From e3d4a6773923a884986aaa4bb272431ce27764e2 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Tue, 2 Apr 2019 17:13:00 +0800
Subject: support /proc/net/snmp

This proc file contains statistics according to [1].

[1] https://tools.ietf.org/html/rfc2013

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Change-Id: I9662132085edd8a7783d356ce4237d7ac0800d94
---
 pkg/sentry/fs/proc/net.go       | 117 +++++++++++++++++++++-
 pkg/sentry/inet/inet.go         |  20 ++++
 test/syscalls/linux/BUILD       |   1 +
 test/syscalls/linux/proc_net.cc | 213 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 349 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index f70239449..ec1bc1d17 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"reflect"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -33,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // newNet creates a new proc net entry.
@@ -40,7 +42,8 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo
 	var contents map[string]*fs.Inode
 	if s := p.k.NetworkStack(); s != nil {
 		contents = map[string]*fs.Inode{
-			"dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
+			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
+			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
 
 			// The following files are simple stubs until they are
 			// implemented in netstack, if the file contains a
@@ -195,6 +198,118 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	return data, 0
 }
 
+// netSnmp implements seqfile.SeqSource for /proc/net/snmp.
+//
+// +stateify savable
+type netSnmp struct {
+	s inet.Stack
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (n *netSnmp) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+type snmpLine struct {
+	prefix string
+	header string
+}
+
+var snmp = []snmpLine{
+	{
+		prefix: "Ip",
+		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
+	},
+	{
+		prefix: "Icmp",
+		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
+	},
+	{
+		prefix: "IcmpMsg",
+	},
+	{
+		prefix: "Tcp",
+		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
+	},
+	{
+		prefix: "Udp",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+	{
+		prefix: "UdpLite",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+}
+
+func toSlice(a interface{}) []uint64 {
+	v := reflect.Indirect(reflect.ValueOf(a))
+	return v.Slice(0, v.Len()).Interface().([]uint64)
+}
+
+func sprintSlice(s []uint64) string {
+	if len(s) == 0 {
+		return ""
+	}
+	r := fmt.Sprint(s)
+	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. See Linux's
+// net/core/net-procfs.c:dev_seq_show.
+func (n *netSnmp) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	contents := make([]string, 0, len(snmp)*2)
+	types := []interface{}{
+		&inet.StatSNMPIP{},
+		&inet.StatSNMPICMP{},
+		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
+		&inet.StatSNMPTCP{},
+		&inet.StatSNMPUDP{},
+		&inet.StatSNMPUDPLite{},
+	}
+	for i, stat := range types {
+		line := snmp[i]
+		if stat == nil {
+			contents = append(
+				contents,
+				fmt.Sprintf("%s:\n", line.prefix),
+				fmt.Sprintf("%s:\n", line.prefix),
+			)
+			continue
+		}
+		if err := n.s.Statistics(stat, line.prefix); err != nil {
+			if err == syserror.EOPNOTSUPP {
+				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			} else {
+				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			}
+		}
+		var values string
+		if line.prefix == "Tcp" {
+			tcp := stat.(*inet.StatSNMPTCP)
+			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
+			values = fmt.Sprintf("%s %d %s", sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+		} else {
+			values = sprintSlice(toSlice(stat))
+		}
+		contents = append(
+			contents,
+			fmt.Sprintf("%s: %s\n", line.prefix, line.header),
+			fmt.Sprintf("%s: %s\n", line.prefix, values),
+		)
+	}
+
+	data := make([]seqfile.SeqData, 0, len(snmp)*2)
+	for _, l := range contents {
+		data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*netSnmp)(nil)})
+	}
+
+	return data, 0
+}
+
 // netUnix implements seqfile.SeqSource for /proc/net/unix.
 //
 // +stateify savable
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 80f227dbe..bc6cb1095 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -153,3 +153,23 @@ type Route struct {
 	// GatewayAddr is the route gateway address (RTA_GATEWAY).
 	GatewayAddr []byte
 }
+
+// Below SNMP metrics are from Linux/usr/include/linux/snmp.h.
+
+// StatSNMPIP describes Ip line of /proc/net/snmp.
+type StatSNMPIP [19]uint64
+
+// StatSNMPICMP describes Icmp line of /proc/net/snmp.
+type StatSNMPICMP [27]uint64
+
+// StatSNMPICMPMSG describes IcmpMsg line of /proc/net/snmp.
+type StatSNMPICMPMSG [512]uint64
+
+// StatSNMPTCP describes Tcp line of /proc/net/snmp.
+type StatSNMPTCP [15]uint64
+
+// StatSNMPUDP describes Udp line of /proc/net/snmp.
+type StatSNMPUDP [8]uint64
+
+// StatSNMPUDPLite describes UdpLite line of /proc/net/snmp.
+type StatSNMPUDPLite [8]uint64
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 84a8eb76c..d243be9e4 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1552,6 +1552,7 @@ cc_binary(
     srcs = ["proc_net.cc"],
     linkstatic = 1,
     deps = [
+        ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index efdaf202b..af4cd616a 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -12,9 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+
+#include "absl/strings/str_split.h"
 #include "gtest/gtest.h"
 #include "test/util/capability_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/test_util.h"
@@ -57,6 +65,209 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
   EXPECT_EQ(buf, to_write);
 }
 
+PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
+                                             const std::string &type,
+                                             const std::string &item) {
+  std::vector<std::string> snmp_vec = absl::StrSplit(snmp, '\n');
+
+  // /proc/net/snmp prints a line of headers followed by a line of metrics.
+  // Only search the headers.
+  for (unsigned i = 0; i < snmp_vec.size(); i = i + 2) {
+    if (!absl::StartsWith(snmp_vec[i], type)) continue;
+
+    std::vector<std::string> fields =
+        absl::StrSplit(snmp_vec[i], ' ', absl::SkipWhitespace());
+
+    EXPECT_TRUE((i + 1) < snmp_vec.size());
+    std::vector<std::string> values =
+        absl::StrSplit(snmp_vec[i + 1], ' ', absl::SkipWhitespace());
+
+    EXPECT_TRUE(!fields.empty() && fields.size() == values.size());
+
+    // Metrics start at the first index.
+    for (unsigned j = 1; j < fields.size(); j++) {
+      if (fields[j] == item) {
+        uint64_t val;
+        if (!absl::SimpleAtoi(values[j], &val)) {
+          return PosixError(EINVAL,
+                            absl::StrCat("field is not a number: ", values[j]));
+        }
+
+        return val;
+      }
+    }
+  }
+  // We should never get here.
+  return PosixError(
+      EINVAL, absl::StrCat("failed to find ", type, "/", item, " in:", snmp));
+}
+
+TEST(ProcNetSnmp, TcpReset) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldAttemptFails;
+  uint64_t oldActiveOpens;
+  uint64_t oldOutRsts;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  oldOutRsts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+  oldAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(connect(s.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallFailsWithErrno(ECONNREFUSED));
+
+  uint64_t newAttemptFails;
+  uint64_t newActiveOpens;
+  uint64_t newOutRsts;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  newOutRsts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+  newAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+
+  EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
+  EXPECT_EQ(oldOutRsts, newOutRsts - 1);
+  EXPECT_EQ(oldAttemptFails, newAttemptFails - 1);
+}
+
+TEST(ProcNetSnmp, TcpEstab) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldEstabResets;
+  uint64_t oldActiveOpens;
+  uint64_t oldPassiveOpens;
+  uint64_t oldCurrEstab;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  oldPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+  oldCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  oldEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+
+  FileDescriptor s_listen =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(bind(s_listen.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(s_listen.get(), 1), SyscallSucceeds());
+
+  FileDescriptor s_connect =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+  ASSERT_THAT(connect(s_connect.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceeds());
+
+  auto s_accept =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(s_listen.get(), nullptr, nullptr));
+
+  uint64_t newEstabResets;
+  uint64_t newActiveOpens;
+  uint64_t newPassiveOpens;
+  uint64_t newCurrEstab;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  newPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+
+  EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
+  EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1);
+  EXPECT_EQ(oldCurrEstab, newCurrEstab - 2);
+
+  ASSERT_THAT(send(s_connect.get(), "a", 1, 0), SyscallSucceedsWithValue(1));
+
+  s_accept.reset(-1);
+  s_connect.reset(-1);
+
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+
+  EXPECT_EQ(oldCurrEstab, newCurrEstab);
+  EXPECT_EQ(oldEstabResets, newEstabResets - 2);
+}
+
+TEST(ProcNetSnmp, UdpNoPorts) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldOutDatagrams;
+  uint64_t oldNoPorts;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  oldNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceedsWithValue(1));
+
+  uint64_t newOutDatagrams;
+  uint64_t newNoPorts;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  newNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+
+  EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
+  EXPECT_EQ(oldNoPorts, newNoPorts - 1);
+}
+
+TEST(ProcNetSnmp, UdpIn) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldOutDatagrams;
+  uint64_t oldInDatagrams;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  oldInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+
+  FileDescriptor server =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(bind(server.get(), (struct sockaddr *)&sin, sizeof(sin)),
+      SyscallSucceeds());
+
+  FileDescriptor client =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  ASSERT_THAT(sendto(client.get(), "a", 1, 0, (struct sockaddr *)&sin,
+                     sizeof(sin)), SyscallSucceedsWithValue(1));
+
+  char buf[128];
+  ASSERT_THAT(recvfrom(server.get(), buf, sizeof(buf), 0, NULL, NULL),
+              SyscallSucceedsWithValue(1));
+
+  uint64_t newOutDatagrams;
+  uint64_t newInDatagrams;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  newInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+
+  EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
+  EXPECT_EQ(oldInDatagrams, newInDatagrams - 1);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From b94505ecc020e63a7e5cab0f1bb5ea898ea05ec5 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Thu, 18 Apr 2019 11:41:13 +0800
Subject: support /proc/net/route

This proc file reports routing information to applications inside the
container.

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Change-Id: I498e47f8c4c185419befbb42d849d0b099ec71f3
---
 pkg/abi/linux/netlink_route.go |  6 ++++
 pkg/sentry/fs/proc/BUILD       |  1 +
 pkg/sentry/fs/proc/net.go      | 78 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 152f6b144..3898d2314 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -325,3 +325,9 @@ const (
 	RTA_SPORT         = 28
 	RTA_DPORT         = 29
 )
+
+// Route flags, from include/uapi/linux/route.h.
+const (
+	RTF_GATEWAY = 0x2
+	RTF_UP      = 0x1
+)
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 1c93e8886..c307603a6 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -53,6 +53,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/tcpip/header",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index ec1bc1d17..402919924 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // newNet creates a new proc net entry.
@@ -60,7 +61,7 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo
 			// (ClockGetres returns 1ns resolution).
 			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
 			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
-			"route":  newStaticProcInode(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")),
+			"route":  seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc),
 			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
 			"udp":    seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc),
 			"unix":   seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
@@ -310,6 +311,81 @@ func (n *netSnmp) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 	return data, 0
 }
 
+// netRoute implements seqfile.SeqSource for /proc/net/route.
+//
+// +stateify savable
+type netRoute struct {
+	s inet.Stack
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (n *netRoute) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (n *netRoute) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	interfaces := n.s.Interfaces()
+	contents := []string{"Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT"}
+	for _, rt := range n.s.RouteTable() {
+		// /proc/net/route only includes ipv4 routes.
+		if rt.Family != linux.AF_INET {
+			continue
+		}
+
+		// /proc/net/route does not include broadcast or multicast routes.
+		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
+			continue
+		}
+
+		iface, ok := interfaces[rt.OutputInterface]
+		if !ok || iface.Name == "lo" {
+			continue
+		}
+
+		var (
+			gw     uint32
+			prefix uint32
+			flags  = linux.RTF_UP
+		)
+		if len(rt.GatewayAddr) == header.IPv4AddressSize {
+			flags |= linux.RTF_GATEWAY
+			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
+		}
+		if len(rt.DstAddr) == header.IPv4AddressSize {
+			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
+		}
+		l := fmt.Sprintf(
+			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
+			iface.Name,
+			prefix,
+			gw,
+			flags,
+			0, // RefCnt.
+			0, // Use.
+			0, // Metric.
+			(uint32(1)<<rt.DstLen)-1,
+			0, // MTU.
+			0, // Window.
+			0, // RTT.
+		)
+		contents = append(contents, l)
+	}
+
+	var data []seqfile.SeqData
+	for _, l := range contents {
+		l = fmt.Sprintf("%-127s\n", l)
+		data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*netRoute)(nil)})
+	}
+
+	return data, 0
+}
+
 // netUnix implements seqfile.SeqSource for /proc/net/unix.
 //
 // +stateify savable
-- 
cgit v1.2.3


From dd7d1f825d2f6464b61287b3a324c13139b0d661 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Thu, 16 May 2019 16:21:46 +0800
Subject: hostinet: support /proc/net/snmp and /proc/net/dev

For hostinet, we inherit the data from host procfs. To to that, we
cache the fds for these files for later reads.

Fixes #506

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Change-Id: I2f81215477455b9c59acf67e33f5b9af28ee0165
---
 pkg/sentry/socket/hostinet/stack.go | 106 +++++++++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 3a4fdec47..d4387f5d4 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -16,8 +16,11 @@ package hostinet
 
 import (
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
+	"reflect"
+	"strconv"
 	"strings"
 	"syscall"
 
@@ -26,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -51,6 +55,8 @@ type Stack struct {
 	tcpRecvBufSize inet.TCPBufferSize
 	tcpSendBufSize inet.TCPBufferSize
 	tcpSACKEnabled bool
+	netDevFile     *os.File
+	netSNMPFile    *os.File
 }
 
 // NewStack returns an empty Stack containing no configuration.
@@ -98,6 +104,18 @@ func (s *Stack) Configure() error {
 		log.Warningf("Failed to read if TCP SACK if enabled, setting to true")
 	}
 
+	if f, err := os.Open("/proc/net/dev"); err != nil {
+		log.Warningf("Failed to open /proc/net/dev: %v", err)
+	} else {
+		s.netDevFile = f
+	}
+
+	if f, err := os.Open("/proc/net/snmp"); err != nil {
+		log.Warningf("Failed to open /proc/net/snmp: %v", err)
+	} else {
+		s.netSNMPFile = f
+	}
+
 	return nil
 }
 
@@ -326,9 +344,95 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 	return syserror.EACCES
 }
 
+// getLine reads one line from proc file, with specified prefix.
+// The last argument, withHeader, specifies if it contains line header.
+func getLine(f *os.File, prefix string, withHeader bool) string {
+	data := make([]byte, 4096)
+
+	if _, err := f.Seek(0, 0); err != nil {
+		return ""
+	}
+
+	if _, err := io.ReadFull(f, data); err != io.ErrUnexpectedEOF {
+		return ""
+	}
+
+	prefix = prefix + ":"
+	lines := strings.Split(string(data), "\n")
+	for _, l := range lines {
+		l = strings.TrimSpace(l)
+		if strings.HasPrefix(l, prefix) {
+			if withHeader {
+				withHeader = false
+				continue
+			}
+			return l
+		}
+	}
+	return ""
+}
+
+func toSlice(i interface{}) []uint64 {
+	v := reflect.Indirect(reflect.ValueOf(i))
+	return v.Slice(0, v.Len()).Interface().([]uint64)
+}
+
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
-	return syserror.EOPNOTSUPP
+	var (
+		snmpTCP   bool
+		rawLine   string
+		sliceStat []uint64
+	)
+
+	switch stat.(type) {
+	case *inet.StatDev:
+		if s.netDevFile == nil {
+			return fmt.Errorf("/proc/net/dev is not opened for hostinet")
+		}
+		rawLine = getLine(s.netDevFile, arg, false /* with no header */)
+	case *inet.StatSNMPIP, *inet.StatSNMPICMP, *inet.StatSNMPICMPMSG, *inet.StatSNMPTCP, *inet.StatSNMPUDP, *inet.StatSNMPUDPLite:
+		if s.netSNMPFile == nil {
+			return fmt.Errorf("/proc/net/snmp is not opened for hostinet")
+		}
+		rawLine = getLine(s.netSNMPFile, arg, true)
+	default:
+		return syserr.ErrEndpointOperation.ToError()
+	}
+
+	if rawLine == "" {
+		return fmt.Errorf("Failed to get raw line")
+	}
+
+	parts := strings.SplitN(rawLine, ":", 2)
+	if len(parts) != 2 {
+		return fmt.Errorf("Failed to get prefix from: %q", rawLine)
+	}
+
+	sliceStat = toSlice(stat)
+	fields := strings.Fields(strings.TrimSpace(parts[1]))
+	if len(fields) != len(sliceStat) {
+		return fmt.Errorf("Failed to parse fields: %q", rawLine)
+	}
+	if _, ok := stat.(*inet.StatSNMPTCP); ok {
+		snmpTCP = true
+	}
+	for i := 0; i < len(sliceStat); i++ {
+		var err error
+		if snmpTCP && i == 3 {
+			var tmp int64
+			// MaxConn field is signed, RFC 2012.
+			tmp, err = strconv.ParseInt(fields[i], 10, 64)
+			sliceStat[i] = uint64(tmp) // Convert back to int before use.
+		} else {
+			sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64)
+		}
+		if err != nil {
+			return fmt.Errorf("Failed to parse field %d from: %q, %v", i, rawLine, err)
+		}
+	}
+
+	return nil
 }
 
 // RouteTable implements inet.Stack.RouteTable.
-- 
cgit v1.2.3


From aee2c93366f451b9cc0a62430185749556fc3900 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Thu, 29 Aug 2019 16:23:11 +0000
Subject: netstack: add counters for tcp CurrEstab and EstabResets

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
---
 pkg/sentry/socket/netstack/netstack.go |  2 ++
 pkg/tcpip/tcpip.go                     | 14 ++++++++++++++
 pkg/tcpip/transport/tcp/accept.go      |  6 +++++-
 pkg/tcpip/transport/tcp/connect.go     | 17 ++++++++++++++++-
 pkg/tcpip/transport/tcp/endpoint.go    |  1 +
 pkg/tcpip/transport/tcp/snd.go         |  1 +
 6 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 0ae573b45..6fd43fcbd 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -148,6 +148,8 @@ var Metrics = tcpip.Stats{
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
+		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."),
+		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
 		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
 		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
 		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9d3752032..26f338d8d 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -673,6 +673,11 @@ func (s *StatCounter) Increment() {
 	s.IncrementBy(1)
 }
 
+// Decrement minuses one to the counter.
+func (s *StatCounter) Decrement() {
+	s.IncrementBy(^uint64(0))
+}
+
 // Value returns the current value of the counter.
 func (s *StatCounter) Value() uint64 {
 	return atomic.LoadUint64(&s.count)
@@ -881,6 +886,15 @@ type TCPStats struct {
 	// successfully via Listen.
 	PassiveConnectionOpenings *StatCounter
 
+	// CurrentEstablished is the number of TCP connections for which the
+	// current state is either ESTABLISHED or CLOSE-WAIT.
+	CurrentEstablished *StatCounter
+
+	// EstablishedResets is the number of times TCP connections have made
+	// a direct transition to the CLOSED state from either the
+	// ESTABLISHED state or the CLOSE-WAIT state.
+	EstablishedResets *StatCounter
+
 	// ListenOverflowSynDrop is the number of times the listen queue overflowed
 	// and a SYN was dropped.
 	ListenOverflowSynDrop *StatCounter
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 844959fa0..2b4c5c2f9 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -297,7 +297,10 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		return nil, err
 	}
 	ep.mu.Lock()
-	ep.state = StateEstablished
+	if ep.state != StateEstablished {
+		ep.stack.Stats().TCP.CurrentEstablished.Increment()
+		ep.state = StateEstablished
+	}
 	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
@@ -519,6 +522,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		n.tsOffset = 0
 
 		// Switch state to connected.
+		n.stack.Stats().TCP.CurrentEstablished.Increment()
 		n.state = StateEstablished
 
 		// Do the delivery in a separate goroutine so
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 5ea036bea..4467dda82 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -754,6 +754,10 @@ func (e *endpoint) handleClose() *tcpip.Error {
 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	// Only send a reset if the connection is being aborted for a reason
 	// other than receiving a reset.
+	if e.state == StateEstablished || e.state == StateCloseWait {
+		e.stack.Stats().TCP.EstablishedResets.Increment()
+		e.stack.Stats().TCP.CurrentEstablished.Decrement()
+	}
 	e.state = StateError
 	e.HardError = err
 	if err != tcpip.ErrConnectionReset {
@@ -924,6 +928,10 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
+			if e.state == StateEstablished || e.state == StateCloseWait {
+				e.stack.Stats().TCP.EstablishedResets.Increment()
+				e.stack.Stats().TCP.CurrentEstablished.Decrement()
+			}
 			e.state = StateError
 			e.HardError = err
 
@@ -954,7 +962,10 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
-	e.state = StateEstablished
+	if e.state != StateEstablished {
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.state = StateEstablished
+	}
 	drained := e.drainDone != nil
 	e.mu.Unlock()
 	if drained {
@@ -1115,6 +1126,10 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Mark endpoint as closed.
 	e.mu.Lock()
 	if e.state != StateError {
+		if e.state == StateEstablished || e.state == StateCloseWait {
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
+		}
 		e.state = StateClose
 	}
 	// Lock released below.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index a1b784b49..31a22c1eb 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1729,6 +1729,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		e.segmentQueue.mu.Unlock()
 		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
 		e.state = StateEstablished
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
 	}
 
 	if run {
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 8332a0179..d3f7c9125 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -674,6 +674,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		default:
 			s.ep.state = StateFinWait1
 		}
+		s.ep.stack.Stats().TCP.CurrentEstablished.Decrement()
 		s.ep.mu.Unlock()
 	} else {
 		// We're sending a non-FIN segment.
-- 
cgit v1.2.3


From d277bfba2702b319d8336b65429cf8775661ea2f Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Mon, 20 May 2019 11:26:10 +0000
Subject: epsocket: support /proc/net/snmp

Netstack has its own stats, we use this to fill /proc/net/snmp.

Note that some metrics are not recorded in Netstack, which will be shown
as 0 in the proc file.

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Change-Id: Ie0089184507d16f49bc0057b4b0482094417ebe1
---
 pkg/sentry/socket/netstack/stack.go | 93 ++++++++++++++++++++++++++++++++++++-
 pkg/tcpip/transport/tcp/accept.go   |  6 +--
 pkg/tcpip/transport/tcp/connect.go  | 12 ++---
 test/syscalls/linux/proc_net.cc     | 23 +++++++++
 4 files changed, 121 insertions(+), 13 deletions(-)

diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index fda0156e5..d5db8c17c 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -144,7 +144,98 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
-	return syserr.ErrEndpointOperation.ToError()
+	switch stats := stat.(type) {
+	case *inet.StatSNMPIP:
+		ip := Metrics.IP
+		*stats = inet.StatSNMPIP{
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			ip.PacketsReceived.Value(),          // InReceives.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			ip.InvalidAddressesReceived.Value(), // InAddrErrors.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			ip.PacketsDelivered.Value(),         // InDelivers.
+			ip.PacketsSent.Value(),              // OutRequests.
+			ip.OutgoingPacketErrors.Value(),     // OutDiscards.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+		}
+	case *inet.StatSNMPICMP:
+		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
+		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		*stats = inet.StatSNMPICMP{
+			0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs.
+			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
+			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors.
+			in.DstUnreachable.Value(), // InDestUnreachs.
+			in.TimeExceeded.Value(),   // InTimeExcds.
+			in.ParamProblem.Value(),   // InParmProbs.
+			in.SrcQuench.Value(),      // InSrcQuenchs.
+			in.Redirect.Value(),       // InRedirects.
+			in.Echo.Value(),           // InEchos.
+			in.EchoReply.Value(),      // InEchoReps.
+			in.Timestamp.Value(),      // InTimestamps.
+			in.TimestampReply.Value(), // InTimestampReps.
+			in.InfoRequest.Value(),    // InAddrMasks.
+			in.InfoReply.Value(),      // InAddrMaskReps.
+			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs.
+			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
+			out.DstUnreachable.Value(),                     // OutDestUnreachs.
+			out.TimeExceeded.Value(),                       // OutTimeExcds.
+			out.ParamProblem.Value(),                       // OutParmProbs.
+			out.SrcQuench.Value(),                          // OutSrcQuenchs.
+			out.Redirect.Value(),                           // OutRedirects.
+			out.Echo.Value(),                               // OutEchos.
+			out.EchoReply.Value(),                          // OutEchoReps.
+			out.Timestamp.Value(),                          // OutTimestamps.
+			out.TimestampReply.Value(),                     // OutTimestampReps.
+			out.InfoRequest.Value(),                        // OutAddrMasks.
+			out.InfoReply.Value(),                          // OutAddrMaskReps.
+		}
+	case *inet.StatSNMPTCP:
+		tcp := Metrics.TCP
+		// RFC 2012 (updates 1213):  SNMPv2-MIB-TCP.
+		*stats = inet.StatSNMPTCP{
+			1,                                     // RtoAlgorithm.
+			200,                                   // RtoMin.
+			120000,                                // RtoMax.
+			(1<<64 - 1),                           // MaxConn.
+			tcp.ActiveConnectionOpenings.Value(),  // ActiveOpens.
+			tcp.PassiveConnectionOpenings.Value(), // PassiveOpens.
+			tcp.FailedConnectionAttempts.Value(),  // AttemptFails.
+			tcp.EstablishedResets.Value(),         // EstabResets.
+			tcp.CurrentEstablished.Value(),        // CurrEstab.
+			tcp.ValidSegmentsReceived.Value(),     // InSegs.
+			tcp.SegmentsSent.Value(),              // OutSegs.
+			tcp.Retransmits.Value(),               // RetransSegs.
+			tcp.InvalidSegmentsReceived.Value(),   // InErrs.
+			tcp.ResetsSent.Value(),                // OutRsts.
+			tcp.ChecksumErrors.Value(),            // InCsumErrors.
+		}
+	case *inet.StatSNMPUDP:
+		udp := Metrics.UDP
+		*stats = inet.StatSNMPUDP{
+			udp.PacketsReceived.Value(),     // InDatagrams.
+			udp.UnknownPortErrors.Value(),   // NoPorts.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InErrors.
+			udp.PacketsSent.Value(),         // OutDatagrams.
+			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti.
+		}
+	default:
+		return syserr.ErrEndpointOperation.ToError()
+	}
+	return nil
 }
 
 // RouteTable implements inet.Stack.RouteTable.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 2b4c5c2f9..65c346046 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -297,10 +297,8 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		return nil, err
 	}
 	ep.mu.Lock()
-	if ep.state != StateEstablished {
-		ep.stack.Stats().TCP.CurrentEstablished.Increment()
-		ep.state = StateEstablished
-	}
+	ep.stack.Stats().TCP.CurrentEstablished.Increment()
+	ep.state = StateEstablished
 	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4467dda82..b724d02bb 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -928,10 +928,8 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			if e.state == StateEstablished || e.state == StateCloseWait {
-				e.stack.Stats().TCP.EstablishedResets.Increment()
-				e.stack.Stats().TCP.CurrentEstablished.Decrement()
-			}
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
 			e.state = StateError
 			e.HardError = err
 
@@ -1126,10 +1124,8 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Mark endpoint as closed.
 	e.mu.Lock()
 	if e.state != StateError {
-		if e.state == StateEstablished || e.state == StateCloseWait {
-			e.stack.Stats().TCP.EstablishedResets.Increment()
-			e.stack.Stats().TCP.CurrentEstablished.Decrement()
-		}
+		e.stack.Stats().TCP.EstablishedResets.Increment()
+		e.stack.Stats().TCP.CurrentEstablished.Decrement()
 		e.state = StateClose
 	}
 	// Lock released below.
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index af4cd616a..d0ef8d380 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -15,11 +15,14 @@
 #include <arpa/inet.h>
 #include <errno.h>
 #include <netinet/in.h>
+#include <poll.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/syscall.h>
 
 #include "absl/strings/str_split.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "gtest/gtest.h"
 #include "test/util/capability_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -184,11 +187,31 @@ TEST(ProcNetSnmp, TcpEstab) {
   EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1);
   EXPECT_EQ(oldCurrEstab, newCurrEstab - 2);
 
+  // Send 1 byte from client to server.
   ASSERT_THAT(send(s_connect.get(), "a", 1, 0), SyscallSucceedsWithValue(1));
 
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+
+  // Wait until server-side fd sees the data on its side but don't read it.
+  struct pollfd poll_fd = {s_accept.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now close server-side fd without reading the data which leads to a RST
+  // packet sent to client side.
   s_accept.reset(-1);
+
+  // Wait until client-side fd sees RST packet.
+  struct pollfd poll_fd1 = {s_connect.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd1, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now close client-side fd.
   s_connect.reset(-1);
 
+  // Wait until the process of the netstack.
+  absl::SleepFor(absl::Seconds(1.0));
+
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
   newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
   newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
-- 
cgit v1.2.3


From 2cee0669299cd2b980aa9ae253c24107a4813b26 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 26 Sep 2019 06:09:32 +0000
Subject: enable ring0 to support arm64

This patch enabled the basic framework for arm64 guest.

Serveral jobs were finished in this patch:
1, ring0.Vectors()
2, switchToUser()
3, basic framwork for Arm64 guest.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/BUILD            |  33 +-
 pkg/sentry/platform/ring0/aarch64.go       | 109 ++++++
 pkg/sentry/platform/ring0/defs.go          |  11 -
 pkg/sentry/platform/ring0/defs_amd64.go    |  11 +
 pkg/sentry/platform/ring0/defs_arm64.go    | 133 +++++++
 pkg/sentry/platform/ring0/entry_arm64.go   |  60 +++
 pkg/sentry/platform/ring0/entry_arm64.s    | 565 +++++++++++++++++++++++++++++
 pkg/sentry/platform/ring0/kernel_arm64.go  |  58 +++
 pkg/sentry/platform/ring0/lib_arm64.go     |  25 ++
 pkg/sentry/platform/ring0/offsets_arm64.go | 124 +++++++
 10 files changed, 1105 insertions(+), 24 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/aarch64.go
 create mode 100644 pkg/sentry/platform/ring0/defs_arm64.go
 create mode 100644 pkg/sentry/platform/ring0/entry_arm64.go
 create mode 100644 pkg/sentry/platform/ring0/entry_arm64.s
 create mode 100644 pkg/sentry/platform/ring0/kernel_arm64.go
 create mode 100644 pkg/sentry/platform/ring0/lib_arm64.go
 create mode 100644 pkg/sentry/platform/ring0/offsets_arm64.go

diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 8ed6c7652..939a0033a 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,17 +1,16 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
 go_template(
     name = "defs",
-    srcs = [
-        "defs.go",
-        "defs_amd64.go",
-        "offsets_amd64.go",
-        "x86.go",
-    ],
+    srcs = select(
+    {
+        "@bazel_tools//src/conditions:linux_aarch64": ["defs.go", "defs_arm64.go", "offsets_arm64.go", "aarch64.go",],
+        "//conditions:default": ["defs.go", "defs_amd64.go", "offsets_amd64.go", "x86.go",],
+    },
+    ),
     visibility = [":__subpackages__"],
 )
 
@@ -23,10 +22,15 @@ go_template_instance(
 )
 
 genrule(
-    name = "entry_impl_amd64",
-    srcs = ["entry_amd64.s"],
-    outs = ["entry_impl_amd64.s"],
-    cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+    name = "entry_impl",
+    srcs = ["entry_amd64.s", "entry_arm64.s"],
+    outs = ["entry_impl.s"],
+    cmd = select(
+    {
+        "@bazel_tools//src/conditions:linux_aarch64": "(echo -e '// build +arm64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(location entry_arm64.s)) > $@",
+        "//conditions:default": "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(location entry_amd64.s)) > $@",
+    },
+    ),
     tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
 )
 
@@ -35,12 +39,15 @@ go_library(
     srcs = [
         "defs_impl.go",
         "entry_amd64.go",
-        "entry_impl_amd64.s",
+        "entry_arm64.go",
+        "entry_impl.s",
         "kernel.go",
         "kernel_amd64.go",
+        "kernel_arm64.go",
         "kernel_unsafe.go",
         "lib_amd64.go",
         "lib_amd64.s",
+        "lib_arm64.go",
         "ring0.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0",
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
new file mode 100644
index 000000000..6b078cd1e
--- /dev/null
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -0,0 +1,109 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+// Useful bits.
+const (
+	_PGD_PGT_BASE = 0x1000
+	_PGD_PGT_SIZE = 0x1000
+	_PUD_PGT_BASE = 0x2000
+	_PUD_PGT_SIZE = 0x1000
+	_PMD_PGT_BASE = 0x3000
+	_PMD_PGT_SIZE = 0x4000
+	_PTE_PGT_BASE = 0x7000
+	_PTE_PGT_SIZE = 0x1000
+
+	_PSR_MODE_EL0t = 0x0
+	_PSR_MODE_EL1t = 0x4
+	_PSR_MODE_EL1h = 0x5
+	_PSR_EL_MASK   = 0xf
+
+	_PSR_D_BIT = 0x200
+	_PSR_A_BIT = 0x100
+	_PSR_I_BIT = 0x80
+	_PSR_F_BIT = 0x40
+)
+
+const (
+	// KernelFlagsSet should always be set in the kernel.
+	KernelFlagsSet = _PSR_MODE_EL1h
+
+	// UserFlagsSet are always set in userspace.
+	UserFlagsSet = _PSR_MODE_EL0t
+
+	KernelFlagsClear = _PSR_EL_MASK
+	UserFlagsClear   = _PSR_EL_MASK
+
+	PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
+)
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+	El1SyncInvalid = iota
+	El1IrqInvalid
+	El1FiqInvalid
+	El1ErrorInvalid
+	El1Sync
+	El1Irq
+	El1Fiq
+	El1Error
+	El0Sync
+	El0Irq
+	El0Fiq
+	El0Error
+	El0Sync_invalid
+	El0Irq_invalid
+	El0Fiq_invalid
+	El0Error_invalid
+	El1Sync_da
+	El1Sync_ia
+	El1Sync_sp_pc
+	El1Sync_undef
+	El1Sync_dbg
+	El1Sync_inv
+	El0Sync_svc
+	El0Sync_da
+	El0Sync_ia
+	El0Sync_fpsimd_acc
+	El0Sync_sve_acc
+	El0Sync_sys
+	El0Sync_sp_pc
+	El0Sync_undef
+	El0Sync_dbg
+	El0Sync_inv
+	VirtualizationException
+	_NR_INTERRUPTS
+)
+
+// System call vectors.
+const (
+	Syscall   Vector = El0Sync_svc
+	PageFault Vector = El0Sync_da
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+func VirtualAddressBits() uint32 {
+	return 48
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+func PhysicalAddressBits() uint32 {
+	return 40
+}
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 076063f85..3f094c2a7 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -20,17 +20,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
-var (
-	// UserspaceSize is the total size of userspace.
-	UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
-
-	// MaximumUserAddress is the largest possible user address.
-	MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
-
-	// KernelStartAddress is the starting kernel address.
-	KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
-)
-
 // Kernel is a global kernel object.
 //
 // This contains global state, shared by multiple CPUs.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 7206322b1..10dbd381f 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -20,6 +20,17 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
+var (
+	// UserspaceSize is the total size of userspace.
+	UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
+
+	// MaximumUserAddress is the largest possible user address.
+	MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
+
+	// KernelStartAddress is the starting kernel address.
+	KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
 // Segment indices and Selectors.
 const (
 	// Index into GDT array.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
new file mode 100644
index 000000000..fbfbd9bab
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -0,0 +1,133 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+var (
+	// UserspaceSize is the total size of userspace.
+	UserspaceSize = uintptr(1) << (VirtualAddressBits())
+
+	// MaximumUserAddress is the largest possible user address.
+	MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
+
+	// KernelStartAddress is the starting kernel address.
+	KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
+// KernelOpts has initialization options for the kernel.
+type KernelOpts struct {
+	// PageTables are the kernel pagetables; this must be provided.
+	PageTables *pagetables.PageTables
+}
+
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+	KernelOpts
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+	// stack is the stack used for interrupts on this CPU.
+	stack [512]byte
+
+	// errorCode is the error code from the last exception.
+	errorCode uintptr
+
+	// errorType indicates the type of error code here, it is always set
+	// along with the errorCode value above.
+	//
+	// It will either by 1, which indicates a user error, or 0 indicating a
+	// kernel error. If the error code below returns false (kernel error),
+	// then it cannot provide relevant information about the last
+	// exception.
+	errorType uintptr
+
+	// faultAddr is the value of far_el1.
+	faultAddr uintptr
+
+	// ttbr0Kvm is the value of ttbr0_el1 for sentry.
+	ttbr0Kvm uintptr
+
+	// ttbr0App is the value of ttbr0_el1 for applicaton.
+	ttbr0App uintptr
+
+	// exception vector.
+	vecCode Vector
+
+	// application context pointer.
+	appAddr uintptr
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+	return c.errorCode, c.errorType != 0
+}
+
+// ClearErrorCode resets the error code.
+//
+//go:nosplit
+func (c *CPU) ClearErrorCode() {
+	c.errorCode = 0 // No code.
+	c.errorType = 1 // User mode.
+}
+
+//go:nosplit
+func (c *CPU) GetFaultAddr() (value uintptr) {
+	return c.faultAddr
+}
+
+//go:nosplit
+func (c *CPU) SetTtbr0Kvm(value uintptr) {
+	c.ttbr0Kvm = value
+}
+
+//go:nosplit
+func (c *CPU) SetTtbr0App(value uintptr) {
+	c.ttbr0App = value
+}
+
+//go:nosplit
+func (c *CPU) GetVector() (value Vector) {
+	return c.vecCode
+}
+
+//go:nosplit
+func (c *CPU) SetAppAddr(value uintptr) {
+	c.appAddr = value
+}
+
+// SwitchArchOpts are embedded in SwitchOpts.
+type SwitchArchOpts struct {
+	// UserASID indicates that the application ASID to be used on switch,
+	UserASID uint16
+
+	// KernelASID indicates that the kernel ASID to be used on return,
+	KernelASID uint16
+}
+
+func init() {
+}
diff --git a/pkg/sentry/platform/ring0/entry_arm64.go b/pkg/sentry/platform/ring0/entry_arm64.go
new file mode 100644
index 000000000..0dfa42c36
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_arm64.go
@@ -0,0 +1,60 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+// This is an assembly function.
+//
+// The sysenter function is invoked in two situations:
+//
+//  (1) The guest kernel has executed a system call.
+//  (2) The guest application has executed a system call.
+//
+// The interrupt flag is examined to determine whether the system call was
+// executed from kernel mode or not and the appropriate stub is called.
+
+func El1_sync_invalid()
+func El1_irq_invalid()
+func El1_fiq_invalid()
+func El1_error_invalid()
+
+func El1_sync()
+func El1_irq()
+func El1_fiq()
+func El1_error()
+
+func El0_sync()
+func El0_irq()
+func El0_fiq()
+func El0_error()
+
+func El0_sync_invalid()
+func El0_irq_invalid()
+func El0_fiq_invalid()
+func El0_error_invalid()
+
+func Vectors()
+
+// Start is the CPU entrypoint.
+//
+// The CPU state will be set to c.Registers().
+func Start()
+func kernelExitToEl1()
+
+func kernelExitToEl0()
+
+// Shutdown execution
+func Shutdown()
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
new file mode 100644
index 000000000..29c475882
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -0,0 +1,565 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// NB: Offsets are programatically generated (see BUILD).
+//
+// This file is concatenated with the definitions.
+
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+
+#define ERET() \
+  WORD $0xd69f03e0
+
+#define RSV_REG 	R18_PLATFORM
+#define RSV_REG_APP 	R9
+
+#define REGISTERS_SAVE(reg, offset) \
+  MOVD R0, offset+PTRACE_R0(reg); \
+  MOVD R1, offset+PTRACE_R1(reg); \
+  MOVD R2, offset+PTRACE_R2(reg); \
+  MOVD R3, offset+PTRACE_R3(reg); \
+  MOVD R4, offset+PTRACE_R4(reg); \
+  MOVD R5, offset+PTRACE_R5(reg); \
+  MOVD R6, offset+PTRACE_R6(reg); \
+  MOVD R7, offset+PTRACE_R7(reg); \
+  MOVD R8, offset+PTRACE_R8(reg); \
+  MOVD R10, offset+PTRACE_R10(reg); \
+  MOVD R11, offset+PTRACE_R11(reg); \
+  MOVD R12, offset+PTRACE_R12(reg); \
+  MOVD R13, offset+PTRACE_R13(reg); \
+  MOVD R14, offset+PTRACE_R14(reg); \
+  MOVD R15, offset+PTRACE_R15(reg); \
+  MOVD R16, offset+PTRACE_R16(reg); \
+  MOVD R17, offset+PTRACE_R17(reg); \
+  MOVD R19, offset+PTRACE_R19(reg); \
+  MOVD R20, offset+PTRACE_R20(reg); \
+  MOVD R21, offset+PTRACE_R21(reg); \
+  MOVD R22, offset+PTRACE_R22(reg); \
+  MOVD R23, offset+PTRACE_R23(reg); \
+  MOVD R24, offset+PTRACE_R24(reg); \
+  MOVD R25, offset+PTRACE_R25(reg); \
+  MOVD R26, offset+PTRACE_R26(reg); \
+  MOVD R27, offset+PTRACE_R27(reg); \
+  MOVD g,   offset+PTRACE_R28(reg); \
+  MOVD R29, offset+PTRACE_R29(reg); \
+  MOVD R30, offset+PTRACE_R30(reg);
+
+#define REGISTERS_LOAD(reg, offset) \
+  MOVD offset+PTRACE_R0(reg), R0; \
+  MOVD offset+PTRACE_R1(reg), R1; \
+  MOVD offset+PTRACE_R2(reg), R2; \
+  MOVD offset+PTRACE_R3(reg), R3; \
+  MOVD offset+PTRACE_R4(reg), R4; \
+  MOVD offset+PTRACE_R5(reg), R5; \
+  MOVD offset+PTRACE_R6(reg), R6; \
+  MOVD offset+PTRACE_R7(reg), R7; \
+  MOVD offset+PTRACE_R8(reg), R8; \
+  MOVD offset+PTRACE_R10(reg), R10; \
+  MOVD offset+PTRACE_R11(reg), R11; \
+  MOVD offset+PTRACE_R12(reg), R12; \
+  MOVD offset+PTRACE_R13(reg), R13; \
+  MOVD offset+PTRACE_R14(reg), R14; \
+  MOVD offset+PTRACE_R15(reg), R15; \
+  MOVD offset+PTRACE_R16(reg), R16; \
+  MOVD offset+PTRACE_R17(reg), R17; \
+  MOVD offset+PTRACE_R19(reg), R19; \
+  MOVD offset+PTRACE_R20(reg), R20; \
+  MOVD offset+PTRACE_R21(reg), R21; \
+  MOVD offset+PTRACE_R22(reg), R22; \
+  MOVD offset+PTRACE_R23(reg), R23; \
+  MOVD offset+PTRACE_R24(reg), R24; \
+  MOVD offset+PTRACE_R25(reg), R25; \
+  MOVD offset+PTRACE_R26(reg), R26; \
+  MOVD offset+PTRACE_R27(reg), R27; \
+  MOVD offset+PTRACE_R28(reg), g; \
+  MOVD offset+PTRACE_R29(reg), R29; \
+  MOVD offset+PTRACE_R30(reg), R30;
+
+//NOP
+#define nop31Instructions() \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f; \
+        WORD $0xd503201f;
+
+#define ESR_ELx_EC_UNKNOWN	(0x00)
+#define ESR_ELx_EC_WFx		(0x01)
+/* Unallocated EC: 0x02 */
+#define ESR_ELx_EC_CP15_32	(0x03)
+#define ESR_ELx_EC_CP15_64	(0x04)
+#define ESR_ELx_EC_CP14_MR	(0x05)
+#define ESR_ELx_EC_CP14_LS	(0x06)
+#define ESR_ELx_EC_FP_ASIMD	(0x07)
+#define ESR_ELx_EC_CP10_ID	(0x08)	/* EL2 only */
+#define ESR_ELx_EC_PAC		(0x09)	/* EL2 and above */
+/* Unallocated EC: 0x0A - 0x0B */
+#define ESR_ELx_EC_CP14_64	(0x0C)
+/* Unallocated EC: 0x0d */
+#define ESR_ELx_EC_ILL		(0x0E)
+/* Unallocated EC: 0x0F - 0x10 */
+#define ESR_ELx_EC_SVC32	(0x11)
+#define ESR_ELx_EC_HVC32	(0x12)	/* EL2 only */
+#define ESR_ELx_EC_SMC32	(0x13)	/* EL2 and above */
+/* Unallocated EC: 0x14 */
+#define ESR_ELx_EC_SVC64	(0x15)
+#define ESR_ELx_EC_HVC64	(0x16)	/* EL2 and above */
+#define ESR_ELx_EC_SMC64	(0x17)	/* EL2 and above */
+#define ESR_ELx_EC_SYS64	(0x18)
+#define ESR_ELx_EC_SVE		(0x19)
+/* Unallocated EC: 0x1A - 0x1E */
+#define ESR_ELx_EC_IMP_DEF	(0x1f)	/* EL3 only */
+#define ESR_ELx_EC_IABT_LOW	(0x20)
+#define ESR_ELx_EC_IABT_CUR	(0x21)
+#define ESR_ELx_EC_PC_ALIGN	(0x22)
+/* Unallocated EC: 0x23 */
+#define ESR_ELx_EC_DABT_LOW	(0x24)
+#define ESR_ELx_EC_DABT_CUR	(0x25)
+#define ESR_ELx_EC_SP_ALIGN	(0x26)
+/* Unallocated EC: 0x27 */
+#define ESR_ELx_EC_FP_EXC32	(0x28)
+/* Unallocated EC: 0x29 - 0x2B */
+#define ESR_ELx_EC_FP_EXC64	(0x2C)
+/* Unallocated EC: 0x2D - 0x2E */
+#define ESR_ELx_EC_SERROR	(0x2F)
+#define ESR_ELx_EC_BREAKPT_LOW	(0x30)
+#define ESR_ELx_EC_BREAKPT_CUR	(0x31)
+#define ESR_ELx_EC_SOFTSTP_LOW	(0x32)
+#define ESR_ELx_EC_SOFTSTP_CUR	(0x33)
+#define ESR_ELx_EC_WATCHPT_LOW	(0x34)
+#define ESR_ELx_EC_WATCHPT_CUR	(0x35)
+/* Unallocated EC: 0x36 - 0x37 */
+#define ESR_ELx_EC_BKPT32	(0x38)
+/* Unallocated EC: 0x39 */
+#define ESR_ELx_EC_VECTOR32	(0x3A)	/* EL2 only */
+/* Unallocted EC: 0x3B */
+#define ESR_ELx_EC_BRK64	(0x3C)
+/* Unallocated EC: 0x3D - 0x3F */
+#define ESR_ELx_EC_MAX		(0x3F)
+
+#define ESR_ELx_EC_SHIFT	(26)
+#define ESR_ELx_EC_MASK		(UL(0x3F) << ESR_ELx_EC_SHIFT)
+#define ESR_ELx_EC(esr)		(((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT)
+
+#define ESR_ELx_IL_SHIFT	(25)
+#define ESR_ELx_IL		(UL(1) << ESR_ELx_IL_SHIFT)
+#define ESR_ELx_ISS_MASK	(ESR_ELx_IL - 1)
+
+/* ISS field definitions shared by different classes */
+#define ESR_ELx_WNR_SHIFT	(6)
+#define ESR_ELx_WNR		(UL(1) << ESR_ELx_WNR_SHIFT)
+
+/* Asynchronous Error Type */
+#define ESR_ELx_IDS_SHIFT	(24)
+#define ESR_ELx_IDS		(UL(1) << ESR_ELx_IDS_SHIFT)
+#define ESR_ELx_AET_SHIFT	(10)
+#define ESR_ELx_AET		(UL(0x7) << ESR_ELx_AET_SHIFT)
+
+#define ESR_ELx_AET_UC		(UL(0) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_UEU		(UL(1) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_UEO		(UL(2) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_UER		(UL(3) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_CE		(UL(6) << ESR_ELx_AET_SHIFT)
+
+/* Shared ISS field definitions for Data/Instruction aborts */
+#define ESR_ELx_SET_SHIFT	(11)
+#define ESR_ELx_SET_MASK	(UL(3) << ESR_ELx_SET_SHIFT)
+#define ESR_ELx_FnV_SHIFT	(10)
+#define ESR_ELx_FnV		(UL(1) << ESR_ELx_FnV_SHIFT)
+#define ESR_ELx_EA_SHIFT	(9)
+#define ESR_ELx_EA		(UL(1) << ESR_ELx_EA_SHIFT)
+#define ESR_ELx_S1PTW_SHIFT	(7)
+#define ESR_ELx_S1PTW		(UL(1) << ESR_ELx_S1PTW_SHIFT)
+
+/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */
+#define ESR_ELx_FSC		(0x3F)
+#define ESR_ELx_FSC_TYPE	(0x3C)
+#define ESR_ELx_FSC_EXTABT	(0x10)
+#define ESR_ELx_FSC_SERROR	(0x11)
+#define ESR_ELx_FSC_ACCESS	(0x08)
+#define ESR_ELx_FSC_FAULT	(0x04)
+#define ESR_ELx_FSC_PERM	(0x0C)
+
+/* ISS field definitions for Data Aborts */
+#define ESR_ELx_ISV_SHIFT	(24)
+#define ESR_ELx_ISV		(UL(1) << ESR_ELx_ISV_SHIFT)
+#define ESR_ELx_SAS_SHIFT	(22)
+#define ESR_ELx_SAS		(UL(3) << ESR_ELx_SAS_SHIFT)
+#define ESR_ELx_SSE_SHIFT	(21)
+#define ESR_ELx_SSE		(UL(1) << ESR_ELx_SSE_SHIFT)
+#define ESR_ELx_SRT_SHIFT	(16)
+#define ESR_ELx_SRT_MASK	(UL(0x1F) << ESR_ELx_SRT_SHIFT)
+#define ESR_ELx_SF_SHIFT	(15)
+#define ESR_ELx_SF 		(UL(1) << ESR_ELx_SF_SHIFT)
+#define ESR_ELx_AR_SHIFT	(14)
+#define ESR_ELx_AR 		(UL(1) << ESR_ELx_AR_SHIFT)
+#define ESR_ELx_CM_SHIFT	(8)
+#define ESR_ELx_CM 		(UL(1) << ESR_ELx_CM_SHIFT)
+
+/* ISS field definitions for exceptions taken in to Hyp */
+#define ESR_ELx_CV		(UL(1) << 24)
+#define ESR_ELx_COND_SHIFT	(20)
+#define ESR_ELx_COND_MASK	(UL(0xF) << ESR_ELx_COND_SHIFT)
+#define ESR_ELx_WFx_ISS_TI	(UL(1) << 0)
+#define ESR_ELx_WFx_ISS_WFI	(UL(0) << 0)
+#define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
+#define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
+
+#define LOAD_KERNEL_ADDRESS(from, to) \
+	MOVD from, to; \
+	ORR $0xffff000000000000, to, to;
+
+// LOAD_KERNEL_STACK loads the kernel temporary stack.
+#define LOAD_KERNEL_STACK(from) \
+	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
+	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
+	MOVD RSV_REG, RSP; \
+	ISB $15; \
+	DSB $15;
+
+#define SWITCH_TO_APP_PAGETABLE(from) \
+	MOVD CPU_TTBR0_APP(from), RSV_REG; \
+	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
+	ISB $15; \
+	DSB $15;
+
+#define SWITCH_TO_KVM_PAGETABLE(from) \
+	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
+	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
+	ISB $15; \
+	DSB $15;
+
+#define IRQ_ENABLE \
+	MSR $2, DAIFSet;
+
+#define IRQ_DISABLE \
+	MSR $2, DAIFClr;
+
+#define KERNEL_ENTRY_FROM_EL0 \
+	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
+	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
+	WORD $0xd538d092; \    //MRS   TPIDR_EL1, R18, step2, switch user pagetable.
+	SWITCH_TO_KVM_PAGETABLE(RSV_REG); \
+	WORD $0xd538d092; \    //MRS   TPIDR_EL1, R18
+	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer.
+	REGISTERS_SAVE(RSV_REG_APP, 0); \          // step4, save app context.
+	MOVD RSV_REG_APP, R20; \
+	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \
+	ADD $16, RSP, RSP; \
+	MOVD RSV_REG, PTRACE_R18(R20); \
+	MOVD RSV_REG_APP, PTRACE_R9(R20); \
+	MOVD R20, RSV_REG_APP; \
+	WORD $0xd5384003; \      //  MRS SPSR_EL1, R3
+	MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \
+	MRS ELR_EL1, R3; \
+	MOVD R3, PTRACE_PC(RSV_REG_APP); \
+	WORD $0xd5384103; \      //  MRS SP_EL0, R3
+	MOVD R3, PTRACE_SP(RSV_REG_APP);
+
+#define KERNEL_ENTRY_FROM_EL1 \
+	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// save sentry context
+	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
+	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
+	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
+	MRS ELR_EL1, R4; \
+	MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \
+	MOVD RSP, R4; \
+	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG);
+
+TEXT ·Halt(SB),NOSPLIT,$0
+	// clear bluepill.
+	WORD $0xd538d092   //MRS   TPIDR_EL1, R18
+	CMP RSV_REG, R9
+	BNE mmio_exit
+	MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+mmio_exit:
+	// MMIO_EXIT.
+	MOVD $0, R9
+	MOVD R0, 0xffff000000001000(R9)
+	B ·kernelExitToEl1(SB)
+
+TEXT ·Shutdown(SB),NOSPLIT,$0
+	// PSCI EVENT.
+	MOVD $0x84000009, R0
+	HVC $0
+
+// See kernel.go.
+TEXT ·Current(SB),NOSPLIT,$0-8
+	MOVD CPU_SELF(RSV_REG), R8
+	MOVD R8, ret+0(FP)
+	RET
+
+#define STACK_FRAME_SIZE 16
+
+TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
+	ERET()
+
+TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
+	ERET()
+
+TEXT ·Start(SB),NOSPLIT,$0
+	IRQ_DISABLE
+	MOVD R8, RSV_REG
+	ORR $0xffff000000000000, RSV_REG, RSV_REG
+	WORD $0xd518d092        //MSR R18, TPIDR_EL1
+
+	B ·kernelExitToEl1(SB)
+
+TEXT ·El1_sync_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El1_irq_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El1_error_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El1_sync(SB),NOSPLIT,$0
+	KERNEL_ENTRY_FROM_EL1
+	WORD $0xd5385219        // MRS ESR_EL1, R25
+	LSR  $ESR_ELx_EC_SHIFT, R25, R24
+	CMP $ESR_ELx_EC_DABT_CUR, R24
+	BEQ el1_da
+	CMP $ESR_ELx_EC_IABT_CUR, R24
+	BEQ el1_ia
+	CMP $ESR_ELx_EC_SYS64, R24
+	BEQ el1_undef
+	CMP $ESR_ELx_EC_SP_ALIGN, R24
+	BEQ el1_sp_pc
+	CMP $ESR_ELx_EC_PC_ALIGN, R24
+	BEQ el1_sp_pc
+	CMP $ESR_ELx_EC_UNKNOWN, R24
+	BEQ el1_undef
+	CMP $ESR_ELx_EC_SVC64, R24
+	BEQ el1_svc
+	CMP $ESR_ELx_EC_BREAKPT_CUR, R24
+	BGE el1_dbg
+	B el1_invalid
+
+el1_da:
+	B ·Halt(SB)
+
+el1_ia:
+	B ·Halt(SB)
+
+el1_sp_pc:
+	B ·Shutdown(SB)
+
+el1_undef:
+	B ·Shutdown(SB)
+
+el1_svc:
+	B ·Halt(SB)
+
+el1_dbg:
+	B ·Shutdown(SB)
+
+el1_invalid:
+	B ·Shutdown(SB)
+
+TEXT ·El1_irq(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El1_fiq(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El1_error(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_sync(SB),NOSPLIT,$0
+	KERNEL_ENTRY_FROM_EL0
+	WORD $0xd5385219	// MRS ESR_EL1, R25
+	LSR  $ESR_ELx_EC_SHIFT, R25, R24
+	CMP $ESR_ELx_EC_SVC64, R24
+	BEQ el0_svc
+	CMP $ESR_ELx_EC_DABT_LOW, R24
+	BEQ el0_da
+	CMP $ESR_ELx_EC_IABT_LOW, R24
+	BEQ el0_ia
+	CMP $ESR_ELx_EC_FP_ASIMD, R24
+	BEQ el0_fpsimd_acc
+	CMP $ESR_ELx_EC_SVE, R24
+	BEQ el0_sve_acc
+	CMP $ESR_ELx_EC_FP_EXC64, R24
+	BEQ el0_fpsimd_exc
+	CMP $ESR_ELx_EC_SP_ALIGN, R24
+	BEQ el0_sp_pc
+	CMP $ESR_ELx_EC_PC_ALIGN, R24
+	BEQ el0_sp_pc
+	CMP $ESR_ELx_EC_UNKNOWN, R24
+	BEQ el0_undef
+	CMP $ESR_ELx_EC_BREAKPT_LOW, R24
+	BGE el0_dbg
+	B   el0_invalid
+
+el0_svc:
+	B ·Halt(SB)
+
+el0_da:
+	B ·Halt(SB)
+
+el0_ia:
+	B ·Shutdown(SB)
+
+el0_fpsimd_acc:
+	B ·Shutdown(SB)
+
+el0_sve_acc:
+	B ·Shutdown(SB)
+
+el0_fpsimd_exc:
+	B ·Shutdown(SB)
+
+el0_sp_pc:
+	B ·Shutdown(SB)
+
+el0_undef:
+	B ·Shutdown(SB)
+
+el0_dbg:
+	B ·Shutdown(SB)
+
+el0_invalid:
+	B ·Shutdown(SB)
+
+TEXT ·El0_irq(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_fiq(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_error(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_irq_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·El0_error_invalid(SB),NOSPLIT,$0
+	B ·Shutdown(SB)
+
+TEXT ·Vectors(SB),NOSPLIT,$0
+	B ·El1_sync_invalid(SB)
+	nop31Instructions()
+	B ·El1_irq_invalid(SB)
+	nop31Instructions()
+	B ·El1_fiq_invalid(SB)
+	nop31Instructions()
+	B ·El1_error_invalid(SB)
+	nop31Instructions()
+
+	B ·El1_sync(SB)
+	nop31Instructions()
+	B ·El1_irq(SB)
+	nop31Instructions()
+	B ·El1_fiq(SB)
+	nop31Instructions()
+	B ·El1_error(SB)
+	nop31Instructions()
+
+	B ·El0_sync(SB)
+	nop31Instructions()
+	B ·El0_irq(SB)
+	nop31Instructions()
+	B ·El0_fiq(SB)
+	nop31Instructions()
+	B ·El0_error(SB)
+	nop31Instructions()
+
+	B ·El0_sync_invalid(SB)
+	nop31Instructions()
+	B ·El0_irq_invalid(SB)
+	nop31Instructions()
+	B ·El0_fiq_invalid(SB)
+	nop31Instructions()
+	B ·El0_error_invalid(SB)
+	nop31Instructions()
+
+	WORD $0xd503201f	//nop
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
+	WORD $0xd503201f
+	nop31Instructions()
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
new file mode 100644
index 000000000..ed82a131e
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -0,0 +1,58 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+// init initializes architecture-specific state.
+func (k *Kernel) init(opts KernelOpts) {
+	// Save the root page tables.
+	k.PageTables = opts.PageTables
+}
+
+// init initializes architecture-specific state.
+func (c *CPU) init() {
+	// Set the kernel stack pointer(virtual address).
+	c.registers.Sp = uint64(c.StackTop())
+
+}
+
+// StackTop returns the kernel's stack address.
+//
+//go:nosplit
+func (c *CPU) StackTop() uint64 {
+	return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
+}
+
+// IsCanonical indicates whether addr is canonical per the arm64 spec.
+//
+//go:nosplit
+func IsCanonical(addr uint64) bool {
+	return addr <= 0x0000ffffffffffff || addr > 0xffff000000000000
+}
+
+//go:nosplit
+func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
+	// Sanitize registers.
+	regs := switchOpts.Registers
+
+	regs.Pstate &= ^uint64(UserFlagsClear)
+	regs.Pstate |= UserFlagsSet
+	kernelExitToEl0()
+	vector = c.vecCode
+
+	// Perform the switch.
+	return
+}
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
new file mode 100644
index 000000000..900ee6380
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+// LoadFloatingPoint loads floating point state by the most efficient mechanism
+// available (set by Init).
+var LoadFloatingPoint func(*byte)
+
+// SaveFloatingPoint saves floating point state by the most efficient mechanism
+// available (set by Init).
+var SaveFloatingPoint func(*byte)
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
new file mode 100644
index 000000000..d7aa1c7cc
--- /dev/null
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -0,0 +1,124 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+import (
+	"fmt"
+	"io"
+	"reflect"
+	"syscall"
+)
+
+// Emit prints architecture-specific offsets.
+func Emit(w io.Writer) {
+	fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
+
+	c := &CPU{}
+	fmt.Fprintf(w, "\n// CPU offsets.\n")
+	fmt.Fprintf(w, "#define CPU_SELF             0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_REGISTERS        0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
+	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_FAULT_ADDR       0x%02x\n", reflect.ValueOf(&c.faultAddr).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_TTBR0_KVM	     0x%02x\n", reflect.ValueOf(&c.ttbr0Kvm).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_TTBR0_APP        0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_VECTOR_CODE      0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_APP_ADDR         0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
+
+	fmt.Fprintf(w, "\n// Bits.\n")
+	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
+
+	fmt.Fprintf(w, "\n// Vectors.\n")
+	fmt.Fprintf(w, "#define El1SyncInvalid  0x%02x\n", El1SyncInvalid)
+	fmt.Fprintf(w, "#define El1IrqInvalid 0x%02x\n", El1IrqInvalid)
+	fmt.Fprintf(w, "#define El1FiqInvalid 0x%02x\n", El1FiqInvalid)
+	fmt.Fprintf(w, "#define El1ErrorInvalid 0x%02x\n", El1ErrorInvalid)
+
+	fmt.Fprintf(w, "#define El1Sync 0x%02x\n", El1Sync)
+	fmt.Fprintf(w, "#define El1Irq 0x%02x\n", El1Irq)
+	fmt.Fprintf(w, "#define El1Fiq 0x%02x\n", El1Fiq)
+	fmt.Fprintf(w, "#define El1Error 0x%02x\n", El1Error)
+
+	fmt.Fprintf(w, "#define El0Sync 0x%02x\n", El0Sync)
+	fmt.Fprintf(w, "#define El0Irq 0x%02x\n", El0Irq)
+	fmt.Fprintf(w, "#define El0Fiq 0x%02x\n", El0Fiq)
+	fmt.Fprintf(w, "#define El0Error 0x%02x\n", El0Error)
+
+	fmt.Fprintf(w, "#define El0Sync_invalid 0x%02x\n", El0Sync_invalid)
+	fmt.Fprintf(w, "#define El0Irq_invalid 0x%02x\n", El0Irq_invalid)
+	fmt.Fprintf(w, "#define El0Fiq_invalid 0x%02x\n", El0Fiq_invalid)
+	fmt.Fprintf(w, "#define El0Error_invalid 0x%02x\n", El0Error_invalid)
+
+	fmt.Fprintf(w, "#define El1Sync_da 0x%02x\n", El1Sync_da)
+	fmt.Fprintf(w, "#define El1Sync_ia 0x%02x\n", El1Sync_ia)
+	fmt.Fprintf(w, "#define El1Sync_sp_pc 0x%02x\n", El1Sync_sp_pc)
+	fmt.Fprintf(w, "#define El1Sync_undef 0x%02x\n", El1Sync_undef)
+	fmt.Fprintf(w, "#define El1Sync_dbg 0x%02x\n", El1Sync_dbg)
+	fmt.Fprintf(w, "#define El1Sync_inv 0x%02x\n", El1Sync_inv)
+
+	fmt.Fprintf(w, "#define El0Sync_svc 0x%02x\n", El0Sync_svc)
+	fmt.Fprintf(w, "#define El0Sync_da 0x%02x\n", El0Sync_da)
+	fmt.Fprintf(w, "#define El0Sync_ia 0x%02x\n", El0Sync_ia)
+	fmt.Fprintf(w, "#define El0Sync_fpsimd_acc 0x%02x\n", El0Sync_fpsimd_acc)
+	fmt.Fprintf(w, "#define El0Sync_sve_acc 0x%02x\n", El0Sync_sve_acc)
+	fmt.Fprintf(w, "#define El0Sync_sys 0x%02x\n", El0Sync_sys)
+	fmt.Fprintf(w, "#define El0Sync_sp_pc 0x%02x\n", El0Sync_sp_pc)
+	fmt.Fprintf(w, "#define El0Sync_undef 0x%02x\n", El0Sync_undef)
+	fmt.Fprintf(w, "#define El0Sync_dbg 0x%02x\n", El0Sync_dbg)
+	fmt.Fprintf(w, "#define El0Sync_inv 0x%02x\n", El0Sync_inv)
+
+	fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
+	fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
+
+	p := &syscall.PtraceRegs{}
+	fmt.Fprintf(w, "\n// Ptrace registers.\n")
+	fmt.Fprintf(w, "#define PTRACE_R0       0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R1       0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R2       0x%02x\n", reflect.ValueOf(&p.Regs[2]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R3       0x%02x\n", reflect.ValueOf(&p.Regs[3]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R4       0x%02x\n", reflect.ValueOf(&p.Regs[4]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R5       0x%02x\n", reflect.ValueOf(&p.Regs[5]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R6       0x%02x\n", reflect.ValueOf(&p.Regs[6]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R7       0x%02x\n", reflect.ValueOf(&p.Regs[7]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R8       0x%02x\n", reflect.ValueOf(&p.Regs[8]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R9       0x%02x\n", reflect.ValueOf(&p.Regs[9]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R10      0x%02x\n", reflect.ValueOf(&p.Regs[10]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R11      0x%02x\n", reflect.ValueOf(&p.Regs[11]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R12      0x%02x\n", reflect.ValueOf(&p.Regs[12]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R13      0x%02x\n", reflect.ValueOf(&p.Regs[13]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R14      0x%02x\n", reflect.ValueOf(&p.Regs[14]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R15      0x%02x\n", reflect.ValueOf(&p.Regs[15]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R16      0x%02x\n", reflect.ValueOf(&p.Regs[16]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R17      0x%02x\n", reflect.ValueOf(&p.Regs[17]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R18      0x%02x\n", reflect.ValueOf(&p.Regs[18]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R19      0x%02x\n", reflect.ValueOf(&p.Regs[19]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R20      0x%02x\n", reflect.ValueOf(&p.Regs[20]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R21      0x%02x\n", reflect.ValueOf(&p.Regs[21]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R22      0x%02x\n", reflect.ValueOf(&p.Regs[22]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R23      0x%02x\n", reflect.ValueOf(&p.Regs[23]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R24      0x%02x\n", reflect.ValueOf(&p.Regs[24]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R25      0x%02x\n", reflect.ValueOf(&p.Regs[25]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R26      0x%02x\n", reflect.ValueOf(&p.Regs[26]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R27      0x%02x\n", reflect.ValueOf(&p.Regs[27]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R28      0x%02x\n", reflect.ValueOf(&p.Regs[28]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R29      0x%02x\n", reflect.ValueOf(&p.Regs[29]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R30      0x%02x\n", reflect.ValueOf(&p.Regs[30]).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_SP       0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_PC       0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_PSTATE   0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer())
+}
-- 
cgit v1.2.3


From 345f140169dc59bfc73bc522f4877f1683b2558a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 20 Sep 2019 14:37:33 +0000
Subject: Optimize kvm/physical_map.go on Arm platform

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/BUILD                 |  2 ++
 pkg/sentry/platform/kvm/physical_map.go       | 12 +-----------
 pkg/sentry/platform/kvm/physical_map_amd64.go | 22 ++++++++++++++++++++++
 pkg/sentry/platform/kvm/physical_map_arm64.go | 19 +++++++++++++++++++
 4 files changed, 44 insertions(+), 11 deletions(-)
 create mode 100644 pkg/sentry/platform/kvm/physical_map_amd64.go
 create mode 100644 pkg/sentry/platform/kvm/physical_map_arm64.go

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index fe979dccf..2046df525 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -24,6 +24,8 @@ go_library(
         "machine_amd64_unsafe.go",
         "machine_unsafe.go",
         "physical_map.go",
+        "physical_map_amd64.go",
+        "physical_map_arm64.go",
         "virtual_map.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm",
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 586e91bb2..91de5dab1 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -24,15 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
-const (
-	// reservedMemory is a chunk of physical memory reserved starting at
-	// physical address zero. There are some special pages in this region,
-	// so we just call the whole thing off.
-	//
-	// Other architectures may define this to be zero.
-	reservedMemory = 0x100000000
-)
-
 type region struct {
 	virtual uintptr
 	length  uintptr
@@ -59,8 +50,7 @@ func fillAddressSpace() (excludedRegions []region) {
 	// We can cut vSize in half, because the kernel will be using the top
 	// half and we ignore it while constructing mappings. It's as if we've
 	// already excluded half the possible addresses.
-	vSize := uintptr(1) << ring0.VirtualAddressBits()
-	vSize = vSize >> 1
+	vSize := ring0.UserspaceSize
 
 	// We exclude reservedMemory below from our physical memory size, so it
 	// needs to be dropped here as well. Otherwise, we could end up with
diff --git a/pkg/sentry/platform/kvm/physical_map_amd64.go b/pkg/sentry/platform/kvm/physical_map_amd64.go
new file mode 100644
index 000000000..c5adfb577
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map_amd64.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+const (
+	// reservedMemory is a chunk of physical memory reserved starting at
+	// physical address zero. There are some special pages in this region,
+	// so we just call the whole thing off.
+	reservedMemory = 0x100000000
+)
diff --git a/pkg/sentry/platform/kvm/physical_map_arm64.go b/pkg/sentry/platform/kvm/physical_map_arm64.go
new file mode 100644
index 000000000..4d8561453
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map_arm64.go
@@ -0,0 +1,19 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+const (
+	reservedMemory = 0
+)
-- 
cgit v1.2.3


From c0065e296f6e840ec1f6797fb0fd55cde0fff785 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 23 Oct 2019 12:58:40 -0700
Subject: Remove comparison between signed and unsigned int

Some compilers don't like the comparison between int and size_t. Remove it.

The other changes are minor style cleanups.

PiperOrigin-RevId: 276333450
---
 test/syscalls/linux/sendfile_socket.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 1c56540bc..3331288b7 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -185,7 +185,7 @@ TEST_P(SendFileTest, Shutdown) {
   // Create a socket.
   std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
   const FileDescriptor client(std::get<0>(fds));
-  FileDescriptor server(std::get<1>(fds));  // non-const, released below.
+  FileDescriptor server(std::get<1>(fds));  // non-const, reset below.
 
   // If this is a TCP socket, then turn off linger.
   if (GetParam() == AF_INET) {
@@ -210,14 +210,14 @@ TEST_P(SendFileTest, Shutdown) {
   // checking the contents (other tests do that), so we just re-use the same
   // buffer as above.
   ScopedThread t([&]() {
-    int done = 0;
+    size_t done = 0;
     while (done < data.size()) {
-      int n = read(server.get(), data.data(), data.size());
+      int n = RetryEINTR(read)(server.get(), data.data(), data.size());
       ASSERT_THAT(n, SyscallSucceeds());
       done += n;
     }
     // Close the server side socket.
-    ASSERT_THAT(close(server.release()), SyscallSucceeds());
+    server.reset();
   });
 
   // Continuously stream from the file to the socket. Note we do not assert
-- 
cgit v1.2.3


From de3dbf8a09afdccba75d7ca3c129ce33c569c086 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 23 Oct 2019 13:25:14 -0700
Subject: Inform netstack integrator when Duplicate Address Detection completes

This change introduces a new interface, stack.NDPDispatcher. It can be
implemented by the netstack integrator to receive NDP related events. As of this
change, only DAD related events are supported.

Tests: Existing tests were modified to use the NDPDispatcher's DAD events for
DAD tests where it needed to wait for DAD completing (failing and resolving).
PiperOrigin-RevId: 276338733
---
 pkg/tcpip/network/ipv6/icmp.go |   8 +--
 pkg/tcpip/stack/ndp.go         | 118 +++++++++++++++++++++-----------
 pkg/tcpip/stack/ndp_test.go    | 149 +++++++++++++++++++++++++++++++++--------
 pkg/tcpip/stack/nic.go         |   3 +-
 pkg/tcpip/stack/stack.go       |   9 +++
 pkg/tcpip/stack/stack_test.go  |  46 ++++++++-----
 6 files changed, 245 insertions(+), 88 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 6c14b4aae..b289e902f 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -246,15 +246,15 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			return
 		}
 
-		// At this point we know that the targetAddress is not tentaive
+		// At this point we know that the targetAddress is not tentative
 		// on rxNICID. However, targetAddr may still be assigned to
 		// rxNICID but not tentative (it could be permanent). Such a
 		// scenario is beyond the scope of RFC 4862. As such, we simply
 		// ignore such a scenario for now and proceed as normal.
 		//
-		// TODO(b/140896005): Handle the scenario described above
-		// (inform the netstack integration that a duplicate address was
-		// was detected)
+		// TODO(b/143147598): Handle the scenario described above. Also
+		// inform the netstack integration that a duplicate address was
+		// detected outside of DAD.
 
 		e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress)
 		if targetAddr != r.RemoteAddress {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 921d1c9c7..ea2dbed2e 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -51,6 +51,22 @@ const (
 	minimumRetransmitTimer = time.Millisecond
 )
 
+// NDPDispatcher is the interface integrators of netstack must implement to
+// receive and handle NDP related events.
+type NDPDispatcher interface {
+	// OnDuplicateAddressDetectionStatus will be called when the DAD process
+	// for an address (addr) on a NIC (with ID nicid) completes. resolved
+	// will be set to true if DAD completed successfully (no duplicate addr
+	// detected); false otherwise (addr was detected to be a duplicate on
+	// the link the NIC is a part of, or it was stopped for some other
+	// reason, such as the address being removed). If an error occured
+	// during DAD, err will be set and resolved must be ignored.
+	//
+	// This function is permitted to block indefinitely without interfering
+	// with the stack's operation.
+	OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
+}
+
 // NDPConfigurations is the NDP configurations for the netstack.
 type NDPConfigurations struct {
 	// The number of Neighbor Solicitation messages to send when doing
@@ -88,6 +104,9 @@ func (c *NDPConfigurations) validate() {
 
 // ndpState is the per-interface NDP state.
 type ndpState struct {
+	// The NIC this ndpState is for.
+	nic *NIC
+
 	// The DAD state to send the next NS message, or resolve the address.
 	dad map[tcpip.Address]dadState
 }
@@ -110,8 +129,8 @@ type dadState struct {
 // This function must only be called by IPv6 addresses that are currently
 // tentative.
 //
-// The NIC that ndp belongs to (n) MUST be locked.
-func (ndp *ndpState) startDuplicateAddressDetection(n *NIC, addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
 	// addr must be a valid unicast IPv6 address.
 	if !header.IsV6UnicastAddress(addr) {
 		return tcpip.ErrAddressFamilyNotSupported
@@ -127,13 +146,13 @@ func (ndp *ndpState) startDuplicateAddressDetection(n *NIC, addr tcpip.Address,
 		// reference count would have been increased without doing the
 		// work that would have been done for an address that was brand
 		// new. See NIC.addPermanentAddressLocked.
-		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, n.ID()))
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
-	remaining := n.stack.ndpConfigs.DupAddrDetectTransmits
+	remaining := ndp.nic.stack.ndpConfigs.DupAddrDetectTransmits
 
 	{
-		done, err := ndp.doDuplicateAddressDetection(n, addr, remaining, ref)
+		done, err := ndp.doDuplicateAddressDetection(addr, remaining, ref)
 		if err != nil {
 			return err
 		}
@@ -146,41 +165,59 @@ func (ndp *ndpState) startDuplicateAddressDetection(n *NIC, addr tcpip.Address,
 
 	var done bool
 	var timer *time.Timer
-	timer = time.AfterFunc(n.stack.ndpConfigs.RetransmitTimer, func() {
-		n.mu.Lock()
-		defer n.mu.Unlock()
+	timer = time.AfterFunc(ndp.nic.stack.ndpConfigs.RetransmitTimer, func() {
+		var d bool
+		var err *tcpip.Error
+
+		// doDadIteration does a single iteration of the DAD loop.
+		//
+		// Returns true if the integrator needs to be informed of DAD
+		// completing.
+		doDadIteration := func() bool {
+			ndp.nic.mu.Lock()
+			defer ndp.nic.mu.Unlock()
+
+			if done {
+				// If we reach this point, it means that the DAD
+				// timer fired after another goroutine already
+				// obtained the NIC lock and stopped DAD before
+				// this function obtained the NIC lock. Simply
+				// return here and do nothing further.
+				return false
+			}
 
-		if done {
-			// If we reach this point, it means that the DAD timer
-			// fired after another goroutine already obtained the
-			// NIC lock and stopped DAD before it this function
-			// obtained the NIC lock. Simply return here and do
-			// nothing further.
-			return
-		}
+			ref, ok := ndp.nic.endpoints[NetworkEndpointID{addr}]
+			if !ok {
+				// This should never happen.
+				// We should have an endpoint for addr since we
+				// are still performing DAD on it. If the
+				// endpoint does not exist, but we are doing DAD
+				// on it, then we started DAD at some point, but
+				// forgot to stop it when the endpoint was
+				// deleted.
+				panic(fmt.Sprintf("ndpdad: unrecognized addr %s for NIC(%d)", addr, ndp.nic.ID()))
+			}
 
-		ref, ok := n.endpoints[NetworkEndpointID{addr}]
-		if !ok {
-			// This should never happen.
-			// We should have an endpoint for addr since we are
-			// still performing DAD on it. If the endpoint does not
-			// exist, but we are doing DAD on it, then we started
-			// DAD at some point, but forgot to stop it when the
-			// endpoint was deleted.
-			panic(fmt.Sprintf("ndpdad: unrecognized addr %s for NIC(%d)", addr, n.ID()))
-		}
+			d, err = ndp.doDuplicateAddressDetection(addr, remaining, ref)
+			if err != nil || d {
+				delete(ndp.dad, addr)
 
-		if done, err := ndp.doDuplicateAddressDetection(n, addr, remaining, ref); err != nil || done {
-			if err != nil {
-				log.Printf("ndpdad: Error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, n.ID(), err)
+				if err != nil {
+					log.Printf("ndpdad: Error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, ndp.nic.ID(), err)
+				}
+
+				// Let the integrator know DAD has completed.
+				return true
 			}
 
-			ndp.stopDuplicateAddressDetection(addr)
-			return
+			remaining--
+			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
+			return false
 		}
 
-		timer.Reset(n.stack.ndpConfigs.RetransmitTimer)
-		remaining--
+		if doDadIteration() && ndp.nic.stack.ndpDisp != nil {
+			ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, d, err)
+		}
 
 	})
 
@@ -204,11 +241,11 @@ func (ndp *ndpState) startDuplicateAddressDetection(n *NIC, addr tcpip.Address,
 // The NIC that ndp belongs to (n) MUST be locked.
 //
 // Returns true if DAD has resolved; false if DAD is still ongoing.
-func (ndp *ndpState) doDuplicateAddressDetection(n *NIC, addr tcpip.Address, remaining uint8, ref *referencedNetworkEndpoint) (bool, *tcpip.Error) {
+func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining uint8, ref *referencedNetworkEndpoint) (bool, *tcpip.Error) {
 	if ref.getKind() != permanentTentative {
 		// The endpoint should still be marked as tentative
 		// since we are still performing DAD on it.
-		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, n.ID()))
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
 	if remaining == 0 {
@@ -219,17 +256,17 @@ func (ndp *ndpState) doDuplicateAddressDetection(n *NIC, addr tcpip.Address, rem
 
 	// Send a new NS.
 	snmc := header.SolicitedNodeAddr(addr)
-	snmcRef, ok := n.endpoints[NetworkEndpointID{snmc}]
+	snmcRef, ok := ndp.nic.endpoints[NetworkEndpointID{snmc}]
 	if !ok {
 		// This should never happen as if we have the
 		// address, we should have the solicited-node
 		// address.
-		panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", n.ID(), snmc, addr))
+		panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", ndp.nic.ID(), snmc, addr))
 	}
 
 	// Use the unspecified address as the source address when performing
 	// DAD.
-	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, n.linkEP.LinkAddress(), snmcRef, false, false)
+	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), snmcRef, false, false)
 
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
@@ -275,5 +312,8 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 
 	delete(ndp.dad, addr)
 
-	return
+	// Let the integrator know DAD did not resolve.
+	if ndp.nic.stack.ndpDisp != nil {
+		go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	}
 }
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8995fbfc3..b089ce2ae 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -67,6 +67,35 @@ func TestDADDisabled(t *testing.T) {
 	}
 }
 
+// ndpDADEvent is a set of parameters that was passed to
+// ndpDispatcher.OnDuplicateAddressDetectionStatus.
+type ndpDADEvent struct {
+	nicid    tcpip.NICID
+	addr     tcpip.Address
+	resolved bool
+	err      *tcpip.Error
+}
+
+var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
+
+// ndpDispatcher implements NDPDispatcher so tests can know when various NDP
+// related events happen for test purposes.
+type ndpDispatcher struct {
+	dadC chan ndpDADEvent
+}
+
+// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
+//
+// If the DAD event matches what we are expecting, send signal on n.dadC.
+func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
+	n.dadC <- ndpDADEvent{
+		nicid,
+		addr,
+		resolved,
+		err,
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -88,8 +117,12 @@ func TestDADResolve(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent),
+			}
 			opts := stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
 			}
 			opts.NDPConfigs.RetransmitTimer = test.retransTimer
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
@@ -106,8 +139,7 @@ func TestDADResolve(t *testing.T) {
 
 			stat := s.Stats().ICMP.V6PacketsSent.NeighborSolicit
 
-			// Should have sent an NDP NS almost immediately.
-			time.Sleep(100 * time.Millisecond)
+			// Should have sent an NDP NS immediately.
 			if got := stat.Value(); got != 1 {
 				t.Fatalf("got NeighborSolicit = %d, want = 1", got)
 
@@ -123,16 +155,10 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
 			}
 
-			// Wait for the remaining time - 500ms, to make sure
-			// the address is still not resolved. Note, we subtract
-			// 600ms because we already waited for 100ms earlier,
-			// so our remaining time is 100ms less than the expected
-			// time.
-			// (X - 100ms) - 500ms = X - 600ms
-			//
-			// TODO(b/140896005): Use events from the netstack to
-			// be signalled before DAD resolves.
-			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - 600*time.Millisecond)
+			// Wait for the remaining time - some delta (500ms), to
+			// make sure the address is still not resolved.
+			const delta = 500 * time.Millisecond
+			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - delta)
 			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
@@ -141,13 +167,30 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
 			}
 
-			// Wait for the remaining time + 250ms, at which point
-			// the address should be resolved. Note, the remaining
-			// time is 500ms. See above comments.
-			//
-			// TODO(b/140896005): Use events from the netstack to
-			// know immediately when DAD completes.
-			time.Sleep(750 * time.Millisecond)
+			// Wait for DAD to resolve.
+			select {
+			case <-time.After(2 * delta):
+				// We should get a resolution event after 500ms
+				// (delta) since we wait for 500ms less than the
+				// expected resolution time above to make sure
+				// that the address did not yet resolve. Waiting
+				// for 1s (2x delta) without a resolution event
+				// means something is wrong.
+				t.Fatal("timed out waiting for DAD resolution")
+			case e := <-ndpDisp.dadC:
+				if e.err != nil {
+					t.Fatal("got DAD error: ", e.err)
+				}
+				if e.nicid != 1 {
+					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				}
+				if e.addr != addr1 {
+					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+				}
+				if !e.resolved {
+					t.Fatal("got DAD event w/ resolved = false, want = true")
+				}
+			}
 			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
@@ -250,9 +293,14 @@ func TestDADFail(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent),
+			}
+			ndpConfigs := stack.DefaultNDPConfigurations()
 			opts := stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs:       stack.DefaultNDPConfigurations(),
+				NDPConfigs:       ndpConfigs,
+				NDPDisp:          &ndpDisp,
 			}
 			opts.NDPConfigs.RetransmitTimer = time.Second * 2
 
@@ -286,8 +334,28 @@ func TestDADFail(t *testing.T) {
 				t.Fatalf("got stat = %d, want = 1", got)
 			}
 
-			// Wait 3 seconds to make sure that DAD did not resolve
-			time.Sleep(3 * time.Second)
+			// Wait for DAD to fail and make sure the address did
+			// not get resolved.
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the
+				// expected resolution time + extra 1s buffer,
+				// something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if e.err != nil {
+					t.Fatal("got DAD error: ", e.err)
+				}
+				if e.nicid != 1 {
+					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				}
+				if e.addr != addr1 {
+					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+				}
+				if e.resolved {
+					t.Fatal("got DAD event w/ resolved = true, want = false")
+				}
+			}
 			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
@@ -302,11 +370,18 @@ func TestDADFail(t *testing.T) {
 // TestDADStop tests to make sure that the DAD process stops when an address is
 // removed.
 func TestDADStop(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.NDPConfigurations{
+		RetransmitTimer:        time.Second,
+		DupAddrDetectTransmits: 2,
+	}
 	opts := stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPDisp:          &ndpDisp,
+		NDPConfigs:       ndpConfigs,
 	}
-	opts.NDPConfigs.RetransmitTimer = time.Second
-	opts.NDPConfigs.DupAddrDetectTransmits = 2
 
 	e := channel.New(10, 1280, linkAddr1)
 	s := stack.New(opts)
@@ -332,11 +407,27 @@ func TestDADStop(t *testing.T) {
 		t.Fatalf("RemoveAddress(_, %s) = %s", addr1, err)
 	}
 
-	// Wait for the time to normally resolve
-	// DupAddrDetectTransmits(2) * RetransmitTimer(1s) = 2s.
-	// An extra 250ms is added to make sure that if DAD was still running
-	// it resolves and the check below fails.
-	time.Sleep(2*time.Second + 250*time.Millisecond)
+	// Wait for DAD to fail (since the address was removed during DAD).
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// If we don't get a failure event after the expected resolution
+		// time + extra 1s buffer, something is wrong.
+		t.Fatal("timed out waiting for DAD failure")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicid != 1 {
+			t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+		}
+		if e.addr != addr1 {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+		}
+		if e.resolved {
+			t.Fatal("got DAD event w/ resolved = true, want = false")
+		}
+
+	}
 	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
 	if err != nil {
 		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index e456e05f4..2d29fa88e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -108,6 +108,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 			dad: make(map[tcpip.Address]dadState),
 		},
 	}
+	nic.ndp.nic = nic
 
 	// Register supported packet endpoint protocols.
 	for _, netProto := range header.Ethertypes {
@@ -432,7 +433,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	// If we are adding a tentative IPv6 address, start DAD.
 	if isIPv6Unicast && kind == permanentTentative {
-		if err := n.ndp.startDuplicateAddressDetection(n, protocolAddress.AddressWithPrefix.Address, ref); err != nil {
+		if err := n.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
 			return nil, err
 		}
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 284280917..5ea432a24 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -406,6 +406,10 @@ type Stack struct {
 	// to auto-generate an IPv6 link-local address for newly enabled NICs.
 	// See the AutoGenIPv6LinkLocal field of Options for more details.
 	autoGenIPv6LinkLocal bool
+
+	// ndpDisp is the NDP event dispatcher that is used to send the netstack
+	// integrator NDP related events.
+	ndpDisp NDPDispatcher
 }
 
 // Options contains optional Stack configuration.
@@ -448,6 +452,10 @@ type Options struct {
 	// guidelines.
 	AutoGenIPv6LinkLocal bool
 
+	// NDPDisp is the NDP event dispatcher that an integrator can provide to
+	// receive NDP related events.
+	NDPDisp NDPDispatcher
+
 	// RawFactory produces raw endpoints. Raw endpoints are enabled only if
 	// this is non-nil.
 	RawFactory RawFactory
@@ -514,6 +522,7 @@ func New(opts Options) *Stack {
 		portSeed:             generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+		ndpDisp:              opts.NDPDisp,
 	}
 
 	// Add specified network protocols.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9a8906a0d..9dae853d0 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1971,13 +1971,15 @@ func TestNICAutoGenAddr(t *testing.T) {
 // TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
 // link-local addresses will only be assigned after the DAD process resolves.
 func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.DefaultNDPConfigurations()
 	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			RetransmitTimer:        time.Second,
-			DupAddrDetectTransmits: 1,
-		},
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:           ndpConfigs,
 		AutoGenIPv6LinkLocal: true,
+		NDPDisp:              &ndpDisp,
 	}
 
 	e := channel.New(10, 1280, linkAddr1)
@@ -1996,21 +1998,35 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
 	}
 
-	// Wait for the address to resolve (an extra
-	// 250ms to make sure the address resolves).
-	//
-	// TODO(b/140896005): Use events from the
-	// netstack to know immediately when DAD
-	// completes.
-	time.Sleep(time.Second + 250*time.Millisecond)
+	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
 
-	// Should have auto-generated an address and
-	// resolved (if DAD).
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// We should get a resolution event after 1s (default time to
+		// resolve as per default NDP configurations). Waiting for that
+		// resolution time + an extra 1s without a resolution event
+		// means something is wrong.
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicid != 1 {
+			t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+		}
+		if e.addr != linkLocalAddr {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
+		}
+		if !e.resolved {
+			t.Fatal("got DAD event w/ resolved = false, want = true")
+		}
+	}
 	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
 	if err != nil {
 		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
 	}
-	if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(linkAddr1), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
 		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
 	}
 }
-- 
cgit v1.2.3


From fbe6b50d5628bc3d522f87eee2abcc5a923df420 Mon Sep 17 00:00:00 2001
From: DarcySail <darcysail@gmail.com>
Date: Wed, 23 Oct 2019 14:26:23 -0700
Subject: Keep minimal available fd to accelerate fd allocation

Use fd.next to store the iteration start position, which can be used to accelerate allocating new FDs.
And adding the corresponding gtest benchmark to measure performance.
@tanjianfeng

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/758 from DarcySail:master 96685ec7886dfe1a64988406831d3bc002b438cc
PiperOrigin-RevId: 276351250
---
 pkg/sentry/kernel/fd_table.go      | 22 ++++++++++++++++++++++
 pkg/sentry/kernel/fd_table_test.go | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index cc3f43a45..11f613a11 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -81,6 +81,9 @@ type FDTable struct {
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
 
+	// next is start position to find fd.
+	next int32
+
 	// used contains the number of non-nil entries. It must be accessed
 	// atomically. It may be read atomically without holding mu (but not
 	// written).
@@ -226,6 +229,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.get(i); d == nil {
@@ -242,6 +250,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 		return nil, syscall.EMFILE
 	}
 
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
 	return fds, nil
 }
 
@@ -361,6 +374,11 @@ func (f *FDTable) Remove(fd int32) *fs.File {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	// Update current available position.
+	if fd < f.next {
+		f.next = fd
+	}
+
 	orig, _, _ := f.get(fd)
 	if orig != nil {
 		orig.IncRef()             // Reference for caller.
@@ -377,6 +395,10 @@ func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
 	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
 		if cond(file, flags) {
 			f.set(fd, nil, FDFlags{}) // Clear from table.
+			// Update current available position.
+			if fd < f.next {
+				f.next = fd
+			}
 		}
 	})
 }
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 2413788e7..2bcb6216a 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -70,6 +70,42 @@ func TestFDTableMany(t *testing.T) {
 		if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
 			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
 		}
+
+		i := int32(2)
+		fdTable.Remove(i)
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
+			t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
+		}
+	})
+}
+
+func TestFDTableOverLimit(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		if _, err := fdTable.NewFDs(ctx, maxFD, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(maxFD, f): got nil, wanted error")
+		}
+
+		if _, err := fdTable.NewFDs(ctx, maxFD-2, []*fs.File{file, file, file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(maxFD-2, {f,f,f}): got nil, wanted error")
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, maxFD-3, []*fs.File{file, file, file}, FDFlags{}); err != nil {
+			t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
+		} else {
+			for _, fd := range fds {
+				fdTable.Remove(fd)
+			}
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, maxFD-1, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != maxFD-1 {
+			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+		} else if len(fds) != 1 || fds[0] != 0 {
+			t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds)
+		}
 	})
 }
 
-- 
cgit v1.2.3


From 072af49059a1818e0e06188be81fe425363acf55 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 23 Oct 2019 17:20:07 -0700
Subject: Add check for proper settings to AF_PACKET tests.

As in packet_socket_raw.cc, we should check that certain proc files are set
correctly.

PiperOrigin-RevId: 276384534
---
 test/syscalls/linux/packet_socket.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index fcf64ee59..92ae55eec 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -130,6 +130,20 @@ void CookedPacketTest::SetUp() {
     GTEST_SKIP();
   }
 
+  if (!IsRunningOnGvisor()) {
+    FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE(
+        Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY));
+    FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE(
+        Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDONLY));
+    char enabled;
+    ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
+                SyscallSucceedsWithValue(1));
+    ASSERT_EQ(enabled, '1');
+    ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
+                SyscallSucceedsWithValue(1));
+    ASSERT_EQ(enabled, '1');
+  }
+
   ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
               SyscallSucceeds());
 }
-- 
cgit v1.2.3


From 7ca50236c42ad1b1aa19951815d03b62c0c722ed Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 23 Oct 2019 22:21:33 -0700
Subject: Handle AT_EMPTY_PATH flag in execveat.

PiperOrigin-RevId: 276419967
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  2 +-
 pkg/sentry/syscalls/linux/sys_thread.go    | 32 ++++++++++-----
 test/syscalls/linux/exec.cc                | 62 ++++++++++++++++++++++++++++++
 test/util/multiprocess_util.h              |  9 +++++
 4 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index aedb6d774..6d3801ad9 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -362,7 +362,7 @@ var AMD64 = &kernel.SyscallTable{
 		319: syscalls.Supported("memfd_create", MemfdCreate),
 		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_EMPTY_PATH, AT_SYMLINK_FOLLOW.", nil),
+		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_SYMLINK_FOLLOW.", nil),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 6e425f1ec..7ece7ba6f 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -105,18 +105,26 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		}
 	}
 
-	if flags != 0 {
-		// TODO(b/128449944): Handle AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW.
+	if flags&linux.AT_SYMLINK_NOFOLLOW != 0 {
+		// TODO(b/128449944): Handle AT_SYMLINK_NOFOLLOW.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS
 	}
 
+	atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
+	if !atEmptyPath && len(pathname) == 0 {
+		return 0, nil, syserror.ENOENT
+	}
+
 	root := t.FSContext().RootDirectory()
 	defer root.DecRef()
 
 	var wd *fs.Dirent
+	var executable *fs.File
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
-		// If pathname is absolute, LoadTaskImage() will ignore the wd.
+		// Even if the pathname is absolute, we may still need the wd
+		// for interpreter scripts if the path of the interpreter is
+		// relative.
 		wd = t.FSContext().WorkingDirectory()
 	} else {
 		// Need to extract the given FD.
@@ -126,17 +134,23 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		}
 		defer f.DecRef()
 
-		wd = f.Dirent
-		wd.IncRef()
-		if !fs.IsDir(wd.Inode.StableAttr) {
-			return 0, nil, syserror.ENOTDIR
+		if atEmptyPath && len(pathname) == 0 {
+			executable = f
+		} else {
+			wd = f.Dirent
+			wd.IncRef()
+			if !fs.IsDir(wd.Inode.StableAttr) {
+				return 0, nil, syserror.ENOTDIR
+			}
 		}
 	}
-	defer wd.DecRef()
+	if wd != nil {
+		defer wd.DecRef()
+	}
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, nil, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 85734c290..03ec9f75f 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -550,6 +550,18 @@ TEST(ExecveatTest, Basic) {
                 ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
 }
 
+TEST(ExecveatTest, FDNotADirectory) {
+  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string relative_path = std::string(Basename(absolute_path));
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), relative_path,
+                                            {absolute_path}, {}, /*flags=*/0,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOTDIR);
+}
+
 TEST(ExecveatTest, AbsolutePathWithFDCWD) {
   std::string path = WorkloadPath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
@@ -564,6 +576,56 @@ TEST(ExecveatTest, AbsolutePath) {
                 absl::StrCat(path, "\n"));
 }
 
+TEST(ExecveatTest, EmptyPathBasic) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+  CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0),
+                absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, EmptyPathWithDirFD) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  std::string parent_dir = std::string(Dirname(path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), "", {path}, {},
+                                            AT_EMPTY_PATH,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
+TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
+      fd.get(), "", {path}, {}, /*flags=*/0, /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+  CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH,
+                ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
+  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string parent_dir = std::string(Dirname(absolute_path));
+  std::string relative_path = std::string(Basename(absolute_path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, AT_EMPTY_PATH,
+                ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
+}
+
 // Priority consistent across calls to execve()
 TEST(GetpriorityTest, ExecveMaintainsPriority) {
   int prio = 16;
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index c413d63ea..61526b4e7 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -109,6 +109,15 @@ PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd, const std::string& pathname
                                       const std::function<void()>& fn,
                                       pid_t* child, int* execve_errno);
 
+inline PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd,
+                                             const std::string& pathname,
+                                             const ExecveArray& argv,
+                                             const ExecveArray& envv, int flags,
+                                             pid_t* child, int* execve_errno) {
+  return ForkAndExecveat(
+      dirfd, pathname, argv, envv, flags, [] {}, child, execve_errno);
+}
+
 // Calls fn in a forked subprocess and returns the exit status of the
 // subprocess.
 //
-- 
cgit v1.2.3


From d9fd5363409facbc5cf04b85b3b0e7dade085dd9 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 24 Oct 2019 01:44:03 -0700
Subject: Handle AT_SYMLINK_NOFOLLOW flag for execveat.

PiperOrigin-RevId: 276441249
---
 pkg/sentry/kernel/kernel.go                |  2 +-
 pkg/sentry/kernel/task_context.go          |  4 +-
 pkg/sentry/loader/elf.go                   |  2 +-
 pkg/sentry/loader/loader.go                | 24 ++++++---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  2 +-
 pkg/sentry/syscalls/linux/sys_thread.go    | 10 ++--
 test/syscalls/linux/exec.cc                | 79 +++++++++++++++++++++++++++---
 7 files changed, 96 insertions(+), 27 deletions(-)

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3cda03891..d70ad5c09 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -805,7 +805,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	// Create a fresh task context.
 	remainingTraversals = uint(args.MaxSymlinkTraversals)
 
-	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
+	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, true /*resolveFinal*/, k.featureSet)
 	if se != nil {
 		return nil, 0, errors.New(se.String())
 	}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 8639d379f..1da718b27 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -145,7 +145,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, resolveFinal bool, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
 	// If File is not nil, we should load that instead of resolving filename.
 	if file != nil {
 		filename = file.MappedName(ctx)
@@ -155,7 +155,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, r
 	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
 
-	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso)
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, resolveFinal, k.extraAuxv, k.vdso)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 2d9251e92..86f6b269b 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -640,7 +640,7 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace
 
 	var interp loadedELF
 	if bin.interpreter != "" {
-		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter)
+		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter, true /*resolveFinal*/)
 		if err != nil {
 			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
 			return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 089d1635b..f5303491d 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -57,13 +57,19 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDTable. The caller takes ownership of both.
 //
 // name must be a readable, executable, regular file.
-func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string, resolveFinal bool) (*fs.Dirent, *fs.File, error) {
+	var err error
 	if name == "" {
 		ctx.Infof("cannot open empty name")
 		return nil, nil, syserror.ENOENT
 	}
 
-	d, err := mm.FindInode(ctx, root, wd, name, maxTraversals)
+	var d *fs.Dirent
+	if resolveFinal {
+		d, err = mounts.FindInode(ctx, root, wd, name, maxTraversals)
+	} else {
+		d, err = mounts.FindLink(ctx, root, wd, name, maxTraversals)
+	}
 	if err != nil {
 		return nil, nil, err
 	}
@@ -71,10 +77,13 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	// Open file will take a reference to Dirent, so destroy this one.
 	defer d.DecRef()
 
+	if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+		return nil, nil, syserror.ELOOP
+	}
+
 	return openFile(ctx, nil, d, name)
 }
 
-// openFile performs checks on a file to be executed. If provided a *fs.File,
 // openFile takes that file's Dirent and performs checks on it. If provided a
 // *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's
 // Inode and performs checks on that.
@@ -181,7 +190,7 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string, resolveFinal bool) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
 		var (
 			d   *fs.Dirent
@@ -189,8 +198,7 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 			err error
 		)
 		if passedFile == nil {
-			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename)
-
+			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename, resolveFinal)
 		} else {
 			d, f, err = openFile(ctx, passedFile, nil, "")
 			// Set to nil in case we loop on a Interpreter Script.
@@ -255,9 +263,9 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, resolveFinal bool, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the binary itself.
-	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv)
+	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, resolveFinal)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
 	}
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 6d3801ad9..3021440ed 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -362,7 +362,7 @@ var AMD64 = &kernel.SyscallTable{
 		319: syscalls.Supported("memfd_create", MemfdCreate),
 		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_SYMLINK_FOLLOW.", nil),
+		322: syscalls.Supported("execveat", Execveat),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 7ece7ba6f..effe16186 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -105,16 +105,14 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		}
 	}
 
-	if flags&linux.AT_SYMLINK_NOFOLLOW != 0 {
-		// TODO(b/128449944): Handle AT_SYMLINK_NOFOLLOW.
-		t.Kernel().EmitUnimplementedEvent(t)
-		return 0, nil, syserror.ENOSYS
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
 	}
-
 	atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
 	if !atEmptyPath && len(pathname) == 0 {
 		return 0, nil, syserror.ENOENT
 	}
+	resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
 
 	root := t.FSContext().RootDirectory()
 	defer root.DecRef()
@@ -150,7 +148,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, resolveFinal, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 03ec9f75f..21a5ffd40 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -542,23 +542,23 @@ TEST(ExecveatTest, BasicWithFDCWD) {
 TEST(ExecveatTest, Basic) {
   std::string absolute_path = WorkloadPath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
-  std::string relative_path = std::string(Basename(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
 
-  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, /*flags=*/0,
+  CheckExecveat(dirfd.get(), base, {absolute_path}, {}, /*flags=*/0,
                 ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
 }
 
 TEST(ExecveatTest, FDNotADirectory) {
   std::string absolute_path = WorkloadPath(kBasicWorkload);
-  std::string relative_path = std::string(Basename(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
 
   int execve_errno;
-  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), relative_path,
-                                            {absolute_path}, {}, /*flags=*/0,
-                                            /*child=*/nullptr, &execve_errno));
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), base, {absolute_path}, {},
+                                            /*flags=*/0, /*child=*/nullptr,
+                                            &execve_errno));
   EXPECT_EQ(execve_errno, ENOTDIR);
 }
 
@@ -618,14 +618,77 @@ TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
 TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
   std::string absolute_path = WorkloadPath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
-  std::string relative_path = std::string(Basename(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
 
-  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, AT_EMPTY_PATH,
+  CheckExecveat(dirfd.get(), base, {absolute_path}, {}, AT_EMPTY_PATH,
                 ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
 }
 
+TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+  std::string base = std::string(Basename(link.path()));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {},
+                                            AT_SYMLINK_NOFOLLOW,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+  std::string path = link.path();
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(AT_FDCWD, path, {path}, {},
+                                            AT_SYMLINK_NOFOLLOW,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+  std::string path = link.path();
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0));
+
+  CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
+                ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
+  TempPath parent_link =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
+  std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
+
+  CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
+                AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
+
+  CheckExecveat(dirfd.get(), "echo", {"echo"}, {}, AT_SYMLINK_NOFOLLOW,
+                ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, InvalidFlags) {
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
+      /*dirfd=*/-1, "", {}, {}, /*flags=*/0xFFFF, /*child=*/nullptr,
+      &execve_errno));
+  EXPECT_EQ(execve_errno, EINVAL);
+}
+
 // Priority consistent across calls to execve()
 TEST(GetpriorityTest, ExecveMaintainsPriority) {
   int prio = 16;
-- 
cgit v1.2.3


From 7f9c391cf10a8cad57666535ab9b6db4b9086235 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 24 Oct 2019 09:15:21 +0000
Subject: slight changes to pkg/abi

In glibc, some structures are defined differently on different
platforms.
Such as: C.struct_stat

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/abi/linux/BUILD         |  2 ++
 pkg/abi/linux/file.go       | 19 -------------------
 pkg/abi/linux/file_amd64.go | 34 ++++++++++++++++++++++++++++++++++
 pkg/abi/linux/file_arm64.go | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+), 19 deletions(-)
 create mode 100644 pkg/abi/linux/file_amd64.go
 create mode 100644 pkg/abi/linux/file_arm64.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 7c17109a6..51774c6b6 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -23,6 +23,8 @@ go_library(
         "exec.go",
         "fcntl.go",
         "file.go",
+        "file_amd64.go",
+        "file_arm64.go",
         "fs.go",
         "futex.go",
         "inotify.go",
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 257f67222..c9ee098f4 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -186,25 +186,6 @@ const (
 	RWF_VALID = RWF_HIPRI | RWF_DSYNC | RWF_SYNC
 )
 
-// Stat represents struct stat.
-type Stat struct {
-	Dev     uint64
-	Ino     uint64
-	Nlink   uint64
-	Mode    uint32
-	UID     uint32
-	GID     uint32
-	_       int32
-	Rdev    uint64
-	Size    int64
-	Blksize int64
-	Blocks  int64
-	ATime   Timespec
-	MTime   Timespec
-	CTime   Timespec
-	_       [3]int64
-}
-
 // SizeOfStat is the size of a Stat struct.
 var SizeOfStat = binary.Size(Stat{})
 
diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
new file mode 100644
index 000000000..74c554be6
--- /dev/null
+++ b/pkg/abi/linux/file_amd64.go
@@ -0,0 +1,34 @@
+// Copyright 2018 The gVisor Authors.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Stat represents struct stat.
+type Stat struct {
+	Dev     uint64
+	Ino     uint64
+	Nlink   uint64
+	Mode    uint32
+	UID     uint32
+	GID     uint32
+	_       int32
+	Rdev    uint64
+	Size    int64
+	Blksize int64
+	Blocks  int64
+	ATime   Timespec
+	MTime   Timespec
+	CTime   Timespec
+	_       [3]int64
+}
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
new file mode 100644
index 000000000..f16c07589
--- /dev/null
+++ b/pkg/abi/linux/file_arm64.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Stat represents struct stat.
+type Stat struct {
+	Dev     uint64
+	Ino     uint64
+	Mode    uint32
+	Nlink   uint32
+	UID     uint32
+	GID     uint32
+	Rdev    uint64
+	_       uint64
+	Size    int64
+	Blksize int32
+	_       int32
+	Blocks  int64
+	ATime   Timespec
+	MTime   Timespec
+	CTime   Timespec
+	_       [2]int32
+}
-- 
cgit v1.2.3


From f034790ad8c0af42bf510f6c9763e599ac64192d Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 24 Oct 2019 11:07:58 -0700
Subject: Use interface-specific NDP configurations instead of the stack-wide
 default.

This change makes it so that NDP work is done using the per-interface NDP
configurations instead of the stack-wide default NDP configurations to correctly
implement RFC 4861 section 6.3.2 (note here, a host is a single NIC operating
as a host device), and RFC 4862 section 5.1.

Test: Test that we can set NDP configurations on a per-interface basis without
affecting the configurations of other interfaces or the stack-wide default. Also
make sure that after the configurations are updated, the updated configurations
are used for NDP processes (e.g. Duplicate Address Detection).
PiperOrigin-RevId: 276525661
---
 pkg/tcpip/stack/ndp.go      |   8 ++-
 pkg/tcpip/stack/ndp_test.go | 166 ++++++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/nic.go      |  20 +++++-
 pkg/tcpip/stack/stack.go    |  23 +++++-
 4 files changed, 211 insertions(+), 6 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index ea2dbed2e..03ddebdbd 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -107,6 +107,9 @@ type ndpState struct {
 	// The NIC this ndpState is for.
 	nic *NIC
 
+	// configs is the per-interface NDP configurations.
+	configs NDPConfigurations
+
 	// The DAD state to send the next NS message, or resolve the address.
 	dad map[tcpip.Address]dadState
 }
@@ -149,7 +152,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
-	remaining := ndp.nic.stack.ndpConfigs.DupAddrDetectTransmits
+	remaining := ndp.configs.DupAddrDetectTransmits
 
 	{
 		done, err := ndp.doDuplicateAddressDetection(addr, remaining, ref)
@@ -165,7 +168,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 
 	var done bool
 	var timer *time.Timer
-	timer = time.AfterFunc(ndp.nic.stack.ndpConfigs.RetransmitTimer, func() {
+	timer = time.AfterFunc(ndp.configs.RetransmitTimer, func() {
 		var d bool
 		var err *tcpip.Error
 
@@ -218,7 +221,6 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		if doDadIteration() && ndp.nic.stack.ndpDisp != nil {
 			ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, d, err)
 		}
-
 	})
 
 	ndp.dad[addr] = dadState{
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index b089ce2ae..525a25218 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -31,6 +31,7 @@ import (
 const (
 	addr1     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
 	addr2     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	addr3     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
 	linkAddr1 = "\x02\x02\x03\x04\x05\x06"
 )
 
@@ -441,3 +442,168 @@ func TestDADStop(t *testing.T) {
 		t.Fatalf("got NeighborSolicit = %d, want <= 1", got)
 	}
 }
+
+// TestSetNDPConfigurationFailsForBadNICID tests to make sure we get an error if
+// we attempt to update NDP configurations using an invalid NICID.
+func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+
+	// No NIC with ID 1 yet.
+	if got := s.SetNDPConfigurations(1, stack.NDPConfigurations{}); got != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.SetNDPConfigurations = %v, want = %s", got, tcpip.ErrUnknownNICID)
+	}
+}
+
+// TestSetNDPConfigurations tests that we can update and use per-interface NDP
+// configurations without affecting the default NDP configurations or other
+// interfaces' configurations.
+func TestSetNDPConfigurations(t *testing.T) {
+	tests := []struct {
+		name                    string
+		dupAddrDetectTransmits  uint8
+		retransmitTimer         time.Duration
+		expectedRetransmitTimer time.Duration
+	}{
+		{
+			"OK",
+			1,
+			time.Second,
+			time.Second,
+		},
+		{
+			"Invalid Retransmit Timer",
+			1,
+			0,
+			time.Second,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+			})
+
+			// This NIC(1)'s NDP configurations will be updated to
+			// be different from the default.
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Created before updating NIC(1)'s NDP configurations
+			// but updating NIC(1)'s NDP configurations should not
+			// affect other existing NICs.
+			if err := s.CreateNIC(2, e); err != nil {
+				t.Fatalf("CreateNIC(2) = %s", err)
+			}
+
+			// Update the NDP configurations on NIC(1) to use DAD.
+			configs := stack.NDPConfigurations{
+				DupAddrDetectTransmits: test.dupAddrDetectTransmits,
+				RetransmitTimer:        test.retransmitTimer,
+			}
+			if err := s.SetNDPConfigurations(1, configs); err != nil {
+				t.Fatalf("got SetNDPConfigurations(1, _) = %s", err)
+			}
+
+			// Created after updating NIC(1)'s NDP configurations
+			// but the stack's default NDP configurations should not
+			// have been updated.
+			if err := s.CreateNIC(3, e); err != nil {
+				t.Fatalf("CreateNIC(3) = %s", err)
+			}
+
+			// Add addresses for each NIC.
+			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(1, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			}
+			if err := s.AddAddress(2, header.IPv6ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(2, %d, %s) = %s", header.IPv6ProtocolNumber, addr2, err)
+			}
+			if err := s.AddAddress(3, header.IPv6ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(3, %d, %s) = %s", header.IPv6ProtocolNumber, addr3, err)
+			}
+
+			// Address should not be considered bound to NIC(1) yet
+			// (DAD ongoing).
+			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+			}
+
+			// Should get the address on NIC(2) and NIC(3)
+			// immediately since we should not have performed DAD on
+			// it as the stack was configured to not do DAD by
+			// default and we only updated the NDP configurations on
+			// NIC(1).
+			addr, err = s.GetMainNICAddress(2, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(2, _) err = %s", err)
+			}
+			if addr.Address != addr2 {
+				t.Fatalf("got stack.GetMainNICAddress(2, _) = %s, want = %s", addr, addr2)
+			}
+			addr, err = s.GetMainNICAddress(3, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(3, _) err = %s", err)
+			}
+			if addr.Address != addr3 {
+				t.Fatalf("got stack.GetMainNICAddress(3, _) = %s, want = %s", addr, addr3)
+			}
+
+			// Sleep until right (500ms before) before resolution to
+			// make sure the address didn't resolve on NIC(1) yet.
+			const delta = 500 * time.Millisecond
+			time.Sleep(time.Duration(test.dupAddrDetectTransmits)*test.expectedRetransmitTimer - delta)
+			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+			}
+
+			// Wait for DAD to resolve.
+			select {
+			case <-time.After(2 * delta):
+				// We should get a resolution event after 500ms
+				// (delta) since we wait for 500ms less than the
+				// expected resolution time above to make sure
+				// that the address did not yet resolve. Waiting
+				// for 1s (2x delta) without a resolution event
+				// means something is wrong.
+				t.Fatal("timed out waiting for DAD resolution")
+			case e := <-ndpDisp.dadC:
+				if e.err != nil {
+					t.Fatal("got DAD error: ", e.err)
+				}
+				if e.nicid != 1 {
+					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				}
+				if e.addr != addr1 {
+					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+				}
+				if !e.resolved {
+					t.Fatal("got DAD event w/ resolved = false, want = true")
+				}
+			}
+			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(1, _) err = %s", err)
+			}
+			if addr.Address != addr1 {
+				t.Fatalf("got stack.GetMainNICAddress(1, _) = %s, want = %s", addr, addr1)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 2d29fa88e..a867f8c00 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -46,6 +46,10 @@ type NIC struct {
 
 	stats NICStats
 
+	// ndp is the NDP related state for NIC.
+	//
+	// Note, read and write operations on ndp require that the NIC is
+	// appropriately locked.
 	ndp ndpState
 }
 
@@ -80,6 +84,7 @@ const (
 	NeverPrimaryEndpoint
 )
 
+// newNIC returns a new NIC using the default NDP configurations from stack.
 func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
 	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
 	// example, make sure that the link address it provides is a valid
@@ -105,7 +110,8 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 			},
 		},
 		ndp: ndpState{
-			dad: make(map[tcpip.Address]dadState),
+			configs: stack.ndpConfigs,
+			dad:     make(map[tcpip.Address]dadState),
 		},
 	}
 	nic.ndp.nic = nic
@@ -937,6 +943,18 @@ func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
 	return n.removePermanentAddressLocked(addr)
 }
 
+// setNDPConfigs sets the NDP configurations for n.
+//
+// Note, if c contains invalid NDP configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (n *NIC) setNDPConfigs(c NDPConfigurations) {
+	c.validate()
+
+	n.mu.Lock()
+	n.ndp.configs = c
+	n.mu.Unlock()
+}
+
 type networkEndpointKind int32
 
 const (
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 5ea432a24..242d2150c 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -399,7 +399,7 @@ type Stack struct {
 	// TODO(gvisor.dev/issue/940): S/R this field.
 	portSeed uint32
 
-	// ndpConfigs is the NDP configurations used by interfaces.
+	// ndpConfigs is the default NDP configurations used by interfaces.
 	ndpConfigs NDPConfigurations
 
 	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
@@ -433,7 +433,7 @@ type Options struct {
 	// stack (false).
 	HandleLocal bool
 
-	// NDPConfigs is the NDP configurations used by interfaces.
+	// NDPConfigs is the default NDP configurations used by interfaces.
 	//
 	// By default, NDPConfigs will have a zero value for its
 	// DupAddrDetectTransmits field, implying that DAD will not be performed
@@ -1425,6 +1425,25 @@ func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tc
 	return nic.dupTentativeAddrDetected(addr)
 }
 
+// SetNDPConfigurations sets the per-interface NDP configurations on the NIC
+// with ID id to c.
+//
+// Note, if c contains invalid NDP configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.setNDPConfigs(c)
+
+	return nil
+}
+
 // PortSeed returns a 32 bit value that can be used as a seed value for port
 // picking.
 //
-- 
cgit v1.2.3


From e50a1f5739adc9bcb74456d365959ae718ff2197 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 24 Oct 2019 15:18:43 -0700
Subject: Remove the amss field from tcpip.tcp.handshake as it was unused

The amss field in the tcpip.tcp.handshake was not used anywhere. Removed it to
not cause confusion with the amss field in the tcpip.tcp.endpoint struct, which
was documented to be used (and is actually being used) for the same purpose.

PiperOrigin-RevId: 276577088
---
 pkg/tcpip/transport/tcp/connect.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 8db1cc028..790e89cc3 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -78,9 +78,6 @@ type handshake struct {
 	// mss is the maximum segment size received from the peer.
 	mss uint16
 
-	// amss is the maximum segment size advertised by us to the peer.
-	amss uint16
-
 	// sndWndScale is the send window scale, as defined in RFC 1323. A
 	// negative value means no scaling is supported by the peer.
 	sndWndScale int
-- 
cgit v1.2.3


From e8ba10c0085d404378ce649e018624b93cf4aa65 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 24 Oct 2019 16:35:29 -0700
Subject: Fix early deletion of rootDir

container.startContainers() cannot be called twice in a test
(e.g. TestMultiContainerLoadSandbox) because the cleanup
function deletes the rootDir, together with information from
all other containers that may exist.

PiperOrigin-RevId: 276591806
---
 runsc/container/container_test.go       |   3 +-
 runsc/container/multi_container_test.go | 198 +++++++++++++++++++++++++-------
 runsc/testutil/testutil.go              |   7 --
 test/root/oom_score_adj_test.go         |  28 +++--
 4 files changed, 176 insertions(+), 60 deletions(-)

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c4c56b2e0..07eacaac0 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1548,7 +1548,8 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfigWithRoot(rootDir)
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
 
 	cids := []string{
 		"foo-" + testutil.UniqueContainerID(),
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 9e02a825e..a5a62378c 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -60,13 +60,8 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 }
 
 func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
-	// Setup root dir if one hasn't been provided.
 	if len(conf.RootDir) == 0 {
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			return nil, nil, fmt.Errorf("error creating root dir: %v", err)
-		}
-		conf.RootDir = rootDir
+		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
 	var containers []*Container
@@ -78,7 +73,6 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		for _, b := range bundles {
 			os.RemoveAll(b)
 		}
-		os.RemoveAll(conf.RootDir)
 	}
 	for i, spec := range specs {
 		bundleDir, err := testutil.SetupBundleDir(spec)
@@ -144,6 +138,13 @@ func TestMultiContainerSanity(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
@@ -175,6 +176,13 @@ func TestMultiPIDNS(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		testSpecs, ids := createSpecs(sleep, sleep)
@@ -213,6 +221,13 @@ func TestMultiPIDNSPath(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		testSpecs, ids := createSpecs(sleep, sleep, sleep)
@@ -268,13 +283,21 @@ func TestMultiPIDNSPath(t *testing.T) {
 }
 
 func TestMultiContainerWait(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
 
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -344,12 +367,14 @@ func TestExecWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -432,7 +457,15 @@ func TestMultiContainerMount(t *testing.T) {
 	})
 
 	// Setup the containers.
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	containers, cleanup, err := startContainers(conf, sps, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -454,6 +487,13 @@ func TestMultiContainerSignal(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
@@ -548,6 +588,13 @@ func TestMultiContainerDestroy(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// First container will remain intact while the second container is killed.
 		podSpecs, ids := createSpecs(
 			[]string{"sleep", "100"},
@@ -599,13 +646,21 @@ func TestMultiContainerDestroy(t *testing.T) {
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
 	// will just execve into 'sleep' and both containers will look the
 	// same.
 	specs, ids := createSpecs(
 		[]string{"sleep", "100"},
 		[]string{"sh", "-c", "{ sleep 100; }"})
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -650,6 +705,15 @@ func TestMultiContainerProcesses(t *testing.T) {
 // TestMultiContainerKillAll checks that all process that belong to a container
 // are killed when SIGKILL is sent to *all* processes in that container.
 func TestMultiContainerKillAll(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	for _, tc := range []struct {
 		killContainer bool
 	}{
@@ -665,7 +729,6 @@ func TestMultiContainerKillAll(t *testing.T) {
 		specs, ids := createSpecs(
 			[]string{app, "task-tree", "--depth=2", "--width=2"},
 			[]string{app, "task-tree", "--depth=4", "--width=2"})
-		conf := testutil.TestConfig()
 		containers, cleanup, err := startContainers(conf, specs, ids)
 		if err != nil {
 			t.Fatalf("error starting containers: %v", err)
@@ -739,19 +802,13 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	specs, ids := createSpecs(
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfigWithRoot(rootDir)
 
-	// Create and start root container.
-	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+	conf := testutil.TestConfig()
+	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(rootBundleDir)
 
 	rootArgs := Args{
@@ -800,19 +857,12 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	specs, ids := createSpecs(cmds...)
 
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfigWithRoot(rootDir)
-
-	// Create and start root container.
-	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+	conf := testutil.TestConfig()
+	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(rootBundleDir)
 
 	rootArgs := Args{
@@ -886,9 +936,17 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
 	cmd := []string{"sh", "-c", script}
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Make sure overlay is enabled, and none of the root filesystems are
 	// read-only, otherwise we won't be able to create the file.
-	conf := testutil.TestConfig()
 	conf.Overlay = true
 	specs, ids := createSpecs(cmdRoot, cmd, cmd)
 	for _, s := range specs {
@@ -941,26 +999,21 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	}
 	allSpecs, allIDs := createSpecs(cmds...)
 
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
 	// Split up the specs and IDs.
 	rootSpec := allSpecs[0]
 	rootID := allIDs[0]
 	childrenSpecs := allSpecs[1:]
 	childrenIDs := allIDs[1:]
 
-	bundleDir, err := testutil.SetupBundleDir(rootSpec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
-		t.Fatalf("error setting up bundle dir: %v", err)
+		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
 	// Start root container.
-	conf := testutil.TestConfigWithRoot(rootDir)
 	rootArgs := Args{
 		ID:        rootID,
 		Spec:      rootSpec,
@@ -1029,6 +1082,13 @@ func TestMultiContainerSharedMount(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1137,6 +1197,13 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1197,6 +1264,13 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1300,8 +1374,14 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 // Test that unsupported pod mounts options are ignored when matching master and
 // slave mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
-	t.Logf("Running test with conf: %+v", conf)
+	conf.RootDir = rootDir
 
 	// Setup the containers.
 	sleep := []string{"/bin/sleep", "100"}
@@ -1376,6 +1456,15 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 		Type:        "tmpfs",
 	}
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Create the specs.
 	specs, ids := createSpecs(
 		[]string{"sleep", "1000"},
@@ -1386,7 +1475,6 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 	specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt)
 	specs[2].Mounts = append(specs[1].Mounts, sharedMnt)
 
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1405,9 +1493,17 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 
 // Test that container is destroyed when Gofer is killed.
 func TestMultiContainerGoferKilled(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1483,7 +1579,15 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 func TestMultiContainerLoadSandbox(t *testing.T) {
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
 
 	// Create containers for the sandbox.
 	wants, cleanup, err := startContainers(conf, specs, ids)
@@ -1576,7 +1680,15 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	pod, cleanup, err := startContainers(conf, podSpecs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 26467bdc7..9632776d2 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -151,13 +151,6 @@ func TestConfig() *boot.Config {
 	}
 }
 
-// TestConfigWithRoot returns the default configuration to use in tests.
-func TestConfigWithRoot(rootDir string) *boot.Config {
-	conf := TestConfig()
-	conf.RootDir = rootDir
-	return conf
-}
-
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
 // in tests.
 func NewSpecWithArgs(args ...string) *specs.Spec {
diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go
index 6cd378a1b..126f0975a 100644
--- a/test/root/oom_score_adj_test.go
+++ b/test/root/oom_score_adj_test.go
@@ -40,6 +40,15 @@ var (
 // TestOOMScoreAdjSingle tests that oom_score_adj is set properly in a
 // single container sandbox.
 func TestOOMScoreAdjSingle(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	ppid, err := specutils.GetParentPid(os.Getpid())
 	if err != nil {
 		t.Fatalf("getting parent pid: %v", err)
@@ -84,7 +93,6 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 			s := testutil.NewSpecWithArgs("sleep", "1000")
 			s.Process.OOMScoreAdj = testCase.OOMScoreAdj
 
-			conf := testutil.TestConfig()
 			containers, cleanup, err := startContainers(conf, []*specs.Spec{s}, []string{id})
 			if err != nil {
 				t.Fatalf("error starting containers: %v", err)
@@ -123,6 +131,15 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 // TestOOMScoreAdjMulti tests that oom_score_adj is set properly in a
 // multi-container sandbox.
 func TestOOMScoreAdjMulti(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	ppid, err := specutils.GetParentPid(os.Getpid())
 	if err != nil {
 		t.Fatalf("getting parent pid: %v", err)
@@ -240,7 +257,6 @@ func TestOOMScoreAdjMulti(t *testing.T) {
 				}
 			}
 
-			conf := testutil.TestConfig()
 			containers, cleanup, err := startContainers(conf, specs, ids)
 			if err != nil {
 				t.Fatalf("error starting containers: %v", err)
@@ -327,13 +343,8 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 }
 
 func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
-	// Setup root dir if one hasn't been provided.
 	if len(conf.RootDir) == 0 {
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			return nil, nil, fmt.Errorf("error creating root dir: %v", err)
-		}
-		conf.RootDir = rootDir
+		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
 	var containers []*container.Container
@@ -345,7 +356,6 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*c
 		for _, b := range bundles {
 			os.RemoveAll(b)
 		}
-		os.RemoveAll(conf.RootDir)
 	}
 	for i, spec := range specs {
 		bundleDir, err := testutil.SetupBundleDir(spec)
-- 
cgit v1.2.3


From 27e896f2905eea612855b1c92d9b43ebaa09cbf3 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 24 Oct 2019 16:51:41 -0700
Subject: Add a type to represent the NDP Prefix Information option.

This change is in preparation for NDP Prefix Discovery and SLAAC where the stack
will need to handle NDP Prefix Information options.

Tests: Test that given an NDP Prefix Information option buffer, correct values
are returned by the field getters.
PiperOrigin-RevId: 276594592
---
 pkg/tcpip/header/ndp_options.go | 164 ++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/header/ndp_test.go    |  71 +++++++++++++++++
 2 files changed, 235 insertions(+)

diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index b28bde15b..98310ea23 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -15,6 +15,9 @@
 package header
 
 import (
+	"encoding/binary"
+	"time"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
@@ -27,6 +30,65 @@ const (
 	// Link Layer Option for an Ethernet address.
 	ndpTargetEthernetLinkLayerAddressSize = 8
 
+	// ndpPrefixInformationType is the type of the Prefix Information
+	// option, as per RFC 4861 section 4.6.2.
+	ndpPrefixInformationType = 3
+
+	// ndpPrefixInformationLength is the expected length, in bytes, of the
+	// body of an NDP Prefix Information option, as per RFC 4861 section
+	// 4.6.2 which specifies that the Length field is 4. Given this, the
+	// expected length, in bytes, is 30 becuase 4 * lengthByteUnits (8) - 2
+	// (Type & Length) = 30.
+	ndpPrefixInformationLength = 30
+
+	// ndpPrefixInformationPrefixLengthOffset is the offset of the Prefix
+	// Length field within an NDPPrefixInformation.
+	ndpPrefixInformationPrefixLengthOffset = 0
+
+	// ndpPrefixInformationFlagsOffset is the offset of the flags byte
+	// within an NDPPrefixInformation.
+	ndpPrefixInformationFlagsOffset = 1
+
+	// ndpPrefixInformationOnLinkFlagMask is the mask of the On-Link Flag
+	// field in the flags byte within an NDPPrefixInformation.
+	ndpPrefixInformationOnLinkFlagMask = (1 << 7)
+
+	// ndpPrefixInformationAutoAddrConfFlagMask is the mask of the
+	// Autonomous Address-Configuration flag field in the flags byte within
+	// an NDPPrefixInformation.
+	ndpPrefixInformationAutoAddrConfFlagMask = (1 << 6)
+
+	// ndpPrefixInformationReserved1FlagsMask is the mask of the Reserved1
+	// field in the flags byte within an NDPPrefixInformation.
+	ndpPrefixInformationReserved1FlagsMask = 63
+
+	// ndpPrefixInformationValidLifetimeOffset is the start of the 4-byte
+	// Valid Lifetime field within an NDPPrefixInformation.
+	ndpPrefixInformationValidLifetimeOffset = 2
+
+	// ndpPrefixInformationPreferredLifetimeOffset is the start of the
+	// 4-byte Preferred Lifetime field within an NDPPrefixInformation.
+	ndpPrefixInformationPreferredLifetimeOffset = 6
+
+	// ndpPrefixInformationReserved2Offset is the start of the 4-byte
+	// Reserved2 field within an NDPPrefixInformation.
+	ndpPrefixInformationReserved2Offset = 10
+
+	// ndpPrefixInformationReserved2Length is the length of the Reserved2
+	// field.
+	//
+	// It is 4 bytes.
+	ndpPrefixInformationReserved2Length = 4
+
+	// ndpPrefixInformationPrefixOffset is the start of the Prefix field
+	// within an NDPPrefixInformation.
+	ndpPrefixInformationPrefixOffset = 14
+
+	// NDPPrefixInformationInfiniteLifetime is a value that represents
+	// infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
+	// Information option. Its value is (2^32 - 1)s = 4294967295s
+	NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295
+
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
 	// 8 octets, as per RFC 4861 section 4.6.
@@ -154,6 +216,9 @@ func (b NDPOptionsSerializer) Length() int {
 
 // NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option
 // as defined by RFC 4861 section 4.6.1.
+//
+// It is the first X bytes following the NDP option's Type and Length field
+// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
 type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
 
 // Type implements ndpOption.Type.
@@ -170,3 +235,102 @@ func (o NDPTargetLinkLayerAddressOption) Length() int {
 func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
 	return copy(b, o)
 }
+
+// NDPPrefixInformation is the NDP Prefix Information option as defined by
+// RFC 4861 section 4.6.2.
+//
+// The length, in bytes, of a valid NDP Prefix Information option body MUST be
+// ndpPrefixInformationLength bytes.
+type NDPPrefixInformation []byte
+
+// Type implements ndpOption.Type.
+func (o NDPPrefixInformation) Type() uint8 {
+	return ndpPrefixInformationType
+}
+
+// Length implements ndpOption.Length.
+func (o NDPPrefixInformation) Length() int {
+	return ndpPrefixInformationLength
+}
+
+// serializeInto implements ndpOption.serializeInto.
+func (o NDPPrefixInformation) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the Reserved1 field.
+	b[ndpPrefixInformationFlagsOffset] &^= ndpPrefixInformationReserved1FlagsMask
+
+	// Zero out the Reserved2 field.
+	reserved2 := b[ndpPrefixInformationReserved2Offset:][:ndpPrefixInformationReserved2Length]
+	for i := range reserved2 {
+		reserved2[i] = 0
+	}
+
+	return used
+}
+
+// PrefixLength returns the value in the number of leading bits in the Prefix
+// that are valid.
+//
+// Valid values are in the range [0, 128], but o may not always contain valid
+// values. It is up to the caller to valdiate the Prefix Information option.
+func (o NDPPrefixInformation) PrefixLength() uint8 {
+	return o[ndpPrefixInformationPrefixLengthOffset]
+}
+
+// OnLinkFlag returns true of the prefix is considered on-link. On-link means
+// that a forwarding node is not needed to send packets to other nodes on the
+// same prefix.
+//
+// Note, when this function returns false, no statement is made about the
+// on-link property of a prefix. That is, if OnLinkFlag returns false, the
+// caller MUST NOT conclude that the prefix is off-link and MUST NOT update any
+// previously stored state for this prefix about its on-link status.
+func (o NDPPrefixInformation) OnLinkFlag() bool {
+	return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationOnLinkFlagMask != 0
+}
+
+// AutonomousAddressConfigurationFlag returns true if the prefix can be used for
+// Stateless Address Auto-Configuration (as specified in RFC 4862).
+func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool {
+	return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationAutoAddrConfFlagMask != 0
+}
+
+// ValidLifetime returns the length of time that the prefix is valid for the
+// purpose of on-link determination. This value is relative to the send time of
+// the packet that the Prefix Information option was present in.
+//
+// Note, a value of 0 implies the prefix should not be considered as on-link,
+// and a value of infinity/forever is represented by
+// NDPPrefixInformationInfiniteLifetime.
+func (o NDPPrefixInformation) ValidLifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 4861 section 4.6.2.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:]))
+}
+
+// PreferredLifetime returns the length of time that an address generated from
+// the prefix via Stateless Address Auto-Configuration remains preferred. This
+// value is relative to the send time of the packet that the Prefix Information
+// option was present in.
+//
+// Note, a value of 0 implies that addresses generated from the prefix should
+// no longer remain preferred, and a value of infinity is represented by
+// NDPPrefixInformationInfiniteLifetime.
+//
+// Also note that the value of this field MUST NOT exceed the Valid Lifetime
+// field to avoid preferring addresses that are no longer valid, for the
+// purpose of Stateless Address Auto-Configuration.
+func (o NDPPrefixInformation) PreferredLifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 4861 section 4.6.2.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationPreferredLifetimeOffset:]))
+}
+
+// Prefix returns an IPv6 address or a prefix of an IPv6 address. The Prefix
+// Length field (see NDPPrefixInformation.PrefixLength) contains the number
+// of valid leading bits in the prefix.
+//
+// Hosts SHOULD ignore an NDP Prefix Information option where the Prefix field
+// holds the link-local prefix (fe80::).
+func (o NDPPrefixInformation) Prefix() tcpip.Address {
+	return tcpip.Address(o[ndpPrefixInformationPrefixOffset:][:IPv6AddressSize])
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 0aac14f43..0bbf67a2b 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -197,3 +197,74 @@ func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
 		})
 	}
 }
+
+// TestNDPPrefixInformationOption tests the field getters and serialization of a
+// NDPPrefixInformation.
+func TestNDPPrefixInformationOption(t *testing.T) {
+	b := []byte{
+		43, 127,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		5, 5, 5, 5,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+		17, 18, 19, 20,
+		21, 22, 23, 24,
+	}
+
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPPrefixInformation(b),
+	}
+	opts.Serialize(serializer)
+	expectedBuf := []byte{
+		3, 4, 43, 64,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		0, 0, 0, 0,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+		17, 18, 19, 20,
+		21, 22, 23, 24,
+	}
+	if !bytes.Equal(targetBuf, expectedBuf) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expectedBuf)
+	}
+
+	// First two bytes are the Type and Length fields, which are not part of
+	// the option body.
+	pi := NDPPrefixInformation(targetBuf[2:])
+
+	if got := pi.Type(); got != 3 {
+		t.Fatalf("got Type = %d, want = 3", got)
+	}
+
+	if got := pi.Length(); got != 30 {
+		t.Fatalf("got Length = %d, want = 30", got)
+	}
+
+	if got := pi.PrefixLength(); got != 43 {
+		t.Fatalf("got PrefixLength = %d, want = 43", got)
+	}
+
+	if pi.OnLinkFlag() {
+		t.Fatalf("got OnLinkFlag = true, want = false")
+	}
+
+	if !pi.AutonomousAddressConfigurationFlag() {
+		t.Fatalf("got AutonomousAddressConfigurationFlag = false, want = true")
+	}
+
+	if got, want := pi.ValidLifetime(), 16909060*time.Second; got != want {
+		t.Fatalf("got ValidLifetime = %d, want = %d", got, want)
+	}
+
+	if got, want := pi.PreferredLifetime(), 84281096*time.Second; got != want {
+		t.Fatalf("got PreferredLifetime = %d, want = %d", got, want)
+	}
+
+	if got, want := pi.Prefix(), tcpip.Address("\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18"); got != want {
+		t.Fatalf("got Prefix = %s, want = %s", got, want)
+	}
+}
-- 
cgit v1.2.3


From fd598912bee1965c32dee1a5933678ed34e768bc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 25 Oct 2019 10:47:49 -0700
Subject: platform/ptrace: use tgkill instead of kill

The syscall filters don't allow kill, just tgkill.

PiperOrigin-RevId: 276718421
---
 pkg/sentry/platform/ptrace/subprocess.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index b699b057d..ddb1f41e3 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -335,7 +335,8 @@ func (t *thread) unexpectedStubExit() {
 		// these cases, we don't need to panic. There is no reasons to
 		// think that something wrong in gVisor.
 		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
-		syscall.Kill(os.Getpid(), syscall.SIGKILL)
+		pid := os.Getpid()
+		syscall.Tgkill(pid, pid, syscall.Signal(syscall.SIGKILL))
 	}
 	t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
 }
-- 
cgit v1.2.3


From e0c84f284c8cfadc456a5cf3e7cdacbf4f459b96 Mon Sep 17 00:00:00 2001
From: Haibo <Haibo.Xu@arm.com>
Date: Fri, 25 Oct 2019 12:39:20 -0700
Subject: test/syscall:  Remove duplicated gtest/gtest.h.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I05a7ec69b98b88931ba4a8adb3e8a7b822006001
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1023 from xiaobo55x:syscall_test d44a8b1f827ed4081997af96cd58ba7449e0a9e1
PiperOrigin-RevId: 276740442
---
 test/syscalls/linux/accept_bind.cc                                 | 1 -
 test/syscalls/linux/accept_bind_stream.cc                          | 1 -
 test/syscalls/linux/bind.cc                                        | 1 -
 test/syscalls/linux/chroot.cc                                      | 1 -
 test/syscalls/linux/connect_external.cc                            | 1 -
 test/syscalls/linux/file_base.h                                    | 1 -
 test/syscalls/linux/ioctl.cc                                       | 1 -
 test/syscalls/linux/madvise.cc                                     | 1 -
 test/syscalls/linux/memory_accounting.cc                           | 1 -
 test/syscalls/linux/pipe.cc                                        | 1 -
 test/syscalls/linux/pread64.cc                                     | 1 -
 test/syscalls/linux/preadv.cc                                      | 1 -
 test/syscalls/linux/preadv2.cc                                     | 1 -
 test/syscalls/linux/proc_net.cc                                    | 1 -
 test/syscalls/linux/proc_net_tcp.cc                                | 1 -
 test/syscalls/linux/proc_net_udp.cc                                | 1 -
 test/syscalls/linux/proc_net_unix.cc                               | 1 -
 test/syscalls/linux/pwrite64.cc                                    | 1 -
 test/syscalls/linux/pwritev2.cc                                    | 1 -
 test/syscalls/linux/readv.cc                                       | 1 -
 test/syscalls/linux/readv_common.cc                                | 1 -
 test/syscalls/linux/readv_socket.cc                                | 1 -
 test/syscalls/linux/rename.cc                                      | 1 -
 test/syscalls/linux/select.cc                                      | 1 -
 test/syscalls/linux/sigaltstack.cc                                 | 1 -
 test/syscalls/linux/signalfd.cc                                    | 1 -
 test/syscalls/linux/socket_bind_to_device.cc                       | 1 -
 test/syscalls/linux/socket_bind_to_device_distribution.cc          | 1 -
 test/syscalls/linux/socket_bind_to_device_sequence.cc              | 1 -
 test/syscalls/linux/socket_blocking.cc                             | 1 -
 test/syscalls/linux/socket_generic.cc                              | 1 -
 test/syscalls/linux/socket_ip_tcp_generic.cc                       | 1 -
 test/syscalls/linux/socket_ip_tcp_udp_generic.cc                   | 1 -
 test/syscalls/linux/socket_ip_udp_generic.cc                       | 1 -
 test/syscalls/linux/socket_ip_unbound.cc                           | 1 -
 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc | 1 -
 test/syscalls/linux/socket_ipv4_udp_unbound.cc                     | 1 -
 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc | 1 -
 test/syscalls/linux/socket_non_blocking.cc                         | 1 -
 test/syscalls/linux/socket_non_stream.cc                           | 1 -
 test/syscalls/linux/socket_non_stream_blocking.cc                  | 1 -
 test/syscalls/linux/socket_stream.cc                               | 1 -
 test/syscalls/linux/socket_stream_blocking.cc                      | 1 -
 test/syscalls/linux/socket_stream_nonblock.cc                      | 1 -
 test/syscalls/linux/socket_test_util.h                             | 1 -
 test/syscalls/linux/socket_unix.cc                                 | 1 -
 test/syscalls/linux/socket_unix_cmsg.cc                            | 1 -
 test/syscalls/linux/socket_unix_dgram.cc                           | 1 -
 test/syscalls/linux/socket_unix_dgram_non_blocking.cc              | 1 -
 test/syscalls/linux/socket_unix_non_stream.cc                      | 1 -
 test/syscalls/linux/socket_unix_seqpacket.cc                       | 1 -
 test/syscalls/linux/socket_unix_stream.cc                          | 1 -
 test/syscalls/linux/socket_unix_unbound_abstract.cc                | 1 -
 test/syscalls/linux/socket_unix_unbound_dgram.cc                   | 1 -
 test/syscalls/linux/socket_unix_unbound_filesystem.cc              | 1 -
 test/syscalls/linux/socket_unix_unbound_seqpacket.cc               | 1 -
 test/syscalls/linux/socket_unix_unbound_stream.cc                  | 1 -
 test/syscalls/linux/stat.cc                                        | 1 -
 58 files changed, 58 deletions(-)

diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 328192a05..427c42ede 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include <vector>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
index b6cdb3f4f..7bcd91e9e 100644
--- a/test/syscalls/linux/accept_bind_stream.cc
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include <vector>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc
index de8cca53b..9547c4ab2 100644
--- a/test/syscalls/linux/bind.cc
+++ b/test/syscalls/linux/bind.cc
@@ -16,7 +16,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 498c45f16..de1611c21 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -24,7 +24,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc
index 98032ac19..bfe1da82e 100644
--- a/test/syscalls/linux/connect_external.cc
+++ b/test/syscalls/linux/connect_external.cc
@@ -21,7 +21,6 @@
 #include <string>
 #include <tuple>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 36efabcae..4d155b618 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -32,7 +32,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index 4948a76f0..c4f8bff08 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -25,7 +25,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index 08ff4052c..7fd0ea20c 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -25,7 +25,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/logging.h"
 #include "test/util/memory_util.h"
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index a6e20f9c3..ff2f49863 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -15,7 +15,6 @@
 #include <sys/mman.h>
 #include <map>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 10e2a6dfc..c0b354e65 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -19,7 +19,6 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 5e3eb1735..2cecf2e5f 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index eebd129f2..f7ea44054 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -21,7 +21,6 @@
 #include <atomic>
 #include <string>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index aac960130..c9246367d 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -20,7 +20,6 @@
 #include <string>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "test/syscalls/linux/file_base.h"
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index dcfd5f86c..65bad06d4 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -20,7 +20,6 @@
 #include <sys/syscall.h>
 #include <sys/types.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_split.h"
 #include "absl/time/clock.h"
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index f61795592..2659f6a98 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -17,7 +17,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index 369df8e0e..f06f1a24b 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -17,7 +17,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 83dbd1364..66db0acaa 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index e1603fc2d..b48fe540d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index f6a0fc96c..1dbc0d6df 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -20,7 +20,6 @@
 #include <string>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index f327ec3a9..4069cbc7e 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/syscalls/linux/readv_common.h"
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 35d2dd9e3..9658f7d42 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 3c315cc02..9b6972201 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/syscalls/linux/readv_common.h"
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index c9d76c2e2..5b474ff32 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -16,7 +16,6 @@
 #include <stdio.h>
 #include <string>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/util/capability_util.h"
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 88c010aec..e06a2666d 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -20,7 +20,6 @@
 #include <csignal>
 #include <cstdio>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/base_poll_test.h"
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 69b6e4f90..6fd3989a4 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -21,7 +21,6 @@
 #include <functional>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/util/cleanup.h"
 #include "test/util/fs_util.h"
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 9379d5878..09ecad34a 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -23,7 +23,6 @@
 #include <functional>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/synchronization/mutex.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/socket_bind_to_device.cc b/test/syscalls/linux/socket_bind_to_device.cc
index d20821cac..6b27f6eab 100644
--- a/test/syscalls/linux/socket_bind_to_device.cc
+++ b/test/syscalls/linux/socket_bind_to_device.cc
@@ -32,7 +32,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index 4d2400328..5767181a1 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -33,7 +33,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index a7365d139..e4641c62e 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -33,7 +33,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
index 00c50d1bf..d7ce57566 100644
--- a/test/syscalls/linux/socket_blocking.cc
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/un.h>
 #include <cstdio>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index 51d614639..e8f24a59e 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -19,7 +19,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index bfa7943b1..7e0deda05 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -23,7 +23,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
index de63f79d9..f178f1af9 100644
--- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -21,7 +21,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 044394ba7..2a4ed04a5 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -23,7 +23,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index fa9a9df6f..b02872308 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -23,7 +23,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
index 3a068aacf..3c3712b50 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -23,7 +23,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 67d29af0a..b828b6844 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -20,7 +20,6 @@
 #include <sys/un.h>
 #include <cstdio>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 8b8993d3d..98ae414f3 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -27,7 +27,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc
index 73e6dc618..c3520cadd 100644
--- a/test/syscalls/linux/socket_non_blocking.cc
+++ b/test/syscalls/linux/socket_non_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index 3c599b6e8..d91c5ed39 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -18,7 +18,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 76127d181..62d87c1af 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index 0417dd347..346443f96 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index 8367460d2..e9cc082bf 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc
index b00748b97..74d608741 100644
--- a/test/syscalls/linux/socket_stream_nonblock.cc
+++ b/test/syscalls/linux/socket_stream_nonblock.cc
@@ -19,7 +19,6 @@
 #include <sys/uio.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 70710195c..be38907c2 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -29,7 +29,6 @@
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 875f0391f..8a28202a8 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -24,7 +24,6 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
index 1092e29b1..1159c5229 100644
--- a/test/syscalls/linux/socket_unix_cmsg.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -24,7 +24,6 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index 3e0f611d2..3245cf7c9 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -17,7 +17,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 707052af8..cd4fba25c 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index b5c82cd67..276a94eb8 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -18,7 +18,6 @@
 #include <sys/mman.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index 6f6367dd5..60fa9e38a 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -17,7 +17,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 8f38ed92f..563467365 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -16,7 +16,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
index 4b5832de8..7f5816ace 100644
--- a/test/syscalls/linux/socket_unix_unbound_abstract.cc
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 52aef891f..907dca0f1 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -16,7 +16,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
index 8cb03c450..b14f24086 100644
--- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 0575f2e1d..50ffa1d04 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index e483d2777..344918c34 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 88ab90b5b..30de2f8ff 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -24,7 +24,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-- 
cgit v1.2.3


From 8f029b3f823342e43d23e2a238bc599596bdca24 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 25 Oct 2019 13:14:02 -0700
Subject: Convert DelayOption to the newer/faster SockOpt int type.

DelayOption is set on all new endpoints in gVisor.

PiperOrigin-RevId: 276746791
---
 pkg/sentry/socket/netstack/netstack.go | 10 ++++-----
 pkg/tcpip/tcpip.go                     | 10 ++++-----
 pkg/tcpip/transport/tcp/endpoint.go    | 38 ++++++++++++++++++----------------
 pkg/tcpip/transport/tcp/tcp_test.go    |  8 +++----
 4 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 69dbfd197..27c6692c4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -281,7 +281,7 @@ type SocketOperations struct {
 // New creates a new endpoint socket.
 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
 	if skType == linux.SOCK_STREAM {
-		if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil {
+		if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 	}
@@ -1055,8 +1055,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.DelayOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.DelayOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1497,11 +1497,11 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		var o tcpip.DelayOption
+		var o int
 		if v == 0 {
 			o = 1
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(o))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o))
 
 	case linux.TCP_CORK:
 		if len(optVal) < sizeOfInt32 {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 353ecd49b..03be7d3d4 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -489,6 +489,11 @@ const (
 	// number of unread bytes in the output buffer should be returned.
 	SendQueueSizeOption
 
+	// DelayOption is used by SetSockOpt/GetSockOpt to specify if data
+	// should be sent out immediately by the transport protocol. For TCP,
+	// it determines if the Nagle algorithm is on or off.
+	DelayOption
+
 	// TODO(b/137664753): convert all int socket options to be handled via
 	// GetSockOptInt.
 )
@@ -501,11 +506,6 @@ type ErrorOption struct{}
 // socket is to be restricted to sending and receiving IPv6 packets only.
 type V6OnlyOption int
 
-// DelayOption is used by SetSockOpt/GetSockOpt to specify if data should be
-// sent out immediately by the transport protocol. For TCP, it determines if the
-// Nagle algorithm is on or off.
-type DelayOption int
-
 // CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
 // held until segments are full by the TCP transport protocol.
 type CorkOption int
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index c6bc5528c..6ca0d73a9 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1133,16 +1133,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 		e.sndBufMu.Unlock()
 		return nil
 
-	default:
-		return nil
-	}
-}
-
-// SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
-	const inetECNMask = 3
-	switch v := opt.(type) {
 	case tcpip.DelayOption:
 		if v == 0 {
 			atomic.StoreUint32(&e.delay, 0)
@@ -1154,6 +1144,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
+	default:
+		return nil
+	}
+}
+
+// SetSockOpt sets a socket option.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
+	const inetECNMask = 3
+	switch v := opt.(type) {
 	case tcpip.CorkOption:
 		if v == 0 {
 			atomic.StoreUint32(&e.cork, 0)
@@ -1345,6 +1345,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
+
 	case tcpip.SendBufferSizeOption:
 		e.sndBufMu.Lock()
 		v := e.sndBufSize
@@ -1357,8 +1358,16 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 		e.rcvListMu.Unlock()
 		return v, nil
 
+	case tcpip.DelayOption:
+		var o int
+		if v := atomic.LoadUint32(&e.delay); v != 0 {
+			o = 1
+		}
+		return o, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
 	}
-	return -1, tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
@@ -1379,13 +1388,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = header.TCPDefaultMSS
 		return nil
 
-	case *tcpip.DelayOption:
-		*o = 0
-		if v := atomic.LoadUint32(&e.delay); v != 0 {
-			*o = 1
-		}
-		return nil
-
 	case *tcpip.CorkOption:
 		*o = 0
 		if v := atomic.LoadUint32(&e.cork); v != 0 {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 6d022a266..6d808328c 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1623,7 +1623,7 @@ func TestDelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOpt(tcpip.DelayOption(1))
+	c.EP.SetSockOptInt(tcpip.DelayOption, 1)
 
 	var allData []byte
 	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
@@ -1671,7 +1671,7 @@ func TestUndelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOpt(tcpip.DelayOption(1))
+	c.EP.SetSockOptInt(tcpip.DelayOption, 1)
 
 	allData := [][]byte{{0}, {1, 2, 3}}
 	for i, data := range allData {
@@ -1704,7 +1704,7 @@ func TestUndelay(t *testing.T) {
 	// Check that we don't get the second packet yet.
 	c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond)
 
-	c.EP.SetSockOpt(tcpip.DelayOption(0))
+	c.EP.SetSockOptInt(tcpip.DelayOption, 0)
 
 	// Check that data is received.
 	second := c.GetPacket()
@@ -1741,7 +1741,7 @@ func TestMSSNotDelayed(t *testing.T) {
 		fn   func(tcpip.Endpoint)
 	}{
 		{"no-op", func(tcpip.Endpoint) {}},
-		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOpt(tcpip.DelayOption(1)) }},
+		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptInt(tcpip.DelayOption, 1) }},
 		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOpt(tcpip.CorkOption(1)) }},
 	}
 
-- 
cgit v1.2.3


From 5a421058a07477e23f6ca23bb510894419224080 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 25 Oct 2019 16:05:31 -0700
Subject: Validate the checksum for incoming ICMPv6 packets

This change validates the ICMPv6 checksum field before further processing an
ICMPv6 packet.

Tests: Unittests to make sure that only ICMPv6 packets with a valid checksum
are accepted/processed. Existing tests using checker.ICMPv6 now also check the
ICMPv6 checksum field.
PiperOrigin-RevId: 276779148
---
 pkg/tcpip/checker/BUILD              |   1 +
 pkg/tcpip/checker/checker.go         |   7 +
 pkg/tcpip/header/icmpv6.go           |   4 +-
 pkg/tcpip/network/ip_test.go         |   7 +-
 pkg/tcpip/network/ipv6/icmp.go       |  12 +
 pkg/tcpip/network/ipv6/icmp_test.go  | 532 ++++++++++++++++++++++++++++++++++-
 pkg/tcpip/transport/icmp/endpoint.go |   6 +-
 7 files changed, 562 insertions(+), 7 deletions(-)

diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index 4cecfb989..b6fa6fc37 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -10,6 +10,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
     ],
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 02137e1c9..2f15bf1f1 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -22,6 +22,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 )
@@ -639,6 +640,8 @@ func ICMPv4Code(want byte) TransportChecker {
 
 // ICMPv6 creates a checker that checks that the transport protocol is ICMPv6 and
 // potentially additional ICMPv6 header fields.
+//
+// ICMPv6 will validate the checksum field before calling checkers.
 func ICMPv6(checkers ...TransportChecker) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
@@ -650,6 +653,10 @@ func ICMPv6(checkers ...TransportChecker) NetworkChecker {
 		}
 
 		icmp := header.ICMPv6(last.Payload())
+		if got, want := icmp.Checksum(), header.ICMPv6Checksum(icmp, last.SourceAddress(), last.DestinationAddress(), buffer.VectorisedView{}); got != want {
+			t.Fatalf("Bad ICMPv6 checksum; got %d, want %d", got, want)
+		}
+
 		for _, f := range checkers {
 			f(t, icmp)
 		}
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index c2bfd8c79..b4037b6c8 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -132,7 +132,7 @@ func (b ICMPv6) Checksum() uint16 {
 	return binary.BigEndian.Uint16(b[icmpv6ChecksumOffset:])
 }
 
-// SetChecksum calculates and sets the ICMP checksum field.
+// SetChecksum sets the ICMP checksum field.
 func (b ICMPv6) SetChecksum(checksum uint16) {
 	binary.BigEndian.PutUint16(b[icmpv6ChecksumOffset:], checksum)
 }
@@ -197,7 +197,7 @@ func (b ICMPv6) Payload() []byte {
 	return b[ICMPv6PayloadOffset:]
 }
 
-// ICMPv6Checksum calculates the ICMP checksum over the provided ICMP header,
+// ICMPv6Checksum calculates the ICMP checksum over the provided ICMPv6 header,
 // IPv6 src/dst addresses and the payload.
 func ICMPv6Checksum(h ICMPv6, src, dst tcpip.Address, vv buffer.VectorisedView) uint16 {
 	// Calculate the IPv6 pseudo-header upper-layer checksum.
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 8d74497ba..666d8b92a 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -519,6 +519,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 	newUint16 := func(v uint16) *uint16 { return &v }
 
 	const mtu = 0xffff
+	const outerSrcAddr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa"
 	cases := []struct {
 		name           string
 		expectedCount  int
@@ -570,7 +571,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 				PayloadLength: uint16(len(view) - header.IPv6MinimumSize - c.trunc),
 				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 				HopLimit:      20,
-				SrcAddr:       "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa",
+				SrcAddr:       outerSrcAddr,
 				DstAddr:       localIpv6Addr,
 			})
 
@@ -618,6 +619,10 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			o.extra = c.expectedExtra
 
 			vv := view[:len(view)-c.trunc].ToVectorisedView()
+
+			// Set ICMPv6 checksum.
+			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
+
 			ep.HandlePacket(&r, vv)
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index b289e902f..c3f1dd488 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -72,6 +72,18 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	h := header.ICMPv6(v)
 	iph := header.IPv6(netHeader)
 
+	// Validate ICMPv6 checksum before processing the packet.
+	//
+	// Only the first view in vv is accounted for by h. To account for the
+	// rest of vv, a shallow copy is made and the first view is removed.
+	// This copy is used as extra payload during the checksum calculation.
+	payload := vv
+	payload.RemoveFirst()
+	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
+		received.Invalid.Increment()
+		return
+	}
+
 	// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
 	// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
 	// in the IPv6 header is not set to 255.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 7c11dde55..b112303b6 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -30,7 +30,7 @@ import (
 )
 
 const (
-	linkAddr0 = tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06")
+	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
 )
 
@@ -359,3 +359,533 @@ func TestLinkResolution(t *testing.T) {
 		routeICMPv6Packet(t, args, nil)
 	}
 }
+
+func TestICMPChecksumValidationSimple(t *testing.T) {
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	}{
+		{
+			"DstUnreachable",
+			header.ICMPv6DstUnreachable,
+			header.ICMPv6DstUnreachableMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.DstUnreachable
+			},
+		},
+		{
+			"PacketTooBig",
+			header.ICMPv6PacketTooBig,
+			header.ICMPv6PacketTooBigMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.PacketTooBig
+			},
+		},
+		{
+			"TimeExceeded",
+			header.ICMPv6TimeExceeded,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.TimeExceeded
+			},
+		},
+		{
+			"ParamProblem",
+			header.ICMPv6ParamProblem,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.ParamProblem
+			},
+		},
+		{
+			"EchoRequest",
+			header.ICMPv6EchoRequest,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoRequest
+			},
+		},
+		{
+			"EchoReply",
+			header.ICMPv6EchoReply,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoReply
+			},
+		},
+		{
+			"RouterSolicit",
+			header.ICMPv6RouterSolicit,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterSolicit
+			},
+		},
+		{
+			"RouterAdvert",
+			header.ICMPv6RouterAdvert,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterAdvert
+			},
+		},
+		{
+			"NeighborSolicit",
+			header.ICMPv6NeighborSolicit,
+			header.ICMPv6NeighborSolicitMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborSolicit
+			},
+		},
+		{
+			"NeighborAdvert",
+			header.ICMPv6NeighborAdvert,
+			header.ICMPv6NeighborAdvertSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborAdvert
+			},
+		},
+		{
+			"RedirectMsg",
+			header.ICMPv6RedirectMsg,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RedirectMsg
+			},
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr0)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         1,
+					}},
+				)
+			}
+
+			handleIPv6Payload := func(typ header.ICMPv6Type, size int, checksum bool) {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + size)
+				pkt := header.ICMPv6(hdr.Prepend(size))
+				pkt.SetType(typ)
+				if checksum {
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+				}
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(size),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       lladdr1,
+					DstAddr:       lladdr0,
+				})
+				e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+			}
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			typStat := typ.statCounter(stats)
+
+			// Initial stat counts should be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// Without setting checksum, the incoming packet should
+			// be invalid.
+			handleIPv6Payload(typ.typ, typ.size, false)
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+			// Rx count of type typ.typ should not have increased.
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// When checksum is set, it should be received.
+			handleIPv6Payload(typ.typ, typ.size, true)
+			if got := typStat.Value(); got != 1 {
+				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			}
+			// Invalid count should not have increased again.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+		})
+	}
+}
+
+func TestICMPChecksumValidationWithPayload(t *testing.T) {
+	const simpleBodySize = 64
+	simpleBody := func(view buffer.View) {
+		for i := 0; i < simpleBodySize; i++ {
+			view[i] = uint8(i)
+		}
+	}
+
+	const errorICMPBodySize = header.IPv6MinimumSize + simpleBodySize
+	errorICMPBody := func(view buffer.View) {
+		ip := header.IPv6(view)
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: simpleBodySize,
+			NextHeader:    10,
+			HopLimit:      20,
+			SrcAddr:       lladdr0,
+			DstAddr:       lladdr1,
+		})
+		simpleBody(view[header.IPv6MinimumSize:])
+	}
+
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		payloadSize int
+		payload     func(buffer.View)
+	}{
+		{
+			"DstUnreachable",
+			header.ICMPv6DstUnreachable,
+			header.ICMPv6DstUnreachableMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.DstUnreachable
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"PacketTooBig",
+			header.ICMPv6PacketTooBig,
+			header.ICMPv6PacketTooBigMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.PacketTooBig
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"TimeExceeded",
+			header.ICMPv6TimeExceeded,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.TimeExceeded
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"ParamProblem",
+			header.ICMPv6ParamProblem,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.ParamProblem
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"EchoRequest",
+			header.ICMPv6EchoRequest,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoRequest
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+		{
+			"EchoReply",
+			header.ICMPv6EchoReply,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoReply
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr0)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         1,
+					}},
+				)
+			}
+
+			handleIPv6Payload := func(typ header.ICMPv6Type, size, payloadSize int, payloadFn func(buffer.View), checksum bool) {
+				icmpSize := size + payloadSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+				pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+				pkt.SetType(typ)
+				payloadFn(pkt.Payload())
+
+				if checksum {
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+				}
+
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(icmpSize),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       lladdr1,
+					DstAddr:       lladdr0,
+				})
+				e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+			}
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			typStat := typ.statCounter(stats)
+
+			// Initial stat counts should be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// Without setting checksum, the incoming packet should
+			// be invalid.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, false)
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+			// Rx count of type typ.typ should not have increased.
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// When checksum is set, it should be received.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
+			if got := typStat.Value(); got != 1 {
+				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			}
+			// Invalid count should not have increased again.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+		})
+	}
+}
+
+func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
+	const simpleBodySize = 64
+	simpleBody := func(view buffer.View) {
+		for i := 0; i < simpleBodySize; i++ {
+			view[i] = uint8(i)
+		}
+	}
+
+	const errorICMPBodySize = header.IPv6MinimumSize + simpleBodySize
+	errorICMPBody := func(view buffer.View) {
+		ip := header.IPv6(view)
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: simpleBodySize,
+			NextHeader:    10,
+			HopLimit:      20,
+			SrcAddr:       lladdr0,
+			DstAddr:       lladdr1,
+		})
+		simpleBody(view[header.IPv6MinimumSize:])
+	}
+
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		payloadSize int
+		payload     func(buffer.View)
+	}{
+		{
+			"DstUnreachable",
+			header.ICMPv6DstUnreachable,
+			header.ICMPv6DstUnreachableMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.DstUnreachable
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"PacketTooBig",
+			header.ICMPv6PacketTooBig,
+			header.ICMPv6PacketTooBigMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.PacketTooBig
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"TimeExceeded",
+			header.ICMPv6TimeExceeded,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.TimeExceeded
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"ParamProblem",
+			header.ICMPv6ParamProblem,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.ParamProblem
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"EchoRequest",
+			header.ICMPv6EchoRequest,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoRequest
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+		{
+			"EchoReply",
+			header.ICMPv6EchoReply,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoReply
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr0)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         1,
+					}},
+				)
+			}
+
+			handleIPv6Payload := func(typ header.ICMPv6Type, size, payloadSize int, payloadFn func(buffer.View), checksum bool) {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + size)
+				pkt := header.ICMPv6(hdr.Prepend(size))
+				pkt.SetType(typ)
+
+				payload := buffer.NewView(payloadSize)
+				payloadFn(payload)
+
+				if checksum {
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, payload.ToVectorisedView()))
+				}
+
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(size + payloadSize),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       lladdr1,
+					DstAddr:       lladdr0,
+				})
+				e.Inject(ProtocolNumber,
+					buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize,
+						[]buffer.View{hdr.View(), payload}))
+			}
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			typStat := typ.statCounter(stats)
+
+			// Initial stat counts should be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// Without setting checksum, the incoming packet should
+			// be invalid.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, false)
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+			// Rx count of type typ.typ should not have increased.
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// When checksum is set, it should be received.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
+			if got := typStat.Value(); got != 1 {
+				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			}
+			// Invalid count should not have increased again.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 3187b336b..043467519 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -445,13 +445,13 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	icmpv6.SetChecksum(0)
-	icmpv6.SetChecksum(^header.Checksum(icmpv6, header.Checksum(data, 0)))
+	dataVV := data.ToVectorisedView()
+	icmpv6.SetChecksum(header.ICMPv6Checksum(icmpv6, r.LocalAddress, r.RemoteAddress, dataVV))
 
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS})
+	return r.WritePacket(nil /* gso */, hdr, dataVV, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS})
 }
 
 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-- 
cgit v1.2.3


From 1c480abc39b9957606ff8bf125a5c253ad8a76cb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 25 Oct 2019 22:31:35 -0700
Subject: Aggregate arguments for loading executables into a single struct.

This change simplifies the function signatures of functions related to loading
executables, such as LoadTaskImage, Load, loadBinary.

PiperOrigin-RevId: 276821187
---
 pkg/sentry/kernel/kernel.go             |  14 +++-
 pkg/sentry/kernel/task_context.go       |  27 ++-----
 pkg/sentry/loader/elf.go                |  17 ++--
 pkg/sentry/loader/loader.go             | 132 +++++++++++++++++++++-----------
 pkg/sentry/syscalls/linux/BUILD         |   1 +
 pkg/sentry/syscalls/linux/sys_thread.go |  18 ++++-
 6 files changed, 136 insertions(+), 73 deletions(-)

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index d70ad5c09..fcfe7a16d 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -804,8 +804,20 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 
 	// Create a fresh task context.
 	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Mounts:              mounts,
+		Root:                root,
+		WorkingDirectory:    wd,
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        true,
+		Filename:            args.Filename,
+		File:                args.File,
+		Argv:                args.Argv,
+		Envv:                args.Envv,
+		Features:            k.featureSet,
+	}
 
-	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, true /*resolveFinal*/, k.featureSet)
+	tc, se := k.LoadTaskImage(ctx, loadArgs)
 	if se != nil {
 		return nil, 0, errors.New(se.String())
 	}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 1da718b27..bb5560acf 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -18,10 +18,8 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -132,30 +130,21 @@ func (t *Task) Stack() *arch.Stack {
 	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
 }
 
-// LoadTaskImage loads filename into a new TaskContext.
+// LoadTaskImage loads a specified file into a new TaskContext.
 //
-// It takes several arguments:
-//  * mounts: MountNamespace to lookup filename in
-//  * root: Root to lookup filename under
-//  * wd: Working directory to lookup filename under
-//  * maxTraversals: maximum number of symlinks to follow
-//  * filename: path to binary to load
-//  * file: an open fs.File object of the binary to load. If set,
-//  file will be loaded and not filename.
-//  * argv: Binary argv
-//  * envv: Binary envv
-//  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, resolveFinal bool, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
-	// If File is not nil, we should load that instead of resolving filename.
-	if file != nil {
-		filename = file.MappedName(ctx)
+// args.MemoryManager does not need to be set by the caller.
+func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
+	// If File is not nil, we should load that instead of resolving Filename.
+	if args.File != nil {
+		args.Filename = args.File.MappedName(ctx)
 	}
 
 	// Prepare a new user address space to load into.
 	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
+	args.MemoryManager = m
 
-	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, resolveFinal, k.extraAuxv, k.vdso)
+	os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 86f6b269b..3ea037e4d 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -624,15 +624,15 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, in
 	return loadParsedELF(ctx, m, f, info, 0)
 }
 
-// loadELF loads f into the Task address space.
+// loadELF loads args.File into the Task address space.
 //
 // If loadELF returns ErrSwitchFile it should be called again with the returned
 // path and argv.
 //
 // Preconditions:
-//  * f is an ELF file
-func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
-	bin, ac, err := loadInitialELF(ctx, m, fs, f)
+//  * args.File is an ELF file
+func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) {
+	bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File)
 	if err != nil {
 		ctx.Infof("Error loading binary: %v", err)
 		return loadedELF{}, nil, err
@@ -640,7 +640,12 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace
 
 	var interp loadedELF
 	if bin.interpreter != "" {
-		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter, true /*resolveFinal*/)
+		// Even if we do not allow the final link of the script to be
+		// resolved, the interpreter should still be resolved if it is
+		// a symlink.
+		args.ResolveFinal = true
+		args.Filename = bin.interpreter
+		d, i, err := openPath(ctx, args)
 		if err != nil {
 			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
 			return loadedELF{}, nil, err
@@ -649,7 +654,7 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace
 		// We don't need the Dirent.
 		d.DecRef()
 
-		interp, err = loadInterpreterELF(ctx, m, i, bin)
+		interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin)
 		if err != nil {
 			ctx.Infof("Error loading interpreter: %v", err)
 			return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index f5303491d..818941762 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package loader loads a binary into a MemoryManager.
+// Package loader loads an executable file into a MemoryManager.
 package loader
 
 import (
@@ -35,6 +35,48 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LoadArgs holds specifications for an executable file to be loaded.
+type LoadArgs struct {
+	// MemoryManager is the memory manager to load the executable into.
+	MemoryManager *mm.MemoryManager
+
+	// Mounts is the mount namespace in which to look up Filename.
+	Mounts *fs.MountNamespace
+
+	// Root is the root directory under which to look up Filename.
+	Root *fs.Dirent
+
+	// WorkingDirectory is the working directory under which to look up
+	// Filename.
+	WorkingDirectory *fs.Dirent
+
+	// RemainingTraversals is the maximum number of symlinks to follow to
+	// resolve Filename. This counter is passed by reference to keep it
+	// updated throughout the call stack.
+	RemainingTraversals *uint
+
+	// ResolveFinal indicates whether the final link of Filename should be
+	// resolved, if it is a symlink.
+	ResolveFinal bool
+
+	// Filename is the path for the executable.
+	Filename string
+
+	// File is an open fs.File object of the executable. If File is not
+	// nil, then File will be loaded and Filename will be ignored.
+	File *fs.File
+
+	// Argv is the vector of arguments to pass to the executable.
+	Argv []string
+
+	// Envv is the vector of environment variables to pass to the
+	// executable.
+	Envv []string
+
+	// Features specifies the CPU feature set for the executable.
+	Features *cpuid.FeatureSet
+}
+
 // readFull behaves like io.ReadFull for an *fs.File.
 func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	var total int64
@@ -51,24 +93,24 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 	return total, nil
 }
 
-// openPath opens name for loading.
+// openPath opens args.Filename for loading.
 //
-// openPath returns the fs.Dirent and an *fs.File for name, which is not
-// installed in the Task FDTable. The caller takes ownership of both.
+// openPath returns the fs.Dirent and an *fs.File for args.Filename, which is
+// not installed in the Task FDTable. The caller takes ownership of both.
 //
-// name must be a readable, executable, regular file.
-func openPath(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string, resolveFinal bool) (*fs.Dirent, *fs.File, error) {
+// args.Filename must be a readable, executable, regular file.
+func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) {
 	var err error
-	if name == "" {
+	if args.Filename == "" {
 		ctx.Infof("cannot open empty name")
 		return nil, nil, syserror.ENOENT
 	}
 
 	var d *fs.Dirent
-	if resolveFinal {
-		d, err = mounts.FindInode(ctx, root, wd, name, maxTraversals)
+	if args.ResolveFinal {
+		d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
 	} else {
-		d, err = mounts.FindLink(ctx, root, wd, name, maxTraversals)
+		d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
 	}
 	if err != nil {
 		return nil, nil, err
@@ -77,11 +119,11 @@ func openPath(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Diren
 	// Open file will take a reference to Dirent, so destroy this one.
 	defer d.DecRef()
 
-	if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+	if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
 		return nil, nil, syserror.ELOOP
 	}
 
-	return openFile(ctx, nil, d, name)
+	return openFile(ctx, nil, d, args.Filename)
 }
 
 // openFile takes that file's Dirent and performs checks on it. If provided a
@@ -182,34 +224,33 @@ const (
 	maxLoaderAttempts = 6
 )
 
-// loadBinary loads a binary that is pointed to by "file". If nil, the path
-// "filename" is resolved and loaded.
+// loadExecutable loads an executable that is pointed to by args.File. If nil,
+// the path args.Filename is resolved and loaded. If the executable is an
+// interpreter script rather than an ELF, the binary of the corresponding
+// interpreter will be loaded.
 //
 // It returns:
 //  * loadedELF, description of the loaded binary
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
-//  * Possibly updated argv
-func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string, resolveFinal bool) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+//  * Possibly updated args.Argv
+func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
 		var (
 			d   *fs.Dirent
-			f   *fs.File
 			err error
 		)
-		if passedFile == nil {
-			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename, resolveFinal)
+		if args.File == nil {
+			d, args.File, err = openPath(ctx, args)
 		} else {
-			d, f, err = openFile(ctx, passedFile, nil, "")
-			// Set to nil in case we loop on a Interpreter Script.
-			passedFile = nil
+			d, args.File, err = openFile(ctx, args.File, nil, "")
 		}
 
 		if err != nil {
-			ctx.Infof("Error opening %s: %v", filename, err)
+			ctx.Infof("Error opening %s: %v", args.Filename, err)
 			return loadedELF{}, nil, nil, nil, err
 		}
-		defer f.DecRef()
+		defer args.File.DecRef()
 		// We will return d in the successful case, but defer a DecRef
 		// for intermediate loops and failure cases.
 		defer d.DecRef()
@@ -217,9 +258,9 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 		// Check the header. Is this an ELF or interpreter script?
 		var hdr [4]uint8
 		// N.B. We assume that reading from a regular file cannot block.
-		_, err = readFull(ctx, f, usermem.BytesIOSequence(hdr[:]), 0)
-		// Allow unexpected EOF, as a valid executable could be only three
-		// bytes (e.g., #!a).
+		_, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0)
+		// Allow unexpected EOF, as a valid executable could be only
+		// three bytes (e.g., #!a).
 		if err != nil && err != io.ErrUnexpectedEOF {
 			if err == io.EOF {
 				err = syserror.ENOEXEC
@@ -229,33 +270,33 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 
 		switch {
 		case bytes.Equal(hdr[:], []byte(elfMagic)):
-			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, features, f)
+			loaded, ac, err := loadELF(ctx, args)
 			if err != nil {
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
 			// An ELF is always terminal. Hold on to d.
 			d.IncRef()
-			return loaded, ac, d, argv, err
+			return loaded, ac, d, args.Argv, err
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
-			newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv)
+			args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv)
 			if err != nil {
 				ctx.Infof("Error loading interpreter script: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
-			filename = newpath
-			argv = newargv
 		default:
 			ctx.Infof("Unknown magic: %v", hdr)
 			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
 		}
+		// Set to nil in case we loop on a Interpreter Script.
+		args.File = nil
 	}
 
 	return loadedELF{}, nil, nil, nil, syserror.ELOOP
 }
 
-// Load loads "file" into a MemoryManager. If file is nil, the path "filename"
-// is resolved and loaded instead.
+// Load loads args.File into a MemoryManager. If args.File is nil, the path
+// args.Filename is resolved and loaded instead.
 //
 // If Load returns ErrSwitchFile it should be called again with the returned
 // path and argv.
@@ -263,37 +304,37 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, resolveFinal bool, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
-	// Load the binary itself.
-	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, resolveFinal)
+func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
+	// Load the executable itself.
+	loaded, ac, d, newArgv, err := loadExecutable(ctx, args)
 	if err != nil {
-		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
 	}
 	defer d.DecRef()
 
 	// Load the VDSO.
-	vdsoAddr, err := loadVDSO(ctx, m, vdso, loaded)
+	vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Setup the heap. brk starts at the next page after the end of the
-	// binary. Userspace can assume that the remainer of the page after
+	// executable. Userspace can assume that the remainer of the page after
 	// loaded.end is available for its use.
 	e, ok := loaded.end.RoundUp()
 	if !ok {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), linux.ENOEXEC)
 	}
-	m.BrkSetup(ctx, e)
+	args.MemoryManager.BrkSetup(ctx, e)
 
 	// Allocate our stack.
-	stack, err := allocStack(ctx, m, ac)
+	stack, err := allocStack(ctx, args.MemoryManager, ac)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to allocate stack: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Push the original filename to the stack, for AT_EXECFN.
-	execfn, err := stack.Push(filename)
+	execfn, err := stack.Push(args.Filename)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
 	}
@@ -327,11 +368,12 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r
 	}...)
 	auxv = append(auxv, extraAuxv...)
 
-	sl, err := stack.Load(argv, envv, auxv)
+	sl, err := stack.Load(newArgv, args.Envv, auxv)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load stack: %v", err), syserr.FromError(err).ToLinux())
 	}
 
+	m := args.MemoryManager
 	m.SetArgvStart(sl.ArgvStart)
 	m.SetArgvEnd(sl.ArgvEnd)
 	m.SetEnvvStart(sl.EnvvStart)
@@ -342,7 +384,7 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r
 	ac.SetIP(uintptr(loaded.entry))
 	ac.SetStack(uintptr(stack.Bottom))
 
-	name := path.Base(filename)
+	name := path.Base(args.Filename)
 	if len(name) > linux.TASK_COMM_LEN-1 {
 		name = name[:linux.TASK_COMM_LEN-1]
 	}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index fb2c1777f..4c0bf96e4 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -79,6 +79,7 @@ go_library(
         "//pkg/sentry/kernel/signalfd",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/safemem",
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index effe16186..2476f8858 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -147,8 +148,21 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	}
 
 	// Load the new TaskContext.
-	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, resolveFinal, t.Arch().FeatureSet())
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Mounts:              t.MountNamespace(),
+		Root:                root,
+		WorkingDirectory:    wd,
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        resolveFinal,
+		Filename:            pathname,
+		File:                executable,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
-- 
cgit v1.2.3


From dec831b4939a6332cac5d186a604ff2cbbcaf7af Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Thu, 24 Oct 2019 03:18:18 +0000
Subject: Cast the Stat_t.Nlink to uint64 on arm64.

Since the syscall.Stat_t.Nlink is defined as different types on
amd64 and arm64(uint64 and uint32 respectively), we need to cast
them to a unified uint64 type in gVisor code.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I7542b99b195c708f3fc49b1cbe6adebdd2f6e96b
---
 pkg/p9/p9.go             | 2 +-
 runsc/fsgofer/fsgofer.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 25530adca..6039f5a42 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -814,7 +814,7 @@ func StatToAttr(s *syscall.Stat_t, req AttrMask) (Attr, AttrMask) {
 		attr.Mode = FileMode(s.Mode)
 	}
 	if req.NLink {
-		attr.NLink = s.Nlink
+		attr.NLink = uint64(s.Nlink)
 	}
 	if req.UID {
 		attr.UID = UID(s.Uid)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 3fceecb3d..18b853e2e 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -601,7 +601,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 		Mode:             p9.FileMode(stat.Mode),
 		UID:              p9.UID(stat.Uid),
 		GID:              p9.GID(stat.Gid),
-		NLink:            stat.Nlink,
+		NLink:            uint64(stat.Nlink),
 		RDev:             stat.Rdev,
 		Size:             uint64(stat.Size),
 		BlockSize:        uint64(stat.Blksize),
-- 
cgit v1.2.3


From e9d43f9022e014a31d40b7d04c5e7f5b6d3be2b9 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 17 Oct 2019 14:51:01 +0800
Subject: Add tools/go_stateify to support Arm64

2 jobs were finished in this patch:
  1, add a new attribute to specify the target platform.
  2, check the source files that whether we can build this file for target platform by checking file name and build tags.
Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 tools/go_stateify/defs.bzl |   9 ++++
 tools/go_stateify/main.go  | 116 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)

diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 3ce36c1c8..a7961c2fb 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -44,6 +44,7 @@ def _go_stateify_impl(ctx):
     # Run the stateify command.
     args = ["-output=%s" % output.path]
     args += ["-pkg=%s" % ctx.attr.package]
+    args += ["-arch=%s" % ctx.attr.arch]
     if ctx.attr._statepkg:
         args += ["-statepkg=%s" % ctx.attr._statepkg]
     if ctx.attr.imports:
@@ -83,6 +84,10 @@ for statified types.
             doc = "The package name for the input sources.",
             mandatory = True,
         ),
+        "arch": attr.string(
+            doc = "Target platform.",
+            mandatory = True,
+        ),
         "out": attr.output(
             doc = """
 The name of the generated file output. This must not conflict with any other
@@ -118,6 +123,10 @@ def go_library(name, srcs, deps = [], imports = [], **kwargs):
             srcs = [src for src in srcs if src.endswith(".go")],
             imports = imports,
             package = name,
+            arch = select({
+                   "@bazel_tools//src/conditions:linux_aarch64": "arm64",
+                   "//conditions:default": "amd64",
+            }),
             out = name + "_state_autogen.go",
         )
         all_srcs = srcs + [name + "_state_autogen.go"]
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index db7a7107b..47c8ea1d7 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -22,7 +22,9 @@ import (
 	"go/ast"
 	"go/parser"
 	"go/token"
+	"io/ioutil"
 	"os"
+	"path/filepath"
 	"reflect"
 	"strings"
 	"sync"
@@ -33,8 +35,117 @@ var (
 	imports  = flag.String("imports", "", "extra imports for the output file")
 	output   = flag.String("output", "", "output file")
 	statePkg = flag.String("statepkg", "", "state import package; defaults to empty")
+	arch     = flag.String("arch", "", "specify the target platform")
 )
 
+// The known architectures.
+var okgoarch = []string{
+	"386",
+	"amd64",
+	"arm",
+	"arm64",
+	"mips",
+	"mipsle",
+	"mips64",
+	"mips64le",
+	"ppc64",
+	"ppc64le",
+	"riscv64",
+	"s390x",
+	"sparc64",
+	"wasm",
+}
+
+// readfile returns the content of the named file.
+func readfile(file string) string {
+	data, err := ioutil.ReadFile(file)
+	if err != nil {
+		panic(fmt.Sprintf("readfile err: %v", err))
+	}
+	return string(data)
+}
+
+// matchfield reports whether the field (x,y,z) matches this build.
+// all the elements in the field must be satisfied.
+func matchfield(f string, goarch string) bool {
+	for _, tag := range strings.Split(f, ",") {
+		if !matchtag(tag, goarch) {
+			return false
+		}
+	}
+	return true
+}
+
+// matchtag reports whether the tag (x or !x) matches this build.
+func matchtag(tag string, goarch string) bool {
+	if tag == "" {
+		return false
+	}
+	if tag[0] == '!' {
+		if len(tag) == 1 || tag[1] == '!' {
+			return false
+		}
+		return !matchtag(tag[1:], goarch)
+	}
+	return tag == goarch
+}
+
+// canBuild reports whether we can build this file for target platform by checking file name and build tags.
+// The code is derived from the Go source cmd.dist.build.shouldbuild.
+func canBuild(file, goTargetArch string) bool {
+	name := filepath.Base(file)
+	excluded := func(list []string, ok string) bool {
+		for _, x := range list {
+			if x == ok || (ok == "android" && x == "linux") || (ok == "illumos" && x == "solaris") {
+				continue
+			}
+			i := strings.Index(name, x)
+			if i <= 0 || name[i-1] != '_' {
+				continue
+			}
+			i += len(x)
+			if i == len(name) || name[i] == '.' || name[i] == '_' {
+				return true
+			}
+		}
+		return false
+	}
+	if excluded(okgoarch, goTargetArch) {
+		return false
+	}
+
+	// Check file contents for // +build lines.
+	for _, p := range strings.Split(readfile(file), "\n") {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		code := p
+		i := strings.Index(code, "//")
+		if i > 0 {
+			code = strings.TrimSpace(code[:i])
+		}
+		if !strings.HasPrefix(p, "//") {
+			break
+		}
+		if !strings.Contains(p, "+build") {
+			continue
+		}
+		fields := strings.Fields(p[2:])
+		if len(fields) < 1 || fields[0] != "+build" {
+			continue
+		}
+		for _, p := range fields[1:] {
+			if matchfield(p, goTargetArch) {
+				goto fieldmatch
+			}
+		}
+		return false
+	fieldmatch:
+	}
+	return true
+}
+
 // resolveTypeName returns a qualified type name.
 func resolveTypeName(name string, typ ast.Expr) (field string, qualified string) {
 	for done := false; !done; {
@@ -256,6 +367,11 @@ func main() {
 			fmt.Fprintf(os.Stderr, "Input %q can't be parsed: %v\n", filename, err)
 			os.Exit(1)
 		}
+
+		if !canBuild(filename, *arch) {
+			continue
+		}
+
 		files = append(files, f)
 	}
 
-- 
cgit v1.2.3


From 198f1cddb82d46570ae63cb704b4a1b88cf0de1f Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 28 Oct 2019 10:18:55 -0700
Subject: Update comment

FDTable.GetFile doesn't exist.

PiperOrigin-RevId: 277089842
---
 pkg/sentry/kernel/task.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index c82ef5486..11a8c6c87 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -709,9 +709,9 @@ func (t *Task) FDTable() *FDTable {
 	return t.fdTable
 }
 
-// GetFile is a convenience wrapper t.FDTable().GetFile.
+// GetFile is a convenience wrapper t.FDTable().Get.
 //
-// Precondition: same as FDTable.
+// Precondition: same as FDTable.Get.
 func (t *Task) GetFile(fd int32) *fs.File {
 	f, _ := t.fdTable.Get(fd)
 	return f
-- 
cgit v1.2.3


From 0864549ecc26e734bae3dcf40e0d761232f8bdad Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 28 Oct 2019 18:19:12 -0700
Subject: Use the user supplied TCP MSS when creating a new active socket

This change supports using a user supplied TCP MSS for new active TCP
connections. Note, the user supplied MSS must be less than or equal to the
maximum possible MSS for a TCP connection's route. If it is greater than the
maximum possible MSS, the maximum possible MSS will be used as the connection's
MSS instead.

This change does not use this user supplied MSS for connections accepted from
listening sockets - that will come in a later change.

Test: Test that outgoing TCP SYN segments contain a TCP MSS option with the user
supplied MSS if it is not greater than the maximum possible MSS for the route.
PiperOrigin-RevId: 277185125
---
 pkg/tcpip/stack/nic.go              |   5 ++
 pkg/tcpip/transport/tcp/accept.go   |   6 +-
 pkg/tcpip/transport/tcp/connect.go  |   2 +-
 pkg/tcpip/transport/tcp/endpoint.go |  24 ++++++-
 pkg/tcpip/transport/tcp/tcp_test.go | 124 ++++++++++++++++++++++++++++++++++++
 5 files changed, 155 insertions(+), 6 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index a867f8c00..a01a208b8 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -89,6 +89,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
 	// example, make sure that the link address it provides is a valid
 	// unicast ethernet address.
+
+	// TODO(b/143357959): RFC 8200 section 5 requires that IPv6 endpoints
+	// observe an MTU of at least 1280 bytes. Ensure that this requirement
+	// of IPv6 is supported on this endpoint's LinkEndpoint.
+
 	nic := &NIC{
 		stack:      stack,
 		id:         id,
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 65c346046..1dd00d026 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -400,6 +400,9 @@ func (e *endpoint) acceptQueueIsFull() bool {
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
+	// TODO(b/143300739): Use the userMSS of the listening socket
+	// for accepted sockets.
+
 	switch s.flags {
 	case header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
@@ -434,13 +437,12 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			//
 			// Enable Timestamp option if the original syn did have
 			// the timestamp option specified.
-			mss := mssForRoute(&s.route)
 			synOpts := header.TCPSynOptions{
 				WS:    -1,
 				TS:    opts.TS,
 				TSVal: tcpTimeStamp(timeStampOffset()),
 				TSEcr: opts.TSVal,
-				MSS:   uint16(mss),
+				MSS:   mssForRoute(&s.route),
 			}
 			e.sendSynTCP(&s.route, s.id, e.ttl, e.sendTOS, header.TCPFlagSyn|header.TCPFlagAck, cookie, s.sequenceNumber+1, ctx.rcvWnd, synOpts)
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 790e89cc3..ca982c451 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -442,7 +442,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
-	h.ep.amss = mssForRoute(&h.ep.route)
+	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
 		WS:            h.rcvWndScale,
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 6ca0d73a9..8234a8b53 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -411,7 +411,7 @@ type endpoint struct {
 
 	// userMSS if non-zero is the MSS value explicitly set by the user
 	// for this endpoint using the TCP_MAXSEG setsockopt.
-	userMSS int
+	userMSS uint16
 
 	// The following fields are used to manage the send buffer. When
 	// segments are ready to be sent, they are added to sndQueue and the
@@ -504,6 +504,21 @@ type endpoint struct {
 	stats Stats `state:"nosave"`
 }
 
+// calculateAdvertisedMSS calculates the MSS to advertise.
+//
+// If userMSS is non-zero and is not greater than the maximum possible MSS for
+// r, it will be used; otherwise, the maximum possible MSS will be used.
+func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
+	// The maximum possible MSS is dependent on the route.
+	maxMSS := mssForRoute(&r)
+
+	if userMSS != 0 && userMSS < maxMSS {
+		return userMSS
+	}
+
+	return maxMSS
+}
+
 // StopWork halts packet processing. Only to be used in tests.
 func (e *endpoint) StopWork() {
 	e.workMu.Lock()
@@ -752,7 +767,9 @@ func (e *endpoint) initialReceiveWindow() int {
 	if rcvWnd > math.MaxUint16 {
 		rcvWnd = math.MaxUint16
 	}
-	routeWnd := InitialCwnd * int(mssForRoute(&e.route)) * 2
+
+	// Use the user supplied MSS, if available.
+	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
 	if rcvWnd > routeWnd {
 		rcvWnd = routeWnd
 	}
@@ -1206,7 +1223,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			return tcpip.ErrInvalidOptionValue
 		}
 		e.mu.Lock()
-		e.userMSS = int(userMSS)
+		e.userMSS = uint16(userMSS)
 		e.mu.Unlock()
 		e.notifyProtocolGoroutine(notifyMSSChanged)
 		return nil
@@ -2383,5 +2400,6 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 }
 
 func mssForRoute(r *stack.Route) uint16 {
+	// TODO(b/143359391): Respect TCP Min and Max size.
 	return uint16(r.MTU() - header.TCPMinimumSize)
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 6d808328c..126f26ed3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -474,6 +474,130 @@ func TestSimpleReceive(t *testing.T) {
 	)
 }
 
+// TestUserSuppliedMSSOnConnectV4 tests that the user supplied MSS is used when
+// creating a new active IPv4 TCP socket. It should be present in the sent TCP
+// SYN segment.
+func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
+	const mtu = 5000
+	const maxMSS = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
+	tests := []struct {
+		name   string
+		setMSS uint16
+		expMSS uint16
+	}{
+		{
+			"EqualToMaxMSS",
+			maxMSS,
+			maxMSS,
+		},
+		{
+			"LessThanMTU",
+			maxMSS - 1,
+			maxMSS - 1,
+		},
+		{
+			"GreaterThanMTU",
+			maxMSS + 1,
+			maxMSS,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, mtu)
+			defer c.Cleanup()
+
+			c.Create(-1)
+
+			// Set the MSS socket option.
+			opt := tcpip.MaxSegOption(test.setMSS)
+			if err := c.EP.SetSockOpt(opt); err != nil {
+				t.Fatalf("SetSockOpt(%#v) failed: %s", opt, err)
+			}
+
+			// Get expected window size.
+			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+			if err != nil {
+				t.Fatalf("GetSockOpt(%v) failed: %s", tcpip.ReceiveBufferSizeOption, err)
+			}
+			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
+
+			// Start connection attempt to IPv4 address.
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("Unexpected return value from Connect: %v", err)
+			}
+
+			// Receive SYN packet with our user supplied MSS.
+			checker.IPv4(t, c.GetPacket(), checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlags(header.TCPFlagSyn),
+				checker.TCPSynOptions(header.TCPSynOptions{MSS: test.expMSS, WS: ws})))
+		})
+	}
+}
+
+// TestUserSuppliedMSSOnConnectV6 tests that the user supplied MSS is used when
+// creating a new active IPv6 TCP socket. It should be present in the sent TCP
+// SYN segment.
+func TestUserSuppliedMSSOnConnectV6(t *testing.T) {
+	const mtu = 5000
+	const maxMSS = mtu - header.IPv6MinimumSize - header.TCPMinimumSize
+	tests := []struct {
+		name   string
+		setMSS uint16
+		expMSS uint16
+	}{
+		{
+			"EqualToMaxMSS",
+			maxMSS,
+			maxMSS,
+		},
+		{
+			"LessThanMTU",
+			maxMSS - 1,
+			maxMSS - 1,
+		},
+		{
+			"GreaterThanMTU",
+			maxMSS + 1,
+			maxMSS,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, mtu)
+			defer c.Cleanup()
+
+			c.CreateV6Endpoint(true)
+
+			// Set the MSS socket option.
+			opt := tcpip.MaxSegOption(test.setMSS)
+			if err := c.EP.SetSockOpt(opt); err != nil {
+				t.Fatalf("SetSockOpt(%#v) failed: %s", opt, err)
+			}
+
+			// Get expected window size.
+			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+			if err != nil {
+				t.Fatalf("GetSockOpt(%v) failed: %s", tcpip.ReceiveBufferSizeOption, err)
+			}
+			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
+
+			// Start connection attempt to IPv6 address.
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestV6Addr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("Unexpected return value from Connect: %v", err)
+			}
+
+			// Receive SYN packet with our user supplied MSS.
+			checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlags(header.TCPFlagSyn),
+				checker.TCPSynOptions(header.TCPSynOptions{MSS: test.expMSS, WS: ws})))
+		})
+	}
+}
+
 func TestTOSV4(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-- 
cgit v1.2.3


From dbeaf9d4dbeea4cde670c3d07a78b56a45fa8f21 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 28 Oct 2019 18:48:35 -0700
Subject: Deflake TestCheckpointRestore

PiperOrigin-RevId: 277189064
---
 test/e2e/integration_test.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index 7cc0de129..28064e557 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -175,6 +175,9 @@ func TestCheckpointRestore(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	// TODO(b/143498576): Remove after github.com/moby/moby/issues/38963 is fixed.
+	time.Sleep(1 * time.Second)
+
 	if err := d.Restore("test"); err != nil {
 		t.Fatal("docker restore failed:", err)
 	}
-- 
cgit v1.2.3


From 29273b03842a85bce8314799348231520ceb6e9c Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 29 Oct 2019 10:03:18 -0700
Subject: Disallow execveat on interpreter scripts with fd opened with
 O_CLOEXEC.

When an interpreter script is opened with O_CLOEXEC and the resulting fd is
passed into execveat, an ENOENT error should occur (the script would otherwise
be inaccessible to the interpreter). This matches the actual behavior of
Linux's execveat.

PiperOrigin-RevId: 277306680
---
 pkg/sentry/kernel/kernel.go             |  1 +
 pkg/sentry/loader/loader.go             |  9 +++++++++
 pkg/sentry/syscalls/linux/sys_thread.go |  5 ++++-
 test/syscalls/linux/exec.cc             | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index fcfe7a16d..e64d648e2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -812,6 +812,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		ResolveFinal:        true,
 		Filename:            args.Filename,
 		File:                args.File,
+		CloseOnExec:         false,
 		Argv:                args.Argv,
 		Envv:                args.Envv,
 		Features:            k.featureSet,
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 818941762..f75ebe08a 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -66,6 +66,12 @@ type LoadArgs struct {
 	// nil, then File will be loaded and Filename will be ignored.
 	File *fs.File
 
+	// CloseOnExec indicates that the executable (or one of its parent
+	// directories) was opened with O_CLOEXEC. If the executable is an
+	// interpreter script, then cause an ENOENT error to occur, since the
+	// script would otherwise be inaccessible to the interpreter.
+	CloseOnExec bool
+
 	// Argv is the vector of arguments to pass to the executable.
 	Argv []string
 
@@ -279,6 +285,9 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 			d.IncRef()
 			return loaded, ac, d, args.Argv, err
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
+			if args.CloseOnExec {
+				return loadedELF{}, nil, nil, nil, syserror.ENOENT
+			}
 			args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv)
 			if err != nil {
 				ctx.Infof("Error loading interpreter script: %v", err)
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 2476f8858..4115116ff 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -120,6 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 
 	var wd *fs.Dirent
 	var executable *fs.File
+	var closeOnExec bool
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
 		// Even if the pathname is absolute, we may still need the wd
 		// for interpreter scripts if the path of the interpreter is
@@ -127,11 +128,12 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		wd = t.FSContext().WorkingDirectory()
 	} else {
 		// Need to extract the given FD.
-		f := t.GetFile(dirFD)
+		f, fdFlags := t.FDTable().Get(dirFD)
 		if f == nil {
 			return 0, nil, syserror.EBADF
 		}
 		defer f.DecRef()
+		closeOnExec = fdFlags.CloseOnExec
 
 		if atEmptyPath && len(pathname) == 0 {
 			executable = f
@@ -157,6 +159,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		ResolveFinal:        resolveFinal,
 		Filename:            pathname,
 		File:                executable,
+		CloseOnExec:         closeOnExec,
 		Argv:                argv,
 		Envv:                envv,
 		Features:            t.Arch().FeatureSet(),
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 21a5ffd40..a9067df2a 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -681,6 +681,39 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
                 ArgEnvExitStatus(0, 0), "");
 }
 
+TEST(ExecveatTest, BasicWithCloexecFD) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
+
+  CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH,
+                ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
+  std::string path = WorkloadPath(kExitScript);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), "", {path}, {},
+                                            AT_EMPTY_PATH, /*child=*/nullptr,
+                                            &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) {
+  std::string absolute_path = WorkloadPath(kExitScript);
+  std::string parent_dir = std::string(Dirname(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_CLOEXEC | O_DIRECTORY));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {},
+                                            /*flags=*/0, /*child=*/nullptr,
+                                            &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
 TEST(ExecveatTest, InvalidFlags) {
   int execve_errno;
   ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
-- 
cgit v1.2.3


From 41e2df1bdee527eccdc4622ef2013201afc8e2cc Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 29 Oct 2019 10:28:55 -0700
Subject: Support iterating an NDP options buffer.

This change helps support iterating over an NDP options buffer so that
implementations can handle all the NDP options present in an NDP packet.

Note, this change does not yet actually handle these options, it just provides
the tools to do so (in preparation for NDP's Prefix, Parameter, and a complete
implementation of Neighbor Discovery).

Tests: Unittests to make sure we can iterate over a valid NDP options buffer
that may contain multiple options. Also tests to check an iterator before
using it to see if the NDP options buffer is malformed.
PiperOrigin-RevId: 277312487
---
 pkg/tcpip/header/ndp_options.go | 163 ++++++++++++++++--
 pkg/tcpip/header/ndp_test.go    | 360 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 475 insertions(+), 48 deletions(-)

diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 98310ea23..a2b9d7435 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -16,6 +16,7 @@ package header
 
 import (
 	"encoding/binary"
+	"errors"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -30,9 +31,9 @@ const (
 	// Link Layer Option for an Ethernet address.
 	ndpTargetEthernetLinkLayerAddressSize = 8
 
-	// ndpPrefixInformationType is the type of the Prefix Information
+	// NDPPrefixInformationType is the type of the Prefix Information
 	// option, as per RFC 4861 section 4.6.2.
-	ndpPrefixInformationType = 3
+	NDPPrefixInformationType = 3
 
 	// ndpPrefixInformationLength is the expected length, in bytes, of the
 	// body of an NDP Prefix Information option, as per RFC 4861 section
@@ -95,9 +96,122 @@ const (
 	lengthByteUnits = 8
 )
 
+// NDPOptionIterator is an iterator of NDPOption.
+//
+// Note, between when an NDPOptionIterator is obtained and last used, no changes
+// to the NDPOptions may happen. Doing so may cause undefined and unexpected
+// behaviour. It is fine to obtain an NDPOptionIterator, iterate over the first
+// few NDPOption then modify the backing NDPOptions so long as the
+// NDPOptionIterator obtained before modification is no longer used.
+type NDPOptionIterator struct {
+	// The NDPOptions this NDPOptionIterator is iterating over.
+	opts NDPOptions
+}
+
+// Potential errors when iterating over an NDPOptions.
+var (
+	ErrNDPOptBufExhausted  = errors.New("Buffer unexpectedly exhausted")
+	ErrNDPOptZeroLength    = errors.New("NDP option has zero-valued Length field")
+	ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body")
+)
+
+// Next returns the next element in the backing NDPOptions, or true if we are
+// done, or false if an error occured.
+//
+// The return can be read as option, done, error. Note, option should only be
+// used if done is false and error is nil.
+func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
+	for {
+		// Do we still have elements to look at?
+		if len(i.opts) == 0 {
+			return nil, true, nil
+		}
+
+		// Do we have enough bytes for an NDP option that has a Length
+		// field of at least 1? Note, 0 in the Length field is invalid.
+		if len(i.opts) < lengthByteUnits {
+			return nil, true, ErrNDPOptBufExhausted
+		}
+
+		// Get the Type field.
+		t := i.opts[0]
+
+		// Get the Length field.
+		l := i.opts[1]
+
+		// This would indicate an erroneous NDP option as the Length
+		// field should never be 0.
+		if l == 0 {
+			return nil, true, ErrNDPOptZeroLength
+		}
+
+		// How many bytes are in the option body?
+		numBytes := int(l) * lengthByteUnits
+		numBodyBytes := numBytes - 2
+
+		potentialBody := i.opts[2:]
+
+		// This would indicate an erroenous NDPOptions buffer as we ran
+		// out of the buffer in the middle of an NDP option.
+		if left := len(potentialBody); left < numBodyBytes {
+			return nil, true, ErrNDPOptBufExhausted
+		}
+
+		// Get only the options body, leaving the rest of the options
+		// buffer alone.
+		body := potentialBody[:numBodyBytes]
+
+		// Update opts with the remaining options body.
+		i.opts = i.opts[numBytes:]
+
+		switch t {
+		case NDPTargetLinkLayerAddressOptionType:
+			return NDPTargetLinkLayerAddressOption(body), false, nil
+
+		case NDPPrefixInformationType:
+			// Make sure the length of a Prefix Information option
+			// body is ndpPrefixInformationLength, as per RFC 4861
+			// section 4.6.2.
+			if numBodyBytes != ndpPrefixInformationLength {
+				return nil, true, ErrNDPOptMalformedBody
+			}
+
+			return NDPPrefixInformation(body), false, nil
+		default:
+			// We do not yet recognize the option, just skip for
+			// now. This is okay because RFC 4861 allows us to
+			// skip/ignore any unrecognized options. However,
+			// we MUST recognized all the options in RFC 4861.
+			//
+			// TODO(b/141487990): Handle all NDP options as defined
+			//                    by RFC 4861.
+		}
+	}
+}
+
 // NDPOptions is a buffer of NDP options as defined by RFC 4861 section 4.6.
 type NDPOptions []byte
 
+// Iter returns an iterator of NDPOption.
+//
+// If check is true, Iter will do an integrity check on the options by iterating
+// over it and returning an error if detected.
+//
+// See NDPOptionIterator for more information.
+func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) {
+	it := NDPOptionIterator{opts: b}
+
+	if check {
+		for it2 := it; true; {
+			if _, done, err := it2.Next(); err != nil || done {
+				return it, err
+			}
+		}
+	}
+
+	return it, nil
+}
+
 // Serialize serializes the provided list of NDP options into o.
 //
 // Note, b must be of sufficient size to hold all the options in s. See
@@ -137,15 +251,15 @@ func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
 	return done
 }
 
-// ndpOption is the set of functions to be implemented by all NDP option types.
-type ndpOption interface {
-	// Type returns the type of this ndpOption.
+// NDPOption is the set of functions to be implemented by all NDP option types.
+type NDPOption interface {
+	// Type returns the type of the receiver.
 	Type() uint8
 
-	// Length returns the length of the body of this ndpOption, in bytes.
+	// Length returns the length of the body of the receiver, in bytes.
 	Length() int
 
-	// serializeInto serializes this ndpOption into the provided byte
+	// serializeInto serializes the receiver into the provided byte
 	// buffer.
 	//
 	// Note, the caller MUST provide a byte buffer with size of at least
@@ -154,15 +268,15 @@ type ndpOption interface {
 	// buffer is not of sufficient size.
 	//
 	// serializeInto will return the number of bytes that was used to
-	// serialize this ndpOption. Implementers must only use the number of
-	// bytes required to serialize this ndpOption. Callers MAY provide a
+	// serialize the receiver. Implementers must only use the number of
+	// bytes required to serialize the receiver. Callers MAY provide a
 	// larger buffer than required to serialize into.
 	serializeInto([]byte) int
 }
 
 // paddedLength returns the length of o, in bytes, with any padding bytes, if
 // required.
-func paddedLength(o ndpOption) int {
+func paddedLength(o NDPOption) int {
 	l := o.Length()
 
 	if l == 0 {
@@ -201,7 +315,7 @@ func paddedLength(o ndpOption) int {
 }
 
 // NDPOptionsSerializer is a serializer for NDP options.
-type NDPOptionsSerializer []ndpOption
+type NDPOptionsSerializer []NDPOption
 
 // Length returns the total number of bytes required to serialize.
 func (b NDPOptionsSerializer) Length() int {
@@ -221,21 +335,34 @@ func (b NDPOptionsSerializer) Length() int {
 // where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
 type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
 
-// Type implements ndpOption.Type.
+// Type implements NDPOption.Type.
 func (o NDPTargetLinkLayerAddressOption) Type() uint8 {
 	return NDPTargetLinkLayerAddressOptionType
 }
 
-// Length implements ndpOption.Length.
+// Length implements NDPOption.Length.
 func (o NDPTargetLinkLayerAddressOption) Length() int {
 	return len(o)
 }
 
-// serializeInto implements ndpOption.serializeInto.
+// serializeInto implements NDPOption.serializeInto.
 func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
 	return copy(b, o)
 }
 
+// EthernetAddress will return an ethernet (MAC) address if the
+// NDPTargetLinkLayerAddressOption's body has at minimum EthernetAddressSize
+// bytes. If the body has more than EthernetAddressSize bytes, only the first
+// EthernetAddressSize bytes are returned as that is all that is needed for an
+// Ethernet address.
+func (o NDPTargetLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
+	if len(o) >= EthernetAddressSize {
+		return tcpip.LinkAddress(o[:EthernetAddressSize])
+	}
+
+	return tcpip.LinkAddress([]byte(nil))
+}
+
 // NDPPrefixInformation is the NDP Prefix Information option as defined by
 // RFC 4861 section 4.6.2.
 //
@@ -243,17 +370,17 @@ func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
 // ndpPrefixInformationLength bytes.
 type NDPPrefixInformation []byte
 
-// Type implements ndpOption.Type.
+// Type implements NDPOption.Type.
 func (o NDPPrefixInformation) Type() uint8 {
-	return ndpPrefixInformationType
+	return NDPPrefixInformationType
 }
 
-// Length implements ndpOption.Length.
+// Length implements NDPOption.Length.
 func (o NDPPrefixInformation) Length() int {
 	return ndpPrefixInformationLength
 }
 
-// serializeInto implements ndpOption.serializeInto.
+// serializeInto implements NDPOption.serializeInto.
 func (o NDPPrefixInformation) serializeInto(b []byte) int {
 	used := copy(b, o)
 
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 0bbf67a2b..ad6daafcd 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -36,18 +36,18 @@ func TestNDPNeighborSolicit(t *testing.T) {
 	ns := NDPNeighborSolicit(b)
 	addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
 	if got := ns.TargetAddress(); got != addr {
-		t.Fatalf("got ns.TargetAddress = %s, want %s", got, addr)
+		t.Errorf("got ns.TargetAddress = %s, want %s", got, addr)
 	}
 
 	// Test updating the Target Address.
 	addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
 	ns.SetTargetAddress(addr2)
 	if got := ns.TargetAddress(); got != addr2 {
-		t.Fatalf("got ns.TargetAddress = %s, want %s", got, addr2)
+		t.Errorf("got ns.TargetAddress = %s, want %s", got, addr2)
 	}
 	// Make sure the address got updated in the backing buffer.
 	if got := tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize]); got != addr2 {
-		t.Fatalf("got targetaddress buffer = %s, want %s", got, addr2)
+		t.Errorf("got targetaddress buffer = %s, want %s", got, addr2)
 	}
 }
 
@@ -65,56 +65,56 @@ func TestNDPNeighborAdvert(t *testing.T) {
 	na := NDPNeighborAdvert(b)
 	addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
 	if got := na.TargetAddress(); got != addr {
-		t.Fatalf("got TargetAddress = %s, want %s", got, addr)
+		t.Errorf("got TargetAddress = %s, want %s", got, addr)
 	}
 
 	// Test getting the Router Flag.
 	if got := na.RouterFlag(); !got {
-		t.Fatalf("got RouterFlag = false, want = true")
+		t.Errorf("got RouterFlag = false, want = true")
 	}
 
 	// Test getting the Solicited Flag.
 	if got := na.SolicitedFlag(); got {
-		t.Fatalf("got SolicitedFlag = true, want = false")
+		t.Errorf("got SolicitedFlag = true, want = false")
 	}
 
 	// Test getting the Override Flag.
 	if got := na.OverrideFlag(); !got {
-		t.Fatalf("got OverrideFlag = false, want = true")
+		t.Errorf("got OverrideFlag = false, want = true")
 	}
 
 	// Test updating the Target Address.
 	addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
 	na.SetTargetAddress(addr2)
 	if got := na.TargetAddress(); got != addr2 {
-		t.Fatalf("got TargetAddress = %s, want %s", got, addr2)
+		t.Errorf("got TargetAddress = %s, want %s", got, addr2)
 	}
 	// Make sure the address got updated in the backing buffer.
 	if got := tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize]); got != addr2 {
-		t.Fatalf("got targetaddress buffer = %s, want %s", got, addr2)
+		t.Errorf("got targetaddress buffer = %s, want %s", got, addr2)
 	}
 
 	// Test updating the Router Flag.
 	na.SetRouterFlag(false)
 	if got := na.RouterFlag(); got {
-		t.Fatalf("got RouterFlag = true, want = false")
+		t.Errorf("got RouterFlag = true, want = false")
 	}
 
 	// Test updating the Solicited Flag.
 	na.SetSolicitedFlag(true)
 	if got := na.SolicitedFlag(); !got {
-		t.Fatalf("got SolicitedFlag = false, want = true")
+		t.Errorf("got SolicitedFlag = false, want = true")
 	}
 
 	// Test updating the Override Flag.
 	na.SetOverrideFlag(false)
 	if got := na.OverrideFlag(); got {
-		t.Fatalf("got OverrideFlag = true, want = false")
+		t.Errorf("got OverrideFlag = true, want = false")
 	}
 
 	// Make sure flags got updated in the backing buffer.
 	if got := b[ndpNAFlagsOffset]; got != 64 {
-		t.Fatalf("got flags byte = %d, want = 64")
+		t.Errorf("got flags byte = %d, want = 64")
 	}
 }
 
@@ -128,30 +128,66 @@ func TestNDPRouterAdvert(t *testing.T) {
 	ra := NDPRouterAdvert(b)
 
 	if got := ra.CurrHopLimit(); got != 64 {
-		t.Fatalf("got ra.CurrHopLimit = %d, want = 64", got)
+		t.Errorf("got ra.CurrHopLimit = %d, want = 64", got)
 	}
 
 	if got := ra.ManagedAddrConfFlag(); !got {
-		t.Fatalf("got ManagedAddrConfFlag = false, want = true")
+		t.Errorf("got ManagedAddrConfFlag = false, want = true")
 	}
 
 	if got := ra.OtherConfFlag(); got {
-		t.Fatalf("got OtherConfFlag = true, want = false")
+		t.Errorf("got OtherConfFlag = true, want = false")
 	}
 
 	if got, want := ra.RouterLifetime(), time.Second*258; got != want {
-		t.Fatalf("got ra.RouterLifetime = %d, want = %d", got, want)
+		t.Errorf("got ra.RouterLifetime = %d, want = %d", got, want)
 	}
 
 	if got, want := ra.ReachableTime(), time.Millisecond*50595078; got != want {
-		t.Fatalf("got ra.ReachableTime = %d, want = %d", got, want)
+		t.Errorf("got ra.ReachableTime = %d, want = %d", got, want)
 	}
 
 	if got, want := ra.RetransTimer(), time.Millisecond*117967114; got != want {
-		t.Fatalf("got ra.RetransTimer = %d, want = %d", got, want)
+		t.Errorf("got ra.RetransTimer = %d, want = %d", got, want)
 	}
 }
 
+// TestNDPTargetLinkLayerAddressOptionEthernetAddress tests getting the
+// Ethernet address from an NDPTargetLinkLayerAddressOption.
+func TestNDPTargetLinkLayerAddressOptionEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		expected tcpip.LinkAddress
+	}{
+		{
+			"ValidMAC",
+			[]byte{1, 2, 3, 4, 5, 6},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+		{
+			"TLLBodyTooShort",
+			[]byte{1, 2, 3, 4, 5},
+			tcpip.LinkAddress([]byte(nil)),
+		},
+		{
+			"TLLBodyLargerThanNeeded",
+			[]byte{1, 2, 3, 4, 5, 6, 7, 8},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			tll := NDPTargetLinkLayerAddressOption(test.buf)
+			if got := tll.EthernetAddress(); got != test.expected {
+				t.Errorf("got tll.EthernetAddress = %s, want = %s", got, test.expected)
+			}
+		})
+	}
+
+}
+
 // TestNDPTargetLinkLayerAddressOptionSerialize tests serializing a
 // NDPTargetLinkLayerAddressOption.
 func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
@@ -194,6 +230,44 @@ func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
 			if !bytes.Equal(test.buf, test.expectedBuf) {
 				t.Fatalf("got b = %d, want = %d", test.buf, test.expectedBuf)
 			}
+
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			if len(test.expectedBuf) > 0 {
+				next, done, err := it.Next()
+				if err != nil {
+					t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+				}
+				if done {
+					t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+				}
+				if got := next.Type(); got != NDPTargetLinkLayerAddressOptionType {
+					t.Fatalf("got Type %= %d, want = %d", got, NDPTargetLinkLayerAddressOptionType)
+				}
+				tll := next.(NDPTargetLinkLayerAddressOption)
+				if got, want := []byte(tll), test.expectedBuf[2:]; !bytes.Equal(got, want) {
+					t.Fatalf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+				}
+
+				if got, want := tll.EthernetAddress(), tcpip.LinkAddress(test.expectedBuf[2:][:EthernetAddressSize]); got != want {
+					t.Errorf("got tll.MACAddress = %s, want = %s", got, want)
+				}
+			}
+
+			// Iterator should not return anything else.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
 		})
 	}
 }
@@ -232,39 +306,265 @@ func TestNDPPrefixInformationOption(t *testing.T) {
 		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expectedBuf)
 	}
 
-	// First two bytes are the Type and Length fields, which are not part of
-	// the option body.
-	pi := NDPPrefixInformation(targetBuf[2:])
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPPrefixInformationType {
+		t.Errorf("got Type = %d, want = %d", got, NDPPrefixInformationType)
+	}
+
+	pi := next.(NDPPrefixInformation)
 
 	if got := pi.Type(); got != 3 {
-		t.Fatalf("got Type = %d, want = 3", got)
+		t.Errorf("got Type = %d, want = 3", got)
 	}
 
 	if got := pi.Length(); got != 30 {
-		t.Fatalf("got Length = %d, want = 30", got)
+		t.Errorf("got Length = %d, want = 30", got)
 	}
 
 	if got := pi.PrefixLength(); got != 43 {
-		t.Fatalf("got PrefixLength = %d, want = 43", got)
+		t.Errorf("got PrefixLength = %d, want = 43", got)
 	}
 
 	if pi.OnLinkFlag() {
-		t.Fatalf("got OnLinkFlag = true, want = false")
+		t.Error("got OnLinkFlag = true, want = false")
 	}
 
 	if !pi.AutonomousAddressConfigurationFlag() {
-		t.Fatalf("got AutonomousAddressConfigurationFlag = false, want = true")
+		t.Error("got AutonomousAddressConfigurationFlag = false, want = true")
 	}
 
 	if got, want := pi.ValidLifetime(), 16909060*time.Second; got != want {
-		t.Fatalf("got ValidLifetime = %d, want = %d", got, want)
+		t.Errorf("got ValidLifetime = %d, want = %d", got, want)
 	}
 
 	if got, want := pi.PreferredLifetime(), 84281096*time.Second; got != want {
-		t.Fatalf("got PreferredLifetime = %d, want = %d", got, want)
+		t.Errorf("got PreferredLifetime = %d, want = %d", got, want)
 	}
 
 	if got, want := pi.Prefix(), tcpip.Address("\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18"); got != want {
-		t.Fatalf("got Prefix = %s, want = %s", got, want)
+		t.Errorf("got Prefix = %s, want = %s", got, want)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
+// TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
+// the iterator was returned for is malformed.
+func TestNDPOptionsIterCheck(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		expected error
+	}{
+		{
+			"ZeroLengthField",
+			[]byte{0, 0, 0, 0, 0, 0, 0, 0},
+			ErrNDPOptZeroLength,
+		},
+		{
+			"ValidTargetLinkLayerAddressOption",
+			[]byte{2, 1, 1, 2, 3, 4, 5, 6},
+			nil,
+		},
+		{
+			"TooSmallTargetLinkLayerAddressOption",
+			[]byte{2, 1, 1, 2, 3, 4, 5},
+			ErrNDPOptBufExhausted,
+		},
+		{
+			"ValidPrefixInformation",
+			[]byte{
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23, 24,
+			},
+			nil,
+		},
+		{
+			"TooSmallPrefixInformation",
+			[]byte{
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23,
+			},
+			ErrNDPOptBufExhausted,
+		},
+		{
+			"InvalidPrefixInformationLength",
+			[]byte{
+				3, 3, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+			},
+			ErrNDPOptMalformedBody,
+		},
+		{
+			"ValidTargetLinkLayerAddressWithPrefixInformation",
+			[]byte{
+				// Target Link-Layer Address.
+				2, 1, 1, 2, 3, 4, 5, 6,
+
+				// Prefix information.
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23, 24,
+			},
+			nil,
+		},
+		{
+			"ValidTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
+			[]byte{
+				// Target Link-Layer Address.
+				2, 1, 1, 2, 3, 4, 5, 6,
+
+				// 255 is an unrecognized type. If 255 ends up
+				// being the type for some recognized type,
+				// update 255 to some other unrecognized value.
+				255, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8,
+
+				// Prefix information.
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23, 24,
+			},
+			nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+
+			if _, err := opts.Iter(true); err != test.expected {
+				t.Fatalf("got Iter(true) = (_, %v), want = (_, %v)", err, test.expected)
+			}
+
+			// test.buf may be malformed but we chose not to check
+			// the iterator so it must return true.
+			if _, err := opts.Iter(false); err != nil {
+				t.Fatalf("got Iter(false) = (_, %s), want = (_, nil)", err)
+			}
+		})
+	}
+}
+
+// TestNDPOptionsIter tests that we can iterator over a valid NDPOptions. Note,
+// this test does not actually check any of the option's getters, it simply
+// checks the option Type and Body. We have other tests that tests the option
+// field gettings given an option body and don't need to duplicate those tests
+// here.
+func TestNDPOptionsIter(t *testing.T) {
+	buf := []byte{
+		// Target Link-Layer Address.
+		2, 1, 1, 2, 3, 4, 5, 6,
+
+		// 255 is an unrecognized type. If 255 ends up being the type
+		// for some recognized type, update 255 to some other
+		// unrecognized value. Note, this option should be skipped when
+		// iterating.
+		255, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8,
+
+		// Prefix information.
+		3, 4, 43, 64,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		0, 0, 0, 0,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+		17, 18, 19, 20,
+		21, 22, 23, 24,
+	}
+
+	opts := NDPOptions(buf)
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	// Test the first (Taret Link-Layer) option.
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got, want := []byte(next.(NDPTargetLinkLayerAddressOption)), buf[2:][:6]; !bytes.Equal(got, want) {
+		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+	}
+	if got := next.Type(); got != NDPTargetLinkLayerAddressOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPTargetLinkLayerAddressOptionType)
+	}
+
+	// Test the next (Prefix Information) option.
+	// Note, the unrecognized option should be skipped.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got, want := next.(NDPPrefixInformation), buf[26:][:30]; !bytes.Equal(got, want) {
+		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+	}
+	if got := next.Type(); got != NDPPrefixInformationType {
+		t.Errorf("got Type = %d, want = %d", got, NDPPrefixInformationType)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
 	}
 }
-- 
cgit v1.2.3


From 7d80e85835fbe47b2395eedf287cf902ed78599a Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 29 Oct 2019 11:19:04 -0700
Subject: Allow waiting for Endpoint worker goroutines to finish.

Updates #837

PiperOrigin-RevId: 277325162
---
 pkg/tcpip/stack/registration.go      | 14 ++++++++++++++
 pkg/tcpip/stack/transport_demuxer.go | 20 ++++++++++++++++++++
 pkg/tcpip/stack/transport_test.go    |  5 +++--
 pkg/tcpip/transport/icmp/endpoint.go |  3 +++
 pkg/tcpip/transport/raw/endpoint.go  |  3 +++
 pkg/tcpip/transport/tcp/endpoint.go  | 16 ++++++++++++++++
 pkg/tcpip/transport/udp/endpoint.go  |  3 +++
 7 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 0869fb084..0360187b8 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -67,6 +67,20 @@ type TransportEndpoint interface {
 	// HandleControlPacket is called by the stack when new control (e.g.,
 	// ICMP) packets arrive to this transport endpoint.
 	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView)
+
+	// Close puts the endpoint in a closed state and frees all resources
+	// associated with it. This cleanup may happen asynchronously. Wait can
+	// be used to block on this asynchronous cleanup.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the endpoint to stop.
+	//
+	// An endpoint can be requested to stop its worker goroutines by calling
+	// its Close method.
+	//
+	// Wait will not block if the endpoint hasn't started any goroutines
+	// yet, even if it might later.
+	Wait()
 }
 
 // RawTransportEndpoint is the interface that needs to be implemented by raw
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 97a1aec4b..9aff90a3d 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -240,6 +240,26 @@ func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, v
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
+// Close implements stack.TransportEndpoint.Close.
+func (ep *multiPortEndpoint) Close() {
+	ep.mu.RLock()
+	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
+	ep.mu.RUnlock()
+	for _, e := range eps {
+		e.Close()
+	}
+}
+
+// Wait implements stack.TransportEndpoint.Wait.
+func (ep *multiPortEndpoint) Wait() {
+	ep.mu.RLock()
+	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
+	ep.mu.RUnlock()
+	for _, e := range eps {
+		e.Wait()
+	}
+}
+
 // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
 // list. The list might be empty already.
 func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 86c62be25..db951c9ce 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -225,8 +225,9 @@ func (f *fakeTransportEndpoint) IPTables() (iptables.IPTables, error) {
 	return iptables.IPTables{}, nil
 }
 
-func (f *fakeTransportEndpoint) Resume(*stack.Stack) {
-}
+func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
+
+func (f *fakeTransportEndpoint) Wait() {}
 
 type fakeTransportGoodOption bool
 
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 043467519..d0dd383fd 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -798,3 +798,6 @@ func (e *endpoint) Info() tcpip.EndpointInfo {
 func (e *endpoint) Stats() tcpip.EndpointStats {
 	return &e.stats
 }
+
+// Wait implements stack.TransportEndpoint.Wait.
+func (*endpoint) Wait() {}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 308f10d24..951d317ed 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -641,3 +641,6 @@ func (e *endpoint) Info() tcpip.EndpointInfo {
 func (e *endpoint) Stats() tcpip.EndpointStats {
 	return &e.stats
 }
+
+// Wait implements stack.TransportEndpoint.Wait.
+func (*endpoint) Wait() {}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8234a8b53..ce8307cee 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2399,6 +2399,22 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 	return &e.stats
 }
 
+// Wait implements stack.TransportEndpoint.Wait.
+func (e *endpoint) Wait() {
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
+	defer e.waiterQueue.EventUnregister(&waitEntry)
+	for {
+		e.mu.Lock()
+		running := e.workerRunning
+		e.mu.Unlock()
+		if !running {
+			break
+		}
+		<-notifyCh
+	}
+}
+
 func mssForRoute(r *stack.Route) uint16 {
 	// TODO(b/143359391): Respect TCP Min and Max size.
 	return uint16(r.MTU() - header.TCPMinimumSize)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 91c8487f3..cda302bb7 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1234,6 +1234,9 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 	return &e.stats
 }
 
+// Wait implements tcpip.Endpoint.Wait.
+func (*endpoint) Wait() {}
+
 func isBroadcastOrMulticast(a tcpip.Address) bool {
 	return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a)
 }
-- 
cgit v1.2.3


From 392c56149531c82ef3c07e2899939c0d63f0980b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 29 Oct 2019 12:15:33 -0700
Subject: Fix PollWithFullBufferBlocks.

Set the snd/rcv buffer sizes so that the test is deterministic and runs in a
reasonable amount of time. It also ensures that we disable any auto-tuning of
the send/receive buffer which may happen.

PiperOrigin-RevId: 277337232
---
 test/syscalls/linux/tcp_socket.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index bfa031bce..277d6835a 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -394,8 +394,15 @@ TEST_P(TcpSocketTest, PollWithFullBufferBlocks) {
                          sizeof(tcp_nodelay_flag)),
               SyscallSucceeds());
 
+  // Set a 256KB send/receive buffer.
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(setsockopt(t_, SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &buf_sz, sizeof(buf_sz)),
+              SyscallSucceedsWithValue(0));
+
   // Create a large buffer that will be used for sending.
-  std::vector<char> buf(10 * sendbuf_size_);
+  std::vector<char> buf(1 << 16);
 
   // Write until we receive an error.
   while (RetryEINTR(send)(s_, buf.data(), buf.size(), 0) != -1) {
-- 
cgit v1.2.3


From 2e00771d5abb3d821703965953c2b21ef7c20911 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 29 Oct 2019 12:50:03 -0700
Subject: Refactor logic for loadExecutable.

Separate the handling of filenames and *fs.File objects in a more explicit way
for the sake of clarity.

PiperOrigin-RevId: 277344203
---
 pkg/sentry/loader/loader.go | 134 ++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index f75ebe08a..803e7d41e 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"io"
 	"path"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -99,20 +100,20 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 	return total, nil
 }
 
-// openPath opens args.Filename for loading.
+// openPath opens args.Filename and checks that it is valid for loading.
 //
-// openPath returns the fs.Dirent and an *fs.File for args.Filename, which is
-// not installed in the Task FDTable. The caller takes ownership of both.
+// openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not
+// installed in the Task FDTable. The caller takes ownership of both.
 //
 // args.Filename must be a readable, executable, regular file.
 func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) {
-	var err error
 	if args.Filename == "" {
 		ctx.Infof("cannot open empty name")
 		return nil, nil, syserror.ENOENT
 	}
 
 	var d *fs.Dirent
+	var err error
 	if args.ResolveFinal {
 		d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
 	} else {
@@ -121,67 +122,60 @@ func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error)
 	if err != nil {
 		return nil, nil, err
 	}
-
-	// Open file will take a reference to Dirent, so destroy this one.
+	// Defer a DecRef for the sake of failure cases.
 	defer d.DecRef()
 
 	if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
 		return nil, nil, syserror.ELOOP
 	}
 
-	return openFile(ctx, nil, d, args.Filename)
-}
+	if err := checkPermission(ctx, d); err != nil {
+		return nil, nil, err
+	}
 
-// openFile takes that file's Dirent and performs checks on it. If provided a
-// *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's
-// Inode and performs checks on that.
-//
-// openFile returns an *fs.File and *fs.Dirent, and the caller takes ownership
-// of both.
-//
-// "dirent" and "file" must not both be nil and point to a readable, executable, regular file.
-func openFile(ctx context.Context, file *fs.File, dirent *fs.Dirent, name string) (*fs.Dirent, *fs.File, error) {
-	// file and dirent must not be nil.
-	if dirent == nil && file == nil {
-		ctx.Infof("dirent and file cannot both be nil.")
-		return nil, nil, syserror.ENOENT
+	// If they claim it's a directory, then make sure.
+	//
+	// N.B. we reject directories below, but we must first reject
+	// non-directories passed as directories.
+	if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) {
+		return nil, nil, syserror.ENOTDIR
+	}
+
+	if err := checkIsRegularFile(ctx, d, args.Filename); err != nil {
+		return nil, nil, err
 	}
 
-	if file != nil {
-		dirent = file.Dirent
+	f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		return nil, nil, err
 	}
+	// Defer a DecRef for the sake of failure cases.
+	defer f.DecRef()
 
-	// Perform permissions checks on the file.
-	if err := checkFile(ctx, dirent, name); err != nil {
+	if err := checkPread(ctx, f, args.Filename); err != nil {
 		return nil, nil, err
 	}
 
-	if file == nil {
-		var ferr error
-		if file, ferr = dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true}); ferr != nil {
-			return nil, nil, ferr
-		}
-	} else {
-		// GetFile takes a reference to the created file, so make one in the case
-		// that the file reference already existed.
-		file.IncRef()
+	d.IncRef()
+	f.IncRef()
+	return d, f, err
+}
+
+// checkFile performs checks on a file to be executed.
+func checkFile(ctx context.Context, f *fs.File, filename string) error {
+	if err := checkPermission(ctx, f.Dirent); err != nil {
+		return err
 	}
 
-	// We must be able to read at arbitrary offsets.
-	if !file.Flags().Pread {
-		file.DecRef()
-		ctx.Infof("%s cannot be read at an offset: %+v", file.MappedName(ctx), file.Flags())
-		return nil, nil, syserror.EACCES
+	if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil {
+		return err
 	}
 
-	// Grab reference for caller.
-	dirent.IncRef()
-	return dirent, file, nil
+	return checkPread(ctx, f, filename)
 }
 
-// checkFile performs file permissions checks for binaries called in openPath
-// and openFile
-func checkFile(ctx context.Context, d *fs.Dirent, name string) error {
+// checkPermission checks whether the file is readable and executable.
+func checkPermission(ctx context.Context, d *fs.Dirent) error {
 	perms := fs.PermMask{
 		// TODO(gvisor.dev/issue/160): Linux requires only execute
 		// permission, not read. However, our backing filesystems may
@@ -192,26 +186,26 @@ func checkFile(ctx context.Context, d *fs.Dirent, name string) error {
 		Read:    true,
 		Execute: true,
 	}
-	if err := d.Inode.CheckPermission(ctx, perms); err != nil {
-		return err
-	}
+	return d.Inode.CheckPermission(ctx, perms)
+}
 
-	// If they claim it's a directory, then make sure.
-	//
-	// N.B. we reject directories below, but we must first reject
-	// non-directories passed as directories.
-	if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
-		return syserror.ENOTDIR
+// checkIsRegularFile prevents us from trying to execute a directory, pipe, etc.
+func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error {
+	attr := d.Inode.StableAttr
+	if !fs.IsRegular(attr) {
+		ctx.Infof("%s is not regular: %v", filename, attr)
+		return syserror.EACCES
 	}
+	return nil
+}
 
-	// No exec-ing directories, pipes, etc!
-	if !fs.IsRegular(d.Inode.StableAttr) {
-		ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr)
+// checkPread checks whether we can read the file at arbitrary offsets.
+func checkPread(ctx context.Context, f *fs.File, filename string) error {
+	if !f.Flags().Pread {
+		ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags())
 		return syserror.EACCES
 	}
-
 	return nil
-
 }
 
 // allocStack allocates and maps a stack in to any available part of the address space.
@@ -248,25 +242,31 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 		)
 		if args.File == nil {
 			d, args.File, err = openPath(ctx, args)
+			// We will return d in the successful case, but defer a DecRef for the
+			// sake of intermediate loops and failure cases.
+			if d != nil {
+				defer d.DecRef()
+			}
+			if args.File != nil {
+				defer args.File.DecRef()
+			}
 		} else {
-			d, args.File, err = openFile(ctx, args.File, nil, "")
+			d = args.File.Dirent
+			d.IncRef()
+			defer d.DecRef()
+			err = checkFile(ctx, args.File, args.Filename)
 		}
-
 		if err != nil {
 			ctx.Infof("Error opening %s: %v", args.Filename, err)
 			return loadedELF{}, nil, nil, nil, err
 		}
-		defer args.File.DecRef()
-		// We will return d in the successful case, but defer a DecRef
-		// for intermediate loops and failure cases.
-		defer d.DecRef()
 
 		// Check the header. Is this an ELF or interpreter script?
 		var hdr [4]uint8
 		// N.B. We assume that reading from a regular file cannot block.
 		_, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0)
-		// Allow unexpected EOF, as a valid executable could be only
-		// three bytes (e.g., #!a).
+		// Allow unexpected EOF, as a valid executable could be only three bytes
+		// (e.g., #!a).
 		if err != nil && err != io.ErrUnexpectedEOF {
 			if err == io.EOF {
 				err = syserror.ENOEXEC
-- 
cgit v1.2.3


From c0b8fd4b6a9fcb595f3200577b93d07737cfaacd Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 29 Oct 2019 13:17:01 -0700
Subject: Update build tags to allow Go 1.14

Currently there are no ABI changes. We should check again closer to release.

PiperOrigin-RevId: 277349744
---
 pkg/procid/procid_amd64.s                              | 2 +-
 pkg/procid/procid_arm64.s                              | 2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go             | 2 +-
 pkg/sentry/platform/kvm/machine_unsafe.go              | 2 +-
 pkg/sentry/platform/ptrace/subprocess_unsafe.go        | 2 +-
 pkg/sentry/vfs/mount_unsafe.go                         | 2 +-
 pkg/sleep/sleep_unsafe.go                              | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go    | 2 +-
 pkg/tcpip/time_unsafe.go                               | 2 +-
 third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go | 2 +-
 third_party/gvsync/downgradable_rwmutex_unsafe.go      | 2 +-
 third_party/gvsync/memmove_unsafe.go                   | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 30ec8e6e2..38cea9be3 100644
--- a/pkg/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.14
+// +build !go1.15
 
 #include "textflag.h"
 
diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index e340d9f98..4f4b70fef 100644
--- a/pkg/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.14
+// +build !go1.15
 
 #include "textflag.h"
 
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 7e8e9f42a..ee730ad70 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 405e00292..e00c7ae40 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index b80a3604d..2ae6b9f9d 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index b0511aa40..75e6c7dfa 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 8f5e60a25..acbf0229b 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index dda3b10a6..0b5a6cf49 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index a52262e87..48764b978 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
index 8baec5458..3b9346843 100644
--- a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
index 1f6007aa1..b7862d185 100644
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
index 84b69f215..9dd1d6142 100644
--- a/third_party/gvsync/memmove_unsafe.go
+++ b/third_party/gvsync/memmove_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.14
+// +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
 
-- 
cgit v1.2.3


From 38330e93774e68324d8f43adb27178453dee18b6 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 29 Oct 2019 13:58:09 -0700
Subject: Update symlink traversal limit when resolving interpreter path.

When execveat is called on an interpreter script, the symlink count for
resolving the script path should be separate from the count for resolving the
the corresponding interpreter. An ELOOP error should not occur if we do not hit
the symlink limit along any individual path, even if the total number of
symlinks encountered exceeds the limit.

Closes #574

PiperOrigin-RevId: 277358474
---
 pkg/sentry/loader/elf.go    |  2 ++
 pkg/sentry/loader/loader.go |  2 ++
 test/syscalls/linux/exec.cc | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 3ea037e4d..c2c3ec06e 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -644,6 +644,8 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
 		// resolved, the interpreter should still be resolved if it is
 		// a symlink.
 		args.ResolveFinal = true
+		// Refresh the traversal limit.
+		*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		args.Filename = bin.interpreter
 		d, i, err := openPath(ctx, args)
 		if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 803e7d41e..b03eeb005 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -293,6 +293,8 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 				ctx.Infof("Error loading interpreter script: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
+			// Refresh the traversal limit for the interpreter.
+			*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		default:
 			ctx.Infof("Unknown magic: %v", hdr)
 			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index a9067df2a..581f03533 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -533,6 +533,47 @@ TEST(ExecTest, CloexecEventfd) {
             W_EXITCODE(0, 0), "");
 }
 
+constexpr int kLinuxMaxSymlinks = 40;
+
+TEST(ExecTest, SymlinkLimitExceeded) {
+  std::string path = WorkloadPath(kBasicWorkload);
+
+  // Hold onto TempPath objects so they are not destructed prematurely.
+  std::vector<TempPath> symlinks;
+  for (int i = 0; i < kLinuxMaxSymlinks + 1; i++) {
+    symlinks.push_back(
+        ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", path)));
+    path = symlinks[i].path();
+  }
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(path, {path}, {}, /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
+  std::string tmp_dir = "/tmp";
+  std::string interpreter_path = "/bin/echo";
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      tmp_dir, absl::StrCat("#!", interpreter_path), 0755));
+  std::string script_path = script.path();
+
+  // Hold onto TempPath objects so they are not destructed prematurely.
+  std::vector<TempPath> interpreter_symlinks;
+  std::vector<TempPath> script_symlinks;
+  for (int i = 0; i < kLinuxMaxSymlinks; i++) {
+    interpreter_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+        TempPath::CreateSymlinkTo(tmp_dir, interpreter_path)));
+    interpreter_path = interpreter_symlinks[i].path();
+    script_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+        TempPath::CreateSymlinkTo(tmp_dir, script_path)));
+    script_path = script_symlinks[i].path();
+  }
+
+  CheckExec(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0), "");
+}
+
 TEST(ExecveatTest, BasicWithFDCWD) {
   std::string path = WorkloadPath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
-- 
cgit v1.2.3


From d7f5e823e24501c33a377ee6c73210b00bf3d89f Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 29 Oct 2019 13:58:20 -0700
Subject: Fix grammar in comment.

Missing "for".

PiperOrigin-RevId: 277358513
---
 pkg/sentry/kernel/task.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 11a8c6c87..9be3dae3c 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -709,7 +709,7 @@ func (t *Task) FDTable() *FDTable {
 	return t.fdTable
 }
 
-// GetFile is a convenience wrapper t.FDTable().Get.
+// GetFile is a convenience wrapper for t.FDTable().Get.
 //
 // Precondition: same as FDTable.Get.
 func (t *Task) GetFile(fd int32) *fs.File {
-- 
cgit v1.2.3


From a2c51efe3669f0380042b2375eae79e403d3680c Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 29 Oct 2019 16:13:43 -0700
Subject: Add endpoint tracking to the stack.

In the future this will replace DanglingEndpoints. DanglingEndpoints must be
kept for now due to issues with save/restore.

This is arguably a cleaner design and allows the stack to know which transport
endpoints might still be using its link endpoints.

Updates #837

PiperOrigin-RevId: 277386633
---
 pkg/sentry/inet/BUILD                     |  5 ++-
 pkg/sentry/inet/inet.go                   | 12 ++++++
 pkg/sentry/inet/test_stack.go             | 16 +++++++-
 pkg/sentry/socket/hostinet/BUILD          |  1 +
 pkg/sentry/socket/hostinet/stack.go       | 10 +++++
 pkg/sentry/socket/netstack/stack.go       | 15 +++++++
 pkg/sentry/socket/rpcinet/BUILD           |  1 +
 pkg/sentry/socket/rpcinet/stack.go        | 10 +++++
 pkg/tcpip/stack/stack.go                  | 59 ++++++++++++++++++++++++++--
 pkg/tcpip/stack/transport_demuxer.go      | 65 +++++++++++++++++++++++--------
 pkg/tcpip/stack/transport_test.go         |  3 +-
 pkg/tcpip/transport/tcp/endpoint.go       |  5 ++-
 pkg/tcpip/transport/tcp/endpoint_state.go |  7 +++-
 13 files changed, 182 insertions(+), 27 deletions(-)

diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index d5284f0d9..8d60ad4ad 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -13,5 +13,8 @@ go_library(
         "test_stack.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/inet",
-    deps = ["//pkg/sentry/context"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/tcpip/stack",
+    ],
 )
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index bc6cb1095..a7dfb78a7 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -15,6 +15,8 @@
 // Package inet defines semantics for IP stacks.
 package inet
 
+import "gvisor.dev/gvisor/pkg/tcpip/stack"
+
 // Stack represents a TCP/IP stack.
 type Stack interface {
 	// Interfaces returns all network interfaces as a mapping from interface
@@ -58,6 +60,16 @@ type Stack interface {
 
 	// Resume restarts the network stack after restore.
 	Resume()
+
+	// RegisteredEndpoints returns all endpoints which are currently registered.
+	RegisteredEndpoints() []stack.TransportEndpoint
+
+	// CleanupEndpoints returns endpoints currently in the cleanup state.
+	CleanupEndpoints() []stack.TransportEndpoint
+
+	// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
+	// for restoring a stack after a save.
+	RestoreCleanupEndpoints([]stack.TransportEndpoint)
 }
 
 // Interface contains information about a network interface.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index b9eed7c3a..dcfcbd97e 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -14,6 +14,8 @@
 
 package inet
 
+import "gvisor.dev/gvisor/pkg/tcpip/stack"
+
 // TestStack is a dummy implementation of Stack for tests.
 type TestStack struct {
 	InterfacesMap     map[int32]Interface
@@ -94,5 +96,17 @@ func (s *TestStack) RouteTable() []Route {
 }
 
 // Resume implements Stack.Resume.
-func (s *TestStack) Resume() {
+func (s *TestStack) Resume() {}
+
+// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints.
+func (s *TestStack) RegisteredEndpoints() []stack.TransportEndpoint {
+	return nil
 }
+
+// CleanupEndpoints implements inet.Stack.CleanupEndpoints.
+func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint {
+	return nil
+}
+
+// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
+func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 4d174dda4..8b66a719d 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
+        "//pkg/tcpip/stack",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index d4387f5d4..e67b46c9e 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 var defaultRecvBufSize = inet.TCPBufferSize{
@@ -442,3 +443,12 @@ func (s *Stack) RouteTable() []inet.Route {
 
 // Resume implements inet.Stack.Resume.
 func (s *Stack) Resume() {}
+
+// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints.
+func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil }
+
+// CleanupEndpoints implements inet.Stack.CleanupEndpoints.
+func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
+
+// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
+func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index d5db8c17c..a0db2d4fd 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -291,3 +291,18 @@ func (s *Stack) FillDefaultIPTables() {
 func (s *Stack) Resume() {
 	s.Stack.Resume()
 }
+
+// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints.
+func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint {
+	return s.Stack.RegisteredEndpoints()
+}
+
+// CleanupEndpoints implements inet.Stack.CleanupEndpoints.
+func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint {
+	return s.Stack.CleanupEndpoints()
+}
+
+// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
+func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
+	s.Stack.RestoreCleanupEndpoints(es)
+}
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 3a6baa308..4668b87d1 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -37,6 +37,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
         "//pkg/unet",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index 5dcb6b455..f7878a760 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
 	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
@@ -165,3 +166,12 @@ func (s *Stack) RouteTable() []inet.Route {
 
 // Resume implements inet.Stack.Resume.
 func (s *Stack) Resume() {}
+
+// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints.
+func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil }
+
+// CleanupEndpoints implements inet.Stack.CleanupEndpoints.
+func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
+
+// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
+func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 242d2150c..360c54b2d 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -361,9 +361,10 @@ type Stack struct {
 
 	linkAddrCache *linkAddrCache
 
-	mu         sync.RWMutex
-	nics       map[tcpip.NICID]*NIC
-	forwarding bool
+	mu               sync.RWMutex
+	nics             map[tcpip.NICID]*NIC
+	forwarding       bool
+	cleanupEndpoints map[TransportEndpoint]struct{}
 
 	// route is the route table passed in by the user via SetRouteTable(),
 	// it is used by FindRoute() to build a route for a specific
@@ -513,6 +514,7 @@ func New(opts Options) *Stack {
 		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
 		linkAddrResolvers:    make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
 		nics:                 make(map[tcpip.NICID]*NIC),
+		cleanupEndpoints:     make(map[TransportEndpoint]struct{}),
 		linkAddrCache:        newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
 		PortManager:          ports.NewPortManager(),
 		clock:                clock,
@@ -1136,6 +1138,25 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip
 	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
 }
 
+// StartTransportEndpointCleanup removes the endpoint with the given id from
+// the stack transport dispatcher. It also transitions it to the cleanup stage.
+func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.cleanupEndpoints[ep] = struct{}{}
+
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
+}
+
+// CompleteTransportEndpointCleanup removes the endpoint from the cleanup
+// stage.
+func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
+	s.mu.Lock()
+	delete(s.cleanupEndpoints, ep)
+	s.mu.Unlock()
+}
+
 // RegisterRawTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided transport
 // protocol will be delivered to the given endpoint.
@@ -1157,6 +1178,38 @@ func (s *Stack) RegisterRestoredEndpoint(e ResumableEndpoint) {
 	s.mu.Unlock()
 }
 
+// RegisteredEndpoints returns all endpoints which are currently registered.
+func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	var es []TransportEndpoint
+	for _, e := range s.demux.protocol {
+		es = append(es, e.transportEndpoints()...)
+	}
+	return es
+}
+
+// CleanupEndpoints returns endpoints currently in the cleanup state.
+func (s *Stack) CleanupEndpoints() []TransportEndpoint {
+	s.mu.Lock()
+	es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
+	for e := range s.cleanupEndpoints {
+		es = append(es, e)
+	}
+	s.mu.Unlock()
+	return es
+}
+
+// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
+// for restoring a stack after a save.
+func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
+	s.mu.Lock()
+	for _, e := range es {
+		s.cleanupEndpoints[e] = struct{}{}
+	}
+	s.mu.Unlock()
+}
+
 // Resume restarts the stack after a restore. This must be called after the
 // entire system has been restored.
 func (s *Stack) Resume() {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 9aff90a3d..f633632f0 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -41,6 +41,31 @@ type transportEndpoints struct {
 	rawEndpoints []RawTransportEndpoint
 }
 
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	epsByNic, ok := eps.endpoints[id]
+	if !ok {
+		return
+	}
+	if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
+		return
+	}
+	delete(eps.endpoints, id)
+}
+
+func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
+	eps.mu.RLock()
+	defer eps.mu.RUnlock()
+	es := make([]TransportEndpoint, 0, len(eps.endpoints))
+	for _, e := range eps.endpoints {
+		es = append(es, e.transportEndpoints()...)
+	}
+	return es
+}
+
 type endpointsByNic struct {
 	mu        sync.RWMutex
 	endpoints map[tcpip.NICID]*multiPortEndpoint
@@ -48,6 +73,16 @@ type endpointsByNic struct {
 	seed uint32
 }
 
+func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
+	epsByNic.mu.RLock()
+	defer epsByNic.mu.RUnlock()
+	var eps []TransportEndpoint
+	for _, ep := range epsByNic.endpoints {
+		eps = append(eps, ep.transportEndpoints()...)
+	}
+	return eps
+}
+
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
 func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
@@ -127,21 +162,6 @@ func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t T
 	return len(epsByNic.endpoints) == 0
 }
 
-// unregisterEndpoint unregisters the endpoint with the given id such that it
-// won't receive any more packets.
-func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
-	eps.mu.Lock()
-	defer eps.mu.Unlock()
-	epsByNic, ok := eps.endpoints[id]
-	if !ok {
-		return
-	}
-	if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
-		return
-	}
-	delete(eps.endpoints, id)
-}
-
 // transportDemuxer demultiplexes packets targeted at a transport endpoint
 // (i.e., after they've been parsed by the network layer). It does two levels
 // of demultiplexing: first based on the network and transport protocols, then
@@ -183,14 +203,27 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 // multiPortEndpoint is a container for TransportEndpoints which are bound to
 // the same pair of address and port. endpointsArr always has at least one
 // element.
+//
+// FIXME(gvisor.dev/issue/873): Restore this properly. Currently, we just save
+// this to ensure that the underlying endpoints get saved/restored, but not not
+// use the restored copy.
+//
+// +stateify savable
 type multiPortEndpoint struct {
-	mu           sync.RWMutex
+	mu           sync.RWMutex `state:"nosave"`
 	endpointsArr []TransportEndpoint
 	endpointsMap map[TransportEndpoint]int
 	// reuse indicates if more than one endpoint is allowed.
 	reuse bool
 }
 
+func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
+	ep.mu.RLock()
+	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
+	ep.mu.RUnlock()
+	return eps
+}
+
 // reciprocalScale scales a value into range [0, n).
 //
 // This is similar to val % n, but faster.
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index db951c9ce..ae6fda3a9 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -218,8 +218,7 @@ func (f *fakeTransportEndpoint) State() uint32 {
 	return 0
 }
 
-func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {
-}
+func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
 func (f *fakeTransportEndpoint) IPTables() (iptables.IPTables, error) {
 	return iptables.IPTables{}, nil
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ce8307cee..8a3ca0f1b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -686,7 +686,7 @@ func (e *endpoint) Close() {
 	// in Listen() when trying to register.
 	if e.state == StateListen && e.isPortReserved {
 		if e.isRegistered {
-			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
+			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
 			e.isRegistered = false
 		}
 
@@ -747,7 +747,7 @@ func (e *endpoint) cleanupLocked() {
 	e.workerCleanup = false
 
 	if e.isRegistered {
-		e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
+		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
 		e.isRegistered = false
 	}
 
@@ -757,6 +757,7 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	e.route.Release()
+	e.stack.CompleteTransportEndpointCleanup(e)
 	tcpip.DeleteDanglingEndpoint(e)
 }
 
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index eae17237e..19f003b6b 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -193,8 +193,10 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		if len(e.BindAddr) == 0 {
 			e.BindAddr = e.ID.LocalAddress
 		}
-		if err := e.Bind(tcpip.FullAddress{Addr: e.BindAddr, Port: e.ID.LocalPort}); err != nil {
-			panic("endpoint binding failed: " + err.String())
+		addr := e.BindAddr
+		port := e.ID.LocalPort
+		if err := e.Bind(tcpip.FullAddress{Addr: addr, Port: port}); err != nil {
+			panic(fmt.Sprintf("endpoint binding [%v]:%d failed: %v", addr, port, err))
 		}
 	}
 
@@ -265,6 +267,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		}
 		fallthrough
 	case StateError:
+		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
 }
-- 
cgit v1.2.3


From dc21c5ca16dbc43755185ffdf53764c7bb4c3a12 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 29 Oct 2019 17:21:01 -0700
Subject: Add Close and Wait methods to stack.

Link endpoints still don't have a unified way to be requested to stop.

Updates #837

PiperOrigin-RevId: 277398952
---
 pkg/tcpip/stack/stack.go | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 360c54b2d..6d6ddc0ff 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1210,6 +1210,37 @@ func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
 	s.mu.Unlock()
 }
 
+// Close closes all currently registered transport endpoints.
+//
+// Endpoints created or modified during this call may not get closed.
+func (s *Stack) Close() {
+	for _, e := range s.RegisteredEndpoints() {
+		e.Close()
+	}
+}
+
+// Wait waits for all transport and link endpoints to halt their worker
+// goroutines.
+//
+// Endpoints created or modified during this call may not get waited on.
+//
+// Note that link endpoints must be stopped via an implementation specific
+// mechanism.
+func (s *Stack) Wait() {
+	for _, e := range s.RegisteredEndpoints() {
+		e.Wait()
+	}
+	for _, e := range s.CleanupEndpoints() {
+		e.Wait()
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, n := range s.nics {
+		n.linkEP.Wait()
+	}
+}
+
 // Resume restarts the stack after a restore. This must be called after the
 // entire system has been restored.
 func (s *Stack) Resume() {
-- 
cgit v1.2.3


From 80d0db274ef88f4c53d2d08df52c0f9c58ca53ac Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 30 Oct 2019 03:06:34 +0000
Subject: Enable runsc/fsgofer support on arm64.

newfstatat() syscall is not supported on arm64, so we resort
to use the fstatat() syscall.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I9e89d46c5ec9ae07db201c9da5b6dda9bfd2eaf0
---
 runsc/fsgofer/BUILD                   |  2 ++
 runsc/fsgofer/filter/BUILD            |  2 ++
 runsc/fsgofer/filter/config.go        |  7 +----
 runsc/fsgofer/filter/config_amd64.go  | 33 +++++++++++++++++++++++
 runsc/fsgofer/filter/config_arm64.go  | 27 +++++++++++++++++++
 runsc/fsgofer/fsgofer_amd64_unsafe.go | 49 +++++++++++++++++++++++++++++++++++
 runsc/fsgofer/fsgofer_arm64_unsafe.go | 49 +++++++++++++++++++++++++++++++++++
 runsc/fsgofer/fsgofer_unsafe.go       | 25 ------------------
 8 files changed, 163 insertions(+), 31 deletions(-)
 create mode 100644 runsc/fsgofer/filter/config_amd64.go
 create mode 100644 runsc/fsgofer/filter/config_arm64.go
 create mode 100644 runsc/fsgofer/fsgofer_amd64_unsafe.go
 create mode 100644 runsc/fsgofer/fsgofer_arm64_unsafe.go

diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 80a4aa2fe..afcb41801 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "fsgofer",
     srcs = [
         "fsgofer.go",
+        "fsgofer_amd64_unsafe.go",
+        "fsgofer_arm64_unsafe.go",
         "fsgofer_unsafe.go",
     ],
     importpath = "gvisor.dev/gvisor/runsc/fsgofer",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index 02168ad1b..bac73f89d 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 2ea95f8fb..a1792330f 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -25,11 +25,7 @@ import (
 
 // allowedSyscalls is the set of syscalls executed by the gofer.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ACCEPT: {},
-	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	},
+	syscall.SYS_ACCEPT:        {},
 	syscall.SYS_CLOCK_GETTIME: {},
 	syscall.SYS_CLONE: []seccomp.Rule{
 		{
@@ -155,7 +151,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MPROTECT:   {},
 	syscall.SYS_MUNMAP:     {},
 	syscall.SYS_NANOSLEEP:  {},
-	syscall.SYS_NEWFSTATAT: {},
 	syscall.SYS_OPENAT:     {},
 	syscall.SYS_PPOLL:      {},
 	syscall.SYS_PREAD64:    {},
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
new file mode 100644
index 000000000..a4b28cb8b
--- /dev/null
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
new file mode 100644
index 000000000..d2697deb7
--- /dev/null
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
new file mode 100644
index 000000000..5d4aab597
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, err
+	}
+	namePtr := unsafe.Pointer(nameBytes)
+
+	var stat syscall.Stat_t
+	statPtr := unsafe.Pointer(&stat)
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+	}
+	return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
new file mode 100644
index 000000000..8041fd352
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, err
+	}
+	namePtr := unsafe.Pointer(nameBytes)
+
+	var stat syscall.Stat_t
+	statPtr := unsafe.Pointer(&stat)
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_FSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+	}
+	return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index ff2556aee..542b54365 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -18,34 +18,9 @@ import (
 	"syscall"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return syscall.Stat_t{}, err
-	}
-	namePtr := unsafe.Pointer(nameBytes)
-
-	var stat syscall.Stat_t
-	statPtr := unsafe.Pointer(&stat)
-
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_NEWFSTATAT,
-		uintptr(dirFd),
-		uintptr(namePtr),
-		uintptr(statPtr),
-		linux.AT_SYMLINK_NOFOLLOW,
-		0,
-		0); errno != 0 {
-
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
-	}
-	return stat, nil
-}
-
 func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
 	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
 	// operate directly on 'dirFd' unlike other *at syscalls.
-- 
cgit v1.2.3


From 8bc7b8dba2dcc339ab5bd1b05c83f74a6211a7d0 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 30 Oct 2019 13:29:56 -0700
Subject: Clean up typos in test names.

PiperOrigin-RevId: 277572791
---
 test/syscalls/linux/socket_ip_tcp_generic.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 7e0deda05..592448289 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -30,7 +30,7 @@
 namespace gvisor {
 namespace testing {
 
-TEST_P(TCPSocketPairTest, TcpInfoSucceedes) {
+TEST_P(TCPSocketPairTest, TcpInfoSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct tcp_info opt = {};
@@ -39,7 +39,7 @@ TEST_P(TCPSocketPairTest, TcpInfoSucceedes) {
               SyscallSucceeds());
 }
 
-TEST_P(TCPSocketPairTest, ShortTcpInfoSucceedes) {
+TEST_P(TCPSocketPairTest, ShortTcpInfoSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct tcp_info opt = {};
@@ -48,7 +48,7 @@ TEST_P(TCPSocketPairTest, ShortTcpInfoSucceedes) {
               SyscallSucceeds());
 }
 
-TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceedes) {
+TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct tcp_info opt = {};
-- 
cgit v1.2.3


From db37483cb6acf55b66132d534bb734f09555b1cf Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 30 Oct 2019 15:32:20 -0700
Subject: Store endpoints inside multiPortEndpoint in a sorted order

It is required to guarantee the same order of endpoints after save/restore.

PiperOrigin-RevId: 277598665
---
 pkg/tcpip/stack/registration.go             |   3 +
 pkg/tcpip/stack/stack.go                    |  29 ++++++++
 pkg/tcpip/stack/transport_demuxer.go        |  10 +++
 pkg/tcpip/stack/transport_test.go           |  11 ++-
 pkg/tcpip/transport/icmp/endpoint.go        |   7 ++
 pkg/tcpip/transport/tcp/endpoint.go         |   7 ++
 pkg/tcpip/transport/udp/endpoint.go         |   7 ++
 runsc/boot/loader.go                        |   5 +-
 test/syscalls/linux/socket_inet_loopback.cc | 107 ++++++++++++++++++++++++++++
 9 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 0360187b8..94015ba54 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -60,6 +60,9 @@ const (
 // TransportEndpoint is the interface that needs to be implemented by transport
 // protocol (e.g., tcp, udp) endpoints that can handle packets.
 type TransportEndpoint interface {
+	// UniqueID returns an unique ID for this transport endpoint.
+	UniqueID() uint64
+
 	// HandlePacket is called by the stack when new packets arrive to
 	// this transport endpoint.
 	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6d6ddc0ff..115a6fcb8 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -22,6 +22,7 @@ package stack
 import (
 	"encoding/binary"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"golang.org/x/time/rate"
@@ -344,6 +345,13 @@ type ResumableEndpoint interface {
 	Resume(*Stack)
 }
 
+// uniqueIDGenerator is a default unique ID generator.
+type uniqueIDGenerator uint64
+
+func (u *uniqueIDGenerator) UniqueID() uint64 {
+	return atomic.AddUint64((*uint64)(u), 1)
+}
+
 // Stack is a networking stack, with all supported protocols, NICs, and route
 // table.
 type Stack struct {
@@ -411,6 +419,14 @@ type Stack struct {
 	// ndpDisp is the NDP event dispatcher that is used to send the netstack
 	// integrator NDP related events.
 	ndpDisp NDPDispatcher
+
+	// uniqueIDGenerator is a generator of unique identifiers.
+	uniqueIDGenerator UniqueID
+}
+
+// UniqueID is an abstract generator of unique identifiers.
+type UniqueID interface {
+	UniqueID() uint64
 }
 
 // Options contains optional Stack configuration.
@@ -434,6 +450,9 @@ type Options struct {
 	// stack (false).
 	HandleLocal bool
 
+	// UniqueID is an optional generator of unique identifiers.
+	UniqueID UniqueID
+
 	// NDPConfigs is the default NDP configurations used by interfaces.
 	//
 	// By default, NDPConfigs will have a zero value for its
@@ -506,6 +525,10 @@ func New(opts Options) *Stack {
 		clock = &tcpip.StdClock{}
 	}
 
+	if opts.UniqueID == nil {
+		opts.UniqueID = new(uniqueIDGenerator)
+	}
+
 	// Make sure opts.NDPConfigs contains valid values only.
 	opts.NDPConfigs.validate()
 
@@ -524,6 +547,7 @@ func New(opts Options) *Stack {
 		portSeed:             generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
 	}
 
@@ -551,6 +575,11 @@ func New(opts Options) *Stack {
 	return s
 }
 
+// UniqueID returns a unique identifier.
+func (s *Stack) UniqueID() uint64 {
+	return s.uniqueIDGenerator.UniqueID()
+}
+
 // SetNetworkProtocolOption allows configuring individual protocol level
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index f633632f0..ccd3d030e 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -17,6 +17,7 @@ package stack
 import (
 	"fmt"
 	"math/rand"
+	"sort"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -310,6 +311,15 @@ func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePo
 	// endpointsMap. This will allow us to remove endpoint from the array fast.
 	ep.endpointsMap[t] = len(ep.endpointsArr)
 	ep.endpointsArr = append(ep.endpointsArr, t)
+
+	// ep.endpointsArr is sorted by endpoint unique IDs, so that endpoints
+	// can be restored in the same order.
+	sort.Slice(ep.endpointsArr, func(i, j int) bool {
+		return ep.endpointsArr[i].UniqueID() < ep.endpointsArr[j].UniqueID()
+	})
+	for i, e := range ep.endpointsArr {
+		ep.endpointsMap[e] = i
+	}
 	return nil
 }
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index ae6fda3a9..203e79f56 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -43,6 +43,7 @@ type fakeTransportEndpoint struct {
 	proto    *fakeTransportProtocol
 	peerAddr tcpip.Address
 	route    stack.Route
+	uniqueID uint64
 
 	// acceptQueue is non-nil iff bound.
 	acceptQueue []fakeTransportEndpoint
@@ -56,8 +57,8 @@ func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
-func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber) tcpip.Endpoint {
-	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto}
+func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
+	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
 func (f *fakeTransportEndpoint) Close() {
@@ -144,6 +145,10 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	return nil
 }
 
+func (f *fakeTransportEndpoint) UniqueID() uint64 {
+	return f.uniqueID
+}
+
 func (f *fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
 	return nil
 }
@@ -251,7 +256,7 @@ func (*fakeTransportProtocol) Number() tcpip.TransportProtocolNumber {
 }
 
 func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newFakeTransportEndpoint(stack, f, netProto), nil
+	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
 }
 
 func (f *fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index d0dd383fd..114a69b4e 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -58,6 +58,7 @@ type endpoint struct {
 	// immutable.
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
+	uniqueID    uint64
 
 	// The following fields are used to manage the receive queue, and are
 	// protected by rcvMu.
@@ -90,9 +91,15 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 		state:         stateInitial,
+		uniqueID:      s.UniqueID(),
 	}, nil
 }
 
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8a3ca0f1b..a1efd8d55 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -287,6 +287,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack  `state:"manual"`
 	waiterQueue *waiter.Queue `state:"wait"`
+	uniqueID    uint64
 
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
@@ -504,6 +505,11 @@ type endpoint struct {
 	stats Stats `state:"nosave"`
 }
 
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
 // calculateAdvertisedMSS calculates the MSS to advertise.
 //
 // If userMSS is non-zero and is not greater than the maximum possible MSS for
@@ -565,6 +571,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 			interval: 75 * time.Second,
 			count:    9,
 		},
+		uniqueID: s.UniqueID(),
 	}
 
 	var ss SendBufferSizeOption
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index cda302bb7..68977dc25 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -80,6 +80,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
+	uniqueID    uint64
 
 	// The following fields are used to manage the receive queue, and are
 	// protected by rcvMu.
@@ -160,9 +161,15 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 		state:         StateInitial,
+		uniqueID:      s.UniqueID(),
 	}
 }
 
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0c0eba99e..86df384f8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -232,7 +232,7 @@ func New(args Args) (*Loader, error) {
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k)
+	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -905,7 +905,7 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
 	switch conf.Network {
 	case NetworkHost:
 		return hostinet.NewStack(), nil
@@ -923,6 +923,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			// Enable raw sockets for users with sufficient
 			// privileges.
 			RawFactory: raw.EndpointFactory{},
+			UniqueID:   uniqueID,
 		})}
 
 		// Enable SACK Recovery.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 322ee07ad..ab375aaaf 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -16,6 +16,7 @@
 #include <netinet/in.h>
 #include <poll.h>
 #include <string.h>
+#include <sys/epoll.h>
 #include <sys/socket.h>
 
 #include <atomic>
@@ -516,6 +517,112 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+  sockaddr_storage listen_addr = listener.addr;
+  sockaddr_storage conn_addr = connector.addr;
+  constexpr int kThreadCount = 3;
+
+  // TODO(b/141211329): endpointsByNic.seed has to be saved/restored.
+  const DisableSave ds141211329;
+
+  // Create listening sockets.
+  FileDescriptor listener_fds[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    listener_fds[i] =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0));
+    int fd = listener_fds[i].get();
+
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+        SyscallSucceeds());
+
+    // On the first bind we need to determine which port was bound.
+    if (i != 0) {
+      continue;
+    }
+
+    // Get the port bound by the listening socket.
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listener_fds[0].get(),
+                    reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+        SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+    ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  }
+
+  constexpr int kConnectAttempts = 10;
+  FileDescriptor client_fds[kConnectAttempts];
+
+  // Do the first run without save/restore.
+  DisableSave ds;
+  for (int i = 0; i < kConnectAttempts; i++) {
+    client_fds[i] =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+    EXPECT_THAT(RetryEINTR(sendto)(client_fds[i].get(), &i, sizeof(i), 0,
+                                   reinterpret_cast<sockaddr*>(&conn_addr),
+                                   connector.addr_len),
+                SyscallSucceedsWithValue(sizeof(i)));
+  }
+  ds.reset();
+
+  // Check that a mapping of client and server sockets has
+  // not been change after save/restore.
+  for (int i = 0; i < kConnectAttempts; i++) {
+    EXPECT_THAT(RetryEINTR(sendto)(client_fds[i].get(), &i, sizeof(i), 0,
+                                   reinterpret_cast<sockaddr*>(&conn_addr),
+                                   connector.addr_len),
+                SyscallSucceedsWithValue(sizeof(i)));
+  }
+
+  int epollfd;
+  ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds());
+
+  for (int i = 0; i < kThreadCount; i++) {
+    int fd = listener_fds[i].get();
+    struct epoll_event ev;
+    ev.data.fd = fd;
+    ev.events = EPOLLIN;
+    ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev), SyscallSucceeds());
+  }
+
+  std::map<uint16_t, int> portToFD;
+
+  for (int i = 0; i < kConnectAttempts * 2; i++) {
+    struct sockaddr_storage addr = {};
+    socklen_t addrlen = sizeof(addr);
+    struct epoll_event ev;
+    int data, fd;
+
+    ASSERT_THAT(epoll_wait(epollfd, &ev, 1, -1), SyscallSucceedsWithValue(1));
+
+    fd = ev.data.fd;
+    EXPECT_THAT(RetryEINTR(recvfrom)(fd, &data, sizeof(data), 0,
+                                     reinterpret_cast<struct sockaddr*>(&addr),
+                                     &addrlen),
+                SyscallSucceedsWithValue(sizeof(data)));
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
+    auto prev_port = portToFD.find(port);
+    // Check that all packets from one client have been delivered to the same
+    // server socket.
+    if (prev_port == portToFD.end()) {
+      portToFD[port] = ev.data.fd;
+    } else {
+      EXPECT_EQ(portToFD[port], ev.data.fd);
+    }
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetReusePortTest,
     ::testing::Values(
-- 
cgit v1.2.3


From ca90dad0e21c758925968d217ad15a05015bd82c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 30 Oct 2019 15:36:42 -0700
Subject: Fix container locking

Sandbox root dir was not being saved with the Container state,
so it would point to the wrong directory location when attempting
to lock the sandbox. This led to race conditions saving and
loading container state. Fixing it, led to multiple deadlocks.

I've moved the saving and locking logic to a separate struct and
moved the lock file inside the RootDir (instead of container
root dir), which allows the lock to be taken inside Destroy,
and removes the need to lock the sandbox.

PiperOrigin-RevId: 277599612
---
 runsc/container/BUILD         |   1 +
 runsc/container/container.go  | 344 ++++++++++++++++--------------------------
 runsc/container/state_file.go | 185 +++++++++++++++++++++++
 3 files changed, 312 insertions(+), 218 deletions(-)
 create mode 100644 runsc/container/state_file.go

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 26d1cd5ab..2bd12120d 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "container.go",
         "hook.go",
+        "state_file.go",
         "status.go",
     ],
     importpath = "gvisor.dev/gvisor/runsc/container",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 32510d427..68782c4be 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -17,13 +17,11 @@ package container
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"os/signal"
-	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -31,7 +29,6 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
-	"github.com/gofrs/flock"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
@@ -41,17 +38,6 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
-const (
-	// metadataFilename is the name of the metadata file relative to the
-	// container root directory that holds sandbox metadata.
-	metadataFilename = "meta.json"
-
-	// metadataLockFilename is the name of a lock file in the container
-	// root directory that is used to prevent concurrent modifications to
-	// the container state and metadata.
-	metadataLockFilename = "meta.lock"
-)
-
 // validateID validates the container id.
 func validateID(id string) error {
 	// See libcontainer/factory_linux.go.
@@ -99,11 +85,6 @@ type Container struct {
 	// BundleDir is the directory containing the container bundle.
 	BundleDir string `json:"bundleDir"`
 
-	// Root is the directory containing the container metadata file. If this
-	// container is the root container, Root and RootContainerDir will be the
-	// same.
-	Root string `json:"root"`
-
 	// CreatedAt is the time the container was created.
 	CreatedAt time.Time `json:"createdAt"`
 
@@ -121,21 +102,24 @@ type Container struct {
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
+	// Sandbox is the sandbox this container is running in. It's set when the
+	// container is created and reset when the sandbox is destroyed.
+	Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+	// Saver handles load from/save to the state file safely from multiple
+	// processes.
+	Saver StateFile `json:"saver"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
 	// goferIsChild is set if a gofer process is a child of the current process.
 	//
 	// This field isn't saved to json, because only a creator of a gofer
 	// process will have it as a child process.
 	goferIsChild bool
-
-	// Sandbox is the sandbox this container is running in. It's set when the
-	// container is created and reset when the sandbox is destroyed.
-	Sandbox *sandbox.Sandbox `json:"sandbox"`
-
-	// RootContainerDir is the root directory containing the metadata file of the
-	// sandbox root container. It's used to lock in order to serialize creating
-	// and deleting this Container's metadata directory. If this container is the
-	// root container, this is the same as Root.
-	RootContainerDir string
 }
 
 // loadSandbox loads all containers that belong to the sandbox with the given
@@ -166,43 +150,35 @@ func loadSandbox(rootDir, id string) ([]*Container, error) {
 	return containers, nil
 }
 
-// Load loads a container with the given id from a metadata file. id may be an
-// abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to.
-// Returns ErrNotExist if container doesn't exist.
-func Load(rootDir, id string) (*Container, error) {
-	log.Debugf("Load container %q %q", rootDir, id)
-	if err := validateID(id); err != nil {
+// Load loads a container with the given id from a metadata file. partialID may
+// be an abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to. Returns ErrNotExist if
+// container doesn't exist.
+func Load(rootDir, partialID string) (*Container, error) {
+	log.Debugf("Load container %q %q", rootDir, partialID)
+	if err := validateID(partialID); err != nil {
 		return nil, fmt.Errorf("validating id: %v", err)
 	}
 
-	cRoot, err := findContainerRoot(rootDir, id)
+	id, err := findContainerID(rootDir, partialID)
 	if err != nil {
 		// Preserve error so that callers can distinguish 'not found' errors.
 		return nil, err
 	}
 
-	// Lock the container metadata to prevent other runsc instances from
-	// writing to it while we are reading it.
-	unlock, err := lockContainerMetadata(cRoot)
-	if err != nil {
-		return nil, err
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
 	}
-	defer unlock()
+	defer state.close()
 
-	// Read the container metadata file and create a new Container from it.
-	metaFile := filepath.Join(cRoot, metadataFilename)
-	metaBytes, err := ioutil.ReadFile(metaFile)
-	if err != nil {
+	c := &Container{}
+	if err := state.load(c); err != nil {
 		if os.IsNotExist(err) {
 			// Preserve error so that callers can distinguish 'not found' errors.
 			return nil, err
 		}
-		return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
-	}
-	var c Container
-	if err := json.Unmarshal(metaBytes, &c); err != nil {
-		return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
 	}
 
 	// If the status is "Running" or "Created", check that the sandbox
@@ -223,57 +199,37 @@ func Load(rootDir, id string) (*Container, error) {
 		}
 	}
 
-	return &c, nil
+	return c, nil
 }
 
-func findContainerRoot(rootDir, partialID string) (string, error) {
+func findContainerID(rootDir, partialID string) (string, error) {
 	// Check whether the id fully specifies an existing container.
-	cRoot := filepath.Join(rootDir, partialID)
-	if _, err := os.Stat(cRoot); err == nil {
-		return cRoot, nil
+	stateFile := buildStatePath(rootDir, partialID)
+	if _, err := os.Stat(stateFile); err == nil {
+		return partialID, nil
 	}
 
 	// Now see whether id could be an abbreviation of exactly 1 of the
 	// container ids. If id is ambiguous (it could match more than 1
 	// container), it is an error.
-	cRoot = ""
 	ids, err := List(rootDir)
 	if err != nil {
 		return "", err
 	}
+	rv := ""
 	for _, id := range ids {
 		if strings.HasPrefix(id, partialID) {
-			if cRoot != "" {
-				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+			if rv != "" {
+				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
 			}
-			cRoot = id
+			rv = id
 		}
 	}
-	if cRoot == "" {
+	if rv == "" {
 		return "", os.ErrNotExist
 	}
-	log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
-	return filepath.Join(rootDir, cRoot), nil
-}
-
-// List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
-	log.Debugf("List containers %q", rootDir)
-	fs, err := ioutil.ReadDir(rootDir)
-	if err != nil {
-		return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
-	}
-	var out []string
-	for _, f := range fs {
-		// Filter out directories that do no belong to a container.
-		cid := f.Name()
-		if validateID(cid) == nil {
-			if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
-				out = append(out, f.Name())
-			}
-		}
-	}
-	return out, nil
+	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
+	return rv, nil
 }
 
 // Args is used to configure a new container.
@@ -316,44 +272,34 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 		return nil, err
 	}
 
-	unlockRoot, err := maybeLockRootContainer(args.Spec, conf.RootDir)
-	if err != nil {
-		return nil, err
+	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
+		return nil, fmt.Errorf("creating container root directory: %v", err)
 	}
-	defer unlockRoot()
+
+	c := &Container{
+		ID:            args.ID,
+		Spec:          args.Spec,
+		ConsoleSocket: args.ConsoleSocket,
+		BundleDir:     args.BundleDir,
+		Status:        Creating,
+		CreatedAt:     time.Now(),
+		Owner:         os.Getenv("USER"),
+		Saver: StateFile{
+			RootDir: conf.RootDir,
+			ID:      args.ID,
+		},
+	}
+	// The Cleanup object cleans up partially created containers when an error
+	// occurs. Any errors occurring during cleanup itself are ignored.
+	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+	defer cu.Clean()
 
 	// Lock the container metadata file to prevent concurrent creations of
 	// containers with the same id.
-	containerRoot := filepath.Join(conf.RootDir, args.ID)
-	unlock, err := lockContainerMetadata(containerRoot)
-	if err != nil {
+	if err := c.Saver.lockForNew(); err != nil {
 		return nil, err
 	}
-	defer unlock()
-
-	// Check if the container already exists by looking for the metadata
-	// file.
-	if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
-		return nil, fmt.Errorf("container with id %q already exists", args.ID)
-	} else if !os.IsNotExist(err) {
-		return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
-	}
-
-	c := &Container{
-		ID:               args.ID,
-		Spec:             args.Spec,
-		ConsoleSocket:    args.ConsoleSocket,
-		BundleDir:        args.BundleDir,
-		Root:             containerRoot,
-		Status:           Creating,
-		CreatedAt:        time.Now(),
-		Owner:            os.Getenv("USER"),
-		RootContainerDir: conf.RootDir,
-	}
-	// The Cleanup object cleans up partially created containers when an error occurs.
-	// Any errors occuring during cleanup itself are ignored.
-	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
-	defer cu.Clean()
+	defer c.Saver.unlock()
 
 	// If the metadata annotations indicate that this container should be
 	// started in an existing sandbox, we must do so. The metadata will
@@ -431,7 +377,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	c.changeStatus(Created)
 
 	// Save the metadata file.
-	if err := c.save(); err != nil {
+	if err := c.saveLocked(); err != nil {
 		return nil, err
 	}
 
@@ -451,17 +397,12 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 func (c *Container) Start(conf *boot.Config) error {
 	log.Debugf("Start container %q", c.ID)
 
-	unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlockRoot()
+	unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+	defer unlock.Clean()
 
-	unlock, err := c.lock()
-	if err != nil {
-		return err
-	}
-	defer unlock()
 	if err := c.requireStatus("start", Created); err != nil {
 		return err
 	}
@@ -509,14 +450,15 @@ func (c *Container) Start(conf *boot.Config) error {
 	}
 
 	c.changeStatus(Running)
-	if err := c.save(); err != nil {
+	if err := c.saveLocked(); err != nil {
 		return err
 	}
 
-	// Adjust the oom_score_adj for sandbox. This must be done after
-	// save().
-	err = adjustSandboxOOMScoreAdj(c.Sandbox, c.RootContainerDir, false)
-	if err != nil {
+	// Release lock before adjusting OOM score because the lock is acquired there.
+	unlock.Clean()
+
+	// Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
+	if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil {
 		return err
 	}
 
@@ -529,11 +471,10 @@ func (c *Container) Start(conf *boot.Config) error {
 // to restore a container from its state file.
 func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if err := c.requireStatus("restore", Created); err != nil {
 		return err
@@ -551,7 +492,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 		return err
 	}
 	c.changeStatus(Running)
-	return c.save()
+	return c.saveLocked()
 }
 
 // Run is a helper that calls Create + Start + Wait.
@@ -711,11 +652,10 @@ func (c *Container) Checkpoint(f *os.File) error {
 // The call only succeeds if the container's status is created or running.
 func (c *Container) Pause() error {
 	log.Debugf("Pausing container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if c.Status != Created && c.Status != Running {
 		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
@@ -725,18 +665,17 @@ func (c *Container) Pause() error {
 		return fmt.Errorf("pausing container: %v", err)
 	}
 	c.changeStatus(Paused)
-	return c.save()
+	return c.saveLocked()
 }
 
 // Resume unpauses the container and its kernel.
 // The call only succeeds if the container's status is paused.
 func (c *Container) Resume() error {
 	log.Debugf("Resuming container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if c.Status != Paused {
 		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
@@ -745,7 +684,7 @@ func (c *Container) Resume() error {
 		return fmt.Errorf("resuming container: %v", err)
 	}
 	c.changeStatus(Running)
-	return c.save()
+	return c.saveLocked()
 }
 
 // State returns the metadata of the container.
@@ -773,6 +712,17 @@ func (c *Container) Processes() ([]*control.Process, error) {
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer func() {
+		c.Saver.unlock()
+		c.Saver.close()
+	}()
+
+	// Stored for later use as stop() sets c.Sandbox to nil.
+	sb := c.Sandbox
+
 	// We must perform the following cleanup steps:
 	// * stop the container and gofer processes,
 	// * remove the container filesystem on the host, and
@@ -782,48 +732,43 @@ func (c *Container) Destroy() error {
 	// do our best to perform all of the cleanups. Hence, we keep a slice
 	// of errors return their concatenation.
 	var errs []string
-
-	unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
-	if err != nil {
-		return err
-	}
-	defer unlock()
-
-	// Stored for later use as stop() sets c.Sandbox to nil.
-	sb := c.Sandbox
-
 	if err := c.stop(); err != nil {
 		err = fmt.Errorf("stopping container: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
-	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
-		err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
+	if err := c.Saver.destroy(); err != nil {
+		err = fmt.Errorf("deleting container state files: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
 	c.changeStatus(Stopped)
 
-	// Adjust oom_score_adj for the sandbox. This must be done after the
-	// container is stopped and the directory at c.Root is removed.
-	// We must test if the sandbox is nil because Destroy should be
-	// idempotent.
-	if sb != nil {
-		if err := adjustSandboxOOMScoreAdj(sb, c.RootContainerDir, true); err != nil {
+	// Adjust oom_score_adj for the sandbox. This must be done after the container
+	// is stopped and the directory at c.Root is removed. Adjustment can be
+	// skipped if the root container is exiting, because it brings down the entire
+	// sandbox.
+	//
+	// Use 'sb' to tell whether it has been executed before because Destroy must
+	// be idempotent.
+	if sb != nil && !isRoot(c.Spec) {
+		if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil {
 			errs = append(errs, err.Error())
 		}
 	}
 
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
-	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
-	// Based on the OCI, "The post-stop hooks MUST be called after the container is
-	// deleted but before the delete operation returns"
+	// remaining hooks and lifecycle continue as if the hook had
+	// succeeded" - OCI spec.
+	//
+	// Based on the OCI, "The post-stop hooks MUST be called after the container
+	// is deleted but before the delete operation returns"
 	// Run it here to:
 	// 1) Conform to the OCI.
-	// 2) Make sure it only runs once, because the root has been deleted, the container
-	// can't be loaded again.
+	// 2) Make sure it only runs once, because the root has been deleted, the
+	// container can't be loaded again.
 	if c.Spec.Hooks != nil {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
@@ -834,18 +779,13 @@ func (c *Container) Destroy() error {
 	return fmt.Errorf(strings.Join(errs, "\n"))
 }
 
-// save saves the container metadata to a file.
+// saveLocked saves the container metadata to a file.
 //
 // Precondition: container must be locked with container.lock().
-func (c *Container) save() error {
+func (c *Container) saveLocked() error {
 	log.Debugf("Save container %q", c.ID)
-	metaFile := filepath.Join(c.Root, metadataFilename)
-	meta, err := json.Marshal(c)
-	if err != nil {
-		return fmt.Errorf("invalid container metadata: %v", err)
-	}
-	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
-		return fmt.Errorf("writing container metadata: %v", err)
+	if err := c.Saver.saveLocked(c); err != nil {
+		return fmt.Errorf("saving container metadata: %v", err)
 	}
 	return nil
 }
@@ -1106,48 +1046,6 @@ func (c *Container) requireStatus(action string, statuses ...Status) error {
 	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
 }
 
-// lock takes a file lock on the container metadata lock file.
-func (c *Container) lock() (func() error, error) {
-	return lockContainerMetadata(filepath.Join(c.Root, c.ID))
-}
-
-// lockContainerMetadata takes a file lock on the metadata lock file in the
-// given container root directory.
-func lockContainerMetadata(containerRootDir string) (func() error, error) {
-	if err := os.MkdirAll(containerRootDir, 0711); err != nil {
-		return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
-	}
-	f := filepath.Join(containerRootDir, metadataLockFilename)
-	l := flock.NewFlock(f)
-	if err := l.Lock(); err != nil {
-		return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
-	}
-	return l.Unlock, nil
-}
-
-// maybeLockRootContainer locks the sandbox root container. It is used to
-// prevent races to create and delete child container sandboxes.
-func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
-	if isRoot(spec) {
-		return func() error { return nil }, nil
-	}
-
-	sbid, ok := specutils.SandboxID(spec)
-	if !ok {
-		return nil, fmt.Errorf("no sandbox ID found when locking root container")
-	}
-	sb, err := Load(rootDir, sbid)
-	if err != nil {
-		return nil, err
-	}
-
-	unlock, err := sb.lock()
-	if err != nil {
-		return nil, err
-	}
-	return unlock, nil
-}
-
 func isRoot(spec *specs.Spec) bool {
 	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
 }
@@ -1170,7 +1068,12 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
 func (c *Container) adjustGoferOOMScoreAdj() error {
 	if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil {
 		if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
-			return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+			// Ignore NotExist error because it can be returned when the sandbox
+			// exited while OOM score was being adjusted.
+			if !os.IsNotExist(err) {
+				return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+			}
+			log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid)
 		}
 	}
 
@@ -1252,7 +1155,12 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool)
 
 	// Set the lowest of all containers oom_score_adj to the sandbox.
 	if err := setOOMScoreAdj(s.Pid, lowScore); err != nil {
-		return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
+		// Ignore NotExist error because it can be returned when the sandbox
+		// exited while OOM score was being adjusted.
+		if !os.IsNotExist(err) {
+			return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
+		}
+		log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid)
 	}
 
 	return nil
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
new file mode 100644
index 000000000..d95151ea5
--- /dev/null
+++ b/runsc/container/state_file.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/gofrs/flock"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+const stateFileExtension = ".state"
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
+// The state file is located at: "${s.RootDir}/${s.ID}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the container ID.
+	ID string `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List containers %q", rootDir)
+	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	if err != nil {
+		return nil, err
+	}
+	var out []string
+	for _, path := range list {
+		// Filter out files that do no belong to a container.
+		fileName := filepath.Base(path)
+		if len(fileName) < len(stateFileExtension) {
+			panic(fmt.Sprintf("invalid file match %q", path))
+		}
+		// Remove the extension.
+		cid := fileName[:len(fileName)-len(stateFileExtension)]
+		if validateID(cid) == nil {
+			out = append(out, cid)
+		}
+	}
+	return out, nil
+}
+
+// lock globally locks all locking operations for the container.
+func (s *StateFile) lock() error {
+	s.once.Do(func() {
+		s.flock = flock.NewFlock(s.lockPath())
+	})
+
+	if err := s.flock.Lock(); err != nil {
+		return fmt.Errorf("acquiring lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// lockForNew acquires the lock and checks if the state file doesn't exist. This
+// is done to ensure that more than one creation didn't race to create
+// containers with the same ID.
+func (s *StateFile) lockForNew() error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+
+	// Checks if the container already exists by looking for the metadata file.
+	if _, err := os.Stat(s.statePath()); err == nil {
+		s.unlock()
+		return fmt.Errorf("container already exists")
+	} else if !os.IsNotExist(err) {
+		s.unlock()
+		return fmt.Errorf("looking for existing container: %v", err)
+	}
+	return nil
+}
+
+// unlock globally unlocks all locking operations for the container.
+func (s *StateFile) unlock() error {
+	if !s.flock.Locked() {
+		panic("unlock called without lock held")
+	}
+
+	if err := s.flock.Unlock(); err != nil {
+		log.Warningf("Error to release lock on %q: %v", s.flock, err)
+		return fmt.Errorf("releasing lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// saveLocked saves 'v' to the state file.
+//
+// Preconditions: lock() must been called before.
+func (s *StateFile) saveLocked(v interface{}) error {
+	if !s.flock.Locked() {
+		panic("saveLocked called without lock held")
+	}
+
+	meta, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil {
+		return fmt.Errorf("writing json file: %v", err)
+	}
+	return nil
+}
+
+func (s *StateFile) load(v interface{}) error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+	defer s.unlock()
+
+	metaBytes, err := ioutil.ReadFile(s.statePath())
+	if err != nil {
+		return err
+	}
+	return json.Unmarshal(metaBytes, &v)
+}
+
+func (s *StateFile) close() error {
+	if s.flock == nil {
+		return nil
+	}
+	if s.flock.Locked() {
+		panic("Closing locked file")
+	}
+	return s.flock.Close()
+}
+
+func buildStatePath(rootDir, id string) string {
+	return filepath.Join(rootDir, id+stateFileExtension)
+}
+
+// statePath is the full path to the state file.
+func (s *StateFile) statePath() string {
+	return buildStatePath(s.RootDir, s.ID)
+}
+
+// lockPath is the full path to the lock file.
+func (s *StateFile) lockPath() string {
+	return filepath.Join(s.RootDir, s.ID+".lock")
+}
+
+// destroy deletes all state created by the stateFile. It may be called with the
+// lock file held. In that case, the lock file must still be unlocked and
+// properly closed after destroy returns.
+func (s *StateFile) destroy() error {
+	if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
-- 
cgit v1.2.3


From ca933329fa46ce219b39f4cf8cba1754b36cc2c2 Mon Sep 17 00:00:00 2001
From: lubinszARM <34124929+lubinszARM@users.noreply.github.com>
Date: Wed, 30 Oct 2019 15:51:42 -0700
Subject: support using KVM_MEM_READONLY for arm64 regions

On Arm platform, "setMemoryRegion" has extra permission checks.
In virt/kvm/arm/mmu.c: kvm_arch_prepare_memory_region()
      ....
      if (writable && !(vma->vm_flags & VM_WRITE)) {
             ret = -EPERM;
             break;
       }
        ....
So, for Arm platform, the "flags" for kvm_memory_region is required.
And on x86 platform, the "flags" can be always set as '0'.

Signed-off-by: Bin Lu <bin.lu@arm.com>
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/810 from lubinszARM:pr_setregion 8c99b19cfb0c859c6630a1cfff951db65fcf87ac
PiperOrigin-RevId: 277602603
---
 pkg/sentry/platform/kvm/BUILD                   |  1 +
 pkg/sentry/platform/kvm/address_space.go        |  2 +-
 pkg/sentry/platform/kvm/allocator.go            |  2 +-
 pkg/sentry/platform/kvm/bluepill_fault.go       | 10 ++--
 pkg/sentry/platform/kvm/bluepill_unsafe.go      |  2 +-
 pkg/sentry/platform/kvm/kvm_const.go            |  7 +++
 pkg/sentry/platform/kvm/machine.go              | 26 +++++++++--
 pkg/sentry/platform/kvm/machine_amd64.go        | 10 ++++
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 24 ----------
 pkg/sentry/platform/kvm/machine_arm64.go        | 61 +++++++++++++++++++++++++
 pkg/sentry/platform/kvm/machine_unsafe.go       | 24 ++++++++++
 11 files changed, 133 insertions(+), 36 deletions(-)
 create mode 100644 pkg/sentry/platform/kvm/machine_arm64.go

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 31fa48ec5..6803d488c 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -23,6 +23,7 @@ go_library(
         "machine.go",
         "machine_amd64.go",
         "machine_amd64_unsafe.go",
+        "machine_arm64.go",
         "machine_unsafe.go",
         "physical_map.go",
         "virtual_map.go",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index acd41f73d..ea8b9632e 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -127,7 +127,7 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 		// not have physical mappings, the KVM module may inject
 		// spurious exceptions when emulation fails (i.e. it tries to
 		// emulate because the RIP is pointed at those pages).
-		as.machine.mapPhysical(physical, length)
+		as.machine.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE)
 
 		// Install the page table mappings. Note that the ordering is
 		// important; if the pagetable mappings were installed before
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
index 80942e9c9..3f35414bb 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -54,7 +54,7 @@ func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
 //
 //go:nosplit
 func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
-	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical)
+	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions)
 	if !ok {
 		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
 	}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index b97476053..f6459cda9 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -46,9 +46,9 @@ func yield() {
 // calculateBluepillFault calculates the fault address range.
 //
 //go:nosplit
-func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
+func calculateBluepillFault(physical uintptr, phyRegions []physicalRegion) (virtualStart, physicalStart, length uintptr, ok bool) {
 	alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
-	for _, pr := range physicalRegions {
+	for _, pr := range phyRegions {
 		end := pr.physical + pr.length
 		if physical < pr.physical || physical >= end {
 			continue
@@ -77,12 +77,12 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng
 // The corresponding virtual address is returned. This may throw on error.
 //
 //go:nosplit
-func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
+func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegion, flags uint32) (uintptr, bool) {
 	// Paging fault: we need to map the underlying physical pages for this
 	// fault. This all has to be done in this function because we're in a
 	// signal handler context. (We can't call any functions that might
 	// split the stack.)
-	virtualStart, physicalStart, length, ok := calculateBluepillFault(physical)
+	virtualStart, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
 	if !ok {
 		return 0, false
 	}
@@ -96,7 +96,7 @@ func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
 		yield() // Race with another call.
 		slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0))
 	}
-	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart)
+	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags)
 	if errno == 0 {
 		// Successfully added region; we can increment nextSlot and
 		// allow another set to proceed here.
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index ee730ad70..3734bfb7a 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -162,7 +162,7 @@ func bluepillHandler(context unsafe.Pointer) {
 
 			// For MMIO, the physical address is the first data item.
 			physical := uintptr(c.runData.data[0])
-			virtual, ok := handleBluepillFault(c.machine, physical)
+			virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
 			if !ok {
 				c.die(bluepillArchContext(context), "invalid physical address")
 				return
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index d05f05c29..766131d60 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -62,3 +62,10 @@ const (
 	_KVM_NR_INTERRUPTS    = 0x100
 	_KVM_NR_CPUID_ENTRIES = 0x100
 )
+
+// KVM kvm_memory_region::flags.
+const (
+	_KVM_MEM_LOG_DIRTY_PAGES = uint32(1) << 0
+	_KVM_MEM_READONLY        = uint32(1) << 1
+	_KVM_MEM_FLAGS_NONE      = 0
+)
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index cc6c138b2..7d02ebf19 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -215,6 +215,17 @@ func newMachine(vm int) (*machine, error) {
 		return true // Keep iterating.
 	})
 
+	var physicalRegionsReadOnly []physicalRegion
+	var physicalRegionsAvailable []physicalRegion
+
+	physicalRegionsReadOnly = rdonlyRegionsForSetMem()
+	physicalRegionsAvailable = availableRegionsForSetMem()
+
+	// Map all read-only regions.
+	for _, r := range physicalRegionsReadOnly {
+		m.mapPhysical(r.physical, r.length, physicalRegionsReadOnly, _KVM_MEM_READONLY)
+	}
+
 	// Ensure that the currently mapped virtual regions are actually
 	// available in the VM. Note that this doesn't guarantee no future
 	// faults, however it should guarantee that everything is available to
@@ -223,6 +234,13 @@ func newMachine(vm int) (*machine, error) {
 		if excludeVirtualRegion(vr) {
 			return // skip region.
 		}
+
+		for _, r := range physicalRegionsReadOnly {
+			if vr.virtual == r.virtual {
+				return
+			}
+		}
+
 		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
 			physical, length, ok := translateToPhysical(virtual)
 			if !ok {
@@ -236,7 +254,7 @@ func newMachine(vm int) (*machine, error) {
 			}
 
 			// Ensure the physical range is mapped.
-			m.mapPhysical(physical, length)
+			m.mapPhysical(physical, length, physicalRegionsAvailable, _KVM_MEM_FLAGS_NONE)
 			virtual += length
 		}
 	})
@@ -256,9 +274,9 @@ func newMachine(vm int) (*machine, error) {
 // not available. This attempts to be efficient for calls in the hot path.
 //
 // This panics on error.
-func (m *machine) mapPhysical(physical, length uintptr) {
+func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
 	for end := physical + length; physical < end; {
-		_, physicalStart, length, ok := calculateBluepillFault(physical)
+		_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
 		if !ok {
 			// Should never happen.
 			panic("mapPhysical on unknown physical address")
@@ -266,7 +284,7 @@ func (m *machine) mapPhysical(physical, length uintptr) {
 
 		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
 			// Not present in the cache; requires setting the slot.
-			if _, ok := handleBluepillFault(m, physical); !ok {
+			if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok {
 				panic("handleBluepillFault failed")
 			}
 		}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index c1cbe33be..b99fe425e 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -355,3 +355,13 @@ func (m *machine) retryInGuest(fn func()) {
 		}
 	}
 }
+
+// On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
+// There is no need to return read-only physicalRegions.
+func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
+	return nil
+}
+
+func availableRegionsForSetMem() (phyRegions []physicalRegion) {
+	return physicalRegions
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 506ec9af1..61227cafb 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -26,30 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/time"
 )
 
-// setMemoryRegion initializes a region.
-//
-// This may be called from bluepillHandler, and therefore returns an errno
-// directly (instead of wrapping in an error) to avoid allocations.
-//
-//go:nosplit
-func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
-	userRegion := userMemoryRegion{
-		slot:          uint32(slot),
-		flags:         0,
-		guestPhysAddr: uint64(physical),
-		memorySize:    uint64(length),
-		userspaceAddr: uint64(virtual),
-	}
-
-	// Set the region.
-	_, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(m.fd),
-		_KVM_SET_USER_MEMORY_REGION,
-		uintptr(unsafe.Pointer(&userRegion)))
-	return errno
-}
-
 // loadSegments copies the current segments.
 //
 // This may be called from within the signal context and throws on error.
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
new file mode 100644
index 000000000..b7e2cfb9d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -0,0 +1,61 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// Get all read-only physicalRegions.
+func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
+	var rdonlyRegions []region
+
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			return
+		}
+
+		if !vr.accessType.Write && vr.accessType.Read {
+			rdonlyRegions = append(rdonlyRegions, vr.region)
+		}
+	})
+
+	for _, r := range rdonlyRegions {
+		physical, _, ok := translateToPhysical(r.virtual)
+		if !ok {
+			continue
+		}
+
+		phyRegions = append(phyRegions, physicalRegion{
+			region: region{
+				virtual: r.virtual,
+				length:  r.length,
+			},
+			physical: physical,
+		})
+	}
+
+	return phyRegions
+}
+
+// Get all available physicalRegions.
+func availableRegionsForSetMem() (phyRegions []physicalRegion) {
+	var excludeRegions []region
+	applyVirtualRegions(func(vr virtualRegion) {
+		if !vr.accessType.Write {
+			excludeRegions = append(excludeRegions, vr.region)
+		}
+	})
+
+	phyRegions = computePhysicalRegions(excludeRegions)
+
+	return phyRegions
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index e00c7ae40..ed9433311 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -35,6 +35,30 @@ func entersyscall()
 //go:linkname exitsyscall runtime.exitsyscall
 func exitsyscall()
 
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr, flags uint32) syscall.Errno {
+	userRegion := userMemoryRegion{
+		slot:          uint32(slot),
+		flags:         uint32(flags),
+		guestPhysAddr: uint64(physical),
+		memorySize:    uint64(length),
+		userspaceAddr: uint64(virtual),
+	}
+
+	// Set the region.
+	_, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_USER_MEMORY_REGION,
+		uintptr(unsafe.Pointer(&userRegion)))
+	return errno
+}
+
 // mapRunData maps the vCPU run data.
 func mapRunData(fd int) (*runData, error) {
 	r, _, errno := syscall.RawSyscall6(
-- 
cgit v1.2.3


From df125c986948fbbae2bc30de33213e2095762a86 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Wed, 30 Oct 2019 16:14:30 -0700
Subject: Add Kokoro config for new runtime tests

PiperOrigin-RevId: 277607217
---
 kokoro/runtime_tests.cfg     |  1 +
 scripts/runtime_tests.sh     | 25 +++++++++++++++++++++++++
 test/runtimes/BUILD          | 10 +++++-----
 test/runtimes/build_defs.bzl | 39 +++++++++++++++++++++++++++------------
 4 files changed, 58 insertions(+), 17 deletions(-)
 create mode 100644 kokoro/runtime_tests.cfg
 create mode 100755 scripts/runtime_tests.sh

diff --git a/kokoro/runtime_tests.cfg b/kokoro/runtime_tests.cfg
new file mode 100644
index 000000000..7d56d5aca
--- /dev/null
+++ b/kokoro/runtime_tests.cfg
@@ -0,0 +1 @@
+build_file: "repo/scripts/runtime_tests.sh"
diff --git a/scripts/runtime_tests.sh b/scripts/runtime_tests.sh
new file mode 100755
index 000000000..fb82b2491
--- /dev/null
+++ b/scripts/runtime_tests.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+if [ ! -v RUNTIME ]; then
+  echo 'Must set $RUNTIME' >&2
+  exit 1
+fi
+
+install_runsc_for_test runtimes
+test_runsc "//test/runtimes:${RUNTIME}_test"
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 2e125525b..367295206 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -16,32 +16,32 @@ go_binary(
 )
 
 runtime_test(
+    name = "go1.12",
     blacklist_file = "blacklist_go1.12.csv",
-    image = "gcr.io/gvisor-presubmit/go1.12",
     lang = "go",
 )
 
 runtime_test(
+    name = "java11",
     blacklist_file = "blacklist_java11.csv",
-    image = "gcr.io/gvisor-presubmit/java11",
     lang = "java",
 )
 
 runtime_test(
+    name = "nodejs12.4.0",
     blacklist_file = "blacklist_nodejs12.4.0.csv",
-    image = "gcr.io/gvisor-presubmit/nodejs12.4.0",
     lang = "nodejs",
 )
 
 runtime_test(
+    name = "php7.3.6",
     blacklist_file = "blacklist_php7.3.6.csv",
-    image = "gcr.io/gvisor-presubmit/php7.3.6",
     lang = "php",
 )
 
 runtime_test(
+    name = "python3.7.3",
     blacklist_file = "blacklist_python3.7.3.csv",
-    image = "gcr.io/gvisor-presubmit/python3.7.3",
     lang = "python",
 )
 
diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
index 7c11624b4..d458df1fd 100644
--- a/test/runtimes/build_defs.bzl
+++ b/test/runtimes/build_defs.bzl
@@ -2,32 +2,48 @@
 
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
-# runtime_test is a macro that will create targets to run the given test target
-# with different runtime options.
 def runtime_test(
+        name,
         lang,
-        image,
+        image_repo = "gcr.io/gvisor-presubmit",
+        image_name = None,
+        blacklist_file = None,
         shard_count = 50,
-        size = "enormous",
-        blacklist_file = ""):
+        size = "enormous"):
+    """Generates sh_test and blacklist test targets for a given runtime.
+
+    Args:
+      name: The name of the runtime being tested. Typically, the lang + version.
+          This is used in the names of the generated test targets.
+      lang: The language being tested.
+      image_repo: The docker repository containing the proctor image to run.
+          i.e., the prefix to the fully qualified docker image id.
+      image_name: The name of the image in the image_repo.
+          Defaults to the test name.
+      blacklist_file: A test blacklist to pass to the runtime test's runner.
+      shard_count: See Bazel common test attributes.
+      size: See Bazel common test attributes.
+    """
+    if image_name == None:
+        image_name = name
     args = [
         "--lang",
         lang,
         "--image",
-        image,
+        "/".join([image_repo, image_name]),
     ]
     data = [
         ":runner",
     ]
-    if blacklist_file != "":
+    if blacklist_file:
         args += ["--blacklist_file", "test/runtimes/" + blacklist_file]
         data += [blacklist_file]
 
         # Add a test that the blacklist parses correctly.
-        blacklist_test(lang, blacklist_file)
+        blacklist_test(name, blacklist_file)
 
     sh_test(
-        name = lang + "_test",
+        name = name + "_test",
         srcs = ["runner.sh"],
         args = args,
         data = data,
@@ -35,15 +51,14 @@ def runtime_test(
         shard_count = shard_count,
         tags = [
             # Requires docker and runsc to be configured before the test runs.
-            "manual",
             "local",
         ],
     )
 
-def blacklist_test(lang, blacklist_file):
+def blacklist_test(name, blacklist_file):
     """Test that a blacklist parses correctly."""
     go_test(
-        name = lang + "_blacklist_test",
+        name = name + "_blacklist_test",
         embed = [":runner"],
         srcs = ["blacklist_test.go"],
         args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
-- 
cgit v1.2.3


From 3246040447c6d0a08cc12c5721480c06f77f5dfe Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 30 Oct 2019 17:00:29 -0700
Subject: Deep copy dispatcher views.

When VectorisedViews were passed up the stack from packet_dispatchers, we were
passing a sub-slice of the dispatcher's views fields. The dispatchers then
immediately set those views to nil.

This wasn't caught before because every implementer copied the data in these
views before returning.

PiperOrigin-RevId: 277615351
---
 pkg/tcpip/link/fdbased/endpoint_test.go      | 10 +++++++---
 pkg/tcpip/link/fdbased/packet_dispatchers.go |  4 ++--
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/registration.go              | 16 ++++++++++++++++
 pkg/tcpip/transport/icmp/endpoint.go         |  5 +----
 pkg/tcpip/transport/packet/endpoint.go       |  8 ++------
 pkg/tcpip/transport/raw/endpoint.go          |  6 +-----
 pkg/tcpip/transport/udp/endpoint.go          |  5 +----
 8 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 59378b96c..e7c05ca4f 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -45,7 +45,7 @@ const (
 type packetInfo struct {
 	raddr      tcpip.LinkAddress
 	proto      tcpip.NetworkProtocolNumber
-	contents   buffer.View
+	contents   buffer.VectorisedView
 	linkHeader buffer.View
 }
 
@@ -94,7 +94,7 @@ func (c *context) cleanup() {
 }
 
 func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
-	c.ch <- packetInfo{remote, protocol, vv.ToView(), linkHeader}
+	c.ch <- packetInfo{remote, protocol, vv, linkHeader}
 }
 
 func TestNoEthernetProperties(t *testing.T) {
@@ -319,13 +319,17 @@ func TestDeliverPacket(t *testing.T) {
 					want := packetInfo{
 						raddr:      raddr,
 						proto:      proto,
-						contents:   b,
+						contents:   buffer.View(b).ToVectorisedView(),
 						linkHeader: buffer.View(hdr),
 					}
 					if !eth {
 						want.proto = header.IPv4ProtocolNumber
 						want.raddr = ""
 					}
+					// want.contents will be a single view,
+					// so make pi do the same for the
+					// DeepEqual check.
+					pi.contents = pi.contents.ToView().ToVectorisedView()
 					if !reflect.DeepEqual(want, pi) {
 						t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want)
 					}
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index 12168a1dc..3331b6453 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,7 +139,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	vv := buffer.NewVectorisedView(n, d.views[:used])
+	vv := buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...))
 	vv.TrimFront(d.e.hdrSize)
 
 	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
@@ -293,7 +293,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		vv := buffer.NewVectorisedView(int(n), d.views[k][:used])
+		vv := buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...))
 		vv.TrimFront(d.e.hdrSize)
 		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index a01a208b8..fe8f83d58 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -762,7 +762,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
-		ep.HandlePacket(n.id, local, protocol, vv, linkHeader)
+		ep.HandlePacket(n.id, local, protocol, vv.Clone(nil), linkHeader)
 	}
 
 	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 94015ba54..d7c124e81 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -65,10 +65,14 @@ type TransportEndpoint interface {
 
 	// HandlePacket is called by the stack when new packets arrive to
 	// this transport endpoint.
+	//
+	// HandlePacket takes ownership of vv.
 	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
 
 	// HandleControlPacket is called by the stack when new control (e.g.,
 	// ICMP) packets arrive to this transport endpoint.
+	//
+	// HandleControlPacket takes ownership of vv.
 	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView)
 
 	// Close puts the endpoint in a closed state and frees all resources
@@ -94,6 +98,8 @@ type RawTransportEndpoint interface {
 	// HandlePacket is called by the stack when new packets arrive to
 	// this transport endpoint. The packet contains all data from the link
 	// layer up.
+	//
+	// HandlePacket takes ownership of packet and netHeader.
 	HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView)
 }
 
@@ -110,6 +116,8 @@ type PacketEndpoint interface {
 	//
 	// linkHeader may have a length of 0, in which case the PacketEndpoint
 	// should construct its own ethernet header for applications.
+	//
+	// HandlePacket takes ownership of packet and linkHeader.
 	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, packet buffer.VectorisedView, linkHeader buffer.View)
 }
 
@@ -160,10 +168,14 @@ type TransportDispatcher interface {
 	// DeliverTransportPacket delivers packets to the appropriate
 	// transport protocol endpoint. It also returns the network layer
 	// header for the enpoint to inspect or pass up the stack.
+	//
+	// DeliverTransportPacket takes ownership of vv and netHeader.
 	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
+	//
+	// DeliverTransportControlPacket takes ownership of vv.
 	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView)
 }
 
@@ -237,6 +249,8 @@ type NetworkEndpoint interface {
 
 	// HandlePacket is called by the link layer when new packets arrive to
 	// this network endpoint.
+	//
+	// HandlePacket takes ownership of vv.
 	HandlePacket(r *Route, vv buffer.VectorisedView)
 
 	// Close is called when the endpoint is reomved from a stack.
@@ -282,6 +296,8 @@ type NetworkDispatcher interface {
 	// DeliverNetworkPacket finds the appropriate network protocol endpoint
 	// and hands the packet over for further processing. linkHeader may have
 	// length 0 when the caller does not have ethernet data.
+	//
+	// DeliverNetworkPacket takes ownership of vv and linkHeader.
 	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View)
 }
 
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 114a69b4e..33405eb7d 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -31,9 +31,6 @@ type icmpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	// views is used as buffer for data when its length is large
-	// enough to store a VectorisedView.
-	views [8]buffer.View `state:"nosave"`
 }
 
 type endpointState int
@@ -767,7 +764,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 		},
 	}
 
-	pkt.data = vv.Clone(pkt.views[:])
+	pkt.data = vv
 
 	e.rcvList.PushBack(pkt)
 	e.rcvBufSize += pkt.data.Size()
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 73cdaa265..ead83b83d 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -41,10 +41,6 @@ type packet struct {
 	// data holds the actual packet data, including any headers and
 	// payload.
 	data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
-	// views is pre-allocated space to back data. As long as the packet is
-	// made up of fewer than 8 buffer.Views, no extra allocation is
-	// necessary to store packet data.
-	views [8]buffer.View `state:"nosave"`
 	// timestampNS is the unix time at which the packet was received.
 	timestampNS int64
 	// senderAddr is the network address of the sender.
@@ -310,7 +306,7 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 
 	if ep.cooked {
 		// Cooked packets can simply be queued.
-		packet.data = vv.Clone(packet.views[:])
+		packet.data = vv
 	} else {
 		// Raw packets need their ethernet headers prepended before
 		// queueing.
@@ -328,7 +324,7 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 		}
 		combinedVV := buffer.View(ethHeader).ToVectorisedView()
 		combinedVV.Append(vv)
-		packet.data = combinedVV.Clone(packet.views[:])
+		packet.data = combinedVV
 	}
 	packet.timestampNS = ep.stack.NowNanoseconds()
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 951d317ed..23922a30e 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -42,10 +42,6 @@ type rawPacket struct {
 	// data holds the actual packet data, including any headers and
 	// payload.
 	data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
-	// views is pre-allocated space to back data. As long as the packet is
-	// made up of fewer than 8 buffer.Views, no extra allocation is
-	// necessary to store packet data.
-	views [8]buffer.View `state:"nosave"`
 	// timestampNS is the unix time at which the packet was received.
 	timestampNS int64
 	// senderAddr is the network address of the sender.
@@ -609,7 +605,7 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
 
 	combinedVV := netHeader.ToVectorisedView()
 	combinedVV.Append(vv)
-	pkt.data = combinedVV.Clone(pkt.views[:])
+	pkt.data = combinedVV
 	pkt.timestampNS = e.stack.NowNanoseconds()
 
 	e.rcvList.PushBack(pkt)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 68977dc25..03bd5c8fd 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -31,9 +31,6 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	// views is used as buffer for data when its length is large
-	// enough to store a VectorisedView.
-	views [8]buffer.View `state:"nosave"`
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -1202,7 +1199,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 			Port: hdr.SourcePort(),
 		},
 	}
-	pkt.data = vv.Clone(pkt.views[:])
+	pkt.data = vv
 	e.rcvList.PushBack(pkt)
 	e.rcvBufSize += vv.Size()
 
-- 
cgit v1.2.3


From 7dcfcd53e4f3f0e1384ac42eacf2622a57d1b37c Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Thu, 31 Oct 2019 11:25:19 -0700
Subject: Fix overloaded use of $RUNTIME.

Turns out we use $RUNTIME in scripts/common.sh to give a name to the runsc
runtime used by the tests.

PiperOrigin-RevId: 277764383
---
 scripts/runtime_tests.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/runtime_tests.sh b/scripts/runtime_tests.sh
index fb82b2491..9ee991e42 100755
--- a/scripts/runtime_tests.sh
+++ b/scripts/runtime_tests.sh
@@ -16,10 +16,10 @@
 
 source $(dirname $0)/common.sh
 
-if [ ! -v RUNTIME ]; then
-  echo 'Must set $RUNTIME' >&2
+if [ ! -v RUNTIME_TEST_NAME ]; then
+  echo 'Must set $RUNTIME_TEST_NAME' >&2
   exit 1
 fi
 
 install_runsc_for_test runtimes
-test_runsc "//test/runtimes:${RUNTIME}_test"
+test_runsc "//test/runtimes:${RUNTIME_TEST_NAME}_test"
-- 
cgit v1.2.3


From f7dbddaf77a6059c2f5a441d068a39219fe593bd Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 31 Oct 2019 12:27:46 -0700
Subject: platform/kvm: calll sigtimedwait with zero timeout

sigtimedwait is used to check pending signals and
it should not block.

PiperOrigin-RevId: 277777269
---
 pkg/sentry/platform/kvm/bluepill_unsafe.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 3734bfb7a..ca011ef78 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -80,13 +80,17 @@ func bluepillHandler(context unsafe.Pointer) {
 			// interrupted KVM. Since we're in a signal handler
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
+			timeout := syscall.Timespec{}
 			sig, _, errno := syscall.RawSyscall6(
 				syscall.SYS_RT_SIGTIMEDWAIT,
 				uintptr(unsafe.Pointer(&bounceSignalMask)),
-				0, // siginfo.
-				0, // timeout.
-				8, // sigset size.
+				0,                                 // siginfo.
+				uintptr(unsafe.Pointer(&timeout)), // timeout.
+				8,                                 // sigset size.
 				0, 0)
+			if errno == syscall.EAGAIN {
+				continue
+			}
 			if errno != 0 {
 				throw("error waiting for pending signal")
 			}
-- 
cgit v1.2.3


From fe2e0764ac600fe19a3d87069a58d7463a5223ab Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 31 Oct 2019 12:51:50 -0700
Subject: Add LICENSE and AUTHORS to the go branch.

Also, construct the README directly so that edits can be made.

PiperOrigin-RevId: 277782095
---
 tools/go_branch.sh | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/tools/go_branch.sh b/tools/go_branch.sh
index ddb9b6e7b..0ac16e266 100755
--- a/tools/go_branch.sh
+++ b/tools/go_branch.sh
@@ -17,9 +17,9 @@
 set -eo pipefail
 
 # Discovery the package name from the go.mod file.
-declare -r gomod="$(pwd)/go.mod"
-declare -r module=$(cat "${gomod}" | grep -E "^module" | cut -d' ' -f2)
-declare -r gosum="$(pwd)/go.sum"
+declare -r module=$(cat go.mod | grep -E "^module" | cut -d' ' -f2)
+declare -r origpwd=$(pwd)
+declare -r othersrc=("go.mod" "go.sum" "AUTHORS" "LICENSE")
 
 # Check that gopath has been built.
 declare -r gopath_dir="$(pwd)/bazel-bin/gopath/src/${module}"
@@ -65,10 +65,22 @@ git checkout -b go "${go_branch}"
 git merge --no-commit --strategy ours ${head} || \
   git merge --allow-unrelated-histories --no-commit --strategy ours ${head}
 
-# Sync the entire gopath_dir and go.mod.
-rsync --recursive --verbose --delete --exclude .git --exclude README.md -L "${gopath_dir}/" .
-cp "${gomod}" .
-cp "${gosum}" .
+# Sync the entire gopath_dir.
+rsync --recursive --verbose --delete --exclude .git -L "${gopath_dir}/" .
+
+# Add additional files.
+for file in "${othersrc[@]}"; do
+  cp "${origpwd}"/"${file}" .
+done
+
+# Construct a new README.md.
+cat > README.md <<EOF
+# gVisor
+
+This branch is a synthetic branch, containing only Go sources, that is
+compatible with standard Go tools. See the `master` branch for authoritative
+sources and tests.
+EOF
 
 # There are a few solitary files that can get left behind due to the way bazel
 # constructs the gopath target. Note that we don't find all Go files here
-- 
cgit v1.2.3


From 36837c4ad3f3c840791379db81d02b60d918c0f5 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 31 Oct 2019 17:37:54 -0700
Subject: Add systemd-cgroup flag option.

Adds a systemd-cgroup flag option that prints an error letting the user know
that systemd cgroups are not supported and points them to the relevant issue.

Issue #193

PiperOrigin-RevId: 277837162
---
 runsc/main.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/runsc/main.go b/runsc/main.go
index ae906c661..711f60d4f 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -46,6 +46,8 @@ var (
 	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
 	debug       = flag.Bool("debug", false, "enable debug logging.")
 	showVersion = flag.Bool("version", false, "show version and exit.")
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
 
 	// These flags are unique to runsc, and are used to configure parts of the
 	// system that are not covered by the runtime spec.
@@ -136,6 +138,12 @@ func main() {
 		os.Exit(0)
 	}
 
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	if *systemdCgroup {
+		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
+		os.Exit(1)
+	}
+
 	var errorLogger io.Writer
 	if *logFD > -1 {
 		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
-- 
cgit v1.2.3


From a99d3479a84ca86843e500dbdf58db0af389b536 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 31 Oct 2019 18:02:04 -0700
Subject: Add context to state.

PiperOrigin-RevId: 277840416
---
 pkg/sentry/context/context.go      | 63 +++++++++++++++++++++++---------------
 pkg/sentry/kernel/context.go       | 32 +++++++++++++++++++
 pkg/sentry/kernel/kernel.go        | 13 ++++----
 pkg/sentry/pgalloc/save_restore.go | 13 ++++----
 pkg/state/decode.go                |  4 +++
 pkg/state/encode.go                |  4 +++
 pkg/state/map.go                   | 11 +++++++
 pkg/state/state.go                 |  7 +++--
 pkg/state/state_test.go            | 11 ++++---
 9 files changed, 115 insertions(+), 43 deletions(-)

diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index dfd62cbdb..23e009ef3 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -12,10 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package context defines the sentry's Context type.
+// Package context defines an internal context type.
+//
+// The given Context conforms to the standard Go context, but mandates
+// additional methods that are specific to the kernel internals. Note however,
+// that the Context described by this package carries additional constraints
+// regarding concurrent access and retaining beyond the scope of a call.
+//
+// See the Context type for complete details.
 package context
 
 import (
+	"context"
+	"time"
+
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/log"
 )
@@ -59,6 +69,7 @@ func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
 type Context interface {
 	log.Logger
 	amutex.Sleeper
+	context.Context
 
 	// UninterruptibleSleepStart indicates the beginning of an uninterruptible
 	// sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
@@ -72,19 +83,36 @@ type Context interface {
 	// AddressSpace is activated. Normally activate is the same value as the
 	// deactivate parameter passed to UninterruptibleSleepStart.
 	UninterruptibleSleepFinish(activate bool)
+}
+
+// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep
+// methods for anonymous embedding in other types that do not implement sleeps.
+type NoopSleeper struct {
+	amutex.NoopSleeper
+}
+
+// UninterruptibleSleepStart does nothing.
+func (NoopSleeper) UninterruptibleSleepStart(bool) {}
+
+// UninterruptibleSleepFinish does nothing.
+func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
+
+// Deadline returns zero values, meaning no deadline.
+func (NoopSleeper) Deadline() (time.Time, bool) {
+	return time.Time{}, false
+}
+
+// Done returns nil.
+func (NoopSleeper) Done() <-chan struct{} {
+	return nil
+}
 
-	// Value returns the value associated with this Context for key, or nil if
-	// no value is associated with key. Successive calls to Value with the same
-	// key returns the same result.
-	//
-	// A key identifies a specific value in a Context. Functions that wish to
-	// retrieve values from Context typically allocate a key in a global
-	// variable then use that key as the argument to Context.Value. A key can
-	// be any type that supports equality; packages should define keys as an
-	// unexported type to avoid collisions.
-	Value(key interface{}) interface{}
+// Err returns nil.
+func (NoopSleeper) Err() error {
+	return nil
 }
 
+// logContext implements basic logging.
 type logContext struct {
 	log.Logger
 	NoopSleeper
@@ -95,19 +123,6 @@ func (logContext) Value(key interface{}) interface{} {
 	return nil
 }
 
-// NoopSleeper is a noop implementation of amutex.Sleeper and
-// Context.UninterruptibleSleep* methods for anonymous embedding in other types
-// that do not want to notify kernel.Task about sleeps.
-type NoopSleeper struct {
-	amutex.NoopSleeper
-}
-
-// UninterruptibleSleepStart does nothing.
-func (NoopSleeper) UninterruptibleSleepStart(bool) {}
-
-// UninterruptibleSleepFinish does nothing.
-func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
-
 // bgContext is the context returned by context.Background.
 var bgContext = &logContext{Logger: log.Log()}
 
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index e3f5b0d83..3c9dceaba 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -15,6 +15,8 @@
 package kernel
 
 import (
+	"time"
+
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 )
@@ -97,6 +99,21 @@ func TaskFromContext(ctx context.Context) *Task {
 	return nil
 }
 
+// Deadline implements context.Context.Deadline.
+func (*Task) Deadline() (time.Time, bool) {
+	return time.Time{}, false
+}
+
+// Done implements context.Context.Done.
+func (*Task) Done() <-chan struct{} {
+	return nil
+}
+
+// Err implements context.Context.Err.
+func (*Task) Err() error {
+	return nil
+}
+
 // AsyncContext returns a context.Context that may be used by goroutines that
 // do work on behalf of t and therefore share its contextual values, but are
 // not t's task goroutine (e.g. asynchronous I/O).
@@ -129,6 +146,21 @@ func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
 	return ctx.t.IsLogging(level)
 }
 
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+	return ctx.t.Deadline()
+}
+
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+	return ctx.t.Done()
+}
+
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+	return ctx.t.Err()
+}
+
 // Value implements context.Context.Value.
 func (ctx taskAsyncContext) Value(key interface{}) interface{} {
 	return ctx.t.Value(key)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index e64d648e2..28ba950bd 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -391,7 +391,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	//
 	// N.B. This will also be saved along with the full kernel save below.
 	cpuidStart := time.Now()
-	if err := state.Save(w, k.FeatureSet(), nil); err != nil {
+	if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil {
 		return err
 	}
 	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
@@ -399,7 +399,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	// Save the kernel state.
 	kernelStart := time.Now()
 	var stats state.Stats
-	if err := state.Save(w, k, &stats); err != nil {
+	if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil {
 		return err
 	}
 	log.Infof("Kernel save stats: %s", &stats)
@@ -407,7 +407,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 
 	// Save the memory file's state.
 	memoryStart := time.Now()
-	if err := k.mf.SaveTo(w); err != nil {
+	if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
 		return err
 	}
 	log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -542,7 +542,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	// don't need to explicitly install it in the Kernel.
 	cpuidStart := time.Now()
 	var features cpuid.FeatureSet
-	if err := state.Load(r, &features, nil); err != nil {
+	if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil {
 		return err
 	}
 	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -558,7 +558,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	// Load the kernel state.
 	kernelStart := time.Now()
 	var stats state.Stats
-	if err := state.Load(r, k, &stats); err != nil {
+	if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil {
 		return err
 	}
 	log.Infof("Kernel load stats: %s", &stats)
@@ -566,7 +566,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 
 	// Load the memory file's state.
 	memoryStart := time.Now()
-	if err := k.mf.LoadFrom(r); err != nil {
+	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
 		return err
 	}
 	log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -1322,6 +1322,7 @@ func (k *Kernel) ListSockets() []*SocketEntry {
 	return socks
 }
 
+// supervisorContext is a privileged context.
 type supervisorContext struct {
 	context.NoopSleeper
 	log.Logger
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index 1effc7735..aafce1d00 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -16,6 +16,7 @@ package pgalloc
 
 import (
 	"bytes"
+	"context"
 	"fmt"
 	"io"
 	"runtime"
@@ -29,7 +30,7 @@ import (
 )
 
 // SaveTo writes f's state to the given stream.
-func (f *MemoryFile) SaveTo(w io.Writer) error {
+func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
 	// Wait for reclaim.
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -78,10 +79,10 @@ func (f *MemoryFile) SaveTo(w io.Writer) error {
 	}
 
 	// Save metadata.
-	if err := state.Save(w, &f.fileSize, nil); err != nil {
+	if err := state.Save(ctx, w, &f.fileSize, nil); err != nil {
 		return err
 	}
-	if err := state.Save(w, &f.usage, nil); err != nil {
+	if err := state.Save(ctx, w, &f.usage, nil); err != nil {
 		return err
 	}
 
@@ -114,9 +115,9 @@ func (f *MemoryFile) SaveTo(w io.Writer) error {
 }
 
 // LoadFrom loads MemoryFile state from the given stream.
-func (f *MemoryFile) LoadFrom(r io.Reader) error {
+func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
 	// Load metadata.
-	if err := state.Load(r, &f.fileSize, nil); err != nil {
+	if err := state.Load(ctx, r, &f.fileSize, nil); err != nil {
 		return err
 	}
 	if err := f.file.Truncate(f.fileSize); err != nil {
@@ -124,7 +125,7 @@ func (f *MemoryFile) LoadFrom(r io.Reader) error {
 	}
 	newMappings := make([]uintptr, f.fileSize>>chunkShift)
 	f.mappings.Store(newMappings)
-	if err := state.Load(r, &f.usage, nil); err != nil {
+	if err := state.Load(ctx, r, &f.usage, nil); err != nil {
 		return err
 	}
 
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index 47e6b878a..590c241a3 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -16,6 +16,7 @@ package state
 
 import (
 	"bytes"
+	"context"
 	"encoding/binary"
 	"errors"
 	"fmt"
@@ -133,6 +134,9 @@ func (os *objectState) findCycle() []*objectState {
 // to ensure that all callbacks are executed, otherwise the callback graph was
 // not acyclic.
 type decodeState struct {
+	// ctx is the decode context.
+	ctx context.Context
+
 	// objectByID is the set of objects in progress.
 	objectsByID map[uint64]*objectState
 
diff --git a/pkg/state/encode.go b/pkg/state/encode.go
index 5d9409a45..c5118d3a9 100644
--- a/pkg/state/encode.go
+++ b/pkg/state/encode.go
@@ -16,6 +16,7 @@ package state
 
 import (
 	"container/list"
+	"context"
 	"encoding/binary"
 	"fmt"
 	"io"
@@ -38,6 +39,9 @@ type queuedObject struct {
 // The encoding process is a breadth-first traversal of the object graph. The
 // inherent races and dependencies are much simpler than the decode case.
 type encodeState struct {
+	// ctx is the encode context.
+	ctx context.Context
+
 	// lastID is the last object ID.
 	//
 	// See idsByObject for context. Because of the special zero encoding
diff --git a/pkg/state/map.go b/pkg/state/map.go
index 7e6fefed4..4f3ebb0da 100644
--- a/pkg/state/map.go
+++ b/pkg/state/map.go
@@ -15,6 +15,7 @@
 package state
 
 import (
+	"context"
 	"fmt"
 	"reflect"
 	"sort"
@@ -219,3 +220,13 @@ func (m Map) AfterLoad(fn func()) {
 	// data dependencies have been cleared.
 	m.os.callbacks = append(m.os.callbacks, fn)
 }
+
+// Context returns the current context object.
+func (m Map) Context() context.Context {
+	if m.es != nil {
+		return m.es.ctx
+	} else if m.ds != nil {
+		return m.ds.ctx
+	}
+	return context.Background() // No context.
+}
diff --git a/pkg/state/state.go b/pkg/state/state.go
index d408ff84a..dbe507ab4 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -50,6 +50,7 @@
 package state
 
 import (
+	"context"
 	"fmt"
 	"io"
 	"reflect"
@@ -86,9 +87,10 @@ func UnwrapErrState(err error) error {
 }
 
 // Save saves the given object state.
-func Save(w io.Writer, rootPtr interface{}, stats *Stats) error {
+func Save(ctx context.Context, w io.Writer, rootPtr interface{}, stats *Stats) error {
 	// Create the encoding state.
 	es := &encodeState{
+		ctx:         ctx,
 		idsByObject: make(map[uintptr]uint64),
 		w:           w,
 		stats:       stats,
@@ -101,9 +103,10 @@ func Save(w io.Writer, rootPtr interface{}, stats *Stats) error {
 }
 
 // Load loads a checkpoint.
-func Load(r io.Reader, rootPtr interface{}, stats *Stats) error {
+func Load(ctx context.Context, r io.Reader, rootPtr interface{}, stats *Stats) error {
 	// Create the decoding state.
 	ds := &decodeState{
+		ctx:         ctx,
 		objectsByID: make(map[uint64]*objectState),
 		deferred:    make(map[uint64]*pb.Object),
 		r:           r,
diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go
index 7c24bbcda..d7221e9e8 100644
--- a/pkg/state/state_test.go
+++ b/pkg/state/state_test.go
@@ -16,6 +16,7 @@ package state
 
 import (
 	"bytes"
+	"context"
 	"io/ioutil"
 	"math"
 	"reflect"
@@ -46,7 +47,7 @@ func runTest(t *testing.T, tests []TestCase) {
 			saveBuffer := &bytes.Buffer{}
 			saveObjectPtr := reflect.New(reflect.TypeOf(root))
 			saveObjectPtr.Elem().Set(reflect.ValueOf(root))
-			if err := Save(saveBuffer, saveObjectPtr.Interface(), nil); err != nil && !test.Fail {
+			if err := Save(context.Background(), saveBuffer, saveObjectPtr.Interface(), nil); err != nil && !test.Fail {
 				t.Errorf("    FAIL: Save failed unexpectedly: %v", err)
 				continue
 			} else if err != nil {
@@ -56,7 +57,7 @@ func runTest(t *testing.T, tests []TestCase) {
 
 			// Load a new copy of the object.
 			loadObjectPtr := reflect.New(reflect.TypeOf(root))
-			if err := Load(bytes.NewReader(saveBuffer.Bytes()), loadObjectPtr.Interface(), nil); err != nil && !test.Fail {
+			if err := Load(context.Background(), bytes.NewReader(saveBuffer.Bytes()), loadObjectPtr.Interface(), nil); err != nil && !test.Fail {
 				t.Errorf("    FAIL: Load failed unexpectedly: %v", err)
 				continue
 			} else if err != nil {
@@ -624,7 +625,7 @@ func BenchmarkEncoding(b *testing.B) {
 	bs := buildObject(b.N)
 	var stats Stats
 	b.StartTimer()
-	if err := Save(ioutil.Discard, bs, &stats); err != nil {
+	if err := Save(context.Background(), ioutil.Discard, bs, &stats); err != nil {
 		b.Errorf("save failed: %v", err)
 	}
 	b.StopTimer()
@@ -638,12 +639,12 @@ func BenchmarkDecoding(b *testing.B) {
 	bs := buildObject(b.N)
 	var newBS benchStruct
 	buf := &bytes.Buffer{}
-	if err := Save(buf, bs, nil); err != nil {
+	if err := Save(context.Background(), buf, bs, nil); err != nil {
 		b.Errorf("save failed: %v", err)
 	}
 	var stats Stats
 	b.StartTimer()
-	if err := Load(buf, &newBS, &stats); err != nil {
+	if err := Load(context.Background(), buf, &newBS, &stats); err != nil {
 		b.Errorf("load failed: %v", err)
 	}
 	b.StopTimer()
-- 
cgit v1.2.3


From af6af2c34131c4ec5e3195be99c1deb6a2669c06 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 1 Nov 2019 11:21:06 -0700
Subject: tests: don't use ASSERT_THAT after fork

PiperOrigin-RevId: 277965624
---
 test/syscalls/linux/semaphore.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index 40c57f543..e9b131ca9 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -447,9 +447,8 @@ TEST(SemaphoreTest, SemCtlGetPidFork) {
 
   const pid_t child_pid = fork();
   if (child_pid == 0) {
-    ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
-    ASSERT_THAT(semctl(sem.get(), 0, GETPID),
-                SyscallSucceedsWithValue(getpid()));
+    TEST_PCHECK(semctl(sem.get(), 0, SETVAL, 1) == 0);
+    TEST_PCHECK(semctl(sem.get(), 0, GETPID) == getpid());
 
     _exit(0);
   }
-- 
cgit v1.2.3


From 5694bd080e0e95ba18cbf77038f450fe33b9f8df Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 1 Nov 2019 11:43:33 -0700
Subject: Don't log "p9.channel.service: flipcall connection shutdown".

This gets quite spammy, especially in tests.

PiperOrigin-RevId: 277970468
---
 pkg/flipcall/ctrl_futex.go  |  2 +-
 pkg/flipcall/flipcall.go    | 10 ++++++----
 pkg/flipcall/futex_linux.go |  6 +++---
 pkg/p9/server.go            |  6 +++++-
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/pkg/flipcall/ctrl_futex.go b/pkg/flipcall/ctrl_futex.go
index 8390915a2..e7c3a3a0b 100644
--- a/pkg/flipcall/ctrl_futex.go
+++ b/pkg/flipcall/ctrl_futex.go
@@ -113,7 +113,7 @@ func (ep *Endpoint) enterFutexWait() error {
 		return nil
 	case epsBlocked | epsShutdown:
 		atomic.AddInt32(&ep.ctrl.state, -epsBlocked)
-		return shutdownError{}
+		return ShutdownError{}
 	default:
 		// Most likely due to ep.enterFutexWait() being called concurrently
 		// from multiple goroutines.
diff --git a/pkg/flipcall/flipcall.go b/pkg/flipcall/flipcall.go
index 386cee42c..3cdb576e1 100644
--- a/pkg/flipcall/flipcall.go
+++ b/pkg/flipcall/flipcall.go
@@ -136,8 +136,8 @@ func (ep *Endpoint) unmapPacket() {
 
 // Shutdown causes concurrent and future calls to ep.Connect(), ep.SendRecv(),
 // ep.RecvFirst(), and ep.SendLast(), as well as the same calls in the peer
-// Endpoint, to unblock and return errors. It does not wait for concurrent
-// calls to return. Successive calls to Shutdown have no effect.
+// Endpoint, to unblock and return ShutdownErrors. It does not wait for
+// concurrent calls to return. Successive calls to Shutdown have no effect.
 //
 // Shutdown is the only Endpoint method that may be called concurrently with
 // other methods on the same Endpoint.
@@ -154,10 +154,12 @@ func (ep *Endpoint) isShutdownLocally() bool {
 	return atomic.LoadUint32(&ep.shutdown) != 0
 }
 
-type shutdownError struct{}
+// ShutdownError is returned by most Endpoint methods after Endpoint.Shutdown()
+// has been called.
+type ShutdownError struct{}
 
 // Error implements error.Error.
-func (shutdownError) Error() string {
+func (ShutdownError) Error() string {
 	return "flipcall connection shutdown"
 }
 
diff --git a/pkg/flipcall/futex_linux.go b/pkg/flipcall/futex_linux.go
index b127a2bbb..168c1ccff 100644
--- a/pkg/flipcall/futex_linux.go
+++ b/pkg/flipcall/futex_linux.go
@@ -61,7 +61,7 @@ func (ep *Endpoint) futexSwitchToPeer() error {
 	if !atomic.CompareAndSwapUint32(ep.connState(), ep.activeState, ep.inactiveState) {
 		switch cs := atomic.LoadUint32(ep.connState()); cs {
 		case csShutdown:
-			return shutdownError{}
+			return ShutdownError{}
 		default:
 			return fmt.Errorf("unexpected connection state before FUTEX_WAKE: %v", cs)
 		}
@@ -81,14 +81,14 @@ func (ep *Endpoint) futexSwitchFromPeer() error {
 			return nil
 		case ep.inactiveState:
 			if ep.isShutdownLocally() {
-				return shutdownError{}
+				return ShutdownError{}
 			}
 			if err := ep.futexWaitConnState(ep.inactiveState); err != nil {
 				return fmt.Errorf("failed to FUTEX_WAIT for peer Endpoint: %v", err)
 			}
 			continue
 		case csShutdown:
-			return shutdownError{}
+			return ShutdownError{}
 		default:
 			return fmt.Errorf("unexpected connection state before FUTEX_WAIT: %v", cs)
 		}
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index e717e6161..40b8fa023 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -453,7 +453,11 @@ func (cs *connState) initializeChannels() (err error) {
 		go func() { // S/R-SAFE: Server side.
 			defer cs.channelWg.Done()
 			if err := res.service(cs); err != nil {
-				log.Warningf("p9.channel.service: %v", err)
+				// Don't log flipcall.ShutdownErrors, which we expect to be
+				// returned during server shutdown.
+				if _, ok := err.(flipcall.ShutdownError); !ok {
+					log.Warningf("p9.channel.service: %v", err)
+				}
 			}
 		}()
 	}
-- 
cgit v1.2.3


From e70f28664af53b0428405c695c90a91b9bb43f67 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 1 Nov 2019 11:44:07 -0700
Subject: Allow the watchdog to detect when the sandbox is stuck during setup.

The watchdog currently can find stuck tasks, but has no way to tell if the
sandbox is stuck before the application starts executing.

This CL adds a startup timeout and action to the watchdog. If Start() is not
called before the given timeout (if non-zero), then the watchdog will take the
action.

PiperOrigin-RevId: 277970577
---
 pkg/sentry/watchdog/watchdog.go | 152 ++++++++++++++++++++++++++++------------
 runsc/boot/controller.go        |   4 +-
 runsc/boot/loader.go            |   4 +-
 3 files changed, 112 insertions(+), 48 deletions(-)

diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 145102c0d..ecce6c69f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -42,8 +42,35 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
 
-// DefaultTimeout is a resonable timeout value for most applications.
-const DefaultTimeout = 3 * time.Minute
+// Opts configures the watchdog.
+type Opts struct {
+	// TaskTimeout is the amount of time to allow a task to execute the
+	// same syscall without blocking before it's declared stuck.
+	TaskTimeout time.Duration
+
+	// TaskTimeoutAction indicates what action to take when a stuck tasks
+	// is detected.
+	TaskTimeoutAction Action
+
+	// StartupTimeout is the amount of time to allow between watchdog
+	// creation and calling watchdog.Start.
+	StartupTimeout time.Duration
+
+	// StartupTimeoutAction indicates what action to take when
+	// watchdog.Start is not called within the timeout.
+	StartupTimeoutAction Action
+}
+
+// DefaultOpts is a default set of options for the watchdog.
+var DefaultOpts = Opts{
+	// Task timeout.
+	TaskTimeout:       3 * time.Minute,
+	TaskTimeoutAction: LogWarning,
+
+	// Startup timeout.
+	StartupTimeout:       30 * time.Second,
+	StartupTimeoutAction: LogWarning,
+}
 
 // descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period
 // is discounted from task's last update time. It's set high enough that small scheduling delays won't
@@ -61,6 +88,7 @@ type Action int
 const (
 	// LogWarning logs warning message followed by stack trace.
 	LogWarning Action = iota
+
 	// Panic will do the same logging as LogWarning and panic().
 	Panic
 )
@@ -80,17 +108,13 @@ func (a Action) String() string {
 // Watchdog is the main watchdog class. It controls a goroutine that periodically
 // analyses all tasks and reports if any of them appear to be stuck.
 type Watchdog struct {
+	// Configuration options are embedded.
+	Opts
+
 	// period indicates how often to check all tasks. It's calculated based on
-	// 'taskTimeout'.
+	// opts.TaskTimeout.
 	period time.Duration
 
-	// taskTimeout is the amount of time to allow a task to execute the same syscall
-	// without blocking before it's declared stuck.
-	taskTimeout time.Duration
-
-	// timeoutAction indicates what action to take when a stuck tasks is detected.
-	timeoutAction Action
-
 	// k is where the tasks come from.
 	k *kernel.Kernel
 
@@ -113,8 +137,12 @@ type Watchdog struct {
 	// mu protects the fields below.
 	mu sync.Mutex
 
-	// started is true if the watchdog has been started before.
-	started bool
+	// running is true if the watchdog is running.
+	running bool
+
+	// startCalled is true if Start has ever been called. It remains true
+	// even if Stop is called.
+	startCalled bool
 }
 
 type offender struct {
@@ -122,58 +150,81 @@ type offender struct {
 }
 
 // New creates a new watchdog.
-func New(k *kernel.Kernel, taskTimeout time.Duration, a Action) *Watchdog {
-	// 4 is arbitrary, just don't want to prolong 'taskTimeout' too much.
-	period := taskTimeout / 4
-	return &Watchdog{
-		k:             k,
-		period:        period,
-		taskTimeout:   taskTimeout,
-		timeoutAction: a,
-		offenders:     make(map[*kernel.Task]*offender),
-		stop:          make(chan struct{}),
-		done:          make(chan struct{}),
+func New(k *kernel.Kernel, opts Opts) *Watchdog {
+	// 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much.
+	period := opts.TaskTimeout / 4
+	w := &Watchdog{
+		Opts:      opts,
+		k:         k,
+		period:    period,
+		offenders: make(map[*kernel.Task]*offender),
+		stop:      make(chan struct{}),
+		done:      make(chan struct{}),
+	}
+
+	// Handle StartupTimeout if it exists.
+	if w.StartupTimeout > 0 {
+		log.Infof("Watchdog waiting %v for startup", w.StartupTimeout)
+		go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore.
 	}
+
+	return w
 }
 
 // Start starts the watchdog.
 func (w *Watchdog) Start() {
-	if w.taskTimeout == 0 {
-		log.Infof("Watchdog disabled")
-		return
-	}
-
 	w.mu.Lock()
 	defer w.mu.Unlock()
-	if w.started {
+	w.startCalled = true
+
+	if w.running {
 		return
 	}
 
+	if w.TaskTimeout == 0 {
+		log.Infof("Watchdog task timeout disabled")
+		return
+	}
 	w.lastRun = w.k.MonotonicClock().Now()
 
-	log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.taskTimeout, w.timeoutAction)
+	log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction)
 	go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
-	w.started = true
+	w.running = true
 }
 
 // Stop requests the watchdog to stop and wait for it.
 func (w *Watchdog) Stop() {
-	if w.taskTimeout == 0 {
+	if w.TaskTimeout == 0 {
 		return
 	}
 
 	w.mu.Lock()
 	defer w.mu.Unlock()
-	if !w.started {
+	if !w.running {
 		return
 	}
 	log.Infof("Stopping watchdog")
 	w.stop <- struct{}{}
 	<-w.done
-	w.started = false
+	w.running = false
 	log.Infof("Watchdog stopped")
 }
 
+// waitForStart waits for Start to be called and takes action if it does not
+// happen within the startup timeout.
+func (w *Watchdog) waitForStart() {
+	<-time.After(w.StartupTimeout)
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.startCalled {
+		// We are fine.
+		return
+	}
+	var buf bytes.Buffer
+	buf.WriteString("Watchdog.Start() not called within %s:\n")
+	w.doAction(w.StartupTimeoutAction, false, &buf)
+}
+
 // loop is the main watchdog routine. It only returns when 'Stop()' is called.
 func (w *Watchdog) loop() {
 	// Loop until someone stops it.
@@ -202,7 +253,7 @@ func (w *Watchdog) runTurn() {
 
 	select {
 	case <-done:
-	case <-time.After(w.taskTimeout):
+	case <-time.After(w.TaskTimeout):
 		// Report if the watchdog is not making progress.
 		// No one is wathching the watchdog watcher though.
 		w.reportStuckWatchdog()
@@ -231,7 +282,7 @@ func (w *Watchdog) runTurn() {
 		if tsched.State == kernel.TaskGoroutineRunningSys {
 			lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick)))
 			elapsed := now.Sub(lastUpdateTime) - discount
-			if elapsed > w.taskTimeout {
+			if elapsed > w.TaskTimeout {
 				tc, ok := w.offenders[t]
 				if !ok {
 					// New stuck task detected.
@@ -261,28 +312,34 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 		tid := w.k.TaskSet().Root.IDOfTask(t)
 		buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime)))
 	}
+
 	buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine")
-	w.onStuckTask(newTaskFound, &buf)
+
+	// Dump stack only if a new task is detected or if it sometime has
+	// passed since the last time a stack dump was generated.
+	skipStack := newTaskFound || time.Since(w.lastStackDump) >= stackDumpSameTaskPeriod
+	w.doAction(w.TaskTimeoutAction, skipStack, &buf)
 }
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
 	buf.WriteString("Watchdog goroutine is stuck:\n")
-	w.onStuckTask(true, &buf)
+	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
-func (w *Watchdog) onStuckTask(newTaskFound bool, msg *bytes.Buffer) {
-	switch w.timeoutAction {
+// doAction will take the given action. If the action is LogWarnind and
+// skipStack is true, then the stack printing will be skipped.
+func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
+	switch action {
 	case LogWarning:
-		// Dump stack only if a new task is detected or if it sometime has passed since
-		// the last time a stack dump was generated.
-		if !newTaskFound && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
+		if skipStack {
 			msg.WriteString("\n...[stack dump skipped]...")
 			log.Warningf(msg.String())
-		} else {
-			log.TracebackAll(msg.String())
-			w.lastStackDump = time.Now()
+			return
+
 		}
+		log.TracebackAll(msg.String())
+		w.lastStackDump = time.Now()
 
 	case Panic:
 		// Panic will skip over running tasks, which is likely the culprit here. So manually
@@ -301,5 +358,8 @@ func (w *Watchdog) onStuckTask(newTaskFound bool, msg *bytes.Buffer) {
 		case <-time.After(1 * time.Second):
 		}
 		panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String()))
+	default:
+		panic(fmt.Sprintf("Unknown watchdog action %v", action))
+
 	}
 }
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 928285683..f62be4c59 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -380,7 +380,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	// Since we have a new kernel we also must make a new watchdog.
-	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 86df384f8..4d1bd2d08 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -300,7 +300,9 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create a watchdog.
-	dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
 
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
 	if err != nil {
-- 
cgit v1.2.3


From 2a709a1b7b150cae6121cb97259db5cbeb57b330 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 1 Nov 2019 11:51:50 -0700
Subject: Add "manual" tag back to runtime tests.

PiperOrigin-RevId: 277971910
---
 test/runtimes/build_defs.bzl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
index d458df1fd..6f84ca852 100644
--- a/test/runtimes/build_defs.bzl
+++ b/test/runtimes/build_defs.bzl
@@ -52,6 +52,8 @@ def runtime_test(
         tags = [
             # Requires docker and runsc to be configured before the test runs.
             "local",
+            # Don't include test target in wildcard target patterns.
+            "manual",
         ],
     )
 
-- 
cgit v1.2.3


From 515fee5b6d4f3270c951f72283aef79a28d463dd Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 1 Nov 2019 12:42:04 -0700
Subject: Add SO_PASSCRED support to netlink sockets

Since we only supporting sending messages from the kernel, the peer is always
the kernel, simplifying handling.

There are currently no known users of SO_PASSCRED that would actually receive
messages from gVisor, but adding full support is barely more work than stubbing
out fake support.

Updates #1117
Fixes #1119

PiperOrigin-RevId: 277981465
---
 pkg/sentry/socket/netlink/BUILD             |   1 +
 pkg/sentry/socket/netlink/socket.go         |  76 ++++++++++++++++++-
 test/syscalls/linux/socket_netlink_route.cc | 110 +++++++++++++++++++++++++++-
 3 files changed, 183 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f95803f91..79589e3c8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -20,6 +20,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index b2732ca29..05dac4f0a 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -61,7 +62,7 @@ var netlinkSocketDevice = device.NewAnonDevice()
 // This implementation only supports userspace sending and receiving messages
 // to/from the kernel.
 //
-// Socket implements socket.Socket.
+// Socket implements socket.Socket and transport.Credentialer.
 //
 // +stateify savable
 type Socket struct {
@@ -104,9 +105,13 @@ type Socket struct {
 	// sendBufferSize is the send buffer "size". We don't actually have a
 	// fixed buffer but only consume this many bytes.
 	sendBufferSize uint32
+
+	// passcred indicates if this socket wants SCM credentials.
+	passcred bool
 }
 
 var _ socket.Socket = (*Socket)(nil)
+var _ transport.Credentialer = (*Socket)(nil)
 
 // NewSocket creates a new Socket.
 func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) {
@@ -172,6 +177,22 @@ func (s *Socket) EventUnregister(e *waiter.Entry) {
 	s.ep.EventUnregister(e)
 }
 
+// Passcred implements transport.Credentialer.Passcred.
+func (s *Socket) Passcred() bool {
+	s.mu.Lock()
+	passcred := s.passcred
+	s.mu.Unlock()
+	return passcred
+}
+
+// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
+func (s *Socket) ConnectedPasscred() bool {
+	// This socket is connected to the kernel, which doesn't need creds.
+	//
+	// This is arbitrary, as ConnectedPasscred on this type has no callers.
+	return false
+}
+
 // Ioctl implements fs.FileOperations.Ioctl.
 func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) {
 	// TODO(b/68878065): no ioctls supported.
@@ -309,9 +330,20 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.
 			// We don't have limit on receiving size.
 			return int32(math.MaxInt32), nil
 
+		case linux.SO_PASSCRED:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			var passcred int32
+			if s.Passcred() {
+				passcred = 1
+			}
+			return passcred, nil
+
 		default:
 			socket.GetSockOptEmitUnimplementedEvent(t, name)
 		}
+
 	case linux.SOL_NETLINK:
 		switch name {
 		case linux.NETLINK_BROADCAST_ERROR,
@@ -348,6 +380,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			s.sendBufferSize = size
 			s.mu.Unlock()
 			return nil
+
 		case linux.SO_RCVBUF:
 			if len(opt) < sizeOfInt32 {
 				return syserr.ErrInvalidArgument
@@ -355,6 +388,18 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			// We don't have limit on receiving size. So just accept anything as
 			// valid for compatibility.
 			return nil
+
+		case linux.SO_PASSCRED:
+			if len(opt) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+			passcred := usermem.ByteOrder.Uint32(opt)
+
+			s.mu.Lock()
+			s.passcred = passcred != 0
+			s.mu.Unlock()
+			return nil
+
 		default:
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
@@ -483,6 +528,26 @@ func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _
 	})
 }
 
+// kernelSCM implements control.SCMCredentials with credentials that represent
+// the kernel itself rather than a Task.
+//
+// +stateify savable
+type kernelSCM struct{}
+
+// Equals implements transport.CredentialsControlMessage.Equals.
+func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool {
+	_, ok := oc.(kernelSCM)
+	return ok
+}
+
+// Credentials implements control.SCMCredentials.Credentials.
+func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
+	return 0, auth.RootUID, auth.RootGID
+}
+
+// kernelCreds is the concrete version of kernelSCM used in all creds.
+var kernelCreds = &kernelSCM{}
+
 // sendResponse sends the response messages in ms back to userspace.
 func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error {
 	// Linux combines multiple netlink messages into a single datagram.
@@ -491,10 +556,15 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 		bufs = append(bufs, m.Finalize())
 	}
 
+	// All messages are from the kernel.
+	cms := transport.ControlMessages{
+		Credentials: kernelCreds,
+	}
+
 	if len(bufs) > 0 {
 		// RecvMsg never receives the address, so we don't need to send
 		// one.
-		_, notify, err := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{})
 		// If the buffer is full, we simply drop messages, just like
 		// Linux.
 		if err != nil && err != syserr.ErrWouldBlock {
@@ -521,7 +591,7 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 		// Add the dump_done_errno payload.
 		m.Put(int64(0))
 
-		_, notify, err := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
 		if err != nil && err != syserr.ErrWouldBlock {
 			return err
 		}
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index dd4a11655..be0dadcd6 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -195,7 +195,8 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple(SO_DOMAIN, IsEqual(AF_NETLINK),
                         absl::StrFormat("AF_NETLINK (%d)", AF_NETLINK)),
         std::make_tuple(SO_PROTOCOL, IsEqual(NETLINK_ROUTE),
-                        absl::StrFormat("NETLINK_ROUTE (%d)", NETLINK_ROUTE))));
+                        absl::StrFormat("NETLINK_ROUTE (%d)", NETLINK_ROUTE)),
+        std::make_tuple(SO_PASSCRED, IsEqual(0), "0")));
 
 // Validates the reponses to RTM_GETLINK + NLM_F_DUMP.
 void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
@@ -692,6 +693,113 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
   } while (type != NLMSG_DONE && type != NLMSG_ERROR);
 }
 
+// No SCM_CREDENTIALS are received without SO_PASSCRED set.
+TEST(NetlinkRouteTest, NoPasscredNoCreds) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+
+  ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct rtgenmsg rgm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req;
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.rgm.rtgen_family = AF_UNSPEC;
+
+  struct iovec iov = {};
+  iov.iov_base = &req;
+  iov.iov_len = sizeof(req);
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  iov.iov_base = NULL;
+  iov.iov_len = 0;
+
+  char control[CMSG_SPACE(sizeof(struct ucred))] = {};
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  // Note: This test assumes at least one message is returned by the
+  // RTM_GETADDR request.
+  ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  // No control messages.
+  EXPECT_EQ(CMSG_FIRSTHDR(&msg), nullptr);
+}
+
+// SCM_CREDENTIALS are received with SO_PASSCRED set.
+TEST(NetlinkRouteTest, PasscredCreds) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+
+  ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct rtgenmsg rgm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req;
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.rgm.rtgen_family = AF_UNSPEC;
+
+  struct iovec iov = {};
+  iov.iov_base = &req;
+  iov.iov_len = sizeof(req);
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  iov.iov_base = NULL;
+  iov.iov_len = 0;
+
+  char control[CMSG_SPACE(sizeof(struct ucred))] = {};
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  // Note: This test assumes at least one message is returned by the
+  // RTM_GETADDR request.
+  ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  struct ucred creds;
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(creds)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+  memcpy(&creds, CMSG_DATA(cmsg), sizeof(creds));
+
+  // The peer is the kernel, which is "PID" 0.
+  EXPECT_EQ(creds.pid, 0);
+  // The kernel identifies as root. Also allow nobody in case this test is
+  // running in a userns without root mapped.
+  EXPECT_THAT(creds.uid, AnyOf(Eq(0), Eq(65534)));
+  EXPECT_THAT(creds.gid, AnyOf(Eq(0), Eq(65534)));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 3b4f5445d03f7d2f170d68a8a4969b8acbad773e Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 4 Nov 2019 09:54:14 -0800
Subject: Update membarrier bug

Updates #267

PiperOrigin-RevId: 278402684
---
 pkg/sentry/syscalls/linux/linux64_amd64.go | 2 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 3021440ed..81e4f93a6 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -364,7 +364,7 @@ var AMD64 = &kernel.SyscallTable{
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
 		322: syscalls.Supported("execveat", Execveat),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
+		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
 		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 4cf7f836a..a809115e0 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -297,7 +297,7 @@ var ARM64 = &kernel.SyscallTable{
 		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
 		281: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}),    // TODO(b/118901836)
 		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
+		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
 		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
 		286: syscalls.Supported("preadv2", Preadv2),
-- 
cgit v1.2.3


From b23b36e701c40827065217f4652a51eebc5f9913 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 4 Nov 2019 10:06:00 -0800
Subject: Add NETLINK_KOBJECT_UEVENT socket support

NETLINK_KOBJECT_UEVENT sockets send udev-style messages for device events.
gVisor doesn't have any device events, so our sockets don't need to do anything
once created.

systemd's device manager needs to be able to create one of these sockets. It
also wants to install a BPF filter on the socket. Since we'll never send any
messages, the filter would never be invoked, thus we just fake it out.

Fixes #1117
Updates #1119

PiperOrigin-RevId: 278405893
---
 pkg/sentry/socket/netlink/provider.go        |   7 ++
 pkg/sentry/socket/netlink/route/protocol.go  |   5 +
 pkg/sentry/socket/netlink/socket.go          |  42 ++++++++
 pkg/sentry/socket/netlink/uevent/BUILD       |  17 +++
 pkg/sentry/socket/netlink/uevent/protocol.go |  60 +++++++++++
 runsc/boot/BUILD                             |   1 +
 runsc/boot/loader.go                         |   1 +
 test/syscalls/BUILD                          |   4 +
 test/syscalls/linux/BUILD                    |  29 +++++
 test/syscalls/linux/socket_netdevice.cc      |   3 +-
 test/syscalls/linux/socket_netlink.cc        | 153 +++++++++++++++++++++++++++
 test/syscalls/linux/socket_netlink_route.cc  | 140 ++++--------------------
 test/syscalls/linux/socket_netlink_uevent.cc |  83 +++++++++++++++
 test/syscalls/linux/socket_netlink_util.cc   |   5 +-
 test/syscalls/linux/socket_netlink_util.h    |   5 +-
 15 files changed, 431 insertions(+), 124 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/uevent/BUILD
 create mode 100644 pkg/sentry/socket/netlink/uevent/protocol.go
 create mode 100644 test/syscalls/linux/socket_netlink.cc
 create mode 100644 test/syscalls/linux/socket_netlink_uevent.cc

diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 689cad997..be005df24 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -30,6 +30,13 @@ type Protocol interface {
 	// Protocol returns the Linux netlink protocol value.
 	Protocol() int
 
+	// CanSend returns true if this protocol may ever send messages.
+	//
+	// TODO(gvisor.dev/issue/1119): This is a workaround to allow
+	// advertising support for otherwise unimplemented features on sockets
+	// that will never send messages, thus making those features no-ops.
+	CanSend() bool
+
 	// ProcessMessage processes a single message from userspace.
 	//
 	// If err == nil, any messages added to ms will be sent back to the
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index cc70ac237..6b4a0ecf4 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -61,6 +61,11 @@ func (p *Protocol) Protocol() int {
 	return linux.NETLINK_ROUTE
 }
 
+// CanSend implements netlink.Protocol.CanSend.
+func (p *Protocol) CanSend() bool {
+	return true
+}
+
 // dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests.
 func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 05dac4f0a..4a1b87a9a 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -54,6 +54,8 @@ const (
 	maxSendBufferSize = 4 << 20 // 4MB
 )
 
+var errNoFilter = syserr.New("no filter attached", linux.ENOENT)
+
 // netlinkSocketDevice is the netlink socket virtual device.
 var netlinkSocketDevice = device.NewAnonDevice()
 
@@ -108,6 +110,12 @@ type Socket struct {
 
 	// passcred indicates if this socket wants SCM credentials.
 	passcred bool
+
+	// filter indicates that this socket has a BPF filter "installed".
+	//
+	// TODO(gvisor.dev/issue/1119): We don't actually support filtering,
+	// this is just bookkeeping for tracking add/remove.
+	filter bool
 }
 
 var _ socket.Socket = (*Socket)(nil)
@@ -400,6 +408,40 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			s.mu.Unlock()
 			return nil
 
+		case linux.SO_ATTACH_FILTER:
+			// TODO(gvisor.dev/issue/1119): We don't actually
+			// support filtering. If this socket can't ever send
+			// messages, then there is nothing to filter and we can
+			// advertise support. Otherwise, be conservative and
+			// return an error.
+			if s.protocol.CanSend() {
+				socket.SetSockOptEmitUnimplementedEvent(t, name)
+				return syserr.ErrProtocolNotAvailable
+			}
+
+			s.mu.Lock()
+			s.filter = true
+			s.mu.Unlock()
+			return nil
+
+		case linux.SO_DETACH_FILTER:
+			// TODO(gvisor.dev/issue/1119): See above.
+			if s.protocol.CanSend() {
+				socket.SetSockOptEmitUnimplementedEvent(t, name)
+				return syserr.ErrProtocolNotAvailable
+			}
+
+			s.mu.Lock()
+			filter := s.filter
+			s.filter = false
+			s.mu.Unlock()
+
+			if !filter {
+				return errNoFilter
+			}
+
+			return nil
+
 		default:
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD
new file mode 100644
index 000000000..0777f3baf
--- /dev/null
+++ b/pkg/sentry/socket/netlink/uevent/BUILD
@@ -0,0 +1,17 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "uevent",
+    srcs = ["protocol.go"],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/syserr",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
new file mode 100644
index 000000000..b5d7808d7
--- /dev/null
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package uevent provides a NETLINK_KOBJECT_UEVENT socket protocol.
+//
+// NETLINK_KOBJECT_UEVENT sockets send udev-style device events. gVisor does
+// not support any device events, so these sockets never send any messages.
+package uevent
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+// Protocol implements netlink.Protocol.
+//
+// +stateify savable
+type Protocol struct{}
+
+var _ netlink.Protocol = (*Protocol)(nil)
+
+// NewProtocol creates a NETLINK_KOBJECT_UEVENT netlink.Protocol.
+func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) {
+	return &Protocol{}, nil
+}
+
+// Protocol implements netlink.Protocol.Protocol.
+func (p *Protocol) Protocol() int {
+	return linux.NETLINK_KOBJECT_UEVENT
+}
+
+// CanSend implements netlink.Protocol.CanSend.
+func (p *Protocol) CanSend() bool {
+	return false
+}
+
+// ProcessMessage implements netlink.Protocol.ProcessMessage.
+func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// Silently ignore all messages.
+	return nil
+}
+
+// init registers the NETLINK_KOBJECT_UEVENT provider.
+func init() {
+	netlink.RegisterProvider(linux.NETLINK_KOBJECT_UEVENT, NewProtocol)
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 6fe2b57de..58e86ae7f 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -60,6 +60,7 @@ go_library(
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netlink/uevent",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 4d1bd2d08..f05d5973f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -65,6 +65,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a53a23afd..3e5b6b3c3 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -511,8 +511,12 @@ syscall_test(test = "//test/syscalls/linux:socket_ip_unbound_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_netdevice_test")
 
+syscall_test(test = "//test/syscalls/linux:socket_netlink_test")
+
 syscall_test(test = "//test/syscalls/linux:socket_netlink_route_test")
 
+syscall_test(test = "//test/syscalls/linux:socket_netlink_uevent_test")
+
 syscall_test(test = "//test/syscalls/linux:socket_blocking_local_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_blocking_ip_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 833fbaa09..93bff8299 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2675,6 +2675,20 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_netlink_test",
+    testonly = 1,
+    srcs = ["socket_netlink.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "socket_netlink_route_test",
     testonly = 1,
@@ -2692,6 +2706,21 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_netlink_uevent_test",
+    testonly = 1,
+    srcs = ["socket_netlink_uevent.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_netlink_util",
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # These socket tests are in a library because the test cases are shared
 # across several test build targets.
 cc_library(
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index 765f8e0e4..405dbbd73 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -68,7 +68,8 @@ TEST(NetdeviceTest, Netmask) {
 
   // Use a netlink socket to get the netmask, which we'll then compare to the
   // netmask obtained via ioctl.
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
diff --git a/test/syscalls/linux/socket_netlink.cc b/test/syscalls/linux/socket_netlink.cc
new file mode 100644
index 000000000..4ec0fd4fa
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink.cc
@@ -0,0 +1,153 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for all netlink socket protocols.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// NetlinkTest parameter is the protocol to test.
+using NetlinkTest = ::testing::TestWithParam<int>;
+
+// Netlink sockets must be SOCK_DGRAM or SOCK_RAW.
+TEST_P(NetlinkTest, Types) {
+  const int protocol = GetParam();
+
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_STREAM, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_SEQPACKET, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_RDM, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_DCCP, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_PACKET, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+
+  int fd;
+  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_DGRAM, protocol), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_RAW, protocol), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_P(NetlinkTest, AutomaticPort) {
+  const int protocol = GetParam();
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  EXPECT_THAT(
+      bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallSucceeds());
+
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  // This is the only netlink socket in the process, so it should get the PID as
+  // the port id.
+  //
+  // N.B. Another process could theoretically have explicitly reserved our pid
+  // as a port ID, but that is very unlikely.
+  EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+// Calling connect automatically binds to an automatic port.
+TEST_P(NetlinkTest, ConnectBinds) {
+  const int protocol = GetParam();
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+
+  // Each test is running in a pid namespace, so another process can explicitly
+  // reserve our pid as a port ID. In this case, a negative portid value will be
+  // set.
+  if (static_cast<pid_t>(addr.nl_pid) > 0) {
+    EXPECT_EQ(addr.nl_pid, getpid());
+  }
+
+  memset(&addr, 0, sizeof(addr));
+  addr.nl_family = AF_NETLINK;
+
+  // Connecting again is allowed, but keeps the same port.
+  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+TEST_P(NetlinkTest, GetPeerName) {
+  const int protocol = GetParam();
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+  struct sockaddr_nl addr = {};
+  socklen_t addrlen = sizeof(addr);
+
+  EXPECT_THAT(getpeername(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, sizeof(addr));
+  EXPECT_EQ(addr.nl_family, AF_NETLINK);
+  // Peer is the kernel if we didn't connect elsewhere.
+  EXPECT_EQ(addr.nl_pid, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(ProtocolTest, NetlinkTest,
+                         ::testing::Values(NETLINK_ROUTE,
+                                           NETLINK_KOBJECT_UEVENT));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index be0dadcd6..ef567f512 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -41,112 +41,7 @@ namespace {
 using ::testing::AnyOf;
 using ::testing::Eq;
 
-// Netlink sockets must be SOCK_DGRAM or SOCK_RAW.
-TEST(NetlinkRouteTest, Types) {
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_STREAM, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_SEQPACKET, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_RDM, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_DCCP, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_PACKET, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-
-  int fd;
-  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE),
-              SyscallSucceeds());
-  EXPECT_THAT(close(fd), SyscallSucceeds());
-
-  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE),
-              SyscallSucceeds());
-  EXPECT_THAT(close(fd), SyscallSucceeds());
-}
-
-TEST(NetlinkRouteTest, AutomaticPort) {
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
-
-  struct sockaddr_nl addr = {};
-  addr.nl_family = AF_NETLINK;
-
-  EXPECT_THAT(
-      bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
-      SyscallSucceeds());
-
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, sizeof(addr));
-  // This is the only netlink socket in the process, so it should get the PID as
-  // the port id.
-  //
-  // N.B. Another process could theoretically have explicitly reserved our pid
-  // as a port ID, but that is very unlikely.
-  EXPECT_EQ(addr.nl_pid, getpid());
-}
-
-// Calling connect automatically binds to an automatic port.
-TEST(NetlinkRouteTest, ConnectBinds) {
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
-
-  struct sockaddr_nl addr = {};
-  addr.nl_family = AF_NETLINK;
-
-  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                      sizeof(addr)),
-              SyscallSucceeds());
-
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, sizeof(addr));
-
-  // Each test is running in a pid namespace, so another process can explicitly
-  // reserve our pid as a port ID. In this case, a negative portid value will be
-  // set.
-  if (static_cast<pid_t>(addr.nl_pid) > 0) {
-    EXPECT_EQ(addr.nl_pid, getpid());
-  }
-
-  memset(&addr, 0, sizeof(addr));
-  addr.nl_family = AF_NETLINK;
-
-  // Connecting again is allowed, but keeps the same port.
-  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                      sizeof(addr)),
-              SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, sizeof(addr));
-  EXPECT_EQ(addr.nl_pid, getpid());
-}
-
-TEST(NetlinkRouteTest, GetPeerName) {
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
-
-  struct sockaddr_nl addr = {};
-  socklen_t addrlen = sizeof(addr);
-
-  EXPECT_THAT(getpeername(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, sizeof(addr));
-  EXPECT_EQ(addr.nl_family, AF_NETLINK);
-  // Peer is the kernel if we didn't connect elsewhere.
-  EXPECT_EQ(addr.nl_pid, 0);
-}
-
-// Parameters for GetSockOpt test. They are:
+// Parameters for SockOptTest. They are:
 // 0: Socket option to query.
 // 1: A predicate to run on the returned sockopt value. Should return true if
 //    the value is considered ok.
@@ -219,7 +114,8 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
 }
 
 TEST(NetlinkRouteTest, GetLinkDump) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -260,7 +156,8 @@ TEST(NetlinkRouteTest, GetLinkDump) {
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -293,7 +190,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -332,7 +230,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
 }
 
 TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -373,7 +272,8 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
 }
 
 TEST(NetlinkRouteTest, ControlMessageIgnored) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -408,7 +308,8 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
 }
 
 TEST(NetlinkRouteTest, GetAddrDump) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -468,7 +369,8 @@ TEST(NetlinkRouteTest, LookupAll) {
 
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -544,7 +446,8 @@ TEST(NetlinkRouteTest, GetRouteDump) {
 // buffer. MSG_TRUNC with a zero length buffer should consume subsequent
 // messages off the socket.
 TEST(NetlinkRouteTest, RecvmsgTrunc) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -620,7 +523,8 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
 // it, so a properly sized buffer can be allocated to store the message. This
 // test tests that scenario.
 TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -695,7 +599,8 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
 
 // No SCM_CREDENTIALS are received without SO_PASSCRED set.
 TEST(NetlinkRouteTest, NoPasscredNoCreds) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOff,
                          sizeof(kSockOptOff)),
@@ -742,7 +647,8 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
 
 // SCM_CREDENTIALS are received with SO_PASSCRED set.
 TEST(NetlinkRouteTest, PasscredCreds) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
                          sizeof(kSockOptOn)),
diff --git a/test/syscalls/linux/socket_netlink_uevent.cc b/test/syscalls/linux/socket_netlink_uevent.cc
new file mode 100644
index 000000000..da425bed4
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_uevent.cc
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/filter.h>
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for NETLINK_KOBJECT_UEVENT sockets.
+//
+// gVisor never sends any messages on these sockets, so we don't test the events
+// themselves.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// SO_PASSCRED can be enabled. Since no messages are sent in gVisor, we don't
+// actually test receiving credentials.
+TEST(NetlinkUeventTest, PassCred) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+  EXPECT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+}
+
+// SO_DETACH_FILTER fails without a filter already installed.
+TEST(NetlinkUeventTest, DetachNoFilter) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+  int opt;
+  EXPECT_THAT(
+      setsockopt(fd.get(), SOL_SOCKET, SO_DETACH_FILTER, &opt, sizeof(opt)),
+      SyscallFailsWithErrno(ENOENT));
+}
+
+// We can attach a BPF filter.
+TEST(NetlinkUeventTest, AttachFilter) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+  // Minimal BPF program: a single ret.
+  struct sock_filter filter = {0x6, 0, 0, 0};
+  struct sock_fprog prog = {};
+  prog.len = 1;
+  prog.filter = &filter;
+
+  EXPECT_THAT(
+      setsockopt(fd.get(), SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog)),
+      SyscallSucceeds());
+
+  int opt;
+  EXPECT_THAT(
+      setsockopt(fd.get(), SOL_SOCKET, SO_DETACH_FILTER, &opt, sizeof(opt)),
+      SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index fcb8f8a88..5f05bab10 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,7 +16,6 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
-#include <linux/rtnetlink.h>
 
 #include <vector>
 
@@ -27,9 +26,9 @@
 namespace gvisor {
 namespace testing {
 
-PosixErrorOr<FileDescriptor> NetlinkBoundSocket() {
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol) {
   FileDescriptor fd;
-  ASSIGN_OR_RETURN_ERRNO(fd, Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+  ASSIGN_OR_RETURN_ERRNO(fd, Socket(AF_NETLINK, SOCK_RAW, protocol));
 
   struct sockaddr_nl addr = {};
   addr.nl_family = AF_NETLINK;
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index db8639a2f..da99f0d60 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -17,7 +17,6 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
-#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -25,8 +24,8 @@
 namespace gvisor {
 namespace testing {
 
-// Returns a bound NETLINK_ROUTE socket.
-PosixErrorOr<FileDescriptor> NetlinkBoundSocket();
+// Returns a bound netlink socket.
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
 
 // Returns the port ID of the passed socket.
 PosixErrorOr<uint32_t> NetlinkPortID(int fd);
-- 
cgit v1.2.3


From 4fdd69d681bb3abb68a043377a2fb0ec8a031d54 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 4 Nov 2019 10:56:13 -0800
Subject: Check that a file is a regular file with open(O_TRUNC).

It was possible to panic the sentry by opening a cache revalidating folder with
O_TRUNC|O_CREAT.

PiperOrigin-RevId: 278417533
---
 pkg/sentry/fs/inode_operations.go     | 2 ++
 pkg/sentry/syscalls/linux/sys_file.go | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 5cde9d215..d6c35c2dc 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -221,6 +221,8 @@ type InodeOperations interface {
 	// sys_ftruncate.
 	//
 	// Implementations need not check that length >= 0.
+	//
+	// Truncate must only be called on regular files.
 	Truncate(ctx context.Context, inode *Inode, size int64) error
 
 	// Allocate allows the caller to reserve disk space for the inode.
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index b9a8e3e21..c9f57fe27 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -169,7 +169,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 			if dirPath {
 				return syserror.ENOTDIR
 			}
-			if flags&linux.O_TRUNC != 0 {
+			if flags&linux.O_TRUNC != 0 && fs.IsRegular(d.Inode.StableAttr) {
 				if err := d.Inode.Truncate(t, d, 0); err != nil {
 					return err
 				}
@@ -397,7 +397,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
 			}
 
 			// Should we truncate the file?
-			if flags&linux.O_TRUNC != 0 {
+			if flags&linux.O_TRUNC != 0 && fs.IsRegular(found.Inode.StableAttr) {
 				if err := found.Inode.Truncate(t, found, 0); err != nil {
 					return err
 				}
@@ -1483,7 +1483,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if fs.IsDir(d.Inode.StableAttr) {
 			return syserror.EISDIR
 		}
-		if !fs.IsFile(d.Inode.StableAttr) {
+		if !fs.IsRegular(d.Inode.StableAttr) {
 			return syserror.EINVAL
 		}
 
@@ -1523,7 +1523,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 
 	// Note that this is different from truncate(2) above, where a
 	// directory returns EISDIR.
-	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
 		return 0, nil, syserror.EINVAL
 	}
 
-- 
cgit v1.2.3


From 1e21496e95e9587b69339aa88d4e228013e4d0bf Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 4 Nov 2019 11:26:28 -0800
Subject: Bump rules_go to v0.20.2 and go toolchain to v1.13.4.

PiperOrigin-RevId: 278424814
---
 WORKSPACE | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 57e6f3558..f6d2f4f32 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -3,10 +3,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "842ec0e6b4fbfdd3de6150b61af92901eeb73681fd4d185746644c338f51d4c0",
+    sha256 = "b9aa86ec08a292b97ec4591cf578e020b35f98e12173bbd4a921f84f583aebd9",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.20.1/rules_go-v0.20.1.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.20.1/rules_go-v0.20.1.tar.gz",
+        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.20.2/rules_go-v0.20.2.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.20.2/rules_go-v0.20.2.tar.gz",
     ],
 )
 
@@ -24,7 +24,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_to
 go_rules_dependencies()
 
 go_register_toolchains(
-    go_version = "1.13.3",
+    go_version = "1.13.4",
     nogo = "@//:nogo",
 )
 
-- 
cgit v1.2.3


From 493334f8b594eb1c2b0f5a6133dbedad4e0ecd32 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 4 Nov 2019 15:59:11 -0800
Subject: kokoro: run KVM syscall tests

We don't know how stable they are, so let's start with warning.

PiperOrigin-RevId: 278484186
---
 kokoro/syscall_kvm_tests.cfg  |  9 +++++++++
 scripts/syscall_kvm_tests.sh  | 21 +++++++++++++++++++++
 test/syscalls/linux/itimer.cc |  6 ++++++
 3 files changed, 36 insertions(+)
 create mode 100644 kokoro/syscall_kvm_tests.cfg
 create mode 100755 scripts/syscall_kvm_tests.sh

diff --git a/kokoro/syscall_kvm_tests.cfg b/kokoro/syscall_kvm_tests.cfg
new file mode 100644
index 000000000..3b99e9c13
--- /dev/null
+++ b/kokoro/syscall_kvm_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/syscall_kvm_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/scripts/syscall_kvm_tests.sh b/scripts/syscall_kvm_tests.sh
new file mode 100755
index 000000000..de85daa5a
--- /dev/null
+++ b/scripts/syscall_kvm_tests.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# TODO(b/112165693): "test --test_tag_filters=runsc_kvm" can be used
+# when the "manual" tag will be removed for kvm tests.
+test `bazel query "attr(tags, runsc_kvm, tests(//test/syscalls/...))"`
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 930d2b940..b77e4cbd1 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -267,6 +267,9 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
+  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   pid_t child;
   int execve_errno;
   auto kill = ASSERT_NO_ERRNO_AND_VALUE(
@@ -288,6 +291,9 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) {
+  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   pid_t child;
   int execve_errno;
   auto kill = ASSERT_NO_ERRNO_AND_VALUE(
-- 
cgit v1.2.3


From e904823833bb166a514c98bd628704379de93b47 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 5 Nov 2019 15:06:06 -0800
Subject: Fix repository build scripts.

This fixes a number of issues with the repository build process:

 * Fix the overall structure of the repository.
 * Fix the debian package description.
 * Fix the broken version number for packages.
 * Update the digest algorithm used for signing the release.

I've validated that installation works from a separate staging bucket.

Updates #852

PiperOrigin-RevId: 278716914
---
 kokoro/build.cfg         |  1 +
 runsc/BUILD              | 10 ++++-
 runsc/debian/description |  6 +--
 scripts/build.sh         | 96 ++++++++++++++++++++++++++----------------------
 tools/make_repository.sh | 67 +++++++++++++++++++++++----------
 5 files changed, 111 insertions(+), 69 deletions(-)

diff --git a/kokoro/build.cfg b/kokoro/build.cfg
index 6c1d262d4..c9ceda947 100644
--- a/kokoro/build.cfg
+++ b/kokoro/build.cfg
@@ -19,5 +19,6 @@ action {
     regex: "**/runsc"
     regex: "**/runsc.*"
     regex: "**/dists/**"
+    regex: "**/pool/**"
   }
 }
diff --git a/runsc/BUILD b/runsc/BUILD
index e4e8e64a3..e5587421d 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -76,16 +76,24 @@ pkg_tar(
 
 genrule(
     name = "deb-version",
+    # Note that runsc must appear in the srcs parameter and not the tools
+    # parameter, otherwise it will not be stamped. This is reasonable, as tools
+    # may be encoded differently in the build graph (cached more aggressively
+    # because they are assumes to be hermetic).
+    srcs = [":runsc"],
     outs = ["version.txt"],
     cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
     stamp = 1,
-    tools = [":runsc"],
 )
 
 pkg_deb(
     name = "runsc-debian",
     architecture = "amd64",
     data = ":debian-data",
+    # Note that the description_file will be flatten (all newlines removed),
+    # and therefore it is kept to a simple one-line description. The expected
+    # format for debian packages is "short summary\nLonger explanation of
+    # tool." and this is impossible with the flattening.
     description_file = "debian/description",
     homepage = "https://gvisor.dev/",
     maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
diff --git a/runsc/debian/description b/runsc/debian/description
index 6e3b1b2c0..9e8e08805 100644
--- a/runsc/debian/description
+++ b/runsc/debian/description
@@ -1,5 +1 @@
-gVisor is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system surface. It includes an Open Container Initiative
-(OCI) runtime called runsc that provides an isolation boundary between the
-application and the host kernel. The runsc runtime integrates with Docker and
-Kubernetes, making it simple to run sandboxed containers.
+gVisor container sandbox runtime
diff --git a/scripts/build.sh b/scripts/build.sh
index 0b3d1b316..8b2094cb0 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -17,63 +17,71 @@
 source $(dirname $0)/common.sh
 
 # Install required packages for make_repository.sh et al.
-sudo apt-get update && sudo apt-get install -y dpkg-sig coreutils apt-utils
+sudo apt-get update && sudo apt-get install -y dpkg-sig coreutils apt-utils xz-utils
 
 # Build runsc.
 runsc=$(build -c opt //runsc)
 
 # Build packages.
-pkg=$(build -c opt //runsc:runsc-debian)
+pkgs=$(build -c opt //runsc:runsc-debian)
+
+# Stop here if we have no artifacts directory.
+[[ -v KOKORO_ARTIFACTS_DIR ]] || exit 0
+
+# install_raw installs raw artifacts.
+install_raw() {
+  mkdir -p "$1"
+  cp -f "${runsc}" "$1"/runsc
+  sha512sum "$1"/runsc | awk '{print $1 "  runsc"}' > "$1"/runsc.sha512
+}
 
 # Build a repository, if the key is available.
+#
+# Note that make_repository.sh script will install packages into the provided
+# root, but will output to stdout a directory that can be copied arbitrarily
+# into "${KOKORO_ARTIFACTS_DIR}"/dists/XXX. We do things this way because we
+# will copy the same repository structure into multiple locations, below.
 if [[ -v KOKORO_REPO_KEY ]]; then
-  repo=$(tools/make_repository.sh "${KOKORO_KEYSTORE_DIR}/${KOKORO_REPO_KEY}" gvisor-bot@google.com main ${pkg})
+  repo=$(tools/make_repository.sh \
+          "${KOKORO_KEYSTORE_DIR}/${KOKORO_REPO_KEY}" \
+          gvisor-bot@google.com \
+          main \
+          "${KOKORO_ARTIFACTS_DIR}" \
+          ${pkgs})
 fi
 
-# Install installs artifacts.
-install() {
-  local -r binaries_dir="$1"
-  local -r repo_dir="$2"
-  mkdir -p "${binaries_dir}"
-  cp -f "${runsc}" "${binaries_dir}"/runsc
-  sha512sum "${binaries_dir}"/runsc | awk '{print $1 "  runsc"}' > "${binaries_dir}"/runsc.sha512
+# install_repo installs a repository.
+#
+# Note that packages are already installed, as noted above.
+install_repo() {
   if [[ -v repo ]]; then
-    rm -rf "${repo_dir}" && mkdir -p "$(dirname "${repo_dir}")"
-    cp -a "${repo}" "${repo_dir}"
+    rm -rf "$1" && mkdir -p "$(dirname "$1")" && cp -a "${repo}" "$1"
   fi
 }
 
-# Move the runsc binary into "latest" directory, and also a directory with the
-# current date. If the current commit happens to correpond to a tag, then we
-# will also move everything into a directory named after the given tag.
-if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
-  if [[ "${KOKORO_BUILD_NIGHTLY:-false}" == "true" ]]; then
-    # The "latest" directory and current date.
-    stamp="$(date -Idate)"
-    install "${KOKORO_ARTIFACTS_DIR}/nightly/latest" \
-            "${KOKORO_ARTIFACTS_DIR}/dists/nightly/latest"
-    install "${KOKORO_ARTIFACTS_DIR}/nightly/${stamp}" \
-            "${KOKORO_ARTIFACTS_DIR}/dists/nightly/${stamp}"
-  else
-    # Is it a tagged release? Build that instead. In that case, we also try to
-    # update the base release directory, in case this is an update. Finally, we
-    # update the "release" directory, which has the last released version.
-    tags="$(git tag --points-at HEAD)"
-    if ! [[ -z "${tags}" ]]; then
-      # Note that a given commit can match any number of tags. We have to
-      # iterate through all possible tags and produce associated artifacts.
-      for tag in ${tags}; do
-        name=$(echo "${tag}" | cut -d'-' -f2)
-        base=$(echo "${name}" | cut -d'.' -f1)
-        install "${KOKORO_ARTIFACTS_DIR}/release/${name}" \
-                "${KOKORO_ARTIFACTS_DIR}/dists/${name}"
-        if [[ "${base}" != "${tag}" ]]; then
-          install "${KOKORO_ARTIFACTS_DIR}/release/${base}" \
-                  "${KOKORO_ARTIFACTS_DIR}/dists/${base}"
-        fi
-        install "${KOKORO_ARTIFACTS_DIR}/release/latest" \
-                "${KOKORO_ARTIFACTS_DIR}/dists/latest"
-      done
-    fi
+# If nightly, install only nightly artifacts.
+if [[ "${KOKORO_BUILD_NIGHTLY:-false}" == "true" ]]; then
+  # The "latest" directory and current date.
+  stamp="$(date -Idate)"
+  install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/latest"
+  install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/${stamp}"
+  install_repo "${KOKORO_ARTIFACTS_DIR}/dists/nightly"
+else
+  # We keep only the latest master raw release.
+  install_raw  "${KOKORO_ARTIFACTS_DIR}/master/latest"
+  install_repo "${KOKORO_ARTIFACTS_DIR}/dists/master"
+
+  # Is it a tagged release? Build that too.
+  tags="$(git tag --points-at HEAD)"
+  if ! [[ -z "${tags}" ]]; then
+    # Note that a given commit can match any number of tags. We have to iterate
+    # through all possible tags and produce associated artifacts.
+    for tag in ${tags}; do
+      name=$(echo "${tag}" | cut -d'-' -f2)
+      base=$(echo "${name}" | cut -d'.' -f1)
+      install_raw  "${KOKORO_ARTIFACTS_DIR}/release/${name}"
+      install_repo "${KOKORO_ARTIFACTS_DIR}/dists/release"
+      install_repo "${KOKORO_ARTIFACTS_DIR}/dists/${base}"
+    done
   fi
 fi
diff --git a/tools/make_repository.sh b/tools/make_repository.sh
index 071f72b74..27ffbc9f3 100755
--- a/tools/make_repository.sh
+++ b/tools/make_repository.sh
@@ -17,13 +17,13 @@
 # Parse arguments. We require more than two arguments, which are the private
 # keyring, the e-mail associated with the signer, and the list of packages.
 if [ "$#" -le 3 ]; then
-  echo "usage: $0 <private-key> <signer-email> <component> <packages...>"
+  echo "usage: $0 <private-key> <signer-email> <component> <root> <packages...>"
   exit 1
 fi
-declare -r private_key=$(readlink -e "$1")
-declare -r signer="$2"
-declare -r component="$3"
-shift; shift; shift
+declare -r private_key=$(readlink -e "$1"); shift
+declare -r signer="$1"; shift
+declare -r component="$1"; shift
+declare -r root="$1"; shift
 
 # Verbose from this point.
 set -xeo pipefail
@@ -40,7 +40,7 @@ cleanup() {
 trap cleanup EXIT
 gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}" >&2
 
-# Copy the packages, and ensure permissions are correct.
+# Copy the packages into the root.
 for pkg in "$@"; do
   name=$(basename "${pkg}" .deb)
   name=$(basename "${name}" .changes)
@@ -48,32 +48,61 @@ for pkg in "$@"; do
   if [[ "${name}" == "${arch}" ]]; then
     continue # Not a regular package.
   fi
-  mkdir -p "${tmpdir}"/"${component}"/binary-"${arch}"
-  cp -a "${pkg}" "${tmpdir}"/"${component}"/binary-"${arch}"
+  if [[ "${pkg}" =~ ^.*\.deb$ ]]; then
+    # Extract from the debian file.
+    version=$(dpkg --info "${pkg}" | grep -E 'Version:' | cut -d':' -f2)
+  elif [[ "${pkg}" =~ ^.*\.changes$ ]]; then
+    # Extract from the changes file.
+    version=$(grep -E 'Version:' "${pkg}" | cut -d':' -f2)
+  else
+    # Unsupported file type.
+    echo "Unknown file type: ${pkg}"
+    exit 1
+  fi
+  version=${version// /} # Trim whitespace.
+  mkdir -p "${root}"/pool/"${version}"/binary-"${arch}"
+  cp -a "${pkg}" "${root}"/pool/"${version}"/binary-"${arch}"
 done
-find "${tmpdir}" -type f -exec chmod 0644 {} \;
 
-# Ensure there are no symlinks hanging around; these may be remnants of the
-# build process. They may be useful for other things, but we are going to build
-# an index of the actual packages here.
-find "${tmpdir}" -type l -exec rm -f {} \;
+# Ensure all permissions are correct.
+find "${root}"/pool -type f -exec chmod 0644 {} \;
 
 # Sign all packages.
-for file in "${tmpdir}"/"${component}"/binary-*/*.deb; do
+for file in "${root}"/pool/*/binary-*/*.deb; do
   dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${file}" >&2
 done
 
 # Build the package list.
-for dir in "${tmpdir}"/"${component}"/binary-*; do
-  (cd "${dir}" && apt-ftparchive packages . | gzip > Packages.gz)
+declare arches=()
+for dir in "${root}"/pool/*/binary-*; do
+  name=$(basename "${dir}")
+  arch=${name##binary-}
+  arches+=("${arch}")
+  repo_packages="${tmpdir}"/"${component}"/"${name}"
+  mkdir -p "${repo_packages}"
+  (cd "${root}" && apt-ftparchive --arch "${arch}" packages pool > "${repo_packages}"/Packages)
+  (cd "${repo_packages}" && cat Packages | gzip > Packages.gz)
+  (cd "${repo_packages}" && cat Packages | xz > Packages.xz)
 done
 
 # Build the release list.
-(cd "${tmpdir}" && apt-ftparchive release . > Release)
+cat > "${tmpdir}"/apt.conf <<EOF
+APT {
+  FTPArchive {
+    Release {
+      Architectures "${arches[@]}";
+      Components "${component}";
+    };
+  };
+};
+EOF
+(cd "${tmpdir}" && apt-ftparchive -c=apt.conf release . > Release)
+rm "${tmpdir}"/apt.conf
 
 # Sign the release.
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign -o InRelease Release >&2)
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" -abs -o Release.gpg Release >&2)
+declare -r digest_opts=("--digest-algo" "SHA512" "--cert-digest-algo" "SHA512")
+(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release >&2)
+(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release >&2)
 
 # Show the results.
 echo "${tmpdir}"
-- 
cgit v1.2.3


From 57f6dbc4be5c9c5416c9d3a442eacfb797e57e9c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 5 Nov 2019 17:02:15 -0800
Subject: test/root: check that memory accouting works as expected

PiperOrigin-RevId: 278739427
---
 test/root/cgroup_test.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
index 76f1e4f2a..4038661cb 100644
--- a/test/root/cgroup_test.go
+++ b/test/root/cgroup_test.go
@@ -24,6 +24,7 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"time"
 
 	"gvisor.dev/gvisor/runsc/cgroup"
 	"gvisor.dev/gvisor/runsc/dockerutil"
@@ -55,6 +56,59 @@ func verifyPid(pid int, path string) error {
 	return fmt.Errorf("got: %s, want: %d", gots, pid)
 }
 
+// TestCgroup sets cgroup options and checks that cgroup was properly configured.
+func TestMemCGroup(t *testing.T) {
+	allocMemSize := 128 << 20
+	if err := dockerutil.Pull("python"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("memusage-test")
+
+	// Start a new container and allocate the specified about of memory.
+	args := []string{
+		"--memory=256MB",
+		"python",
+		"python",
+		"-c",
+		fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize),
+	}
+	if err := d.Run(args...); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	defer d.CleanUp()
+
+	gid, err := d.ID()
+	if err != nil {
+		t.Fatalf("Docker.ID() failed: %v", err)
+	}
+	t.Logf("cgroup ID: %s", gid)
+
+	path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.usage_in_bytes")
+	memUsage := 0
+
+	// Wait when the container will allocate memory.
+	start := time.Now()
+	for time.Now().Sub(start) < 30*time.Second {
+		outRaw, err := ioutil.ReadFile(path)
+		if err != nil {
+			t.Fatalf("failed to read %q: %v", path, err)
+		}
+		out := strings.TrimSpace(string(outRaw))
+		memUsage, err = strconv.Atoi(out)
+		if err != nil {
+			t.Fatalf("Atoi(%v): %v", out, err)
+		}
+
+		if memUsage > allocMemSize {
+			return
+		}
+
+		time.Sleep(100 * time.Millisecond)
+	}
+
+	t.Fatalf("%vMB is less than %vMB: %v", memUsage>>20, allocMemSize>>20)
+}
+
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroup(t *testing.T) {
 	if err := dockerutil.Pull("alpine"); err != nil {
-- 
cgit v1.2.3


From a824b48ceac4e2e3bacd23d63e72881c76d669c8 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 6 Nov 2019 10:38:02 -0800
Subject: Validate incoming NDP Router Advertisements, as per RFC 4861 section
 6.1.2

This change validates incoming NDP Router Advertisements as per RFC 4861 section
6.1.2. It also includes the skeleton to handle Router Advertiements that arrive
on some NIC.

Tests: Unittest to make sure only valid NDP Router Advertisements are received/
not dropped.
PiperOrigin-RevId: 278891972
---
 pkg/tcpip/network/ipv6/icmp.go      |  51 +++++++++-
 pkg/tcpip/network/ipv6/icmp_test.go |   4 +-
 pkg/tcpip/network/ipv6/ndp_test.go  | 189 +++++++++++++++++++++++++++++++++++-
 pkg/tcpip/stack/ndp.go              |  55 +++++++++++
 pkg/tcpip/stack/nic.go              |  13 ++-
 pkg/tcpip/stack/stack.go            |  16 +++
 6 files changed, 322 insertions(+), 6 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index c3f1dd488..05e8c075b 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -86,7 +86,8 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 
 	// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
 	// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
-	// in the IPv6 header is not set to 255.
+	// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
+	// set to 0.
 	switch h.Type() {
 	case header.ICMPv6NeighborSolicit,
 		header.ICMPv6NeighborAdvert,
@@ -97,6 +98,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
+
+		if h.Code() != 0 {
+			received.Invalid.Increment()
+			return
+		}
 	}
 
 	// TODO(b/112892170): Meaningfully handle all ICMP types.
@@ -309,8 +315,51 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		received.RouterSolicit.Increment()
 
 	case header.ICMPv6RouterAdvert:
+		routerAddr := iph.SourceAddress()
+
+		//
+		// Validate the RA as per RFC 4861 section 6.1.2.
+		//
+
+		// Is the IP Source Address a link-local address?
+		if !header.IsV6LinkLocalAddress(routerAddr) {
+			// ...No, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		p := h.NDPPayload()
+
+		// Is the NDP payload of sufficient size to hold a Router
+		// Advertisement?
+		if len(p) < header.NDPRAMinimumSize {
+			// ...No, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		ra := header.NDPRouterAdvert(p)
+		opts := ra.Options()
+
+		// Are options valid as per the wire format?
+		if _, err := opts.Iter(true); err != nil {
+			// ...No, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		//
+		// At this point, we have a valid Router Advertisement, as far
+		// as RFC 4861 section 6.1.2 is concerned.
+		//
+
 		received.RouterAdvert.Increment()
 
+		// Tell the NIC to handle the RA.
+		stack := r.Stack()
+		rxNICID := r.NICID()
+		stack.HandleNDPRA(rxNICID, routerAddr, ra)
+
 	case header.ICMPv6RedirectMsg:
 		received.RedirectMsg.Increment()
 
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index b112303b6..d686f79ce 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -131,7 +131,7 @@ func TestICMPCounts(t *testing.T) {
 		{header.ICMPv6EchoRequest, header.ICMPv6EchoMinimumSize},
 		{header.ICMPv6EchoReply, header.ICMPv6EchoMinimumSize},
 		{header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize},
-		{header.ICMPv6RouterAdvert, header.ICMPv6MinimumSize},
+		{header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize},
 		{header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize},
 		{header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize},
 		{header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize},
@@ -426,7 +426,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		{
 			"RouterAdvert",
 			header.ICMPv6RouterAdvert,
-			header.ICMPv6MinimumSize,
+			header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
 			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterAdvert
 			},
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index c32716f2e..69ab7ba12 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 )
@@ -109,7 +110,7 @@ func TestHopLimitValidation(t *testing.T) {
 		{"RouterSolicit", header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 			return stats.RouterSolicit
 		}},
-		{"RouterAdvert", header.ICMPv6RouterAdvert, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+		{"RouterAdvert", header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 			return stats.RouterAdvert
 		}},
 		{"NeighborSolicit", header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
@@ -179,3 +180,189 @@ func TestHopLimitValidation(t *testing.T) {
 		})
 	}
 }
+
+// TestRouterAdvertValidation tests that when the NIC is configured to handle
+// NDP Router Advertisement packets, it validates the Router Advertisement
+// properly before handling them.
+func TestRouterAdvertValidation(t *testing.T) {
+	tests := []struct {
+		name            string
+		src             tcpip.Address
+		hopLimit        uint8
+		code            uint8
+		ndpPayload      []byte
+		expectedSuccess bool
+	}{
+		{
+			"OK",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			true,
+		},
+		{
+			"NonLinkLocalSourceAddr",
+			addr1,
+			255,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			false,
+		},
+		{
+			"HopLimitNot255",
+			lladdr0,
+			254,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			false,
+		},
+		{
+			"NonZeroCode",
+			lladdr0,
+			255,
+			1,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			false,
+		},
+		{
+			"NDPPayloadTooSmall",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0,
+			},
+			false,
+		},
+		{
+			"OKWithOptions",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				// RA payload
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+
+				// Option #1 (TargetLinkLayerAddress)
+				2, 1, 0, 0, 0, 0, 0, 0,
+
+				// Option #2 (unrecognized)
+				255, 1, 0, 0, 0, 0, 0, 0,
+
+				// Option #3 (PrefixInformation)
+				3, 4, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			true,
+		},
+		{
+			"OptionWithZeroLength",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				// RA payload
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+
+				// Option #1 (TargetLinkLayerAddress)
+				// Invalid as it has 0 length.
+				2, 0, 0, 0, 0, 0, 0, 0,
+
+				// Option #2 (unrecognized)
+				255, 1, 0, 0, 0, 0, 0, 0,
+
+				// Option #3 (PrefixInformation)
+				3, 4, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+			pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+			pkt.SetType(header.ICMPv6RouterAdvert)
+			pkt.SetCode(test.code)
+			copy(pkt.NDPPayload(), test.ndpPayload)
+			payloadLength := hdr.UsedLength()
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(icmp.ProtocolNumber6),
+				HopLimit:      test.hopLimit,
+				SrcAddr:       test.src,
+				DstAddr:       header.IPv6AllNodesMulticastAddress,
+			})
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			rxRA := stats.RouterAdvert
+
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := rxRA.Value(); got != 0 {
+				t.Fatalf("got rxRA = %d, want = 0", got)
+			}
+
+			e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+
+			if test.expectedSuccess {
+				if got := invalid.Value(); got != 0 {
+					t.Fatalf("got invalid = %d, want = 0", got)
+				}
+				if got := rxRA.Value(); got != 1 {
+					t.Fatalf("got rxRA = %d, want = 1", got)
+				}
+
+			} else {
+				if got := invalid.Value(); got != 1 {
+					t.Fatalf("got invalid = %d, want = 1", got)
+				}
+				if got := rxRA.Value(); got != 0 {
+					t.Fatalf("got rxRA = %d, want = 0", got)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 03ddebdbd..d5352bb5f 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -38,6 +38,19 @@ const (
 	// Default = 1s (from RFC 4861 section 10).
 	defaultRetransmitTimer = time.Second
 
+	// defaultHandleRAs is the default configuration for whether or not to
+	// handle incoming Router Advertisements as a host.
+	//
+	// Default = true.
+	defaultHandleRAs = true
+
+	// defaultDiscoverDefaultRouters is the default configuration for
+	// whether or not to discover default routers from incoming Router
+	// Advertisements as a host.
+	//
+	// Default = true.
+	defaultDiscoverDefaultRouters = true
+
 	// minimumRetransmitTimer is the minimum amount of time to wait between
 	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
 	// not impose a minimum Retransmit Timer, but we do here to make sure
@@ -49,6 +62,13 @@ const (
 	//
 	// Min = 1ms.
 	minimumRetransmitTimer = time.Millisecond
+
+	// MaxDiscoveredDefaultRouters is the maximum number of discovered
+	// default routers. The stack should stop discovering new routers after
+	// discovering MaxDiscoveredDefaultRouters routers.
+	//
+	// Max = 10.
+	MaxDiscoveredDefaultRouters = 10
 )
 
 // NDPDispatcher is the interface integrators of netstack must implement to
@@ -80,6 +100,15 @@ type NDPConfigurations struct {
 	//
 	// Must be greater than 0.5s.
 	RetransmitTimer time.Duration
+
+	// HandleRAs determines whether or not Router Advertisements will be
+	// processed.
+	HandleRAs bool
+
+	// DiscoverDefaultRouters determines whether or not default routers will
+	// be discovered from Router Advertisements. This configuration is
+	// ignored if HandleRAs is false.
+	DiscoverDefaultRouters bool
 }
 
 // DefaultNDPConfigurations returns an NDPConfigurations populated with
@@ -88,6 +117,8 @@ func DefaultNDPConfigurations() NDPConfigurations {
 	return NDPConfigurations{
 		DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
 		RetransmitTimer:        defaultRetransmitTimer,
+		HandleRAs:              defaultHandleRAs,
+		DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
 	}
 }
 
@@ -112,6 +143,9 @@ type ndpState struct {
 
 	// The DAD state to send the next NS message, or resolve the address.
 	dad map[tcpip.Address]dadState
+
+	// The default routers discovered through Router Advertisements.
+	defaultRouters map[tcpip.Address]defaultRouterState
 }
 
 // dadState holds the Duplicate Address Detection timer and channel to signal
@@ -127,6 +161,12 @@ type dadState struct {
 	done *bool
 }
 
+// defaultRouterState holds data associated with a default router discovered by
+// a Router Advertisement.
+type defaultRouterState struct {
+	invalidationTimer *time.Timer
+}
+
 // startDuplicateAddressDetection performs Duplicate Address Detection.
 //
 // This function must only be called by IPv6 addresses that are currently
@@ -319,3 +359,18 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 		go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
 	}
 }
+
+// handleRA handles a Router Advertisement message that arrived on the NIC
+// this ndp is for.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
+	// Is the NIC configured to handle RAs at all?
+	if !ndp.configs.HandleRAs {
+		return
+	}
+
+	// TODO(b/140882146): Do Router Discovery.
+	// TODO(b/140948104): Do Prefix Discovery.
+	// TODO(b/141556115): Do Parameter Discovery.
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index fe8f83d58..12969c74e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -115,8 +115,9 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 			},
 		},
 		ndp: ndpState{
-			configs: stack.ndpConfigs,
-			dad:     make(map[tcpip.Address]dadState),
+			configs:        stack.ndpConfigs,
+			dad:            make(map[tcpip.Address]dadState),
+			defaultRouters: make(map[tcpip.Address]defaultRouterState),
 		},
 	}
 	nic.ndp.nic = nic
@@ -960,6 +961,14 @@ func (n *NIC) setNDPConfigs(c NDPConfigurations) {
 	n.mu.Unlock()
 }
 
+// handleNDPRA handles an NDP Router Advertisement message that arrived on n.
+func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.ndp.handleRA(ip, ra)
+}
+
 type networkEndpointKind int32
 
 const (
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 115a6fcb8..8b141cafd 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1557,6 +1557,22 @@ func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip
 	return nil
 }
 
+// HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement
+// message that it needs to handle.
+func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.handleNDPRA(ip, ra)
+
+	return nil
+}
+
 // PortSeed returns a 32 bit value that can be used as a seed value for port
 // picking.
 //
-- 
cgit v1.2.3


From d0d89ceeddd21f1f22e818d78dc3b07d3669dbb5 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 6 Nov 2019 10:42:00 -0800
Subject: Send a TCP RST in response to a TCP SYN-ACK on a listening endpoint

This change better follows what is outlined in RFC 793 section 3.4 figure 12
where a listening socket should not accept a SYN-ACK segment in response to a
(potentially) old SYN segment.

Tests: Test that checks the TCP RST segment sent in response to a TCP SYN-ACK
segment received on a listening TCP endpoint.
PiperOrigin-RevId: 278893114
---
 pkg/tcpip/transport/tcp/accept.go   |  9 ++++++
 pkg/tcpip/transport/tcp/segment.go  | 10 +++++--
 pkg/tcpip/transport/tcp/tcp_test.go | 56 +++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 1dd00d026..cb0e13ebc 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -400,6 +401,14 @@ func (e *endpoint) acceptQueueIsFull() bool {
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
+	if s.flagsAreSet(header.TCPFlagSyn | header.TCPFlagAck) {
+		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
+		// must be sent in response to a SYN-ACK while in the listen
+		// state to prevent completing a handshake from an old SYN.
+		e.sendTCP(&s.route, s.id, buffer.VectorisedView{}, e.ttl, e.sendTOS, header.TCPFlagRst, s.ackNumber, 0, 0, nil, nil)
+		return
+	}
+
 	// TODO(b/143300739): Use the userMSS of the listening socket
 	// for accepted sockets.
 
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index ea725d513..c4a89525e 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -99,8 +99,14 @@ func (s *segment) clone() *segment {
 	return t
 }
 
-func (s *segment) flagIsSet(flag uint8) bool {
-	return (s.flags & flag) != 0
+// flagIsSet checks if at least one flag in flags is set in s.flags.
+func (s *segment) flagIsSet(flags uint8) bool {
+	return s.flags&flags != 0
+}
+
+// flagsAreSet checks if all flags in flags are set in s.flags.
+func (s *segment) flagsAreSet(flags uint8) bool {
+	return s.flags&flags == flags
 }
 
 func (s *segment) decRef() {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 126f26ed3..beaa40210 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -598,6 +598,62 @@ func TestUserSuppliedMSSOnConnectV6(t *testing.T) {
 	}
 }
 
+func TestSendRstOnListenerRxSynAckV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst),
+		checker.SeqNum(200)))
+}
+
+func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst),
+		checker.SeqNum(200)))
+}
+
 func TestTOSV4(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-- 
cgit v1.2.3


From e1b21f3c8ca989dc94b25526fda1bb107691f1af Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 6 Nov 2019 14:24:38 -0800
Subject: Use PacketBuffers, rather than VectorisedViews, in netstack.

PacketBuffers are analogous to Linux's sk_buff. They hold all information about
a packet, headers, and payload. This is important for:

* iptables to access various headers of packets
* Preventing the clutter of passing different net and link headers along with
  VectorisedViews to packet handling functions.

This change only affects the incoming packet path, and a future change will
change the outgoing path.

Benchmark               Regular         PacketBufferPtr  PacketBufferConcrete
--------------------------------------------------------------------------------
BM_Recvmsg             400.715MB/s      373.676MB/s      396.276MB/s
BM_Sendmsg             361.832MB/s      333.003MB/s      335.571MB/s
BM_Recvfrom            453.336MB/s      393.321MB/s      381.650MB/s
BM_Sendto              378.052MB/s      372.134MB/s      341.342MB/s
BM_SendmsgTCP/0/1k     353.711MB/s      316.216MB/s      322.747MB/s
BM_SendmsgTCP/0/2k     600.681MB/s      588.776MB/s      565.050MB/s
BM_SendmsgTCP/0/4k     995.301MB/s      888.808MB/s      941.888MB/s
BM_SendmsgTCP/0/8k     1.517GB/s        1.274GB/s        1.345GB/s
BM_SendmsgTCP/0/16k    1.872GB/s        1.586GB/s        1.698GB/s
BM_SendmsgTCP/0/32k    1.017GB/s        1.020GB/s        1.133GB/s
BM_SendmsgTCP/0/64k    475.626MB/s      584.587MB/s      627.027MB/s
BM_SendmsgTCP/0/128k   416.371MB/s      503.434MB/s      409.850MB/s
BM_SendmsgTCP/0/256k   323.449MB/s      449.599MB/s      388.852MB/s
BM_SendmsgTCP/0/512k   243.992MB/s      267.676MB/s      314.474MB/s
BM_SendmsgTCP/0/1M     95.138MB/s       95.874MB/s       95.417MB/s
BM_SendmsgTCP/0/2M     96.261MB/s       94.977MB/s       96.005MB/s
BM_SendmsgTCP/0/4M     96.512MB/s       95.978MB/s       95.370MB/s
BM_SendmsgTCP/0/8M     95.603MB/s       95.541MB/s       94.935MB/s
BM_SendmsgTCP/0/16M    94.598MB/s       94.696MB/s       94.521MB/s
BM_SendmsgTCP/0/32M    94.006MB/s       94.671MB/s       94.768MB/s
BM_SendmsgTCP/0/64M    94.133MB/s       94.333MB/s       94.746MB/s
BM_SendmsgTCP/0/128M   93.615MB/s       93.497MB/s       93.573MB/s
BM_SendmsgTCP/0/256M   93.241MB/s       95.100MB/s       93.272MB/s
BM_SendmsgTCP/1/1k     303.644MB/s      316.074MB/s      308.430MB/s
BM_SendmsgTCP/1/2k     537.093MB/s      584.962MB/s      529.020MB/s
BM_SendmsgTCP/1/4k     882.362MB/s      939.087MB/s      892.285MB/s
BM_SendmsgTCP/1/8k     1.272GB/s        1.394GB/s        1.296GB/s
BM_SendmsgTCP/1/16k    1.802GB/s        2.019GB/s        1.830GB/s
BM_SendmsgTCP/1/32k    2.084GB/s        2.173GB/s        2.156GB/s
BM_SendmsgTCP/1/64k    2.515GB/s        2.463GB/s        2.473GB/s
BM_SendmsgTCP/1/128k   2.811GB/s        3.004GB/s        2.946GB/s
BM_SendmsgTCP/1/256k   3.008GB/s        3.159GB/s        3.171GB/s
BM_SendmsgTCP/1/512k   2.980GB/s        3.150GB/s        3.126GB/s
BM_SendmsgTCP/1/1M     2.165GB/s        2.233GB/s        2.163GB/s
BM_SendmsgTCP/1/2M     2.370GB/s        2.219GB/s        2.453GB/s
BM_SendmsgTCP/1/4M     2.005GB/s        2.091GB/s        2.214GB/s
BM_SendmsgTCP/1/8M     2.111GB/s        2.013GB/s        2.109GB/s
BM_SendmsgTCP/1/16M    1.902GB/s        1.868GB/s        1.897GB/s
BM_SendmsgTCP/1/32M    1.655GB/s        1.665GB/s        1.635GB/s
BM_SendmsgTCP/1/64M    1.575GB/s        1.547GB/s        1.575GB/s
BM_SendmsgTCP/1/128M   1.524GB/s        1.584GB/s        1.580GB/s
BM_SendmsgTCP/1/256M   1.579GB/s        1.607GB/s        1.593GB/s

PiperOrigin-RevId: 278940079
---
 pkg/tcpip/BUILD                                    |  2 +
 pkg/tcpip/link/channel/channel.go                  | 10 ++--
 pkg/tcpip/link/fdbased/endpoint.go                 |  4 +-
 pkg/tcpip/link/fdbased/endpoint_test.go            | 27 ++++-----
 pkg/tcpip/link/fdbased/mmap.go                     |  5 +-
 pkg/tcpip/link/fdbased/packet_dispatchers.go       | 18 ++++--
 pkg/tcpip/link/loopback/loopback.go                | 10 +++-
 pkg/tcpip/link/muxed/injectable.go                 |  4 +-
 pkg/tcpip/link/sharedmem/sharedmem.go              |  7 ++-
 pkg/tcpip/link/sharedmem/sharedmem_test.go         |  9 ++-
 pkg/tcpip/link/sniffer/sniffer.go                  | 12 ++--
 pkg/tcpip/link/waitable/waitable.go                |  4 +-
 pkg/tcpip/link/waitable/waitable_test.go           |  8 +--
 pkg/tcpip/network/arp/arp.go                       |  4 +-
 pkg/tcpip/network/arp/arp_test.go                  |  4 +-
 pkg/tcpip/network/ip_test.go                       | 34 ++++++++----
 pkg/tcpip/network/ipv4/icmp.go                     | 34 +++++++-----
 pkg/tcpip/network/ipv4/ipv4.go                     | 43 +++++++++------
 pkg/tcpip/network/ipv4/ipv4_test.go                |  4 +-
 pkg/tcpip/network/ipv6/icmp.go                     | 48 ++++++++--------
 pkg/tcpip/network/ipv6/icmp_test.go                | 24 +++++---
 pkg/tcpip/network/ipv6/ipv6.go                     | 28 ++++++----
 pkg/tcpip/network/ipv6/ipv6_test.go                |  8 ++-
 pkg/tcpip/network/ipv6/ndp_test.go                 |  8 ++-
 pkg/tcpip/packet_buffer.go                         | 54 ++++++++++++++++++
 pkg/tcpip/packet_buffer_state.go                   | 26 +++++++++
 pkg/tcpip/stack/ndp_test.go                        |  4 +-
 pkg/tcpip/stack/nic.go                             | 48 ++++++++--------
 pkg/tcpip/stack/registration.go                    | 64 +++++++++++++---------
 pkg/tcpip/stack/stack.go                           |  4 +-
 pkg/tcpip/stack/stack_test.go                      | 50 +++++++++++------
 pkg/tcpip/stack/transport_demuxer.go               | 53 +++++++++---------
 pkg/tcpip/stack/transport_demuxer_test.go          |  4 +-
 pkg/tcpip/stack/transport_test.go                  | 34 ++++++++----
 pkg/tcpip/transport/icmp/endpoint.go               | 18 +++---
 pkg/tcpip/transport/icmp/protocol.go               |  2 +-
 pkg/tcpip/transport/packet/endpoint.go             | 19 ++++---
 pkg/tcpip/transport/raw/endpoint.go                | 17 +++---
 pkg/tcpip/transport/tcp/endpoint.go                |  6 +-
 pkg/tcpip/transport/tcp/forwarder.go               |  5 +-
 pkg/tcpip/transport/tcp/protocol.go                |  4 +-
 pkg/tcpip/transport/tcp/segment.go                 |  5 +-
 pkg/tcpip/transport/tcp/testing/context/context.go | 16 ++++--
 pkg/tcpip/transport/udp/endpoint.go                | 20 +++----
 pkg/tcpip/transport/udp/forwarder.go               |  9 ++-
 pkg/tcpip/transport/udp/protocol.go                | 30 +++++-----
 pkg/tcpip/transport/udp/udp_test.go                | 19 +++++--
 test/syscalls/linux/raw_socket_icmp.cc             |  2 +-
 48 files changed, 542 insertions(+), 330 deletions(-)
 create mode 100644 pkg/tcpip/packet_buffer.go
 create mode 100644 pkg/tcpip/packet_buffer_state.go

diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 3c2b2b5ea..65d4d0cd8 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -6,6 +6,8 @@ package(licenses = ["notice"])
 go_library(
     name = "tcpip",
     srcs = [
+        "packet_buffer.go",
+        "packet_buffer_state.go",
         "tcpip.go",
         "time_unsafe.go",
     ],
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 14f197a77..22eefb564 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -65,14 +65,14 @@ func (e *Endpoint) Drain() int {
 	}
 }
 
-// Inject injects an inbound packet.
-func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	e.InjectLinkAddr(protocol, "", vv)
+// InjectInbound injects an inbound packet.
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, vv buffer.VectorisedView) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil), nil /* linkHeader */)
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index ae4858529..edef7db26 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -598,8 +598,8 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index e7c05ca4f..7e08e033b 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -43,10 +43,9 @@ const (
 )
 
 type packetInfo struct {
-	raddr      tcpip.LinkAddress
-	proto      tcpip.NetworkProtocolNumber
-	contents   buffer.VectorisedView
-	linkHeader buffer.View
+	raddr    tcpip.LinkAddress
+	proto    tcpip.NetworkProtocolNumber
+	contents tcpip.PacketBuffer
 }
 
 type context struct {
@@ -93,8 +92,8 @@ func (c *context) cleanup() {
 	syscall.Close(c.fds[1])
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
-	c.ch <- packetInfo{remote, protocol, vv, linkHeader}
+func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
 func TestNoEthernetProperties(t *testing.T) {
@@ -317,19 +316,21 @@ func TestDeliverPacket(t *testing.T) {
 				select {
 				case pi := <-c.ch:
 					want := packetInfo{
-						raddr:      raddr,
-						proto:      proto,
-						contents:   buffer.View(b).ToVectorisedView(),
-						linkHeader: buffer.View(hdr),
+						raddr: raddr,
+						proto: proto,
+						contents: tcpip.PacketBuffer{
+							Data:       buffer.View(b).ToVectorisedView(),
+							LinkHeader: buffer.View(hdr),
+						},
 					}
 					if !eth {
 						want.proto = header.IPv4ProtocolNumber
 						want.raddr = ""
 					}
-					// want.contents will be a single view,
-					// so make pi do the same for the
+					// want.contents.Data will be a single
+					// view, so make pi do the same for the
 					// DeepEqual check.
-					pi.contents = pi.contents.ToView().ToVectorisedView()
+					pi.contents.Data = pi.contents.Data.ToView().ToVectorisedView()
 					if !reflect.DeepEqual(want, pi) {
 						t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want)
 					}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 554d45715..62ed1e569 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -190,6 +190,9 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}), buffer.View(eth))
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, tcpip.PacketBuffer{
+		Data:       buffer.View(pkt).ToVectorisedView(),
+		LinkHeader: buffer.View(eth),
+	})
 	return true, nil
 }
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index 3331b6453..c67d684ce 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,10 +139,13 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	vv := buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...))
-	vv.TrimFront(d.e.hdrSize)
+	pkt := tcpip.PacketBuffer{
+		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
+		LinkHeader: buffer.View(eth),
+	}
+	pkt.Data.TrimFront(d.e.hdrSize)
 
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -293,9 +296,12 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		vv := buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...))
-		vv.TrimFront(d.e.hdrSize)
-		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
+		pkt := tcpip.PacketBuffer{
+			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
+			LinkHeader: buffer.View(eth),
+		}
+		pkt.Data.TrimFront(d.e.hdrSize)
+		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
 
 		// Prepare e.views for another packet: release used views.
 		for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index a3b48fa73..bc5d8a2f3 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -80,12 +80,13 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependa
 	views := make([]buffer.View, 1, 1+len(payload.Views()))
 	views[0] = hdr.View()
 	views = append(views, payload.Views()...)
-	vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
 
 	// Because we're immediately turning around and writing the packet back to the
 	// rx path, we intentionally don't preserve the remote and local link
 	// addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, tcpip.PacketBuffer{
+		Data: buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+	})
 
 	return nil
 }
@@ -105,7 +106,10 @@ func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
 	// There should be an ethernet header at the beginning of packet.
 	linkHeader := header.Ethernet(packet.First()[:header.EthernetMinimumSize])
 	packet.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), packet, buffer.View(linkHeader))
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), tcpip.PacketBuffer{
+		Data:       packet,
+		LinkHeader: buffer.View(linkHeader),
+	})
 
 	return nil
 }
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 682b60291..9a8e8ebfe 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -80,8 +80,8 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 279e2b457..2bace5298 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -273,8 +273,11 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 		}
 
 		// Send packet up the stack.
-		eth := header.Ethernet(b)
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(), buffer.View(eth))
+		eth := header.Ethernet(b[:header.EthernetMinimumSize])
+		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), tcpip.PacketBuffer{
+			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
+			LinkHeader: buffer.View(eth),
+		})
 	}
 
 	// Clean state.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index f3e9705c9..199406886 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,13 +131,12 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
-		addr:       remoteLinkAddr,
-		proto:      proto,
-		vv:         vv.Clone(nil),
-		linkHeader: linkHeader,
+		addr:  remoteLinkAddr,
+		proto: proto,
+		vv:    pkt.Data.Clone(nil),
 	})
 	c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 39757ea2a..d71a03cd2 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -116,19 +116,19 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, vv.First(), nil)
+		logPacket("recv", protocol, pkt.Data.First(), nil)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		vs := vv.Views()
-		length := vv.Size()
+		vs := pkt.Data.Views()
+		length := pkt.Data.Size()
 		if length > int(e.maxPCAPLen) {
 			length = int(e.maxPCAPLen)
 		}
 
 		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(vv.Size()))); err != nil {
+		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(pkt.Data.Size()))); err != nil {
 			panic(err)
 		}
 		for _, v := range vs {
@@ -147,7 +147,7 @@ func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local
 			panic(err)
 		}
 	}
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
 }
 
 // Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index a04fc1062..b440970e0 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
 
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
 	e.dispatchGate.Leave()
 }
 
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 5f0f8fa2d..df2e70e54 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	e.dispatchCount++
 }
 
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 46178459e..4161ebf87 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -92,8 +92,8 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	v := vv.First()
+func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+	v := pkt.Data.First()
 	h := header.ARP(v)
 	if !h.IsValid() {
 		return
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 88b57ec03..47098bfdc 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -102,7 +102,9 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView())
+		c.linkEP.InjectInbound(arp.ProtocolNumber, tcpip.PacketBuffer{
+			Data: v.ToVectorisedView(),
+		})
 	}
 
 	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 666d8b92a..fe499d47e 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,16 +96,16 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
-	t.checkValues(protocol, vv, r.RemoteAddress, r.LocalAddress)
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
 
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-	t.checkValues(trans, vv, remote, local)
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
 	}
@@ -279,7 +279,9 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, view.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: view.ToVectorisedView(),
+	})
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -367,7 +369,9 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.extra = c.expectedExtra
 
 			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, vv)
+			ep.HandlePacket(&r, tcpip.PacketBuffer{
+				Data: vv,
+			})
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
@@ -430,13 +434,17 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, frag1.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: frag1.ToVectorisedView(),
+	})
 	if o.dataCalls != 0 {
 		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, frag2.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: frag2.ToVectorisedView(),
+	})
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -509,7 +517,9 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, view.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: view.ToVectorisedView(),
+	})
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -618,12 +628,12 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			o.typ = c.expectedTyp
 			o.extra = c.expectedExtra
 
-			vv := view[:len(view)-c.trunc].ToVectorisedView()
-
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, vv)
+			ep.HandlePacket(&r, tcpip.PacketBuffer{
+				Data: view[:len(view)-c.trunc].ToVectorisedView(),
+			})
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 50b363dc4..ce771631c 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,6 +15,7 @@
 package ipv4
 
 import (
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -24,8 +25,8 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-	h := header.IPv4(vv.First())
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+	h := header.IPv4(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that the IP
 	// header plus 8 bytes of the transport header be included. So it's
@@ -39,7 +40,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 	}
 
 	hlen := int(h.HeaderLength())
-	if vv.Size() < hlen || h.FragmentOffset() != 0 {
+	if pkt.Data.Size() < hlen || h.FragmentOffset() != 0 {
 		// We won't be able to handle this if it doesn't contain the
 		// full IPv4 header, or if it's a fragment not at offset 0
 		// (because it won't have the transport header).
@@ -47,15 +48,15 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 	}
 
 	// Skip the ip header, then deliver control message.
-	vv.TrimFront(hlen)
+	pkt.Data.TrimFront(hlen)
 	p := h.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
-	v := vv.First()
+	v := pkt.Data.First()
 	if len(v) < header.ICMPv4MinimumSize {
 		received.Invalid.Increment()
 		return
@@ -73,20 +74,23 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		// checksum. We'll have to reset this before we hand the packet
 		// off.
 		h.SetChecksum(0)
-		gotChecksum := ^header.ChecksumVV(vv, 0 /* initial */)
+		gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
 		if gotChecksum != wantChecksum {
 			// It's possible that a raw socket expects to receive this.
 			h.SetChecksum(wantChecksum)
-			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
+			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 			received.Invalid.Increment()
 			return
 		}
 
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, tcpip.PacketBuffer{
+			Data:          pkt.Data.Clone(nil),
+			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
+		})
 
-		vv := vv.Clone(nil)
+		vv := pkt.Data.Clone(nil)
 		vv.TrimFront(header.ICMPv4MinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize)
 		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
@@ -104,19 +108,19 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	case header.ICMPv4EchoReply:
 		received.EchoReply.Increment()
 
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 
 	case header.ICMPv4DstUnreachable:
 		received.DstUnreachable.Increment()
 
-		vv.TrimFront(header.ICMPv4MinimumSize)
+		pkt.Data.TrimFront(header.ICMPv4MinimumSize)
 		switch h.Code() {
 		case header.ICMPv4PortUnreachable:
-			e.handleControl(stack.ControlPortUnreachable, 0, vv)
+			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 
 		case header.ICMPv4FragmentationNeeded:
 			mtu := uint32(h.MTU())
-			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv)
+			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 		}
 
 	case header.ICMPv4SrcQuench:
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 1339f8474..26f1402ed 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -198,7 +198,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buff
 	return nil
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv4 {
 	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
 	length := uint16(hdr.UsedLength() + payloadSize)
 	id := uint32(0)
@@ -218,19 +218,24 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
+	return ip
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	e.addIPHeader(r, &hdr, payload.Size(), params)
+	ip := e.addIPHeader(r, &hdr, payload.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(payload.Views()))
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
-		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
 		loopedR := r.MakeLoopedRoute()
-		e.HandlePacket(&loopedR, vv)
+
+		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+			Data:          buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+			NetworkHeader: buffer.View(ip),
+		})
+
 		loopedR.Release()
 	}
 	if loop&stack.PacketOut == 0 {
@@ -301,7 +306,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 	ip.SetChecksum(^ip.CalculateChecksum())
 
 	if loop&stack.PacketLoop != 0 {
-		e.HandlePacket(r, payload)
+		e.HandlePacket(r, tcpip.PacketBuffer{
+			Data:          payload,
+			NetworkHeader: buffer.View(ip),
+		})
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
@@ -314,22 +322,23 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	headerView := vv.First()
+func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+	headerView := pkt.Data.First()
 	h := header.IPv4(headerView)
-	if !h.IsValid(vv.Size()) {
+	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
+	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
 	hlen := int(h.HeaderLength())
 	tlen := int(h.TotalLength())
-	vv.TrimFront(hlen)
-	vv.CapLength(tlen - hlen)
+	pkt.Data.TrimFront(hlen)
+	pkt.Data.CapLength(tlen - hlen)
 
 	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
 	if more || h.FragmentOffset() != 0 {
-		if vv.Size() == 0 {
+		if pkt.Data.Size() == 0 {
 			// Drop the packet as it's marked as a fragment but has
 			// no payload.
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -337,10 +346,10 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 			return
 		}
 		// The packet is a fragment, let's try to reassemble it.
-		last := h.FragmentOffset() + uint16(vv.Size()) - 1
+		last := h.FragmentOffset() + uint16(pkt.Data.Size()) - 1
 		// Drop the packet if the fragmentOffset is incorrect. i.e the
-		// combination of fragmentOffset and vv.size() causes a wrap
-		// around resulting in last being less than the offset.
+		// combination of fragmentOffset and pkt.Data.size() causes a
+		// wrap around resulting in last being less than the offset.
 		if last < h.FragmentOffset() {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -348,7 +357,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 		}
 		var ready bool
 		var err error
-		vv, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv)
+		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, pkt.Data)
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -361,11 +370,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
 		headerView.CapLength(hlen)
-		e.handleICMP(r, headerView, vv)
+		e.handleICMP(r, pkt)
 		return
 	}
 	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, headerView, vv)
+	e.dispatcher.DeliverTransportPacket(r, p, pkt)
 }
 
 // Close cleans up resources associated with the endpoint.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 99f84acd7..f100d84ee 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -464,7 +464,9 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicid, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}))
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, tcpip.PacketBuffer{
+					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
+				})
 			}
 
 			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), tc.wantMalformedIPPackets; got != want {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 05e8c075b..58f8e80df 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -25,8 +25,8 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-	h := header.IPv6(vv.First())
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+	h := header.IPv6(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that up to
 	// 1280 bytes of the original packet be included. So it's likely that it
@@ -40,10 +40,10 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 
 	// Skip the IP header, then handle the fragmentation header if there
 	// is one.
-	vv.TrimFront(header.IPv6MinimumSize)
+	pkt.Data.TrimFront(header.IPv6MinimumSize)
 	p := h.TransportProtocol()
 	if p == header.IPv6FragmentHeader {
-		f := header.IPv6Fragment(vv.First())
+		f := header.IPv6Fragment(pkt.Data.First())
 		if !f.IsValid() || f.FragmentOffset() != 0 {
 			// We can't handle fragments that aren't at offset 0
 			// because they don't have the transport headers.
@@ -52,19 +52,19 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 
 		// Skip fragmentation header and find out the actual protocol
 		// number.
-		vv.TrimFront(header.IPv6FragmentHeaderSize)
+		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
 		p = f.TransportProtocol()
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.PacketBuffer) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
-	v := vv.First()
+	v := pkt.Data.First()
 	if len(v) < header.ICMPv6MinimumSize {
 		received.Invalid.Increment()
 		return
@@ -77,7 +77,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	// Only the first view in vv is accounted for by h. To account for the
 	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
-	payload := vv
+	payload := pkt.Data
 	payload.RemoveFirst()
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
@@ -113,9 +113,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		vv.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
+		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
 		mtu := h.MTU()
-		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv)
+		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
@@ -123,10 +123,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		vv.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
+		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
 		switch h.Code() {
 		case header.ICMPv6PortUnreachable:
-			e.handleControl(stack.ControlPortUnreachable, 0, vv)
+			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
 
 	case header.ICMPv6NeighborSolicit:
@@ -189,9 +189,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
 		}
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
-		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
-		pkt.SetType(header.ICMPv6NeighborAdvert)
-		na := header.NDPNeighborAdvert(pkt.NDPPayload())
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+		packet.SetType(header.ICMPv6NeighborAdvert)
+		na := header.NDPNeighborAdvert(packet.NDPPayload())
 		na.SetSolicitedFlag(true)
 		na.SetOverrideFlag(true)
 		na.SetTargetAddress(targetAddr)
@@ -209,7 +209,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		r := r.Clone()
 		defer r.Release()
 		r.LocalAddress = targetAddr
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		// TODO(tamird/ghanan): there exists an explicit NDP option that is
 		// used to update the neighbor table with link addresses for a
@@ -285,13 +285,13 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		vv.TrimFront(header.ICMPv6EchoMinimumSize)
+		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
-		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-		copy(pkt, h)
-		pkt.SetType(header.ICMPv6EchoReply)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, vv))
-		if err := r.WritePacket(nil /* gso */, hdr, vv, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
+		copy(packet, h)
+		packet.SetType(header.ICMPv6EchoReply)
+		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
+		if err := r.WritePacket(nil /* gso */, hdr, pkt.Data, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -303,7 +303,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, netHeader, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, pkt)
 
 	case header.ICMPv6TimeExceeded:
 		received.TimeExceeded.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d686f79ce..6037a1ef8 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -65,7 +65,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, buffer.View, buffer.VectorisedView) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, tcpip.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -147,7 +147,9 @@ func TestICMPCounts(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, hdr.View().ToVectorisedView())
+		ep.HandlePacket(&r, tcpip.PacketBuffer{
+			Data: hdr.View().ToVectorisedView(),
+		})
 	}
 
 	for _, typ := range types {
@@ -280,7 +282,9 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		views := []buffer.View{pkt.Header, pkt.Payload}
 		size := len(pkt.Header) + len(pkt.Payload)
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pkt.Proto, args.dst.LinkAddress(), vv)
+		args.dst.InjectLinkAddr(pkt.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
+			Data: vv,
+		})
 	}
 
 	if pkt.Proto != ProtocolNumber {
@@ -498,7 +502,9 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+					Data: hdr.View().ToVectorisedView(),
+				})
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -673,7 +679,9 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+					Data: hdr.View().ToVectorisedView(),
+				})
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -849,9 +857,9 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.Inject(ProtocolNumber,
-					buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize,
-						[]buffer.View{hdr.View(), payload}))
+				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
+				})
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 5898f8f9e..805d1739c 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -97,7 +97,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv6 {
 	length := uint16(hdr.UsedLength() + payloadSize)
 	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
@@ -108,19 +108,24 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
+	return ip
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	e.addIPHeader(r, &hdr, payload.Size(), params)
+	ip := e.addIPHeader(r, &hdr, payload.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(payload.Views()))
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
-		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
 		loopedR := r.MakeLoopedRoute()
-		e.HandlePacket(&loopedR, vv)
+
+		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+			Data:          buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+			NetworkHeader: buffer.View(ip),
+		})
+
 		loopedR.Release()
 	}
 	if loop&stack.PacketOut == 0 {
@@ -160,24 +165,25 @@ func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vector
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	headerView := vv.First()
+func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+	headerView := pkt.Data.First()
 	h := header.IPv6(headerView)
-	if !h.IsValid(vv.Size()) {
+	if !h.IsValid(pkt.Data.Size()) {
 		return
 	}
 
-	vv.TrimFront(header.IPv6MinimumSize)
-	vv.CapLength(int(h.PayloadLength()))
+	pkt.NetworkHeader = headerView[:header.IPv6MinimumSize]
+	pkt.Data.TrimFront(header.IPv6MinimumSize)
+	pkt.Data.CapLength(int(h.PayloadLength()))
 
 	p := h.TransportProtocol()
 	if p == header.ICMPv6ProtocolNumber {
-		e.handleICMP(r, headerView, vv)
+		e.handleICMP(r, headerView, pkt)
 		return
 	}
 
 	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, headerView, vv)
+	e.dispatcher.DeliverTransportPacket(r, p, pkt)
 }
 
 // Close cleans up resources associated with the endpoint.
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index deaa9b7f3..1cbfa7278 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -55,7 +55,9 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
 
 	stats := s.Stats().ICMP.V6PacketsReceived
 
@@ -111,7 +113,9 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
 
 	stat := s.Stats().UDP.PacketsReceived
 
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 69ab7ba12..0dbce14a0 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -98,7 +98,9 @@ func TestHopLimitValidation(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, hdr.View().ToVectorisedView())
+		ep.HandlePacket(r, tcpip.PacketBuffer{
+			Data: hdr.View().ToVectorisedView(),
+		})
 	}
 
 	types := []struct {
@@ -345,7 +347,9 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
 
 			if test.expectedSuccess {
 				if got := invalid.Value(); got != 0 {
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
new file mode 100644
index 000000000..10b04239d
--- /dev/null
+++ b/pkg/tcpip/packet_buffer.go
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// A PacketBuffer contains all the data of a network packet.
+//
+// As a PacketBuffer traverses up the stack, it may be necessary to pass it to
+// multiple endpoints. Clone() should be called in such cases so that
+// modifications to the Data field do not affect other copies.
+//
+// +stateify savable
+type PacketBuffer struct {
+	// Data holds the payload of the packet. For inbound packets, it also
+	// holds the headers, which are consumed as the packet moves up the
+	// stack. Headers are guaranteed not to be split across views.
+	//
+	// The bytes backing Data are immutable, but Data itself may be trimmed
+	// or otherwise modified.
+	Data buffer.VectorisedView
+
+	// The bytes backing these views are immutable. Each field may be nil
+	// if either it has not been set yet or no such header exists (e.g.
+	// packets sent via loopback may not have a link header).
+	//
+	// These fields may be Views into other Views. SR dosen't support this,
+	// so deep copies are necessary in some cases.
+	LinkHeader      buffer.View
+	NetworkHeader   buffer.View
+	TransportHeader buffer.View
+}
+
+// Clone makes a copy of pk. It clones the Data field, which creates a new
+// VectorisedView but does not deep copy the underlying bytes.
+func (pk PacketBuffer) Clone() PacketBuffer {
+	return PacketBuffer{
+		Data:            pk.Data.Clone(nil),
+		LinkHeader:      pk.LinkHeader,
+		NetworkHeader:   pk.NetworkHeader,
+		TransportHeader: pk.TransportHeader,
+	}
+}
diff --git a/pkg/tcpip/packet_buffer_state.go b/pkg/tcpip/packet_buffer_state.go
new file mode 100644
index 000000000..04c4cf136
--- /dev/null
+++ b/pkg/tcpip/packet_buffer_state.go
@@ -0,0 +1,26 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// beforeSave is invoked by stateify.
+func (pk *PacketBuffer) beforeSave() {
+	// Non-Data fields may be slices of the Data field. This causes
+	// problems for SR, so during save we make each header independent.
+	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
+	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
+	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 525a25218..cc789b5af 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -328,7 +328,9 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
 
 			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
 			if got := stat.Value(); got != 1 {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 12969c74e..28a28ae6e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -723,10 +723,10 @@ func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 	return nil
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, vv buffer.VectorisedView) {
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
-	ref.ep.HandlePacket(&r, vv)
+	ref.ep.HandlePacket(&r, pkt)
 	ref.decRef()
 }
 
@@ -736,9 +736,9 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	n.stats.Rx.Packets.Increment()
-	n.stats.Rx.Bytes.IncrementBy(uint64(vv.Size()))
+	n.stats.Rx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
 
 	netProto, ok := n.stack.networkProtocols[protocol]
 	if !ok {
@@ -763,22 +763,22 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
-		ep.HandlePacket(n.id, local, protocol, vv.Clone(nil), linkHeader)
+		ep.HandlePacket(n.id, local, protocol, pkt.Clone())
 	}
 
 	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	if len(vv.First()) < netProto.MinimumPacketSize() {
+	if len(pkt.Data.First()) < netProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	src, dst := netProto.ParseAddresses(vv.First())
+	src, dst := netProto.ParseAddresses(pkt.Data.First())
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, vv)
+		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
 		return
 	}
 
@@ -806,20 +806,20 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		if ok {
 			r.RemoteAddress = src
 			// TODO(b/123449044): Update the source NIC as well.
-			ref.ep.HandlePacket(&r, vv)
+			ref.ep.HandlePacket(&r, pkt)
 			ref.decRef()
 		} else {
 			// n doesn't have a destination endpoint.
 			// Send the packet out of n.
-			hdr := buffer.NewPrependableFromView(vv.First())
-			vv.RemoveFirst()
+			hdr := buffer.NewPrependableFromView(pkt.Data.First())
+			pkt.Data.RemoveFirst()
 
 			// TODO(b/128629022): use route.WritePacket.
-			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, vv, protocol); err != nil {
+			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, pkt.Data, protocol); err != nil {
 				r.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
 				n.stats.Tx.Packets.Increment()
-				n.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + vv.Size()))
+				n.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + pkt.Data.Size()))
 			}
 		}
 		return
@@ -833,7 +833,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -845,41 +845,41 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// Raw socket packets are delivered based solely on the transport
 	// protocol number. We do not inspect the payload to ensure it's
 	// validly formed.
-	n.stack.demux.deliverRawPacket(r, protocol, netHeader, vv)
+	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	if len(vv.First()) < transProto.MinimumPacketSize() {
+	if len(pkt.Data.First()) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(vv.First())
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
 	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
-	if n.stack.demux.deliverPacket(r, protocol, netHeader, vv, id) {
+	if n.stack.demux.deliverPacket(r, protocol, pkt, id) {
 		return
 	}
 
 	// Try to deliver to per-stack default handler.
 	if state.defaultHandler != nil {
-		if state.defaultHandler(r, id, netHeader, vv) {
+		if state.defaultHandler(r, id, pkt) {
 			return
 		}
 	}
 
 	// We could not find an appropriate destination for this packet, so
 	// deliver it to the global handler.
-	if !transProto.HandleUnknownDestinationPacket(r, id, netHeader, vv) {
+	if !transProto.HandleUnknownDestinationPacket(r, id, pkt) {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 	}
 }
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
@@ -890,17 +890,17 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
 	// be present in the payload. We know that the ports are within the
 	// first 8 bytes for all known transport protocols.
-	if len(vv.First()) < 8 {
+	if len(pkt.Data.First()) < 8 {
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(vv.First())
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
 	if err != nil {
 		return
 	}
 
 	id := TransportEndpointID{srcPort, local, dstPort, remote}
-	if n.stack.demux.deliverControlPacket(n, net, trans, typ, extra, vv, id) {
+	if n.stack.demux.deliverControlPacket(n, net, trans, typ, extra, pkt, id) {
 		return
 	}
 }
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index d7c124e81..5806d294c 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -64,16 +64,15 @@ type TransportEndpoint interface {
 	UniqueID() uint64
 
 	// HandlePacket is called by the stack when new packets arrive to
-	// this transport endpoint.
+	// this transport endpoint. It sets pkt.TransportHeader.
 	//
-	// HandlePacket takes ownership of vv.
-	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer)
 
-	// HandleControlPacket is called by the stack when new control (e.g.,
+	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
-	//
-	// HandleControlPacket takes ownership of vv.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView)
+	// HandleControlPacket takes ownership of pkt.
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
 
 	// Close puts the endpoint in a closed state and frees all resources
 	// associated with it. This cleanup may happen asynchronously. Wait can
@@ -99,8 +98,8 @@ type RawTransportEndpoint interface {
 	// this transport endpoint. The packet contains all data from the link
 	// layer up.
 	//
-	// HandlePacket takes ownership of packet and netHeader.
-	HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -117,8 +116,8 @@ type PacketEndpoint interface {
 	// linkHeader may have a length of 0, in which case the PacketEndpoint
 	// should construct its own ethernet header for applications.
 	//
-	// HandlePacket takes ownership of packet and linkHeader.
-	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, packet buffer.VectorisedView, linkHeader buffer.View)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -148,7 +147,9 @@ type TransportProtocol interface {
 	//
 	// The return value indicates whether the packet was well-formed (for
 	// stats purposes only).
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool
+	//
+	// HandleUnknownDestinationPacket takes ownership of pkt.
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -166,17 +167,21 @@ type TransportProtocol interface {
 // the network layer.
 type TransportDispatcher interface {
 	// DeliverTransportPacket delivers packets to the appropriate
-	// transport protocol endpoint. It also returns the network layer
-	// header for the enpoint to inspect or pass up the stack.
+	// transport protocol endpoint.
+	//
+	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
-	// DeliverTransportPacket takes ownership of vv and netHeader.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView)
+	// DeliverTransportPacket takes ownership of pkt.
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
 	//
-	// DeliverTransportControlPacket takes ownership of vv.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView)
+	// pkt.NetworkHeader must be set before calling
+	// DeliverTransportControlPacket.
+	//
+	// DeliverTransportControlPacket takes ownership of pkt.
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -248,10 +253,10 @@ type NetworkEndpoint interface {
 	NICID() tcpip.NICID
 
 	// HandlePacket is called by the link layer when new packets arrive to
-	// this network endpoint.
+	// this network endpoint. It sets pkt.NetworkHeader.
 	//
-	// HandlePacket takes ownership of vv.
-	HandlePacket(r *Route, vv buffer.VectorisedView)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
@@ -294,11 +299,14 @@ type NetworkProtocol interface {
 // the data link layer.
 type NetworkDispatcher interface {
 	// DeliverNetworkPacket finds the appropriate network protocol endpoint
-	// and hands the packet over for further processing. linkHeader may have
-	// length 0 when the caller does not have ethernet data.
+	// and hands the packet over for further processing.
+	//
+	// pkt.LinkHeader may or may not be set before calling
+	// DeliverNetworkPacket. Some packets do not have link headers (e.g.
+	// packets sent via loopback), and won't have the field set.
 	//
-	// DeliverNetworkPacket takes ownership of vv and linkHeader.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View)
+	// DeliverNetworkPacket takes ownership of pkt.
+	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -329,7 +337,9 @@ const (
 
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
 // ethernet, loopback, raw) and used by network layer protocols to send packets
-// out through the implementer's data link endpoint.
+// out through the implementer's data link endpoint. When a link header exists,
+// it sets each tcpip.PacketBuffer's LinkHeader field before passing it up the
+// stack.
 type LinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
 	// usually dictated by the backing physical network; when such a
@@ -395,7 +405,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 8b141cafd..08599d765 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -51,7 +51,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -641,7 +641,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, buffer.View, buffer.VectorisedView) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9dae853d0..1fac5477f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -86,28 +86,28 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
 	// Consume the network header.
-	b := vv.First()
-	vv.TrimFront(fakeNetHeaderLen)
+	b := pkt.Data.First()
+	pkt.Data.TrimFront(fakeNetHeaderLen)
 
 	// Handle control packets.
 	if b[2] == uint8(fakeControlProtocol) {
-		nb := vv.First()
+		nb := pkt.Data.First()
 		if len(nb) < fakeNetHeaderLen {
 			return
 		}
 
-		vv.TrimFront(fakeNetHeaderLen)
-		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, vv)
+		pkt.Data.TrimFront(fakeNetHeaderLen)
+		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
 		return
 	}
 
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), buffer.View([]byte{}), vv)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -138,7 +138,9 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
 		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
-		f.HandlePacket(r, vv)
+		f.HandlePacket(r, tcpip.PacketBuffer{
+			Data: vv,
+		})
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
@@ -259,7 +261,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet with wrong address is not delivered.
 	buf[0] = 3
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 0 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 0)
 	}
@@ -269,7 +273,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to first endpoint.
 	buf[0] = 1
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -279,7 +285,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to second endpoint.
 	buf[0] = 2
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -288,7 +296,9 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.Inject(fakeNetNumber-1, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber-1, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -298,7 +308,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -373,7 +385,9 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
 		t.Errorf("receive packet count: got = %d, want %d", got, want)
 	}
@@ -1795,7 +1809,9 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
 		t.Errorf("got Rx.Packets.Value() = %d, want = %d", got, want)
 	}
@@ -1855,7 +1871,9 @@ func TestNICForwarding(t *testing.T) {
 	// Send a packet to address 3.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 
 	select {
 	case <-ep2.C:
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index ccd3d030e..594570216 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -21,7 +21,6 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
@@ -86,7 +85,7 @@ func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	epsByNic.mu.RLock()
 
 	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
@@ -100,18 +99,18 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, v
 	// If this is a broadcast or multicast datagram, deliver the datagram to all
 	// endpoints bound to the right device.
 	if isMulticastOrBroadcast(id.LocalAddress) {
-		mpep.handlePacketAll(r, id, vv)
+		mpep.handlePacketAll(r, id, pkt)
 		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, vv)
+	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 	epsByNic.mu.RLock()
 	defer epsByNic.mu.RUnlock()
 
@@ -127,7 +126,7 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 	// broadcast like we are doing with handlePacket above?
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, vv)
+	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, pkt)
 }
 
 // registerEndpoint returns true if it succeeds. It fails and returns
@@ -258,18 +257,16 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	return mpep.endpointsArr[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep.mu.RLock()
 	for i, endpoint := range ep.endpointsArr {
-		// HandlePacket modifies vv, so each endpoint needs its own copy except for
-		// the final one.
+		// HandlePacket takes ownership of pkt, so each endpoint needs
+		// its own copy except for the final one.
 		if i == len(ep.endpointsArr)-1 {
-			endpoint.HandlePacket(r, id, vv)
+			endpoint.HandlePacket(r, id, pkt)
 			break
 		}
-		vvCopy := buffer.NewView(vv.Size())
-		copy(vvCopy, vv.ToView())
-		endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+		endpoint.HandlePacket(r, id, pkt.Clone())
 	}
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
@@ -395,7 +392,7 @@ var loopbackSubnet = func() tcpip.Subnet {
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if it
 // found one or more endpoints, false otherwise.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -408,8 +405,8 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	// transport endpoints.
 	var destEps []*endpointsByNic
 	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
-		destEps = d.findAllEndpointsLocked(eps, vv, id)
-	} else if ep := d.findEndpointLocked(eps, vv, id); ep != nil {
+		destEps = d.findAllEndpointsLocked(eps, id)
+	} else if ep := d.findEndpointLocked(eps, id); ep != nil {
 		destEps = append(destEps, ep)
 	}
 
@@ -424,17 +421,19 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 		return false
 	}
 
-	// Deliver the packet.
-	for _, ep := range destEps {
-		ep.handlePacket(r, id, vv)
+	// HandlePacket takes ownership of pkt, so each endpoint needs its own
+	// copy except for the final one.
+	for _, ep := range destEps[:len(destEps)-1] {
+		ep.handlePacket(r, id, pkt.Clone())
 	}
+	destEps[len(destEps)-1].handlePacket(r, id, pkt)
 
 	return true
 }
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -448,7 +447,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 	for _, rawEP := range eps.rawEndpoints {
 		// Each endpoint gets its own copy of the packet for the sake
 		// of save/restore.
-		rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
+		rawEP.HandlePacket(r, pkt)
 		foundRaw = true
 	}
 	eps.mu.RUnlock()
@@ -458,7 +457,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
@@ -466,7 +465,7 @@ func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtoco
 
 	// Try to find the endpoint.
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, vv, id)
+	ep := d.findEndpointLocked(eps, id)
 	eps.mu.RUnlock()
 
 	// Fail if we didn't find one.
@@ -475,12 +474,12 @@ func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtoco
 	}
 
 	// Deliver the packet.
-	ep.handleControlPacket(n, id, typ, extra, vv)
+	ep.handleControlPacket(n, id, typ, extra, pkt)
 
 	return true
 }
 
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) []*endpointsByNic {
+func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
 	var matchedEPs []*endpointsByNic
 	// Try to find a match with the id as provided.
 	if ep, ok := eps.endpoints[id]; ok {
@@ -514,8 +513,8 @@ func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, vv bu
 
 // findEndpointLocked returns the endpoint that most closely matches the given
 // id.
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) *endpointsByNic {
-	if matchedEPs := d.findAllEndpointsLocked(eps, vv, id); len(matchedEPs) > 0 {
+func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
+	if matchedEPs := d.findAllEndpointsLocked(eps, id); len(matchedEPs) > 0 {
 		return matchedEPs[0]
 	}
 	return nil
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 210233dc0..f54117c4e 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -156,7 +156,9 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEPs[linkEpName].Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEPs[linkEpName].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 }
 
 func TestTransportDemuxerRegister(t *testing.T) {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 203e79f56..2cacea99a 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -197,7 +197,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ buffer.VectorisedView) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ tcpip.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -214,7 +214,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, buffer.VectorisedView) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, tcpip.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -271,7 +271,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.View, buffer.VectorisedView) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
 	return true
 }
 
@@ -342,7 +342,9 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.packetCount != 0 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
 	}
@@ -351,7 +353,9 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.packetCount != 0 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
 	}
@@ -360,7 +364,9 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.packetCount != 1 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 1)
 	}
@@ -413,7 +419,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.controlCount != 0 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
 	}
@@ -422,7 +430,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.controlCount != 0 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
 	}
@@ -431,7 +441,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.controlCount != 1 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 1)
 	}
@@ -584,7 +596,9 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.Inject(fakeNetNumber, req.ToVectorisedView())
+	ep2.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: req.ToVectorisedView(),
+	})
 
 	aep, _, err := ep.Accept()
 	if err != nil || aep == nil {
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 33405eb7d..0092d0ea9 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -718,18 +718,18 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h := header.ICMPv4(vv.First())
+		h := header.ICMPv4(pkt.Data.First())
 		if h.Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h := header.ICMPv6(vv.First())
+		h := header.ICMPv6(pkt.Data.First())
 		if h.Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
@@ -757,19 +757,19 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	pkt := &icmpPacket{
+	packet := &icmpPacket{
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
 		},
 	}
 
-	pkt.data = vv
+	packet.data = pkt.Data
 
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += pkt.data.Size()
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += packet.data.Size()
 
-	pkt.timestamp = e.stack.NowNanoseconds()
+	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
@@ -780,7 +780,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index bfb16f7c3..9ce500e80 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,7 +104,7 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.View, buffer.VectorisedView) bool {
+func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
 	return true
 }
 
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index ead83b83d..26335094e 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -266,7 +266,7 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, ethHeader buffer.View) {
+func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -289,9 +289,9 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 	// Push new packet into receive list and increment the buffer size.
 	var packet packet
 	// TODO(b/129292371): Return network protocol.
-	if len(ethHeader) > 0 {
+	if len(pkt.LinkHeader) > 0 {
 		// Get info directly from the ethernet header.
-		hdr := header.Ethernet(ethHeader)
+		hdr := header.Ethernet(pkt.LinkHeader)
 		packet.senderAddr = tcpip.FullAddress{
 			NIC:  nicid,
 			Addr: tcpip.Address(hdr.SourceAddress()),
@@ -306,11 +306,12 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 
 	if ep.cooked {
 		// Cooked packets can simply be queued.
-		packet.data = vv
+		packet.data = pkt.Data
 	} else {
 		// Raw packets need their ethernet headers prepended before
 		// queueing.
-		if len(ethHeader) == 0 {
+		var linkHeader buffer.View
+		if len(pkt.LinkHeader) == 0 {
 			// We weren't provided with an actual ethernet header,
 			// so fake one.
 			ethFields := header.EthernetFields{
@@ -320,10 +321,12 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 			}
 			fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
 			fakeHeader.Encode(&ethFields)
-			ethHeader = buffer.View(fakeHeader)
+			linkHeader = buffer.View(fakeHeader)
+		} else {
+			linkHeader = append(buffer.View(nil), pkt.LinkHeader...)
 		}
-		combinedVV := buffer.View(ethHeader).ToVectorisedView()
-		combinedVV.Append(vv)
+		combinedVV := linkHeader.ToVectorisedView()
+		combinedVV.Append(pkt.Data)
 		packet.data = combinedVV
 	}
 	packet.timestampNS = ep.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 23922a30e..230a1537a 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -555,7 +555,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -596,20 +596,21 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	pkt := &rawPacket{
+	packet := &rawPacket{
 		senderAddr: tcpip.FullAddress{
 			NIC:  route.NICID(),
 			Addr: route.RemoteAddress,
 		},
 	}
 
-	combinedVV := netHeader.ToVectorisedView()
-	combinedVV.Append(vv)
-	pkt.data = combinedVV
-	pkt.timestampNS = e.stack.NowNanoseconds()
+	networkHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+	combinedVV := networkHeader.ToVectorisedView()
+	combinedVV.Append(pkt.Data)
+	packet.data = combinedVV
+	packet.timestampNS = e.stack.NowNanoseconds()
 
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += pkt.data.Size()
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += packet.data.Size()
 
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index a1efd8d55..e31464c9b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2029,8 +2029,8 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
-	s := newSegment(r, id, vv)
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	s := newSegment(r, id, pkt)
 	if !s.parse() {
 		e.stack.Stats().MalformedRcvdPackets.Increment()
 		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
@@ -2065,7 +2065,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 63666f0b3..4983bca81 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -18,7 +18,6 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -63,8 +62,8 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
-	s := newSegment(r, id, vv)
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
 	// We only care about well-formed SYN packets.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index db40785d3..c4f1a84bb 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -126,8 +126,8 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
-	s := newSegment(r, id, vv)
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
 	if !s.parse() || !s.csumValid {
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index c4a89525e..1c10da5ca 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
@@ -60,13 +61,13 @@ type segment struct {
 	xmitTime time.Time `state:".(unixTime)"`
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
 		route:  r.Clone(),
 	}
-	s.data = vv.Clone(s.views[:])
+	s.data = pkt.Data.Clone(s.views[:])
 	s.rcvdTime = time.Now()
 	return s
 }
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index ef823e4ae..4854e719d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -302,7 +302,9 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.Inject(ipv4.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 }
 
 // BuildSegment builds a TCP segment based on the given Headers and payload.
@@ -350,13 +352,17 @@ func (c *Context) BuildSegment(payload []byte, h *Headers) buffer.VectorisedView
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.Inject(ipv4.ProtocolNumber, s)
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: s,
+	})
 }
 
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.Inject(ipv4.ProtocolNumber, c.BuildSegment(payload, h))
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: c.BuildSegment(payload, h),
+	})
 }
 
 // SendAck sends an ACK packet.
@@ -518,7 +524,9 @@ func (c *Context) SendV6Packet(payload []byte, h *Headers) {
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 }
 
 // CreateConnected creates a connected TCP endpoint.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 03bd5c8fd..4e11de9db 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1158,17 +1158,17 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(vv.First())
-	if int(hdr.Length()) > vv.Size() {
+	hdr := header.UDP(pkt.Data.First())
+	if int(hdr.Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	vv.TrimFront(header.UDPMinimumSize)
+	pkt.Data.TrimFront(header.UDPMinimumSize)
 
 	e.rcvMu.Lock()
 	e.stack.Stats().UDP.PacketsReceived.Increment()
@@ -1192,18 +1192,18 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	pkt := &udpPacket{
+	packet := &udpPacket{
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
 			Port: hdr.SourcePort(),
 		},
 	}
-	pkt.data = vv
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += vv.Size()
+	packet.data = pkt.Data
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += pkt.Data.Size()
 
-	pkt.timestamp = e.stack.NowNanoseconds()
+	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
 
@@ -1214,7 +1214,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State.
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index d399ec722..fc706ede2 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -16,7 +16,6 @@ package udp
 
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -44,12 +43,12 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
 		id:    id,
-		vv:    vv,
+		pkt:   pkt,
 	})
 
 	return true
@@ -62,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	vv    buffer.VectorisedView
+	pkt   tcpip.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
@@ -90,7 +89,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.rcvReady = true
 	ep.rcvMu.Unlock()
 
-	ep.HandlePacket(r.route, r.id, r.vv)
+	ep.HandlePacket(r.route, r.id, r.pkt)
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 5c3358a5e..43f11b700 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -66,10 +66,10 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(vv.First())
-	if int(hdr.Length()) > vv.Size() {
+	hdr := header.UDP(pkt.Data.First())
+	if int(hdr.Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
@@ -116,20 +116,18 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(netHeader) + vv.Size()
+		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
 
-		// The buffers used by vv and netHeader may be used elsewhere
-		// in the system.  For example, a raw or packet socket may use
-		// what UDP considers an unreachable destination. Thus we deep
-		// copy vv and netHeader to prevent multiple ownership and SR
-		// errors.
-		newNetHeader := make(buffer.View, len(netHeader))
-		copy(newNetHeader, netHeader)
-		payload := buffer.NewVectorisedView(len(newNetHeader), []buffer.View{newNetHeader})
-		payload.Append(vv.ToView().ToVectorisedView())
+		// The buffers used by pkt may be used elsewhere in the system.
+		// For example, a raw or packet socket may use what UDP
+		// considers an unreachable destination. Thus we deep copy pkt
+		// to prevent multiple ownership and SR errors.
+		newNetHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+		payload := newNetHeader.ToVectorisedView()
+		payload.Append(pkt.Data.ToView().ToVectorisedView())
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
@@ -158,12 +156,12 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(netHeader) + vv.Size()
+		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
-		payload := buffer.NewVectorisedView(len(netHeader), []buffer.View{netHeader})
-		payload.Append(vv)
+		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader), []buffer.View{pkt.NetworkHeader})
+		payload.Append(pkt.Data)
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index b724d788c..30ee9801b 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -397,7 +397,8 @@ func (c *testContext) injectPacket(flow testFlow, payload []byte) {
 func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
-	copy(buf[len(buf)-len(payload):], payload)
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
 
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
@@ -431,7 +432,11 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
 }
 
 // injectV4Packet creates a V4 test packet with the given payload and header
@@ -441,7 +446,8 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
-	copy(buf[len(buf)-len(payload):], payload)
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
 
 	// Initialize the IP header.
 	ip := header.IPv4(buf)
@@ -471,7 +477,12 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.Inject(ipv4.ProtocolNumber, buf.ToVectorisedView())
+
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
 }
 
 func newPayload() []byte {
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
index 8bcaba6f1..3de898df7 100644
--- a/test/syscalls/linux/raw_socket_icmp.cc
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -129,7 +129,7 @@ TEST_F(RawSocketICMPTest, SendAndReceiveBadChecksum) {
   EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
-//
+
 // Send and receive an ICMP packet.
 TEST_F(RawSocketICMPTest, SendAndReceive) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-- 
cgit v1.2.3


From e63db5e7bbf8decc6f799965f54fcf7aa6673527 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 6 Nov 2019 16:28:25 -0800
Subject: Discover default routers from Router Advertisements

This change allows the netstack to do NDP's Router Discovery as outlined by
RFC 4861 section 6.3.4.

Note, this change will not break existing uses of netstack as the default
configuration for the stack options is set in such a way that Router Discovery
will not be performed. See `stack.Options` and `stack.NDPConfigurations` for
more details.

This change introduces 2 options required to take advantage of Router Discovery,
all available under NDPConfigurations:
- HandleRAs: Whether or not NDP RAs are processes
- DiscoverDefaultRouters: Whether or not Router Discovery is performed

Another note: for a NIC to process Router Advertisements, it must not be a
router itself. Currently the netstack does not have per-interface routing
configuration; the routing/forwarding configuration is controlled stack-wide.
Therefore, if the stack is configured to enable forwarding/routing, no Router
Advertisements will be processed.

Tests: Unittest to make sure that Router Discovery and updates to the routing
table only occur if explicitly configured to do so. Unittest to make sure at
max stack.MaxDiscoveredDefaultRouters discovered default routers are remembered.
PiperOrigin-RevId: 278965143
---
 pkg/tcpip/header/ipv6.go    |   4 +-
 pkg/tcpip/stack/ndp.go      | 166 ++++++++++++++++-
 pkg/tcpip/stack/ndp_test.go | 426 ++++++++++++++++++++++++++++++++++++++++++--
 pkg/tcpip/tcpip.go          |   7 +
 4 files changed, 586 insertions(+), 17 deletions(-)

diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index f1e60911b..0caa51c1e 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -92,7 +92,9 @@ const (
 	IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 )
 
-// IPv6EmptySubnet is the empty IPv6 subnet.
+// IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
+// catch-all or wildcard subnet. That is, all IPv6 addresses are considered to
+// be contained within this subnet.
 var IPv6EmptySubnet = func() tcpip.Subnet {
 	subnet, err := tcpip.NewSubnet(IPv6Any, tcpip.AddressMask(IPv6Any))
 	if err != nil {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index d5352bb5f..a216242d8 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -67,6 +67,9 @@ const (
 	// default routers. The stack should stop discovering new routers after
 	// discovering MaxDiscoveredDefaultRouters routers.
 	//
+	// This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and
+	// SHOULD be more.
+	//
 	// Max = 10.
 	MaxDiscoveredDefaultRouters = 10
 )
@@ -85,6 +88,24 @@ type NDPDispatcher interface {
 	// This function is permitted to block indefinitely without interfering
 	// with the stack's operation.
 	OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
+
+	// OnDefaultRouterDiscovered will be called when a new default router is
+	// discovered. Implementations must return true along with a new valid
+	// route table if the newly discovered router should be remembered. If
+	// an implementation returns false, the second return value will be
+	// ignored.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnDefaultRouterDiscovered(nicid tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route)
+
+	// OnDefaultRouterInvalidated will be called when a discovered default
+	// router is invalidated. Implementers must return a new valid route
+	// table.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnDefaultRouterInvalidated(nicid tcpip.NICID, addr tcpip.Address) []tcpip.Route
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
@@ -165,6 +186,22 @@ type dadState struct {
 // a Router Advertisement.
 type defaultRouterState struct {
 	invalidationTimer *time.Timer
+
+	// Used to signal the timer not to invalidate the default router (R) in
+	// a race condition (T1 is a goroutine that handles an RA from R and T2
+	// is the goroutine that handles R's invalidation timer firing):
+	//   T1: Receive a new RA from R
+	//   T1: Obtain the NIC's lock before processing the RA
+	//   T2: R's invalidation timer fires, and gets blocked on obtaining the
+	//       NIC's lock
+	//   T1: Refreshes/extends R's lifetime & releases NIC's lock
+	//   T2: Obtains NIC's lock & invalidates R immediately
+	//
+	// To resolve this, T1 will check to see if the timer already fired, and
+	// signal the timer using this channel to not invalidate R, so that once
+	// T2 obtains the lock, it will see that there is an event on this
+	// channel and do nothing further.
+	doNotInvalidateC chan struct{}
 }
 
 // startDuplicateAddressDetection performs Duplicate Address Detection.
@@ -361,16 +398,137 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 }
 
 // handleRA handles a Router Advertisement message that arrived on the NIC
-// this ndp is for.
+// this ndp is for. Does nothing if the NIC is configured to not handle RAs.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The NIC that ndp belongs to and its associated stack MUST be locked.
 func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	// Is the NIC configured to handle RAs at all?
-	if !ndp.configs.HandleRAs {
+	//
+	// Currently, the stack does not determine router interface status on a
+	// per-interface basis; it is a stack-wide configuration, so we check
+	// stack's forwarding flag to determine if the NIC is a routing
+	// interface.
+	if !ndp.configs.HandleRAs || ndp.nic.stack.forwarding {
 		return
 	}
 
-	// TODO(b/140882146): Do Router Discovery.
+	// Is the NIC configured to discover default routers?
+	if ndp.configs.DiscoverDefaultRouters {
+		rtr, ok := ndp.defaultRouters[ip]
+		rl := ra.RouterLifetime()
+		switch {
+		case !ok && rl != 0:
+			// This is a new default router we are discovering.
+			//
+			// Only remember it if we currently know about less than
+			// MaxDiscoveredDefaultRouters routers.
+			if len(ndp.defaultRouters) < MaxDiscoveredDefaultRouters {
+				ndp.rememberDefaultRouter(ip, rl)
+			}
+
+		case ok && rl != 0:
+			// This is an already discovered default router. Update
+			// the invalidation timer.
+			timer := rtr.invalidationTimer
+
+			// We should ALWAYS have an invalidation timer for a
+			// discovered router.
+			if timer == nil {
+				panic("ndphandlera: RA invalidation timer should not be nil")
+			}
+
+			if !timer.Stop() {
+				// If we reach this point, then we know the
+				// timer fired after we already took the NIC
+				// lock. Signal the timer so that once it
+				// obtains the lock, it doesn't actually
+				// invalidate the router as we just got a new
+				// RA that refreshes its lifetime to a non-zero
+				// value. See
+				// defaultRouterState.doNotInvalidateC for more
+				// details.
+				rtr.doNotInvalidateC <- struct{}{}
+			}
+
+			timer.Reset(rl)
+
+		case ok && rl == 0:
+			// We know about the router but it is no longer to be
+			// used as a default router so invalidate it.
+			ndp.invalidateDefaultRouter(ip)
+		}
+	}
+
 	// TODO(b/140948104): Do Prefix Discovery.
 	// TODO(b/141556115): Do Parameter Discovery.
 }
+
+// invalidateDefaultRouter invalidates a discovered default router.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
+	rtr, ok := ndp.defaultRouters[ip]
+
+	// Is the router still discovered?
+	if !ok {
+		// ...Nope, do nothing further.
+		return
+	}
+
+	rtr.invalidationTimer.Stop()
+	rtr.invalidationTimer = nil
+	close(rtr.doNotInvalidateC)
+	rtr.doNotInvalidateC = nil
+
+	delete(ndp.defaultRouters, ip)
+
+	// Let the integrator know a discovered default router is invalidated.
+	if ndp.nic.stack.ndpDisp != nil {
+		ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
+	}
+}
+
+// rememberDefaultRouter remembers a newly discovered default router with IPv6
+// link-local address ip with lifetime rl.
+//
+// The router identified by ip MUST NOT already be known by the NIC.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
+	if ndp.nic.stack.ndpDisp == nil {
+		return
+	}
+
+	// Inform the integrator when we discovered a default router.
+	remember, routeTable := ndp.nic.stack.ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip)
+	if !remember {
+		// Informed by the integrator to not remember the router, do
+		// nothing further.
+		return
+	}
+
+	// Used to signal the timer not to invalidate the default router (R) in
+	// a race condition. See defaultRouterState.doNotInvalidateC for more
+	// details.
+	doNotInvalidateC := make(chan struct{}, 1)
+
+	ndp.defaultRouters[ip] = defaultRouterState{
+		invalidationTimer: time.AfterFunc(rl, func() {
+			ndp.nic.stack.mu.Lock()
+			defer ndp.nic.stack.mu.Unlock()
+			ndp.nic.mu.Lock()
+			defer ndp.nic.mu.Unlock()
+
+			select {
+			case <-doNotInvalidateC:
+				return
+			default:
+			}
+
+			ndp.invalidateDefaultRouter(ip)
+		}),
+		doNotInvalidateC: doNotInvalidateC,
+	}
+
+	ndp.nic.stack.routeTable = routeTable
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index cc789b5af..0dbe4da9d 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -15,9 +15,12 @@
 package stack_test
 
 import (
+	"encoding/binary"
+	"fmt"
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -29,10 +32,19 @@ import (
 )
 
 const (
-	addr1     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	addr2     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
-	addr3     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
-	linkAddr1 = "\x02\x02\x03\x04\x05\x06"
+	addr1          = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	addr2          = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	addr3          = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
+	linkAddr1      = "\x02\x02\x03\x04\x05\x06"
+	linkAddr2      = "\x02\x02\x03\x04\x05\x07"
+	linkAddr3      = "\x02\x02\x03\x04\x05\x08"
+	defaultTimeout = 250 * time.Millisecond
+)
+
+var (
+	llAddr1 = header.LinkLocalAddr(linkAddr1)
+	llAddr2 = header.LinkLocalAddr(linkAddr2)
+	llAddr3 = header.LinkLocalAddr(linkAddr3)
 )
 
 // TestDADDisabled tests that an address successfully resolves immediately
@@ -77,26 +89,86 @@ type ndpDADEvent struct {
 	err      *tcpip.Error
 }
 
+type ndpRouterEvent struct {
+	nicid tcpip.NICID
+	addr  tcpip.Address
+	// true if router was discovered, false if invalidated.
+	discovered bool
+}
+
 var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
 // related events happen for test purposes.
 type ndpDispatcher struct {
-	dadC chan ndpDADEvent
+	dadC           chan ndpDADEvent
+	routerC        chan ndpRouterEvent
+	rememberRouter bool
+	routeTable     []tcpip.Route
 }
 
 // Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
-//
-// If the DAD event matches what we are expecting, send signal on n.dadC.
 func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
-	n.dadC <- ndpDADEvent{
-		nicid,
-		addr,
-		resolved,
-		err,
+	if n.dadC != nil {
+		n.dadC <- ndpDADEvent{
+			nicid,
+			addr,
+			resolved,
+			err,
+		}
 	}
 }
 
+// Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
+func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicid tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) {
+	if n.routerC != nil {
+		n.routerC <- ndpRouterEvent{
+			nicid,
+			addr,
+			true,
+		}
+	}
+
+	if !n.rememberRouter {
+		return false, nil
+	}
+
+	rt := append([]tcpip.Route(nil), n.routeTable...)
+	rt = append(rt, tcpip.Route{
+		Destination: header.IPv6EmptySubnet,
+		Gateway:     addr,
+		NIC:         nicid,
+	})
+	n.routeTable = rt
+	return true, rt
+}
+
+// Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
+func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicid tcpip.NICID, addr tcpip.Address) []tcpip.Route {
+	if n.routerC != nil {
+		n.routerC <- ndpRouterEvent{
+			nicid,
+			addr,
+			false,
+		}
+	}
+
+	var rt []tcpip.Route
+	exclude := tcpip.Route{
+		Destination: header.IPv6EmptySubnet,
+		Gateway:     addr,
+		NIC:         nicid,
+	}
+
+	for _, r := range n.routeTable {
+		if r != exclude {
+			rt = append(rt, r)
+		}
+	}
+	n.routeTable = rt
+	return rt
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -609,3 +681,333 @@ func TestSetNDPConfigurations(t *testing.T) {
 		})
 	}
 }
+
+// raBuf returns a valid NDP Router Advertisement.
+//
+// Note, raBuf does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
+	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+	pkt.SetType(header.ICMPv6RouterAdvert)
+	pkt.SetCode(0)
+	// Populate the Router Lifetime.
+	binary.BigEndian.PutUint16(pkt.NDPPayload()[2:], rl)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, ip, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+	payloadLength := hdr.UsedLength()
+	iph := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	iph.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(icmp.ProtocolNumber6),
+		HopLimit:      header.NDPHopLimit,
+		SrcAddr:       ip,
+		DstAddr:       header.IPv6AllNodesMulticastAddress,
+	})
+
+	return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+}
+
+// TestNoRouterDiscovery tests that router discovery will not be performed if
+// configured not to.
+func TestNoRouterDiscovery(t *testing.T) {
+	// Being configured to discover routers means handle and
+	// discover are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, discover =
+	// true and forwarding = false (the required configuration to do
+	// router discovery) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		discover := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				routerC: make(chan ndpRouterEvent, 10),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					DiscoverDefaultRouters: discover,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
+			select {
+			case <-ndpDisp.routerC:
+				t.Fatal("unexpectedly discovered a router when configured not to")
+			case <-time.After(defaultTimeout):
+			}
+		})
+	}
+}
+
+// TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
+// remember a discovered router when the dispatcher asks it not to.
+func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		routerC: make(chan ndpRouterEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	routeTable := []tcpip.Route{
+		{
+			header.IPv6EmptySubnet,
+			llAddr3,
+			1,
+		},
+	}
+	s.SetRouteTable(routeTable)
+
+	// Rx an RA with short lifetime.
+	lifetime := time.Duration(1)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(lifetime)))
+	select {
+	case r := <-ndpDisp.routerC:
+		if r.nicid != 1 {
+			t.Fatalf("got r.nicid = %d, want = 1", r.nicid)
+		}
+		if r.addr != llAddr2 {
+			t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr2)
+		}
+		if !r.discovered {
+			t.Fatal("got r.discovered = false, want = true")
+		}
+	case <-time.After(defaultTimeout):
+		t.Fatal("timeout waiting for router discovery event")
+	}
+
+	// Original route table should not have been modified.
+	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	}
+
+	// Wait for the normal invalidation time plus an extra second to
+	// make sure we do not actually receive any invalidation events as
+	// we should not have remembered the router in the first place.
+	select {
+	case <-ndpDisp.routerC:
+		t.Fatal("should not have received any router events")
+	case <-time.After(lifetime*time.Second + defaultTimeout):
+	}
+
+	// Original route table should not have been modified.
+	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	}
+}
+
+func TestRouterDiscovery(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		routerC:        make(chan ndpRouterEvent, 10),
+		rememberRouter: true,
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	waitForEvent := func(addr tcpip.Address, discovered bool, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case r := <-ndpDisp.routerC:
+			if r.nicid != 1 {
+				t.Fatalf("got r.nicid = %d, want = 1", r.nicid)
+			}
+			if r.addr != addr {
+				t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+			}
+			if r.discovered != discovered {
+				t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timeout waiting for router discovery event")
+		}
+	}
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Rx an RA from lladdr2 with zero lifetime. It should not be
+	// remembered.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
+	select {
+	case <-ndpDisp.routerC:
+		t.Fatal("unexpectedly discovered a router with 0 lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Rx an RA from lladdr2 with a huge lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
+	waitForEvent(llAddr2, true, defaultTimeout)
+
+	// Should have a default route through the discovered router.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Rx an RA from another router (lladdr3) with non-zero lifetime.
+	l3Lifetime := time.Duration(6)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, uint16(l3Lifetime)))
+	waitForEvent(llAddr3, true, defaultTimeout)
+
+	// Should have default routes through the discovered routers.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Rx an RA from lladdr2 with lesser lifetime.
+	l2Lifetime := time.Duration(2)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(l2Lifetime)))
+	select {
+	case <-ndpDisp.routerC:
+		t.Fatal("Should not receive a router event when updating lifetimes for known routers")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Should still have a default route through the discovered routers.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Wait for lladdr2's router invalidation timer to fire. The lifetime
+	// of the router should have been updated to the most recent (smaller)
+	// lifetime.
+	//
+	// Wait for the normal lifetime plus an extra bit for the
+	// router to get invalidated. If we don't get an invalidation
+	// event after this time, then something is wrong.
+	waitForEvent(llAddr2, false, l2Lifetime*time.Second+defaultTimeout)
+
+	// Should no longer have the default route through lladdr2.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Rx an RA from lladdr2 with huge lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
+	waitForEvent(llAddr2, true, defaultTimeout)
+
+	// Should have a default route through the discovered routers.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}, {header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Rx an RA from lladdr2 with zero lifetime. It should be invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
+	waitForEvent(llAddr2, false, defaultTimeout)
+
+	// Should have deleted the default route through the router that just
+	// got invalidated.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Wait for lladdr3's router invalidation timer to fire. The lifetime
+	// of the router should have been updated to the most recent (smaller)
+	// lifetime.
+	//
+	// Wait for the normal lifetime plus an extra bit for the
+	// router to get invalidated. If we don't get an invalidation
+	// event after this time, then something is wrong.
+	waitForEvent(llAddr3, false, l3Lifetime*time.Second+defaultTimeout)
+
+	// Should not have any routes now that all discovered routers have been
+	// invalidated.
+	if got := len(s.GetRouteTable()); got != 0 {
+		t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got)
+	}
+}
+
+// TestRouterDiscoveryMaxRouters tests that only
+// stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
+func TestRouterDiscoveryMaxRouters(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		routerC:        make(chan ndpRouterEvent, 10),
+		rememberRouter: true,
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectedRt := [stack.MaxDiscoveredDefaultRouters]tcpip.Route{}
+
+	// Receive an RA from 2 more than the max number of discovered routers.
+	for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ {
+		linkAddr := []byte{2, 2, 3, 4, 5, 0}
+		linkAddr[5] = byte(i)
+		llAddr := header.LinkLocalAddr(tcpip.LinkAddress(linkAddr))
+
+		e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5))
+
+		if i <= stack.MaxDiscoveredDefaultRouters {
+			expectedRt[i-1] = tcpip.Route{header.IPv6EmptySubnet, llAddr, 1}
+			select {
+			case r := <-ndpDisp.routerC:
+				if r.nicid != 1 {
+					t.Fatalf("got r.nicid = %d, want = 1", r.nicid)
+				}
+				if r.addr != llAddr {
+					t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr)
+				}
+				if !r.discovered {
+					t.Fatal("got r.discovered = false, want = true")
+				}
+			case <-time.After(defaultTimeout):
+				t.Fatal("timeout waiting for router discovery event")
+			}
+
+		} else {
+			select {
+			case <-ndpDisp.routerC:
+				t.Fatal("should not have discovered a new router after we already discovered the max number of routers")
+			case <-time.After(defaultTimeout):
+			}
+		}
+	}
+
+	// Should only have default routes for the first
+	// stack.MaxDiscoveredDefaultRouters discovered routers.
+	if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
+	}
+}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 03be7d3d4..3edb513d4 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -231,6 +231,13 @@ func (s *Subnet) Broadcast() Address {
 	return Address(addr)
 }
 
+// Equal returns true if s equals o.
+//
+// Needed to use cmp.Equal on Subnet as its fields are unexported.
+func (s Subnet) Equal(o Subnet) bool {
+	return s == o
+}
+
 // NICID is a number that uniquely identifies a NIC.
 type NICID int32
 
-- 
cgit v1.2.3


From f8ffadddb39e132605f0ef3e3d39e5d7ad6e0ecf Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 6 Nov 2019 17:10:30 -0800
Subject: Add p9.OpenTruncate.

This is required to implement O_TRUNC correctly on filesystems backed by
gofers.

9P2000.L: "lopen prepares fid for file I/O. flags contains Linux open(2) flags
bits, e.g. O_RDONLY, O_RDWR, O_WRONLY."

open(2): "The argument flags must include one of the following access modes:
O_RDONLY, O_WRONLY, or O_RDWR. ... In addition, zero or more file creation
flags and file status flags can be bitwise-or'd in flags."

The reference 9P2000.L implementation also appears to expect arbitrary flags,
not just access modes, in Tlopen.flags:
https://github.com/chaos/diod/blob/master/diod/ops.c#L703

PiperOrigin-RevId: 278972683
---
 pkg/p9/file.go           |  2 +-
 pkg/p9/p9.go             | 37 ++++++++++++++++++++++++-------------
 runsc/fsgofer/fsgofer.go | 13 +++++++------
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 907445e15..96d1f2a8e 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -116,7 +116,7 @@ type File interface {
 	// N.B. The server must resolve any lazy paths when open is called.
 	// After this point, read and write may be called on files with no
 	// deletion check, so resolving in the data path is not viable.
-	Open(mode OpenFlags) (*fd.FD, QID, uint32, error)
+	Open(flags OpenFlags) (*fd.FD, QID, uint32, error)
 
 	// Read reads from this file. Open must be called first.
 	//
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 6039f5a42..415200d60 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -32,18 +32,22 @@ import (
 type OpenFlags uint32
 
 const (
-	// ReadOnly is a Topen and Tcreate flag indicating read-only mode.
+	// ReadOnly is a Tlopen and Tlcreate flag indicating read-only mode.
 	ReadOnly OpenFlags = 0
 
-	// WriteOnly is a Topen and Tcreate flag indicating write-only mode.
+	// WriteOnly is a Tlopen and Tlcreate flag indicating write-only mode.
 	WriteOnly OpenFlags = 1
 
-	// ReadWrite is a Topen flag indicates read-write mode.
+	// ReadWrite is a Tlopen flag indicates read-write mode.
 	ReadWrite OpenFlags = 2
 
 	// OpenFlagsModeMask is a mask of valid OpenFlags mode bits.
 	OpenFlagsModeMask OpenFlags = 3
 
+	// OpenTruncate is a Tlopen flag indicating that the opened file should be
+	// truncated.
+	OpenTruncate OpenFlags = 01000
+
 	// OpenFlagsIgnoreMask is a list of OpenFlags mode bits that are ignored for Tlopen.
 	// Note that syscall.O_LARGEFILE is set to zero, use value from Linux fcntl.h.
 	OpenFlagsIgnoreMask OpenFlags = syscall.O_DIRECTORY | syscall.O_NOATIME | 0100000
@@ -71,25 +75,32 @@ const (
 
 // OSFlags converts a p9.OpenFlags to an int compatible with open(2).
 func (o OpenFlags) OSFlags() int {
-	return int(o & OpenFlagsModeMask)
+	// "flags contains Linux open(2) flags bits" - 9P2000.L
+	return int(o)
 }
 
 // String implements fmt.Stringer.
 func (o OpenFlags) String() string {
-	switch o {
+	var buf strings.Builder
+	switch mode := o & OpenFlagsModeMask; mode {
 	case ReadOnly:
-		return "ReadOnly"
+		buf.WriteString("ReadOnly")
 	case WriteOnly:
-		return "WriteOnly"
+		buf.WriteString("WriteOnly")
 	case ReadWrite:
-		return "ReadWrite"
-	case OpenFlagsModeMask:
-		return "OpenFlagsModeMask"
-	case OpenFlagsIgnoreMask:
-		return "OpenFlagsIgnoreMask"
+		buf.WriteString("ReadWrite")
 	default:
-		return "UNDEFINED"
+		fmt.Fprintf(&buf, "%#o", mode)
+	}
+	otherFlags := o &^ OpenFlagsModeMask
+	if otherFlags&OpenTruncate != 0 {
+		buf.WriteString("|OpenTruncate")
+		otherFlags &^= OpenTruncate
+	}
+	if otherFlags != 0 {
+		fmt.Fprintf(&buf, "|%#o", otherFlags)
 	}
+	return buf.String()
 }
 
 // Tag is a message tag.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 18b853e2e..9117d9616 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -366,23 +366,24 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
 }
 
 // Open implements p9.File.
-func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	if l.isOpen() {
 		panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
 	}
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *fd.FD
-	if mode == p9.ReadOnly {
-		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath)
+	if flags == p9.ReadOnly {
+		log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
 		newFile = l.file
 	} else {
 		// Ideally reopen would call name_to_handle_at (with empty name) and
 		// open_by_handle_at to reopen the file without using 'hostPath'. However,
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
-		log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath)
+		log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
 		var err error
-		newFile, err = reopenProcFd(l.file, openFlags|mode.OSFlags())
+		// Constrain open flags to the open mode and O_TRUNC.
+		newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC)))
 		if err != nil {
 			return nil, p9.QID{}, 0, extractErrno(err)
 		}
@@ -409,7 +410,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		}
 		l.file = newFile
 	}
-	l.mode = mode
+	l.mode = flags & p9.OpenFlagsModeMask
 	return fd, l.attachPoint.makeQID(stat), 0, nil
 }
 
-- 
cgit v1.2.3


From adb10f4d53cb951c7329e1355a784345ceea4b62 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 6 Nov 2019 17:55:04 -0800
Subject: Internal change.

PiperOrigin-RevId: 278979065
---
 pkg/tcpip/transport/tcp/endpoint.go |  5 +++++
 pkg/tcpip/transport/tcp/protocol.go | 18 ++++++++++++++-
 pkg/tcpip/transport/tcp/tcp_test.go | 45 +++++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e31464c9b..d29f0f81b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -594,6 +594,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.rcvAutoParams.disabled = !bool(mrb)
 	}
 
+	var de DelayEnabled
+	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
+		e.SetSockOptInt(tcpip.DelayOption, 1)
+	}
+
 	if p := s.GetTCPProbe(); p != nil {
 		e.probe = p
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index c4f1a84bb..c8e4a0d7e 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -60,6 +60,9 @@ const (
 // protocol. See: https://tools.ietf.org/html/rfc2018.
 type SACKEnabled bool
 
+// DelayEnabled option can be used to enable Nagle's algorithm in the TCP protocol.
+type DelayEnabled bool
+
 // SendBufferSizeOption allows the default, min and max send buffer sizes for
 // TCP endpoints to be queried or configured.
 type SendBufferSizeOption struct {
@@ -84,6 +87,7 @@ const (
 type protocol struct {
 	mu                         sync.Mutex
 	sackEnabled                bool
+	delayEnabled               bool
 	sendBufferSize             SendBufferSizeOption
 	recvBufferSize             ReceiveBufferSizeOption
 	congestionControl          string
@@ -97,7 +101,7 @@ func (*protocol) Number() tcpip.TransportProtocolNumber {
 }
 
 // NewEndpoint creates a new tcp endpoint.
-func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return newEndpoint(stack, netProto, waiterQueue), nil
 }
 
@@ -165,6 +169,12 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case DelayEnabled:
+		p.mu.Lock()
+		p.delayEnabled = bool(v)
+		p.mu.Unlock()
+		return nil
+
 	case SendBufferSizeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
@@ -216,6 +226,12 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case *DelayEnabled:
+		p.mu.Lock()
+		*v = DelayEnabled(p.delayEnabled)
+		p.mu.Unlock()
+		return nil
+
 	case *SendBufferSizeOption:
 		p.mu.Lock()
 		*v = p.sendBufferSize
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index beaa40210..f4ea5f091 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -4848,3 +4848,48 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		payloadSize *= 2
 	}
 }
+
+func TestDelayEnabled(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	checkDelayOption(t, c, false, 0) // Delay is disabled by default.
+
+	for _, v := range []struct {
+		delayEnabled    tcp.DelayEnabled
+		wantDelayOption int
+	}{
+		{delayEnabled: false, wantDelayOption: 0},
+		{delayEnabled: true, wantDelayOption: 1},
+	} {
+		c := context.New(t, defaultMTU)
+		defer c.Cleanup()
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, v.delayEnabled); err != nil {
+			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %v", v.delayEnabled, err)
+		}
+		checkDelayOption(t, c, v.delayEnabled, v.wantDelayOption)
+	}
+}
+
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption int) {
+	t.Helper()
+
+	var gotDelayEnabled tcp.DelayEnabled
+	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &gotDelayEnabled); err != nil {
+		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %v", err)
+	}
+	if gotDelayEnabled != wantDelayEnabled {
+		t.Errorf("TransportProtocolOption(tcp, &gotDelayEnabled) got %t, want %t", gotDelayEnabled, wantDelayEnabled)
+	}
+
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, new(waiter.Queue))
+	if err != nil {
+		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %v", err)
+	}
+	gotDelayOption, err := ep.GetSockOptInt(tcpip.DelayOption)
+	if err != nil {
+		t.Fatalf("ep.GetSockOptInt(tcpip.DelayOption) failed: %v", err)
+	}
+	if gotDelayOption != wantDelayOption {
+		t.Errorf("ep.GetSockOptInt(tcpip.DelayOption) got: %d, want: %d", gotDelayOption, wantDelayOption)
+	}
+}
-- 
cgit v1.2.3


From 0c424ea73198866066ddc5e7047a3a357d313f46 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 6 Nov 2019 19:39:57 -0800
Subject: Rename nicid to nicID to follow go-readability initialisms

https://github.com/golang/go/wiki/CodeReviewComments#initialisms

This change does not introduce any new functionality. It just renames variables
from `nicid` to `nicID`.

PiperOrigin-RevId: 278992966
---
 pkg/tcpip/network/arp/arp.go              | 12 ++---
 pkg/tcpip/network/ipv4/ipv4.go            |  8 +--
 pkg/tcpip/network/ipv4/ipv4_test.go       |  4 +-
 pkg/tcpip/network/ipv6/icmp.go            |  8 +--
 pkg/tcpip/network/ipv6/ipv6.go            |  8 +--
 pkg/tcpip/stack/ndp.go                    |  8 +--
 pkg/tcpip/stack/ndp_test.go               | 48 ++++++++---------
 pkg/tcpip/stack/registration.go           | 12 ++---
 pkg/tcpip/stack/stack.go                  | 30 +++++------
 pkg/tcpip/stack/stack_test.go             | 86 +++++++++++++++----------------
 pkg/tcpip/stack/transport_demuxer_test.go |  8 +--
 pkg/tcpip/transport/icmp/endpoint.go      | 26 +++++-----
 pkg/tcpip/transport/packet/endpoint.go    |  6 +--
 pkg/tcpip/transport/tcp/endpoint.go       | 18 +++----
 pkg/tcpip/transport/udp/endpoint.go       | 50 +++++++++---------
 15 files changed, 166 insertions(+), 166 deletions(-)

diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 4161ebf87..0ee509ebe 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -42,7 +42,7 @@ const (
 
 // endpoint implements stack.NetworkEndpoint.
 type endpoint struct {
-	nicid         tcpip.NICID
+	nicID         tcpip.NICID
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
 }
@@ -58,7 +58,7 @@ func (e *endpoint) MTU() uint32 {
 }
 
 func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicid
+	return e.nicID
 }
 
 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
@@ -102,7 +102,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	switch h.Op() {
 	case header.ARPRequest:
 		localAddr := tcpip.Address(h.ProtocolAddressTarget())
-		if e.linkAddrCache.CheckLocalAddress(e.nicid, header.IPv4ProtocolNumber, localAddr) == 0 {
+		if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
 			return // we have no useful answer, ignore the request
 		}
 		hdr := buffer.NewPrependable(int(e.linkEP.MaxHeaderLength()) + header.ARPSize)
@@ -118,7 +118,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	case header.ARPReply:
 		addr := tcpip.Address(h.ProtocolAddressSender())
 		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
-		e.linkAddrCache.AddLinkAddress(e.nicid, addr, linkAddr)
+		e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
 	}
 }
 
@@ -135,12 +135,12 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
 }
 
-func (p *protocol) NewEndpoint(nicid tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
 	if addrWithPrefix.Address != ProtocolAddress {
 		return nil, tcpip.ErrBadLocalAddress
 	}
 	return &endpoint{
-		nicid:         nicid,
+		nicID:         nicID,
 		linkEP:        sender,
 		linkAddrCache: linkAddrCache,
 	}, nil
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 26f1402ed..ac16c8add 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -47,7 +47,7 @@ const (
 )
 
 type endpoint struct {
-	nicid         tcpip.NICID
+	nicID         tcpip.NICID
 	id            stack.NetworkEndpointID
 	prefixLen     int
 	linkEP        stack.LinkEndpoint
@@ -57,9 +57,9 @@ type endpoint struct {
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
-func (p *protocol) NewEndpoint(nicid tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
 	e := &endpoint{
-		nicid:         nicid,
+		nicID:         nicID,
 		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
 		prefixLen:     addrWithPrefix.PrefixLen,
 		linkEP:        linkEP,
@@ -89,7 +89,7 @@ func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
 
 // NICID returns the ID of the NIC this endpoint belongs to.
 func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicid
+	return e.nicID
 }
 
 // ID returns the ipv4 endpoint ID.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index f100d84ee..01dfb5f20 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -451,7 +451,7 @@ func TestInvalidFragments(t *testing.T) {
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			const nicid tcpip.NICID = 42
+			const nicID tcpip.NICID = 42
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{
 					ipv4.NewProtocol(),
@@ -461,7 +461,7 @@ func TestInvalidFragments(t *testing.T) {
 			var linkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30})
 			var remoteLinkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x31})
 			ep := channel.New(10, 1500, linkAddr)
-			s.CreateNIC(nicid, sniffer.New(ep))
+			s.CreateNIC(nicID, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
 				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, tcpip.PacketBuffer{
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 58f8e80df..6629951c6 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -180,7 +180,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// rxNICID so the packet is processed as defined in RFC 4861,
 		// as per RFC 4862 section 5.4.3.
 
-		if e.linkAddrCache.CheckLocalAddress(e.nicid, ProtocolNumber, targetAddr) == 0 {
+		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
 			// We don't have a useful answer; the best we can do is ignore the request.
 			return
 		}
@@ -218,7 +218,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		//
 		// Furthermore, the entirety of NDP handling here seems to be
 		// contradicted by RFC 4861.
-		e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
+		e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, r.RemoteLinkAddress)
 
 		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
 		//
@@ -274,9 +274,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// inform the netstack integration that a duplicate address was
 		// detected outside of DAD.
 
-		e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress)
+		e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, r.RemoteLinkAddress)
 		if targetAddr != r.RemoteAddress {
-			e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
+			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, r.RemoteLinkAddress)
 		}
 
 	case header.ICMPv6EchoRequest:
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 805d1739c..4cee848a1 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -43,7 +43,7 @@ const (
 )
 
 type endpoint struct {
-	nicid         tcpip.NICID
+	nicID         tcpip.NICID
 	id            stack.NetworkEndpointID
 	prefixLen     int
 	linkEP        stack.LinkEndpoint
@@ -65,7 +65,7 @@ func (e *endpoint) MTU() uint32 {
 
 // NICID returns the ID of the NIC this endpoint belongs to.
 func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicid
+	return e.nicID
 }
 
 // ID returns the ipv6 endpoint ID.
@@ -218,9 +218,9 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // NewEndpoint creates a new ipv6 endpoint.
-func (p *protocol) NewEndpoint(nicid tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
 	return &endpoint{
-		nicid:         nicid,
+		nicID:         nicID,
 		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
 		prefixLen:     addrWithPrefix.PrefixLen,
 		linkEP:        linkEP,
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index a216242d8..8e49f7a56 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -78,7 +78,7 @@ const (
 // receive and handle NDP related events.
 type NDPDispatcher interface {
 	// OnDuplicateAddressDetectionStatus will be called when the DAD process
-	// for an address (addr) on a NIC (with ID nicid) completes. resolved
+	// for an address (addr) on a NIC (with ID nicID) completes. resolved
 	// will be set to true if DAD completed successfully (no duplicate addr
 	// detected); false otherwise (addr was detected to be a duplicate on
 	// the link the NIC is a part of, or it was stopped for some other
@@ -87,7 +87,7 @@ type NDPDispatcher interface {
 	//
 	// This function is permitted to block indefinitely without interfering
 	// with the stack's operation.
-	OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
+	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
 
 	// OnDefaultRouterDiscovered will be called when a new default router is
 	// discovered. Implementations must return true along with a new valid
@@ -97,7 +97,7 @@ type NDPDispatcher interface {
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
-	OnDefaultRouterDiscovered(nicid tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route)
+	OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route)
 
 	// OnDefaultRouterInvalidated will be called when a discovered default
 	// router is invalidated. Implementers must return a new valid route
@@ -105,7 +105,7 @@ type NDPDispatcher interface {
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
-	OnDefaultRouterInvalidated(nicid tcpip.NICID, addr tcpip.Address) []tcpip.Route
+	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 0dbe4da9d..50ce1bbfa 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -83,14 +83,14 @@ func TestDADDisabled(t *testing.T) {
 // ndpDADEvent is a set of parameters that was passed to
 // ndpDispatcher.OnDuplicateAddressDetectionStatus.
 type ndpDADEvent struct {
-	nicid    tcpip.NICID
+	nicID    tcpip.NICID
 	addr     tcpip.Address
 	resolved bool
 	err      *tcpip.Error
 }
 
 type ndpRouterEvent struct {
-	nicid tcpip.NICID
+	nicID tcpip.NICID
 	addr  tcpip.Address
 	// true if router was discovered, false if invalidated.
 	discovered bool
@@ -108,10 +108,10 @@ type ndpDispatcher struct {
 }
 
 // Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
-func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
+func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
 	if n.dadC != nil {
 		n.dadC <- ndpDADEvent{
-			nicid,
+			nicID,
 			addr,
 			resolved,
 			err,
@@ -120,10 +120,10 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, add
 }
 
 // Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
-func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicid tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) {
+func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) {
 	if n.routerC != nil {
 		n.routerC <- ndpRouterEvent{
-			nicid,
+			nicID,
 			addr,
 			true,
 		}
@@ -137,17 +137,17 @@ func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicid tcpip.NICID, addr tcpip.
 	rt = append(rt, tcpip.Route{
 		Destination: header.IPv6EmptySubnet,
 		Gateway:     addr,
-		NIC:         nicid,
+		NIC:         nicID,
 	})
 	n.routeTable = rt
 	return true, rt
 }
 
 // Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
-func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicid tcpip.NICID, addr tcpip.Address) []tcpip.Route {
+func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route {
 	if n.routerC != nil {
 		n.routerC <- ndpRouterEvent{
-			nicid,
+			nicID,
 			addr,
 			false,
 		}
@@ -157,7 +157,7 @@ func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicid tcpip.NICID, addr tcpip
 	exclude := tcpip.Route{
 		Destination: header.IPv6EmptySubnet,
 		Gateway:     addr,
-		NIC:         nicid,
+		NIC:         nicID,
 	}
 
 	for _, r := range n.routeTable {
@@ -254,8 +254,8 @@ func TestDADResolve(t *testing.T) {
 				if e.err != nil {
 					t.Fatal("got DAD error: ", e.err)
 				}
-				if e.nicid != 1 {
-					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				if e.nicID != 1 {
+					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
 				}
 				if e.addr != addr1 {
 					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
@@ -421,8 +421,8 @@ func TestDADFail(t *testing.T) {
 				if e.err != nil {
 					t.Fatal("got DAD error: ", e.err)
 				}
-				if e.nicid != 1 {
-					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				if e.nicID != 1 {
+					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
 				}
 				if e.addr != addr1 {
 					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
@@ -492,8 +492,8 @@ func TestDADStop(t *testing.T) {
 		if e.err != nil {
 			t.Fatal("got DAD error: ", e.err)
 		}
-		if e.nicid != 1 {
-			t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+		if e.nicID != 1 {
+			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
 		}
 		if e.addr != addr1 {
 			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
@@ -661,8 +661,8 @@ func TestSetNDPConfigurations(t *testing.T) {
 				if e.err != nil {
 					t.Fatal("got DAD error: ", e.err)
 				}
-				if e.nicid != 1 {
-					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				if e.nicID != 1 {
+					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
 				}
 				if e.addr != addr1 {
 					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
@@ -786,8 +786,8 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(lifetime)))
 	select {
 	case r := <-ndpDisp.routerC:
-		if r.nicid != 1 {
-			t.Fatalf("got r.nicid = %d, want = 1", r.nicid)
+		if r.nicID != 1 {
+			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
 		}
 		if r.addr != llAddr2 {
 			t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr2)
@@ -839,8 +839,8 @@ func TestRouterDiscovery(t *testing.T) {
 
 		select {
 		case r := <-ndpDisp.routerC:
-			if r.nicid != 1 {
-				t.Fatalf("got r.nicid = %d, want = 1", r.nicid)
+			if r.nicID != 1 {
+				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
 			}
 			if r.addr != addr {
 				t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
@@ -983,8 +983,8 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 			expectedRt[i-1] = tcpip.Route{header.IPv6EmptySubnet, llAddr, 1}
 			select {
 			case r := <-ndpDisp.routerC:
-				if r.nicid != 1 {
-					t.Fatalf("got r.nicid = %d, want = 1", r.nicid)
+				if r.nicID != 1 {
+					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
 				}
 				if r.addr != llAddr {
 					t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr)
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 5806d294c..c0026f5a3 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -117,7 +117,7 @@ type PacketEndpoint interface {
 	// should construct its own ethernet header for applications.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -281,7 +281,7 @@ type NetworkProtocol interface {
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
 	// NewEndpoint creates a new endpoint of this protocol.
-	NewEndpoint(nicid tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint) (NetworkEndpoint, *tcpip.Error)
+	NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint) (NetworkEndpoint, *tcpip.Error)
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -440,10 +440,10 @@ type LinkAddressResolver interface {
 type LinkAddressCache interface {
 	// CheckLocalAddress determines if the given local address exists, and if it
 	// does not exist.
-	CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID
+	CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID
 
 	// AddLinkAddress adds a link address to the cache.
-	AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress)
+	AddLinkAddress(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress)
 
 	// GetLinkAddress looks up the cache to translate address to link address (e.g. IP -> MAC).
 	// If the LinkEndpoint requests address resolution and there is a LinkAddressResolver
@@ -454,10 +454,10 @@ type LinkAddressCache interface {
 	// If address resolution is required, ErrNoLinkAddress and a notification channel is
 	// returned for the top level caller to block. Channel is closed once address resolution
 	// is complete (success or not).
-	GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error)
+	GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error)
 
 	// RemoveWaker removes a waker that has been added in GetLinkAddress().
-	RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
+	RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
 }
 
 // RawFactory produces endpoints for writing various types of raw packets.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 08599d765..99809df75 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1055,13 +1055,13 @@ func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool
 // CheckLocalAddress determines if the given local address exists, and if it
 // does, returns the id of the NIC it's bound to. Returns 0 if the address
 // does not exist.
-func (s *Stack) CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID {
+func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
 	// If a NIC is specified, we try to find the address there only.
-	if nicid != 0 {
-		nic := s.nics[nicid]
+	if nicID != 0 {
+		nic := s.nics[nicID]
 		if nic == nil {
 			return 0
 		}
@@ -1120,35 +1120,35 @@ func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
 }
 
 // AddLinkAddress adds a link address to the stack link cache.
-func (s *Stack) AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) {
-	fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr}
+func (s *Stack) AddLinkAddress(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) {
+	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
 	s.linkAddrCache.add(fullAddr, linkAddr)
 	// TODO: provide a way for a transport endpoint to receive a signal
 	// that AddLinkAddress for a particular address has been called.
 }
 
 // GetLinkAddress implements LinkAddressCache.GetLinkAddress.
-func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
+func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
 	s.mu.RLock()
-	nic := s.nics[nicid]
+	nic := s.nics[nicID]
 	if nic == nil {
 		s.mu.RUnlock()
 		return "", nil, tcpip.ErrUnknownNICID
 	}
 	s.mu.RUnlock()
 
-	fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr}
+	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
 	linkRes := s.linkAddrResolvers[protocol]
 	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
 }
 
 // RemoveWaker implements LinkAddressCache.RemoveWaker.
-func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
+func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	if nic := s.nics[nicid]; nic == nil {
-		fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr}
+	if nic := s.nics[nicID]; nic == nil {
+		fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
 		s.linkAddrCache.removeWaker(fullAddr, waker)
 	}
 }
@@ -1344,9 +1344,9 @@ func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip
 
 // WritePacket writes data directly to the specified NIC. It adds an ethernet
 // header based on the arguments.
-func (s *Stack) WritePacket(nicid tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
+func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
 	s.mu.Lock()
-	nic, ok := s.nics[nicid]
+	nic, ok := s.nics[nicID]
 	s.mu.Unlock()
 	if !ok {
 		return tcpip.ErrUnknownDevice
@@ -1372,9 +1372,9 @@ func (s *Stack) WritePacket(nicid tcpip.NICID, dst tcpip.LinkAddress, netProto t
 
 // WriteRawPacket writes data directly to the specified NIC without adding any
 // headers.
-func (s *Stack) WriteRawPacket(nicid tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
+func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
 	s.mu.Lock()
-	nic, ok := s.nics[nicid]
+	nic, ok := s.nics[nicID]
 	s.mu.Unlock()
 	if !ok {
 		return tcpip.ErrUnknownDevice
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 1fac5477f..bf1d6974c 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -58,7 +58,7 @@ const (
 // use the first three: destination address, source address, and transport
 // protocol. They're all one byte fields to simplify parsing.
 type fakeNetworkEndpoint struct {
-	nicid      tcpip.NICID
+	nicID      tcpip.NICID
 	id         stack.NetworkEndpointID
 	prefixLen  int
 	proto      *fakeNetworkProtocol
@@ -71,7 +71,7 @@ func (f *fakeNetworkEndpoint) MTU() uint32 {
 }
 
 func (f *fakeNetworkEndpoint) NICID() tcpip.NICID {
-	return f.nicid
+	return f.nicID
 }
 
 func (f *fakeNetworkEndpoint) PrefixLen() int {
@@ -199,9 +199,9 @@ func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Addres
 	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
 }
 
-func (f *fakeNetworkProtocol) NewEndpoint(nicid tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
 	return &fakeNetworkEndpoint{
-		nicid:      nicid,
+		nicID:      nicID,
 		id:         stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
 		prefixLen:  addrWithPrefix.PrefixLen,
 		proto:      f,
@@ -682,11 +682,11 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) {
 	}
 }
 
-func verifyAddress(t *testing.T, s *stack.Stack, nicid tcpip.NICID, addr tcpip.Address) {
+func verifyAddress(t *testing.T, s *stack.Stack, nicID tcpip.NICID, addr tcpip.Address) {
 	t.Helper()
-	info, ok := s.NICInfo()[nicid]
+	info, ok := s.NICInfo()[nicID]
 	if !ok {
-		t.Fatalf("NICInfo() failed to find nicid=%d", nicid)
+		t.Fatalf("NICInfo() failed to find nicID=%d", nicID)
 	}
 	if len(addr) == 0 {
 		// No address given, verify that there is no address assigned to the NIC.
@@ -719,7 +719,7 @@ func TestEndpointExpiration(t *testing.T) {
 		localAddrByte byte          = 0x01
 		remoteAddr    tcpip.Address = "\x03"
 		noAddr        tcpip.Address = ""
-		nicid         tcpip.NICID   = 1
+		nicID         tcpip.NICID   = 1
 	)
 	localAddr := tcpip.Address([]byte{localAddrByte})
 
@@ -731,7 +731,7 @@ func TestEndpointExpiration(t *testing.T) {
 				})
 
 				ep := channel.New(10, defaultMTU, "")
-				if err := s.CreateNIC(nicid, ep); err != nil {
+				if err := s.CreateNIC(nicID, ep); err != nil {
 					t.Fatal("CreateNIC failed:", err)
 				}
 
@@ -748,13 +748,13 @@ func TestEndpointExpiration(t *testing.T) {
 				buf[0] = localAddrByte
 
 				if promiscuous {
-					if err := s.SetPromiscuousMode(nicid, true); err != nil {
+					if err := s.SetPromiscuousMode(nicID, true); err != nil {
 						t.Fatal("SetPromiscuousMode failed:", err)
 					}
 				}
 
 				if spoofing {
-					if err := s.SetSpoofing(nicid, true); err != nil {
+					if err := s.SetSpoofing(nicID, true); err != nil {
 						t.Fatal("SetSpoofing failed:", err)
 					}
 				}
@@ -762,7 +762,7 @@ func TestEndpointExpiration(t *testing.T) {
 				// 1. No Address yet, send should only work for spoofing, receive for
 				// promiscuous mode.
 				//-----------------------
-				verifyAddress(t, s, nicid, noAddr)
+				verifyAddress(t, s, nicID, noAddr)
 				if promiscuous {
 					testRecv(t, fakeNet, localAddrByte, ep, buf)
 				} else {
@@ -777,20 +777,20 @@ func TestEndpointExpiration(t *testing.T) {
 
 				// 2. Add Address, everything should work.
 				//-----------------------
-				if err := s.AddAddress(nicid, fakeNetNumber, localAddr); err != nil {
+				if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
 					t.Fatal("AddAddress failed:", err)
 				}
-				verifyAddress(t, s, nicid, localAddr)
+				verifyAddress(t, s, nicID, localAddr)
 				testRecv(t, fakeNet, localAddrByte, ep, buf)
 				testSendTo(t, s, remoteAddr, ep, nil)
 
 				// 3. Remove the address, send should only work for spoofing, receive
 				// for promiscuous mode.
 				//-----------------------
-				if err := s.RemoveAddress(nicid, localAddr); err != nil {
+				if err := s.RemoveAddress(nicID, localAddr); err != nil {
 					t.Fatal("RemoveAddress failed:", err)
 				}
-				verifyAddress(t, s, nicid, noAddr)
+				verifyAddress(t, s, nicID, noAddr)
 				if promiscuous {
 					testRecv(t, fakeNet, localAddrByte, ep, buf)
 				} else {
@@ -805,10 +805,10 @@ func TestEndpointExpiration(t *testing.T) {
 
 				// 4. Add Address back, everything should work again.
 				//-----------------------
-				if err := s.AddAddress(nicid, fakeNetNumber, localAddr); err != nil {
+				if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
 					t.Fatal("AddAddress failed:", err)
 				}
-				verifyAddress(t, s, nicid, localAddr)
+				verifyAddress(t, s, nicID, localAddr)
 				testRecv(t, fakeNet, localAddrByte, ep, buf)
 				testSendTo(t, s, remoteAddr, ep, nil)
 
@@ -826,10 +826,10 @@ func TestEndpointExpiration(t *testing.T) {
 				// 6. Remove the address. Send should only work for spoofing, receive
 				// for promiscuous mode.
 				//-----------------------
-				if err := s.RemoveAddress(nicid, localAddr); err != nil {
+				if err := s.RemoveAddress(nicID, localAddr); err != nil {
 					t.Fatal("RemoveAddress failed:", err)
 				}
-				verifyAddress(t, s, nicid, noAddr)
+				verifyAddress(t, s, nicID, noAddr)
 				if promiscuous {
 					testRecv(t, fakeNet, localAddrByte, ep, buf)
 				} else {
@@ -845,10 +845,10 @@ func TestEndpointExpiration(t *testing.T) {
 
 				// 7. Add Address back, everything should work again.
 				//-----------------------
-				if err := s.AddAddress(nicid, fakeNetNumber, localAddr); err != nil {
+				if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
 					t.Fatal("AddAddress failed:", err)
 				}
-				verifyAddress(t, s, nicid, localAddr)
+				verifyAddress(t, s, nicID, localAddr)
 				testRecv(t, fakeNet, localAddrByte, ep, buf)
 				testSendTo(t, s, remoteAddr, ep, nil)
 				testSend(t, r, ep, nil)
@@ -856,17 +856,17 @@ func TestEndpointExpiration(t *testing.T) {
 				// 8. Remove the route, sendTo/recv should still work.
 				//-----------------------
 				r.Release()
-				verifyAddress(t, s, nicid, localAddr)
+				verifyAddress(t, s, nicID, localAddr)
 				testRecv(t, fakeNet, localAddrByte, ep, buf)
 				testSendTo(t, s, remoteAddr, ep, nil)
 
 				// 9. Remove the address. Send should only work for spoofing, receive
 				// for promiscuous mode.
 				//-----------------------
-				if err := s.RemoveAddress(nicid, localAddr); err != nil {
+				if err := s.RemoveAddress(nicID, localAddr); err != nil {
 					t.Fatal("RemoveAddress failed:", err)
 				}
-				verifyAddress(t, s, nicid, noAddr)
+				verifyAddress(t, s, nicID, noAddr)
 				if promiscuous {
 					testRecv(t, fakeNet, localAddrByte, ep, buf)
 				} else {
@@ -1659,12 +1659,12 @@ func verifyAddresses(t *testing.T, expectedAddresses, gotAddresses []tcpip.Proto
 }
 
 func TestAddAddress(t *testing.T) {
-	const nicid = 1
+	const nicID = 1
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
 	})
 	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(nicid, ep); err != nil {
+	if err := s.CreateNIC(nicID, ep); err != nil {
 		t.Fatal("CreateNIC failed:", err)
 	}
 
@@ -1672,7 +1672,7 @@ func TestAddAddress(t *testing.T) {
 	expectedAddresses := make([]tcpip.ProtocolAddress, 0, 2)
 	for _, addrLen := range []int{4, 16} {
 		address := addrGen.next(addrLen)
-		if err := s.AddAddress(nicid, fakeNetNumber, address); err != nil {
+		if err := s.AddAddress(nicID, fakeNetNumber, address); err != nil {
 			t.Fatalf("AddAddress(address=%s) failed: %s", address, err)
 		}
 		expectedAddresses = append(expectedAddresses, tcpip.ProtocolAddress{
@@ -1681,17 +1681,17 @@ func TestAddAddress(t *testing.T) {
 		})
 	}
 
-	gotAddresses := s.AllAddresses()[nicid]
+	gotAddresses := s.AllAddresses()[nicID]
 	verifyAddresses(t, expectedAddresses, gotAddresses)
 }
 
 func TestAddProtocolAddress(t *testing.T) {
-	const nicid = 1
+	const nicID = 1
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
 	})
 	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(nicid, ep); err != nil {
+	if err := s.CreateNIC(nicID, ep); err != nil {
 		t.Fatal("CreateNIC failed:", err)
 	}
 
@@ -1708,24 +1708,24 @@ func TestAddProtocolAddress(t *testing.T) {
 					PrefixLen: prefixLen,
 				},
 			}
-			if err := s.AddProtocolAddress(nicid, protocolAddress); err != nil {
+			if err := s.AddProtocolAddress(nicID, protocolAddress); err != nil {
 				t.Errorf("AddProtocolAddress(%+v) failed: %s", protocolAddress, err)
 			}
 			expectedAddresses = append(expectedAddresses, protocolAddress)
 		}
 	}
 
-	gotAddresses := s.AllAddresses()[nicid]
+	gotAddresses := s.AllAddresses()[nicID]
 	verifyAddresses(t, expectedAddresses, gotAddresses)
 }
 
 func TestAddAddressWithOptions(t *testing.T) {
-	const nicid = 1
+	const nicID = 1
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
 	})
 	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(nicid, ep); err != nil {
+	if err := s.CreateNIC(nicID, ep); err != nil {
 		t.Fatal("CreateNIC failed:", err)
 	}
 
@@ -1736,7 +1736,7 @@ func TestAddAddressWithOptions(t *testing.T) {
 	for _, addrLen := range addrLenRange {
 		for _, behavior := range behaviorRange {
 			address := addrGen.next(addrLen)
-			if err := s.AddAddressWithOptions(nicid, fakeNetNumber, address, behavior); err != nil {
+			if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address, behavior); err != nil {
 				t.Fatalf("AddAddressWithOptions(address=%s, behavior=%d) failed: %s", address, behavior, err)
 			}
 			expectedAddresses = append(expectedAddresses, tcpip.ProtocolAddress{
@@ -1746,17 +1746,17 @@ func TestAddAddressWithOptions(t *testing.T) {
 		}
 	}
 
-	gotAddresses := s.AllAddresses()[nicid]
+	gotAddresses := s.AllAddresses()[nicID]
 	verifyAddresses(t, expectedAddresses, gotAddresses)
 }
 
 func TestAddProtocolAddressWithOptions(t *testing.T) {
-	const nicid = 1
+	const nicID = 1
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
 	})
 	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(nicid, ep); err != nil {
+	if err := s.CreateNIC(nicID, ep); err != nil {
 		t.Fatal("CreateNIC failed:", err)
 	}
 
@@ -1775,7 +1775,7 @@ func TestAddProtocolAddressWithOptions(t *testing.T) {
 						PrefixLen: prefixLen,
 					},
 				}
-				if err := s.AddProtocolAddressWithOptions(nicid, protocolAddress, behavior); err != nil {
+				if err := s.AddProtocolAddressWithOptions(nicID, protocolAddress, behavior); err != nil {
 					t.Fatalf("AddProtocolAddressWithOptions(%+v, %d) failed: %s", protocolAddress, behavior, err)
 				}
 				expectedAddresses = append(expectedAddresses, protocolAddress)
@@ -1783,7 +1783,7 @@ func TestAddProtocolAddressWithOptions(t *testing.T) {
 		}
 	}
 
-	gotAddresses := s.AllAddresses()[nicid]
+	gotAddresses := s.AllAddresses()[nicID]
 	verifyAddresses(t, expectedAddresses, gotAddresses)
 }
 
@@ -2030,8 +2030,8 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 		if e.err != nil {
 			t.Fatal("got DAD error: ", e.err)
 		}
-		if e.nicid != 1 {
-			t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+		if e.nicID != 1 {
+			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
 		}
 		if e.addr != linkLocalAddr {
 			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index f54117c4e..3b28b06d0 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -79,17 +79,17 @@ func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string)
 	linkEPs := make(map[string]*channel.Endpoint)
 	for i, linkEpName := range linkEpNames {
 		channelEP := channel.New(256, mtu, "")
-		nicid := tcpip.NICID(i + 1)
-		if err := s.CreateNamedNIC(nicid, linkEpName, channelEP); err != nil {
+		nicID := tcpip.NICID(i + 1)
+		if err := s.CreateNamedNIC(nicID, linkEpName, channelEP); err != nil {
 			t.Fatalf("CreateNIC failed: %v", err)
 		}
 		linkEPs[linkEpName] = channelEP
 
-		if err := s.AddAddress(nicid, ipv4.ProtocolNumber, stackAddr); err != nil {
+		if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil {
 			t.Fatalf("AddAddress IPv4 failed: %v", err)
 		}
 
-		if err := s.AddAddress(nicid, ipv6.ProtocolNumber, stackV6Addr); err != nil {
+		if err := s.AddAddress(nicID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
 			t.Fatalf("AddAddress IPv6 failed: %v", err)
 		}
 	}
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 0092d0ea9..70e008d36 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -278,13 +278,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	} else {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
-		nicid := to.NIC
+		nicID := to.NIC
 		if e.BindNICID != 0 {
-			if nicid != 0 && nicid != e.BindNICID {
+			if nicID != 0 && nicID != e.BindNICID {
 				return 0, nil, tcpip.ErrNoRoute
 			}
 
-			nicid = e.BindNICID
+			nicID = e.BindNICID
 		}
 
 		toCopy := *to
@@ -295,7 +295,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 
 		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicid, e.BindAddr, to.Addr, netProto, false /* multicastLoop */)
+		r, err := e.stack.FindRoute(nicID, e.BindAddr, to.Addr, netProto, false /* multicastLoop */)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -483,7 +483,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	nicid := addr.NIC
+	nicID := addr.NIC
 	localPort := uint16(0)
 	switch e.state {
 	case stateBound, stateConnected:
@@ -492,11 +492,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 			break
 		}
 
-		if nicid != 0 && nicid != e.BindNICID {
+		if nicID != 0 && nicID != e.BindNICID {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		nicid = e.BindNICID
+		nicID = e.BindNICID
 	default:
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -507,7 +507,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.BindAddr, addr.Addr, netProto, false /* multicastLoop */)
+	r, err := e.stack.FindRoute(nicID, e.BindAddr, addr.Addr, netProto, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
@@ -524,14 +524,14 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	// v6only is set to false and this is an ipv6 endpoint.
 	netProtos := []tcpip.NetworkProtocolNumber{netProto}
 
-	id, err = e.registerWithStack(nicid, netProtos, id)
+	id, err = e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
 	}
 
 	e.ID = id
 	e.route = r.Clone()
-	e.RegisterNICID = nicid
+	e.RegisterNICID = nicID
 
 	e.state = stateConnected
 
@@ -582,18 +582,18 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
 	if id.LocalPort != 0 {
 		// The endpoint already has a local port, just attempt to
 		// register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindToDevice */)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindToDevice */)
 		return id, err
 	}
 
 	// We need to find a port for the endpoint.
 	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
 		id.LocalPort = p
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindtodevice */)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindtodevice */)
 		switch err {
 		case nil:
 			return true, nil
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 26335094e..0010b5e5f 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -266,7 +266,7 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -293,13 +293,13 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 		// Get info directly from the ethernet header.
 		hdr := header.Ethernet(pkt.LinkHeader)
 		packet.senderAddr = tcpip.FullAddress{
-			NIC:  nicid,
+			NIC:  nicID,
 			Addr: tcpip.Address(hdr.SourceAddress()),
 		}
 	} else {
 		// Guess the would-be ethernet header.
 		packet.senderAddr = tcpip.FullAddress{
-			NIC:  nicid,
+			NIC:  nicID,
 			Addr: tcpip.Address(localAddr),
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index d29f0f81b..79fec6b77 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1214,9 +1214,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			e.bindToDevice = 0
 			return nil
 		}
-		for nicid, nic := range e.stack.NICInfo() {
+		for nicID, nic := range e.stack.NICInfo() {
 			if nic.Name == string(v) {
-				e.bindToDevice = nicid
+				e.bindToDevice = nicID
 				return nil
 			}
 		}
@@ -1634,7 +1634,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		return tcpip.ErrAlreadyConnected
 	}
 
-	nicid := addr.NIC
+	nicID := addr.NIC
 	switch e.state {
 	case StateBound:
 		// If we're already bound to a NIC but the caller is requesting
@@ -1643,11 +1643,11 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			break
 		}
 
-		if nicid != 0 && nicid != e.boundNICID {
+		if nicID != 0 && nicID != e.boundNICID {
 			return tcpip.ErrNoRoute
 		}
 
-		nicid = e.boundNICID
+		nicID = e.boundNICID
 
 	case StateInitial:
 		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
@@ -1666,7 +1666,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
+	r, err := e.stack.FindRoute(nicID, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
@@ -1681,7 +1681,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 	if e.ID.LocalPort != 0 {
 		// The endpoint is bound to a port, attempt to register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice)
 		if err != nil {
 			return err
 		}
@@ -1716,7 +1716,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 			id := e.ID
 			id.LocalPort = p
-			switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
+			switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
 			case nil:
 				e.ID = id
 				return true, nil
@@ -1741,7 +1741,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	e.isRegistered = true
 	e.state = StateConnecting
 	e.route = r.Clone()
-	e.boundNICID = nicid
+	e.boundNICID = nicID
 	e.effectiveNetProtos = netProtos
 	e.connectingAddress = connectingAddr
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 4e11de9db..5270f24df 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -282,7 +282,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 // connectRoute establishes a route to the specified interface or the
 // configured multicast interface if no interface is specified and the
 // specified address is a multicast address.
-func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) {
+func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) {
 	localAddr := e.ID.LocalAddress
 	if isBroadcastOrMulticast(localAddr) {
 		// A packet can only originate from a unicast address (i.e., an interface).
@@ -290,20 +290,20 @@ func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress, netPr
 	}
 
 	if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) {
-		if nicid == 0 {
-			nicid = e.multicastNICID
+		if nicID == 0 {
+			nicID = e.multicastNICID
 		}
-		if localAddr == "" && nicid == 0 {
+		if localAddr == "" && nicID == 0 {
 			localAddr = e.multicastAddr
 		}
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, localAddr, addr.Addr, netProto, e.multicastLoop)
+	r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.multicastLoop)
 	if err != nil {
 		return stack.Route{}, 0, err
 	}
-	return r, nicid, nil
+	return r, nicID, nil
 }
 
 // Write writes data to the endpoint's peer. This method does not block
@@ -382,13 +382,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	} else {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
-		nicid := to.NIC
+		nicID := to.NIC
 		if e.BindNICID != 0 {
-			if nicid != 0 && nicid != e.BindNICID {
+			if nicID != 0 && nicID != e.BindNICID {
 				return 0, nil, tcpip.ErrNoRoute
 			}
 
-			nicid = e.BindNICID
+			nicID = e.BindNICID
 		}
 
 		if to.Addr == header.IPv4Broadcast && !e.broadcast {
@@ -400,7 +400,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, err
 		}
 
-		r, _, err := e.connectRoute(nicid, *to, netProto)
+		r, _, err := e.connectRoute(nicID, *to, netProto)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -622,9 +622,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			e.bindToDevice = 0
 			return nil
 		}
-		for nicid, nic := range e.stack.NICInfo() {
+		for nicID, nic := range e.stack.NICInfo() {
 			if nic.Name == string(v) {
-				e.bindToDevice = nicid
+				e.bindToDevice = nicID
 				return nil
 			}
 		}
@@ -907,7 +907,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	nicid := addr.NIC
+	nicID := addr.NIC
 	var localPort uint16
 	switch e.state {
 	case StateInitial:
@@ -917,16 +917,16 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 			break
 		}
 
-		if nicid != 0 && nicid != e.BindNICID {
+		if nicID != 0 && nicID != e.BindNICID {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		nicid = e.BindNICID
+		nicID = e.BindNICID
 	default:
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	r, nicid, err := e.connectRoute(nicid, addr, netProto)
+	r, nicID, err := e.connectRoute(nicID, addr, netProto)
 	if err != nil {
 		return err
 	}
@@ -954,7 +954,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		}
 	}
 
-	id, err = e.registerWithStack(nicid, netProtos, id)
+	id, err = e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
 	}
@@ -967,7 +967,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.ID = id
 	e.route = r.Clone()
 	e.dstPort = addr.Port
-	e.RegisterNICID = nicid
+	e.RegisterNICID = nicID
 	e.effectiveNetProtos = netProtos
 
 	e.state = StateConnected
@@ -1022,7 +1022,7 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
 		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
 		if err != nil {
@@ -1031,7 +1031,7 @@ func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.Networ
 		id.LocalPort = port
 	}
 
-	err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
+	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
 	if err != nil {
 		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
 	}
@@ -1061,11 +1061,11 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		}
 	}
 
-	nicid := addr.NIC
+	nicID := addr.NIC
 	if len(addr.Addr) != 0 && !isBroadcastOrMulticast(addr.Addr) {
 		// A local unicast address was specified, verify that it's valid.
-		nicid = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
-		if nicid == 0 {
+		nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		if nicID == 0 {
 			return tcpip.ErrBadLocalAddress
 		}
 	}
@@ -1074,13 +1074,13 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		LocalPort:    addr.Port,
 		LocalAddress: addr.Addr,
 	}
-	id, err = e.registerWithStack(nicid, netProtos, id)
+	id, err = e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
 	}
 
 	e.ID = id
-	e.RegisterNICID = nicid
+	e.RegisterNICID = nicID
 	e.effectiveNetProtos = netProtos
 
 	// Mark endpoint as bound.
-- 
cgit v1.2.3


From 3552691137284525a33d3de7e3c2d170da66c8ac Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 6 Nov 2019 22:28:41 -0800
Subject: Fix data race in syscall_test_runner.go

Fixes #1140

PiperOrigin-RevId: 279012793
---
 test/syscalls/syscall_test_runner.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 856398994..7186a8ddc 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -208,14 +208,15 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 		}
 		log.Warningf("%s: Got signal: %v", name, s)
 		done := make(chan bool)
-		go func() {
-			dArgs := append(args, "-alsologtostderr=true", "debug", "--stacks", id)
+		dArgs := append([]string{}, args...)
+		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
+		go func(dArgs []string) {
 			cmd := exec.Command(*runscPath, dArgs...)
 			cmd.Stdout = os.Stdout
 			cmd.Stderr = os.Stderr
 			cmd.Run()
 			done <- true
-		}()
+		}(dArgs)
 
 		timeout := time.After(3 * time.Second)
 		select {
@@ -225,7 +226,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 		}
 
 		log.Warningf("Send SIGTERM to the sandbox process")
-		dArgs := append(args, "debug",
+		dArgs = append(args, "debug",
 			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
 			id)
 		cmd = exec.Command(*runscPath, dArgs...)
-- 
cgit v1.2.3


From 2326224a9652201938df2881be055ab352672587 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 6 Nov 2019 23:50:54 -0800
Subject: Fix yet another data race.

Fixes #1140

PiperOrigin-RevId: 279020846
---
 test/syscalls/syscall_test_runner.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 7186a8ddc..accf46347 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -229,7 +229,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 		dArgs = append(args, "debug",
 			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
 			id)
-		cmd = exec.Command(*runscPath, dArgs...)
+		cmd := exec.Command(*runscPath, dArgs...)
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
 		cmd.Run()
-- 
cgit v1.2.3


From 66ebb6575f929a389d3c929977ed5e31d706fcfe Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 7 Nov 2019 09:45:26 -0800
Subject: Add support for TIME_WAIT timeout.

This change adds explicit support for honoring the 2MSL timeout
for sockets in TIME_WAIT state. It also adds support for the
TCP_LINGER2 option that allows modification of the FIN_WAIT2
state timeout duration for a given socket.

It also adds an option to modify the Stack wide TIME_WAIT timeout
but this is only for testing. On Linux this is fixed at 60s.

Further, we also now correctly process RST's in CLOSE_WAIT and
close the socket similar to linux without moving it to error
state.

We also now handle SYN in ESTABLISHED state as per
RFC5961#section-4.1. Earlier we would just drop these SYNs.
Which can result in some tests that pass on linux to fail on
gVisor.

Netstack now honors TIME_WAIT correctly as well as handles the
following cases correctly.

- TCP RSTs in TIME_WAIT are ignored.
- A duplicate TCP FIN during TIME_WAIT extends the TIME_WAIT
  and a dup ACK is sent in response to the FIN as the dup FIN
  indicates potential loss of the original final ACK.
- An out of order segment during TIME_WAIT generates a dup ACK.
- A new SYN w/ a sequence number > the highest sequence number
  in the previous connection closes the TIME_WAIT early and
  opens a new connection.

Further to make the SYN case work correctly the ISN (Initial
Sequence Number) generation for Netstack has been updated to
be as per RFC. Its not a pure random number anymore and follows
the recommendation in https://tools.ietf.org/html/rfc6528#page-3.

The current hash used is not a cryptographically secure hash
function. A separate change will update the hash function used
to Siphash similar to what is used in Linux.

PiperOrigin-RevId: 279106406
---
 pkg/sentry/socket/netstack/netstack.go       |  20 +
 pkg/tcpip/adapters/gonet/gonet_test.go       |  12 +-
 pkg/tcpip/stack/stack.go                     |  20 +-
 pkg/tcpip/stack/transport_demuxer.go         |  33 +-
 pkg/tcpip/tcpip.go                           |  12 +-
 pkg/tcpip/transport/tcp/BUILD                |   2 +-
 pkg/tcpip/transport/tcp/accept.go            |  17 +-
 pkg/tcpip/transport/tcp/connect.go           | 322 ++++++++++++--
 pkg/tcpip/transport/tcp/endpoint.go          | 101 ++++-
 pkg/tcpip/transport/tcp/endpoint_state.go    |  26 +-
 pkg/tcpip/transport/tcp/protocol.go          |  43 ++
 pkg/tcpip/transport/tcp/rcv.go               | 167 ++++++-
 pkg/tcpip/transport/tcp/tcp_test.go          | 622 ++++++++++++++++++++++++++-
 test/syscalls/BUILD                          |  22 +-
 test/syscalls/linux/BUILD                    |   1 +
 test/syscalls/linux/socket_inet_loopback.cc  | 336 +++++++++++++++
 test/syscalls/linux/socket_ip_tcp_generic.cc |  93 ++++
 17 files changed, 1736 insertions(+), 113 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 27c6692c4..d92399efd 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1173,6 +1173,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		copy(b, v)
 		return b, nil
 
+	case linux.TCP_LINGER2:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPLingerTimeoutOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1556,6 +1568,14 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return nil
 
+	case linux.TCP_LINGER2:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 8ced960bb..ee077ae83 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -151,10 +151,8 @@ func TestCloseReader(t *testing.T) {
 
 		buf := make([]byte, 256)
 		n, err := c.Read(buf)
-		got, ok := err.(*net.OpError)
-		want := tcpip.ErrConnectionAborted
-		if n != 0 || !ok || got.Err.Error() != want.String() {
-			t.Errorf("c.Read() = (%d, %v), want (0, OpError(%v))", n, err, want)
+		if n != 0 || err != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, EOF)", n, err)
 		}
 	}()
 	sender, err := connect(s, addr)
@@ -203,10 +201,8 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 
 		buf := make([]byte, 256)
 		n, e := c.Read(buf)
-		got, ok := e.(*net.OpError)
-		want := tcpip.ErrConnectionAborted
-		if n != 0 || !ok || got.Err.Error() != want.String() {
-			t.Errorf("c.Read() = (%d, %v), want (0, OpError(%v))", n, e, want)
+		if n != 0 || e != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, EOF)", n, e)
 		}
 	})
 	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 99809df75..2f8d8e822 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -402,11 +402,11 @@ type Stack struct {
 	// by the stack.
 	icmpRateLimiter *ICMPRateLimiter
 
-	// portSeed is a one-time random value initialized at stack startup
+	// seed is a one-time random value initialized at stack startup
 	// and is used to seed the TCP port picking on active connections
 	//
 	// TODO(gvisor.dev/issue/940): S/R this field.
-	portSeed uint32
+	seed uint32
 
 	// ndpConfigs is the default NDP configurations used by interfaces.
 	ndpConfigs NDPConfigurations
@@ -544,7 +544,7 @@ func New(opts Options) *Stack {
 		stats:                opts.Stats.FillIn(),
 		handleLocal:          opts.HandleLocal,
 		icmpRateLimiter:      NewICMPRateLimiter(),
-		portSeed:             generateRandUint32(),
+		seed:                 generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		uniqueIDGenerator:    opts.UniqueID,
@@ -1186,6 +1186,12 @@ func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
 	s.mu.Unlock()
 }
 
+// FindTransportEndpoint finds an endpoint that most closely matches the provided
+// id. If no endpoint is found it returns nil.
+func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+	return s.demux.findTransportEndpoint(netProto, transProto, id, r)
+}
+
 // RegisterRawTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided transport
 // protocol will be delivered to the given endpoint.
@@ -1573,12 +1579,12 @@ func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRoute
 	return nil
 }
 
-// PortSeed returns a 32 bit value that can be used as a seed value for port
-// picking.
+// Seed returns a 32 bit value that can be used as a seed value for port
+// picking, ISN generation etc.
 //
 // NOTE: The seed is generated once during stack initialization only.
-func (s *Stack) PortSeed() uint32 {
-	return s.portSeed
+func (s *Stack) Seed() uint32 {
+	return s.seed
 }
 
 func generateRandUint32() uint32 {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 594570216..cb805522b 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -103,7 +103,6 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
-
 	// multiPortEndpoints are guaranteed to have at least one element.
 	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
@@ -507,10 +506,40 @@ func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id Tr
 	if ep, ok := eps.endpoints[nid]; ok {
 		matchedEPs = append(matchedEPs, ep)
 	}
-
 	return matchedEPs
 }
 
+// findTransportEndpoint find a single endpoint that most closely matches the provided id.
+func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		return nil
+	}
+	// Try to find the endpoint.
+	eps.mu.RLock()
+	epsByNic := d.findEndpointLocked(eps, id)
+	// Fail if we didn't find one.
+	if epsByNic == nil {
+		eps.mu.RUnlock()
+		return nil
+	}
+
+	epsByNic.mu.RLock()
+	eps.mu.RUnlock()
+
+	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	if !ok {
+		if mpep, ok = epsByNic.endpoints[0]; !ok {
+			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+			return nil
+		}
+	}
+
+	ep := selectEndpoint(id, mpep, epsByNic.seed)
+	epsByNic.mu.RUnlock()
+	return ep
+}
+
 // findEndpointLocked returns the endpoint that most closely matches the given
 // id.
 func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 3edb513d4..bd5eb89ca 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -586,6 +586,16 @@ type MaxSegOption int
 // A zero value indicates the default.
 type TTLOption uint8
 
+// TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
+// before being marked closed.
+type TCPLingerTimeoutOption time.Duration
+
+// TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum duration for which a socket lingers in the TIME_WAIT state
+// before being marked closed.
+type TCPTimeWaitTimeoutOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
@@ -1329,8 +1339,8 @@ var (
 
 // GetDanglingEndpoints returns all dangling endpoints.
 func GetDanglingEndpoints() []Endpoint {
-	es := make([]Endpoint, 0, len(danglingEndpoints))
 	danglingEndpointsMu.Lock()
+	es := make([]Endpoint, 0, len(danglingEndpoints))
 	for e := range danglingEndpoints {
 		es = append(es, e)
 	}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index f1dbc6f91..3f47b328d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -71,7 +71,7 @@ filegroup(
 
 go_test(
     name = "tcp_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "dual_stack_test.go",
         "sack_scoreboard_test.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index cb0e13ebc..0e8e0a2b4 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -269,8 +269,8 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
-	cookie := l.createCookie(s.id, irs, encodeMSS(opts.MSS))
-	ep, err := l.createConnectingEndpoint(s, cookie, irs, opts)
+	isn := generateSecureISN(s.id, l.stack.Seed())
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
 	if err != nil {
 		return nil, err
 	}
@@ -289,7 +289,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Perform the 3-way handshake.
 	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
 
-	h.resetToSynRcvd(cookie, irs, opts)
+	h.resetToSynRcvd(isn, irs, opts)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -361,6 +361,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer decSynRcvdCount()
 	defer e.decSynRcvdCount()
 	defer s.decRef()
+
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
@@ -368,6 +369,11 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 		return
 	}
 	ctx.removePendingEndpoint(n)
+	// Start the protocol goroutine.
+	wq := &waiter.Queue{}
+	n.startAcceptedLoop(wq)
+	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
+
 	e.deliverAccepted(n)
 }
 
@@ -543,6 +549,11 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// number of goroutines as we do check before
 		// entering here that there was at least some
 		// space available in the backlog.
+
+		// Start the protocol goroutine.
+		wq := &waiter.Queue{}
+		n.startAcceptedLoop(wq)
+		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ca982c451..a114c06c1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -15,6 +15,7 @@
 package tcp
 
 import (
+	"encoding/binary"
 	"sync"
 	"time"
 
@@ -22,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -139,7 +141,32 @@ func (h *handshake) resetState() {
 	h.flags = header.TCPFlagSyn
 	h.ackNum = 0
 	h.mss = 0
-	h.iss = seqnum.Value(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24)
+	h.iss = generateSecureISN(h.ep.ID, h.ep.stack.Seed())
+}
+
+// generateSecureISN generates a secure Initial Sequence number based on the
+// recommendation here https://tools.ietf.org/html/rfc6528#page-3.
+func generateSecureISN(id stack.TransportEndpointID, seed uint32) seqnum.Value {
+	isnHasher := jenkins.Sum32(seed)
+	isnHasher.Write([]byte(id.LocalAddress))
+	isnHasher.Write([]byte(id.RemoteAddress))
+	portBuf := make([]byte, 2)
+	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
+	isnHasher.Write(portBuf)
+	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
+	isnHasher.Write(portBuf)
+	// The time period here is 64ns. This is similar to what linux uses
+	// generate a sequence number that overlaps less than one
+	// time per MSL (2 minutes).
+	//
+	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
+	// To wrap the whole 32 bit space would require
+	// 2^32/1562500 ~ 274 seconds.
+	//
+	// Which sort of guarantees that we won't reuse the ISN for a new
+	// connection for the same tuple for at least 274s.
+	isn := isnHasher.Sum32() + uint32(time.Now().UnixNano()>>6)
+	return seqnum.Value(isn)
 }
 
 // effectiveRcvWndScale returns the effective receive window scale to be used.
@@ -809,7 +836,19 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	e.state = StateError
 	e.HardError = err
 	if err != tcpip.ErrConnectionReset {
-		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0)
+		// The exact sequence number to be used for the RST is the same as the
+		// one used by Linux. We need to handle the case of window being shrunk
+		// which can cause sndNxt to be outside the acceptable window on the
+		// receiver.
+		//
+		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
+		// information.
+		sndWndEnd := e.snd.sndUna.Add(e.snd.sndWnd)
+		resetSeqNum := sndWndEnd
+		if !sndWndEnd.LessThan(e.snd.sndNxt) || e.snd.sndNxt.Size(sndWndEnd) < (1<<e.snd.sndWndScale) {
+			resetSeqNum = e.snd.sndNxt
+		}
+		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.rcvNxt, 0)
 	}
 }
 
@@ -823,6 +862,51 @@ func (e *endpoint) completeWorkerLocked() {
 	}
 }
 
+func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
+	if e.rcv.acceptable(s.sequenceNumber, 0) {
+		// RFC 793, page 37 states that "in all states
+		// except SYN-SENT, all reset (RST) segments are
+		// validated by checking their SEQ-fields." So
+		// we only process it if it's acceptable.
+		s.decRef()
+		e.mu.Lock()
+		switch e.state {
+		// In case of a RST in CLOSE-WAIT linux moves
+		// the socket to closed state with an error set
+		// to indicate EPIPE.
+		//
+		// Technically this seems to be at odds w/ RFC.
+		// As per https://tools.ietf.org/html/rfc793#section-2.7
+		// page 69 the behavior for a segment arriving
+		// w/ RST bit set in CLOSE-WAIT is inlined below.
+		//
+		//  ESTABLISHED
+		//  FIN-WAIT-1
+		//  FIN-WAIT-2
+		//  CLOSE-WAIT
+
+		//  If the RST bit is set then, any outstanding RECEIVEs and
+		//  SEND should receive "reset" responses. All segment queues
+		//  should be flushed.  Users should also receive an unsolicited
+		//  general "connection reset" signal. Enter the CLOSED state,
+		//  delete the TCB, and return.
+		case StateCloseWait:
+			e.state = StateClose
+			e.HardError = tcpip.ErrAborted
+			// We need to set this explicitly here because otherwise
+			// the port registrations will not be released till the
+			// endpoint is actively closed by the application.
+			e.workerCleanup = true
+			e.mu.Unlock()
+			return false, nil
+		default:
+			e.mu.Unlock()
+			return false, tcpip.ErrConnectionReset
+		}
+	}
+	return true, nil
+}
+
 // handleSegments pulls segments from the queue and processes them. It returns
 // no error if the protocol loop should continue, an error otherwise.
 func (e *endpoint) handleSegments() *tcpip.Error {
@@ -840,14 +924,34 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 		}
 
 		if s.flagIsSet(header.TCPFlagRst) {
-			if e.rcv.acceptable(s.sequenceNumber, 0) {
-				// RFC 793, page 37 states that "in all states
-				// except SYN-SENT, all reset (RST) segments are
-				// validated by checking their SEQ-fields." So
-				// we only process it if it's acceptable.
-				s.decRef()
-				return tcpip.ErrConnectionReset
+			if ok, err := e.handleReset(s); !ok {
+				return err
 			}
+		} else if s.flagIsSet(header.TCPFlagSyn) {
+			// See: https://tools.ietf.org/html/rfc5961#section-4.1
+			//   1) If the SYN bit is set, irrespective of the sequence number, TCP
+			//    MUST send an ACK (also referred to as challenge ACK) to the remote
+			//    peer:
+			//
+			//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+			//
+			//    After sending the acknowledgment, TCP MUST drop the unacceptable
+			//    segment and stop processing further.
+			//
+			// By sending an ACK, the remote peer is challenged to confirm the loss
+			// of the previous connection and the request to start a new connection.
+			// A legitimate peer, after restart, would not have a TCB in the
+			// synchronized state.  Thus, when the ACK arrives, the peer should send
+			// a RST segment back with the sequence number derived from the ACK
+			// field that caused the RST.
+
+			// This RST will confirm that the remote peer has indeed closed the
+			// previous connection.  Upon receipt of a valid RST, the local TCP
+			// endpoint MUST terminate its connection.  The local TCP endpoint
+			// should then rely on SYN retransmission from the remote end to
+			// re-establish the connection.
+
+			e.snd.sendAck()
 		} else if s.flagIsSet(header.TCPFlagAck) {
 			// Patch the window size in the segment according to the
 			// send window scale.
@@ -856,7 +960,15 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 			// RFC 793, page 41 states that "once in the ESTABLISHED
 			// state all segments must carry current acknowledgment
 			// information."
-			e.rcv.handleRcvdSegment(s)
+			drop, err := e.rcv.handleRcvdSegment(s)
+			if err != nil {
+				s.decRef()
+				return err
+			}
+			if drop {
+				s.decRef()
+				continue
+			}
 			e.snd.handleRcvdSegment(s)
 		}
 		s.decRef()
@@ -955,7 +1067,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		}
 
 		e.mu.Unlock()
-
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1001,6 +1112,10 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		// RTT itself.
 		e.rcvAutoParams.prevCopied = initialRcvWnd
 		e.rcvListMu.Unlock()
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.mu.Lock()
+		e.state = StateEstablished
+		e.mu.Unlock()
 	}
 
 	e.keepalive.timer.init(&e.keepalive.waker)
@@ -1008,10 +1123,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
-	if e.state != StateEstablished {
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
-		e.state = StateEstablished
-	}
 	drained := e.drainDone != nil
 	e.mu.Unlock()
 	if drained {
@@ -1042,7 +1153,13 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		{
 			w: &closeWaker,
 			f: func() *tcpip.Error {
-				return tcpip.ErrConnectionAborted
+				// This means the socket is being closed due
+				// to the TCP_FIN_WAIT2 timeout was hit. Just
+				// mark the socket as closed.
+				e.mu.Lock()
+				e.state = StateClose
+				e.mu.Unlock()
+				return nil
 			},
 		},
 		{
@@ -1085,17 +1202,18 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
 					e.mu.Unlock()
 				}
+
 				if n&notifyClose != 0 && closeTimer == nil {
-					// Reset the connection 3 seconds after
-					// the endpoint has been closed.
-					//
-					// The timer could fire in background
-					// when the endpoint is drained. That's
-					// OK as the loop here will not honor
-					// the firing until the undrain arrives.
-					closeTimer = time.AfterFunc(3*time.Second, func() {
-						closeWaker.Assert()
-					})
+					e.mu.Lock()
+					if e.state == StateFinWait2 && e.closed {
+						// The socket has been closed and we are in FIN_WAIT2
+						// so start the FIN_WAIT2 timer.
+						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
+							closeWaker.Assert()
+						})
+						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+					}
+					e.mu.Unlock()
 				}
 
 				if n&notifyKeepaliveChanged != 0 {
@@ -1117,6 +1235,12 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 					}
 				}
 
+				if n&notifyTickleWorker != 0 {
+					// Just a tickle notification. No need to do
+					// anything.
+					return nil
+				}
+
 				return nil
 			},
 		},
@@ -1143,15 +1267,16 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	}
 	e.rcvListMu.Unlock()
 
-	e.mu.RLock()
+	e.mu.Lock()
 	if e.workerCleanup {
 		e.notifyProtocolGoroutine(notifyClose)
 	}
-	e.mu.RUnlock()
 
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
-	for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList {
+
+	for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
+		e.mu.Unlock()
 		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
 		e.workMu.Lock()
@@ -1167,6 +1292,23 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 			return nil
 		}
+		e.mu.Lock()
+	}
+
+	state := e.state
+	e.mu.Unlock()
+	var reuseTW func()
+	if state == StateTimeWait {
+		// Disable close timer as we now entering real TIME_WAIT.
+		if closeTimer != nil {
+			closeTimer.Stop()
+		}
+		// Mark the current sleeper done so as to free all associated
+		// wakers.
+		s.Done()
+		// Wake up any waiters before we enter TIME_WAIT.
+		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
@@ -1176,8 +1318,130 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		e.stack.Stats().TCP.CurrentEstablished.Decrement()
 		e.state = StateClose
 	}
+
 	// Lock released below.
 	epilogue()
 
+	// A new SYN was received during TIME_WAIT and we need to abort
+	// the timewait and redirect the segment to the listener queue
+	if reuseTW != nil {
+		reuseTW()
+	}
+
 	return nil
 }
+
+// handleTimeWaitSegments processes segments received during TIME_WAIT
+// state.
+func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
+	checkRequeue := true
+	for i := 0; i < maxSegmentsPerWake; i++ {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			checkRequeue = false
+			break
+		}
+		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
+		if newSyn {
+			info := e.EndpointInfo.TransportEndpointInfo
+			newID := info.ID
+			newID.RemoteAddress = ""
+			newID.RemotePort = 0
+			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
+			// If the local address is an IPv4 address then also
+			// look for IPv6 dual stack endpoints that might be
+			// listening on the local address.
+			if newID.LocalAddress.To4() != "" {
+				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
+			}
+			for _, netProto := range netProtos {
+				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, &s.route); listenEP != nil {
+					tcpEP := listenEP.(*endpoint)
+					if EndpointState(tcpEP.State()) == StateListen {
+						reuseTW = func() {
+							tcpEP.enqueueSegment(s)
+						}
+						// We explicitly do not decRef
+						// the segment as it's still
+						// valid and being reflected to
+						// a listening endpoint.
+						return false, reuseTW
+					}
+				}
+			}
+		}
+		if extTW {
+			extendTimeWait = true
+		}
+		s.decRef()
+	}
+	if checkRequeue && !e.segmentQueue.empty() {
+		e.newSegmentWaker.Assert()
+	}
+	return extendTimeWait, nil
+}
+
+// doTimeWait is responsible for handling the TCP behaviour once a socket
+// enters the TIME_WAIT state. Optionally it can return a closure that
+// should be executed after releasing the endpoint registrations. This is
+// done in cases where a new SYN is received during TIME_WAIT that carries
+// a sequence number larger than one see on the connection.
+func (e *endpoint) doTimeWait() (twReuse func()) {
+	// Trigger a 2 * MSL time wait state. During this period
+	// we will drop all incoming segments.
+	// NOTE: On Linux this is not configurable and is fixed at 60 seconds.
+	timeWaitDuration := DefaultTCPTimeWaitTimeout
+
+	// Get the stack wide configuration.
+	var tcpTW tcpip.TCPTimeWaitTimeoutOption
+	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
+		timeWaitDuration = time.Duration(tcpTW)
+	}
+
+	const newSegment = 1
+	const notification = 2
+	const timeWaitDone = 3
+
+	s := sleep.Sleeper{}
+	s.AddWaker(&e.newSegmentWaker, newSegment)
+	s.AddWaker(&e.notificationWaker, notification)
+
+	var timeWaitWaker sleep.Waker
+	s.AddWaker(&timeWaitWaker, timeWaitDone)
+	timeWaitTimer := time.AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
+	defer timeWaitTimer.Stop()
+
+	for {
+		e.workMu.Unlock()
+		v, _ := s.Fetch(true)
+		e.workMu.Lock()
+		switch v {
+		case newSegment:
+			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
+			if reuseTW != nil {
+				return reuseTW
+			}
+			if extendTimeWait {
+				timeWaitTimer.Reset(timeWaitDuration)
+			}
+		case notification:
+			n := e.fetchNotifications()
+			if n&notifyClose != 0 {
+				return nil
+			}
+			if n&notifyDrain != 0 {
+				for !e.segmentQueue.empty() {
+					// Ignore extending TIME_WAIT during a
+					// save. For sockets in TIME_WAIT we just
+					// terminate the TIME_WAIT early.
+					e.handleTimeWaitSegments()
+				}
+				close(e.drainDone)
+				<-e.undrain
+				return nil
+			}
+		case timeWaitDone:
+			return nil
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 79fec6b77..04c92c04c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -121,6 +121,11 @@ const (
 	notifyReset
 	notifyKeepaliveChanged
 	notifyMSSChanged
+	// notifyTickleWorker is used to tickle the protocol main loop during a
+	// restore after we update the endpoint state to the correct one. This
+	// ensures the loop terminates if the final state of the endpoint is
+	// say TIME_WAIT.
+	notifyTickleWorker
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
@@ -320,6 +325,11 @@ type endpoint struct {
 
 	state EndpointState `state:".(EndpointState)"`
 
+	// origEndpointState is only used during a restore phase to save the
+	// endpoint state at restore time as the socket is moved to it's correct
+	// state.
+	origEndpointState EndpointState `state:"nosave"`
+
 	isPortReserved    bool `state:"manual"`
 	isRegistered      bool
 	boundNICID        tcpip.NICID `state:"manual"`
@@ -503,6 +513,16 @@ type endpoint struct {
 
 	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
 	stats Stats `state:"nosave"`
+
+	// tcpLingerTimeout is the maximum amount of a time a socket
+	// a socket stays in TIME_WAIT state before being marked
+	// closed.
+	tcpLingerTimeout time.Duration
+
+	// closed indicates that the user has called closed on the
+	// endpoint and at this point the endpoint is only around
+	// to complete the TCP shutdown.
+	closed bool
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -599,6 +619,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.SetSockOptInt(tcpip.DelayOption, 1)
 	}
 
+	var tcpLT tcpip.TCPLingerTimeoutOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
+		e.tcpLingerTimeout = time.Duration(tcpLT)
+	}
+
 	if p := s.GetTCPProbe(); p != nil {
 		e.probe = p
 	}
@@ -686,6 +711,13 @@ func (e *endpoint) notifyProtocolGoroutine(n uint32) {
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
 func (e *endpoint) Close() {
+	e.mu.Lock()
+	closed := e.closed
+	e.mu.Unlock()
+	if closed {
+		return
+	}
+
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
@@ -706,6 +738,8 @@ func (e *endpoint) Close() {
 		e.isPortReserved = false
 	}
 
+	// Mark endpoint as closed.
+	e.closed = true
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	tcpip.AddDanglingEndpoint(e)
@@ -731,9 +765,7 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 	go func() {
 		defer close(done)
 		for n := range e.acceptedChan {
-			n.mu.Lock()
-			n.resetConnectionLocked(tcpip.ErrConnectionAborted)
-			n.mu.Unlock()
+			n.notifyProtocolGoroutine(notifyReset)
 			n.Close()
 		}
 	}()
@@ -1349,6 +1381,28 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPLingerTimeoutOption:
+		e.mu.Lock()
+		if v < 0 {
+			// Same as effectively disabling TCPLinger timeout.
+			v = 0
+		}
+		var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption
+		if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil {
+			// We were unable to retrieve a stack config, just use
+			// the DefaultTCPLingerTimeout.
+			if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) {
+				stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout)
+			}
+		}
+		// Cap it to the stack wide TCPLinger timeout.
+		if v > stkTCPLingerTimeout {
+			v = stkTCPLingerTimeout
+		}
+		e.tcpLingerTimeout = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1562,6 +1616,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.RUnlock()
 		return nil
 
+	case *tcpip.TCPLingerTimeoutOption:
+		e.mu.Lock()
+		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1696,7 +1756,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		// src IP to ensure that for a given tuple (srcIP, destIP,
 		// destPort) the offset used as a starting point is the same to
 		// ensure that we can cycle through the port space effectively.
-		h := jenkins.Sum32(e.stack.PortSeed())
+		h := jenkins.Sum32(e.stack.Seed())
 		h.Write([]byte(e.ID.LocalAddress))
 		h.Write([]byte(e.ID.RemoteAddress))
 		portBuf := make([]byte, 2)
@@ -1782,9 +1842,8 @@ func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
 // peer.
 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.shutdownFlags |= flags
-
+	finQueued := false
 	switch {
 	case e.state.connected():
 		// Close for read.
@@ -1799,6 +1858,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
 				e.notifyProtocolGoroutine(notifyReset)
+				e.mu.Unlock()
 				return nil
 			}
 		}
@@ -1817,14 +1877,11 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			s := newSegmentFromView(&e.route, e.ID, nil)
 			e.sndQueue.PushBack(s)
 			e.sndBufInQueue++
-
+			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
 
 			e.sndBufMu.Unlock()
-
-			// Tell protocol goroutine to close.
-			e.sndCloseWaker.Assert()
 		}
 
 	case e.state == StateListen:
@@ -1832,11 +1889,20 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
 		}
-
 	default:
+		e.mu.Unlock()
 		return tcpip.ErrNotConnected
 	}
-
+	e.mu.Unlock()
+	if finQueued {
+		if e.workMu.TryLock() {
+			e.handleClose()
+			e.workMu.Unlock()
+		} else {
+			// Tell protocol goroutine to close.
+			e.sndCloseWaker.Assert()
+		}
+	}
 	return nil
 }
 
@@ -1928,12 +1994,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 		return nil, nil, tcpip.ErrWouldBlock
 	}
 
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
-	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
-
-	return n, wq, nil
+	return n, n.waiterQueue, nil
 }
 
 // Bind binds the endpoint to a specific local port and optionally address.
@@ -2058,6 +2119,10 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		e.stack.Stats().TCP.ResetsReceived.Increment()
 	}
 
+	e.enqueueSegment(s)
+}
+
+func (e *endpoint) enqueueSegment(s *segment) {
 	// Send packet to worker goroutine.
 	if e.segmentQueue.enqueue(s) {
 		e.newSegmentWaker.Assert()
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 19f003b6b..7aa4c3f0e 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -78,7 +78,7 @@ func (e *endpoint) beforeSave() {
 		}
 		fallthrough
 	case StateError, StateClose:
-		for e.state == StateError && e.workerRunning {
+		for (e.state == StateError || e.state == StateClose) && e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
 			e.mu.Lock()
@@ -165,6 +165,12 @@ func (e *endpoint) loadState(state EndpointState) {
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
+	// Freeze segment queue before registering to prevent any segments
+	// from being delivered while it is being restored.
+	e.origEndpointState = e.state
+	// Restore the endpoint to InitialState as it will be moved to
+	// its origEndpointState during Resume.
+	e.state = StateInitial
 	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
@@ -173,8 +179,8 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
+	state := e.origEndpointState
 
-	state := e.state
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
@@ -189,7 +195,6 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	}
 
 	bind := func() {
-		e.state = StateInitial
 		if len(e.BindAddr) == 0 {
 			e.BindAddr = e.ID.LocalAddress
 		}
@@ -219,6 +224,16 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.ID.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
 			panic("endpoint connecting failed: " + err.String())
 		}
+		e.mu.Lock()
+		e.state = e.origEndpointState
+		closed := e.closed
+		e.mu.Unlock()
+		e.notifyProtocolGoroutine(notifyTickleWorker)
+		if state == StateFinWait2 && closed {
+			// If the endpoint has been closed then make sure we notify so
+			// that the FIN_WAIT2 timer is started after a restore.
+			e.notifyProtocolGoroutine(notifyClose)
+		}
 		connectedLoading.Done()
 	case StateListen:
 		tcpip.AsyncLoading.Add(1)
@@ -265,8 +280,11 @@ func (e *endpoint) Resume(s *stack.Stack) {
 				tcpip.AsyncLoading.Done()
 			}()
 		}
-		fallthrough
+		e.state = StateClose
+		e.stack.CompleteTransportEndpointCleanup(e)
+		tcpip.DeleteDanglingEndpoint(e)
 	case StateError:
+		e.state = StateError
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index c8e4a0d7e..89b965c23 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -23,6 +23,7 @@ package tcp
 import (
 	"strings"
 	"sync"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -54,6 +55,14 @@ const (
 	// MaxUnprocessedSegments is the maximum number of unprocessed segments
 	// that can be queued for a given endpoint.
 	MaxUnprocessedSegments = 300
+
+	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
+	// FIN_WAIT_2 state before being marked closed.
+	DefaultTCPLingerTimeout = 60 * time.Second
+
+	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
+	// in TIME_WAIT state before being marked closed.
+	DefaultTCPTimeWaitTimeout = 60 * time.Second
 )
 
 // SACKEnabled option can be used to enable SACK support in the TCP
@@ -93,6 +102,8 @@ type protocol struct {
 	congestionControl          string
 	availableCongestionControl []string
 	moderateReceiveBuffer      bool
+	tcpLingerTimeout           time.Duration
+	tcpTimeWaitTimeout         time.Duration
 }
 
 // Number returns the tcp protocol number.
@@ -212,6 +223,24 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case tcpip.TCPLingerTimeoutOption:
+		if v < 0 {
+			v = 0
+		}
+		p.mu.Lock()
+		p.tcpLingerTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPTimeWaitTimeoutOption:
+		if v < 0 {
+			v = 0
+		}
+		p.mu.Lock()
+		p.tcpTimeWaitTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -262,6 +291,18 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPLingerTimeoutOption:
+		p.mu.Lock()
+		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
+		p.mu.Unlock()
+		return nil
+
+	case *tcpip.TCPTimeWaitTimeoutOption:
+		p.mu.Lock()
+		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -274,5 +315,7 @@ func NewProtocol() stack.TransportProtocol {
 		recvBufferSize:             ReceiveBufferSizeOption{MinBufferSize, DefaultReceiveBufferSize, MaxBufferSize},
 		congestionControl:          ccReno,
 		availableCongestionControl: []string{ccReno, ccCubic},
+		tcpLingerTimeout:           DefaultTCPLingerTimeout,
+		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index e90f9a7d9..068b90fb6 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -18,6 +18,7 @@ import (
 	"container/heap"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 )
@@ -209,6 +210,11 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		switch r.ep.state {
 		case StateFinWait1:
 			r.ep.state = StateFinWait2
+			// Notify protocol goroutine that we have received an
+			// ACK to our FIN so that it can start the FIN_WAIT2
+			// timer to abort connection if the other side does
+			// not close within 2MSL.
+			r.ep.notifyProtocolGoroutine(notifyClose)
 		case StateClosing:
 			r.ep.state = StateTimeWait
 		case StateLastAck:
@@ -253,23 +259,105 @@ func (r *receiver) updateRTT() {
 	r.ep.rcvListMu.Unlock()
 }
 
-// handleRcvdSegment handles TCP segments directed at the connection managed by
-// r as they arrive. It is called by the protocol main loop.
-func (r *receiver) handleRcvdSegment(s *segment) {
+func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err *tcpip.Error) {
+	r.ep.rcvListMu.Lock()
+	rcvClosed := r.ep.rcvClosed || r.closed
+	r.ep.rcvListMu.Unlock()
+
+	// If we are in one of the shutdown states then we need to do
+	// additional checks before we try and process the segment.
+	switch state {
+	case StateCloseWait, StateClosing, StateLastAck:
+		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
+			s.decRef()
+			// Just drop the segment as we have
+			// already received a FIN and this
+			// segment is after the sequence number
+			// for the FIN.
+			return true, nil
+		}
+		fallthrough
+	case StateFinWait1:
+		fallthrough
+	case StateFinWait2:
+		// If we are closed for reads (either due to an
+		// incoming FIN or the user calling shutdown(..,
+		// SHUT_RD) then any data past the rcvNxt should
+		// trigger a RST.
+		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
+		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
+			s.decRef()
+			return true, tcpip.ErrConnectionAborted
+		}
+		if state == StateFinWait1 {
+			break
+		}
+
+		// If it's a retransmission of an old data segment
+		// or a pure ACK then allow it.
+		if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.rcvNxt) ||
+			s.logicalLen() == 0 {
+			break
+		}
+
+		// In FIN-WAIT2 if the socket is fully
+		// closed(not owned by application on our end
+		// then the only acceptable segment is a
+		// FIN. Since FIN can technically also carry
+		// data we verify that the segment carrying a
+		// FIN ends at exactly e.rcvNxt+1.
+		//
+		// From RFC793 page 25.
+		//
+		// For sequence number purposes, the SYN is
+		// considered to occur before the first actual
+		// data octet of the segment in which it occurs,
+		// while the FIN is considered to occur after
+		// the last actual data octet in a segment in
+		// which it occurs.
+		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
+			s.decRef()
+			return true, tcpip.ErrConnectionAborted
+		}
+	}
+
 	// We don't care about receive processing anymore if the receive side
 	// is closed.
-	if r.closed {
-		return
+	//
+	// NOTE: We still want to permit a FIN as it's possible only our
+	// end has closed and the peer is yet to send a FIN. Hence we
+	// compare only the payload.
+	segEnd := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
+	if rcvClosed && !segEnd.LessThanEq(r.rcvNxt) {
+		return true, nil
+	}
+	return false, nil
+}
+
+// handleRcvdSegment handles TCP segments directed at the connection managed by
+// r as they arrive. It is called by the protocol main loop.
+func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
+	r.ep.mu.RLock()
+	state := r.ep.state
+	closed := r.ep.closed
+	r.ep.mu.RUnlock()
+
+	if state != StateEstablished {
+		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
+		if drop || err != nil {
+			return drop, err
+		}
 	}
 
 	segLen := seqnum.Size(s.data.Size())
 	segSeq := s.sequenceNumber
 
 	// If the sequence number range is outside the acceptable range, just
-	// send an ACK. This is according to RFC 793, page 37.
+	// send an ACK and stop further processing of the segment.
+	// This is according to RFC 793, page 68.
 	if !r.acceptable(segSeq, segLen) {
 		r.ep.snd.sendAck()
-		return
+		return true, nil
 	}
 
 	// Defer segment processing if it can't be consumed now.
@@ -288,7 +376,7 @@ func (r *receiver) handleRcvdSegment(s *segment) {
 			// have to retransmit.
 			r.ep.snd.sendAck()
 		}
-		return
+		return false, nil
 	}
 
 	// Since we consumed a segment update the receiver's RTT estimate
@@ -315,4 +403,67 @@ func (r *receiver) handleRcvdSegment(s *segment) {
 		r.pendingBufUsed -= s.logicalLen()
 		s.decRef()
 	}
+	return false, nil
+}
+
+// handleTimeWaitSegment handles inbound segments received when the endpoint
+// has entered the TIME_WAIT state.
+func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) {
+	segSeq := s.sequenceNumber
+	segLen := seqnum.Size(s.data.Size())
+
+	// Just silently drop any RST packets in TIME_WAIT. We do not support
+	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
+	// in https://tools.ietf.org/html/rfc1337#section-3.
+	if s.flagIsSet(header.TCPFlagRst) {
+		return false, false
+	}
+
+	// If it's a SYN and the sequence number is higher than any seen before
+	// for this connection then try and redirect it to a listening endpoint
+	// if available.
+	//
+	// RFC 1122:
+	//   "When a connection is [...] on TIME-WAIT state [...]
+	//   [a TCP] MAY accept a new SYN from the remote TCP to
+	//   reopen the connection directly, if it:
+
+	//    (1) assigns its initial sequence number for the new
+	//     connection to be larger than the largest sequence
+	//     number it used on the previous connection incarnation,
+	//     and
+
+	//    (2) returns to TIME-WAIT state if the SYN turns out
+	//      to be an old duplicate".
+	if s.flagIsSet(header.TCPFlagSyn) && r.rcvNxt.LessThan(segSeq) {
+
+		return false, true
+	}
+
+	// Drop the segment if it does not contain an ACK.
+	if !s.flagIsSet(header.TCPFlagAck) {
+		return false, false
+	}
+
+	// Update Timestamp if required. See RFC7323, section-4.3.
+	if r.ep.sendTSOk && s.parsedOptions.TS {
+		r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.maxSentAck, segSeq)
+	}
+
+	if segSeq.Add(1) == r.rcvNxt && s.flagIsSet(header.TCPFlagFin) {
+		// If it's a FIN-ACK then resetTimeWait and send an ACK, as it
+		// indicates our final ACK could have been lost.
+		r.ep.snd.sendAck()
+		return true, false
+	}
+
+	// If the sequence number range is outside the acceptable range or
+	// carries data then just send an ACK. This is according to RFC 793,
+	// page 37.
+	//
+	// NOTE: In TIME_WAIT the only acceptable sequence number is rcvNxt.
+	if segSeq != r.rcvNxt || segLen != 0 {
+		r.ep.snd.sendAck()
+	}
+	return false, false
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index f4ea5f091..0c1704d74 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -206,17 +206,18 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	// Set TCPLingerTimeout to 5 seconds so that sockets are marked closed
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Send a SYN request.
@@ -256,7 +257,7 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -264,6 +265,13 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		}
 	}
 
+	// Lower stackwide TIME_WAIT timeout so that the reservations
+	// are released instantly on Close.
+	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
+		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %s) = %s", tcp.ProtocolNumber, tcpTW, err)
+	}
+
 	c.EP.Close()
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
@@ -285,6 +293,11 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// Get the ACK to the FIN we just sent.
 	c.GetPacket()
 
+	// Since an active close was done we need to wait for a little more than
+	// tcpLingerTimeout for the port reservations to be released and the
+	// socket to move to a CLOSED state.
+	time.Sleep(20 * time.Millisecond)
+
 	// Now resend the same ACK, this ACK should generate a RST as there
 	// should be no endpoint in SYN-RCVD state and we are not using
 	// syn-cookies yet. The reason we send the same ACK is we need a valid
@@ -376,6 +389,13 @@ func TestConnectResetAfterClose(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	// Set TCPLinger to 3 seconds so that sockets are marked closed
+	// after 3 second in FIN_WAIT2 state.
+	tcpLingerTimeout := 3 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpLingerTimeout, err)
+	}
+
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	ep := c.EP
 	c.EP = nil
@@ -396,12 +416,24 @@ func TestConnectResetAfterClose(t *testing.T) {
 		DstPort: c.Port,
 		Flags:   header.TCPFlagAck,
 		SeqNum:  790,
-		AckNum:  c.IRS.Add(1),
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Wait for the ep to give up waiting for a FIN.
+	time.Sleep(tcpLingerTimeout + 1*time.Second)
+
+	// Now send an ACK and it should trigger a RST as the endpoint should
+	// not exist anymore.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
 		RcvWnd:  30000,
 	})
 
-	// Wait for the ep to give up waiting for a FIN, and send a RST.
-	time.Sleep(3 * time.Second)
 	for {
 		b := c.GetPacket()
 		tcpHdr := header.TCP(header.IPv4(b).Payload())
@@ -413,7 +445,7 @@ func TestConnectResetAfterClose(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
+				checker.SeqNum(uint32(c.IRS)+2),
 				checker.AckNum(790),
 				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
 			),
@@ -1110,8 +1142,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
-			// We shouldn't consume a sequence number on RST.
-			checker.SeqNum(uint32(c.IRS)+1),
+			checker.SeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -3085,6 +3116,13 @@ func TestReadAfterClosedState(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
+	// after 1 second in TIME_WAIT state.
+	tcpTimeWaitTimeout := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
@@ -3092,12 +3130,12 @@ func TestReadAfterClosedState(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Shutdown immediately for write, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3135,10 +3173,9 @@ func TestReadAfterClosedState(t *testing.T) {
 		),
 	)
 
-	// Give the stack the chance to transition to closed state. Note that since
-	// both the sender and receiver are now closed, we effectively skip the
-	// TIME-WAIT state.
-	time.Sleep(1 * time.Second)
+	// Give the stack the chance to transition to closed state from
+	// TIME_WAIT.
+	time.Sleep(tcpTimeWaitTimeout * 2)
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateClose; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
@@ -3155,7 +3192,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	peekBuf := make([]byte, 10)
 	n, _, err := c.EP.Peek([][]byte{peekBuf})
 	if err != nil {
-		t.Fatalf("Peek failed: %v", err)
+		t.Fatalf("Peek failed: %s", err)
 	}
 
 	peekBuf = peekBuf[:n]
@@ -3166,7 +3203,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Receive data.
 	v, _, err := c.EP.Read(nil)
 	if err != nil {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	if !bytes.Equal(data, v) {
@@ -3176,11 +3213,11 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Now that we drained the queue, check that functions fail with the
 	// right error code.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 
 	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Peek(...) = %v, want = %v", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Peek(...) = %v, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 }
 
@@ -4347,7 +4384,8 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	// Send a SYN request.
 	irs := seqnum.Value(789)
 	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort,
+		// pick a different src port for new SYN.
+		SrcPort: context.TestPort + 1,
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  irs,
@@ -4893,3 +4931,545 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.Del
 		t.Errorf("ep.GetSockOptInt(tcpip.DelayOption) got: %d, want: %d", gotDelayOption, wantDelayOption)
 	}
 }
+
+func TestTCPLingerTimeout(t *testing.T) {
+	c := context.New(t, 1500 /* mtu */)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	testCases := []struct {
+		name             string
+		tcpLingerTimeout time.Duration
+		want             time.Duration
+	}{
+		{"NegativeLingerTimeout", -123123, 0},
+		{"ZeroLingerTimeout", 0, 0},
+		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
+		// Values > stack's TCPLingerTimeout are capped to the stack's
+		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
+		{"AboveMaxLingerTimeout", 65 * time.Second, 60 * time.Second},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if err := c.EP.SetSockOpt(tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)); err != nil {
+				t.Fatalf("SetSockOpt(%s) = %s", tc.tcpLingerTimeout, err)
+			}
+			var v tcpip.TCPLingerTimeoutOption
+			if err := c.EP.GetSockOpt(&v); err != nil {
+				t.Fatalf("GetSockOpt(tcpip.TCPLingerTimeoutOption) = %s", err)
+			}
+			if got, want := time.Duration(v), tc.want; got != want {
+				t.Fatalf("unexpected linger timeout got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
+
+func TestTCPTimeWaitRSTIgnored(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Now send a RST and this should be ignored and not
+	// generate an ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagRst,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	})
+
+	c.CheckNoPacketTimeout("unexpected packet received in TIME_WAIT state", 1*time.Second)
+
+	// Out of order ACK should generate an immediate ACK in
+	// TIME_WAIT.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 3,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+}
+
+func TestTCPTimeWaitOutOfOrder(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Out of order ACK should generate an immediate ACK in
+	// TIME_WAIT.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 3,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+}
+
+func TestTCPTimeWaitNewSyn(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Send a SYN request w/ sequence number lower than
+	// the highest sequence number sent. We just reuse
+	// the same number.
+	iss = seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
+
+	// Send a SYN request w/ sequence number higher than
+	// the highest sequence number sent.
+	iss = seqnum.Value(792)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b = c.GetPacket()
+	tcpHdr = header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+}
+
+func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+	// after 5 seconds in TIME_WAIT state.
+	tcpTimeWaitTimeout := 5 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	time.Sleep(2 * time.Second)
+
+	// Now send a duplicate FIN. This should cause the TIME_WAIT to extend
+	// by another 5 seconds and also send us a duplicate ACK as it should
+	// indicate that the final ACK was potentially lost.
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Sleep for 4 seconds so at this point we are 1 second past the
+	// original tcpLingerTimeout of 5 seconds.
+	time.Sleep(4 * time.Second)
+
+	// Send an ACK and it should not generate any packet as the socket
+	// should still be in TIME_WAIT for another another 5 seconds due
+	// to the duplicate FIN we sent earlier.
+	*ackHeaders = *finHeaders
+	ackHeaders.SeqNum = ackHeaders.SeqNum + 1
+	ackHeaders.Flags = header.TCPFlagAck
+	c.SendPacket(nil, ackHeaders)
+
+	c.CheckNoPacketTimeout("unexpected packet received from endpoint in TIME_WAIT", 1*time.Second)
+	// Now sleep for another 2 seconds so that we are past the
+	// extended TIME_WAIT of 7 seconds (2 + 5).
+	time.Sleep(2 * time.Second)
+
+	// Resend the same ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Receive the RST that should be generated as there is no valid
+	// endpoint.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(ackHeaders.AckNum)),
+		checker.AckNum(uint32(ackHeaders.SeqNum)),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 3e5b6b3c3..722d14b53 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -9,7 +9,7 @@ syscall_test(test = "//test/syscalls/linux:accept_bind_stream_test")
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:accept_bind_test",
 )
 
@@ -434,7 +434,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_abstract_test",
 )
 
@@ -445,7 +445,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_domain_test",
 )
 
@@ -458,19 +458,19 @@ syscall_test(
 syscall_test(
     size = "large",
     add_overlay = True,
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_filesystem_test",
 )
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_inet_loopback_test",
 )
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_generic_loopback_test",
 )
 
@@ -481,13 +481,13 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_loopback_test",
 )
 
 syscall_test(
     size = "medium",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_udp_generic_loopback_test",
 )
 
@@ -498,7 +498,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_udp_loopback_test",
 )
 
@@ -560,7 +560,7 @@ syscall_test(
 syscall_test(
     size = "large",
     add_overlay = True,
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_unix_pair_test",
 )
 
@@ -599,7 +599,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_unix_unbound_stream_test",
 )
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 93bff8299..f8b8cb724 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2141,6 +2141,7 @@ cc_library(
     deps = [
         ":socket_test_util",
         "//test/util:test_util",
+        "//test/util:thread_util",
         "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index ab375aaaf..2eeee352e 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <linux/tcp.h>
 #include <netinet/in.h>
 #include <poll.h>
 #include <string.h>
@@ -31,6 +32,7 @@
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
@@ -267,6 +269,340 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   }
 }
 
+// TCPFinWait2Test creates a pair of connected sockets then closes one end to
+// trigger FIN_WAIT2 state for the closed endpoint. Then it binds the same local
+// IP/port on a new socket and tries to connect. The connect should fail w/
+// an EADDRINUSE. Then we wait till the FIN_WAIT2 timeout is over and try the
+// connect again with a new socket and this time it should succeed.
+//
+// TCP timers are not S/R today, this can cause this test to be flaky when run
+// under random S/R due to timer being reset on a restore.
+TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Lower FIN_WAIT2 state to 5 seconds for test.
+  constexpr int kTCPLingerTimeout = 5;
+  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+              SyscallSucceedsWithValue(0));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Get the address/port bound by the connecting socket.
+  sockaddr_storage conn_bound_addr;
+  socklen_t conn_addrlen = connector.addr_len;
+  ASSERT_THAT(
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                  &conn_addrlen),
+      SyscallSucceeds());
+
+  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+  conn_fd.reset();
+
+  // Now bind and connect a new socket.
+  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Disable cooperative saves after this point. As a save between the first
+  // bind/connect and the second one can cause the linger timeout timer to
+  // be restarted causing the final bind/connect to fail.
+  DisableSave ds;
+
+  // TODO(gvisor.dev/issue/1030): Portmanager does not track all 5 tuple
+  //   reservations which causes the bind() to succeed on gVisor but connect
+  //   correctly fails.
+  if (IsRunningOnGvisor()) {
+    ASSERT_THAT(
+        bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+             conn_addrlen),
+        SyscallSucceeds());
+    ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                    reinterpret_cast<sockaddr*>(&conn_addr),
+                                    conn_addrlen),
+                SyscallFailsWithErrno(EADDRINUSE));
+  } else {
+    ASSERT_THAT(
+        bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+             conn_addrlen),
+        SyscallFailsWithErrno(EADDRINUSE));
+  }
+
+  // Sleep for a little over the linger timeout to reduce flakiness in
+  // save/restore tests.
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+  ds.reset();
+
+  if (!IsRunningOnGvisor()) {
+    ASSERT_THAT(
+        bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+             conn_addrlen),
+        SyscallSucceeds());
+  }
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  conn_addrlen),
+              SyscallSucceeds());
+}
+
+// TCPLinger2TimeoutAfterClose creates a pair of connected sockets
+// then closes one end to trigger FIN_WAIT2 state for the closed endpont.
+// It then sleeps for the TCP_LINGER2 timeout and verifies that bind/
+// connecting the same address succeeds.
+//
+// TCP timers are not S/R today, this can cause this test to be flaky when run
+// under random S/R due to timer being reset on a restore.
+TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Get the address/port bound by the connecting socket.
+  sockaddr_storage conn_bound_addr;
+  socklen_t conn_addrlen = connector.addr_len;
+  ASSERT_THAT(
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                  &conn_addrlen),
+      SyscallSucceeds());
+
+  constexpr int kTCPLingerTimeout = 5;
+  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+              SyscallSucceedsWithValue(0));
+
+  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+  conn_fd.reset();
+
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+  // Now bind and connect a new socket and verify that we can immediately
+  // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
+  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(bind(conn_fd2.get(),
+                   reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  conn_addrlen),
+              SyscallSucceeds());
+}
+
+// TCPResetAfterClose creates a pair of connected sockets then closes
+// one end to trigger FIN_WAIT2 state for the closed endpoint verifies
+// that we generate RSTs for any new data after the socket is fully
+// closed.
+TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+  conn_fd.reset();
+
+  int data = 1234;
+
+  // Now send data which should trigger a RST as the other end should
+  // have timed out and closed the socket.
+  EXPECT_THAT(RetryEINTR(send)(accepted.get(), &data, sizeof(data), 0),
+              SyscallSucceeds());
+  // Sleep for a shortwhile to get a RST back.
+  absl::SleepFor(absl::Seconds(1));
+
+  // Try writing again and we should get an EPIPE back.
+  EXPECT_THAT(RetryEINTR(send)(accepted.get(), &data, sizeof(data), 0),
+              SyscallFailsWithErrno(EPIPE));
+
+  // Trying to read should return zero as the other end did send
+  // us a FIN. We do it twice to verify that the RST does not cause an
+  // ECONNRESET on the read after EOF has been read by applicaiton.
+  EXPECT_THAT(RetryEINTR(recv)(accepted.get(), &data, sizeof(data), 0),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(RetryEINTR(recv)(accepted.get(), &data, sizeof(data), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+// This test is disabled under random save as the the restore run
+// results in the stack.Seed() being different which can cause
+// sequence number of final connect to be one that is considered
+// old and can cause the test to be flaky.
+TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  // We disable saves after this point as a S/R causes the netstack seed
+  // to be regenerated which changes what ports/ISN is picked for a given
+  // tuple (src ip,src port, dst ip, dst port). This can cause the final
+  // SYN to use a sequence number that looks like one from the current
+  // connection in TIME_WAIT and will not be accepted causing the test
+  // to timeout.
+  //
+  // TODO(gvisor.dev/issue/940): S/R portSeed/portHint
+  DisableSave ds;
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Get the address/port bound by the connecting socket.
+  sockaddr_storage conn_bound_addr;
+  socklen_t conn_addrlen = connector.addr_len;
+  ASSERT_THAT(
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                  &conn_addrlen),
+      SyscallSucceeds());
+
+  // close the accept FD to trigger TIME_WAIT on the accepted socket which
+  // should cause the conn_fd to follow CLOSE_WAIT->LAST_ACK->CLOSED instead of
+  // TIME_WAIT.
+  accepted.reset();
+  absl::SleepFor(absl::Seconds(1));
+  conn_fd.reset();
+  absl::SleepFor(absl::Seconds(1));
+
+  // Now bind and connect a new socket and verify that we can immediately
+  // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
+  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(bind(conn_fd2.get(),
+                   reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  conn_addrlen),
+              SyscallSucceeds());
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 592448289..a37b49447 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -26,6 +26,7 @@
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -243,6 +244,31 @@ TEST_P(TCPSocketPairTest, ShutdownRdAllowsReadOfReceivedDataBeforeEOF) {
               SyscallSucceedsWithValue(0));
 }
 
+// This test verifies that a shutdown(wr) by the server after sending
+// data allows the client to still read() the queued data and a client
+// close after sending response allows server to read the incoming
+// response.
+TEST_P(TCPSocketPairTest, ShutdownWrServerClientClose) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[10] = {};
+  ScopedThread t([&]() {
+    ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_THAT(close(sockets->release_first_fd()),
+                SyscallSucceedsWithValue(0));
+  });
+  ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(shutdown)(sockets->second_fd(), SHUT_WR),
+              SyscallSucceedsWithValue(0));
+  t.Join();
+
+  ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
 TEST_P(TCPSocketPairTest, ClosedReadNonBlockingSocket) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -696,5 +722,72 @@ TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
   EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
 }
 
+// Linux and Netstack both default to a 60s TCP_LINGER2 timeout.
+constexpr int kDefaultTCPLingerTimeout = 60;
+
+TEST_P(TCPSocketPairTest, TCPLingerTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kDefaultTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZeroOrLess) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
+                         sizeof(kZero)),
+              SyscallSucceedsWithValue(0));
+
+  constexpr int kNegative = -1234;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+                         &kNegative, sizeof(kNegative)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout
+  // on linux (defaults to 60 seconds on linux).
+  constexpr int kAboveDefault = kDefaultTCPLingerTimeout + 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+                         &kAboveDefault, sizeof(kAboveDefault)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kDefaultTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout
+  // on linux (defaults to 60 seconds on linux).
+  constexpr int kTCPLingerTimeout = kDefaultTCPLingerTimeout - 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPLingerTimeout);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From d2df9d76eb6b6410758863455459171fc89fddf7 Mon Sep 17 00:00:00 2001
From: Andrew Dunham <andrew@du.nham.ca>
Date: Thu, 7 Nov 2019 22:19:33 -0800
Subject: Bump gazelle to v0.19.1

---
 WORKSPACE | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index f6d2f4f32..0ad2bb17c 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -12,10 +12,10 @@ http_archive(
 
 http_archive(
     name = "bazel_gazelle",
-    sha256 = "41bff2a0b32b02f20c227d234aa25ef3783998e5453f7eade929704dcff7cd4b",
+    sha256 = "86c6d481b3f7aedc1d60c1c211c6f76da282ae197c3b3160f54bd3a8f847896f",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/bazel-gazelle/releases/download/v0.19.0/bazel-gazelle-v0.19.0.tar.gz",
-        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.19.0/bazel-gazelle-v0.19.0.tar.gz",
+        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/bazel-gazelle/releases/download/v0.19.1/bazel-gazelle-v0.19.1.tar.gz",
+        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.19.1/bazel-gazelle-v0.19.1.tar.gz",
     ],
 )
 
-- 
cgit v1.2.3


From af58a4e3bb0ba81c103429317eb0c2735450136c Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 8 Nov 2019 12:18:40 -0800
Subject: Automated rollback of changelist 278417533

PiperOrigin-RevId: 279365629
---
 pkg/sentry/fs/inode_operations.go     | 2 --
 pkg/sentry/syscalls/linux/sys_file.go | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index d6c35c2dc..5cde9d215 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -221,8 +221,6 @@ type InodeOperations interface {
 	// sys_ftruncate.
 	//
 	// Implementations need not check that length >= 0.
-	//
-	// Truncate must only be called on regular files.
 	Truncate(ctx context.Context, inode *Inode, size int64) error
 
 	// Allocate allows the caller to reserve disk space for the inode.
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index c9f57fe27..b9a8e3e21 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -169,7 +169,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 			if dirPath {
 				return syserror.ENOTDIR
 			}
-			if flags&linux.O_TRUNC != 0 && fs.IsRegular(d.Inode.StableAttr) {
+			if flags&linux.O_TRUNC != 0 {
 				if err := d.Inode.Truncate(t, d, 0); err != nil {
 					return err
 				}
@@ -397,7 +397,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
 			}
 
 			// Should we truncate the file?
-			if flags&linux.O_TRUNC != 0 && fs.IsRegular(found.Inode.StableAttr) {
+			if flags&linux.O_TRUNC != 0 {
 				if err := found.Inode.Truncate(t, found, 0); err != nil {
 					return err
 				}
@@ -1483,7 +1483,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if fs.IsDir(d.Inode.StableAttr) {
 			return syserror.EISDIR
 		}
-		if !fs.IsRegular(d.Inode.StableAttr) {
+		if !fs.IsFile(d.Inode.StableAttr) {
 			return syserror.EINVAL
 		}
 
@@ -1523,7 +1523,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 
 	// Note that this is different from truncate(2) above, where a
 	// directory returns EISDIR.
-	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
+	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
 		return 0, nil, syserror.EINVAL
 	}
 
-- 
cgit v1.2.3


From 50d6236111485acce0e728794c4f53884097ea7d Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 8 Nov 2019 14:07:34 -0800
Subject: Update kokoro images to install junitparser

junitparser will be used to merge junit xml files.

PiperOrigin-RevId: 279387305
---
 kokoro/ubuntu1604/40_kokoro.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kokoro/ubuntu1604/40_kokoro.sh b/kokoro/ubuntu1604/40_kokoro.sh
index 64772d74d..b132abcc8 100755
--- a/kokoro/ubuntu1604/40_kokoro.sh
+++ b/kokoro/ubuntu1604/40_kokoro.sh
@@ -23,7 +23,10 @@ declare -r ssh_public_keys=(
 )
 
 # Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm
+apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip
+
+# junitparser is used to merge junit xml files.
+pip install junitparser
 
 # We need a kbuilder user.
 if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
-- 
cgit v1.2.3


From 14f4461f93a4c0014314a35a374ce07eec25636c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 8 Nov 2019 15:43:47 -0800
Subject: kokoro: update images to install zip

PiperOrigin-RevId: 279406266
---
 kokoro/ubuntu1604/40_kokoro.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/ubuntu1604/40_kokoro.sh b/kokoro/ubuntu1604/40_kokoro.sh
index b132abcc8..3f50929d5 100755
--- a/kokoro/ubuntu1604/40_kokoro.sh
+++ b/kokoro/ubuntu1604/40_kokoro.sh
@@ -23,7 +23,7 @@ declare -r ssh_public_keys=(
 )
 
 # Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip
+apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip zip
 
 # junitparser is used to merge junit xml files.
 pip install junitparser
-- 
cgit v1.2.3


From b91ad8fa0950c752ab08af7d08727d5a97b14b12 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 8 Nov 2019 16:39:25 -0800
Subject: test: merge log files of all shards for each test suite

This significantly speeds up a process of uploading this files
to sponge and resultstore by kokoro.

PiperOrigin-RevId: 279416349
---
 scripts/common_bazel.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh
index f8ec967b1..a82163297 100755
--- a/scripts/common_bazel.sh
+++ b/scripts/common_bazel.sh
@@ -71,6 +71,13 @@ function run_as_root() {
 function collect_logs() {
   # Zip out everything into a convenient form.
   if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
+    # Merge results files of all shards for each test suite.
+    for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
+      junitparser merge `find $d -name test.xml` $d/test.xml
+      cat $d/shard_*_of_*/test.log > $d/test.log
+      ls -l $d/shard_*_of_*/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/outputs.zip
+    done
+    find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
     # Move test logs to Kokoro directory. tar is used to conveniently perform
     # renames while moving files.
     find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
-- 
cgit v1.2.3


From 773071680021a2fb985f3a3af7e9f65cdc1bd1ed Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 11 Nov 2019 14:13:50 -0800
Subject: Make `connect` on socket returned by `accept` correctly error out
 with EISCONN

PiperOrigin-RevId: 279814493
---
 pkg/tcpip/transport/tcp/accept.go   |  2 ++
 pkg/tcpip/transport/tcp/tcp_test.go |  3 +++
 test/syscalls/linux/tcp_socket.cc   | 13 +++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 0e8e0a2b4..f24b51b91 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -300,6 +300,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	ep.mu.Lock()
 	ep.stack.Stats().TCP.CurrentEstablished.Increment()
 	ep.state = StateEstablished
+	ep.isConnectNotified = true
 	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
@@ -539,6 +540,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// Switch state to connected.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
 		n.state = StateEstablished
+		n.isConnectNotified = true
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 0c1704d74..84579ce52 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -4599,6 +4599,9 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
+	if err := aep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAlreadyConnected {
+		t.Errorf("Unexpected error attempting to call connect on an established endpoint, got: %v, want: %v", err, tcpip.ErrAlreadyConnected)
+	}
 	// Listening endpoint remains in listen state.
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 277d6835a..bfc77ffc2 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -130,6 +130,19 @@ void TcpSocketTest::TearDown() {
   }
 }
 
+TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  ASSERT_THAT(
+      connect(s_, reinterpret_cast<const struct sockaddr*>(&addr), addrlen),
+      SyscallFailsWithErrno(EISCONN));
+  ASSERT_THAT(
+      connect(t_, reinterpret_cast<const struct sockaddr*>(&addr), addrlen),
+      SyscallFailsWithErrno(EISCONN));
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
-- 
cgit v1.2.3


From e09e7bf72f3e0208c7f557d9931407ee8729ebb2 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 11 Nov 2019 14:41:44 -0800
Subject: Add more extended features.

PiperOrigin-RevId: 279820435
---
 pkg/cpuid/cpuid.go | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index 5d61dc2ff..d37047368 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -183,6 +183,33 @@ const (
 	X86FeatureAVX512VBMI
 	X86FeatureUMIP
 	X86FeaturePKU
+	X86FeatureOSPKE
+	X86FeatureWAITPKG
+	X86FeatureAVX512_VBMI2
+	_ // ecx bit 7 is reserved
+	X86FeatureGFNI
+	X86FeatureVAES
+	X86FeatureVPCLMULQDQ
+	X86FeatureAVX512_VNNI
+	X86FeatureAVX512_BITALG
+	X86FeatureTME
+	X86FeatureAVX512_VPOPCNTDQ
+	_ // ecx bit 15 is reserved
+	X86FeatureLA57
+	// ecx bits 17-21 are reserved
+	_
+	_
+	_
+	_
+	_
+	X86FeatureRDPID
+	// ecx bits 23-24 are reserved
+	_
+	_
+	X86FeatureCLDEMOTE
+	_ // ecx bit 26 is reserved
+	X86FeatureMOVDIRI
+	X86FeatureMOVDIR64B
 )
 
 // Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX.
@@ -353,9 +380,24 @@ var x86FeatureStrings = map[Feature]string{
 	X86FeatureAVX512VL:   "avx512vl",
 
 	// Block 3.
-	X86FeatureAVX512VBMI: "avx512vbmi",
-	X86FeatureUMIP:       "umip",
-	X86FeaturePKU:        "pku",
+	X86FeatureAVX512VBMI:       "avx512vbmi",
+	X86FeatureUMIP:             "umip",
+	X86FeaturePKU:              "pku",
+	X86FeatureOSPKE:            "ospke",
+	X86FeatureWAITPKG:          "waitpkg",
+	X86FeatureAVX512_VBMI2:     "avx512_vbmi2",
+	X86FeatureGFNI:             "gfni",
+	X86FeatureVAES:             "vaes",
+	X86FeatureVPCLMULQDQ:       "vpclmulqdq",
+	X86FeatureAVX512_VNNI:      "avx512_vnni",
+	X86FeatureAVX512_BITALG:    "avx512_bitalg",
+	X86FeatureTME:              "tme",
+	X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq",
+	X86FeatureLA57:             "la57",
+	X86FeatureRDPID:            "rdpid",
+	X86FeatureCLDEMOTE:         "cldemote",
+	X86FeatureMOVDIRI:          "movdiri",
+	X86FeatureMOVDIR64B:        "movdir64b",
 
 	// Block 4.
 	X86FeatureXSAVEOPT: "xsaveopt",
-- 
cgit v1.2.3


From 2b0e4dc6aa7fb8a3f619220b72537a8fff2f95b4 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 11 Nov 2019 15:49:49 -0800
Subject: Remove obsolete TODO. This is now fixed.

PiperOrigin-RevId: 279835100
---
 test/syscalls/linux/tcp_socket.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index bfc77ffc2..99863b0ed 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -425,6 +425,11 @@ TEST_P(TcpSocketTest, PollWithFullBufferBlocks) {
   }
   // The last error should have been EWOULDBLOCK.
   ASSERT_EQ(errno, EWOULDBLOCK);
+
+  // Now polling on the FD with a timeout should return 0 corresponding to no
+  // FDs ready.
+  struct pollfd poll_fd = {s_, POLLOUT, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10), SyscallSucceedsWithValue(0));
 }
 
 TEST_P(TcpSocketTest, MsgTrunc) {
-- 
cgit v1.2.3


From b82bd24f9495435cadd2713db829b19ce8fcce9d Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 11 Nov 2019 18:34:28 -0800
Subject: Update ephemeral port reservation tests.

The existing tests which are disabled on gVisor are failing because we default
to SO_REUSEADDR being enabled for TCP sockets. Update the test comments.

Also add new tests for enabled SO_REUSEADDR.

PiperOrigin-RevId: 279862275
---
 test/syscalls/linux/socket_inet_loopback.cc | 223 ++++++++++++++++++++++++++--
 1 file changed, 212 insertions(+), 11 deletions(-)

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2eeee352e..96a1731cf 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1156,10 +1156,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
     sockaddr_storage addr_dual = test_addr_dual.addr;
     const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(test_addr_dual.family(), param.type, 0));
-    int one = 1;
-    EXPECT_THAT(
-        setsockopt(fd_dual.get(), IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)),
-        SyscallSucceeds());
+    EXPECT_THAT(setsockopt(fd_dual.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+                           &kSockOptOn, sizeof(kSockOptOn)),
+                SyscallSucceeds());
     ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
                      test_addr_dual.addr_len),
                 SyscallSucceeds());
@@ -1207,7 +1206,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME(b/114268588)
+  // FIXME(b/76031995): Support disabling SO_REUSEADDR for TCP sockets and make
+  // it disabled by default.
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -1305,10 +1305,76 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
+  auto const& param = GetParam();
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
+
+  // Bind the v6 loopback on a dual stack socket.
+  TestAddress const& test_addr = V6Loopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  const FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Listen iff TCP.
+  if (param.type == SOCK_STREAM) {
+    ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+  }
+
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Connect to bind an ephemeral port.
+  const FileDescriptor connected_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(connected_fd.get(),
+                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+              SyscallSucceeds());
+
+  // Get the ephemeral port.
+  sockaddr_storage connected_addr = {};
+  socklen_t connected_addr_len = sizeof(connected_addr);
+  ASSERT_THAT(getsockname(connected_fd.get(),
+                          reinterpret_cast<sockaddr*>(&connected_addr),
+                          &connected_addr_len),
+              SyscallSucceeds());
+  uint16_t const ephemeral_port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+  // Verify that we actually got an ephemeral port.
+  ASSERT_NE(ephemeral_port, 0);
+
+  // Verify that the ephemeral port is not reserved.
+  const FileDescriptor checking_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+           connected_addr_len),
+      SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME(b/114268588)
+  // FIXME(b/76031995): Support disabling SO_REUSEADDR for TCP sockets and make
+  // it disabled by default.
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -1408,9 +1474,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
       // v6-only socket.
       const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
           Socket(test_addr_v6_any.family(), param.type, 0));
-      int one = 1;
       EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
-                             &one, sizeof(one)),
+                             &kSockOptOn, sizeof(kSockOptOn)),
                   SyscallSucceeds());
       ret =
           bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
@@ -1429,10 +1494,78 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+       V4MappedEphemeralPortReservedResueAddr) {
+  auto const& param = GetParam();
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
+
+  // Bind the v4 loopback on a dual stack socket.
+  TestAddress const& test_addr = V4MappedLoopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  const FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Listen iff TCP.
+  if (param.type == SOCK_STREAM) {
+    ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+  }
+
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Connect to bind an ephemeral port.
+  const FileDescriptor connected_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(connected_fd.get(),
+                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+              SyscallSucceeds());
+
+  // Get the ephemeral port.
+  sockaddr_storage connected_addr = {};
+  socklen_t connected_addr_len = sizeof(connected_addr);
+  ASSERT_THAT(getsockname(connected_fd.get(),
+                          reinterpret_cast<sockaddr*>(&connected_addr),
+                          &connected_addr_len),
+              SyscallSucceeds());
+  uint16_t const ephemeral_port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+  // Verify that we actually got an ephemeral port.
+  ASSERT_NE(ephemeral_port, 0);
+
+  // Verify that the ephemeral port is not reserved.
+  const FileDescriptor checking_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+           connected_addr_len),
+      SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME(b/114268588)
+  // FIXME(b/76031995): Support disabling SO_REUSEADDR for TCP sockets and make
+  // it disabled by default.
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -1533,9 +1666,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
       // v6-only socket.
       const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
           Socket(test_addr_v6_any.family(), param.type, 0));
-      int one = 1;
       EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
-                             &one, sizeof(one)),
+                             &kSockOptOn, sizeof(kSockOptOn)),
                   SyscallSucceeds());
       ret =
           bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
@@ -1554,6 +1686,75 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
+  auto const& param = GetParam();
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
+
+  // Bind the v4 loopback on a v4 socket.
+  TestAddress const& test_addr = V4Loopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  const FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+
+  // Listen iff TCP.
+  if (param.type == SOCK_STREAM) {
+    ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+  }
+
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Connect to bind an ephemeral port.
+  const FileDescriptor connected_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(connected_fd.get(),
+                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+              SyscallSucceeds());
+
+  // Get the ephemeral port.
+  sockaddr_storage connected_addr = {};
+  socklen_t connected_addr_len = sizeof(connected_addr);
+  ASSERT_THAT(getsockname(connected_fd.get(),
+                          reinterpret_cast<sockaddr*>(&connected_addr),
+                          &connected_addr_len),
+              SyscallSucceeds());
+  uint16_t const ephemeral_port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+  // Verify that we actually got an ephemeral port.
+  ASSERT_NE(ephemeral_port, 0);
+
+  // Verify that the ephemeral port is not reserved.
+  const FileDescriptor checking_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+           connected_addr_len),
+      SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
   auto const& param = GetParam();
   TestAddress const& test_addr = V4Loopback();
-- 
cgit v1.2.3


From 548d65b2b6116beecb2aa782a0b5428fb20f89a0 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 12 Nov 2019 12:00:16 -0800
Subject: kokoro: correct a path to outputs.zip

PiperOrigin-RevId: 280021914
---
 scripts/common_bazel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh
index a82163297..bbc1a038e 100755
--- a/scripts/common_bazel.sh
+++ b/scripts/common_bazel.sh
@@ -75,7 +75,7 @@ function collect_logs() {
     for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
       junitparser merge `find $d -name test.xml` $d/test.xml
       cat $d/shard_*_of_*/test.log > $d/test.log
-      ls -l $d/shard_*_of_*/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/outputs.zip
+      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
     done
     find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
     # Move test logs to Kokoro directory. tar is used to conveniently perform
-- 
cgit v1.2.3


From 57a2a5ea3359e0879f5e4cc40fdb9ad973c689a8 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 12 Nov 2019 14:02:53 -0800
Subject: Add tests for SO_REUSEADDR and SO_REUSEPORT.

* Basic tests for the SO_REUSEADDR and SO_REUSEPORT options.
* SO_REUSEADDR functional tests for TCP and UDP.
* SO_REUSEADDR and SO_REUSEPORT interaction tests for UDP.
* Stubbed support for UDP getsockopt(SO_REUSEADDR).

PiperOrigin-RevId: 280049265
---
 pkg/tcpip/transport/udp/endpoint.go                |    4 +
 test/syscalls/linux/BUILD                          |    1 +
 test/syscalls/linux/socket_ip_udp_generic.cc       |  128 ++-
 test/syscalls/linux/socket_ipv4_udp_unbound.cc     | 1116 ++++++++++++--------
 test/syscalls/linux/socket_ipv4_udp_unbound.h      |    4 +-
 .../linux/socket_ipv4_udp_unbound_loopback.cc      |   13 +-
 test/syscalls/linux/socket_test_util.h             |    3 +
 7 files changed, 789 insertions(+), 480 deletions(-)

diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 5270f24df..dda7af910 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -732,6 +732,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = tcpip.MulticastLoopOption(v)
 		return nil
 
+	case *tcpip.ReuseAddressOption:
+		*o = 0
+		return nil
+
 	case *tcpip.ReusePortOption:
 		e.mu.RLock()
 		v := e.reusePort
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f8b8cb724..6345ea28c 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2234,6 +2234,7 @@ cc_library(
         ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 2a4ed04a5..66eb68857 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -35,7 +35,7 @@ TEST_P(UDPSocketPairTest, MulticastTTLDefault) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -52,7 +52,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMin) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -69,7 +69,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMax) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -91,7 +91,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLNegativeOne) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -126,7 +126,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLChar) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -147,7 +147,7 @@ TEST_P(UDPSocketPairTest, MulticastLoopDefault) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -163,7 +163,7 @@ TEST_P(UDPSocketPairTest, SetMulticastLoop) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -173,7 +173,7 @@ TEST_P(UDPSocketPairTest, SetMulticastLoop) {
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
 
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -192,7 +192,7 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -202,12 +202,120 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
                          &kSockOptOnChar, sizeof(kSockOptOnChar)),
               SyscallSucceeds());
 
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
   EXPECT_EQ(get, kSockOptOn);
 }
 
+TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReuseAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, ReusePortDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReusePort) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index b828b6844..00dc24928 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -18,9 +18,11 @@
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+
 #include <cstdio>
 
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
@@ -51,26 +53,27 @@ TestAddress V4Broadcast() {
 
 // Check that packets are not received without a group membership. Default send
 // interface configured by bind.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   EXPECT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address. If multicast worked like unicast,
   // this would ensure that we get the packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -82,33 +85,33 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
 
 // Check that not setting a default send interface prevents multicast packets
 // from being sent. Group membership interface configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the second FD to the v4 any address to ensure that we can receive any
   // unicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -118,8 +121,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -128,27 +131,27 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallFailsWithErrno(ENETUNREACH));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallFailsWithErrno(ENETUNREACH));
 }
 
 // Check that not setting a default send interface prevents multicast packets
 // from being sent. Group membership interface configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNicNoDefaultSendIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the second FD to the v4 any address to ensure that we can receive any
   // unicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -158,8 +161,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -168,35 +171,35 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallFailsWithErrno(ENETUNREACH));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallFailsWithErrno(ENETUNREACH));
 }
 
 // Check that multicast works when the default send interface is configured by
 // bind and the group membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -206,8 +209,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -216,43 +219,42 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
 // Check that multicast works when the default send interface is configured by
 // bind and the group membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -262,8 +264,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -272,17 +274,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -290,25 +290,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -318,8 +319,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -328,17 +329,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -346,25 +345,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -374,8 +374,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -384,17 +384,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -402,25 +400,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -430,8 +429,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -439,22 +438,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   ASSERT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -462,25 +459,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -490,8 +488,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -499,22 +497,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   ASSERT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -522,25 +518,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -550,8 +547,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -560,17 +557,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -578,25 +573,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -606,8 +602,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -616,17 +612,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -634,25 +628,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -662,8 +657,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -671,20 +666,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   EXPECT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
@@ -692,25 +686,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -720,8 +715,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -729,20 +724,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   ASSERT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
@@ -750,29 +744,30 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &kSockOptOff, sizeof(kSockOptOff)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -782,8 +777,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -792,17 +787,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -810,29 +803,30 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &kSockOptOff, sizeof(kSockOptOff)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -842,8 +836,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -852,57 +846,57 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
 // Check that dropping a group membership that does not exist fails.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastInvalidDrop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastInvalidDrop) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Unregister from a membership that we didn't have.
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
 // Check that dropping a group membership prevents multicast packets from being
 // delivered. Default send address configured by bind and group membership
 // interface configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   EXPECT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -912,11 +906,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -925,15 +919,14 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
@@ -941,26 +934,27 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
 // Check that dropping a group membership prevents multicast packets from being
 // delivered. Default send address configured by bind and group membership
 // interface configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   EXPECT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -970,11 +964,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -983,50 +977,53 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfZero) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfZero) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn iface = {};
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn iface = {};
   iface.imr_ifindex = -1;
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreq iface = {};
   iface.imr_interface.s_addr = inet_addr("255.255.255");
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetShort) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetShort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Create a valid full-sized request.
   ip_mreqn iface = {};
@@ -1034,29 +1031,31 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetShort) {
 
   // Send an optlen of 1 to check that optlen is enforced.
   EXPECT_THAT(
-      setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1),
+      setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1),
       SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefault) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefaultReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefaultReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
@@ -1071,19 +1070,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefaultReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddrGetReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddrGetReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   in_addr set = {};
   set.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
@@ -1095,19 +1095,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddrGetReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddrGetReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddrGetReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreq set = {};
   set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
@@ -1119,19 +1120,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddrGetReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNicGetReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNicGetReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn set = {};
   set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
   EXPECT_EQ(size, sizeof(in_addr));
   EXPECT_EQ(get.imr_multiaddr.s_addr, 0);
@@ -1139,87 +1141,93 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNicGetReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   in_addr set = {};
   set.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, set.s_addr);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreq set = {};
   set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, set.imr_interface.s_addr);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn set = {};
   set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupNoIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupNoIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(ENODEV));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupInvalidIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupInvalidIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn group = {};
   group.imr_address.s_addr = inet_addr("255.255.255");
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(ENODEV));
 }
 
 // Check that multiple memberships are not allowed on the same socket.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestMultipleJoinsOnSingleSocket) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-  auto fd = sockets->first_fd();
+TEST_P(IPv4UDPUnboundSocketTest, TestMultipleJoinsOnSingleSocket) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto fd = socket1->get();
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
@@ -1234,41 +1242,44 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestMultipleJoinsOnSingleSocket) {
 }
 
 // Check that two sockets can join the same multicast group at the same time.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestTwoSocketsJoinSameMulticastGroup) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestTwoSocketsJoinSameMulticastGroup) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Drop the membership twice on each socket, the second call for each socket
   // should fail.
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
 // Check that two sockets can join the same multicast group at the same time,
 // and both will receive data on it.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestMcastReceptionOnTwoSockets) {
+TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionOnTwoSockets) {
   std::unique_ptr<SocketPair> socket_pairs[2] = {
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()),
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair())};
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket())),
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket()))};
 
   ip_mreq iface = {}, group = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
@@ -1338,11 +1349,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestMcastReceptionOnTwoSockets) {
 // Check that on two sockets that joined a group and listen on ANY, dropping
 // memberships one by one will continue to deliver packets to both sockets until
 // both memberships have been dropped.
-TEST_P(IPv4UDPUnboundSocketPairTest,
-       TestMcastReceptionWhenDroppingMemberships) {
+TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionWhenDroppingMemberships) {
   std::unique_ptr<SocketPair> socket_pairs[2] = {
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()),
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair())};
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket())),
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket()))};
 
   ip_mreq iface = {}, group = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
@@ -1437,18 +1449,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest,
 
 // Check that a receiving socket can bind to the multicast address before
 // joining the group and receive data once the group has been joined.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenJoinThenReceive) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenJoinThenReceive) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the multicast address.
   auto receiver_addr = V4Multicast();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   // Update receiver_addr with the correct port number.
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1458,30 +1471,29 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenJoinThenReceive) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet on the first socket out the loopback interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
   auto sendto_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
@@ -1489,18 +1501,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenJoinThenReceive) {
 
 // Check that a receiving socket can bind to the multicast address and won't
 // receive multicast data if it hasn't joined the group.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenNoJoinThenNoReceive) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenNoJoinThenNoReceive) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the multicast address.
   auto receiver_addr = V4Multicast();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   // Update receiver_addr with the correct port number.
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1509,40 +1522,40 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenNoJoinThenNoReceive) {
   // Send a multicast packet on the first socket out the loopback interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
   auto sendto_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we don't receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
 
 // Check that a socket can bind to a multicast address and still send out
 // packets.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenSend) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the ANY address.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1551,11 +1564,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
   // Bind the first socket (sender) to the multicast address.
   auto sender_addr = V4Multicast();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
   socklen_t sender_addr_len = sender_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&sender_addr.addr),
                           &sender_addr_len),
               SyscallSucceeds());
@@ -1567,15 +1580,14 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
@@ -1583,46 +1595,46 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
 
 // Check that a receiving socket can bind to the broadcast address and receive
 // broadcast packets.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenReceive) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenReceive) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the broadcast address.
   auto receiver_addr = V4Broadcast();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
   EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
 
   // Send a broadcast packet on the first socket out the loopback interface.
-  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_BROADCAST,
-                         &kSockOptOn, sizeof(kSockOptOn)),
+  EXPECT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                         sizeof(kSockOptOn)),
               SyscallSucceedsWithValue(0));
   // Note: Binding to the loopback interface makes the broadcast go out of it.
   auto sender_bind_addr = V4Loopback();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&sender_bind_addr.addr),
-                   sender_bind_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_bind_addr.addr),
+           sender_bind_addr.addr_len),
+      SyscallSucceeds());
   auto sendto_addr = V4Broadcast();
   reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
@@ -1630,17 +1642,18 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenReceive) {
 
 // Check that a socket can bind to the broadcast address and still send out
 // packets.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenSend) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenSend) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the ANY address.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1649,11 +1662,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenSend) {
   // Bind the first socket (sender) to the broadcast address.
   auto sender_addr = V4Broadcast();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
   socklen_t sender_addr_len = sender_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&sender_addr.addr),
                           &sender_addr_len),
               SyscallSucceeds());
@@ -1665,19 +1678,202 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenSend) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
+// Check that SO_REUSEADDR always delivers to the most recently bound socket.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
+
+  ASSERT_THAT(setsockopt(sockets[0]->get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(sockets[0]->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(sockets[0]->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  constexpr int kMessageSize = 200;
+
+  for (int i = 0; i < 10; i++) {
+    // Add a new receiver.
+    sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
+    auto& last = sockets.back();
+    ASSERT_THAT(setsockopt(last->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(bind(last->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                     addr.addr_len),
+                SyscallSucceeds());
+
+    // Send a new message to the SO_REUSEADDR group. We use a new socket each
+    // time so that a new ephemeral port will be used each time. This ensures
+    // that we aren't doing REUSEPORT-like hash load blancing.
+    auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+    char send_buf[kMessageSize];
+    RandomizeBuffer(send_buf, sizeof(send_buf));
+    EXPECT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                   reinterpret_cast<sockaddr*>(&addr.addr),
+                                   addr.addr_len),
+                SyscallSucceedsWithValue(sizeof(send_buf)));
+
+    // Verify that the most recent socket got the message. We don't expect any
+    // of the other sockets to have received it, but we will check that later.
+    char recv_buf[sizeof(send_buf)] = {};
+    EXPECT_THAT(
+        RetryEINTR(recv)(last->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+        SyscallSucceedsWithValue(sizeof(send_buf)));
+    EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+  }
+
+  // Verify that no other messages were received.
+  for (auto& socket : sockets) {
+    char recv_buf[kMessageSize] = {};
+    EXPECT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf),
+                                 MSG_DONTWAIT),
+                SyscallFailsWithErrno(EAGAIN));
+  }
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrThenReusePort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+// Check that REUSEPORT takes precedence over REUSEADDR.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  ASSERT_THAT(setsockopt(receiver1->get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(receiver1->get(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(receiver1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind receiver2 to the same address as socket1, also with REUSEADDR and
+  // REUSEPORT.
+  ASSERT_THAT(setsockopt(receiver2->get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(receiver2->get(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(receiver2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  constexpr int kMessageSize = 10;
+
+  for (int i = 0; i < 100; ++i) {
+    // Send a new message to the REUSEADDR/REUSEPORT group. We use a new socket
+    // each time so that a new ephemerial port will be used each time. This
+    // ensures that we cycle through hashes.
+    auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+    char send_buf[kMessageSize] = {};
+    EXPECT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                   reinterpret_cast<sockaddr*>(&addr.addr),
+                                   addr.addr_len),
+                SyscallSucceedsWithValue(sizeof(send_buf)));
+  }
+
+  // Check that both receivers got messages. This checks that we are using load
+  // balancing (REUSEPORT) instead of the most recently bound socket
+  // (REUSEADDR).
+  char recv_buf[kMessageSize] = {};
+  EXPECT_THAT(RetryEINTR(recv)(receiver1->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallSucceedsWithValue(kMessageSize));
+  EXPECT_THAT(RetryEINTR(recv)(receiver2->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallSucceedsWithValue(kMessageSize));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h
index 8e07bfbbf..f64c57645 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h
@@ -20,8 +20,8 @@
 namespace gvisor {
 namespace testing {
 
-// Test fixture for tests that apply to pairs of IPv4 UDP sockets.
-using IPv4UDPUnboundSocketPairTest = SocketPairTest;
+// Test fixture for tests that apply to IPv4 UDP sockets.
+using IPv4UDPUnboundSocketTest = SimpleSocketTest;
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
index cb0105471..f121c044d 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
@@ -22,14 +22,11 @@
 namespace gvisor {
 namespace testing {
 
-std::vector<SocketPairKind> GetSocketPairs() {
-  return ApplyVec<SocketPairKind>(
-      IPv4UDPUnboundSocketPair,
-      AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(IPv4UDPSockets, IPv4UDPUnboundSocketPairTest,
-                         ::testing::ValuesIn(GetSocketPairs()));
+INSTANTIATE_TEST_SUITE_P(
+    IPv4UDPSockets, IPv4UDPUnboundSocketTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index be38907c2..2dbb8bed3 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -114,6 +114,9 @@ class FDSocketPair : public SocketPair {
  public:
   FDSocketPair(int first_fd, int second_fd)
       : first_(first_fd), second_(second_fd) {}
+  FDSocketPair(std::unique_ptr<FileDescriptor> first_fd,
+               std::unique_ptr<FileDescriptor> second_fd)
+      : first_(first_fd->release()), second_(second_fd->release()) {}
 
   int first_fd() const override { return first_.get(); }
   int second_fd() const override { return second_.get(); }
-- 
cgit v1.2.3


From 5398530e45634b6f5ea4344d1a34b41cc8123457 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 12 Nov 2019 14:02:53 -0800
Subject: Discover on-link prefixes from Router Advertisements' Prefix
 Information options

This change allows the netstack to do NDP's Prefix Discovery as outlined by
RFC 4861 section 6.3.4. If configured to do so, when a new on-link prefix is
discovered, the routing table will be updated with a device route through
the nic the RA arrived at. Likewise, when such a prefix gets invalidated, the
device route will be removed.

Note, this change will not break existing uses of netstack as the default
configuration for the stack options is set in such a way that Prefix Discovery
will not be performed. See `stack.Options` and `stack.NDPConfigurations` for
more details.

This change reuses 1 option and introduces a new one that is required to take
advantage of Prefix Discovery, all available under NDPConfigurations:
- HandleRAs: Whether or not NDP RAs are processes
- DiscoverOnLinkPrefixes: Whether or not Prefix Discovery is performed (new)

Another note: for a NIC to process Prefix Information options (in Router
Advertisements), it must not be a router itself. Currently the netstack does not
have per-interface routing configuration; the routing/forwarding configuration
is controlled stack-wide. Therefore, if the stack is configured to enable
forwarding/routing, no router Advertisements (and by extension the Prefix
Information options) will be processed.

Tests: Unittest to make sure that Prefix Discovery and updates to the routing
table only occur if explicitly configured to do so. Unittest to make sure at
max stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are
remembered.
PiperOrigin-RevId: 280049278
---
 pkg/tcpip/header/ndp_options.go |  25 +-
 pkg/tcpip/stack/ndp.go          | 309 +++++++++++++++++++++--
 pkg/tcpip/stack/ndp_test.go     | 534 +++++++++++++++++++++++++++++++++++++++-
 pkg/tcpip/stack/nic.go          |   1 +
 4 files changed, 836 insertions(+), 33 deletions(-)

diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index a2b9d7435..1ca6199ef 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -85,17 +85,22 @@ const (
 	// within an NDPPrefixInformation.
 	ndpPrefixInformationPrefixOffset = 14
 
-	// NDPPrefixInformationInfiniteLifetime is a value that represents
-	// infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
-	// Information option. Its value is (2^32 - 1)s = 4294967295s
-	NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295
-
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
 	// 8 octets, as per RFC 4861 section 4.6.
 	lengthByteUnits = 8
 )
 
+var (
+	// NDPPrefixInformationInfiniteLifetime is a value that represents
+	// infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
+	// Information option. Its value is (2^32 - 1)s = 4294967295s
+	//
+	// This is a variable instead of a constant so that tests can change
+	// this value to a smaller value. It should only be modified by tests.
+	NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295
+)
+
 // NDPOptionIterator is an iterator of NDPOption.
 //
 // Note, between when an NDPOptionIterator is obtained and last used, no changes
@@ -461,3 +466,13 @@ func (o NDPPrefixInformation) PreferredLifetime() time.Duration {
 func (o NDPPrefixInformation) Prefix() tcpip.Address {
 	return tcpip.Address(o[ndpPrefixInformationPrefixOffset:][:IPv6AddressSize])
 }
+
+// Subnet returns the Prefix field and Prefix Length field represented in a
+// tcpip.Subnet.
+func (o NDPPrefixInformation) Subnet() tcpip.Subnet {
+	addrWithPrefix := tcpip.AddressWithPrefix{
+		Address:   o.Prefix(),
+		PrefixLen: int(o.PrefixLength()),
+	}
+	return addrWithPrefix.Subnet()
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 8e49f7a56..8357dca77 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -46,11 +46,18 @@ const (
 
 	// defaultDiscoverDefaultRouters is the default configuration for
 	// whether or not to discover default routers from incoming Router
-	// Advertisements as a host.
+	// Advertisements, as a host.
 	//
 	// Default = true.
 	defaultDiscoverDefaultRouters = true
 
+	// defaultDiscoverOnLinkPrefixes is the default configuration for
+	// whether or not to discover on-link prefixes from incoming Router
+	// Advertisements' Prefix Information option, as a host.
+	//
+	// Default = true.
+	defaultDiscoverOnLinkPrefixes = true
+
 	// minimumRetransmitTimer is the minimum amount of time to wait between
 	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
 	// not impose a minimum Retransmit Timer, but we do here to make sure
@@ -72,6 +79,14 @@ const (
 	//
 	// Max = 10.
 	MaxDiscoveredDefaultRouters = 10
+
+	// MaxDiscoveredOnLinkPrefixes is the maximum number of discovered
+	// on-link prefixes. The stack should stop discovering new on-link
+	// prefixes after discovering MaxDiscoveredOnLinkPrefixes on-link
+	// prefixes.
+	//
+	// Max = 10.
+	MaxDiscoveredOnLinkPrefixes = 10
 )
 
 // NDPDispatcher is the interface integrators of netstack must implement to
@@ -106,6 +121,24 @@ type NDPDispatcher interface {
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route
+
+	// OnOnLinkPrefixDiscovered will be called when a new on-link prefix is
+	// discovered. Implementations must return true along with a new valid
+	// route table if the newly discovered on-link prefix should be
+	// remembered. If an implementation returns false, the second return
+	// value will be ignored.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route)
+
+	// OnOnLinkPrefixInvalidated will be called when a discovered on-link
+	// prefix is invalidated. Implementers must return a new valid route
+	// table.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
@@ -130,6 +163,11 @@ type NDPConfigurations struct {
 	// be discovered from Router Advertisements. This configuration is
 	// ignored if HandleRAs is false.
 	DiscoverDefaultRouters bool
+
+	// DiscoverOnLinkPrefixes determines whether or not on-link prefixes
+	// will be discovered from Router Advertisements' Prefix Information
+	// option. This configuration is ignored if HandleRAs is false.
+	DiscoverOnLinkPrefixes bool
 }
 
 // DefaultNDPConfigurations returns an NDPConfigurations populated with
@@ -140,6 +178,7 @@ func DefaultNDPConfigurations() NDPConfigurations {
 		RetransmitTimer:        defaultRetransmitTimer,
 		HandleRAs:              defaultHandleRAs,
 		DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
+		DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes,
 	}
 }
 
@@ -167,6 +206,10 @@ type ndpState struct {
 
 	// The default routers discovered through Router Advertisements.
 	defaultRouters map[tcpip.Address]defaultRouterState
+
+	// The on-link prefixes discovered through Router Advertisements' Prefix
+	// Information option.
+	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
 }
 
 // dadState holds the Duplicate Address Detection timer and channel to signal
@@ -183,11 +226,11 @@ type dadState struct {
 }
 
 // defaultRouterState holds data associated with a default router discovered by
-// a Router Advertisement.
+// a Router Advertisement (RA).
 type defaultRouterState struct {
 	invalidationTimer *time.Timer
 
-	// Used to signal the timer not to invalidate the default router (R) in
+	// Used to inform the timer not to invalidate the default router (R) in
 	// a race condition (T1 is a goroutine that handles an RA from R and T2
 	// is the goroutine that handles R's invalidation timer firing):
 	//   T1: Receive a new RA from R
@@ -198,10 +241,33 @@ type defaultRouterState struct {
 	//   T2: Obtains NIC's lock & invalidates R immediately
 	//
 	// To resolve this, T1 will check to see if the timer already fired, and
-	// signal the timer using this channel to not invalidate R, so that once
-	// T2 obtains the lock, it will see that there is an event on this
-	// channel and do nothing further.
-	doNotInvalidateC chan struct{}
+	// inform the timer using doNotInvalidate to not invalidate R, so that
+	// once T2 obtains the lock, it will see that it is set to true and do
+	// nothing further.
+	doNotInvalidate *bool
+}
+
+// onLinkPrefixState holds data associated with an on-link prefix discovered by
+// a Router Advertisement's Prefix Information option (PI) when the NDP
+// configurations was configured to do so.
+type onLinkPrefixState struct {
+	invalidationTimer *time.Timer
+
+	// Used to signal the timer not to invalidate the on-link prefix (P) in
+	// a race condition (T1 is a goroutine that handles a PI for P and T2
+	// is the goroutine that handles P's invalidation timer firing):
+	//   T1: Receive a new PI for P
+	//   T1: Obtain the NIC's lock before processing the PI
+	//   T2: P's invalidation timer fires, and gets blocked on obtaining the
+	//       NIC's lock
+	//   T1: Refreshes/extends P's lifetime & releases NIC's lock
+	//   T2: Obtains NIC's lock & invalidates P immediately
+	//
+	// To resolve this, T1 will check to see if the timer already fired, and
+	// inform the timer using doNotInvalidate to not invalidate P, so that
+	// once T2 obtains the lock, it will see that it is set to true and do
+	// nothing further.
+	doNotInvalidate *bool
 }
 
 // startDuplicateAddressDetection performs Duplicate Address Detection.
@@ -440,14 +506,13 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 			if !timer.Stop() {
 				// If we reach this point, then we know the
 				// timer fired after we already took the NIC
-				// lock. Signal the timer so that once it
-				// obtains the lock, it doesn't actually
-				// invalidate the router as we just got a new
-				// RA that refreshes its lifetime to a non-zero
-				// value. See
-				// defaultRouterState.doNotInvalidateC for more
+				// lock. Inform the timer not to invalidate the
+				// router when it obtains the lock as we just
+				// got a new RA that refreshes its lifetime to a
+				// non-zero value. See
+				// defaultRouterState.doNotInvalidate for more
 				// details.
-				rtr.doNotInvalidateC <- struct{}{}
+				*rtr.doNotInvalidate = true
 			}
 
 			timer.Reset(rl)
@@ -459,8 +524,117 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 		}
 	}
 
-	// TODO(b/140948104): Do Prefix Discovery.
-	// TODO(b/141556115): Do Parameter Discovery.
+	// TODO(b/141556115): Do (RetransTimer, ReachableTime)) Parameter
+	//                    Discovery.
+
+	// We know the options is valid as far as wire format is concerned since
+	// we got the Router Advertisement, as documented by this fn. Given this
+	// we do not check the iterator for errors on calls to Next.
+	it, _ := ra.Options().Iter(false)
+	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
+		switch opt.Type() {
+		case header.NDPPrefixInformationType:
+			if !ndp.configs.DiscoverOnLinkPrefixes {
+				continue
+			}
+
+			pi := opt.(header.NDPPrefixInformation)
+
+			prefix := pi.Subnet()
+
+			// Is the prefix a link-local?
+			if header.IsV6LinkLocalAddress(prefix.ID()) {
+				// ...Yes, skip as per RFC 4861 section 6.3.4.
+				continue
+			}
+
+			// Is the Prefix Length 0?
+			if prefix.Prefix() == 0 {
+				// ...Yes, skip as this is an invalid prefix
+				// as all IPv6 addresses cannot be on-link.
+				continue
+			}
+
+			if !pi.OnLinkFlag() {
+				// Not on-link so don't "discover" it as an
+				// on-link prefix.
+				continue
+			}
+
+			prefixState, ok := ndp.onLinkPrefixes[prefix]
+			vl := pi.ValidLifetime()
+			switch {
+			case !ok && vl == 0:
+				// Don't know about this prefix but has a zero
+				// valid lifetime, so just ignore.
+				continue
+
+			case !ok && vl != 0:
+				// This is a new on-link prefix we are
+				// discovering.
+				//
+				// Only remember it if we currently know about
+				// less than MaxDiscoveredOnLinkPrefixes on-link
+				// prefixes.
+				if len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
+					ndp.rememberOnLinkPrefix(prefix, vl)
+				}
+				continue
+
+			case ok && vl == 0:
+				// We know about the on-link prefix, but it is
+				// no longer to be considered on-link, so
+				// invalidate it.
+				ndp.invalidateOnLinkPrefix(prefix)
+				continue
+			}
+
+			// This is an already discovered on-link prefix with a
+			// new non-zero valid lifetime.
+			// Update the invalidation timer.
+			timer := prefixState.invalidationTimer
+
+			if timer == nil && vl >= header.NDPPrefixInformationInfiniteLifetime {
+				// Had infinite valid lifetime before and
+				// continues to have an invalid lifetime. Do
+				// nothing further.
+				continue
+			}
+
+			if timer != nil && !timer.Stop() {
+				// If we reach this point, then we know the
+				// timer already fired after we took the NIC
+				// lock. Inform the timer to not invalidate
+				// the prefix once it obtains the lock as we
+				// just got a new PI that refeshes its lifetime
+				// to a non-zero value. See
+				// onLinkPrefixState.doNotInvalidate for more
+				// details.
+				*prefixState.doNotInvalidate = true
+			}
+
+			if vl >= header.NDPPrefixInformationInfiniteLifetime {
+				// Prefix is now valid forever so we don't need
+				// an invalidation timer.
+				prefixState.invalidationTimer = nil
+				ndp.onLinkPrefixes[prefix] = prefixState
+				continue
+			}
+
+			if timer != nil {
+				// We already have a timer so just reset it to
+				// expire after the new valid lifetime.
+				timer.Reset(vl)
+				continue
+			}
+
+			// We do not have a timer so just create a new one.
+			prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
+			ndp.onLinkPrefixes[prefix] = prefixState
+		}
+
+		// TODO(b/141556115): Do (MTU) Parameter Discovery.
+	}
 }
 
 // invalidateDefaultRouter invalidates a discovered default router.
@@ -477,8 +651,8 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 
 	rtr.invalidationTimer.Stop()
 	rtr.invalidationTimer = nil
-	close(rtr.doNotInvalidateC)
-	rtr.doNotInvalidateC = nil
+	*rtr.doNotInvalidate = true
+	rtr.doNotInvalidate = nil
 
 	delete(ndp.defaultRouters, ip)
 
@@ -508,9 +682,9 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 	}
 
 	// Used to signal the timer not to invalidate the default router (R) in
-	// a race condition. See defaultRouterState.doNotInvalidateC for more
+	// a race condition. See defaultRouterState.doNotInvalidate for more
 	// details.
-	doNotInvalidateC := make(chan struct{}, 1)
+	var doNotInvalidate bool
 
 	ndp.defaultRouters[ip] = defaultRouterState{
 		invalidationTimer: time.AfterFunc(rl, func() {
@@ -519,16 +693,103 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 			ndp.nic.mu.Lock()
 			defer ndp.nic.mu.Unlock()
 
-			select {
-			case <-doNotInvalidateC:
+			if doNotInvalidate {
+				doNotInvalidate = false
 				return
-			default:
 			}
 
 			ndp.invalidateDefaultRouter(ip)
 		}),
-		doNotInvalidateC: doNotInvalidateC,
+		doNotInvalidate: &doNotInvalidate,
+	}
+
+	ndp.nic.stack.routeTable = routeTable
+}
+
+// rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6
+// address with prefix prefix with lifetime l.
+//
+// The prefix identified by prefix MUST NOT already be known.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
+	if ndp.nic.stack.ndpDisp == nil {
+		return
+	}
+
+	// Inform the integrator when we discovered an on-link prefix.
+	remember, routeTable := ndp.nic.stack.ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix)
+	if !remember {
+		// Informed by the integrator to not remember the prefix, do
+		// nothing further.
+		return
+	}
+
+	// Used to signal the timer not to invalidate the on-link prefix (P) in
+	// a race condition. See onLinkPrefixState.doNotInvalidate for more
+	// details.
+	var doNotInvalidate bool
+	var timer *time.Timer
+
+	// Only create a timer if the lifetime is not infinite.
+	if l < header.NDPPrefixInformationInfiniteLifetime {
+		timer = ndp.prefixInvalidationCallback(prefix, l, &doNotInvalidate)
+	}
+
+	ndp.onLinkPrefixes[prefix] = onLinkPrefixState{
+		invalidationTimer: timer,
+		doNotInvalidate:   &doNotInvalidate,
 	}
 
 	ndp.nic.stack.routeTable = routeTable
 }
+
+// invalidateOnLinkPrefix invalidates a discovered on-link prefix.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
+	s, ok := ndp.onLinkPrefixes[prefix]
+
+	// Is the on-link prefix still discovered?
+	if !ok {
+		// ...Nope, do nothing further.
+		return
+	}
+
+	if s.invalidationTimer != nil {
+		s.invalidationTimer.Stop()
+		s.invalidationTimer = nil
+		*s.doNotInvalidate = true
+	}
+
+	s.doNotInvalidate = nil
+
+	delete(ndp.onLinkPrefixes, prefix)
+
+	// Let the integrator know a discovered on-link prefix is invalidated.
+	if ndp.nic.stack.ndpDisp != nil {
+		ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
+	}
+}
+
+// prefixInvalidationCallback returns a new on-link prefix invalidation timer
+// for prefix that fires after vl.
+//
+// doNotInvalidate is used to signal the timer when it fires at the same time
+// that a prefix's valid lifetime gets refreshed. See
+// onLinkPrefixState.doNotInvalidate for more details.
+func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Duration, doNotInvalidate *bool) *time.Timer {
+	return time.AfterFunc(vl, func() {
+		ndp.nic.stack.mu.Lock()
+		defer ndp.nic.stack.mu.Unlock()
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
+		if *doNotInvalidate {
+			*doNotInvalidate = false
+			return
+		}
+
+		ndp.invalidateOnLinkPrefix(prefix)
+	})
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 50ce1bbfa..494244368 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -96,6 +96,13 @@ type ndpRouterEvent struct {
 	discovered bool
 }
 
+type ndpPrefixEvent struct {
+	nicID  tcpip.NICID
+	prefix tcpip.Subnet
+	// true if prefix was discovered, false if invalidated.
+	discovered bool
+}
+
 var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
@@ -104,6 +111,8 @@ type ndpDispatcher struct {
 	dadC           chan ndpDADEvent
 	routerC        chan ndpRouterEvent
 	rememberRouter bool
+	prefixC        chan ndpPrefixEvent
+	rememberPrefix bool
 	routeTable     []tcpip.Route
 }
 
@@ -169,6 +178,54 @@ func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip
 	return rt
 }
 
+// Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered.
+func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route) {
+	if n.prefixC != nil {
+		n.prefixC <- ndpPrefixEvent{
+			nicID,
+			prefix,
+			true,
+		}
+	}
+
+	if !n.rememberPrefix {
+		return false, nil
+	}
+
+	rt := append([]tcpip.Route(nil), n.routeTable...)
+	rt = append(rt, tcpip.Route{
+		Destination: prefix,
+		NIC:         nicID,
+	})
+	n.routeTable = rt
+	return true, rt
+}
+
+// Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated.
+func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route {
+	if n.prefixC != nil {
+		n.prefixC <- ndpPrefixEvent{
+			nicID,
+			prefix,
+			false,
+		}
+	}
+
+	rt := make([]tcpip.Route, 0)
+	exclude := tcpip.Route{
+		Destination: prefix,
+		NIC:         nicID,
+	}
+
+	for _, r := range n.routeTable {
+		if r != exclude {
+			rt = append(rt, r)
+		}
+	}
+	n.routeTable = rt
+	return rt
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -682,16 +739,19 @@ func TestSetNDPConfigurations(t *testing.T) {
 	}
 }
 
-// raBuf returns a valid NDP Router Advertisement.
+// raBufWithOpts returns a valid NDP Router Advertisement with options.
 //
-// Note, raBuf does not populate any of the RA fields other than the
+// Note, raBufWithOpts does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
-	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
 	pkt.SetType(header.ICMPv6RouterAdvert)
 	pkt.SetCode(0)
+	ra := header.NDPRouterAdvert(pkt.NDPPayload())
+	opts := ra.Options()
+	opts.Serialize(optSer)
 	// Populate the Router Lifetime.
 	binary.BigEndian.PutUint16(pkt.NDPPayload()[2:], rl)
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, ip, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
@@ -708,6 +768,35 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
 	return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
 }
 
+// raBuf returns a valid NDP Router Advertisement.
+//
+// Note, raBuf does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
+	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
+}
+
+// raBufWithPI returns a valid NDP Router Advertisement with a single Prefix
+// Information option.
+//
+// Note, raBufWithPI does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink bool, vl uint32) tcpip.PacketBuffer {
+	flags := uint8(0)
+	if onLink {
+		flags |= 128
+	}
+
+	buf := [30]byte{}
+	buf[0] = uint8(prefix.PrefixLen)
+	buf[1] = flags
+	binary.BigEndian.PutUint32(buf[2:], vl)
+	copy(buf[14:], prefix.Address)
+	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{
+		header.NDPPrefixInformation(buf[:]),
+	})
+}
+
 // TestNoRouterDiscovery tests that router discovery will not be performed if
 // configured not to.
 func TestNoRouterDiscovery(t *testing.T) {
@@ -1011,3 +1100,440 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
 	}
 }
+
+// TestNoPrefixDiscovery tests that prefix discovery will not be performed if
+// configured not to.
+func TestNoPrefixDiscovery(t *testing.T) {
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+
+	// Being configured to discover prefixes means handle and
+	// discover are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, discover =
+	// true and forwarding = false (the required configuration to do
+	// prefix discovery) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		discover := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				prefixC: make(chan ndpPrefixEvent, 10),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					DiscoverOnLinkPrefixes: discover,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with prefix with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 10))
+
+			select {
+			case <-ndpDisp.prefixC:
+				t.Fatal("unexpectedly discovered a prefix when configured not to")
+			case <-time.After(defaultTimeout):
+			}
+		})
+	}
+}
+
+// TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
+// remember a discovered on-link prefix when the dispatcher asks it not to.
+func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+	subnet := prefix.Subnet()
+
+	ndpDisp := ndpDispatcher{
+		prefixC: make(chan ndpPrefixEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: false,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	routeTable := []tcpip.Route{
+		{
+			header.IPv6EmptySubnet,
+			llAddr3,
+			1,
+		},
+	}
+	s.SetRouteTable(routeTable)
+
+	// Rx an RA with prefix with a short lifetime.
+	const lifetime = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, lifetime))
+	select {
+	case r := <-ndpDisp.prefixC:
+		if r.nicID != 1 {
+			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		}
+		if r.prefix != subnet {
+			t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet)
+		}
+		if !r.discovered {
+			t.Fatal("got r.discovered = false, want = true")
+		}
+	case <-time.After(defaultTimeout):
+		t.Fatal("timeout waiting for prefix discovery event")
+	}
+
+	// Original route table should not have been modified.
+	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	}
+
+	// Wait for the normal invalidation time plus some buffer to
+	// make sure we do not actually receive any invalidation events as
+	// we should not have remembered the prefix in the first place.
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("should not have received any prefix events")
+	case <-time.After(lifetime*time.Second + defaultTimeout):
+	}
+
+	// Original route table should not have been modified.
+	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	}
+}
+
+func TestPrefixDiscovery(t *testing.T) {
+	prefix1 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+	prefix2 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+	prefix3 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x0a\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 72,
+	}
+	subnet1 := prefix1.Subnet()
+	subnet2 := prefix2.Subnet()
+	subnet3 := prefix3.Subnet()
+
+	ndpDisp := ndpDispatcher{
+		prefixC:        make(chan ndpPrefixEvent, 10),
+		rememberPrefix: true,
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	waitForEvent := func(subnet tcpip.Subnet, discovered bool, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case r := <-ndpDisp.prefixC:
+			if r.nicID != 1 {
+				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+			}
+			if r.prefix != subnet {
+				t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet)
+			}
+			if r.discovered != discovered {
+				t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timeout waiting for prefix discovery event")
+		}
+	}
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with zero valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly discovered a prefix with 0 lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 100))
+	waitForEvent(subnet1, true, defaultTimeout)
+
+	// Should have added a device route for subnet1 through the nic.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Receive an RA with prefix2 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, 100))
+	waitForEvent(subnet2, true, defaultTimeout)
+
+	// Should have added a device route for subnet2 through the nic.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Receive an RA with prefix3 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 100))
+	waitForEvent(subnet3, true, defaultTimeout)
+
+	// Should have added a device route for subnet3 through the nic.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Receive an RA with prefix1 in a PI with lifetime = 0.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+	waitForEvent(subnet1, false, defaultTimeout)
+
+	// Should have removed the device route for subnet1 through the nic.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Receive an RA with prefix2 in a PI with lesser lifetime.
+	lifetime := uint32(2)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, lifetime))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly received prefix event when updating lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Should not have updated route table.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Wait for prefix2's most recent invalidation timer plus some buffer to
+	// expire.
+	waitForEvent(subnet2, false, time.Duration(lifetime)*time.Second+defaultTimeout)
+
+	// Should have removed the device route for subnet2 through the nic.
+	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	}
+
+	// Receive RA to invalidate prefix3.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 0))
+	waitForEvent(subnet3, false, defaultTimeout)
+
+	// Should not have any routes.
+	if got := len(s.GetRouteTable()); got != 0 {
+		t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got)
+	}
+}
+
+func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
+	// Update the infinite lifetime value to a smaller value so we can test
+	// that when we receive a PI with such a lifetime value, we do not
+	// invalidate the prefix.
+	const testInfiniteLifetimeSeconds = 2
+	const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second
+	saved := header.NDPPrefixInformationInfiniteLifetime
+	header.NDPPrefixInformationInfiniteLifetime = testInfiniteLifetime
+	defer func() {
+		header.NDPPrefixInformationInfiniteLifetime = saved
+	}()
+
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+	subnet := prefix.Subnet()
+
+	ndpDisp := ndpDispatcher{
+		prefixC:        make(chan ndpPrefixEvent, 10),
+		rememberPrefix: true,
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	waitForEvent := func(discovered bool, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case r := <-ndpDisp.prefixC:
+			if r.nicID != 1 {
+				t.Errorf("got r.nicID = %d, want = 1", r.nicID)
+			}
+			if r.prefix != subnet {
+				t.Errorf("got r.prefix = %s, want = %s", r.prefix, subnet)
+			}
+			if r.discovered != discovered {
+				t.Errorf("got r.discovered = %t, want = %t", r.discovered, discovered)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timeout waiting for prefix discovery event")
+		}
+	}
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix in an NDP Prefix Information option (PI)
+	// with infinite valid lifetime which should not get invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+	waitForEvent(true, defaultTimeout)
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
+	case <-time.After(testInfiniteLifetime + defaultTimeout):
+	}
+
+	// Receive an RA with finite lifetime.
+	// The prefix should get invalidated after 1s.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
+	waitForEvent(false, testInfiniteLifetime)
+
+	// Receive an RA with finite lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
+	waitForEvent(true, defaultTimeout)
+
+	// Receive an RA with prefix with an infinite lifetime.
+	// The prefix should not be invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
+	case <-time.After(testInfiniteLifetime + defaultTimeout):
+	}
+
+	// Receive an RA with a prefix with a lifetime value greater than the
+	// set infinite lifetime value.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds+1))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
+	case <-time.After((testInfiniteLifetimeSeconds+1)*time.Second + defaultTimeout):
+	}
+
+	// Receive an RA with 0 lifetime.
+	// The prefix should get invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 0))
+	waitForEvent(false, defaultTimeout)
+}
+
+// TestPrefixDiscoveryMaxRouters tests that only
+// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
+func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
+		rememberPrefix: true,
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: false,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2)
+	expectedRt := [stack.MaxDiscoveredOnLinkPrefixes]tcpip.Route{}
+	prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
+
+	// Receive an RA with 2 more than the max number of discovered on-link
+	// prefixes.
+	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
+		prefixAddr := [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0}
+		prefixAddr[7] = byte(i)
+		prefix := tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(prefixAddr[:]),
+			PrefixLen: 64,
+		}
+		prefixes[i] = prefix.Subnet()
+		buf := [30]byte{}
+		buf[0] = uint8(prefix.PrefixLen)
+		buf[1] = 128
+		binary.BigEndian.PutUint32(buf[2:], 10)
+		copy(buf[14:], prefix.Address)
+
+		optSer[i] = header.NDPPrefixInformation(buf[:])
+
+		if i < stack.MaxDiscoveredOnLinkPrefixes {
+			expectedRt[i] = tcpip.Route{prefixes[i], tcpip.Address([]byte(nil)), 1}
+		}
+	}
+
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
+	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
+		if i < stack.MaxDiscoveredOnLinkPrefixes {
+			select {
+			case r := <-ndpDisp.prefixC:
+				if r.nicID != 1 {
+					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+				}
+				if r.prefix != prefixes[i] {
+					t.Fatalf("got r.prefix = %s, want = %s", r.prefix, prefixes[i])
+				}
+				if !r.discovered {
+					t.Fatal("got r.discovered = false, want = true")
+				}
+			case <-time.After(defaultTimeout):
+				t.Fatal("timeout waiting for prefix discovery event")
+			}
+		} else {
+			select {
+			case <-ndpDisp.prefixC:
+				t.Fatal("should not have discovered a new prefix after we already discovered the max number of prefixes")
+			case <-time.After(defaultTimeout):
+			}
+		}
+	}
+
+	// Should only have device routes for the first
+	// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes.
+	if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) {
+		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 28a28ae6e..9ed9e1e7c 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -118,6 +118,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 			configs:        stack.ndpConfigs,
 			dad:            make(map[tcpip.Address]dadState),
 			defaultRouters: make(map[tcpip.Address]defaultRouterState),
+			onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
 		},
 	}
 	nic.ndp.nic = nic
-- 
cgit v1.2.3


From 3f51bef8cdad5f0555e7c6b05f777769d23aaf77 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 12 Nov 2019 15:48:34 -0800
Subject: Do not handle TCP packets that include a non-unicast IP address

This change drops TCP packets with a non-unicast IP address as the source or
destination address as TCP is meant for communication between two endpoints.

Test: Make sure that if the source or destination address contains a non-unicast
address, no TCP packet is sent in response and the packet is dropped.
PiperOrigin-RevId: 280073731
---
 pkg/tcpip/stack/transport_demuxer.go               |  45 ++++-
 pkg/tcpip/transport/tcp/tcp_test.go                | 204 +++++++++++++++++++++
 pkg/tcpip/transport/tcp/testing/context/context.go |  34 +++-
 3 files changed, 269 insertions(+), 14 deletions(-)

diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index cb805522b..67c21be42 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -389,8 +389,8 @@ var loopbackSubnet = func() tcpip.Subnet {
 }()
 
 // deliverPacket attempts to find one or more matching transport endpoints, and
-// then, if matches are found, delivers the packet to them. Returns true if it
-// found one or more endpoints, false otherwise.
+// then, if matches are found, delivers the packet to them. Returns true if
+// the packet no longer needs to be handled.
 func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
@@ -400,13 +400,38 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	eps.mu.RLock()
 
 	// Determine which transport endpoint or endpoints to deliver this packet to.
-	// If the packet is a broadcast or multicast, then find all matching
-	// transport endpoints.
+	// If the packet is a UDP broadcast or multicast, then find all matching
+	// transport endpoints. If the packet is a TCP packet with a non-unicast
+	// source or destination address, then do nothing further and instruct
+	// the caller to do the same.
 	var destEps []*endpointsByNic
-	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
-		destEps = d.findAllEndpointsLocked(eps, id)
-	} else if ep := d.findEndpointLocked(eps, id); ep != nil {
-		destEps = append(destEps, ep)
+	switch protocol {
+	case header.UDPProtocolNumber:
+		if isMulticastOrBroadcast(id.LocalAddress) {
+			destEps = d.findAllEndpointsLocked(eps, id)
+			break
+		}
+
+		if ep := d.findEndpointLocked(eps, id); ep != nil {
+			destEps = append(destEps, ep)
+		}
+
+	case header.TCPProtocolNumber:
+		if !(isUnicast(r.LocalAddress) && isUnicast(r.RemoteAddress)) {
+			// TCP can only be used to communicate between a single
+			// source and a single destination; the addresses must
+			// be unicast.
+			eps.mu.RUnlock()
+			r.Stats().TCP.InvalidSegmentsReceived.Increment()
+			return true
+		}
+
+		fallthrough
+
+	default:
+		if ep := d.findEndpointLocked(eps, id); ep != nil {
+			destEps = append(destEps, ep)
+		}
 	}
 
 	eps.mu.RUnlock()
@@ -587,3 +612,7 @@ func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolN
 func isMulticastOrBroadcast(addr tcpip.Address) bool {
 	return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr)
 }
+
+func isUnicast(addr tcpip.Address) bool {
+	return addr != header.IPv4Any && addr != header.IPv6Any && !isMulticastOrBroadcast(addr)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 84579ce52..b443fe9dc 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -4242,6 +4242,210 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 }
 
+// TestListenNoAcceptMulticastBroadcastV4 makes sure that TCP segments with a
+// non unicast IPv4 address are not accepted.
+func TestListenNoAcceptNonUnicastV4(t *testing.T) {
+	multicastAddr := tcpip.Address("\xe0\x00\x01\x02")
+	otherMulticastAddr := tcpip.Address("\xe0\x00\x01\x03")
+
+	tests := []struct {
+		name    string
+		srcAddr tcpip.Address
+		dstAddr tcpip.Address
+	}{
+		{
+			"SourceUnspecified",
+			header.IPv4Any,
+			context.StackAddr,
+		},
+		{
+			"SourceBroadcast",
+			header.IPv4Broadcast,
+			context.StackAddr,
+		},
+		{
+			"SourceOurMulticast",
+			multicastAddr,
+			context.StackAddr,
+		},
+		{
+			"SourceOtherMulticast",
+			otherMulticastAddr,
+			context.StackAddr,
+		},
+		{
+			"DestUnspecified",
+			context.TestAddr,
+			header.IPv4Any,
+		},
+		{
+			"DestBroadcast",
+			context.TestAddr,
+			header.IPv4Broadcast,
+		},
+		{
+			"DestOurMulticast",
+			context.TestAddr,
+			multicastAddr,
+		},
+		{
+			"DestOtherMulticast",
+			context.TestAddr,
+			otherMulticastAddr,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.Create(-1)
+
+			if err := c.Stack().JoinGroup(header.IPv4ProtocolNumber, 1, multicastAddr); err != nil {
+				t.Fatalf("JoinGroup failed: %s", err)
+			}
+
+			if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			if err := c.EP.Listen(1); err != nil {
+				t.Fatalf("Listen failed: %s", err)
+			}
+
+			irs := seqnum.Value(789)
+			c.SendPacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, test.srcAddr, test.dstAddr)
+			c.CheckNoPacket("Should not have received a response")
+
+			// Handle normal packet.
+			c.SendPacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, context.TestAddr, context.StackAddr)
+			checker.IPv4(t, c.GetPacket(),
+				checker.TCP(
+					checker.SrcPort(context.StackPort),
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+					checker.AckNum(uint32(irs)+1)))
+		})
+	}
+}
+
+// TestListenNoAcceptMulticastBroadcastV6 makes sure that TCP segments with a
+// non unicast IPv6 address are not accepted.
+func TestListenNoAcceptNonUnicastV6(t *testing.T) {
+	multicastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01")
+	otherMulticastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02")
+
+	tests := []struct {
+		name    string
+		srcAddr tcpip.Address
+		dstAddr tcpip.Address
+	}{
+		{
+			"SourceUnspecified",
+			header.IPv6Any,
+			context.StackV6Addr,
+		},
+		{
+			"SourceAllNodes",
+			header.IPv6AllNodesMulticastAddress,
+			context.StackV6Addr,
+		},
+		{
+			"SourceOurMulticast",
+			multicastAddr,
+			context.StackV6Addr,
+		},
+		{
+			"SourceOtherMulticast",
+			otherMulticastAddr,
+			context.StackV6Addr,
+		},
+		{
+			"DestUnspecified",
+			context.TestV6Addr,
+			header.IPv6Any,
+		},
+		{
+			"DestAllNodes",
+			context.TestV6Addr,
+			header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			"DestOurMulticast",
+			context.TestV6Addr,
+			multicastAddr,
+		},
+		{
+			"DestOtherMulticast",
+			context.TestV6Addr,
+			otherMulticastAddr,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateV6Endpoint(true)
+
+			if err := c.Stack().JoinGroup(header.IPv6ProtocolNumber, 1, multicastAddr); err != nil {
+				t.Fatalf("JoinGroup failed: %s", err)
+			}
+
+			if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			if err := c.EP.Listen(1); err != nil {
+				t.Fatalf("Listen failed: %s", err)
+			}
+
+			irs := seqnum.Value(789)
+			c.SendV6PacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, test.srcAddr, test.dstAddr)
+			c.CheckNoPacket("Should not have received a response")
+
+			// Handle normal packet.
+			c.SendV6PacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, context.TestV6Addr, context.StackV6Addr)
+			checker.IPv6(t, c.GetV6Packet(),
+				checker.TCP(
+					checker.SrcPort(context.StackPort),
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+					checker.AckNum(uint32(irs)+1)))
+		})
+	}
+}
+
 func TestListenSynRcvdQueueFull(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 4854e719d..0a733fa94 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -309,6 +309,12 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 
 // BuildSegment builds a TCP segment based on the given Headers and payload.
 func (c *Context) BuildSegment(payload []byte, h *Headers) buffer.VectorisedView {
+	return c.BuildSegmentWithAddrs(payload, h, TestAddr, StackAddr)
+}
+
+// BuildSegmentWithAddrs builds a TCP segment based on the given Headers,
+// payload and source and destination IPv4 addresses.
+func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) buffer.VectorisedView {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.TCPMinimumSize + header.IPv4MinimumSize + len(h.TCPOpts) + len(payload))
 	copy(buf[len(buf)-len(payload):], payload)
@@ -321,8 +327,8 @@ func (c *Context) BuildSegment(payload []byte, h *Headers) buffer.VectorisedView
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(tcp.ProtocolNumber),
-		SrcAddr:     TestAddr,
-		DstAddr:     StackAddr,
+		SrcAddr:     src,
+		DstAddr:     dst,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
 
@@ -339,7 +345,7 @@ func (c *Context) BuildSegment(payload []byte, h *Headers) buffer.VectorisedView
 	})
 
 	// Calculate the TCP pseudo-header checksum.
-	xsum := header.PseudoHeaderChecksum(tcp.ProtocolNumber, TestAddr, StackAddr, uint16(len(t)))
+	xsum := header.PseudoHeaderChecksum(tcp.ProtocolNumber, src, dst, uint16(len(t)))
 
 	// Calculate the TCP checksum and set it.
 	xsum = header.Checksum(payload, xsum)
@@ -365,6 +371,15 @@ func (c *Context) SendPacket(payload []byte, h *Headers) {
 	})
 }
 
+// SendPacketWithAddrs builds and sends a TCP segment(with the provided payload
+// & TCPheaders) in an IPv4 packet via the link layer endpoint using the
+// provided source and destination IPv4 addresses.
+func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
+	})
+}
+
 // SendAck sends an ACK packet.
 func (c *Context) SendAck(seq seqnum.Value, bytesReceived int) {
 	c.SendAckWithSACK(seq, bytesReceived, nil)
@@ -490,6 +505,13 @@ func (c *Context) GetV6Packet() []byte {
 // SendV6Packet builds and sends an IPv6 Packet via the link layer endpoint of
 // the context.
 func (c *Context) SendV6Packet(payload []byte, h *Headers) {
+	c.SendV6PacketWithAddrs(payload, h, TestV6Addr, StackV6Addr)
+}
+
+// SendV6PacketWithAddrs builds and sends an IPv6 Packet via the link layer
+// endpoint of the context using the provided source and destination IPv6
+// addresses.
+func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.TCPMinimumSize + header.IPv6MinimumSize + len(payload))
 	copy(buf[len(buf)-len(payload):], payload)
@@ -500,8 +522,8 @@ func (c *Context) SendV6Packet(payload []byte, h *Headers) {
 		PayloadLength: uint16(header.TCPMinimumSize + len(payload)),
 		NextHeader:    uint8(tcp.ProtocolNumber),
 		HopLimit:      65,
-		SrcAddr:       TestV6Addr,
-		DstAddr:       StackV6Addr,
+		SrcAddr:       src,
+		DstAddr:       dst,
 	})
 
 	// Initialize the TCP header.
@@ -517,7 +539,7 @@ func (c *Context) SendV6Packet(payload []byte, h *Headers) {
 	})
 
 	// Calculate the TCP pseudo-header checksum.
-	xsum := header.PseudoHeaderChecksum(tcp.ProtocolNumber, TestV6Addr, StackV6Addr, uint16(len(t)))
+	xsum := header.PseudoHeaderChecksum(tcp.ProtocolNumber, src, dst, uint16(len(t)))
 
 	// Calculate the TCP checksum and set it.
 	xsum = header.Checksum(payload, xsum)
-- 
cgit v1.2.3


From ca9cba66d2062811db9fa2b89a610f8eaa13fe99 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 12 Nov 2019 15:58:41 -0800
Subject: seccomp: introduce the GreaterThan rule type

PiperOrigin-RevId: 280075805
---
 pkg/seccomp/seccomp.go       | 17 ++++++++++++++++
 pkg/seccomp/seccomp_rules.go |  3 +++
 pkg/seccomp/seccomp_test.go  | 48 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index c7503f2cc..fc36efa23 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -199,6 +199,10 @@ func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string {
 	return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx)
 }
 
+func ruleLabel(ruleSetIdx int, sysno uintptr, idx int, name string) string {
+	return fmt.Sprintf("rule_%v_%v_%v_%v", ruleSetIdx, sysno, idx, name)
+}
+
 func checkArgsLabel(sysno uintptr) string {
 	return fmt.Sprintf("checkArgs_%v", sysno)
 }
@@ -223,6 +227,19 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
+				case GreaterThan:
+					labelGood := fmt.Sprintf("gt%v", i)
+					high, low := uint32(a>>32), uint32(a)
+					// assert arg_high < high
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					// arg_high > high
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					// arg_low < low
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
 				default:
 					return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
 				}
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 29eec8db1..84c841d7f 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -49,6 +49,9 @@ func (a AllowAny) String() (s string) {
 // AllowValue specifies a value that needs to be strictly matched.
 type AllowValue uintptr
 
+// GreaterThan specifies a value that needs to be strictly smaller.
+type GreaterThan uintptr
+
 func (a AllowValue) String() (s string) {
 	return fmt.Sprintf("%#x ", uintptr(a))
 }
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 353686ed3..abbee7051 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -340,6 +340,54 @@ func TestBasic(t *testing.T) {
 				},
 			},
 		},
+		{
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								GreaterThan(0xf),
+								GreaterThan(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			specs: []spec{
+				{
+					desc: "GreaterThan: Syscall argument allowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x10, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "GreaterThan: Syscall argument disallowed (equal)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0xf, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "Syscall argument disallowed (smaller)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "GreaterThan2: Syscall argument allowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x10, 0xfbcd000d}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "GreaterThan2: Syscall argument disallowed (equal)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x10, 0xabcd000d}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "GreaterThan2: Syscall argument disallowed (smaller)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x10, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
 	} {
 		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
 		if err != nil {
-- 
cgit v1.2.3


From 2c6c9af904c99371fe4381517375cd114917db59 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 12 Nov 2019 20:37:40 -0800
Subject: Add UDP SO_REUSEADDR/SO_REUSEPORT conversion tests.

Add additional tests for UDP SO_REUSEADDR and SO_REUSEPORT interaction.

If all existing all currently bound sockets as well as the current binding
socket have SO_REUSEADDR, or if all existing all currently bound sockets as
well as the current binding socket have SO_REUSEPORT, binding a currently bound
address is allowed. This seems odd since it means that the
SO_REUSEADDR/SO_REUSEPORT behavior can change with the binding of additional
sockets.

PiperOrigin-RevId: 280116163
---
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 274 +++++++++++++++++++++++++
 1 file changed, 274 insertions(+)

diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 00dc24928..6b1af6c17 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1814,6 +1814,280 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
               SyscallFailsWithErrno(EADDRINUSE));
 }
 
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable1) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Close socket2 to revert to just socket1 with REUSEADDR and REUSEPORT.
+  socket2->reset();
+
+  // Bind socket3 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable2) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Close socket2 to revert to just socket1 with REUSEADDR and REUSEPORT.
+  socket2->reset();
+
+  // Bind socket3 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReusePort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, also with REUSEADDR and
+  // REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, also with REUSEADDR and
+  // REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
 // Check that REUSEPORT takes precedence over REUSEADDR.
 TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
   auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-- 
cgit v1.2.3


From 05871a1cdc73e98df58f56841be23a4eac27225c Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 11 Nov 2019 08:20:18 +0000
Subject: Enable runsc/boot support on arm64.

This patch also include a minor change to replace syscall.Dup2
with syscall.Dup3 which was missed in a previous commit(ref a25a976).

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I00beb9cc492e44c762ebaa3750201c63c1f7c2f3
---
 pkg/seccomp/seccomp_test_victim.go |  2 +-
 pkg/sentry/fs/gofer/inode.go       |  2 +-
 runsc/boot/BUILD                   |  2 ++
 runsc/boot/filter/BUILD            |  2 ++
 runsc/boot/filter/config.go        |  6 +-----
 runsc/boot/filter/config_amd64.go  | 31 +++++++++++++++++++++++++++++++
 runsc/boot/filter/config_arm64.go  | 21 +++++++++++++++++++++
 runsc/boot/loader.go               |  4 ----
 runsc/boot/loader_amd64.go         | 28 ++++++++++++++++++++++++++++
 runsc/boot/loader_arm64.go         | 28 ++++++++++++++++++++++++++++
 10 files changed, 115 insertions(+), 11 deletions(-)
 create mode 100644 runsc/boot/filter/config_amd64.go
 create mode 100644 runsc/boot/filter/config_arm64.go
 create mode 100644 runsc/boot/loader_amd64.go
 create mode 100644 runsc/boot/loader_arm64.go

diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 48413f1fb..da6b9eaaf 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -38,7 +38,7 @@ func main() {
 		syscall.SYS_CLONE:           {},
 		syscall.SYS_CLOSE:           {},
 		syscall.SYS_DUP:             {},
-		syscall.SYS_DUP2:            {},
+		syscall.SYS_DUP3:            {},
 		syscall.SYS_EPOLL_CREATE1:   {},
 		syscall.SYS_EPOLL_CTL:       {},
 		syscall.SYS_EPOLL_WAIT:      {},
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 99910388f..54a8ceef8 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -268,7 +268,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle
 	// operations on the old will see the new data. Then, make the new handle take
 	// ownereship of the old FD and mark the old readHandle to not close the FD
 	// when done.
-	if err := syscall.Dup2(h.Host.FD(), i.readHandles.Host.FD()); err != nil {
+	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil {
 		return err
 	}
 
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 58e86ae7f..847d2f91c 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -15,6 +15,8 @@ go_library(
         "fs.go",
         "limits.go",
         "loader.go",
+        "loader_amd64.go",
+        "loader_arm64.go",
         "network.go",
         "pprof.go",
         "strace.go",
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index f5509b6b7..3a9dcfc04 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 5ad108261..b5bd61a3a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -26,10 +26,6 @@ import (
 
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	},
 	syscall.SYS_CLOCK_GETTIME: {},
 	syscall.SYS_CLONE: []seccomp.Rule{
 		{
@@ -44,7 +40,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_CLOSE:         {},
 	syscall.SYS_DUP:           {},
-	syscall.SYS_DUP2:          {},
+	syscall.SYS_DUP3:          {},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
new file mode 100644
index 000000000..058d9c264
--- /dev/null
+++ b/runsc/boot/filter/config_amd64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	}
+}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
new file mode 100644
index 000000000..7fa9bbda3
--- /dev/null
+++ b/runsc/boot/filter/config_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+// Reserve for future customization.
+func init() {
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f05d5973f..df6052c88 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -43,7 +43,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
-	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
@@ -147,9 +146,6 @@ type execProcess struct {
 func init() {
 	// Initialize the random number generator.
 	mrand.Seed(gtime.Now().UnixNano())
-
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(slinux.AMD64)
 }
 
 // Args are the arguments for New().
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
new file mode 100644
index 000000000..d16d20d89
--- /dev/null
+++ b/runsc/boot/loader_amd64.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+func init() {
+	// Register the global syscall table.
+	kernel.RegisterSyscallTable(linux.AMD64)
+}
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
new file mode 100644
index 000000000..8712e764a
--- /dev/null
+++ b/runsc/boot/loader_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+func init() {
+	// Register the global syscall table.
+	kernel.RegisterSyscallTable(linux.ARM64)
+}
-- 
cgit v1.2.3


From c5d9b5b8816e99507661e1d39ec51033fb69e212 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 11 Nov 2019 09:15:45 +0000
Subject: Enable sentry/fs/host support on arm64.

newfstatat() syscall is not supported on arm64, so we resort
to use the fstatat() syscall.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Iea95550ea53bcf85c01f7b3b95da70ad0952177d
---
 pkg/sentry/fs/host/BUILD                |  2 ++
 pkg/sentry/fs/host/util.go              |  2 +-
 pkg/sentry/fs/host/util_amd64_unsafe.go | 41 +++++++++++++++++++++++++++++++++
 pkg/sentry/fs/host/util_arm64_unsafe.go | 41 +++++++++++++++++++++++++++++++++
 pkg/sentry/fs/host/util_unsafe.go       | 19 ---------------
 5 files changed, 85 insertions(+), 20 deletions(-)
 create mode 100644 pkg/sentry/fs/host/util_amd64_unsafe.go
 create mode 100644 pkg/sentry/fs/host/util_arm64_unsafe.go

diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 1cbed07ae..23daeb528 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -21,6 +21,8 @@ go_library(
         "socket_unsafe.go",
         "tty.go",
         "util.go",
+        "util_amd64_unsafe.go",
+        "util_arm64_unsafe.go",
         "util_unsafe.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fs/host",
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index bad61a9a1..e37e687c6 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -155,7 +155,7 @@ func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
 		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
 		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
 		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
-		Links:            s.Nlink,
+		Links:            uint64(s.Nlink),
 	}
 }
 
diff --git a/pkg/sentry/fs/host/util_amd64_unsafe.go b/pkg/sentry/fs/host/util_amd64_unsafe.go
new file mode 100644
index 000000000..66da6e9f5
--- /dev/null
+++ b/pkg/sentry/fs/host/util_amd64_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return stat, err
+	}
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(unsafe.Pointer(&stat)),
+		uintptr(flags),
+		0, 0)
+	if errno != 0 {
+		return stat, errno
+	}
+	return stat, nil
+}
diff --git a/pkg/sentry/fs/host/util_arm64_unsafe.go b/pkg/sentry/fs/host/util_arm64_unsafe.go
new file mode 100644
index 000000000..e8cb94aeb
--- /dev/null
+++ b/pkg/sentry/fs/host/util_arm64_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return stat, err
+	}
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_FSTATAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(unsafe.Pointer(&stat)),
+		uintptr(flags),
+		0, 0)
+	if errno != 0 {
+		return stat, errno
+	}
+	return stat, nil
+}
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index 2b76f1065..3ab36b088 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -116,22 +116,3 @@ func setTimestamps(fd int, ts fs.TimeSpec) error {
 	}
 	return nil
 }
-
-func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
-	var stat syscall.Stat_t
-	namePtr, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return stat, err
-	}
-	_, _, errno := syscall.Syscall6(
-		syscall.SYS_NEWFSTATAT,
-		uintptr(fd),
-		uintptr(unsafe.Pointer(namePtr)),
-		uintptr(unsafe.Pointer(&stat)),
-		uintptr(flags),
-		0, 0)
-	if errno != 0 {
-		return stat, errno
-	}
-	return stat, nil
-}
-- 
cgit v1.2.3


From 1d8b7292d72ce93d465e4ded19237fb92c08bc56 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 11 Nov 2019 09:42:04 +0000
Subject: Fix some build errors on arm64.

Initialize the VDSO "os" and "arch" fields explicitly,
or the VDSO load process would failed on arm64 platform.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Ic6768df88e43cd7c7956eb630511672ae11ac52f
---
 pkg/sentry/kernel/ptrace_arm64.go | 1 -
 pkg/sentry/loader/vdso.go         | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index 0acdf769d..61e412911 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -17,7 +17,6 @@
 package kernel
 
 import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index ada28aea3..df8a81907 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -268,6 +268,8 @@ func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, er
 		// some applications may not be able to handle multiple [vdso]
 		// hints.
 		vdso:  mm.NewSpecialMappable("", mfp, vdso),
+		os:    info.os,
+		arch:  info.arch,
 		phdrs: info.phdrs,
 	}, nil
 }
-- 
cgit v1.2.3


From c2d3dc0c13e1adfc182c33d57e410c46fe12415f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 12 Nov 2019 22:49:22 -0800
Subject: Use overlay MountSource when binding socket in overlay.

PiperOrigin-RevId: 280131840
---
 pkg/sentry/fs/inode_overlay.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 5a388dad1..a09147080 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -462,7 +462,9 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri
 		inode.DecRef()
 		return nil, err
 	}
-	return NewDirent(ctx, newOverlayInode(ctx, entry, inode.MountSource), name), nil
+	// Use the parent's MountSource, since that corresponds to the overlay,
+	// and not the upper filesystem.
+	return NewDirent(ctx, newOverlayInode(ctx, entry, parent.Inode.MountSource), name), nil
 }
 
 func overlayBoundEndpoint(o *overlayEntry, path string) transport.BoundEndpoint {
-- 
cgit v1.2.3


From 683e8798ab4c2bde60f067563eef0cf06dc9bda5 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 13 Nov 2019 13:20:29 -0800
Subject: Extract linux-specific test setup to separate file

PiperOrigin-RevId: 280264564
---
 test/util/BUILD             |  8 +++++++-
 test/util/test_util.cc      | 13 -------------
 test/util/test_util.h       |  2 ++
 test/util/test_util_impl.cc | 38 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 14 deletions(-)
 create mode 100644 test/util/test_util_impl.cc

diff --git a/test/util/BUILD b/test/util/BUILD
index 5d2a9cc2c..4526bb3f1 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -232,7 +232,13 @@ cc_library(
 cc_library(
     name = "test_util",
     testonly = 1,
-    srcs = ["test_util.cc"],
+    srcs = [
+        "test_util.cc",
+    ] + select_for_linux(
+        [
+            "test_util_impl.cc",
+        ],
+    ),
     hdrs = ["test_util.h"],
     deps = [
         ":fs_util",
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index ba0dcf7d0..9cb050735 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -116,9 +116,6 @@ PosixErrorOr<KernelVersion> GetKernelVersion() {
   return ParseKernelVersion(buf.release);
 }
 
-void SetupGvisorDeathTest() {
-}
-
 std::string CPUSetToString(const cpu_set_t& set, size_t cpus) {
   std::string str = "cpuset[";
   for (unsigned int n = 0; n < cpus; n++) {
@@ -224,15 +221,5 @@ bool Equivalent(uint64_t current, uint64_t target, double tolerance) {
   return abs_diff <= static_cast<uint64_t>(tolerance * target);
 }
 
-void TestInit(int* argc, char*** argv) {
-  ::testing::InitGoogleTest(argc, *argv);
-  ::absl::ParseCommandLine(*argc, *argv);
-
-  // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
-  struct sigaction sa = {};
-  sa.sa_handler = SIG_IGN;
-  TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
-}
-
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/test_util.h b/test/util/test_util.h
index b9d2dc2ba..dc30575b8 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -221,7 +221,9 @@ enum class Platform {
 bool IsRunningOnGvisor();
 Platform GvisorPlatform();
 
+#ifdef __linux__
 void SetupGvisorDeathTest();
+#endif
 
 struct KernelVersion {
   int major;
diff --git a/test/util/test_util_impl.cc b/test/util/test_util_impl.cc
new file mode 100644
index 000000000..ba7c0a85b
--- /dev/null
+++ b/test/util/test_util_impl.cc
@@ -0,0 +1,38 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+void SetupGvisorDeathTest() {}
+
+void TestInit(int* argc, char*** argv) {
+  ::testing::InitGoogleTest(argc, *argv);
+  ::absl::ParseCommandLine(*argc, *argv);
+
+  // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_IGN;
+  TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 6dd4c9ee74828b27cd12ea343756f5625bae683c Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 13 Nov 2019 14:27:46 -0800
Subject: Fix flaky behaviour during S/R.

PiperOrigin-RevId: 280280156
---
 pkg/tcpip/transport/tcp/connect.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a114c06c1..be066d877 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1229,7 +1229,9 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 							return err
 						}
 					}
-					if e.state != StateError {
+					if e.state != StateClose && e.state != StateError {
+						// Only block the worker if the endpoint
+						// is not in closed state or error state.
 						close(e.drainDone)
 						<-e.undrain
 					}
-- 
cgit v1.2.3


From 1e55eb3800a60c1a1118b84f2534b78481702f38 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 13 Nov 2019 15:34:47 -0800
Subject: test/syscalls/proc: check an return code of waitid

PiperOrigin-RevId: 280295208
---
 test/syscalls/linux/proc.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index e4c030bbb..512de5ee0 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -183,7 +183,8 @@ PosixError WithSubprocess(SubprocessCallback const& running,
   siginfo_t info;
   // Wait until the child process has exited (WEXITED flag) but don't
   // reap the child (WNOWAIT flag).
-  waitid(P_PID, child_pid, &info, WNOWAIT | WEXITED);
+  EXPECT_THAT(waitid(P_PID, child_pid, &info, WNOWAIT | WEXITED),
+              SyscallSucceeds());
 
   if (zombied) {
     // Arg of "Z" refers to a Zombied Process.
-- 
cgit v1.2.3


From 3f7d9370909a598cf83dfa07a1e87545a66e182f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 14 Nov 2019 10:14:07 -0800
Subject: Use PacketBuffers for outgoing packets.

PiperOrigin-RevId: 280455453
---
 pkg/tcpip/buffer/prependable.go                    |   6 ++
 pkg/tcpip/link/channel/channel.go                  |  42 ++++----
 pkg/tcpip/link/fdbased/endpoint.go                 |  21 ++--
 pkg/tcpip/link/fdbased/endpoint_test.go            |  10 +-
 pkg/tcpip/link/loopback/loopback.go                |  30 +++---
 pkg/tcpip/link/muxed/injectable.go                 |   6 +-
 pkg/tcpip/link/muxed/injectable_test.go            |  12 ++-
 pkg/tcpip/link/sharedmem/sharedmem.go              |  13 +--
 pkg/tcpip/link/sharedmem/sharedmem_test.go         |  59 ++++++++---
 pkg/tcpip/link/sniffer/sniffer.go                  |  33 +++---
 pkg/tcpip/link/waitable/waitable.go                |   8 +-
 pkg/tcpip/link/waitable/waitable_test.go           |  10 +-
 pkg/tcpip/network/arp/arp.go                       |  26 +++--
 pkg/tcpip/network/arp/arp_test.go                  |  10 +-
 pkg/tcpip/network/ip_test.go                       |  18 ++--
 pkg/tcpip/network/ipv4/icmp.go                     |   6 +-
 pkg/tcpip/network/ipv4/ipv4.go                     | 116 ++++++++++++---------
 pkg/tcpip/network/ipv4/ipv4_test.go                |  41 ++++----
 pkg/tcpip/network/ipv6/icmp.go                     |  13 ++-
 pkg/tcpip/network/ipv6/icmp_test.go                |  16 +--
 pkg/tcpip/network/ipv6/ipv6.go                     |  16 +--
 pkg/tcpip/packet_buffer.go                         |  21 ++--
 pkg/tcpip/packet_buffer_state.go                   |   1 +
 pkg/tcpip/stack/ndp.go                             |   4 +-
 pkg/tcpip/stack/ndp_test.go                        |   2 +-
 pkg/tcpip/stack/nic.go                             |   6 +-
 pkg/tcpip/stack/registration.go                    |  17 +--
 pkg/tcpip/stack/route.go                           |  12 +--
 pkg/tcpip/stack/stack.go                           |   6 +-
 pkg/tcpip/stack/stack_test.go                      |  22 ++--
 pkg/tcpip/stack/transport_test.go                  |   9 +-
 pkg/tcpip/transport/icmp/endpoint.go               |  12 ++-
 pkg/tcpip/transport/raw/endpoint.go                |   9 +-
 pkg/tcpip/transport/tcp/connect.go                 |   5 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  18 ++--
 pkg/tcpip/transport/udp/endpoint.go                |   5 +-
 pkg/tcpip/transport/udp/protocol.go                |  10 +-
 pkg/tcpip/transport/udp/udp_test.go                |  14 +--
 38 files changed, 406 insertions(+), 279 deletions(-)

diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index 48a2a2713..ba21f4eca 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -77,3 +77,9 @@ func (p *Prependable) Prepend(size int) []byte {
 	p.usedIdx -= size
 	return p.View()[:size:size]
 }
+
+// DeepCopy copies p and the bytes backing it.
+func (p Prependable) DeepCopy() Prependable {
+	p.buf = append(View(nil), p.buf...)
+	return p
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 22eefb564..9fe8e9f9d 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -25,10 +25,9 @@ import (
 
 // PacketInfo holds all the information about an outbound packet.
 type PacketInfo struct {
-	Header  buffer.View
-	Payload buffer.View
-	Proto   tcpip.NetworkProtocolNumber
-	GSO     *stack.GSO
+	Pkt   tcpip.PacketBuffer
+	Proto tcpip.NetworkProtocolNumber
+	GSO   *stack.GSO
 }
 
 // Endpoint is link layer endpoint that stores outbound packets in a channel
@@ -118,12 +117,11 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	p := PacketInfo{
-		Header:  hdr.View(),
-		Proto:   protocol,
-		Payload: payload.ToView(),
-		GSO:     gso,
+		Pkt:   pkt,
+		Proto: protocol,
+		GSO:   gso,
 	}
 
 	select {
@@ -139,15 +137,16 @@ func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 	payloadView := payload.ToView()
 	n := 0
 packetLoop:
-	for i := range hdrs {
-		hdr := &hdrs[i].Hdr
-		off := hdrs[i].Off
-		size := hdrs[i].Size
+	for _, hdr := range hdrs {
+		off := hdr.Off
+		size := hdr.Size
 		p := PacketInfo{
-			Header:  hdr.View(),
-			Proto:   protocol,
-			Payload: buffer.NewViewFromBytes(payloadView[off : off+size]),
-			GSO:     gso,
+			Pkt: tcpip.PacketBuffer{
+				Header: hdr.Hdr,
+				Data:   buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(),
+			},
+			Proto: protocol,
+			GSO:   gso,
 		}
 
 		select {
@@ -162,12 +161,11 @@ packetLoop:
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := PacketInfo{
-		Header:  packet.ToView(),
-		Proto:   0,
-		Payload: buffer.View{},
-		GSO:     nil,
+		Pkt:   tcpip.PacketBuffer{Data: vv},
+		Proto: 0,
+		GSO:   nil,
 	}
 
 	select {
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index edef7db26..98109c5dc 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -386,10 +386,11 @@ const (
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
-		eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
+		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
+		pkt.LinkHeader = buffer.View(eth)
 		ethHdr := &header.EthernetFields{
 			DstAddr: r.RemoteLinkAddress,
 			Type:    protocol,
@@ -408,13 +409,13 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 		vnetHdr := virtioNetHdr{}
 		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
 		if gso != nil {
-			vnetHdr.hdrLen = uint16(hdr.UsedLength())
+			vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
 			if gso.NeedsCsum {
 				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
 				vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
 				vnetHdr.csumOffset = gso.CsumOffset
 			}
-			if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS {
+			if gso.Type != stack.GSONone && uint16(pkt.Data.Size()) > gso.MSS {
 				switch gso.Type {
 				case stack.GSOTCPv4:
 					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
@@ -427,14 +428,14 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 			}
 		}
 
-		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView())
+		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
 	}
 
-	if payload.Size() == 0 {
-		return rawfile.NonBlockingWrite(e.fds[0], hdr.View())
+	if pkt.Data.Size() == 0 {
+		return rawfile.NonBlockingWrite(e.fds[0], pkt.Header.View())
 	}
 
-	return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
+	return rawfile.NonBlockingWrite3(e.fds[0], pkt.Header.View(), pkt.Data.ToView(), nil)
 }
 
 // WritePackets writes outbound packets to the file descriptor. If it is not
@@ -555,8 +556,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
-	return rawfile.NonBlockingWrite(e.fds[0], packet.ToView())
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return rawfile.NonBlockingWrite(e.fds[0], vv.ToView())
 }
 
 // InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 7e08e033b..2066987eb 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -168,7 +168,10 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, hdr, payload.ToVectorisedView(), proto); err != nil {
+	if err := c.ep.WritePacket(r, gso, proto, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
@@ -258,7 +261,10 @@ func TestPreserveSrcAddress(t *testing.T) {
 	// WritePacket panics given a prependable with anything less than
 	// the minimum size of the ethernet header.
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
-	if err := c.ep.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, proto); err != nil {
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.VectorisedView{},
+	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index bc5d8a2f3..563a67188 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -76,16 +76,16 @@ func (*endpoint) Wait() {}
 
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
-func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
-	views := make([]buffer.View, 1, 1+len(payload.Views()))
-	views[0] = hdr.View()
-	views = append(views, payload.Views()...)
-
-	// Because we're immediately turning around and writing the packet back to the
-	// rx path, we intentionally don't preserve the remote and local link
-	// addresses from the stack.Route we're passed.
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+	views[0] = pkt.Header.View()
+	views = append(views, pkt.Data.Views()...)
+
+	// Because we're immediately turning around and writing the packet back
+	// to the rx path, we intentionally don't preserve the remote and local
+	// link addresses from the stack.Route we're passed.
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, tcpip.PacketBuffer{
-		Data: buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 	})
 
 	return nil
@@ -97,17 +97,17 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, hdrs []stack.Packe
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	// Reject the packet if it's shorter than an ethernet header.
-	if packet.Size() < header.EthernetMinimumSize {
+	if vv.Size() < header.EthernetMinimumSize {
 		return tcpip.ErrBadAddress
 	}
 
-	// There should be an ethernet header at the beginning of packet.
-	linkHeader := header.Ethernet(packet.First()[:header.EthernetMinimumSize])
-	packet.TrimFront(len(linkHeader))
+	// There should be an ethernet header at the beginning of vv.
+	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
+	vv.TrimFront(len(linkHeader))
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), tcpip.PacketBuffer{
-		Data:       packet,
+		Data:       vv,
 		LinkHeader: buffer.View(linkHeader),
 	})
 
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 9a8e8ebfe..55ed2a28e 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -98,15 +98,15 @@ func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs [
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
-		return endpoint.WritePacket(r, gso, hdr, payload, protocol)
+		return endpoint.WritePacket(r, gso, protocol, pkt)
 	}
 	return tcpip.ErrNoRoute
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (m *InjectableEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+func (m *InjectableEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
 	// WriteRawPacket doesn't get a route or network address, so there's
 	// nowhere to write this.
 	return tcpip.ErrNoRoute
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 9cd300af8..63b249837 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -50,8 +50,10 @@ func TestInjectableEndpointDispatch(t *testing.T) {
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
 
-	endpoint.WritePacket(&packetRoute, nil /* gso */, hdr,
-		buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(), ipv4.ProtocolNumber)
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
+	})
 
 	buf := make([]byte, 6500)
 	bytesRead, err := sock.Read(buf)
@@ -68,8 +70,10 @@ func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
 	hdr := buffer.NewPrependable(1)
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
-	endpoint.WritePacket(&packetRoute, nil /* gso */, hdr,
-		buffer.NewView(0).ToVectorisedView(), ipv4.ProtocolNumber)
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.NewView(0).ToVectorisedView(),
+	})
 	buf := make([]byte, 6500)
 	bytesRead, err := sock.Read(buf)
 	if err != nil {
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 2bace5298..88947a03a 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -185,9 +185,10 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// Add the ethernet header here.
-	eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
+	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
+	pkt.LinkHeader = buffer.View(eth)
 	ethHdr := &header.EthernetFields{
 		DstAddr: r.RemoteLinkAddress,
 		Type:    protocol,
@@ -199,10 +200,10 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependa
 	}
 	eth.Encode(ethHdr)
 
-	v := payload.ToView()
+	v := pkt.Data.ToView()
 	// Transmit the packet.
 	e.mu.Lock()
-	ok := e.tx.transmit(hdr.View(), v)
+	ok := e.tx.transmit(pkt.Header.View(), v)
 	e.mu.Unlock()
 
 	if !ok {
@@ -218,8 +219,8 @@ func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.Packe
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
-	v := packet.ToView()
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	v := vv.ToView()
 	// Transmit the packet.
 	e.mu.Lock()
 	ok := e.tx.transmit(v, buffer.View{})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 199406886..89603c48f 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -273,7 +273,10 @@ func TestSimpleSend(t *testing.T) {
 			randomFill(buf)
 
 			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-			if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), proto); err != nil {
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+				Header: hdr,
+				Data:   buf.ToVectorisedView(),
+			}); err != nil {
 				t.Fatalf("WritePacket failed: %v", err)
 			}
 
@@ -342,7 +345,9 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
 
 	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-	if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buffer.VectorisedView{}, proto); err != nil {
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+		Header: hdr,
+	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
@@ -396,7 +401,10 @@ func TestFillTxQueue(t *testing.T) {
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 
-		if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -411,7 +419,10 @@ func TestFillTxQueue(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != want {
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	}); err != want {
 		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 	}
 }
@@ -436,7 +447,10 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	// Send two packets so that the id slice has at least two slots.
 	for i := 2; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 	}
@@ -456,7 +470,10 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -471,7 +488,10 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != want {
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	}); err != want {
 		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 	}
 }
@@ -494,7 +514,10 @@ func TestFillTxMemory(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queueDataSize / bufferSize; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -510,7 +533,10 @@ func TestFillTxMemory(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber)
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	})
 	if want := tcpip.ErrWouldBlock; err != want {
 		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 	}
@@ -535,7 +561,10 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// until there is only one buffer left.
 	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -548,7 +577,10 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 		uu := buffer.NewView(bufferSize).ToVectorisedView()
-		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, hdr, uu, header.IPv4ProtocolNumber); err != want {
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   uu,
+		}); err != want {
 			t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 		}
 	}
@@ -556,7 +588,10 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// Attempt to write the one-buffer packet again. It must succeed.
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, hdr, buf.ToVectorisedView(), header.IPv4ProtocolNumber); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 	}
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index d71a03cd2..122680e10 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -193,19 +193,19 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) dumpPacket(gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) {
+func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", protocol, hdr.View(), gso)
+		logPacket("send", protocol, pkt.Header.View(), gso)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		hdrBuf := hdr.View()
-		length := len(hdrBuf) + payload.Size()
+		hdrBuf := pkt.Header.View()
+		length := len(hdrBuf) + pkt.Data.Size()
 		if length > int(e.maxPCAPLen) {
 			length = int(e.maxPCAPLen)
 		}
 
 		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(len(hdrBuf)+payload.Size()))); err != nil {
+		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(len(hdrBuf)+pkt.Data.Size()))); err != nil {
 			panic(err)
 		}
 		if len(hdrBuf) > length {
@@ -215,7 +215,7 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, hdr buffer.Prependable, payload bu
 			panic(err)
 		}
 		length -= len(hdrBuf)
-		logVectorisedView(payload, length, buf)
+		logVectorisedView(pkt.Data, length, buf)
 		if _, err := e.file.Write(buf.Bytes()); err != nil {
 			panic(err)
 		}
@@ -225,9 +225,9 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, hdr buffer.Prependable, payload bu
 // WritePacket implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
-	e.dumpPacket(gso, hdr, payload, protocol)
-	return e.lower.WritePacket(r, gso, hdr, payload, protocol)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+	e.dumpPacket(gso, protocol, pkt)
+	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
 
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
@@ -236,32 +236,35 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	view := payload.ToView()
 	for _, d := range hdrs {
-		e.dumpPacket(gso, d.Hdr, buffer.NewVectorisedView(d.Size, []buffer.View{view[d.Off:][:d.Size]}), protocol)
+		e.dumpPacket(gso, protocol, tcpip.PacketBuffer{
+			Header: d.Hdr,
+			Data:   view[d.Off:][:d.Size].ToVectorisedView(),
+		})
 	}
 	return e.lower.WritePackets(r, gso, hdrs, payload, protocol)
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		length := packet.Size()
+		length := vv.Size()
 		if length > int(e.maxPCAPLen) {
 			length = int(e.maxPCAPLen)
 		}
 
 		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(packet.Size()))); err != nil {
+		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(vv.Size()))); err != nil {
 			panic(err)
 		}
-		logVectorisedView(packet, length, buf)
+		logVectorisedView(vv, length, buf)
 		if _, err := e.file.Write(buf.Bytes()); err != nil {
 			panic(err)
 		}
 	}
-	return e.lower.WriteRawPacket(packet)
+	return e.lower.WriteRawPacket(vv)
 }
 
 func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer) {
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index b440970e0..12e7c1932 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -99,12 +99,12 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
 
-	err := e.lower.WritePacket(r, gso, hdr, payload, protocol)
+	err := e.lower.WritePacket(r, gso, protocol, pkt)
 	e.writeGate.Leave()
 	return err
 }
@@ -123,12 +123,12 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
 
-	err := e.lower.WriteRawPacket(packet)
+	err := e.lower.WriteRawPacket(vv)
 	e.writeGate.Leave()
 	return err
 }
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index df2e70e54..0fc0c2ebe 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -65,7 +65,7 @@ func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
@@ -76,7 +76,7 @@ func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stac
 	return len(hdrs), nil
 }
 
-func (e *countedEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
@@ -89,21 +89,21 @@ func TestWaitWrite(t *testing.T) {
 	wep := New(ep)
 
 	// Write and check that it goes through.
-	wep.WritePacket(nil, nil /* gso */, buffer.Prependable{}, buffer.VectorisedView{}, 0)
+	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
 	if want := 1; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on dispatches, then try to write. It must go through.
 	wep.WaitDispatch()
-	wep.WritePacket(nil, nil /* gso */, buffer.Prependable{}, buffer.VectorisedView{}, 0)
+	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on writes, then try to write. It must not go through.
 	wep.WaitWrite()
-	wep.WritePacket(nil, nil /* gso */, buffer.Prependable{}, buffer.VectorisedView{}, 0)
+	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 0ee509ebe..30aec9ba7 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,7 +79,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buffer.VectorisedView, stack.NetworkHeaderParams, stack.PacketLooping) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketLooping, tcpip.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -88,7 +88,7 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketDescript
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -106,14 +106,16 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 			return // we have no useful answer, ignore the request
 		}
 		hdr := buffer.NewPrependable(int(e.linkEP.MaxHeaderLength()) + header.ARPSize)
-		pkt := header.ARP(hdr.Prepend(header.ARPSize))
-		pkt.SetIPv4OverEthernet()
-		pkt.SetOp(header.ARPReply)
-		copy(pkt.HardwareAddressSender(), r.LocalLinkAddress[:])
-		copy(pkt.ProtocolAddressSender(), h.ProtocolAddressTarget())
-		copy(pkt.HardwareAddressTarget(), h.HardwareAddressSender())
-		copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
+		packet := header.ARP(hdr.Prepend(header.ARPSize))
+		packet.SetIPv4OverEthernet()
+		packet.SetOp(header.ARPReply)
+		copy(packet.HardwareAddressSender(), r.LocalLinkAddress[:])
+		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
+		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
+		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
+		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+			Header: hdr,
+		})
 		fallthrough // also fill the cache from requests
 	case header.ARPReply:
 		addr := tcpip.Address(h.ProtocolAddressSender())
@@ -165,7 +167,9 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	copy(h.ProtocolAddressSender(), localAddr)
 	copy(h.ProtocolAddressTarget(), addr)
 
-	return linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+	})
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 47098bfdc..8e6048a21 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -110,13 +110,13 @@ func TestDirectRequest(t *testing.T) {
 	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
 			inject(address)
-			pkt := <-c.linkEP.C
-			if pkt.Proto != arp.ProtocolNumber {
-				t.Fatalf("expected ARP response, got network protocol number %d", pkt.Proto)
+			pi := <-c.linkEP.C
+			if pi.Proto != arp.ProtocolNumber {
+				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
 			}
-			rep := header.ARP(pkt.Header)
+			rep := header.ARP(pi.Pkt.Header.View())
 			if !rep.IsValid() {
-				t.Fatalf("invalid ARP response len(pkt.Header)=%d", len(pkt.Header))
+				t.Fatalf("invalid ARP response pi.Pkt.Header.UsedLength()=%d", pi.Pkt.Header.UsedLength())
 			}
 			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
 				t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index fe499d47e..1de188738 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -150,24 +150,24 @@ func (*testObject) Wait() {}
 // WritePacket is called by network endpoints after producing a packet and
 // writing it to the link endpoint. This is used by the test object to verify
 // that the produced packet is as expected.
-func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	var prot tcpip.TransportProtocolNumber
 	var srcAddr tcpip.Address
 	var dstAddr tcpip.Address
 
 	if t.v4 {
-		h := header.IPv4(hdr.View())
+		h := header.IPv4(pkt.Header.View())
 		prot = tcpip.TransportProtocolNumber(h.Protocol())
 		srcAddr = h.SourceAddress()
 		dstAddr = h.DestinationAddress()
 
 	} else {
-		h := header.IPv6(hdr.View())
+		h := header.IPv6(pkt.Header.View())
 		prot = tcpip.TransportProtocolNumber(h.NextHeader())
 		srcAddr = h.SourceAddress()
 		dstAddr = h.DestinationAddress()
 	}
-	t.checkValues(prot, payload, srcAddr, dstAddr)
+	t.checkValues(prot, pkt.Data, srcAddr, dstAddr)
 	return nil
 }
 
@@ -239,7 +239,10 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, hdr, payload.ToVectorisedView(), stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut); err != nil {
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
@@ -477,7 +480,10 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, hdr, payload.ToVectorisedView(), stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut); err != nil {
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ce771631c..32bf39e43 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -99,7 +99,11 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 		pkt.SetChecksum(0)
 		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, hdr, vv, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			Header:          hdr,
+			Data:            vv,
+			TransportHeader: buffer.View(pkt),
+		}); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index ac16c8add..040329a74 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -117,13 +117,14 @@ func (e *endpoint) GSOMaxSize() uint32 {
 }
 
 // writePacketFragments calls e.linkEP.WritePacket with each packet fragment to
-// write. It assumes that the IP header is entirely in hdr but does not assume
-// that only the IP header is in hdr. It assumes that the input packet's stated
-// length matches the length of the hdr+payload. mtu includes the IP header and
-// options. This does not support the DontFragment IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, mtu int) *tcpip.Error {
+// write. It assumes that the IP header is entirely in pkt.Header but does not
+// assume that only the IP header is in pkt.Header. It assumes that the input
+// packet's stated length matches the length of the header+payload. mtu
+// includes the IP header and options. This does not support the DontFragment
+// IP flag.
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// This packet is too big, it needs to be fragmented.
-	ip := header.IPv4(hdr.View())
+	ip := header.IPv4(pkt.Header.View())
 	flags := ip.Flags()
 
 	// Update mtu to take into account the header, which will exist in all
@@ -137,62 +138,77 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buff
 
 	outerMTU := innerMTU + int(ip.HeaderLength())
 	offset := ip.FragmentOffset()
-	originalAvailableLength := hdr.AvailableLength()
+	originalAvailableLength := pkt.Header.AvailableLength()
 	for i := 0; i < n; i++ {
 		// Where possible, the first fragment that is sent has the same
-		// hdr.UsedLength() as the input packet. The link-layer endpoint may depends
-		// on this for looking at, eg, L4 headers.
+		// pkt.Header.UsedLength() as the input packet. The link-layer
+		// endpoint may depend on this for looking at, eg, L4 headers.
 		h := ip
 		if i > 0 {
-			hdr = buffer.NewPrependable(int(ip.HeaderLength()) + originalAvailableLength)
-			h = header.IPv4(hdr.Prepend(int(ip.HeaderLength())))
+			pkt.Header = buffer.NewPrependable(int(ip.HeaderLength()) + originalAvailableLength)
+			h = header.IPv4(pkt.Header.Prepend(int(ip.HeaderLength())))
 			copy(h, ip[:ip.HeaderLength()])
 		}
 		if i != n-1 {
 			h.SetTotalLength(uint16(outerMTU))
 			h.SetFlagsFragmentOffset(flags|header.IPv4FlagMoreFragments, offset)
 		} else {
-			h.SetTotalLength(uint16(h.HeaderLength()) + uint16(payload.Size()))
+			h.SetTotalLength(uint16(h.HeaderLength()) + uint16(pkt.Data.Size()))
 			h.SetFlagsFragmentOffset(flags, offset)
 		}
 		h.SetChecksum(0)
 		h.SetChecksum(^h.CalculateChecksum())
 		offset += uint16(innerMTU)
 		if i > 0 {
-			newPayload := payload.Clone([]buffer.View{})
+			newPayload := pkt.Data.Clone(nil)
 			newPayload.CapLength(innerMTU)
-			if err := e.linkEP.WritePacket(r, gso, hdr, newPayload, ProtocolNumber); err != nil {
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+				Header:        pkt.Header,
+				Data:          newPayload,
+				NetworkHeader: buffer.View(h),
+			}); err != nil {
 				return err
 			}
 			r.Stats().IP.PacketsSent.Increment()
-			payload.TrimFront(newPayload.Size())
+			pkt.Data.TrimFront(newPayload.Size())
 			continue
 		}
-		// Special handling for the first fragment because it comes from the hdr.
-		if outerMTU >= hdr.UsedLength() {
-			// This fragment can fit all of hdr and possibly some of payload, too.
-			newPayload := payload.Clone([]buffer.View{})
-			newPayloadLength := outerMTU - hdr.UsedLength()
+		// Special handling for the first fragment because it comes
+		// from the header.
+		if outerMTU >= pkt.Header.UsedLength() {
+			// This fragment can fit all of pkt.Header and possibly
+			// some of pkt.Data, too.
+			newPayload := pkt.Data.Clone(nil)
+			newPayloadLength := outerMTU - pkt.Header.UsedLength()
 			newPayload.CapLength(newPayloadLength)
-			if err := e.linkEP.WritePacket(r, gso, hdr, newPayload, ProtocolNumber); err != nil {
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+				Header:        pkt.Header,
+				Data:          newPayload,
+				NetworkHeader: buffer.View(h),
+			}); err != nil {
 				return err
 			}
 			r.Stats().IP.PacketsSent.Increment()
-			payload.TrimFront(newPayloadLength)
+			pkt.Data.TrimFront(newPayloadLength)
 		} else {
-			// The fragment is too small to fit all of hdr.
-			startOfHdr := hdr
-			startOfHdr.TrimBack(hdr.UsedLength() - outerMTU)
+			// The fragment is too small to fit all of pkt.Header.
+			startOfHdr := pkt.Header
+			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
 			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
-			if err := e.linkEP.WritePacket(r, gso, startOfHdr, emptyVV, ProtocolNumber); err != nil {
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+				Header:        startOfHdr,
+				Data:          emptyVV,
+				NetworkHeader: buffer.View(h),
+			}); err != nil {
 				return err
 			}
 			r.Stats().IP.PacketsSent.Increment()
-			// Add the unused bytes of hdr into the payload that remains to be sent.
-			restOfHdr := hdr.View()[outerMTU:]
+			// Add the unused bytes of pkt.Header into the pkt.Data
+			// that remains to be sent.
+			restOfHdr := pkt.Header.View()[outerMTU:]
 			tmp := buffer.NewVectorisedView(len(restOfHdr), []buffer.View{buffer.NewViewFromBytes(restOfHdr)})
-			tmp.Append(payload)
-			payload = tmp
+			tmp.Append(pkt.Data)
+			pkt.Data = tmp
 		}
 	}
 	return nil
@@ -222,17 +238,17 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	ip := e.addIPHeader(r, &hdr, payload.Size(), params)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
-		views := make([]buffer.View, 1, 1+len(payload.Views()))
-		views[0] = hdr.View()
-		views = append(views, payload.Views()...)
+		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+		views[0] = pkt.Header.View()
+		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
 		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
-			Data:          buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+			Data:          buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 			NetworkHeader: buffer.View(ip),
 		})
 
@@ -241,10 +257,10 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	if loop&stack.PacketOut == 0 {
 		return nil
 	}
-	if hdr.UsedLength()+payload.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
-		return e.writePacketFragments(r, gso, hdr, payload, int(e.linkEP.MTU()))
+	if pkt.Header.UsedLength()+pkt.Data.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
+		return e.writePacketFragments(r, gso, int(e.linkEP.MTU()), pkt)
 	}
-	if err := e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber); err != nil {
+	if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
 		return err
 	}
 	r.Stats().IP.PacketsSent.Increment()
@@ -270,16 +286,16 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
-	ip := header.IPv4(payload.First())
-	if !ip.IsValid(payload.Size()) {
+	ip := header.IPv4(pkt.Data.First())
+	if !ip.IsValid(pkt.Data.Size()) {
 		return tcpip.ErrInvalidOptionValue
 	}
 
 	// Always set the total length.
-	ip.SetTotalLength(uint16(payload.Size()))
+	ip.SetTotalLength(uint16(pkt.Data.Size()))
 
 	// Set the source address when zero.
 	if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) {
@@ -293,7 +309,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 	// Set the packet ID when zero.
 	if ip.ID() == 0 {
 		id := uint32(0)
-		if payload.Size() > header.IPv4MaximumHeaderSize+8 {
+		if pkt.Data.Size() > header.IPv4MaximumHeaderSize+8 {
 			// Packets of 68 bytes or less are required by RFC 791 to not be
 			// fragmented, so we only assign ids to larger packets.
 			id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)
@@ -306,18 +322,18 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 	ip.SetChecksum(^ip.CalculateChecksum())
 
 	if loop&stack.PacketLoop != 0 {
-		e.HandlePacket(r, tcpip.PacketBuffer{
-			Data:          payload,
-			NetworkHeader: buffer.View(ip),
-		})
+		e.HandlePacket(r, pkt.Clone())
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
 	}
 
-	hdr := buffer.NewPrependableFromView(payload.ToView())
 	r.Stats().IP.PacketsSent.Increment()
-	return e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
+
+	ip = ip[:ip.HeaderLength()]
+	pkt.Header = buffer.NewPrependableFromView(buffer.View(ip))
+	pkt.Data.TrimFront(int(ip.HeaderLength()))
+	return e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
 }
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 01dfb5f20..e900f1b45 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -113,12 +113,12 @@ func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []packetInfo, sourcePacketInfo packetInfo, mtu uint32) {
+func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketInfo tcpip.PacketBuffer, mtu uint32) {
 	t.Helper()
 	// Make a complete array of the sourcePacketInfo packet.
 	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
 	source = append(source, sourcePacketInfo.Header.View()...)
-	source = append(source, sourcePacketInfo.Payload.ToView()...)
+	source = append(source, sourcePacketInfo.Data.ToView()...)
 
 	// Make a copy of the IP header, which will be modified in some fields to make
 	// an expected header.
@@ -132,7 +132,7 @@ func compareFragments(t *testing.T, packets []packetInfo, sourcePacketInfo packe
 	for i, packet := range packets {
 		// Confirm that the packet is valid.
 		allBytes := packet.Header.View().ToVectorisedView()
-		allBytes.Append(packet.Payload)
+		allBytes.Append(packet.Data)
 		ip := header.IPv4(allBytes.ToView())
 		if !ip.IsValid(len(ip)) {
 			t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
@@ -173,7 +173,7 @@ func compareFragments(t *testing.T, packets []packetInfo, sourcePacketInfo packe
 
 type errorChannel struct {
 	*channel.Endpoint
-	Ch                    chan packetInfo
+	Ch                    chan tcpip.PacketBuffer
 	packetCollectorErrors []*tcpip.Error
 }
 
@@ -183,17 +183,11 @@ type errorChannel struct {
 func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
 	return &errorChannel{
 		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan packetInfo, size),
+		Ch:                    make(chan tcpip.PacketBuffer, size),
 		packetCollectorErrors: packetCollectorErrors,
 	}
 }
 
-// packetInfo holds all the information about an outbound packet.
-type packetInfo struct {
-	Header  buffer.Prependable
-	Payload buffer.VectorisedView
-}
-
 // Drain removes all outbound packets from the channel and counts them.
 func (e *errorChannel) Drain() int {
 	c := 0
@@ -208,14 +202,9 @@ func (e *errorChannel) Drain() int {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
-	p := packetInfo{
-		Header:  hdr,
-		Payload: payload,
-	}
-
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
 	select {
-	case e.Ch <- p:
+	case e.Ch <- pkt:
 	default:
 	}
 
@@ -292,18 +281,21 @@ func TestFragmentation(t *testing.T) {
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
-			source := packetInfo{
+			source := tcpip.PacketBuffer{
 				Header: hdr,
 				// Save the source payload because WritePacket will modify it.
-				Payload: payload.Clone([]buffer.View{}),
+				Data: payload.Clone(nil),
 			}
 			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, hdr, payload, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS})
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+				Header: hdr,
+				Data:   payload,
+			})
 			if err != nil {
 				t.Errorf("err got %v, want %v", err, nil)
 			}
 
-			var results []packetInfo
+			var results []tcpip.PacketBuffer
 		L:
 			for {
 				select {
@@ -345,7 +337,10 @@ func TestFragmentationErrors(t *testing.T) {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
 			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, hdr, payload, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS})
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+				Header: hdr,
+				Data:   payload,
+			})
 			for i := 0; i < len(ft.packetCollectorErrors)-1; i++ {
 				if got, want := ft.packetCollectorErrors[i], (*tcpip.Error)(nil); got != want {
 					t.Errorf("ft.packetCollectorErrors[%d] got %v, want %v", i, got, want)
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 6629951c6..1c3410618 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -226,7 +226,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		//
 		// The IP Hop Limit field has a value of 255, i.e., the packet
 		// could not possibly have been forwarded by a router.
-		if err := r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}); err != nil {
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			Header: hdr,
+		}); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -291,7 +293,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		copy(packet, h)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
-		if err := r.WritePacket(nil /* gso */, hdr, pkt.Data, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   pkt.Data,
+		}); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -417,7 +422,9 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+		Header: hdr,
+	})
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 6037a1ef8..335f634d5 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -55,7 +55,7 @@ func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
-func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buffer.VectorisedView, tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, tcpip.PacketBuffer) *tcpip.Error {
 	return nil
 }
 
@@ -276,22 +276,22 @@ type routeArgs struct {
 func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) {
 	t.Helper()
 
-	pkt := <-args.src.C
+	pi := <-args.src.C
 
 	{
-		views := []buffer.View{pkt.Header, pkt.Payload}
-		size := len(pkt.Header) + len(pkt.Payload)
+		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
+		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pkt.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
 			Data: vv,
 		})
 	}
 
-	if pkt.Proto != ProtocolNumber {
-		t.Errorf("unexpected protocol number %d", pkt.Proto)
+	if pi.Proto != ProtocolNumber {
+		t.Errorf("unexpected protocol number %d", pi.Proto)
 		return
 	}
-	ipv6 := header.IPv6(pkt.Header)
+	ipv6 := header.IPv6(pi.Pkt.Header.View())
 	transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader())
 	if transProto != header.ICMPv6ProtocolNumber {
 		t.Errorf("unexpected transport protocol number %d", transProto)
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 4cee848a1..8d1578ed9 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -112,17 +112,17 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	ip := e.addIPHeader(r, &hdr, payload.Size(), params)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
-		views := make([]buffer.View, 1, 1+len(payload.Views()))
-		views[0] = hdr.View()
-		views = append(views, payload.Views()...)
+		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+		views[0] = pkt.Header.View()
+		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
 		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
-			Data:          buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+			Data:          buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 			NetworkHeader: buffer.View(ip),
 		})
 
@@ -133,7 +133,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	}
 
 	r.Stats().IP.PacketsSent.Increment()
-	return e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber)
+	return e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt)
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
@@ -158,7 +158,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// TODO(b/119580726): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
index 10b04239d..695f7b188 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/packet_buffer.go
@@ -31,12 +31,19 @@ type PacketBuffer struct {
 	// or otherwise modified.
 	Data buffer.VectorisedView
 
+	// Header holds the headers of outbound packets. As a packet is passed
+	// down the stack, each layer adds to Header.
+	Header buffer.Prependable
+
+	// These fields are used by both inbound and outbound packets. They
+	// typically overlap with the Data and Header fields.
+	//
 	// The bytes backing these views are immutable. Each field may be nil
 	// if either it has not been set yet or no such header exists (e.g.
 	// packets sent via loopback may not have a link header).
 	//
-	// These fields may be Views into other Views. SR dosen't support this,
-	// so deep copies are necessary in some cases.
+	// These fields may be Views into other slices (either Data or Header).
+	// SR dosen't support this, so deep copies are necessary in some cases.
 	LinkHeader      buffer.View
 	NetworkHeader   buffer.View
 	TransportHeader buffer.View
@@ -44,11 +51,9 @@ type PacketBuffer struct {
 
 // Clone makes a copy of pk. It clones the Data field, which creates a new
 // VectorisedView but does not deep copy the underlying bytes.
+//
+// Clone also does not deep copy any of its other fields.
 func (pk PacketBuffer) Clone() PacketBuffer {
-	return PacketBuffer{
-		Data:            pk.Data.Clone(nil),
-		LinkHeader:      pk.LinkHeader,
-		NetworkHeader:   pk.NetworkHeader,
-		TransportHeader: pk.TransportHeader,
-	}
+	pk.Data = pk.Data.Clone(nil)
+	return pk
 }
diff --git a/pkg/tcpip/packet_buffer_state.go b/pkg/tcpip/packet_buffer_state.go
index 04c4cf136..ad3cc24fa 100644
--- a/pkg/tcpip/packet_buffer_state.go
+++ b/pkg/tcpip/packet_buffer_state.go
@@ -20,6 +20,7 @@ import "gvisor.dev/gvisor/pkg/tcpip/buffer"
 func (pk *PacketBuffer) beforeSave() {
 	// Non-Data fields may be slices of the Data field. This causes
 	// problems for SR, so during save we make each header independent.
+	pk.Header = pk.Header.DeepCopy()
 	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
 	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
 	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 8357dca77..cfdd0496e 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -421,7 +421,9 @@ func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining u
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 	sent := r.Stats().ICMP.V6PacketsSent
-	if err := r.WritePacket(nil, hdr, buffer.VectorisedView{}, NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: DefaultTOS}); err != nil {
+	if err := r.WritePacket(nil, NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: DefaultTOS}, tcpip.PacketBuffer{
+		Header: hdr,
+	}); err != nil {
 		sent.Dropped.Increment()
 		return false, err
 	}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 494244368..5b901f947 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -344,7 +344,7 @@ func TestDADResolve(t *testing.T) {
 				}
 
 				// Check NDP packet.
-				checker.IPv6(t, p.Header.ToVectorisedView().First(),
+				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
 					checker.TTL(header.NDPHopLimit),
 					checker.NDPNS(
 						checker.NDPNSTargetAddress(addr1)))
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 9ed9e1e7c..3f8d7312c 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -812,15 +812,15 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		} else {
 			// n doesn't have a destination endpoint.
 			// Send the packet out of n.
-			hdr := buffer.NewPrependableFromView(pkt.Data.First())
+			pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
 			pkt.Data.RemoveFirst()
 
 			// TODO(b/128629022): use route.WritePacket.
-			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, pkt.Data, protocol); err != nil {
+			if err := n.linkEP.WritePacket(&r, nil /* gso */, protocol, pkt); err != nil {
 				r.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
 				n.stats.Tx.Packets.Increment()
-				n.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + pkt.Data.Size()))
+				n.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
 			}
 		}
 		return
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index c0026f5a3..7fd4e4a65 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -232,8 +232,9 @@ type NetworkEndpoint interface {
 	MaxHeaderLength() uint16
 
 	// WritePacket writes a packet to the given destination address and
-	// protocol.
-	WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) *tcpip.Error
+	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
+	// already been set.
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
 	// protocol.
@@ -241,7 +242,7 @@ type NetworkEndpoint interface {
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, payload buffer.VectorisedView, loop PacketLooping) *tcpip.Error
+	WriteHeaderIncludedPacket(r *Route, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
@@ -361,13 +362,15 @@ type LinkEndpoint interface {
 	// link endpoint.
 	LinkAddress() tcpip.LinkAddress
 
-	// WritePacket writes a packet with the given protocol through the given
-	// route.
+	// WritePacket writes a packet with the given protocol through the
+	// given route. It sets pkt.LinkHeader if a link layer header exists.
+	// pkt.NetworkHeader and pkt.TransportHeader must have already been
+	// set.
 	//
 	// To participate in transparent bridging, a LinkEndpoint implementation
 	// should call eth.Encode with header.EthernetFields.SrcAddr set to
 	// r.LocalLinkAddress if it is provided.
-	WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
 	// given route.
@@ -379,7 +382,7 @@ type LinkEndpoint interface {
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header.
-	WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error
+	WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error
 
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 1a0a51b57..617f5a57c 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -154,17 +154,17 @@ func (r *Route) IsResolutionRequired() bool {
 }
 
 // WritePacket writes the packet through the given route.
-func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params NetworkHeaderParams) *tcpip.Error {
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.Loop)
+	err := r.ref.ep.WritePacket(r, gso, params, r.Loop, pkt)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	} else {
 		r.ref.nic.stats.Tx.Packets.Increment()
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + payload.Size()))
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
 	}
 	return err
 }
@@ -209,17 +209,17 @@ func (r *Route) WritePackets(gso *GSO, hdrs []PacketDescriptor, payload buffer.V
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.Error {
+func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.Loop); err != nil {
+	if err := r.ref.ep.WriteHeaderIncludedPacket(r, r.Loop, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
 	r.ref.nic.stats.Tx.Packets.Increment()
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payload.Size()))
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
 	return nil
 }
 
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 2f8d8e822..0e88643a4 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1366,10 +1366,10 @@ func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto t
 	}
 	fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
 	fakeHeader.Encode(&ethFields)
-	ethHeader := buffer.View(fakeHeader).ToVectorisedView()
-	ethHeader.Append(payload)
+	vv := buffer.View(fakeHeader).ToVectorisedView()
+	vv.Append(payload)
 
-	if err := nic.linkEP.WriteRawPacket(ethHeader); err != nil {
+	if err := nic.linkEP.WriteRawPacket(vv); err != nil {
 		return err
 	}
 
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index bf1d6974c..f979e2b1a 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -122,31 +122,30 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
-	b := hdr.Prepend(fakeNetHeaderLen)
+	b := pkt.Header.Prepend(fakeNetHeaderLen)
 	b[0] = r.RemoteAddress[0]
 	b[1] = f.id.LocalAddress[0]
 	b[2] = byte(params.Protocol)
 
 	if loop&stack.PacketLoop != 0 {
-		views := make([]buffer.View, 1, 1+len(payload.Views()))
-		views[0] = hdr.View()
-		views = append(views, payload.Views()...)
-		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+		views[0] = pkt.Header.View()
+		views = append(views, pkt.Data.Views()...)
 		f.HandlePacket(r, tcpip.PacketBuffer{
-			Data: vv,
+			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
 	}
 
-	return f.ep.WritePacket(r, gso, hdr, payload, fakeNetNumber)
+	return f.ep.WritePacket(r, gso, fakeNetNumber, pkt)
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
@@ -154,7 +153,7 @@ func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -330,7 +329,10 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 
 func send(r stack.Route, payload buffer.View) *tcpip.Error {
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
-	return r.WritePacket(nil /* gso */, hdr, payload.ToVectorisedView(), stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS})
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	})
 }
 
 func testSendTo(t *testing.T, s *stack.Stack, addr tcpip.Address, ep *channel.Endpoint, payload buffer.View) {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 2cacea99a..748ce4ea5 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -83,7 +83,10 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := f.route.WritePacket(nil /* gso */, hdr, buffer.View(v).ToVectorisedView(), stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}); err != nil {
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.View(v).ToVectorisedView(),
+	}); err != nil {
 		return 0, nil, err
 	}
 
@@ -617,10 +620,10 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	if dst := p.Header[0]; dst != 3 {
+	if dst := p.Pkt.Header.View()[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := p.Header[1]; src != 1 {
+	if src := p.Pkt.Header.View()[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 70e008d36..9c40931b5 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -429,7 +429,11 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS})
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		Header:          hdr,
+		Data:            data.ToVectorisedView(),
+		TransportHeader: buffer.View(icmpv4),
+	})
 }
 
 func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Error {
@@ -455,7 +459,11 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, hdr, dataVV, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS})
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		Header:          hdr,
+		Data:            dataVV,
+		TransportHeader: buffer.View(icmpv6),
+	})
 }
 
 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 230a1537a..5aafe2615 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -338,13 +338,18 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
 		if !e.associated {
-			if err := route.WriteHeaderIncludedPacket(buffer.View(payloadBytes).ToVectorisedView()); err != nil {
+			if err := route.WriteHeaderIncludedPacket(tcpip.PacketBuffer{
+				Data: buffer.View(payloadBytes).ToVectorisedView(),
+			}); err != nil {
 				return 0, nil, err
 			}
 			break
 		}
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   buffer.View(payloadBytes).ToVectorisedView(),
+		}); err != nil {
 			return 0, nil, err
 		}
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index be066d877..49f2b9685 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -723,7 +723,10 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(gso, d.Hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil {
+	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
+		Header: d.Hdr,
+		Data:   data,
+	}); err != nil {
 		r.Stats().TCP.SegmentSendErrors.Increment()
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 0a733fa94..04fdaaed1 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -236,9 +236,9 @@ func (c *Context) GetPacket() []byte {
 		if p.Proto != ipv4.ProtocolNumber {
 			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 		}
-		b := make([]byte, len(p.Header)+len(p.Payload))
-		copy(b, p.Header)
-		copy(b[len(p.Header):], p.Payload)
+
+		hdr := p.Pkt.Header.View()
+		b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
 
 		if p.GSO != nil && p.GSO.L3HdrLen != header.IPv4MinimumSize {
 			c.t.Errorf("L3HdrLen %v (expected %v)", p.GSO.L3HdrLen, header.IPv4MinimumSize)
@@ -264,9 +264,9 @@ func (c *Context) GetPacketNonBlocking() []byte {
 		if p.Proto != ipv4.ProtocolNumber {
 			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 		}
-		b := make([]byte, len(p.Header)+len(p.Payload))
-		copy(b, p.Header)
-		copy(b[len(p.Header):], p.Payload)
+
+		hdr := p.Pkt.Header.View()
+		b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
 
 		checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
 		return b
@@ -488,9 +488,9 @@ func (c *Context) GetV6Packet() []byte {
 		if p.Proto != ipv6.ProtocolNumber {
 			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv6.ProtocolNumber)
 		}
-		b := make([]byte, len(p.Header)+len(p.Payload))
-		copy(b, p.Header)
-		copy(b[len(p.Header):], p.Payload)
+		b := make([]byte, p.Pkt.Header.UsedLength()+p.Pkt.Data.Size())
+		copy(b, p.Pkt.Header.View())
+		copy(b[p.Pkt.Header.UsedLength():], p.Pkt.Data.ToView())
 
 		checker.IPv6(c.t, b, checker.SrcAddr(StackV6Addr), checker.DstAddr(TestV6Addr))
 		return b
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index dda7af910..2d97d1398 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -817,7 +817,10 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	if useDefaultTTL {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(nil /* gso */, hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil {
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
+		Header: hdr,
+		Data:   data,
+	}); err != nil {
 		r.Stats().UDP.PacketSendErrors.Increment()
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 43f11b700..259c3072a 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -135,7 +135,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv4DstUnreachable)
 		pkt.SetCode(header.ICMPv4PortUnreachable)
 		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
-		r.WritePacket(nil /* gso */, hdr, payload, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS})
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   payload,
+		})
 
 	case header.IPv6AddressSize:
 		if !r.Stack().AllowICMPMessage() {
@@ -169,7 +172,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv6DstUnreachable)
 		pkt.SetCode(header.ICMPv6PortUnreachable)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
-		r.WritePacket(nil /* gso */, hdr, payload, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS})
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			Header: hdr,
+			Data:   payload,
+		})
 	}
 	return true
 }
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 30ee9801b..7051a7a9c 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -356,9 +356,9 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 		if p.Proto != flow.netProto() {
 			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
 		}
-		b := make([]byte, len(p.Header)+len(p.Payload))
-		copy(b, p.Header)
-		copy(b[len(p.Header):], p.Payload)
+
+		hdr := p.Pkt.Header.View()
+		b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
 
 		h := flow.header4Tuple(outgoing)
 		checkers := append(
@@ -1453,8 +1453,8 @@ func TestV4UnknownDestination(t *testing.T) {
 			select {
 			case p := <-c.linkEP.C:
 				var pkt []byte
-				pkt = append(pkt, p.Header...)
-				pkt = append(pkt, p.Payload...)
+				pkt = append(pkt, p.Pkt.Header.View()...)
+				pkt = append(pkt, p.Pkt.Data.ToView()...)
 				if got, want := len(pkt), header.IPv4MinimumProcessableDatagramSize; got > want {
 					t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
 				}
@@ -1527,8 +1527,8 @@ func TestV6UnknownDestination(t *testing.T) {
 			select {
 			case p := <-c.linkEP.C:
 				var pkt []byte
-				pkt = append(pkt, p.Header...)
-				pkt = append(pkt, p.Payload...)
+				pkt = append(pkt, p.Pkt.Header.View()...)
+				pkt = append(pkt, p.Pkt.Data.ToView()...)
 				if got, want := len(pkt), header.IPv6MinimumMTU; got > want {
 					t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
 				}
-- 
cgit v1.2.3


From 9ca15dbf14ac6318a34540354020b8a71d789077 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 14 Nov 2019 14:03:24 -0800
Subject: Avoid unnecessary slice allocation in
 usermem.BytesIO.blocksFromAddrRanges().

PiperOrigin-RevId: 280507239
---
 pkg/sentry/usermem/bytes_io.go | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
index 8d88396ba..7898851b3 100644
--- a/pkg/sentry/usermem/bytes_io.go
+++ b/pkg/sentry/usermem/bytes_io.go
@@ -102,19 +102,34 @@ func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) {
 }
 
 func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) {
-	blocks := make([]safemem.Block, 0, ars.NumRanges())
-	for !ars.IsEmpty() {
-		ar := ars.Head()
-		n, err := b.rangeCheck(ar.Start, int(ar.Length()))
-		if n != 0 {
-			blocks = append(blocks, safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start):int(ar.Start)+n]))
+	switch ars.NumRanges() {
+	case 0:
+		return safemem.BlockSeq{}, nil
+	case 1:
+		block, err := b.blockFromAddrRange(ars.Head())
+		return safemem.BlockSeqOf(block), err
+	default:
+		blocks := make([]safemem.Block, 0, ars.NumRanges())
+		for !ars.IsEmpty() {
+			block, err := b.blockFromAddrRange(ars.Head())
+			if block.Len() != 0 {
+				blocks = append(blocks, block)
+			}
+			if err != nil {
+				return safemem.BlockSeqFromSlice(blocks), err
+			}
+			ars = ars.Tail()
 		}
-		if err != nil {
-			return safemem.BlockSeqFromSlice(blocks), err
-		}
-		ars = ars.Tail()
+		return safemem.BlockSeqFromSlice(blocks), nil
+	}
+}
+
+func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) {
+	n, err := b.rangeCheck(ar.Start, int(ar.Length()))
+	if n == 0 {
+		return safemem.Block{}, err
 	}
-	return safemem.BlockSeqFromSlice(blocks), nil
+	return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err
 }
 
 // BytesIOSequence returns an IOSequence representing the given byte slice.
-- 
cgit v1.2.3


From 1e1f5ce08210af6211bcb1c8da293a63a79165fe Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 14 Nov 2019 15:04:25 -0800
Subject: Allow all runtime tests for a language to be run via a single
 command.

This was intended behavior per the README, but running tests without the --test
flag caused an error. Users can now omit the --test flag to run every test for a
runtime.

PiperOrigin-RevId: 280522025
---
 test/runtimes/images/proctor/proctor.go | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/test/runtimes/images/proctor/proctor.go b/test/runtimes/images/proctor/proctor.go
index e6178e82b..b54abe434 100644
--- a/test/runtimes/images/proctor/proctor.go
+++ b/test/runtimes/images/proctor/proctor.go
@@ -39,10 +39,10 @@ type TestRunner interface {
 }
 
 var (
-	runtime = flag.String("runtime", "", "name of runtime")
-	list    = flag.Bool("list", false, "list all available tests")
-	test    = flag.String("test", "", "run a single test from the list of available tests")
-	pause   = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
+	runtime  = flag.String("runtime", "", "name of runtime")
+	list     = flag.Bool("list", false, "list all available tests")
+	testName = flag.String("test", "", "run a single test from the list of available tests")
+	pause    = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
 )
 
 func main() {
@@ -74,14 +74,23 @@ func main() {
 		return
 	}
 
-	// Run a single test.
-	if *test == "" {
-		log.Fatalf("test flag must be provided")
+	var tests []string
+	if *testName == "" {
+		// Run every test.
+		tests, err = tr.ListTests()
+		if err != nil {
+			log.Fatalf("failed to get all tests: %v", err)
+		}
+	} else {
+		// Run a single test.
+		tests = []string{*testName}
 	}
-	cmd := tr.TestCmd(*test)
-	cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
-	if err := cmd.Run(); err != nil {
-		log.Fatalf("FAIL: %v", err)
+	for _, test := range tests {
+		cmd := tr.TestCmd(test)
+		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
+		if err := cmd.Run(); err != nil {
+			log.Fatalf("FAIL: %v", err)
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 339536de5eefe782813aabae4aeeb312b3c4dde7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 14 Nov 2019 15:55:07 -0800
Subject: Check that a file is a regular file with open(O_TRUNC).

It was possible to panic the sentry by opening a cache revalidating folder with
O_TRUNC|O_CREAT.

Avoids breaking php tests.

PiperOrigin-RevId: 280533213
---
 pkg/sentry/fs/inode.go                |  4 ++++
 pkg/sentry/fs/tty/master.go           |  1 +
 pkg/sentry/fs/tty/slave.go            |  1 +
 pkg/sentry/syscalls/linux/sys_file.go |  9 +++++----
 test/syscalls/linux/open.cc           | 22 ++++++++++++++++++++++
 test/syscalls/linux/open_create.cc    | 24 ++++++++++++++++++++++++
 test/syscalls/linux/pty.cc            | 20 +++++++++++++++++++-
 test/util/pty_util.cc                 | 10 +++++++++-
 test/util/pty_util.h                  |  3 +++
 9 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index f4ddfa406..2d43dff1d 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -344,6 +344,10 @@ func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error
 
 // Truncate calls i.InodeOperations.Truncate with i as the Inode.
 func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error {
+	if IsDir(i.StableAttr) {
+		return syserror.EISDIR
+	}
+
 	if i.overlay != nil {
 		return overlayTruncate(ctx, i.overlay, d, size)
 	}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 19b7557d5..bc56be696 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -32,6 +32,7 @@ import (
 // +stateify savable
 type masterInodeOperations struct {
 	fsutil.SimpleFileInode
+	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 944c4ada1..4cbea0367 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -31,6 +31,7 @@ import (
 // +stateify savable
 type slaveInodeOperations struct {
 	fsutil.SimpleFileInode
+	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index b9a8e3e21..167c2b60b 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -169,10 +169,11 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 			if dirPath {
 				return syserror.ENOTDIR
 			}
-			if flags&linux.O_TRUNC != 0 {
-				if err := d.Inode.Truncate(t, d, 0); err != nil {
-					return err
-				}
+		}
+
+		if flags&linux.O_TRUNC != 0 {
+			if err := d.Inode.Truncate(t, d, 0); err != nil {
+				return err
 			}
 		}
 
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 2b1df52ce..267ae19f6 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -73,6 +73,28 @@ class OpenTest : public FileTest {
   const std::string test_data_ = "hello world\n";
 };
 
+TEST_F(OpenTest, OTrunc) {
+  auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_TRUNC, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, OTruncAndReadOnlyDir) {
+  auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, OTruncAndReadOnlyFile) {
+  auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
+  const FileDescriptor existing =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dirpath.c_str(), O_RDWR | O_CREAT, 0666));
+  const FileDescriptor otrunc = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666));
+}
+
 TEST_F(OpenTest, ReadOnly) {
   char buf;
   const FileDescriptor ro_file =
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index e5a85ef9d..431733dbe 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -88,6 +88,30 @@ TEST(CreateTest, CreateExclusively) {
               SyscallFailsWithErrno(EEXIST));
 }
 
+TEST(CreateTeast, CreatWithOTrunc) {
+  std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(CreateTeast, CreatDirWithOTruncAndReadOnly) {
+  std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(CreateTeast, CreatFileWithOTruncAndReadOnly) {
+  std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
+  int dirfd;
+  ASSERT_THAT(dirfd = open(dirpath.c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
+              SyscallSucceeds());
+  ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
 TEST(CreateTest, CreateFailsOnUnpermittedDir) {
   // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
   // always override directory permissions.
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 99a0df235..dafe64d20 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -70,6 +70,8 @@ constexpr absl::Duration kTimeout = absl::Seconds(20);
 // The maximum line size in bytes returned per read from a pty file.
 constexpr int kMaxLineSize = 4096;
 
+constexpr char kMasterPath[] = "/dev/ptmx";
+
 // glibc defines its own, different, version of struct termios. We care about
 // what the kernel does, not glibc.
 #define KERNEL_NCCS 19
@@ -376,9 +378,25 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
   return PosixError(ETIMEDOUT, "Poll timed out");
 }
 
+TEST(PtyTrunc, Truncate) {
+  // Opening PTYs with O_TRUNC shouldn't cause an error, but calls to
+  // (f)truncate should.
+  FileDescriptor master =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(kMasterPath, O_RDWR | O_TRUNC));
+  int n = ASSERT_NO_ERRNO_AND_VALUE(SlaveID(master));
+  std::string spath = absl::StrCat("/dev/pts/", n);
+  FileDescriptor slave =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(spath, O_RDWR | O_NONBLOCK | O_TRUNC));
+
+  EXPECT_THAT(truncate(kMasterPath, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(truncate(spath.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(ftruncate(master.get(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(ftruncate(slave.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(BasicPtyTest, StatUnopenedMaster) {
   struct stat s;
-  ASSERT_THAT(stat("/dev/ptmx", &s), SyscallSucceeds());
+  ASSERT_THAT(stat(kMasterPath, &s), SyscallSucceeds());
 
   EXPECT_EQ(s.st_rdev, makedev(TTYAUX_MAJOR, kPtmxMinor));
   EXPECT_EQ(s.st_size, 0);
diff --git a/test/util/pty_util.cc b/test/util/pty_util.cc
index c0fd9a095..c01f916aa 100644
--- a/test/util/pty_util.cc
+++ b/test/util/pty_util.cc
@@ -24,6 +24,14 @@ namespace gvisor {
 namespace testing {
 
 PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
+  PosixErrorOr<int> n = SlaveID(master);
+  if (!n.ok()) {
+    return PosixErrorOr<FileDescriptor>(n.error());
+  }
+  return Open(absl::StrCat("/dev/pts/", n.ValueOrDie()), O_RDWR | O_NONBLOCK);
+}
+
+PosixErrorOr<int> SlaveID(const FileDescriptor& master) {
   // Get pty index.
   int n;
   int ret = ioctl(master.get(), TIOCGPTN, &n);
@@ -38,7 +46,7 @@ PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
     return PosixError(errno, "ioctl(TIOSPTLCK) failed");
   }
 
-  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
+  return n;
 }
 
 }  // namespace testing
diff --git a/test/util/pty_util.h b/test/util/pty_util.h
index 367b14f15..0722da379 100644
--- a/test/util/pty_util.h
+++ b/test/util/pty_util.h
@@ -24,6 +24,9 @@ namespace testing {
 // Opens the slave end of the passed master as R/W and nonblocking.
 PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master);
 
+// Get the number of the slave end of the master.
+PosixErrorOr<int> SlaveID(const FileDescriptor& master);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From af323eb7c1830053627de6161f8ce73ac5f06d4e Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 14 Nov 2019 17:02:59 -0800
Subject: Fix return codes for {get,set}sockopt for some nullptr cases.

Updates #1092

PiperOrigin-RevId: 280547239
---
 pkg/sentry/syscalls/linux/sys_socket.go  | 23 +++++++++--------------
 test/syscalls/linux/socket_ip_unbound.cc | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index b5a72ce63..ab1001f16 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -447,16 +447,13 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, syserror.ENOTSOCK
 	}
 
-	// Read the length if present. Reject negative values.
+	// Read the length. Reject negative values.
 	optLen := int32(0)
-	if optLenAddr != 0 {
-		if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
-			return 0, nil, err
-		}
-
-		if optLen < 0 {
-			return 0, nil, syserror.EINVAL
-		}
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
 	}
 
 	// Call syscall implementation then copy both value and value len out.
@@ -465,11 +462,9 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, e.ToError()
 	}
 
-	if optLenAddr != 0 {
-		vLen := int32(binary.Size(v))
-		if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
-			return 0, nil, err
-		}
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
 	}
 
 	if v != nil {
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index b02872308..b6754111f 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -354,6 +354,38 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTOS) {
   EXPECT_EQ(get, expect);
 }
 
+TEST_P(IPUnboundSocketTest, NullTOS) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  TOSOption t = GetTOSOption(GetParam().domain);
+  int set_sz = sizeof(int);
+  if (GetParam().domain == AF_INET) {
+    EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+                SyscallFailsWithErrno(EFAULT));
+  } else {  // AF_INET6
+    // The AF_INET6 behavior is not yet compatible. gVisor will try to read
+    // optval from user memory at syscall handler, it needs substantial
+    // refactoring to implement this behavior just for IPv6.
+    if (IsRunningOnGvisor()) {
+      EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+                  SyscallFailsWithErrno(EFAULT));
+    } else {
+      // Linux's IPv6 stack treats nullptr optval as input of 0, so the call
+      // succeeds. (net/ipv6/ipv6_sockglue.c, do_ipv6_setsockopt())
+      //
+      // Linux's implementation would need fixing as passing a nullptr as optval
+      // and non-zero optlen may not be valid.
+      EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+                  SyscallSucceedsWithValue(0));
+    }
+  }
+  socklen_t get_sz = sizeof(int);
+  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, nullptr, &get_sz),
+              SyscallFailsWithErrno(EFAULT));
+  int get = -1;
+  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, nullptr),
+              SyscallFailsWithErrno(EFAULT));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     IPUnboundSockets, IPUnboundSocketTest,
     ::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
-- 
cgit v1.2.3


From 23574b1b87ce5aed7b78a53663eac61ae030e9d5 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 14 Nov 2019 22:54:01 -0800
Subject: Fix panic when logging raw packets via sniffer.

Sniffer assumed that outgoing packets have transport headers, but
users can write packets via SOCK_RAW with arbitrary transport headers that
netstack doesn't know about. We now explicitly check for the presence of network
and transport headers before assuming they exist.

PiperOrigin-RevId: 280594395
---
 pkg/tcpip/link/sniffer/sniffer.go | 277 +++++++++++++++++++-------------------
 1 file changed, 140 insertions(+), 137 deletions(-)

diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 122680e10..147d4e242 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -118,7 +118,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // logs the packet before forwarding to the actual dispatcher.
 func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, pkt.Data.First(), nil)
+		logPacket("recv", protocol, pkt, nil)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		vs := pkt.Data.Views()
@@ -195,7 +195,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 
 func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", protocol, pkt.Header.View(), gso)
+		logPacket("send", protocol, pkt, gso)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		hdrBuf := pkt.Header.View()
@@ -247,7 +247,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
+		logPacket("send raw packet", 0, tcpip.PacketBuffer{}, nil /* gso */)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		length := vv.Size()
@@ -289,7 +289,7 @@ func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer)
 // Wait implements stack.LinkEndpoint.Wait.
 func (*endpoint) Wait() {}
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -298,39 +298,40 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	size := uint16(0)
 	var fragmentOffset uint16
 	var moreFragments bool
-	switch protocol {
-	case header.IPv4ProtocolNumber:
-		ipv4 := header.IPv4(b)
-		fragmentOffset = ipv4.FragmentOffset()
-		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
-		src = ipv4.SourceAddress()
-		dst = ipv4.DestinationAddress()
-		transProto = ipv4.Protocol()
-		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		b = b[ipv4.HeaderLength():]
-		id = int(ipv4.ID())
-
-	case header.IPv6ProtocolNumber:
-		ipv6 := header.IPv6(b)
-		src = ipv6.SourceAddress()
-		dst = ipv6.DestinationAddress()
-		transProto = ipv6.NextHeader()
-		size = ipv6.PayloadLength()
-		b = b[header.IPv6MinimumSize:]
-
-	case header.ARPProtocolNumber:
-		arp := header.ARP(b)
-		log.Infof(
-			"%s arp %v (%v) -> %v (%v) valid:%v",
-			prefix,
-			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
-			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
-			arp.IsValid(),
-		)
-		return
-	default:
-		log.Infof("%s unknown network protocol", prefix)
-		return
+
+	if pkt.NetworkHeader != nil {
+		switch protocol {
+		case header.IPv4ProtocolNumber:
+			ipv4 := header.IPv4(pkt.NetworkHeader)
+			fragmentOffset = ipv4.FragmentOffset()
+			moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
+			src = ipv4.SourceAddress()
+			dst = ipv4.DestinationAddress()
+			transProto = ipv4.Protocol()
+			size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
+			id = int(ipv4.ID())
+
+		case header.IPv6ProtocolNumber:
+			ipv6 := header.IPv6(pkt.NetworkHeader)
+			src = ipv6.SourceAddress()
+			dst = ipv6.DestinationAddress()
+			transProto = ipv6.NextHeader()
+			size = ipv6.PayloadLength()
+
+		case header.ARPProtocolNumber:
+			arp := header.ARP(pkt.NetworkHeader)
+			log.Infof(
+				"%s arp %v (%v) -> %v (%v) valid:%v",
+				prefix,
+				tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
+				tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
+				arp.IsValid(),
+			)
+			return
+		default:
+			log.Infof("%s unknown network protocol", prefix)
+			return
+		}
 	}
 
 	// Figure out the transport layer info.
@@ -338,118 +339,120 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	srcPort := uint16(0)
 	dstPort := uint16(0)
 	details := ""
-	switch tcpip.TransportProtocolNumber(transProto) {
-	case header.ICMPv4ProtocolNumber:
-		transName = "icmp"
-		icmp := header.ICMPv4(b)
-		icmpType := "unknown"
-		if fragmentOffset == 0 {
+	if pkt.TransportHeader != nil {
+		switch tcpip.TransportProtocolNumber(transProto) {
+		case header.ICMPv4ProtocolNumber:
+			transName = "icmp"
+			icmp := header.ICMPv4(pkt.TransportHeader)
+			icmpType := "unknown"
+			if fragmentOffset == 0 {
+				switch icmp.Type() {
+				case header.ICMPv4EchoReply:
+					icmpType = "echo reply"
+				case header.ICMPv4DstUnreachable:
+					icmpType = "destination unreachable"
+				case header.ICMPv4SrcQuench:
+					icmpType = "source quench"
+				case header.ICMPv4Redirect:
+					icmpType = "redirect"
+				case header.ICMPv4Echo:
+					icmpType = "echo"
+				case header.ICMPv4TimeExceeded:
+					icmpType = "time exceeded"
+				case header.ICMPv4ParamProblem:
+					icmpType = "param problem"
+				case header.ICMPv4Timestamp:
+					icmpType = "timestamp"
+				case header.ICMPv4TimestampReply:
+					icmpType = "timestamp reply"
+				case header.ICMPv4InfoRequest:
+					icmpType = "info request"
+				case header.ICMPv4InfoReply:
+					icmpType = "info reply"
+				}
+			}
+			log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+			return
+
+		case header.ICMPv6ProtocolNumber:
+			transName = "icmp"
+			icmp := header.ICMPv6(pkt.TransportHeader)
+			icmpType := "unknown"
 			switch icmp.Type() {
-			case header.ICMPv4EchoReply:
-				icmpType = "echo reply"
-			case header.ICMPv4DstUnreachable:
+			case header.ICMPv6DstUnreachable:
 				icmpType = "destination unreachable"
-			case header.ICMPv4SrcQuench:
-				icmpType = "source quench"
-			case header.ICMPv4Redirect:
-				icmpType = "redirect"
-			case header.ICMPv4Echo:
-				icmpType = "echo"
-			case header.ICMPv4TimeExceeded:
+			case header.ICMPv6PacketTooBig:
+				icmpType = "packet too big"
+			case header.ICMPv6TimeExceeded:
 				icmpType = "time exceeded"
-			case header.ICMPv4ParamProblem:
+			case header.ICMPv6ParamProblem:
 				icmpType = "param problem"
-			case header.ICMPv4Timestamp:
-				icmpType = "timestamp"
-			case header.ICMPv4TimestampReply:
-				icmpType = "timestamp reply"
-			case header.ICMPv4InfoRequest:
-				icmpType = "info request"
-			case header.ICMPv4InfoReply:
-				icmpType = "info reply"
+			case header.ICMPv6EchoRequest:
+				icmpType = "echo request"
+			case header.ICMPv6EchoReply:
+				icmpType = "echo reply"
+			case header.ICMPv6RouterSolicit:
+				icmpType = "router solicit"
+			case header.ICMPv6RouterAdvert:
+				icmpType = "router advert"
+			case header.ICMPv6NeighborSolicit:
+				icmpType = "neighbor solicit"
+			case header.ICMPv6NeighborAdvert:
+				icmpType = "neighbor advert"
+			case header.ICMPv6RedirectMsg:
+				icmpType = "redirect message"
 			}
-		}
-		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
-		return
-
-	case header.ICMPv6ProtocolNumber:
-		transName = "icmp"
-		icmp := header.ICMPv6(b)
-		icmpType := "unknown"
-		switch icmp.Type() {
-		case header.ICMPv6DstUnreachable:
-			icmpType = "destination unreachable"
-		case header.ICMPv6PacketTooBig:
-			icmpType = "packet too big"
-		case header.ICMPv6TimeExceeded:
-			icmpType = "time exceeded"
-		case header.ICMPv6ParamProblem:
-			icmpType = "param problem"
-		case header.ICMPv6EchoRequest:
-			icmpType = "echo request"
-		case header.ICMPv6EchoReply:
-			icmpType = "echo reply"
-		case header.ICMPv6RouterSolicit:
-			icmpType = "router solicit"
-		case header.ICMPv6RouterAdvert:
-			icmpType = "router advert"
-		case header.ICMPv6NeighborSolicit:
-			icmpType = "neighbor solicit"
-		case header.ICMPv6NeighborAdvert:
-			icmpType = "neighbor advert"
-		case header.ICMPv6RedirectMsg:
-			icmpType = "redirect message"
-		}
-		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
-		return
-
-	case header.UDPProtocolNumber:
-		transName = "udp"
-		udp := header.UDP(b)
-		if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
-			srcPort = udp.SourcePort()
-			dstPort = udp.DestinationPort()
-			details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
-			size -= header.UDPMinimumSize
-		}
+			log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+			return
 
-	case header.TCPProtocolNumber:
-		transName = "tcp"
-		tcp := header.TCP(b)
-		if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
-			offset := int(tcp.DataOffset())
-			if offset < header.TCPMinimumSize {
-				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
-				break
-			}
-			if offset > len(tcp) && !moreFragments {
-				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, len(tcp))
-				break
+		case header.UDPProtocolNumber:
+			transName = "udp"
+			udp := header.UDP(pkt.TransportHeader)
+			if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
+				srcPort = udp.SourcePort()
+				dstPort = udp.DestinationPort()
+				details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
+				size -= header.UDPMinimumSize
 			}
 
-			srcPort = tcp.SourcePort()
-			dstPort = tcp.DestinationPort()
-			size -= uint16(offset)
+		case header.TCPProtocolNumber:
+			transName = "tcp"
+			tcp := header.TCP(pkt.TransportHeader)
+			if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
+				offset := int(tcp.DataOffset())
+				if offset < header.TCPMinimumSize {
+					details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
+					break
+				}
+				if offset > len(tcp) && !moreFragments {
+					details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, len(tcp))
+					break
+				}
 
-			// Initialize the TCP flags.
-			flags := tcp.Flags()
-			flagsStr := []byte("FSRPAU")
-			for i := range flagsStr {
-				if flags&(1<<uint(i)) == 0 {
-					flagsStr[i] = ' '
+				srcPort = tcp.SourcePort()
+				dstPort = tcp.DestinationPort()
+				size -= uint16(offset)
+
+				// Initialize the TCP flags.
+				flags := tcp.Flags()
+				flagsStr := []byte("FSRPAU")
+				for i := range flagsStr {
+					if flags&(1<<uint(i)) == 0 {
+						flagsStr[i] = ' '
+					}
+				}
+				details = fmt.Sprintf("flags:0x%02x (%v) seqnum: %v ack: %v win: %v xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
+				if flags&header.TCPFlagSyn != 0 {
+					details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
+				} else {
+					details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
 				}
 			}
-			details = fmt.Sprintf("flags:0x%02x (%v) seqnum: %v ack: %v win: %v xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
-			if flags&header.TCPFlagSyn != 0 {
-				details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
-			} else {
-				details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
-			}
-		}
 
-	default:
-		log.Infof("%s %v -> %v unknown transport protocol: %d", prefix, src, dst, transProto)
-		return
+		default:
+			log.Infof("%s %v -> %v unknown transport protocol: %d", prefix, src, dst, transProto)
+			return
+		}
 	}
 
 	if gso != nil {
-- 
cgit v1.2.3


From 76039f895995c3fe0deef5958f843868685ecc38 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 15 Nov 2019 11:39:25 -0800
Subject: Do not set finalizer on p9.ClientFile.

Aside from the performance hit, there is no guarantee that p9.ClientFile's
finalizer runs before the associated p9.Client is closed.

PiperOrigin-RevId: 280702509
---
 pkg/p9/client_file.go | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index a6cc0617e..de9357389 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -17,7 +17,6 @@ package p9
 import (
 	"fmt"
 	"io"
-	"runtime"
 	"sync/atomic"
 	"syscall"
 
@@ -45,15 +44,10 @@ func (c *Client) Attach(name string) (File, error) {
 
 // newFile returns a new client file.
 func (c *Client) newFile(fid FID) *clientFile {
-	cf := &clientFile{
+	return &clientFile{
 		client: c,
 		fid:    fid,
 	}
-
-	// Make sure the file is closed.
-	runtime.SetFinalizer(cf, (*clientFile).Close)
-
-	return cf
 }
 
 // clientFile is provided to clients.
@@ -192,7 +186,6 @@ func (c *clientFile) Remove() error {
 	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
 		return syscall.EBADF
 	}
-	runtime.SetFinalizer(c, nil)
 
 	// Send the remove message.
 	if err := c.client.sendRecv(&Tremove{FID: c.fid}, &Rremove{}); err != nil {
@@ -214,7 +207,6 @@ func (c *clientFile) Close() error {
 	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
 		return syscall.EBADF
 	}
-	runtime.SetFinalizer(c, nil)
 
 	// Send the close message.
 	if err := c.client.sendRecv(&Tclunk{FID: c.fid}, &Rclunk{}); err != nil {
-- 
cgit v1.2.3


From 3e534f2974f469a889534221b83c3bbbd1b0318c Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Fri, 15 Nov 2019 11:44:02 -0800
Subject: Handle in-flight TCP segments when moving to CLOSE.

As we move to CLOSE state from LAST-ACK or TIME-WAIT,
ensure that we re-match all in-flight segments to any
listening endpoint.

Also fix LISTEN state handling of any ACK segments as per RFC793.

Fixes #1153

PiperOrigin-RevId: 280703556
---
 pkg/tcpip/transport/tcp/accept.go   |  14 ++-
 pkg/tcpip/transport/tcp/connect.go  |  60 +++++++++--
 pkg/tcpip/transport/tcp/rcv.go      |   2 +-
 pkg/tcpip/transport/tcp/tcp_test.go | 196 ++++++++++++++++++++++++++++++++++++
 4 files changed, 261 insertions(+), 11 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index f24b51b91..023045ec1 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -419,8 +419,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 	// TODO(b/143300739): Use the userMSS of the listening socket
 	// for accepted sockets.
 
-	switch s.flags {
-	case header.TCPFlagSyn:
+	switch {
+	case s.flags == header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
 		if incSynRcvdCount() {
 			// Only handle the syn if the following conditions hold
@@ -464,7 +464,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
 		}
 
-	case header.TCPFlagAck:
+	case (s.flags & header.TCPFlagAck) != 0:
 		if e.acceptQueueIsFull() {
 			// Silently drop the ack as the application can't accept
 			// the connection at this point. The ack will be
@@ -478,6 +478,14 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		}
 
 		if !synCookiesInUse() {
+			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
+			// Any acknowledgment is bad if it arrives on a connection still in
+			// the LISTEN state.  An acceptable reset segment should be formed
+			// for any arriving ACK-bearing segment.  The RST should be
+			// formatted as follows:
+			//
+			//  <SEQ=SEG.ACK><CTL=RST>
+			//
 			// Send a reset as this is an ACK for which there is no
 			// half open connections and we are not using cookies
 			// yet.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 49f2b9685..364067731 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -865,6 +865,33 @@ func (e *endpoint) completeWorkerLocked() {
 	}
 }
 
+// transitionToStateCloseLocked ensures that the endpoint is
+// cleaned up from the transport demuxer, "before" moving to
+// StateClose. This will ensure that no packet will be
+// delivered to this endpoint from the demuxer when the endpoint
+// is transitioned to StateClose.
+func (e *endpoint) transitionToStateCloseLocked() {
+	if e.state == StateClose {
+		return
+	}
+	e.cleanupLocked()
+	e.state = StateClose
+}
+
+// tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
+// segment to any other endpoint other than the current one. This is called
+// only when the endpoint is in StateClose and we want to deliver the segment
+// to any other listening endpoint. We reply with RST if we cannot find one.
+func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
+	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
+	if ep == nil {
+		replyWithReset(s)
+		s.decRef()
+		return
+	}
+	ep.(*endpoint).enqueueSegment(s)
+}
+
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 	if e.rcv.acceptable(s.sequenceNumber, 0) {
 		// RFC 793, page 37 states that "in all states
@@ -894,12 +921,8 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		//  general "connection reset" signal. Enter the CLOSED state,
 		//  delete the TCB, and return.
 		case StateCloseWait:
-			e.state = StateClose
+			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
-			// We need to set this explicitly here because otherwise
-			// the port registrations will not be released till the
-			// endpoint is actively closed by the application.
-			e.workerCleanup = true
 			e.mu.Unlock()
 			return false, nil
 		default:
@@ -915,6 +938,20 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 func (e *endpoint) handleSegments() *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
+		e.mu.RLock()
+		state := e.state
+		e.mu.RUnlock()
+		if state == StateClose {
+			// When we get into StateClose while processing from the queue,
+			// return immediately and let the protocolMainloop handle it.
+			//
+			// We can reach StateClose only while processing a previous segment
+			// or a notification from the protocolMainLoop (caller goroutine).
+			// This means that with this return, the segment dequeue below can
+			// never occur on a closed endpoint.
+			return nil
+		}
+
 		s := e.segmentQueue.dequeue()
 		if s == nil {
 			checkRequeue = false
@@ -1160,7 +1197,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				// to the TCP_FIN_WAIT2 timeout was hit. Just
 				// mark the socket as closed.
 				e.mu.Lock()
-				e.state = StateClose
+				e.transitionToStateCloseLocked()
 				e.mu.Unlock()
 				return nil
 			},
@@ -1321,12 +1358,21 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	if e.state != StateError {
 		e.stack.Stats().TCP.EstablishedResets.Increment()
 		e.stack.Stats().TCP.CurrentEstablished.Decrement()
-		e.state = StateClose
+		e.transitionToStateCloseLocked()
 	}
 
 	// Lock released below.
 	epilogue()
 
+	// epilogue removes the endpoint from the transport-demuxer and
+	// unlocks e.mu. Now that no new segments can get enqueued to this
+	// endpoint, try to re-match the segment to a different endpoint
+	// as the current endpoint is closed.
+	for !e.segmentQueue.empty() {
+		s := e.segmentQueue.dequeue()
+		e.tryDeliverSegmentFromClosedEndpoint(s)
+	}
+
 	// A new SYN was received during TIME_WAIT and we need to abort
 	// the timewait and redirect the segment to the listener queue
 	if reuseTW != nil {
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 068b90fb6..857dc445f 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -218,7 +218,7 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateClosing:
 			r.ep.state = StateTimeWait
 		case StateLastAck:
-			r.ep.state = StateClose
+			r.ep.transitionToStateCloseLocked()
 		}
 		r.ep.mu.Unlock()
 	}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index b443fe9dc..64f765c70 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -454,6 +454,112 @@ func TestConnectResetAfterClose(t *testing.T) {
 	}
 }
 
+// TestClosingWithEnqueuedSegments tests handling of
+// still enqueued segments when the endpoint transitions
+// to StateClose. The in-flight segments would be re-enqueued
+// to a any listening endpoint.
+func TestClosingWithEnqueuedSegments(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	ep := c.EP
+	c.EP = nil
+
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateEstablished; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	// Send a FIN for ESTABLISHED --> CLOSED-WAIT
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagFin | header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Get the ACK for the FIN we sent.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateCloseWait; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	// Close the application endpoint for CLOSE_WAIT --> LAST_ACK
+	ep.Close()
+
+	// Get the FIN
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateLastAck; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	// Pause the endpoint`s protocolMainLoop.
+	ep.(interface{ StopWork() }).StopWork()
+
+	// Enqueue last ACK followed by an ACK matching the endpoint
+	//
+	// Send Last ACK for LAST_ACK --> CLOSED
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  791,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Send a packet with ACK set, this would generate RST when
+	// not using SYN cookies as in this test.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  792,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Unpause endpoint`s protocolMainLoop.
+	ep.(interface{ ResumeWork() }).ResumeWork()
+
+	// Wait for the protocolMainLoop to resume and update state.
+	time.Sleep(1 * time.Millisecond)
+
+	// Expect the endpoint to be closed.
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	// Check if the endpoint was moved to CLOSED and netstack a reset in
+	// response to the ACK packet that we sent after last-ACK.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(793),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		),
+	)
+}
+
 func TestSimpleReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -686,6 +792,96 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
 		checker.SeqNum(200)))
 }
 
+func TestSendRstOnListenerRxAckV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagFin | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(200)))
+}
+
+func TestSendRstOnListenerRxAckV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true /* v6Only */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagFin | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(200)))
+}
+
+// TestListenShutdown tests for the listening endpoint not processing
+// any receive when it is on read shutdown.
+func TestListenShutdown(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	if err := c.EP.Shutdown(tcpip.ShutdownRead); err != nil {
+		t.Fatal("Shutdown failed:", err)
+	}
+
+	// Wait for the endpoint state to be propagated.
+	time.Sleep(10 * time.Millisecond)
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	c.CheckNoPacket("Packet received when listening socket was shutdown")
+}
+
 func TestTOSV4(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-- 
cgit v1.2.3


From 5107e6b6bd75a77f05e68503ff958c7a9354ea73 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 15 Nov 2019 16:51:11 -0800
Subject: Automated rollback of changelist 280594395

PiperOrigin-RevId: 280763655
---
 pkg/tcpip/link/sniffer/sniffer.go | 277 +++++++++++++++++++-------------------
 1 file changed, 137 insertions(+), 140 deletions(-)

diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 147d4e242..122680e10 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -118,7 +118,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // logs the packet before forwarding to the actual dispatcher.
 func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, pkt, nil)
+		logPacket("recv", protocol, pkt.Data.First(), nil)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		vs := pkt.Data.Views()
@@ -195,7 +195,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 
 func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", protocol, pkt, gso)
+		logPacket("send", protocol, pkt.Header.View(), gso)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		hdrBuf := pkt.Header.View()
@@ -247,7 +247,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send raw packet", 0, tcpip.PacketBuffer{}, nil /* gso */)
+		logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		length := vv.Size()
@@ -289,7 +289,7 @@ func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer)
 // Wait implements stack.LinkEndpoint.Wait.
 func (*endpoint) Wait() {}
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer, gso *stack.GSO) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -298,40 +298,39 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt tcpip.Pa
 	size := uint16(0)
 	var fragmentOffset uint16
 	var moreFragments bool
-
-	if pkt.NetworkHeader != nil {
-		switch protocol {
-		case header.IPv4ProtocolNumber:
-			ipv4 := header.IPv4(pkt.NetworkHeader)
-			fragmentOffset = ipv4.FragmentOffset()
-			moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
-			src = ipv4.SourceAddress()
-			dst = ipv4.DestinationAddress()
-			transProto = ipv4.Protocol()
-			size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-			id = int(ipv4.ID())
-
-		case header.IPv6ProtocolNumber:
-			ipv6 := header.IPv6(pkt.NetworkHeader)
-			src = ipv6.SourceAddress()
-			dst = ipv6.DestinationAddress()
-			transProto = ipv6.NextHeader()
-			size = ipv6.PayloadLength()
-
-		case header.ARPProtocolNumber:
-			arp := header.ARP(pkt.NetworkHeader)
-			log.Infof(
-				"%s arp %v (%v) -> %v (%v) valid:%v",
-				prefix,
-				tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
-				tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
-				arp.IsValid(),
-			)
-			return
-		default:
-			log.Infof("%s unknown network protocol", prefix)
-			return
-		}
+	switch protocol {
+	case header.IPv4ProtocolNumber:
+		ipv4 := header.IPv4(b)
+		fragmentOffset = ipv4.FragmentOffset()
+		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
+		src = ipv4.SourceAddress()
+		dst = ipv4.DestinationAddress()
+		transProto = ipv4.Protocol()
+		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
+		b = b[ipv4.HeaderLength():]
+		id = int(ipv4.ID())
+
+	case header.IPv6ProtocolNumber:
+		ipv6 := header.IPv6(b)
+		src = ipv6.SourceAddress()
+		dst = ipv6.DestinationAddress()
+		transProto = ipv6.NextHeader()
+		size = ipv6.PayloadLength()
+		b = b[header.IPv6MinimumSize:]
+
+	case header.ARPProtocolNumber:
+		arp := header.ARP(b)
+		log.Infof(
+			"%s arp %v (%v) -> %v (%v) valid:%v",
+			prefix,
+			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
+			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
+			arp.IsValid(),
+		)
+		return
+	default:
+		log.Infof("%s unknown network protocol", prefix)
+		return
 	}
 
 	// Figure out the transport layer info.
@@ -339,120 +338,118 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt tcpip.Pa
 	srcPort := uint16(0)
 	dstPort := uint16(0)
 	details := ""
-	if pkt.TransportHeader != nil {
-		switch tcpip.TransportProtocolNumber(transProto) {
-		case header.ICMPv4ProtocolNumber:
-			transName = "icmp"
-			icmp := header.ICMPv4(pkt.TransportHeader)
-			icmpType := "unknown"
-			if fragmentOffset == 0 {
-				switch icmp.Type() {
-				case header.ICMPv4EchoReply:
-					icmpType = "echo reply"
-				case header.ICMPv4DstUnreachable:
-					icmpType = "destination unreachable"
-				case header.ICMPv4SrcQuench:
-					icmpType = "source quench"
-				case header.ICMPv4Redirect:
-					icmpType = "redirect"
-				case header.ICMPv4Echo:
-					icmpType = "echo"
-				case header.ICMPv4TimeExceeded:
-					icmpType = "time exceeded"
-				case header.ICMPv4ParamProblem:
-					icmpType = "param problem"
-				case header.ICMPv4Timestamp:
-					icmpType = "timestamp"
-				case header.ICMPv4TimestampReply:
-					icmpType = "timestamp reply"
-				case header.ICMPv4InfoRequest:
-					icmpType = "info request"
-				case header.ICMPv4InfoReply:
-					icmpType = "info reply"
-				}
-			}
-			log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
-			return
-
-		case header.ICMPv6ProtocolNumber:
-			transName = "icmp"
-			icmp := header.ICMPv6(pkt.TransportHeader)
-			icmpType := "unknown"
+	switch tcpip.TransportProtocolNumber(transProto) {
+	case header.ICMPv4ProtocolNumber:
+		transName = "icmp"
+		icmp := header.ICMPv4(b)
+		icmpType := "unknown"
+		if fragmentOffset == 0 {
 			switch icmp.Type() {
-			case header.ICMPv6DstUnreachable:
+			case header.ICMPv4EchoReply:
+				icmpType = "echo reply"
+			case header.ICMPv4DstUnreachable:
 				icmpType = "destination unreachable"
-			case header.ICMPv6PacketTooBig:
-				icmpType = "packet too big"
-			case header.ICMPv6TimeExceeded:
+			case header.ICMPv4SrcQuench:
+				icmpType = "source quench"
+			case header.ICMPv4Redirect:
+				icmpType = "redirect"
+			case header.ICMPv4Echo:
+				icmpType = "echo"
+			case header.ICMPv4TimeExceeded:
 				icmpType = "time exceeded"
-			case header.ICMPv6ParamProblem:
+			case header.ICMPv4ParamProblem:
 				icmpType = "param problem"
-			case header.ICMPv6EchoRequest:
-				icmpType = "echo request"
-			case header.ICMPv6EchoReply:
-				icmpType = "echo reply"
-			case header.ICMPv6RouterSolicit:
-				icmpType = "router solicit"
-			case header.ICMPv6RouterAdvert:
-				icmpType = "router advert"
-			case header.ICMPv6NeighborSolicit:
-				icmpType = "neighbor solicit"
-			case header.ICMPv6NeighborAdvert:
-				icmpType = "neighbor advert"
-			case header.ICMPv6RedirectMsg:
-				icmpType = "redirect message"
+			case header.ICMPv4Timestamp:
+				icmpType = "timestamp"
+			case header.ICMPv4TimestampReply:
+				icmpType = "timestamp reply"
+			case header.ICMPv4InfoRequest:
+				icmpType = "info request"
+			case header.ICMPv4InfoReply:
+				icmpType = "info reply"
 			}
-			log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
-			return
+		}
+		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		return
 
-		case header.UDPProtocolNumber:
-			transName = "udp"
-			udp := header.UDP(pkt.TransportHeader)
-			if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
-				srcPort = udp.SourcePort()
-				dstPort = udp.DestinationPort()
-				details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
-				size -= header.UDPMinimumSize
+	case header.ICMPv6ProtocolNumber:
+		transName = "icmp"
+		icmp := header.ICMPv6(b)
+		icmpType := "unknown"
+		switch icmp.Type() {
+		case header.ICMPv6DstUnreachable:
+			icmpType = "destination unreachable"
+		case header.ICMPv6PacketTooBig:
+			icmpType = "packet too big"
+		case header.ICMPv6TimeExceeded:
+			icmpType = "time exceeded"
+		case header.ICMPv6ParamProblem:
+			icmpType = "param problem"
+		case header.ICMPv6EchoRequest:
+			icmpType = "echo request"
+		case header.ICMPv6EchoReply:
+			icmpType = "echo reply"
+		case header.ICMPv6RouterSolicit:
+			icmpType = "router solicit"
+		case header.ICMPv6RouterAdvert:
+			icmpType = "router advert"
+		case header.ICMPv6NeighborSolicit:
+			icmpType = "neighbor solicit"
+		case header.ICMPv6NeighborAdvert:
+			icmpType = "neighbor advert"
+		case header.ICMPv6RedirectMsg:
+			icmpType = "redirect message"
+		}
+		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		return
+
+	case header.UDPProtocolNumber:
+		transName = "udp"
+		udp := header.UDP(b)
+		if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
+			srcPort = udp.SourcePort()
+			dstPort = udp.DestinationPort()
+			details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
+			size -= header.UDPMinimumSize
+		}
+
+	case header.TCPProtocolNumber:
+		transName = "tcp"
+		tcp := header.TCP(b)
+		if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
+			offset := int(tcp.DataOffset())
+			if offset < header.TCPMinimumSize {
+				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
+				break
+			}
+			if offset > len(tcp) && !moreFragments {
+				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, len(tcp))
+				break
 			}
 
-		case header.TCPProtocolNumber:
-			transName = "tcp"
-			tcp := header.TCP(pkt.TransportHeader)
-			if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
-				offset := int(tcp.DataOffset())
-				if offset < header.TCPMinimumSize {
-					details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
-					break
-				}
-				if offset > len(tcp) && !moreFragments {
-					details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, len(tcp))
-					break
-				}
+			srcPort = tcp.SourcePort()
+			dstPort = tcp.DestinationPort()
+			size -= uint16(offset)
 
-				srcPort = tcp.SourcePort()
-				dstPort = tcp.DestinationPort()
-				size -= uint16(offset)
-
-				// Initialize the TCP flags.
-				flags := tcp.Flags()
-				flagsStr := []byte("FSRPAU")
-				for i := range flagsStr {
-					if flags&(1<<uint(i)) == 0 {
-						flagsStr[i] = ' '
-					}
-				}
-				details = fmt.Sprintf("flags:0x%02x (%v) seqnum: %v ack: %v win: %v xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
-				if flags&header.TCPFlagSyn != 0 {
-					details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
-				} else {
-					details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
+			// Initialize the TCP flags.
+			flags := tcp.Flags()
+			flagsStr := []byte("FSRPAU")
+			for i := range flagsStr {
+				if flags&(1<<uint(i)) == 0 {
+					flagsStr[i] = ' '
 				}
 			}
-
-		default:
-			log.Infof("%s %v -> %v unknown transport protocol: %d", prefix, src, dst, transProto)
-			return
+			details = fmt.Sprintf("flags:0x%02x (%v) seqnum: %v ack: %v win: %v xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
+			if flags&header.TCPFlagSyn != 0 {
+				details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
+			} else {
+				details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
+			}
 		}
+
+	default:
+		log.Infof("%s %v -> %v unknown transport protocol: %d", prefix, src, dst, transProto)
+		return
 	}
 
 	if gso != nil {
-- 
cgit v1.2.3


From 96019436854b0c59ae380a4920381586d05d9c31 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 18 Nov 2019 11:21:18 -0800
Subject: release: fix tag script

The tag script, when not run interactively, will fail without a provided commit
message (since it now uses annotated tags). For now, use a trivial message. In
the future, this could be extended to provide automated release notes.

PiperOrigin-RevId: 281112651
---
 tools/tag_release.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index 9d5a60583..f33b902d6 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -64,5 +64,6 @@ fi
 
 # Tag the given commit (annotated, to record the committer).
 declare -r tag="release-${release}"
-(git tag -a "${tag}" "${commit}" && git push origin tag "${tag}") || \
+(git tag -m "Release ${release}" -a "${tag}" "${commit}" && \
+  git push origin tag "${tag}") || \
   (git tag -d "${tag}" && false)
-- 
cgit v1.2.3


From 26b3341b9ae08bb72971d5465c77e6c8db82c996 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 18 Nov 2019 14:55:30 -0800
Subject: platform/ptrace: use host.GetCPU instead of the getcpu syscall

This should save ~200ns from switchToApp (on ptrace too). // mpratt

PiperOrigin-RevId: 281159895
---
 pkg/sentry/platform/ptrace/BUILD                    |  1 +
 .../platform/ptrace/subprocess_linux_unsafe.go      | 21 +++------------------
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index ebcc8c098..0df8cfa0f 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//pkg/procid",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
+        "//pkg/sentry/hostcpu",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/safecopy",
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index de6783fb0..2e6fbe488 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -25,6 +25,7 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 )
 
 // maskPool contains reusable CPU masks for setting affinity. Unfortunately,
@@ -49,20 +50,6 @@ func unmaskAllSignals() syscall.Errno {
 	return errno
 }
 
-// getCPU gets the current CPU.
-//
-// Precondition: the current runtime thread should be locked.
-func getCPU() (uint32, error) {
-	var cpu uintptr
-	if _, _, errno := syscall.RawSyscall(
-		unix.SYS_GETCPU,
-		uintptr(unsafe.Pointer(&cpu)),
-		0, 0); errno != 0 {
-		return 0, errno
-	}
-	return uint32(cpu), nil
-}
-
 // setCPU sets the CPU affinity.
 func (t *thread) setCPU(cpu uint32) error {
 	mask := maskPool.Get().([]uintptr)
@@ -93,10 +80,8 @@ func (t *thread) setCPU(cpu uint32) error {
 //
 // Precondition: the current runtime thread should be locked.
 func (t *thread) bind() {
-	currentCPU, err := getCPU()
-	if err != nil {
-		return
-	}
+	currentCPU := hostcpu.GetCPU()
+
 	if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
 		// Set the affinity on the thread and save the CPU for next
 		// round; we don't expect CPUs to bounce around too frequently.
-- 
cgit v1.2.3


From ef6f93625457c166628fc9de57c15d986ae83159 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 18 Nov 2019 16:25:03 -0800
Subject: Add vfs.GenericParseMountOptions().

Equivalent to fs.GenericMountSourceOptions().

PiperOrigin-RevId: 281179287
---
 pkg/sentry/vfs/BUILD                   |  1 +
 pkg/sentry/vfs/filesystem_impl_util.go | 43 ++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 pkg/sentry/vfs/filesystem_impl_util.go

diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index eff4b44f6..4f2c2de9f 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -12,6 +12,7 @@ go_library(
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
+        "filesystem_impl_util.go",
         "filesystem_type.go",
         "mount.go",
         "mount_unsafe.go",
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
new file mode 100644
index 000000000..465e610e0
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"strings"
+)
+
+// GenericParseMountOptions parses a comma-separated list of options of the
+// form "key" or "key=value", where neither key nor value contain commas, and
+// returns it as a map. If str contains duplicate keys, then the last value
+// wins. For example:
+//
+// str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':'','key2':'value2'}
+//
+// GenericParseMountOptions is not appropriate if values may contain commas,
+// e.g. in the case of the mpol mount option for tmpfs(5).
+func GenericParseMountOptions(str string) map[string]string {
+	m := make(map[string]string)
+	for _, opt := range strings.Split(str, ",") {
+		if len(opt) > 0 {
+			res := strings.SplitN(opt, "=", 2)
+			if len(res) == 2 {
+				m[res[0]] = res[1]
+			} else {
+				m[opt] = ""
+			}
+		}
+	}
+	return m
+}
-- 
cgit v1.2.3


From 012102eefd2b145ddee774cba28e4fa889fadd49 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 20 Nov 2019 14:54:03 -0800
Subject: Pass OpenTruncate to gofer in Open call when opening file with
 O_TRUNC.

Note that the Sentry still calls Truncate() on the file before calling Open.

A new p9 version check was added to ensure that the p9 server can handle the
the OpenTruncate flag. If not, then the flag is stripped before sending.

PiperOrigin-RevId: 281609112
---
 pkg/p9/handlers.go                 | 18 ++++-----
 pkg/p9/p9.go                       |  4 --
 pkg/p9/p9test/client_test.go       | 78 +++++++++++++++++++++++---------------
 pkg/p9/version.go                  |  8 +++-
 pkg/sentry/fs/flags.go             |  7 ++++
 pkg/sentry/fs/gofer/file_state.go  |  8 +++-
 pkg/sentry/fs/gofer/handles.go     |  5 ++-
 pkg/sentry/fs/gofer/inode.go       | 18 ++++++---
 pkg/sentry/syscalls/linux/flags.go |  1 +
 9 files changed, 95 insertions(+), 52 deletions(-)

diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index ba9a55d6d..51869c7d6 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -272,15 +272,15 @@ func (t *Tlopen) handle(cs *connState) message {
 		return newErr(syscall.EINVAL)
 	}
 
-	// Are flags valid?
-	flags := t.Flags &^ OpenFlagsIgnoreMask
-	if flags&^OpenFlagsModeMask != 0 {
-		return newErr(syscall.EINVAL)
-	}
-
-	// Is this an attempt to open a directory as writable? Don't accept.
-	if ref.mode.IsDir() && flags != ReadOnly {
-		return newErr(syscall.EINVAL)
+	if ref.mode.IsDir() {
+		// Directory must be opened ReadOnly.
+		if t.Flags&OpenFlagsModeMask != ReadOnly {
+			return newErr(syscall.EISDIR)
+		}
+		// Directory not truncatable.
+		if t.Flags&OpenTruncate != 0 {
+			return newErr(syscall.EISDIR)
+		}
 	}
 
 	var (
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 415200d60..d3090535a 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -47,10 +47,6 @@ const (
 	// OpenTruncate is a Tlopen flag indicating that the opened file should be
 	// truncated.
 	OpenTruncate OpenFlags = 01000
-
-	// OpenFlagsIgnoreMask is a list of OpenFlags mode bits that are ignored for Tlopen.
-	// Note that syscall.O_LARGEFILE is set to zero, use value from Linux fcntl.h.
-	OpenFlagsIgnoreMask OpenFlags = syscall.O_DIRECTORY | syscall.O_NOATIME | 0100000
 )
 
 // ConnectFlags is the mode passed to Connect operations.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 8bbdb2488..6e758148d 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1044,11 +1044,11 @@ func TestReaddir(t *testing.T) {
 			if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
 				t.Errorf("readdir got %v, wanted EINVAL", err)
 			}
-			if _, _, _, err := f.Open(p9.ReadWrite); err != syscall.EINVAL {
-				t.Errorf("readdir got %v, wanted EINVAL", err)
+			if _, _, _, err := f.Open(p9.ReadWrite); err != syscall.EISDIR {
+				t.Errorf("readdir got %v, wanted EISDIR", err)
 			}
-			if _, _, _, err := f.Open(p9.WriteOnly); err != syscall.EINVAL {
-				t.Errorf("readdir got %v, wanted EINVAL", err)
+			if _, _, _, err := f.Open(p9.WriteOnly); err != syscall.EISDIR {
+				t.Errorf("readdir got %v, wanted EISDIR", err)
 			}
 			backend.EXPECT().Open(p9.ReadOnly).Times(1)
 			if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
@@ -1065,75 +1065,93 @@ func TestReaddir(t *testing.T) {
 func TestOpen(t *testing.T) {
 	type openTest struct {
 		name  string
-		mode  p9.OpenFlags
+		flags p9.OpenFlags
 		err   error
 		match func(p9.FileMode) bool
 	}
 
 	cases := []openTest{
-		{
-			name:  "invalid",
-			mode:  ^p9.OpenFlagsModeMask,
-			err:   syscall.EINVAL,
-			match: func(p9.FileMode) bool { return true },
-		},
 		{
 			name:  "not-openable-read-only",
-			mode:  p9.ReadOnly,
+			flags: p9.ReadOnly,
 			err:   syscall.EINVAL,
 			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
 		},
 		{
 			name:  "not-openable-write-only",
-			mode:  p9.WriteOnly,
+			flags: p9.WriteOnly,
 			err:   syscall.EINVAL,
 			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
 		},
 		{
 			name:  "not-openable-read-write",
-			mode:  p9.ReadWrite,
+			flags: p9.ReadWrite,
 			err:   syscall.EINVAL,
 			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
 		},
 		{
 			name:  "directory-read-only",
-			mode:  p9.ReadOnly,
+			flags: p9.ReadOnly,
 			err:   nil,
 			match: func(mode p9.FileMode) bool { return mode.IsDir() },
 		},
 		{
 			name:  "directory-read-write",
-			mode:  p9.ReadWrite,
-			err:   syscall.EINVAL,
+			flags: p9.ReadWrite,
+			err:   syscall.EISDIR,
 			match: func(mode p9.FileMode) bool { return mode.IsDir() },
 		},
 		{
 			name:  "directory-write-only",
-			mode:  p9.WriteOnly,
-			err:   syscall.EINVAL,
+			flags: p9.WriteOnly,
+			err:   syscall.EISDIR,
 			match: func(mode p9.FileMode) bool { return mode.IsDir() },
 		},
 		{
 			name:  "read-only",
-			mode:  p9.ReadOnly,
+			flags: p9.ReadOnly,
 			err:   nil,
 			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
 		},
 		{
 			name:  "write-only",
-			mode:  p9.WriteOnly,
+			flags: p9.WriteOnly,
 			err:   nil,
 			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
 		},
 		{
 			name:  "read-write",
-			mode:  p9.ReadWrite,
+			flags: p9.ReadWrite,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+		{
+			name:  "directory-read-only-truncate",
+			flags: p9.ReadOnly | p9.OpenTruncate,
+			err:   syscall.EISDIR,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "read-only-truncate",
+			flags: p9.ReadOnly | p9.OpenTruncate,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+		{
+			name:  "write-only-truncate",
+			flags: p9.WriteOnly | p9.OpenTruncate,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+		{
+			name:  "read-write-truncate",
+			flags: p9.ReadWrite | p9.OpenTruncate,
 			err:   nil,
 			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
 		},
 	}
 
-	// Open(mode OpenFlags) (*fd.FD, QID, uint32, error)
+	// Open(flags OpenFlags) (*fd.FD, QID, uint32, error)
 	// - only works on Regular, NamedPipe, BLockDevice, CharacterDevice
 	// - returning a file works as expected
 	for name := range newTypeMap(nil) {
@@ -1171,25 +1189,25 @@ func TestOpen(t *testing.T) {
 				// Attempt the given open.
 				if tc.err != nil {
 					// We expect an error, just test and return.
-					if _, _, _, err := f.Open(tc.mode); err != tc.err {
-						t.Fatalf("open with mode %v got %v, want %v", tc.mode, err, tc.err)
+					if _, _, _, err := f.Open(tc.flags); err != tc.err {
+						t.Fatalf("open with flags %v got %v, want %v", tc.flags, err, tc.err)
 					}
 					return
 				}
 
 				// Run an FD test, since we expect success.
 				fdTest(t, func(send *fd.FD) *fd.FD {
-					backend.EXPECT().Open(tc.mode).Return(send, p9.QID{}, uint32(0), nil).Times(1)
-					recv, _, _, err := f.Open(tc.mode)
+					backend.EXPECT().Open(tc.flags).Return(send, p9.QID{}, uint32(0), nil).Times(1)
+					recv, _, _, err := f.Open(tc.flags)
 					if err != tc.err {
-						t.Fatalf("open with mode %v got %v, want %v", tc.mode, err, tc.err)
+						t.Fatalf("open with flags %v got %v, want %v", tc.flags, err, tc.err)
 					}
 					return recv
 				})
 
 				// If the open was successful, attempt another one.
-				if _, _, _, err := f.Open(tc.mode); err != syscall.EINVAL {
-					t.Errorf("second open with mode %v got %v, want EINVAL", tc.mode, err)
+				if _, _, _, err := f.Open(tc.flags); err != syscall.EINVAL {
+					t.Errorf("second open with flags %v got %v, want EINVAL", tc.flags, err)
 				}
 
 				// Ensure that all illegal operations fail.
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index f1ffdd23a..36a694c58 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 8
+	highestSupportedVersion uint32 = 9
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -155,3 +155,9 @@ func versionSupportsTallocate(v uint32) bool {
 func versionSupportsFlipcall(v uint32) bool {
 	return v >= 8
 }
+
+// VersionSupportsOpenTruncateFlag returns true if version v supports
+// passing the OpenTruncate flag to Tlopen.
+func VersionSupportsOpenTruncateFlag(v uint32) bool {
+	return v >= 9
+}
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index 0fab876a9..4338ae1fa 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -64,6 +64,10 @@ type FileFlags struct {
 
 	// NonSeekable indicates that file.offset isn't used.
 	NonSeekable bool
+
+	// Truncate indicates that the file should be truncated before opened.
+	// This is only applicable if the file is regular.
+	Truncate bool
 }
 
 // SettableFileFlags is a subset of FileFlags above that can be changed
@@ -118,6 +122,9 @@ func (f FileFlags) ToLinux() (mask uint) {
 	if f.LargeFile {
 		mask |= linux.O_LARGEFILE
 	}
+	if f.Truncate {
+		mask |= linux.O_TRUNC
+	}
 
 	switch {
 	case f.Read && f.Write:
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index c2fbb4be9..bb8312849 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -28,8 +28,14 @@ func (f *fileOperations) afterLoad() {
 
 		// Manually load the open handles.
 		var err error
+
+		// The file may have been opened with Truncate, but we don't
+		// want to re-open it with Truncate or we will lose data.
+		flags := f.flags
+		flags.Truncate = false
+
 		// TODO(b/38173783): Context is not plumbed to save/restore.
-		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags, f.inodeOperations.cachingInodeOps)
+		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 39c8ec33d..b86c49b39 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -64,7 +64,7 @@ func (h *handles) DecRef() {
 	})
 }
 
-func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*handles, error) {
+func newHandles(ctx context.Context, client *p9.Client, file contextFile, flags fs.FileFlags) (*handles, error) {
 	_, newFile, err := file.walk(ctx, nil)
 	if err != nil {
 		return nil, err
@@ -81,6 +81,9 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han
 	default:
 		panic("impossible fs.FileFlags")
 	}
+	if flags.Truncate && p9.VersionSupportsOpenTruncateFlag(client.Version()) {
+		p9flags |= p9.OpenTruncate
+	}
 
 	hostFile, _, _, err := newFile.open(ctx, p9flags)
 	if err != nil {
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 99910388f..4237bf353 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -180,7 +180,7 @@ func (i *inodeFileState) setSharedHandlesLocked(flags fs.FileFlags, h *handles)
 // given flags.
 func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags, cache *fsutil.CachingInodeOperations) (*handles, error) {
 	if !i.canShareHandles() {
-		return newHandles(ctx, i.file, flags)
+		return newHandles(ctx, i.s.client, i.file, flags)
 	}
 
 	i.handlesMu.Lock()
@@ -201,19 +201,25 @@ func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags, cac
 // whether previously open read handle was recreated. Host mappings must be
 // invalidated if so.
 func (i *inodeFileState) getHandlesLocked(ctx context.Context, flags fs.FileFlags) (*handles, bool, error) {
-	// Do we already have usable shared handles?
-	if flags.Write {
+	// Check if we are able to use cached handles.
+	if flags.Truncate && p9.VersionSupportsOpenTruncateFlag(i.s.client.Version()) {
+		// If we are truncating (and the gofer supports it), then we
+		// always need a new handle. Don't return one from the cache.
+	} else if flags.Write {
 		if i.writeHandles != nil && (i.writeHandlesRW || !flags.Read) {
+			// File is opened for writing, and we have cached write
+			// handles that we can use.
 			i.writeHandles.IncRef()
 			return i.writeHandles, false, nil
 		}
 	} else if i.readHandles != nil {
+		// File is opened for reading and we have cached handles.
 		i.readHandles.IncRef()
 		return i.readHandles, false, nil
 	}
 
-	// No; get new handles and cache them for future sharing.
-	h, err := newHandles(ctx, i.file, flags)
+	// Get new handles and cache them for future sharing.
+	h, err := newHandles(ctx, i.s.client, i.file, flags)
 	if err != nil {
 		return nil, false, err
 	}
@@ -239,7 +245,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle
 	if !flags.Read {
 		// Writer can't be used for read, must create a new handle.
 		var err error
-		h, err = newHandles(ctx, i.file, fs.FileFlags{Read: true})
+		h, err = newHandles(ctx, i.s.client, i.file, fs.FileFlags{Read: true})
 		if err != nil {
 			return err
 		}
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index 444f2b004..07961dad9 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -50,5 +50,6 @@ func linuxToFlags(mask uint) fs.FileFlags {
 		Directory:   mask&linux.O_DIRECTORY != 0,
 		Async:       mask&linux.O_ASYNC != 0,
 		LargeFile:   mask&linux.O_LARGEFILE != 0,
+		Truncate:    mask&linux.O_TRUNC != 0,
 	}
 }
-- 
cgit v1.2.3


From b6a00aa375e674617f1914b90db5ddb222b5a04e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 20 Nov 2019 15:28:02 -0800
Subject: Use a GitHub credential for tagging a release.

PiperOrigin-RevId: 281617882
---
 kokoro/release.cfg | 14 ++++++++++++++
 scripts/release.sh | 11 +++++++++++
 2 files changed, 25 insertions(+)

diff --git a/kokoro/release.cfg b/kokoro/release.cfg
index b9d35bc51..5cec1790a 100644
--- a/kokoro/release.cfg
+++ b/kokoro/release.cfg
@@ -1 +1,15 @@
 build_file: "repo/scripts/release.sh"
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+      keystore_config_id: 73898
+      keyname: "kokoro-github-access-token"
+    }
+  }
+}
+
+env_vars {
+  key: "KOKORO_GITHUB_ACCESS_TOKEN"
+  value: "73898_kokoro-github-access-token"
+}
diff --git a/scripts/release.sh b/scripts/release.sh
index b936bcc77..091abf87f 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -34,5 +34,16 @@ declare -r EMAIL=${EMAIL:-${KOKORO_RELEASE_AUTHOR}@google.com}
 git config --get user.name || git config user.name "gVisor-bot"
 git config --get user.email || git config user.email "${EMAIL}"
 
+# Provide a credential if available.
+if [[ -v KOKORO_GITHUB_ACCESS_TOKEN ]]; then
+  git config --global credential.helper cache
+  git credential approve <<EOF
+protocol=https
+host=github.com
+username=$(cat "${KOKORO_KEYSTORE_DIR}/${KOKORO_GITHUB_ACCESS_TOKEN}")
+password=x-oauth-basic
+EOF
+fi
+
 # Run the release tool, which pushes to the origin repository.
 tools/tag_release.sh "${KOKORO_RELEASE_COMMIT}" "${KOKORO_RELEASE_TAG}"
-- 
cgit v1.2.3


From c0f89eba6ebdec08460bd796fc62d6aef674d141 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 21 Nov 2019 11:29:49 -0800
Subject: Import and structure cleanup.

PiperOrigin-RevId: 281795269
---
 pkg/eventchannel/BUILD                             |   1 +
 pkg/flipcall/BUILD                                 |   2 +-
 pkg/flipcall/flipcall_unsafe.go                    |  10 +-
 pkg/sentry/BUILD                                   |   3 +
 pkg/sentry/fs/BUILD                                |   2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go           |   1 +
 pkg/sentry/fs/overlay.go                           |   4 +-
 pkg/sentry/fsimpl/memfs/BUILD                      |   3 +-
 pkg/sentry/kernel/BUILD                            |   4 +-
 pkg/sentry/kernel/auth/BUILD                       |   2 +-
 pkg/sentry/kernel/futex/BUILD                      |   2 +-
 pkg/sentry/kernel/signalfd/BUILD                   |   4 +-
 pkg/sentry/kernel/task.go                          |   4 +-
 pkg/sentry/mm/BUILD                                |   2 +-
 pkg/sentry/mm/mm.go                                |   6 +-
 pkg/sentry/strace/strace.proto                     |   3 +-
 pkg/sentry/time/BUILD                              |   4 +-
 pkg/sentry/vfs/BUILD                               |   2 +-
 pkg/sentry/vfs/mount_unsafe.go                     |   4 +-
 pkg/state/object.proto                             |  56 ++++----
 pkg/syncutil/BUILD                                 |  54 ++++++++
 pkg/syncutil/LICENSE                               |  27 ++++
 pkg/syncutil/README.md                             |   5 +
 pkg/syncutil/atomicptr_unsafe.go                   |  47 +++++++
 pkg/syncutil/atomicptrtest/BUILD                   |  29 ++++
 pkg/syncutil/atomicptrtest/atomicptr_test.go       |  31 +++++
 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go   |  21 +++
 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go   |  16 +++
 pkg/syncutil/downgradable_rwmutex_test.go          | 150 ++++++++++++++++++++
 pkg/syncutil/downgradable_rwmutex_unsafe.go        | 143 +++++++++++++++++++
 pkg/syncutil/memmove_unsafe.go                     |  28 ++++
 pkg/syncutil/norace_unsafe.go                      |  35 +++++
 pkg/syncutil/race_unsafe.go                        |  41 ++++++
 pkg/syncutil/seqatomic_unsafe.go                   |  72 ++++++++++
 pkg/syncutil/seqatomictest/BUILD                   |  35 +++++
 pkg/syncutil/seqatomictest/seqatomic_test.go       | 132 ++++++++++++++++++
 pkg/syncutil/seqcount.go                           | 149 ++++++++++++++++++++
 pkg/syncutil/seqcount_test.go                      | 153 +++++++++++++++++++++
 pkg/syncutil/syncutil.go                           |   7 +
 test/syscalls/linux/accept_bind.cc                 |   2 +
 test/syscalls/linux/accept_bind_stream.cc          |   2 +
 test/syscalls/linux/chmod.cc                       |   1 +
 test/syscalls/linux/chroot.cc                      |   1 +
 test/syscalls/linux/clock_gettime.cc               |   1 +
 test/syscalls/linux/concurrency.cc                 |   1 +
 test/syscalls/linux/exec_binary.cc                 |   1 +
 test/syscalls/linux/file_base.h                    |   1 +
 test/syscalls/linux/flock.cc                       |   1 +
 test/syscalls/linux/fork.cc                        |   1 +
 test/syscalls/linux/getdents.cc                    |   1 +
 test/syscalls/linux/ip_socket_test_util.cc         |   5 +-
 test/syscalls/linux/memory_accounting.cc           |   1 +
 test/syscalls/linux/mlock.cc                       |   1 +
 test/syscalls/linux/mmap.cc                        |   1 +
 test/syscalls/linux/mount.cc                       |   1 +
 test/syscalls/linux/read.cc                        |   1 +
 test/syscalls/linux/rename.cc                      |   1 +
 test/syscalls/linux/seccomp.cc                     |   1 +
 test/syscalls/linux/select.cc                      |   1 +
 test/syscalls/linux/shm.cc                         |   1 -
 test/syscalls/linux/socket_blocking.cc             |   1 +
 test/syscalls/linux/socket_ip_loopback_blocking.cc |   1 +
 .../linux/socket_ip_tcp_generic_loopback.cc        |   1 +
 .../linux/socket_ip_tcp_loopback_blocking.cc       |   1 +
 .../linux/socket_ip_tcp_loopback_nonblock.cc       |   1 +
 .../socket_ipv4_tcp_unbound_external_networking.cc |   1 +
 ...et_ipv4_tcp_unbound_external_networking_test.cc |   3 +-
 ...et_ipv4_udp_unbound_external_networking_test.cc |   3 +-
 test/syscalls/linux/socket_netlink_util.cc         |   4 +-
 test/syscalls/linux/socket_unix_blocking_local.cc  |   3 +-
 test/syscalls/linux/socket_unix_dgram.cc           |   1 +
 .../linux/socket_unix_dgram_non_blocking.cc        |   1 +
 .../linux/socket_unix_non_stream_blocking_local.cc |   3 +-
 test/syscalls/linux/socket_unix_seqpacket.cc       |   1 +
 .../linux/socket_unix_stream_blocking_local.cc     |   3 +-
 .../linux/socket_unix_stream_nonblock_local.cc     |   3 +-
 .../syscalls/linux/socket_unix_unbound_abstract.cc |   1 +
 .../linux/socket_unix_unbound_filesystem.cc        |   1 +
 .../linux/socket_unix_unbound_seqpacket.cc         |   1 +
 test/syscalls/linux/socket_unix_unbound_stream.cc  |   1 +
 test/syscalls/linux/sync.cc                        |   3 +-
 test/syscalls/linux/truncate.cc                    |   1 +
 .../syscalls/linux/unix_domain_socket_test_util.cc |   1 +
 test/syscalls/linux/unix_domain_socket_test_util.h |   1 +
 test/syscalls/linux/utimes.cc                      |   1 +
 test/syscalls/linux/vdso_clock_gettime.cc          |   1 +
 test/util/fs_util_test.cc                          |   4 +-
 test/util/mount_util.h                             |   1 +
 test/util/posix_error_test.cc                      |   1 +
 test/util/rlimit_util.cc                           |   1 +
 test/util/signal_util.cc                           |   1 +
 test/util/signal_util.h                            |   1 +
 test/util/temp_path.h                              |   1 +
 test/util/test_util_test.cc                        |   1 +
 third_party/gvsync/BUILD                           |  53 -------
 third_party/gvsync/LICENSE                         |  27 ----
 third_party/gvsync/README.md                       |   3 -
 third_party/gvsync/atomicptr_unsafe.go             |  47 -------
 third_party/gvsync/atomicptrtest/BUILD             |  28 ----
 third_party/gvsync/atomicptrtest/atomicptr_test.go |  31 -----
 .../gvsync/downgradable_rwmutex_1_12_unsafe.go     |  21 ---
 .../gvsync/downgradable_rwmutex_1_13_unsafe.go     |  16 ---
 third_party/gvsync/downgradable_rwmutex_test.go    | 150 --------------------
 third_party/gvsync/downgradable_rwmutex_unsafe.go  | 143 -------------------
 third_party/gvsync/gvsync.go                       |   7 -
 third_party/gvsync/memmove_unsafe.go               |  28 ----
 third_party/gvsync/norace_unsafe.go                |  35 -----
 third_party/gvsync/race_unsafe.go                  |  41 ------
 third_party/gvsync/seqatomic_unsafe.go             |  72 ----------
 third_party/gvsync/seqatomictest/BUILD             |  34 -----
 third_party/gvsync/seqatomictest/seqatomic_test.go | 132 ------------------
 third_party/gvsync/seqcount.go                     | 149 --------------------
 third_party/gvsync/seqcount_test.go                | 153 ---------------------
 tools/go_marshal/test/BUILD                        |   3 +-
 tools/go_marshal/test/external/BUILD               |   4 +-
 115 files changed, 1302 insertions(+), 1250 deletions(-)
 create mode 100644 pkg/syncutil/BUILD
 create mode 100644 pkg/syncutil/LICENSE
 create mode 100644 pkg/syncutil/README.md
 create mode 100644 pkg/syncutil/atomicptr_unsafe.go
 create mode 100644 pkg/syncutil/atomicptrtest/BUILD
 create mode 100644 pkg/syncutil/atomicptrtest/atomicptr_test.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_test.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_unsafe.go
 create mode 100644 pkg/syncutil/memmove_unsafe.go
 create mode 100644 pkg/syncutil/norace_unsafe.go
 create mode 100644 pkg/syncutil/race_unsafe.go
 create mode 100644 pkg/syncutil/seqatomic_unsafe.go
 create mode 100644 pkg/syncutil/seqatomictest/BUILD
 create mode 100644 pkg/syncutil/seqatomictest/seqatomic_test.go
 create mode 100644 pkg/syncutil/seqcount.go
 create mode 100644 pkg/syncutil/seqcount_test.go
 create mode 100644 pkg/syncutil/syncutil.go
 delete mode 100644 third_party/gvsync/BUILD
 delete mode 100644 third_party/gvsync/LICENSE
 delete mode 100644 third_party/gvsync/README.md
 delete mode 100644 third_party/gvsync/atomicptr_unsafe.go
 delete mode 100644 third_party/gvsync/atomicptrtest/BUILD
 delete mode 100644 third_party/gvsync/atomicptrtest/atomicptr_test.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_test.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_unsafe.go
 delete mode 100644 third_party/gvsync/gvsync.go
 delete mode 100644 third_party/gvsync/memmove_unsafe.go
 delete mode 100644 third_party/gvsync/norace_unsafe.go
 delete mode 100644 third_party/gvsync/race_unsafe.go
 delete mode 100644 third_party/gvsync/seqatomic_unsafe.go
 delete mode 100644 third_party/gvsync/seqatomictest/BUILD
 delete mode 100644 third_party/gvsync/seqatomictest/seqatomic_test.go
 delete mode 100644 third_party/gvsync/seqcount.go
 delete mode 100644 third_party/gvsync/seqcount_test.go

diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 71f2abc83..0b4b7cc44 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -25,6 +25,7 @@ go_library(
 proto_library(
     name = "eventchannel_proto",
     srcs = ["event.proto"],
+    visibility = ["//:sandbox"],
 )
 
 go_proto_library(
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index 5643d5f26..e590a71ba 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -19,7 +19,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/memutil",
-        "//third_party/gvsync",
+        "//pkg/syncutil",
     ],
 )
 
diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go
index a37952637..27b8939fc 100644
--- a/pkg/flipcall/flipcall_unsafe.go
+++ b/pkg/flipcall/flipcall_unsafe.go
@@ -18,7 +18,7 @@ import (
 	"reflect"
 	"unsafe"
 
-	"gvisor.dev/gvisor/third_party/gvsync"
+	"gvisor.dev/gvisor/pkg/syncutil"
 )
 
 // Packets consist of a 16-byte header followed by an arbitrarily-sized
@@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte {
 var ioSync int64
 
 func raceBecomeActive() {
-	if gvsync.RaceEnabled {
-		gvsync.RaceAcquire((unsafe.Pointer)(&ioSync))
+	if syncutil.RaceEnabled {
+		syncutil.RaceAcquire((unsafe.Pointer)(&ioSync))
 	}
 }
 
 func raceBecomeInactive() {
-	if gvsync.RaceEnabled {
-		gvsync.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
+	if syncutil.RaceEnabled {
+		syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
 	}
 }
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index 2d6379c86..2a7122957 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -10,5 +10,8 @@ package_group(
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
         "//tools/go_marshal/...",
+
+        # Keep the old paths as a temporary measure.
+        "//third_party/golang/gvisor/pkg/sentry/...",
     ],
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 378602cc9..c035ffff7 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -68,9 +68,9 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
+        "//pkg/syncutil",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 8e4d839e1..577445148 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -25,6 +25,7 @@ import (
 	"time"
 
 	"github.com/google/uuid"
+
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 1d3ff39e0..25573e986 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syncutil"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/third_party/gvsync"
 )
 
 // The virtual filesystem implements an overlay configuration. For a high-level
@@ -199,7 +199,7 @@ type overlayEntry struct {
 	upper *Inode
 
 	// dirCacheMu protects dirCache.
-	dirCacheMu gvsync.DowngradableRWMutex `state:"nosave"`
+	dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"`
 
 	// dirCache is cache of DentAttrs from upper and lower Inodes.
 	dirCache *SortedDentryMap
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 04d667273..952b20c51 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -1,10 +1,9 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "dentry_list",
     out = "dentry_list.go",
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e041c51b3..2706927ff 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -35,7 +35,7 @@ go_template_instance(
     out = "seqatomic_taskgoroutineschedinfo_unsafe.go",
     package = "kernel",
     suffix = "TaskGoroutineSchedInfo",
-    template = "//third_party/gvsync:generic_seqatomic",
+    template = "//pkg/syncutil:generic_seqatomic",
     types = {
         "Value": "TaskGoroutineSchedInfo",
     },
@@ -209,12 +209,12 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/state/statefile",
+        "//pkg/syncutil",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 51de4568a..04c244447 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -8,7 +8,7 @@ go_template_instance(
     out = "atomicptr_credentials_unsafe.go",
     package = "auth",
     suffix = "Credentials",
-    template = "//third_party/gvsync:generic_atomicptr",
+    template = "//pkg/syncutil:generic_atomicptr",
     types = {
         "Value": "Credentials",
     },
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 34286c7a8..75ec31761 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "atomicptr_bucket_unsafe.go",
     package = "futex",
     suffix = "Bucket",
-    template = "//third_party/gvsync:generic_atomicptr",
+    template = "//pkg/syncutil:generic_atomicptr",
     types = {
         "Value": "bucket",
     },
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 50b69d154..9f7e19b4d 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "signalfd",
     srcs = ["signalfd.go"],
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 9be3dae3c..80c8e5464 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -35,8 +35,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syncutil"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/third_party/gvsync"
 )
 
 // Task represents a thread of execution in the untrusted app.  It
@@ -83,7 +83,7 @@ type Task struct {
 	//
 	// gosched is protected by goschedSeq. gosched is owned by the task
 	// goroutine.
-	goschedSeq gvsync.SeqCount `state:"nosave"`
+	goschedSeq syncutil.SeqCount `state:"nosave"`
 	gosched    TaskGoroutineSchedInfo
 
 	// yieldCount is the number of times the task goroutine has called
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a804b8b5c..839931f67 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -118,9 +118,9 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/syncutil",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index f350e0109..58a5c186d 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -44,7 +44,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/third_party/gvsync"
+	"gvisor.dev/gvisor/pkg/syncutil"
 )
 
 // MemoryManager implements a virtual address space.
@@ -82,7 +82,7 @@ type MemoryManager struct {
 	users int32
 
 	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
-	mappingMu gvsync.DowngradableRWMutex `state:"nosave"`
+	mappingMu syncutil.DowngradableRWMutex `state:"nosave"`
 
 	// vmas stores virtual memory areas. Since vmas are stored by value,
 	// clients should usually use vmaIterator.ValuePtr() instead of
@@ -125,7 +125,7 @@ type MemoryManager struct {
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
-	activeMu gvsync.DowngradableRWMutex `state:"nosave"`
+	activeMu syncutil.DowngradableRWMutex `state:"nosave"`
 
 	// pmas stores platform mapping areas used to implement vmas. Since pmas
 	// are stored by value, clients should usually use pmaIterator.ValuePtr()
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
index 4b2f73a5f..906c52c51 100644
--- a/pkg/sentry/strace/strace.proto
+++ b/pkg/sentry/strace/strace.proto
@@ -32,8 +32,7 @@ message Strace {
   }
 }
 
-message StraceEnter {
-}
+message StraceEnter {}
 
 message StraceExit {
   // Return value formatted as string.
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index d3a4cd943..18e212dff 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "seqatomic_parameters_unsafe.go",
     package = "time",
     suffix = "Parameters",
-    template = "//third_party/gvsync:generic_seqatomic",
+    template = "//pkg/syncutil:generic_seqatomic",
     types = {
         "Value": "Parameters",
     },
@@ -36,8 +36,8 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/metric",
+        "//pkg/syncutil",
         "//pkg/syserror",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 4f2c2de9f..74a325309 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -33,9 +33,9 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
+        "//pkg/syncutil",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 75e6c7dfa..c98b42f91 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,7 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
-	"gvisor.dev/gvisor/third_party/gvsync"
+	"gvisor.dev/gvisor/pkg/syncutil"
 )
 
 // mountKey represents the location at which a Mount is mounted. It is
@@ -72,7 +72,7 @@ type mountTable struct {
 	// intrinsics and inline assembly, limiting the performance of this
 	// approach.)
 
-	seq  gvsync.SeqCount
+	seq  syncutil.SeqCount
 	seed uint32 // for hashing keys
 
 	// size holds both length (number of elements) and capacity (number of
diff --git a/pkg/state/object.proto b/pkg/state/object.proto
index 952289069..5ebcfb151 100644
--- a/pkg/state/object.proto
+++ b/pkg/state/object.proto
@@ -18,8 +18,8 @@ package gvisor.state.statefile;
 
 // Slice is a slice value.
 message Slice {
-  uint32 length    = 1;
-  uint32 capacity  = 2;
+  uint32 length = 1;
+  uint32 capacity = 2;
   uint64 ref_value = 3;
 }
 
@@ -30,13 +30,13 @@ message Array {
 
 // Map is a map value.
 message Map {
-  repeated Object keys   = 1;
+  repeated Object keys = 1;
   repeated Object values = 2;
 }
 
 // Interface is an interface value.
 message Interface {
-  string type  = 1;
+  string type = 1;
   Object value = 2;
 }
 
@@ -47,7 +47,7 @@ message Struct {
 
 // Field encodes a single field.
 message Field {
-  string name  = 1;
+  string name = 1;
   Object value = 2;
 }
 
@@ -113,28 +113,28 @@ message Float32s {
 // Note that ref_value references an Object.id, below.
 message Object {
   oneof value {
-    bool      bool_value          = 1;
-    bytes     string_value        = 2;
-    int64     int64_value         = 3;
-    uint64    uint64_value        = 4;
-    double    double_value        = 5;
-    uint64    ref_value           = 6;
-    Slice     slice_value         = 7;
-    Array     array_value         = 8;
-    Interface interface_value     = 9;
-    Struct    struct_value        = 10;
-    Map       map_value           = 11;
-    bytes     byte_array_value    = 12;
-    Uint16s   uint16_array_value  = 13;
-    Uint32s   uint32_array_value  = 14;
-    Uint64s   uint64_array_value  = 15;
-    Uintptrs  uintptr_array_value = 16;
-    Int8s     int8_array_value    = 17;
-    Int16s    int16_array_value   = 18;
-    Int32s    int32_array_value   = 19;
-    Int64s    int64_array_value   = 20;
-    Bools     bool_array_value    = 21;
-    Float64s  float64_array_value = 22;
-    Float32s  float32_array_value = 23;
+    bool bool_value = 1;
+    bytes string_value = 2;
+    int64 int64_value = 3;
+    uint64 uint64_value = 4;
+    double double_value = 5;
+    uint64 ref_value = 6;
+    Slice slice_value = 7;
+    Array array_value = 8;
+    Interface interface_value = 9;
+    Struct struct_value = 10;
+    Map map_value = 11;
+    bytes byte_array_value = 12;
+    Uint16s uint16_array_value = 13;
+    Uint32s uint32_array_value = 14;
+    Uint64s uint64_array_value = 15;
+    Uintptrs uintptr_array_value = 16;
+    Int8s int8_array_value = 17;
+    Int16s int16_array_value = 18;
+    Int32s int32_array_value = 19;
+    Int64s int64_array_value = 20;
+    Bools bool_array_value = 21;
+    Float64s float64_array_value = 22;
+    Float32s float32_array_value = 23;
   }
 }
diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD
new file mode 100644
index 000000000..b06a90bef
--- /dev/null
+++ b/pkg/syncutil/BUILD
@@ -0,0 +1,54 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+exports_files(["LICENSE"])
+
+go_template(
+    name = "generic_atomicptr",
+    srcs = ["atomicptr_unsafe.go"],
+    types = [
+        "Value",
+    ],
+)
+
+go_template(
+    name = "generic_seqatomic",
+    srcs = ["seqatomic_unsafe.go"],
+    types = [
+        "Value",
+    ],
+    deps = [
+        ":sync",
+    ],
+)
+
+go_library(
+    name = "syncutil",
+    srcs = [
+        "downgradable_rwmutex_1_12_unsafe.go",
+        "downgradable_rwmutex_1_13_unsafe.go",
+        "downgradable_rwmutex_unsafe.go",
+        "memmove_unsafe.go",
+        "norace_unsafe.go",
+        "race_unsafe.go",
+        "seqcount.go",
+        "syncutil.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/syncutil",
+)
+
+go_test(
+    name = "syncutil_test",
+    size = "small",
+    srcs = [
+        "downgradable_rwmutex_test.go",
+        "seqcount_test.go",
+    ],
+    embed = [":syncutil"],
+)
diff --git a/pkg/syncutil/LICENSE b/pkg/syncutil/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/syncutil/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/syncutil/README.md b/pkg/syncutil/README.md
new file mode 100644
index 000000000..2183c4e20
--- /dev/null
+++ b/pkg/syncutil/README.md
@@ -0,0 +1,5 @@
+# Syncutil
+
+This package provides additional synchronization primitives not provided by the
+Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
+package from go1.10.
diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/syncutil/atomicptr_unsafe.go
new file mode 100644
index 000000000..525c4beed
--- /dev/null
+++ b/pkg/syncutil/atomicptr_unsafe.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// Value is a required type parameter.
+type Value struct{}
+
+// An AtomicPtr is a pointer to a value of type Value that can be atomically
+// loaded and stored. The zero value of an AtomicPtr represents nil.
+//
+// Note that copying AtomicPtr by value performs a non-atomic read of the
+// stored pointer, which is unsafe if Store() can be called concurrently; in
+// this case, do `dst.Store(src.Load())` instead.
+//
+// +stateify savable
+type AtomicPtr struct {
+	ptr unsafe.Pointer `state:".(*Value)"`
+}
+
+func (p *AtomicPtr) savePtr() *Value {
+	return p.Load()
+}
+
+func (p *AtomicPtr) loadPtr(v *Value) {
+	p.Store(v)
+}
+
+// Load returns the value set by the most recent Store. It returns nil if there
+// has been no previous call to Store.
+func (p *AtomicPtr) Load() *Value {
+	return (*Value)(atomic.LoadPointer(&p.ptr))
+}
+
+// Store sets the value returned by Load to x.
+func (p *AtomicPtr) Store(x *Value) {
+	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
+}
diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/syncutil/atomicptrtest/BUILD
new file mode 100644
index 000000000..63f411a90
--- /dev/null
+++ b/pkg/syncutil/atomicptrtest/BUILD
@@ -0,0 +1,29 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "atomicptr_int",
+    out = "atomicptr_int_unsafe.go",
+    package = "atomicptr",
+    suffix = "Int",
+    template = "//pkg/syncutil:generic_atomicptr",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "atomicptr",
+    srcs = ["atomicptr_int_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr",
+)
+
+go_test(
+    name = "atomicptr_test",
+    size = "small",
+    srcs = ["atomicptr_test.go"],
+    embed = [":atomicptr"],
+)
diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/syncutil/atomicptrtest/atomicptr_test.go
new file mode 100644
index 000000000..8fdc5112e
--- /dev/null
+++ b/pkg/syncutil/atomicptrtest/atomicptr_test.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomicptr
+
+import (
+	"testing"
+)
+
+func newInt(val int) *int {
+	return &val
+}
+
+func TestAtomicPtr(t *testing.T) {
+	var p AtomicPtrInt
+	if got := p.Load(); got != nil {
+		t.Errorf("initial value is %p (%v), wanted nil", got, got)
+	}
+	want := newInt(42)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+	want = newInt(100)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
new file mode 100644
index 000000000..7c6336e62
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
@@ -0,0 +1,21 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.13
+
+// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
+
+package syncutil
+
+import _ "unsafe"
+
+//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
+func runtimeSemrelease112(s *uint32, handoff bool)
+
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
+	// 'skipframes' is only available starting from 1.13.
+	runtimeSemrelease112(s, handoff)
+}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
new file mode 100644
index 000000000..3c3673119
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncutil
+
+import _ "unsafe"
+
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/syncutil/downgradable_rwmutex_test.go
new file mode 100644
index 000000000..ffaf7ecc7
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_test.go
@@ -0,0 +1,150 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
+// addition of downgradingWriter and the renaming of num_iterations to
+// numIterations to shut up Golint.
+
+package syncutil
+
+import (
+	"fmt"
+	"runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	<-cunlock
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders, gomaxprocs int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	var m DowngradableRWMutex
+	clocked := make(chan bool)
+	cunlock := make(chan bool)
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	for i := 0; i < numReaders; i++ {
+		cunlock <- true
+	}
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelReaders(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	doTestParallelReaders(1, 4)
+	doTestParallelReaders(3, 4)
+	doTestParallelReaders(4, 2)
+}
+
+func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.DowngradeLock()
+		n = atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		n = atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm DowngradableRWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	// Wait for the 4 writers and all readers to finish.
+	for i := 0; i < 4+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestDowngradableRWMutex(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerDowngradableRWMutex(1, 1, n)
+	HammerDowngradableRWMutex(1, 3, n)
+	HammerDowngradableRWMutex(1, 10, n)
+	HammerDowngradableRWMutex(4, 1, n)
+	HammerDowngradableRWMutex(4, 3, n)
+	HammerDowngradableRWMutex(4, 10, n)
+	HammerDowngradableRWMutex(10, 1, n)
+	HammerDowngradableRWMutex(10, 3, n)
+	HammerDowngradableRWMutex(10, 10, n)
+	HammerDowngradableRWMutex(10, 5, n)
+}
diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go
new file mode 100644
index 000000000..07feca402
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_unsafe.go
@@ -0,0 +1,143 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+// This is mostly copied from the standard library's sync/rwmutex.go.
+//
+// Happens-before relationships indicated to the race detector:
+// - Unlock -> Lock (via writerSem)
+// - Unlock -> RLock (via readerSem)
+// - RUnlock -> Lock (via writerSem)
+// - DowngradeLock -> RLock (via readerSem)
+
+package syncutil
+
+import (
+	"sync"
+	"sync/atomic"
+	"unsafe"
+)
+
+//go:linkname runtimeSemacquire sync.runtime_Semacquire
+func runtimeSemacquire(s *uint32)
+
+// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
+// method.
+type DowngradableRWMutex struct {
+	w           sync.Mutex // held if there are pending writers
+	writerSem   uint32     // semaphore for writers to wait for completing readers
+	readerSem   uint32     // semaphore for readers to wait for completing writers
+	readerCount int32      // number of pending readers
+	readerWait  int32      // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// RLock locks rw for reading.
+func (rw *DowngradableRWMutex) RLock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
+		// A writer is pending, wait for it.
+		runtimeSemacquire(&rw.readerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.readerSem))
+	}
+}
+
+// RUnlock undoes a single RLock call.
+func (rw *DowngradableRWMutex) RUnlock() {
+	if RaceEnabled {
+		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
+		RaceDisable()
+	}
+	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			panic("RUnlock of unlocked DowngradableRWMutex")
+		}
+		// A writer is pending.
+		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			runtimeSemrelease(&rw.writerSem, false, 0)
+		}
+	}
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// Lock locks rw for writing.
+func (rw *DowngradableRWMutex) Lock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	rw.w.Lock()
+	// Announce to readers there is a pending writer.
+	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
+	// Wait for active readers.
+	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
+		runtimeSemacquire(&rw.writerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+}
+
+// Unlock unlocks rw for writing.
+func (rw *DowngradableRWMutex) Unlock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.writerSem))
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
+	if r >= rwmutexMaxReaders {
+		panic("Unlock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any.
+	for i := 0; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// DowngradeLock atomically unlocks rw for writing and locks it for reading.
+func (rw *DowngradableRWMutex) DowngradeLock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer and one additional reader.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
+	if r >= rwmutexMaxReaders+1 {
+		panic("DowngradeLock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
+	// includes this goroutine.
+	for i := 1; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
+	// block on rw.writerSem since at least this reader exists, such that
+	// DowngradeLock() is atomic with the previous write lock.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/syncutil/memmove_unsafe.go
new file mode 100644
index 000000000..348675baa
--- /dev/null
+++ b/pkg/syncutil/memmove_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncutil
+
+import (
+	"unsafe"
+)
+
+//go:linkname memmove runtime.memmove
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
+
+// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
+// define it because go_generics can't update the go:linkname annotation.
+// Furthermore, go:linkname silently doesn't work if the local name is exported
+// (this is of course undocumented), which is why this indirection is
+// necessary.
+func Memmove(to, from unsafe.Pointer, n uintptr) {
+	memmove(to, from, n)
+}
diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/syncutil/norace_unsafe.go
new file mode 100644
index 000000000..0a0a9deda
--- /dev/null
+++ b/pkg/syncutil/norace_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+package syncutil
+
+import (
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = false
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+}
diff --git a/pkg/syncutil/race_unsafe.go b/pkg/syncutil/race_unsafe.go
new file mode 100644
index 000000000..206067ec1
--- /dev/null
+++ b/pkg/syncutil/race_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package syncutil
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = true
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+	runtime.RaceDisable()
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+	runtime.RaceEnable()
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+	runtime.RaceAcquire(addr)
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+	runtime.RaceRelease(addr)
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+	runtime.RaceReleaseMerge(addr)
+}
diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/syncutil/seqatomic_unsafe.go
new file mode 100644
index 000000000..cb6d2eb22
--- /dev/null
+++ b/pkg/syncutil/seqatomic_unsafe.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/syncutil"
+)
+
+// Value is a required type parameter.
+//
+// Value must not contain any pointers, including interface objects, function
+// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
+// containing any of the above. An init() function will panic if this property
+// does not hold.
+type Value struct{}
+
+// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
+// with any writer critical sections in sc.
+func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value {
+	// This function doesn't use SeqAtomicTryLoad because doing so is
+	// measurably, significantly (~20%) slower; Go is awful at inlining.
+	var val Value
+	for {
+		epoch := sc.BeginRead()
+		if syncutil.RaceEnabled {
+			// runtime.RaceDisable() doesn't actually stop the race detector,
+			// so it can't help us here. Instead, call runtime.memmove
+			// directly, which is not instrumented by the race detector.
+			syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+		} else {
+			// This is ~40% faster for short reads than going through memmove.
+			val = *ptr
+		}
+		if sc.ReadOk(epoch) {
+			break
+		}
+	}
+	return val
+}
+
+// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
+// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
+// would race with a writer critical section, SeqAtomicTryLoad returns
+// (unspecified, false).
+func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) {
+	var val Value
+	if syncutil.RaceEnabled {
+		syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+	} else {
+		val = *ptr
+	}
+	return val, sc.ReadOk(epoch)
+}
+
+func init() {
+	var val Value
+	typ := reflect.TypeOf(val)
+	name := typ.Name()
+	if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 {
+		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
+	}
+}
diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/syncutil/seqatomictest/BUILD
new file mode 100644
index 000000000..ba18f3238
--- /dev/null
+++ b/pkg/syncutil/seqatomictest/BUILD
@@ -0,0 +1,35 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "seqatomic_int",
+    out = "seqatomic_int_unsafe.go",
+    package = "seqatomic",
+    suffix = "Int",
+    template = "//pkg/syncutil:generic_seqatomic",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "seqatomic",
+    srcs = ["seqatomic_int_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic",
+    deps = [
+        "//pkg/syncutil",
+    ],
+)
+
+go_test(
+    name = "seqatomic_test",
+    size = "small",
+    srcs = ["seqatomic_test.go"],
+    embed = [":seqatomic"],
+    deps = [
+        "//pkg/syncutil",
+    ],
+)
diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/syncutil/seqatomictest/seqatomic_test.go
new file mode 100644
index 000000000..b0db44999
--- /dev/null
+++ b/pkg/syncutil/seqatomictest/seqatomic_test.go
@@ -0,0 +1,132 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seqatomic
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/syncutil"
+)
+
+func TestSeqAtomicLoadUncontended(t *testing.T) {
+	var seq syncutil.SeqCount
+	const want = 1
+	data := want
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadAfterWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadDuringWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicTryLoadUncontended(t *testing.T) {
+	var seq syncutil.SeqCount
+	const want = 1
+	data := want
+	epoch := seq.BeginRead()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+	}
+}
+
+func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+	seq.EndWrite()
+}
+
+func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+}
+
+func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
+	var seq syncutil.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := SeqAtomicLoadInt(&seq, &data); got != want {
+				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
+
+func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
+	var seq syncutil.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		epoch := seq.BeginRead()
+		for pb.Next() {
+			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+			}
+		}
+	})
+}
+
+// For comparison:
+func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
+	var a atomic.Value
+	const want = 42
+	a.Store(int(want))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := a.Load().(int); got != want {
+				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
diff --git a/pkg/syncutil/seqcount.go b/pkg/syncutil/seqcount.go
new file mode 100644
index 000000000..11d8dbfaa
--- /dev/null
+++ b/pkg/syncutil/seqcount.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syncutil
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+)
+
+// SeqCount is a synchronization primitive for optimistic reader/writer
+// synchronization in cases where readers can work with stale data and
+// therefore do not need to block writers.
+//
+// Compared to sync/atomic.Value:
+//
+// - Mutation of SeqCount-protected data does not require memory allocation,
+// whereas atomic.Value generally does. This is a significant advantage when
+// writes are common.
+//
+// - Atomic reads of SeqCount-protected data require copying. This is a
+// disadvantage when atomic reads are common.
+//
+// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
+// operations to be made atomic with reads of SeqCount-protected data.
+//
+// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
+// cannot include pointers.
+//
+// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
+// data require instantiating function templates using go_generics (see
+// seqatomic.go).
+type SeqCount struct {
+	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
+	// if a writer critical section is active, and a read from data protected
+	// by this SeqCount is atomic iff epoch is the same even value before and
+	// after the read.
+	epoch uint32
+}
+
+// SeqCountEpoch tracks writer critical sections in a SeqCount.
+type SeqCountEpoch struct {
+	val uint32
+}
+
+// We assume that:
+//
+// - All functions in sync/atomic that perform a memory read are at least a
+// read fence: memory reads before calls to such functions cannot be reordered
+// after the call, and memory reads after calls to such functions cannot be
+// reordered before the call, even if those reads do not use sync/atomic.
+//
+// - All functions in sync/atomic that perform a memory write are at least a
+// write fence: memory writes before calls to such functions cannot be
+// reordered after the call, and memory writes after calls to such functions
+// cannot be reordered before the call, even if those writes do not use
+// sync/atomic.
+//
+// As of this writing, the Go memory model completely fails to describe
+// sync/atomic, but these properties are implied by
+// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
+
+// BeginRead indicates the beginning of a reader critical section. Reader
+// critical sections DO NOT BLOCK writer critical sections, so operations in a
+// reader critical section MAY RACE with writer critical sections. Races are
+// detected by ReadOk at the end of the reader critical section. Thus, the
+// low-level structure of readers is generally:
+//
+//     for {
+//         epoch := seq.BeginRead()
+//         // do something idempotent with seq-protected data
+//         if seq.ReadOk(epoch) {
+//             break
+//         }
+//     }
+//
+// However, since reader critical sections may race with writer critical
+// sections, the Go race detector will (accurately) flag data races in readers
+// using this pattern. Most users of SeqCount will need to use the
+// SeqAtomicLoad function template in seqatomic.go.
+func (s *SeqCount) BeginRead() SeqCountEpoch {
+	epoch := atomic.LoadUint32(&s.epoch)
+	for epoch&1 != 0 {
+		runtime.Gosched()
+		epoch = atomic.LoadUint32(&s.epoch)
+	}
+	return SeqCountEpoch{epoch}
+}
+
+// ReadOk returns true if the reader critical section initiated by a previous
+// call to BeginRead() that returned epoch did not race with any writer critical
+// sections.
+//
+// ReadOk may be called any number of times during a reader critical section.
+// Reader critical sections do not need to be explicitly terminated; the last
+// call to ReadOk is implicitly the end of the reader critical section.
+func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
+	return atomic.LoadUint32(&s.epoch) == epoch.val
+}
+
+// BeginWrite indicates the beginning of a writer critical section.
+//
+// SeqCount does not support concurrent writer critical sections; clients with
+// concurrent writers must synchronize them using e.g. sync.Mutex.
+func (s *SeqCount) BeginWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
+		panic("SeqCount.BeginWrite during writer critical section")
+	}
+}
+
+// EndWrite ends the effect of a preceding BeginWrite.
+func (s *SeqCount) EndWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
+		panic("SeqCount.EndWrite outside writer critical section")
+	}
+}
+
+// PointersInType returns a list of pointers reachable from values named
+// valName of the given type.
+//
+// PointersInType is not exhaustive, but it is guaranteed that if typ contains
+// at least one pointer, then PointersInTypeOf returns a non-empty list.
+func PointersInType(typ reflect.Type, valName string) []string {
+	switch kind := typ.Kind(); kind {
+	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+		return nil
+
+	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
+		return []string{valName}
+
+	case reflect.Array:
+		return PointersInType(typ.Elem(), valName+"[]")
+
+	case reflect.Struct:
+		var ptrs []string
+		for i, n := 0, typ.NumField(); i < n; i++ {
+			field := typ.Field(i)
+			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
+		}
+		return ptrs
+
+	default:
+		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
+	}
+}
diff --git a/pkg/syncutil/seqcount_test.go b/pkg/syncutil/seqcount_test.go
new file mode 100644
index 000000000..14d6aedea
--- /dev/null
+++ b/pkg/syncutil/seqcount_test.go
@@ -0,0 +1,153 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syncutil
+
+import (
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestSeqCountWriteUncontended(t *testing.T) {
+	var seq SeqCount
+	seq.BeginWrite()
+	seq.EndWrite()
+}
+
+func TestSeqCountReadUncontended(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadAfterWrite(t *testing.T) {
+	var seq SeqCount
+	var data int32
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadDuringWrite(t *testing.T) {
+	var seq SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountReadOkAfterWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+}
+
+func TestSeqCountReadOkDuringWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+	seq.EndWrite()
+}
+
+func BenchmarkSeqCountWriteUncontended(b *testing.B) {
+	var seq SeqCount
+	for i := 0; i < b.N; i++ {
+		seq.BeginWrite()
+		seq.EndWrite()
+	}
+}
+
+func BenchmarkSeqCountReadUncontended(b *testing.B) {
+	var seq SeqCount
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			epoch := seq.BeginRead()
+			if !seq.ReadOk(epoch) {
+				b.Fatalf("ReadOk: got false, wanted true")
+			}
+		}
+	})
+}
+
+func TestPointersInType(t *testing.T) {
+	for _, test := range []struct {
+		name string // used for both test and value name
+		val  interface{}
+		ptrs []string
+	}{
+		{
+			name: "EmptyStruct",
+			val:  struct{}{},
+		},
+		{
+			name: "Int",
+			val:  int(0),
+		},
+		{
+			name: "MixedStruct",
+			val: struct {
+				b             bool
+				I             int
+				ExportedPtr   *struct{}
+				unexportedPtr *struct{}
+				arr           [2]int
+				ptrArr        [2]*int
+				nestedStruct  struct {
+					nestedNonptr int
+					nestedPtr    *int
+				}
+				structArr [1]struct {
+					nonptr int
+					ptr    *int
+				}
+			}{},
+			ptrs: []string{
+				"MixedStruct.ExportedPtr",
+				"MixedStruct.unexportedPtr",
+				"MixedStruct.ptrArr[]",
+				"MixedStruct.nestedStruct.nestedPtr",
+				"MixedStruct.structArr[].ptr",
+			},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			typ := reflect.TypeOf(test.val)
+			ptrs := PointersInType(typ, test.name)
+			t.Logf("Found pointers: %v", ptrs)
+			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
+				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
+			}
+		})
+	}
+}
diff --git a/pkg/syncutil/syncutil.go b/pkg/syncutil/syncutil.go
new file mode 100644
index 000000000..66e750d06
--- /dev/null
+++ b/pkg/syncutil/syncutil.go
@@ -0,0 +1,7 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package syncutil provides synchronization primitives.
+package syncutil
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 427c42ede..e08c578f0 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
index 7bcd91e9e..4857f160b 100644
--- a/test/syscalls/linux/accept_bind_stream.cc
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
index 7e918b9b2..a06b5cfd6 100644
--- a/test/syscalls/linux/chmod.cc
+++ b/test/syscalls/linux/chmod.cc
@@ -16,6 +16,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index de1611c21..04bc2d7b9 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -19,6 +19,7 @@
 #include <sys/stat.h>
 #include <syscall.h>
 #include <unistd.h>
+
 #include <string>
 #include <vector>
 
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index c9e3ed6b2..2aa91691e 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -14,6 +14,7 @@
 
 #include <pthread.h>
 #include <sys/time.h>
+
 #include <cerrno>
 #include <cstdint>
 #include <ctime>
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index 4e0a13f8b..00b96b34a 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <signal.h>
+
 #include <atomic>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 0a3931e5a..736452b0c 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -20,6 +20,7 @@
 #include <sys/types.h>
 #include <sys/user.h>
 #include <unistd.h>
+
 #include <algorithm>
 #include <functional>
 #include <iterator>
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 4d155b618..4e048320e 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -27,6 +27,7 @@
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
+
 #include <cstring>
 #include <string>
 
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index b4a91455d..3ecb8db8e 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <sys/file.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index dd6e1a422..371890110 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -20,6 +20,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+
 #include <atomic>
 #include <cstdlib>
 
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index fe9cfafe8..ad2dbacb8 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -23,6 +23,7 @@
 #include <sys/types.h>
 #include <syscall.h>
 #include <unistd.h>
+
 #include <map>
 #include <string>
 #include <unordered_map>
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 57e99596f..8398fc95f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/syscalls/linux/ip_socket_test_util.h"
+
 #include <net/if.h>
 #include <netinet/in.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
-#include <cstring>
 
-#include "test/syscalls/linux/ip_socket_test_util.h"
+#include <cstring>
 
 namespace gvisor {
 namespace testing {
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index ff2f49863..94aea4077 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sys/mman.h>
+
 #include <map>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 283c21ed3..620b4f8b4 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -16,6 +16,7 @@
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+
 #include <cerrno>
 #include <cstring>
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index a112316e9..6f2639d8a 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -28,6 +28,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+
 #include <vector>
 
 #include "gmock/gmock.h"
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index e35be3cab..a3e9745cf 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -18,6 +18,7 @@
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <unistd.h>
+
 #include <functional>
 #include <memory>
 #include <string>
diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc
index 4430fa3c2..2633ba31b 100644
--- a/test/syscalls/linux/read.cc
+++ b/test/syscalls/linux/read.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <unistd.h>
+
 #include <vector>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index 5b474ff32..833c0dc4f 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <stdio.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index e77586852..7e41fe7d8 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -25,6 +25,7 @@
 #include <time.h>
 #include <ucontext.h>
 #include <unistd.h>
+
 #include <atomic>
 
 #include "gmock/gmock.h"
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index e06a2666d..424e2a67f 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -16,6 +16,7 @@
 #include <sys/resource.h>
 #include <sys/select.h>
 #include <sys/time.h>
+
 #include <climits>
 #include <csignal>
 #include <cstdio>
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index eb7a3966f..7ba752599 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include <stdio.h>
-
 #include <sys/ipc.h>
 #include <sys/mman.h>
 #include <sys/shm.h>
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
index d7ce57566..7e88aa2d9 100644
--- a/test/syscalls/linux/socket_blocking.cc
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -17,6 +17,7 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <sys/un.h>
+
 #include <cstdio>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index d7fc9715b..e58eedaba 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index 0dc274e2d..d11f7cc23 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index cd3ad97d0..fcd20102f 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 1acdecc17..63a05b799 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
index 3c3712b50..80f12b0a9 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -18,6 +18,7 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <sys/un.h>
+
 #include <cstdio>
 #include <cstring>
 
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 92f03e045..3ac790873 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
-#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 9d4e1ab97..8f47952b0 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
-#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 5f05bab10..723f5d728 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sys/socket.h>
+#include "test/syscalls/linux/socket_netlink_util.h"
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <sys/socket.h>
 
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 1994139e6..6f84221b2 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/socket_blocking.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_blocking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index 3245cf7c9..af0df4fb4 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -16,6 +16,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index cd4fba25c..2db8b68d3 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index da762cd83..8855d5001 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/socket_non_stream_blocking.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index 60fa9e38a..84d3a569e 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -16,6 +16,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index fa0a9d367..08e579ba7 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/socket_stream_blocking.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_stream_blocking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index ec777c59f..1936aa135 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -11,10 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "test/syscalls/linux/socket_stream_nonblock.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_stream_nonblock.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
index 7f5816ace..8b1762000 100644
--- a/test/syscalls/linux/socket_unix_unbound_abstract.cc
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
index b14f24086..cab912152 100644
--- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 50ffa1d04..cb99030f5 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index 344918c34..f185dded3 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc
index fe479390d..8aa2525a9 100644
--- a/test/syscalls/linux/sync.cc
+++ b/test/syscalls/linux/sync.cc
@@ -14,10 +14,9 @@
 
 #include <fcntl.h>
 #include <stdio.h>
-#include <unistd.h>
-
 #include <sys/syscall.h>
 #include <unistd.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
index e5cc5d97c..c988c6380 100644
--- a/test/syscalls/linux/truncate.cc
+++ b/test/syscalls/linux/truncate.cc
@@ -19,6 +19,7 @@
 #include <sys/vfs.h>
 #include <time.h>
 #include <unistd.h>
+
 #include <iostream>
 #include <string>
 
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc
index 7fb9eed8d..b05ab2900 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.cc
+++ b/test/syscalls/linux/unix_domain_socket_test_util.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 
 #include <sys/un.h>
+
 #include <vector>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h
index 5eca0b7f0..b8073db17 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.h
+++ b/test/syscalls/linux/unix_domain_socket_test_util.h
@@ -16,6 +16,7 @@
 #define GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
 
 #include <string>
+
 #include "test/syscalls/linux/socket_test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index 80716859a..12b925a51 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -20,6 +20,7 @@
 #include <time.h>
 #include <unistd.h>
 #include <utime.h>
+
 #include <string>
 
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
index 40c0014b9..ce1899f45 100644
--- a/test/syscalls/linux/vdso_clock_gettime.cc
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -17,6 +17,7 @@
 #include <syscall.h>
 #include <time.h>
 #include <unistd.h>
+
 #include <map>
 #include <string>
 #include <utility>
diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc
index 2a200320a..657b6a46e 100644
--- a/test/util/fs_util_test.cc
+++ b/test/util/fs_util_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/util/fs_util.h"
+
 #include <errno.h>
+
 #include <vector>
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 38ec6c8a1..484de560e 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -17,6 +17,7 @@
 
 #include <errno.h>
 #include <sys/mount.h>
+
 #include <functional>
 #include <string>
 
diff --git a/test/util/posix_error_test.cc b/test/util/posix_error_test.cc
index d67270842..bf9465abb 100644
--- a/test/util/posix_error_test.cc
+++ b/test/util/posix_error_test.cc
@@ -15,6 +15,7 @@
 #include "test/util/posix_error.h"
 
 #include <errno.h>
+
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
diff --git a/test/util/rlimit_util.cc b/test/util/rlimit_util.cc
index 684253f78..d7bfc1606 100644
--- a/test/util/rlimit_util.cc
+++ b/test/util/rlimit_util.cc
@@ -15,6 +15,7 @@
 #include "test/util/rlimit_util.h"
 
 #include <sys/resource.h>
+
 #include <cerrno>
 
 #include "test/util/cleanup.h"
diff --git a/test/util/signal_util.cc b/test/util/signal_util.cc
index 26738864f..5ee95ee80 100644
--- a/test/util/signal_util.cc
+++ b/test/util/signal_util.cc
@@ -15,6 +15,7 @@
 #include "test/util/signal_util.h"
 
 #include <signal.h>
+
 #include <ostream>
 
 #include "gtest/gtest.h"
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
index 7fd2af015..bcf85c337 100644
--- a/test/util/signal_util.h
+++ b/test/util/signal_util.h
@@ -18,6 +18,7 @@
 #include <signal.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+
 #include <ostream>
 
 #include "gmock/gmock.h"
diff --git a/test/util/temp_path.h b/test/util/temp_path.h
index 92d669503..9e5ac11f4 100644
--- a/test/util/temp_path.h
+++ b/test/util/temp_path.h
@@ -16,6 +16,7 @@
 #define GVISOR_TEST_UTIL_TEMP_PATH_H_
 
 #include <sys/stat.h>
+
 #include <string>
 #include <utility>
 
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
index b7300d9e5..f42100374 100644
--- a/test/util/test_util_test.cc
+++ b/test/util/test_util_test.cc
@@ -15,6 +15,7 @@
 #include "test/util/test_util.h"
 
 #include <errno.h>
+
 #include <vector>
 
 #include "gmock/gmock.h"
diff --git a/third_party/gvsync/BUILD b/third_party/gvsync/BUILD
deleted file mode 100644
index 7d6d59c48..000000000
--- a/third_party/gvsync/BUILD
+++ /dev/null
@@ -1,53 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template")
-
-package(
-    default_visibility = ["//:sandbox"],
-    licenses = ["notice"],
-)
-
-exports_files(["LICENSE"])
-
-go_template(
-    name = "generic_atomicptr",
-    srcs = ["atomicptr_unsafe.go"],
-    types = [
-        "Value",
-    ],
-)
-
-go_template(
-    name = "generic_seqatomic",
-    srcs = ["seqatomic_unsafe.go"],
-    types = [
-        "Value",
-    ],
-    deps = [
-        ":sync",
-    ],
-)
-
-go_library(
-    name = "gvsync",
-    srcs = [
-        "downgradable_rwmutex_1_12_unsafe.go",
-        "downgradable_rwmutex_1_13_unsafe.go",
-        "downgradable_rwmutex_unsafe.go",
-        "gvsync.go",
-        "memmove_unsafe.go",
-        "norace_unsafe.go",
-        "race_unsafe.go",
-        "seqcount.go",
-    ],
-    importpath = "gvisor.dev/gvisor/third_party/gvsync",
-)
-
-go_test(
-    name = "gvsync_test",
-    size = "small",
-    srcs = [
-        "downgradable_rwmutex_test.go",
-        "seqcount_test.go",
-    ],
-    embed = [":gvsync"],
-)
diff --git a/third_party/gvsync/LICENSE b/third_party/gvsync/LICENSE
deleted file mode 100644
index 6a66aea5e..000000000
--- a/third_party/gvsync/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/gvsync/README.md b/third_party/gvsync/README.md
deleted file mode 100644
index fcc7e6f44..000000000
--- a/third_party/gvsync/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This package provides additional synchronization primitives not provided by the
-Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
-package.
diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go
deleted file mode 100644
index 525c4beed..000000000
--- a/third_party/gvsync/atomicptr_unsafe.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-// Value is a required type parameter.
-type Value struct{}
-
-// An AtomicPtr is a pointer to a value of type Value that can be atomically
-// loaded and stored. The zero value of an AtomicPtr represents nil.
-//
-// Note that copying AtomicPtr by value performs a non-atomic read of the
-// stored pointer, which is unsafe if Store() can be called concurrently; in
-// this case, do `dst.Store(src.Load())` instead.
-//
-// +stateify savable
-type AtomicPtr struct {
-	ptr unsafe.Pointer `state:".(*Value)"`
-}
-
-func (p *AtomicPtr) savePtr() *Value {
-	return p.Load()
-}
-
-func (p *AtomicPtr) loadPtr(v *Value) {
-	p.Store(v)
-}
-
-// Load returns the value set by the most recent Store. It returns nil if there
-// has been no previous call to Store.
-func (p *AtomicPtr) Load() *Value {
-	return (*Value)(atomic.LoadPointer(&p.ptr))
-}
-
-// Store sets the value returned by Load to x.
-func (p *AtomicPtr) Store(x *Value) {
-	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
-}
diff --git a/third_party/gvsync/atomicptrtest/BUILD b/third_party/gvsync/atomicptrtest/BUILD
deleted file mode 100644
index 447ecf96a..000000000
--- a/third_party/gvsync/atomicptrtest/BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "atomicptr_int",
-    out = "atomicptr_int_unsafe.go",
-    package = "atomicptr",
-    suffix = "Int",
-    template = "//third_party/gvsync:generic_atomicptr",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "atomicptr",
-    srcs = ["atomicptr_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/third_party/gvsync/atomicptr",
-)
-
-go_test(
-    name = "atomicptr_test",
-    size = "small",
-    srcs = ["atomicptr_test.go"],
-    embed = [":atomicptr"],
-)
diff --git a/third_party/gvsync/atomicptrtest/atomicptr_test.go b/third_party/gvsync/atomicptrtest/atomicptr_test.go
deleted file mode 100644
index 8fdc5112e..000000000
--- a/third_party/gvsync/atomicptrtest/atomicptr_test.go
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package atomicptr
-
-import (
-	"testing"
-)
-
-func newInt(val int) *int {
-	return &val
-}
-
-func TestAtomicPtr(t *testing.T) {
-	var p AtomicPtrInt
-	if got := p.Load(); got != nil {
-		t.Errorf("initial value is %p (%v), wanted nil", got, got)
-	}
-	want := newInt(42)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-	want = newInt(100)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-}
diff --git a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
deleted file mode 100644
index 855b2a2b1..000000000
--- a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.13
-
-// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
-
-package gvsync
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
-func runtimeSemrelease112(s *uint32, handoff bool)
-
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
-	// 'skipframes' is only available starting from 1.13.
-	runtimeSemrelease112(s, handoff)
-}
diff --git a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
deleted file mode 100644
index 3b9346843..000000000
--- a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package gvsync
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/third_party/gvsync/downgradable_rwmutex_test.go b/third_party/gvsync/downgradable_rwmutex_test.go
deleted file mode 100644
index 40c384b8b..000000000
--- a/third_party/gvsync/downgradable_rwmutex_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GOMAXPROCS=10 go test
-
-// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
-// addition of downgradingWriter and the renaming of num_iterations to
-// numIterations to shut up Golint.
-
-package gvsync
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-)
-
-func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
-	m.RLock()
-	clocked <- true
-	<-cunlock
-	m.RUnlock()
-	cdone <- true
-}
-
-func doTestParallelReaders(numReaders, gomaxprocs int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	var m DowngradableRWMutex
-	clocked := make(chan bool)
-	cunlock := make(chan bool)
-	cdone := make(chan bool)
-	for i := 0; i < numReaders; i++ {
-		go parallelReader(&m, clocked, cunlock, cdone)
-	}
-	// Wait for all parallel RLock()s to succeed.
-	for i := 0; i < numReaders; i++ {
-		<-clocked
-	}
-	for i := 0; i < numReaders; i++ {
-		cunlock <- true
-	}
-	// Wait for the goroutines to finish.
-	for i := 0; i < numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestParallelReaders(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	doTestParallelReaders(1, 4)
-	doTestParallelReaders(3, 4)
-	doTestParallelReaders(4, 2)
-}
-
-func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.RLock()
-		n := atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.Unlock()
-	}
-	cdone <- true
-}
-
-func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.DowngradeLock()
-		n = atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		n = atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	// Number of active readers + 10000 * number of active writers.
-	var activity int32
-	var rwm DowngradableRWMutex
-	cdone := make(chan bool)
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	var i int
-	for i = 0; i < numReaders/2; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	for ; i < numReaders; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	// Wait for the 4 writers and all readers to finish.
-	for i := 0; i < 4+numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestDowngradableRWMutex(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	n := 1000
-	if testing.Short() {
-		n = 5
-	}
-	HammerDowngradableRWMutex(1, 1, n)
-	HammerDowngradableRWMutex(1, 3, n)
-	HammerDowngradableRWMutex(1, 10, n)
-	HammerDowngradableRWMutex(4, 1, n)
-	HammerDowngradableRWMutex(4, 3, n)
-	HammerDowngradableRWMutex(4, 10, n)
-	HammerDowngradableRWMutex(10, 1, n)
-	HammerDowngradableRWMutex(10, 3, n)
-	HammerDowngradableRWMutex(10, 10, n)
-	HammerDowngradableRWMutex(10, 5, n)
-}
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
deleted file mode 100644
index b7862d185..000000000
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-// This is mostly copied from the standard library's sync/rwmutex.go.
-//
-// Happens-before relationships indicated to the race detector:
-// - Unlock -> Lock (via writerSem)
-// - Unlock -> RLock (via readerSem)
-// - RUnlock -> Lock (via writerSem)
-// - DowngradeLock -> RLock (via readerSem)
-
-package gvsync
-
-import (
-	"sync"
-	"sync/atomic"
-	"unsafe"
-)
-
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
-// method.
-type DowngradableRWMutex struct {
-	w           sync.Mutex // held if there are pending writers
-	writerSem   uint32     // semaphore for writers to wait for completing readers
-	readerSem   uint32     // semaphore for readers to wait for completing writers
-	readerCount int32      // number of pending readers
-	readerWait  int32      // number of departing readers
-}
-
-const rwmutexMaxReaders = 1 << 30
-
-// RLock locks rw for reading.
-func (rw *DowngradableRWMutex) RLock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
-		// A writer is pending, wait for it.
-		runtimeSemacquire(&rw.readerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.readerSem))
-	}
-}
-
-// RUnlock undoes a single RLock call.
-func (rw *DowngradableRWMutex) RUnlock() {
-	if RaceEnabled {
-		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
-		RaceDisable()
-	}
-	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
-		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
-			panic("RUnlock of unlocked DowngradableRWMutex")
-		}
-		// A writer is pending.
-		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
-			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false, 0)
-		}
-	}
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// Lock locks rw for writing.
-func (rw *DowngradableRWMutex) Lock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	// First, resolve competition with other writers.
-	rw.w.Lock()
-	// Announce to readers there is a pending writer.
-	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
-	// Wait for active readers.
-	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
-		runtimeSemacquire(&rw.writerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.writerSem))
-	}
-}
-
-// Unlock unlocks rw for writing.
-func (rw *DowngradableRWMutex) Unlock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.writerSem))
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
-	if r >= rwmutexMaxReaders {
-		panic("Unlock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any.
-	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *DowngradableRWMutex) DowngradeLock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer and one additional reader.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
-	if r >= rwmutexMaxReaders+1 {
-		panic("DowngradeLock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
-	// includes this goroutine.
-	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
-	// block on rw.writerSem since at least this reader exists, such that
-	// DowngradeLock() is atomic with the previous write lock.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
diff --git a/third_party/gvsync/gvsync.go b/third_party/gvsync/gvsync.go
deleted file mode 100644
index 3bbef13c3..000000000
--- a/third_party/gvsync/gvsync.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package gvsync provides synchronization primitives.
-package gvsync
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
deleted file mode 100644
index 9dd1d6142..000000000
--- a/third_party/gvsync/memmove_unsafe.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package gvsync
-
-import (
-	"unsafe"
-)
-
-//go:linkname memmove runtime.memmove
-//go:noescape
-func memmove(to, from unsafe.Pointer, n uintptr)
-
-// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
-// define it because go_generics can't update the go:linkname annotation.
-// Furthermore, go:linkname silently doesn't work if the local name is exported
-// (this is of course undocumented), which is why this indirection is
-// necessary.
-func Memmove(to, from unsafe.Pointer, n uintptr) {
-	memmove(to, from, n)
-}
diff --git a/third_party/gvsync/norace_unsafe.go b/third_party/gvsync/norace_unsafe.go
deleted file mode 100644
index e3852db8c..000000000
--- a/third_party/gvsync/norace_unsafe.go
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !race
-
-package gvsync
-
-import (
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = false
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-}
diff --git a/third_party/gvsync/race_unsafe.go b/third_party/gvsync/race_unsafe.go
deleted file mode 100644
index 13c02a830..000000000
--- a/third_party/gvsync/race_unsafe.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build race
-
-package gvsync
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = true
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-	runtime.RaceDisable()
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-	runtime.RaceEnable()
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-	runtime.RaceAcquire(addr)
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-	runtime.RaceRelease(addr)
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-	runtime.RaceReleaseMerge(addr)
-}
diff --git a/third_party/gvsync/seqatomic_unsafe.go b/third_party/gvsync/seqatomic_unsafe.go
deleted file mode 100644
index 382eeed43..000000000
--- a/third_party/gvsync/seqatomic_unsafe.go
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"fmt"
-	"reflect"
-	"strings"
-	"unsafe"
-
-	"gvisor.dev/gvisor/third_party/gvsync"
-)
-
-// Value is a required type parameter.
-//
-// Value must not contain any pointers, including interface objects, function
-// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
-// containing any of the above. An init() function will panic if this property
-// does not hold.
-type Value struct{}
-
-// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *gvsync.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
-	for {
-		epoch := sc.BeginRead()
-		if gvsync.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
-		}
-	}
-	return val
-}
-
-// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
-// (unspecified, false).
-func SeqAtomicTryLoad(sc *gvsync.SeqCount, epoch gvsync.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
-	if gvsync.RaceEnabled {
-		gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-	} else {
-		val = *ptr
-	}
-	return val, sc.ReadOk(epoch)
-}
-
-func init() {
-	var val Value
-	typ := reflect.TypeOf(val)
-	name := typ.Name()
-	if ptrs := gvsync.PointersInType(typ, name); len(ptrs) != 0 {
-		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
-	}
-}
diff --git a/third_party/gvsync/seqatomictest/BUILD b/third_party/gvsync/seqatomictest/BUILD
deleted file mode 100644
index c858c20c4..000000000
--- a/third_party/gvsync/seqatomictest/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "seqatomic_int",
-    out = "seqatomic_int_unsafe.go",
-    package = "seqatomic",
-    suffix = "Int",
-    template = "//third_party/gvsync:generic_seqatomic",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "seqatomic",
-    srcs = ["seqatomic_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/third_party/gvsync/seqatomic",
-    deps = [
-        "//third_party/gvsync",
-    ],
-)
-
-go_test(
-    name = "seqatomic_test",
-    size = "small",
-    srcs = ["seqatomic_test.go"],
-    embed = [":seqatomic"],
-    deps = [
-        "//third_party/gvsync",
-    ],
-)
diff --git a/third_party/gvsync/seqatomictest/seqatomic_test.go b/third_party/gvsync/seqatomictest/seqatomic_test.go
deleted file mode 100644
index a5447f589..000000000
--- a/third_party/gvsync/seqatomictest/seqatomic_test.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package seqatomic
-
-import (
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/third_party/gvsync"
-)
-
-func TestSeqAtomicLoadUncontended(t *testing.T) {
-	var seq gvsync.SeqCount
-	const want = 1
-	data := want
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadAfterWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadDuringWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicTryLoadUncontended(t *testing.T) {
-	var seq gvsync.SeqCount
-	const want = 1
-	data := want
-	epoch := seq.BeginRead()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-	}
-}
-
-func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-	seq.EndWrite()
-}
-
-func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-}
-
-func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
-	var seq gvsync.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := SeqAtomicLoadInt(&seq, &data); got != want {
-				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
-
-func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
-	var seq gvsync.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		epoch := seq.BeginRead()
-		for pb.Next() {
-			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-			}
-		}
-	})
-}
-
-// For comparison:
-func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
-	var a atomic.Value
-	const want = 42
-	a.Store(int(want))
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := a.Load().(int); got != want {
-				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
diff --git a/third_party/gvsync/seqcount.go b/third_party/gvsync/seqcount.go
deleted file mode 100644
index 2c9c2c3d6..000000000
--- a/third_party/gvsync/seqcount.go
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package gvsync
-
-import (
-	"fmt"
-	"reflect"
-	"runtime"
-	"sync/atomic"
-)
-
-// SeqCount is a synchronization primitive for optimistic reader/writer
-// synchronization in cases where readers can work with stale data and
-// therefore do not need to block writers.
-//
-// Compared to sync/atomic.Value:
-//
-// - Mutation of SeqCount-protected data does not require memory allocation,
-// whereas atomic.Value generally does. This is a significant advantage when
-// writes are common.
-//
-// - Atomic reads of SeqCount-protected data require copying. This is a
-// disadvantage when atomic reads are common.
-//
-// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
-// operations to be made atomic with reads of SeqCount-protected data.
-//
-// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
-// cannot include pointers.
-//
-// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
-// data require instantiating function templates using go_generics (see
-// seqatomic.go).
-type SeqCount struct {
-	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
-	// if a writer critical section is active, and a read from data protected
-	// by this SeqCount is atomic iff epoch is the same even value before and
-	// after the read.
-	epoch uint32
-}
-
-// SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
-
-// We assume that:
-//
-// - All functions in sync/atomic that perform a memory read are at least a
-// read fence: memory reads before calls to such functions cannot be reordered
-// after the call, and memory reads after calls to such functions cannot be
-// reordered before the call, even if those reads do not use sync/atomic.
-//
-// - All functions in sync/atomic that perform a memory write are at least a
-// write fence: memory writes before calls to such functions cannot be
-// reordered after the call, and memory writes after calls to such functions
-// cannot be reordered before the call, even if those writes do not use
-// sync/atomic.
-//
-// As of this writing, the Go memory model completely fails to describe
-// sync/atomic, but these properties are implied by
-// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
-
-// BeginRead indicates the beginning of a reader critical section. Reader
-// critical sections DO NOT BLOCK writer critical sections, so operations in a
-// reader critical section MAY RACE with writer critical sections. Races are
-// detected by ReadOk at the end of the reader critical section. Thus, the
-// low-level structure of readers is generally:
-//
-//     for {
-//         epoch := seq.BeginRead()
-//         // do something idempotent with seq-protected data
-//         if seq.ReadOk(epoch) {
-//             break
-//         }
-//     }
-//
-// However, since reader critical sections may race with writer critical
-// sections, the Go race detector will (accurately) flag data races in readers
-// using this pattern. Most users of SeqCount will need to use the
-// SeqAtomicLoad function template in seqatomic.go.
-func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
-	}
-	return SeqCountEpoch{epoch}
-}
-
-// ReadOk returns true if the reader critical section initiated by a previous
-// call to BeginRead() that returned epoch did not race with any writer critical
-// sections.
-//
-// ReadOk may be called any number of times during a reader critical section.
-// Reader critical sections do not need to be explicitly terminated; the last
-// call to ReadOk is implicitly the end of the reader critical section.
-func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
-}
-
-// BeginWrite indicates the beginning of a writer critical section.
-//
-// SeqCount does not support concurrent writer critical sections; clients with
-// concurrent writers must synchronize them using e.g. sync.Mutex.
-func (s *SeqCount) BeginWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
-		panic("SeqCount.BeginWrite during writer critical section")
-	}
-}
-
-// EndWrite ends the effect of a preceding BeginWrite.
-func (s *SeqCount) EndWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
-		panic("SeqCount.EndWrite outside writer critical section")
-	}
-}
-
-// PointersInType returns a list of pointers reachable from values named
-// valName of the given type.
-//
-// PointersInType is not exhaustive, but it is guaranteed that if typ contains
-// at least one pointer, then PointersInTypeOf returns a non-empty list.
-func PointersInType(typ reflect.Type, valName string) []string {
-	switch kind := typ.Kind(); kind {
-	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		return nil
-
-	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
-		return []string{valName}
-
-	case reflect.Array:
-		return PointersInType(typ.Elem(), valName+"[]")
-
-	case reflect.Struct:
-		var ptrs []string
-		for i, n := 0, typ.NumField(); i < n; i++ {
-			field := typ.Field(i)
-			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
-		}
-		return ptrs
-
-	default:
-		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
-	}
-}
diff --git a/third_party/gvsync/seqcount_test.go b/third_party/gvsync/seqcount_test.go
deleted file mode 100644
index 085e574b3..000000000
--- a/third_party/gvsync/seqcount_test.go
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package gvsync
-
-import (
-	"reflect"
-	"testing"
-	"time"
-)
-
-func TestSeqCountWriteUncontended(t *testing.T) {
-	var seq SeqCount
-	seq.BeginWrite()
-	seq.EndWrite()
-}
-
-func TestSeqCountReadUncontended(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadAfterWrite(t *testing.T) {
-	var seq SeqCount
-	var data int32
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadDuringWrite(t *testing.T) {
-	var seq SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountReadOkAfterWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-}
-
-func TestSeqCountReadOkDuringWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-	seq.EndWrite()
-}
-
-func BenchmarkSeqCountWriteUncontended(b *testing.B) {
-	var seq SeqCount
-	for i := 0; i < b.N; i++ {
-		seq.BeginWrite()
-		seq.EndWrite()
-	}
-}
-
-func BenchmarkSeqCountReadUncontended(b *testing.B) {
-	var seq SeqCount
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			epoch := seq.BeginRead()
-			if !seq.ReadOk(epoch) {
-				b.Fatalf("ReadOk: got false, wanted true")
-			}
-		}
-	})
-}
-
-func TestPointersInType(t *testing.T) {
-	for _, test := range []struct {
-		name string // used for both test and value name
-		val  interface{}
-		ptrs []string
-	}{
-		{
-			name: "EmptyStruct",
-			val:  struct{}{},
-		},
-		{
-			name: "Int",
-			val:  int(0),
-		},
-		{
-			name: "MixedStruct",
-			val: struct {
-				b             bool
-				I             int
-				ExportedPtr   *struct{}
-				unexportedPtr *struct{}
-				arr           [2]int
-				ptrArr        [2]*int
-				nestedStruct  struct {
-					nestedNonptr int
-					nestedPtr    *int
-				}
-				structArr [1]struct {
-					nonptr int
-					ptr    *int
-				}
-			}{},
-			ptrs: []string{
-				"MixedStruct.ExportedPtr",
-				"MixedStruct.unexportedPtr",
-				"MixedStruct.ptrArr[]",
-				"MixedStruct.nestedStruct.nestedPtr",
-				"MixedStruct.structArr[].ptr",
-			},
-		},
-	} {
-		t.Run(test.name, func(t *testing.T) {
-			typ := reflect.TypeOf(test.val)
-			ptrs := PointersInType(typ, test.name)
-			t.Logf("Found pointers: %v", ptrs)
-			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
-				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
-			}
-		})
-	}
-}
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index fa82f8e9b..d412e1ccf 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_marshal:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_marshal:defs.bzl", "go_library")
-
 package_group(
     name = "gomarshal_test",
     packages = [
diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD
index 8fb43179b..9bb89e1da 100644
--- a/tools/go_marshal/test/external/BUILD
+++ b/tools/go_marshal/test/external/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_marshal:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "external",
     testonly = 1,
-- 
cgit v1.2.3


From 4e27ba372e12e3186c0d03b32a7829b0d50f7a89 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 22 Nov 2019 10:42:57 -0800
Subject: tests: include sys/socket.h before linux/if_arp.h

This is how it has to be accoding to the man page.

PiperOrigin-RevId: 281998068
---
 test/syscalls/linux/socket_netlink_util.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index da99f0d60..76e772c48 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -15,6 +15,8 @@
 #ifndef GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
 #define GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
 
+#include <sys/socket.h>
+// socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
 
-- 
cgit v1.2.3


From 07635d20d40e1a279c4b063abaaad51048400ed7 Mon Sep 17 00:00:00 2001
From: lubinszARM <34124929+lubinszARM@users.noreply.github.com>
Date: Fri, 22 Nov 2019 11:54:04 -0800
Subject: enable ring0/pagetables to support arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/891 from lubinszARM:pr_pagetable 2385de75a8662af3ab1ae289dd74dd0e5dcfaf66
PiperOrigin-RevId: 282013224
---
 pkg/sentry/platform/ring0/pagetables/BUILD         |  16 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go |   9 -
 .../ring0/pagetables/pagetables_aarch64.go         | 212 ++++++++++++++
 .../platform/ring0/pagetables/pagetables_amd64.go  |   9 +
 .../platform/ring0/pagetables/pagetables_arm64.go  |  57 ++++
 .../ring0/pagetables/pagetables_arm64_test.go      |  80 ++++++
 .../platform/ring0/pagetables/walker_arm64.go      | 314 +++++++++++++++++++++
 7 files changed, 684 insertions(+), 13 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/walker_arm64.go

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 934a90378..e2e15ba5c 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,14 +1,17 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
+config_setting(
+    name = "aarch64",
+    constraint_values = ["@bazel_tools//platforms:aarch64"],
+)
+
 go_template(
     name = "generic_walker",
-    srcs = [
-        "walker_amd64.go",
-    ],
+    srcs = ["walker_amd64.go"],
     opt_types = [
         "Visitor",
     ],
@@ -76,9 +79,13 @@ go_library(
         "allocator.go",
         "allocator_unsafe.go",
         "pagetables.go",
+        "pagetables_aarch64.go",
         "pagetables_amd64.go",
+        "pagetables_arm64.go",
         "pagetables_x86.go",
         "pcids_x86.go",
+        "walker_amd64.go",
+        "walker_arm64.go",
         "walker_empty.go",
         "walker_lookup.go",
         "walker_map.go",
@@ -97,6 +104,7 @@ go_test(
     size = "small",
     srcs = [
         "pagetables_amd64_test.go",
+        "pagetables_arm64_test.go",
         "pagetables_test.go",
         "walker_check.go",
     ],
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 904f1a6de..30c64a372 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -48,15 +48,6 @@ func New(a Allocator) *PageTables {
 	return p
 }
 
-// Init initializes a set of PageTables.
-//
-//go:nosplit
-func (p *PageTables) Init(allocator Allocator) {
-	p.Allocator = allocator
-	p.root = p.Allocator.NewPTEs()
-	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
-}
-
 // mapVisitor is used for map.
 type mapVisitor struct {
 	target   uintptr // Input.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
new file mode 100644
index 000000000..e78424766
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -0,0 +1,212 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package pagetables
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// archPageTables is architecture-specific data.
+type archPageTables struct {
+	// root is the pagetable root for kernel space.
+	root *PTEs
+
+	// rootPhysical is the cached physical address of the root.
+	//
+	// This is saved only to prevent constant translation.
+	rootPhysical uintptr
+
+	asid uint16
+}
+
+// TTBR0_EL1 returns the translation table base register 0.
+//
+//go:nosplit
+func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 {
+	return uint64(p.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
+}
+
+// TTBR1_EL1 returns the translation table base register 1.
+//
+//go:nosplit
+func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
+	return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
+}
+
+// Bits in page table entries.
+const (
+	typeTable   = 0x3 << 0
+	typeSect    = 0x1 << 0
+	typePage    = 0x3 << 0
+	pteValid    = 0x1 << 0
+	pteTableBit = 0x1 << 1
+	pteTypeMask = 0x3 << 0
+	present     = pteValid | pteTableBit
+	user        = 0x1 << 6 /* AP[1] */
+	readOnly    = 0x1 << 7 /* AP[2] */
+	accessed    = 0x1 << 10
+	dbm         = 0x1 << 51
+	writable    = dbm
+	cont        = 0x1 << 52
+	pxn         = 0x1 << 53
+	xn          = 0x1 << 54
+	dirty       = 0x1 << 55
+	nG          = 0x1 << 11
+	shared      = 0x3 << 8
+)
+
+const (
+	mtNormal = 0x4 << 2
+)
+
+const (
+	executeDisable = xn
+	optionMask     = 0xfff | 0xfff<<48
+	protDefault    = accessed | shared | mtNormal
+)
+
+// MapOpts are x86 options.
+type MapOpts struct {
+	// AccessType defines permissions.
+	AccessType usermem.AccessType
+
+	// Global indicates the page is globally accessible.
+	Global bool
+
+	// User indicates the page is a user page.
+	User bool
+}
+
+// PTE is a page table entry.
+type PTE uintptr
+
+// Clear clears this PTE, including sect page information.
+//
+//go:nosplit
+func (p *PTE) Clear() {
+	atomic.StoreUintptr((*uintptr)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+//
+//go:nosplit
+func (p *PTE) Valid() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&present != 0
+}
+
+// Opts returns the PTE options.
+//
+// These are all options except Valid and Sect.
+//
+//go:nosplit
+func (p *PTE) Opts() MapOpts {
+	v := atomic.LoadUintptr((*uintptr)(p))
+
+	return MapOpts{
+		AccessType: usermem.AccessType{
+			Read:    true,
+			Write:   v&readOnly == 0,
+			Execute: v&xn == 0,
+		},
+		Global: v&nG == 0,
+		User:   v&user != 0,
+	}
+}
+
+// SetSect sets this page as a sect page.
+//
+// The page must not be valid or a panic will result.
+//
+//go:nosplit
+func (p *PTE) SetSect() {
+	if p.Valid() {
+		// This is not allowed.
+		panic("SetSect called on valid page!")
+	}
+	atomic.StoreUintptr((*uintptr)(p), typeSect)
+}
+
+// IsSect returns true iff this page is a sect page.
+//
+//go:nosplit
+func (p *PTE) IsSect() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&pteTypeMask == typeSect
+}
+
+// Set sets this PTE value.
+//
+// This does not change the sect page property.
+//
+//go:nosplit
+func (p *PTE) Set(addr uintptr, opts MapOpts) {
+	if !opts.AccessType.Any() {
+		p.Clear()
+		return
+	}
+	v := (addr &^ optionMask) | protDefault | nG | readOnly
+
+	if p.IsSect() {
+		// Note that this is inherited from the previous instance. Set
+		// does not change the value of Sect. See above.
+		v |= typeSect
+	} else {
+		v |= typePage
+	}
+
+	if opts.Global {
+		v = v &^ nG
+	}
+
+	if opts.AccessType.Execute {
+		v = v &^ executeDisable
+	} else {
+		v |= executeDisable
+	}
+	if opts.AccessType.Write {
+		v = v &^ readOnly
+	}
+
+	if opts.User {
+		v |= user
+	} else {
+		v = v &^ user
+	}
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and sect bit to
+// be cleared. This is used explicitly for breaking sect pages.
+//
+//go:nosplit
+func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
+	addr := pt.Allocator.PhysicalFor(ptes)
+	if addr&^optionMask != addr {
+		// This should never happen.
+		panic("unaligned physical address!")
+	}
+	v := addr | typeTable | protDefault
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+//
+//go:nosplit
+func (p *PTE) Address() uintptr {
+	return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 7aa6c524e..0c153cf8c 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -41,5 +41,14 @@ const (
 	entriesPerPage = 512
 )
 
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+	p.Allocator = allocator
+	p.root = p.Allocator.NewPTEs()
+	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+}
+
 // PTEs is a collection of entries.
 type PTEs [entriesPerPage]PTE
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
new file mode 100644
index 000000000..1a49f12a2
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -0,0 +1,57 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+	lowerTop    = 0x0000ffffffffffff
+	upperBottom = 0xffff000000000000
+	pteShift    = 12
+	pmdShift    = 21
+	pudShift    = 30
+	pgdShift    = 39
+
+	pteMask = 0x1ff << pteShift
+	pmdMask = 0x1ff << pmdShift
+	pudMask = 0x1ff << pudShift
+	pgdMask = 0x1ff << pgdShift
+
+	pteSize = 1 << pteShift
+	pmdSize = 1 << pmdShift
+	pudSize = 1 << pudShift
+	pgdSize = 1 << pgdShift
+
+	ttbrASIDOffset = 55
+	ttbrASIDMask   = 0xff
+
+	entriesPerPage = 512
+)
+
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+	p.Allocator = allocator
+	p.root = p.Allocator.NewPTEs()
+	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+	p.archPageTables.root = p.Allocator.NewPTEs()
+	p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root)
+}
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go
new file mode 100644
index 000000000..254116233
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go
@@ -0,0 +1,80 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package pagetables
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+func Test2MAnd4K(t *testing.T) {
+	pt := New(NewRuntimeAllocator())
+
+	// Map a small page and a huge page.
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42)
+	pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*47)
+
+	pt.Map(0xffff000000400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: false}, pteSize*42)
+	pt.Map(0xffffff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: false}, pmdSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}},
+		{0x0000ff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: true}},
+		{0xffff000000400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: false}},
+		{0xffffff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: false}},
+	})
+}
+
+func Test1GAnd4K(t *testing.T) {
+	pt := New(NewRuntimeAllocator())
+
+	// Map a small page and a super page.
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42)
+	pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}},
+		{0x0000ff0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read, User: true}},
+	})
+}
+
+func TestSplit1GPage(t *testing.T) {
+	pt := New(NewRuntimeAllocator())
+
+	// Map a super page and knock out the middle.
+	pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*42)
+	pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pudSize-(2*pteSize))
+
+	checkMappings(t, pt, []mapping{
+		{0x0000ff0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read, User: true}},
+		{0x0000ff0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}},
+	})
+}
+
+func TestSplit2MPage(t *testing.T) {
+	pt := New(NewRuntimeAllocator())
+
+	// Map a huge page and knock out the middle.
+	pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*42)
+	pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pmdSize-(2*pteSize))
+
+	checkMappings(t, pt, []mapping{
+		{0x0000ff0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read, User: true}},
+		{0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}},
+	})
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
new file mode 100644
index 000000000..c261d393a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
@@ -0,0 +1,314 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package pagetables
+
+// Visitor is a generic type.
+type Visitor interface {
+	// visit is called on each PTE.
+	visit(start uintptr, pte *PTE, align uintptr)
+
+	// requiresAlloc indicates that new entries should be allocated within
+	// the walked range.
+	requiresAlloc() bool
+
+	// requiresSplit indicates that entries in the given range should be
+	// split if they are huge or jumbo pages.
+	requiresSplit() bool
+}
+
+// Walker walks page tables.
+type Walker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor Visitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is sect pages. If a valid sect page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of sect pages whenever
+// possible. Whether a sect page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *Walker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func next(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *Walker) iterateRangeCanonical(start, end uintptr) {
+	pgdEntryIndex := w.pageTables.root
+	if start >= upperBottom {
+		pgdEntryIndex = w.pageTables.archPageTables.root
+	}
+
+	for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &pgdEntryIndex[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+				// Skip over this entry.
+				start = next(start, pgdSize)
+				continue
+			}
+
+			// Allocate a new pgd.
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		// Map the next level.
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+					// Skip over this entry.
+					clearPUDEntries++
+					start = next(start, pudSize)
+					continue
+				}
+
+				// This level has 1-GB sect pages. Is this
+				// entire region at least as large as a single
+				// PUD entry?  If so, we can skip allocating a
+				// new page for the pmd.
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSect()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = next(start, pudSize)
+						continue
+					}
+				}
+
+				// Allocate a new pud.
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSect() {
+				// Does this page need to be split?
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) {
+					// Install the relevant entries.
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSect()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+					// A sect page to be checked directly.
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					// Might have been cleared.
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					// Note that the sect page was changed.
+					start = next(start, pudSize)
+					continue
+				}
+
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			// Map the next level, since this is valid.
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+						// Skip over this entry.
+						clearPMDEntries++
+						start = next(start, pmdSize)
+						continue
+					}
+
+					// This level has 2-MB huge pages. If this
+					// region is contined in a single PMD entry?
+					// As above, we can skip allocating a new page.
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSect()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = next(start, pmdSize)
+							continue
+						}
+					}
+
+					// Allocate a new pmd.
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSect() {
+					// Does this page need to be split?
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) {
+						// Install the relevant entries.
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+						// A huge page to be checked directly.
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						// Might have been cleared.
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						// Note that the huge page was changed.
+						start = next(start, pmdSize)
+						continue
+					}
+
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				// Map the next level, since this is valid.
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					// At this point, we are guaranteed that start%pteSize == 0.
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					// Note that the pte was changed.
+					start += pteSize
+					continue
+				}
+
+				// Check if we no longer need this page.
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			// Check if we no longer need this page.
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		// Check if we no longer need this page.
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
-- 
cgit v1.2.3


From f27f38d13717a25721efb2b37fabadae5c34e374 Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Fri, 22 Nov 2019 12:53:49 -0800
Subject: Add segment dequeue check while emptying segment queue.

PiperOrigin-RevId: 282023891
---
 pkg/tcpip/transport/tcp/connect.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 364067731..75b7c0828 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1368,8 +1368,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// unlocks e.mu. Now that no new segments can get enqueued to this
 	// endpoint, try to re-match the segment to a different endpoint
 	// as the current endpoint is closed.
-	for !e.segmentQueue.empty() {
+	for {
 		s := e.segmentQueue.dequeue()
+		if s == nil {
+			break
+		}
 		e.tryDeliverSegmentFromClosedEndpoint(s)
 	}
 
-- 
cgit v1.2.3


From 9db08c4e583e758e3eb1aed03875743ce02b8cff Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 22 Nov 2019 14:41:04 -0800
Subject: Use PacketBuffers with GSO.

PiperOrigin-RevId: 282045221
---
 pkg/tcpip/link/channel/channel.go        | 12 ++++-----
 pkg/tcpip/link/fdbased/endpoint.go       | 25 +++++++++++++------
 pkg/tcpip/link/loopback/loopback.go      |  2 +-
 pkg/tcpip/link/muxed/injectable.go       |  4 +--
 pkg/tcpip/link/sharedmem/sharedmem.go    |  2 +-
 pkg/tcpip/link/sniffer/sniffer.go        | 12 ++++-----
 pkg/tcpip/link/waitable/waitable.go      |  6 ++---
 pkg/tcpip/link/waitable/waitable_test.go |  6 ++---
 pkg/tcpip/network/arp/arp.go             |  2 +-
 pkg/tcpip/network/ip_test.go             |  2 +-
 pkg/tcpip/network/ipv4/ipv4.go           | 10 ++++----
 pkg/tcpip/network/ipv6/ipv6.go           | 12 ++++-----
 pkg/tcpip/packet_buffer.go               |  8 ++++++
 pkg/tcpip/stack/registration.go          |  8 +++---
 pkg/tcpip/stack/route.go                 | 29 ++++-----------------
 pkg/tcpip/stack/stack_test.go            |  2 +-
 pkg/tcpip/transport/tcp/connect.go       | 43 ++++++++++++++++++--------------
 17 files changed, 95 insertions(+), 90 deletions(-)

diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 9fe8e9f9d..70188551f 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -133,16 +133,16 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	payloadView := payload.ToView()
+func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	payloadView := pkts[0].Data.ToView()
 	n := 0
 packetLoop:
-	for _, hdr := range hdrs {
-		off := hdr.Off
-		size := hdr.Size
+	for _, pkt := range pkts {
+		off := pkt.DataOffset
+		size := pkt.DataSize
 		p := PacketInfo{
 			Pkt: tcpip.PacketBuffer{
-				Header: hdr.Hdr,
+				Header: pkt.Header,
 				Data:   buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(),
 			},
 			Proto: protocol,
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 98109c5dc..fa8a703d9 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -440,7 +440,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 
 // WritePackets writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	var ethHdrBuf []byte
 	// hdr + data
 	iovLen := 2
@@ -463,9 +463,9 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 		iovLen++
 	}
 
-	n := len(hdrs)
+	n := len(pkts)
 
-	views := payload.Views()
+	views := pkts[0].Data.Views()
 	/*
 	 * Each bondary in views can add one more iovec.
 	 *
@@ -483,14 +483,20 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 	viewOff := 0
 	off := 0
 	nextOff := 0
-	for i := range hdrs {
+	for i := range pkts {
+		// TODO(b/134618279): Different packets may have different data
+		// in the future. We should handle this.
+		if !viewsEqual(pkts[i].Data.Views(), views) {
+			panic("All packets in pkts should have the same Data.")
+		}
+
 		prevIovecIdx := iovecIdx
 		mmsgHdr := &mmsgHdrs[i]
 		mmsgHdr.Msg.Iov = &iovec[iovecIdx]
-		packetSize := hdrs[i].Size
-		hdr := &hdrs[i].Hdr
+		packetSize := pkts[i].DataSize
+		hdr := &pkts[i].Header
 
-		off = hdrs[i].Off
+		off = pkts[i].DataOffset
 		if off != nextOff {
 			// We stop in a different point last time.
 			size := packetSize
@@ -555,6 +561,11 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.Pac
 	return packets, nil
 }
 
+// viewsEqual tests whether v1 and v2 refer to the same backing bytes.
+func viewsEqual(vs1, vs2 []buffer.View) bool {
+	return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0])
+}
+
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	return rawfile.NonBlockingWrite(e.fds[0], vv.ToView())
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 563a67188..499cc608f 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -92,7 +92,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 55ed2a28e..445b22c17 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -87,12 +87,12 @@ func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber,
 // WritePackets writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
 // r.RemoteAddress has a route registered in this endpoint.
-func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	endpoint, ok := m.routes[r.RemoteAddress]
 	if !ok {
 		return 0, tcpip.ErrNoRoute
 	}
-	return endpoint.WritePackets(r, gso, hdrs, payload, protocol)
+	return endpoint.WritePackets(r, gso, pkts, protocol)
 }
 
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 88947a03a..080f9d667 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -214,7 +214,7 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 122680e10..767f14303 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -233,15 +233,15 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	view := payload.ToView()
-	for _, d := range hdrs {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	view := pkts[0].Data.ToView()
+	for _, pkt := range pkts {
 		e.dumpPacket(gso, protocol, tcpip.PacketBuffer{
-			Header: d.Hdr,
-			Data:   view[d.Off:][:d.Size].ToVectorisedView(),
+			Header: pkt.Header,
+			Data:   view[pkt.DataOffset:][:pkt.DataSize].ToVectorisedView(),
 		})
 	}
-	return e.lower.WritePackets(r, gso, hdrs, payload, protocol)
+	return e.lower.WritePackets(r, gso, pkts, protocol)
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 12e7c1932..a8de38979 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -112,12 +112,12 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	if !e.writeGate.Enter() {
-		return len(hdrs), nil
+		return len(pkts), nil
 	}
 
-	n, err := e.lower.WritePackets(r, gso, hdrs, payload, protocol)
+	n, err := e.lower.WritePackets(r, gso, pkts, protocol)
 	e.writeGate.Leave()
 	return n, err
 }
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 0fc0c2ebe..31b11a27a 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -71,9 +71,9 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcp
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	e.writeCount += len(hdrs)
-	return len(hdrs), nil
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	e.writeCount += len(pkts)
+	return len(pkts), nil
 }
 
 func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 30aec9ba7..da8482509 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderPara
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketDescriptor, buffer.VectorisedView, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 1de188738..4144a7837 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -172,7 +172,7 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, hdr []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 040329a74..7059600f5 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -268,18 +268,18 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
 	if loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
 	if loop&stack.PacketOut == 0 {
-		return len(hdrs), nil
+		return len(pkts), nil
 	}
 
-	for i := range hdrs {
-		e.addIPHeader(r, &hdrs[i].Hdr, hdrs[i].Size, params)
+	for i := range pkts {
+		e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params)
 	}
-	n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber)
+	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
 	return n, err
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 8d1578ed9..c9087ffa7 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -137,21 +137,21 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
 	if loop&stack.PacketLoop != 0 {
 		panic("not implemented")
 	}
 	if loop&stack.PacketOut == 0 {
-		return len(hdrs), nil
+		return len(pkts), nil
 	}
 
-	for i := range hdrs {
-		hdr := &hdrs[i].Hdr
-		size := hdrs[i].Size
+	for i := range pkts {
+		hdr := &pkts[i].Header
+		size := pkts[i].DataSize
 		e.addIPHeader(r, hdr, size, params)
 	}
 
-	n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber)
+	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
 	return n, err
 }
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
index 695f7b188..ab24372e7 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/packet_buffer.go
@@ -31,6 +31,14 @@ type PacketBuffer struct {
 	// or otherwise modified.
 	Data buffer.VectorisedView
 
+	// DataOffset is used for GSO output. It is the offset into the Data
+	// field where the payload of this packet starts.
+	DataOffset int
+
+	// DataSize is used for GSO output. It is the size of this packet's
+	// payload.
+	DataSize int
+
 	// Header holds the headers of outbound packets. As a packet is passed
 	// down the stack, each layer adds to Header.
 	Header buffer.Prependable
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 7fd4e4a65..61fd46d66 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -237,8 +237,8 @@ type NetworkEndpoint interface {
 	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
-	// protocol.
-	WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error)
+	// protocol. pkts must not be zero length.
+	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
@@ -373,12 +373,12 @@ type LinkEndpoint interface {
 	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
-	// given route.
+	// given route. pkts must not be zero length.
 	//
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
-	WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 617f5a57c..34307ae07 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -17,7 +17,6 @@ package stack
 import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
@@ -169,39 +168,21 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.Pack
 	return err
 }
 
-// PacketDescriptor is a packet descriptor which contains a packet header and
-// offset and size of packet data in a payload view.
-type PacketDescriptor struct {
-	Hdr  buffer.Prependable
-	Off  int
-	Size int
-}
-
-// NewPacketDescriptors allocates a set of packet descriptors.
-func NewPacketDescriptors(n int, hdrSize int) []PacketDescriptor {
-	buf := make([]byte, n*hdrSize)
-	hdrs := make([]PacketDescriptor, n)
-	for i := range hdrs {
-		hdrs[i].Hdr = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
-	}
-	return hdrs
-}
-
 // WritePackets writes the set of packets through the given route.
-func (r *Route) WritePackets(gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams) (int, *tcpip.Error) {
+func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
 	if !r.ref.isValidForOutgoing() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
-	n, err := r.ref.ep.WritePackets(r, gso, hdrs, payload, params, r.Loop)
+	n, err := r.ref.ep.WritePackets(r, gso, pkts, params, r.Loop)
 	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(hdrs) - n))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n))
 	}
 	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
 	payloadSize := 0
 	for i := 0; i < n; i++ {
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(hdrs[i].Hdr.UsedLength()))
-		payloadSize += hdrs[i].Size
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkts[i].Header.UsedLength()))
+		payloadSize += pkts[i].DataSize
 	}
 	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize))
 	return n, err
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index f979e2b1a..8fc034ca1 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -149,7 +149,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 75b7c0828..00c0c9a92 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -631,11 +631,11 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
 	return nil
 }
 
-func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDescriptor, data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
+func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.PacketBuffer, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
 	optLen := len(opts)
-	hdr := &d.Hdr
-	packetSize := d.Size
-	off := d.Off
+	hdr := &pkt.Header
+	packetSize := pkt.DataSize
+	off := pkt.DataOffset
 	// Initialize the header.
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
 	tcp.Encode(&header.TCPFields{
@@ -659,7 +659,7 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDe
 		// header and data and get the right sum of the TCP packet.
 		tcp.SetChecksum(xsum)
 	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
-		xsum = header.ChecksumVVWithOffset(data, xsum, off, packetSize)
+		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, off, packetSize)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
 
@@ -674,7 +674,13 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 	mss := int(gso.MSS)
 	n := (data.Size() + mss - 1) / mss
 
-	hdrs := stack.NewPacketDescriptors(n, header.TCPMinimumSize+int(r.MaxHeaderLength())+optLen)
+	// Allocate one big slice for all the headers.
+	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
+	buf := make([]byte, n*hdrSize)
+	pkts := make([]tcpip.PacketBuffer, n)
+	for i := range pkts {
+		pkts[i].Header = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
+	}
 
 	size := data.Size()
 	off := 0
@@ -684,16 +690,17 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 			packetSize = size
 		}
 		size -= packetSize
-		hdrs[i].Off = off
-		hdrs[i].Size = packetSize
-		buildTCPHdr(r, id, &hdrs[i], data, flags, seq, ack, rcvWnd, opts, gso)
+		pkts[i].DataOffset = off
+		pkts[i].DataSize = packetSize
+		pkts[i].Data = data
+		buildTCPHdr(r, id, &pkts[i], flags, seq, ack, rcvWnd, opts, gso)
 		off += packetSize
 		seq = seq.Add(seqnum.Size(packetSize))
 	}
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	sent, err := r.WritePackets(gso, hdrs, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
+	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
 	if err != nil {
 		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
 	}
@@ -713,20 +720,18 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
 	}
 
-	d := &stack.PacketDescriptor{
-		Hdr:  buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
-		Off:  0,
-		Size: data.Size(),
+	pkt := tcpip.PacketBuffer{
+		Header:     buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
+		DataOffset: 0,
+		DataSize:   data.Size(),
+		Data:       data,
 	}
-	buildTCPHdr(r, id, d, data, flags, seq, ack, rcvWnd, opts, gso)
+	buildTCPHdr(r, id, &pkt, flags, seq, ack, rcvWnd, opts, gso)
 
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
-		Header: d.Hdr,
-		Data:   data,
-	}); err != nil {
+	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, pkt); err != nil {
 		r.Stats().TCP.SegmentSendErrors.Increment()
 		return err
 	}
-- 
cgit v1.2.3


From 8eb68912e40bc87c932baeb13d151fd590d7d279 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 22 Nov 2019 14:56:54 -0800
Subject: Store SO_BINDTODEVICE state at bind.

This allows us to ensure that the correct port reservation is released.

Fixes #1217

PiperOrigin-RevId: 282048155
---
 pkg/tcpip/transport/tcp/accept.go         |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go       | 26 ++++++++++++++++-------
 pkg/tcpip/transport/udp/endpoint.go       | 35 ++++++++++++++++++++-----------
 pkg/tcpip/transport/udp/endpoint_state.go |  2 +-
 4 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 023045ec1..f543a6105 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -243,7 +243,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	n.initGSO()
 
 	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.bindToDevice); err != nil {
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
 		n.Close()
 		return nil, err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 04c92c04c..9d4a87e30 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -340,6 +340,9 @@ type endpoint struct {
 	// TCP should never broadcast but Linux nevertheless supports enabling/
 	// disabling SO_BROADCAST, albeit as a NOOP.
 	broadcast bool
+	// Values used to reserve a port or register a transport endpoint
+	// (which ever happens first).
+	boundBindToDevice tcpip.NICID
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -730,12 +733,13 @@ func (e *endpoint) Close() {
 	// in Listen() when trying to register.
 	if e.state == StateListen && e.isPortReserved {
 		if e.isRegistered {
-			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
+			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 			e.isRegistered = false
 		}
 
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
 		e.isPortReserved = false
+		e.boundBindToDevice = 0
 	}
 
 	// Mark endpoint as closed.
@@ -791,14 +795,15 @@ func (e *endpoint) cleanupLocked() {
 	e.workerCleanup = false
 
 	if e.isRegistered {
-		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
+		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 		e.isRegistered = false
 	}
 
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
+	e.boundBindToDevice = 0
 
 	e.route.Release()
 	e.stack.CompleteTransportEndpointCleanup(e)
@@ -1741,7 +1746,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 	if e.ID.LocalPort != 0 {
 		// The endpoint is bound to a port, attempt to register it.
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.boundBindToDevice)
 		if err != nil {
 			return err
 		}
@@ -1778,7 +1783,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			id.LocalPort = p
 			switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
 			case nil:
+				// Port picking successful. Save the details of
+				// the selected port.
 				e.ID = id
+				e.boundBindToDevice = e.bindToDevice
 				return true, nil
 			case tcpip.ErrPortInUse:
 				return false, nil
@@ -1794,7 +1802,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	// before Connect: in such a case we don't want to hold on to
 	// reservations anymore.
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -1950,7 +1958,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	// Register the endpoint.
-	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice); err != nil {
+	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.boundBindToDevice); err != nil {
 		return err
 	}
 
@@ -2031,6 +2039,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 		return err
 	}
 
+	e.boundBindToDevice = e.bindToDevice
 	e.isPortReserved = true
 	e.effectiveNetProtos = netProtos
 	e.ID.LocalPort = port
@@ -2044,8 +2053,9 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 			e.ID.LocalPort = 0
 			e.ID.LocalAddress = ""
 			e.boundNICID = 0
+			e.boundBindToDevice = 0
 		}
-	}(e.bindToDevice)
+	}(e.boundBindToDevice)
 
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 2d97d1398..23c1da717 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -104,6 +104,10 @@ type endpoint struct {
 	bindToDevice   tcpip.NICID
 	broadcast      bool
 
+	// Values used to reserve a port or register a transport endpoint.
+	// (which ever happens first).
+	boundBindToDevice tcpip.NICID
+
 	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
@@ -175,8 +179,9 @@ func (e *endpoint) Close() {
 
 	switch e.state {
 	case StateBound, StateConnected:
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.boundBindToDevice = 0
 	}
 
 	for _, mem := range e.multicastMemberships {
@@ -870,7 +875,10 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	if e.state != StateConnected {
 		return nil
 	}
-	id := stack.TransportEndpointID{}
+	var (
+		id  stack.TransportEndpointID
+		btd tcpip.NICID
+	)
 	// Exclude ephemerally bound endpoints.
 	if e.BindNICID != 0 || e.ID.LocalAddress == "" {
 		var err *tcpip.Error
@@ -878,7 +886,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 			LocalPort:    e.ID.LocalPort,
 			LocalAddress: e.ID.LocalAddress,
 		}
-		id, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id)
+		id, btd, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id)
 		if err != nil {
 			return err
 		}
@@ -886,13 +894,14 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	} else {
 		if e.ID.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
 		}
 		e.state = StateInitial
 	}
 
-	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
+	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 	e.ID = id
+	e.boundBindToDevice = btd
 	e.route.Release()
 	e.route = stack.Route{}
 	e.dstPort = 0
@@ -961,17 +970,18 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		}
 	}
 
-	id, err = e.registerWithStack(nicID, netProtos, id)
+	id, btd, err := e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
 	}
 
 	// Remove the old registration.
 	if e.ID.LocalPort != 0 {
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 	}
 
 	e.ID = id
+	e.boundBindToDevice = btd
 	e.route = r.Clone()
 	e.dstPort = addr.Port
 	e.RegisterNICID = nicID
@@ -1029,11 +1039,11 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
 		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
 		if err != nil {
-			return id, err
+			return id, e.bindToDevice, err
 		}
 		id.LocalPort = port
 	}
@@ -1042,7 +1052,7 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ
 	if err != nil {
 		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
 	}
-	return id, err
+	return id, e.bindToDevice, err
 }
 
 func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
@@ -1081,12 +1091,13 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		LocalPort:    addr.Port,
 		LocalAddress: addr.Addr,
 	}
-	id, err = e.registerWithStack(nicID, netProtos, id)
+	id, btd, err := e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
 	}
 
 	e.ID = id
+	e.boundBindToDevice = btd
 	e.RegisterNICID = nicID
 	e.effectiveNetProtos = netProtos
 
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index b227e353b..43fb047ed 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -109,7 +109,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	// pass it to the reservation machinery.
 	id := e.ID
 	e.ID.LocalPort = 0
-	e.ID, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id)
+	e.ID, e.boundBindToDevice, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id)
 	if err != nil {
 		panic(err)
 	}
-- 
cgit v1.2.3


From 5eb522193cf206a36c1663d909b9e53ae93b2b6a Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 22 Nov 2019 15:21:31 -0800
Subject: Force timezone initialization before filter installation

The first use of time.Local (usually via time.Time.Date, et. al) performs
initialization of the local timezone, which involves open several tzdata files
from the host.

Since filter installation disallows open, we should explicitly force this
initialization rather than implicitly depending on the first logging (or other
time) call occurring before filter installation.

PiperOrigin-RevId: 282053121
---
 runsc/main.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/runsc/main.go b/runsc/main.go
index 711f60d4f..4682b308c 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -26,6 +26,7 @@ import (
 	"path/filepath"
 	"strings"
 	"syscall"
+	"time"
 
 	"flag"
 
@@ -237,6 +238,18 @@ func main() {
 		log.SetLevel(log.Debug)
 	}
 
+	// Logging will include the local date and time via the time package.
+	//
+	// On first use, time.Local initializes the local time zone, which
+	// involves opening tzdata files on the host. Since this requires
+	// opening host files, it must be done before syscall filter
+	// installation.
+	//
+	// Generally there will be a log message before filter installation
+	// that will force initialization, but force initialization here in
+	// case that does not occur.
+	_ = time.Local.String()
+
 	subcommand := flag.CommandLine.Arg(0)
 
 	var e log.Emitter
-- 
cgit v1.2.3


From b0a1bbd3e248888cf6c9e6fa73df5d2c22490f85 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 22 Nov 2019 16:45:00 -0800
Subject: Internal change.

PiperOrigin-RevId: 282068093
---
 pkg/tcpip/transport/tcp/connect.go  |   9 +++
 pkg/tcpip/transport/tcp/tcp_test.go | 121 +++++++++++++++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 00c0c9a92..f14f0ca65 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -299,6 +299,15 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 		return nil
 	}
 
+	// RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
+	// sequence number outside of the window causes an ACK with the proper seq
+	// number and "After sending the acknowledgment, drop the unacceptable
+	// segment and return."
+	if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
+		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
+		return nil
+	}
+
 	if s.flagIsSet(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
 		// We received two SYN segments with different sequence
 		// numbers, so we reset this and restart the whole
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 64f765c70..c4b45aa6f 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -4673,7 +4673,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
-		SeqNum:  seqnum.Value(789),
+		SeqNum:  irs,
 		RcvWnd:  30000,
 	})
 
@@ -4825,6 +4825,125 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	}
 }
 
+func TestSynRcvdBadSeqNumber(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Start listening.
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN to get a SYN-ACK. This should put the ep into SYN-RCVD state
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	iss := seqnum.Value(tcpHdr.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Now send a packet with an out-of-window sequence number
+	largeSeqnum := irs + seqnum.Value(tcpHdr.WindowSize()) + 1
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  largeSeqnum,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	// Should receive an ACK with the expected SEQ number
+	b = c.GetPacket()
+	tcpCheckers = []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.AckNum(uint32(irs) + 1),
+		checker.SeqNum(uint32(iss + 1)),
+	}
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Now that the socket replied appropriately with the ACK,
+	// complete the connection to test that the large SEQ num
+	// did not change the state from SYN-RCVD.
+
+	// Send ACK to move to ESTABLISHED state.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	newEP, _, err := c.EP.Accept()
+
+	if err != nil && err != tcpip.ErrWouldBlock {
+		t.Fatalf("Accept failed: %s", err)
+	}
+
+	if err == tcpip.ErrWouldBlock {
+		// Try to accept the connections in the backlog.
+		we, ch := waiter.NewChannelEntry(nil)
+		c.WQ.EventRegister(&we, waiter.EventIn)
+		defer c.WQ.EventUnregister(&we)
+
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			newEP, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	_, _, err = newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+
+	if err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	pkt := c.GetPacket()
+	tcpHdr = header.TCP(header.IPv4(pkt).Payload())
+	if string(tcpHdr.Payload()) != data {
+		t.Fatalf("Unexpected data: got %s, want %s", string(tcpHdr.Payload()), data)
+	}
+}
+
 func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-- 
cgit v1.2.3


From f697d1a33e4e7cefb4164ec977c38ccc2a228099 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Tue, 8 Oct 2019 17:43:01 +0000
Subject: gofer: reduce CPU usage on GC as of frequent readdir

Refer to golang mallocgc(), each time of allocating an object > 32 KB,
a gc will be triggered.

When we do readdir, sentry always passes 65535, which leads to a malloc
of 65535 * sizeof(p9.Direnta) > 32 KB.

Considering we already use slice append, let's avoid defining the
capability for this slide.

Command for test:

Before this change:

  (container)$ time tree linux-5.3.1 > /dev/null

  real    0m54.272s
  user    0m2.010s
  sys     0m1.740s
  (CPU usage of Gofer: ~30 cores)

  (host)$ perf top -p <pid-of-gofer>

    42.57%  runsc        [.] runtime.gcDrain
    23.41%  runsc        [.] runtime.(*lfstack).pop
     9.74%  runsc        [.] runtime.greyobject
     8.06%  runsc        [.] runtime.(*lfstack).push
     4.33%  runsc        [.] runtime.scanobject
     1.69%  runsc        [.] runtime.findObject
     1.12%  runsc        [.] runtime.findrunnable
     0.69%  runsc        [.] runtime.runqgrab
    ...

  (host)$ mkdir test && cd test
  (host)$ for i in `seq 1 65536`; do mkdir $i; done
  (container)$ time ls test/ > /dev/null

  real    2m10.934s
  user    0m0.280s
  sys     0m4.260s
  (CPU usage of Gofer: ~1 core)

After this change:

  (container)$ time tree linux-5.3.1 > /dev/null

  real    0m22.465s
  user    0m1.270s
  sys     0m1.310s
  (CPU usage of Gofer: ~1 core)

  $ perf top -p <pid-of-gofer>

    20.57%  runsc        [.] runtime.gcDrain
     7.15%  runsc        [.] runtime.(*lfstack).pop
     4.11%  runsc        [.] runtime.scanobject
     3.78%  runsc        [.] runtime.greyobject
     2.78%  runsc        [.] runtime.(*lfstack).push
    ...

  (host)$ mkdir test && cd test
  (host)$ for i in `seq 1 65536`; do mkdir $i; done
  (container)$ time ls test/ > /dev/null

  real    0m13.338s
  user    0m0.190s
  sys     0m3.980s
  (CPU usage of Gofer: ~0.8 core)

Fixes #898

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
---
 runsc/fsgofer/fsgofer.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 9117d9616..c9add64ec 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -956,14 +956,14 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 }
 
 func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+	var dirents []p9.Dirent
+
 	// Limit 'count' to cap the slice size that is returned.
 	const maxCount = 100000
 	if count > maxCount {
 		count = maxCount
 	}
 
-	dirents := make([]p9.Dirent, 0, count)
-
 	// Pre-allocate buffers that will be reused to get partial results.
 	direntsBuf := make([]byte, 8192)
 	names := make([]string, 0, 100)
-- 
cgit v1.2.3


From c3b93afeafeff4555b57aa22c2a91375f9e38e28 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Sat, 23 Nov 2019 23:21:04 -0800
Subject: Cleanup visibility.

PiperOrigin-RevId: 282194656
---
 pkg/tcpip/hash/jenkins/BUILD                  |  4 +---
 pkg/tcpip/link/channel/BUILD                  |  2 +-
 pkg/tcpip/link/fdbased/BUILD                  |  4 +---
 pkg/tcpip/link/loopback/BUILD                 |  2 +-
 pkg/tcpip/link/muxed/BUILD                    |  4 +---
 pkg/tcpip/link/rawfile/BUILD                  |  4 +---
 pkg/tcpip/link/sharedmem/BUILD                |  4 +---
 pkg/tcpip/link/sharedmem/pipe/BUILD           |  2 +-
 pkg/tcpip/link/sharedmem/queue/BUILD          |  2 +-
 pkg/tcpip/link/sniffer/BUILD                  |  4 +---
 pkg/tcpip/link/tun/BUILD                      |  4 +---
 pkg/tcpip/link/waitable/BUILD                 |  4 +---
 pkg/tcpip/network/arp/BUILD                   |  4 +---
 pkg/tcpip/network/fragmentation/BUILD         | 10 +---------
 pkg/tcpip/network/ipv4/BUILD                  |  4 +---
 pkg/tcpip/network/ipv6/BUILD                  |  4 +---
 pkg/tcpip/ports/BUILD                         |  2 +-
 pkg/tcpip/seqnum/BUILD                        |  4 +---
 pkg/tcpip/stack/BUILD                         | 12 +-----------
 pkg/tcpip/transport/icmp/BUILD                |  8 --------
 pkg/tcpip/transport/packet/BUILD              |  8 --------
 pkg/tcpip/transport/raw/BUILD                 |  8 --------
 pkg/tcpip/transport/tcp/BUILD                 |  8 --------
 pkg/tcpip/transport/tcp/testing/context/BUILD |  2 +-
 pkg/tcpip/transport/udp/BUILD                 |  8 --------
 pkg/waiter/BUILD                              |  8 --------
 26 files changed, 20 insertions(+), 110 deletions(-)

diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
index 0c5c20cea..e648efa71 100644
--- a/pkg/tcpip/hash/jenkins/BUILD
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -7,9 +7,7 @@ go_library(
     name = "jenkins",
     srcs = ["jenkins.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
 )
 
 go_test(
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 97a794986..7dbc05754 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -6,7 +6,7 @@ go_library(
     name = "channel",
     srcs = ["channel.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/channel",
-    visibility = ["//:sandbox"],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 8fa9e3984..897c94821 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -14,9 +14,7 @@ go_library(
         "packet_dispatchers.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index 23e4d1418..f35fcdff4 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -6,7 +6,7 @@ go_library(
     name = "loopback",
     srcs = ["loopback.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/loopback",
-    visibility = ["//:sandbox"],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index 1bab380b0..1ac7948b6 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -7,9 +7,7 @@ go_library(
     name = "muxed",
     srcs = ["injectable.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/muxed",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 05c7b8024..d8211e93d 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -13,9 +13,7 @@ go_library(
         "rawfile_unsafe.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/rawfile",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index 0a5ea3dc4..a4f9cdd69 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -12,9 +12,7 @@ go_library(
         "tx.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem",
-    visibility = [
-        "//:sandbox",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index 330ed5e94..6b5bc542c 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -12,7 +12,7 @@ go_library(
         "tx.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe",
-    visibility = ["//:sandbox"],
+    visibility = ["//visibility:public"],
 )
 
 go_test(
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index de1ce043d..8c9234d54 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -10,7 +10,7 @@ go_library(
         "tx.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue",
-    visibility = ["//:sandbox"],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
         "//pkg/tcpip/link/sharedmem/pipe",
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 1756114e6..d6ae0368a 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -9,9 +9,7 @@ go_library(
         "sniffer.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sniffer",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 92dce8fac..a71a493fc 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -6,7 +6,5 @@ go_library(
     name = "tun",
     srcs = ["tun_unsafe.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/tun",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
 )
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 0746dc8ec..134837943 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -9,9 +9,7 @@ go_library(
         "waitable.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/waitable",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/gate",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index df0d3a8c0..e7617229b 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -7,9 +7,7 @@ go_library(
     name = "arp",
     srcs = ["arp.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/network/arp",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index 2cad0a0b6..acf1e022c 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -25,7 +25,7 @@ go_library(
         "reassembler_list.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation",
-    visibility = ["//:sandbox"],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
         "//pkg/tcpip",
@@ -44,11 +44,3 @@ go_test(
     embed = [":fragmentation"],
     deps = ["//pkg/tcpip/buffer"],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "reassembler_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 58e537aad..aeddfcdd4 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -10,9 +10,7 @@ go_library(
         "ipv4.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv4",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index f06622a8b..e4e273460 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -10,9 +10,7 @@ go_library(
         "ipv6.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv6",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 11efb4e44..4839f0a65 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -7,7 +7,7 @@ go_library(
     name = "ports",
     srcs = ["ports.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/ports",
-    visibility = ["//:sandbox"],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
     ],
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index 29b7d761c..b31ddba2f 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -6,7 +6,5 @@ go_library(
     name = "seqnum",
     srcs = ["seqnum.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/seqnum",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
 )
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 460db3cf8..69077669a 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -31,9 +31,7 @@ go_library(
         "transport_demuxer.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/stack",
-    visibility = [
-        "//visibility:public",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         "//pkg/ilist",
         "//pkg/rand",
@@ -87,11 +85,3 @@ go_test(
         "//pkg/tcpip",
     ],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "linkaddrentry_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 9254c3dea..d8c5b5058 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -38,11 +38,3 @@ go_library(
         "//pkg/waiter",
     ],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "icmp_packet_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index 8ea2e6ee5..44b58ff6b 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -36,11 +36,3 @@ go_library(
         "//pkg/waiter",
     ],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "packet_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 4af49218c..00991ac8e 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -38,11 +38,3 @@ go_library(
         "//pkg/waiter",
     ],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "raw_packet_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 3f47b328d..dd1728f9c 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -61,14 +61,6 @@ go_library(
     ],
 )
 
-filegroup(
-    name = "autogen",
-    srcs = [
-        "tcp_segment_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "tcp_test",
     size = "medium",
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index 19b0d31c5..b33ec2087 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -8,7 +8,7 @@ go_library(
     srcs = ["context.go"],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context",
     visibility = [
-        "//:sandbox",
+        "//visibility:public",
     ],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index c9460aa0d..8d4c3808f 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -59,11 +59,3 @@ go_test(
         "//pkg/waiter",
     ],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "udp_packet_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 1f7efb064..0427bc41f 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -34,11 +34,3 @@ go_test(
     ],
     embed = [":waiter"],
 )
-
-filegroup(
-    name = "autogen",
-    srcs = [
-        "waiter_list.go",
-    ],
-    visibility = ["//:sandbox"],
-)
-- 
cgit v1.2.3


From 2b1b51f1d7dd96f14b0af3b2663c33bc7ab67f63 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 25 Nov 2019 08:35:09 -0800
Subject: Fix panic in sniffer.

Packets written via SOCK_RAW are guaranteed to have network headers, but not
transport headers. Check first whether there are enough bytes left in the packet
to contain a transport header before attempting to parse it.

PiperOrigin-RevId: 282363895
---
 pkg/tcpip/link/sniffer/sniffer.go | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 767f14303..3392b7edd 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -49,6 +49,13 @@ var LogPackets uint32 = 1
 // LogPacketsToFile must be accessed atomically.
 var LogPacketsToFile uint32 = 1
 
+var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.TransportProtocolNumber]int{
+	header.ICMPv4ProtocolNumber: header.IPv4MinimumSize,
+	header.ICMPv6ProtocolNumber: header.IPv6MinimumSize,
+	header.UDPProtocolNumber:    header.UDPMinimumSize,
+	header.TCPProtocolNumber:    header.TCPMinimumSize,
+}
+
 type endpoint struct {
 	dispatcher stack.NetworkDispatcher
 	lower      stack.LinkEndpoint
@@ -333,6 +340,13 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 		return
 	}
 
+	// We aren't guaranteed to have a transport header - it's possible for
+	// writes via raw endpoints to contain only network headers.
+	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && len(b) < minSize {
+		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
+		return
+	}
+
 	// Figure out the transport layer info.
 	transName := "unknown"
 	srcPort := uint16(0)
-- 
cgit v1.2.3


From 1641338b14204ea941c547cf4c1a70665922ca05 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 25 Nov 2019 09:26:30 -0800
Subject: Set transport and network headers on outbound packets.

These are necessary for iptables to read and parse headers for packet filtering.

PiperOrigin-RevId: 282372811
---
 pkg/tcpip/network/ipv4/ipv4.go      | 9 ++++++---
 pkg/tcpip/network/ipv6/ipv6.go      | 9 ++++++---
 pkg/tcpip/transport/tcp/connect.go  | 1 +
 pkg/tcpip/transport/udp/endpoint.go | 5 +++--
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 7059600f5..e645cf62c 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -240,16 +240,18 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+	pkt.NetworkHeader = buffer.View(ip)
 
 	if loop&stack.PacketLoop != 0 {
+		// The inbound path expects the network header to still be in
+		// the PacketBuffer's Data field.
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 		views[0] = pkt.Header.View()
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
 		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
-			Data:          buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-			NetworkHeader: buffer.View(ip),
+			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
 		loopedR.Release()
@@ -277,7 +279,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 	}
 
 	for i := range pkts {
-		e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params)
+		ip := e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params)
+		pkts[i].NetworkHeader = buffer.View(ip)
 	}
 	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index c9087ffa7..dd31f0fb7 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -114,16 +114,18 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+	pkt.NetworkHeader = buffer.View(ip)
 
 	if loop&stack.PacketLoop != 0 {
+		// The inbound path expects the network header to still be in
+		// the PacketBuffer's Data field.
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 		views[0] = pkt.Header.View()
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
 		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
-			Data:          buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-			NetworkHeader: buffer.View(ip),
+			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
 		loopedR.Release()
@@ -148,7 +150,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 	for i := range pkts {
 		hdr := &pkts[i].Header
 		size := pkts[i].DataSize
-		e.addIPHeader(r, hdr, size, params)
+		ip := e.addIPHeader(r, hdr, size, params)
+		pkts[i].NetworkHeader = buffer.View(ip)
 	}
 
 	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index f14f0ca65..4206db8b6 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -647,6 +647,7 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.Packet
 	off := pkt.DataOffset
 	// Initialize the header.
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
+	pkt.TransportHeader = buffer.View(tcp)
 	tcp.Encode(&header.TCPFields{
 		SrcPort:    id.LocalPort,
 		DstPort:    id.RemotePort,
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 23c1da717..24cb88c13 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -823,8 +823,9 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 		ttl = r.DefaultTTL()
 	}
 	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
-		Header: hdr,
-		Data:   data,
+		Header:          hdr,
+		Data:            data,
+		TransportHeader: buffer.View(udp),
 	}); err != nil {
 		r.Stats().UDP.PacketSendErrors.Increment()
 		return err
-- 
cgit v1.2.3


From a5f7b82036f4a062183f1fed9c27227636c8eed5 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 25 Nov 2019 11:19:15 -0800
Subject: Internal change.

PiperOrigin-RevId: 282396322
---
 pkg/sentry/BUILD | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index 2a7122957..2d6379c86 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -10,8 +10,5 @@ package_group(
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
         "//tools/go_marshal/...",
-
-        # Keep the old paths as a temporary measure.
-        "//third_party/golang/gvisor/pkg/sentry/...",
     ],
 )
-- 
cgit v1.2.3


From 97d2c9a94e802bcb450e50816a913dfc18afc0e3 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 25 Nov 2019 11:41:39 -0800
Subject: Use mount hints to determine FileAccessType

PiperOrigin-RevId: 282401165
---
 runsc/boot/fs.go      | 18 ++++++++++++++--
 runsc/boot/fs_test.go | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 76036c147..bc9ffaf81 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -465,6 +465,13 @@ func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	return nil
 }
 
+func (m *mountHint) fileAccessType() FileAccessType {
+	if m.share == container {
+		return FileAccessExclusive
+	}
+	return FileAccessShared
+}
+
 func filterUnsupportedOptions(mount specs.Mount) []string {
 	rv := make([]string, 0, len(mount.Options))
 	for _, o := range mount.Options {
@@ -764,8 +771,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	case bind:
 		fd := c.fds.remove()
 		fsName = "9p"
-		// Non-root bind mounts are always shared.
-		opts = p9MountOptions(fd, FileAccessShared)
+		opts = p9MountOptions(fd, c.getMountAccessType(m))
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
@@ -778,6 +784,14 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
+func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+	if hint := c.hints.findMount(mount); hint != nil {
+		return hint.fileAccessType()
+	}
+	// Non-root bind mounts are always shared if no hints were provided.
+	return FileAccessShared
+}
+
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 49ab34b33..0396a4cfb 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -191,3 +191,61 @@ func TestPodMountHintsErrors(t *testing.T) {
 		})
 	}
 }
+
+func TestGetMountAccessType(t *testing.T) {
+	const source = "foo"
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		want        FileAccessType
+	}{
+		{
+			name: "container=exclusive",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): source,
+				path.Join(MountPrefix, "mount1", "type"):   "bind",
+				path.Join(MountPrefix, "mount1", "share"):  "container",
+			},
+			want: FileAccessExclusive,
+		},
+		{
+			name: "pod=shared",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): source,
+				path.Join(MountPrefix, "mount1", "type"):   "bind",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "shared=shared",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): source,
+				path.Join(MountPrefix, "mount1", "type"):   "bind",
+				path.Join(MountPrefix, "mount1", "share"):  "shared",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "default=shared",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): source + "mismatch",
+				path.Join(MountPrefix, "mount1", "type"):   "bind",
+				path.Join(MountPrefix, "mount1", "share"):  "container",
+			},
+			want: FileAccessShared,
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err != nil {
+				t.Fatalf("newPodMountHints failed: %v", err)
+			}
+			mounter := containerMounter{hints: podHints}
+			if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want {
+				t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From d92dc065fd98b5875a0945ccc062f91fc4d39190 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 25 Nov 2019 13:51:13 -0800
Subject: Fix typo in go_branch.sh script.

With the ticks, the command `master` is actually be run and the output included
(which is nothing). This is confusing, as we actually mean to say "master" in
the description of the Go branch.

PiperOrigin-RevId: 282426081
---
 tools/go_branch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/go_branch.sh b/tools/go_branch.sh
index 0ac16e266..f97a74aaf 100755
--- a/tools/go_branch.sh
+++ b/tools/go_branch.sh
@@ -78,7 +78,7 @@ cat > README.md <<EOF
 # gVisor
 
 This branch is a synthetic branch, containing only Go sources, that is
-compatible with standard Go tools. See the `master` branch for authoritative
+compatible with standard Go tools. See the master branch for authoritative
 sources and tests.
 EOF
 
-- 
cgit v1.2.3


From 128948d6ae94009c6ad13a0bd96e03e45a560477 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 25 Nov 2019 15:20:25 -0800
Subject: Implement basic umounting for vfs2.

This is required to test filesystems with a non-trivial implementation of
FilesystemImpl.Release(). Propagation isn't handled yet, and umount isn't yet
plumbed out to VirtualFilesystem.UmountAt(), but otherwise the implementation
of umount is believed to be correct.

- Move entering mountTable.seq writer critical sections to callers of
  mountTable.{insert,remove}Seqed. This is required since umount(2) must ensure
  that no new references are taken on the candidate mount after checking that
  it isn't busy, which is only possible by entering a vfs.mountTable.seq writer
  critical section before the check and remaining in it until after
  VFS.umountRecursiveLocked() is complete. (Linux does the same thing:
  fs/namespace.c:do_umount() => lock_mount_hash(),
  fs/pnode.c:propagate_mount_busy(), umount_tree(), unlock_mount_hash().)

- It's not possible for dentry deletion to umount while only holding
  VFS.mountMu for reading, but it's also very unappealing to hold VFS.mountMu
  exclusively around e.g. gofer unlink RPCs. Introduce dentry.mu to avoid these
  problems. This means that VFS.mountMu is never acquired for reading, so
  change it to a sync.Mutex.

PiperOrigin-RevId: 282444343
---
 pkg/sentry/fsimpl/memfs/BUILD             |   4 +-
 pkg/sentry/fsimpl/memfs/benchmark_test.go |  22 ++-
 pkg/sentry/vfs/README.md                  |   4 +-
 pkg/sentry/vfs/dentry.go                  | 128 +++++++++---
 pkg/sentry/vfs/mount.go                   | 319 +++++++++++++++++++++---------
 pkg/sentry/vfs/mount_test.go              |  34 ++--
 pkg/sentry/vfs/mount_unsafe.go            |  60 +++---
 pkg/sentry/vfs/resolving_path.go          |   8 +-
 pkg/sentry/vfs/syscalls.go                |   2 +
 pkg/sentry/vfs/vfs.go                     |  11 +-
 10 files changed, 423 insertions(+), 169 deletions(-)

diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 952b20c51..bc5c0b591 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -1,9 +1,10 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
 go_template_instance(
     name = "dentry_list",
     out = "dentry_list.go",
@@ -48,6 +49,7 @@ go_test(
     deps = [
         ":memfs",
         "//pkg/abi/linux",
+        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index a94b17db6..23a846c08 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -160,6 +161,8 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 					b.Fatalf("stat(%q) failed: %v", filePath, err)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -177,6 +180,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
+			defer mntns.DecRef(vfsObj)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
@@ -186,7 +190,6 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			defer root.DecRef()
 			vd := root
 			vd.IncRef()
-			defer vd.DecRef()
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
@@ -219,6 +222,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
 				Mode:  0644,
 			})
+			vd.DecRef()
+			vd = vfs.VirtualDentry{}
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
@@ -243,6 +248,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -343,6 +350,8 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 					b.Fatalf("stat(%q) failed: %v", filePath, err)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -360,6 +369,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
+			defer mntns.DecRef(vfsObj)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
@@ -395,7 +405,6 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount root: %v", err)
 			}
-			defer vd.DecRef()
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
@@ -435,6 +444,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
 				Mode:  0644,
 			})
+			vd.DecRef()
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
@@ -459,6 +469,14 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
+
+func init() {
+	// Turn off reference leak checking for a fair comparison between vfs1 and
+	// vfs2.
+	refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 7847854bc..9aa133bcb 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
 -   Mount: Each referenced Mount holds a reference on its parent, which is the
     mount containing its mount point.
 
--   VirtualFilesystem: A reference is held on all Mounts that are attached
-    (reachable by Mount traversal).
+-   VirtualFilesystem: A reference is held on each Mount that has not been
+    umounted.
 
 MountNamespace and FileDescription references are held by users of VFS. The
 expectation is that each `kernel.Task` holds a reference on its corresponding
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 45912fc58..09ed5a70e 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"fmt"
+	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -50,7 +51,7 @@ import (
 // and not inodes. Furthermore, when parties outside the scope of VFS can
 // rename inodes on such filesystems, VFS generally cannot "follow" the rename,
 // both due to synchronization issues and because it may not even be able to
-// name the destination path; this implies that it would in fact be *incorrect*
+// name the destination path; this implies that it would in fact be incorrect
 // for Dentries to be associated with inodes on such filesystems. Consequently,
 // operations that are inode operations in Linux are FilesystemImpl methods
 // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
@@ -84,6 +85,9 @@ type Dentry struct {
 	// mounts is accessed using atomic memory operations.
 	mounts uint32
 
+	// mu synchronizes disowning and mounting over this Dentry.
+	mu sync.Mutex
+
 	// children are child Dentries.
 	children map[string]*Dentry
 
@@ -228,36 +232,48 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent
 			panic("d is already disowned")
 		}
 	}
-	vfs.mountMu.RLock()
-	if _, ok := mntns.mountpoints[d]; ok {
-		vfs.mountMu.RUnlock()
+	vfs.mountMu.Lock()
+	if mntns.mountpoints[d] != 0 {
+		vfs.mountMu.Unlock()
 		return syserror.EBUSY
 	}
-	// Return with vfs.mountMu locked, which will be unlocked by
-	// AbortDeleteDentry or CommitDeleteDentry.
+	d.mu.Lock()
+	vfs.mountMu.Unlock()
+	// Return with d.mu locked to block attempts to mount over it; it will be
+	// unlocked by AbortDeleteDentry or CommitDeleteDentry.
 	return nil
 }
 
 // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
 // fails.
-func (vfs *VirtualFilesystem) AbortDeleteDentry() {
-	vfs.mountMu.RUnlock()
+func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
+	d.mu.Unlock()
 }
 
 // CommitDeleteDentry must be called after the file represented by d is
 // deleted, and causes d to become disowned.
 //
+// CommitDeleteDentry is a mutator of d and d.Parent().
+//
 // Preconditions: PrepareDeleteDentry was previously called on d.
 func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
-	delete(d.parent.children, d.name)
+	if d.parent != nil {
+		delete(d.parent.children, d.name)
+	}
 	d.setDisowned()
-	// TODO: lazily unmount mounts at d
-	vfs.mountMu.RUnlock()
+	d.mu.Unlock()
+	if d.isMounted() {
+		vfs.forgetDisownedMountpoint(d)
+	}
 }
 
 // DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
 // appropriate for in-memory filesystems that don't need to ensure that some
 // external state change succeeds before committing the deletion.
+//
+// DeleteDentry is a mutator of d and d.Parent().
+//
+// Preconditions: d is a child Dentry.
 func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
 	if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
 		return err
@@ -266,6 +282,27 @@ func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) err
 	return nil
 }
 
+// ForceDeleteDentry causes d to become disowned. It should only be used in
+// cases where VFS has no ability to stop the deletion (e.g. d represents the
+// local state of a file on a remote filesystem on which the file has already
+// been deleted).
+//
+// ForceDeleteDentry is a mutator of d and d.Parent().
+//
+// Preconditions: d is a child Dentry.
+func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
+	if checkInvariants {
+		if d.parent == nil {
+			panic("d is independent")
+		}
+		if d.IsDisowned() {
+			panic("d is already disowned")
+		}
+	}
+	d.mu.Lock()
+	vfs.CommitDeleteDentry(d)
+}
+
 // PrepareRenameDentry must be called before attempting to rename the file
 // represented by from. If to is not nil, it represents the file that will be
 // replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the
@@ -291,18 +328,21 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t
 			}
 		}
 	}
-	vfs.mountMu.RLock()
-	if _, ok := mntns.mountpoints[from]; ok {
-		vfs.mountMu.RUnlock()
+	vfs.mountMu.Lock()
+	if mntns.mountpoints[from] != 0 {
+		vfs.mountMu.Unlock()
 		return syserror.EBUSY
 	}
 	if to != nil {
-		if _, ok := mntns.mountpoints[to]; ok {
-			vfs.mountMu.RUnlock()
+		if mntns.mountpoints[to] != 0 {
+			vfs.mountMu.Unlock()
 			return syserror.EBUSY
 		}
+		to.mu.Lock()
 	}
-	// Return with vfs.mountMu locked, which will be unlocked by
+	from.mu.Lock()
+	vfs.mountMu.Unlock()
+	// Return with from.mu and to.mu locked, which will be unlocked by
 	// AbortRenameDentry, CommitRenameReplaceDentry, or
 	// CommitRenameExchangeDentry.
 	return nil
@@ -310,38 +350,76 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t
 
 // AbortRenameDentry must be called after PrepareRenameDentry if the rename
 // fails.
-func (vfs *VirtualFilesystem) AbortRenameDentry() {
-	vfs.mountMu.RUnlock()
+func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
+	from.mu.Unlock()
+	if to != nil {
+		to.mu.Unlock()
+	}
 }
 
 // CommitRenameReplaceDentry must be called after the file represented by from
 // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
 // that was replaced by from.
 //
+// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and
+// to.Parent().
+//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
 // newParent.Child(newName) == to.
 func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
-	if to != nil {
-		to.setDisowned()
-		// TODO: lazily unmount mounts at d
-	}
 	if newParent.children == nil {
 		newParent.children = make(map[string]*Dentry)
 	}
 	newParent.children[newName] = from
 	from.parent = newParent
 	from.name = newName
-	vfs.mountMu.RUnlock()
+	from.mu.Unlock()
+	if to != nil {
+		to.setDisowned()
+		to.mu.Unlock()
+		if to.isMounted() {
+			vfs.forgetDisownedMountpoint(to)
+		}
+	}
 }
 
 // CommitRenameExchangeDentry must be called after the files represented by
 // from and to are exchanged by rename(RENAME_EXCHANGE).
 //
+// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and
+// to.Parent().
+//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
 func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
 	from.parent, to.parent = to.parent, from.parent
 	from.name, to.name = to.name, from.name
 	from.parent.children[from.name] = from
 	to.parent.children[to.name] = to
-	vfs.mountMu.RUnlock()
+	from.mu.Unlock()
+	to.mu.Unlock()
+}
+
+// forgetDisownedMountpoint is called when a mount point is deleted to umount
+// all mounts using it in all other mount namespaces.
+//
+// forgetDisownedMountpoint is analogous to Linux's
+// fs/namespace.c:__detach_mounts().
+func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
+	var (
+		vdsToDecRef    []VirtualDentry
+		mountsToDecRef []*Mount
+	)
+	vfs.mountMu.Lock()
+	vfs.mounts.seq.BeginWrite()
+	for mnt := range vfs.mountpoints[d] {
+		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef)
+	}
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.decRef()
+	}
 }
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 11702f720..198fb8067 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -38,16 +38,12 @@ import (
 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
 // between struct mount and struct vfsmount.)
 type Mount struct {
-	// The lower 63 bits of refs are a reference count. The MSB of refs is set
-	// if the Mount has been eagerly unmounted, as by umount(2) without the
-	// MNT_DETACH flag. refs is accessed using atomic memory operations.
-	refs int64
-
-	// The lower 63 bits of writers is the number of calls to
-	// Mount.CheckBeginWrite() that have not yet been paired with a call to
-	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
-	// writers is accessed using atomic memory operations.
-	writers int64
+	// vfs, fs, and root are immutable. References are held on fs and root.
+	//
+	// Invariant: root belongs to fs.
+	vfs  *VirtualFilesystem
+	fs   *Filesystem
+	root *Dentry
 
 	// key is protected by VirtualFilesystem.mountMu and
 	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
@@ -57,13 +53,29 @@ type Mount struct {
 	// key.parent.fs.
 	key mountKey
 
-	// fs, root, and ns are immutable. References are held on fs and root (but
-	// not ns).
-	//
-	// Invariant: root belongs to fs.
-	fs   *Filesystem
-	root *Dentry
-	ns   *MountNamespace
+	// ns is the namespace in which this Mount was mounted. ns is protected by
+	// VirtualFilesystem.mountMu.
+	ns *MountNamespace
+
+	// The lower 63 bits of refs are a reference count. The MSB of refs is set
+	// if the Mount has been eagerly umounted, as by umount(2) without the
+	// MNT_DETACH flag. refs is accessed using atomic memory operations.
+	refs int64
+
+	// children is the set of all Mounts for which Mount.key.parent is this
+	// Mount. children is protected by VirtualFilesystem.mountMu.
+	children map[*Mount]struct{}
+
+	// umounted is true if VFS.umountRecursiveLocked() has been called on this
+	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
+	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
+	umounted bool
+
+	// The lower 63 bits of writers is the number of calls to
+	// Mount.CheckBeginWrite() that have not yet been paired with a call to
+	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
+	// writers is accessed using atomic memory operations.
+	writers int64
 }
 
 // A MountNamespace is a collection of Mounts.
@@ -73,13 +85,16 @@ type Mount struct {
 //
 // MountNamespace is analogous to Linux's struct mnt_namespace.
 type MountNamespace struct {
-	refs int64 // accessed using atomic memory operations
-
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
-	// mountpoints contains all Dentries which are mount points in this
-	// namespace. mountpoints is protected by VirtualFilesystem.mountMu.
+	// refs is the reference count. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// mountpoints maps all Dentries which are mount points in this namespace
+	// to the number of Mounts for which they are mount points. mountpoints is
+	// protected by VirtualFilesystem.mountMu.
 	//
 	// mountpoints is used to determine if a Dentry can be moved or removed
 	// (which requires that the Dentry is not a mount point in the calling
@@ -89,7 +104,7 @@ type MountNamespace struct {
 	// MountNamespace; this is required to ensure that
 	// VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
 	// correctly on unreferenced MountNamespaces.
-	mountpoints map[*Dentry]struct{}
+	mountpoints map[*Dentry]uint32
 }
 
 // NewMountNamespace returns a new mount namespace with a root filesystem
@@ -106,9 +121,10 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	}
 	mntns := &MountNamespace{
 		refs:        1,
-		mountpoints: make(map[*Dentry]struct{}),
+		mountpoints: make(map[*Dentry]uint32),
 	}
 	mntns.root = &Mount{
+		vfs:  vfs,
 		fs:   fs,
 		root: root,
 		ns:   mntns,
@@ -136,8 +152,10 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 		return err
 	}
 	vfs.mountMu.Lock()
+	vd.dentry.mu.Lock()
 	for {
 		if vd.dentry.IsDisowned() {
+			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
 			root.decRef(fs)
@@ -153,36 +171,208 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 		if nextmnt == nil {
 			break
 		}
-		nextmnt.incRef()
+		// It's possible that nextmnt has been umounted but not disconnected,
+		// in which case vfs no longer holds a reference on it, and the last
+		// reference may be concurrently dropped even though we're holding
+		// vfs.mountMu.
+		if !nextmnt.tryIncMountedRef() {
+			break
+		}
+		// This can't fail since we're holding vfs.mountMu.
 		nextmnt.root.incRef(nextmnt.fs)
+		vd.dentry.mu.Unlock()
 		vd.DecRef()
 		vd = VirtualDentry{
 			mount:  nextmnt,
 			dentry: nextmnt.root,
 		}
+		vd.dentry.mu.Lock()
 	}
 	// TODO: Linux requires that either both the mount point and the mount root
 	// are directories, or neither are, and returns ENOTDIR if this is not the
 	// case.
 	mntns := vd.mount.ns
 	mnt := &Mount{
+		vfs:  vfs,
 		fs:   fs,
 		root: root,
 		ns:   mntns,
 		refs: 1,
 	}
-	mnt.storeKey(vd.mount, vd.dentry)
+	vfs.mounts.seq.BeginWrite()
+	vfs.connectLocked(mnt, vd, mntns)
+	vfs.mounts.seq.EndWrite()
+	vd.dentry.mu.Unlock()
+	vfs.mountMu.Unlock()
+	return nil
+}
+
+type umountRecursiveOptions struct {
+	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
+	// on umounted mounts fail.
+	//
+	// eager is analogous to Linux's UMOUNT_SYNC.
+	eager bool
+
+	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
+	// should be disconnected from their parents. (Mounts whose parents are not
+	// umounted, which in most cases means the Mount passed to the initial call
+	// to umountRecursiveLocked, are unconditionally disconnected for
+	// consistency with Linux.)
+	//
+	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
+	disconnectHierarchy bool
+}
+
+// umountRecursiveLocked marks mnt and its descendants as umounted. It does not
+// release mount or dentry references; instead, it appends VirtualDentries and
+// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
+// respectively, and returns updated slices. (This is necessary because
+// filesystem locks possibly taken by DentryImpl.DecRef() may precede
+// vfs.mountMu in the lock order, and Mount.decRef() may lock vfs.mountMu.)
+//
+// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section.
+func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
+	if !mnt.umounted {
+		mnt.umounted = true
+		mountsToDecRef = append(mountsToDecRef, mnt)
+		if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
+			vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
+		}
+	}
+	if opts.eager {
+		for {
+			refs := atomic.LoadInt64(&mnt.refs)
+			if refs < 0 {
+				break
+			}
+			if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) {
+				break
+			}
+		}
+	}
+	for child := range mnt.children {
+		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
+	}
+	return vdsToDecRef, mountsToDecRef
+}
+
+// connectLocked makes vd the mount parent/point for mnt. It consumes
+// references held by vd.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. d.mu must be locked. mnt.parent() == nil.
+func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+	mnt.storeKey(vd)
+	if vd.mount.children == nil {
+		vd.mount.children = make(map[*Mount]struct{})
+	}
+	vd.mount.children[mnt] = struct{}{}
 	atomic.AddUint32(&vd.dentry.mounts, 1)
-	mntns.mountpoints[vd.dentry] = struct{}{}
+	mntns.mountpoints[vd.dentry]++
+	vfs.mounts.insertSeqed(mnt)
 	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
 	if !ok {
 		vfsmpmounts = make(map[*Mount]struct{})
 		vfs.mountpoints[vd.dentry] = vfsmpmounts
 	}
 	vfsmpmounts[mnt] = struct{}{}
-	vfs.mounts.Insert(mnt)
-	vfs.mountMu.Unlock()
-	return nil
+}
+
+// disconnectLocked makes vd have no mount parent/point and returns its old
+// mount parent/point with a reference held.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. mnt.parent() != nil.
+func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
+	vd := mnt.loadKey()
+	mnt.storeKey(VirtualDentry{})
+	delete(vd.mount.children, mnt)
+	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
+	mnt.ns.mountpoints[vd.dentry]--
+	if mnt.ns.mountpoints[vd.dentry] == 0 {
+		delete(mnt.ns.mountpoints, vd.dentry)
+	}
+	vfs.mounts.removeSeqed(mnt)
+	vfsmpmounts := vfs.mountpoints[vd.dentry]
+	delete(vfsmpmounts, mnt)
+	if len(vfsmpmounts) == 0 {
+		delete(vfs.mountpoints, vd.dentry)
+	}
+	return vd
+}
+
+// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
+// reference count is already zero, or has been eagerly umounted,
+// tryIncMountedRef does nothing and returns false.
+//
+// tryIncMountedRef does not require that a reference is held on mnt.
+func (mnt *Mount) tryIncMountedRef() bool {
+	for {
+		refs := atomic.LoadInt64(&mnt.refs)
+		if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (mnt *Mount) incRef() {
+	// In general, negative values for mnt.refs are valid because the MSB is
+	// the eager-unmount bit.
+	atomic.AddInt64(&mnt.refs, 1)
+}
+
+func (mnt *Mount) decRef() {
+	refs := atomic.AddInt64(&mnt.refs, -1)
+	if refs&^math.MinInt64 == 0 { // mask out MSB
+		var vd VirtualDentry
+		if mnt.parent() != nil {
+			mnt.vfs.mountMu.Lock()
+			mnt.vfs.mounts.seq.BeginWrite()
+			vd = mnt.vfs.disconnectLocked(mnt)
+			mnt.vfs.mounts.seq.EndWrite()
+			mnt.vfs.mountMu.Unlock()
+		}
+		mnt.root.decRef(mnt.fs)
+		mnt.fs.decRef()
+		if vd.Ok() {
+			vd.DecRef()
+		}
+	}
+}
+
+// IncRef increments mntns' reference count.
+func (mntns *MountNamespace) IncRef() {
+	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
+		panic("MountNamespace.IncRef() called without holding a reference")
+	}
+}
+
+// DecRef decrements mntns' reference count.
+func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
+	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+		vfs.mountMu.Lock()
+		vfs.mounts.seq.BeginWrite()
+		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
+			disconnectHierarchy: true,
+		}, nil, nil)
+		vfs.mounts.seq.EndWrite()
+		vfs.mountMu.Unlock()
+		for _, vd := range vdsToDecRef {
+			vd.DecRef()
+		}
+		for _, mnt := range mountsToDecRef {
+			mnt.decRef()
+		}
+	} else if refs < 0 {
+		panic("MountNamespace.DecRef() called without holding a reference")
+	}
 }
 
 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
@@ -231,12 +421,12 @@ retryFirst:
 }
 
 // getMountpointAt returns the mount point for the stack of Mounts including
-// mnt. It takes a reference on the returned Mount and Dentry. If no such mount
+// mnt. It takes a reference on the returned VirtualDentry. If no such mount
 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
 //
 // Preconditions: References are held on mnt and root. vfsroot is not (mnt,
 // mnt.root).
-func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) {
+func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
 	// The first mount is special-cased:
 	//
 	// - The caller must have already checked mnt against vfsroot.
@@ -246,12 +436,12 @@ func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry)
 	// - We don't drop the caller's reference on mnt.
 retryFirst:
 	epoch := vfs.mounts.seq.BeginRead()
-	parent, point := mnt.loadKey()
+	parent, point := mnt.parent(), mnt.point()
 	if !vfs.mounts.seq.ReadOk(epoch) {
 		goto retryFirst
 	}
 	if parent == nil {
-		return nil, nil
+		return VirtualDentry{}
 	}
 	if !parent.tryIncMountedRef() {
 		// Raced with umount.
@@ -263,6 +453,11 @@ retryFirst:
 		parent.decRef()
 		goto retryFirst
 	}
+	if !vfs.mounts.seq.ReadOk(epoch) {
+		point.decRef(parent.fs)
+		parent.decRef()
+		goto retryFirst
+	}
 	mnt = parent
 	d := point
 	for {
@@ -274,7 +469,7 @@ retryFirst:
 		}
 	retryNotFirst:
 		epoch := vfs.mounts.seq.BeginRead()
-		parent, point := mnt.loadKey()
+		parent, point := mnt.parent(), mnt.point()
 		if !vfs.mounts.seq.ReadOk(epoch) {
 			goto retryNotFirst
 		}
@@ -301,43 +496,7 @@ retryFirst:
 		mnt = parent
 		d = point
 	}
-	return mnt, d
-}
-
-// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
-// reference count is already zero, or has been eagerly unmounted,
-// tryIncMountedRef does nothing and returns false.
-//
-// tryIncMountedRef does not require that a reference is held on mnt.
-func (mnt *Mount) tryIncMountedRef() bool {
-	for {
-		refs := atomic.LoadInt64(&mnt.refs)
-		if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
-func (mnt *Mount) incRef() {
-	// In general, negative values for mnt.refs are valid because the MSB is
-	// the eager-unmount bit.
-	atomic.AddInt64(&mnt.refs, 1)
-}
-
-func (mnt *Mount) decRef() {
-	refs := atomic.AddInt64(&mnt.refs, -1)
-	if refs&^math.MinInt64 == 0 { // mask out MSB
-		parent, point := mnt.loadKey()
-		if point != nil {
-			point.decRef(parent.fs)
-			parent.decRef()
-		}
-		mnt.root.decRef(mnt.fs)
-		mnt.fs.decRef()
-	}
+	return VirtualDentry{mnt, d}
 }
 
 // CheckBeginWrite increments the counter of in-progress write operations on
@@ -360,7 +519,7 @@ func (mnt *Mount) EndWrite() {
 	atomic.AddInt64(&mnt.writers, -1)
 }
 
-// Preconditions: VirtualFilesystem.mountMu must be locked for writing.
+// Preconditions: VirtualFilesystem.mountMu must be locked.
 func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
 		return nil
@@ -383,22 +542,6 @@ func (mnt *Mount) Filesystem() *Filesystem {
 	return mnt.fs
 }
 
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
-	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
-		panic("MountNamespace.IncRef() called without holding a reference")
-	}
-}
-
-// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef() {
-	if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 {
-		// TODO: unmount mntns.root
-	} else if refs < 0 {
-		panic("MountNamespace.DecRef() called without holding a reference")
-	}
-}
-
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index f394d7483..adff0b94b 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -37,7 +37,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	mt.Init()
 
 	mount := &Mount{}
-	mount.storeKey(&Mount{}, &Dentry{})
+	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
 	mt.Insert(mount)
 
 	if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -78,18 +78,10 @@ const enableComparativeBenchmarks = false
 
 func newBenchMount() *Mount {
 	mount := &Mount{}
-	mount.storeKey(&Mount{}, &Dentry{})
+	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
 	return mount
 }
 
-func vdkey(mnt *Mount) VirtualDentry {
-	parent, point := mnt.loadKey()
-	return VirtualDentry{
-		mount:  parent,
-		dentry: point,
-	}
-}
-
 func BenchmarkMountTableParallelLookup(b *testing.B) {
 	for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
 		for _, numMounts := range benchNumMounts {
@@ -101,7 +93,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
 					mt.Insert(mount)
-					keys = append(keys, vdkey(mount))
+					keys = append(keys, mount.loadKey())
 				}
 
 				var ready sync.WaitGroup
@@ -153,7 +145,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := vdkey(mount)
+					key := mount.loadKey()
 					ms[key] = mount
 					keys = append(keys, key)
 				}
@@ -208,7 +200,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := vdkey(mount)
+					key := mount.loadKey()
 					ms.Store(key, mount)
 					keys = append(keys, key)
 				}
@@ -290,7 +282,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
 			ms := make(map[VirtualDentry]*Mount)
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms[vdkey(mount)] = mount
+				ms[mount.loadKey()] = mount
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -325,7 +317,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
 			var ms sync.Map
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms.Store(vdkey(mount), mount)
+				ms.Store(mount.loadKey(), mount)
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -379,7 +371,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms[vdkey(mount)] = mount
+		ms[mount.loadKey()] = mount
 	}
 }
 
@@ -399,7 +391,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(vdkey(mount), mount)
+		ms.Store(mount.loadKey(), mount)
 	}
 }
 
@@ -432,13 +424,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
 	ms := make(map[VirtualDentry]*Mount)
 	for i := range mounts {
 		mount := mounts[i]
-		ms[vdkey(mount)] = mount
+		ms[mount.loadKey()] = mount
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		delete(ms, vdkey(mount))
+		delete(ms, mount.loadKey())
 	}
 }
 
@@ -454,12 +446,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
 	var ms sync.Map
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(vdkey(mount), mount)
+		ms.Store(mount.loadKey(), mount)
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Delete(vdkey(mount))
+		ms.Delete(mount.loadKey())
 	}
 }
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index c98b42f91..ab13fa461 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -38,16 +38,6 @@ type mountKey struct {
 	point  unsafe.Pointer // *Dentry
 }
 
-// Invariant: mnt.key's fields are nil. parent and point are non-nil.
-func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
-	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
-	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
-}
-
-func (mnt *Mount) loadKey() (*Mount, *Dentry) {
-	return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
-}
-
 func (mnt *Mount) parent() *Mount {
 	return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
 }
@@ -56,6 +46,19 @@ func (mnt *Mount) point() *Dentry {
 	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
 }
 
+func (mnt *Mount) loadKey() VirtualDentry {
+	return VirtualDentry{
+		mount:  mnt.parent(),
+		dentry: mnt.point(),
+	}
+}
+
+// Invariant: mnt.key.parent == nil. vd.Ok().
+func (mnt *Mount) storeKey(vd VirtualDentry) {
+	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
+	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
+}
+
 // mountTable maps (mount parent, mount point) pairs to mounts. It supports
 // efficient concurrent lookup, even in the presence of concurrent mutators
 // (provided mutation is sufficiently uncommon).
@@ -201,9 +204,19 @@ loop:
 
 // Insert inserts the given mount into mt.
 //
-// Preconditions: There are no concurrent mutators of mt. mt must not already
-// contain a Mount with the same mount point and parent.
+// Preconditions: mt must not already contain a Mount with the same mount point
+// and parent.
 func (mt *mountTable) Insert(mount *Mount) {
+	mt.seq.BeginWrite()
+	mt.insertSeqed(mount)
+	mt.seq.EndWrite()
+}
+
+// insertSeqed inserts the given mount into mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must not
+// already contain a Mount with the same mount point and parent.
+func (mt *mountTable) insertSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 
 	// We're under the maximum load factor if:
@@ -215,10 +228,8 @@ func (mt *mountTable) Insert(mount *Mount) {
 	tcap := uintptr(1) << order
 	if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
 		// Atomically insert the new element into the table.
-		mt.seq.BeginWrite()
 		atomic.AddUint64(&mt.size, mtSizeLenOne)
 		mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
-		mt.seq.EndWrite()
 		return
 	}
 
@@ -241,8 +252,6 @@ func (mt *mountTable) Insert(mount *Mount) {
 	for {
 		oldSlot := (*mountSlot)(oldCur)
 		if oldSlot.value != nil {
-			// Don't need to lock mt.seq yet since newSlots isn't visible
-			// to readers.
 			mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
 		}
 		if oldCur == oldLast {
@@ -252,11 +261,9 @@ func (mt *mountTable) Insert(mount *Mount) {
 	}
 	// Insert the new element into the new table.
 	mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
-	// Atomically switch to the new table.
-	mt.seq.BeginWrite()
+	// Switch to the new table.
 	atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
 	atomic.StorePointer(&mt.slots, newSlots)
-	mt.seq.EndWrite()
 }
 
 // Preconditions: There are no concurrent mutators of the table (slots, cap).
@@ -294,9 +301,18 @@ func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, has
 
 // Remove removes the given mount from mt.
 //
-// Preconditions: There are no concurrent mutators of mt. mt must contain
-// mount.
+// Preconditions: mt must contain mount.
 func (mt *mountTable) Remove(mount *Mount) {
+	mt.seq.BeginWrite()
+	mt.removeSeqed(mount)
+	mt.seq.EndWrite()
+}
+
+// removeSeqed removes the given mount from mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must contain
+// mount.
+func (mt *mountTable) removeSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
 	mask := tcap - 1
@@ -311,7 +327,6 @@ func (mt *mountTable) Remove(mount *Mount) {
 			// backward until we either find an empty slot, or an element that
 			// is already in its first-probed slot. (This is backward shift
 			// deletion.)
-			mt.seq.BeginWrite()
 			for {
 				nextOff := (off + mountSlotBytes) & offmask
 				nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
@@ -330,7 +345,6 @@ func (mt *mountTable) Remove(mount *Mount) {
 			}
 			atomic.StorePointer(&slot.value, nil)
 			atomic.AddUint64(&mt.size, mtSizeLenNegOne)
-			mt.seq.EndWrite()
 			return
 		}
 		if checkInvariants && slotValue == nil {
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8d05c8583..61bce6426 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -269,11 +269,11 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
 		parent = d
 	} else if d == rp.mount.root {
 		// At mount root ...
-		mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root)
-		if mnt != nil {
+		vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
+		if vd.Ok() {
 			// ... of non-root mount.
-			rp.nextMount = mnt
-			rp.nextStart = mntpt
+			rp.nextMount = vd.mount
+			rp.nextStart = vd.dentry
 			return nil, resolveMountRootError{}
 		}
 		// ... of root mount.
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
index abde0feaa..49952b2cc 100644
--- a/pkg/sentry/vfs/syscalls.go
+++ b/pkg/sentry/vfs/syscalls.go
@@ -230,6 +230,8 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) err
 //
 // - VFS.SymlinkAt()
 //
+// - VFS.UmountAt()
+//
 // - VFS.UnlinkAt()
 //
 // - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 4a8a69540..88e865d86 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -16,9 +16,14 @@
 //
 // Lock order:
 //
-// Filesystem implementation locks
+// FilesystemImpl/FileDescriptionImpl locks
 //   VirtualFilesystem.mountMu
+//     Dentry.mu
+//       Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
 // VirtualFilesystem.fsTypesMu
+//
+// Locking Dentry.mu in multiple Dentries requires holding
+// VirtualFilesystem.mountMu.
 package vfs
 
 import (
@@ -33,7 +38,7 @@ type VirtualFilesystem struct {
 	// mountMu serializes mount mutations.
 	//
 	// mountMu is analogous to Linux's namespace_sem.
-	mountMu sync.RWMutex
+	mountMu sync.Mutex
 
 	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
 	// are uniquely namespaced, including mount parent in the key correctly
@@ -52,7 +57,7 @@ type VirtualFilesystem struct {
 	// mountpoints maps mount points to mounts at those points in all
 	// namespaces. mountpoints is protected by mountMu.
 	//
-	// mountpoints is used to find mounts that must be unmounted due to
+	// mountpoints is used to find mounts that must be umounted due to
 	// removal of a mount point Dentry from another mount namespace. ("A file
 	// or directory that is a mount point in one namespace that is not a mount
 	// point in another namespace, may be renamed, unlinked, or removed
-- 
cgit v1.2.3


From b72e1b3c0873ea29d031db42e39ca053923eecff Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 25 Nov 2019 18:09:15 -0800
Subject: Minor VFS2 interface changes.

- Remove the Filesystem argument from DentryImpl.*Ref(); in general DentryImpls
  that need the Filesystem for reference counting will probably also need it
  for other interface methods that don't plumb Filesystem, so it's easier to
  just store a pointer to the filesystem in the DentryImpl.

- Add a pointer to the VirtualFilesystem to Filesystem, which is needed by the
  gofer client to disown dentries for cache eviction triggered by dentry
  reference count changes.

- Rename FilesystemType.NewFilesystem to GetFilesystem; in some cases (e.g.
  sysfs, cgroupfs) it's much cleaner for there to be only one Filesystem that
  is used by all mounts, and in at least one case (devtmpfs) it's visibly
  incorrect not to do so, so NewFilesystem doesn't always actually create and
  return a *new* Filesystem.

- Require callers of FileDescription.Init() to increment Mount/Dentry
  references. This is because the gofer client may, in the OpenAt() path, take
  a reference on a dentry with 0 references, which is safe due to
  synchronization that is outside the scope of this CL, and it would be safer
  to still have its implementation of DentryImpl.IncRef() check for an
  increment for 0 references in other cases.

- Add FileDescription.TryIncRef. This is used by the gofer client to take
  references on "special file descriptions" (FDs for files such as pipes,
  sockets, and devices), which use per-FD handles (fids) instead of
  dentry-shared handles, for sync() and syncfs().

PiperOrigin-RevId: 282473364
---
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  4 +-
 pkg/sentry/fsimpl/ext/block_map_file.go           |  4 +-
 pkg/sentry/fsimpl/ext/block_map_test.go           |  4 +-
 pkg/sentry/fsimpl/ext/dentry.go                   | 10 ++--
 pkg/sentry/fsimpl/ext/ext.go                      | 10 ++--
 pkg/sentry/fsimpl/ext/ext_test.go                 | 14 +++---
 pkg/sentry/fsimpl/ext/extent_file.go              |  6 +--
 pkg/sentry/fsimpl/ext/extent_test.go              |  4 +-
 pkg/sentry/fsimpl/ext/file_description.go         |  4 +-
 pkg/sentry/fsimpl/ext/inode.go                    | 28 +++++++-----
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |  6 +--
 pkg/sentry/fsimpl/memfs/filesystem.go             | 15 ++++--
 pkg/sentry/fsimpl/memfs/memfs.go                  | 16 +++----
 pkg/sentry/fsimpl/memfs/named_pipe.go             |  5 +-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |  2 +-
 pkg/sentry/vfs/dentry.go                          | 24 ++++++----
 pkg/sentry/vfs/file_description.go                | 33 +++++++++++--
 pkg/sentry/vfs/file_description_impl_util_test.go |  2 +-
 pkg/sentry/vfs/filesystem.go                      | 20 ++++++--
 pkg/sentry/vfs/filesystem_type.go                 | 10 ++--
 pkg/sentry/vfs/mount.go                           | 56 ++++++++++++-----------
 pkg/sentry/vfs/resolving_path.go                  |  8 ++--
 pkg/sentry/vfs/syscalls.go                        |  2 +-
 pkg/sentry/vfs/testutil.go                        | 12 ++---
 pkg/sentry/vfs/vfs.go                             |  8 ++--
 25 files changed, 186 insertions(+), 121 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 10a8083a0..94cd74095 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -50,7 +50,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	// Create VFS.
 	vfsObj := vfs.New()
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
@@ -81,7 +81,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
 		b.Fatalf("failed to mount tmpfs submount: %v", err)
 	}
 	return func() {
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index cea89bcd9..a2d8c3ad6 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -154,7 +154,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 			toRead = len(dst)
 		}
 
-		n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
+		n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
 		if n < toRead {
 			return n, syserror.EIO
 		}
@@ -174,7 +174,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 	curChildOff := relFileOff % childCov
 	for i := startIdx; i < endIdx; i++ {
 		var childPhyBlk uint32
-		err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
+		err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
 		if err != nil {
 			return read, err
 		}
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 213aa3919..181727ef7 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -87,12 +87,14 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
 	regFile := regularFile{
 		inode: inode{
+			fs: &filesystem{
+				dev: bytes.NewReader(mockDisk),
+			},
 			diskInode: &disklayout.InodeNew{
 				InodeOld: disklayout.InodeOld{
 					SizeLo: getMockBMFileFize(),
 				},
 			},
-			dev:     bytes.NewReader(mockDisk),
 			blkSize: uint64(mockBMBlkSize),
 		},
 	}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 054fb42b6..a080cb189 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -41,16 +41,18 @@ func newDentry(in *inode) *dentry {
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef() {
 	d.inode.incRef()
 }
 
 // TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef() bool {
 	return d.inode.tryIncRef()
 }
 
 // DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
-	d.inode.decRef(vfsfs.Impl().(*filesystem))
+func (d *dentry) DecRef() {
+	// FIXME(b/134676337): filesystem.mu may not be locked as required by
+	// inode.decRef().
+	d.inode.decRef()
 }
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index f10accafc..4b7d17dc6 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -40,14 +40,14 @@ var _ vfs.FilesystemType = (*FilesystemType)(nil)
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
 //   2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
+func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) {
 	if opts.InternalData == nil {
 		// User mount call.
 		// TODO(b/134676337): Open the device specified by `source` and return that.
 		panic("unimplemented")
 	}
 
-	// NewFilesystem call originated from within the sentry.
+	// GetFilesystem call originated from within the sentry.
 	devFd, ok := opts.InternalData.(int)
 	if !ok {
 		return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
@@ -91,8 +91,8 @@ func isCompatible(sb disklayout.SuperBlock) bool {
 	return true
 }
 
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
 	// EACCESS should be returned according to mount(2). Filesystem independent
 	// flags (like readonly) are currently not available in pkg/sentry/vfs.
@@ -103,7 +103,7 @@ func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials
 	}
 
 	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
-	fs.vfsfs.Init(&fs)
+	fs.vfsfs.Init(vfsObj, &fs)
 	fs.sb, err = readSuperBlock(dev)
 	if err != nil {
 		return nil, nil, err
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 1aa2bd6a4..307e4d68c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -66,7 +66,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	// Create VFS.
 	vfsObj := vfs.New()
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
@@ -509,27 +509,27 @@ func TestIterDirents(t *testing.T) {
 	}
 
 	wantDirents := []vfs.Dirent{
-		vfs.Dirent{
+		{
 			Name: ".",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "..",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "lost+found",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "file.txt",
 			Type: linux.DT_REG,
 		},
-		vfs.Dirent{
+		{
 			Name: "bigfile.txt",
 			Type: linux.DT_REG,
 		},
-		vfs.Dirent{
+		{
 			Name: "symlink.txt",
 			Type: linux.DT_LNK,
 		},
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 38b68a2d3..3d3ebaca6 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -99,7 +99,7 @@ func (f *extentFile) buildExtTree() error {
 func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) {
 	var header disklayout.ExtentHeader
 	off := entry.PhysicalBlock() * f.regFile.inode.blkSize
-	err := readFromDisk(f.regFile.inode.dev, int64(off), &header)
+	err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header)
 	if err != nil {
 		return nil, err
 	}
@@ -115,7 +115,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla
 			curEntry = &disklayout.ExtentIdx{}
 		}
 
-		err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry)
+		err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry)
 		if err != nil {
 			return nil, err
 		}
@@ -229,7 +229,7 @@ func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byt
 		toRead = len(dst)
 	}
 
-	n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart))
+	n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart))
 	if n < toRead {
 		return n, syserror.EIO
 	}
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index 42d0a484b..a2382daa3 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -180,13 +180,15 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 	mockExtentFile := &extentFile{
 		regFile: regularFile{
 			inode: inode{
+				fs: &filesystem{
+					dev: bytes.NewReader(mockDisk),
+				},
 				diskInode: &disklayout.InodeNew{
 					InodeOld: disklayout.InodeOld{
 						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
 					},
 				},
 				blkSize: mockExtentBlkSize,
-				dev:     bytes.NewReader(mockDisk),
 			},
 		},
 	}
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 4d18b28cb..5eca2b83f 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -36,11 +36,11 @@ type fileDescription struct {
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
 // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index e6c847a71..24249525c 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"fmt"
-	"io"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -42,13 +41,13 @@ type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
 
+	// fs is the containing filesystem.
+	fs *filesystem
+
 	// inodeNum is the inode number of this inode on disk. This is used to
 	// identify inodes within the ext filesystem.
 	inodeNum uint32
 
-	// dev represents the underlying device. Same as filesystem.dev.
-	dev io.ReaderAt
-
 	// blkSize is the fs data block size. Same as filesystem.sb.BlockSize().
 	blkSize uint64
 
@@ -81,10 +80,10 @@ func (in *inode) tryIncRef() bool {
 // decRef decrements the inode ref count and releases the inode resources if
 // the ref count hits 0.
 //
-// Precondition: Must have locked fs.mu.
-func (in *inode) decRef(fs *filesystem) {
+// Precondition: Must have locked filesystem.mu.
+func (in *inode) decRef() {
 	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
-		delete(fs.inodeCache, in.inodeNum)
+		delete(in.fs.inodeCache, in.inodeNum)
 	} else if refs < 0 {
 		panic("ext.inode.decRef() called without holding a reference")
 	}
@@ -117,8 +116,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 
 	// Build the inode based on its type.
 	inode := inode{
+		fs:        fs,
 		inodeNum:  inodeNum,
-		dev:       fs.dev,
 		blkSize:   blkSize,
 		diskInode: diskInode,
 	}
@@ -154,11 +153,14 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
+	mnt := rp.Mount()
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
 		fd.flags = flags
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -167,8 +169,10 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
 		fd.flags = flags
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		return &fd.vfsfd, nil
 	case *symlink:
 		if flags&linux.O_PATH == 0 {
@@ -177,7 +181,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 		}
 		var fd symlinkFD
 		fd.flags = flags
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index 23a846c08..ea6417ce7 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -176,7 +176,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			// Create VFS.
 			vfsObj := vfs.New()
 			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -365,7 +365,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			// Create VFS.
 			vfsObj := vfs.New()
 			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef()
 			// Create and mount the submount.
-			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil {
+			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index f006c40cd..08a9cb8ef 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -159,7 +159,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 			return nil, err
 		}
 	}
-	inode.incRef() // vfsd.IncRef(&fs.vfsfs)
+	inode.incRef()
 	return vfsd, nil
 }
 
@@ -379,6 +379,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 			return nil, err
 		}
 	}
+	mnt := rp.Mount()
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
@@ -386,12 +387,14 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
 		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
 		if fd.writable {
-			if err := rp.Mount().CheckBeginWrite(); err != nil {
+			if err := mnt.CheckBeginWrite(); err != nil {
 				return nil, err
 			}
-			// Mount.EndWrite() is called by regularFileFD.Release().
+			// mnt.EndWrite() is called by regularFileFD.Release().
 		}
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data = impl.data[:0]
@@ -405,7 +408,9 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		fd.flags = flags
 		return &fd.vfsfd, nil
 	case *symlink:
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index 64c851c1a..4cb2a4e0f 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -52,10 +52,10 @@ type filesystem struct {
 	nextInoMinusOne uint64 // accessed using atomic memory operations
 }
 
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	var fs filesystem
-	fs.vfsfs.Init(&fs)
+	fs.vfsfs.Init(vfsObj, &fs)
 	root := fs.newDentry(fs.newDirectory(creds, 01777))
 	return &fs.vfsfs, &root.vfsd, nil
 }
@@ -99,17 +99,17 @@ func (fs *filesystem) newDentry(inode *inode) *dentry {
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef() {
 	d.inode.incRef()
 }
 
 // TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef() bool {
 	return d.inode.tryIncRef()
 }
 
 // DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) DecRef() {
 	d.inode.decRef()
 }
 
@@ -266,11 +266,11 @@ type fileDescription struct {
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
 // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
index 732ed7c58..91cb4b1fc 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -54,6 +54,9 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 	if err != nil {
 		return nil, err
 	}
-	fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+	mnt := rp.Mount()
+	mnt.IncRef()
+	vfsd.IncRef()
+	fd.vfsfd.Init(&fd, mnt, vfsd)
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index 0674b81a3..a3a870571 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	// Create VFS.
 	vfsObj := vfs.New()
 	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 09ed5a70e..40f4c1d09 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -118,7 +118,7 @@ func (d *Dentry) Impl() DentryImpl {
 type DentryImpl interface {
 	// IncRef increments the Dentry's reference count. A Dentry with a non-zero
 	// reference count must remain coherent with the state of the filesystem.
-	IncRef(fs *Filesystem)
+	IncRef()
 
 	// TryIncRef increments the Dentry's reference count and returns true. If
 	// the Dentry's reference count is zero, TryIncRef may do nothing and
@@ -126,10 +126,10 @@ type DentryImpl interface {
 	// guarantee that the Dentry is coherent with the state of the filesystem.)
 	//
 	// TryIncRef does not require that a reference is held on the Dentry.
-	TryIncRef(fs *Filesystem) bool
+	TryIncRef() bool
 
 	// DecRef decrements the Dentry's reference count.
-	DecRef(fs *Filesystem)
+	DecRef()
 }
 
 // IsDisowned returns true if d is disowned.
@@ -146,16 +146,20 @@ func (d *Dentry) isMounted() bool {
 	return atomic.LoadUint32(&d.mounts) != 0
 }
 
-func (d *Dentry) incRef(fs *Filesystem) {
-	d.impl.IncRef(fs)
+// IncRef increments d's reference count.
+func (d *Dentry) IncRef() {
+	d.impl.IncRef()
 }
 
-func (d *Dentry) tryIncRef(fs *Filesystem) bool {
-	return d.impl.TryIncRef(fs)
+// TryIncRef increments d's reference count and returns true. If d's reference
+// count is zero, TryIncRef may instead do nothing and return false.
+func (d *Dentry) TryIncRef() bool {
+	return d.impl.TryIncRef()
 }
 
-func (d *Dentry) decRef(fs *Filesystem) {
-	d.impl.DecRef(fs)
+// DecRef decrements d's reference count.
+func (d *Dentry) DecRef() {
+	d.impl.DecRef()
 }
 
 // These functions are exported so that filesystem implementations can use
@@ -420,6 +424,6 @@ func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
 		vd.DecRef()
 	}
 	for _, mnt := range mountsToDecRef {
-		mnt.decRef()
+		mnt.DecRef()
 	}
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 3a9665800..34007eb57 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -47,15 +47,14 @@ type FileDescription struct {
 	impl FileDescriptionImpl
 }
 
-// Init must be called before first use of fd. It takes references on mnt and
-// d.
+// Init must be called before first use of fd. It takes ownership of references
+// on mnt and d held by the caller.
 func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
 	fd.refs = 1
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
 	}
-	fd.vd.IncRef()
 	fd.impl = impl
 }
 
@@ -64,6 +63,18 @@ func (fd *FileDescription) Impl() FileDescriptionImpl {
 	return fd.impl
 }
 
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+	return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+	return fd.vd.dentry
+}
+
 // VirtualDentry returns the location at which fd was opened. It does not take
 // a reference on the returned VirtualDentry.
 func (fd *FileDescription) VirtualDentry() VirtualDentry {
@@ -75,6 +86,22 @@ func (fd *FileDescription) IncRef() {
 	atomic.AddInt64(&fd.refs, 1)
 }
 
+// TryIncRef increments fd's reference count and returns true. If fd's
+// reference count is already zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fd.
+func (fd *FileDescription) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fd.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
 // DecRef decrements fd's reference count.
 func (fd *FileDescription) DecRef() {
 	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 511b829fc..a5561dcbe 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -90,7 +90,7 @@ func TestGenCountFD(t *testing.T) {
 
 	vfsObj := New() // vfs.New()
 	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create testfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 7a074b718..76ff8cf51 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -33,29 +33,41 @@ type Filesystem struct {
 	// operations.
 	refs int64
 
+	// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
+	// immutable.
+	vfs *VirtualFilesystem
+
 	// impl is the FilesystemImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in Dentry.
 	impl FilesystemImpl
 }
 
 // Init must be called before first use of fs.
-func (fs *Filesystem) Init(impl FilesystemImpl) {
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
 	fs.refs = 1
+	fs.vfs = vfsObj
 	fs.impl = impl
 }
 
+// VirtualFilesystem returns the containing VirtualFilesystem.
+func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
+	return fs.vfs
+}
+
 // Impl returns the FilesystemImpl associated with fs.
 func (fs *Filesystem) Impl() FilesystemImpl {
 	return fs.impl
 }
 
-func (fs *Filesystem) incRef() {
+// IncRef increments fs' reference count.
+func (fs *Filesystem) IncRef() {
 	if atomic.AddInt64(&fs.refs, 1) <= 1 {
-		panic("Filesystem.incRef() called without holding a reference")
+		panic("Filesystem.IncRef() called without holding a reference")
 	}
 }
 
-func (fs *Filesystem) decRef() {
+// DecRef decrements fs' reference count.
+func (fs *Filesystem) DecRef() {
 	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
 		fs.impl.Release()
 	} else if refs < 0 {
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f401ad7f3..c335e206d 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -25,21 +25,21 @@ import (
 //
 // FilesystemType is analogous to Linux's struct file_system_type.
 type FilesystemType interface {
-	// NewFilesystem returns a Filesystem configured by the given options,
+	// GetFilesystem returns a Filesystem configured by the given options,
 	// along with its mount root. A reference is taken on the returned
 	// Filesystem and Dentry.
-	NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error)
+	GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
 }
 
-// NewFilesystemOptions contains options to FilesystemType.NewFilesystem.
-type NewFilesystemOptions struct {
+// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
+type GetFilesystemOptions struct {
 	// Data is the string passed as the 5th argument to mount(2), which is
 	// usually a comma-separated list of filesystem-specific mount options.
 	Data string
 
 	// InternalData holds opaque FilesystemType-specific data. There is
 	// intentionally no way for applications to specify InternalData; if it is
-	// not nil, the call to NewFilesystem originates from within the sentry.
+	// not nil, the call to GetFilesystem originates from within the sentry.
 	InternalData interface{}
 }
 
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 198fb8067..1c3b2e987 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -110,12 +110,12 @@ type MountNamespace struct {
 // NewMountNamespace returns a new mount namespace with a root filesystem
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return nil, err
 	}
@@ -133,13 +133,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
-// NewMount creates and mounts a new Filesystem.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error {
+// NewMount creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return err
 	}
@@ -147,8 +147,8 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 	// lock ordering.
 	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
 	if err != nil {
-		root.decRef(fs)
-		fs.decRef()
+		root.DecRef()
+		fs.DecRef()
 		return err
 	}
 	vfs.mountMu.Lock()
@@ -158,8 +158,8 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
-			root.decRef(fs)
-			fs.decRef()
+			root.DecRef()
+			fs.DecRef()
 			return syserror.ENOENT
 		}
 		// vd might have been mounted over between vfs.GetDentryAt() and
@@ -179,7 +179,7 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 			break
 		}
 		// This can't fail since we're holding vfs.mountMu.
-		nextmnt.root.incRef(nextmnt.fs)
+		nextmnt.root.IncRef()
 		vd.dentry.mu.Unlock()
 		vd.DecRef()
 		vd = VirtualDentry{
@@ -229,7 +229,7 @@ type umountRecursiveOptions struct {
 // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
 // respectively, and returns updated slices. (This is necessary because
 // filesystem locks possibly taken by DentryImpl.DecRef() may precede
-// vfs.mountMu in the lock order, and Mount.decRef() may lock vfs.mountMu.)
+// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
 //
 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
 //
@@ -322,13 +322,15 @@ func (mnt *Mount) tryIncMountedRef() bool {
 	}
 }
 
-func (mnt *Mount) incRef() {
+// IncRef increments mnt's reference count.
+func (mnt *Mount) IncRef() {
 	// In general, negative values for mnt.refs are valid because the MSB is
 	// the eager-unmount bit.
 	atomic.AddInt64(&mnt.refs, 1)
 }
 
-func (mnt *Mount) decRef() {
+// DecRef decrements mnt's reference count.
+func (mnt *Mount) DecRef() {
 	refs := atomic.AddInt64(&mnt.refs, -1)
 	if refs&^math.MinInt64 == 0 { // mask out MSB
 		var vd VirtualDentry
@@ -339,8 +341,8 @@ func (mnt *Mount) decRef() {
 			mnt.vfs.mounts.seq.EndWrite()
 			mnt.vfs.mountMu.Unlock()
 		}
-		mnt.root.decRef(mnt.fs)
-		mnt.fs.decRef()
+		mnt.root.DecRef()
+		mnt.fs.DecRef()
 		if vd.Ok() {
 			vd.DecRef()
 		}
@@ -368,7 +370,7 @@ func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
 			vd.DecRef()
 		}
 		for _, mnt := range mountsToDecRef {
-			mnt.decRef()
+			mnt.DecRef()
 		}
 	} else if refs < 0 {
 		panic("MountNamespace.DecRef() called without holding a reference")
@@ -413,7 +415,7 @@ retryFirst:
 			// Raced with umount.
 			continue
 		}
-		mnt.decRef()
+		mnt.DecRef()
 		mnt = next
 		d = next.root
 	}
@@ -447,15 +449,15 @@ retryFirst:
 		// Raced with umount.
 		goto retryFirst
 	}
-	if !point.tryIncRef(parent.fs) {
+	if !point.TryIncRef() {
 		// Since Mount holds a reference on Mount.key.point, this can only
 		// happen due to a racing change to Mount.key.
-		parent.decRef()
+		parent.DecRef()
 		goto retryFirst
 	}
 	if !vfs.mounts.seq.ReadOk(epoch) {
-		point.decRef(parent.fs)
-		parent.decRef()
+		point.DecRef()
+		parent.DecRef()
 		goto retryFirst
 	}
 	mnt = parent
@@ -480,19 +482,19 @@ retryFirst:
 			// Raced with umount.
 			goto retryNotFirst
 		}
-		if !point.tryIncRef(parent.fs) {
+		if !point.TryIncRef() {
 			// Since Mount holds a reference on Mount.key.point, this can
 			// only happen due to a racing change to Mount.key.
-			parent.decRef()
+			parent.DecRef()
 			goto retryNotFirst
 		}
 		if !vfs.mounts.seq.ReadOk(epoch) {
-			point.decRef(parent.fs)
-			parent.decRef()
+			point.DecRef()
+			parent.DecRef()
 			goto retryNotFirst
 		}
-		d.decRef(mnt.fs)
-		mnt.decRef()
+		d.DecRef()
+		mnt.DecRef()
 		mnt = parent
 		d = point
 	}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 61bce6426..621f5a6f8 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -149,20 +149,20 @@ func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
 
 func (rp *ResolvingPath) decRefStartAndMount() {
 	if rp.flags&rpflagsHaveStartRef != 0 {
-		rp.start.decRef(rp.mount.fs)
+		rp.start.DecRef()
 	}
 	if rp.flags&rpflagsHaveMountRef != 0 {
-		rp.mount.decRef()
+		rp.mount.DecRef()
 	}
 }
 
 func (rp *ResolvingPath) releaseErrorState() {
 	if rp.nextStart != nil {
-		rp.nextStart.decRef(rp.nextMount.fs)
+		rp.nextStart.DecRef()
 		rp.nextStart = nil
 	}
 	if rp.nextMount != nil {
-		rp.nextMount.decRef()
+		rp.nextMount.DecRef()
 		rp.nextMount = nil
 	}
 }
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
index 49952b2cc..436151afa 100644
--- a/pkg/sentry/vfs/syscalls.go
+++ b/pkg/sentry/vfs/syscalls.go
@@ -63,7 +63,7 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
 				mount:  rp.mount,
 				dentry: d,
 			}
-			rp.mount.incRef()
+			rp.mount.IncRef()
 			vfs.putResolvingPath(rp)
 			return vd, nil
 		}
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 70b192ece..593144cb7 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -33,10 +33,10 @@ type FDTestFilesystem struct {
 	vfsfs Filesystem
 }
 
-// NewFilesystem implements FilesystemType.NewFilesystem.
-func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) {
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) {
 	var fs FDTestFilesystem
-	fs.vfsfs.Init(&fs)
+	fs.vfsfs.Init(vfsObj, &fs)
 	return &fs.vfsfs, fs.NewDentry(), nil
 }
 
@@ -126,14 +126,14 @@ func (fs *FDTestFilesystem) NewDentry() *Dentry {
 }
 
 // IncRef implements DentryImpl.IncRef.
-func (d *fdTestDentry) IncRef(vfsfs *Filesystem) {
+func (d *fdTestDentry) IncRef() {
 }
 
 // TryIncRef implements DentryImpl.TryIncRef.
-func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool {
+func (d *fdTestDentry) TryIncRef() bool {
 	return true
 }
 
 // DecRef implements DentryImpl.DecRef.
-func (d *fdTestDentry) DecRef(vfsfs *Filesystem) {
+func (d *fdTestDentry) DecRef() {
 }
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 88e865d86..f0cd3ffe5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -116,15 +116,15 @@ func (vd VirtualDentry) Ok() bool {
 // IncRef increments the reference counts on the Mount and Dentry represented
 // by vd.
 func (vd VirtualDentry) IncRef() {
-	vd.mount.incRef()
-	vd.dentry.incRef(vd.mount.fs)
+	vd.mount.IncRef()
+	vd.dentry.IncRef()
 }
 
 // DecRef decrements the reference counts on the Mount and Dentry represented
 // by vd.
 func (vd VirtualDentry) DecRef() {
-	vd.dentry.decRef(vd.mount.fs)
-	vd.mount.decRef()
+	vd.dentry.DecRef()
+	vd.mount.DecRef()
 }
 
 // Mount returns the Mount associated with vd. It does not take a reference on
-- 
cgit v1.2.3


From e91c1675cd49254936d04f01b814a0cd802ff6de Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 26 Nov 2019 15:32:45 +0800
Subject: passed the kvm test case of "TestKernelFault" on Arm64 platform

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 29c475882..0ba4c6b73 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -385,6 +385,16 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
 	B el1_invalid
 
 el1_da:
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+	WORD $0xd538601a     //MRS   FAR_EL1, R26
+
+	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
+
+	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
+
+	MOVD $PageFault, R3
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
 	B ·Halt(SB)
 
 el1_ia:
-- 
cgit v1.2.3


From e710f654015d2d35d8a8df07ef54ddbc442cbc9d Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 26 Nov 2019 15:53:26 +0800
Subject: Prepare the vcpu environment for a container application

    Mainly 2 jobs were finished in this patch:
    1, context switching for a container application:
        a, R0-R30 b, pc\pstate\sp_el0 c, pagetable_el0 for container application
       This job can help us to pass the following test cases:
       "TestApplicationSyscall", "TestApplicationFault"
    2, checking pagetable_el0 is empty
       This job can help us to pass the following test case: "TestInvalidate"

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s | 67 +++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 29c475882..674c569e1 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -337,6 +337,73 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 #define STACK_FRAME_SIZE 16
 
 TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
+	// Step1, save sentry context into memory.
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
+	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+
+	WORD $0xd5384003    //    MRS SPSR_EL1, R3
+	MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG)
+	MOVD R30, CPU_REGISTERS+PTRACE_PC(RSV_REG)
+	MOVD RSP, R3
+	MOVD R3, CPU_REGISTERS+PTRACE_SP(RSV_REG)
+
+	MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3
+
+	// Step2, save SP_EL1, PSTATE into kernel temporary stack.
+	// switch to temporary stack.
+	LOAD_KERNEL_STACK(RSV_REG)
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+
+	SUB $STACK_FRAME_SIZE, RSP, RSP
+	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R11
+	MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R12
+	STP (R11, R12), 16*0(RSP)
+
+	MOVD CPU_REGISTERS+PTRACE_R11(RSV_REG), R11
+	MOVD CPU_REGISTERS+PTRACE_R12(RSV_REG), R12
+
+	// Step3, test user pagetable.
+	// If user pagetable is empty, trapped in el1_ia.
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+	SWITCH_TO_APP_PAGETABLE(RSV_REG)
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+	SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+
+	// If pagetable is not empty, recovery kernel temporary stack.
+	ADD $STACK_FRAME_SIZE, RSP, RSP
+
+	// Step4, load app context pointer.
+	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP
+
+	// Step5, prepare the environment for container application.
+	// set sp_el0.
+	MOVD PTRACE_SP(RSV_REG_APP), R1
+	WORD $0xd5184101        //MSR R1, SP_EL0
+	// set pc.
+	MOVD PTRACE_PC(RSV_REG_APP), R1
+	MSR R1, ELR_EL1
+	// set pstate.
+	MOVD PTRACE_PSTATE(RSV_REG_APP), R1
+	WORD $0xd5184001  //MSR R1, SPSR_EL1
+
+	// RSV_REG & RSV_REG_APP will be loaded at the end.
+	REGISTERS_LOAD(RSV_REG_APP, 0)
+
+	// switch to user pagetable.
+	MOVD PTRACE_R18(RSV_REG_APP), RSV_REG
+	MOVD PTRACE_R9(RSV_REG_APP), RSV_REG_APP
+
+	SUB $STACK_FRAME_SIZE, RSP, RSP
+	STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+
+	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
+
+	SWITCH_TO_APP_PAGETABLE(RSV_REG)
+
+	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
+	ADD $STACK_FRAME_SIZE, RSP, RSP
+
 	ERET()
 
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
-- 
cgit v1.2.3


From 3e0062480650ded910ef6bc80883668da89e2ef8 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 26 Nov 2019 15:57:07 +0800
Subject: passed the kvm test case of "TestApplicationSyscall" on Arm64
 platform

For test case "TestApplicationSyscall",
Syscall in guest user level will be trapped in el0_svc.
And in el0_svc, we use mmio_exit to leave the KVM guest for now.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 29c475882..bd9f09469 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -441,6 +441,16 @@ TEXT ·El0_sync(SB),NOSPLIT,$0
 	B   el0_invalid
 
 el0_svc:
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+
+	MOVD $0, CPU_ERROR_CODE(RSV_REG) // Clear error code.
+
+	MOVD $1, R3
+	MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
+
+	MOVD $Syscall, R3
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
 	B ·Halt(SB)
 
 el0_da:
-- 
cgit v1.2.3


From 3f0e91b00450f926d8378ebd518b557c1f273712 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 26 Nov 2019 15:59:51 +0800
Subject: passed the kvm test case of "TestApplicationFault" on Arm64 platform

For test case "TestApplicationFault",
Memory-fault in guest user level will be trapped in el0_da.
And in el0_da, we use mmio_exit to leave the KVM guest.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 29c475882..7b7420785 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -444,6 +444,17 @@ el0_svc:
 	B ·Halt(SB)
 
 el0_da:
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+	WORD $0xd538601a     //MRS   FAR_EL1, R26
+
+	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
+
+	MOVD $1, R3
+	MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
+
+	MOVD $PageFault, R3
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
 	B ·Halt(SB)
 
 el0_ia:
-- 
cgit v1.2.3


From 519ceabdf90129664fa1f70f49d0472a9106910f Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 26 Nov 2019 17:01:56 -0800
Subject: Mark execveat as supported for linux64_arm64.

PiperOrigin-RevId: 282667122
---
 pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index a809115e0..f1dd4b0c0 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -295,7 +295,7 @@ var ARM64 = &kernel.SyscallTable{
 		278: syscalls.Supported("getrandom", GetRandom),
 		279: syscalls.Supported("memfd_create", MemfdCreate),
 		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		281: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}),    // TODO(b/118901836)
+		281: syscalls.Supported("execveat", Execveat),
 		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
 		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-- 
cgit v1.2.3


From 20279c305ece6a458006999c8dafc5672ca92803 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Tue, 26 Nov 2019 18:19:47 -0800
Subject: Allow open(O_TRUNC) and (f)truncate for proc files.

This allows writable proc and devices files to be opened with O_CREAT|O_TRUNC.
This is encountered most frequently when interacting with proc or devices files
via the command line.
e.g. $ echo 8192 1048576 4194304 > /proc/sys/net/ipv4/tcp_rmem

Also adds a test to test the behavior of open(O_TRUNC), truncate, and ftruncate
on named pipes.

Fixes #1116

PiperOrigin-RevId: 282677425
---
 pkg/sentry/fs/proc/sys_net.go         | 17 ++++++++++++++---
 pkg/sentry/fs/tty/master.go           |  6 +++++-
 pkg/sentry/fs/tty/slave.go            |  6 +++++-
 pkg/sentry/syscalls/linux/sys_file.go | 12 ++++++++++--
 test/syscalls/linux/pipe.cc           | 14 ++++++++++++++
 5 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index f3b63dfc2..bd93f83fa 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -64,7 +64,7 @@ var _ fs.InodeOperations = (*tcpMemInode)(nil)
 
 func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode {
 	tm := &tcpMemInode{
-		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
 		s:               s,
 		dir:             dir,
 	}
@@ -77,6 +77,11 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir
 	return fs.NewInode(ctx, tm, msrc, sattr)
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	flags.Pread = true
@@ -168,14 +173,15 @@ func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error {
 
 // +stateify savable
 type tcpSack struct {
+	fsutil.SimpleFileInode
+
 	stack   inet.Stack `state:"wait"`
 	enabled *bool
-	fsutil.SimpleFileInode
 }
 
 func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
 	ts := &tcpSack{
-		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
 		stack:           s,
 	}
 	sattr := fs.StableAttr{
@@ -187,6 +193,11 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 	return fs.NewInode(ctx, ts, msrc, sattr)
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	flags.Pread = true
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index bc56be696..934828c12 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -32,7 +32,6 @@ import (
 // +stateify savable
 type masterInodeOperations struct {
 	fsutil.SimpleFileInode
-	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -77,6 +76,11 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
 func (mi *masterInodeOperations) Release(ctx context.Context) {
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 //
 // It allocates a new terminal.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 4cbea0367..2a51e6bab 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -31,7 +31,6 @@ import (
 // +stateify savable
 type slaveInodeOperations struct {
 	fsutil.SimpleFileInode
-	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -73,6 +72,11 @@ func (si *slaveInodeOperations) Release(ctx context.Context) {
 	si.t.DecRef()
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 //
 // This may race with destruction of the terminal. If the terminal is gone, it
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 167c2b60b..3b9181002 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -171,6 +171,9 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 			}
 		}
 
+		// Truncate is called when O_TRUNC is specified for any kind of
+		// existing Dirent. Behavior is delegated to the entry's Truncate
+		// implementation.
 		if flags&linux.O_TRUNC != 0 {
 			if err := d.Inode.Truncate(t, d, 0); err != nil {
 				return err
@@ -397,7 +400,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
 				return err
 			}
 
-			// Should we truncate the file?
+			// Truncate is called when O_TRUNC is specified for any kind of
+			// existing Dirent. Behavior is delegated to the entry's Truncate
+			// implementation.
 			if flags&linux.O_TRUNC != 0 {
 				if err := found.Inode.Truncate(t, found, 0); err != nil {
 					return err
@@ -1484,6 +1489,8 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if fs.IsDir(d.Inode.StableAttr) {
 			return syserror.EISDIR
 		}
+		// In contrast to open(O_TRUNC), truncate(2) is only valid for file
+		// types.
 		if !fs.IsFile(d.Inode.StableAttr) {
 			return syserror.EINVAL
 		}
@@ -1522,7 +1529,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Note that this is different from truncate(2) above, where a
+	// In contrast to open(O_TRUNC), truncate(2) is only valid for file
+	// types. Note that this is different from truncate(2) above, where a
 	// directory returns EISDIR.
 	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
 		return 0, nil, syserror.EINVAL
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index c0b354e65..ac9b21b24 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -212,6 +212,20 @@ TEST(Pipe2Test, BadOptions) {
   EXPECT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
 }
 
+// Tests that opening named pipes with O_TRUNC shouldn't cause an error, but
+// calls to (f)truncate should.
+TEST(NamedPipeTest, Truncate) {
+  const std::string tmp_path = NewTempAbsPath();
+  SKIP_IF(mkfifo(tmp_path.c_str(), 0644) != 0);
+
+  ASSERT_THAT(open(tmp_path.c_str(), O_NONBLOCK | O_RDONLY), SyscallSucceeds());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(tmp_path.c_str(), O_RDWR | O_NONBLOCK | O_TRUNC));
+
+  ASSERT_THAT(truncate(tmp_path.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST_P(PipeTest, Seek) {
   SKIP_IF(!CreateBlocking());
 
-- 
cgit v1.2.3


From 58afb4be695e6804925ba2be5f2d8c245f079cba Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 27 Nov 2019 13:47:44 -0800
Subject: Add floating point exception tests

PiperOrigin-RevId: 282828273
---
 test/syscalls/linux/exceptions.cc | 181 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 370e85166..3d564e720 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -22,6 +22,23 @@
 namespace gvisor {
 namespace testing {
 
+// Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5
+// "x87 FPU Control Word".
+constexpr uint16_t kX87ControlWordDefault = 0x37f;
+
+// Mask for the divide-by-zero exception.
+constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2;
+
+// Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch
+// 11.6.4 "Initialization of SSE/SSE3 Extensions".
+constexpr uint32_t kMXCSRDefault = 0x1f80;
+
+// Mask for the divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Mask = 1 << 9;
+
+// Flag for a pending divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Flag = 1 << 2;
+
 void inline Halt() { asm("hlt\r\n"); }
 
 void inline SetAlignmentCheck() {
@@ -107,6 +124,170 @@ TEST(ExceptionTest, DivideByZero) {
       ::testing::KilledBySignal(SIGFPE), "");
 }
 
+// By default, x87 exceptions are masked and simply return a default value.
+TEST(ExceptionTest, X87DivideByZeroMasked) {
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
+  asm("fildl %[value]\r\n"
+      "fidivl %[divisor]\r\n"
+      "fistpl %[quotient]\r\n"
+      : [ quotient ] "=m"(quotient)
+      : [ value ] "m"(value), [ divisor ] "m"(divisor));
+
+  EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, X87DivideByZeroUnmasked) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        // Clear the divide by zero exception mask.
+        constexpr uint16_t kControlWord =
+            kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
+        asm volatile(
+            "fldcw %[cw]\r\n"
+            "fildl %[value]\r\n"
+            "fidivl %[divisor]\r\n"
+            "fistpl %[quotient]\r\n"
+            : [ quotient ] "=m"(quotient)
+            : [ cw ] "m"(kControlWord), [ value ] "m"(value),
+              [ divisor ] "m"(divisor));
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the x87 status register are not clobbered by syscalls.
+TEST(ExceptionTest, X87StatusClobber) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        // Clear the divide by zero exception mask.
+        constexpr uint16_t kControlWord =
+            kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
+        asm volatile(
+            "fildl %[value]\r\n"
+            "fidivl %[divisor]\r\n"
+            // Exception is masked, so it does not occur here.
+            "fistpl %[quotient]\r\n"
+
+            // SYS_getpid placed in rax by constraint.
+            "syscall\r\n"
+
+            // Unmask exception. The syscall didn't clobber the pending
+            // exception, so now it can be raised.
+            //
+            // N.B. "a floating-point exception will be generated upon execution
+            // of the *next* floating-point instruction".
+            "fldcw %[cw]\r\n"
+            "fwait\r\n"
+            : [ quotient ] "=m"(quotient)
+            : [ value ] "m"(value), [ divisor ] "m"(divisor), "a"(SYS_getpid),
+              [ cw ] "m"(kControlWord)
+            : "rcx", "r11");
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// By default, SSE exceptions are masked and simply return a default value.
+TEST(ExceptionTest, SSEDivideByZeroMasked) {
+  uint32_t status;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
+  asm("cvtsi2ssl %[value], %%xmm0\r\n"
+      "cvtsi2ssl %[divisor], %%xmm1\r\n"
+      "divss %%xmm1, %%xmm0\r\n"
+      "cvtss2sil %%xmm0, %[quotient]\r\n"
+      : [ quotient ] "=r"(quotient), [ status ] "=r"(status)
+      : [ value ] "r"(value), [ divisor ] "r"(divisor)
+      : "xmm0", "xmm1");
+
+  EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        // Clear the divide by zero exception mask.
+        constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
+
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
+        asm volatile(
+            "ldmxcsr %[mxcsr]\r\n"
+            "cvtsi2ssl %[value], %%xmm0\r\n"
+            "cvtsi2ssl %[divisor], %%xmm1\r\n"
+            "divss %%xmm1, %%xmm0\r\n"
+            "cvtss2sil %%xmm0, %[quotient]\r\n"
+            : [ quotient ] "=r"(quotient)
+            : [ mxcsr ] "m"(kMXCSR), [ value ] "r"(value),
+              [ divisor ] "r"(divisor)
+            : "xmm0", "xmm1");
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the SSE status register are not clobbered by syscalls.
+TEST(ExceptionTest, SSEStatusClobber) {
+  uint32_t mxcsr;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
+  asm("cvtsi2ssl %[value], %%xmm0\r\n"
+      "cvtsi2ssl %[divisor], %%xmm1\r\n"
+      "divss %%xmm1, %%xmm0\r\n"
+      // Exception is masked, so it does not occur here.
+      "cvtss2sil %%xmm0, %[quotient]\r\n"
+
+      // SYS_getpid placed in rax by constraint.
+      "syscall\r\n"
+
+      // Intel SDM Vol 1, Ch 10.2.3.1 "SIMD Floating-Point Mask and Flag Bits":
+      // "If LDMXCSR or FXRSTOR clears a mask bit and sets the corresponding
+      // exception flag bit, a SIMD floating-point exception will not be
+      // generated as a result of this change. The unmasked exception will be
+      // generated only upon the execution of the next SSE/SSE2/SSE3 instruction
+      // that detects the unmasked exception condition."
+      //
+      // Though ambiguous, empirical evidence indicates that this means that
+      // exception flags set in the status register will never cause an
+      // exception to be raised; only a new exception condition will do so.
+      //
+      // Thus here we just check for the flag itself rather than trying to raise
+      // the exception.
+      "stmxcsr %[mxcsr]\r\n"
+      : [ quotient ] "=r"(quotient), [ mxcsr ] "+m"(mxcsr)
+      : [ value ] "r"(value), [ divisor ] "r"(divisor), "a"(SYS_getpid)
+      : "xmm0", "xmm1", "rcx", "r11");
+
+  EXPECT_TRUE(mxcsr & kMXCSRDiv0Flag);
+}
+
 TEST(ExceptionTest, IOAccessFault) {
   // See above.
   struct sigaction sa = {};
-- 
cgit v1.2.3


From 684f757a228f88e5fabe6ebe6ed54f0db20fd63d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 27 Nov 2019 16:19:35 -0800
Subject: Add support for receiving TOS and TCLASS control messages in
 hostinet.

This involves allowing getsockopt/setsockopt for the corresponding socket
options, as well as allowing hostinet to process control messages received from
the actual recvmsg syscall.

PiperOrigin-RevId: 282851425
---
 pkg/abi/linux/socket.go                 |  9 ++++
 pkg/sentry/socket/control/control.go    | 24 ++++++++-
 pkg/sentry/socket/hostinet/BUILD        |  1 +
 pkg/sentry/socket/hostinet/socket.go    | 92 ++++++++++++++++++++++++---------
 pkg/sentry/syscalls/linux/sys_socket.go |  8 +++
 pkg/tcpip/tcpip.go                      | 14 ++++-
 runsc/boot/filter/config.go             | 29 +++++++++--
 7 files changed, 145 insertions(+), 32 deletions(-)

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 2e2cc6be7..766ee4014 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -422,6 +422,15 @@ type ControlMessageRights []int32
 // ControlMessageRights.
 const SizeOfControlMessageRight = 4
 
+// SizeOfControlMessageInq is the size of a TCP_INQ control message.
+const SizeOfControlMessageInq = 4
+
+// SizeOfControlMessageTOS is the size of an IP_TOS control message.
+const SizeOfControlMessageTOS = 1
+
+// SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
+const SizeOfControlMessageTClass = 4
+
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4e95101b7..0371acede 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -320,11 +320,33 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 		buf,
 		linux.SOL_TCP,
 		linux.TCP_INQ,
-		4,
+		t.Arch().Width(),
 		inq,
 	)
 }
 
+// PackTOS packs an IP_TOS socket control message.
+func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IP,
+		linux.IP_TOS,
+		t.Arch().Width(),
+		tos,
+	)
+}
+
+// PackTClass packs an IPV6_TCLASS socket control message.
+func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IPV6,
+		linux.IPV6_TCLASS,
+		t.Arch().Width(),
+		tClass,
+	)
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) {
 	var (
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 8b66a719d..b1cf1126f 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -34,5 +34,6 @@ go_library(
         "//pkg/syserror",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 92beb1bcf..aa234f760 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
@@ -41,6 +42,10 @@ const (
 	// sizeofSockaddr is the size in bytes of the largest sockaddr type
 	// supported by this package.
 	sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
+
+	// maxControlLen is the maximum size of a control message buffer used in a
+	// recvmsg syscall.
+	maxControlLen = 1024
 )
 
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
@@ -281,26 +286,32 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	// Whitelist options and constrain option length.
 	var optlen int
 	switch level {
-	case syscall.SOL_IPV6:
+	case linux.SOL_IP:
+		switch name {
+		case linux.IP_RECVTOS:
+			optlen = sizeofInt32
+		}
+	case linux.SOL_IPV6:
 		switch name {
-		case syscall.IPV6_V6ONLY:
+		case linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
 			optlen = sizeofInt32
 		}
-	case syscall.SOL_SOCKET:
+	case linux.SOL_SOCKET:
 		switch name {
-		case syscall.SO_ERROR, syscall.SO_KEEPALIVE, syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR:
+		case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR:
 			optlen = sizeofInt32
-		case syscall.SO_LINGER:
+		case linux.SO_LINGER:
 			optlen = syscall.SizeofLinger
 		}
-	case syscall.SOL_TCP:
+	case linux.SOL_TCP:
 		switch name {
-		case syscall.TCP_NODELAY:
+		case linux.TCP_NODELAY:
 			optlen = sizeofInt32
-		case syscall.TCP_INFO:
+		case linux.TCP_INFO:
 			optlen = int(linux.SizeOfTCPInfo)
 		}
 	}
+
 	if optlen == 0 {
 		return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT
 	}
@@ -320,19 +331,24 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 	// Whitelist options and constrain option length.
 	var optlen int
 	switch level {
-	case syscall.SOL_IPV6:
+	case linux.SOL_IP:
 		switch name {
-		case syscall.IPV6_V6ONLY:
+		case linux.IP_RECVTOS:
 			optlen = sizeofInt32
 		}
-	case syscall.SOL_SOCKET:
+	case linux.SOL_IPV6:
 		switch name {
-		case syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR:
+		case linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
 			optlen = sizeofInt32
 		}
-	case syscall.SOL_TCP:
+	case linux.SOL_SOCKET:
 		switch name {
-		case syscall.TCP_NODELAY:
+		case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR:
+			optlen = sizeofInt32
+		}
+	case linux.SOL_TCP:
+		switch name {
+		case linux.TCP_NODELAY:
 			optlen = sizeofInt32
 		}
 	}
@@ -354,11 +370,11 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
 	// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
-	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
+	// messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the
 	// Socket interface's dependence on netstack.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
 		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
@@ -370,6 +386,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		senderAddrBuf = make([]byte, sizeofSockaddr)
 	}
 
+	var controlBuf []byte
 	var msgFlags int
 
 	recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
@@ -384,11 +401,6 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// We always do a non-blocking recv*().
 		sysflags := flags | syscall.MSG_DONTWAIT
 
-		if dsts.NumBlocks() == 1 {
-			// Skip allocating []syscall.Iovec.
-			return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddrBuf)
-		}
-
 		iovs := iovecsFromBlockSeq(dsts)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
@@ -398,12 +410,18 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			msg.Name = &senderAddrBuf[0]
 			msg.Namelen = uint32(len(senderAddrBuf))
 		}
+		if controlLen > 0 {
+			controlBuf = make([]byte, maxControlLen)
+			msg.Control = &controlBuf[0]
+			msg.Controllen = maxControlLen
+		}
 		n, err := recvmsg(s.fd, &msg, sysflags)
 		if err != nil {
 			return 0, err
 		}
 		senderAddrBuf = senderAddrBuf[:msg.Namelen]
 		msgFlags = int(msg.Flags)
+		controlLen = uint64(msg.Controllen)
 		return n, nil
 	})
 
@@ -429,14 +447,38 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			n, err = dst.CopyOutFrom(t, recvmsgToBlocks)
 		}
 	}
-
-	// We don't allow control messages.
-	msgFlags &^= linux.MSG_CTRUNC
+	if err != nil {
+		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+	}
 
 	if senderRequested {
 		senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf)
 	}
-	return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), socket.ControlMessages{}, syserr.FromError(err)
+
+	unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf[:controlLen])
+	if err != nil {
+		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+	}
+
+	controlMessages := socket.ControlMessages{}
+	for _, unixCmsg := range unixControlMessages {
+		switch unixCmsg.Header.Level {
+		case syscall.SOL_IP:
+			switch unixCmsg.Header.Type {
+			case syscall.IP_TOS:
+				controlMessages.IP.HasTOS = true
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+			}
+		case syscall.SOL_IPV6:
+			switch unixCmsg.Header.Type {
+			case syscall.IPV6_TCLASS:
+				controlMessages.IP.HasTClass = true
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], usermem.ByteOrder, &controlMessages.IP.TClass)
+			}
+		}
+	}
+
+	return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), controlMessages, nil
 }
 
 // SendMsg implements socket.Socket.SendMsg.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index ab1001f16..13f77565f 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -802,6 +802,14 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		controlData = control.PackInq(t, cms.IP.Inq, controlData)
 	}
 
+	if cms.IP.HasTOS {
+		controlData = control.PackTOS(t, cms.IP.TOS, controlData)
+	}
+
+	if cms.IP.HasTClass {
+		controlData = control.PackTClass(t, cms.IP.TClass, controlData)
+	}
+
 	if cms.Unix.Rights != nil {
 		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index bd5eb89ca..5746043cc 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -308,7 +308,7 @@ type ControlMessages struct {
 	// HasTimestamp indicates whether Timestamp is valid/set.
 	HasTimestamp bool
 
-	// Timestamp is the time (in ns) that the last packed used to create
+	// Timestamp is the time (in ns) that the last packet used to create
 	// the read data was received.
 	Timestamp int64
 
@@ -317,6 +317,18 @@ type ControlMessages struct {
 
 	// Inq is the number of bytes ready to be received.
 	Inq int32
+
+	// HasTOS indicates whether Tos is valid/set.
+	HasTOS bool
+
+	// TOS is the IPv4 type of service of the associated packet.
+	TOS int8
+
+	// HasTClass indicates whether Tclass is valid/set.
+	HasTClass bool
+
+	// Tclass is the IPv6 traffic class of the associated packet.
+	TClass int32
 }
 
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 677356193..bf690160c 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -134,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.SOL_SOCKET),
 			seccomp.AllowValue(syscall.SO_SNDBUF),
 		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_REUSEADDR),
-		},
 	},
 	syscall.SYS_GETTID:       {},
 	syscall.SYS_GETTIMEOFDAY: {},
@@ -315,6 +310,16 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETPEERNAME: {},
 		syscall.SYS_GETSOCKNAME: {},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IPV6),
@@ -418,6 +423,20 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowAny{},
 				seccomp.AllowValue(4),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
-- 
cgit v1.2.3


From 10bbcf97d25b824aa0565af114e3272d3e314d19 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 28 Nov 2019 17:13:46 -0800
Subject: Test handling segments on completed but not yet accepted TCP
 connections

This change does not introduce any new features, or modify existing ones.

This change tests handling TCP segments right away for connections that were
completed from a listening endpoint.

PiperOrigin-RevId: 282986457
---
 pkg/tcpip/transport/tcp/tcp_test.go                | 122 ++++++++++++++++++++-
 pkg/tcpip/transport/tcp/testing/context/context.go |   6 +
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index c4b45aa6f..50829ae27 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -792,6 +792,82 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
 		checker.SeqNum(200)))
 }
 
+// TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete,
+// peers can send data and expect a response within a reasonable ammount of time
+// without calling Accept on the listening endpoint first.
+//
+// This test uses IPv4.
+func TestTCPAckBeforeAcceptV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	// Send data before accepting the connection.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+// TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete,
+// peers can send data and expect a response within a reasonable ammount of time
+// without calling Accept on the listening endpoint first.
+//
+// This test uses IPv6.
+func TestTCPAckBeforeAcceptV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	irs, iss := executeV6Handshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	// Send data before accepting the connection.
+	c.SendV6Packet([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
 func TestSendRstOnListenerRxAckV4(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -4303,7 +4379,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
 		RcvWnd:  30000,
 	})
 
-	// Receive the SYN-ACK reply.w
+	// Receive the SYN-ACK reply.
 	b := c.GetPacket()
 	tcp := header.TCP(header.IPv4(b).Payload())
 	iss = seqnum.Value(tcp.SequenceNumber())
@@ -4336,6 +4412,50 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
 	return irs, iss
 }
 
+func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+	// Send a SYN request.
+	irs = seqnum.Value(789)
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetV6Packet()
+	tcp := header.TCP(header.IPv6(b).Payload())
+	iss = seqnum.Value(tcp.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(srcPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+
+	if synCookieInUse {
+		// When cookies are in use window scaling is disabled.
+		tcpCheckers = append(tcpCheckers, checker.TCPSynOptions(header.TCPSynOptions{
+			WS:  -1,
+			MSS: c.MSSWithoutOptionsV6(),
+		}))
+	}
+
+	checker.IPv6(t, b, checker.TCP(tcpCheckers...))
+
+	// Send ACK.
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+	return irs, iss
+}
+
 // TestListenBacklogFull tests that netstack does not complete handshakes if the
 // listen backlog for the endpoint is full.
 func TestListenBacklogFull(t *testing.T) {
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 04fdaaed1..6cb66c1af 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1089,3 +1089,9 @@ func (c *Context) SetGSOEnabled(enable bool) {
 func (c *Context) MSSWithoutOptions() uint16 {
 	return uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize)
 }
+
+// MSSWithoutOptionsV6 returns the value for the MSS used by the stack when no
+// options are in use for IPv6 packets.
+func (c *Context) MSSWithoutOptionsV6() uint16 {
+	return uint16(c.linkEP.MTU() - header.IPv6MinimumSize - header.TCPMinimumSize)
+}
-- 
cgit v1.2.3


From aa70523da21534d8518eaa52f36db002e3d61885 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Mon, 2 Dec 2019 05:37:09 -0800
Subject: Port tests in udp_socket.cc to Fuchsia

Separate out a test in udp_socket.cc that depends on <linux/errqueue.h> so the
rest of the tests can run on Fuchsia.

PiperOrigin-RevId: 283322633
---
 test/syscalls/linux/BUILD                          |   23 +-
 test/syscalls/linux/udp_socket.cc                  | 1321 +-------------------
 .../linux/udp_socket_errqueue_test_case.cc         |   54 +
 test/syscalls/linux/udp_socket_test_cases.cc       | 1279 +++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.h        |   74 ++
 5 files changed, 1427 insertions(+), 1324 deletions(-)
 create mode 100644 test/syscalls/linux/udp_socket_errqueue_test_case.cc
 create mode 100644 test/syscalls/linux/udp_socket_test_cases.cc
 create mode 100644 test/syscalls/linux/udp_socket_test_cases.h

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 6345ea28c..2dd115409 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3351,11 +3351,15 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "udp_socket_test",
+cc_library(
+    name = "udp_socket_test_cases",
     testonly = 1,
-    srcs = ["udp_socket.cc"],
-    linkstatic = 1,
+    srcs = [
+        "udp_socket_test_cases.cc",
+    ] + select_for_linux([
+        "udp_socket_errqueue_test_case.cc",
+    ]),
+    hdrs = ["udp_socket_test_cases.h"],
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
@@ -3366,6 +3370,17 @@ cc_binary(
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "udp_socket_test",
+    testonly = 1,
+    srcs = ["udp_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":udp_socket_test_cases",
+    ],
 )
 
 cc_binary(
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index 111dbacdf..7a8ac30a4 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -12,1332 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <linux/errqueue.h>
-#include <netinet/in.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include "gtest/gtest.h"
-#include "absl/base/macros.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-#include "test/util/thread_util.h"
+#include "test/syscalls/linux/udp_socket_test_cases.h"
 
 namespace gvisor {
 namespace testing {
 
 namespace {
 
-// The initial port to be be used on gvisor.
-constexpr int TestPort = 40000;
-
-// Fixture for tests parameterized by the address family to use (AF_INET and
-// AF_INET6) when creating sockets.
-class UdpSocketTest : public ::testing::TestWithParam<AddressFamily> {
- protected:
-  // Creates two sockets that will be used by test cases.
-  void SetUp() override;
-
-  // Closes the sockets created by SetUp().
-  void TearDown() override {
-    EXPECT_THAT(close(s_), SyscallSucceeds());
-    EXPECT_THAT(close(t_), SyscallSucceeds());
-
-    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
-      ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
-    }
-  }
-
-  // First UDP socket.
-  int s_;
-
-  // Second UDP socket.
-  int t_;
-
-  // The length of the socket address.
-  socklen_t addrlen_;
-
-  // Initialized address pointing to loopback and port TestPort+i.
-  struct sockaddr* addr_[3];
-
-  // Initialize "any" address.
-  struct sockaddr* anyaddr_;
-
-  // Used ports.
-  int ports_[3];
-
- private:
-  // Storage for the loopback addresses.
-  struct sockaddr_storage addr_storage_[3];
-
-  // Storage for the "any" address.
-  struct sockaddr_storage anyaddr_storage_;
-};
-
-// Gets a pointer to the port component of the given address.
-uint16_t* Port(struct sockaddr_storage* addr) {
-  switch (addr->ss_family) {
-    case AF_INET: {
-      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
-      return &sin->sin_port;
-    }
-    case AF_INET6: {
-      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
-      return &sin6->sin6_port;
-    }
-  }
-
-  return nullptr;
-}
-
-void UdpSocketTest::SetUp() {
-  int type;
-  if (GetParam() == AddressFamily::kIpv4) {
-    type = AF_INET;
-    auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
-    addrlen_ = sizeof(*sin);
-    sin->sin_addr.s_addr = htonl(INADDR_ANY);
-  } else {
-    type = AF_INET6;
-    auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
-    addrlen_ = sizeof(*sin6);
-    if (GetParam() == AddressFamily::kIpv6) {
-      sin6->sin6_addr = IN6ADDR_ANY_INIT;
-    } else {
-      TestAddress const& v4_mapped_any = V4MappedAny();
-      sin6->sin6_addr =
-          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
-              ->sin6_addr;
-    }
-  }
-  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-
-  ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-
-  memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
-  anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
-  anyaddr_->sa_family = type;
-
-  if (gvisor::testing::IsRunningOnGvisor()) {
-    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
-      ports_[i] = TestPort + i;
-    }
-  } else {
-    // When not under gvisor, use utility function to pick port. Assert that
-    // all ports are different.
-    std::string error;
-    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
-      // Find an unused port, we specify port 0 to allow the kernel to provide
-      // the port.
-      bool unique = true;
-      do {
-        ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
-            0, AddressFamily::kDualStack, SocketType::kUdp, false));
-        ASSERT_GT(ports_[i], 0);
-        for (size_t j = 0; j < i; ++j) {
-          if (ports_[j] == ports_[i]) {
-            unique = false;
-            break;
-          }
-        }
-      } while (!unique);
-    }
-  }
-
-  // Initialize the sockaddrs.
-  for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
-    memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
-
-    addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
-    addr_[i]->sa_family = type;
-
-    switch (type) {
-      case AF_INET: {
-        auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
-        sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-        sin->sin_port = htons(ports_[i]);
-        break;
-      }
-      case AF_INET6: {
-        auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
-        sin6->sin6_addr = in6addr_loopback;
-        sin6->sin6_port = htons(ports_[i]);
-        break;
-      }
-    }
-  }
-}
-
-TEST_P(UdpSocketTest, Creation) {
-  int type = AF_INET6;
-  if (GetParam() == AddressFamily::kIpv4) {
-    type = AF_INET;
-  }
-
-  int s_;
-
-  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-  EXPECT_THAT(close(s_), SyscallSucceeds());
-
-  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds());
-  EXPECT_THAT(close(s_), SyscallSucceeds());
-
-  ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails());
-}
-
-TEST_P(UdpSocketTest, Getsockname) {
-  // Check that we're not bound.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
-
-  // Bind, then check that we get the right address.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Getpeername) {
-  // Check that we're not connected.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-
-  // Connect, then check that we get the right address.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, SendNotConnected) {
-  // Do send & write, they must fail.
-  char buf[512];
-  EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
-              SyscallFailsWithErrno(EDESTADDRREQ));
-
-  EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
-
-  // Use sendto.
-  ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Check that we're bound now.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ConnectBinds) {
-  // Connect the socket.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that we're bound now.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveNotBound) {
-  char buf[512];
-  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, Bind) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Try to bind again.
-  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
-
-  // Check that we're still bound to the original address.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, BindInUse) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Try to bind again.
-  EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterConnect) {
-  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Get the address s_ was bound to during connect.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-
-  // Send from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
-                     reinterpret_cast<sockaddr*>(&addr), addrlen),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
-  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Get the address s_ was bound to during connect.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-
-  for (int i = 0; i < 2; i++) {
-    // Send from t_ to s_.
-    char buf[512];
-    RandomizeBuffer(buf, sizeof(buf));
-    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-                SyscallSucceeds());
-    ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
-                       reinterpret_cast<sockaddr*>(&addr), addrlen),
-                SyscallSucceedsWithValue(sizeof(buf)));
-
-    // Receive the data.
-    char received[sizeof(buf)];
-    EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-                SyscallSucceedsWithValue(sizeof(received)));
-    EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
-    // Disconnect s_.
-    struct sockaddr addr = {};
-    addr.sa_family = AF_UNSPEC;
-    ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds());
-    // Connect s_ loopback:TestPort.
-    ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  }
-}
-
-TEST_P(UdpSocketTest, Connect) {
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that we're connected to the right peer.
-  struct sockaddr_storage peer;
-  socklen_t peerlen = sizeof(peer);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-              SyscallSucceeds());
-  EXPECT_EQ(peerlen, addrlen_);
-  EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
-
-  // Try to bind after connect.
-  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
-
-  // Try to connect again.
-  EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Check that peer name changed.
-  peerlen = sizeof(peer);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-              SyscallSucceeds());
-  EXPECT_EQ(peerlen, addrlen_);
-  EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
-}
-
-void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
-  struct sockaddr_storage addr = {};
-
-  // Precondition check.
-  {
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(
-        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-        SyscallSucceeds());
-
-    if (family == AddressFamily::kIpv4) {
-      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
-    } else {
-      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      struct in6_addr any = IN6ADDR_ANY_INIT;
-      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0);
-    }
-
-    {
-      socklen_t addrlen = sizeof(addr);
-      EXPECT_THAT(
-          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-          SyscallFailsWithErrno(ENOTCONN));
-    }
-
-    struct sockaddr_storage baddr = {};
-    if (family == AddressFamily::kIpv4) {
-      auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
-      addrlen = sizeof(*addr_in);
-      addr_in->sin_family = AF_INET;
-      addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
-      addr_in->sin_port = port;
-    } else {
-      auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
-      addrlen = sizeof(*addr_in);
-      addr_in->sin6_family = AF_INET6;
-      addr_in->sin6_port = port;
-      if (family == AddressFamily::kIpv6) {
-        addr_in->sin6_addr = IN6ADDR_ANY_INIT;
-      } else {
-        TestAddress const& v4_mapped_any = V4MappedAny();
-        addr_in->sin6_addr =
-            reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
-                ->sin6_addr;
-      }
-    }
-
-    // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
-    if (port == 0) {
-      SKIP_IF(IsRunningOnGvisor());
-    }
-
-    ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
-                SyscallSucceeds());
-  }
-
-  // Postcondition check.
-  {
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(
-        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-        SyscallSucceeds());
-
-    if (family == AddressFamily::kIpv4) {
-      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
-    } else {
-      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      struct in6_addr loopback;
-      if (family == AddressFamily::kIpv6) {
-        loopback = IN6ADDR_LOOPBACK_INIT;
-      } else {
-        TestAddress const& v4_mapped_loopback = V4MappedLoopback();
-        loopback = reinterpret_cast<const struct sockaddr_in6*>(
-                       &v4_mapped_loopback.addr)
-                       ->sin6_addr;
-      }
-
-      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
-    }
-
-    addrlen = sizeof(addr);
-    if (port == 0) {
-      EXPECT_THAT(
-          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-          SyscallFailsWithErrno(ENOTCONN));
-    } else {
-      EXPECT_THAT(
-          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-          SyscallSucceeds());
-    }
-  }
-}
-
-TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); }
-
-TEST_P(UdpSocketTest, ConnectAnyWithPort) {
-  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
-  ConnectAny(GetParam(), s_, port);
-}
-
-void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) {
-  struct sockaddr_storage addr = {};
-
-  socklen_t addrlen = sizeof(addr);
-  struct sockaddr_storage baddr = {};
-  if (family == AddressFamily::kIpv4) {
-    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
-    addrlen = sizeof(*addr_in);
-    addr_in->sin_family = AF_INET;
-    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
-    addr_in->sin_port = port;
-  } else {
-    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
-    addrlen = sizeof(*addr_in);
-    addr_in->sin6_family = AF_INET6;
-    addr_in->sin6_port = port;
-    if (family == AddressFamily::kIpv6) {
-      addr_in->sin6_addr = IN6ADDR_ANY_INIT;
-    } else {
-      TestAddress const& v4_mapped_any = V4MappedAny();
-      addr_in->sin6_addr =
-          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
-              ->sin6_addr;
-    }
-  }
-
-  // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
-  if (port == 0) {
-    SKIP_IF(IsRunningOnGvisor());
-  }
-
-  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
-              SyscallSucceeds());
-  // Now the socket is bound to the loopback address.
-
-  // Disconnect
-  addrlen = sizeof(addr);
-  addr.ss_family = AF_UNSPEC;
-  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen),
-              SyscallSucceeds());
-
-  // Check that after disconnect the socket is bound to the ANY address.
-  EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  if (family == AddressFamily::kIpv4) {
-    auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
-  } else {
-    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    struct in6_addr loopback = IN6ADDR_ANY_INIT;
-
-    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
-  }
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
-  DisconnectAfterConnectAny(GetParam(), s_, 0);
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
-  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
-  DisconnectAfterConnectAny(GetParam(), s_, port);
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBind) {
-  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
-  // Connect the socket.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage addr = {};
-  addr.ss_family = AF_UNSPEC;
-  EXPECT_THAT(
-      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
-      SyscallSucceeds());
-
-  // Check that we're still bound.
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
-  struct sockaddr_storage baddr = {};
-  socklen_t addrlen;
-  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
-  if (GetParam() == AddressFamily::kIpv4) {
-    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
-    addr_in->sin_family = AF_INET;
-    addr_in->sin_port = port;
-    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
-  } else {
-    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
-    addr_in->sin6_family = AF_INET6;
-    addr_in->sin6_port = port;
-    addr_in->sin6_scope_id = 0;
-    addr_in->sin6_addr = IN6ADDR_ANY_INIT;
-  }
-  ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
-              SyscallSucceeds());
-  // Connect the socket.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage addr = {};
-  addr.ss_family = AF_UNSPEC;
-  EXPECT_THAT(
-      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
-      SyscallSucceeds());
-
-  // Check that we're still bound.
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0);
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, Disconnect) {
-  for (int i = 0; i < 2; i++) {
-    // Try to connect again.
-    EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-    // Check that we're connected to the right peer.
-    struct sockaddr_storage peer;
-    socklen_t peerlen = sizeof(peer);
-    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-                SyscallSucceeds());
-    EXPECT_EQ(peerlen, addrlen_);
-    EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
-
-    // Try to disconnect.
-    struct sockaddr_storage addr = {};
-    addr.ss_family = AF_UNSPEC;
-    EXPECT_THAT(
-        connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
-        SyscallSucceeds());
-
-    peerlen = sizeof(peer);
-    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-                SyscallFailsWithErrno(ENOTCONN));
-
-    // Check that we're still bound.
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-                SyscallSucceeds());
-    EXPECT_EQ(addrlen, addrlen_);
-    EXPECT_EQ(*Port(&addr), 0);
-  }
-}
-
-TEST_P(UdpSocketTest, ConnectBadAddress) {
-  struct sockaddr addr = {};
-  addr.sa_family = addr_[0]->sa_family;
-  ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send to a different destination than we're connected to.
-  char buf[512];
-  EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from s_ to t_.
-  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
-  // Receive the packet.
-  char received[3];
-  EXPECT_THAT(read(t_, received, sizeof(received)),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Set t_ to non-blocking.
-  int opts = 0;
-  ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from s_ to t_.
-  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
-  // Receive the packet.
-  char received[3];
-  EXPECT_THAT(read(t_, received, sizeof(received)),
-              SyscallSucceedsWithValue(0));
-  EXPECT_THAT(read(t_, received, sizeof(received)),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
-  // Bind s_ to loopback.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send some data to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveConnected) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+2.
-  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Check that the data isn't_ received because it was sent from a different
-  // address than we're connected.
-  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+2.
-  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Connect to loopback:TestPort+1.
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Receive the data. It works because it was sent before the connect.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
-  // Send again. This time it should not be received.
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveFrom) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data and sender address.
-  char received[sizeof(buf)];
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
-                       reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Listen) {
-  ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-TEST_P(UdpSocketTest, Accept) {
-  ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-// This test validates that a read shutdown with pending data allows the read
-// to proceed with the data before returning EAGAIN.
-TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
-  char received[512];
-
-  // Bind t_ to loopback:TestPort+2.
-  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Verify that we get EWOULDBLOCK when there is nothing to read.
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  const char* buf = "abc";
-  EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
-
-  int opts = 0;
-  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
-  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
-  ASSERT_NE(opts & O_NONBLOCK, 0);
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  // We should get the data even though read has been shutdown.
-  EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
-
-  // Because we read less than the entire packet length, since it's a packet
-  // based socket any subsequent reads should return EWOULDBLOCK.
-  EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-// This test is validating that even after a socket is shutdown if it's
-// reconnected it will reset the shutdown state.
-TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
-  char received[512];
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReadShutdown) {
-  char received[512];
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
-  char received[512];
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then shutdown from another thread.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  ScopedThread t([&] {
-    absl::SleepFor(absl::Milliseconds(200));
-    EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
-  });
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-  t.Join();
-
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, WriteShutdown) {
-  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
-}
-
-TEST_P(UdpSocketTest, SynchronousReceive) {
-  // Bind s_ to loopback.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send some data to s_ from another thread.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  // Receive the data prior to actually starting the other thread.
-  char received[512];
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Start the thread.
-  ScopedThread t([&] {
-    absl::SleepFor(absl::Milliseconds(200));
-    ASSERT_THAT(
-        sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
-        SyscallSucceedsWithValue(sizeof(buf)));
-  });
-
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(512));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send 3 packets from t_ to s_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
-                SyscallSucceedsWithValue(psize));
-  }
-
-  // Receive the data as 3 separate packets.
-  char received[6 * psize];
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
-                SyscallSucceedsWithValue(psize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Direct writes from t_ to s_.
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send 2 packets from t_ to s_, where each packet's data consists of 2
-  // discontiguous iovecs.
-  constexpr size_t kPieceSize = 100;
-  char buf[4 * kPieceSize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[2];
-    for (int j = 0; j < 2; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-
-  // Receive the data as 2 separate packets.
-  char received[6 * kPieceSize];
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[3];
-    for (int j = 0; j < 3; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send 2 packets from t_ to s_, where each packet's data consists of 2
-  // discontiguous iovecs.
-  constexpr size_t kPieceSize = 100;
-  char buf[4 * kPieceSize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[2];
-    for (int j = 0; j < 2; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    struct msghdr msg = {};
-    msg.msg_name = addr_[0];
-    msg.msg_namelen = addrlen_;
-    msg.msg_iov = iov;
-    msg.msg_iovlen = 2;
-    ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-
-  // Receive the data as 2 separate packets.
-  char received[6 * kPieceSize];
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[3];
-    for (int j = 0; j < 3; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    struct msghdr msg = {};
-    msg.msg_iov = iov;
-    msg.msg_iovlen = 3;
-    ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  const char str[] = "abc";
-  ASSERT_THAT(send(s_, str, sizeof(str), 0),
-              SyscallSucceedsWithValue(sizeof(str)));
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(str));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(str));
-}
-
-TEST_P(UdpSocketTest, FIONREAD) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that the bound socket with an empty buffer reports an empty first
-  // packet.
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Send 3 packets from t_ to s_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
-                SyscallSucceedsWithValue(psize));
-
-    // Check that regardless of how many packets are in the queue, the size
-    // reported is that of a single packet.
-    n = -1;
-    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-    EXPECT_EQ(n, psize);
-  }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that the bound socket with an empty buffer reports an empty first
-  // packet.
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Send 3 packets from t_ to s_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
-                SyscallSucceedsWithValue(0));
-
-    // Check that regardless of how many packets are in the queue, the size
-    // reported is that of a single packet.
-    n = -1;
-    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-    EXPECT_EQ(n, 0);
-  }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  const char str[] = "abc";
-  ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, ErrorQueue) {
-  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
-  msghdr msg;
-  memset(&msg, 0, sizeof(msg));
-  iovec iov;
-  memset(&iov, 0, sizeof(iov));
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-
-  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
-  EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
-  int v = -1;
-  socklen_t optlen = sizeof(v);
-  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
-              SyscallSucceeds());
-  ASSERT_EQ(v, kSockOptOff);
-  ASSERT_EQ(optlen, sizeof(v));
-}
-
-TEST_P(UdpSocketTest, SoTimestamp) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  int v = 1;
-  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
-              SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from t_ to s_.
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
-  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
-  msghdr msg;
-  memset(&msg, 0, sizeof(msg));
-  iovec iov;
-  memset(&iov, 0, sizeof(iov));
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
-  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
-
-  struct timeval tv = {};
-  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
-
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
-  // There should be nothing to get via ioctl.
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
-}
-
-TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
-  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, TimestampIoctl) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send packet from t_ to s_.
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // There should be no control messages.
-  char recv_buf[sizeof(buf)];
-  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
-
-  // A nonzero timeval should be available via ioctl.
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-}
-
-TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
-}
-
-// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
-// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
-TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send packet from t_ to s_.
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
-  // There should be no control messages.
-  char recv_buf[sizeof(buf)];
-  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
-
-  // A nonzero timeval should be available via ioctl.
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
-  // Enable SO_TIMESTAMP and send a message.
-  int v = 1;
-  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
-              SyscallSucceeds());
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
-  // There should be a message for SO_TIMESTAMP.
-  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
-  msghdr msg = {};
-  iovec iov = {};
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-
-  // The ioctl should return the exact same values as before.
-  struct timeval tv2 = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
-  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
-  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
-}
-
 INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest,
                          ::testing::Values(AddressFamily::kIpv4,
                                            AddressFamily::kIpv6,
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
new file mode 100644
index 000000000..147978f46
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -0,0 +1,54 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <linux/errqueue.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UdpSocketTest, ErrorQueue) {
+  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
+  EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
new file mode 100644
index 000000000..b6090ac66
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -0,0 +1,1279 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Gets a pointer to the port component of the given address.
+uint16_t* Port(struct sockaddr_storage* addr) {
+  switch (addr->ss_family) {
+    case AF_INET: {
+      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+      return &sin->sin_port;
+    }
+    case AF_INET6: {
+      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+      return &sin6->sin6_port;
+    }
+  }
+
+  return nullptr;
+}
+
+void UdpSocketTest::SetUp() {
+  int type;
+  if (GetParam() == AddressFamily::kIpv4) {
+    type = AF_INET;
+    auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
+    addrlen_ = sizeof(*sin);
+    sin->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    type = AF_INET6;
+    auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
+    addrlen_ = sizeof(*sin6);
+    if (GetParam() == AddressFamily::kIpv6) {
+      sin6->sin6_addr = IN6ADDR_ANY_INIT;
+    } else {
+      TestAddress const& v4_mapped_any = V4MappedAny();
+      sin6->sin6_addr =
+          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+              ->sin6_addr;
+    }
+  }
+  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+
+  ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+
+  memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
+  anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
+  anyaddr_->sa_family = type;
+
+  if (gvisor::testing::IsRunningOnGvisor()) {
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      ports_[i] = TestPort + i;
+    }
+  } else {
+    // When not under gvisor, use utility function to pick port. Assert that
+    // all ports are different.
+    std::string error;
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      // Find an unused port, we specify port 0 to allow the kernel to provide
+      // the port.
+      bool unique = true;
+      do {
+        ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
+            0, AddressFamily::kDualStack, SocketType::kUdp, false));
+        ASSERT_GT(ports_[i], 0);
+        for (size_t j = 0; j < i; ++j) {
+          if (ports_[j] == ports_[i]) {
+            unique = false;
+            break;
+          }
+        }
+      } while (!unique);
+    }
+  }
+
+  // Initialize the sockaddrs.
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
+    memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
+
+    addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
+    addr_[i]->sa_family = type;
+
+    switch (type) {
+      case AF_INET: {
+        auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
+        sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+        sin->sin_port = htons(ports_[i]);
+        break;
+      }
+      case AF_INET6: {
+        auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
+        sin6->sin6_addr = in6addr_loopback;
+        sin6->sin6_port = htons(ports_[i]);
+        break;
+      }
+    }
+  }
+}
+
+TEST_P(UdpSocketTest, Creation) {
+  int type = AF_INET6;
+  if (GetParam() == AddressFamily::kIpv4) {
+    type = AF_INET;
+  }
+
+  int s_;
+
+  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds());
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails());
+}
+
+TEST_P(UdpSocketTest, Getsockname) {
+  // Check that we're not bound.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
+
+  // Bind, then check that we get the right address.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Getpeername) {
+  // Check that we're not connected.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+
+  // Connect, then check that we get the right address.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendNotConnected) {
+  // Do send & write, they must fail.
+  char buf[512];
+  EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EDESTADDRREQ));
+
+  EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
+
+  // Use sendto.
+  ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectBinds) {
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveNotBound) {
+  char buf[512];
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, Bind) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+  // Check that we're still bound to the original address.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, BindInUse) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterConnect) {
+  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Get the address s_ was bound to during connect.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+
+  // Send from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+                     reinterpret_cast<sockaddr*>(&addr), addrlen),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
+  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Get the address s_ was bound to during connect.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+
+  for (int i = 0; i < 2; i++) {
+    // Send from t_ to s_.
+    char buf[512];
+    RandomizeBuffer(buf, sizeof(buf));
+    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+                SyscallSucceeds());
+    ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+                       reinterpret_cast<sockaddr*>(&addr), addrlen),
+                SyscallSucceedsWithValue(sizeof(buf)));
+
+    // Receive the data.
+    char received[sizeof(buf)];
+    EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+                SyscallSucceedsWithValue(sizeof(received)));
+    EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+    // Disconnect s_.
+    struct sockaddr addr = {};
+    addr.sa_family = AF_UNSPEC;
+    ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds());
+    // Connect s_ loopback:TestPort.
+    ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  }
+}
+
+TEST_P(UdpSocketTest, Connect) {
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that we're connected to the right peer.
+  struct sockaddr_storage peer;
+  socklen_t peerlen = sizeof(peer);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+              SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
+
+  // Try to bind after connect.
+  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+  // Try to connect again.
+  EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Check that peer name changed.
+  peerlen = sizeof(peer);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+              SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+}
+
+void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
+  struct sockaddr_storage addr = {};
+
+  // Precondition check.
+  {
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(
+        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+        SyscallSucceeds());
+
+    if (family == AddressFamily::kIpv4) {
+      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+    } else {
+      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      struct in6_addr any = IN6ADDR_ANY_INIT;
+      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0);
+    }
+
+    {
+      socklen_t addrlen = sizeof(addr);
+      EXPECT_THAT(
+          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+          SyscallFailsWithErrno(ENOTCONN));
+    }
+
+    struct sockaddr_storage baddr = {};
+    if (family == AddressFamily::kIpv4) {
+      auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+      addrlen = sizeof(*addr_in);
+      addr_in->sin_family = AF_INET;
+      addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+      addr_in->sin_port = port;
+    } else {
+      auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+      addrlen = sizeof(*addr_in);
+      addr_in->sin6_family = AF_INET6;
+      addr_in->sin6_port = port;
+      if (family == AddressFamily::kIpv6) {
+        addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+      } else {
+        TestAddress const& v4_mapped_any = V4MappedAny();
+        addr_in->sin6_addr =
+            reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+                ->sin6_addr;
+      }
+    }
+
+    // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
+    if (port == 0) {
+      SKIP_IF(IsRunningOnGvisor());
+    }
+
+    ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
+                SyscallSucceeds());
+  }
+
+  // Postcondition check.
+  {
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(
+        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+        SyscallSucceeds());
+
+    if (family == AddressFamily::kIpv4) {
+      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+    } else {
+      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      struct in6_addr loopback;
+      if (family == AddressFamily::kIpv6) {
+        loopback = IN6ADDR_LOOPBACK_INIT;
+      } else {
+        TestAddress const& v4_mapped_loopback = V4MappedLoopback();
+        loopback = reinterpret_cast<const struct sockaddr_in6*>(
+                       &v4_mapped_loopback.addr)
+                       ->sin6_addr;
+      }
+
+      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+    }
+
+    addrlen = sizeof(addr);
+    if (port == 0) {
+      EXPECT_THAT(
+          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+          SyscallFailsWithErrno(ENOTCONN));
+    } else {
+      EXPECT_THAT(
+          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+          SyscallSucceeds());
+    }
+  }
+}
+
+TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); }
+
+TEST_P(UdpSocketTest, ConnectAnyWithPort) {
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  ConnectAny(GetParam(), s_, port);
+}
+
+void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) {
+  struct sockaddr_storage addr = {};
+
+  socklen_t addrlen = sizeof(addr);
+  struct sockaddr_storage baddr = {};
+  if (family == AddressFamily::kIpv4) {
+    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+    addrlen = sizeof(*addr_in);
+    addr_in->sin_family = AF_INET;
+    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+    addr_in->sin_port = port;
+  } else {
+    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+    addrlen = sizeof(*addr_in);
+    addr_in->sin6_family = AF_INET6;
+    addr_in->sin6_port = port;
+    if (family == AddressFamily::kIpv6) {
+      addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+    } else {
+      TestAddress const& v4_mapped_any = V4MappedAny();
+      addr_in->sin6_addr =
+          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+              ->sin6_addr;
+    }
+  }
+
+  // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
+  if (port == 0) {
+    SKIP_IF(IsRunningOnGvisor());
+  }
+
+  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
+              SyscallSucceeds());
+  // Now the socket is bound to the loopback address.
+
+  // Disconnect
+  addrlen = sizeof(addr);
+  addr.ss_family = AF_UNSPEC;
+  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  // Check that after disconnect the socket is bound to the ANY address.
+  EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  if (family == AddressFamily::kIpv4) {
+    auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+  } else {
+    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    struct in6_addr loopback = IN6ADDR_ANY_INIT;
+
+    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+  }
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
+  DisconnectAfterConnectAny(GetParam(), s_, 0);
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  DisconnectAfterConnectAny(GetParam(), s_, port);
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBind) {
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr = {};
+  addr.ss_family = AF_UNSPEC;
+  EXPECT_THAT(
+      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+      SyscallSucceeds());
+
+  // Check that we're still bound.
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
+  struct sockaddr_storage baddr = {};
+  socklen_t addrlen;
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+    addr_in->sin_family = AF_INET;
+    addr_in->sin_port = port;
+    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+    addr_in->sin6_family = AF_INET6;
+    addr_in->sin6_port = port;
+    addr_in->sin6_scope_id = 0;
+    addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+  }
+  ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
+              SyscallSucceeds());
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr = {};
+  addr.ss_family = AF_UNSPEC;
+  EXPECT_THAT(
+      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+      SyscallSucceeds());
+
+  // Check that we're still bound.
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0);
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, Disconnect) {
+  for (int i = 0; i < 2; i++) {
+    // Try to connect again.
+    EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+    // Check that we're connected to the right peer.
+    struct sockaddr_storage peer;
+    socklen_t peerlen = sizeof(peer);
+    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+                SyscallSucceeds());
+    EXPECT_EQ(peerlen, addrlen_);
+    EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+
+    // Try to disconnect.
+    struct sockaddr_storage addr = {};
+    addr.ss_family = AF_UNSPEC;
+    EXPECT_THAT(
+        connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+        SyscallSucceeds());
+
+    peerlen = sizeof(peer);
+    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+                SyscallFailsWithErrno(ENOTCONN));
+
+    // Check that we're still bound.
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+                SyscallSucceeds());
+    EXPECT_EQ(addrlen, addrlen_);
+    EXPECT_EQ(*Port(&addr), 0);
+  }
+}
+
+TEST_P(UdpSocketTest, ConnectBadAddress) {
+  struct sockaddr addr = {};
+  addr.sa_family = addr_[0]->sa_family;
+  ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send to a different destination than we're connected to.
+  char buf[512];
+  EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from s_ to t_.
+  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Set t_ to non-blocking.
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from s_ to t_.
+  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
+  // Bind s_ to loopback.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send some data to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveConnected) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that the data isn't_ received because it was sent from a different
+  // address than we're connected.
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Connect to loopback:TestPort+1.
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Receive the data. It works because it was sent before the connect.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+  // Send again. This time it should not be received.
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveFrom) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data and sender address.
+  char received[sizeof(buf)];
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
+                       reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Listen) {
+  ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UdpSocketTest, Accept) {
+  ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test validates that a read shutdown with pending data allows the read
+// to proceed with the data before returning EAGAIN.
+TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
+  char received[512];
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Verify that we get EWOULDBLOCK when there is nothing to read.
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  const char* buf = "abc";
+  EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
+
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  ASSERT_NE(opts & O_NONBLOCK, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  // We should get the data even though read has been shutdown.
+  EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
+
+  // Because we read less than the entire packet length, since it's a packet
+  // based socket any subsequent reads should return EWOULDBLOCK.
+  EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// This test is validating that even after a socket is shutdown if it's
+// reconnected it will reset the shutdown state.
+TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReadShutdown) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then shutdown from another thread.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
+  });
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+  t.Join();
+
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, WriteShutdown) {
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SynchronousReceive) {
+  // Bind s_ to loopback.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send some data to s_ from another thread.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  // Receive the data prior to actually starting the other thread.
+  char received[512];
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Start the thread.
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    ASSERT_THAT(
+        sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
+        SyscallSucceedsWithValue(sizeof(buf)));
+  });
+
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(512));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(psize));
+  }
+
+  // Receive the data as 3 separate packets.
+  char received[6 * psize];
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
+                SyscallSucceedsWithValue(psize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Direct writes from t_ to s_.
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from t_ to s_, where each packet's data consists of 2
+  // discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from t_ to s_, where each packet's data consists of 2
+  // discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_name = addr_[0];
+    msg.msg_namelen = addrlen_;
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 2;
+    ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 3;
+    ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(s_, str, sizeof(str), 0),
+              SyscallSucceedsWithValue(sizeof(str)));
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+}
+
+TEST_P(UdpSocketTest, Fionread) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(psize));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, psize);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(0));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, 0);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  int v = -1;
+  socklen_t optlen = sizeof(v);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOff);
+  ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoTimestamp) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  int v = 1;
+  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+  struct timeval tv = {};
+  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
+
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // There should be nothing to get via ioctl.
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, TimestampIoctl) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
+// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
+TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // Enable SO_TIMESTAMP and send a message.
+  int v = 1;
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  // There should be a message for SO_TIMESTAMP.
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg = {};
+  iovec iov = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+
+  // The ioctl should return the exact same values as before.
+  struct timeval tv2 = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
+  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
+  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.h b/test/syscalls/linux/udp_socket_test_cases.h
new file mode 100644
index 000000000..2fd79d99e
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.h
@@ -0,0 +1,74 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+#define THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// The initial port to be be used on gvisor.
+constexpr int TestPort = 40000;
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class UdpSocketTest
+    : public ::testing::TestWithParam<gvisor::testing::AddressFamily> {
+ protected:
+  // Creates two sockets that will be used by test cases.
+  void SetUp() override;
+
+  // Closes the sockets created by SetUp().
+  void TearDown() override {
+    EXPECT_THAT(close(s_), SyscallSucceeds());
+    EXPECT_THAT(close(t_), SyscallSucceeds());
+
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
+    }
+  }
+
+  // First UDP socket.
+  int s_;
+
+  // Second UDP socket.
+  int t_;
+
+  // The length of the socket address.
+  socklen_t addrlen_;
+
+  // Initialized address pointing to loopback and port TestPort+i.
+  struct sockaddr* addr_[3];
+
+  // Initialize "any" address.
+  struct sockaddr* anyaddr_;
+
+  // Used ports.
+  int ports_[3];
+
+ private:
+  // Storage for the loopback addresses.
+  struct sockaddr_storage addr_storage_[3];
+
+  // Storage for the "any" address.
+  struct sockaddr_storage anyaddr_storage_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
-- 
cgit v1.2.3


From 1518f7fd38cc2367ee966443a5895a3f25621d83 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Mon, 2 Dec 2019 08:32:27 -0800
Subject: Fix typo, s/Convertable/Convertible/g

PiperOrigin-RevId: 283345791
---
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 6b1af6c17..aa6fb4e3f 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1814,7 +1814,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
               SyscallFailsWithErrno(EADDRINUSE));
 }
 
-TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReusePort) {
   auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -1855,7 +1855,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
               SyscallFailsWithErrno(EADDRINUSE));
 }
 
-TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReuseAddr) {
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) {
   // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
   SKIP_IF(IsRunningOnGvisor());
 
-- 
cgit v1.2.3


From 9194aab2aaada137b377fdfcb812a7c015857d5d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 2 Dec 2019 08:38:45 -0800
Subject: Support sending IP_TOS and IPV6_TCLASS control messages with hostinet
 sockets.

There are two potential ways of sending a TOS byte with outgoing packets:
including a control message in sendmsg, or setting the IP_TOS/IPV6_TCLASS
socket options (for IPV4 and IPV6 respectively). This change lets hostinet
support the former.

PiperOrigin-RevId: 283346737
---
 pkg/sentry/socket/control/BUILD         |   1 +
 pkg/sentry/socket/control/control.go    | 154 +++++++++++++++++++++-----------
 pkg/sentry/socket/hostinet/BUILD        |   1 +
 pkg/sentry/socket/hostinet/socket.go    |   8 +-
 pkg/sentry/syscalls/linux/sys_socket.go |   4 +-
 5 files changed, 115 insertions(+), 53 deletions(-)

diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 4a6e83a8b..357517ed4 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 0371acede..782a3cb92 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -347,30 +348,63 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
+func addSpaceForCmsg(cmsgDataLen int, buf []byte) []byte {
+	newBuf := make([]byte, 0, len(buf)+linux.SizeOfControlMessageHeader+cmsgDataLen)
+	return append(newBuf, buf...)
+}
+
+// PackControlMessages converts the given ControlMessages struct into a buffer.
+// We skip control messages specific to Unix domain sockets.
+func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages) []byte {
+	var buf []byte
+	// The use of t.Arch().Width() is analogous to Linux's use of sizeof(long) in
+	// CMSG_ALIGN.
+	width := t.Arch().Width()
+
+	if cmsgs.IP.HasTimestamp {
+		buf = addSpaceForCmsg(int(width), buf)
+		buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf)
+	}
+
+	if cmsgs.IP.HasInq {
+		// In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
+		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageInq, width), buf)
+		buf = PackInq(t, cmsgs.IP.Inq, buf)
+	}
+
+	if cmsgs.IP.HasTOS {
+		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageTOS, width), buf)
+		buf = PackTOS(t, cmsgs.IP.TOS, buf)
+	}
+
+	if cmsgs.IP.HasTClass {
+		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageTClass, width), buf)
+		buf = PackTClass(t, cmsgs.IP.TClass, buf)
+	}
+
+	return buf
+}
+
 // Parse parses a raw socket control message into portable objects.
-func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) {
+func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
-		fds       linux.ControlMessageRights
-		haveCreds bool
-		creds     linux.ControlMessageCredentials
+		cmsgs socket.ControlMessages
+		fds   linux.ControlMessageRights
 	)
 
 	for i := 0; i < len(buf); {
 		if i+linux.SizeOfControlMessageHeader > len(buf) {
-			return transport.ControlMessages{}, syserror.EINVAL
+			return cmsgs, syserror.EINVAL
 		}
 
 		var h linux.ControlMessageHeader
 		binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h)
 
 		if h.Length < uint64(linux.SizeOfControlMessageHeader) {
-			return transport.ControlMessages{}, syserror.EINVAL
+			return socket.ControlMessages{}, syserror.EINVAL
 		}
 		if h.Length > uint64(len(buf)-i) {
-			return transport.ControlMessages{}, syserror.EINVAL
-		}
-		if h.Level != linux.SOL_SOCKET {
-			return transport.ControlMessages{}, syserror.EINVAL
+			return socket.ControlMessages{}, syserror.EINVAL
 		}
 
 		i += linux.SizeOfControlMessageHeader
@@ -380,59 +414,79 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.
 		// sizeof(long) in CMSG_ALIGN.
 		width := t.Arch().Width()
 
-		switch h.Type {
-		case linux.SCM_RIGHTS:
-			rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
-			numRights := rightsSize / linux.SizeOfControlMessageRight
-
-			if len(fds)+numRights > linux.SCM_MAX_FD {
-				return transport.ControlMessages{}, syserror.EINVAL
+		switch h.Level {
+		case linux.SOL_SOCKET:
+			switch h.Type {
+			case linux.SCM_RIGHTS:
+				rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
+				numRights := rightsSize / linux.SizeOfControlMessageRight
+
+				if len(fds)+numRights > linux.SCM_MAX_FD {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
+					fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
+				}
+
+				i += AlignUp(length, width)
+
+			case linux.SCM_CREDENTIALS:
+				if length < linux.SizeOfControlMessageCredentials {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				var creds linux.ControlMessageCredentials
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
+				scmCreds, err := NewSCMCredentials(t, creds)
+				if err != nil {
+					return socket.ControlMessages{}, err
+				}
+				cmsgs.Unix.Credentials = scmCreds
+				i += AlignUp(length, width)
+
+			default:
+				// Unknown message type.
+				return socket.ControlMessages{}, syserror.EINVAL
 			}
-
-			for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
-				fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
+		case linux.SOL_IP:
+			switch h.Type {
+			case linux.IP_TOS:
+				cmsgs.IP.HasTOS = true
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
+				i += AlignUp(length, width)
+
+			default:
+				return socket.ControlMessages{}, syserror.EINVAL
 			}
-
-			i += AlignUp(length, width)
-
-		case linux.SCM_CREDENTIALS:
-			if length < linux.SizeOfControlMessageCredentials {
-				return transport.ControlMessages{}, syserror.EINVAL
+		case linux.SOL_IPV6:
+			switch h.Type {
+			case linux.IPV6_TCLASS:
+				cmsgs.IP.HasTClass = true
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
+				i += AlignUp(length, width)
+
+			default:
+				return socket.ControlMessages{}, syserror.EINVAL
 			}
-
-			binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
-			haveCreds = true
-			i += AlignUp(length, width)
-
 		default:
-			// Unknown message type.
-			return transport.ControlMessages{}, syserror.EINVAL
+			return socket.ControlMessages{}, syserror.EINVAL
 		}
 	}
 
-	var credentials SCMCredentials
-	if haveCreds {
-		var err error
-		if credentials, err = NewSCMCredentials(t, creds); err != nil {
-			return transport.ControlMessages{}, err
-		}
-	} else {
-		credentials = makeCreds(t, socketOrEndpoint)
+	if cmsgs.Unix.Credentials == nil {
+		cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint)
 	}
 
-	var rights SCMRights
 	if len(fds) > 0 {
-		var err error
-		if rights, err = NewSCMRights(t, fds); err != nil {
-			return transport.ControlMessages{}, err
+		rights, err := NewSCMRights(t, fds)
+		if err != nil {
+			return socket.ControlMessages{}, err
 		}
+		cmsgs.Unix.Rights = rights
 	}
 
-	if credentials == nil && rights == nil {
-		return transport.ControlMessages{}, nil
-	}
-
-	return transport.ControlMessages{Credentials: credentials, Rights: rights}, nil
+	return cmsgs, nil
 }
 
 func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index b1cf1126f..4c44c7c0f 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -29,6 +29,7 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index aa234f760..8d9363aac 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -30,6 +30,7 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -488,6 +489,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		return 0, syserr.ErrInvalidArgument
 	}
 
+	controlBuf := control.PackControlMessages(t, controlMessages)
 	sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
 		// Refuse to do anything if any part of src.Addrs was unusable.
 		if uint64(src.NumBytes()) != srcs.NumBytes() {
@@ -500,7 +502,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		// We always do a non-blocking send*().
 		sysflags := flags | syscall.MSG_DONTWAIT
 
-		if srcs.NumBlocks() == 1 {
+		if srcs.NumBlocks() == 1 && len(controlBuf) == 0 {
 			// Skip allocating []syscall.Iovec.
 			src := srcs.Head()
 			n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
@@ -519,6 +521,10 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			msg.Name = &to[0]
 			msg.Namelen = uint32(len(to))
 		}
+		if len(controlBuf) != 0 {
+			msg.Control = &controlBuf[0]
+			msg.Controllen = uint64(len(controlBuf))
+		}
 		return sendmsg(s.fd, &msg, sysflags)
 	})
 
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 13f77565f..d8acae063 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1068,10 +1068,10 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	}
 
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: controlMessages})
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
-		controlMessages.Release()
+		controlMessages.Unix.Release()
 	}
 	return uintptr(n), err
 }
-- 
cgit v1.2.3


From b41277049c6c6c15581d8698fd9418ef9c2cec8a Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 2 Dec 2019 15:35:51 -0800
Subject: test/syscal: Don't skip ClockGettime.CputimeId

We skipped it due to the issue in the golang scheduler
which has been fixed in go1.13.

PiperOrigin-RevId: 283432226
---
 test/syscalls/linux/clock_gettime.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 2aa91691e..7f6015049 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -56,11 +56,6 @@ void spin_ns(int64_t ns) {
 
 // Test that CLOCK_PROCESS_CPUTIME_ID is a superset of CLOCK_THREAD_CPUTIME_ID.
 TEST(ClockGettime, CputimeId) {
-  // TODO(b/128871825,golang.org/issue/10958): Test times out when there is a
-  // small number of core because one goroutine starves the others.
-  printf("CPUS: %d\n", std::thread::hardware_concurrency());
-  SKIP_IF(std::thread::hardware_concurrency() <= 2);
-
   constexpr int kNumThreads = 13;  // arbitrary
 
   absl::Duration spin_time = absl::Seconds(1);
-- 
cgit v1.2.3


From 7ac46c50486eef252ecaa4de1a2fe2581f73f79c Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Mon, 2 Dec 2019 17:59:08 -0800
Subject: Allow non-unique UIDs in bazel docker containers

Allow non-unique UIDs in the bazel docker container in order to avoid failures
using host UIDs that are already present in the image.

Issue #1267

PiperOrigin-RevId: 283456369
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1735c07df..a73bc0c36 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ bazel-server-start: docker-build
 		--privileged \
 		gvisor-bazel \
 		sh -c "while :; do sleep 100; done" && \
-	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --gid $(GID) -d $(HOME) gvisor"
+	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
 
 bazel-server:
 	docker exec gvisor-bazel true || \
-- 
cgit v1.2.3


From 61f2274cb6f05579e4abe1e794182c04a622b58f Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 18 Nov 2019 09:07:00 +0000
Subject: Enable runsc compatLog support on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I3fd5e552f5f03b5144ed52647f75af3b8253b1d6
---
 runsc/boot/BUILD           |  1 +
 runsc/boot/compat.go       | 58 ++++++++++++++++++++++------
 runsc/boot/compat_amd64.go | 87 +++++++++++++++++++++++++----------------
 runsc/boot/compat_arm64.go | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/compat_test.go  | 47 +++++++++++++----------
 5 files changed, 223 insertions(+), 66 deletions(-)
 create mode 100644 runsc/boot/compat_arm64.go

diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 847d2f91c..3b6a29c6e 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "compat.go",
         "compat_amd64.go",
+        "compat_arm64.go",
         "config.go",
         "controller.go",
         "debug.go",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 07e35ab10..b7283f56c 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -21,10 +21,8 @@ import (
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/strace"
@@ -53,9 +51,9 @@ type compatEmitter struct {
 }
 
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
-	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+	nameMap, ok := getSyscallNameMap()
 	if !ok {
-		return nil, fmt.Errorf("amd64 Linux syscall table not found")
+		return nil, fmt.Errorf("Linux syscall table not found")
 	}
 
 	c := &compatEmitter{
@@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
 }
 
 func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
-	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+	regs := us.Registers
 
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	sysnr := regs.OrigRax
+	sysnr := syscallNum(regs)
 	tr := c.trackers[sysnr]
 	if tr == nil {
 		switch sysnr {
-		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+		case syscall.SYS_PRCTL:
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
@@ -112,10 +110,11 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			tr = newArgsTracker(2)
 
 		default:
-			tr = &onceTracker{}
+			tr = newArchArgsTracker(sysnr)
 		}
 		c.trackers[sysnr] = tr
 	}
+
 	if tr.shouldReport(regs) {
 		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
 		tr.onReported(regs)
@@ -139,10 +138,10 @@ func (c *compatEmitter) Close() error {
 // the syscall and arguments.
 type syscallTracker interface {
 	// shouldReport returns true is the syscall should be reported.
-	shouldReport(regs *rpb.AMD64Registers) bool
+	shouldReport(regs *rpb.Registers) bool
 
 	// onReported marks the syscall as reported.
-	onReported(regs *rpb.AMD64Registers)
+	onReported(regs *rpb.Registers)
 }
 
 // onceTracker reports only a single time, used for most syscalls.
@@ -150,10 +149,45 @@ type onceTracker struct {
 	reported bool
 }
 
-func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
 	return !o.reported
 }
 
-func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+func (o *onceTracker) onReported(_ *rpb.Registers) {
 	o.reported = true
 }
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
+	return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 43cd0db94..bfb094577 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -16,62 +16,83 @@ package boot
 
 import (
 	"fmt"
+	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
 // reportLimit is the max number of events that should be reported per tracker.
 const reportLimit = 100
 
-// argsTracker reports only once for each different combination of arguments.
-// It's used for generic syscalls like ioctl to report once per 'cmd'.
-type argsTracker struct {
-	// argsIdx is the syscall arguments to use as unique ID.
-	argsIdx  []int
-	reported map[string]struct{}
-	count    int
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Amd64{
+			Amd64: &rpb.AMD64Registers{},
+		},
+	}
 }
 
-func newArgsTracker(argIdx ...int) *argsTracker {
-	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
-}
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
 
-// cmd returns the command based on the syscall argument index.
-func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
-	var rv string
-	for _, idx := range a.argsIdx {
-		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	switch argIdx {
+	case 0:
+		return uint32(amd64Regs.Rdi)
+	case 1:
+		return uint32(amd64Regs.Rsi)
+	case 2:
+		return uint32(amd64Regs.Rdx)
+	case 3:
+		return uint32(amd64Regs.R10)
+	case 4:
+		return uint32(amd64Regs.R8)
+	case 5:
+		return uint32(amd64Regs.R9)
 	}
-	return rv
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
 	switch argIdx {
 	case 0:
-		return uint32(regs.Rdi)
+		amd64Regs.Rdi = argVal
 	case 1:
-		return uint32(regs.Rsi)
+		amd64Regs.Rsi = argVal
 	case 2:
-		return uint32(regs.Rdx)
+		amd64Regs.Rdx = argVal
 	case 3:
-		return uint32(regs.R10)
+		amd64Regs.R10 = argVal
 	case 4:
-		return uint32(regs.R8)
+		amd64Regs.R8 = argVal
 	case 5:
-		return uint32(regs.R9)
+		amd64Regs.R9 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 	}
-	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
-	if a.count >= reportLimit {
-		return false
-	}
-	_, ok := a.reported[a.key(regs)]
-	return !ok
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.AMD64)
 }
 
-func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
-	a.count++
-	a.reported[a.key(regs)] = struct{}{}
+func syscallNum(regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+	return amd64Regs.OrigRax
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	switch sysnr {
+	case syscall.SYS_ARCH_PRCTL:
+		// args: cmd, ...
+		return newArgsTracker(0)
+
+	default:
+		return &onceTracker{}
+	}
 }
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..50947d7a9
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,96 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Arm64{
+			Arm64: &rpb.ARM64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		return uint32(arm64Regs.R0)
+	case 1:
+		return uint32(arm64Regs.R1)
+	case 2:
+		return uint32(arm64Regs.R2)
+	case 3:
+		return uint32(arm64Regs.R3)
+	case 4:
+		return uint32(arm64Regs.R4)
+	case 5:
+		return uint32(arm64Regs.R5)
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		arm64Regs.R0 = argVal
+	case 1:
+		arm64Regs.R1 = argVal
+	case 2:
+		arm64Regs.R2 = argVal
+	case 3:
+		arm64Regs.R3 = argVal
+	case 4:
+		arm64Regs.R4 = argVal
+	case 5:
+		arm64Regs.R5 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+	return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+
+	switch sysnr {
+	// currently, no arch specific syscalls need to be handled here.
+	default:
+		return &onceTracker{}
+	}
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 388298d8d..4bb520898 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,8 +16,6 @@ package boot
 
 import (
 	"testing"
-
-	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 )
 
 func TestOnceTracker(t *testing.T) {
@@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) {
 
 func TestArgsTracker(t *testing.T) {
 	for _, tc := range []struct {
-		name string
-		idx  []int
-		rdi1 uint64
-		rdi2 uint64
-		rsi1 uint64
-		rsi2 uint64
-		want bool
+		name   string
+		idx    []int
+		arg1_1 uint64
+		arg1_2 uint64
+		arg2_1 uint64
+		arg2_2 uint64
+		want   bool
 	}{
-		{name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
-		{name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
-		{name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
-		{name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
-		{name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
-		{name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
-		{name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
+		{name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+		{name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+		{name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+		{name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			c := newArgsTracker(tc.idx...)
-			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+			regs := newRegs()
+			setArgVal(0, tc.arg1_1, regs)
+			setArgVal(1, tc.arg2_1, regs)
 			if !c.shouldReport(regs) {
 				t.Error("first call to shouldReport, got: false, want: true")
 			}
 			c.onReported(regs)
 
-			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+			setArgVal(0, tc.arg1_2, regs)
+			setArgVal(1, tc.arg2_2, regs)
 			if got := c.shouldReport(regs); tc.want != got {
 				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
 			}
@@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) {
 func TestArgsTrackerLimit(t *testing.T) {
 	c := newArgsTracker(0, 1)
 	for i := 0; i < reportLimit; i++ {
-		regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+		regs := newRegs()
+		setArgVal(0, 123, regs)
+		setArgVal(1, uint64(i), regs)
 		if !c.shouldReport(regs) {
 			t.Error("shouldReport before limit was reached, got: false, want: true")
 		}
@@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) {
 	}
 
 	// Should hit the count limit now.
-	regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+	regs := newRegs()
+	setArgVal(0, 123, regs)
+	setArgVal(1, 123456, regs)
 	if c.shouldReport(regs) {
 		t.Error("shouldReport after limit was reached, got: true, want: false")
 	}
-- 
cgit v1.2.3


From ce32c0684311923fb80dd04221d5fd5120170cf9 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 2 Dec 2019 22:51:55 -0800
Subject: Import benchmark-tools to main repository.

This has adapted for use with bazel from the original commit
a26e93769ebefd82593a43e22fb13a09717cfa6d.

In particular, the style has been made consistent with internal python style
guidelines, and the packages (including the main entrypoint) have been
refactored in order to allow bazel testing targets.

PiperOrigin-RevId: 283484433
---
 Dockerfile                                         |   4 +-
 WORKSPACE                                          |  23 +-
 benchmarks/BUILD                                   |   9 +
 benchmarks/README.md                               | 172 ++++++++
 benchmarks/defs.bzl                                |  18 +
 benchmarks/examples/localhost.yaml                 |   2 +
 benchmarks/harness/BUILD                           |  89 ++++
 benchmarks/harness/__init__.py                     |  25 ++
 benchmarks/harness/benchmark_driver.py             |  85 ++++
 benchmarks/harness/container.py                    | 181 ++++++++
 benchmarks/harness/machine.py                      | 191 +++++++++
 benchmarks/harness/machine_mocks/BUILD             |   9 +
 benchmarks/harness/machine_mocks/__init__.py       |  81 ++++
 benchmarks/harness/machine_producers/BUILD         |  35 ++
 benchmarks/harness/machine_producers/__init__.py   |  13 +
 .../harness/machine_producers/machine_producer.py  |  30 ++
 .../harness/machine_producers/mock_producer.py     |  31 ++
 .../harness/machine_producers/yaml_producer.py     | 106 +++++
 benchmarks/harness/ssh_connection.py               | 111 +++++
 benchmarks/harness/tunnel_dispatcher.py            |  82 ++++
 benchmarks/requirements.txt                        |  32 ++
 benchmarks/run.py                                  |  19 +
 benchmarks/runner/BUILD                            |  50 +++
 benchmarks/runner/__init__.py                      | 301 +++++++++++++
 benchmarks/runner/runner_test.py                   |  59 +++
 benchmarks/suites/BUILD                            | 130 ++++++
 benchmarks/suites/__init__.py                      | 119 ++++++
 benchmarks/suites/absl.py                          |  37 ++
 benchmarks/suites/density.py                       | 121 ++++++
 benchmarks/suites/fio.py                           | 165 +++++++
 benchmarks/suites/helpers.py                       |  57 +++
 benchmarks/suites/http.py                          | 138 ++++++
 benchmarks/suites/media.py                         |  42 ++
 benchmarks/suites/ml.py                            |  33 ++
 benchmarks/suites/network.py                       | 101 +++++
 benchmarks/suites/redis.py                         |  46 ++
 benchmarks/suites/startup.py                       | 110 +++++
 benchmarks/suites/sysbench.py                      | 119 ++++++
 benchmarks/suites/syscall.py                       |  37 ++
 benchmarks/tcp/BUILD                               |  41 ++
 benchmarks/tcp/README.md                           |  87 ++++
 benchmarks/tcp/nsjoin.c                            |  47 ++
 benchmarks/tcp/tcp_benchmark.sh                    | 369 ++++++++++++++++
 benchmarks/tcp/tcp_proxy.go                        | 436 +++++++++++++++++++
 benchmarks/workloads/BUILD                         |  35 ++
 benchmarks/workloads/__init__.py                   |  14 +
 benchmarks/workloads/ab/BUILD                      |  35 ++
 benchmarks/workloads/ab/Dockerfile                 |  15 +
 benchmarks/workloads/ab/__init__.py                |  88 ++++
 benchmarks/workloads/ab/ab_test.py                 |  42 ++
 benchmarks/workloads/absl/BUILD                    |  35 ++
 benchmarks/workloads/absl/Dockerfile               |  24 ++
 benchmarks/workloads/absl/__init__.py              |  63 +++
 benchmarks/workloads/absl/absl_test.py             |  31 ++
 benchmarks/workloads/curl/BUILD                    |  11 +
 benchmarks/workloads/curl/Dockerfile               |  14 +
 benchmarks/workloads/ffmpeg/BUILD                  |  16 +
 benchmarks/workloads/ffmpeg/Dockerfile             |  10 +
 benchmarks/workloads/ffmpeg/__init__.py            |  20 +
 benchmarks/workloads/fio/BUILD                     |  35 ++
 benchmarks/workloads/fio/Dockerfile                |  23 +
 benchmarks/workloads/fio/__init__.py               | 369 ++++++++++++++++
 benchmarks/workloads/fio/fio_test.py               |  44 ++
 benchmarks/workloads/httpd/BUILD                   |  11 +
 benchmarks/workloads/httpd/Dockerfile              |  27 ++
 benchmarks/workloads/iperf/BUILD                   |  35 ++
 benchmarks/workloads/iperf/Dockerfile              |  14 +
 benchmarks/workloads/iperf/__init__.py             |  40 ++
 benchmarks/workloads/iperf/iperf_test.py           |  28 ++
 benchmarks/workloads/netcat/BUILD                  |  11 +
 benchmarks/workloads/netcat/Dockerfile             |  14 +
 benchmarks/workloads/nginx/BUILD                   |  11 +
 benchmarks/workloads/nginx/Dockerfile              |   1 +
 benchmarks/workloads/node/BUILD                    |  13 +
 benchmarks/workloads/node/Dockerfile               |   2 +
 benchmarks/workloads/node/index.js                 |  28 ++
 benchmarks/workloads/node/package.json             |  19 +
 benchmarks/workloads/node_template/BUILD           |  15 +
 benchmarks/workloads/node_template/Dockerfile      |   5 +
 benchmarks/workloads/node_template/index.hbs       |   8 +
 benchmarks/workloads/node_template/index.js        |  43 ++
 .../workloads/node_template/package-lock.json      | 476 +++++++++++++++++++++
 benchmarks/workloads/node_template/package.json    |  19 +
 benchmarks/workloads/redis/BUILD                   |  11 +
 benchmarks/workloads/redis/Dockerfile              |   1 +
 benchmarks/workloads/redisbenchmark/BUILD          |  35 ++
 benchmarks/workloads/redisbenchmark/Dockerfile     |   4 +
 benchmarks/workloads/redisbenchmark/__init__.py    |  85 ++++
 .../redisbenchmark/redisbenchmark_test.py          |  51 +++
 benchmarks/workloads/ruby/BUILD                    |  15 +
 benchmarks/workloads/ruby/Dockerfile               |  28 ++
 benchmarks/workloads/ruby/Gemfile                  |  12 +
 benchmarks/workloads/ruby/Gemfile.lock             |  55 +++
 benchmarks/workloads/ruby/config.ru                |   2 +
 benchmarks/workloads/ruby/index.rb                 |  14 +
 benchmarks/workloads/ruby_template/BUILD           |  16 +
 benchmarks/workloads/ruby_template/Dockerfile      |  38 ++
 benchmarks/workloads/ruby_template/Gemfile         |   5 +
 benchmarks/workloads/ruby_template/Gemfile.lock    |  26 ++
 benchmarks/workloads/ruby_template/config.ru       |   2 +
 benchmarks/workloads/ruby_template/index.erb       |   8 +
 benchmarks/workloads/ruby_template/main.rb         |  27 ++
 benchmarks/workloads/sleep/BUILD                   |  11 +
 benchmarks/workloads/sleep/Dockerfile              |   3 +
 benchmarks/workloads/sysbench/BUILD                |  35 ++
 benchmarks/workloads/sysbench/Dockerfile           |  16 +
 benchmarks/workloads/sysbench/__init__.py          | 167 ++++++++
 benchmarks/workloads/sysbench/sysbench_test.py     |  34 ++
 benchmarks/workloads/syscall/BUILD                 |  36 ++
 benchmarks/workloads/syscall/Dockerfile            |   6 +
 benchmarks/workloads/syscall/__init__.py           |  29 ++
 benchmarks/workloads/syscall/syscall.c             |  55 +++
 benchmarks/workloads/syscall/syscall_test.py       |  27 ++
 benchmarks/workloads/tensorflow/BUILD              |  16 +
 benchmarks/workloads/tensorflow/Dockerfile         |  14 +
 benchmarks/workloads/tensorflow/__init__.py        |  20 +
 benchmarks/workloads/true/BUILD                    |  11 +
 benchmarks/workloads/true/Dockerfile               |   3 +
 scripts/benchmarks.sh                              |  53 +++
 scripts/simple_tests.sh                            |   2 +-
 120 files changed, 6706 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/BUILD
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/defs.bzl
 create mode 100644 benchmarks/examples/localhost.yaml
 create mode 100644 benchmarks/harness/BUILD
 create mode 100644 benchmarks/harness/__init__.py
 create mode 100644 benchmarks/harness/benchmark_driver.py
 create mode 100644 benchmarks/harness/container.py
 create mode 100644 benchmarks/harness/machine.py
 create mode 100644 benchmarks/harness/machine_mocks/BUILD
 create mode 100644 benchmarks/harness/machine_mocks/__init__.py
 create mode 100644 benchmarks/harness/machine_producers/BUILD
 create mode 100644 benchmarks/harness/machine_producers/__init__.py
 create mode 100644 benchmarks/harness/machine_producers/machine_producer.py
 create mode 100644 benchmarks/harness/machine_producers/mock_producer.py
 create mode 100644 benchmarks/harness/machine_producers/yaml_producer.py
 create mode 100644 benchmarks/harness/ssh_connection.py
 create mode 100644 benchmarks/harness/tunnel_dispatcher.py
 create mode 100644 benchmarks/requirements.txt
 create mode 100644 benchmarks/run.py
 create mode 100644 benchmarks/runner/BUILD
 create mode 100644 benchmarks/runner/__init__.py
 create mode 100644 benchmarks/runner/runner_test.py
 create mode 100644 benchmarks/suites/BUILD
 create mode 100644 benchmarks/suites/__init__.py
 create mode 100644 benchmarks/suites/absl.py
 create mode 100644 benchmarks/suites/density.py
 create mode 100644 benchmarks/suites/fio.py
 create mode 100644 benchmarks/suites/helpers.py
 create mode 100644 benchmarks/suites/http.py
 create mode 100644 benchmarks/suites/media.py
 create mode 100644 benchmarks/suites/ml.py
 create mode 100644 benchmarks/suites/network.py
 create mode 100644 benchmarks/suites/redis.py
 create mode 100644 benchmarks/suites/startup.py
 create mode 100644 benchmarks/suites/sysbench.py
 create mode 100644 benchmarks/suites/syscall.py
 create mode 100644 benchmarks/tcp/BUILD
 create mode 100644 benchmarks/tcp/README.md
 create mode 100644 benchmarks/tcp/nsjoin.c
 create mode 100755 benchmarks/tcp/tcp_benchmark.sh
 create mode 100644 benchmarks/tcp/tcp_proxy.go
 create mode 100644 benchmarks/workloads/BUILD
 create mode 100644 benchmarks/workloads/__init__.py
 create mode 100644 benchmarks/workloads/ab/BUILD
 create mode 100644 benchmarks/workloads/ab/Dockerfile
 create mode 100644 benchmarks/workloads/ab/__init__.py
 create mode 100644 benchmarks/workloads/ab/ab_test.py
 create mode 100644 benchmarks/workloads/absl/BUILD
 create mode 100644 benchmarks/workloads/absl/Dockerfile
 create mode 100644 benchmarks/workloads/absl/__init__.py
 create mode 100644 benchmarks/workloads/absl/absl_test.py
 create mode 100644 benchmarks/workloads/curl/BUILD
 create mode 100644 benchmarks/workloads/curl/Dockerfile
 create mode 100644 benchmarks/workloads/ffmpeg/BUILD
 create mode 100644 benchmarks/workloads/ffmpeg/Dockerfile
 create mode 100644 benchmarks/workloads/ffmpeg/__init__.py
 create mode 100644 benchmarks/workloads/fio/BUILD
 create mode 100644 benchmarks/workloads/fio/Dockerfile
 create mode 100644 benchmarks/workloads/fio/__init__.py
 create mode 100644 benchmarks/workloads/fio/fio_test.py
 create mode 100644 benchmarks/workloads/httpd/BUILD
 create mode 100644 benchmarks/workloads/httpd/Dockerfile
 create mode 100644 benchmarks/workloads/iperf/BUILD
 create mode 100644 benchmarks/workloads/iperf/Dockerfile
 create mode 100644 benchmarks/workloads/iperf/__init__.py
 create mode 100644 benchmarks/workloads/iperf/iperf_test.py
 create mode 100644 benchmarks/workloads/netcat/BUILD
 create mode 100644 benchmarks/workloads/netcat/Dockerfile
 create mode 100644 benchmarks/workloads/nginx/BUILD
 create mode 100644 benchmarks/workloads/nginx/Dockerfile
 create mode 100644 benchmarks/workloads/node/BUILD
 create mode 100644 benchmarks/workloads/node/Dockerfile
 create mode 100644 benchmarks/workloads/node/index.js
 create mode 100644 benchmarks/workloads/node/package.json
 create mode 100644 benchmarks/workloads/node_template/BUILD
 create mode 100644 benchmarks/workloads/node_template/Dockerfile
 create mode 100644 benchmarks/workloads/node_template/index.hbs
 create mode 100644 benchmarks/workloads/node_template/index.js
 create mode 100644 benchmarks/workloads/node_template/package-lock.json
 create mode 100644 benchmarks/workloads/node_template/package.json
 create mode 100644 benchmarks/workloads/redis/BUILD
 create mode 100644 benchmarks/workloads/redis/Dockerfile
 create mode 100644 benchmarks/workloads/redisbenchmark/BUILD
 create mode 100644 benchmarks/workloads/redisbenchmark/Dockerfile
 create mode 100644 benchmarks/workloads/redisbenchmark/__init__.py
 create mode 100644 benchmarks/workloads/redisbenchmark/redisbenchmark_test.py
 create mode 100644 benchmarks/workloads/ruby/BUILD
 create mode 100644 benchmarks/workloads/ruby/Dockerfile
 create mode 100644 benchmarks/workloads/ruby/Gemfile
 create mode 100644 benchmarks/workloads/ruby/Gemfile.lock
 create mode 100755 benchmarks/workloads/ruby/config.ru
 create mode 100755 benchmarks/workloads/ruby/index.rb
 create mode 100644 benchmarks/workloads/ruby_template/BUILD
 create mode 100755 benchmarks/workloads/ruby_template/Dockerfile
 create mode 100755 benchmarks/workloads/ruby_template/Gemfile
 create mode 100644 benchmarks/workloads/ruby_template/Gemfile.lock
 create mode 100755 benchmarks/workloads/ruby_template/config.ru
 create mode 100755 benchmarks/workloads/ruby_template/index.erb
 create mode 100755 benchmarks/workloads/ruby_template/main.rb
 create mode 100644 benchmarks/workloads/sleep/BUILD
 create mode 100644 benchmarks/workloads/sleep/Dockerfile
 create mode 100644 benchmarks/workloads/sysbench/BUILD
 create mode 100644 benchmarks/workloads/sysbench/Dockerfile
 create mode 100644 benchmarks/workloads/sysbench/__init__.py
 create mode 100644 benchmarks/workloads/sysbench/sysbench_test.py
 create mode 100644 benchmarks/workloads/syscall/BUILD
 create mode 100644 benchmarks/workloads/syscall/Dockerfile
 create mode 100644 benchmarks/workloads/syscall/__init__.py
 create mode 100644 benchmarks/workloads/syscall/syscall.c
 create mode 100644 benchmarks/workloads/syscall/syscall_test.py
 create mode 100644 benchmarks/workloads/tensorflow/BUILD
 create mode 100644 benchmarks/workloads/tensorflow/Dockerfile
 create mode 100644 benchmarks/workloads/tensorflow/__init__.py
 create mode 100644 benchmarks/workloads/true/BUILD
 create mode 100644 benchmarks/workloads/true/Dockerfile
 create mode 100755 scripts/benchmarks.sh

diff --git a/Dockerfile b/Dockerfile
index 6e9d870db..5b95822f9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
 FROM ubuntu:bionic
 
-RUN apt-get update && apt-get install -y curl gnupg2 git python3
+RUN apt-get update && apt-get install -y curl gnupg2 git python3 python3-distutils python3-pip
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-	curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
 RUN apt-get update && apt-get install -y bazel && apt-get clean
 
 WORKDIR /gvisor
diff --git a/WORKSPACE b/WORKSPACE
index 0ad2bb17c..4561ed8fc 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,6 +1,7 @@
-# Load go bazel rules and gazelle.
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
+# Load go bazel rules and gazelle.
 http_archive(
     name = "io_bazel_rules_go",
     sha256 = "b9aa86ec08a292b97ec4591cf578e020b35f98e12173bbd4a921f84f583aebd9",
@@ -58,6 +59,26 @@ load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
 
 protobuf_deps()
 
+# Load python dependencies.
+git_repository(
+    name = "rules_python",
+    commit = "94677401bc56ed5d756f50b441a6a5c7f735a6d4",
+    remote = "https://github.com/bazelbuild/rules_python.git",
+    shallow_since = "1573842889 -0500",
+)
+
+load("@rules_python//python:pip.bzl", "pip_import")
+
+pip_import(
+    name = "pydeps",
+    python_interpreter = "python3",
+    requirements = "//benchmarks:requirements.txt",
+)
+
+load("@pydeps//:requirements.bzl", "pip_install")
+
+pip_install()
+
 # Load bazel_toolchain to support Remote Build Execution.
 # See releases at https://releases.bazel.build/bazel-toolchains.html
 http_archive(
diff --git a/benchmarks/BUILD b/benchmarks/BUILD
new file mode 100644
index 000000000..dbadeeaf2
--- /dev/null
+++ b/benchmarks/BUILD
@@ -0,0 +1,9 @@
+package(licenses = ["notice"])
+
+py_binary(
+    name = "benchmarks",
+    srcs = ["run.py"],
+    main = "run.py",
+    python_version = "PY3",
+    deps = ["//benchmarks/runner"],
+)
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..ad44cd6ac
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,172 @@
+# Benchmark tools
+
+These scripts are tools for collecting performance data for Docker-based tests.
+
+## Setup
+
+The scripts assume the following:
+
+*   You have a local machine with bazel installed.
+*   You have some machine(s) with docker installed. These machines will be
+    refered to as the "Environment".
+*   Environment machines have the runtime(s) under test installed, such that you
+    can run docker with a command like: `docker run --runtime=$RUNTIME
+    your/image`.
+*   You are able to login to machines in the environment with the local machine
+    via ssh and the user for ssh can run docker commands without using `sudo`.
+*   The docker daemon on each of your environment machines is listening on
+    `unix:///var/run/docker.sock` (docker's default).
+
+For configuring the environment manually, consult the
+[dockerd documentation][dockerd].
+
+## Environment
+
+All benchmarks require a user defined yaml file describe the environment. These
+files are of the form:
+
+```yaml
+machine1: local
+machine2:
+  hostname: 100.100.100.100
+  username: username
+  key_path: ~/private_keyfile
+  key_password: passphrase
+machine3:
+  hostname: 100.100.100.101
+  username: username
+  key_path: ~/private_keyfile
+  key_password: passphrase
+```
+
+The yaml file defines an environment with three machines named `machine1`,
+`machine2` and `machine3`. `machine1` is the local machine, `machine2` and
+`machine3` are remote machines. Both `machine2` and `machine3` should be
+reachable by `ssh`. For example, the command `ssh -i ~/private_keyfile
+username@100.100.100.100` (using the passphrase `passphrase`) should connect to
+`machine2`.
+
+The above is an example only. Machines should be uniform, since they are treated
+as such by the tests. Machines must also be accessible to each other via their
+default routes. Furthermore, some benchmarks will meaningless if running on the
+local machine, such as density.
+
+For remote machines, `hostname`, `key_path`, and `username` are required and
+others are optional. In addition key files must be generated
+[using the instrcutions below](#generating-ssh-keys).
+
+The above yaml file can be checked for correctness with the `validate` command
+in the top level perf.py script:
+
+`bazel run :benchmarks -- validate $PWD/examples/localhost.yaml`
+
+## Running benchmarks
+
+To list available benchmarks, use the `list` commmand:
+
+```bash
+bazel run :benchmarks -- list
+
+...
+Benchmark: sysbench.cpu
+Metrics: events_per_second
+    Run sysbench CPU test. Additional arguments can be provided for sysbench.
+
+    :param max_prime: The maximum prime number to search.
+```
+
+To run benchmarks, use the `run` command. For example, to run the sysbench
+benchmark above:
+
+```bash
+bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml sysbench.cpu
+```
+
+You can run parameterized benchmarks, for example to run with different
+runtimes:
+
+```bash
+bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml --runtime=runc --runtime=runsc sysbench.cpu
+```
+
+Or with different parameters:
+
+```bash
+bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml --max_prime=10 --max_prime=100 sysbench.cpu
+```
+
+## Writing benchmarks
+
+To write new benchmarks, you should familiarize yourself with the structure of
+the repository. There are three key components.
+
+## Harness
+
+The harness makes use of the [docker py SDK][docker-py]. It is advisable that
+you familiarize yourself with that API when making changes, specifically:
+
+*   clients
+*   containers
+*   images
+
+In general, benchmarks need only interact with the `Machine` objects provided to
+the benchmark function, which are the machines defined in the environment. These
+objects allow the benchmark to define the relationships between different
+containers, and parse the output.
+
+## Workloads
+
+The harness requires workloads to run. These are all available in the
+`workloads` directory.
+
+In general, a workload consists of a Dockerfile to build it (while these are not
+hermetic, in general they should be as fixed and isolated as possible), some
+parses for output if required, parser tests and sample data. Provided the test
+is named after the workload package and contains a function named `sample`, this
+variable will be used to automatically mock workload output when the `--mock`
+flag is provided to the main tool.
+
+## Writing benchmarks
+
+Benchmarks define the tests themselves. All benchmarks have the following
+function signature:
+
+```python
+def my_func(output) -> float:
+    return float(output)
+
+@benchmark(metrics = my_func, machines = 1)
+def my_benchmark(machine: machine.Machine, arg: str):
+    return "3.4432"
+```
+
+Each benchmark takes a variable amount of position arguments as
+`harness.Machine` objects and some set of keyword arguments. It is recommended
+that you accept arbitrary keyword arguments and pass them through when
+constructing the container under test.
+
+To write a new benchmark, open a module in the `suites` directory and use the
+above signature. You should add a descriptive doc string to describe what your
+benchmark is and any test centric arguments.
+
+## Generating SSH Keys
+
+The scripts only support RSA Keys, and ssh library used in paramiko. Paramiko
+only supports RSA keys that look like the following (PEM format):
+
+```bash
+$ cat /path/to/ssh/key
+
+-----BEGIN RSA PRIVATE KEY-----
+...private key text...
+-----END RSA PRIVATE KEY-----
+
+```
+
+To generate ssh keys in PEM format, use the [`-t rsa -m PEM -b 4096`][RSA-keys].
+option.
+
+[dockerd]: https://docs.docker.com/engine/reference/commandline/dockerd/
+[docker-py]: https://docker-py.readthedocs.io/en/stable/
+[paramiko]: http://docs.paramiko.org/en/2.4/api/client.html
+[RSA-keys]: https://serverfault.com/questions/939909/ssh-keygen-does-not-create-rsa-private-key
diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl
new file mode 100644
index 000000000..79e6cdbc8
--- /dev/null
+++ b/benchmarks/defs.bzl
@@ -0,0 +1,18 @@
+"""Provides python helper functions."""
+
+load("@pydeps//:requirements.bzl", _requirement = "requirement")
+
+def filter_deps(deps = None):
+    if deps == None:
+        deps = []
+    return [dep for dep in deps if dep]
+
+def py_library(deps = None, **kwargs):
+    return native.py_library(deps = filter_deps(deps), **kwargs)
+
+def py_test(deps = None, **kwargs):
+    return native.py_test(deps = filter_deps(deps), **kwargs)
+
+def requirement(name, direct = True):
+    """ requirement returns the required dependency. """
+    return _requirement(name)
diff --git a/benchmarks/examples/localhost.yaml b/benchmarks/examples/localhost.yaml
new file mode 100644
index 000000000..f70fe0fb7
--- /dev/null
+++ b/benchmarks/examples/localhost.yaml
@@ -0,0 +1,2 @@
+client: localhost
+server: localhost
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
new file mode 100644
index 000000000..9546220c4
--- /dev/null
+++ b/benchmarks/harness/BUILD
@@ -0,0 +1,89 @@
+load("//benchmarks:defs.bzl", "py_library", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "harness",
+    srcs = ["__init__.py"],
+)
+
+py_library(
+    name = "benchmark_driver",
+    srcs = ["benchmark_driver.py"],
+    deps = [
+        "//benchmarks/harness/machine_mocks",
+        "//benchmarks/harness/machine_producers:machine_producer",
+        "//benchmarks/suites",
+    ],
+)
+
+py_library(
+    name = "container",
+    srcs = ["container.py"],
+    deps = [
+        requirement("asn1crypto", False),
+        requirement("chardet", False),
+        requirement("certifi", False),
+        requirement("docker", True),
+        requirement("docker-pycreds", False),
+        requirement("idna", False),
+        requirement("ptyprocess", False),
+        requirement("requests", False),
+        requirement("urllib3", False),
+        requirement("websocket-client", False),
+    ],
+)
+
+py_library(
+    name = "machine",
+    srcs = ["machine.py"],
+    deps = [
+        "//benchmarks/harness",
+        "//benchmarks/harness:container",
+        "//benchmarks/harness:ssh_connection",
+        "//benchmarks/harness:tunnel_dispatcher",
+        requirement("asn1crypto", False),
+        requirement("chardet", False),
+        requirement("certifi", False),
+        requirement("docker", True),
+        requirement("docker-pycreds", False),
+        requirement("idna", False),
+        requirement("ptyprocess", False),
+        requirement("requests", False),
+        requirement("urllib3", False),
+        requirement("websocket-client", False),
+    ],
+)
+
+py_library(
+    name = "ssh_connection",
+    srcs = ["ssh_connection.py"],
+    deps = [
+        "//benchmarks/harness",
+        requirement("bcrypt", False),
+        requirement("cffi", False),
+        requirement("paramiko", True),
+        requirement("cryptography", False),
+    ],
+)
+
+py_library(
+    name = "tunnel_dispatcher",
+    srcs = ["tunnel_dispatcher.py"],
+    deps = [
+        requirement("asn1crypto", False),
+        requirement("chardet", False),
+        requirement("certifi", False),
+        requirement("docker", True),
+        requirement("docker-pycreds", False),
+        requirement("idna", False),
+        requirement("pexpect", True),
+        requirement("ptyprocess", False),
+        requirement("requests", False),
+        requirement("urllib3", False),
+        requirement("websocket-client", False),
+    ],
+)
diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
new file mode 100644
index 000000000..a7f34da9e
--- /dev/null
+++ b/benchmarks/harness/__init__.py
@@ -0,0 +1,25 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core benchmark utilities."""
+
+import os
+
+# LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a
+# format string that accepts a single string parameter.
+LOCAL_WORKLOADS_PATH = os.path.join(
+    os.path.dirname(__file__), "../workloads/{}")
+
+# REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the
+# remote host. This is a format string that accepts a single string parameter.
+REMOTE_WORKLOADS_PATH = "workloads/{}"
diff --git a/benchmarks/harness/benchmark_driver.py b/benchmarks/harness/benchmark_driver.py
new file mode 100644
index 000000000..9abc21b54
--- /dev/null
+++ b/benchmarks/harness/benchmark_driver.py
@@ -0,0 +1,85 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main driver for benchmarks."""
+
+import copy
+import statistics
+import threading
+import types
+
+from benchmarks import suites
+from benchmarks.harness.machine_producers import machine_producer
+
+
+# pylint: disable=too-many-instance-attributes
+class BenchmarkDriver:
+  """Allocates machines and invokes a benchmark method."""
+
+  def __init__(self,
+               producer: machine_producer.MachineProducer,
+               method: types.FunctionType,
+               runs: int = 1,
+               **kwargs):
+
+    self._producer = producer
+    self._method = method
+    self._kwargs = copy.deepcopy(kwargs)
+    self._threads = []
+    self.lock = threading.RLock()
+    self._runs = runs
+    self._metric_results = {}
+
+  def start(self):
+    """Starts a benchmark thread."""
+    for _ in range(self._runs):
+      thread = threading.Thread(target=self._run_method)
+      thread.start()
+      self._threads.append(thread)
+
+  def join(self):
+    """Joins the thread."""
+    # pylint: disable=expression-not-assigned
+    [t.join() for t in self._threads]
+
+  def _run_method(self):
+    """Runs all benchmarks."""
+    machines = self._producer.get_machines(
+        suites.benchmark_machines(self._method))
+    try:
+      result = self._method(*machines, **self._kwargs)
+      for name, res in result:
+        with self.lock:
+          if name in self._metric_results:
+            self._metric_results[name].append(res)
+          else:
+            self._metric_results[name] = [res]
+    finally:
+      # Always release.
+      self._producer.release_machines(machines)
+
+  def median(self):
+    """Returns the median result, after join is finished."""
+    for key, value in self._metric_results.items():
+      yield key, [statistics.median(value)]
+
+  def all(self):
+    """Returns all results."""
+    for key, value in self._metric_results.items():
+      yield key, value
+
+  def meanstd(self):
+    """Returns all results."""
+    for key, value in self._metric_results.items():
+      mean = statistics.mean(value)
+      yield key, [mean, statistics.stdev(value, xbar=mean)]
diff --git a/benchmarks/harness/container.py b/benchmarks/harness/container.py
new file mode 100644
index 000000000..585436e20
--- /dev/null
+++ b/benchmarks/harness/container.py
@@ -0,0 +1,181 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Container definitions."""
+
+import contextlib
+import logging
+import pydoc
+import types
+from typing import Tuple
+
+import docker
+import docker.errors
+
+from benchmarks import workloads
+
+
+class Container:
+  """Abstract container.
+
+  Must be a context manager.
+
+  Usage:
+
+    with Container(client, image, ...):
+        ...
+  """
+
+  def run(self, **env) -> str:
+    """Run the container synchronously."""
+    raise NotImplementedError
+
+  def detach(self, **env):
+    """Run the container asynchronously."""
+    raise NotImplementedError
+
+  def address(self) -> Tuple[str, int]:
+    """Return the bound address for the container."""
+    raise NotImplementedError
+
+  def get_names(self) -> types.GeneratorType:
+    """Return names of all containers."""
+    raise NotImplementedError
+
+
+# pylint: disable=too-many-instance-attributes
+class DockerContainer(Container):
+  """Class that handles creating a docker container."""
+
+  # pylint: disable=too-many-arguments
+  def __init__(self,
+               client: docker.DockerClient,
+               host: str,
+               image: str,
+               count: int = 1,
+               runtime: str = "runc",
+               port: int = 0,
+               **kwargs):
+    """Trys to setup "count" containers.
+
+    Args:
+      client: A docker client from dockerpy.
+      host: The host address the image is running on.
+      image: The name of the image to run.
+      count: The number of containers to setup.
+      runtime: The container runtime to use.
+      port: The port to reserve.
+      **kwargs: Additional container options.
+    """
+    assert count >= 1
+    assert port == 0 or count == 1
+    self._client = client
+    self._host = host
+    self._containers = []
+    self._count = count
+    self._image = image
+    self._runtime = runtime
+    self._port = port
+    self._kwargs = kwargs
+    if port != 0:
+      self._ports = {"%d/tcp" % port: None}
+    else:
+      self._ports = {}
+
+  @contextlib.contextmanager
+  def detach(self, **env):
+    env = ["%s=%s" % (key, value) for (key, value) in env.items()]
+    # Start all containers.
+    for _ in range(self._count):
+      try:
+        # Start the container in a detached mode.
+        container = self._client.containers.run(
+            self._image,
+            detach=True,
+            remove=True,
+            runtime=self._runtime,
+            ports=self._ports,
+            environment=env,
+            **self._kwargs)
+        logging.info("Started detached container %s -> %s", self._image,
+                     container.attrs["Id"])
+        self._containers.append(container)
+      except Exception as exc:
+        self._clean_containers()
+        raise exc
+    try:
+      # Wait for all containers to be up.
+      for container in self._containers:
+        while not container.attrs["State"]["Running"]:
+          container = self._client.containers.get(container.attrs["Id"])
+      yield self
+    finally:
+      self._clean_containers()
+
+  def address(self) -> Tuple[str, int]:
+    assert self._count == 1
+    assert self._port != 0
+    container = self._client.containers.get(self._containers[0].attrs["Id"])
+    port = container.attrs["NetworkSettings"]["Ports"][
+        "%d/tcp" % self._port][0]["HostPort"]
+    return (self._host, port)
+
+  def get_names(self) -> types.GeneratorType:
+    for container in self._containers:
+      yield container.name
+
+  def run(self, **env) -> str:
+    env = ["%s=%s" % (key, value) for (key, value) in env.items()]
+    return self._client.containers.run(
+        self._image,
+        runtime=self._runtime,
+        ports=self._ports,
+        remove=True,
+        environment=env,
+        **self._kwargs).decode("utf-8")
+
+  def _clean_containers(self):
+    """Kills all containers."""
+    for container in self._containers:
+      try:
+        container.kill()
+      except docker.errors.NotFound:
+        pass
+
+
+class MockContainer(Container):
+  """Mock of Container."""
+
+  def __init__(self, workload: str):
+    self._workload = workload
+
+  def __enter__(self):
+    return self
+
+  def run(self, **env):
+    # Lookup sample data if any exists for the workload module. We use a
+    # well-defined test locate and a well-defined sample function.
+    mod = pydoc.locate(workloads.__name__ + "." + self._workload)
+    if hasattr(mod, "sample"):
+      return mod.sample(**env)
+    return ""  # No output.
+
+  def address(self) -> Tuple[str, int]:
+    return ("example.com", 80)
+
+  def get_names(self) -> types.GeneratorType:
+    yield "mock"
+
+  @contextlib.contextmanager
+  def detach(self, **env):
+    yield self
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
new file mode 100644
index 000000000..2166d040a
--- /dev/null
+++ b/benchmarks/harness/machine.py
@@ -0,0 +1,191 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Machine abstraction. This is the primary API for benchmarks."""
+
+import logging
+import re
+import subprocess
+import time
+from typing import Tuple
+
+import docker
+
+from benchmarks import harness
+from benchmarks.harness import container
+from benchmarks.harness import machine_mocks
+from benchmarks.harness import ssh_connection
+from benchmarks.harness import tunnel_dispatcher
+
+
+class Machine:
+  """The machine object is the primary object for benchmarks.
+
+  Machine objects are passed to each metric function call and benchmarks use
+  machines to access real connections to those machines.
+  """
+
+  def run(self, cmd: str) -> Tuple[str, str]:
+    """Convenience method for running a bash command on a machine object.
+
+    Some machines may point to the local machine, and thus, do not have ssh
+    connections. Run runs a command either local or over ssh and returns the
+    output stdout and stderr as strings.
+
+    Args:
+      cmd: The command to run as a string.
+
+    Returns:
+      The command output.
+    """
+    raise NotImplementedError
+
+  def read(self, path: str) -> str:
+    """Reads the contents of some file.
+
+    This will be mocked.
+
+    Args:
+      path: The path to the file to be read.
+
+    Returns:
+      The file contents.
+    """
+    raise NotImplementedError
+
+  def pull(self, workload: str) -> str:
+    """Send the given workload to the machine, build and tag it.
+
+    All images must be defined by the workloads directory.
+
+    Args:
+      workload: The workload name.
+
+    Returns:
+      The workload tag.
+    """
+    raise NotImplementedError
+
+  def container(self, image: str, **kwargs) -> container.Container:
+    """Returns a container object.
+
+    Args:
+      image: The pulled image tag.
+      **kwargs: Additional container options.
+
+    Returns:
+        :return: a container.Container object.
+    """
+    raise NotImplementedError
+
+  def sleep(self, amount: float):
+    """Sleeps the given amount of time."""
+    raise NotImplementedError
+
+
+class MockMachine(Machine):
+  """A mocked machine."""
+
+  def run(self, cmd: str) -> Tuple[str, str]:
+    return "", ""
+
+  def read(self, path: str) -> str:
+    return machine_mocks.Readfile(path)
+
+  def pull(self, workload: str) -> str:
+    return workload  # Workload is the tag.
+
+  def container(self, image: str, **kwargs) -> container.Container:
+    return container.MockContainer(image)
+
+  def sleep(self, amount: float):
+    pass
+
+
+def get_address(machine: Machine) -> str:
+  """Return a machine's default address."""
+  default_route, _ = machine.run("ip route get 8.8.8.8")
+  return re.search(" src ([0-9.]+) ", default_route).group(1)
+
+
+class LocalMachine(Machine):
+  """The local machine."""
+
+  def __init__(self, name):
+    self._name = name
+    self._docker_client = docker.from_env()
+
+  def __str__(self):
+    return self._name
+
+  def run(self, cmd: str) -> Tuple[str, str]:
+    process = subprocess.Popen(
+        cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = process.communicate()
+    return stdout.decode("utf-8"), stderr.decode("utf-8")
+
+  def read(self, path: str) -> str:
+    # Read the exact path locally.
+    return open(path, "r").read()
+
+  def pull(self, workload: str) -> str:
+    # Run the docker build command locally.
+    logging.info("Building %s@%s locally...", workload, self._name)
+    self.run("docker build --tag={} {}".format(
+        workload, harness.LOCAL_WORKLOADS_PATH.format(workload)))
+    return workload  # Workload is the tag.
+
+  def container(self, image: str, **kwargs) -> container.Container:
+    # Return a local docker container directly.
+    return container.DockerContainer(self._docker_client, get_address(self),
+                                     image, **kwargs)
+
+  def sleep(self, amount: float):
+    time.sleep(amount)
+
+
+class RemoteMachine(Machine):
+  """Remote machine accessible via an SSH connection."""
+
+  def __init__(self, name, **kwargs):
+    self._name = name
+    self._ssh_connection = ssh_connection.SSHConnection(name, **kwargs)
+    self._tunnel = tunnel_dispatcher.Tunnel(name, **kwargs)
+    self._tunnel.connect()
+    self._docker_client = self._tunnel.get_docker_client()
+
+  def __str__(self):
+    return self._name
+
+  def run(self, cmd: str) -> Tuple[str, str]:
+    return self._ssh_connection.run(cmd)
+
+  def read(self, path: str) -> str:
+    # Just cat remotely.
+    stdout, stderr = self._ssh_connection.run("cat '{}'".format(path))
+    return stdout + stderr
+
+  def pull(self, workload: str) -> str:
+    # Push to the remote machine and build.
+    logging.info("Building %s@%s remotely...", workload, self._name)
+    remote_path = self._ssh_connection.send_workload(workload)
+    self.run("docker build --tag={} {}".format(workload, remote_path))
+    return workload  # Workload is the tag.
+
+  def container(self, image: str, **kwargs) -> container.Container:
+    # Return a remote docker container.
+    return container.DockerContainer(self._docker_client, get_address(self),
+                                     image, **kwargs)
+
+  def sleep(self, amount: float):
+    time.sleep(amount)
diff --git a/benchmarks/harness/machine_mocks/BUILD b/benchmarks/harness/machine_mocks/BUILD
new file mode 100644
index 000000000..c8ec4bc79
--- /dev/null
+++ b/benchmarks/harness/machine_mocks/BUILD
@@ -0,0 +1,9 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "machine_mocks",
+    srcs = ["__init__.py"],
+)
diff --git a/benchmarks/harness/machine_mocks/__init__.py b/benchmarks/harness/machine_mocks/__init__.py
new file mode 100644
index 000000000..00f0085d7
--- /dev/null
+++ b/benchmarks/harness/machine_mocks/__init__.py
@@ -0,0 +1,81 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Machine mock files."""
+
+MEMINFO = """\
+MemTotal:        7652344 kB
+MemFree:         7174724 kB
+MemAvailable:    7152008 kB
+Buffers:            7544 kB
+Cached:           178856 kB
+SwapCached:            0 kB
+Active:           270928 kB
+Inactive:          68436 kB
+Active(anon):     153124 kB
+Inactive(anon):      880 kB
+Active(file):     117804 kB
+Inactive(file):    67556 kB
+Unevictable:           0 kB
+Mlocked:               0 kB
+SwapTotal:             0 kB
+SwapFree:              0 kB
+Dirty:               900 kB
+Writeback:             0 kB
+AnonPages:        153000 kB
+Mapped:           129120 kB
+Shmem:              1044 kB
+Slab:              60864 kB
+SReclaimable:      22792 kB
+SUnreclaim:        38072 kB
+KernelStack:        2672 kB
+PageTables:         5756 kB
+NFS_Unstable:          0 kB
+Bounce:                0 kB
+WritebackTmp:          0 kB
+CommitLimit:     3826172 kB
+Committed_AS:     663836 kB
+VmallocTotal:   34359738367 kB
+VmallocUsed:           0 kB
+VmallocChunk:          0 kB
+HardwareCorrupted:     0 kB
+AnonHugePages:         0 kB
+ShmemHugePages:        0 kB
+ShmemPmdMapped:        0 kB
+CmaTotal:              0 kB
+CmaFree:               0 kB
+HugePages_Total:       0
+HugePages_Free:        0
+HugePages_Rsvd:        0
+HugePages_Surp:        0
+Hugepagesize:       2048 kB
+DirectMap4k:       94196 kB
+DirectMap2M:     4624384 kB
+DirectMap1G:     3145728 kB
+"""
+
+CONTENTS = {
+    "/proc/meminfo": MEMINFO,
+}
+
+
+def Readfile(path: str) -> str:
+  """Reads a mock file.
+
+  Args:
+    path: The target path.
+
+  Returns:
+    Mocked file contents or None.
+  """
+  return CONTENTS.get(path, None)
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
new file mode 100644
index 000000000..5b2228e01
--- /dev/null
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "harness",
+    srcs = ["__init__.py"],
+)
+
+py_library(
+    name = "machine_producer",
+    srcs = ["machine_producer.py"],
+)
+
+py_library(
+    name = "mock_producer",
+    srcs = ["mock_producer.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/harness/machine_producers:machine_producer",
+    ],
+)
+
+py_library(
+    name = "yaml_producer",
+    srcs = ["yaml_producer.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/harness/machine_producers:machine_producer",
+        requirement("PyYAML", False),
+    ],
+)
diff --git a/benchmarks/harness/machine_producers/__init__.py b/benchmarks/harness/machine_producers/__init__.py
new file mode 100644
index 000000000..634ef4843
--- /dev/null
+++ b/benchmarks/harness/machine_producers/__init__.py
@@ -0,0 +1,13 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/benchmarks/harness/machine_producers/machine_producer.py b/benchmarks/harness/machine_producers/machine_producer.py
new file mode 100644
index 000000000..124ee14cc
--- /dev/null
+++ b/benchmarks/harness/machine_producers/machine_producer.py
@@ -0,0 +1,30 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Abstract types."""
+
+from typing import List
+
+from benchmarks.harness import machine
+
+
+class MachineProducer:
+  """Abstract Machine producer."""
+
+  def get_machines(self, num_machines: int) -> List[machine.Machine]:
+    """Returns the requested number of machines."""
+    raise NotImplementedError
+
+  def release_machines(self, machine_list: List[machine.Machine]):
+    """Releases the given set of machines."""
+    raise NotImplementedError
diff --git a/benchmarks/harness/machine_producers/mock_producer.py b/benchmarks/harness/machine_producers/mock_producer.py
new file mode 100644
index 000000000..4f29ad53f
--- /dev/null
+++ b/benchmarks/harness/machine_producers/mock_producer.py
@@ -0,0 +1,31 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Producers of mocks."""
+
+from typing import List
+
+from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import machine_producer
+
+
+class MockMachineProducer(machine_producer.MachineProducer):
+  """Produces MockMachine objects."""
+
+  def get_machines(self, num_machines: int) -> List[machine.MockMachine]:
+    """Returns the request number of MockMachines."""
+    return [machine.MockMachine() for i in range(num_machines)]
+
+  def release_machines(self, machine_list: List[machine.MockMachine]):
+    """No-op."""
+    return
diff --git a/benchmarks/harness/machine_producers/yaml_producer.py b/benchmarks/harness/machine_producers/yaml_producer.py
new file mode 100644
index 000000000..5d334e480
--- /dev/null
+++ b/benchmarks/harness/machine_producers/yaml_producer.py
@@ -0,0 +1,106 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Producers based on yaml files."""
+
+import os
+import threading
+from typing import Dict
+from typing import List
+
+import yaml
+
+from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import machine_producer
+
+
+class YamlMachineProducer(machine_producer.MachineProducer):
+  """Loads machines from a yaml file."""
+
+  def __init__(self, path: str):
+    self.machines = build_machines(path)
+    self.max_machines = len(self.machines)
+    self.machine_condition = threading.Condition()
+
+  def get_machines(self, num_machines: int) -> List[machine.Machine]:
+    if num_machines > self.max_machines:
+      raise ValueError(
+          "Insufficient Ammount of Machines. {ask} asked for and have {max_num} max."
+          .format(ask=num_machines, max_num=self.max_machines))
+
+    with self.machine_condition:
+      while not self._enough_machines(num_machines):
+        self.machine_condition.wait(timeout=1)
+      return [self.machines.pop(0) for _ in range(num_machines)]
+
+  def release_machines(self, machine_list: List[machine.Machine]):
+    with self.machine_condition:
+      while machine_list:
+        next_machine = machine_list.pop()
+        self.machines.append(next_machine)
+      self.machine_condition.notify()
+
+  def _enough_machines(self, ask: int):
+    return ask <= len(self.machines)
+
+
+def build_machines(path: str, num_machines: str = -1) -> List[machine.Machine]:
+  """Builds machine objects defined by the yaml file "path".
+
+  Args:
+    path: The path to a yaml file which defines machines.
+    num_machines: Optional limit on how many machine objects to build.
+
+  Returns:
+    Machine objects in a list.
+
+    If num_machines is set, len(machines) <= num_machines.
+  """
+  data = parse_yaml(path)
+  machines = []
+  for key, value in data.items():
+    if len(machines) == num_machines:
+      return machines
+    if isinstance(value, dict):
+      machines.append(machine.RemoteMachine(key, **value))
+    else:
+      machines.append(machine.LocalMachine(key))
+  return machines
+
+
+def parse_yaml(path: str) -> Dict[str, Dict[str, str]]:
+  """Parse the yaml file pointed by path.
+
+  Args:
+    path: The path to yaml file.
+
+  Returns:
+    The contents of the yaml file as a dictionary.
+  """
+  data = get_file_contents(path)
+  return yaml.load(data, Loader=yaml.Loader)
+
+
+def get_file_contents(path: str) -> str:
+  """Dumps the file contents to a string and returns them.
+
+  Args:
+    path: The path to dump.
+
+  Returns:
+    The file contents as a string.
+  """
+  if not os.path.isabs(path):
+    path = os.path.abspath(path)
+  with open(path) as input_file:
+    return input_file.read()
diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py
new file mode 100644
index 000000000..fcbfbcdb2
--- /dev/null
+++ b/benchmarks/harness/ssh_connection.py
@@ -0,0 +1,111 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SSHConnection handles the details of SSH connections."""
+
+import os
+import warnings
+
+import paramiko
+
+from benchmarks import harness
+
+# Get rid of paramiko Cryptography Warnings.
+warnings.filterwarnings(action="ignore", module=".*paramiko.*")
+
+
+def send_one_file(client: paramiko.SSHClient, path: str, remote_dir: str):
+  """Sends a single file via an SSH client.
+
+  Args:
+    client: The existing SSH client.
+    path: The local path.
+    remote_dir: The remote directory.
+  """
+  filename = path.split("/").pop()
+  client.exec_command("mkdir -p " + remote_dir)
+  with client.open_sftp() as ftp_client:
+    ftp_client.put(path, os.path.join(remote_dir, filename))
+
+
+class SSHConnection:
+  """SSH connection to a remote machine."""
+
+  def __init__(self, name: str, hostname: str, key_path: str, username: str,
+               **kwargs):
+    """Sets up a paramiko ssh connection to the given hostname."""
+    self._name = name  # Unused.
+    self._hostname = hostname
+    self._username = username
+    self._key_path = key_path  # RSA Key path
+    self._kwargs = kwargs
+    # SSHConnection wraps paramiko. paramiko supports RSA, ECDSA, and Ed25519
+    # keys, and we've chosen to only suport and require RSA keys. paramiko
+    # supports RSA keys that begin with '----BEGIN RSAKEY----'.
+    # https://stackoverflow.com/questions/53600581/ssh-key-generated-by-ssh-keygen-is-not-recognized-by-paramiko
+    self.rsa_key = self._rsa()
+    self.run("true")  # Validate.
+
+  def _client(self) -> paramiko.SSHClient:
+    """Returns a connected SSH client."""
+    client = paramiko.SSHClient()
+    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    client.connect(
+        hostname=self._hostname,
+        port=22,
+        username=self._username,
+        pkey=self.rsa_key,
+        allow_agent=False,
+        look_for_keys=False)
+    return client
+
+  def _rsa(self):
+    if "key_password" in self._kwargs:
+      password = self._kwargs["key_password"]
+    else:
+      password = None
+    rsa = paramiko.RSAKey.from_private_key_file(self._key_path, password)
+    return rsa
+
+  def run(self, cmd: str) -> (str, str):
+    """Runs a command via ssh.
+
+    Args:
+      cmd: The shell command to run.
+
+    Returns:
+      The contents of stdout and stderr.
+    """
+    with self._client() as client:
+      _, stdout, stderr = client.exec_command(command=cmd)
+      stdout.channel.recv_exit_status()
+      stdout = stdout.read().decode("utf-8")
+      stderr = stderr.read().decode("utf-8")
+    return stdout, stderr
+
+  def send_workload(self, name: str) -> str:
+    """Sends a workload to the remote machine.
+
+    Args:
+      name: The workload name.
+
+    Returns:
+      The remote path.
+    """
+    with self._client() as client:
+      for dirpath, _, filenames in os.walk(
+          harness.LOCAL_WORKLOADS_PATH.format(name)):
+        for filename in filenames:
+          send_one_file(client, os.path.join(dirpath, filename),
+                        harness.REMOTE_WORKLOADS_PATH.format(name))
+    return harness.REMOTE_WORKLOADS_PATH.format(name)
diff --git a/benchmarks/harness/tunnel_dispatcher.py b/benchmarks/harness/tunnel_dispatcher.py
new file mode 100644
index 000000000..8dfe2862a
--- /dev/null
+++ b/benchmarks/harness/tunnel_dispatcher.py
@@ -0,0 +1,82 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tunnel handles setting up connections to remote machines."""
+
+import os
+import tempfile
+import time
+
+import docker
+import pexpect
+
+SSH_TUNNEL_COMMAND = """ssh
+ -o GlobalKnownHostsFile=/dev/null
+ -o UserKnownHostsFile=/dev/null
+ -o StrictHostKeyChecking=no
+ -nNT -L {filename}:/var/run/docker.sock
+ -i {key_path}
+ {username}@{hostname}"""
+
+
+class Tunnel:
+  """The tunnel object represents the tunnel via ssh.
+
+  This connects a local unix domain socket with a remote socket.
+  """
+
+  def __init__(self, name, hostname: str, username: str, key_path: str,
+               **kwargs):
+    self._filename = tempfile.NamedTemporaryFile(prefix=name).name
+    self._hostname = hostname
+    self._username = username
+    self._key_path = key_path
+    self._kwargs = kwargs
+    self._process = None
+
+  def connect(self):
+    """Connects the SSH tunnel."""
+    cmd = SSH_TUNNEL_COMMAND.format(
+        filename=self._filename,
+        key_path=self._key_path,
+        username=self._username,
+        hostname=self._hostname)
+    self._process = pexpect.spawn(cmd, timeout=10)
+
+    # If given a password, assume we'll be asked for it.
+    if "key_password" in self._kwargs:
+      self._process.expect(["Enter passphrase for key .*: "])
+      self._process.sendline(self._kwargs["key_password"])
+
+    while True:
+      # Wait for the tunnel to appear.
+      if self._process.exitstatus is not None:
+        raise ConnectionError("Error in setting up ssh tunnel")
+      if os.path.exists(self._filename):
+        return
+      time.sleep(0.1)
+
+  def path(self):
+    """Return the socket file."""
+    return self._filename
+
+  def get_docker_client(self):
+    """Returns a docker client for this Tunne0l."""
+    return docker.DockerClient(base_url="unix:/" + self._filename)
+
+  def __del__(self):
+    """Closes the ssh connection process and deletes the socket file."""
+    if self._process:
+      self._process.close()
+    if os.path.exists(self._filename):
+      os.remove(self._filename)
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 000000000..577eb1a2e
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,32 @@
+asn1crypto==1.2.0
+atomicwrites==1.3.0
+attrs==19.3.0
+bcrypt==3.1.7
+certifi==2019.9.11
+cffi==1.13.2
+chardet==3.0.4
+Click==7.0
+cryptography==2.8
+docker==3.7.0
+docker-pycreds==0.4.0
+idna==2.8
+importlib-metadata==0.23
+more-itertools==7.2.0
+packaging==19.2
+paramiko==2.6.0
+pathlib2==2.3.5
+pexpect==4.7.0
+pluggy==0.9.0
+ptyprocess==0.6.0
+py==1.8.0
+pycparser==2.19
+PyNaCl==1.3.0
+pyparsing==2.4.5
+pytest==4.3.0
+PyYAML==5.1.2
+requests==2.22.0
+six==1.13.0
+urllib3==1.25.7
+wcwidth==0.1.7
+websocket-client==0.56.0
+zipp==0.6.0
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100644
index 000000000..a22eb8641
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,19 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Benchmark runner."""
+
+from benchmarks import runner
+
+if __name__ == "__main__":
+  runner.runner()
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
new file mode 100644
index 000000000..a3941da42
--- /dev/null
+++ b/benchmarks/runner/BUILD
@@ -0,0 +1,50 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(licenses = ["notice"])
+
+py_library(
+    name = "runner",
+    srcs = ["__init__.py"],
+    data = [
+        "//benchmarks/workloads:files",
+    ],
+    visibility = ["//benchmarks:__pkg__"],
+    deps = [
+        "//benchmarks/harness:benchmark_driver",
+        "//benchmarks/harness/machine_producers:mock_producer",
+        "//benchmarks/harness/machine_producers:yaml_producer",
+        "//benchmarks/suites",
+        "//benchmarks/suites:absl",
+        "//benchmarks/suites:density",
+        "//benchmarks/suites:fio",
+        "//benchmarks/suites:helpers",
+        "//benchmarks/suites:http",
+        "//benchmarks/suites:media",
+        "//benchmarks/suites:ml",
+        "//benchmarks/suites:network",
+        "//benchmarks/suites:redis",
+        "//benchmarks/suites:startup",
+        "//benchmarks/suites:sysbench",
+        "//benchmarks/suites:syscall",
+        requirement("click", True),
+    ],
+)
+
+py_test(
+    name = "runner_test",
+    srcs = ["runner_test.py"],
+    python_version = "PY3",
+    tags = ["local"],
+    deps = [
+        ":runner",
+        requirement("click", True),
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
new file mode 100644
index 000000000..9bf9cfd65
--- /dev/null
+++ b/benchmarks/runner/__init__.py
@@ -0,0 +1,301 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""High-level benchmark utility."""
+
+import copy
+import csv
+import logging
+import pkgutil
+import pydoc
+import re
+import sys
+import types
+from typing import List
+from typing import Tuple
+
+import click
+
+from benchmarks import suites
+from benchmarks.harness import benchmark_driver
+from benchmarks.harness.machine_producers import mock_producer
+from benchmarks.harness.machine_producers import yaml_producer
+
+
+@click.group()
+@click.option(
+    "--verbose/--no-verbose", default=False, help="Enable verbose logging.")
+@click.option("--debug/--no-debug", default=False, help="Enable debug logging.")
+def runner(verbose: bool = False, debug: bool = False):
+  """Run distributed benchmarks.
+
+  See the run and list commands for details.
+
+  Args:
+    verbose: Enable verbose logging.
+    debug: Enable debug logging (supercedes verbose).
+  """
+  if debug:
+    logging.basicConfig(level=logging.DEBUG)
+  elif verbose:
+    logging.basicConfig(level=logging.INFO)
+
+
+def find_benchmarks(
+    regex: str) -> List[Tuple[str, types.ModuleType, types.FunctionType]]:
+  """Finds all available benchmarks.
+
+  Args:
+    regex: A regular expression to match.
+
+  Returns:
+    A (short_name, module, function) tuple for each match.
+  """
+  pkgs = pkgutil.walk_packages(suites.__path__, suites.__name__ + ".")
+  found = []
+  for _, name, _ in pkgs:
+    mod = pydoc.locate(name)
+    funcs = [
+        getattr(mod, x)
+        for x in dir(mod)
+        if suites.is_benchmark(getattr(mod, x))
+    ]
+    for func in funcs:
+      # Use the short_name with the benchmarks. prefix stripped.
+      prefix_len = len(suites.__name__ + ".")
+      short_name = mod.__name__[prefix_len:] + "." + func.__name__
+      # Add to the list if a pattern is provided.
+      if re.compile(regex).match(short_name):
+        found.append((short_name, mod, func))
+  return found
+
+
+@runner.command("list")
+@click.argument("method", nargs=-1)
+def list_all(method):
+  """Lists available benchmarks."""
+  if not method:
+    method = ".*"
+  else:
+    method = "(" + ",".join(method) + ")"
+  for (short_name, _, func) in find_benchmarks(method):
+    print("Benchmark %s:" % short_name)
+    metrics = suites.benchmark_metrics(func)
+    if func.__doc__:
+      print("    " + func.__doc__.lstrip().rstrip())
+    if metrics:
+      print("\n    Metrics:")
+    for metric in metrics:
+      print("\t{name}: {doc}".format(name=metric[0], doc=metric[1]))
+    print("\n")
+
+
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-branches
+# pylint: disable=too-many-locals
+@runner.command(
+    context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
+@click.pass_context
+@click.argument("method")
+@click.option("--mock/--no-mock", default=False, help="Mock the machines.")
+@click.option("--env", default=None, help="Specify a yaml file with machines.")
+@click.option(
+    "--runtime", default=["runc"], help="The runtime to use.", multiple=True)
+@click.option("--metric", help="The metric to extract.", multiple=True)
+@click.option(
+    "--runs", default=1, help="The number of times to run each benchmark.")
+@click.option(
+    "--stat",
+    default="median",
+    help="How to aggregate the data from all runs."
+    "\nmedian - returns the median of all runs (default)"
+    "\nall - returns all results comma separated"
+    "\nmeanstd - returns result as mean,std")
+# pylint: disable=too-many-statements
+def run(ctx, method: str, runs: int, env: str, mock: bool, runtime: List[str],
+        metric: List[str], stat: str, **kwargs):
+  """Runs arbitrary benchmarks.
+
+  All unknown command line flags are passed through to the underlying benchmark
+  method. Flags may be specified multiple times, in which case it is considered
+  a "dimension" for the test, and a comma-separated table will be emitted
+  instead of a single result.
+
+  See the output of list to see available metrics for any given benchmark
+  method. The method parameter is a regular expression that will match against
+  available benchmarks. If multiple benchmarks match, then that is considered a
+  distinct "dimension" for the test.
+
+  All benchmarks are run in parallel where possible, but have exclusive
+  ownership over the individual machines.
+
+  Exactly one of the --mock and --env flag must be specified.
+
+  Every benchmark method will be run the times indicated by --runs.
+
+  Args:
+    ctx: Click context.
+    method: A regular expression for methods to be run.
+    runs: Number of runs.
+    env: Environment to use.
+    mock: If true, use mocked environment (supercedes env).
+    runtime: A list of runtimes to test.
+    metric: A list of metrics to extract.
+    stat: The class of statistics to extract.
+    **kwargs: Dimensions to test.
+  """
+  # First, calculate additional arguments.
+  #
+  # This essentially calculates any arguments that appear multiple times, and
+  # moves those to the "dimensions" dictionary, which maps to lists. These
+  # dimensions are then iterated over to generate the relevant csv output.
+  dimensions = {}
+
+  if stat not in ["median", "all", "meanstd"]:
+    raise ValueError("Illegal value for --result, see help.")
+
+  def squish(key: str, value: str):
+    """Collapse an argument into kwargs or dimensions."""
+    if key in dimensions:
+      # Extend an existing dimension.
+      dimensions[key].append(value)
+    elif key in kwargs:
+      # Create a new dimension.
+      dimensions[key] = [kwargs[key], value]
+      del kwargs[key]
+    else:
+      # A single value.
+      kwargs[key] = value
+
+  for item in ctx.args:
+    if "=" in method:
+      # This must be the method. The method is simply set to the first
+      # non-matching argument, which we're also parsing here.
+      item, method = method, item
+    if "=" not in item:
+      logging.error("illegal argument: %s", item)
+      sys.exit(1)
+    (key, value) = item.lstrip("-").split("=", 1)
+    squish(key, value)
+
+  # Convert runtime and metric to dimensions.
+  #
+  # They exist only in the arguments above for documentation purposes.
+  # Essentially here we are treating them like anything else. Note however,
+  # that an empty set here will result in a dimension. This is important for
+  # metrics, where an empty set actually means all metrics.
+  def fold(key: str, value, allow_flatten=False):
+    """Collapse a list value into kwargs or dimensions."""
+    if len(value) == 1 and allow_flatten:
+      kwargs[key] = value[0]
+    else:
+      dimensions[key] = value
+
+  fold("runtime", runtime, allow_flatten=True)
+  fold("metric", metric)
+
+  # Lookup the methods.
+  #
+  # We match the method parameter to a regular expression. This allows you to
+  # do things like `run --mock .*` for a broad test. Note that we track the
+  # short_names in the dimensions here, and look up again in the recursion.
+  methods = {
+      short_name: func for (short_name, _, func) in find_benchmarks(method)
+  }
+  if not methods:
+    # Must match at least one method.
+    logging.error("no matching benchmarks for %s: try list.", method)
+    sys.exit(1)
+  fold("method", list(methods.keys()), allow_flatten=True)
+
+  # Construct the environment.
+  if mock and env:
+    # You can't provide both.
+    logging.error("both --mock and --env are set: which one is it?")
+    sys.exit(1)
+  elif mock:
+    producer = mock_producer.MockMachineProducer()
+  elif env:
+    producer = yaml_producer.YamlMachineProducer(env)
+  else:
+    # You must provide one of mock or env.
+    logging.error("no enviroment provided: use --mock or --env.")
+    sys.exit(1)
+
+  # Spin up the drivers.
+  #
+  # We ensure that metric is the last entry, because we have special behavior.
+  # They actually run the test once and the benchmark is a generator that
+  # produces all viable metrics.
+  dimension_keys = list(dimensions.keys())
+  if "metric" in dimension_keys:
+    dimension_keys.remove("metric")
+    dimension_keys.append("metric")
+  drivers = []
+
+  def _start(keywords, finished, left):
+    """Runs a test across dimensions recursively."""
+    # Resolve the method fully, it starts as a string.
+    if "method" in keywords and isinstance(keywords["method"], str):
+      keywords["method"] = methods[keywords["method"]]
+    # Is this a non-recursive case?
+    if not left:
+      driver = benchmark_driver.BenchmarkDriver(producer, runs=runs, **keywords)
+      driver.start()
+      drivers.append((finished, driver))
+    else:
+      # Recurse on the next dimension.
+      current, left = left[0], left[1:]
+      keywords = copy.deepcopy(keywords)
+      if current == "metric":
+        # We use a generator, popped below. Note that metric is
+        # guaranteed to be the last element here, and we will provide
+        # the value for 'done' below when generating the csv.
+        keywords[current] = dimensions[current]
+        _start(keywords, finished, left)
+      else:
+        # Generate manually.
+        for value in dimensions[current]:
+          keywords[current] = value
+          _start(keywords, finished + [value], left)
+
+  # Start all the drivers, recursively.
+  _start(kwargs, [], dimension_keys)
+
+  # Finish all tests, write results.
+  output = csv.writer(sys.stdout)
+  output.writerow(dimension_keys + ["result"])
+  for (done, driver) in drivers:
+    driver.join()
+    for (metric_name, result) in getattr(driver, stat)():
+      output.writerow([  # Collapse the method name.
+          hasattr(x, "__name__") and x.__name__ or x for x in done
+      ] + [metric_name] + result)
+
+
+@runner.command()
+@click.argument("env")
+@click.option(
+    "--cmd", default="uname -a", help="command to run on all found machines")
+@click.option(
+    "--workload", default="true", help="workload to run all found machines")
+def validate(env, cmd, workload):
+  """Validates an environment described by yaml file."""
+  producer = yaml_producer.YamlMachineProducer(env)
+  for machine in producer.machines:
+    print("Machine %s:" % machine)
+    stdout, _ = machine.run(cmd)
+    print("  Output of '%s': %s" % (cmd, stdout.lstrip().rstrip()))
+    image = machine.pull(workload)
+    stdout = machine.container(image).run()
+    print("  Container %s: %s" % (workload, stdout.lstrip().rstrip()))
diff --git a/benchmarks/runner/runner_test.py b/benchmarks/runner/runner_test.py
new file mode 100644
index 000000000..5719c2838
--- /dev/null
+++ b/benchmarks/runner/runner_test.py
@@ -0,0 +1,59 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Top-level tests."""
+
+import os
+import subprocess
+import sys
+
+from click import testing
+import pytest
+
+from benchmarks import runner
+
+
+def _get_locale():
+  output = subprocess.check_output(["locale", "-a"])
+  locales = output.split()
+  if b"en_US.utf8" in locales:
+    return "en_US.UTF-8"
+  else:
+    return "C.UTF-8"
+
+
+def _set_locale():
+  locale = _get_locale()
+  if os.getenv("LANG") != locale:
+    os.environ["LANG"] = locale
+    os.environ["LC_ALL"] = locale
+    os.execv("/proc/self/exe", ["python"] + sys.argv)
+
+
+def test_list():
+  cli_runner = testing.CliRunner()
+  result = cli_runner.invoke(runner.runner, ["list"])
+  print(result.output)
+  assert result.exit_code == 0
+
+
+def test_run():
+  cli_runner = testing.CliRunner()
+  result = cli_runner.invoke(runner.runner, ["run", "--mock", "."])
+  print(result.output)
+  assert result.exit_code == 0
+
+
+if __name__ == "__main__":
+  _set_locale()
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/suites/BUILD b/benchmarks/suites/BUILD
new file mode 100644
index 000000000..04fc23261
--- /dev/null
+++ b/benchmarks/suites/BUILD
@@ -0,0 +1,130 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "suites",
+    srcs = ["__init__.py"],
+)
+
+py_library(
+    name = "absl",
+    srcs = ["absl.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/workloads/absl",
+    ],
+)
+
+py_library(
+    name = "density",
+    srcs = ["density.py"],
+    deps = [
+        "//benchmarks/harness:container",
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/suites:helpers",
+    ],
+)
+
+py_library(
+    name = "fio",
+    srcs = ["fio.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/suites:helpers",
+        "//benchmarks/workloads/fio",
+    ],
+)
+
+py_library(
+    name = "helpers",
+    srcs = ["helpers.py"],
+    deps = ["//benchmarks/harness:machine"],
+)
+
+py_library(
+    name = "http",
+    srcs = ["http.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/workloads/ab",
+    ],
+)
+
+py_library(
+    name = "media",
+    srcs = ["media.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/suites:helpers",
+        "//benchmarks/workloads/ffmpeg",
+    ],
+)
+
+py_library(
+    name = "ml",
+    srcs = ["ml.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/suites:startup",
+        "//benchmarks/workloads/tensorflow",
+    ],
+)
+
+py_library(
+    name = "network",
+    srcs = ["network.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/suites:helpers",
+        "//benchmarks/workloads/iperf",
+    ],
+)
+
+py_library(
+    name = "redis",
+    srcs = ["redis.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/workloads/redisbenchmark",
+    ],
+)
+
+py_library(
+    name = "startup",
+    srcs = ["startup.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/suites:helpers",
+    ],
+)
+
+py_library(
+    name = "sysbench",
+    srcs = ["sysbench.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/workloads/sysbench",
+    ],
+)
+
+py_library(
+    name = "syscall",
+    srcs = ["syscall.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/suites",
+        "//benchmarks/workloads/syscall",
+    ],
+)
diff --git a/benchmarks/suites/__init__.py b/benchmarks/suites/__init__.py
new file mode 100644
index 000000000..360736cc3
--- /dev/null
+++ b/benchmarks/suites/__init__.py
@@ -0,0 +1,119 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core benchmark annotations."""
+
+import functools
+import inspect
+import types
+from typing import List
+from typing import Tuple
+
+BENCHMARK_METRICS = '__benchmark_metrics__'
+BENCHMARK_MACHINES = '__benchmark_machines__'
+
+
+def is_benchmark(func: types.FunctionType) -> bool:
+  """Returns true if the given function is a benchmark."""
+  return isinstance(func, types.FunctionType) and \
+      hasattr(func, BENCHMARK_METRICS) and \
+      hasattr(func, BENCHMARK_MACHINES)
+
+
+def benchmark_metrics(func: types.FunctionType) -> List[Tuple[str, str]]:
+  """Returns the list of available metrics."""
+  return [(metric.__name__, metric.__doc__)
+          for metric in getattr(func, BENCHMARK_METRICS)]
+
+
+def benchmark_machines(func: types.FunctionType) -> int:
+  """Returns the number of machines required."""
+  return getattr(func, BENCHMARK_MACHINES)
+
+
+# pylint: disable=unused-argument
+def default(value, **kwargs):
+  """Returns the passed value."""
+  return value
+
+
+def benchmark(metrics: List[types.FunctionType] = None,
+              machines: int = 1) -> types.FunctionType:
+  """Define a benchmark function with metrics.
+
+  Args:
+    metrics: A list of metric functions.
+    machines: The number of machines required.
+
+  Returns:
+    A function that accepts the given number of machines, and iteratively
+    returns a set of (metric_name, metric_value) pairs when called repeatedly.
+  """
+  if not metrics:
+    # The default passes through.
+    metrics = [default]
+
+  def decorator(func: types.FunctionType) -> types.FunctionType:
+    """Decorator function."""
+    # Every benchmark should accept at least two parameters:
+    #   runtime: The runtime to use for the benchmark (str, required).
+    #   metrics: The metrics to use, if not the default (str, optional).
+    @functools.wraps(func)
+    def wrapper(*args, runtime: str, metric: list = None, **kwargs):
+      """Wrapper function."""
+      # First -- ensure that we marshall all types appropriately. In
+      # general, we will call this with only strings. These strings will
+      # need to be converted to their underlying types/classes.
+      sig = inspect.signature(func)
+      for param in sig.parameters.values():
+        if param.annotation != inspect.Parameter.empty and \
+           param.name in kwargs and not isinstance(kwargs[param.name], param.annotation):
+          try:
+            # Marshall to the appropriate type.
+            kwargs[param.name] = param.annotation(kwargs[param.name])
+          except Exception as exc:
+            raise ValueError(
+                'illegal type for %s(%s=%s): %s' %
+                (func.__name__, param.name, kwargs[param.name], exc))
+        elif param.default != inspect.Parameter.empty and \
+             param.name not in kwargs:
+          # Ensure that we have the value set, because it will
+          # be passed to the metric function for evaluation.
+          kwargs[param.name] = param.default
+
+      # Next, figure out how to apply a metric. We do this prior to
+      # running the underlying function to prevent having to wait a few
+      # minutes for a result just to see some error.
+      if not metric:
+        # Return all metrics in the iterator.
+        result = func(*args, runtime=runtime, **kwargs)
+        for metric_func in metrics:
+          yield (metric_func.__name__, metric_func(result, **kwargs))
+      else:
+        result = None
+        for single_metric in metric:
+          for metric_func in metrics:
+            # Is this a function that matches the name?
+            # Apply this function to the result.
+            if metric_func.__name__ == single_metric:
+              if not result:
+                # Lazy evaluation: only if metric matches.
+                result = func(*args, runtime=runtime, **kwargs)
+              yield single_metric, metric_func(result, **kwargs)
+
+    # Set metadata on the benchmark (used above).
+    setattr(wrapper, BENCHMARK_METRICS, metrics)
+    setattr(wrapper, BENCHMARK_MACHINES, machines)
+    return wrapper
+
+  return decorator
diff --git a/benchmarks/suites/absl.py b/benchmarks/suites/absl.py
new file mode 100644
index 000000000..5d9b57a09
--- /dev/null
+++ b/benchmarks/suites/absl.py
@@ -0,0 +1,37 @@
+# python3
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""absl build benchmark."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import absl
+
+
+@suites.benchmark(metrics=[absl.elapsed_time], machines=1)
+def build(target: machine.Machine, **kwargs) -> str:
+  """Runs the absl workload and report the absl build time.
+
+    Runs the 'bazel build //absl/...' in a clean bazel directory and
+    monitors time elapsed.
+
+  Args:
+    target: A machine object.
+    **kwargs: Additional container options.
+
+  Returns:
+    Container output.
+  """
+  image = target.pull("absl")
+  return target.container(image, **kwargs).run()
diff --git a/benchmarks/suites/density.py b/benchmarks/suites/density.py
new file mode 100644
index 000000000..89d29fb26
--- /dev/null
+++ b/benchmarks/suites/density.py
@@ -0,0 +1,121 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Density tests."""
+
+import re
+import types
+
+from benchmarks import suites
+from benchmarks.harness import container
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+
+
+# pylint: disable=unused-argument
+def memory_usage(value, **kwargs):
+  """Returns the passed value."""
+  return value
+
+
+def density(target: machine.Machine,
+            workload: str,
+            count: int = 50,
+            wait: float = 0,
+            load_func: types.FunctionType = None,
+            **kwargs):
+  """Calculate the average memory usage per container.
+
+  Args:
+    target: A machine object.
+    workload: The workload to run.
+    count: The number of containers to start.
+    wait: The time to wait after starting.
+    load_func: Callback that is called after count images have been started on
+      the given machine.
+    **kwargs: Additional container options.
+
+  Returns:
+    The average usage in Kb per container.
+  """
+  count = int(count)
+
+  # Drop all caches.
+  helpers.drop_caches(target)
+  before = target.read("/proc/meminfo")
+
+  # Load the workload.
+  image = target.pull(workload)
+
+  with target.container(
+      image=image, count=count, **kwargs).detach() as containers:
+    # Call the optional load function callback if given.
+    if load_func:
+      load_func(target, containers)
+    # Wait 'wait' time before taking a measurement.
+    target.sleep(wait)
+
+    # Drop caches again.
+    helpers.drop_caches(target)
+    after = target.read("/proc/meminfo")
+
+  # Calculate the memory used.
+  available_re = re.compile(r"MemAvailable:\s*(\d+)\skB\n")
+  before_available = available_re.findall(before)
+  after_available = available_re.findall(after)
+  return 1024 * float(int(before_available[0]) -
+                      int(after_available[0])) / float(count)
+
+
+def load_redis(target: machine.Machine, containers: container.Container):
+  """Use redis-benchmark "LPUSH" to load each container with 1G of data.
+
+  Args:
+    target: A machine object.
+    containers: A set of containers.
+  """
+  target.pull("redisbenchmark")
+  for name in containers.get_names():
+    flags = "-d 10000 -t LPUSH"
+    target.container(
+        "redisbenchmark", links={
+            name: name
+        }).run(
+            host=name, flags=flags)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def empty(target: machine.Machine, **kwargs) -> float:
+  """Run trivial containers in a density test."""
+  return density(target, workload="sleep", wait=1.0, **kwargs)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def node(target: machine.Machine, **kwargs) -> float:
+  """Run node containers in a density test."""
+  return density(target, workload="node", wait=3.0, **kwargs)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def ruby(target: machine.Machine, **kwargs) -> float:
+  """Run ruby containers in a density test."""
+  return density(target, workload="ruby", wait=3.0, **kwargs)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def redis(target: machine.Machine, **kwargs) -> float:
+  """Run redis containers in a density test."""
+  if "count" not in kwargs:
+    kwargs["count"] = 5
+  return density(
+      target, workload="redis", wait=3.0, load_func=load_redis, **kwargs)
diff --git a/benchmarks/suites/fio.py b/benchmarks/suites/fio.py
new file mode 100644
index 000000000..2171790c5
--- /dev/null
+++ b/benchmarks/suites/fio.py
@@ -0,0 +1,165 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""File I/O tests."""
+
+import os
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+from benchmarks.workloads import fio
+
+
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-locals
+def run_fio(target: machine.Machine,
+            test: str,
+            ioengine: str = "sync",
+            size: int = 1024 * 1024 * 1024,
+            iodepth: int = 4,
+            blocksize: int = 1024 * 1024,
+            time: int = -1,
+            mount_dir: str = "",
+            filename: str = "file.dat",
+            tmpfs: bool = False,
+            ramp_time: int = 0,
+            **kwargs) -> str:
+  """FIO benchmarks.
+
+  For more on fio see:
+    https://media.readthedocs.org/pdf/fio/latest/fio.pdf
+
+  Args:
+    target: A machine object.
+    test: The test to run (read, write, randread, randwrite, etc.)
+    ioengine: The engine for I/O.
+    size: The size of the generated file in bytes (if an integer) or 5g, 16k,
+      etc.
+    iodepth: The I/O for certain engines.
+    blocksize: The blocksize for reads and writes in bytes (if an integer) or
+      4k, etc.
+    time: If test is time based, how long to run in seconds.
+    mount_dir: The absolute path on the host to mount a bind mount.
+    filename: The name of the file to creat inside container. For a path of
+      /dir/dir/file, the script setup a volume like 'docker run -v
+        mount_dir:/dir/dir fio' and fio will create (and delete) the file
+          /dir/dir/file. If tmpfs is set, this /dir/dir will be a tmpfs.
+    tmpfs: If true, mount on tmpfs.
+    ramp_time: The time to run before recording statistics
+    **kwargs: Additional container options.
+
+  Returns:
+    The output of fio as a string.
+  """
+  # Pull the image before dropping caches.
+  image = target.pull("fio")
+
+  if not mount_dir:
+    stdout, _ = target.run("pwd")
+    mount_dir = stdout.rstrip()
+
+  # Setup the volumes.
+  volumes = {mount_dir: {"bind": "/disk", "mode": "rw"}} if not tmpfs else None
+  tmpfs = {"/disk": ""} if tmpfs else None
+
+  # Construct a file in the volume.
+  filepath = os.path.join("/disk", filename)
+
+  # If we are running a read test, us fio to write a file and then flush file
+  # data from memory.
+  if "read" in test:
+    target.container(
+        image, volumes=volumes, tmpfs=tmpfs, **kwargs).run(
+            test="write",
+            ioengine="sync",
+            size=size,
+            iodepth=iodepth,
+            blocksize=blocksize,
+            path=filepath)
+    helpers.drop_caches(target)
+
+  # Run the test.
+  time_str = "--time_base --runtime={time}".format(
+      time=time) if int(time) > 0 else ""
+  res = target.container(
+      image, volumes=volumes, tmpfs=tmpfs, **kwargs).run(
+          test=test,
+          ioengine=ioengine,
+          size=size,
+          iodepth=iodepth,
+          blocksize=blocksize,
+          time=time_str,
+          path=filepath,
+          ramp_time=ramp_time)
+
+  target.run(
+      "rm {path}".format(path=os.path.join(mount_dir.rstrip(), filename)))
+
+  return res
+
+
+@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1)
+def read(*args, **kwargs):
+  """Read test.
+
+  Args:
+    *args: None.
+    **kwargs: Additional container options.
+
+  Returns:
+    The output of fio.
+  """
+  return run_fio(*args, test="read", **kwargs)
+
+
+@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1)
+def randread(*args, **kwargs):
+  """Random read test.
+
+  Args:
+    *args: None.
+    **kwargs: Additional container options.
+
+  Returns:
+    The output of fio.
+  """
+  return run_fio(*args, test="randread", **kwargs)
+
+
+@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1)
+def write(*args, **kwargs):
+  """Write test.
+
+  Args:
+    *args: None.
+    **kwargs: Additional container options.
+
+  Returns:
+    The output of fio.
+  """
+  return run_fio(*args, test="write", **kwargs)
+
+
+@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1)
+def randwrite(*args, **kwargs):
+  """Random write test.
+
+  Args:
+    *args: None.
+    **kwargs: Additional container options.
+
+  Returns:
+    The output of fio.
+  """
+  return run_fio(*args, test="randwrite", **kwargs)
diff --git a/benchmarks/suites/helpers.py b/benchmarks/suites/helpers.py
new file mode 100644
index 000000000..b3c7360ab
--- /dev/null
+++ b/benchmarks/suites/helpers.py
@@ -0,0 +1,57 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Benchmark helpers."""
+
+import datetime
+from benchmarks.harness import machine
+
+
+class Timer:
+  """Helper to time runtime of some call.
+
+  Usage:
+
+    with Timer as t:
+        # do something.
+        t.get_time_in_seconds()
+  """
+
+  def __init__(self):
+    self._start = datetime.datetime.now()
+
+  def __enter__(self):
+    self.start()
+    return self
+
+  def start(self):
+    """Starts the timer."""
+    self._start = datetime.datetime.now()
+
+  def elapsed(self) -> float:
+    """Returns the elapsed time in seconds."""
+    return (datetime.datetime.now() - self._start).total_seconds()
+
+  def __exit__(self, exception_type, exception_value, exception_traceback):
+    pass
+
+
+def drop_caches(target: machine.Machine):
+  """Drops caches on the machine.
+
+  Args:
+    target: A machine object.
+  """
+  target.run("sudo sync")
+  target.run("sudo sysctl vm.drop_caches=3")
+  target.run("sudo sysctl vm.drop_caches=3")
diff --git a/benchmarks/suites/http.py b/benchmarks/suites/http.py
new file mode 100644
index 000000000..ea9024e43
--- /dev/null
+++ b/benchmarks/suites/http.py
@@ -0,0 +1,138 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HTTP benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import ab
+
+
+# pylint: disable=too-many-arguments
+def http(server: machine.Machine,
+         client: machine.Machine,
+         workload: str,
+         requests: int = 5000,
+         connections: int = 10,
+         port: int = 80,
+         path: str = "notfound",
+         **kwargs) -> str:
+  """Run apachebench (ab) against an http server.
+
+  Args:
+    server: A machine object.
+    client: A machine object.
+    workload: The http-serving workload.
+    requests: Number of requests to send the server. Default is 5000.
+    connections: Number of concurent connections to use. Default is 10.
+    port: The port to access in benchmarking.
+    path: File to download, generally workload-specific.
+    **kwargs: Additional container options.
+
+  Returns:
+    The full apachebench output.
+  """
+  # Pull the client & server.
+  apachebench = client.pull("ab")
+  netcat = client.pull("netcat")
+  image = server.pull(workload)
+
+  with server.container(image, port=port, **kwargs).detach() as container:
+    (host, port) = container.address()
+    # Wait for the server to come up.
+    client.container(netcat).run(host=host, port=port)
+    # Run the benchmark, no arguments.
+    return client.container(apachebench).run(
+        host=host,
+        port=port,
+        requests=requests,
+        connections=connections,
+        path=path)
+
+
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-locals
+def http_app(server: machine.Machine,
+             client: machine.Machine,
+             workload: str,
+             requests: int = 5000,
+             connections: int = 10,
+             port: int = 80,
+             path: str = "notfound",
+             **kwargs) -> str:
+  """Run apachebench (ab) against an http application.
+
+  Args:
+    server: A machine object.
+    client: A machine object.
+    workload: The http-serving workload.
+    requests: Number of requests to send the server. Default is 5000.
+    connections: Number of concurent connections to use. Default is 10.
+    port: The port to use for benchmarking.
+    path: File to download, generally workload-specific.
+    **kwargs: Additional container options.
+
+  Returns:
+    The full apachebench output.
+  """
+  # Pull the client & server.
+  apachebench = client.pull("ab")
+  netcat = client.pull("netcat")
+  server_netcat = server.pull("netcat")
+  redis = server.pull("redis")
+  image = server.pull(workload)
+  redis_port = 6379
+  redis_name = "redis_server"
+
+  with server.container(redis, name=redis_name).detach():
+    server.container(server_netcat, links={redis_name: redis_name})\
+        .run(host=redis_name, port=redis_port)
+    with server.container(image, port=port, links={redis_name: redis_name}, **kwargs)\
+            .detach(host=redis_name) as container:
+      (host, port) = container.address()
+      # Wait for the server to come up.
+      client.container(netcat).run(host=host, port=port)
+      # Run the benchmark, no arguments.
+      return client.container(apachebench).run(
+          host=host,
+          port=port,
+          requests=requests,
+          connections=connections,
+          path=path)
+
+
+@suites.benchmark(metrics=[ab.transfer_rate, ab.latency], machines=2)
+def httpd(*args, **kwargs) -> str:
+  """Apache2 benchmark."""
+  return http(*args, workload="httpd", port=80, **kwargs)
+
+
+@suites.benchmark(
+    metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2)
+def nginx(*args, **kwargs) -> str:
+  """Nginx benchmark."""
+  return http(*args, workload="nginx", port=80, **kwargs)
+
+
+@suites.benchmark(
+    metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2)
+def node(*args, **kwargs) -> str:
+  """Node benchmark."""
+  return http_app(*args, workload="node_template", path="", port=8080, **kwargs)
+
+
+@suites.benchmark(
+    metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2)
+def ruby(*args, **kwargs) -> str:
+  """Ruby benchmark."""
+  return http_app(*args, workload="ruby_template", path="", port=9292, **kwargs)
diff --git a/benchmarks/suites/media.py b/benchmarks/suites/media.py
new file mode 100644
index 000000000..9cbffdaa1
--- /dev/null
+++ b/benchmarks/suites/media.py
@@ -0,0 +1,42 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Media processing benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+from benchmarks.workloads import ffmpeg
+
+
+@suites.benchmark(metrics=[ffmpeg.run_time], machines=1)
+def transcode(target: machine.Machine, **kwargs) -> float:
+  """Runs a video transcoding workload and times it.
+
+  Args:
+    target: A machine object.
+    **kwargs: Additional container options.
+
+  Returns:
+    Total workload runtime.
+  """
+  # Load before timing.
+  image = target.pull("ffmpeg")
+
+  # Drop caches.
+  helpers.drop_caches(target)
+
+  # Time startup + transcoding.
+  with helpers.Timer() as timer:
+    target.container(image, **kwargs).run()
+    return timer.elapsed()
diff --git a/benchmarks/suites/ml.py b/benchmarks/suites/ml.py
new file mode 100644
index 000000000..a394d1f69
--- /dev/null
+++ b/benchmarks/suites/ml.py
@@ -0,0 +1,33 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Machine Learning tests."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import startup
+from benchmarks.workloads import tensorflow
+
+
+@suites.benchmark(metrics=[tensorflow.run_time], machines=1)
+def train(target: machine.Machine, **kwargs):
+  """Run the tensorflow benchmark and return the runtime in seconds of workload.
+
+  Args:
+    target: A machine object.
+    **kwargs: Additional container options.
+
+  Returns:
+    The total runtime.
+  """
+  return startup.startup(target, workload="tensorflow", count=1, **kwargs)
diff --git a/benchmarks/suites/network.py b/benchmarks/suites/network.py
new file mode 100644
index 000000000..f973cf3f1
--- /dev/null
+++ b/benchmarks/suites/network.py
@@ -0,0 +1,101 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Network microbenchmarks."""
+
+from typing import Dict
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+from benchmarks.workloads import iperf
+
+
+def run_iperf(client: machine.Machine,
+              server: machine.Machine,
+              client_kwargs: Dict[str, str] = None,
+              server_kwargs: Dict[str, str] = None) -> str:
+  """Measure iperf performance.
+
+  Args:
+    client: A machine object.
+    server: A machine object.
+    client_kwargs: Additional client container options.
+    server_kwargs: Additional server container options.
+
+  Returns:
+    The output of iperf.
+  """
+  if not client_kwargs:
+    client_kwargs = dict()
+  if not server_kwargs:
+    server_kwargs = dict()
+
+  # Pull images.
+  netcat = client.pull("netcat")
+  iperf_client_image = client.pull("iperf")
+  iperf_server_image = server.pull("iperf")
+
+  # Set this due to a bug in the kernel that resets connections.
+  client.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1")
+  server.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1")
+
+  with server.container(
+      iperf_server_image, port=5001, **server_kwargs).detach() as iperf_server:
+    (host, port) = iperf_server.address()
+    # Wait until the service is available.
+    client.container(netcat).run(host=host, port=port)
+    # Run a warm-up run.
+    client.container(
+        iperf_client_image, stderr=True, **client_kwargs).run(
+            host=host, port=port)
+    # Run the client with relevant arguments.
+    res = client.container(iperf_client_image, stderr=True, **client_kwargs)\
+        .run(host=host, port=port)
+    helpers.drop_caches(client)
+    return res
+
+
+@suites.benchmark(metrics=[iperf.bandwidth], machines=2)
+def upload(client: machine.Machine, server: machine.Machine, **kwargs) -> str:
+  """Measure upload performance.
+
+  Args:
+    client: A machine object.
+    server: A machine object.
+    **kwargs: Client container options.
+
+  Returns:
+    The output of iperf.
+  """
+  if kwargs["runtime"] == "runc":
+    kwargs["network_mode"] = "host"
+  return run_iperf(client, server, client_kwargs=kwargs)
+
+
+@suites.benchmark(metrics=[iperf.bandwidth], machines=2)
+def download(client: machine.Machine, server: machine.Machine, **kwargs) -> str:
+  """Measure download performance.
+
+  Args:
+    client: A machine object.
+    server: A machine object.
+    **kwargs: Server container options.
+
+  Returns:
+    The output of iperf.
+  """
+
+  client_kwargs = {"network_mode": "host"}
+  return run_iperf(
+      client, server, client_kwargs=client_kwargs, server_kwargs=kwargs)
diff --git a/benchmarks/suites/redis.py b/benchmarks/suites/redis.py
new file mode 100644
index 000000000..b84dd073d
--- /dev/null
+++ b/benchmarks/suites/redis.py
@@ -0,0 +1,46 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Redis benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import redisbenchmark
+
+
+@suites.benchmark(metrics=list(redisbenchmark.METRICS.values()), machines=2)
+def redis(server: machine.Machine,
+          client: machine.Machine,
+          flags: str = "",
+          **kwargs) -> str:
+  """Run redis-benchmark on client pointing at server machine.
+
+  Args:
+    server: A machine object.
+    client: A machine object.
+    flags: Flags to pass redis-benchmark.
+    **kwargs: Additional container options.
+
+  Returns:
+    Output from redis-benchmark.
+  """
+  redis_server = server.pull("redis")
+  redis_client = client.pull("redisbenchmark")
+  netcat = client.pull("netcat")
+  with server.container(
+      redis_server, port=6379, **kwargs).detach() as container:
+    (host, port) = container.address()
+    # Wait for the container to be up.
+    client.container(netcat).run(host=host, port=port)
+    # Run all redis benchmarks.
+    return client.container(redis_client).run(host=host, port=port, flags=flags)
diff --git a/benchmarks/suites/startup.py b/benchmarks/suites/startup.py
new file mode 100644
index 000000000..a1b6c5753
--- /dev/null
+++ b/benchmarks/suites/startup.py
@@ -0,0 +1,110 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Start-up benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+
+
+# pylint: disable=unused-argument
+def startup_time_ms(value, **kwargs):
+  """Returns average startup time per container in milliseconds.
+
+  Args:
+    value: The floating point time in seconds.
+    **kwargs: Ignored.
+
+  Returns:
+    The time given in milliseconds.
+  """
+  return value * 1000
+
+
+def startup(target: machine.Machine,
+            workload: str,
+            count: int = 5,
+            port: int = 0,
+            **kwargs):
+  """Time the startup of some workload.
+
+  Args:
+    target: A machine object.
+    workload: The workload to run.
+    count: Number of containers to start.
+    port: The port to check for liveness, if provided.
+    **kwargs: Additional container options.
+
+  Returns:
+    The mean start-up time in seconds.
+  """
+  # Load before timing.
+  image = target.pull(workload)
+  netcat = target.pull("netcat")
+  count = int(count)
+  port = int(port)
+
+  with helpers.Timer() as timer:
+    for _ in range(count):
+      if not port:
+        # Run the container synchronously.
+        target.container(image, **kwargs).run()
+      else:
+        # Run a detached container until httpd available.
+        with target.container(image, port=port, **kwargs).detach() as server:
+          (server_host, server_port) = server.address()
+          target.container(netcat).run(host=server_host, port=server_port)
+    return timer.elapsed() / float(count)
+
+
+@suites.benchmark(metrics=[startup_time_ms], machines=1)
+def empty(target: machine.Machine, **kwargs) -> float:
+  """Time the startup of a trivial container.
+
+  Args:
+    target: A machine object.
+    **kwargs: Additional startup options.
+
+  Returns:
+    The time to run the container.
+  """
+  return startup(target, workload="true", **kwargs)
+
+
+@suites.benchmark(metrics=[startup_time_ms], machines=1)
+def node(target: machine.Machine, **kwargs) -> float:
+  """Time the startup of the node container.
+
+  Args:
+    target: A machine object.
+    **kwargs: Additional statup options.
+
+  Returns:
+    The time to run the container.
+  """
+  return startup(target, workload="node", port=8080, **kwargs)
+
+
+@suites.benchmark(metrics=[startup_time_ms], machines=1)
+def ruby(target: machine.Machine, **kwargs) -> float:
+  """Time the startup of the ruby container.
+
+  Args:
+    target: A machine object.
+    **kwargs: Additional startup options.
+
+  Returns:
+    The time to run the container.
+  """
+  return startup(target, workload="ruby", port=3000, **kwargs)
diff --git a/benchmarks/suites/sysbench.py b/benchmarks/suites/sysbench.py
new file mode 100644
index 000000000..2a6e2126c
--- /dev/null
+++ b/benchmarks/suites/sysbench.py
@@ -0,0 +1,119 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sysbench-based benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import sysbench
+
+
+def run_sysbench(target: machine.Machine,
+                 test: str = "cpu",
+                 threads: int = 8,
+                 time: int = 5,
+                 options: str = "",
+                 **kwargs) -> str:
+  """Run sysbench container with arguments.
+
+  Args:
+    target: A machine object.
+    test: Relevant sysbench test to run (e.g. cpu, memory).
+    threads: The number of threads to use for tests.
+    time: The time to run tests.
+    options: Additional sysbench options.
+    **kwargs: Additional container options.
+
+  Returns:
+    The output of the command as a string.
+  """
+  image = target.pull("sysbench")
+  return target.container(image, **kwargs).run(
+      test=test, threads=threads, time=time, options=options)
+
+
+@suites.benchmark(metrics=[sysbench.cpu_events_per_second], machines=1)
+def cpu(target: machine.Machine, max_prime: int = 5000, **kwargs) -> str:
+  """Run sysbench CPU test.
+
+  Additional arguments can be provided for sysbench.
+
+  Args:
+    target: A machine object.
+    max_prime: The maximum prime number to search.
+    **kwargs:
+      - threads: The number of threads to use for tests.
+      - time: The time to run tests.
+      - options: Additional sysbench options. See sysbench tool:
+        https://github.com/akopytov/sysbench
+
+  Returns:
+    Sysbench output.
+  """
+  options = kwargs.pop("options", "")
+  options += " --cpu-max-prime={}".format(max_prime)
+  return run_sysbench(target, test="cpu", options=options, **kwargs)
+
+
+@suites.benchmark(metrics=[sysbench.memory_ops_per_second], machines=1)
+def memory(target: machine.Machine, **kwargs) -> str:
+  """Run sysbench memory test.
+
+  Additional arguments can be provided per sysbench.
+
+  Args:
+    target: A machine object.
+    **kwargs:
+        - threads: The number of threads to use for tests.
+        - time: The time to run tests.
+        - options: Additional sysbench options. See sysbench tool:
+          https://github.com/akopytov/sysbench
+
+  Returns:
+    Sysbench output.
+  """
+  return run_sysbench(target, test="memory", **kwargs)
+
+
+@suites.benchmark(
+    metrics=[
+        sysbench.mutex_time, sysbench.mutex_latency, sysbench.mutex_deviation
+    ],
+    machines=1)
+def mutex(target: machine.Machine,
+          locks: int = 4,
+          count: int = 10000000,
+          threads: int = 8,
+          **kwargs) -> str:
+  """Run sysbench mutex test.
+
+  Additional arguments can be provided per sysbench.
+
+  Args:
+    target: A machine object.
+    locks: The number of locks to use.
+    count: The number of mutexes.
+    threads: The number of threads to use for tests.
+    **kwargs:
+        - time: The time to run tests.
+        - options: Additional sysbench options. See sysbench tool:
+          https://github.com/akopytov/sysbench
+
+  Returns:
+    Sysbench output.
+  """
+  options = kwargs.pop("options", "")
+  options += " --mutex-loops=1 --mutex-locks={} --mutex-num={}".format(
+      count, locks)
+  return run_sysbench(
+      target, test="mutex", options=options, threads=threads, **kwargs)
diff --git a/benchmarks/suites/syscall.py b/benchmarks/suites/syscall.py
new file mode 100644
index 000000000..fa7665b00
--- /dev/null
+++ b/benchmarks/suites/syscall.py
@@ -0,0 +1,37 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Syscall microbenchmark."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads.syscall import syscall_time_ns
+
+
+@suites.benchmark(metrics=[syscall_time_ns], machines=1)
+def syscall(target: machine.Machine, count: int = 1000000, **kwargs) -> str:
+  """Runs the syscall workload and report the syscall time.
+
+  Runs the syscall 'SYS_gettimeofday(0,0)' 'count' times and monitors time
+  elapsed based on the runtime's MONOTONIC clock.
+
+  Args:
+    target: A machine object.
+    count: The number of syscalls to execute.
+    **kwargs: Additional container options.
+
+  Returns:
+    Container output.
+  """
+  image = target.pull("syscall")
+  return target.container(image, **kwargs).run(count=count)
diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD
new file mode 100644
index 000000000..735d7127f
--- /dev/null
+++ b/benchmarks/tcp/BUILD
@@ -0,0 +1,41 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "tcp_proxy",
+    srcs = ["tcp_proxy.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/adapters/gonet",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+# nsjoin is a trivial replacement for nsenter. This is used because nsenter is
+# not available on all systems where this benchmark is run (and we aim to
+# minimize external dependencies.)
+
+cc_binary(
+    name = "nsjoin",
+    srcs = ["nsjoin.c"],
+    visibility = ["//:sandbox"],
+)
+
+sh_binary(
+    name = "tcp_benchmark",
+    srcs = ["tcp_benchmark.sh"],
+    data = [
+        ":nsjoin",
+        ":tcp_proxy",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/benchmarks/tcp/README.md b/benchmarks/tcp/README.md
new file mode 100644
index 000000000..38e6e69f0
--- /dev/null
+++ b/benchmarks/tcp/README.md
@@ -0,0 +1,87 @@
+# TCP Benchmarks
+
+This directory contains a standardized TCP benchmark. This helps to evaluate the
+performance of netstack and native networking stacks under various conditions.
+
+## `tcp_benchmark`
+
+This benchmark allows TCP throughput testing under various conditions. The setup
+consists of an iperf client, a client proxy, a server proxy and an iperf server.
+The client proxy and server proxy abstract the network mechanism used to
+communicate between the iperf client and server.
+
+The setup looks like the following:
+
+```
+ +--------------+  (native)            +--------------+
+ | iperf client |[lo @ 10.0.0.1]------>| client proxy |
+ +--------------+                      +--------------+
+                                    [client.0 @ 10.0.0.2]
+                            (netstack)  |            |  (native)
+                                        +------+-----+
+                                               |
+                                             [br0]
+                                               |
+          Network emulation applied ---> [wan.0:wan.1]
+                                               |
+                                             [br1]
+                                               |
+                                        +------+-----+
+                            (netstack)  |            |  (native)
+                                     [server.0 @ 10.0.0.3]
+ +--------------+                      +--------------+
+ | iperf server |<------[lo @ 10.0.0.4]| server proxy |
+ +--------------+            (native)  +--------------+
+```
+
+Different configurations can be run using different arguments. For example:
+
+*   Native test under normal internet conditions: `tcp_benchmark`
+*   Native test under ideal conditions: `tcp_benchmark --ideal`
+*   Netstack client under ideal conditions: `tcp_benchmark --client --ideal`
+*   Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss
+    5`
+
+Use `tcp_benchmark --help` for full arguments.
+
+This tool may be used to easily generate data for graphing. For example, to
+generate a CSV for various latencies, you might do:
+
+```
+rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv
+latencies=$(seq 0 5 50;
+            seq 60 10 100;
+            seq 125 25 250;
+            seq 300 50 500)
+for latency in $latencies; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --client --ideal --latency $latency)
+  echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv
+done
+for latency in $latencies; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --ideal --latency $latency)
+  echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv
+done
+```
+
+Similarly, to generate a CSV for various levels of packet loss, the following
+would be appropriate:
+
+```
+rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv
+losses=$(seq 0 0.1 1.0;
+         seq 1.2 0.2 2.0;
+         seq 2.5 0.5 5.0;
+         seq 6.0 1.0 10.0)
+for loss in $losses; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss)
+  echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv
+done
+for loss in $losses; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss)
+  echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv
+done
+```
diff --git a/benchmarks/tcp/nsjoin.c b/benchmarks/tcp/nsjoin.c
new file mode 100644
index 000000000..524b4d549
--- /dev/null
+++ b/benchmarks/tcp/nsjoin.c
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(int argc, char** argv) {
+  if (argc <= 2) {
+    fprintf(stderr, "error: must provide a namespace file.\n");
+    fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]);
+    return 1;
+  }
+
+  int fd = open(argv[1], O_RDONLY);
+  if (fd < 0) {
+    fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno));
+    return 1;
+  }
+  if (setns(fd, 0) < 0) {
+    fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno));
+    return 1;
+  }
+
+  execvp(argv[2], &argv[2]);
+  return 1;
+}
diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh
new file mode 100755
index 000000000..69344c9c3
--- /dev/null
+++ b/benchmarks/tcp/tcp_benchmark.sh
@@ -0,0 +1,369 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TCP benchmark; see README.md for documentation.
+
+# Fixed parameters.
+iperf_port=45201 # Not likely to be privileged.
+proxy_port=44000 # Ditto.
+client_addr=10.0.0.1
+client_proxy_addr=10.0.0.2
+server_proxy_addr=10.0.0.3
+server_addr=10.0.0.4
+mask=8
+
+# Defaults; this provides a reasonable approximation of a decent internet link.
+# Parameters can be varied independently from this set to see response to
+# various changes in the kind of link available.
+client=false
+server=false
+verbose=false
+gso=0
+swgso=false
+mtu=1280                # 1280 is a reasonable lowest-common-denominator.
+latency=10              # 10ms approximates a fast, dedicated connection.
+latency_variation=1     # +/- 1ms is a relatively low amount of jitter.
+loss=0.1                # 0.1% loss is non-zero, but not extremely high.
+duplicate=0.1           # 0.1% means duplicates are 1/10x as frequent as losses.
+duration=30             # 30s is enough time to consistent results (experimentally).
+helper_dir=$(dirname $0)
+netstack_opts=
+
+# Check for netem support.
+lsmod_output=$(lsmod | grep sch_netem)
+if [ "$?" != "0" ]; then
+  echo "warning: sch_netem may not be installed." >&2
+fi
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --client)
+      client=true
+      ;;
+    --client_tcp_probe_file)
+      shift
+      netstack_opts="${netstack_opts} -client_tcp_probe_file=$1"
+      ;;
+    --server)
+      server=true
+      ;;
+    --verbose)
+      verbose=true
+      ;;
+    --gso)
+      shift
+      gso=$1
+      ;;
+    --swgso)
+      swgso=true
+      ;;
+    --server_tcp_probe_file)
+      shift
+      netstack_opts="${netstack_opts} -server_tcp_probe_file=$1"
+      ;;
+    --ideal)
+      mtu=1500            # Standard ethernet.
+      latency=0           # No latency.
+      latency_variation=0 # No jitter.
+      loss=0              # No loss.
+      duplicate=0         # No duplicates.
+      ;;
+    --mtu)
+      shift
+      [ "$#" -le 0 ] && echo "no mtu provided" && exit 1
+      mtu=$1
+      ;;
+    --sack)
+      netstack_opts="${netstack_opts} -sack"
+      ;;
+    --cubic)
+      netstack_opts="${netstack_opts} -cubic"
+      ;;
+    --duration)
+      shift
+      [ "$#" -le 0 ] && echo "no duration provided" && exit 1
+      duration=$1
+      ;;
+    --latency)
+      shift
+      [ "$#" -le 0 ] && echo "no latency provided" && exit 1
+      latency=$1
+      ;;
+    --latency-variation)
+      shift
+      [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1
+      latency_variation=$1
+      ;;
+    --loss)
+      shift
+      [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1
+      loss=$1
+      ;;
+    --duplicate)
+      shift
+      [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1
+      duplicate=$1
+      ;;
+    --cpuprofile)
+      shift
+      netstack_opts="${netstack_opts} -cpuprofile=$1"
+      ;;
+    --memprofile)
+      shift
+      netstack_opts="${netstack_opts} -memprofile=$1"
+      ;;
+    --helpers)
+      shift
+      [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1
+      helper_dir=$1
+      ;;
+    *)
+      echo "usage: $0 [options]"
+      echo "options:"
+      echo " --help                show this message"
+      echo " --verbose             verbose output"
+      echo " --client              use netstack as the client"
+      echo " --ideal               reset all network emulation"
+      echo " --server              use netstack as the server"
+      echo " --mtu                 set the mtu (bytes)"
+      echo " --sack                enable SACK support"
+      echo " --cubic               enable CUBIC congestion control for Netstack"
+      echo " --duration            set the test duration (s)"
+      echo " --latency             set the latency (ms)"
+      echo " --latency-variation   set the latency variation"
+      echo " --loss                set the loss probability (%)"
+      echo " --duplicate           set the duplicate probability (%)"
+      echo " --helpers             set the helper directory"
+      echo ""
+      echo "The output will of the script will be:"
+      echo "  <throughput> <client-cpu-usage> <server-cpu-usage>"
+      exit 1
+  esac
+  shift
+done
+
+if [ ${verbose} == "true" ]; then
+  set -x
+fi
+
+# Latency needs to be halved, since it's applied on both ways.
+half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}')
+half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}')
+half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}')
+helper_dir=${helper_dir#$(pwd)/} # Use relative paths.
+proxy_binary=${helper_dir}/tcp_proxy
+nsjoin_binary=${helper_dir}/nsjoin
+
+if [ ! -e ${proxy_binary} ]; then
+  echo "Could not locate ${proxy_binary}, please make sure you've built the binary"
+  exit 1
+fi
+
+if [ ! -e ${nsjoin_binary} ]; then
+  echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary"
+  exit 1
+fi
+
+if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then
+  # As long as there's some jitter, then we use the paretonormal distribution.
+  # This will preserve the minimum RTT, but add a realistic amount of jitter to
+  # the connection and cause re-ordering, etc. The regular pareto distribution
+  # appears to an unreasonable level of delay (we want only small spikes.)
+  distribution="distribution paretonormal"
+else
+  distribution=""
+fi
+
+# Client proxy that will listen on the client's iperf target forward traffic
+# using the host networking stack.
+client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}"
+if ${client}; then
+  # Client proxy that will listen on the client's iperf target
+  # and forward traffic using netstack.
+  client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\
+      -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\
+      -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}"
+fi
+
+# Server proxy that will listen on the proxy port and forward to the server's
+# iperf server using the host networking stack.
+server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}"
+if ${server}; then
+  # Server proxy that will listen on the proxy port and forward to the servers'
+  # iperf server using netstack.
+  server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\
+      -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\
+      -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}"
+fi
+
+# Specify loss and duplicate parameters only if they are non-zero
+loss_opt=""
+if [ "$(echo $half_loss | bc -q)" != "0" ]; then
+  loss_opt="loss random ${half_loss}%"
+fi
+duplicate_opt=""
+if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then
+  duplicate_opt="duplicate ${half_duplicate}%"
+fi
+
+exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF
+set -e -m
+
+if [ ${verbose} == "true" ]; then
+  set -x
+fi
+
+mount -t tmpfs netstack-bench /tmp
+
+# We may have reset the path in the unshare if the shell loaded some public
+# profiles. Ensure that tools are discoverable via the parent's PATH.
+export PATH=${PATH}
+
+# Add client, server interfaces.
+ip link add client.0 type veth peer name client.1
+ip link add server.0 type veth peer name server.1
+
+# Add network emulation devices.
+ip link add wan.0 type veth peer name wan.1
+ip link set wan.0 up
+ip link set wan.1 up
+
+# Enroll on the bridge.
+ip link add name br0 type bridge
+ip link add name br1 type bridge
+ip link set client.1 master br0
+ip link set server.1 master br1
+ip link set wan.0 master br0
+ip link set wan.1 master br1
+ip link set br0 up
+ip link set br1 up
+
+# Set the MTU appropriately.
+ip link set client.0 mtu ${mtu}
+ip link set server.0 mtu ${mtu}
+ip link set wan.0 mtu ${mtu}
+ip link set wan.1 mtu ${mtu}
+
+# Add appropriate latency, loss and duplication.
+#
+# This is added in at the point of bridge connection.
+for device in wan.0 wan.1; do
+  # NOTE: We don't support a loss correlation as testing has shown that it
+  # actually doesn't work. The man page actually has a small comment about this
+  # "It is also possible to add a correlation, but this option is now deprecated
+  # due to the noticed bad behavior." For more information see netem(8).
+  tc qdisc add dev \$device root netem \\
+    delay ${half_latency}ms ${latency_variation}ms ${distribution} \\
+    ${loss_opt} ${duplicate_opt}
+done
+
+# Start a client proxy.
+touch /tmp/client.netns
+unshare -n mount --bind /proc/self/ns/net /tmp/client.netns
+
+# Move the endpoint into the namespace.
+while ip link | grep client.0 > /dev/null; do
+  ip link set dev client.0 netns /tmp/client.netns
+done
+
+if ! ${client}; then
+  # Only add the address to NIC if netstack is not in use. Otherwise the host
+  # will also process the inbound SYN and send a RST back.
+  ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0
+fi
+
+# Start a server proxy.
+touch /tmp/server.netns
+unshare -n mount --bind /proc/self/ns/net /tmp/server.netns
+# Move the endpoint into the namespace.
+while ip link | grep server.0 > /dev/null; do
+  ip link set dev server.0 netns /tmp/server.netns
+done
+if ! ${server}; then
+  # Only add the address to NIC if netstack is not in use. Otherwise the host
+  # will also process the inbound SYN and send a RST back.
+  ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0
+fi
+
+# Add client and server addresses, and bring everything up.
+${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0
+${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0
+${nsjoin_binary} /tmp/client.netns ip link set client.0 up
+${nsjoin_binary} /tmp/client.netns ip link set lo up
+${nsjoin_binary} /tmp/server.netns ip link set server.0 up
+${nsjoin_binary} /tmp/server.netns ip link set lo up
+ip link set dev client.1 up
+ip link set dev server.1 up
+
+${nsjoin_binary} /tmp/client.netns ${client_args} &
+client_pid=\$!
+${nsjoin_binary} /tmp/server.netns ${server_args} &
+server_pid=\$!
+
+# Start the iperf server.
+${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 &
+iperf_pid=\$!
+
+# Show traffic information.
+if ! ${client} && ! ${server}; then
+  ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true
+fi
+
+results_file=\$(mktemp)
+function cleanup {
+  rm -f \$results_file
+  kill -TERM \$client_pid
+  kill -TERM \$server_pid
+  wait \$client_pid
+  wait \$server_pid
+  kill -9 \$iperf_pid 2>/dev/null
+}
+
+# Allow failure from this point.
+set +e
+trap cleanup EXIT
+
+# Run the benchmark, recording the results file.
+while ${nsjoin_binary} /tmp/client.netns iperf \\
+    -p ${proxy_port} -c ${client_addr} -t ${duration} -f m 2>&1 \\
+    | tee \$results_file \\
+    | grep "connect failed" >/dev/null; do
+  sleep 0.1 # Wait for all services.
+done
+
+# Unlink all relevant devices from the bridge. This is because when the bridge
+# is deleted, the kernel may hang. It appears that this problem is fixed in
+# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a.
+ip link set client.1 nomaster
+ip link set server.1 nomaster
+ip link set wan.0 nomaster
+ip link set wan.1 nomaster
+
+# Emit raw results.
+cat \$results_file >&2
+
+# Emit a useful result (final throughput).
+mbits=\$(grep Mbits/sec \$results_file \\
+  | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p')
+client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\
+  | awk '{print (\$14+\$15);}')
+server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\
+  | awk '{print (\$14+\$15);}')
+ticks_per_sec=\$(getconf CLK_TCK)
+client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration})
+server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration})
+echo \$mbits \$client_cpu_load \$server_cpu_load
+EOF
diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
new file mode 100644
index 000000000..361a56755
--- /dev/null
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -0,0 +1,436 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary tcp_proxy is a simple TCP proxy.
+package main
+
+import (
+	"encoding/gob"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"os/signal"
+	"regexp"
+	"runtime"
+	"runtime/pprof"
+	"strconv"
+	"syscall"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+)
+
+var (
+	port    = flag.Int("port", 0, "bind port (all addresses)")
+	forward = flag.String("forward", "", "forwarding target")
+	client  = flag.Bool("client", false, "use netstack for listen")
+	server  = flag.Bool("server", false, "use netstack for dial")
+
+	// Netstack-specific options.
+	mtu                = flag.Int("mtu", 1280, "mtu for network stack")
+	addr               = flag.String("addr", "", "address for tap-based netstack")
+	mask               = flag.Int("mask", 8, "mask size for address")
+	iface              = flag.String("iface", "", "network interface name to bind for netstack")
+	sack               = flag.Bool("sack", false, "enable SACK support for netstack")
+	cubic              = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack")
+	gso                = flag.Int("gso", 0, "GSO maximum size")
+	swgso              = flag.Bool("swgso", false, "software-level GSO")
+	clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
+	serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
+	cpuprofile         = flag.String("cpuprofile", "", "write cpu profile to the specified file.")
+	memprofile         = flag.String("memprofile", "", "write memory profile to the specified file.")
+)
+
+type impl interface {
+	dial(address string) (net.Conn, error)
+	listen(port int) (net.Listener, error)
+	printStats()
+}
+
+type netImpl struct{}
+
+func (netImpl) dial(address string) (net.Conn, error) {
+	return net.Dial("tcp", address)
+}
+
+func (netImpl) listen(port int) (net.Listener, error) {
+	return net.Listen("tcp", fmt.Sprintf(":%d", port))
+}
+
+func (netImpl) printStats() {
+}
+
+const (
+	nicID      = 1       // Fixed.
+	rcvBufSize = 1 << 20 // 1MB.
+)
+
+type netstackImpl struct {
+	s    *stack.Stack
+	addr tcpip.Address
+	mode string
+}
+
+func setupNetwork(ifaceName string) (fd int, err error) {
+	// Get all interfaces in the namespace.
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return -1, fmt.Errorf("querying interfaces: %v", err)
+	}
+
+	for _, iface := range ifaces {
+		if iface.Name != ifaceName {
+			continue
+		}
+		// Create the socket.
+		const protocol = 0x0300 // htons(ETH_P_ALL)
+		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+		if err != nil {
+			return -1, fmt.Errorf("unable to create raw socket: %v", err)
+		}
+
+		// Bind to the appropriate device.
+		ll := syscall.SockaddrLinklayer{
+			Protocol: protocol,
+			Ifindex:  iface.Index,
+			Pkttype:  syscall.PACKET_HOST,
+		}
+		if err := syscall.Bind(fd, &ll); err != nil {
+			return -1, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+		}
+
+		// RAW Sockets by default have a very small SO_RCVBUF of 256KB,
+		// up it to at least 1MB to reduce packet drops.
+		if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, rcvBufSize); err != nil {
+			return -1, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+		}
+
+		if !*swgso && *gso != 0 {
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+				return -1, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+			}
+		}
+		return fd, nil
+	}
+	return -1, fmt.Errorf("failed to find interface: %v", ifaceName)
+}
+
+func newNetstackImpl(mode string) (impl, error) {
+	fd, err := setupNetwork(*iface)
+	if err != nil {
+		return nil, err
+	}
+
+	// Parse details.
+	parsedAddr := tcpip.Address(net.ParseIP(*addr).To4())
+	parsedDest := tcpip.Address("")     // Filled in below.
+	parsedMask := tcpip.AddressMask("") // Filled in below.
+	switch *mask {
+	case 8:
+		parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0})
+		parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0})
+	case 16:
+		parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0})
+		parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0})
+	case 24:
+		parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0})
+		parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0})
+	default:
+		// This is just laziness; we don't expect a different mask.
+		return nil, fmt.Errorf("mask %d not supported", mask)
+	}
+
+	// Create a new network stack.
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()}
+	s := stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+	})
+
+	// Generate a new mac for the eth device.
+	mac := make(net.HardwareAddr, 6)
+	rand.Read(mac) // Fill with random data.
+	mac[0] &^= 0x1 // Clear multicast bit.
+	mac[0] |= 0x2  // Set local assignment bit (IEEE802).
+	ep, err := fdbased.New(&fdbased.Options{
+		FDs:            []int{fd},
+		MTU:            uint32(*mtu),
+		EthernetHeader: true,
+		Address:        tcpip.LinkAddress(mac),
+		// Enable checksum generation as we need to generate valid
+		// checksums for the veth device to deliver our packets to the
+		// peer. But we do want to disable checksum verification as veth
+		// devices do perform GRO and the linux host kernel may not
+		// regenerate valid checksums after GRO.
+		TXChecksumOffload:  false,
+		RXChecksumOffload:  true,
+		PacketDispatchMode: fdbased.RecvMMsg,
+		GSOMaxSize:         uint32(*gso),
+		SoftwareGSOEnabled: *swgso,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create FD endpoint: %v", err)
+	}
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err)
+	}
+	if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		return nil, fmt.Errorf("error adding ARP address to %q: %v", *iface, err)
+	}
+	if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil {
+		return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err)
+	}
+
+	subnet, err := tcpip.NewSubnet(parsedDest, parsedMask)
+	if err != nil {
+		return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err)
+	}
+	// Add default route; we only support
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: subnet,
+			NIC:         nicID,
+		},
+	})
+
+	// Set protocol options.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %v", err)
+	}
+
+	// Set Congestion Control to cubic if requested.
+	if *cubic {
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %v", err)
+		}
+	}
+
+	return netstackImpl{
+		s:    s,
+		addr: parsedAddr,
+		mode: mode,
+	}, nil
+}
+
+func (n netstackImpl) dial(address string) (net.Conn, error) {
+	host, port, err := net.SplitHostPort(address)
+	if err != nil {
+		return nil, err
+	}
+	if host == "" {
+		// A host must be provided for the dial.
+		return nil, fmt.Errorf("no host provided")
+	}
+	portNumber, err := strconv.Atoi(port)
+	if err != nil {
+		return nil, err
+	}
+	addr := tcpip.FullAddress{
+		NIC:  nicID,
+		Addr: tcpip.Address(net.ParseIP(host).To4()),
+		Port: uint16(portNumber),
+	}
+	conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber)
+	if err != nil {
+		return nil, err
+	}
+	return conn, nil
+}
+
+func (n netstackImpl) listen(port int) (net.Listener, error) {
+	addr := tcpip.FullAddress{
+		NIC:  nicID,
+		Port: uint16(port),
+	}
+	listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber)
+	if err != nil {
+		return nil, err
+	}
+	return listener, nil
+}
+
+var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`)
+
+func (n netstackImpl) printStats() {
+	// Don't show zero fields.
+	stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "")
+	log.Printf("netstack %s Stats: %+v\n", n.mode, stats)
+}
+
+// installProbe installs a TCP Probe function that will dump endpoint
+// state to the specified file. It also returns a close func() that
+// can be used to close the probeFile.
+func (n netstackImpl) installProbe(probeFileName string) (close func()) {
+	// Install Probe to dump out end point state.
+	probeFile, err := os.Create(probeFileName)
+	if err != nil {
+		log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err)
+	}
+	probeEncoder := gob.NewEncoder(probeFile)
+	// Install a TCP Probe.
+	n.s.AddTCPProbe(func(state stack.TCPEndpointState) {
+		probeEncoder.Encode(state)
+	})
+	return func() { probeFile.Close() }
+}
+
+func main() {
+	flag.Parse()
+	if *port == 0 {
+		log.Fatalf("no port provided")
+	}
+	if *forward == "" {
+		log.Fatalf("no forward provided")
+	}
+	// Seed the random number generator to ensure that we are given MAC addresses that don't
+	// for the case of the client and server stack.
+	rand.Seed(time.Now().UTC().UnixNano())
+
+	if *cpuprofile != "" {
+		f, err := os.Create(*cpuprofile)
+		if err != nil {
+			log.Fatal("could not create CPU profile: ", err)
+		}
+		defer func() {
+			if err := f.Close(); err != nil {
+				log.Print("error closing CPU profile: ", err)
+			}
+		}()
+		if err := pprof.StartCPUProfile(f); err != nil {
+			log.Fatal("could not start CPU profile: ", err)
+		}
+		defer pprof.StopCPUProfile()
+	}
+
+	var (
+		in  impl
+		out impl
+		err error
+	)
+	if *server {
+		in, err = newNetstackImpl("server")
+		if *serverTCPProbeFile != "" {
+			defer in.(netstackImpl).installProbe(*serverTCPProbeFile)()
+		}
+
+	} else {
+		in = netImpl{}
+	}
+	if err != nil {
+		log.Fatalf("netstack error: %v", err)
+	}
+	if *client {
+		out, err = newNetstackImpl("client")
+		if *clientTCPProbeFile != "" {
+			defer out.(netstackImpl).installProbe(*clientTCPProbeFile)()
+		}
+	} else {
+		out = netImpl{}
+	}
+	if err != nil {
+		log.Fatalf("netstack error: %v", err)
+	}
+
+	// Dial forward before binding.
+	var next net.Conn
+	for {
+		next, err = out.dial(*forward)
+		if err == nil {
+			break
+		}
+		time.Sleep(50 * time.Millisecond)
+		log.Printf("connect failed retrying: %v", err)
+	}
+
+	// Bind once to the server socket.
+	listener, err := in.listen(*port)
+	if err != nil {
+		// Should not happen, everything must be bound by this time
+		// this proxy is started.
+		log.Fatalf("unable to listen: %v", err)
+	}
+	log.Printf("client=%v, server=%v, ready.", *client, *server)
+
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, syscall.SIGTERM)
+	go func() {
+		<-sigs
+		if *cpuprofile != "" {
+			pprof.StopCPUProfile()
+		}
+		if *memprofile != "" {
+			f, err := os.Create(*memprofile)
+			if err != nil {
+				log.Fatal("could not create memory profile: ", err)
+			}
+			defer func() {
+				if err := f.Close(); err != nil {
+					log.Print("error closing memory profile: ", err)
+				}
+			}()
+			runtime.GC() // get up-to-date statistics
+			if err := pprof.WriteHeapProfile(f); err != nil {
+				log.Fatalf("Unable to write heap profile: %v", err)
+			}
+		}
+		os.Exit(0)
+	}()
+
+	for {
+		// Forward all connections.
+		inConn, err := listener.Accept()
+		if err != nil {
+			// This should not happen; we are listening
+			// successfully. Exhausted all available FDs?
+			log.Fatalf("accept error: %v", err)
+		}
+		log.Printf("incoming connection established.")
+
+		// Copy both ways.
+		go io.Copy(inConn, next)
+		go io.Copy(next, inConn)
+
+		// Print stats every second.
+		go func() {
+			t := time.NewTicker(time.Second)
+			defer t.Stop()
+			for {
+				<-t.C
+				in.printStats()
+				out.printStats()
+			}
+		}()
+
+		for {
+			// Dial again.
+			next, err = out.dial(*forward)
+			if err == nil {
+				break
+			}
+		}
+	}
+}
diff --git a/benchmarks/workloads/BUILD b/benchmarks/workloads/BUILD
new file mode 100644
index 000000000..643806105
--- /dev/null
+++ b/benchmarks/workloads/BUILD
@@ -0,0 +1,35 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "workloads",
+    srcs = ["__init__.py"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "//benchmarks/workloads/ab:files",
+        "//benchmarks/workloads/absl:files",
+        "//benchmarks/workloads/curl:files",
+        "//benchmarks/workloads/ffmpeg:files",
+        "//benchmarks/workloads/fio:files",
+        "//benchmarks/workloads/httpd:files",
+        "//benchmarks/workloads/iperf:files",
+        "//benchmarks/workloads/netcat:files",
+        "//benchmarks/workloads/nginx:files",
+        "//benchmarks/workloads/node:files",
+        "//benchmarks/workloads/node_template:files",
+        "//benchmarks/workloads/redis:files",
+        "//benchmarks/workloads/redisbenchmark:files",
+        "//benchmarks/workloads/ruby:files",
+        "//benchmarks/workloads/ruby_template:files",
+        "//benchmarks/workloads/sleep:files",
+        "//benchmarks/workloads/sysbench:files",
+        "//benchmarks/workloads/syscall:files",
+        "//benchmarks/workloads/tensorflow:files",
+        "//benchmarks/workloads/true:files",
+    ],
+)
diff --git a/benchmarks/workloads/__init__.py b/benchmarks/workloads/__init__.py
new file mode 100644
index 000000000..e12651e76
--- /dev/null
+++ b/benchmarks/workloads/__init__.py
@@ -0,0 +1,14 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Workloads, parsers and test data."""
diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD
new file mode 100644
index 000000000..e99a8d674
--- /dev/null
+++ b/benchmarks/workloads/ab/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "ab",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "ab_test",
+    srcs = ["ab_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":ab",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/ab/Dockerfile b/benchmarks/workloads/ab/Dockerfile
new file mode 100644
index 000000000..0d0b6e2eb
--- /dev/null
+++ b/benchmarks/workloads/ab/Dockerfile
@@ -0,0 +1,15 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            apache2-utils \
+        && rm -rf /var/lib/apt/lists/*
+
+# Parameterized workload.
+ENV requests 5000
+ENV connections 10
+ENV host localhost
+ENV port 8080
+ENV path notfound
+CMD ["sh", "-c", "ab -n ${requests} -c ${connections} http://${host}:${port}/${path}"]
diff --git a/benchmarks/workloads/ab/__init__.py b/benchmarks/workloads/ab/__init__.py
new file mode 100644
index 000000000..eedf8e083
--- /dev/null
+++ b/benchmarks/workloads/ab/__init__.py
@@ -0,0 +1,88 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Apachebench tool."""
+
+import re
+
+SAMPLE_DATA = """This is ApacheBench, Version 2.3 <$Revision: 1826891 $>
+Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
+Licensed to The Apache Software Foundation, http://www.apache.org/
+
+Benchmarking 10.10.10.10 (be patient).....done
+
+
+Server Software:        Apache/2.4.38
+Server Hostname:        10.10.10.10
+Server Port:            80
+
+Document Path:          /latin10k.txt
+Document Length:        210 bytes
+
+Concurrency Level:      1
+Time taken for tests:   0.180 seconds
+Complete requests:      100
+Failed requests:        0
+Non-2xx responses:      100
+Total transferred:      38800 bytes
+HTML transferred:       21000 bytes
+Requests per second:    556.44 [#/sec] (mean)
+Time per request:       1.797 [ms] (mean)
+Time per request:       1.797 [ms] (mean, across all concurrent requests)
+Transfer rate:          210.84 [Kbytes/sec] received
+
+Connection Times (ms)
+              min  mean[+/-sd] median   max
+Connect:        0    0   0.2      0       2
+Processing:     1    2   1.0      1       8
+Waiting:        1    1   1.0      1       7
+Total:          1    2   1.2      1      10
+
+Percentage of the requests served within a certain time (ms)
+  50%      1
+  66%      2
+  75%      2
+  80%      2
+  90%      2
+  95%      3
+  98%      7
+  99%     10
+ 100%     10 (longest request)"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+  return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def transfer_rate(data: str, **kwargs) -> float:
+  """Mean transfer rate in Kbytes/sec."""
+  regex = r"Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received"
+  return float(re.compile(regex).search(data).group(1))
+
+
+# pylint: disable=unused-argument
+def latency(data: str, **kwargs) -> float:
+  """Mean latency in milliseconds."""
+  regex = r"Total:\s+\d+\s+(\d+)\s+(\d+\.?\d+?)\s+\d+\s+\d+\s"
+  res = re.compile(regex).search(data)
+  return float(res.group(1))
+
+
+# pylint: disable=unused-argument
+def requests_per_second(data: str, **kwargs) -> float:
+  """Requests per second."""
+  regex = r"Requests per second:\s+(\d+\.?\d+?)\s+"
+  res = re.compile(regex).search(data)
+  return float(res.group(1))
diff --git a/benchmarks/workloads/ab/ab_test.py b/benchmarks/workloads/ab/ab_test.py
new file mode 100644
index 000000000..4afac2996
--- /dev/null
+++ b/benchmarks/workloads/ab/ab_test.py
@@ -0,0 +1,42 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import ab
+
+
+def test_transfer_rate_parser():
+  """Test transfer rate parser."""
+  res = ab.transfer_rate(ab.sample())
+  assert res == 210.84
+
+
+def test_latency_parser():
+  """Test latency parser."""
+  res = ab.latency(ab.sample())
+  assert res == 2
+
+
+def test_requests_per_second():
+  """Test requests per second parser."""
+  res = ab.requests_per_second(ab.sample())
+  assert res == 556.44
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD
new file mode 100644
index 000000000..bb499620e
--- /dev/null
+++ b/benchmarks/workloads/absl/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "absl",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "absl_test",
+    srcs = ["absl_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":absl",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/absl/Dockerfile b/benchmarks/workloads/absl/Dockerfile
new file mode 100644
index 000000000..e935c5ddc
--- /dev/null
+++ b/benchmarks/workloads/absl/Dockerfile
@@ -0,0 +1,24 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            wget \
+            git \
+            pkg-config \
+            zip \
+            g++ \
+            zlib1g-dev \
+            unzip \
+            python3 \
+        && rm -rf /var/lib/apt/lists/*
+RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27.0-installer-linux-x86_64.sh
+RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh
+RUN ./bazel-0.27.0-installer-linux-x86_64.sh
+
+RUN git clone https://github.com/abseil/abseil-cpp.git
+WORKDIR abseil-cpp
+RUN git checkout 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f
+RUN bazel clean
+ENV path "absl/base/..."
+CMD bazel build ${path} 2>&1
diff --git a/benchmarks/workloads/absl/__init__.py b/benchmarks/workloads/absl/__init__.py
new file mode 100644
index 000000000..b40e3f915
--- /dev/null
+++ b/benchmarks/workloads/absl/__init__.py
@@ -0,0 +1,63 @@
+# python3
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ABSL build benchmark."""
+
+import re
+
+SAMPLE_BAZEL_OUTPUT = """Extracting Bazel installation...
+Starting local Bazel server and connecting to it...
+Loading:
+Loading: 0 packages loaded
+Loading: 0 packages loaded
+    currently loading: absl/algorithm ... (11 packages)
+Analyzing: 241 targets (16 packages loaded, 0 targets configured)
+Analyzing: 241 targets (21 packages loaded, 617 targets configured)
+Analyzing: 241 targets (27 packages loaded, 687 targets configured)
+Analyzing: 241 targets (32 packages loaded, 1105 targets configured)
+Analyzing: 241 targets (32 packages loaded, 1294 targets configured)
+Analyzing: 241 targets (35 packages loaded, 1575 targets configured)
+Analyzing: 241 targets (35 packages loaded, 1575 targets configured)
+Analyzing: 241 targets (36 packages loaded, 1603 targets configured)
+Analyzing: 241 targets (36 packages loaded, 1603 targets configured)
+INFO: Analyzed 241 targets (37 packages loaded, 1864 targets configured).
+INFO: Found 241 targets...
+[0 / 5] [Prepa] BazelWorkspaceStatusAction stable-status.txt
+[16 / 50] [Analy] Compiling absl/base/dynamic_annotations.cc ... (20 actions, 10 running)
+[60 / 77] Compiling external/com_google_googletest/googletest/src/gtest.cc; 5s processwrapper-sandbox ... (12 actions, 11 running)
+[158 / 174] Compiling absl/container/internal/raw_hash_set_test.cc; 2s processwrapper-sandbox ... (12 actions, 11 running)
+[278 / 302] Compiling absl/container/internal/raw_hash_set_test.cc; 6s processwrapper-sandbox ... (12 actions, 11 running)
+[384 / 406] Compiling absl/container/internal/raw_hash_set_test.cc; 10s processwrapper-sandbox ... (12 actions, 11 running)
+[581 / 604] Compiling absl/container/flat_hash_set_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running)
+[722 / 745] Compiling absl/container/node_hash_set_test.cc; 9s processwrapper-sandbox ... (12 actions, 11 running)
+[846 / 867] Compiling absl/hash/hash_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running)
+INFO: From Compiling absl/debugging/symbolize_test.cc:
+/tmp/cclCVipU.s: Assembler messages:
+/tmp/cclCVipU.s:1662: Warning: ignoring changed section attributes for .text
+[999 / 1,022] Compiling absl/hash/hash_test.cc; 19s processwrapper-sandbox ... (12 actions, 11 running)
+[1,082 / 1,084] Compiling absl/container/flat_hash_map_test.cc; 7s processwrapper-sandbox
+INFO: Elapsed time: 81.861s, Critical Path: 23.81s
+INFO: 515 processes: 515 processwrapper-sandbox.
+INFO: Build completed successfully, 1084 total actions
+INFO: Build completed successfully, 1084 total actions"""
+
+
+def sample():
+  return SAMPLE_BAZEL_OUTPUT
+
+
+# pylint: disable=unused-argument
+def elapsed_time(data: str, **kwargs) -> float:
+  """Returns the elapsed time for running an absl build."""
+  return float(re.compile(r"Elapsed time: (\d*.?\d*)s").search(data).group(1))
diff --git a/benchmarks/workloads/absl/absl_test.py b/benchmarks/workloads/absl/absl_test.py
new file mode 100644
index 000000000..41f216999
--- /dev/null
+++ b/benchmarks/workloads/absl/absl_test.py
@@ -0,0 +1,31 @@
+# python3
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ABSL build test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import absl
+
+
+def test_elapsed_time():
+  """Test elapsed_time."""
+  res = absl.elapsed_time(absl.sample())
+  assert res == 81.861
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/curl/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/curl/Dockerfile b/benchmarks/workloads/curl/Dockerfile
new file mode 100644
index 000000000..336cb088a
--- /dev/null
+++ b/benchmarks/workloads/curl/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            curl \
+        && rm -rf /var/lib/apt/lists/*
+
+# Accept a host and port parameter.
+ENV host localhost
+ENV port 8080
+
+# Spin until we make a successful request.
+CMD ["sh", "-c", "while ! curl -v -i http://$host:$port; do true; done"]
diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD
new file mode 100644
index 000000000..c1f2afc40
--- /dev/null
+++ b/benchmarks/workloads/ffmpeg/BUILD
@@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "ffmpeg",
+    srcs = ["__init__.py"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/ffmpeg/Dockerfile b/benchmarks/workloads/ffmpeg/Dockerfile
new file mode 100644
index 000000000..f2f530d7c
--- /dev/null
+++ b/benchmarks/workloads/ffmpeg/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            ffmpeg \
+        && rm -rf /var/lib/apt/lists/*
+WORKDIR /media
+ADD https://samples.ffmpeg.org/MPEG-4/video.mp4 video.mp4
+CMD ["ffmpeg", "-i", "video.mp4", "-c:v", "libx264", "-preset", "veryslow", "output.mp4"]
diff --git a/benchmarks/workloads/ffmpeg/__init__.py b/benchmarks/workloads/ffmpeg/__init__.py
new file mode 100644
index 000000000..7578a443b
--- /dev/null
+++ b/benchmarks/workloads/ffmpeg/__init__.py
@@ -0,0 +1,20 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple ffmpeg workload."""
+
+
+# pylint: disable=unused-argument
+def run_time(value, **kwargs):
+  """Returns the startup and runtime of the ffmpeg workload in seconds."""
+  return value
diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD
new file mode 100644
index 000000000..7fc96cfa5
--- /dev/null
+++ b/benchmarks/workloads/fio/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "fio",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "fio_test",
+    srcs = ["fio_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":fio",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/fio/Dockerfile b/benchmarks/workloads/fio/Dockerfile
new file mode 100644
index 000000000..b3cf864eb
--- /dev/null
+++ b/benchmarks/workloads/fio/Dockerfile
@@ -0,0 +1,23 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            fio \
+        && rm -rf /var/lib/apt/lists/*
+
+# Parameterized test.
+ENV test write
+ENV ioengine sync
+ENV size 5000000
+ENV iodepth 4
+ENV blocksize "1m"
+ENV time ""
+ENV path "/disk/file.dat"
+ENV ramp_time 0
+
+CMD ["sh", "-c", "fio --output-format=json --name=test --ramp_time=${ramp_time} --ioengine=${ioengine} --size=${size} \
+--filename=${path} --iodepth=${iodepth} --bs=${blocksize} --rw=${test} ${time}"]
+
+
+
diff --git a/benchmarks/workloads/fio/__init__.py b/benchmarks/workloads/fio/__init__.py
new file mode 100644
index 000000000..52711e956
--- /dev/null
+++ b/benchmarks/workloads/fio/__init__.py
@@ -0,0 +1,369 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FIO benchmark tool."""
+
+import json
+
+SAMPLE_DATA = """
+{
+  "fio version" : "fio-3.1",
+  "timestamp" : 1554837456,
+  "timestamp_ms" : 1554837456621,
+  "time" : "Tue Apr  9 19:17:36 2019",
+  "jobs" : [
+    {
+      "jobname" : "test",
+      "groupid" : 0,
+      "error" : 0,
+      "eta" : 2147483647,
+      "elapsed" : 1,
+      "job options" : {
+        "name" : "test",
+        "ioengine" : "sync",
+        "size" : "1073741824",
+        "filename" : "/disk/file.dat",
+        "iodepth" : "4",
+        "bs" : "4096",
+        "rw" : "write"
+      },
+      "read" : {
+        "io_bytes" : 0,
+        "io_kbytes" : 0,
+        "bw" : 0,
+        "iops" : 0.000000,
+        "runtime" : 0,
+        "total_ios" : 0,
+        "short_ios" : 0,
+        "drop_ios" : 0,
+        "slat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000
+        },
+        "clat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000,
+          "percentile" : {
+            "1.000000" : 0,
+            "5.000000" : 0,
+            "10.000000" : 0,
+            "20.000000" : 0,
+            "30.000000" : 0,
+            "40.000000" : 0,
+            "50.000000" : 0,
+            "60.000000" : 0,
+            "70.000000" : 0,
+            "80.000000" : 0,
+            "90.000000" : 0,
+            "95.000000" : 0,
+            "99.000000" : 0,
+            "99.500000" : 0,
+            "99.900000" : 0,
+            "99.950000" : 0,
+            "99.990000" : 0,
+            "0.00" : 0,
+            "0.00" : 0,
+            "0.00" : 0
+          }
+        },
+        "lat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000
+        },
+        "bw_min" : 0,
+        "bw_max" : 0,
+        "bw_agg" : 0.000000,
+        "bw_mean" : 0.000000,
+        "bw_dev" : 0.000000,
+        "bw_samples" : 0,
+        "iops_min" : 0,
+        "iops_max" : 0,
+        "iops_mean" : 0.000000,
+        "iops_stddev" : 0.000000,
+        "iops_samples" : 0
+      },
+      "write" : {
+        "io_bytes" : 1073741824,
+        "io_kbytes" : 1048576,
+        "bw" : 1753471,
+        "iops" : 438367.892977,
+        "runtime" : 598,
+        "total_ios" : 262144,
+        "short_ios" : 0,
+        "drop_ios" : 0,
+        "slat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000
+        },
+        "clat_ns" : {
+          "min" : 1693,
+          "max" : 754733,
+          "mean" : 2076.404373,
+          "stddev" : 1724.195529,
+          "percentile" : {
+            "1.000000" : 1736,
+            "5.000000" : 1752,
+            "10.000000" : 1768,
+            "20.000000" : 1784,
+            "30.000000" : 1800,
+            "40.000000" : 1800,
+            "50.000000" : 1816,
+            "60.000000" : 1816,
+            "70.000000" : 1848,
+            "80.000000" : 1928,
+            "90.000000" : 2512,
+            "95.000000" : 2992,
+            "99.000000" : 6176,
+            "99.500000" : 6304,
+            "99.900000" : 11328,
+            "99.950000" : 15168,
+            "99.990000" : 17792,
+            "0.00" : 0,
+            "0.00" : 0,
+            "0.00" : 0
+          }
+        },
+        "lat_ns" : {
+          "min" : 1731,
+          "max" : 754770,
+          "mean" : 2117.878979,
+          "stddev" : 1730.290512
+        },
+        "bw_min" : 1731120,
+        "bw_max" : 1731120,
+        "bw_agg" : 98.725328,
+        "bw_mean" : 1731120.000000,
+        "bw_dev" : 0.000000,
+        "bw_samples" : 1,
+        "iops_min" : 432780,
+        "iops_max" : 432780,
+        "iops_mean" : 432780.000000,
+        "iops_stddev" : 0.000000,
+        "iops_samples" : 1
+      },
+      "trim" : {
+        "io_bytes" : 0,
+        "io_kbytes" : 0,
+        "bw" : 0,
+        "iops" : 0.000000,
+        "runtime" : 0,
+        "total_ios" : 0,
+        "short_ios" : 0,
+        "drop_ios" : 0,
+        "slat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000
+        },
+        "clat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000,
+          "percentile" : {
+            "1.000000" : 0,
+            "5.000000" : 0,
+            "10.000000" : 0,
+            "20.000000" : 0,
+            "30.000000" : 0,
+            "40.000000" : 0,
+            "50.000000" : 0,
+            "60.000000" : 0,
+            "70.000000" : 0,
+            "80.000000" : 0,
+            "90.000000" : 0,
+            "95.000000" : 0,
+            "99.000000" : 0,
+            "99.500000" : 0,
+            "99.900000" : 0,
+            "99.950000" : 0,
+            "99.990000" : 0,
+            "0.00" : 0,
+            "0.00" : 0,
+            "0.00" : 0
+          }
+        },
+        "lat_ns" : {
+          "min" : 0,
+          "max" : 0,
+          "mean" : 0.000000,
+          "stddev" : 0.000000
+        },
+        "bw_min" : 0,
+        "bw_max" : 0,
+        "bw_agg" : 0.000000,
+        "bw_mean" : 0.000000,
+        "bw_dev" : 0.000000,
+        "bw_samples" : 0,
+        "iops_min" : 0,
+        "iops_max" : 0,
+        "iops_mean" : 0.000000,
+        "iops_stddev" : 0.000000,
+        "iops_samples" : 0
+      },
+      "usr_cpu" : 17.922948,
+      "sys_cpu" : 81.574539,
+      "ctx" : 3,
+      "majf" : 0,
+      "minf" : 10,
+      "iodepth_level" : {
+        "1" : 100.000000,
+        "2" : 0.000000,
+        "4" : 0.000000,
+        "8" : 0.000000,
+        "16" : 0.000000,
+        "32" : 0.000000,
+        ">=64" : 0.000000
+      },
+      "latency_ns" : {
+        "2" : 0.000000,
+        "4" : 0.000000,
+        "10" : 0.000000,
+        "20" : 0.000000,
+        "50" : 0.000000,
+        "100" : 0.000000,
+        "250" : 0.000000,
+        "500" : 0.000000,
+        "750" : 0.000000,
+        "1000" : 0.000000
+      },
+      "latency_us" : {
+        "2" : 82.737350,
+        "4" : 12.605286,
+        "10" : 4.543686,
+        "20" : 0.107956,
+        "50" : 0.010000,
+        "100" : 0.000000,
+        "250" : 0.000000,
+        "500" : 0.000000,
+        "750" : 0.000000,
+        "1000" : 0.010000
+      },
+      "latency_ms" : {
+        "2" : 0.000000,
+        "4" : 0.000000,
+        "10" : 0.000000,
+        "20" : 0.000000,
+        "50" : 0.000000,
+        "100" : 0.000000,
+        "250" : 0.000000,
+        "500" : 0.000000,
+        "750" : 0.000000,
+        "1000" : 0.000000,
+        "2000" : 0.000000,
+        ">=2000" : 0.000000
+      },
+      "latency_depth" : 4,
+      "latency_target" : 0,
+      "latency_percentile" : 100.000000,
+      "latency_window" : 0
+    }
+  ],
+  "disk_util" : [
+    {
+      "name" : "dm-1",
+      "read_ios" : 0,
+      "write_ios" : 3,
+      "read_merges" : 0,
+      "write_merges" : 0,
+      "read_ticks" : 0,
+      "write_ticks" : 0,
+      "in_queue" : 0,
+      "util" : 0.000000,
+      "aggr_read_ios" : 0,
+      "aggr_write_ios" : 3,
+      "aggr_read_merges" : 0,
+      "aggr_write_merge" : 0,
+      "aggr_read_ticks" : 0,
+      "aggr_write_ticks" : 0,
+      "aggr_in_queue" : 0,
+      "aggr_util" : 0.000000
+    },
+    {
+      "name" : "dm-0",
+      "read_ios" : 0,
+      "write_ios" : 3,
+      "read_merges" : 0,
+      "write_merges" : 0,
+      "read_ticks" : 0,
+      "write_ticks" : 0,
+      "in_queue" : 0,
+      "util" : 0.000000,
+      "aggr_read_ios" : 0,
+      "aggr_write_ios" : 3,
+      "aggr_read_merges" : 0,
+      "aggr_write_merge" : 0,
+      "aggr_read_ticks" : 0,
+      "aggr_write_ticks" : 2,
+      "aggr_in_queue" : 0,
+      "aggr_util" : 0.000000
+    },
+    {
+      "name" : "nvme0n1",
+      "read_ios" : 0,
+      "write_ios" : 3,
+      "read_merges" : 0,
+      "write_merges" : 0,
+      "read_ticks" : 0,
+      "write_ticks" : 2,
+      "in_queue" : 0,
+      "util" : 0.000000
+    }
+  ]
+}
+"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+  return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def read_bandwidth(data: str, **kwargs) -> int:
+  """File I/O bandwidth."""
+  return json.loads(data)["jobs"][0]["read"]["bw"] * 1024
+
+
+# pylint: disable=unused-argument
+def write_bandwidth(data: str, **kwargs) -> int:
+  """File I/O bandwidth."""
+  return json.loads(data)["jobs"][0]["write"]["bw"] * 1024
+
+
+# pylint: disable=unused-argument
+def read_io_ops(data: str, **kwargs) -> float:
+  """File I/O operations per second."""
+  return float(json.loads(data)["jobs"][0]["read"]["iops"])
+
+
+# pylint: disable=unused-argument
+def write_io_ops(data: str, **kwargs) -> float:
+  """File I/O operations per second."""
+  return float(json.loads(data)["jobs"][0]["write"]["iops"])
+
+
+# Change function names so we just print "bandwidth" and "io_ops".
+read_bandwidth.__name__ = "bandwidth"
+write_bandwidth.__name__ = "bandwidth"
+read_io_ops.__name__ = "io_ops"
+write_io_ops.__name__ = "io_ops"
diff --git a/benchmarks/workloads/fio/fio_test.py b/benchmarks/workloads/fio/fio_test.py
new file mode 100644
index 000000000..04a6eeb7e
--- /dev/null
+++ b/benchmarks/workloads/fio/fio_test.py
@@ -0,0 +1,44 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser tests."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import fio
+
+
+def test_read_io_ops():
+  """Test read ops parser."""
+  assert fio.read_io_ops(fio.sample()) == 0.0
+
+
+def test_write_io_ops():
+  """Test write ops parser."""
+  assert fio.write_io_ops(fio.sample()) == 438367.892977
+
+
+def test_read_bandwidth():
+  """Test read bandwidth parser."""
+  assert fio.read_bandwidth(fio.sample()) == 0.0
+
+
+def test_write_bandwith():
+  """Test write bandwidth parser."""
+  assert fio.write_bandwidth(fio.sample()) == 1753471 * 1024
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/httpd/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/httpd/Dockerfile b/benchmarks/workloads/httpd/Dockerfile
new file mode 100644
index 000000000..5259c8f4f
--- /dev/null
+++ b/benchmarks/workloads/httpd/Dockerfile
@@ -0,0 +1,27 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            apache2 \
+        && rm -rf /var/lib/apt/lists/*
+
+# Link the htdoc directory to tmp.
+RUN mkdir -p /usr/local/apache2/htdocs && \
+        cd /usr/local/apache2 && ln -s /tmp htdocs
+
+# Generate a bunch of relevant files.
+RUN mkdir -p /local && \
+        for size in 1 10 100 1000 1024 10240; do \
+                dd if=/dev/zero of=/local/latin${size}k.txt count=${size} bs=1024; \
+        done
+
+# Standard settings.
+ENV APACHE_RUN_DIR /tmp
+ENV APACHE_RUN_USER nobody
+ENV APACHE_RUN_GROUP nogroup
+ENV APACHE_LOG_DIR /tmp
+ENV APACHE_PID_FILE /tmp/apache.pid
+
+# Copy on start-up; serve everything from /tmp (including the configuration).
+CMD ["sh", "-c", "cp -a /local/* /tmp && apache2 -c \"ServerName localhost\" -c \"DocumentRoot /tmp\" -X"]
diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD
new file mode 100644
index 000000000..fe0acbfce
--- /dev/null
+++ b/benchmarks/workloads/iperf/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "iperf",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "iperf_test",
+    srcs = ["iperf_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":iperf",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/iperf/Dockerfile b/benchmarks/workloads/iperf/Dockerfile
new file mode 100644
index 000000000..9704c506c
--- /dev/null
+++ b/benchmarks/workloads/iperf/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            iperf \
+        && rm -rf /var/lib/apt/lists/*
+
+# Accept a host parameter.
+ENV host ""
+ENV port 5001
+
+# Start as client if the host is provided.
+CMD ["sh", "-c", "test -z \"${host}\" && iperf -s || iperf -f K --realtime -c ${host} -p ${port}"]
diff --git a/benchmarks/workloads/iperf/__init__.py b/benchmarks/workloads/iperf/__init__.py
new file mode 100644
index 000000000..3817a7ade
--- /dev/null
+++ b/benchmarks/workloads/iperf/__init__.py
@@ -0,0 +1,40 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""iperf."""
+
+import re
+
+SAMPLE_DATA = """
+------------------------------------------------------------
+Client connecting to 10.138.15.215, TCP port 32779
+TCP window size: 45.0 KByte (default)
+------------------------------------------------------------
+[  3] local 10.138.15.216 port 32866 connected with 10.138.15.215 port 32779
+[ ID] Interval       Transfer     Bandwidth
+[  3]  0.0-10.0 sec  459520 KBytes  45900 KBytes/sec
+
+"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+  return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def bandwidth(data: str, **kwargs) -> float:
+  """Calculate the bandwidth."""
+  regex = r"\[\s*\d+\][^\n]+\s+(\d+\.?\d*)\s+KBytes/sec"
+  res = re.compile(regex).search(data)
+  return float(res.group(1)) * 1000
diff --git a/benchmarks/workloads/iperf/iperf_test.py b/benchmarks/workloads/iperf/iperf_test.py
new file mode 100644
index 000000000..6959b7e8a
--- /dev/null
+++ b/benchmarks/workloads/iperf/iperf_test.py
@@ -0,0 +1,28 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for iperf."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import iperf
+
+
+def test_bandwidth():
+  assert iperf.bandwidth(iperf.sample()) == 45900 * 1000
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/netcat/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/netcat/Dockerfile b/benchmarks/workloads/netcat/Dockerfile
new file mode 100644
index 000000000..d8548d89a
--- /dev/null
+++ b/benchmarks/workloads/netcat/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            netcat \
+        && rm -rf /var/lib/apt/lists/*
+
+# Accept a host and port parameter.
+ENV host localhost
+ENV port 8080
+
+# Spin until we make a successful request.
+CMD ["sh", "-c", "while ! nc -zv $host $port; do true; done"]
diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/nginx/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/nginx/Dockerfile b/benchmarks/workloads/nginx/Dockerfile
new file mode 100644
index 000000000..b64eb52ae
--- /dev/null
+++ b/benchmarks/workloads/nginx/Dockerfile
@@ -0,0 +1 @@
+FROM nginx:1.15.10
diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD
new file mode 100644
index 000000000..59460d02f
--- /dev/null
+++ b/benchmarks/workloads/node/BUILD
@@ -0,0 +1,13 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+        "index.js",
+        "package.json",
+    ],
+)
diff --git a/benchmarks/workloads/node/Dockerfile b/benchmarks/workloads/node/Dockerfile
new file mode 100644
index 000000000..139a38bf5
--- /dev/null
+++ b/benchmarks/workloads/node/Dockerfile
@@ -0,0 +1,2 @@
+FROM node:onbuild
+CMD ["node", "index.js"]
diff --git a/benchmarks/workloads/node/index.js b/benchmarks/workloads/node/index.js
new file mode 100644
index 000000000..584158462
--- /dev/null
+++ b/benchmarks/workloads/node/index.js
@@ -0,0 +1,28 @@
+'use strict';
+
+var start = new Date().getTime();
+
+// Load dependencies to simulate an average nodejs app.
+var req_0 = require('async');
+var req_1 = require('bluebird');
+var req_2 = require('firebase');
+var req_3 = require('firebase-admin');
+var req_4 = require('@google-cloud/container');
+var req_5 = require('@google-cloud/logging');
+var req_6 = require('@google-cloud/monitoring');
+var req_7 = require('@google-cloud/spanner');
+var req_8 = require('lodash');
+var req_9 = require('mailgun-js');
+var req_10 = require('request');
+var express = require('express');
+var app = express();
+
+var loaded = new Date().getTime() - start;
+app.get('/', function(req, res) {
+  res.send('Hello World!<br>Loaded in ' + loaded + 'ms');
+});
+
+console.log('Loaded in ' + loaded + ' ms');
+app.listen(8080, function() {
+  console.log('Listening on port 8080...');
+});
diff --git a/benchmarks/workloads/node/package.json b/benchmarks/workloads/node/package.json
new file mode 100644
index 000000000..c00b9b3cb
--- /dev/null
+++ b/benchmarks/workloads/node/package.json
@@ -0,0 +1,19 @@
+{
+  "name": "node",
+  "version": "1.0.0",
+  "main": "index.js",
+  "dependencies": {
+    "@google-cloud/container": "^0.3.0",
+    "@google-cloud/logging": "^4.2.0",
+    "@google-cloud/monitoring": "^0.6.0",
+    "@google-cloud/spanner": "^2.2.1",
+    "async": "^2.6.1",
+    "bluebird": "^3.5.3",
+    "express": "^4.16.4",
+    "firebase": "^5.7.2",
+    "firebase-admin": "^6.4.0",
+    "lodash": "^4.17.11",
+    "mailgun-js": "^0.22.0",
+    "request": "^2.88.0"
+  }
+}
diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD
new file mode 100644
index 000000000..ae7f121d3
--- /dev/null
+++ b/benchmarks/workloads/node_template/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+        "index.hbs",
+        "index.js",
+        "package.json",
+        "package-lock.json",
+    ],
+)
diff --git a/benchmarks/workloads/node_template/Dockerfile b/benchmarks/workloads/node_template/Dockerfile
new file mode 100644
index 000000000..7eb065d54
--- /dev/null
+++ b/benchmarks/workloads/node_template/Dockerfile
@@ -0,0 +1,5 @@
+FROM node:onbuild
+
+ENV host "127.0.0.1"
+
+CMD ["sh", "-c", "node index.js ${host}"]
diff --git a/benchmarks/workloads/node_template/index.hbs b/benchmarks/workloads/node_template/index.hbs
new file mode 100644
index 000000000..03feceb75
--- /dev/null
+++ b/benchmarks/workloads/node_template/index.hbs
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+    {{#each text}}
+        <p>{{this}}</p>
+    {{/each}}
+</body>
+</html>
diff --git a/benchmarks/workloads/node_template/index.js b/benchmarks/workloads/node_template/index.js
new file mode 100644
index 000000000..04a27f356
--- /dev/null
+++ b/benchmarks/workloads/node_template/index.js
@@ -0,0 +1,43 @@
+const app = require('express')();
+const path = require('path');
+const redis = require('redis');
+const srs = require('secure-random-string');
+
+// The hostname is the first argument.
+const host_name = process.argv[2];
+
+var client = redis.createClient({host: host_name, detect_buffers: true});
+
+app.set('views', __dirname);
+app.set('view engine', 'hbs');
+
+app.get('/', (req, res) => {
+  var tmp = [];
+  /* Pull four random keys from the redis server. */
+  for (i = 0; i < 4; i++) {
+    client.get(Math.floor(Math.random() * (100)), function(err, reply) {
+      tmp.push(reply.toString());
+    });
+  }
+
+  res.render('index', {text: tmp});
+});
+
+/**
+ * Securely generate a random string.
+ * @param {number} len
+ * @return {string}
+ */
+function randomBody(len) {
+  return srs({alphanumeric: true, length: len});
+}
+
+/** Mutates one hundred keys randomly. */
+function generateText() {
+  for (i = 0; i < 100; i++) {
+    client.set(i, randomBody(1024));
+  }
+}
+
+generateText();
+app.listen(8080);
diff --git a/benchmarks/workloads/node_template/package-lock.json b/benchmarks/workloads/node_template/package-lock.json
new file mode 100644
index 000000000..1653597a1
--- /dev/null
+++ b/benchmarks/workloads/node_template/package-lock.json
@@ -0,0 +1,476 @@
+{
+  "name": "nodedum",
+  "version": "1.0.0",
+  "lockfileVersion": 1,
+  "requires": true,
+  "dependencies": {
+    "accepts": {
+      "version": "1.3.5",
+      "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
+      "integrity": "sha1-63d99gEXI6OxTopywIBcjoZ0a9I=",
+      "requires": {
+        "mime-types": "~2.1.18",
+        "negotiator": "0.6.1"
+      }
+    },
+    "array-flatten": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+      "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
+    },
+    "async": {
+      "version": "2.6.2",
+      "resolved": "https://registry.npmjs.org/async/-/async-2.6.2.tgz",
+      "integrity": "sha512-H1qVYh1MYhEEFLsP97cVKqCGo7KfCyTt6uEWqsTBr9SO84oK9Uwbyd/yCW+6rKJLHksBNUVWZDAjfS+Ccx0Bbg==",
+      "requires": {
+        "lodash": "^4.17.11"
+      }
+    },
+    "body-parser": {
+      "version": "1.18.3",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.18.3.tgz",
+      "integrity": "sha1-WykhmP/dVTs6DyDe0FkrlWlVyLQ=",
+      "requires": {
+        "bytes": "3.0.0",
+        "content-type": "~1.0.4",
+        "debug": "2.6.9",
+        "depd": "~1.1.2",
+        "http-errors": "~1.6.3",
+        "iconv-lite": "0.4.23",
+        "on-finished": "~2.3.0",
+        "qs": "6.5.2",
+        "raw-body": "2.3.3",
+        "type-is": "~1.6.16"
+      }
+    },
+    "bytes": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
+      "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg="
+    },
+    "commander": {
+      "version": "2.20.0",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.0.tgz",
+      "integrity": "sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==",
+      "optional": true
+    },
+    "content-disposition": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.2.tgz",
+      "integrity": "sha1-DPaLud318r55YcOoUXjLhdunjLQ="
+    },
+    "content-type": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
+      "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
+    },
+    "cookie": {
+      "version": "0.3.1",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.3.1.tgz",
+      "integrity": "sha1-5+Ch+e9DtMi6klxcWpboBtFoc7s="
+    },
+    "cookie-signature": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
+      "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
+    },
+    "debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "requires": {
+        "ms": "2.0.0"
+      }
+    },
+    "depd": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
+      "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
+    },
+    "destroy": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
+      "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
+    },
+    "double-ended-queue": {
+      "version": "2.1.0-0",
+      "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz",
+      "integrity": "sha1-ED01J/0xUo9AGIEwyEHv3XgmTlw="
+    },
+    "ee-first": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+      "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
+    },
+    "encodeurl": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+      "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
+    },
+    "escape-html": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+      "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
+    },
+    "etag": {
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
+      "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
+    },
+    "express": {
+      "version": "4.16.4",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.16.4.tgz",
+      "integrity": "sha512-j12Uuyb4FMrd/qQAm6uCHAkPtO8FDTRJZBDd5D2KOL2eLaz1yUNdUB/NOIyq0iU4q4cFarsUCrnFDPBcnksuOg==",
+      "requires": {
+        "accepts": "~1.3.5",
+        "array-flatten": "1.1.1",
+        "body-parser": "1.18.3",
+        "content-disposition": "0.5.2",
+        "content-type": "~1.0.4",
+        "cookie": "0.3.1",
+        "cookie-signature": "1.0.6",
+        "debug": "2.6.9",
+        "depd": "~1.1.2",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "finalhandler": "1.1.1",
+        "fresh": "0.5.2",
+        "merge-descriptors": "1.0.1",
+        "methods": "~1.1.2",
+        "on-finished": "~2.3.0",
+        "parseurl": "~1.3.2",
+        "path-to-regexp": "0.1.7",
+        "proxy-addr": "~2.0.4",
+        "qs": "6.5.2",
+        "range-parser": "~1.2.0",
+        "safe-buffer": "5.1.2",
+        "send": "0.16.2",
+        "serve-static": "1.13.2",
+        "setprototypeof": "1.1.0",
+        "statuses": "~1.4.0",
+        "type-is": "~1.6.16",
+        "utils-merge": "1.0.1",
+        "vary": "~1.1.2"
+      }
+    },
+    "finalhandler": {
+      "version": "1.1.1",
+      "resolved": "http://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz",
+      "integrity": "sha512-Y1GUDo39ez4aHAw7MysnUD5JzYX+WaIj8I57kO3aEPT1fFRL4sr7mjei97FgnwhAyyzRYmQZaTHb2+9uZ1dPtg==",
+      "requires": {
+        "debug": "2.6.9",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "on-finished": "~2.3.0",
+        "parseurl": "~1.3.2",
+        "statuses": "~1.4.0",
+        "unpipe": "~1.0.0"
+      }
+    },
+    "foreachasync": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/foreachasync/-/foreachasync-3.0.0.tgz",
+      "integrity": "sha1-VQKYfchxS+M5IJfzLgBxyd7gfPY="
+    },
+    "forwarded": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
+      "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
+    },
+    "fresh": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
+      "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
+    },
+    "handlebars": {
+      "version": "4.0.14",
+      "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.0.14.tgz",
+      "integrity": "sha512-E7tDoyAA8ilZIV3xDJgl18sX3M8xB9/fMw8+mfW4msLW8jlX97bAnWgT3pmaNXuvzIEgSBMnAHfuXsB2hdzfow==",
+      "requires": {
+        "async": "^2.5.0",
+        "optimist": "^0.6.1",
+        "source-map": "^0.6.1",
+        "uglify-js": "^3.1.4"
+      }
+    },
+    "hbs": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.0.4.tgz",
+      "integrity": "sha512-esVlyV/V59mKkwFai5YmPRSNIWZzhqL5YMN0++ueMxyK1cCfPa5f6JiHtapPKAIVAhQR6rpGxow0troav9WMEg==",
+      "requires": {
+        "handlebars": "4.0.14",
+        "walk": "2.3.9"
+      }
+    },
+    "http-errors": {
+      "version": "1.6.3",
+      "resolved": "http://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz",
+      "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=",
+      "requires": {
+        "depd": "~1.1.2",
+        "inherits": "2.0.3",
+        "setprototypeof": "1.1.0",
+        "statuses": ">= 1.4.0 < 2"
+      }
+    },
+    "iconv-lite": {
+      "version": "0.4.23",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz",
+      "integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==",
+      "requires": {
+        "safer-buffer": ">= 2.1.2 < 3"
+      }
+    },
+    "inherits": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
+      "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
+    },
+    "ipaddr.js": {
+      "version": "1.8.0",
+      "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.8.0.tgz",
+      "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4="
+    },
+    "lodash": {
+      "version": "4.17.11",
+      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
+      "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
+    },
+    "media-typer": {
+      "version": "0.3.0",
+      "resolved": "http://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+      "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
+    },
+    "merge-descriptors": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
+      "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
+    },
+    "methods": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
+      "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
+    },
+    "mime": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz",
+      "integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ=="
+    },
+    "mime-db": {
+      "version": "1.37.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz",
+      "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg=="
+    },
+    "mime-types": {
+      "version": "2.1.21",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz",
+      "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==",
+      "requires": {
+        "mime-db": "~1.37.0"
+      }
+    },
+    "minimist": {
+      "version": "0.0.10",
+      "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz",
+      "integrity": "sha1-3j+YVD2/lggr5IrRoMfNqDYwHc8="
+    },
+    "ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
+    },
+    "negotiator": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz",
+      "integrity": "sha1-KzJxhOiZIQEXeyhWP7XnECrNDKk="
+    },
+    "on-finished": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
+      "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
+      "requires": {
+        "ee-first": "1.1.1"
+      }
+    },
+    "optimist": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
+      "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
+      "requires": {
+        "minimist": "~0.0.1",
+        "wordwrap": "~0.0.2"
+      }
+    },
+    "parseurl": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.2.tgz",
+      "integrity": "sha1-/CidTtiZMRlGDBViUyYs3I3mW/M="
+    },
+    "path-to-regexp": {
+      "version": "0.1.7",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
+      "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
+    },
+    "proxy-addr": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.4.tgz",
+      "integrity": "sha512-5erio2h9jp5CHGwcybmxmVqHmnCBZeewlfJ0pex+UW7Qny7OOZXTtH56TGNyBizkgiOwhJtMKrVzDTeKcySZwA==",
+      "requires": {
+        "forwarded": "~0.1.2",
+        "ipaddr.js": "1.8.0"
+      }
+    },
+    "qs": {
+      "version": "6.5.2",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
+      "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
+    },
+    "range-parser": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.0.tgz",
+      "integrity": "sha1-9JvmtIeJTdxA3MlKMi9hEJLgDV4="
+    },
+    "raw-body": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.3.3.tgz",
+      "integrity": "sha512-9esiElv1BrZoI3rCDuOuKCBRbuApGGaDPQfjSflGxdy4oyzqghxu6klEkkVIvBje+FF0BX9coEv8KqW6X/7njw==",
+      "requires": {
+        "bytes": "3.0.0",
+        "http-errors": "1.6.3",
+        "iconv-lite": "0.4.23",
+        "unpipe": "1.0.0"
+      }
+    },
+    "redis": {
+      "version": "2.8.0",
+      "resolved": "https://registry.npmjs.org/redis/-/redis-2.8.0.tgz",
+      "integrity": "sha512-M1OkonEQwtRmZv4tEWF2VgpG0JWJ8Fv1PhlgT5+B+uNq2cA3Rt1Yt/ryoR+vQNOQcIEgdCdfH0jr3bDpihAw1A==",
+      "requires": {
+        "double-ended-queue": "^2.1.0-0",
+        "redis-commands": "^1.2.0",
+        "redis-parser": "^2.6.0"
+      },
+      "dependencies": {
+        "redis-commands": {
+          "version": "1.4.0",
+          "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.4.0.tgz",
+          "integrity": "sha512-cu8EF+MtkwI4DLIT0x9P8qNTLFhQD4jLfxLR0cCNkeGzs87FN6879JOJwNQR/1zD7aSYNbU0hgsV9zGY71Itvw=="
+        },
+        "redis-parser": {
+          "version": "2.6.0",
+          "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz",
+          "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs="
+        }
+      }
+    },
+    "safe-buffer": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+      "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
+    },
+    "safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+    },
+    "secure-random-string": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.0.tgz",
+      "integrity": "sha512-V/h8jqoz58zklNGybVhP++cWrxEPXlLM/6BeJ4e0a8zlb4BsbYRzFs16snrxByPa5LUxCVTD3M6EYIVIHR1fAg=="
+    },
+    "send": {
+      "version": "0.16.2",
+      "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz",
+      "integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==",
+      "requires": {
+        "debug": "2.6.9",
+        "depd": "~1.1.2",
+        "destroy": "~1.0.4",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "fresh": "0.5.2",
+        "http-errors": "~1.6.2",
+        "mime": "1.4.1",
+        "ms": "2.0.0",
+        "on-finished": "~2.3.0",
+        "range-parser": "~1.2.0",
+        "statuses": "~1.4.0"
+      }
+    },
+    "serve-static": {
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz",
+      "integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==",
+      "requires": {
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "parseurl": "~1.3.2",
+        "send": "0.16.2"
+      }
+    },
+    "setprototypeof": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz",
+      "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ=="
+    },
+    "source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g=="
+    },
+    "statuses": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz",
+      "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew=="
+    },
+    "type-is": {
+      "version": "1.6.16",
+      "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.16.tgz",
+      "integrity": "sha512-HRkVv/5qY2G6I8iab9cI7v1bOIdhm94dVjQCPFElW9W+3GeDOSHmy2EBYe4VTApuzolPcmgFTN3ftVJRKR2J9Q==",
+      "requires": {
+        "media-typer": "0.3.0",
+        "mime-types": "~2.1.18"
+      }
+    },
+    "uglify-js": {
+      "version": "3.5.9",
+      "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.5.9.tgz",
+      "integrity": "sha512-WpT0RqsDtAWPNJK955DEnb6xjymR8Fn0OlK4TT4pS0ASYsVPqr5ELhgwOwLCP5J5vHeJ4xmMmz3DEgdqC10JeQ==",
+      "optional": true,
+      "requires": {
+        "commander": "~2.20.0",
+        "source-map": "~0.6.1"
+      }
+    },
+    "unpipe": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
+      "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
+    },
+    "utils-merge": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
+      "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
+    },
+    "vary": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
+      "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
+    },
+    "walk": {
+      "version": "2.3.9",
+      "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.9.tgz",
+      "integrity": "sha1-MbTbZnjyrgHDnqn7hyWpAx5Vins=",
+      "requires": {
+        "foreachasync": "^3.0.0"
+      }
+    },
+    "wordwrap": {
+      "version": "0.0.3",
+      "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
+      "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc="
+    }
+  }
+}
diff --git a/benchmarks/workloads/node_template/package.json b/benchmarks/workloads/node_template/package.json
new file mode 100644
index 000000000..7dcadd523
--- /dev/null
+++ b/benchmarks/workloads/node_template/package.json
@@ -0,0 +1,19 @@
+{
+  "name": "nodedum",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "express": "^4.16.4",
+    "hbs": "^4.0.4",
+    "redis": "^2.8.0",
+    "redis-commands": "^1.2.0",
+    "redis-parser": "^2.6.0",
+    "secure-random-string": "^1.1.0"
+  }
+}
diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/redis/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/redis/Dockerfile b/benchmarks/workloads/redis/Dockerfile
new file mode 100644
index 000000000..0f17249af
--- /dev/null
+++ b/benchmarks/workloads/redis/Dockerfile
@@ -0,0 +1 @@
+FROM redis:5.0.4
diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD
new file mode 100644
index 000000000..d40e75a3a
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "redisbenchmark",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "redisbenchmark_test",
+    srcs = ["redisbenchmark_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":redisbenchmark",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/redisbenchmark/Dockerfile b/benchmarks/workloads/redisbenchmark/Dockerfile
new file mode 100644
index 000000000..f94f6442e
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/Dockerfile
@@ -0,0 +1,4 @@
+FROM redis:5.0.4
+ENV host localhost
+ENV port 6379
+CMD ["sh", "-c", "redis-benchmark --csv -h ${host} -p ${port} ${flags}"]
diff --git a/benchmarks/workloads/redisbenchmark/__init__.py b/benchmarks/workloads/redisbenchmark/__init__.py
new file mode 100644
index 000000000..229cef5fa
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/__init__.py
@@ -0,0 +1,85 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Redis-benchmark tool."""
+
+import re
+
+OPERATIONS = [
+    "PING_INLINE",
+    "PING_BULK",
+    "SET",
+    "GET",
+    "INCR",
+    "LPUSH",
+    "RPUSH",
+    "LPOP",
+    "RPOP",
+    "SADD",
+    "HSET",
+    "SPOP",
+    "LRANGE_100",
+    "LRANGE_300",
+    "LRANGE_500",
+    "LRANGE_600",
+    "MSET",
+]
+
+METRICS = dict()
+
+SAMPLE_DATA = """
+"PING_INLINE","48661.80"
+"PING_BULK","50301.81"
+"SET","48923.68"
+"GET","49382.71"
+"INCR","49975.02"
+"LPUSH","49875.31"
+"RPUSH","50276.52"
+"LPOP","50327.12"
+"RPOP","50556.12"
+"SADD","49504.95"
+"HSET","49504.95"
+"SPOP","50025.02"
+"LPUSH (needed to benchmark LRANGE)","48875.86"
+"LRANGE_100 (first 100 elements)","33955.86"
+"LRANGE_300 (first 300 elements)","16550.81"
+"LRANGE_500 (first 450 elements)","13653.74"
+"LRANGE_600 (first 600 elements)","11219.57"
+"MSET (10 keys)","44682.75"
+"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+  return SAMPLE_DATA
+
+
+# Bind a metric for each operation noted above.
+for op in OPERATIONS:
+
+  def bind(metric):
+    """Bind op to a new scope."""
+
+    # pylint: disable=unused-argument
+    def parse(data: str, **kwargs) -> float:
+      """Operation throughput in requests/sec."""
+      regex = r"\"" + metric + r"( .*)?\",\"(\d*.\d*)"
+      res = re.compile(regex).search(data)
+      if res:
+        return float(res.group(2))
+      return 0.0
+
+    parse.__name__ = metric
+    return parse
+
+  METRICS[op] = bind(op)
diff --git a/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py
new file mode 100644
index 000000000..419ced059
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py
@@ -0,0 +1,51 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import redisbenchmark
+
+RESULTS = {
+    "PING_INLINE": 48661.80,
+    "PING_BULK": 50301.81,
+    "SET": 48923.68,
+    "GET": 49382.71,
+    "INCR": 49975.02,
+    "LPUSH": 49875.31,
+    "RPUSH": 50276.52,
+    "LPOP": 50327.12,
+    "RPOP": 50556.12,
+    "SADD": 49504.95,
+    "HSET": 49504.95,
+    "SPOP": 50025.02,
+    "LRANGE_100": 33955.86,
+    "LRANGE_300": 16550.81,
+    "LRANGE_500": 13653.74,
+    "LRANGE_600": 11219.57,
+    "MSET": 44682.75
+}
+
+
+def test_metrics():
+  """Test all metrics."""
+  for (metric, func) in redisbenchmark.METRICS.items():
+    res = func(redisbenchmark.sample())
+    assert float(res) == RESULTS[metric]
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD
new file mode 100644
index 000000000..9846c7e70
--- /dev/null
+++ b/benchmarks/workloads/ruby/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+        "Gemfile",
+        "Gemfile.lock",
+        "config.ru",
+        "index.rb",
+    ],
+)
diff --git a/benchmarks/workloads/ruby/Dockerfile b/benchmarks/workloads/ruby/Dockerfile
new file mode 100644
index 000000000..a9a7a7086
--- /dev/null
+++ b/benchmarks/workloads/ruby/Dockerfile
@@ -0,0 +1,28 @@
+# example based on https://github.com/errm/fib
+
+FROM ruby:2.5
+
+RUN apt-get update -qq && apt-get install -y build-essential libpq-dev nodejs libsodium-dev
+
+# Set an environment variable where the Rails app is installed to inside of Docker image
+ENV RAILS_ROOT /var/www/app_name
+RUN mkdir -p $RAILS_ROOT
+
+# Set working directory
+WORKDIR $RAILS_ROOT
+
+# Setting env up
+ENV RAILS_ENV='production'
+ENV RACK_ENV='production'
+
+# Adding gems
+COPY Gemfile Gemfile
+COPY Gemfile.lock Gemfile.lock
+RUN bundle install --jobs 20 --retry 5 --without development test
+
+# Adding project files
+COPY . .
+
+EXPOSE $PORT
+STOPSIGNAL SIGINT
+CMD ["bundle", "exec", "puma", "config.ru"]
diff --git a/benchmarks/workloads/ruby/Gemfile b/benchmarks/workloads/ruby/Gemfile
new file mode 100644
index 000000000..8f1bdad6e
--- /dev/null
+++ b/benchmarks/workloads/ruby/Gemfile
@@ -0,0 +1,12 @@
+source "https://rubygems.org"
+# load a bunch of dependencies to take up memory
+gem "sinatra"
+gem "puma"
+gem "redis"
+gem 'rake'
+gem 'squid', '~> 1.4'
+gem 'cassandra-driver'
+gem 'ruby-fann'
+gem 'rbnacl'
+gem 'bcrypt'
+gem "activemerchant"
\ No newline at end of file
diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
new file mode 100644
index 000000000..b44817bd3
--- /dev/null
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -0,0 +1,55 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (5.2.3)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+    cassandra-driver (3.2.3)
+      ione (~> 1.2)
+    concurrent-ruby (1.1.5)
+    i18n (1.6.0)
+      concurrent-ruby (~> 1.0)
+    ione (1.2.4)
+    minitest (5.11.3)
+    mustermann (1.0.3)
+    pdf-core (0.7.0)
+    prawn (2.2.2)
+      pdf-core (~> 0.7.0)
+      ttfunk (~> 1.5)
+    puma (3.12.1)
+    rack (2.0.7)
+    rack-protection (2.0.5)
+      rack
+    rake (12.3.2)
+    redis (4.1.1)
+    ruby-fann (1.2.6)
+    sinatra (2.0.5)
+      mustermann (~> 1.0)
+      rack (~> 2.0)
+      rack-protection (= 2.0.5)
+      tilt (~> 2.0)
+    squid (1.4.1)
+      activesupport (>= 4.0)
+      prawn (~> 2.2)
+    thread_safe (0.3.6)
+    tilt (2.0.9)
+    ttfunk (1.5.1)
+    tzinfo (1.2.5)
+      thread_safe (~> 0.1)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  cassandra-driver
+  puma
+  rake
+  redis
+  ruby-fann
+  sinatra
+  squid (~> 1.4)
+
+BUNDLED WITH
+   1.17.1
diff --git a/benchmarks/workloads/ruby/config.ru b/benchmarks/workloads/ruby/config.ru
new file mode 100755
index 000000000..fbd5acc82
--- /dev/null
+++ b/benchmarks/workloads/ruby/config.ru
@@ -0,0 +1,2 @@
+require './index'
+run Sinatra::Application
\ No newline at end of file
diff --git a/benchmarks/workloads/ruby/index.rb b/benchmarks/workloads/ruby/index.rb
new file mode 100755
index 000000000..5fa85af93
--- /dev/null
+++ b/benchmarks/workloads/ruby/index.rb
@@ -0,0 +1,14 @@
+require "sinatra"
+require "puma"
+require "redis"
+require "rake"
+require "squid"
+require "cassandra"
+require "ruby-fann"
+require "rbnacl"
+require "bcrypt"
+require "activemerchant"
+
+get "/" do
+  "Hello World!"
+end
\ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD
new file mode 100644
index 000000000..2b99892af
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/BUILD
@@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+        "Gemfile",
+        "Gemfile.lock",
+        "config.ru",
+        "index.erb",
+        "main.rb",
+    ],
+)
diff --git a/benchmarks/workloads/ruby_template/Dockerfile b/benchmarks/workloads/ruby_template/Dockerfile
new file mode 100755
index 000000000..a06d68bf4
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/Dockerfile
@@ -0,0 +1,38 @@
+# example based on https://github.com/errm/fib
+
+FROM alpine:3.9 as build
+
+COPY Gemfile Gemfile.lock ./
+
+RUN apk add --no-cache ruby ruby-dev ruby-bundler ruby-json build-base bash \
+        && bundle install --frozen -j4 -r3 --no-cache --without development \
+        && apk del --no-cache ruby-bundler \
+        && rm -rf /usr/lib/ruby/gems/*/cache
+
+FROM alpine:3.9 as prod
+
+COPY --from=build /usr/lib/ruby/gems /usr/lib/ruby/gems
+RUN apk add --no-cache ruby ruby-json ruby-etc redis apache2-utils \
+        && ruby -e "Gem::Specification.map.each do |spec| \
+      Gem::Installer.for_spec( \
+        spec, \
+        wrappers: true, \
+        force: true, \
+        install_dir: spec.base_dir, \
+        build_args: spec.build_args, \
+      ).generate_bin \
+    end"
+
+WORKDIR /app
+COPY . /app/.
+
+ENV PORT=9292 \
+    WEB_CONCURRENCY=20 \
+    WEB_MAX_THREADS=20 \
+    RACK_ENV=production
+
+ENV host localhost
+EXPOSE $PORT
+USER nobody
+STOPSIGNAL SIGINT
+CMD ["sh", "-c", "/usr/bin/puma", "${host}"]
diff --git a/benchmarks/workloads/ruby_template/Gemfile b/benchmarks/workloads/ruby_template/Gemfile
new file mode 100755
index 000000000..ac521b32c
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/Gemfile
@@ -0,0 +1,5 @@
+source "https://rubygems.org"
+
+gem "sinatra"
+gem "puma"
+gem "redis"
\ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock
new file mode 100644
index 000000000..dd8d56fb7
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/Gemfile.lock
@@ -0,0 +1,26 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    mustermann (1.0.3)
+    puma (3.12.0)
+    rack (2.0.6)
+    rack-protection (2.0.5)
+      rack
+    sinatra (2.0.5)
+      mustermann (~> 1.0)
+      rack (~> 2.0)
+      rack-protection (= 2.0.5)
+      tilt (~> 2.0)
+    tilt (2.0.9)
+    redis (4.1.0)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  puma
+  sinatra
+  redis
+
+BUNDLED WITH
+   1.17.1
\ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/config.ru b/benchmarks/workloads/ruby_template/config.ru
new file mode 100755
index 000000000..b2d135cc0
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/config.ru
@@ -0,0 +1,2 @@
+require './main'
+run Sinatra::Application
\ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/index.erb b/benchmarks/workloads/ruby_template/index.erb
new file mode 100755
index 000000000..7f7300e80
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/index.erb
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+    <% text.each do |t| %>
+        <p><%= t %></p>
+    <% end %>
+</body>
+</html>
diff --git a/benchmarks/workloads/ruby_template/main.rb b/benchmarks/workloads/ruby_template/main.rb
new file mode 100755
index 000000000..35c239377
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/main.rb
@@ -0,0 +1,27 @@
+require "sinatra"
+require "securerandom"
+require "redis"
+
+redis_host = ENV["host"]
+$redis = Redis.new(host: redis_host)
+
+def generateText
+    for i in 0..99
+        $redis.set(i, randomBody(1024))
+    end
+end
+
+def randomBody(length)
+    return SecureRandom.alphanumeric(length)
+end
+
+generateText
+template = ERB.new(File.read('./index.erb'))
+
+get "/" do
+    texts = Array.new
+    for i in 0..4
+        texts.push($redis.get(rand(0..99)))
+    end
+    template.result_with_hash(text: texts)
+end
\ No newline at end of file
diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/sleep/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/sleep/Dockerfile b/benchmarks/workloads/sleep/Dockerfile
new file mode 100644
index 000000000..24c72e07a
--- /dev/null
+++ b/benchmarks/workloads/sleep/Dockerfile
@@ -0,0 +1,3 @@
+FROM alpine:latest
+
+CMD ["sleep", "315360000"]
diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD
new file mode 100644
index 000000000..35f4d460b
--- /dev/null
+++ b/benchmarks/workloads/sysbench/BUILD
@@ -0,0 +1,35 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "sysbench",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "sysbench_test",
+    srcs = ["sysbench_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":sysbench",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/sysbench/Dockerfile b/benchmarks/workloads/sysbench/Dockerfile
new file mode 100644
index 000000000..8225e0e14
--- /dev/null
+++ b/benchmarks/workloads/sysbench/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+        && apt-get update \
+        && apt-get install -y \
+            sysbench \
+        && rm -rf /var/lib/apt/lists/*
+
+# Parameterize the tests.
+ENV test cpu
+ENV threads 1
+ENV options ""
+
+# run sysbench once as a warm-up and take the second result
+CMD ["sh", "-c", "sysbench --threads=8 --memory-total-size=5G memory run > /dev/null && \
+sysbench --threads=${threads} ${options} ${test} run"]
diff --git a/benchmarks/workloads/sysbench/__init__.py b/benchmarks/workloads/sysbench/__init__.py
new file mode 100644
index 000000000..de357b4db
--- /dev/null
+++ b/benchmarks/workloads/sysbench/__init__.py
@@ -0,0 +1,167 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sysbench."""
+
+import re
+
+STD_REGEX = r"events per second:\s*(\d*.?\d*)\n"
+MEM_REGEX = r"Total\soperations:\s+\d*\s*\((\d*\.\d*)\sper\ssecond\)"
+ALT_REGEX = r"execution time \(avg/stddev\):\s*(\d*.?\d*)/(\d*.?\d*)"
+AVG_REGEX = r"avg:[^\n^\d]*(\d*\.?\d*)"
+
+SAMPLE_CPU_DATA = """
+sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3)
+
+Running the test with following options:
+Number of threads: 8
+Initializing random number generator from current time
+
+
+Prime numbers limit: 10000
+
+Initializing worker threads...
+
+Threads started!
+
+CPU speed:
+    events per second:  9093.38
+
+General statistics:
+    total time:                          10.0007s
+    total number of events:              90949
+
+Latency (ms):
+         min:                                  0.64
+         avg:                                  0.88
+         max:                                 24.65
+         95th percentile:                      1.55
+         sum:                              79936.91
+
+Threads fairness:
+    events (avg/stddev):           11368.6250/831.38
+    execution time (avg/stddev):   9.9921/0.01
+"""
+
+SAMPLE_MEMORY_DATA = """
+sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3)
+
+Running the test with following options:
+Number of threads: 8
+Initializing random number generator from current time
+
+
+Running memory speed test with the following options:
+  block size: 1KiB
+  total size: 102400MiB
+  operation: write
+  scope: global
+
+Initializing worker threads...
+
+Threads started!
+
+Total operations: 47999046 (9597428.64 per second)
+
+46874.07 MiB transferred (9372.49 MiB/sec)
+
+
+General statistics:
+    total time:                          5.0001s
+    total number of events:              47999046
+
+Latency (ms):
+         min:                                  0.00
+         avg:                                  0.00
+         max:                                  0.21
+         95th percentile:                      0.00
+         sum:                              33165.91
+
+Threads fairness:
+    events (avg/stddev):           5999880.7500/111242.52
+    execution time (avg/stddev):   4.1457/0.09
+"""
+
+SAMPLE_MUTEX_DATA = """
+sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3)
+
+Running the test with following options:
+Number of threads: 8
+Initializing random number generator from current time
+
+
+Initializing worker threads...
+
+Threads started!
+
+
+General statistics:
+    total time:                          3.7869s
+    total number of events:              8
+
+Latency (ms):
+         min:                               3688.56
+         avg:                               3754.03
+         max:                               3780.94
+         95th percentile:                   3773.42
+         sum:                              30032.28
+
+Threads fairness:
+    events (avg/stddev):           1.0000/0.00
+    execution time (avg/stddev):   3.7540/0.03
+"""
+
+
+# pylint: disable=unused-argument
+def sample(test, **kwargs):
+  switch = {
+      "cpu": SAMPLE_CPU_DATA,
+      "memory": SAMPLE_MEMORY_DATA,
+      "mutex": SAMPLE_MUTEX_DATA,
+      "randwr": SAMPLE_CPU_DATA
+  }
+  return switch[test]
+
+
+# pylint: disable=unused-argument
+def cpu_events_per_second(data: str, **kwargs) -> float:
+  """Returns events per second."""
+  return float(re.compile(STD_REGEX).search(data).group(1))
+
+
+# pylint: disable=unused-argument
+def memory_ops_per_second(data: str, **kwargs) -> float:
+  """Returns memory operations per second."""
+  return float(re.compile(MEM_REGEX).search(data).group(1))
+
+
+# pylint: disable=unused-argument
+def mutex_time(data: str, count: int, locks: int, threads: int,
+               **kwargs) -> float:
+  """Returns normalized mutex time (lower is better)."""
+  value = float(re.compile(ALT_REGEX).search(data).group(1))
+  contention = float(threads) / float(locks)
+  scale = contention * float(count) / 100000000.0
+  return value / scale
+
+
+# pylint: disable=unused-argument
+def mutex_deviation(data: str, **kwargs) -> float:
+  """Returns deviation for threads."""
+  return float(re.compile(ALT_REGEX).search(data).group(2))
+
+
+# pylint: disable=unused-argument
+def mutex_latency(data: str, **kwargs) -> float:
+  """Returns average mutex latency."""
+  return float(re.compile(AVG_REGEX).search(data).group(1))
diff --git a/benchmarks/workloads/sysbench/sysbench_test.py b/benchmarks/workloads/sysbench/sysbench_test.py
new file mode 100644
index 000000000..3fb541fd2
--- /dev/null
+++ b/benchmarks/workloads/sysbench/sysbench_test.py
@@ -0,0 +1,34 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import sysbench
+
+
+def test_sysbench_parser():
+  """Test the basic parser."""
+  assert sysbench.cpu_events_per_second(sysbench.sample("cpu")) == 9093.38
+  assert sysbench.memory_ops_per_second(sysbench.sample("memory")) == 9597428.64
+  assert sysbench.mutex_time(sysbench.sample("mutex"), 1, 1,
+                             100000000.0) == 3.754
+  assert sysbench.mutex_deviation(sysbench.sample("mutex")) == 0.03
+  assert sysbench.mutex_latency(sysbench.sample("mutex")) == 3754.03
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD
new file mode 100644
index 000000000..e1ff3059b
--- /dev/null
+++ b/benchmarks/workloads/syscall/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "syscall",
+    srcs = ["__init__.py"],
+)
+
+py_test(
+    name = "syscall_test",
+    srcs = ["syscall_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":syscall",
+        requirement("attrs", False),
+        requirement("atomicwrites", False),
+        requirement("more-itertools", False),
+        requirement("pathlib2", False),
+        requirement("pluggy", False),
+        requirement("py", False),
+        requirement("pytest", True),
+        requirement("six", False),
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+        "syscall.c",
+    ],
+)
diff --git a/benchmarks/workloads/syscall/Dockerfile b/benchmarks/workloads/syscall/Dockerfile
new file mode 100644
index 000000000..a2088d953
--- /dev/null
+++ b/benchmarks/workloads/syscall/Dockerfile
@@ -0,0 +1,6 @@
+FROM gcc:latest
+COPY . /usr/src/syscall
+WORKDIR /usr/src/syscall
+RUN gcc -O2 -o syscall syscall.c
+ENV count 1000000
+CMD ["sh", "-c", "./syscall ${count}"]
diff --git a/benchmarks/workloads/syscall/__init__.py b/benchmarks/workloads/syscall/__init__.py
new file mode 100644
index 000000000..dc9028faa
--- /dev/null
+++ b/benchmarks/workloads/syscall/__init__.py
@@ -0,0 +1,29 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple syscall test."""
+
+import re
+
+SAMPLE_DATA = "Called getpid syscall 1000000 times: 1117 ms, 500 ns each."
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+  return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def syscall_time_ns(data: str, **kwargs) -> int:
+  """Returns average system call time."""
+  return float(re.compile(r"(\d+)\sns each.").search(data).group(1))
diff --git a/benchmarks/workloads/syscall/syscall.c b/benchmarks/workloads/syscall/syscall.c
new file mode 100644
index 000000000..ded030397
--- /dev/null
+++ b/benchmarks/workloads/syscall/syscall.c
@@ -0,0 +1,55 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+// Short program that calls getpid() a number of times and outputs time
+// diference from the MONOTONIC clock.
+int main(int argc, char** argv) {
+  struct timespec start, stop;
+  long result;
+  char buf[80];
+
+  if (argc < 2) {
+    printf("Usage:./syscall NUM_TIMES_TO_CALL");
+    return 1;
+  }
+
+  if (clock_gettime(CLOCK_MONOTONIC, &start)) return 1;
+
+  long loops = atoi(argv[1]);
+  for (long i = 0; i < loops; i++) {
+    syscall(SYS_gettimeofday, 0, 0);
+  }
+
+  if (clock_gettime(CLOCK_MONOTONIC, &stop)) return 1;
+
+  if ((stop.tv_nsec - start.tv_nsec) < 0) {
+    result = (stop.tv_sec - start.tv_sec - 1) * 1000;
+    result += (stop.tv_nsec - start.tv_nsec + 1000000000) / (1000 * 1000);
+  } else {
+    result = (stop.tv_sec - start.tv_sec) * 1000;
+    result += (stop.tv_nsec - start.tv_nsec) / (1000 * 1000);
+  }
+
+  printf("Called getpid syscall %d times: %lu ms, %lu ns each.\n", loops,
+         result, result * 1000000 / loops);
+
+  return 0;
+}
diff --git a/benchmarks/workloads/syscall/syscall_test.py b/benchmarks/workloads/syscall/syscall_test.py
new file mode 100644
index 000000000..72f027de1
--- /dev/null
+++ b/benchmarks/workloads/syscall/syscall_test.py
@@ -0,0 +1,27 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import syscall
+
+
+def test_syscall_time_ns():
+  assert syscall.syscall_time_ns(syscall.sample()) == 500
+
+
+if __name__ == "__main__":
+  sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD
new file mode 100644
index 000000000..17f1f8ebb
--- /dev/null
+++ b/benchmarks/workloads/tensorflow/BUILD
@@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "tensorflow",
+    srcs = ["__init__.py"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/tensorflow/Dockerfile b/benchmarks/workloads/tensorflow/Dockerfile
new file mode 100644
index 000000000..262643b98
--- /dev/null
+++ b/benchmarks/workloads/tensorflow/Dockerfile
@@ -0,0 +1,14 @@
+FROM tensorflow/tensorflow:1.13.2
+
+RUN apt-get update \
+    && apt-get install -y git
+RUN git clone https://github.com/aymericdamien/TensorFlow-Examples.git
+RUN python -m pip install -U pip setuptools
+RUN python -m pip install matplotlib
+
+WORKDIR /TensorFlow-Examples/examples
+
+ENV PYTHONPATH="$PYTHONPATH:/TensorFlow-Examples/examples"
+
+ENV workload "3_NeuralNetworks/convolutional_network.py"
+CMD python ${workload}
diff --git a/benchmarks/workloads/tensorflow/__init__.py b/benchmarks/workloads/tensorflow/__init__.py
new file mode 100644
index 000000000..b5ec213f8
--- /dev/null
+++ b/benchmarks/workloads/tensorflow/__init__.py
@@ -0,0 +1,20 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Tensorflow example."""
+
+
+# pylint: disable=unused-argument
+def run_time(value, **kwargs):
+  """Returns the startup and runtime of the Tensorflow workload in seconds."""
+  return value
diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD
new file mode 100644
index 000000000..83f3c71a0
--- /dev/null
+++ b/benchmarks/workloads/true/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//benchmarks:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        "Dockerfile",
+    ],
+)
diff --git a/benchmarks/workloads/true/Dockerfile b/benchmarks/workloads/true/Dockerfile
new file mode 100644
index 000000000..2e97c921e
--- /dev/null
+++ b/benchmarks/workloads/true/Dockerfile
@@ -0,0 +1,3 @@
+FROM alpine:latest
+
+CMD ["true"]
diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh
new file mode 100755
index 000000000..6b9065b07
--- /dev/null
+++ b/scripts/benchmarks.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env bash
+
+if [ "$#" -lt "1" ]; then
+  echo "usage: $0 <--mock |--env=<filename>> ..."
+  echo "example: $0 --mock --runs=8"
+  exit 1
+fi
+
+source $(dirname $0)/common.sh
+
+readonly TIMESTAMP=`date "+%Y%m%d-%H%M%S"`
+readonly OUTDIR="$(mktemp --tmpdir -d run-${TIMESTAMP}-XXX)"
+readonly DEFAULT_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
+readonly ALL_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
+cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio.csv" && rm "${OUTDIR}/tmp_fio.csv"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio-tmpfs.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
+cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio-tmpfs.csv" && rm "${OUTDIR}/tmp_fio.csv"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} startup --count=50  >  "${OUTDIR}/startup.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} density > "${OUTDIR}/density.csv"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.cpu --threads=1 --max_prime=50000 --options='--max-time=5' > "${OUTDIR}/sysbench-cpu.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.memory --threads=1 --options='--memory-block-size=1M --memory-total-size=500G'  > "${OUTDIR}/sysbench-memory.csv"
+run //benchmarks:perf -- run "$@" ${ALL_RUNTIMES} syscall > "${OUTDIR}/syscall.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'network.(upload|download)' --runs=20 > "${OUTDIR}/iperf.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} ml.tensorflow > "${OUTDIR}/tensorflow.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} media.ffmpeg > "${OUTDIR}/ffmpeg.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin100k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd100k.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin10240k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd10240k.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} redis > "${OUTDIR}/redis.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'http.(ruby|node)' > "${OUTDIR}/applications.csv"
+
+echo "${OUTPUT}" && exit 0
diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh
index 585216aae..ef25afc2e 100755
--- a/scripts/simple_tests.sh
+++ b/scripts/simple_tests.sh
@@ -17,4 +17,4 @@
 source $(dirname $0)/common.sh
 
 # Run all simple tests (locally).
-test //pkg/... //runsc/... //tools/...
+test //pkg/... //runsc/... //tools/... //benchmarks/...
-- 
cgit v1.2.3


From 03760e5623f3ee736252fda7da033fd51144af1e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 20 Nov 2019 09:24:41 +0000
Subject: platform/ptrace: make some operations arch specific

Make the patchSignalInfo/cpuid faulting/initial thread seccomp rules
operations architecture dependent.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Iaf692dbe3700d2e01168ec2f1b4beeda9136fd62
---
 pkg/sentry/platform/ptrace/subprocess_amd64.go | 42 ++++++++++++++++++++++++++
 pkg/sentry/platform/ptrace/subprocess_arm64.go | 37 ++++++++++++++++++++++-
 pkg/sentry/platform/ptrace/subprocess_linux.go | 36 +++++-----------------
 3 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 4649a94a7..a55cff507 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,8 @@ import (
 	"strings"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -143,3 +145,43 @@ func (t *thread) adjustInitRegsRip() {
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.R15 = uint64(ppid)
 }
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+		signalInfo.Signo = int32(linux.SIGSEGV)
+
+		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+		// with the si_call_addr field pointing to the current RIP. This field
+		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+		// anything there. We do need to unwind emulation however, so we set the
+		// instruction pointer to the faulting value, and "unpop" the stack.
+		regs.Rip = signalInfo.Addr()
+		regs.Rsp -= 8
+	}
+}
+
+// enableCpuidFault enable cpuid-faulting; this may fail on older kernels or hardware,
+// so we just disregard the result. Host CPUID will be enabled.
+func enableCpuidFault() {
+	syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
+}
+
+// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
+// Ref attachedThread() for more detail.
+func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+	return append(rules, seccomp.RuleSet{
+		Rules: seccomp.SyscallRules{
+			syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+				{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+			},
+		},
+		Action: linux.SECCOMP_RET_ALLOW,
+	})
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index bec884ba5..aed34e7ee 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -17,8 +17,12 @@
 package ptrace
 
 import (
+	"fmt"
+	"strings"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -37,7 +41,7 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
 }
 
 // createSyscallRegs sets up syscall registers.
@@ -124,3 +128,34 @@ func (t *thread) adjustInitRegsRip() {
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.Regs[7] = uint64(ppid)
 }
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+		signalInfo.Signo = int32(linux.SIGSEGV)
+
+		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+		// with the si_call_addr field pointing to the current RIP. This field
+		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+		// anything there. We do need to unwind emulation however, so we set the
+		// instruction pointer to the faulting value, and "unpop" the stack.
+		regs.Pc = signalInfo.Addr()
+		regs.Sp -= 8
+	}
+}
+
+// Noop on arm64.
+func enableCpuidFault() {
+}
+
+// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
+// Ref attachedThread() for more detail.
+func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+	return rules
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 3782d4332..cf13ea5e4 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
@@ -77,27 +78,6 @@ func probeSeccomp() bool {
 	}
 }
 
-// patchSignalInfo patches the signal info to account for hitting the seccomp
-// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
-// synchronous trap, but patch the structure to appear like a SIGSEGV with the
-// Rip as the faulting address.
-//
-// Note that this should only be called after verifying that the signalInfo has
-// been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
-	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
-		signalInfo.Signo = int32(linux.SIGSEGV)
-
-		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
-		// with the si_call_addr field pointing to the current RIP. This field
-		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
-		// anything there. We do need to unwind emulation however, so we set the
-		// instruction pointer to the faulting value, and "unpop" the stack.
-		regs.Rip = signalInfo.Addr()
-		regs.Rsp -= 8
-	}
-}
-
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -149,7 +129,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			Rules: seccomp.SyscallRules{
 				syscall.SYS_GETTIMEOFDAY: {},
 				syscall.SYS_TIME:         {},
-				309:                      {}, // SYS_GETCPU.
+				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
 			},
 			Action:   linux.SECCOMP_RET_TRAP,
 			Vsyscall: true,
@@ -173,10 +153,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 
 				// For the initial process creation.
 				syscall.SYS_WAIT4: {},
-				syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-					{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
-				},
-				syscall.SYS_EXIT: {},
+				syscall.SYS_EXIT:  {},
 
 				// For the stub prctl dance (all).
 				syscall.SYS_PRCTL: []seccomp.Rule{
@@ -196,6 +173,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			},
 			Action: linux.SECCOMP_RET_ALLOW,
 		})
+
+		rules = appendArchSeccompRules(rules)
 	}
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
 	if err != nil {
@@ -267,9 +246,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
-	// Enable cpuid-faulting; this may fail on older kernels or hardware,
-	// so we just disregard the result. Host CPUID will be enabled.
-	syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
+	// Enable cpuid-faulting.
+	enableCpuidFault()
 
 	// Call the stub; should not return.
 	stubCall(stubStart, ppid)
-- 
cgit v1.2.3


From 19b2d997ec702e559bdb5f5e60634a7c5d7d288e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 3 Dec 2019 08:32:03 -0800
Subject: Support IP_TOS and IPV6_TCLASS socket options for hostinet sockets.

There are two potential ways of sending a TOS byte with outgoing packets:
including a control message in sendmsg, or setting the IP_TOS/IPV6_TCLASS
socket options (for IPV4 and IPV6 respectively). This change lets hostinet
support the latter.

Fixes #1188

PiperOrigin-RevId: 283550925
---
 pkg/sentry/socket/hostinet/socket.go |  8 ++++----
 runsc/boot/filter/config.go          | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 8d9363aac..a8c152b54 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -289,12 +289,12 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
 		switch name {
-		case linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
+		case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_SOCKET:
@@ -334,12 +334,12 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
 		switch name {
-		case linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
+		case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_SOCKET:
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index bf690160c..4fb9adca6 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -310,11 +310,21 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETPEERNAME: {},
 		syscall.SYS_GETSOCKNAME: {},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IP),
 				seccomp.AllowValue(syscall.IP_RECVTOS),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IPV6),
@@ -423,6 +433,13 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowAny{},
 				seccomp.AllowValue(4),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IP),
@@ -430,6 +447,13 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowAny{},
 				seccomp.AllowValue(4),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IPV6),
-- 
cgit v1.2.3


From 812189664cab0a17ae29095e4029e2f8762a6779 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Tue, 3 Dec 2019 10:22:01 -0800
Subject: Remove TODO for obsolete bug.

PiperOrigin-RevId: 283571456
---
 pkg/sentry/socket/rpcinet/syscall_rpc.proto | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
index 9586f5923..b677e9eb3 100644
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
 // package syscall_rpc is a set of networking related system calls that can be
 // forwarded to a socket gofer.
 //
-// TODO(b/77963526): Document individual RPCs.
 package syscall_rpc;
 
 message SendmsgRequest {
-- 
cgit v1.2.3


From d7cc2480cb6e465ce01eb245e7edbad2c68c44d8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 3 Dec 2019 12:45:43 -0800
Subject: Add RunfilesPath to test_util

A few tests have their own ad-hoc implementations. Add a single common one.

PiperOrigin-RevId: 283601666
---
 test/syscalls/linux/exec.cc        | 145 +++++++++++++++++--------------------
 test/syscalls/linux/sigaltstack.cc |   8 +-
 test/util/BUILD                    |   2 +
 test/util/test_util.h              |   6 ++
 test/util/test_util_runfiles.cc    |  46 ++++++++++++
 5 files changed, 123 insertions(+), 84 deletions(-)
 create mode 100644 test/util/test_util_runfiles.cc

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 581f03533..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -47,23 +47,14 @@ namespace testing {
 
 namespace {
 
-constexpr char kBasicWorkload[] = "exec_basic_workload";
-constexpr char kExitScript[] = "exit_script";
-constexpr char kStateWorkload[] = "exec_state_workload";
-constexpr char kProcExeWorkload[] = "exec_proc_exe_workload";
-constexpr char kAssertClosedWorkload[] = "exec_assert_closed_workload";
-constexpr char kPriorityWorkload[] = "priority_execve";
-
-std::string WorkloadPath(absl::string_view binary) {
-  std::string full_path;
-  char* test_src = getenv("TEST_SRCDIR");
-  if (test_src) {
-    full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary);
-  }
-
-  TEST_CHECK(full_path.empty() == false);
-  return full_path;
-}
+constexpr char kBasicWorkload[] = "test/syscalls/linux/exec_basic_workload";
+constexpr char kExitScript[] = "test/syscalls/linux/exit_script";
+constexpr char kStateWorkload[] = "test/syscalls/linux/exec_state_workload";
+constexpr char kProcExeWorkload[] =
+    "test/syscalls/linux/exec_proc_exe_workload";
+constexpr char kAssertClosedWorkload[] =
+    "test/syscalls/linux/exec_assert_closed_workload";
+constexpr char kPriorityWorkload[] = "test/syscalls/linux/priority_execve";
 
 constexpr char kExit42[] = "--exec_exit_42";
 constexpr char kExecWithThread[] = "--exec_exec_with_thread";
@@ -171,44 +162,44 @@ TEST(ExecTest, EmptyPath) {
 }
 
 TEST(ExecTest, Basic) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {},
             ArgEnvExitStatus(0, 0),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n"));
 }
 
 TEST(ExecTest, OneArg) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"},
-            {}, ArgEnvExitStatus(1, 0),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "1"}, {},
+            ArgEnvExitStatus(1, 0),
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
 }
 
 TEST(ExecTest, FiveArg) {
-  CheckExec(WorkloadPath(kBasicWorkload),
-            {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
+  CheckExec(RunfilePath(kBasicWorkload),
+            {RunfilePath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
             ArgEnvExitStatus(5, 0),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
 }
 
 TEST(ExecTest, OneEnv) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {"1"},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {"1"},
             ArgEnvExitStatus(0, 1),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
 }
 
 TEST(ExecTest, FiveEnv) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)},
             {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
 }
 
 TEST(ExecTest, OneArgOneEnv) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "arg"},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "arg"},
             {"env"}, ArgEnvExitStatus(1, 1),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\narg\nenv\n"));
 }
 
 TEST(ExecTest, InterpreterScript) {
-  CheckExec(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {},
+  CheckExec(RunfilePath(kExitScript), {RunfilePath(kExitScript), "25"}, {},
             ArgEnvExitStatus(25, 0), "");
 }
 
@@ -216,7 +207,7 @@ TEST(ExecTest, InterpreterScript) {
 TEST(ExecTest, InterpreterScriptArgSplit) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"),
@@ -230,7 +221,7 @@ TEST(ExecTest, InterpreterScriptArgSplit) {
 TEST(ExecTest, InterpreterScriptArgvZero) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -244,7 +235,7 @@ TEST(ExecTest, InterpreterScriptArgvZero) {
 TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -261,7 +252,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
 TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -274,7 +265,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
 TEST(ExecTest, InterpreterScriptArgNUL) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(),
@@ -289,7 +280,7 @@ TEST(ExecTest, InterpreterScriptArgNUL) {
 TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  "), 0755));
@@ -302,7 +293,7 @@ TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
 TEST(ExecTest, InterpreterScriptArgWhitespace) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  foo"), 0755));
@@ -325,7 +316,7 @@ TEST(ExecTest, InterpreterScriptNoPath) {
 TEST(ExecTest, ExecFn) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " PrintExecFn"),
@@ -342,7 +333,7 @@ TEST(ExecTest, ExecFn) {
 }
 
 TEST(ExecTest, ExecName) {
-  std::string path = WorkloadPath(kStateWorkload);
+  std::string path = RunfilePath(kStateWorkload);
 
   CheckExec(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
             absl::StrCat(Basename(path).substr(0, 15), "\n"));
@@ -351,7 +342,7 @@ TEST(ExecTest, ExecName) {
 TEST(ExecTest, ExecNameScript) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(),
@@ -405,13 +396,13 @@ TEST(ExecStateTest, HandlerReset) {
   ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
 
   ExecveArray args = {
-      WorkloadPath(kStateWorkload),
+      RunfilePath(kStateWorkload),
       "CheckSigHandler",
       absl::StrCat(SIGUSR1),
       absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))),
   };
 
-  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // Ignored signal dispositions are not reset.
@@ -421,13 +412,13 @@ TEST(ExecStateTest, IgnorePreserved) {
   ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
 
   ExecveArray args = {
-      WorkloadPath(kStateWorkload),
+      RunfilePath(kStateWorkload),
       "CheckSigHandler",
       absl::StrCat(SIGUSR1),
       absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))),
   };
 
-  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // Signal masks are not reset on exec
@@ -438,12 +429,12 @@ TEST(ExecStateTest, SignalMask) {
   ASSERT_THAT(sigprocmask(SIG_BLOCK, &s, nullptr), SyscallSucceeds());
 
   ExecveArray args = {
-      WorkloadPath(kStateWorkload),
+      RunfilePath(kStateWorkload),
       "CheckSigBlocked",
       absl::StrCat(SIGUSR1),
   };
 
-  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // itimers persist across execve.
@@ -471,7 +462,7 @@ TEST(ExecStateTest, ItimerPreserved) {
     }
   };
 
-  std::string filename = WorkloadPath(kStateWorkload);
+  std::string filename = RunfilePath(kStateWorkload);
   ExecveArray argv = {
       filename,
       "CheckItimerEnabled",
@@ -495,8 +486,8 @@ TEST(ExecStateTest, ItimerPreserved) {
 TEST(ProcSelfExe, ChangesAcrossExecve) {
   // See exec_proc_exe_workload for more details. We simply
   // assert that the /proc/self/exe link changes across execve.
-  CheckExec(WorkloadPath(kProcExeWorkload),
-            {WorkloadPath(kProcExeWorkload),
+  CheckExec(RunfilePath(kProcExeWorkload),
+            {RunfilePath(kProcExeWorkload),
              ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
             {}, W_EXITCODE(0, 0), "");
 }
@@ -507,8 +498,8 @@ TEST(ExecTest, CloexecNormalFile) {
   const FileDescriptor fd_closed_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
 
-  CheckExec(WorkloadPath(kAssertClosedWorkload),
-            {WorkloadPath(kAssertClosedWorkload),
+  CheckExec(RunfilePath(kAssertClosedWorkload),
+            {RunfilePath(kAssertClosedWorkload),
              absl::StrCat(fd_closed_on_exec.get())},
             {}, W_EXITCODE(0, 0), "");
 
@@ -517,10 +508,10 @@ TEST(ExecTest, CloexecNormalFile) {
   const FileDescriptor fd_open_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY));
 
-  CheckExec(WorkloadPath(kAssertClosedWorkload),
-            {WorkloadPath(kAssertClosedWorkload),
-             absl::StrCat(fd_open_on_exec.get())},
-            {}, W_EXITCODE(2, 0), "");
+  CheckExec(
+      RunfilePath(kAssertClosedWorkload),
+      {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd_open_on_exec.get())},
+      {}, W_EXITCODE(2, 0), "");
 }
 
 TEST(ExecTest, CloexecEventfd) {
@@ -528,15 +519,15 @@ TEST(ExecTest, CloexecEventfd) {
   ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
   FileDescriptor fd(efd);
 
-  CheckExec(WorkloadPath(kAssertClosedWorkload),
-            {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
+  CheckExec(RunfilePath(kAssertClosedWorkload),
+            {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
             W_EXITCODE(0, 0), "");
 }
 
 constexpr int kLinuxMaxSymlinks = 40;
 
 TEST(ExecTest, SymlinkLimitExceeded) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
 
   // Hold onto TempPath objects so they are not destructed prematurely.
   std::vector<TempPath> symlinks;
@@ -575,13 +566,13 @@ TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
 }
 
 TEST(ExecveatTest, BasicWithFDCWD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
                 absl::StrCat(path, "\n"));
 }
 
 TEST(ExecveatTest, Basic) {
-  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string absolute_path = RunfilePath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
@@ -592,7 +583,7 @@ TEST(ExecveatTest, Basic) {
 }
 
 TEST(ExecveatTest, FDNotADirectory) {
-  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string absolute_path = RunfilePath(kBasicWorkload);
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
 
@@ -604,13 +595,13 @@ TEST(ExecveatTest, FDNotADirectory) {
 }
 
 TEST(ExecveatTest, AbsolutePathWithFDCWD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
                 absl::StrCat(path, "\n"));
 }
 
 TEST(ExecveatTest, AbsolutePath) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   // File descriptor should be ignored when an absolute path is given.
   const int32_t badFD = -1;
   CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
@@ -618,7 +609,7 @@ TEST(ExecveatTest, AbsolutePath) {
 }
 
 TEST(ExecveatTest, EmptyPathBasic) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
 
   CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0),
@@ -626,7 +617,7 @@ TEST(ExecveatTest, EmptyPathBasic) {
 }
 
 TEST(ExecveatTest, EmptyPathWithDirFD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(path));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
@@ -639,7 +630,7 @@ TEST(ExecveatTest, EmptyPathWithDirFD) {
 }
 
 TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
 
   int execve_errno;
@@ -649,7 +640,7 @@ TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
 }
 
 TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
 
   CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH,
@@ -657,7 +648,7 @@ TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
 }
 
 TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
-  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string absolute_path = RunfilePath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
@@ -670,7 +661,7 @@ TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
 TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
   std::string parent_dir = "/tmp";
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
   std::string base = std::string(Basename(link.path()));
@@ -685,7 +676,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
 TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
   std::string parent_dir = "/tmp";
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
   std::string path = link.path();
 
   int execve_errno;
@@ -697,7 +688,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
 
 TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
   std::string path = link.path();
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0));
 
@@ -723,7 +714,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
 }
 
 TEST(ExecveatTest, BasicWithCloexecFD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
 
   CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH,
@@ -731,7 +722,7 @@ TEST(ExecveatTest, BasicWithCloexecFD) {
 }
 
 TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
-  std::string path = WorkloadPath(kExitScript);
+  std::string path = RunfilePath(kExitScript);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
 
   int execve_errno;
@@ -742,7 +733,7 @@ TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
 }
 
 TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) {
-  std::string absolute_path = WorkloadPath(kExitScript);
+  std::string absolute_path = RunfilePath(kExitScript);
   std::string parent_dir = std::string(Dirname(absolute_path));
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
@@ -775,7 +766,7 @@ TEST(GetpriorityTest, ExecveMaintainsPriority) {
 
   // Program run (priority_execve) will exit(X) where
   // X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio.
-  CheckExec(WorkloadPath(kPriorityWorkload), {WorkloadPath(kPriorityWorkload)},
+  CheckExec(RunfilePath(kPriorityWorkload), {RunfilePath(kPriorityWorkload)},
             {}, W_EXITCODE(expected_exit_code, 0), "");
 }
 
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 6fd3989a4..a778fa639 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -95,13 +95,7 @@ TEST(SigaltstackTest, ResetByExecve) {
   auto const cleanup_sigstack =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
 
-  std::string full_path;
-  char* test_src = getenv("TEST_SRCDIR");
-  if (test_src) {
-    full_path = JoinPath(test_src, "../../linux/sigaltstack_check");
-  }
-
-  ASSERT_FALSE(full_path.empty());
+  std::string full_path = RunfilePath("test/syscalls/linux/sigaltstack_check");
 
   pid_t child_pid = -1;
   int execve_errno = 0;
diff --git a/test/util/BUILD b/test/util/BUILD
index 4526bb3f1..cbc728159 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -237,6 +237,7 @@ cc_library(
     ] + select_for_linux(
         [
             "test_util_impl.cc",
+            "test_util_runfiles.cc",
         ],
     ),
     hdrs = ["test_util.h"],
@@ -245,6 +246,7 @@ cc_library(
         ":logging",
         ":posix_error",
         ":save_util",
+        "@bazel_tools//tools/cpp/runfiles",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/flags:parse",
diff --git a/test/util/test_util.h b/test/util/test_util.h
index dc30575b8..ee6c2bf4d 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -764,6 +764,12 @@ MATCHER_P2(EquivalentWithin, target, tolerance,
   return Equivalent(arg, target, tolerance);
 }
 
+// Returns the absolute path to the a data dependency. 'path' is the runfile
+// location relative to workspace root.
+#ifdef __linux__
+std::string RunfilePath(std::string path);
+#endif
+
 void TestInit(int* argc, char*** argv);
 
 }  // namespace testing
diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc
new file mode 100644
index 000000000..7210094eb
--- /dev/null
+++ b/test/util/test_util_runfiles.cc
@@ -0,0 +1,46 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+#include "tools/cpp/runfiles/runfiles.h"
+
+namespace gvisor {
+namespace testing {
+
+std::string RunfilePath(std::string path) {
+  static const bazel::tools::cpp::runfiles::Runfiles* const runfiles = [] {
+    std::string error;
+    auto* runfiles =
+        bazel::tools::cpp::runfiles::Runfiles::CreateForTest(&error);
+    if (runfiles == nullptr) {
+      std::cerr << "Unable to find runfiles: " << error << std::endl;
+    }
+    return runfiles;
+  }();
+
+  if (!runfiles) {
+    // Can't find runfiles? This probably won't work, but __main__/path is our
+    // best guess.
+    return JoinPath("__main__", path);
+  }
+
+  return runfiles->Rlocation(JoinPath("__main__", path));
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 3e832bec1b48b95951c3b83eb5a7b70f29b1f10f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Dec 2019 13:31:07 -0800
Subject: Point TODOs to gvisor.dev

PiperOrigin-RevId: 283610781
---
 pkg/sentry/fsimpl/proc/filesystems.go | 2 +-
 pkg/sentry/fsimpl/proc/mounts.go      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go
index c36c4aff5..0e016bca5 100644
--- a/pkg/sentry/fsimpl/proc/filesystems.go
+++ b/pkg/sentry/fsimpl/proc/filesystems.go
@@ -19,7 +19,7 @@ package proc
 // +stateify savable
 type filesystemsData struct{}
 
-// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for
+// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for
 // filesystemsData. We would need to retrive filesystem names from
 // vfs.VirtualFilesystem. Also needs vfs replacement for
 // fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go
index e81b1e910..8683cf677 100644
--- a/pkg/sentry/fsimpl/proc/mounts.go
+++ b/pkg/sentry/fsimpl/proc/mounts.go
@@ -16,7 +16,7 @@ package proc
 
 import "gvisor.dev/gvisor/pkg/sentry/kernel"
 
-// TODO(b/138862512): Implement mountInfoFile and mountsFile.
+// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile.
 
 // mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo.
 //
-- 
cgit v1.2.3


From 154dcdec072ddad9e1c96b56e023d7f77fecf2ad Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Dec 2019 13:42:30 -0800
Subject: Remove watchdog TODO

I have not seen a false positive stuck task yet.
Biggest offender was whitelistfs which is going away.

PiperOrigin-RevId: 283613064
---
 pkg/sentry/watchdog/watchdog.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index ecce6c69f..5e4611333 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -287,7 +287,9 @@ func (w *Watchdog) runTurn() {
 				if !ok {
 					// New stuck task detected.
 					//
-					// TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel.
+					// Note that tasks blocked doing IO may be considered stuck in kernel,
+					// unless they are surrounded b
+					// Task.UninterruptibleSleepStart/Finish.
 					tc = &offender{lastUpdateTime: lastUpdateTime}
 					stuckTasks.Increment()
 					newTaskFound = true
-- 
cgit v1.2.3


From 43643752f05a0b25259b116558ccd870a539cc05 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 3 Dec 2019 13:46:09 -0800
Subject: strace: don't create a slice with a negative value

PiperOrigin-RevId: 283613824
---
 pkg/sentry/strace/socket.go             |  9 +++++++++
 test/syscalls/linux/socket_unix_cmsg.cc | 29 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 94334f6d2..51f2efb39 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -208,6 +208,15 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 		i += linux.SizeOfControlMessageHeader
 		width := t.Arch().Width()
 		length := int(h.Length) - linux.SizeOfControlMessageHeader
+		if length < 0 {
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, content too short}",
+				level,
+				typ,
+				h.Length,
+			))
+			break
+		}
 
 		if skipData {
 			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
index 1159c5229..a16899493 100644
--- a/test/syscalls/linux/socket_unix_cmsg.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -149,6 +149,35 @@ TEST_P(UnixSocketPairCmsgTest, BadFDPass) {
               SyscallFailsWithErrno(EBADF));
 }
 
+TEST_P(UnixSocketPairCmsgTest, ShortCmsg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  int sent_fd = -1;
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(sent_fd))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_len = 1;
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+  struct iovec iov;
+  iov.iov_base = sent_data;
+  iov.iov_len = sizeof(sent_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 // BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
 // The difference is that when calling recvmsg, no space for FDs is provided,
 // only space for the cmsg header.
-- 
cgit v1.2.3


From 27e2c4ddca553cf6867bd49f2847ef007ac560c0 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 3 Dec 2019 14:40:22 -0800
Subject: Fix panic due to early transition to Closed.

The code in rcv.consumeSegment incorrectly transitions to
CLOSED state from LAST-ACK before the final ACK for the FIN.

Further if receiving a segment changes a socket to a closed state
then we should not invoke the sender as the socket is now closed
and sending any segments is incorrect.

PiperOrigin-RevId: 283625300
---
 pkg/tcpip/transport/tcp/connect.go           |  32 ++---
 pkg/tcpip/transport/tcp/rcv.go               |   2 +-
 pkg/tcpip/transport/tcp/tcp_test.go          | 179 +++++++++++++++++++++++++++
 test/syscalls/linux/BUILD                    |   1 +
 test/syscalls/linux/socket_ip_tcp_generic.cc |  23 ++++
 5 files changed, 222 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4206db8b6..16f8aea12 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -953,20 +953,6 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 func (e *endpoint) handleSegments() *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
-		e.mu.RLock()
-		state := e.state
-		e.mu.RUnlock()
-		if state == StateClose {
-			// When we get into StateClose while processing from the queue,
-			// return immediately and let the protocolMainloop handle it.
-			//
-			// We can reach StateClose only while processing a previous segment
-			// or a notification from the protocolMainLoop (caller goroutine).
-			// This means that with this return, the segment dequeue below can
-			// never occur on a closed endpoint.
-			return nil
-		}
-
 		s := e.segmentQueue.dequeue()
 		if s == nil {
 			checkRequeue = false
@@ -1024,6 +1010,24 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 				s.decRef()
 				continue
 			}
+
+			// Now check if the received segment has caused us to transition
+			// to a CLOSED state, if yes then terminate processing and do
+			// not invoke the sender.
+			e.mu.RLock()
+			state := e.state
+			e.mu.RUnlock()
+			if state == StateClose {
+				// When we get into StateClose while processing from the queue,
+				// return immediately and let the protocolMainloop handle it.
+				//
+				// We can reach StateClose only while processing a previous segment
+				// or a notification from the protocolMainLoop (caller goroutine).
+				// This means that with this return, the segment dequeue below can
+				// never occur on a closed endpoint.
+				s.decRef()
+				return nil
+			}
 			e.snd.handleRcvdSegment(s)
 		}
 		s.decRef()
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 857dc445f..5ee499c36 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -205,7 +205,7 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 	// Handle ACK (not FIN-ACK, which we handled above) during one of the
 	// shutdown states.
-	if s.flagIsSet(header.TCPFlagAck) {
+	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
 		r.ep.mu.Lock()
 		switch r.ep.state {
 		case StateFinWait1:
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 50829ae27..d1f0d6ce7 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5632,6 +5632,7 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -5750,6 +5751,7 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -5856,6 +5858,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -5929,6 +5932,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
@@ -5941,6 +5945,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -6007,6 +6012,7 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -6115,3 +6121,176 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.AckNum(uint32(ackHeaders.SeqNum)),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
 }
+
+func TestTCPCloseWithData(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+	// after 5 seconds in TIME_WAIT state.
+	tcpTimeWaitTimeout := 5 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+		RcvWnd:  30000,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now trigger a passive close by sending a FIN.
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+		RcvWnd:  30000,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Now write a few bytes and then close the endpoint.
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received.
+	b = c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Errorf("got data = %x, want = %x", p, data)
+	}
+
+	c.EP.Close()
+	// Check the FIN.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
+		checker.AckNum(uint32(iss+2)),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	// First send a partial ACK.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)-1),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Now send a full ACK.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Now ACK the FIN.
+	ackHeaders.AckNum++
+	c.SendPacket(nil, ackHeaders)
+
+	// Now send an ACK and we should get a RST back as the endpoint should
+	// be in CLOSED state.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Check the RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(ackHeaders.AckNum)),
+		checker.AckNum(uint32(ackHeaders.SeqNum)),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 2dd115409..a865e8857 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2142,6 +2142,7 @@ cc_library(
         ":socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index a37b49447..c74273436 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,8 @@
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -789,5 +791,26 @@ TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) {
   EXPECT_EQ(get, kTCPLingerTimeout);
 }
 
+TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ScopedThread t([&]() {
+    // Close one end to trigger sending of a FIN.
+    ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds());
+    char buf[3];
+    ASSERT_THAT(read(sockets->second_fd(), buf, 3),
+                SyscallSucceedsWithValue(3));
+    absl::SleepFor(absl::Milliseconds(50));
+    ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+  });
+
+  absl::SleepFor(absl::Milliseconds(50));
+  // Send some data then close.
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(write(sockets->first_fd(), kStr, 3), SyscallSucceedsWithValue(3));
+  t.Join();
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 035407153989b189a3ce42df43d6f528fa95444f Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Tue, 3 Dec 2019 15:06:18 -0800
Subject: Fix printing /proc/[pid]/io for /proc/[pid]/task/[tid]/io.

PiperOrigin-RevId: 283630669
---
 pkg/sentry/fs/proc/task.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 87184ec67..2a598149d 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -639,7 +639,7 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	io.Accumulate(i.IOUsage())
 
 	var buf bytes.Buffer
-	fmt.Fprintf(&buf, "char: %d\n", io.CharsRead)
+	fmt.Fprintf(&buf, "rchar: %d\n", io.CharsRead)
 	fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten)
 	fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls)
 	fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls)
-- 
cgit v1.2.3


From cf7f27c16793eaa41743e96488dad2ddfd1f5d59 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 3 Dec 2019 16:30:38 -0800
Subject: net/udp: return a local route address as the bound-to address

If the socket is bound to ANY and connected to a loopback address,
getsockname() has to return the loopback address. Without this fix,
getsockname() returns ANY.

PiperOrigin-RevId: 283647781
---
 pkg/tcpip/transport/udp/endpoint.go          |  7 ++++-
 test/syscalls/linux/udp_socket_test_cases.cc | 39 ++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 24cb88c13..4b161e404 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1134,9 +1134,14 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
+	addr := e.ID.LocalAddress
+	if e.state == StateConnected {
+		addr = e.route.LocalAddress
+	}
+
 	return tcpip.FullAddress{
 		NIC:  e.RegisterNICID,
-		Addr: e.ID.LocalAddress,
+		Addr: addr,
 		Port: e.ID.LocalPort,
 	}, nil
 }
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index b6090ac66..63b92d6a7 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -527,6 +527,45 @@ TEST_P(UdpSocketTest, DisconnectAfterBind) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) {
+  struct sockaddr_storage baddr = {};
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+    addr_in->sin_family = AF_INET;
+    addr_in->sin_port = port;
+    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+    addr_in->sin6_family = AF_INET6;
+    addr_in->sin6_port = port;
+    addr_in->sin6_scope_id = 0;
+    addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+  }
+  ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
+              SyscallSucceeds());
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr = {};
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  // If the socket is bound to ANY and connected to a loopback address,
+  // getsockname() has to return the loopback address.
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+  } else {
+    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+    struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT;
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+  }
+}
+
 TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
   struct sockaddr_storage baddr = {};
   socklen_t addrlen;
-- 
cgit v1.2.3


From 4c1fa402b3586035b94635413125909ccd2e7800 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Dec 2019 01:31:08 +0000
Subject: Bump lodash in /benchmarks/workloads/node_template

Bumps [lodash](https://github.com/lodash/lodash) from 4.17.11 to 4.17.15.
- [Release notes](https://github.com/lodash/lodash/releases)
- [Commits](https://github.com/lodash/lodash/compare/4.17.11...4.17.15)

Signed-off-by: dependabot[bot] <support@github.com>
---
 benchmarks/workloads/node_template/package-lock.json | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/benchmarks/workloads/node_template/package-lock.json b/benchmarks/workloads/node_template/package-lock.json
index 1653597a1..580e68aa5 100644
--- a/benchmarks/workloads/node_template/package-lock.json
+++ b/benchmarks/workloads/node_template/package-lock.json
@@ -233,9 +233,9 @@
       "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4="
     },
     "lodash": {
-      "version": "4.17.11",
-      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
-      "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
+      "version": "4.17.15",
+      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz",
+      "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A=="
     },
     "media-typer": {
       "version": "0.3.0",
@@ -364,6 +364,16 @@
         }
       }
     },
+    "redis-commands": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.5.0.tgz",
+      "integrity": "sha512-6KxamqpZ468MeQC3bkWmCB1fp56XL64D4Kf0zJSwDZbVLLm7KFkoIcHrgRvQ+sk8dnhySs7+yBg94yIkAK7aJg=="
+    },
+    "redis-parser": {
+      "version": "2.6.0",
+      "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz",
+      "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs="
+    },
     "safe-buffer": {
       "version": "5.1.2",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
-- 
cgit v1.2.3


From bb641c54035e79e3e4c2752e07e6ac55c620b93f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Dec 2019 17:32:27 -0800
Subject: Point TODO to gvisor.dev

PiperOrigin-RevId: 283657725
---
 test/syscalls/linux/aio.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index b27d4e10a..a33daff17 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -129,7 +129,7 @@ TEST_F(AIOTest, BasicWrite) {
   // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
   // Linux implements aio_ring, so skip the zeroes check.
   //
-  // TODO(b/65486370): Remove when gVisor implements aio_ring.
+  // TODO(gvisor.dev/issue/204): Remove when gVisor implements aio_ring.
   auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
   auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
   EXPECT_EQ(ring->magic, magic);
-- 
cgit v1.2.3


From 80b7ba0c9709c0c7f4c3aef5637d23225bcb866b Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 3 Dec 2019 19:40:56 -0800
Subject: Clean up readv_socket test suite.

Get rid of the SocketTest class, which is only extended by ReadvSocketTest.
Also, get rid of TCP sockets (which were unused anyway) from readv_socket.cc.
This is a very old test suite that isn't the right place for TCP loopback
tests.

PiperOrigin-RevId: 283672772
---
 test/syscalls/linux/BUILD           |  1 -
 test/syscalls/linux/file_base.h     | 89 -------------------------------------
 test/syscalls/linux/readv_common.cc | 43 +++++++++++++++++-
 test/syscalls/linux/readv_socket.cc | 45 ++++++++++++++++---
 4 files changed, 80 insertions(+), 98 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index a865e8857..9cca78a93 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1795,7 +1795,6 @@ cc_binary(
     name = "readv_socket_test",
     testonly = 1,
     srcs = [
-        "file_base.h",
         "readv_common.cc",
         "readv_common.h",
         "readv_socket.cc",
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 4e048320e..6f80bc97c 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -111,95 +111,6 @@ class FileTest : public ::testing::Test {
   int test_pipe_[2];
 };
 
-class SocketTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    test_unix_stream_socket_[0] = -1;
-    test_unix_stream_socket_[1] = -1;
-    test_unix_dgram_socket_[0] = -1;
-    test_unix_dgram_socket_[1] = -1;
-    test_unix_seqpacket_socket_[0] = -1;
-    test_unix_seqpacket_socket_[1] = -1;
-    test_tcp_socket_[0] = -1;
-    test_tcp_socket_[1] = -1;
-
-    ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
-                SyscallSucceeds());
-    ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
-                SyscallSucceeds());
-    ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
-                SyscallSucceeds());
-    ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
-                SyscallSucceeds());
-    ASSERT_THAT(
-        socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
-        SyscallSucceeds());
-    ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
-                SyscallSucceeds());
-  }
-
-  void TearDown() override {
-    close(test_unix_stream_socket_[0]);
-    close(test_unix_stream_socket_[1]);
-
-    close(test_unix_dgram_socket_[0]);
-    close(test_unix_dgram_socket_[1]);
-
-    close(test_unix_seqpacket_socket_[0]);
-    close(test_unix_seqpacket_socket_[1]);
-
-    close(test_tcp_socket_[0]);
-    close(test_tcp_socket_[1]);
-  }
-
-  int test_unix_stream_socket_[2];
-  int test_unix_dgram_socket_[2];
-  int test_unix_seqpacket_socket_[2];
-  int test_tcp_socket_[2];
-};
-
-// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
-// corresponding to an iovec array and its length, contains data that matches
-// the string length strlen.
-MATCHER_P(MatchesStringLength, strlen, "") {
-  struct iovec* iovs = arg.first;
-  int niov = arg.second;
-  int offset = 0;
-  for (int i = 0; i < niov; i++) {
-    offset += iovs[i].iov_len;
-  }
-  if (offset != static_cast<int>(strlen)) {
-    *result_listener << offset;
-    return false;
-  }
-  return true;
-}
-
-// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
-// corresponding to an iovec array and its length, contains data that matches
-// the string value str.
-MATCHER_P(MatchesStringValue, str, "") {
-  struct iovec* iovs = arg.first;
-  int len = strlen(str);
-  int niov = arg.second;
-  int offset = 0;
-  for (int i = 0; i < niov; i++) {
-    struct iovec iov = iovs[i];
-    if (len < offset) {
-      *result_listener << "strlen " << len << " < offset " << offset;
-      return false;
-    }
-    if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
-      absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
-                                     iov.iov_len);
-      *result_listener << iovec_string << " @offset " << offset;
-      return false;
-    }
-    offset += iov.iov_len;
-  }
-  return true;
-}
-
 }  // namespace testing
 }  // namespace gvisor
 
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 9658f7d42..491d5f40f 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -19,12 +19,53 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/file_base.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
 
+// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string length strlen.
+MATCHER_P(MatchesStringLength, strlen, "") {
+  struct iovec* iovs = arg.first;
+  int niov = arg.second;
+  int offset = 0;
+  for (int i = 0; i < niov; i++) {
+    offset += iovs[i].iov_len;
+  }
+  if (offset != static_cast<int>(strlen)) {
+    *result_listener << offset;
+    return false;
+  }
+  return true;
+}
+
+// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string value str.
+MATCHER_P(MatchesStringValue, str, "") {
+  struct iovec* iovs = arg.first;
+  int len = strlen(str);
+  int niov = arg.second;
+  int offset = 0;
+  for (int i = 0; i < niov; i++) {
+    struct iovec iov = iovs[i];
+    if (len < offset) {
+      *result_listener << "strlen " << len << " < offset " << offset;
+      return false;
+    }
+    if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
+      absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
+                                     iov.iov_len);
+      *result_listener << iovec_string << " @offset " << offset;
+      return false;
+    }
+    offset += iov.iov_len;
+  }
+  return true;
+}
+
 extern const char kReadvTestData[] =
     "127.0.0.1      localhost"
     ""
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 9b6972201..dd6fb7008 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -19,7 +19,6 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/file_base.h"
 #include "test/syscalls/linux/readv_common.h"
 #include "test/util/test_util.h"
 
@@ -28,9 +27,30 @@ namespace testing {
 
 namespace {
 
-class ReadvSocketTest : public SocketTest {
+class ReadvSocketTest : public ::testing::Test {
+ public:
   void SetUp() override {
-    SocketTest::SetUp();
+    test_unix_stream_socket_[0] = -1;
+    test_unix_stream_socket_[1] = -1;
+    test_unix_dgram_socket_[0] = -1;
+    test_unix_dgram_socket_[1] = -1;
+    test_unix_seqpacket_socket_[0] = -1;
+    test_unix_seqpacket_socket_[1] = -1;
+
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
+                SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
+                SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
+        SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+
     ASSERT_THAT(
         write(test_unix_stream_socket_[1], kReadvTestData, kReadvTestDataSize),
         SyscallSucceedsWithValue(kReadvTestDataSize));
@@ -40,11 +60,22 @@ class ReadvSocketTest : public SocketTest {
     ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
                       kReadvTestDataSize),
                 SyscallSucceedsWithValue(kReadvTestDataSize));
-    // FIXME(b/69821513): Enable when possible.
-    // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData,
-    // kReadvTestDataSize),
-    //             SyscallSucceedsWithValue(kReadvTestDataSize));
   }
+
+  void TearDown() override {
+    close(test_unix_stream_socket_[0]);
+    close(test_unix_stream_socket_[1]);
+
+    close(test_unix_dgram_socket_[0]);
+    close(test_unix_dgram_socket_[1]);
+
+    close(test_unix_seqpacket_socket_[0]);
+    close(test_unix_seqpacket_socket_[1]);
+  }
+
+  int test_unix_stream_socket_[2];
+  int test_unix_dgram_socket_[2];
+  int test_unix_seqpacket_socket_[2];
 };
 
 TEST_F(ReadvSocketTest, ReadOneBufferPerByte_StreamSocket) {
-- 
cgit v1.2.3


From 70b68bb058e2f3281ba0e245ca404f264cfc547b Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 4 Dec 2019 16:28:16 +0800
Subject: Add a floating test case for Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/testutil/testutil_arm64.s | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 2cd28b2d2..0bebee852 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -50,6 +50,21 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0
 start:
 	B start
 
+TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
+	NO_LOCAL_POINTERS
+	FMOVD $(9.9), F0
+	MOVD $SYS_GETPID, R8 // getpid
+	SVC
+	FMOVD $(9.9), F1
+	FCMPD F0, F1
+	BNE isNaN
+	MOVD $1, R0
+	MOVD R0, ret+0(FP)
+	RET
+isNaN:
+	MOVD $0, ret+0(FP)
+	RET
+
 // MVN: bitwise logical NOT
 // This case simulates an application that modified R0-R30.
 #define TWIDDLE_REGS() \
-- 
cgit v1.2.3


From 1eda90d0848658f330e5f37ce18209bd3d069766 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 4 Dec 2019 13:53:08 -0800
Subject: Remove TODO since we don't plan to support debug registers

PiperOrigin-RevId: 283828423
---
 pkg/sentry/arch/arch_amd64.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 9e7db8b30..67daa6c24 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
 		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
 	}
-	// TODO(b/34088053): debug registers
+	// Note: x86 debug registers are missing.
 	return c.Native(0), nil
 }
 
@@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
 		return err
 	}
-	// TODO(b/34088053): debug registers
+	// Note: x86 debug registers are missing.
 	return nil
 }
-- 
cgit v1.2.3


From 6ae64d793593eaf3c1364354ef01a555f230a0fe Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 4 Dec 2019 23:44:25 -0800
Subject: Allow syscall tests to run with hostinet.

Fixes #1207

PiperOrigin-RevId: 283914438
---
 test/syscalls/build_defs.bzl         | 17 +++++++++++++++++
 test/syscalls/syscall_test_runner.go | 10 ++++++----
 test/util/test_util.cc               |  6 ++++++
 test/util/test_util.h                |  1 +
 4 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index dcf5b73ed..aaf77c65b 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -9,6 +9,7 @@ def syscall_test(
         use_tmpfs = False,
         add_overlay = False,
         add_uds_tree = False,
+        add_hostinet = False,
         tags = None):
     _syscall_test(
         test = test,
@@ -65,6 +66,18 @@ def syscall_test(
             file_access = "shared",
         )
 
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+        )
+
 def _syscall_test(
         test,
         shard_count,
@@ -72,6 +85,7 @@ def _syscall_test(
         platform,
         use_tmpfs,
         tags,
+        network = "none",
         file_access = "exclusive",
         overlay = False,
         add_uds_tree = False):
@@ -85,6 +99,8 @@ def _syscall_test(
         name += "_shared"
     if overlay:
         name += "_overlay"
+    if network != "none":
+        name += "_" + network + "net"
 
     if tags == None:
         tags = []
@@ -107,6 +123,7 @@ def _syscall_test(
         # Arguments are passed directly to syscall_test_runner binary.
         "--test-name=" + test_name,
         "--platform=" + platform,
+        "--network=" + network,
         "--use-tmpfs=" + str(use_tmpfs),
         "--file-access=" + file_access,
         "--overlay=" + str(overlay),
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index accf46347..b9fd885ff 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -46,6 +46,7 @@ var (
 	debug      = flag.Bool("debug", false, "enable debug logs")
 	strace     = flag.Bool("strace", false, "enable strace logs")
 	platform   = flag.String("platform", "ptrace", "platform to run on")
+	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
 	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
 	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
 	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
@@ -137,7 +138,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 
 	args := []string{
 		"-root", rootDir,
-		"-network=none",
+		"-network", *network,
 		"-log-format=text",
 		"-TESTONLY-unsafe-nonroot=true",
 		"-net-raw=true",
@@ -335,10 +336,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		})
 	}
 
-	// Set environment variable that indicates we are
-	// running in gVisor and with the given platform.
+	// Set environment variables that indicate we are
+	// running in gVisor with the given platform and network.
 	platformVar := "TEST_ON_GVISOR"
-	env := append(os.Environ(), platformVar+"="+*platform)
+	networkVar := "GVISOR_NETWORK"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
 
 	// Remove env variables that cause the gunit binary to write output
 	// files, since they will stomp on eachother, and on the output files
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 9cb050735..848504c88 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -41,6 +41,7 @@ namespace gvisor {
 namespace testing {
 
 #define TEST_ON_GVISOR "TEST_ON_GVISOR"
+#define GVISOR_NETWORK "GVISOR_NETWORK"
 
 bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
 
@@ -60,6 +61,11 @@ Platform GvisorPlatform() {
   abort();
 }
 
+bool IsRunningWithHostinet() {
+  char* env = getenv(GVISOR_NETWORK);
+  return env && strcmp(env, "host") == 0;
+}
+
 // Inline cpuid instruction.  Preserve %ebx/%rbx register. In PIC compilations
 // %ebx contains the address of the global offset table. %rbx is occasionally
 // used to address stack variables in presence of dynamic allocas.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index ee6c2bf4d..b3235c7e3 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -220,6 +220,7 @@ enum class Platform {
 };
 bool IsRunningOnGvisor();
 Platform GvisorPlatform();
+bool IsRunningWithHostinet();
 
 #ifdef __linux__
 void SetupGvisorDeathTest();
-- 
cgit v1.2.3


From 05758f34b2f65b7e6b118d3719cb8ce37eb4bc79 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 5 Dec 2019 05:43:52 -0800
Subject: Explicitly export files needed by other packages

PiperOrigin-RevId: 283955946
---
 test/syscalls/linux/BUILD | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 9cca78a93..7ce2e6270 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -6,6 +6,16 @@ package(
     licenses = ["notice"],
 )
 
+exports_files(
+    [
+        "socket.cc",
+        "socket_ipv4_udp_unbound_loopback.cc",
+        "tcp_socket.cc",
+        "udp_socket.cc",
+    ],
+    visibility = ["//:sandbox"],
+)
+
 cc_binary(
     name = "sigaltstack_check",
     testonly = 1,
-- 
cgit v1.2.3


From 10f7b109ab98c95783357b82e1934586f338c2b3 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 5 Dec 2019 10:40:18 -0800
Subject: Add a type to represent the NDP Recursive DNS Server option

This change adds a type to represent the NDP Recursive DNS Server option, as
defined by RFC 8106 section 5.1.

PiperOrigin-RevId: 284005493
---
 pkg/tcpip/header/BUILD          |   5 +-
 pkg/tcpip/header/ndp_options.go | 118 +++++++++++++++++++++-
 pkg/tcpip/header/ndp_test.go    | 215 ++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/ndp.go          |   6 +-
 pkg/tcpip/stack/ndp_test.go     |   6 +-
 5 files changed, 339 insertions(+), 11 deletions(-)

diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index a3485b35c..8392cb9e5 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -55,5 +55,8 @@ go_test(
         "ndp_test.go",
     ],
     embed = [":header"],
-    deps = ["//pkg/tcpip"],
+    deps = [
+        "//pkg/tcpip",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
 )
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 1ca6199ef..2652e7b67 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -85,6 +85,23 @@ const (
 	// within an NDPPrefixInformation.
 	ndpPrefixInformationPrefixOffset = 14
 
+	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
+	// Server option, as per RFC 8106 section 5.1.
+	NDPRecursiveDNSServerOptionType = 25
+
+	// ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
+	// Lifetime field within an NDPRecursiveDNSServer.
+	ndpRecursiveDNSServerLifetimeOffset = 2
+
+	// ndpRecursiveDNSServerAddressesOffset is the start of the addresses
+	// for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
+	ndpRecursiveDNSServerAddressesOffset = 6
+
+	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS
+	// Server option's length field value when it contains at least one
+	// IPv6 address.
+	minNDPRecursiveDNSServerLength = 3
+
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
 	// 8 octets, as per RFC 4861 section 4.6.
@@ -92,13 +109,13 @@ const (
 )
 
 var (
-	// NDPPrefixInformationInfiniteLifetime is a value that represents
+	// NDPInfiniteLifetime is a value that represents
 	// infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
 	// Information option. Its value is (2^32 - 1)s = 4294967295s
 	//
 	// This is a variable instead of a constant so that tests can change
 	// this value to a smaller value. It should only be modified by tests.
-	NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295
+	NDPInfiniteLifetime = time.Second * 4294967295
 )
 
 // NDPOptionIterator is an iterator of NDPOption.
@@ -118,6 +135,7 @@ var (
 	ErrNDPOptBufExhausted  = errors.New("Buffer unexpectedly exhausted")
 	ErrNDPOptZeroLength    = errors.New("NDP option has zero-valued Length field")
 	ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body")
+	ErrNDPInvalidLength    = errors.New("NDP option's Length value is invalid as per relevant RFC")
 )
 
 // Next returns the next element in the backing NDPOptions, or true if we are
@@ -182,6 +200,22 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 			}
 
 			return NDPPrefixInformation(body), false, nil
+
+		case NDPRecursiveDNSServerOptionType:
+			// RFC 8106 section 5.3.1 outlines that the RDNSS option
+			// must have a minimum length of 3 so it contains at
+			// least one IPv6 address.
+			if l < minNDPRecursiveDNSServerLength {
+				return nil, true, ErrNDPInvalidLength
+			}
+
+			opt := NDPRecursiveDNSServer(body)
+			if len(opt.Addresses()) == 0 {
+				return nil, true, ErrNDPOptMalformedBody
+			}
+
+			return opt, false, nil
+
 		default:
 			// We do not yet recognize the option, just skip for
 			// now. This is okay because RFC 4861 allows us to
@@ -434,7 +468,7 @@ func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool {
 //
 // Note, a value of 0 implies the prefix should not be considered as on-link,
 // and a value of infinity/forever is represented by
-// NDPPrefixInformationInfiniteLifetime.
+// NDPInfiniteLifetime.
 func (o NDPPrefixInformation) ValidLifetime() time.Duration {
 	// The field is the time in seconds, as per RFC 4861 section 4.6.2.
 	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:]))
@@ -447,7 +481,7 @@ func (o NDPPrefixInformation) ValidLifetime() time.Duration {
 //
 // Note, a value of 0 implies that addresses generated from the prefix should
 // no longer remain preferred, and a value of infinity is represented by
-// NDPPrefixInformationInfiniteLifetime.
+// NDPInfiniteLifetime.
 //
 // Also note that the value of this field MUST NOT exceed the Valid Lifetime
 // field to avoid preferring addresses that are no longer valid, for the
@@ -476,3 +510,79 @@ func (o NDPPrefixInformation) Subnet() tcpip.Subnet {
 	}
 	return addrWithPrefix.Subnet()
 }
+
+// NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by
+// RFC 8106 section 5.1.
+//
+// To make sure that the option meets its minimum length and does not end in the
+// middle of a DNS server's IPv6 address, the length of a valid
+// NDPRecursiveDNSServer must meet the following constraint:
+//   (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0
+type NDPRecursiveDNSServer []byte
+
+// Type returns the type of an NDP Recursive DNS Server option.
+//
+// Type implements NDPOption.Type.
+func (NDPRecursiveDNSServer) Type() uint8 {
+	return NDPRecursiveDNSServerOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPRecursiveDNSServer) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the reserved bytes that are before the Lifetime field.
+	for i := 0; i < ndpRecursiveDNSServerLifetimeOffset; i++ {
+		b[i] = 0
+	}
+
+	return used
+}
+
+// Lifetime returns the length of time that the DNS server addresses
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the addresses should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+//
+// Lifetime may panic if o does not have enough bytes to hold the Lifetime
+// field.
+func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 8106 section 5.1.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:]))
+}
+
+// Addresses returns the recursive DNS server IPv6 addresses that may be
+// used for name resolution.
+//
+// Note, some of the addresses returned MAY be link-local addresses.
+//
+// Addresses may panic if o does not hold valid IPv6 addresses.
+func (o NDPRecursiveDNSServer) Addresses() []tcpip.Address {
+	l := len(o)
+	if l < ndpRecursiveDNSServerAddressesOffset {
+		return nil
+	}
+
+	l -= ndpRecursiveDNSServerAddressesOffset
+	if l%IPv6AddressSize != 0 {
+		return nil
+	}
+
+	buf := o[ndpRecursiveDNSServerAddressesOffset:]
+	var addrs []tcpip.Address
+	for len(buf) > 0 {
+		addr := tcpip.Address(buf[:IPv6AddressSize])
+		if !IsV6UnicastAddress(addr) {
+			return nil
+		}
+		addrs = append(addrs, addr)
+		buf = buf[IPv6AddressSize:]
+	}
+	return addrs
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index ad6daafcd..2c439d70c 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
@@ -369,6 +370,175 @@ func TestNDPPrefixInformationOption(t *testing.T) {
 	}
 }
 
+func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) {
+	b := []byte{
+		9, 8,
+		1, 2, 4, 8,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	expected := []byte{
+		25, 3, 0, 0,
+		1, 2, 4, 8,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPRecursiveDNSServer(b),
+	}
+	if got, want := opts.Serialize(serializer), len(expected); got != want {
+		t.Errorf("got Serialize = %d, want = %d", got, want)
+	}
+	if !bytes.Equal(targetBuf, expected) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+	}
+
+	opt, ok := next.(NDPRecursiveDNSServer)
+	if !ok {
+		t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+	}
+	if got := opt.Type(); got != 25 {
+		t.Errorf("got Type = %d, want = 31", got)
+	}
+	if got := opt.Length(); got != 22 {
+		t.Errorf("got Length = %d, want = 22", got)
+	}
+	if got, want := opt.Lifetime(), 16909320*time.Second; got != want {
+		t.Errorf("got Lifetime = %s, want = %s", got, want)
+	}
+	want := []tcpip.Address{
+		"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+	}
+	if got := opt.Addresses(); !cmp.Equal(got, want) {
+		t.Errorf("got Addresses = %v, want = %v", got, want)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
+func TestNDPRecursiveDNSServerOption(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		lifetime time.Duration
+		addrs    []tcpip.Address
+	}{
+		{
+			"Valid1Addr",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+			},
+		},
+		{
+			"Valid2Addr",
+			[]byte{
+				25, 5, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+			},
+		},
+		{
+			"Valid3Addr",
+			[]byte{
+				25, 7, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x11",
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			// Iterator should get our option.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if done {
+				t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+			}
+			if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+				t.Fatalf("got Type %= %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+			}
+
+			opt, ok := next.(NDPRecursiveDNSServer)
+			if !ok {
+				t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+			}
+			if got := opt.Lifetime(); got != test.lifetime {
+				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+			}
+			if got := opt.Addresses(); !cmp.Equal(got, test.addrs) {
+				t.Errorf("got Addresses = %v, want = %v", got, test.addrs)
+			}
+
+			// Iterator should not return anything else.
+			next, done, err = it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
+		})
+	}
+}
+
 // TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
 // the iterator was returned for is malformed.
 func TestNDPOptionsIterCheck(t *testing.T) {
@@ -473,6 +643,51 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 			},
 			nil,
 		},
+		{
+			"InvalidRecursiveDNSServerCutsOffAddress",
+			[]byte{
+				25, 4, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+				0, 1, 2, 3, 4, 5, 6, 7,
+			},
+			ErrNDPOptMalformedBody,
+		},
+		{
+			"InvalidRecursiveDNSServerInvalidLengthField",
+			[]byte{
+				25, 2, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8,
+			},
+			ErrNDPInvalidLength,
+		},
+		{
+			"RecursiveDNSServerTooSmall",
+			[]byte{
+				25, 1, 0, 0,
+				0, 0, 0,
+			},
+			ErrNDPOptBufExhausted,
+		},
+		{
+			"RecursiveDNSServerMulticast",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+			},
+			ErrNDPOptMalformedBody,
+		},
+		{
+			"RecursiveDNSServerUnspecified",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			ErrNDPOptMalformedBody,
+		},
 	}
 
 	for _, test := range tests {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index cfdd0496e..1d202deb5 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -596,7 +596,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 			// Update the invalidation timer.
 			timer := prefixState.invalidationTimer
 
-			if timer == nil && vl >= header.NDPPrefixInformationInfiniteLifetime {
+			if timer == nil && vl >= header.NDPInfiniteLifetime {
 				// Had infinite valid lifetime before and
 				// continues to have an invalid lifetime. Do
 				// nothing further.
@@ -615,7 +615,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 				*prefixState.doNotInvalidate = true
 			}
 
-			if vl >= header.NDPPrefixInformationInfiniteLifetime {
+			if vl >= header.NDPInfiniteLifetime {
 				// Prefix is now valid forever so we don't need
 				// an invalidation timer.
 				prefixState.invalidationTimer = nil
@@ -734,7 +734,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 	var timer *time.Timer
 
 	// Only create a timer if the lifetime is not infinite.
-	if l < header.NDPPrefixInformationInfiniteLifetime {
+	if l < header.NDPInfiniteLifetime {
 		timer = ndp.prefixInvalidationCallback(prefix, l, &doNotInvalidate)
 	}
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 5b901f947..b2af78212 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1364,10 +1364,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	// invalidate the prefix.
 	const testInfiniteLifetimeSeconds = 2
 	const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second
-	saved := header.NDPPrefixInformationInfiniteLifetime
-	header.NDPPrefixInformationInfiniteLifetime = testInfiniteLifetime
+	saved := header.NDPInfiniteLifetime
+	header.NDPInfiniteLifetime = testInfiniteLifetime
 	defer func() {
-		header.NDPPrefixInformationInfiniteLifetime = saved
+		header.NDPInfiniteLifetime = saved
 	}()
 
 	prefix := tcpip.AddressWithPrefix{
-- 
cgit v1.2.3


From 02258607f97353932d56bfde9274d50dda18e374 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 5 Dec 2019 12:56:31 -0800
Subject: Add vfs.CheckSetStat() and its dependencies.

PiperOrigin-RevId: 284033820
---
 pkg/sentry/vfs/permissions.go | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f8e74355c..f1edb0680 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -119,3 +119,65 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 		return false
 	}
 }
+
+// CheckSetStat checks that creds has permission to change the metadata of a
+// file with the given permissions, UID, and GID as specified by stat, subject
+// to the rules of Linux's fs/attr.c:setattr_prepare().
+func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+	if stat.Mask&linux.STATX_MODE != 0 {
+		if !CanActAsOwner(creds, kuid) {
+			return syserror.EPERM
+		}
+		// TODO(b/30815691): "If the calling process is not privileged (Linux:
+		// does not have the CAP_FSETID capability), and the group of the file
+		// does not match the effective group ID of the process or one of its
+		// supplementary group IDs, the S_ISGID bit will be turned off, but
+		// this will not cause an error to be returned." - chmod(2)
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) ||
+			HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+			return syserror.EPERM
+		}
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) ||
+			HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+			return syserror.EPERM
+		}
+	}
+	if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
+		if !CanActAsOwner(creds, kuid) {
+			if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
+				(stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) ||
+				(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
+				return syserror.EPERM
+			}
+			// isDir is irrelevant in the following call to
+			// GenericCheckPermissions since ats == MayWrite means that
+			// CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
+			// applies, regardless of isDir.
+			if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// CanActAsOwner returns true if creds can act as the owner of a file with the
+// given owning UID, consistent with Linux's
+// fs/inode.c:inode_owner_or_capable().
+func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
+	if creds.EffectiveKUID == kuid {
+		return true
+	}
+	return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok()
+}
+
+// HasCapabilityOnFile returns true if creds has the given capability with
+// respect to a file with the given owning UID and GID, consistent with Linux's
+// kernel/capability.c:capable_wrt_inode_uidgid().
+func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
+	return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
+}
-- 
cgit v1.2.3


From 0a32c0235744191947a6bf890031026e06788837 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 5 Dec 2019 13:22:31 -0800
Subject: Create correct file for /proc/[pid]/task/[tid]/io

PiperOrigin-RevId: 284038840
---
 pkg/sentry/fs/proc/task.go  | 32 +++++++++++++++++---------------
 test/syscalls/linux/proc.cc | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 2a598149d..0e46c5fb7 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -67,29 +67,28 @@ type taskDir struct {
 var _ fs.InodeOperations = (*taskDir)(nil)
 
 // newTaskDir creates a new proc task entry.
-func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode {
+func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":    newAuxvec(t, msrc),
-		"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":    newComm(t, msrc),
-		"environ": newExecArgInode(t, msrc, environExecArg),
-		"exe":     newExe(t, msrc),
-		"fd":      newFdDir(t, msrc),
-		"fdinfo":  newFdInfoDir(t, msrc),
-		"gid_map": newGIDMap(t, msrc),
-		// FIXME(b/123511468): create the correct io file for threads.
-		"io":        newIO(t, msrc),
+		"auxv":      newAuxvec(t, msrc),
+		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":      newComm(t, msrc),
+		"environ":   newExecArgInode(t, msrc, environExecArg),
+		"exe":       newExe(t, msrc),
+		"fd":        newFdDir(t, msrc),
+		"fdinfo":    newFdInfoDir(t, msrc),
+		"gid_map":   newGIDMap(t, msrc),
+		"io":        newIO(t, msrc, isThreadGroup),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		"ns":        newNamespaceDir(t, msrc),
 		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, showSubtasks, p.pidns),
+		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
 		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, p.pidns),
 		"uid_map":   newUIDMap(t, msrc),
 	}
-	if showSubtasks {
+	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
 	}
 	if len(p.cgroupControllers) > 0 {
@@ -619,8 +618,11 @@ type ioData struct {
 	ioUsage
 }
 
-func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
+	if isThreadGroup {
+		return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+	}
+	return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 512de5ee0..8cf08991b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -37,6 +37,7 @@
 #include <map>
 #include <memory>
 #include <ostream>
+#include <regex>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -51,6 +52,7 @@
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/capability_util.h"
@@ -1988,6 +1990,44 @@ TEST(Proc, GetdentsEnoent) {
               SyscallFailsWithErrno(ENOENT));
 }
 
+void CheckSyscwFromIOFile(const std::string& path, const std::string& regex) {
+  std::string output;
+  ASSERT_NO_ERRNO(GetContents(path, &output));
+  ASSERT_THAT(output, ContainsRegex(absl::StrCat("syscw:\\s+", regex, "\n")));
+}
+
+// Checks that there is variable accounting of IO between threads/tasks.
+TEST(Proc, PidTidIOAccounting) {
+  absl::Notification notification;
+
+  // Run a thread with a bunch of writes. Check that io account records exactly
+  // the number of write calls. File open/close is there to prevent buffering.
+  ScopedThread writer([&notification] {
+    const int num_writes = 100;
+    for (int i = 0; i < num_writes; i++) {
+      auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+      ASSERT_NO_ERRNO(SetContents(path.path(), "a"));
+    }
+    notification.Notify();
+    const std::string& writer_dir =
+        absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+    CheckSyscwFromIOFile(writer_dir, std::to_string(num_writes));
+  });
+
+  // Run a thread and do no writes. Check that no writes are recorded.
+  ScopedThread noop([&notification] {
+    notification.WaitForNotification();
+    const std::string& noop_dir =
+        absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+    CheckSyscwFromIOFile(noop_dir, "0");
+  });
+
+  writer.Join();
+  noop.Join();
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From f053c528122c246b4a454de54dacfffe0f7964f0 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 5 Dec 2019 13:55:37 -0800
Subject: Reduce flakiness under gotsan runs.

TcpPortReuseMultiThread creates lots of connections which result in
a lot of goroutines in the sentry. This can cause gotsan runs to
take really long and timeout. Increasing listen backlog and
reducing number of connections should help the connections complete
faster as well as reduce the number of goroutines that gotsan needs
to track.

PiperOrigin-RevId: 284046018
---
 test/syscalls/linux/socket_inet_loopback.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 96a1731cf..fa4358ae4 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -635,7 +635,9 @@ INSTANTIATE_TEST_SUITE_P(
 
 using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>;
 
-TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
+// TODO(gvisor.dev/issue/940): Remove _NoRandomSave when portHint/stack.Seed is
+// saved/restored.
+TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -643,6 +645,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
   sockaddr_storage listen_addr = listener.addr;
   sockaddr_storage conn_addr = connector.addr;
   constexpr int kThreadCount = 3;
+  constexpr int kConnectAttempts = 4096;
 
   // Create the listening socket.
   FileDescriptor listener_fds[kThreadCount];
@@ -657,7 +660,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
     ASSERT_THAT(
         bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
         SyscallSucceeds());
-    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+    ASSERT_THAT(listen(fd, kConnectAttempts / 3), SyscallSucceeds());
 
     // On the first bind we need to determine which port was bound.
     if (i != 0) {
@@ -676,7 +679,6 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
   }
 
-  constexpr int kConnectAttempts = 10000;
   std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
   int accept_counts[kThreadCount] = {};
-- 
cgit v1.2.3


From 13f0f6069af4d49e236cbee4f0284c190784db37 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 5 Dec 2019 17:27:15 -0800
Subject: Implement F_GETOWN_EX and F_SETOWN_EX.

Some versions of glibc will convert F_GETOWN fcntl(2) calls into F_GETOWN_EX in
some cases.

PiperOrigin-RevId: 284089373
---
 pkg/abi/linux/fcntl.go                |  41 ++++++---
 pkg/sentry/syscalls/linux/sys_file.go |  70 +++++++++++++--
 test/syscalls/linux/BUILD             |   1 +
 test/syscalls/linux/fcntl.cc          | 162 +++++++++++++++++++++++++++++++++-
 test/syscalls/linux/ioctl.cc          |   3 +-
 5 files changed, 255 insertions(+), 22 deletions(-)

diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index f78315ebf..6663a199c 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -16,15 +16,17 @@ package linux
 
 // Commands from linux/fcntl.h.
 const (
-	F_DUPFD         = 0x0
-	F_GETFD         = 0x1
-	F_SETFD         = 0x2
-	F_GETFL         = 0x3
-	F_SETFL         = 0x4
-	F_SETLK         = 0x6
-	F_SETLKW        = 0x7
-	F_SETOWN        = 0x8
-	F_GETOWN        = 0x9
+	F_DUPFD         = 0
+	F_GETFD         = 1
+	F_SETFD         = 2
+	F_GETFL         = 3
+	F_SETFL         = 4
+	F_SETLK         = 6
+	F_SETLKW        = 7
+	F_SETOWN        = 8
+	F_GETOWN        = 9
+	F_SETOWN_EX     = 15
+	F_GETOWN_EX     = 16
 	F_DUPFD_CLOEXEC = 1024 + 6
 	F_SETPIPE_SZ    = 1024 + 7
 	F_GETPIPE_SZ    = 1024 + 8
@@ -32,9 +34,9 @@ const (
 
 // Commands for F_SETLK.
 const (
-	F_RDLCK = 0x0
-	F_WRLCK = 0x1
-	F_UNLCK = 0x2
+	F_RDLCK = 0
+	F_WRLCK = 1
+	F_UNLCK = 2
 )
 
 // Flags for fcntl.
@@ -42,7 +44,7 @@ const (
 	FD_CLOEXEC = 00000001
 )
 
-// Lock structure for F_SETLK.
+// Flock is the lock structure for F_SETLK.
 type Flock struct {
 	Type   int16
 	Whence int16
@@ -52,3 +54,16 @@ type Flock struct {
 	Pid    int32
 	_      [4]byte
 }
+
+// Flags for F_SETOWN_EX and F_GETOWN_EX.
+const (
+	F_OWNER_TID  = 0
+	F_OWNER_PID  = 1
+	F_OWNER_PGRP = 2
+)
+
+// FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX.
+type FOwnerEx struct {
+	Type int32
+	PID  int32
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3b9181002..9bc2445a5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -840,25 +840,42 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(newfd), nil, nil
 }
 
-func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
 	ma := file.Async(nil)
 	if ma == nil {
-		return 0
+		return linux.FOwnerEx{}
 	}
 	a := ma.(*fasync.FileAsync)
 	ot, otg, opg := a.Owner()
 	switch {
 	case ot != nil:
-		return int32(t.PIDNamespace().IDOfTask(ot))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_TID,
+			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
+		}
 	case otg != nil:
-		return int32(t.PIDNamespace().IDOfThreadGroup(otg))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PID,
+			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+		}
 	case opg != nil:
-		return int32(-t.PIDNamespace().IDOfProcessGroup(opg))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PGRP,
+			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+		}
 	default:
-		return 0
+		return linux.FOwnerEx{}
 	}
 }
 
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+	owner := fGetOwnEx(t, file)
+	if owner.Type == linux.F_OWNER_PGRP {
+		return -owner.PID
+	}
+	return owner.PID
+}
+
 // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
 //
 // If who is positive, it represents a PID. If negative, it represents a PGID.
@@ -901,11 +918,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		t.FDTable().SetFlags(fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
+		return 0, nil, nil
 	case linux.F_GETFL:
 		return uintptr(file.Flags().ToLinux()), nil, nil
 	case linux.F_SETFL:
 		flags := uint(args[2].Uint())
 		file.SetFlags(linuxToFlags(flags).Settable())
+		return 0, nil, nil
 	case linux.F_SETLK, linux.F_SETLKW:
 		// In Linux the file system can choose to provide lock operations for an inode.
 		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
@@ -1008,6 +1027,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_SETOWN:
 		fSetOwn(t, file, args[2].Int())
 		return 0, nil, nil
+	case linux.F_GETOWN_EX:
+		addr := args[2].Pointer()
+		owner := fGetOwnEx(t, file)
+		_, err := t.CopyOut(addr, &owner)
+		return 0, nil, err
+	case linux.F_SETOWN_EX:
+		addr := args[2].Pointer()
+		var owner linux.FOwnerEx
+		n, err := t.CopyIn(addr, &owner)
+		if err != nil {
+			return 0, nil, err
+		}
+		a := file.Async(fasync.New).(*fasync.FileAsync)
+		switch owner.Type {
+		case linux.F_OWNER_TID:
+			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
+			if task == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerTask(t, task)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PID:
+			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
+			if tg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerThreadGroup(t, tg)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PGRP:
+			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
+			if pg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerProcessGroup(t, pg)
+			return uintptr(n), nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
 	case linux.F_GET_SEALS:
 		val, err := tmpfs.GetSeals(file.Dirent.Inode)
 		return uintptr(val), nil, err
@@ -1035,7 +1092,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
 }
 
 const (
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 7ce2e6270..61f310db9 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -753,6 +753,7 @@ cc_binary(
         "//test/util:eventfd_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:timer_util",
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 8a45be12a..4f3aa81d6 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <signal.h>
+#include <sys/types.h>
 #include <syscall.h>
 #include <unistd.h>
 
@@ -32,6 +33,7 @@
 #include "test/util/eventfd_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/timer_util.h"
@@ -910,8 +912,166 @@ TEST(FcntlTest, GetOwn) {
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
 
-  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), 0);
+  MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnEx) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &owner),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, SetOwnExInvalidType) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = __pid_type(-1);
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FcntlTest, SetOwnExInvalidTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_TID;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PID;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PGRP;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_TID;
+  EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PID;
+  EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PGRP;
+  EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  // NOTE(igudger): I don't understand why, but this is flaky on Linux.
+  // GetOwnExPgrp (below) does not have this issue.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), -owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnExTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_TID;
+  EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_PID;
+  EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_PGRP;
+  EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
               SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index c4f8bff08..b0a07a064 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -215,7 +215,8 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
   auto mask_cleanup =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
 
-  pid_t pid = getpid();
+  pid_t pid = -1;
+  EXPECT_THAT(pid = getpid(), SyscallSucceeds());
   EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
 
   int set = 1;
-- 
cgit v1.2.3


From 40035d7d9c18d0467075cdaebe3d26d2dbd2720b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 5 Dec 2019 17:57:07 -0800
Subject: Fix possible race condition destroying container

When the sandbox is destroyed, making URPC calls to destroy the
container will fail. The code was checking if the sandbox was
running before attempting to make the URPC call, but that is racy.

PiperOrigin-RevId: 284093764
---
 runsc/sandbox/sandbox.go | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ee9327fc8..805233184 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -1004,16 +1004,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
+	if err := s.destroyContainer(cid); err != nil {
+		// If the sandbox isn't running, the container has already been destroyed,
+		// ignore the error in this case.
+		if s.IsRunning() {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *Sandbox) destroyContainer(cid string) error {
 	if s.IsRootContainer(cid) {
 		log.Debugf("Destroying root container %q by destroying sandbox", cid)
 		return s.destroy()
 	}
 
-	if !s.IsRunning() {
-		// Sandbox isn't running anymore, container is already destroyed.
-		return nil
-	}
-
 	log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
-- 
cgit v1.2.3


From f8bb3f79041bf819cdf803c1009a442154692301 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 6 Dec 2019 08:35:18 -0800
Subject: Document ELF PT_LOAD difference from Linux

PiperOrigin-RevId: 284191345
---
 pkg/sentry/loader/elf.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index c2c3ec06e..6299a3e2f 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -408,6 +408,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 				start = vaddr
 			}
 			if vaddr < end {
+				// NOTE(b/37474556): Linux allows out-of-order
+				// segments, in violation of the spec.
 				ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
 				return loadedELF{}, syserror.ENOEXEC
 			}
-- 
cgit v1.2.3


From b0066217ecd830be1d816d2b4d824f89b278c556 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 6 Dec 2019 12:12:27 -0800
Subject: Add hostinet tests for UDP sockets.

We need to skip a subset of the tests, because of features that hostinet does
not currently support.

Fixes #1209

PiperOrigin-RevId: 284235911
---
 test/syscalls/BUILD                          |  1 +
 test/syscalls/linux/udp_socket_test_cases.cc | 35 ++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 722d14b53..6650984fa 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -669,6 +669,7 @@ syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
     size = "medium",
+    add_hostinet = True,
     shard_count = 10,
     test = "//test/syscalls/linux:udp_socket_test",
 )
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 63b92d6a7..4556f16d6 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -656,6 +656,9 @@ TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
 }
 
 TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
   // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
@@ -673,6 +676,9 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
 }
 
 TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
   // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
@@ -878,6 +884,10 @@ TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
 }
 
 TEST_P(UdpSocketTest, ReadShutdown) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
   char received[512];
   EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
               SyscallFailsWithErrno(EWOULDBLOCK));
@@ -900,6 +910,10 @@ TEST_P(UdpSocketTest, ReadShutdown) {
 }
 
 TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
   char received[512];
   EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
               SyscallFailsWithErrno(EWOULDBLOCK));
@@ -1189,6 +1203,10 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
 }
 
 TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
+  // hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   int v = -1;
   socklen_t optlen = sizeof(v);
   ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
@@ -1198,6 +1216,10 @@ TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
 }
 
 TEST_P(UdpSocketTest, SoTimestamp) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1241,6 +1263,9 @@ TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
 }
 
 TEST_P(UdpSocketTest, TimestampIoctl) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1259,7 +1284,10 @@ TEST_P(UdpSocketTest, TimestampIoctl) {
   ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
 }
 
-TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
+TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1270,6 +1298,10 @@ TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
 // Test that the timestamp accessed via SIOCGSTAMP is still accessible after
 // SO_TIMESTAMP is enabled and used to retrieve a timestamp.
 TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1304,7 +1336,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
   msg.msg_controllen = sizeof(cmsgbuf);
   ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
   struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg = CMSG_FIRSTHDR(&msg);
   ASSERT_NE(cmsg, nullptr);
 
   // The ioctl should return the exact same values as before.
-- 
cgit v1.2.3


From ea7a100202f01601fba613a76f106a9a45c817c8 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 6 Dec 2019 13:50:12 -0800
Subject: Make annotations OCI compliant

Changed annotation to follow the standard defined here:
https://github.com/opencontainers/image-spec/blob/master/annotations.md

PiperOrigin-RevId: 284254847
---
 runsc/boot/fs.go                        | 23 +++++---
 runsc/boot/fs_test.go                   | 97 ++++++++++++++++-----------------
 runsc/container/multi_container_test.go |  8 +--
 3 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index bc9ffaf81..421ccd255 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"path/filepath"
 	"sort"
 	"strconv"
@@ -52,7 +51,7 @@ const (
 	rootDevice = "9pfs-/"
 
 	// MountPrefix is the annotation prefix for mount hints.
-	MountPrefix = "gvisor.dev/spec/mount"
+	MountPrefix = "dev.gvisor.spec.mount."
 
 	// Filesystems that runsc supports.
 	bind     = "bind"
@@ -490,14 +489,15 @@ type podMountHints struct {
 func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 	mnts := make(map[string]*mountHint)
 	for k, v := range spec.Annotations {
-		// Look for 'gvisor.dev/spec/mount' annotations and parse them.
+		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
 		if strings.HasPrefix(k, MountPrefix) {
-			parts := strings.Split(k, "/")
-			if len(parts) != 5 {
+			// Remove the prefix and split the rest.
+			parts := strings.Split(k[len(MountPrefix):], ".")
+			if len(parts) != 2 {
 				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
 			}
-			name := parts[3]
-			if len(name) == 0 || path.Clean(name) != name {
+			name := parts[0]
+			if len(name) == 0 {
 				return nil, fmt.Errorf("invalid mount name: %s", name)
 			}
 			mnt := mnts[name]
@@ -505,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 				mnt = &mountHint{name: name}
 				mnts[name] = mnt
 			}
-			if err := mnt.setField(parts[4], v); err != nil {
+			if err := mnt.setField(parts[1], v); err != nil {
 				return nil, err
 			}
 		}
@@ -575,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 func (c *containerMounter) processHints(conf *Config) error {
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs {
+			continue
+		}
 		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
 		inode, err := c.mountSharedMaster(ctx, conf, hint)
 		if err != nil {
@@ -851,7 +856,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
 	return nil
 }
 
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 0396a4cfb..912037075 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -15,7 +15,6 @@
 package boot
 
 import (
-	"path"
 	"reflect"
 	"strings"
 	"testing"
@@ -26,19 +25,19 @@ import (
 func TestPodMountHintsHappy(t *testing.T) {
 	spec := &specs.Spec{
 		Annotations: map[string]string{
-			path.Join(MountPrefix, "mount1", "source"): "foo",
-			path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-			path.Join(MountPrefix, "mount1", "share"):  "pod",
+			MountPrefix + "mount1.source": "foo",
+			MountPrefix + "mount1.type":   "tmpfs",
+			MountPrefix + "mount1.share":  "pod",
 
-			path.Join(MountPrefix, "mount2", "source"):  "bar",
-			path.Join(MountPrefix, "mount2", "type"):    "bind",
-			path.Join(MountPrefix, "mount2", "share"):   "container",
-			path.Join(MountPrefix, "mount2", "options"): "rw,private",
+			MountPrefix + "mount2.source":  "bar",
+			MountPrefix + "mount2.type":    "bind",
+			MountPrefix + "mount2.share":   "container",
+			MountPrefix + "mount2.options": "rw,private",
 		},
 	}
 	podHints, err := newPodMountHints(spec)
 	if err != nil {
-		t.Errorf("newPodMountHints failed: %v", err)
+		t.Fatalf("newPodMountHints failed: %v", err)
 	}
 
 	// Check that fields were set correctly.
@@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) {
 		{
 			name: "too short",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1"): "foo",
+				MountPrefix + "mount1": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "no name",
 			annotations: map[string]string{
-				MountPrefix + "//source": "foo",
+				MountPrefix + ".source": "foo",
 			},
 			error: "invalid mount name",
 		},
 		{
 			name: "missing source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "type"):  "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"): "pod",
+				MountPrefix + "mount1.type":  "tmpfs",
+				MountPrefix + "mount1.share": "pod",
 			},
 			error: "source field",
 		},
 		{
 			name: "missing type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "type field",
 		},
 		{
 			name: "missing share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
 			},
 			error: "share field",
 		},
 		{
 			name: "invalid field name",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "invalid"): "foo",
+				MountPrefix + "mount1.invalid": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "invalid source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "source cannot be empty",
 		},
 		{
 			name: "invalid type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "invalid-type",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "invalid-type",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "invalid type",
 		},
 		{
 			name: "invalid share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "invalid-share",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "invalid-share",
 			},
 			error: "invalid share",
 		},
 		{
 			name: "invalid options",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"):  "foo",
-				path.Join(MountPrefix, "mount1", "type"):    "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):   "pod",
-				path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+				MountPrefix + "mount1.source":  "foo",
+				MountPrefix + "mount1.type":    "tmpfs",
+				MountPrefix + "mount1.share":   "pod",
+				MountPrefix + "mount1.options": "invalid-option",
 			},
 			error: "unknown mount option",
 		},
 		{
 			name: "duplicate source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 
-				path.Join(MountPrefix, "mount2", "source"): "foo",
-				path.Join(MountPrefix, "mount2", "type"):   "bind",
-				path.Join(MountPrefix, "mount2", "share"):  "container",
+				MountPrefix + "mount2.source": "foo",
+				MountPrefix + "mount2.type":   "bind",
+				MountPrefix + "mount2.share":  "container",
 			},
 			error: "have the same mount source",
 		},
@@ -202,36 +201,36 @@ func TestGetMountAccessType(t *testing.T) {
 		{
 			name: "container=exclusive",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source,
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "container",
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
 			},
 			want: FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source,
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			want: FileAccessShared,
 		},
 		{
 			name: "shared=shared",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source,
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "shared",
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "shared",
 			},
 			want: FileAccessShared,
 		},
 		{
 			name: "default=shared",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source + "mismatch",
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "container",
+				MountPrefix + "mount1.source": source + "mismatch",
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
 			},
 			want: FileAccessShared,
 		},
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index a5a62378c..de2fd3cf2 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -123,11 +123,11 @@ func execMany(execs []execDesc) error {
 
 func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 	for _, spec := range pod {
-		spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
-		spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
-		spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+		spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+		spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+		spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
 		if len(mount.Options) > 0 {
-			spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+			spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
 		}
 	}
 }
-- 
cgit v1.2.3


From 663fe840f79ac3d8e2ce1a1f1409d84cf2a9d37e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 6 Dec 2019 14:32:53 -0800
Subject: Implement TTY field in control.Processes().

Threadgroups already know their TTY (if they have one), which now contains the
TTY Index, and is returned in the Processes() call.

PiperOrigin-RevId: 284263850
---
 pkg/sentry/control/proc.go           | 26 +++++++---
 pkg/sentry/control/proc_test.go      | 10 ++--
 pkg/sentry/fs/tty/terminal.go        |  4 +-
 pkg/sentry/kernel/tty.go             | 11 +++++
 runsc/container/container_test.go    | 95 +++++++++++++++++++++++++++++++++++-
 runsc/container/test_app/BUILD       |  1 +
 runsc/container/test_app/test_app.go | 40 +++++++++++++++
 7 files changed, 173 insertions(+), 14 deletions(-)

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index c35faeb4c..a6f90b2bb 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -268,7 +268,6 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error {
 }
 
 // Process contains information about a single process in a Sandbox.
-// TODO(b/117881927): Implement TTY field.
 type Process struct {
 	UID auth.KUID       `json:"uid"`
 	PID kernel.ThreadID `json:"pid"`
@@ -276,6 +275,9 @@ type Process struct {
 	PPID kernel.ThreadID `json:"ppid"`
 	// Processor utilization
 	C int32 `json:"c"`
+	// TTY name of the process. Will be of the form "pts/N" if there is a
+	// TTY, or "?" if there is not.
+	TTY string `json:"tty"`
 	// Start time
 	STime string `json:"stime"`
 	// CPU time
@@ -285,18 +287,19 @@ type Process struct {
 }
 
 // ProcessListToTable prints a table with the following format:
-// UID       PID       PPID      C         STIME     TIME       CMD
-// 0         1         0         0         14:04     505262ns   tail
+// UID       PID       PPID      C         TTY		STIME     TIME       CMD
+// 0         1         0         0         pty/4	14:04     505262ns   tail
 func ProcessListToTable(pl []*Process) string {
 	var buf bytes.Buffer
 	tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
-	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD")
+	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD")
 	for _, d := range pl {
-		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s",
+		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s",
 			d.UID,
 			d.PID,
 			d.PPID,
 			d.C,
+			d.TTY,
 			d.STime,
 			d.Time,
 			d.Cmd)
@@ -347,7 +350,7 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 		if p := tg.Leader().Parent(); p != nil {
 			ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup())
 		}
-		*out = append(*out, &Process{
+		p := Process{
 			UID:   tg.Leader().Credentials().EffectiveKUID,
 			PID:   pid,
 			PPID:  ppid,
@@ -355,7 +358,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 			C:     percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
 			Time:  tg.CPUStats().SysTime.String(),
 			Cmd:   tg.Leader().Name(),
-		})
+			TTY:   ttyName(tg.TTY()),
+		}
+		*out = append(*out, &p)
 	}
 	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
 	return nil
@@ -395,3 +400,10 @@ func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
 	}
 	return int32(percentCPU)
 }
+
+func ttyName(tty *kernel.TTY) string {
+	if tty == nil {
+		return "?"
+	}
+	return fmt.Sprintf("pts/%d", tty.Index)
+}
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index d8ada2694..0a88459b2 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -34,7 +34,7 @@ func TestProcessListTable(t *testing.T) {
 	}{
 		{
 			pl:       []*Process{},
-			expected: "UID       PID       PPID      C         STIME     TIME      CMD",
+			expected: "UID       PID       PPID      C         TTY       STIME     TIME      CMD",
 		},
 		{
 			pl: []*Process{
@@ -43,6 +43,7 @@ func TestProcessListTable(t *testing.T) {
 					PID:   0,
 					PPID:  0,
 					C:     0,
+					TTY:   "?",
 					STime: "0",
 					Time:  "0",
 					Cmd:   "zero",
@@ -52,14 +53,15 @@ func TestProcessListTable(t *testing.T) {
 					PID:   1,
 					PPID:  1,
 					C:     1,
+					TTY:   "pts/4",
 					STime: "1",
 					Time:  "1",
 					Cmd:   "one",
 				},
 			},
-			expected: `UID       PID       PPID      C         STIME     TIME      CMD
-0         0         0         0         0         0         zero
-1         1         1         1         1         1         one`,
+			expected: `UID       PID       PPID      C         TTY       STIME     TIME      CMD
+0         0         0         0         ?         0         0         zero
+1         1         1         1         pts/4     1         1         one`,
 		},
 	}
 
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ff8138820..917f90cc0 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -53,8 +53,8 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
 		d:          d,
 		n:          n,
 		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{},
-		slaveKTTY:  &kernel.TTY{},
+		masterKTTY: &kernel.TTY{Index: n},
+		slaveKTTY:  &kernel.TTY{Index: n},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
index 34f84487a..048de26dc 100644
--- a/pkg/sentry/kernel/tty.go
+++ b/pkg/sentry/kernel/tty.go
@@ -21,8 +21,19 @@ import "sync"
 //
 // +stateify savable
 type TTY struct {
+	// Index is the terminal index. It is immutable.
+	Index uint32
+
 	mu sync.Mutex `state:"nosave"`
 
 	// tg is protected by mu.
 	tg *ThreadGroup
 }
+
+// TTY returns the thread group's controlling terminal. If nil, there is no
+// controlling terminal.
+func (tg *ThreadGroup) TTY() *TTY {
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.tty
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 07eacaac0..1d06f2780 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -98,10 +98,14 @@ func procListsEqual(got, want []*control.Process) bool {
 	for i := range got {
 		pd1 := got[i]
 		pd2 := want[i]
-		// Zero out unimplemented and timing dependant fields.
+		// Zero out timing dependant fields.
 		pd1.Time = ""
 		pd1.STime = ""
 		pd1.C = 0
+		// Ignore TTY field too, since it's not relevant in the cases
+		// where we use this method. Tests that care about the TTY
+		// field should check for it themselves.
+		pd1.TTY = ""
 		if *pd1 != *pd2 {
 			return false
 		}
@@ -2112,6 +2116,95 @@ func TestOverlayfsStaleRead(t *testing.T) {
 	}
 }
 
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	testCases := []struct {
+		name         string
+		useTTY       bool
+		wantTTYField string
+	}{
+		{
+			name:         "no tty",
+			useTTY:       false,
+			wantTTYField: "?",
+		},
+		{
+			name:         "tty used",
+			useTTY:       true,
+			wantTTYField: "pts/0",
+		},
+	}
+
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			conf := testutil.TestConfig()
+
+			// We will run /bin/sleep, possibly with an open TTY.
+			cmd := []string{"/bin/sleep", "10000"}
+			if test.useTTY {
+				// Run inside the "pty-runner".
+				cmd = append([]string{testApp, "pty-runner"}, cmd...)
+			}
+
+			spec := testutil.NewSpecWithArgs(cmd...)
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(rootDir)
+			defer os.RemoveAll(bundleDir)
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.UniqueContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Wait for sleep to be running, and check the TTY
+			// field.
+			var gotTTYField string
+			cb := func() error {
+				ps, err := c.Processes()
+				if err != nil {
+					err = fmt.Errorf("error getting process data from container: %v", err)
+					return &backoff.PermanentError{Err: err}
+				}
+				for _, p := range ps {
+					if strings.Contains(p.Cmd, "sleep") {
+						gotTTYField = p.TTY
+						return nil
+					}
+				}
+				return fmt.Errorf("sleep not running")
+			}
+			if err := testutil.Poll(cb, 30*time.Second); err != nil {
+				t.Fatalf("error waiting for sleep process: %v", err)
+			}
+
+			if gotTTYField != test.wantTTYField {
+				t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+			}
+		})
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 9bf9e6e9d..bfd338bb6 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -15,5 +15,6 @@ go_binary(
         "//pkg/unet",
         "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
     ],
 )
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 913d781c6..a1c8a741a 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -19,6 +19,7 @@ package main
 import (
 	"context"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"log"
 	"net"
@@ -31,6 +32,7 @@ import (
 
 	"flag"
 	"github.com/google/subcommands"
+	"github.com/kr/pty"
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
@@ -41,6 +43,7 @@ func main() {
 	subcommands.Register(new(fdReceiver), "")
 	subcommands.Register(new(fdSender), "")
 	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(ptyRunner), "")
 	subcommands.Register(new(reaper), "")
 	subcommands.Register(new(syscall), "")
 	subcommands.Register(new(taskTree), "")
@@ -352,3 +355,40 @@ func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...inter
 
 	return subcommands.ExitSuccess
 }
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+	return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+	return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+	return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+	c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+	f, err := pty.Start(c)
+	if err != nil {
+		fmt.Printf("pty.Start failed: %v", err)
+		return subcommands.ExitFailure
+	}
+	defer f.Close()
+
+	// Copy stdout from the command to keep this process alive until the
+	// subprocess exits.
+	io.Copy(os.Stdout, f)
+
+	return subcommands.ExitSuccess
+}
-- 
cgit v1.2.3


From ab3f7bc39392aaa7e7961ae6d82d94f2cae18adb Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 6 Dec 2019 14:40:10 -0800
Subject: Do IPv6 Stateless Address Auto-Configuration (SLAAC)

This change allows the netstack to do SLAAC as outlined by RFC 4862 section 5.5.

Note, this change will not break existing uses of netstack as the default
configuration for the stack options is set in such a way that SLAAC
will not be performed. See `stack.Options` and `stack.NDPConfigurations` for
more details.

This change reuses 1 option and introduces a new one that is required to take
advantage of SLAAC, all available under NDPConfigurations:
- HandleRAs: Whether or not NDP RAs are processes
- AutoGenGlobalAddresses: Whether or not SLAAC is performed.

Also note, this change does not deprecate SLAAC generated addresses after the
preferred lifetime. That will come in a later change (b/143713887). Currently,
only the valid lifetime is honoured.

Tests: Unittest to make sure that SLAAC generates and adds addresses only when
configured to do so. Tests also makes sure that conflicts with static addresses
do not modify the static address.
PiperOrigin-RevId: 284265317
---
 pkg/tcpip/header/ipv6.go        |  49 +++-
 pkg/tcpip/header/ndp_options.go |   9 +-
 pkg/tcpip/stack/ndp.go          | 509 ++++++++++++++++++++++++++++-----
 pkg/tcpip/stack/ndp_test.go     | 616 +++++++++++++++++++++++++++++++++++++---
 pkg/tcpip/stack/nic.go          |  74 +++--
 5 files changed, 1109 insertions(+), 148 deletions(-)

diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 0caa51c1e..5275b34d4 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -90,6 +90,18 @@ const (
 	// IPv6Any is the non-routable IPv6 "any" meta address. It is also
 	// known as the unspecified address.
 	IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+
+	// IIDSize is the size of an interface identifier (IID), in bytes, as
+	// defined by RFC 4291 section 2.5.1.
+	IIDSize = 8
+
+	// IIDOffsetInIPv6Address is the offset, in bytes, from the start
+	// of an IPv6 address to the beginning of the interface identifier
+	// (IID) for auto-generated addresses. That is, all bytes before
+	// the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all
+	// bytes including and after the IIDOffsetInIPv6Address-th byte are
+	// for the IID.
+	IIDOffsetInIPv6Address = 8
 )
 
 // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -266,6 +278,28 @@ func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
 	return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
 }
 
+// EthernetAdddressToEUI64IntoBuf populates buf with a EUI-64 from a 48-bit
+// Ethernet/MAC address.
+//
+// buf MUST be at least 8 bytes.
+func EthernetAdddressToEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
+	buf[0] = linkAddr[0] ^ 2
+	buf[1] = linkAddr[1]
+	buf[2] = linkAddr[2]
+	buf[3] = 0xFE
+	buf[4] = 0xFE
+	buf[5] = linkAddr[3]
+	buf[6] = linkAddr[4]
+	buf[7] = linkAddr[5]
+}
+
+// EthernetAddressToEUI64 computes an EUI-64 from a 48-bit Ethernet/MAC address.
+func EthernetAddressToEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
+	var buf [IIDSize]byte
+	EthernetAdddressToEUI64IntoBuf(linkAddr, buf[:])
+	return buf
+}
+
 // LinkLocalAddr computes the default IPv6 link-local address from a link-layer
 // (MAC) address.
 func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
@@ -275,18 +309,11 @@ func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
 	// The conversion is very nearly:
 	//	aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff
 	// Note the capital A. The conversion aa->Aa involves a bit flip.
-	lladdrb := [16]byte{
-		0:  0xFE,
-		1:  0x80,
-		8:  linkAddr[0] ^ 2,
-		9:  linkAddr[1],
-		10: linkAddr[2],
-		11: 0xFF,
-		12: 0xFE,
-		13: linkAddr[3],
-		14: linkAddr[4],
-		15: linkAddr[5],
+	lladdrb := [IPv6AddressSize]byte{
+		0: 0xFE,
+		1: 0x80,
 	}
+	EthernetAdddressToEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
 	return tcpip.Address(lladdrb[:])
 }
 
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 2652e7b67..06e0bace2 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -17,6 +17,7 @@ package header
 import (
 	"encoding/binary"
 	"errors"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -109,13 +110,13 @@ const (
 )
 
 var (
-	// NDPInfiniteLifetime is a value that represents
-	// infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
-	// Information option. Its value is (2^32 - 1)s = 4294967295s
+	// NDPInfiniteLifetime is a value that represents infinity for the
+	// 4-byte lifetime fields found in various NDP options. Its value is
+	// (2^32 - 1)s = 4294967295s.
 	//
 	// This is a variable instead of a constant so that tests can change
 	// this value to a smaller value. It should only be modified by tests.
-	NDPInfiniteLifetime = time.Second * 4294967295
+	NDPInfiniteLifetime = time.Second * math.MaxUint32
 )
 
 // NDPOptionIterator is an iterator of NDPOption.
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 1d202deb5..75d3ecdac 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -58,6 +58,14 @@ const (
 	// Default = true.
 	defaultDiscoverOnLinkPrefixes = true
 
+	// defaultAutoGenGlobalAddresses is the default configuration for
+	// whether or not to generate global IPv6 addresses in response to
+	// receiving a new Prefix Information option with its Autonomous
+	// Address AutoConfiguration flag set, as a host.
+	//
+	// Default = true.
+	defaultAutoGenGlobalAddresses = true
+
 	// minimumRetransmitTimer is the minimum amount of time to wait between
 	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
 	// not impose a minimum Retransmit Timer, but we do here to make sure
@@ -87,6 +95,24 @@ const (
 	//
 	// Max = 10.
 	MaxDiscoveredOnLinkPrefixes = 10
+
+	// validPrefixLenForAutoGen is the expected prefix length that an
+	// address can be generated for. Must be 64 bits as the interface
+	// identifier (IID) is 64 bits and an IPv6 address is 128 bits, so
+	// 128 - 64 = 64.
+	validPrefixLenForAutoGen = 64
+)
+
+var (
+	// MinPrefixInformationValidLifetimeForUpdate is the minimum Valid
+	// Lifetime to update the valid lifetime of a generated address by
+	// SLAAC.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// Min = 2hrs.
+	MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour
 )
 
 // NDPDispatcher is the interface integrators of netstack must implement to
@@ -139,6 +165,22 @@ type NDPDispatcher interface {
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route
+
+	// OnAutoGenAddress will be called when a new prefix with its
+	// autonomous address-configuration flag set has been received and SLAAC
+	// has been performed. Implementations may prevent the stack from
+	// assigning the address to the NIC by returning false.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
+
+	// OnAutoGenAddressInvalidated will be called when an auto-generated
+	// address (as part of SLAAC) has been invalidated.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
@@ -168,6 +210,17 @@ type NDPConfigurations struct {
 	// will be discovered from Router Advertisements' Prefix Information
 	// option. This configuration is ignored if HandleRAs is false.
 	DiscoverOnLinkPrefixes bool
+
+	// AutoGenGlobalAddresses determines whether or not global IPv6
+	// addresses will be generated for a NIC in response to receiving a new
+	// Prefix Information option with its Autonomous Address
+	// AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC).
+	//
+	// Note, if an address was already generated for some unique prefix, as
+	// part of SLAAC, this option does not affect whether or not the
+	// lifetime(s) of the generated address changes; this option only
+	// affects the generation of new addresses as part of SLAAC.
+	AutoGenGlobalAddresses bool
 }
 
 // DefaultNDPConfigurations returns an NDPConfigurations populated with
@@ -179,6 +232,7 @@ func DefaultNDPConfigurations() NDPConfigurations {
 		HandleRAs:              defaultHandleRAs,
 		DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
 		DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes,
+		AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses,
 	}
 }
 
@@ -210,6 +264,9 @@ type ndpState struct {
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
 	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
+
+	// The addresses generated by SLAAC.
+	autoGenAddresses map[tcpip.Address]autoGenAddressState
 }
 
 // dadState holds the Duplicate Address Detection timer and channel to signal
@@ -270,6 +327,32 @@ type onLinkPrefixState struct {
 	doNotInvalidate *bool
 }
 
+// autoGenAddressState holds data associated with an address generated via
+// SLAAC.
+type autoGenAddressState struct {
+	invalidationTimer *time.Timer
+
+	// Used to signal the timer not to invalidate the SLAAC address (A) in
+	// a race condition (T1 is a goroutine that handles a PI for A and T2
+	// is the goroutine that handles A's invalidation timer firing):
+	//   T1: Receive a new PI for A
+	//   T1: Obtain the NIC's lock before processing the PI
+	//   T2: A's invalidation timer fires, and gets blocked on obtaining the
+	//       NIC's lock
+	//   T1: Refreshes/extends A's lifetime & releases NIC's lock
+	//   T2: Obtains NIC's lock & invalidates A immediately
+	//
+	// To resolve this, T1 will check to see if the timer already fired, and
+	// inform the timer using doNotInvalidate to not invalidate A, so that
+	// once T2 obtains the lock, it will see that it is set to true and do
+	// nothing further.
+	doNotInvalidate *bool
+
+	// Nonzero only when the address is not valid forever (invalidationTimer
+	// is not nil).
+	validUntil time.Time
+}
+
 // startDuplicateAddressDetection performs Duplicate Address Detection.
 //
 // This function must only be called by IPv6 addresses that are currently
@@ -536,17 +619,14 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
 		switch opt.Type() {
 		case header.NDPPrefixInformationType:
-			if !ndp.configs.DiscoverOnLinkPrefixes {
-				continue
-			}
-
 			pi := opt.(header.NDPPrefixInformation)
 
 			prefix := pi.Subnet()
 
 			// Is the prefix a link-local?
 			if header.IsV6LinkLocalAddress(prefix.ID()) {
-				// ...Yes, skip as per RFC 4861 section 6.3.4.
+				// ...Yes, skip as per RFC 4861 section 6.3.4,
+				// and RFC 4862 section 5.5.3.b (for SLAAC).
 				continue
 			}
 
@@ -557,82 +637,13 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 				continue
 			}
 
-			if !pi.OnLinkFlag() {
-				// Not on-link so don't "discover" it as an
-				// on-link prefix.
-				continue
-			}
-
-			prefixState, ok := ndp.onLinkPrefixes[prefix]
-			vl := pi.ValidLifetime()
-			switch {
-			case !ok && vl == 0:
-				// Don't know about this prefix but has a zero
-				// valid lifetime, so just ignore.
-				continue
-
-			case !ok && vl != 0:
-				// This is a new on-link prefix we are
-				// discovering.
-				//
-				// Only remember it if we currently know about
-				// less than MaxDiscoveredOnLinkPrefixes on-link
-				// prefixes.
-				if len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
-					ndp.rememberOnLinkPrefix(prefix, vl)
-				}
-				continue
-
-			case ok && vl == 0:
-				// We know about the on-link prefix, but it is
-				// no longer to be considered on-link, so
-				// invalidate it.
-				ndp.invalidateOnLinkPrefix(prefix)
-				continue
-			}
-
-			// This is an already discovered on-link prefix with a
-			// new non-zero valid lifetime.
-			// Update the invalidation timer.
-			timer := prefixState.invalidationTimer
-
-			if timer == nil && vl >= header.NDPInfiniteLifetime {
-				// Had infinite valid lifetime before and
-				// continues to have an invalid lifetime. Do
-				// nothing further.
-				continue
-			}
-
-			if timer != nil && !timer.Stop() {
-				// If we reach this point, then we know the
-				// timer already fired after we took the NIC
-				// lock. Inform the timer to not invalidate
-				// the prefix once it obtains the lock as we
-				// just got a new PI that refeshes its lifetime
-				// to a non-zero value. See
-				// onLinkPrefixState.doNotInvalidate for more
-				// details.
-				*prefixState.doNotInvalidate = true
+			if pi.OnLinkFlag() {
+				ndp.handleOnLinkPrefixInformation(pi)
 			}
 
-			if vl >= header.NDPInfiniteLifetime {
-				// Prefix is now valid forever so we don't need
-				// an invalidation timer.
-				prefixState.invalidationTimer = nil
-				ndp.onLinkPrefixes[prefix] = prefixState
-				continue
+			if pi.AutonomousAddressConfigurationFlag() {
+				ndp.handleAutonomousPrefixInformation(pi)
 			}
-
-			if timer != nil {
-				// We already have a timer so just reset it to
-				// expire after the new valid lifetime.
-				timer.Reset(vl)
-				continue
-			}
-
-			// We do not have a timer so just create a new one.
-			prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
-			ndp.onLinkPrefixes[prefix] = prefixState
 		}
 
 		// TODO(b/141556115): Do (MTU) Parameter Discovery.
@@ -795,3 +806,345 @@ func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Dur
 		ndp.invalidateOnLinkPrefix(prefix)
 	})
 }
+
+// handleOnLinkPrefixInformation handles a Prefix Information option with
+// its on-link flag set, as per RFC 4861 section 6.3.4.
+//
+// handleOnLinkPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the on-link flag is set.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
+	prefix := pi.Subnet()
+	prefixState, ok := ndp.onLinkPrefixes[prefix]
+	vl := pi.ValidLifetime()
+
+	if !ok && vl == 0 {
+		// Don't know about this prefix but it has a zero valid
+		// lifetime, so just ignore.
+		return
+	}
+
+	if !ok && vl != 0 {
+		// This is a new on-link prefix we are discovering
+		//
+		// Only remember it if we currently know about less than
+		// MaxDiscoveredOnLinkPrefixes on-link prefixes.
+		if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
+			ndp.rememberOnLinkPrefix(prefix, vl)
+		}
+		return
+	}
+
+	if ok && vl == 0 {
+		// We know about the on-link prefix, but it is
+		// no longer to be considered on-link, so
+		// invalidate it.
+		ndp.invalidateOnLinkPrefix(prefix)
+		return
+	}
+
+	// This is an already discovered on-link prefix with a
+	// new non-zero valid lifetime.
+	// Update the invalidation timer.
+	timer := prefixState.invalidationTimer
+
+	if timer == nil && vl >= header.NDPInfiniteLifetime {
+		// Had infinite valid lifetime before and
+		// continues to have an invalid lifetime. Do
+		// nothing further.
+		return
+	}
+
+	if timer != nil && !timer.Stop() {
+		// If we reach this point, then we know the timer alread fired
+		// after we took the NIC lock. Inform the timer to not
+		// invalidate the prefix once it obtains the lock as we just
+		// got a new PI that refreshes its lifetime to a non-zero value.
+		// See onLinkPrefixState.doNotInvalidate for more details.
+		*prefixState.doNotInvalidate = true
+	}
+
+	if vl >= header.NDPInfiniteLifetime {
+		// Prefix is now valid forever so we don't need
+		// an invalidation timer.
+		prefixState.invalidationTimer = nil
+		ndp.onLinkPrefixes[prefix] = prefixState
+		return
+	}
+
+	if timer != nil {
+		// We already have a timer so just reset it to
+		// expire after the new valid lifetime.
+		timer.Reset(vl)
+		return
+	}
+
+	// We do not have a timer so just create a new one.
+	prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
+	ndp.onLinkPrefixes[prefix] = prefixState
+}
+
+// handleAutonomousPrefixInformation handles a Prefix Information option with
+// its autonomous flag set, as per RFC 4862 section 5.5.3.
+//
+// handleAutonomousPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the autonomous flag is set.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
+	vl := pi.ValidLifetime()
+	pl := pi.PreferredLifetime()
+
+	// If the preferred lifetime is greater than the valid lifetime,
+	// silently ignore the Prefix Information option, as per RFC 4862
+	// section 5.5.3.c.
+	if pl > vl {
+		return
+	}
+
+	prefix := pi.Subnet()
+
+	// Check if we already have an auto-generated address for prefix.
+	for _, ref := range ndp.nic.endpoints {
+		if ref.protocol != header.IPv6ProtocolNumber {
+			continue
+		}
+
+		if ref.configType != slaac {
+			continue
+		}
+
+		addr := ref.ep.ID().LocalAddress
+		refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: ref.ep.PrefixLen()}
+		if refAddrWithPrefix.Subnet() != prefix {
+			continue
+		}
+
+		//
+		// At this point, we know we are refreshing a SLAAC generated
+		// IPv6 address with the prefix, prefix. Do the work as outlined
+		// by RFC 4862 section 5.5.3.e.
+		//
+
+		addrState, ok := ndp.autoGenAddresses[addr]
+		if !ok {
+			panic(fmt.Sprintf("must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr))
+		}
+
+		// TODO(b/143713887): Handle deprecating auto-generated address
+		//                    after the preferred lifetime.
+
+		// As per RFC 4862 section 5.5.3.e, the valid lifetime of the
+		// address generated by SLAAC is as follows:
+		//
+		// 1) If the received Valid Lifetime is greater than 2 hours or
+		//    greater than RemainingLifetime, set the valid lifetime of
+		//    the address to the advertised Valid Lifetime.
+		//
+		// 2) If RemainingLifetime is less than or equal to 2 hours,
+		//    ignore the advertised Valid Lifetime.
+		//
+		// 3) Otherwise, reset the valid lifetime of the address to 2
+		//    hours.
+
+		// Handle the infinite valid lifetime separately as we do not
+		// keep a timer in this case.
+		if vl >= header.NDPInfiniteLifetime {
+			if addrState.invalidationTimer != nil {
+				// Valid lifetime was finite before, but now it
+				// is valid forever.
+				if !addrState.invalidationTimer.Stop() {
+					*addrState.doNotInvalidate = true
+				}
+				addrState.invalidationTimer = nil
+				addrState.validUntil = time.Time{}
+				ndp.autoGenAddresses[addr] = addrState
+			}
+
+			return
+		}
+
+		var effectiveVl time.Duration
+		var rl time.Duration
+
+		// If the address was originally set to be valid forever,
+		// assume the remaining time to be the maximum possible value.
+		if addrState.invalidationTimer == nil {
+			rl = header.NDPInfiniteLifetime
+		} else {
+			rl = time.Until(addrState.validUntil)
+		}
+
+		if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
+			effectiveVl = vl
+		} else if rl <= MinPrefixInformationValidLifetimeForUpdate {
+			ndp.autoGenAddresses[addr] = addrState
+			return
+		} else {
+			effectiveVl = MinPrefixInformationValidLifetimeForUpdate
+		}
+
+		if addrState.invalidationTimer == nil {
+			addrState.invalidationTimer = ndp.autoGenAddrInvalidationTimer(addr, effectiveVl, addrState.doNotInvalidate)
+		} else {
+			if !addrState.invalidationTimer.Stop() {
+				*addrState.doNotInvalidate = true
+			}
+			addrState.invalidationTimer.Reset(effectiveVl)
+		}
+
+		addrState.validUntil = time.Now().Add(effectiveVl)
+		ndp.autoGenAddresses[addr] = addrState
+		return
+	}
+
+	// We do not already have an address within the prefix, prefix. Do the
+	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
+	// to auto-generated global addresses by SLAAC.
+
+	// Are we configured to auto-generate new global addresses?
+	if !ndp.configs.AutoGenGlobalAddresses {
+		return
+	}
+
+	// If we do not already have an address for this prefix and the valid
+	// lifetime is 0, no need to do anything further, as per RFC 4862
+	// section 5.5.3.d.
+	if vl == 0 {
+		return
+	}
+
+	// Make sure the prefix is valid (as far as its length is concerned) to
+	// generate a valid IPv6 address from an interface identifier (IID), as
+	// per RFC 4862 sectiion 5.5.3.d.
+	if prefix.Prefix() != validPrefixLenForAutoGen {
+		return
+	}
+
+	// Only attempt to generate an interface-specific IID if we have a valid
+	// link address.
+	//
+	// TODO(b/141011931): Validate a LinkEndpoint's link address
+	// (provided by LinkEndpoint.LinkAddress) before reaching this
+	// point.
+	linkAddr := ndp.nic.linkEP.LinkAddress()
+	if !header.IsValidUnicastEthernetAddress(linkAddr) {
+		return
+	}
+
+	// Generate an address within prefix from the EUI-64 of ndp's NIC's
+	// Ethernet MAC address.
+	addrBytes := make([]byte, header.IPv6AddressSize)
+	copy(addrBytes[:header.IIDOffsetInIPv6Address], prefix.ID()[:header.IIDOffsetInIPv6Address])
+	header.EthernetAdddressToEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	addr := tcpip.Address(addrBytes)
+	addrWithPrefix := tcpip.AddressWithPrefix{
+		Address:   addr,
+		PrefixLen: validPrefixLenForAutoGen,
+	}
+
+	// If the nic already has this address, do nothing further.
+	if ndp.nic.hasPermanentAddrLocked(addr) {
+		return
+	}
+
+	// Inform the integrator that we have a new SLAAC address.
+	if ndp.nic.stack.ndpDisp == nil {
+		return
+	}
+	if !ndp.nic.stack.ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
+		// Informed by the integrator not to add the address.
+		return
+	}
+
+	if _, err := ndp.nic.addAddressLocked(tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: addrWithPrefix,
+	}, FirstPrimaryEndpoint, permanent, slaac); err != nil {
+		panic(err)
+	}
+
+	// Setup the timers to deprecate and invalidate this newly generated
+	// address.
+
+	// TODO(b/143713887): Handle deprecating auto-generated addresses
+	//                    after the preferred lifetime.
+
+	var doNotInvalidate bool
+	var vTimer *time.Timer
+	if vl < header.NDPInfiniteLifetime {
+		vTimer = ndp.autoGenAddrInvalidationTimer(addr, vl, &doNotInvalidate)
+	}
+
+	ndp.autoGenAddresses[addr] = autoGenAddressState{
+		invalidationTimer: vTimer,
+		doNotInvalidate:   &doNotInvalidate,
+		validUntil:        time.Now().Add(vl),
+	}
+}
+
+// invalidateAutoGenAddress invalidates an auto-generated address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) {
+	if !ndp.cleanupAutoGenAddrResourcesAndNotify(addr) {
+		return
+	}
+
+	ndp.nic.removePermanentAddressLocked(addr)
+}
+
+// cleanupAutoGenAddrResourcesAndNotify cleans up an invalidated auto-generated
+// address's resources from ndp. If the stack has an NDP dispatcher, it will
+// be notified that addr has been invalidated.
+//
+// Returns true if ndp had resources for addr to cleanup.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool {
+	state, ok := ndp.autoGenAddresses[addr]
+
+	if !ok {
+		return false
+	}
+
+	if state.invalidationTimer != nil {
+		state.invalidationTimer.Stop()
+		state.invalidationTimer = nil
+		*state.doNotInvalidate = true
+	}
+
+	state.doNotInvalidate = nil
+
+	delete(ndp.autoGenAddresses, addr)
+
+	if ndp.nic.stack.ndpDisp != nil {
+		ndp.nic.stack.ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+			Address:   addr,
+			PrefixLen: validPrefixLenForAutoGen,
+		})
+	}
+
+	return true
+}
+
+// autoGenAddrInvalidationTimer returns a new invalidation timer for an
+// auto-generated address that fires after vl.
+//
+// doNotInvalidate is used to inform the timer when it fires at the same time
+// that an auto-generated address's valid lifetime gets refreshed. See
+// autoGenAddrState.doNotInvalidate for more details.
+func (ndp *ndpState) autoGenAddrInvalidationTimer(addr tcpip.Address, vl time.Duration, doNotInvalidate *bool) *time.Timer {
+	return time.AfterFunc(vl, func() {
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
+		if *doNotInvalidate {
+			*doNotInvalidate = false
+			return
+		}
+
+		ndp.invalidateAutoGenAddress(addr)
+	})
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index b2af78212..e9aa20148 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -38,7 +38,7 @@ const (
 	linkAddr1      = "\x02\x02\x03\x04\x05\x06"
 	linkAddr2      = "\x02\x02\x03\x04\x05\x07"
 	linkAddr3      = "\x02\x02\x03\x04\x05\x08"
-	defaultTimeout = 250 * time.Millisecond
+	defaultTimeout = 100 * time.Millisecond
 )
 
 var (
@@ -47,6 +47,31 @@ var (
 	llAddr3 = header.LinkLocalAddr(linkAddr3)
 )
 
+// prefixSubnetAddr returns a prefix (Address + Length), the prefix's equivalent
+// tcpip.Subnet, and an address where the lower half of the address is composed
+// of the EUI-64 of linkAddr if it is a valid unicast ethernet address.
+func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWithPrefix, tcpip.Subnet, tcpip.AddressWithPrefix) {
+	prefixBytes := []byte{1, 2, 3, 4, 5, 6, 7, 8 + offset, 0, 0, 0, 0, 0, 0, 0, 0}
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(prefixBytes),
+		PrefixLen: 64,
+	}
+
+	subnet := prefix.Subnet()
+
+	var addr tcpip.AddressWithPrefix
+	if header.IsValidUnicastEthernetAddress(linkAddr) {
+		addrBytes := []byte(subnet.ID())
+		header.EthernetAdddressToEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+		addr = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(addrBytes),
+			PrefixLen: 64,
+		}
+	}
+
+	return prefix, subnet, addr
+}
+
 // TestDADDisabled tests that an address successfully resolves immediately
 // when DAD is not enabled (the default for an empty stack.Options).
 func TestDADDisabled(t *testing.T) {
@@ -103,6 +128,19 @@ type ndpPrefixEvent struct {
 	discovered bool
 }
 
+type ndpAutoGenAddrEventType int
+
+const (
+	newAddr ndpAutoGenAddrEventType = iota
+	invalidatedAddr
+)
+
+type ndpAutoGenAddrEvent struct {
+	nicID     tcpip.NICID
+	addr      tcpip.AddressWithPrefix
+	eventType ndpAutoGenAddrEventType
+}
+
 var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
@@ -113,6 +151,7 @@ type ndpDispatcher struct {
 	rememberRouter bool
 	prefixC        chan ndpPrefixEvent
 	rememberPrefix bool
+	autoGenAddrC   chan ndpAutoGenAddrEvent
 	routeTable     []tcpip.Route
 }
 
@@ -211,7 +250,7 @@ func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpi
 		}
 	}
 
-	rt := make([]tcpip.Route, 0)
+	var rt []tcpip.Route
 	exclude := tcpip.Route{
 		Destination: prefix,
 		NIC:         nicID,
@@ -226,6 +265,27 @@ func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpi
 	return rt
 }
 
+func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool {
+	if n.autoGenAddrC != nil {
+		n.autoGenAddrC <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			newAddr,
+		}
+	}
+	return true
+}
+
+func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+	if n.autoGenAddrC != nil {
+		n.autoGenAddrC <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			invalidatedAddr,
+		}
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -247,6 +307,8 @@ func TestDADResolve(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent),
 			}
@@ -781,16 +843,33 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink bool, vl uint32) tcpip.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
-		flags |= 128
+		// The OnLink flag is the 7th bit in the flags byte.
+		flags |= 1 << 7
+	}
+	if auto {
+		// The Address Auto-Configuration flag is the 6th bit in the
+		// flags byte.
+		flags |= 1 << 6
 	}
 
+	// A valid header.NDPPrefixInformation must be 30 bytes.
 	buf := [30]byte{}
+	// The first byte in a header.NDPPrefixInformation is the Prefix Length
+	// field.
 	buf[0] = uint8(prefix.PrefixLen)
+	// The 2nd byte within a header.NDPPrefixInformation is the Flags field.
 	buf[1] = flags
+	// The Valid Lifetime field starts after the 2nd byte within a
+	// header.NDPPrefixInformation.
 	binary.BigEndian.PutUint32(buf[2:], vl)
+	// The Preferred Lifetime field starts after the 6th byte within a
+	// header.NDPPrefixInformation.
+	binary.BigEndian.PutUint32(buf[6:], pl)
+	// The Prefix Address field starts after the 14th byte within a
+	// header.NDPPrefixInformation.
 	copy(buf[14:], prefix.Address)
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{
 		header.NDPPrefixInformation(buf[:]),
@@ -800,6 +879,8 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on
 // TestNoRouterDiscovery tests that router discovery will not be performed if
 // configured not to.
 func TestNoRouterDiscovery(t *testing.T) {
+	t.Parallel()
+
 	// Being configured to discover routers means handle and
 	// discover are set to true and forwarding is set to false.
 	// This tests all possible combinations of the configurations,
@@ -812,6 +893,8 @@ func TestNoRouterDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				routerC: make(chan ndpRouterEvent, 10),
 			}
@@ -844,6 +927,8 @@ func TestNoRouterDiscovery(t *testing.T) {
 // TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered router when the dispatcher asks it not to.
 func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		routerC: make(chan ndpRouterEvent, 10),
 	}
@@ -909,6 +994,8 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestRouterDiscovery(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 10),
 		rememberRouter: true,
@@ -1040,6 +1127,8 @@ func TestRouterDiscovery(t *testing.T) {
 // TestRouterDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
 func TestRouterDiscoveryMaxRouters(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 10),
 		rememberRouter: true,
@@ -1104,6 +1193,8 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 // TestNoPrefixDiscovery tests that prefix discovery will not be performed if
 // configured not to.
 func TestNoPrefixDiscovery(t *testing.T) {
+	t.Parallel()
+
 	prefix := tcpip.AddressWithPrefix{
 		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
 		PrefixLen: 64,
@@ -1121,6 +1212,8 @@ func TestNoPrefixDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				prefixC: make(chan ndpPrefixEvent, 10),
 			}
@@ -1140,7 +1233,7 @@ func TestNoPrefixDiscovery(t *testing.T) {
 			}
 
 			// Rx an RA with prefix with non-zero lifetime.
-			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 10))
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 10, 0))
 
 			select {
 			case <-ndpDisp.prefixC:
@@ -1154,11 +1247,9 @@ func TestNoPrefixDiscovery(t *testing.T) {
 // TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered on-link prefix when the dispatcher asks it not to.
 func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
-	prefix := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 64,
-	}
-	subnet := prefix.Subnet()
+	t.Parallel()
+
+	prefix, subnet, _ := prefixSubnetAddr(0, "")
 
 	ndpDisp := ndpDispatcher{
 		prefixC: make(chan ndpPrefixEvent, 10),
@@ -1189,7 +1280,7 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 
 	// Rx an RA with prefix with a short lifetime.
 	const lifetime = 1
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, lifetime))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetime, 0))
 	select {
 	case r := <-ndpDisp.prefixC:
 		if r.nicID != 1 {
@@ -1226,21 +1317,11 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestPrefixDiscovery(t *testing.T) {
-	prefix1 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 64,
-	}
-	prefix2 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x00\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 64,
-	}
-	prefix3 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x0a\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 72,
-	}
-	subnet1 := prefix1.Subnet()
-	subnet2 := prefix2.Subnet()
-	subnet3 := prefix3.Subnet()
+	t.Parallel()
+
+	prefix1, subnet1, _ := prefixSubnetAddr(0, "")
+	prefix2, subnet2, _ := prefixSubnetAddr(1, "")
+	prefix3, subnet3, _ := prefixSubnetAddr(2, "")
 
 	ndpDisp := ndpDispatcher{
 		prefixC:        make(chan ndpPrefixEvent, 10),
@@ -1281,7 +1362,7 @@ func TestPrefixDiscovery(t *testing.T) {
 
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with zero valid lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly discovered a prefix with 0 lifetime")
@@ -1290,7 +1371,7 @@ func TestPrefixDiscovery(t *testing.T) {
 
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with non-zero lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 100))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0))
 	waitForEvent(subnet1, true, defaultTimeout)
 
 	// Should have added a device route for subnet1 through the nic.
@@ -1299,7 +1380,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive an RA with prefix2 in a PI.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, 100))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0))
 	waitForEvent(subnet2, true, defaultTimeout)
 
 	// Should have added a device route for subnet2 through the nic.
@@ -1308,7 +1389,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive an RA with prefix3 in a PI.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 100))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0))
 	waitForEvent(subnet3, true, defaultTimeout)
 
 	// Should have added a device route for subnet3 through the nic.
@@ -1317,7 +1398,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive an RA with prefix1 in a PI with lifetime = 0.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
 	waitForEvent(subnet1, false, defaultTimeout)
 
 	// Should have removed the device route for subnet1 through the nic.
@@ -1327,7 +1408,7 @@ func TestPrefixDiscovery(t *testing.T) {
 
 	// Receive an RA with prefix2 in a PI with lesser lifetime.
 	lifetime := uint32(2)
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, lifetime))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, lifetime, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly received prefix event when updating lifetime")
@@ -1349,7 +1430,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive RA to invalidate prefix3.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0))
 	waitForEvent(subnet3, false, defaultTimeout)
 
 	// Should not have any routes.
@@ -1415,7 +1496,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with prefix in an NDP Prefix Information option (PI)
 	// with infinite valid lifetime which should not get invalidated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
 	waitForEvent(true, defaultTimeout)
 	select {
 	case <-ndpDisp.prefixC:
@@ -1425,16 +1506,16 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with finite lifetime.
 	// The prefix should get invalidated after 1s.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
 	waitForEvent(false, testInfiniteLifetime)
 
 	// Receive an RA with finite lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
 	waitForEvent(true, defaultTimeout)
 
 	// Receive an RA with prefix with an infinite lifetime.
 	// The prefix should not be invalidated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1443,7 +1524,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with a prefix with a lifetime value greater than the
 	// set infinite lifetime value.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds+1))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds+1, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1452,13 +1533,15 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with 0 lifetime.
 	// The prefix should get invalidated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 0, 0))
 	waitForEvent(false, defaultTimeout)
 }
 
 // TestPrefixDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
 func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
@@ -1537,3 +1620,458 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
 	}
 }
+
+// Checks to see if list contains an IPv6 address, item.
+func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: item,
+	}
+
+	for _, i := range list {
+		if i == protocolAddress {
+			return true
+		}
+	}
+
+	return false
+}
+
+// TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
+func TestNoAutoGenAddr(t *testing.T) {
+	t.Parallel()
+
+	prefix, _, _ := prefixSubnetAddr(0, "")
+
+	// Being configured to auto-generate addresses means handle and
+	// autogen are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, autogen =
+	// true and forwarding = false (the required configuration to do
+	// SLAAC) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		autogen := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					AutoGenGlobalAddresses: autogen,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with prefix with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, false, true, 10, 0))
+
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly auto-generated an address when configured not to")
+			case <-time.After(defaultTimeout):
+			}
+		})
+	}
+}
+
+// TestAutoGenAddr tests that an address is properly generated and invalidated
+// when configured to do so.
+func TestAutoGenAddr(t *testing.T) {
+	const newMinVL = 2
+	newMinVLDuration := newMinVL * time.Second
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	waitForEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case r := <-ndpDisp.autoGenAddrC:
+			if r.nicID != 1 {
+				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+			}
+			if r.addr != addr {
+				t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+			}
+			if r.eventType != eventType {
+				t.Fatalf("got r.eventType = %v, want = %v", r.eventType, eventType)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timeout waiting for addr auto gen event")
+		}
+	}
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with zero valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address with 0 lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+	waitForEvent(addr1, newAddr, defaultTimeout)
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Receive an RA with prefix2 in an NDP Prefix Information option (PI)
+	// with preferred lifetime > valid lifetime
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address with preferred lifetime > valid lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Receive an RA with prefix2 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	waitForEvent(addr2, newAddr, defaultTimeout)
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+		t.Fatalf("Should have %s in the list of addresses", addr2)
+	}
+
+	// Refresh valid lifetime for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	waitForEvent(addr1, invalidatedAddr, newMinVLDuration+defaultTimeout)
+	if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should not have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+		t.Fatalf("Should have %s in the list of addresses", addr2)
+	}
+}
+
+// TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an
+// auto-generated address only gets updated when required to, as specified in
+// RFC 4862 section 5.5.3.e.
+func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
+	const infiniteVL = 4294967295
+	const newMinVL = 5
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	tests := []struct {
+		name string
+		ovl  uint32
+		nvl  uint32
+		evl  uint32
+	}{
+		// Should update the VL to the minimum VL for updating if the
+		// new VL is less than newMinVL but was originally greater than
+		// it.
+		{
+			"LargeVLToVLLessThanMinVLForUpdate",
+			9999,
+			1,
+			newMinVL,
+		},
+		{
+			"LargeVLTo0",
+			9999,
+			0,
+			newMinVL,
+		},
+		{
+			"InfiniteVLToVLLessThanMinVLForUpdate",
+			infiniteVL,
+			1,
+			newMinVL,
+		},
+		{
+			"InfiniteVLTo0",
+			infiniteVL,
+			0,
+			newMinVL,
+		},
+
+		// Should not update VL if original VL was less than newMinVL
+		// and the new VL is also less than newMinVL.
+		{
+			"ShouldNotUpdateWhenBothOldAndNewAreLessThanMinVLForUpdate",
+			newMinVL - 1,
+			newMinVL - 3,
+			newMinVL - 1,
+		},
+
+		// Should take the new VL if the new VL is greater than the
+		// remaining time or is greater than newMinVL.
+		{
+			"MorethanMinVLToLesserButStillMoreThanMinVLForUpdate",
+			newMinVL + 5,
+			newMinVL + 3,
+			newMinVL + 3,
+		},
+		{
+			"SmallVLToGreaterVLButStillLessThanMinVLForUpdate",
+			newMinVL - 3,
+			newMinVL - 1,
+			newMinVL - 1,
+		},
+		{
+			"SmallVLToGreaterVLThatIsMoreThaMinVLForUpdate",
+			newMinVL - 3,
+			newMinVL + 1,
+			newMinVL + 1,
+		},
+	}
+
+	const delta = 500 * time.Millisecond
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              true,
+					AutoGenGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Receive an RA with prefix with initial VL, test.ovl.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0))
+			select {
+			case r := <-ndpDisp.autoGenAddrC:
+				if r.nicID != 1 {
+					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+				}
+				if r.addr != addr {
+					t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+				}
+				if r.eventType != newAddr {
+					t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+				}
+			case <-time.After(defaultTimeout):
+				t.Fatal("timeout waiting for addr auto gen event")
+			}
+
+			// Receive an new RA with prefix with new VL, test.nvl.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0))
+
+			//
+			// Validate that the VL for the address got set to
+			// test.evl.
+			//
+
+			// Make sure we do not get any invalidation events
+			// until atleast 500ms (delta) before test.evl.
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpectedly received an auto gen addr event")
+			case <-time.After(time.Duration(test.evl)*time.Second - delta):
+			}
+
+			// Wait for another second (2x delta), but now we expect
+			// the invalidation event.
+			select {
+			case r := <-ndpDisp.autoGenAddrC:
+				if r.nicID != 1 {
+					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+				}
+				if r.addr != addr {
+					t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+				}
+				if r.eventType != invalidatedAddr {
+					t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+				}
+			case <-time.After(2 * delta):
+				t.Fatal("timeout waiting for addr auto gen event")
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrRemoval tests that when auto-generated addresses are removed
+// by the user, its resources will be cleaned up and an invalidation event will
+// be sent to the integrator.
+func TestAutoGenAddrRemoval(t *testing.T) {
+	t.Parallel()
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix with its valid lifetime = lifetime.
+	const lifetime = 5
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetime, 0))
+	select {
+	case r := <-ndpDisp.autoGenAddrC:
+		if r.nicID != 1 {
+			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		}
+		if r.addr != addr {
+			t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+		}
+		if r.eventType != newAddr {
+			t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+		}
+	case <-time.After(defaultTimeout):
+		t.Fatal("timeout waiting for addr auto gen event")
+	}
+
+	// Remove the address.
+	if err := s.RemoveAddress(1, addr.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr.Address, err)
+	}
+
+	// Should get the invalidation event immediately.
+	select {
+	case r := <-ndpDisp.autoGenAddrC:
+		if r.nicID != 1 {
+			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		}
+		if r.addr != addr {
+			t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+		}
+		if r.eventType != invalidatedAddr {
+			t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+		}
+	case <-time.After(defaultTimeout):
+		t.Fatal("timeout waiting for addr auto gen event")
+	}
+
+	// Wait for the original valid lifetime to make sure the original timer
+	// got stopped/cleaned up.
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpectedly received an auto gen addr event")
+	case <-time.After(lifetime*time.Second + defaultTimeout):
+	}
+}
+
+// TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
+// is already assigned to the NIC, the static address remains.
+func TestAutoGenAddrStaticConflict(t *testing.T) {
+	t.Parallel()
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Add the address as a static address before SLAAC tries to add it.
+	if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil {
+		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err)
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Receive a PI where the generated address will be the same as the one
+	// that we already have assigned statically.
+	const lifetime = 5
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetime, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
+	case <-time.After(defaultTimeout):
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Should not get an invalidation event after the PI's invalidation
+	// time.
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event")
+	case <-time.After(lifetime*time.Second + defaultTimeout):
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3f8d7312c..e8401c673 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -115,10 +115,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 			},
 		},
 		ndp: ndpState{
-			configs:        stack.ndpConfigs,
-			dad:            make(map[tcpip.Address]dadState),
-			defaultRouters: make(map[tcpip.Address]defaultRouterState),
-			onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+			configs:          stack.ndpConfigs,
+			dad:              make(map[tcpip.Address]dadState),
+			defaultRouters:   make(map[tcpip.Address]defaultRouterState),
+			onLinkPrefixes:   make(map[tcpip.Subnet]onLinkPrefixState),
+			autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
 		},
 	}
 	nic.ndp.nic = nic
@@ -244,6 +245,20 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
 	return nil
 }
 
+// hasPermanentAddrLocked returns true if n has a permanent (including currently
+// tentative) address, addr.
+func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
+	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+
+	if !ok {
+		return false
+	}
+
+	kind := ref.getKind()
+
+	return kind == permanent || kind == permanentTentative
+}
+
 func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
 	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, n.promiscuous)
 }
@@ -335,7 +350,7 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 			Address:   address,
 			PrefixLen: netProto.DefaultPrefixLen(),
 		},
-	}, peb, temporary)
+	}, peb, temporary, static)
 
 	n.mu.Unlock()
 	return ref
@@ -384,10 +399,10 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 		}
 	}
 
-	return n.addAddressLocked(protocolAddress, peb, permanent)
+	return n.addAddressLocked(protocolAddress, peb, permanent, static)
 }
 
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind) (*referencedNetworkEndpoint, *tcpip.Error) {
+func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType) (*referencedNetworkEndpoint, *tcpip.Error) {
 	// TODO(b/141022673): Validate IP address before adding them.
 
 	// Sanity check.
@@ -417,11 +432,12 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 	}
 
 	ref := &referencedNetworkEndpoint{
-		refs:     1,
-		ep:       ep,
-		nic:      n,
-		protocol: protocolAddress.Protocol,
-		kind:     kind,
+		refs:       1,
+		ep:         ep,
+		nic:        n,
+		protocol:   protocolAddress.Protocol,
+		kind:       kind,
+		configType: configType,
 	}
 
 	// Set up cache if link address resolution exists for this protocol.
@@ -624,9 +640,18 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 
 	isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
 
-	// If we are removing a tentative IPv6 unicast address, stop DAD.
-	if isIPv6Unicast && kind == permanentTentative {
-		n.ndp.stopDuplicateAddressDetection(addr)
+	if isIPv6Unicast {
+		// If we are removing a tentative IPv6 unicast address, stop
+		// DAD.
+		if kind == permanentTentative {
+			n.ndp.stopDuplicateAddressDetection(addr)
+		}
+
+		// If we are removing an address generated via SLAAC, cleanup
+		// its SLAAC resources and notify the integrator.
+		if r.configType == slaac {
+			n.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
+		}
 	}
 
 	r.setKind(permanentExpired)
@@ -989,7 +1014,7 @@ const (
 	// removing the permanent address from the NIC.
 	permanent
 
-	// An expired permanent endoint is a permanent endoint that had its address
+	// An expired permanent endpoint is a permanent endpoint that had its address
 	// removed from the NIC, and it is waiting to be removed once no more routes
 	// hold a reference to it. This is achieved by decreasing its reference count
 	// by 1. If its address is re-added before the endpoint is removed, its type
@@ -1035,6 +1060,19 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep
 	}
 }
 
+type networkEndpointConfigType int32
+
+const (
+	// A statically configured endpoint is an address that was added by
+	// some user-specified action (adding an explicit address, joining a
+	// multicast group).
+	static networkEndpointConfigType = iota
+
+	// A slaac configured endpoint is an IPv6 endpoint that was
+	// added by SLAAC as per RFC 4862 section 5.5.3.
+	slaac
+)
+
 type referencedNetworkEndpoint struct {
 	ep       NetworkEndpoint
 	nic      *NIC
@@ -1050,6 +1088,10 @@ type referencedNetworkEndpoint struct {
 
 	// networkEndpointKind must only be accessed using {get,set}Kind().
 	kind networkEndpointKind
+
+	// configType is the method that was used to configure this endpoint.
+	// This must never change after the endpoint is added to a NIC.
+	configType networkEndpointConfigType
 }
 
 func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
-- 
cgit v1.2.3


From 7b81633ff828a2fdb3c96f2288407a1d4401b1ef Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 6 Dec 2019 15:25:16 -0800
Subject: Build with C++17

This will require a reasonably modern toolchain. I've put minimum compiler
versions in the README based on versions in
https://en.cppreference.com/w/cpp/compiler_support that have mostly complete
language and library support.

The minimum Bazel version bump is unrelated, but 0.28 is definitely not
supported anymore.

Please report issues on gvisor.dev/issue/1349.

Fixes #1349

PiperOrigin-RevId: 284274250
---
 .bazelrc  | 3 +++
 README.md | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.bazelrc b/.bazelrc
index 379fc8328..7f87e94b1 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Build with C++17.
+build --cxxopt=-std=c++17
+
 # Display the current git revision in the info block.
 build --stamp --workspace_status_command tools/workspace_status.sh
 
diff --git a/README.md b/README.md
index 5ac6f9046..de3e06f4e 100644
--- a/README.md
+++ b/README.md
@@ -48,9 +48,10 @@ Make sure the following dependencies are installed:
 
 *   Linux 4.14.77+ ([older linux][old-linux])
 *   [git][git]
-*   [Bazel][bazel] 0.28.0+
+*   [Bazel][bazel] 1.2+
 *   [Python][python]
 *   [Docker version 17.09.0 or greater][docker]
+*   C++ toolchain supporting C++17 (GCC 7+, Clang 5+)
 *   Gold linker (e.g. `binutils-gold` package on Ubuntu)
 
 ### Building
-- 
cgit v1.2.3


From 3e84777d2e2a2b56c00487cd77aa8d2fc25bbb16 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 6 Dec 2019 15:45:01 -0800
Subject: Fix flakiness in tcp_test.

This change marks the socket as ESTABLISHED and creates the receiver and sender
the moment we send the final ACK in case of an active TCP handshake or when we
receive the final ACK for a passive TCP handshake. Before this change there was
a short window in which an ACK can be received and processed but the state on
the socket is not yet ESTABLISHED.

This can be seen in TestConnectBindToDevice which is flaky because sometimes
the socket is in SYN-SENT and not ESTABLISHED even though the other side has
already received the final ACK of the handshake.

PiperOrigin-RevId: 284277713
---
 pkg/tcpip/transport/tcp/accept.go                  |  4 +-
 pkg/tcpip/transport/tcp/connect.go                 | 52 ++++++++++++++--------
 pkg/tcpip/transport/tcp/tcp_test.go                |  2 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  3 ++
 4 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index f543a6105..74df3edfb 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -298,8 +298,6 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		return nil, err
 	}
 	ep.mu.Lock()
-	ep.stack.Stats().TCP.CurrentEstablished.Increment()
-	ep.state = StateEstablished
 	ep.isConnectNotified = true
 	ep.mu.Unlock()
 
@@ -546,6 +544,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		n.tsOffset = 0
 
 		// Switch state to connected.
+		// We do not use transitionToStateEstablishedLocked here as there is
+		// no handshake state available when doing a SYN cookie based accept.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
 		n.state = StateEstablished
 		n.isConnectNotified = true
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 16f8aea12..2975a1c3c 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -252,6 +252,11 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// and the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
 		h.state = handshakeCompleted
+
+		h.ep.mu.Lock()
+		h.ep.transitionToStateEstablishedLocked(h)
+		h.ep.mu.Unlock()
+
 		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
 		return nil
 	}
@@ -352,6 +357,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+		h.ep.mu.Lock()
+		h.ep.transitionToStateEstablishedLocked(h)
+		h.ep.mu.Unlock()
+
 		return nil
 	}
 
@@ -880,6 +889,30 @@ func (e *endpoint) completeWorkerLocked() {
 	}
 }
 
+// transitionToStateEstablisedLocked transitions a given endpoint
+// to an established state using the handshake parameters provided.
+// It also initializes sender/receiver if required.
+func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
+	if e.snd == nil {
+		// Transfer handshake state to TCP connection. We disable
+		// receive window scaling if the peer doesn't support it
+		// (indicated by a negative send window scale).
+		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
+	}
+	if e.rcv == nil {
+		rcvBufSize := seqnum.Size(e.receiveBufferSize())
+		e.rcvListMu.Lock()
+		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+		// Bootstrap the auto tuning algorithm. Starting at zero will
+		// result in a really large receive window after the first auto
+		// tuning adjustment.
+		e.rcvAutoParams.prevCopied = int(h.rcvWnd)
+		e.rcvListMu.Unlock()
+	}
+	h.ep.stack.Stats().TCP.CurrentEstablished.Increment()
+	e.state = StateEstablished
+}
+
 // transitionToStateCloseLocked ensures that the endpoint is
 // cleaned up from the transport demuxer, "before" moving to
 // StateClose. This will ensure that no packet will be
@@ -1156,25 +1189,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 			return err
 		}
-
-		// Transfer handshake state to TCP connection. We disable
-		// receive window scaling if the peer doesn't support it
-		// (indicated by a negative send window scale).
-		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
-
-		rcvBufSize := seqnum.Size(e.receiveBufferSize())
-		e.rcvListMu.Lock()
-		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
-		// boot strap the auto tuning algorithm. Starting at zero will
-		// result in a large step function on the first proper causing
-		// the window to just go to a really large value after the first
-		// RTT itself.
-		e.rcvAutoParams.prevCopied = initialRcvWnd
-		e.rcvListMu.Unlock()
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
-		e.mu.Lock()
-		e.state = StateEstablished
-		e.mu.Unlock()
 	}
 
 	e.keepalive.timer.init(&e.keepalive.waker)
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index d1f0d6ce7..52c2fa7e3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -541,7 +541,7 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	ep.(interface{ ResumeWork() }).ResumeWork()
 
 	// Wait for the protocolMainLoop to resume and update state.
-	time.Sleep(1 * time.Millisecond)
+	time.Sleep(10 * time.Millisecond)
 
 	// Expect the endpoint to be closed.
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6cb66c1af..b0a376eba 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -231,6 +231,7 @@ func (c *Context) CheckNoPacket(errMsg string) {
 // addresses. It will fail with an error if no packet is received for
 // 2 seconds.
 func (c *Context) GetPacket() []byte {
+	c.t.Helper()
 	select {
 	case p := <-c.linkEP.C:
 		if p.Proto != ipv4.ProtocolNumber {
@@ -259,6 +260,7 @@ func (c *Context) GetPacket() []byte {
 // and destination address. If no packet is available it will return
 // nil immediately.
 func (c *Context) GetPacketNonBlocking() []byte {
+	c.t.Helper()
 	select {
 	case p := <-c.linkEP.C:
 		if p.Proto != ipv4.ProtocolNumber {
@@ -483,6 +485,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 // GetV6Packet reads a single packet from the link layer endpoint of the context
 // and asserts that it is an IPv6 Packet with the expected src/dest addresses.
 func (c *Context) GetV6Packet() []byte {
+	c.t.Helper()
 	select {
 	case p := <-c.linkEP.C:
 		if p.Proto != ipv6.ProtocolNumber {
-- 
cgit v1.2.3


From 371e210b83c244d8828ad2fa1b3d7cef15fbf463 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 6 Dec 2019 16:58:28 -0800
Subject: Add runtime tracing.

This adds meaningful annotations to the trace generated by the runtime/trace
package.

PiperOrigin-RevId: 284290115
---
 pkg/sentry/control/pprof.go       | 15 ++++++-
 pkg/sentry/kernel/kernel.go       | 20 ++++++++-
 pkg/sentry/kernel/syscalls.go     |  8 ++++
 pkg/sentry/kernel/task.go         | 20 +++++----
 pkg/sentry/kernel/task_block.go   |  8 +++-
 pkg/sentry/kernel/task_clone.go   |  1 +
 pkg/sentry/kernel/task_exec.go    |  3 +-
 pkg/sentry/kernel/task_exit.go    |  1 +
 pkg/sentry/kernel/task_log.go     | 86 +++++++++++++++++++++++++++++++++++++--
 pkg/sentry/kernel/task_run.go     | 14 +++++++
 pkg/sentry/kernel/task_start.go   |  8 ++--
 pkg/sentry/kernel/task_syscall.go |  8 ++++
 runsc/boot/controller.go          |  4 +-
 runsc/cmd/debug.go                | 29 +++++++------
 scripts/dev.sh                    |  3 +-
 15 files changed, 190 insertions(+), 38 deletions(-)

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 1f78d54a2..e1f2fea60 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -22,6 +22,7 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -56,6 +57,9 @@ type Profile struct {
 
 	// traceFile is the current execution trace output file.
 	traceFile *fd.FD
+
+	// Kernel is the kernel under profile.
+	Kernel *kernel.Kernel
 }
 
 // StartCPUProfile is an RPC stub which starts recording the CPU profile in a
@@ -147,6 +151,9 @@ func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error {
 		return err
 	}
 
+	// Ensure all trace contexts are registered.
+	p.Kernel.RebuildTraceContexts()
+
 	p.traceFile = output
 	return nil
 }
@@ -158,9 +165,15 @@ func (p *Profile) StopTrace(_, _ *struct{}) error {
 	defer p.mu.Unlock()
 
 	if p.traceFile == nil {
-		return errors.New("Execution tracing not start")
+		return errors.New("Execution tracing not started")
 	}
 
+	// Similarly to the case above, if tasks have not ended traces, we will
+	// lose information. Thus we need to rebuild the tasks in order to have
+	// complete information. This will not lose information if multiple
+	// traces are overlapping.
+	p.Kernel.RebuildTraceContexts()
+
 	trace.Stop()
 	p.traceFile.Close()
 	p.traceFile = nil
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 28ba950bd..bd3fb4c03 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -841,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
 		ContainerID:             args.ContainerID,
 	}
-	if _, err := k.tasks.NewTask(config); err != nil {
+	t, err := k.tasks.NewTask(config)
+	if err != nil {
 		return nil, 0, err
 	}
+	t.traceExecEvent(tc) // Simulate exec for tracing.
 
 	// Success.
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
@@ -1118,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	return lastErr
 }
 
+// RebuildTraceContexts rebuilds the trace context for all tasks.
+//
+// Unfortunately, if these are built while tracing is not enabled, then we will
+// not have meaningful trace data. Rebuilding here ensures that we can do so
+// after tracing has been enabled.
+func (k *Kernel) RebuildTraceContexts() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	for t, tid := range k.tasks.Root.tids {
+		t.rebuildTraceContext(tid)
+	}
+}
+
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 220fa73a2..2fdee0282 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
 	return nil
 }
 
+// LookupName looks up a syscall name.
+func (s *SyscallTable) LookupName(sysno uintptr) string {
+	if sc, ok := s.Table[sysno]; ok {
+		return sc.Name
+	}
+	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
+}
+
 // LookupEmulate looks up an emulation syscall number.
 func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
 	sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 80c8e5464..ab0c6c4aa 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -15,6 +15,8 @@
 package kernel
 
 import (
+	gocontext "context"
+	"runtime/trace"
 	"sync"
 	"sync/atomic"
 
@@ -390,7 +392,14 @@ type Task struct {
 
 	// logPrefix is a string containing the task's thread ID in the root PID
 	// namespace, and is prepended to log messages emitted by Task.Infof etc.
-	logPrefix atomic.Value `state:".(string)"`
+	logPrefix atomic.Value `state:"nosave"`
+
+	// traceContext and traceTask are both used for tracing, and are
+	// updated along with the logPrefix in updateInfoLocked.
+	//
+	// These are exclusive to the task goroutine.
+	traceContext gocontext.Context `state:"nosave"`
+	traceTask    *trace.Task       `state:"nosave"`
 
 	// creds is the task's credentials.
 	//
@@ -528,14 +537,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) {
 	t.ptraceTracer.Store(tracer)
 }
 
-func (t *Task) saveLogPrefix() string {
-	return t.logPrefix.Load().(string)
-}
-
-func (t *Task) loadLogPrefix(prefix string) {
-	t.logPrefix.Store(prefix)
-}
-
 func (t *Task) saveSyscallFilters() []bpf.Program {
 	if f := t.syscallFilters.Load(); f != nil {
 		return f.([]bpf.Program)
@@ -549,6 +550,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) {
 
 // afterLoad is invoked by stateify.
 func (t *Task) afterLoad() {
+	t.updateInfoLocked()
 	t.interruptChan = make(chan struct{}, 1)
 	t.gosched.State = TaskGoroutineNonexistent
 	if t.stop != nil {
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index dd69939f9..4a4a69ee2 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"runtime"
+	"runtime/trace"
 	"time"
 
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
 		runtime.Gosched()
 	}
 
+	region := trace.StartRegion(t.traceContext, blockRegion)
 	select {
 	case <-C:
+		region.End()
 		t.SleepFinish(true)
+		// Woken by event.
 		return nil
 
 	case <-interrupt:
+		region.End()
 		t.SleepFinish(false)
 		// Return the indicated error on interrupt.
 		return syserror.ErrInterrupted
 
 	case <-timerChan:
-		// We've timed out.
+		region.End()
 		t.SleepFinish(true)
+		// We've timed out.
 		return syserror.ETIMEDOUT
 	}
 }
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0916fd658..3eadfedb4 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -299,6 +299,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	// nt that it must receive before its task goroutine starts running.
 	tid := nt.k.tasks.Root.IDOfTask(nt)
 	defer nt.Start(tid)
+	t.traceCloneEvent(tid)
 
 	// "If fork/clone and execve are allowed by @prog, any child processes will
 	// be constrained to the same filters and system call ABI as the parent." -
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 17a089b90..90a6190f1 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct {
 }
 
 func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.traceExecEvent(r.tc)
 	t.tg.pidns.owner.mu.Lock()
 	t.tg.execing = nil
 	if t.killed() {
@@ -253,7 +254,7 @@ func (t *Task) promoteLocked() {
 
 	t.tg.leader = t
 	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
-	t.updateLogPrefixLocked()
+	t.updateInfoLocked()
 	// Reap the original leader. If it has a tracer, detach it instead of
 	// waiting for it to acknowledge the original leader's death.
 	oldLeader.exitParentNotified = true
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 535f03e50..435761e5a 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState {
 type runExitMain struct{}
 
 func (*runExitMain) execute(t *Task) taskRunState {
+	t.traceExitEvent()
 	lastExiter := t.exitThreadGroup()
 
 	// If the task has a cleartid, and the thread group wasn't killed by a
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index a29e9b9eb..0fb3661de 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"fmt"
+	"runtime/trace"
 	"sort"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() {
 	}
 }
 
-// updateLogPrefix updates the task's cached log prefix to reflect its
-// current thread ID.
+// trace definitions.
+//
+// Note that all region names are prefixed by ':' in order to ensure that they
+// are lexically ordered before all system calls, which use the naked system
+// call name (e.g. "read") for maximum clarity.
+const (
+	traceCategory = "task"
+	runRegion     = ":run"
+	blockRegion   = ":block"
+	cpuidRegion   = ":cpuid"
+	faultRegion   = ":fault"
+)
+
+// updateInfoLocked updates the task's cached log prefix and tracing
+// information to reflect its current thread ID.
 //
 // Preconditions: The task's owning TaskSet.mu must be locked.
-func (t *Task) updateLogPrefixLocked() {
+func (t *Task) updateInfoLocked() {
 	// Use the task's TID in the root PID namespace for logging.
-	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+	tid := t.tg.pidns.owner.Root.tids[t]
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid))
+	t.rebuildTraceContext(tid)
+}
+
+// rebuildTraceContext rebuilds the trace context.
+//
+// Precondition: the passed tid must be the tid in the root namespace.
+func (t *Task) rebuildTraceContext(tid ThreadID) {
+	// Re-initialize the trace context.
+	if t.traceTask != nil {
+		t.traceTask.End()
+	}
+
+	// Note that we define the "task type" to be the dynamic TID. This does
+	// not align perfectly with the documentation for "tasks" in the
+	// tracing package. Tasks may be assumed to be bounded by analysis
+	// tools. However, if we just use a generic "task" type here, then the
+	// "user-defined tasks" page on the tracing dashboard becomes nearly
+	// unusable, as it loads all traces from all tasks.
+	//
+	// We can assume that the number of tasks in the system is not
+	// arbitrarily large (in general it won't be, especially for cases
+	// where we're collecting a brief profile), so using the TID is a
+	// reasonable compromise in this case.
+	t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+}
+
+// traceCloneEvent is called when a new task is spawned.
+//
+// ntid must be the new task's ThreadID in the root namespace.
+func (t *Task) traceCloneEvent(ntid ThreadID) {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid)
+}
+
+// traceExitEvent is called when a task exits.
+func (t *Task) traceExitEvent() {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status())
+}
+
+// traceExecEvent is called when a task calls exec.
+func (t *Task) traceExecEvent(tc *TaskContext) {
+	if !trace.IsEnabled() {
+		return
+	}
+	d := tc.MemoryManager.Executable()
+	if d == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
+		return
+	}
+	defer d.DecRef()
+	root := t.fsContext.RootDirectory()
+	if root == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
+		return
+	}
+	defer root.DecRef()
+	n, _ := d.FullName(root)
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
 }
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index c92266c59..d97f8c189 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"bytes"
 	"runtime"
+	"runtime/trace"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -205,9 +206,11 @@ func (*runApp) execute(t *Task) taskRunState {
 		t.tg.pidns.owner.mu.RUnlock()
 	}
 
+	region := trace.StartRegion(t.traceContext, runRegion)
 	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
 	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
 	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+	region.End()
 
 	if clearSinglestep {
 		t.Arch().ClearSingleStep()
@@ -225,6 +228,7 @@ func (*runApp) execute(t *Task) taskRunState {
 
 	case platform.ErrContextSignalCPUID:
 		// Is this a CPUID instruction?
+		region := trace.StartRegion(t.traceContext, cpuidRegion)
 		expected := arch.CPUIDInstruction[:]
 		found := make([]byte, len(expected))
 		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
@@ -232,10 +236,12 @@ func (*runApp) execute(t *Task) taskRunState {
 			// Skip the cpuid instruction.
 			t.Arch().CPUIDEmulate(t)
 			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+			region.End()
 
 			// Resume execution.
 			return (*runApp)(nil)
 		}
+		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
@@ -251,8 +257,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		// an application-generated signal and we should continue execution
 		// normally.
 		if at.Any() {
+			region := trace.StartRegion(t.traceContext, faultRegion)
 			addr := usermem.Addr(info.Addr())
 			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			region.End()
 			if err == nil {
 				// The fault was handled appropriately.
 				// We can resume running the application.
@@ -260,6 +268,12 @@ func (*runApp) execute(t *Task) taskRunState {
 			}
 
 			// Is this a vsyscall that we need emulate?
+			//
+			// Note that we don't track vsyscalls as part of a
+			// specific trace region. This is because regions don't
+			// stack, and the actual system call will count as a
+			// region. We should be able to easily identify
+			// vsyscalls by having a <fault><syscall> pair.
 			if at.Execute {
 				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
 					return t.doVsyscall(addr, sysno)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index ae6fc4025..3522a4ae5 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -154,10 +154,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 	// Below this point, newTask is expected not to fail (there is no rollback
 	// of assignTIDsLocked or any of the following).
 
-	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
-	// This is the earliest point at which we can do so (since t now has thread
-	// IDs).
-	t.updateLogPrefixLocked()
+	// Logging on t's behalf will panic if t.logPrefix hasn't been
+	// initialized. This is the earliest point at which we can do so
+	// (since t now has thread IDs).
+	t.updateInfoLocked()
 
 	if cfg.InheritParent != nil {
 		t.parent = cfg.InheritParent.parent
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index b543d536a..3180f5560 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"fmt"
 	"os"
+	"runtime/trace"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 		ctrl = ctrlStopAndReinvokeSyscall
 	} else {
 		fn := s.Lookup(sysno)
+		var region *trace.Region // Only non-nil if tracing == true.
+		if trace.IsEnabled() {
+			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
+		}
 		if fn != nil {
 			// Call our syscall implementation.
 			rval, ctrl, err = fn(t, args)
@@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 			// Use the missing function if not found.
 			rval, err = t.SyscallTable().Missing(t, sysno, args)
 		}
+		if region != nil {
+			region.End()
+		}
 	}
 
 	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f62be4c59..9c9e94864 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) {
 	srv.Register(&debug{})
 	srv.Register(&control.Logging{})
 	if l.conf.ProfileEnable {
-		srv.Register(&control.Profile{})
+		srv.Register(&control.Profile{
+			Kernel: l.k,
+		})
 	}
 
 	return &controller{
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7313e473f..38da7ee02 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,16 +32,16 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid          int
-	stacks       bool
-	signal       int
-	profileHeap  string
-	profileCPU   string
-	profileDelay int
-	trace        string
-	strace       string
-	logLevel     string
-	logPackets   string
+	pid         int
+	stacks      bool
+	signal      int
+	profileHeap string
+	profileCPU  string
+	trace       string
+	strace      string
+	logLevel    string
+	logPackets  string
+	duration    time.Duration
 }
 
 // Name implements subcommands.Command.
@@ -65,7 +65,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
-	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
@@ -163,7 +163,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err := c.Sandbox.StartCPUProfile(f); err != nil {
 			return Errorf(err.Error())
 		}
-		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+		log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
 	}
 	if d.trace != "" {
 		delay = true
@@ -181,8 +181,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err := c.Sandbox.StartTrace(f); err != nil {
 			return Errorf(err.Error())
 		}
-		log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
-
+		log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
 	}
 
 	if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
@@ -243,7 +242,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	if delay {
-		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+		time.Sleep(d.duration)
 	}
 
 	return subcommands.ExitSuccess
diff --git a/scripts/dev.sh b/scripts/dev.sh
index c67003018..6238b4d0b 100755
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -54,9 +54,10 @@ declare OUTPUT="$(build //runsc)"
 if [[ ${REFRESH} -eq 0 ]]; then
   install_runsc "${RUNTIME}"   --net-raw
   install_runsc "${RUNTIME}-d" --net-raw --debug --strace --log-packets
+  install_runsc "${RUNTIME}-p" --net-raw --profile
 
   echo
-  echo "Runtimes ${RUNTIME} and ${RUNTIME}-d (debug enabled) setup."
+  echo "Runtimes ${RUNTIME}, ${RUNTIME}-d (debug enabled), and ${RUNTIME}-p installed."
   echo "Use --runtime="${RUNTIME}" with your Docker command."
   echo "  docker run --rm --runtime="${RUNTIME}" hello-world"
   echo
-- 
cgit v1.2.3


From b1d44be7ad893bd6bdfd164a54a7142f4462414b Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Fri, 6 Dec 2019 17:15:52 -0800
Subject: Add TCP stats for connection close and keep-alive timeouts.

Fix bugs in updates to TCP CurrentEstablished stat.

Fixes #1277

PiperOrigin-RevId: 284292459
---
 pkg/sentry/socket/netstack/netstack.go |  2 ++
 pkg/tcpip/tcpip.go                     |  8 ++++++
 pkg/tcpip/transport/tcp/connect.go     |  5 ++--
 pkg/tcpip/transport/tcp/snd.go         |  1 -
 pkg/tcpip/transport/tcp/tcp_test.go    | 46 ++++++++++++++++++++++++++++++++++
 5 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index d92399efd..fe5a46aa3 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -151,6 +151,8 @@ var Metrics = tcpip.Stats{
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
 		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."),
 		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
+		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."),
+		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
 		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
 		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
 		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 5746043cc..d5bb5b6ed 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -924,6 +924,14 @@ type TCPStats struct {
 	// ESTABLISHED state or the CLOSE-WAIT state.
 	EstablishedResets *StatCounter
 
+	// EstablishedClosed is the number of times established TCP connections
+	// made a transition to CLOSED state.
+	EstablishedClosed *StatCounter
+
+	// EstablishedTimedout is the number of times an established connection
+	// was reset because of keep-alive time out.
+	EstablishedTimedout *StatCounter
+
 	// ListenOverflowSynDrop is the number of times the listen queue overflowed
 	// and a SYN was dropped.
 	ListenOverflowSynDrop *StatCounter
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 2975a1c3c..3d059c302 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -924,6 +924,7 @@ func (e *endpoint) transitionToStateCloseLocked() {
 	}
 	e.cleanupLocked()
 	e.state = StateClose
+	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
@@ -1094,6 +1095,7 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
 
 	if e.keepalive.unacked >= e.keepalive.count {
 		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
 		return tcpip.ErrTimeout
 	}
 
@@ -1179,8 +1181,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			e.stack.Stats().TCP.EstablishedResets.Increment()
-			e.stack.Stats().TCP.CurrentEstablished.Decrement()
 			e.state = StateError
 			e.HardError = err
 
@@ -1389,7 +1389,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Mark endpoint as closed.
 	e.mu.Lock()
 	if e.state != StateError {
-		e.stack.Stats().TCP.EstablishedResets.Increment()
 		e.stack.Stats().TCP.CurrentEstablished.Decrement()
 		e.transitionToStateCloseLocked()
 	}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index d3f7c9125..8332a0179 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -674,7 +674,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		default:
 			s.ep.state = StateFinWait1
 		}
-		s.ep.stack.Stats().TCP.CurrentEstablished.Decrement()
 		s.ep.mu.Unlock()
 	} else {
 		// We're sending a non-FIN segment.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 52c2fa7e3..bc5cfcf0e 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -75,6 +75,20 @@ func TestGiveUpConnect(t *testing.T) {
 	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
 		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %v, want = %v", err, tcpip.ErrAborted)
 	}
+
+	// Call Connect again to retreive the handshake failure status
+	// and stats updates.
+	if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrAborted)
+	}
+
+	if got := c.Stack().Stats().TCP.FailedConnectionAttempts.Value(); got != 1 {
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = 1", got)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func TestConnectIncrementActiveConnection(t *testing.T) {
@@ -548,6 +562,14 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
+	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = 1", got)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
+
 	// Check if the endpoint was moved to CLOSED and netstack a reset in
 	// response to the ACK packet that we sent after last-ACK.
 	checker.IPv4(t, c.GetPacket(),
@@ -2694,6 +2716,13 @@ loop:
 	if tcp.EndpointState(c.EP.State()) != tcp.StateError {
 		t.Fatalf("got EP state is not StateError")
 	}
+
+	if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
+		t.Errorf("got stats.TCP.EstablishedResets.Value() = %v, want = 1", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func TestSendOnResetConnection(t *testing.T) {
@@ -4363,9 +4392,17 @@ func TestKeepalive(t *testing.T) {
 		),
 	)
 
+	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got)
+	}
+
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
 	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
@@ -5992,6 +6029,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
 	}
 
+	want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
+
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
@@ -6120,6 +6159,13 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
 		checker.AckNum(uint32(ackHeaders.SeqNum)),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+
+	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func TestTCPCloseWithData(t *testing.T) {
-- 
cgit v1.2.3


From 3c2e2f7d12285e6093ecc225e0379fe59e8fd93f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 6 Dec 2019 20:11:51 -0800
Subject: Update Kokoro image to install Golang 1.13

PiperOrigin-RevId: 284308422
---
 kokoro/ubuntu1604/10_core.sh |  4 ++--
 kokoro/ubuntu1604/README.md  | 34 ++++++++++++++++++++++++++++++++++
 scripts/go.sh                |  2 ++
 3 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 kokoro/ubuntu1604/README.md

diff --git a/kokoro/ubuntu1604/10_core.sh b/kokoro/ubuntu1604/10_core.sh
index e87a6eee8..46dda6bb1 100755
--- a/kokoro/ubuntu1604/10_core.sh
+++ b/kokoro/ubuntu1604/10_core.sh
@@ -21,8 +21,8 @@ apt-get update && apt-get -y install make git-core build-essential linux-headers
 
 # Install a recent go toolchain.
 if ! [[ -d /usr/local/go ]]; then
-    wget https://dl.google.com/go/go1.12.linux-amd64.tar.gz
-    tar -xvf go1.12.linux-amd64.tar.gz
+    wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
+    tar -xvf go1.13.5.linux-amd64.tar.gz
     mv go /usr/local
 fi
 
diff --git a/kokoro/ubuntu1604/README.md b/kokoro/ubuntu1604/README.md
new file mode 100644
index 000000000..64f913b9a
--- /dev/null
+++ b/kokoro/ubuntu1604/README.md
@@ -0,0 +1,34 @@
+## Image Update
+
+After making changes to files in the directory, you must run the following
+commands to update the image Kokoro uses:
+
+```shell
+gcloud config set project gvisor-kokoro-testing
+third_party/gvisor/kokoro/ubuntu1604/build.sh
+third_party/gvisor/kokoro/ubuntu1804/build.sh
+```
+
+Note: the command above will change your default project for `gcloud`. Run
+`gcloud config set project` again to revert back to your default project.
+
+Note: Files in `third_party/gvisor/kokoro/ubuntu1804/` as symlinks to
+`ubuntu1604`, therefore both images must be updated.
+
+After the script finishes, the last few lines of the output will container the
+image name. If the output was lost, you can run `build.sh` again to print the
+image name.
+
+```
+NAME                    PROJECT                FAMILY  DEPRECATED  STATUS
+image-6777fa4666a968c8  gvisor-kokoro-testing                      READY
++ cleanup
++ gcloud compute instances delete --quiet build-tlfrdv
+Deleted [https://www.googleapis.com/compute/v1/projects/gvisor-kokoro-testing/zones/us-central1-f/instances/build-tlfrdv].
+```
+
+To setup Kokoro to use the new image, copy the image names to their
+corresponding file below:
+
+*   //devtools/kokoro/config/gcp/gvisor/ubuntu1604.gcl
+*   //devtools/kokoro/config/gcp/gvisor/ubuntu1804.gcl
diff --git a/scripts/go.sh b/scripts/go.sh
index 0dbfb7747..626ed8fa4 100755
--- a/scripts/go.sh
+++ b/scripts/go.sh
@@ -25,6 +25,8 @@ tools/go_branch.sh
 # Checkout the new branch.
 git checkout go && git clean -f
 
+go version
+
 # Build everything.
 go build ./...
 
-- 
cgit v1.2.3


From 01eadf51ea54b8f478c49b755d712f11fff2b28c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 6 Dec 2019 23:08:39 -0800
Subject: Bump up Go 1.13 as minimum requirement

PiperOrigin-RevId: 284320186
---
 pkg/sentry/sighandling/sighandling.go            | 75 +++++-------------------
 pkg/sentry/sighandling/sighandling_unsafe.go     | 26 --------
 pkg/syncutil/BUILD                               |  2 -
 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go | 21 -------
 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go | 16 -----
 pkg/syncutil/downgradable_rwmutex_unsafe.go      |  5 +-
 runsc/boot/loader.go                             | 50 +++++++---------
 7 files changed, 41 insertions(+), 154 deletions(-)
 delete mode 100644 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
 delete mode 100644 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 2f65db70b..ba1f9043d 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -16,7 +16,6 @@
 package sighandling
 
 import (
-	"fmt"
 	"os"
 	"os/signal"
 	"reflect"
@@ -31,37 +30,25 @@ const numSignals = 32
 // handleSignals listens for incoming signals and calls the given handler
 // function.
 //
-// It starts when the start channel is closed, stops when the stop channel
-// is closed, and closes done once it will no longer deliver signals to k.
-func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) {
+// It stops when the stop channel is closed. The done channel is closed once it
+// will no longer deliver signals to k.
+func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) {
 	// Build a select case.
-	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
+	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}}
 	for _, sigchan := range sigchans {
 		sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)})
 	}
 
-	started := false
 	for {
 		// Wait for a notification.
 		index, _, ok := reflect.Select(sc)
 
-		// Was it the start / stop channel?
+		// Was it the stop channel?
 		if index == 0 {
 			if !ok {
-				if !started {
-					// start channel; start forwarding and
-					// swap this case for the stop channel
-					// to select stop requests.
-					started = true
-					sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}
-				} else {
-					// stop channel; stop forwarding and
-					// clear this case so it is never
-					// selected again.
-					started = false
-					close(done)
-					sc[0].Chan = reflect.Value{}
-				}
+				// Stop forwarding and notify that it's done.
+				close(done)
+				return
 			}
 			continue
 		}
@@ -73,44 +60,17 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start,
 
 		// Otherwise, it was a signal on channel N. Index 0 represents the stop
 		// channel, so index N represents the channel for signal N.
-		signal := linux.Signal(index)
-
-		if !started {
-			// Kernel cannot receive signals, either because it is
-			// not ready yet or is shutting down.
-			//
-			// Kill ourselves if this signal would have killed the
-			// process before PrepareForwarding was called. i.e., all
-			// _SigKill signals; see Go
-			// src/runtime/sigtab_linux_generic.go.
-			//
-			// Otherwise ignore the signal.
-			//
-			// TODO(b/114489875): Drop in Go 1.12, which uses tgkill
-			// in runtime.raise.
-			switch signal {
-			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
-				dieFromSignal(signal)
-				panic(fmt.Sprintf("Failed to die from signal %d", signal))
-			default:
-				continue
-			}
-		}
-
-		// Pass the signal to the handler.
-		handler(signal)
+		handler(linux.Signal(index))
 	}
 }
 
-// PrepareHandler ensures that synchronous signals are passed to the given
-// handler function and returns a callback that starts signal delivery, which
-// itself returns a callback that stops signal handling.
+// StartSignalForwarding ensures that synchronous signals are passed to the
+// given handler function and returns a callback that stops signal delivery.
 //
 // Note that this function permanently takes over signal handling. After the
 // stop callback, signals revert to the default Go runtime behavior, which
 // cannot be overridden with external calls to signal.Notify.
-func PrepareHandler(handler func(linux.Signal)) func() func() {
-	start := make(chan struct{})
+func StartSignalForwarding(handler func(linux.Signal)) func() {
 	stop := make(chan struct{})
 	done := make(chan struct{})
 
@@ -128,13 +88,10 @@ func PrepareHandler(handler func(linux.Signal)) func() func() {
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-	go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
+	go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
 
-	return func() func() {
-		close(start)
-		return func() {
-			close(stop)
-			<-done
-		}
+	return func() {
+		close(stop)
+		<-done
 	}
 }
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index c303435d5..1ebe22d34 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -15,8 +15,6 @@
 package sighandling
 
 import (
-	"fmt"
-	"runtime"
 	"syscall"
 	"unsafe"
 
@@ -48,27 +46,3 @@ func IgnoreChildStop() error {
 
 	return nil
 }
-
-// dieFromSignal kills the current process with sig.
-//
-// Preconditions: The default action of sig is termination.
-func dieFromSignal(sig linux.Signal) {
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	sa := sigaction{handler: linux.SIG_DFL}
-	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 {
-		panic(fmt.Sprintf("rt_sigaction failed: %v", e))
-	}
-
-	set := linux.MakeSignalSet(sig)
-	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0); e != 0 {
-		panic(fmt.Sprintf("rt_sigprocmask failed: %v", e))
-	}
-
-	if err := syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.Signal(sig)); err != nil {
-		panic(fmt.Sprintf("tgkill failed: %v", err))
-	}
-
-	panic("failed to die")
-}
diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD
index b06a90bef..cb1f41628 100644
--- a/pkg/syncutil/BUILD
+++ b/pkg/syncutil/BUILD
@@ -31,8 +31,6 @@ go_template(
 go_library(
     name = "syncutil",
     srcs = [
-        "downgradable_rwmutex_1_12_unsafe.go",
-        "downgradable_rwmutex_1_13_unsafe.go",
         "downgradable_rwmutex_unsafe.go",
         "memmove_unsafe.go",
         "norace_unsafe.go",
diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
deleted file mode 100644
index 7c6336e62..000000000
--- a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.13
-
-// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
-
-package syncutil
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
-func runtimeSemrelease112(s *uint32, handoff bool)
-
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
-	// 'skipframes' is only available starting from 1.13.
-	runtimeSemrelease112(s, handoff)
-}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
deleted file mode 100644
index 3c3673119..000000000
--- a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package syncutil
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go
index 07feca402..51e11555d 100644
--- a/pkg/syncutil/downgradable_rwmutex_unsafe.go
+++ b/pkg/syncutil/downgradable_rwmutex_unsafe.go
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.12
+// +build go1.13
 // +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
@@ -27,6 +27,9 @@ import (
 //go:linkname runtimeSemacquire sync.runtime_Semacquire
 func runtimeSemacquire(s *uint32)
 
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
+
 // DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
 // method.
 type DowngradableRWMutex struct {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index df6052c88..bc1d0c1bb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -93,10 +93,6 @@ type Loader struct {
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
 
-	// startSignalForwarding enables forwarding of signals to the sandboxed
-	// container. It should be called after the init process is loaded.
-	startSignalForwarding func() func()
-
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -336,29 +332,6 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
 	}
 
-	// Handle signals by forwarding them to the root container process
-	// (except for panic signal, which should cause a panic).
-	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
-		// Panic signal should cause a panic.
-		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
-			panic("Signal-induced panic")
-		}
-
-		// Otherwise forward to root container.
-		deliveryMode := DeliverToProcess
-		if args.Console {
-			// Since we are running with a console, we should
-			// forward the signal to the foreground process group
-			// so that job control signals like ^C can be handled
-			// properly.
-			deliveryMode = DeliverToForegroundProcessGroup
-		}
-		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
-		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
-			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
-		}
-	})
-
 	// Create the control server using the provided FD.
 	//
 	// This must be done *after* we have initialized the kernel since the
@@ -566,8 +539,27 @@ func (l *Loader) run() error {
 		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
 	}
 
-	// Start signal forwarding only after an init process is created.
-	l.stopSignalForwarding = l.startSignalForwarding()
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.console {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
 
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
-- 
cgit v1.2.3


From 498595d54347d711dbd24247ed12c659b9d89c58 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 9 Dec 2019 11:21:01 -0800
Subject: Add tests for rseq(2)

Add a decent set of syscall tests for rseq(2). These are a bit awkward because
of issues with library integration. libc may register rseq on thread start
(including before main on the initial thread), precluding much testing. Thus we
run tests in a libc-free subprocess.

Support for rseq(2) in gVisor will come in a later commit.

PiperOrigin-RevId: 284595994
---
 test/syscalls/BUILD                 |   2 +
 test/syscalls/linux/BUILD           |  16 ++
 test/syscalls/linux/rseq.cc         | 198 +++++++++++++++++++
 test/syscalls/linux/rseq/BUILD      |  59 ++++++
 test/syscalls/linux/rseq/critical.S |  66 +++++++
 test/syscalls/linux/rseq/critical.h |  39 ++++
 test/syscalls/linux/rseq/rseq.cc    | 366 ++++++++++++++++++++++++++++++++++++
 test/syscalls/linux/rseq/start.S    |  45 +++++
 test/syscalls/linux/rseq/syscalls.h |  66 +++++++
 test/syscalls/linux/rseq/test.h     |  43 +++++
 test/syscalls/linux/rseq/types.h    |  31 +++
 test/syscalls/linux/rseq/uapi.h     |  54 ++++++
 12 files changed, 985 insertions(+)
 create mode 100644 test/syscalls/linux/rseq.cc
 create mode 100644 test/syscalls/linux/rseq/BUILD
 create mode 100644 test/syscalls/linux/rseq/critical.S
 create mode 100644 test/syscalls/linux/rseq/critical.h
 create mode 100644 test/syscalls/linux/rseq/rseq.cc
 create mode 100644 test/syscalls/linux/rseq/start.S
 create mode 100644 test/syscalls/linux/rseq/syscalls.h
 create mode 100644 test/syscalls/linux/rseq/test.h
 create mode 100644 test/syscalls/linux/rseq/types.h
 create mode 100644 test/syscalls/linux/rseq/uapi.h

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 6650984fa..829693e8e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -376,6 +376,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:rlimits_test")
 
+syscall_test(test = "//test/syscalls/linux:rseq_test")
+
 syscall_test(test = "//test/syscalls/linux:rtsignal_test")
 
 syscall_test(test = "//test/syscalls/linux:sched_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 61f310db9..c49445d62 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1852,6 +1852,22 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "rseq_test",
+    testonly = 1,
+    srcs = ["rseq.cc"],
+    data = ["//test/syscalls/linux/rseq"],
+    linkstatic = 1,
+    deps = [
+        "//test/syscalls/linux/rseq:lib",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "rtsignal_test",
     testonly = 1,
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
new file mode 100644
index 000000000..106c045e3
--- /dev/null
+++ b/test/syscalls/linux/rseq.cc
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Syscall test for rseq (restartable sequences).
+//
+// We must be very careful about how these tests are written. Each thread may
+// only have one struct rseq registration, which may be done automatically at
+// thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
+// not do so).
+//
+// Testing of rseq is thus done primarily in a child process with no
+// registration. This means exec'ing a nostdlib binary, as rseq registration can
+// only be cleared by execve (or knowing the old rseq address), and glibc (based
+// on the current unmerged patches) register rseq before calling main()).
+
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+  return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Returns true if this kernel supports the rseq syscall.
+PosixErrorOr<bool> RSeqSupported() {
+  // We have to be careful here, there are three possible cases:
+  //
+  // 1. rseq is not supported -> ENOSYS
+  // 2. rseq is supported and not registered -> success, but we should
+  //    unregister.
+  // 3. rseq is supported and registered -> EINVAL (most likely).
+
+  // The only validation done on new registrations is that rseq is aligned and
+  // writable.
+  rseq rseq = {};
+  int ret = RSeq(&rseq, sizeof(rseq), 0, 0);
+  if (ret == 0) {
+    // Successfully registered, rseq is supported. Unregister.
+    ret = RSeq(&rseq, sizeof(rseq), kRseqFlagUnregister, 0);
+    if (ret != 0) {
+      return PosixError(errno);
+    }
+    return true;
+  }
+
+  switch (errno) {
+    case ENOSYS:
+      // Not supported.
+      return false;
+    case EINVAL:
+      // Supported, but already registered. EINVAL returned because we provided
+      // a different address.
+      return true;
+    default:
+      // Unknown error.
+      return PosixError(errno);
+  }
+}
+
+constexpr char kRseqBinary[] = "test/syscalls/linux/rseq/rseq";
+
+void RunChildTest(std::string test_case, int want_status) {
+  std::string path = RunfilePath(kRseqBinary);
+
+  pid_t child_pid = -1;
+  int execve_errno = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(path, {path, test_case}, {}, &child_pid, &execve_errno));
+
+  ASSERT_GT(child_pid, 0);
+  ASSERT_EQ(execve_errno, 0);
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_EQ(status, want_status);
+}
+
+// Test that rseq must be aligned.
+TEST(RseqTest, Unaligned) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnaligned, 0);
+}
+
+// Sanity test that registration works.
+TEST(RseqTest, Register) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestRegister, 0);
+}
+
+// Registration can't be done twice.
+TEST(RseqTest, DoubleRegister) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestDoubleRegister, 0);
+}
+
+// Registration can be done again after unregister.
+TEST(RseqTest, RegisterUnregister) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestRegisterUnregister, 0);
+}
+
+// The pointer to rseq must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentPtr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnregisterDifferentPtr, 0);
+}
+
+// The signature must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentSignature) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnregisterDifferentSignature, 0);
+}
+
+// The CPU ID is initialized.
+TEST(RseqTest, CPU) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestCPU, 0);
+}
+
+// Critical section is eventually aborted.
+TEST(RseqTest, Abort) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbort, 0);
+}
+
+// Abort may be before the critical section.
+TEST(RseqTest, AbortBefore) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortBefore, 0);
+}
+
+// Signature must match.
+TEST(RseqTest, AbortSignature) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortSignature, SIGSEGV);
+}
+
+// Abort must not be in the critical section.
+TEST(RseqTest, AbortPreCommit) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortPreCommit, SIGSEGV);
+}
+
+// rseq.rseq_cs is cleared on abort.
+TEST(RseqTest, AbortClearsCS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortClearsCS, 0);
+}
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+TEST(RseqTest, InvalidAbortClearsCS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestInvalidAbortClearsCS, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
new file mode 100644
index 000000000..5cfe4e56f
--- /dev/null
+++ b/test/syscalls/linux/rseq/BUILD
@@ -0,0 +1,59 @@
+# This package contains a standalone rseq test binary. This binary must not
+# depend on libc, which might use rseq itself.
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(licenses = ["notice"])
+
+genrule(
+    name = "rseq_binary",
+    srcs = [
+        "critical.h",
+        "critical.S",
+        "rseq.cc",
+        "syscalls.h",
+        "start.S",
+        "test.h",
+        "types.h",
+        "uapi.h",
+    ],
+    outs = ["rseq"],
+    cmd = " ".join([
+        "$(CC)",
+        "$(CC_FLAGS) ",
+        "-I.",
+        "-Wall",
+        "-Werror",
+        "-O2",
+        "-std=c++17",
+        "-static",
+        "-nostdlib",
+        "-ffreestanding",
+        "-o",
+        "$(location rseq)",
+        "$(location critical.S)",
+        "$(location rseq.cc)",
+        "$(location start.S)",
+    ]),
+    toolchains = [
+        ":no_pie_cc_flags",
+        "@bazel_tools//tools/cpp:current_cc_toolchain",
+    ],
+    visibility = ["//:sandbox"],
+)
+
+cc_flags_supplier(
+    name = "no_pie_cc_flags",
+    features = ["-pie"],
+)
+
+cc_library(
+    name = "lib",
+    testonly = 1,
+    hdrs = [
+        "test.h",
+        "uapi.h",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  jmp begin
+
+  // Abort block before the critical section.
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  movq %rsi, 8(%rdi)
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  jmp rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
new file mode 100644
index 000000000..ac987a25e
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.h
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+constexpr uint32_t kRseqSignature = 0x90909090;
+
+extern "C" {
+
+extern void rseq_loop(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_loop_early_abort;
+extern void* rseq_loop_start;
+extern void* rseq_loop_pre_commit;
+extern void* rseq_loop_post_commit;
+extern void* rseq_loop_abort;
+
+extern int rseq_getpid(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_getpid_start;
+extern void* rseq_getpid_post_commit;
+extern void* rseq_getpid_abort;
+
+}  // extern "C"
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
new file mode 100644
index 000000000..f036db26d
--- /dev/null
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -0,0 +1,366 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/rseq/critical.h"
+#include "test/syscalls/linux/rseq/syscalls.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+namespace gvisor {
+namespace testing {
+
+extern "C" int main(int argc, char** argv, char** envp);
+
+// Standalone initialization before calling main().
+extern "C" void __init(uintptr_t* sp) {
+  int argc = sp[0];
+  char** argv = reinterpret_cast<char**>(&sp[1]);
+  char** envp = &argv[argc + 1];
+
+  // Call main() and exit.
+  sys_exit_group(main(argc, argv, envp));
+
+  // sys_exit_group does not return
+}
+
+int strcmp(const char* s1, const char* s2) {
+  const unsigned char* p1 = reinterpret_cast<const unsigned char*>(s1);
+  const unsigned char* p2 = reinterpret_cast<const unsigned char*>(s2);
+
+  while (*p1 == *p2) {
+    if (!*p1) {
+      return 0;
+    }
+    ++p1;
+    ++p2;
+  }
+  return static_cast<int>(*p1) - static_cast<int>(*p2);
+}
+
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+  return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Test that rseq must be aligned.
+int TestUnaligned() {
+  constexpr uintptr_t kRequiredAlignment = alignof(rseq);
+
+  char buf[2 * kRequiredAlignment] = {};
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(&buf[0]);
+  if ((ptr & (kRequiredAlignment - 1)) == 0) {
+    // buf is already aligned. Misalign it.
+    ptr++;
+  }
+
+  int ret = sys_rseq(reinterpret_cast<rseq*>(ptr), sizeof(rseq), 0, 0);
+  if (sys_errno(ret) != EINVAL) {
+    return 1;
+  }
+  return 0;
+}
+
+// Sanity test that registration works.
+int TestRegister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+  return 0;
+};
+
+// Registration can't be done twice.
+int TestDoubleRegister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// Registration can be done again after unregister.
+int TestRegisterUnregister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The pointer to rseq must match on register/unregister.
+int TestUnregisterDifferentPtr() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq r2 = {};
+  if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
+      sys_errno(ret) != EINVAL) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The signature must match on register/unregister.
+int TestUnregisterDifferentSignature() {
+  constexpr int kSignature = 0;
+
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
+      sys_errno(ret) != EPERM) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The CPU ID is initialized.
+int TestCPU() {
+  struct rseq r = {};
+  r.cpu_id = kRseqCPUIDUninitialized;
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (__atomic_load_n(&r.cpu_id, __ATOMIC_RELAXED) < 0) {
+    return 1;
+  }
+  if (__atomic_load_n(&r.cpu_id_start, __ATOMIC_RELAXED) < 0) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// Critical section is eventually aborted.
+int TestAbort() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  return 0;
+};
+
+// Abort may be before the critical section.
+int TestAbortBefore() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  return 0;
+};
+
+// Signature must match.
+int TestAbortSignature() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. This should SIGSEGV on abort.
+  rseq_loop(&r, &cs);
+
+  return 1;
+};
+
+// Abort must not be in the critical section.
+int TestAbortPreCommit() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
+
+  // Loops until abort. This should SIGSEGV on abort.
+  rseq_loop(&r, &cs);
+
+  return 1;
+};
+
+// rseq.rseq_cs is cleared on abort.
+int TestAbortClearsCS() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  if (__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+int TestInvalidAbortClearsCS() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
+
+  // When the next abort condition occurs, the kernel will clear cs once it
+  // determines we aren't in the critical section.
+  while (1) {
+    if (!__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+      break;
+    }
+  }
+
+  return 0;
+};
+
+// Exit codes:
+//  0 - Pass
+//  1 - Fail
+//  2 - Missing argument
+//  3 - Unknown test case
+extern "C" int main(int argc, char** argv, char** envp) {
+  if (argc != 2) {
+    // Usage: rseq <test case>
+    return 2;
+  }
+
+  if (strcmp(argv[1], kRseqTestUnaligned) == 0) {
+    return TestUnaligned();
+  }
+  if (strcmp(argv[1], kRseqTestRegister) == 0) {
+    return TestRegister();
+  }
+  if (strcmp(argv[1], kRseqTestDoubleRegister) == 0) {
+    return TestDoubleRegister();
+  }
+  if (strcmp(argv[1], kRseqTestRegisterUnregister) == 0) {
+    return TestRegisterUnregister();
+  }
+  if (strcmp(argv[1], kRseqTestUnregisterDifferentPtr) == 0) {
+    return TestUnregisterDifferentPtr();
+  }
+  if (strcmp(argv[1], kRseqTestUnregisterDifferentSignature) == 0) {
+    return TestUnregisterDifferentSignature();
+  }
+  if (strcmp(argv[1], kRseqTestCPU) == 0) {
+    return TestCPU();
+  }
+  if (strcmp(argv[1], kRseqTestAbort) == 0) {
+    return TestAbort();
+  }
+  if (strcmp(argv[1], kRseqTestAbortBefore) == 0) {
+    return TestAbortBefore();
+  }
+  if (strcmp(argv[1], kRseqTestAbortSignature) == 0) {
+    return TestAbortSignature();
+  }
+  if (strcmp(argv[1], kRseqTestAbortPreCommit) == 0) {
+    return TestAbortPreCommit();
+  }
+  if (strcmp(argv[1], kRseqTestAbortClearsCS) == 0) {
+    return TestAbortClearsCS();
+  }
+  if (strcmp(argv[1], kRseqTestInvalidAbortClearsCS) == 0) {
+    return TestInvalidAbortClearsCS();
+  }
+
+  return 3;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  movq  %rsp,%rdi
+  call  __init
+  hlt
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  %rdi,%rax      // syscall #
+  mov  %rsi,%rdi      // arg0
+  mov  %rdx,%rsi      // arg1
+  mov  %rcx,%rdx      // arg2
+  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
+  mov  %r9,%r8        // arg4
+  mov  0x8(%rsp),%r9  // arg5
+  syscall
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
new file mode 100644
index 000000000..e5299c188
--- /dev/null
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kGetpid = 39;
+constexpr int kExitGroup = 231;
+#else
+#error "Unknown architecture"
+#endif
+
+namespace gvisor {
+namespace testing {
+
+// Standalone system call interfaces.
+// Note that these are all "raw" system call interfaces which encode
+// errors by setting the return value to a small negative number.
+// Use sys_errno() to check system call return values for errors.
+
+// Maximum Linux error number.
+constexpr int kMaxErrno = 4095;
+
+// Errno values.
+#define EPERM 1
+#define EFAULT 14
+#define EBUSY 16
+#define EINVAL 22
+
+// Get the error number from a raw system call return value.
+// Returns a positive error number or 0 if there was no error.
+static inline int sys_errno(uintptr_t rval) {
+  if (rval >= static_cast<uintptr_t>(-kMaxErrno)) {
+    return -static_cast<int>(rval);
+  }
+  return 0;
+}
+
+extern "C" uintptr_t raw_syscall(int number, ...);
+
+static inline void sys_exit_group(int status) {
+  raw_syscall(kExitGroup, status);
+}
+static inline int sys_getpid() {
+  return static_cast<int>(raw_syscall(kGetpid));
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h
new file mode 100644
index 000000000..3b7bb74b1
--- /dev/null
+++ b/test/syscalls/linux/rseq/test.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+
+namespace gvisor {
+namespace testing {
+
+// Test cases supported by rseq binary.
+
+inline constexpr char kRseqTestUnaligned[] = "unaligned";
+inline constexpr char kRseqTestRegister[] = "register";
+inline constexpr char kRseqTestDoubleRegister[] = "double-register";
+inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
+inline constexpr char kRseqTestUnregisterDifferentPtr[] =
+    "unregister-different-ptr";
+inline constexpr char kRseqTestUnregisterDifferentSignature[] =
+    "unregister-different-signature";
+inline constexpr char kRseqTestCPU[] = "cpu";
+inline constexpr char kRseqTestAbort[] = "abort";
+inline constexpr char kRseqTestAbortBefore[] = "abort-before";
+inline constexpr char kRseqTestAbortSignature[] = "abort-signature";
+inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
+inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
+inline constexpr char kRseqTestInvalidAbortClearsCS[] =
+    "invalid-abort-clears-cs";
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
new file mode 100644
index 000000000..b6afe9817
--- /dev/null
+++ b/test/syscalls/linux/rseq/types.h
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+
+using size_t = __SIZE_TYPE__;
+using uintptr_t = __UINTPTR_TYPE__;
+
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
+
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
new file mode 100644
index 000000000..e3ff0579a
--- /dev/null
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+
+// User-kernel ABI for restartable sequences.
+
+// Standard types.
+//
+// N.B. This header will be included in targets that do have the standard
+// library, so we can't shadow the standard type names.
+using __u32 = __UINT32_TYPE__;
+using __u64 = __UINT64_TYPE__;
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kRseqSyscall = 334;
+#else
+#error "Unknown architecture"
+#endif  // __x86_64__
+
+struct rseq_cs {
+  __u32 version;
+  __u32 flags;
+  __u64 start_ip;
+  __u64 post_commit_offset;
+  __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+// N.B. alignment is enforced by the kernel.
+struct rseq {
+  __u32 cpu_id_start;
+  __u32 cpu_id;
+  struct rseq_cs* rseq_cs;
+  __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+constexpr int kRseqFlagUnregister = 1 << 0;
+
+constexpr int kRseqCPUIDUninitialized = -1;
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
-- 
cgit v1.2.3


From cf477c86ca8bfd27551c97aa4015364d30b98f2e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 9 Dec 2019 11:21:37 -0800
Subject: Mark runner_test as manual.

Because it is local-only, it should also be marked manual.

PiperOrigin-RevId: 284596186
---
 benchmarks/runner/BUILD | 5 ++++-
 scripts/simple_tests.sh | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
index a3941da42..de24824cc 100644
--- a/benchmarks/runner/BUILD
+++ b/benchmarks/runner/BUILD
@@ -34,7 +34,10 @@ py_test(
     name = "runner_test",
     srcs = ["runner_test.py"],
     python_version = "PY3",
-    tags = ["local"],
+    tags = [
+        "local",
+        "manual",
+    ],
     deps = [
         ":runner",
         requirement("click", True),
diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh
index ef25afc2e..3a15050c2 100755
--- a/scripts/simple_tests.sh
+++ b/scripts/simple_tests.sh
@@ -17,4 +17,4 @@
 source $(dirname $0)/common.sh
 
 # Run all simple tests (locally).
-test //pkg/... //runsc/... //tools/... //benchmarks/...
+test //pkg/... //runsc/... //tools/... //benchmarks/... //benchmarks/runner:runner_test
-- 
cgit v1.2.3


From cb5f9b8f863c93bb7e3757c1f4b3e1a64e6acdfb Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 9 Dec 2019 12:03:16 -0800
Subject: Mark test as non flaky.

PiperOrigin-RevId: 284606133
---
 test/syscalls/linux/BUILD | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index c49445d62..6ea922fb4 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3272,8 +3272,6 @@ cc_binary(
     testonly = 1,
     srcs = ["tcp_socket.cc"],
     linkstatic = 1,
-    # FIXME(b/135470853)
-    tags = ["flaky"],
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
-- 
cgit v1.2.3


From 898dcc2f839a975a9171271824af32176c2e5c27 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 9 Dec 2019 12:03:31 -0800
Subject: Redirect TODOs to gvisor.dev

PiperOrigin-RevId: 284606233
---
 pkg/sentry/fs/gofer/session.go             | 6 +++---
 pkg/sentry/kernel/semaphore/semaphore.go   | 6 +++---
 pkg/sentry/syscalls/linux/linux64_amd64.go | 2 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 0da608548..4e358a46a 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -143,9 +143,9 @@ type session struct {
 	// socket files. This allows unix domain sockets to be used with paths that
 	// belong to a gofer.
 	//
-	// TODO(b/77154739): there are few possible races with someone stat'ing the
-	// file and another deleting it concurrently, where the file will not be
-	// reported as socket file.
+	// TODO(gvisor.dev/issue/1200): there are few possible races with someone
+	// stat'ing the file and another deleting it concurrently, where the file
+	// will not be reported as socket file.
 	endpoints *endpointMaps `state:"wait"`
 }
 
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 93fe68a3e..de9617e9d 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 		return syserror.ERANGE
 	}
 
-	// TODO(b/29354920): Clear undo entries in all processes
+	// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
 	sem.value = val
 	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	for i, val := range vals {
 		sem := &s.sems[i]
 
-		// TODO(b/29354920): Clear undo entries in all processes
+		// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
 		sem.value = int16(val)
 		sem.pid = pid
 		sem.wakeWaiters()
@@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
 	}
 
 	// All operations succeeded, apply them.
-	// TODO(b/29354920): handle undo operations.
+	// TODO(gvisor.dev/issue/137): handle undo operations.
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 81e4f93a6..5642d69ea 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -260,7 +260,7 @@ var AMD64 = &kernel.SyscallTable{
 		217: syscalls.Supported("getdents64", Getdents64),
 		218: syscalls.Supported("set_tid_address", SetTidAddress),
 		219: syscalls.Supported("restart_syscall", RestartSyscall),
-		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
 		221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
 		222: syscalls.Supported("timer_create", TimerCreate),
 		223: syscalls.Supported("timer_settime", TimerSettime),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index f1dd4b0c0..f897bfff8 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -224,7 +224,7 @@ var ARM64 = &kernel.SyscallTable{
 		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
 		190: syscalls.Supported("semget", Semget),
 		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
-		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
 		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
 		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
 		195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
-- 
cgit v1.2.3


From 17867c88f7afdac6ff1c212aeac9aee2045f4f5a Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Mon, 9 Dec 2019 13:35:56 -0800
Subject: Include <netinet/tcp.h> for TCP enums in proc_net tests

These are currently duplicated in ip_socket_test_util, so tests including
both netinet/tcp.h and ip_socket_test_util won't compile.

PiperOrigin-RevId: 284623958
---
 test/syscalls/linux/ip_socket_test_util.h | 19 -------------------
 test/syscalls/linux/proc_net_tcp.cc       |  1 +
 test/syscalls/linux/proc_net_udp.cc       |  1 +
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 072230d85..9cb4566db 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -26,25 +26,6 @@
 namespace gvisor {
 namespace testing {
 
-// Possible values of the "st" field in a /proc/net/{tcp,udp} entry. Source:
-// Linux kernel, include/net/tcp_states.h.
-enum {
-  TCP_ESTABLISHED = 1,
-  TCP_SYN_SENT,
-  TCP_SYN_RECV,
-  TCP_FIN_WAIT1,
-  TCP_FIN_WAIT2,
-  TCP_TIME_WAIT,
-  TCP_CLOSE,
-  TCP_CLOSE_WAIT,
-  TCP_LAST_ACK,
-  TCP_LISTEN,
-  TCP_CLOSING,
-  TCP_NEW_SYN_RECV,
-
-  TCP_MAX_STATES
-};
-
 // Extracts the IP address from an inet sockaddr in network byte order.
 uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
 
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 2659f6a98..5b6e3e3cd 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index f06f1a24b..786b4b4af 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-- 
cgit v1.2.3


From 18af75db9de5244bd3e180a86886a4b3cadd7547 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 9 Dec 2019 15:51:24 -0800
Subject: Add UDP SO_REUSEADDR support to the port manager.

Next steps include adding support to the transport demuxer and the UDP endpoint.

PiperOrigin-RevId: 284652151
---
 pkg/tcpip/ports/BUILD                              |   2 +-
 pkg/tcpip/ports/ports.go                           | 148 +++++++--
 pkg/tcpip/ports/ports_test.go                      | 182 +++++++----
 pkg/tcpip/transport/tcp/BUILD                      |   1 +
 pkg/tcpip/transport/tcp/endpoint.go                |  25 +-
 pkg/tcpip/transport/udp/BUILD                      |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |  19 +-
 .../linux/socket_bind_to_device_sequence.cc        | 353 +++++++++++++++------
 8 files changed, 536 insertions(+), 195 deletions(-)

diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 4839f0a65..e156b01f6 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,5 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 30cea8996..6c5e19e8f 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -41,6 +41,30 @@ type portDescriptor struct {
 	port      uint16
 }
 
+// Flags represents the type of port reservation.
+//
+// +stateify savable
+type Flags struct {
+	// MostRecent represents UDP SO_REUSEADDR.
+	MostRecent bool
+
+	// LoadBalanced indicates SO_REUSEPORT.
+	//
+	// LoadBalanced takes precidence over MostRecent.
+	LoadBalanced bool
+}
+
+func (f Flags) bits() reuseFlag {
+	var rf reuseFlag
+	if f.MostRecent {
+		rf |= mostRecentFlag
+	}
+	if f.LoadBalanced {
+		rf |= loadBalancedFlag
+	}
+	return rf
+}
+
 // PortManager manages allocating, reserving and releasing ports.
 type PortManager struct {
 	mu             sync.RWMutex
@@ -54,9 +78,59 @@ type PortManager struct {
 	hint uint32
 }
 
+type reuseFlag int
+
+const (
+	mostRecentFlag reuseFlag = 1 << iota
+	loadBalancedFlag
+	nextFlag
+
+	flagMask = nextFlag - 1
+)
+
 type portNode struct {
-	reuse bool
-	refs  int
+	// refs stores the count for each possible flag combination.
+	refs [nextFlag]int
+}
+
+func (p portNode) totalRefs() int {
+	var total int
+	for _, r := range p.refs {
+		total += r
+	}
+	return total
+}
+
+// flagRefs returns the number of references with all specified flags.
+func (p portNode) flagRefs(flags reuseFlag) int {
+	var total int
+	for i, r := range p.refs {
+		if reuseFlag(i)&flags == flags {
+			total += r
+		}
+	}
+	return total
+}
+
+// allRefsHave returns if all references have all specified flags.
+func (p portNode) allRefsHave(flags reuseFlag) bool {
+	for i, r := range p.refs {
+		if reuseFlag(i)&flags == flags && r > 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// intersectionRefs returns the set of flags shared by all references.
+func (p portNode) intersectionRefs() reuseFlag {
+	intersection := flagMask
+	for i, r := range p.refs {
+		if r > 0 {
+			intersection &= reuseFlag(i)
+		}
+	}
+	return intersection
 }
 
 // deviceNode is never empty. When it has no elements, it is removed from the
@@ -66,30 +140,44 @@ type deviceNode map[tcpip.NICID]portNode
 // isAvailable checks whether binding is possible by device. If not binding to a
 // device, check against all portNodes. If binding to a specific device, check
 // against the unspecified device and the provided device.
-func (d deviceNode) isAvailable(reuse bool, bindToDevice tcpip.NICID) bool {
+//
+// If either of the port reuse flags is enabled on any of the nodes, all nodes
+// sharing a port must share at least one reuse flag. This matches Linux's
+// behavior.
+func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
+	flagBits := flags.bits()
 	if bindToDevice == 0 {
 		// Trying to binding all devices.
-		if !reuse {
+		if flagBits == 0 {
 			// Can't bind because the (addr,port) is already bound.
 			return false
 		}
+		intersection := flagMask
 		for _, p := range d {
-			if !p.reuse {
-				// Can't bind because the (addr,port) was previously bound without reuse.
+			i := p.intersectionRefs()
+			intersection &= i
+			if intersection&flagBits == 0 {
+				// Can't bind because the (addr,port) was
+				// previously bound without reuse.
 				return false
 			}
 		}
 		return true
 	}
 
+	intersection := flagMask
+
 	if p, ok := d[0]; ok {
-		if !reuse || !p.reuse {
+		intersection = p.intersectionRefs()
+		if intersection&flagBits == 0 {
 			return false
 		}
 	}
 
 	if p, ok := d[bindToDevice]; ok {
-		if !reuse || !p.reuse {
+		i := p.intersectionRefs()
+		intersection &= i
+		if intersection&flagBits == 0 {
 			return false
 		}
 	}
@@ -103,12 +191,12 @@ type bindAddresses map[tcpip.Address]deviceNode
 // isAvailable checks whether an IP address is available to bind to. If the
 // address is the "any" address, check all other addresses. Otherwise, just
 // check against the "any" address and the provided address.
-func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice tcpip.NICID) bool {
+func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID) bool {
 	if addr == anyIPAddress {
 		// If binding to the "any" address then check that there are no conflicts
 		// with all addresses.
 		for _, d := range b {
-			if !d.isAvailable(reuse, bindToDevice) {
+			if !d.isAvailable(flags, bindToDevice) {
 				return false
 			}
 		}
@@ -117,14 +205,14 @@ func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice
 
 	// Check that there is no conflict with the "any" address.
 	if d, ok := b[anyIPAddress]; ok {
-		if !d.isAvailable(reuse, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice) {
 			return false
 		}
 	}
 
 	// Check that this is no conflict with the provided address.
 	if d, ok := b[addr]; ok {
-		if !d.isAvailable(reuse, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice) {
 			return false
 		}
 	}
@@ -190,17 +278,17 @@ func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p ui
 }
 
 // IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	return s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice)
+	return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice)
 }
 
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if addrs, ok := s.allocatedPorts[desc]; ok {
-			if !addrs.isAvailable(addr, reuse, bindToDevice) {
+			if !addrs.isAvailable(addr, flags, bindToDevice) {
 				return false
 			}
 		}
@@ -212,14 +300,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	// If a port is specified, just try to reserve it for all network
 	// protocols.
 	if port != 0 {
-		if !s.reserveSpecificPort(networks, transport, addr, port, reuse, bindToDevice) {
+		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice) {
 			return 0, tcpip.ErrPortInUse
 		}
 		return port, nil
@@ -227,15 +315,16 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, reuse, bindToDevice), nil
+		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice), nil
 	})
 }
 
 // reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
-	if !s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) {
 		return false
 	}
+	flagBits := flags.bits()
 
 	// Reserve port on all network protocols.
 	for _, network := range networks {
@@ -250,12 +339,9 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			d = make(deviceNode)
 			m[addr] = d
 		}
-		if n, ok := d[bindToDevice]; ok {
-			n.refs++
-			d[bindToDevice] = n
-		} else {
-			d[bindToDevice] = portNode{reuse: reuse, refs: 1}
-		}
+		n := d[bindToDevice]
+		n.refs[flagBits]++
+		d[bindToDevice] = n
 	}
 
 	return true
@@ -263,10 +349,12 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 
 // ReleasePort releases the reservation on a port/IP combination so that it can
 // be reserved by other endpoints.
-func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, bindToDevice tcpip.NICID) {
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	flagBits := flags.bits()
+
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if m, ok := s.allocatedPorts[desc]; ok {
@@ -278,9 +366,9 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 			if !ok {
 				continue
 			}
-			n.refs--
+			n.refs[flagBits]--
 			d[bindToDevice] = n
-			if n.refs == 0 {
+			if n.refs == [nextFlag]int{} {
 				delete(d, bindToDevice)
 			}
 			if len(d) == 0 {
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 19f4833fc..d6969d050 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -33,7 +33,7 @@ type portReserveTestAction struct {
 	port    uint16
 	ip      tcpip.Address
 	want    *tcpip.Error
-	reuse   bool
+	flags   Flags
 	release bool
 	device  tcpip.NICID
 }
@@ -50,7 +50,7 @@ func TestPortReservation(t *testing.T) {
 				{port: 80, ip: fakeIPAddress1, want: nil},
 				/* N.B. Order of tests matters! */
 				{port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse},
-				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
 			},
 		},
 		{
@@ -61,7 +61,7 @@ func TestPortReservation(t *testing.T) {
 				/* release fakeIPAddress, but anyIPAddress is still inuse */
 				{port: 22, ip: fakeIPAddress, release: true},
 				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
 				/* Release port 22 from any IP address, then try to reserve fake IP address on 22 */
 				{port: 22, ip: anyIPAddress, want: nil, release: true},
 				{port: 22, ip: fakeIPAddress, want: nil},
@@ -71,36 +71,36 @@ func TestPortReservation(t *testing.T) {
 			actions: []portReserveTestAction{
 				{port: 00, ip: fakeIPAddress, want: nil},
 				{port: 00, ip: fakeIPAddress, want: nil},
-				{port: 00, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 00, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
 			tname: "bind to ip with reuseport",
 			actions: []portReserveTestAction{
-				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
-				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 
-				{port: 25, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 25, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 25, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 25, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 25, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 25, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
 			tname: "bind to inaddr any with reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
-				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 
-				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, release: true, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, release: true, want: nil},
 
-				{port: 24, ip: anyIPAddress, release: true},
-				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 24, ip: anyIPAddress, release: true},
-				{port: 24, ip: anyIPAddress, reuse: false, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: nil},
 			},
 		}, {
 			tname: "bind twice with device fails",
@@ -125,88 +125,152 @@ func TestPortReservation(t *testing.T) {
 			actions: []portReserveTestAction{
 				{port: 24, ip: fakeIPAddress, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "bind with device",
 			actions: []portReserveTestAction{
 				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 789, want: nil},
 				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
-			tname: "bind with reuse",
+			tname: "bind with reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
-			tname: "binding with reuse and device",
+			tname: "binding with reuseport and device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse},
 			},
 		}, {
-			tname: "mixing reuse and not reuse by binding to device",
+			tname: "mixing reuseport and not reuseport by binding to device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 999, want: nil},
 			},
 		}, {
-			tname: "can't bind to 0 after mixing reuse and not reuse",
+			tname: "can't bind to 0 after mixing reuseport and not reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "bind and release",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 
 				// Release the bind to device 0 and try again.
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil, release: true},
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil},
 			},
 		}, {
-			tname: "bind twice with reuse once",
+			tname: "bind twice with reuseport once",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "release an unreserved device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil},
 				// The below don't exist.
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil, release: true},
-				{port: 9999, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil, release: true},
+				{port: 9999, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
 				// Release all.
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil, release: true},
+			},
+		}, {
+			tname: "bind with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind twice with reuseaddr once",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseport, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
 			},
 		},
 	} {
@@ -216,12 +280,12 @@ func TestPortReservation(t *testing.T) {
 
 			for _, test := range test.actions {
 				if test.release {
-					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.device)
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse, test.device)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
 				if err != test.want {
-					t.Fatalf("ReservePort(.., .., %s, %d, %t, %d) = %v, want %v", test.ip, test.port, test.reuse, test.device, err, test.want)
+					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d) = %v, want %v", test.ip, test.port, test.flags, test.device, err, test.want)
 				}
 				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
 					t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index dd1728f9c..455a1c098 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -52,6 +52,7 @@ go_library(
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9d4a87e30..4861ab513 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tmutex"
@@ -343,6 +344,7 @@ type endpoint struct {
 	// Values used to reserve a port or register a transport endpoint
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -737,9 +739,10 @@ func (e *endpoint) Close() {
 			e.isRegistered = false
 		}
 
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
 	}
 
 	// Mark endpoint as closed.
@@ -800,10 +803,11 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 	e.boundBindToDevice = 0
+	e.boundPortFlags = ports.Flags{}
 
 	e.route.Release()
 	e.stack.CompleteTransportEndpointCleanup(e)
@@ -1775,7 +1779,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			}
 			// reusePort is false below because connect cannot reuse a port even if
 			// reusePort was set.
-			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
+			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, ports.Flags{LoadBalanced: false}, e.bindToDevice) {
 				return false, nil
 			}
 
@@ -1802,7 +1806,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	// before Connect: in such a case we don't want to hold on to
 	// reservations anymore.
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -2034,28 +2038,33 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
+	flags := ports.Flags{
+		LoadBalanced: e.reusePort,
+	}
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, flags, e.bindToDevice)
 	if err != nil {
 		return err
 	}
 
 	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = flags
 	e.isPortReserved = true
 	e.effectiveNetProtos = netProtos
 	e.ID.LocalPort = port
 
 	// Any failures beyond this point must remove the port registration.
-	defer func(bindToDevice tcpip.NICID) {
+	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
 		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
+			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice)
 			e.isPortReserved = false
 			e.effectiveNetProtos = nil
 			e.ID.LocalPort = 0
 			e.ID.LocalAddress = ""
 			e.boundNICID = 0
 			e.boundBindToDevice = 0
+			e.boundPortFlags = ports.Flags{}
 		}
-	}(e.boundBindToDevice)
+	}(e.boundPortFlags, e.boundBindToDevice)
 
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 8d4c3808f..97e4d5825 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/waiter",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 4b161e404..1ac4705af 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -107,6 +108,7 @@ type endpoint struct {
 	// Values used to reserve a port or register a transport endpoint.
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
 
 	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
 	// applied while sending packets. Defaults to 0 as on Linux.
@@ -180,8 +182,9 @@ func (e *endpoint) Close() {
 	switch e.state {
 	case StateBound, StateConnected:
 		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
 	}
 
 	for _, mem := range e.multicastMemberships {
@@ -895,7 +898,8 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	} else {
 		if e.ID.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+			e.boundPortFlags = ports.Flags{}
 		}
 		e.state = StateInitial
 	}
@@ -1042,16 +1046,23 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
+		flags := ports.Flags{
+			LoadBalanced: e.reusePort,
+			// FIXME(b/129164367): Support SO_REUSEADDR.
+			MostRecent: false,
+		}
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
+		e.boundPortFlags = flags
 		id.LocalPort = port
 	}
 
 	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
 	if err != nil {
-		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
+		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice)
+		e.boundPortFlags = ports.Flags{}
 	}
 	return id, e.bindToDevice, err
 }
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index e4641c62e..033fd80a5 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -97,12 +97,12 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     sockets_to_close_.erase(socket_id);
   }
 
-  // Bind a socket with the reuse option and bind_to_device options.  Checks
+  // Bind a socket with the reuse options and bind_to_device options. Checks
   // that all steps succeed and that the bind command's error matches want.
   // Sets the socket_id to uniquely identify the socket bound if it is not
   // nullptr.
-  void BindSocket(bool reuse, int device_id = 0, int want = 0,
-                  int *socket_id = nullptr) {
+  void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
+                  int want = 0, int *socket_id = nullptr) {
     next_socket_id_++;
     sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
     auto socket_fd = sockets_to_close_[next_socket_id_]->get();
@@ -110,13 +110,20 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
       *socket_id = next_socket_id_;
     }
 
-    // If reuse is indicated, do that.
-    if (reuse) {
+    // If reuse_port is indicated, do that.
+    if (reuse_port) {
       EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
                              sizeof(kSockOptOn)),
                   SyscallSucceedsWithValue(0));
     }
 
+    // If reuse_addr is indicated, do that.
+    if (reuse_addr) {
+      EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                             sizeof(kSockOptOn)),
+                  SyscallSucceedsWithValue(0));
+    }
+
     // If the device is non-zero, bind to that device.
     if (device_id != 0) {
       string device_name;
@@ -182,129 +189,289 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
 };
 
 TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 3));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 3, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindToDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 1));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 2));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 1));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 2));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) {
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ false));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 456, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 789, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithReuse) {
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
   ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true, /* bind_to_device */ 0));
+      BindSocket(/* reusePort */ true, /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false,
+      /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 456));
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 999, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse_port */ true, /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 999, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 456, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 999, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 999, 0));
 }
 
 TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 456));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindAndRelease) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
   int to_release;
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, 0, &to_release));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 345, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 345, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
   // Release the bind to device 0 and try again.
   ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 345));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 345));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
   ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+      BindSocket(/* reusePort */ false, /* reuse_addr */ true));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest,
+       CannotBindTo0AfterMixingReuseAddrAndNotReuseAddr) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReusePortThenReuseAddrReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+// This behavior seems like a bug?
+TEST_P(BindToDeviceSequenceTest,
+       BindReuseAddrThenReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
 }
 
 INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
-- 
cgit v1.2.3


From 98aafb1334b816596b462ad12fa3e96784703061 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 9 Dec 2019 20:07:14 -0800
Subject: Add test for SO_BINDTODEVICE state bug.

This was accidentally dropped from the change which fixed the bug.

Updates #1217

PiperOrigin-RevId: 284689362
---
 .../linux/socket_bind_to_device_sequence.cc        | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index 033fd80a5..34b1058a9 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -97,6 +97,16 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     sockets_to_close_.erase(socket_id);
   }
 
+  // SetDevice changes the bind_to_device option. It does not bind or re-bind.
+  void SetDevice(int socket_id, int device_id) {
+    auto socket_fd = sockets_to_close_[socket_id]->get();
+    string device_name;
+    ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+    EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+                           device_name.c_str(), device_name.size() + 1),
+                SyscallSucceedsWithValue(0));
+  }
+
   // Bind a socket with the reuse options and bind_to_device options. Checks
   // that all steps succeed and that the bind command's error matches want.
   // Sets the socket_id to uniquely identify the socket bound if it is not
@@ -474,6 +484,25 @@ TEST_P(BindToDeviceSequenceTest,
                                      /* bind_to_device */ 0));
 }
 
+// Repro test for gvisor.dev/issue/1217. Not replicated in ports_test.go as this
+// test is different from the others and wouldn't fit well there.
+TEST_P(BindToDeviceSequenceTest, BindAndReleaseDifferentDevice) {
+  int to_release;
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, EADDRINUSE));
+  // Change the device. Since the socket was already bound, this should have no
+  // effect.
+  SetDevice(to_release, 2);
+  // Release the bind to device 3 and try again.
+  ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+}
+
 INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
                          ::testing::Values(IPv4UDPUnboundSocket(0),
                                            IPv4TCPUnboundSocket(0)));
-- 
cgit v1.2.3


From 4a19ebd431659578c9af0a91ff35d8b6d9de190e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 09:32:47 -0800
Subject: Add hostinet tests for sendmsg and recvmsg with TOS/TCLASS.

PiperOrigin-RevId: 284786069
---
 test/syscalls/linux/udp_socket_test_cases.cc | 149 +++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 4556f16d6..dc35c2f50 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1345,5 +1345,154 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
   ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
 }
 
+// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
+// outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SetAndReceiveTOS) {
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  ASSERT_THAT(
+      setsockopt(s_, recv_level, recv_type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Set socket TOS.
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+  }
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+  ASSERT_THAT(
+      setsockopt(t_, sent_level, sent_type, &sent_tos, sizeof(sent_tos)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_type == IPV6_TCLASS) {
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = received_cmsgbuf.size();
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
+// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
+// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SendAndReceiveTOS) {
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  int recv_opt = kSockOptOn;
+  ASSERT_THAT(
+      setsockopt(s_, recv_level, recv_type, &recv_opt, sizeof(recv_opt)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  sent_msg.msg_control = &sent_cmsgbuf[0];
+  sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+
+  // Manually add control message.
+  struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
+  sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
+  sent_cmsg->cmsg_level = sent_level;
+  sent_cmsg->cmsg_type = sent_type;
+  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From aadbf322c63b0aa1d34cd9755dc1266af2e5ac58 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 09:36:52 -0800
Subject: Disable execveat test that is causing files in /bin to be deleted.

Disable until gvisor.dev/issue/1366 is resolved.

Updates #1366

PiperOrigin-RevId: 284786895
---
 test/syscalls/linux/exec.cc | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index b5e0a512b..e402d5b27 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -696,15 +696,6 @@ TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
                 ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
 }
 
-TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
-  TempPath parent_link =
-      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
-  std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
-
-  CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
-                AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
-}
-
 TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
-- 
cgit v1.2.3


From 30f7316dc4a5fe0346c6e2e8e6854bd48a316a82 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 09:59:36 -0800
Subject: Make comments clearer for control message handling.

PiperOrigin-RevId: 284791600
---
 pkg/sentry/socket/control/control.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 782a3cb92..fa3188d51 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -195,15 +195,15 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([
 	// the available space, we must align down.
 	//
 	// align must be >= 4 and each data int32 is 4 bytes. The length of the
-	// header is already aligned, so if we align to the with of the data there
+	// header is already aligned, so if we align to the width of the data there
 	// are two cases:
 	// 1. The aligned length is less than the length of the header. The
 	// unaligned length was also less than the length of the header, so we
 	// can't write anything.
 	// 2. The aligned length is greater than or equal to the length of the
-	// header. We can write the header plus zero or more datas. We can't write
-	// a partial int32, so the length of the message will be
-	// min(aligned length, header + datas).
+	// header. We can write the header plus zero or more bytes of data. We can't
+	// write a partial int32, so the length of the message will be
+	// min(aligned length, header + data).
 	if space < linux.SizeOfControlMessageHeader {
 		flags |= linux.MSG_CTRUNC
 		return buf, flags
@@ -240,12 +240,12 @@ func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interf
 
 	buf = binary.Marshal(buf, usermem.ByteOrder, data)
 
-	// Check if we went over.
+	// If the control message data brought us over capacity, omit it.
 	if cap(buf) != cap(ob) {
 		return hdrBuf
 	}
 
-	// Fix up length.
+	// Update control message length to include data.
 	putUint64(ob, uint64(len(buf)-len(ob)))
 
 	return alignSlice(buf, align)
-- 
cgit v1.2.3


From c15be3f8cfde692e94dbb936fc4111bc1560502c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 10 Dec 2019 10:25:30 -0800
Subject: Add all upstream syscalls to tables

Package strace is missing some syscalls we actually implement (e.g.,
getrandom). We also see newer syscalls sometimes (e.g., membarrier) that would
be handy to have formatted.

Let's go ahead and add all syscalls in the latest upstream release (v5.4), even
though we only intend to implement v4.4. None of them are implemented, just
included as placeholders.

PiperOrigin-RevId: 284797577
---
 pkg/sentry/strace/linux64.go               | 28 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/linux64_amd64.go | 22 +++++++++++++++++++++-
 pkg/sentry/syscalls/linux/linux64_arm64.go | 19 +++++++++++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 5d57b75af..f2763b3f1 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -335,5 +335,33 @@ var linuxAMD64 = SyscallMap{
 	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
 	316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
 	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+	318: makeSyscallInfo("getrandom", Hex, Hex, Hex),
+	319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close.
+	320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex),
+	321: makeSyscallInfo("bpf", Hex, Hex, Hex),
+	322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex),
+	323: makeSyscallInfo("userfaultfd", Hex),
+	324: makeSyscallInfo("membarrier", Hex, Hex),
+	325: makeSyscallInfo("mlock2", Hex, Hex, Hex),
+	326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
+	327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
+	328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
+	329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex),
+	330: makeSyscallInfo("pkey_alloc", Hex, Hex),
+	331: makeSyscallInfo("pkey_free", Hex),
 	332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
+	333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
+	334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
+	424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
+	425: makeSyscallInfo("io_uring_setup", Hex, Hex),
+	426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
+	427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
+	428: makeSyscallInfo("open_tree", FD, Path, Hex),
+	429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
+	430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
+	431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
+	432: makeSyscallInfo("fsmount", FD, Hex, Hex),
+	433: makeSyscallInfo("fspick", FD, Path, Hex),
+	434: makeSyscallInfo("pidfd_open", Hex, Hex),
+	435: makeSyscallInfo("clone3", Hex, Hex),
 }
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 5642d69ea..797542d28 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -367,11 +367,31 @@ var AMD64 = &kernel.SyscallTable{
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
-		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
+		// Syscalls implemented after 325 are "backports" from versions
+		// of Linux after 4.4.
 		326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
 		327: syscalls.Supported("preadv2", Preadv2),
 		328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		332: syscalls.Supported("statx", Statx),
+		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index f897bfff8..2bc7faff5 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -302,7 +302,26 @@ var ARM64 = &kernel.SyscallTable{
 		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
 		286: syscalls.Supported("preadv2", Preadv2),
 		287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		291: syscalls.Supported("statx", Statx),
+		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
 	},
 	Emulate: map[usermem.Addr]uintptr{},
 
-- 
cgit v1.2.3


From f47eaffd5c59445b8cafda1b7a51e7f4be5d254a Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 10:55:08 -0800
Subject: Do not consider symlinks as directories in fs utils.

IsDirectory() is used in RecursivelyDelete(), which should not follow symlinks.
The only other use (syscalls/linux/rename.cc) is not affected by this change.

Updates #1366.

PiperOrigin-RevId: 284803968
---
 test/util/fs_util.cc | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 88b1e7911..042cec94a 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -105,6 +105,15 @@ PosixErrorOr<struct stat> Stat(absl::string_view path) {
   return stat_buf;
 }
 
+PosixErrorOr<struct stat> Lstat(absl::string_view path) {
+  struct stat stat_buf;
+  int res = lstat(std::string(path).c_str(), &stat_buf);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("lstat ", path));
+  }
+  return stat_buf;
+}
+
 PosixErrorOr<struct stat> Fstat(int fd) {
   struct stat stat_buf;
   int res = fstat(fd, &stat_buf);
@@ -127,7 +136,7 @@ PosixErrorOr<bool> Exists(absl::string_view path) {
 }
 
 PosixErrorOr<bool> IsDirectory(absl::string_view path) {
-  ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Stat(path));
+  ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Lstat(path));
   if (S_ISDIR(stat_buf.st_mode)) {
     return true;
   }
-- 
cgit v1.2.3


From f6e87be82f189d7d2176dc2ca4a2d261481a032a Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 10:56:51 -0800
Subject: Let socket.ControlMessages Release() the underlying
 transport.ControlMessages.

PiperOrigin-RevId: 284804370
---
 pkg/sentry/socket/socket.go             | 5 +++++
 pkg/sentry/syscalls/linux/sys_socket.go | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 8c250c325..2389a9cdb 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -43,6 +43,11 @@ type ControlMessages struct {
 	IP   tcpip.ControlMessages
 }
 
+// Release releases Unix domain socket credentials and rights.
+func (c *ControlMessages) Release() {
+	c.Unix.Release()
+}
+
 // Socket is the interface containing socket syscalls used by the syscall layer
 // to redirect them to the appropriate implementation.
 type Socket interface {
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index d8acae063..e3faa890b 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -764,7 +764,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		}
 		if !cms.Unix.Empty() {
 			mflags |= linux.MSG_CTRUNC
-			cms.Unix.Release()
+			cms.Release()
 		}
 
 		if int(msg.Flags) != mflags {
@@ -784,7 +784,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
-	defer cms.Unix.Release()
+	defer cms.Release()
 
 	controlData := make([]byte, 0, msg.ControlLen)
 
@@ -885,7 +885,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
 	}
 
 	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
-	cm.Unix.Release()
+	cm.Release()
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
@@ -1071,7 +1071,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
-		controlMessages.Unix.Release()
+		controlMessages.Release()
 	}
 	return uintptr(n), err
 }
-- 
cgit v1.2.3


From 769e1cdcbe539ca2347ad5ccd2706ae17777aed9 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 11:40:29 -0800
Subject: Re-enable execveat test that was causing files in /bin to be deleted.

Test now no longer deletes files incorrectly, due to a fix in fs utils
used by TempPath (github.com/google/gvisor/pull/1368).

Fixes #1366

PiperOrigin-RevId: 284814605
---
 test/syscalls/linux/exec.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index e402d5b27..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -696,6 +696,15 @@ TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
                 ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
 }
 
+TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
+  TempPath parent_link =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
+  std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
+
+  CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
+                AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
+}
+
 TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
-- 
cgit v1.2.3


From 39386d78bb9636e52d6a0487d5fa7bff6beab64e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 13:04:32 -0800
Subject: Format fd_set parameters in select(2)/pselect(2) for strace.

I1202 14:55:06.835076    7991 x:0] [   1] select_test E
  select(0xa, 0x7fc6ce924c28 [0 1], null, null, 0x7fc6ce924c08 {sec=0 usec=0})
I1202 14:55:06.835102    7991 x:0] [   1] select_test X
  select(0xa, 0x7fc6ce924c28 [0 1], null, null, 0x7fc6ce924c08 {sec=0 usec=0})

PiperOrigin-RevId: 284831805
---
 pkg/sentry/strace/BUILD               |  1 +
 pkg/sentry/strace/linux64.go          |  4 +-
 pkg/sentry/strace/select.go           | 53 ++++++++++++++++++++++++++
 pkg/sentry/strace/strace.go           |  2 +
 pkg/sentry/strace/syscalls.go         |  4 ++
 pkg/sentry/syscalls/linux/sys_poll.go | 71 +++++++++++++++++------------------
 6 files changed, 96 insertions(+), 39 deletions(-)
 create mode 100644 pkg/sentry/strace/select.go

diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 72ebf766d..d46421199 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -14,6 +14,7 @@ go_library(
         "open.go",
         "poll.go",
         "ptrace.go",
+        "select.go",
         "signal.go",
         "socket.go",
         "strace.go",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index f2763b3f1..e603f858f 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -40,7 +40,7 @@ var linuxAMD64 = SyscallMap{
 	20:  makeSyscallInfo("writev", FD, WriteIOVec, Hex),
 	21:  makeSyscallInfo("access", Path, Oct),
 	22:  makeSyscallInfo("pipe", PipeFDs),
-	23:  makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval),
+	23:  makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval),
 	24:  makeSyscallInfo("sched_yield"),
 	25:  makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
 	26:  makeSyscallInfo("msync", Hex, Hex, Hex),
@@ -287,7 +287,7 @@ var linuxAMD64 = SyscallMap{
 	267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
 	268: makeSyscallInfo("fchmodat", FD, Path, Mode),
 	269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
-	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
+	270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet),
 	271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
 	272: makeSyscallInfo("unshare", CloneFlags),
 	273: makeSyscallInfo("set_robust_list", Hex, Hex),
diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
new file mode 100644
index 000000000..92c18083d
--- /dev/null
+++ b/pkg/sentry/strace/select.go
@@ -0,0 +1,53 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+func fdsFromSet(t *kernel.Task, set []byte) []int {
+	var fds []int
+	// Append n if the n-th bit is 1.
+	for i, v := range set {
+		for j := 0; j < 8; j++ {
+			if (v>>uint(j))&1 == 1 {
+				fds = append(fds, i*8+j)
+			}
+		}
+	}
+	return fds
+}
+
+func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	// Calculate the size of the fd set (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := uint(nfds % 8)
+
+	set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding fdset: %s)", addr, err)
+	}
+
+	return fmt.Sprintf("%#x %v", addr, fdsFromSet(t, set))
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 311389547..629c1f308 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -439,6 +439,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
 		case PollFDs:
 			output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
+		case SelectFDSet:
+			output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer()))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 3c389d375..e5d486c4e 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -206,6 +206,10 @@ const (
 	// PollFDs is an array of struct pollfd. The number of entries in the
 	// array is in the next argument.
 	PollFDs
+
+	// SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of
+	// fds represented must be the first argument.
+	SelectFDSet
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 7a13beac2..631dffec6 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -197,53 +197,51 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 	return remainingTimeout, n, err
 }
 
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes int, nBitsInLastPartialByte uint) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
 func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
 	if nfds < 0 || nfds > fileCap {
 		return 0, syserror.EINVAL
 	}
 
-	// Capture all the provided input vectors.
-	//
-	// N.B. This only works on little-endian architectures.
-	byteCount := (nfds + 7) / 8
-
-	bitsInLastPartialByte := uint(nfds % 8)
-	r := make([]byte, byteCount)
-	w := make([]byte, byteCount)
-	e := make([]byte, byteCount)
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := uint(nfds % 8)
 
-	if readFDs != 0 {
-		if _, err := t.CopyIn(readFDs, &r); err != nil {
-			return 0, err
-		}
-		// Mask out bits above nfds.
-		if bitsInLastPartialByte != 0 {
-			r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
-		}
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
 	}
-
-	if writeFDs != 0 {
-		if _, err := t.CopyIn(writeFDs, &w); err != nil {
-			return 0, err
-		}
-		if bitsInLastPartialByte != 0 {
-			w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
-		}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
 	}
-
-	if exceptFDs != 0 {
-		if _, err := t.CopyIn(exceptFDs, &e); err != nil {
-			return 0, err
-		}
-		if bitsInLastPartialByte != 0 {
-			e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
-		}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
 	}
 
 	// Count how many FDs are actually being requested so that we can build
 	// a PollFD array.
 	fdCount := 0
-	for i := 0; i < byteCount; i++ {
+	for i := 0; i < nBytes; i++ {
 		v := r[i] | w[i] | e[i]
 		for v != 0 {
 			v &= (v - 1)
@@ -254,7 +252,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	// Build the PollFD array.
 	pfd := make([]linux.PollFD, 0, fdCount)
 	var fd int32
-	for i := 0; i < byteCount; i++ {
+	for i := 0; i < nBytes; i++ {
 		rV, wV, eV := r[i], w[i], e[i]
 		v := rV | wV | eV
 		m := byte(1)
@@ -295,8 +293,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	}
 
 	// Do the syscall, then count the number of bits set.
-	_, _, err := pollBlock(t, pfd, timeout)
-	if err != nil {
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
 		return 0, syserror.ConvertIntr(err, syserror.EINTR)
 	}
 
-- 
cgit v1.2.3


From 87337e92e3a65e69f6bfc6ac7d162c1b6ed18048 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Tue, 10 Dec 2019 13:24:11 -0800
Subject: Add Kokoro configs for publishing Kythe xrefs.

PiperOrigin-RevId: 284835614
---
 kokoro/kythe/generate_xrefs.cfg | 28 ++++++++++++++++++++
 kokoro/kythe/generate_xrefs.sh  | 57 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 kokoro/kythe/generate_xrefs.cfg
 create mode 100644 kokoro/kythe/generate_xrefs.sh

diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg
new file mode 100644
index 000000000..03e65c54e
--- /dev/null
+++ b/kokoro/kythe/generate_xrefs.cfg
@@ -0,0 +1,28 @@
+build_file: "gvisor/kokoro/kythe/generate_xrefs.sh"
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+      keystore_config_id: 73898
+      keyname: "kokoro-rbe-service-account"
+    }
+  }
+}
+
+bazel_setting {
+  project_id: "gvisor-rbe"
+  local_execution: false
+  auth_credential: {
+    keystore_config_id: 73898
+    keyname: "kokoro-rbe-service-account"
+  }
+  bes_backend_address: "buildeventservice.googleapis.com"
+  foundry_backend_address: "remotebuildexecution.googleapis.com"
+  upsalite_frontend_address: "https://source.cloud.google.com"
+}
+
+action {
+  define_artifacts {
+    regex: "*.kzip"
+  }
+}
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
new file mode 100644
index 000000000..471b90e92
--- /dev/null
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+# Install the latest version of Bazel. The default on Kokoro images is out of
+# date.
+if command -v use_bazel.sh >/dev/null; then
+  use_bazel.sh latest
+fi
+bazel version
+
+# We need to use python 3.6 (the Kokoro PY3 default is 3.4) to compile `//...`
+# because benchmarktools requires a version of `requests` that is not a
+# available in 3.4.
+pyenv versions
+pyenv global 3.6.1
+
+readonly KYTHE_VERSION='v0.0.37'
+readonly WORKDIR="$(mktemp -d)"
+readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
+if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
+  readonly KZIP_FILENAME="${KOKORO_ARTIFACTS_DIR}/${KOKORO_GIT_COMMIT}.kzip"
+else
+  readonly KZIP_FILENAME="$(git rev-parse HEAD).kzip"
+fi
+
+wget -q -O "${WORKDIR}/kythe.tar.gz" \
+  "https://github.com/kythe/kythe/releases/download/${KYTHE_VERSION}/kythe-${KYTHE_VERSION}.tar.gz"
+tar --no-same-owner -xzf "${WORKDIR}/kythe.tar.gz" --directory "$WORKDIR"
+
+if [[ -n "$KOKORO_ARTIFACTS_DIR" ]]; then
+  cd "${KOKORO_ARTIFACTS_DIR}/github/gvisor"
+fi
+bazel \
+  --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \
+  build \
+  --override_repository kythe_release="${KYTHE_DIR}" \
+  --define=kythe_corpus=gvisor.dev \
+  //...
+
+"${KYTHE_DIR}/tools/kzip" merge \
+  --output "$KZIP_FILENAME" \
+  $(find -L bazel-out/*/extra_actions/ -name '*.kzip')
-- 
cgit v1.2.3


From a0aa784ecfb0d5ef94decf3c2be3e1cd44b6cbc6 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Tue, 10 Dec 2019 14:54:04 -0800
Subject: Remove pyenv calls but log the python 3 version in use.

Apparently our Kokoro VM images don't have pyenv -- I previously tested this on
the Kokoro QA shared pool.

PiperOrigin-RevId: 284855160
---
 kokoro/kythe/generate_xrefs.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 471b90e92..49186eeeb 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -23,11 +23,7 @@ if command -v use_bazel.sh >/dev/null; then
 fi
 bazel version
 
-# We need to use python 3.6 (the Kokoro PY3 default is 3.4) to compile `//...`
-# because benchmarktools requires a version of `requests` that is not a
-# available in 3.4.
-pyenv versions
-pyenv global 3.6.1
+python3 -V
 
 readonly KYTHE_VERSION='v0.0.37'
 readonly WORKDIR="$(mktemp -d)"
-- 
cgit v1.2.3


From 4ff71b5be462f3f16808abea11de1f5e01567e5d Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 10 Dec 2019 18:03:43 -0800
Subject: Inform the integrator on receipt of an NDP Recursive DNS Server
 option

This change adds support to let an integrator know when it receives an NDP
Router Advertisement message with the NDP Recursive DNS Server option with at
least one DNS server's address. The stack will not maintain any state related to
the DNS servers - the integrator is expected to maintain any required state and
invalidate the servers after its valid lifetime expires, or refresh the lifetime
when a new one is received for a known DNS server.

Test: Unittest to make sure that an event is sent to the integrator when an NDP
Recursive DNS Server option is received with at least one address.
PiperOrigin-RevId: 284890502
---
 pkg/tcpip/stack/ndp.go      |  32 ++++++---
 pkg/tcpip/stack/ndp_test.go | 172 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+), 8 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 75d3ecdac..060a2e7c6 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -181,6 +181,17 @@ type NDPDispatcher interface {
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
+
+	// OnRecursiveDNSServerOption will be called when an NDP option with
+	// recursive DNS servers has been received. Note, addrs may contain
+	// link-local addresses.
+	//
+	// It is up to the caller to use the DNS Servers only for their valid
+	// lifetime. OnRecursiveDNSServerOption may be called for new or
+	// already known DNS servers. If called with known DNS servers, their
+	// valid lifetimes must be refreshed to lifetime (it may be increased,
+	// decreased, or completely invalidated when lifetime = 0).
+	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
@@ -617,11 +628,16 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	// we do not check the iterator for errors on calls to Next.
 	it, _ := ra.Options().Iter(false)
 	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
-		switch opt.Type() {
-		case header.NDPPrefixInformationType:
-			pi := opt.(header.NDPPrefixInformation)
+		switch opt := opt.(type) {
+		case header.NDPRecursiveDNSServer:
+			if ndp.nic.stack.ndpDisp == nil {
+				continue
+			}
+
+			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), opt.Addresses(), opt.Lifetime())
 
-			prefix := pi.Subnet()
+		case header.NDPPrefixInformation:
+			prefix := opt.Subnet()
 
 			// Is the prefix a link-local?
 			if header.IsV6LinkLocalAddress(prefix.ID()) {
@@ -637,12 +653,12 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 				continue
 			}
 
-			if pi.OnLinkFlag() {
-				ndp.handleOnLinkPrefixInformation(pi)
+			if opt.OnLinkFlag() {
+				ndp.handleOnLinkPrefixInformation(opt)
 			}
 
-			if pi.AutonomousAddressConfigurationFlag() {
-				ndp.handleAutonomousPrefixInformation(pi)
+			if opt.AutonomousAddressConfigurationFlag() {
+				ndp.handleAutonomousPrefixInformation(opt)
 			}
 		}
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index e9aa20148..8d811eb8e 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -141,6 +141,16 @@ type ndpAutoGenAddrEvent struct {
 	eventType ndpAutoGenAddrEventType
 }
 
+type ndpRDNSS struct {
+	addrs    []tcpip.Address
+	lifetime time.Duration
+}
+
+type ndpRDNSSEvent struct {
+	nicID tcpip.NICID
+	rdnss ndpRDNSS
+}
+
 var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
@@ -152,6 +162,7 @@ type ndpDispatcher struct {
 	prefixC        chan ndpPrefixEvent
 	rememberPrefix bool
 	autoGenAddrC   chan ndpAutoGenAddrEvent
+	rdnssC         chan ndpRDNSSEvent
 	routeTable     []tcpip.Route
 }
 
@@ -286,6 +297,19 @@ func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpi
 	}
 }
 
+// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
+func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
+	if n.rdnssC != nil {
+		n.rdnssC <- ndpRDNSSEvent{
+			nicID,
+			ndpRDNSS{
+				addrs,
+				lifetime,
+			},
+		}
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -2075,3 +2099,151 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 }
+
+// TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
+// to the integrator when an RA is received with the NDP Recursive DNS Server
+// option with at least one valid address.
+func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		opt      header.NDPRecursiveDNSServer
+		expected *ndpRDNSS
+	}{
+		{
+			"Unspecified",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			}),
+			nil,
+		},
+		{
+			"Multicast",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+			}),
+			nil,
+		},
+		{
+			"OptionTooSmall",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8,
+			}),
+			nil,
+		},
+		{
+			"0Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+			}),
+			nil,
+		},
+		{
+			"Valid1Address",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+				},
+				2 * time.Second,
+			},
+		},
+		{
+			"Valid2Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+				},
+				time.Second,
+			},
+		},
+		{
+			"Valid3Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 0,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 3,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x03",
+				},
+				0,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				// We do not expect more than a single RDNSS
+				// event at any time for this test.
+				rdnssC: make(chan ndpRDNSSEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, header.NDPOptionsSerializer{test.opt}))
+
+			if test.expected != nil {
+				select {
+				case e := <-ndpDisp.rdnssC:
+					if e.nicID != 1 {
+						t.Errorf("got rdnss nicID = %d, want = 1", e.nicID)
+					}
+					if diff := cmp.Diff(e.rdnss.addrs, test.expected.addrs); diff != "" {
+						t.Errorf("rdnss addrs mismatch (-want +got):\n%s", diff)
+					}
+					if e.rdnss.lifetime != test.expected.lifetime {
+						t.Errorf("got rdnss lifetime = %s, want = %s", e.rdnss.lifetime, test.expected.lifetime)
+					}
+				default:
+					t.Fatal("expected an RDNSS option event")
+				}
+			}
+
+			// Should have no more RDNSS options.
+			select {
+			case e := <-ndpDisp.rdnssC:
+				t.Fatalf("unexpectedly got a new RDNSS option event: %+v", e)
+			default:
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From 46651a7d26559bdc69d460bdeb4de5968212d615 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 10 Dec 2019 18:16:47 -0800
Subject: Add most VFS methods for syscalls.

PiperOrigin-RevId: 284892289
---
 pkg/abi/linux/file.go                             |  10 +-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |   6 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |  29 +-
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |   2 +-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |   6 +-
 pkg/sentry/vfs/BUILD                              |   1 -
 pkg/sentry/vfs/file_description.go                |  93 ++++++
 pkg/sentry/vfs/file_description_impl_util_test.go |  10 +-
 pkg/sentry/vfs/filesystem.go                      |  22 ++
 pkg/sentry/vfs/mount.go                           |  69 +++-
 pkg/sentry/vfs/options.go                         |  12 +
 pkg/sentry/vfs/syscalls.go                        | 237 --------------
 pkg/sentry/vfs/vfs.go                             | 378 ++++++++++++++++++++++
 13 files changed, 606 insertions(+), 269 deletions(-)
 delete mode 100644 pkg/sentry/vfs/syscalls.go

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c9ee098f4..0f014d27f 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -144,9 +144,13 @@ const (
 	ModeCharacterDevice = S_IFCHR
 	ModeNamedPipe       = S_IFIFO
 
-	ModeSetUID = 04000
-	ModeSetGID = 02000
-	ModeSticky = 01000
+	S_ISUID = 04000
+	S_ISGID = 02000
+	S_ISVTX = 01000
+
+	ModeSetUID = S_ISUID
+	ModeSetGID = S_ISGID
+	ModeSticky = S_ISVTX
 
 	ModeUserAll     = 0700
 	ModeUserRead    = 0400
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 94cd74095..177ce2cb9 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -81,7 +81,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	}); err != nil {
 		b.Fatalf("failed to mount tmpfs submount: %v", err)
 	}
 	return func() {
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 307e4d68c..e9f756732 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -147,55 +147,54 @@ func TestSeek(t *testing.T) {
 				t.Fatalf("vfsfs.OpenAt failed: %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+			if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
 				t.Errorf("expected seek position 0, got %d and error %v", n, err)
 			}
 
-			stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+			stat, err := fd.Stat(ctx, vfs.StatOptions{})
 			if err != nil {
 				t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
 			}
 
 			// We should be able to seek beyond the end of file.
 			size := int64(stat.Size)
-			if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+			if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+			if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
 			}
 
 			// Make sure negative offsets work with SEEK_CUR.
-			if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+			if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
 			// Make sure SEEK_END works with regular files.
-			switch fd.Impl().(type) {
-			case *regularFileFD:
+			if _, ok := fd.Impl().(*regularFileFD); ok {
 				// Seek back to 0.
-				if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+				if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
 				}
 
 				// Seek forward beyond EOF.
-				if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+				if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 				}
 
 				// EINVAL should be returned if the resulting offset is negative.
-				if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+				if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
 					t.Errorf("expected error EINVAL but got %v", err)
 				}
 			}
@@ -456,7 +455,7 @@ func TestRead(t *testing.T) {
 			want := make([]byte, 1)
 			for {
 				n, err := f.Read(want)
-				fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+				fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
 
 				if diff := cmp.Diff(got, want); diff != "" {
 					t.Errorf("file data mismatch (-want +got):\n%s", diff)
@@ -464,7 +463,7 @@ func TestRead(t *testing.T) {
 
 				// Make sure there is no more file data left after getting EOF.
 				if n == 0 || err == io.EOF {
-					if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+					if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
 						t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
 					}
 
@@ -574,7 +573,7 @@ func TestIterDirents(t *testing.T) {
 			}
 
 			cb := &iterDirentsCb{}
-			if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+			if err = fd.IterDirents(ctx, cb); err != nil {
 				t.Fatalf("dir fd.IterDirents() failed: %v", err)
 			}
 
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index ea6417ce7..4a7a94a52 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef()
 			// Create and mount the submount.
-			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil {
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index a3a870571..5bf527c80 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -194,7 +194,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 	readData := make([]byte, 1)
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != syserror.ErrWouldBlock {
 		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
 	}
@@ -207,7 +207,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	writeData := []byte(msg)
 	src := usermem.BytesIOSequence(writeData)
-	bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{})
+	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
 	if err != nil {
 		t.Fatalf("error writing to pipe %q: %v", fileName, err)
 	}
@@ -220,7 +220,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg
 func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	readData := make([]byte, len(msg))
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != nil {
 		t.Fatalf("error reading from pipe %q: %v", fileName, err)
 	}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 74a325309..59237c3b9 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -19,7 +19,6 @@ go_library(
         "options.go",
         "permissions.go",
         "resolving_path.go",
-        "syscalls.go",
         "testutil.go",
         "vfs.go",
     ],
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 34007eb57..4473dfce8 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -241,3 +241,96 @@ type IterDirentsCallback interface {
 	// called.
 	Handle(dirent Dirent) bool
 }
+
+// OnClose is called when a file descriptor representing the FileDescription is
+// closed. Returning a non-nil error should not prevent the file descriptor
+// from being closed.
+func (fd *FileDescription) OnClose(ctx context.Context) error {
+	return fd.impl.OnClose(ctx)
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	flags, err := fd.impl.StatusFlags(ctx)
+	flags |= linux.O_LARGEFILE
+	return flags, err
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	return fd.impl.SetStatusFlags(ctx, flags)
+}
+
+// Stat returns metadata for the file represented by fd.
+func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	return fd.impl.Stat(ctx, opts)
+}
+
+// SetStat updates metadata for the file represented by fd.
+func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	return fd.impl.SetStat(ctx, opts)
+}
+
+// StatFS returns metadata for the filesystem containing the file represented
+// by fd.
+func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.impl.StatFS(ctx)
+}
+
+// PRead reads from the file represented by fd into dst, starting at the given
+// offset, and returns the number of bytes read. PRead is permitted to return
+// partial reads with a nil error.
+func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return fd.impl.PRead(ctx, dst, offset, opts)
+}
+
+// Read is similar to PRead, but does not specify an offset.
+func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	return fd.impl.Read(ctx, dst, opts)
+}
+
+// PWrite writes src to the file represented by fd, starting at the given
+// offset, and returns the number of bytes written. PWrite is permitted to
+// return partial writes with a nil error.
+func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return fd.impl.PWrite(ctx, src, offset, opts)
+}
+
+// Write is similar to PWrite, but does not specify an offset.
+func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return fd.impl.Write(ctx, src, opts)
+}
+
+// IterDirents invokes cb on each entry in the directory represented by fd. If
+// IterDirents has been called since the last call to Seek, it continues
+// iteration from the end of the last call.
+func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+	return fd.impl.IterDirents(ctx, cb)
+}
+
+// Seek changes fd's offset (assuming one exists) and returns its new value.
+func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.impl.Seek(ctx, offset, whence)
+}
+
+// Sync has the semantics of fsync(2).
+func (fd *FileDescription) Sync(ctx context.Context) error {
+	return fd.impl.Sync(ctx)
+}
+
+// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
+// fd.
+func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.impl.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements the ioctl(2) syscall.
+func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return fd.impl.Ioctl(ctx, uio, args)
+}
+
+// SyncFS instructs the filesystem containing fd to execute the semantics of
+// syncfs(2).
+func (fd *FileDescription) SyncFS(ctx context.Context) error {
+	return fd.vd.mount.fs.impl.Sync(ctx)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index a5561dcbe..ac7799296 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) {
 	// The first read causes Generate to be called to fill the FD's buffer.
 	buf := make([]byte, 2)
 	ioseq := usermem.BytesIOSequence(buf)
-	n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err := fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
@@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// A second read without seeking is still at EOF.
-	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 0 || err != io.EOF {
 		t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
 	}
 
 	// Seeking to the beginning of the file causes it to be regenerated.
-	n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+	n, err = fd.Seek(ctx, 0, linux.SEEK_SET)
 	if n != 0 || err != nil {
 		t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
 	}
-	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
@@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// PRead at the beginning of the file also causes it to be regenerated.
-	n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+	n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 76ff8cf51..dfbd2372a 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -47,6 +47,9 @@ func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
 	fs.refs = 1
 	fs.vfs = vfsObj
 	fs.impl = impl
+	vfsObj.filesystemsMu.Lock()
+	vfsObj.filesystems[fs] = struct{}{}
+	vfsObj.filesystemsMu.Unlock()
 }
 
 // VirtualFilesystem returns the containing VirtualFilesystem.
@@ -66,9 +69,28 @@ func (fs *Filesystem) IncRef() {
 	}
 }
 
+// TryIncRef increments fs' reference count and returns true. If fs' reference
+// count is zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fs.
+func (fs *Filesystem) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fs.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
 // DecRef decrements fs' reference count.
 func (fs *Filesystem) DecRef() {
 	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+		fs.vfs.filesystemsMu.Lock()
+		delete(fs.vfs.filesystems, fs)
+		fs.vfs.filesystemsMu.Unlock()
 		fs.impl.Release()
 	} else if refs < 0 {
 		panic("Filesystem.decRef() called without holding a reference")
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1c3b2e987..ec23ab0dd 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -18,6 +18,7 @@ import (
 	"math"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -133,13 +134,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
-// NewMount creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error {
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return err
 	}
@@ -207,6 +208,68 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 	return nil
 }
 
+// UmountAt removes the Mount at the given path.
+func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
+	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
+		return syserror.EINVAL
+	}
+
+	// MNT_FORCE is currently unimplemented except for the permission check.
+	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
+		return syserror.EPERM
+	}
+
+	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	defer vd.DecRef()
+	if vd.dentry != vd.mount.root {
+		return syserror.EINVAL
+	}
+	vfs.mountMu.Lock()
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
+		vfs.mountMu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
+	// we don't implement yet (we'll just fail it since the caller holds a
+	// reference on it).
+
+	vfs.mounts.seq.BeginWrite()
+	if opts.Flags&linux.MNT_DETACH == 0 {
+		if len(vd.mount.children) != 0 {
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+		// We are holding a reference on vd.mount.
+		expectedRefs := int64(1)
+		if !vd.mount.umounted {
+			expectedRefs = 2
+		}
+		if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+	}
+	vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
+		eager:               opts.Flags&linux.MNT_DETACH == 0,
+		disconnectHierarchy: true,
+	}, nil, nil)
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.DecRef()
+	}
+	return nil
+}
+
 type umountRecursiveOptions struct {
 	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
 	// on umounted mounts fail.
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3aa73d911..3ecbc8fc1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -46,6 +46,12 @@ type MknodOptions struct {
 	DevMinor uint32
 }
 
+// MountOptions contains options to VirtualFilesystem.MountAt().
+type MountOptions struct {
+	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
+	GetFilesystemOptions GetFilesystemOptions
+}
+
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
 // FilesystemImpl.OpenAt().
 type OpenOptions struct {
@@ -114,6 +120,12 @@ type StatOptions struct {
 	Sync uint32
 }
 
+// UmountOptions contains options to VirtualFilesystem.UmountAt().
+type UmountOptions struct {
+	// Flags contains flags as specified for umount2(2).
+	Flags uint32
+}
+
 // WriteOptions contains options to FileDescription.PWrite(),
 // FileDescriptionImpl.PWrite(), FileDescription.Write(), and
 // FileDescriptionImpl.Write().
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
deleted file mode 100644
index 436151afa..000000000
--- a/pkg/sentry/vfs/syscalls.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// PathOperation specifies the path operated on by a VFS method.
-//
-// PathOperation is passed to VFS methods by pointer to reduce memory copying:
-// it's somewhat large and should never escape. (Options structs are passed by
-// pointer to VFS and FileDescription methods for the same reason.)
-type PathOperation struct {
-	// Root is the VFS root. References on Root are borrowed from the provider
-	// of the PathOperation.
-	//
-	// Invariants: Root.Ok().
-	Root VirtualDentry
-
-	// Start is the starting point for the path traversal. References on Start
-	// are borrowed from the provider of the PathOperation (i.e. the caller of
-	// the VFS method to which the PathOperation was passed).
-	//
-	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
-	Start VirtualDentry
-
-	// Path is the pathname traversed by this operation.
-	Pathname string
-
-	// If FollowFinalSymlink is true, and the Dentry traversed by the final
-	// path component represents a symbolic link, the symbolic link should be
-	// followed.
-	FollowFinalSymlink bool
-}
-
-// GetDentryAt returns a VirtualDentry representing the given path, at which a
-// file must exist. A reference is taken on the returned VirtualDentry.
-func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return VirtualDentry{}, err
-	}
-	for {
-		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
-		if err == nil {
-			vd := VirtualDentry{
-				mount:  rp.mount,
-				dentry: d,
-			}
-			rp.mount.IncRef()
-			vfs.putResolvingPath(rp)
-			return vd, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return VirtualDentry{}, err
-		}
-	}
-}
-
-// MkdirAt creates a directory at the given path.
-func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
-	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
-	// also honored." - mkdir(2)
-	opts.Mode &= 01777
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
-	for {
-		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return err
-		}
-	}
-}
-
-// MknodAt creates a file of the given mode at the given path. It returns an
-// error from the syserror package.
-func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil
-	}
-	for {
-		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
-			vfs.putResolvingPath(rp)
-			return nil
-		}
-		// Handle mount traversals.
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return err
-		}
-	}
-}
-
-// OpenAt returns a FileDescription providing access to the file at the given
-// path. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
-	// Remove:
-	//
-	// - O_LARGEFILE, which we always report in FileDescription status flags
-	// since only 64-bit architectures are supported at this time.
-	//
-	// - O_CLOEXEC, which affects file descriptors and therefore must be
-	// handled outside of VFS.
-	//
-	// - Unknown flags.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
-	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
-	if opts.Flags&linux.O_SYNC != 0 {
-		opts.Flags |= linux.O_DSYNC
-	}
-	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
-	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
-	// filesystem implementations that do not support it).
-	if opts.Flags&linux.O_TMPFILE != 0 {
-		if opts.Flags&linux.O_DIRECTORY == 0 {
-			return nil, syserror.EINVAL
-		}
-		if opts.Flags&linux.O_CREAT != 0 {
-			return nil, syserror.EINVAL
-		}
-		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
-			return nil, syserror.EINVAL
-		}
-	}
-	// O_PATH causes most other flags to be ignored.
-	if opts.Flags&linux.O_PATH != 0 {
-		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
-	}
-	// "On Linux, the following bits are also honored in mode: [S_ISUID,
-	// S_ISGID, S_ISVTX]" - open(2)
-	opts.Mode &= 07777
-
-	if opts.Flags&linux.O_NOFOLLOW != 0 {
-		pop.FollowFinalSymlink = false
-	}
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
-	if opts.Flags&linux.O_DIRECTORY != 0 {
-		rp.mustBeDir = true
-		rp.mustBeDirOrig = true
-	}
-	for {
-		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return fd, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return nil, err
-		}
-	}
-}
-
-// StatAt returns metadata for the file at the given path.
-func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	for {
-		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return stat, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return linux.Statx{}, err
-		}
-	}
-}
-
-// StatusFlags returns file description status flags.
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	flags, err := fd.impl.StatusFlags(ctx)
-	flags |= linux.O_LARGEFILE
-	return flags, err
-}
-
-// SetStatusFlags sets file description status flags.
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return fd.impl.SetStatusFlags(ctx, flags)
-}
-
-// TODO:
-//
-// - VFS.SyncAllFilesystems() for sync(2)
-//
-// - Something for syncfs(2)
-//
-// - VFS.LinkAt()
-//
-// - VFS.ReadlinkAt()
-//
-// - VFS.RenameAt()
-//
-// - VFS.RmdirAt()
-//
-// - VFS.SetStatAt()
-//
-// - VFS.StatFSAt()
-//
-// - VFS.SymlinkAt()
-//
-// - VFS.UmountAt()
-//
-// - VFS.UnlinkAt()
-//
-// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index f0cd3ffe5..7262b0d0a 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -20,6 +20,7 @@
 //   VirtualFilesystem.mountMu
 //     Dentry.mu
 //       Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+//     VirtualFilesystem.filesystemsMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
@@ -28,6 +29,11 @@ package vfs
 
 import (
 	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
@@ -67,6 +73,11 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// filesystems contains all Filesystems. filesystems is protected by
+	// filesystemsMu.
+	filesystemsMu sync.Mutex
+	filesystems   map[*Filesystem]struct{}
+
 	// fsTypes contains all FilesystemTypes that are usable in the
 	// VirtualFilesystem. fsTypes is protected by fsTypesMu.
 	fsTypesMu sync.RWMutex
@@ -77,12 +88,379 @@ type VirtualFilesystem struct {
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		filesystems: make(map[*Filesystem]struct{}),
 		fsTypes:     make(map[string]FilesystemType),
 	}
 	vfs.mounts.Init()
 	return vfs
 }
 
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+	// Root is the VFS root. References on Root are borrowed from the provider
+	// of the PathOperation.
+	//
+	// Invariants: Root.Ok().
+	Root VirtualDentry
+
+	// Start is the starting point for the path traversal. References on Start
+	// are borrowed from the provider of the PathOperation (i.e. the caller of
+	// the VFS method to which the PathOperation was passed).
+	//
+	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+	Start VirtualDentry
+
+	// Path is the pathname traversed by this operation.
+	Pathname string
+
+	// If FollowFinalSymlink is true, and the Dentry traversed by the final
+	// path component represents a symbolic link, the symbolic link should be
+	// followed.
+	FollowFinalSymlink bool
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return VirtualDentry{}, err
+	}
+	for {
+		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+		if err == nil {
+			vd := VirtualDentry{
+				mount:  rp.mount,
+				dentry: d,
+			}
+			rp.mount.IncRef()
+			vfs.putResolvingPath(rp)
+			return vd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, err
+		}
+	}
+}
+
+// LinkAt creates a hard link at newpop representing the existing file at
+// oldpop.
+func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	rp, err := vfs.getResolvingPath(creds, newpop)
+	if err != nil {
+		oldVD.DecRef()
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
+		if err == nil {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+	// also honored." - mkdir(2)
+	opts.Mode &= 0777 | linux.S_ISVTX
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil
+	}
+	for {
+		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		// Handle mount traversals.
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+	// Remove:
+	//
+	// - O_LARGEFILE, which we always report in FileDescription status flags
+	// since only 64-bit architectures are supported at this time.
+	//
+	// - O_CLOEXEC, which affects file descriptors and therefore must be
+	// handled outside of VFS.
+	//
+	// - Unknown flags.
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+	if opts.Flags&linux.O_SYNC != 0 {
+		opts.Flags |= linux.O_DSYNC
+	}
+	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
+	// filesystem implementations that do not support it).
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		if opts.Flags&linux.O_DIRECTORY == 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+			return nil, syserror.EINVAL
+		}
+	}
+	// O_PATH causes most other flags to be ignored.
+	if opts.Flags&linux.O_PATH != 0 {
+		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+	}
+	// "On Linux, the following bits are also honored in mode: [S_ISUID,
+	// S_ISGID, S_ISVTX]" - open(2)
+	opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+	if opts.Flags&linux.O_NOFOLLOW != 0 {
+		pop.FollowFinalSymlink = false
+	}
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil, err
+	}
+	if opts.Flags&linux.O_DIRECTORY != 0 {
+		rp.mustBeDir = true
+		rp.mustBeDirOrig = true
+	}
+	for {
+		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return fd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// ReadlinkAt returns the target of the symbolic link at the given path.
+func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return "", err
+	}
+	for {
+		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return target, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// RenameAt renames the file at oldpop to newpop.
+func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	rp, err := vfs.getResolvingPath(creds, newpop)
+	if err != nil {
+		oldVD.DecRef()
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts)
+		if err == nil {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RmdirAt removes the directory at the given path.
+func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SetStatAt changes metadata for the file at the given path.
+func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	for {
+		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return stat, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statx{}, err
+		}
+	}
+}
+
+// StatFSAt returns metadata for the filesystem containing the file at the
+// given path.
+func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	for {
+		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return statfs, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statfs{}, err
+		}
+	}
+}
+
+// SymlinkAt creates a symbolic link at the given path with the given target.
+func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// UnlinkAt deletes the non-directory file at the given path.
+func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SyncAllFilesystems has the semantics of Linux's sync(2).
+func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+	fss := make(map[*Filesystem]struct{})
+	vfs.filesystemsMu.Lock()
+	for fs := range vfs.filesystems {
+		if !fs.TryIncRef() {
+			continue
+		}
+		fss[fs] = struct{}{}
+	}
+	vfs.filesystemsMu.Unlock()
+	var retErr error
+	for fs := range fss {
+		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+			retErr = err
+		}
+		fs.DecRef()
+	}
+	return retErr
+}
+
 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
 // (which represents a node in a Filesystem's tree) and a Mount (which
 // represents the Filesystem's position in a VFS mount tree).
-- 
cgit v1.2.3


From 2e3b9b0a68d8f191d061008feda6e4a4ce202a78 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 19:14:05 -0800
Subject: Deduplicate and simplify control message processing for recvmsg and
 sendmsg.

Also, improve performance by calculating how much space is needed before making
an allocation for sendmsg in hostinet.

PiperOrigin-RevId: 284898581
---
 pkg/sentry/socket/control/control.go    | 51 ++++++++++++++++++++++-----------
 pkg/sentry/socket/hostinet/socket.go    | 18 +++++++++---
 pkg/sentry/syscalls/linux/sys_socket.go | 18 +-----------
 3 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index fa3188d51..af1a4e95f 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -348,43 +348,62 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
-func addSpaceForCmsg(cmsgDataLen int, buf []byte) []byte {
-	newBuf := make([]byte, 0, len(buf)+linux.SizeOfControlMessageHeader+cmsgDataLen)
-	return append(newBuf, buf...)
-}
-
-// PackControlMessages converts the given ControlMessages struct into a buffer.
+// PackControlMessages packs control messages into the given buffer.
+//
 // We skip control messages specific to Unix domain sockets.
-func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages) []byte {
-	var buf []byte
-	// The use of t.Arch().Width() is analogous to Linux's use of sizeof(long) in
-	// CMSG_ALIGN.
-	width := t.Arch().Width()
-
+//
+// Note that some control messages may be truncated if they do not fit under
+// the capacity of buf.
+func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte {
 	if cmsgs.IP.HasTimestamp {
-		buf = addSpaceForCmsg(int(width), buf)
 		buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf)
 	}
 
 	if cmsgs.IP.HasInq {
 		// In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
-		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageInq, width), buf)
 		buf = PackInq(t, cmsgs.IP.Inq, buf)
 	}
 
 	if cmsgs.IP.HasTOS {
-		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageTOS, width), buf)
 		buf = PackTOS(t, cmsgs.IP.TOS, buf)
 	}
 
 	if cmsgs.IP.HasTClass {
-		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageTClass, width), buf)
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
 	return buf
 }
 
+// cmsgSpace is equivalent to CMSG_SPACE in Linux.
+func cmsgSpace(t *kernel.Task, dataLen int) int {
+	return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width())
+}
+
+// CmsgsSpace returns the number of bytes needed to fit the control messages
+// represented in cmsgs.
+func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
+	space := 0
+
+	if cmsgs.IP.HasTimestamp {
+		space += cmsgSpace(t, linux.SizeOfTimeval)
+	}
+
+	if cmsgs.IP.HasInq {
+		space += cmsgSpace(t, linux.SizeOfControlMessageInq)
+	}
+
+	if cmsgs.IP.HasTOS {
+		space += cmsgSpace(t, linux.SizeOfControlMessageTOS)
+	}
+
+	if cmsgs.IP.HasTClass {
+		space += cmsgSpace(t, linux.SizeOfControlMessageTClass)
+	}
+
+	return space
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index a8c152b54..c957b0f1d 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -45,7 +45,7 @@ const (
 	sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
 
 	// maxControlLen is the maximum size of a control message buffer used in a
-	// recvmsg syscall.
+	// recvmsg or sendmsg syscall.
 	maxControlLen = 1024
 )
 
@@ -412,9 +412,12 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			msg.Namelen = uint32(len(senderAddrBuf))
 		}
 		if controlLen > 0 {
-			controlBuf = make([]byte, maxControlLen)
+			if controlLen > maxControlLen {
+				controlLen = maxControlLen
+			}
+			controlBuf = make([]byte, controlLen)
 			msg.Control = &controlBuf[0]
-			msg.Controllen = maxControlLen
+			msg.Controllen = controlLen
 		}
 		n, err := recvmsg(s.fd, &msg, sysflags)
 		if err != nil {
@@ -489,7 +492,14 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		return 0, syserr.ErrInvalidArgument
 	}
 
-	controlBuf := control.PackControlMessages(t, controlMessages)
+	space := uint64(control.CmsgsSpace(t, controlMessages))
+	if space > maxControlLen {
+		space = maxControlLen
+	}
+	controlBuf := make([]byte, 0, space)
+	// PackControlMessages will append up to space bytes to controlBuf.
+	controlBuf = control.PackControlMessages(t, controlMessages, controlBuf)
+
 	sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
 		// Refuse to do anything if any part of src.Addrs was unusable.
 		if uint64(src.NumBytes()) != srcs.NumBytes() {
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index e3faa890b..4b5aafcc0 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -787,29 +787,13 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	defer cms.Release()
 
 	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
 
 	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
 		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
 		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
 	}
 
-	if cms.IP.HasTimestamp {
-		controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData)
-	}
-
-	if cms.IP.HasInq {
-		// In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
-		controlData = control.PackInq(t, cms.IP.Inq, controlData)
-	}
-
-	if cms.IP.HasTOS {
-		controlData = control.PackTOS(t, cms.IP.TOS, controlData)
-	}
-
-	if cms.IP.HasTClass {
-		controlData = control.PackTClass(t, cms.IP.TClass, controlData)
-	}
-
 	if cms.Unix.Rights != nil {
 		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
 	}
-- 
cgit v1.2.3


From 1643224af0f099d55d7ae7934606ec1987658dfc Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 11 Dec 2019 10:23:54 -0800
Subject: Finish incomplete comment.

PiperOrigin-RevId: 285012278
---
 runsc/fsgofer/fsgofer.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index c9add64ec..b59e1a70e 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -199,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // The reason that the file is not opened initially as read-write is for better
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
+// multiple files are only being opened for read (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
-- 
cgit v1.2.3


From f8c5ad061bb529e0314bde17f4f1b4ddc82c0120 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 11 Dec 2019 10:30:43 -0800
Subject: runsc/debug: add an option to list all processes

runsc debug --ps list all processes with all threads. This option is added to
the debug command but not to the ps command, because it is going to be used for
debug purposes and we want to add any useful information without thinking about
backward compatibility.

This will help to investigate syzkaller issues.

PiperOrigin-RevId: 285013668
---
 pkg/sentry/control/proc.go              |  34 +++++----
 runsc/cmd/debug.go                      |  13 ++++
 runsc/container/console_test.go         |  11 +--
 runsc/container/container_test.go       | 120 +++++++++++++++++++-------------
 runsc/container/multi_container_test.go |  45 ++++++------
 5 files changed, 132 insertions(+), 91 deletions(-)

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index a6f90b2bb..ced51c66c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -272,7 +272,8 @@ type Process struct {
 	UID auth.KUID       `json:"uid"`
 	PID kernel.ThreadID `json:"pid"`
 	// Parent PID
-	PPID kernel.ThreadID `json:"ppid"`
+	PPID    kernel.ThreadID   `json:"ppid"`
+	Threads []kernel.ThreadID `json:"threads"`
 	// Processor utilization
 	C int32 `json:"c"`
 	// TTY name of the process. Will be of the form "pts/N" if there is a
@@ -310,7 +311,7 @@ func ProcessListToTable(pl []*Process) string {
 
 // ProcessListToJSON will return the JSON representation of ps.
 func ProcessListToJSON(pl []*Process) (string, error) {
-	b, err := json.Marshal(pl)
+	b, err := json.MarshalIndent(pl, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
 	}
@@ -337,7 +338,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 	ts := k.TaskSet()
 	now := k.RealtimeClock().Now()
 	for _, tg := range ts.Root.ThreadGroups() {
-		pid := tg.PIDNamespace().IDOfThreadGroup(tg)
+		pidns := tg.PIDNamespace()
+		pid := pidns.IDOfThreadGroup(tg)
+
 		// If tg has already been reaped ignore it.
 		if pid == 0 {
 			continue
@@ -348,19 +351,20 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 
 		ppid := kernel.ThreadID(0)
 		if p := tg.Leader().Parent(); p != nil {
-			ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup())
-		}
-		p := Process{
-			UID:   tg.Leader().Credentials().EffectiveKUID,
-			PID:   pid,
-			PPID:  ppid,
-			STime: formatStartTime(now, tg.Leader().StartTime()),
-			C:     percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
-			Time:  tg.CPUStats().SysTime.String(),
-			Cmd:   tg.Leader().Name(),
-			TTY:   ttyName(tg.TTY()),
+			ppid = pidns.IDOfThreadGroup(p.ThreadGroup())
 		}
-		*out = append(*out, &p)
+		threads := tg.MemberIDs(pidns)
+		*out = append(*out, &Process{
+			UID:     tg.Leader().Credentials().EffectiveKUID,
+			PID:     pid,
+			PPID:    ppid,
+			Threads: threads,
+			STime:   formatStartTime(now, tg.Leader().StartTime()),
+			C:       percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
+			Time:    tg.CPUStats().SysTime.String(),
+			Cmd:     tg.Leader().Name(),
+			TTY:     ttyName(tg.TTY()),
+		})
 	}
 	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
 	return nil
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 38da7ee02..f37415810 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -42,6 +42,7 @@ type Debug struct {
 	logLevel    string
 	logPackets  string
 	duration    time.Duration
+	ps          bool
 }
 
 // Name implements subcommands.Command.
@@ -71,6 +72,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
 	f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
 	f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
+	f.BoolVar(&d.ps, "ps", false, "lists processes")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -240,6 +242,17 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Logging options changed")
 	}
+	if d.ps {
+		pList, err := c.Processes()
+		if err != nil {
+			Fatalf("getting processes for container: %v", err)
+		}
+		o, err := control.ProcessListToJSON(pList)
+		if err != nil {
+			Fatalf("generating JSON: %v", err)
+		}
+		log.Infof(o)
+	}
 
 	if delay {
 		time.Sleep(d.duration)
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 7d67c3a75..5ed131a7f 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/kr/pty"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/testutil"
@@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) {
 	// Make sure all the processes are running.
 	expectedPL := []*control.Process{
 		// Root container process.
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		// Bash from exec process.
-		{PID: 2, Cmd: "bash"},
+		{PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
@@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
 	}
@@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 
 	// Wait for bash to start.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "bash"},
+		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
@@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for sleep to start.
-	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 1d06f2780..2ced028f6 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -52,13 +53,14 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 			err = fmt.Errorf("error getting process data from container: %v", err)
 			return &backoff.PermanentError{Err: err}
 		}
-		if !procListsEqual(got, want) {
-			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+		if r, err := procListsEqual(got, want); !r {
+			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(got), procListToString(want), err)
 		}
 		return nil
 	}
 	// Gives plenty of time as tests can run slow under --race.
-	return testutil.Poll(cb, 30*time.Second)
+	return testutil.Poll(cb, 10*time.Second)
 }
 
 func waitForProcessCount(cont *Container, want int) error {
@@ -91,9 +93,9 @@ func blockUntilWaitable(pid int) error {
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
 // implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
+func procListsEqual(got, want []*control.Process) (bool, error) {
 	if len(got) != len(want) {
-		return false
+		return false, nil
 	}
 	for i := range got {
 		pd1 := got[i]
@@ -106,11 +108,19 @@ func procListsEqual(got, want []*control.Process) bool {
 		// where we use this method. Tests that care about the TTY
 		// field should check for it themselves.
 		pd1.TTY = ""
-		if *pd1 != *pd2 {
-			return false
+		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+		if err != nil {
+			return false, err
+		}
+		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+		if err != nil {
+			return false, err
+		}
+		if pd1Json != pd2Json {
+			return false, nil
 		}
 	}
-	return true
+	return true, nil
 }
 
 // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
@@ -120,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error {
 	if err != nil {
 		return fmt.Errorf("error getting process data from container: %v", err)
 	}
-	if procListsEqual(got, want) {
+	equal, err := procListsEqual(got, want)
+	if err != nil {
+		return err
+	}
+	if equal {
 		return nil
 	}
 	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
@@ -292,11 +306,12 @@ func TestLifecycle(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 		// Create the container.
@@ -594,18 +609,20 @@ func TestExec(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1066,18 +1083,20 @@ func TestPauseResume(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "bash",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "bash",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1130,11 +1149,12 @@ func TestPauseResume(t *testing.T) {
 
 		expectedPL2 := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 
@@ -1245,18 +1265,20 @@ func TestCapabilities(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "exe",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "exe",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index de2fd3cf2..4ad09ceab 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -156,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -202,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -264,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -274,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 		}
 
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -306,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -351,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) {
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -383,7 +383,7 @@ func TestExecWait(t *testing.T) {
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -418,7 +418,7 @@ func TestExecWait(t *testing.T) {
 
 	// Wait for the exec'd process to exit.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -505,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
@@ -519,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Make sure process 1 is still running.
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -633,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) {
 		if err != nil {
 			t.Fatalf("error getting process data from sandbox: %v", err)
 		}
-		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}}
-		if !procListsEqual(pss, expectedPL) {
-			t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+		if r, err := procListsEqual(pss, expectedPL); !r {
+			t.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(pss), procListToString(expectedPL), err)
 		}
 
 		// Check that cont.Destroy is safe to call multiple times.
@@ -669,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -677,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Same for the other container.
 	expectedPL1 := []*control.Process{
-		{PID: 2, Cmd: "sh"},
-		{PID: 3, PPID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -692,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	if _, err := containers[1].Execute(args); err != nil {
 		t.Fatalf("error exec'ing: %v", err)
 	}
-	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
 	}
@@ -1513,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Ensure container is running
 	c := containers[2]
 	expectedPL := []*control.Process{
-		{PID: 3, Cmd: "sleep"},
+		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1541,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 			continue // container[2] has been killed.
 		}
 		pl := []*control.Process{
-			{PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
 		}
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1561,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
 	// and RPC errors out.
 	impossiblePL := []*control.Process{
-		{PID: 100, Cmd: "non-existent-process"},
+		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
 	}
 	if err := waitForProcessList(c, impossiblePL); err == nil {
 		t.Fatalf("Sandbox was not killed after gofer death")
-- 
cgit v1.2.3


From 0d027262e09184f61ea0707935534fc2fc4af7e7 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 11 Dec 2019 13:26:30 -0800
Subject: Add additional packages to go branch

We're missing several packages that runsc doesn't depend on. Most notable are
several tcpip link packages.

To find packages, I looked at a diff of directories on master vs go:

$ bazel build //:gopath
$ find bazel-bin/gopath/src/gvisor.dev/gvisor/ -type d > /tmp/gopath.txt
$ find . -type d > /tmp/master.txt
$ sed 's|bazel-bin/gopath/src/gvisor.dev/gvisor/||' < /tmp/gopath.txt > /tmp/gopath.trunc.txt
$ sed 's|./||' < /tmp/master.txt > /tmp/master.trunc.txt
$ vimdiff /tmp/gopath.trunc.txt /tmp/master.trunc.txt

Testing packages are still left out because :gopath can't depend on testonly
targets...

PiperOrigin-RevId: 285049029
---
 BUILD                                  | 11 +++++++++++
 pkg/tcpip/sample/tun_tcp_connect/BUILD |  1 +
 pkg/tcpip/sample/tun_tcp_echo/BUILD    |  1 +
 3 files changed, 13 insertions(+)

diff --git a/BUILD b/BUILD
index de410b008..76286174f 100644
--- a/BUILD
+++ b/BUILD
@@ -23,7 +23,18 @@ go_path(
         "//runsc",
 
         # Packages that are not dependencies of //runsc.
+        "//pkg/sentry/kernel/memevent",
+        "//pkg/tcpip/adapters/gonet",
         "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/muxed",
+        "//pkg/tcpip/link/sharedmem",
+        "//pkg/tcpip/link/sharedmem/pipe",
+        "//pkg/tcpip/link/sharedmem/queue",
+        "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/link/waitable",
+        "//pkg/tcpip/sample/tun_tcp_connect",
+        "//pkg/tcpip/sample/tun_tcp_echo",
+        "//pkg/tcpip/transport/tcpconntrack",
     ],
 )
 
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index a57752a7c..d7496fde6 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_binary(
     name = "tun_tcp_connect",
     srcs = ["main.go"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index dad8ef399..875561566 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_binary(
     name = "tun_tcp_echo",
     srcs = ["main.go"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
-- 
cgit v1.2.3


From 481dbfa5ab24ec2c0752b9e748d3617285603c4e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 11 Dec 2019 13:40:57 -0800
Subject: Add vfs.Pathname{WithDeleted,ForGetcwd}.

The former is needed for vfs.FileDescription to implement
memmap.MappingIdentity, and the latter is needed to implement getcwd(2).

PiperOrigin-RevId: 285051855
---
 pkg/sentry/fsimpl/ext/BUILD            |   1 +
 pkg/sentry/fsimpl/ext/filesystem.go    |   8 ++
 pkg/sentry/fsimpl/memfs/BUILD          |   1 +
 pkg/sentry/fsimpl/memfs/filesystem.go  |   8 ++
 pkg/sentry/vfs/BUILD                   |   1 +
 pkg/sentry/vfs/filesystem.go           |  54 +++++++++++-
 pkg/sentry/vfs/filesystem_impl_util.go |  26 ++++++
 pkg/sentry/vfs/pathname.go             | 153 +++++++++++++++++++++++++++++++++
 pkg/sentry/vfs/testutil.go             |   9 ++
 9 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 pkg/sentry/vfs/pathname.go

diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 7ccff8b0d..880b7bcd3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 2d15e8aaf..e7aa3b41b 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -20,6 +20,7 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -441,3 +442,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	return syserror.EROFS
 }
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index bc5c0b591..0cc751eb8 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -32,6 +32,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/fspath",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 08a9cb8ef..1f2a5122a 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -19,6 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -582,3 +583,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	inode.decLinksLocked()
 	return nil
 }
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 59237c3b9..e3e554b88 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -17,6 +17,7 @@ go_library(
         "mount.go",
         "mount_unsafe.go",
         "options.go",
+        "pathname.go",
         "permissions.go",
         "resolving_path.go",
         "testutil.go",
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index dfbd2372a..8011eba3f 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
@@ -185,5 +186,56 @@ type FilesystemImpl interface {
 	// UnlinkAt removes the non-directory file at rp.
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
-	// TODO: d_path(); extended attributes; inotify_add_watch(); bind()
+	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
+	//
+	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
+	// before vd.Mount().Root(), PrependPath should stop prepending path
+	// components and return a PrependPathAtVFSRootError.
+	//
+	// If traversal of vd.Dentry()'s ancestors encounters an independent
+	// ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
+	// descendant of vd.Mount().Root()), PrependPath should stop prepending
+	// path components and return a PrependPathAtNonMountRootError.
+	//
+	// Filesystems for which Dentries do not have meaningful paths may prepend
+	// an arbitrary descriptive string to b and then return a
+	// PrependPathSyntheticError.
+	//
+	// Most implementations can acquire the appropriate locks to ensure that
+	// Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
+	// its ancestors, then call GenericPrependPath.
+	//
+	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
+	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
+
+	// TODO: extended attributes; inotify_add_watch(); bind()
+}
+
+// PrependPathAtVFSRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+type PrependPathAtVFSRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtVFSRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached VFS root"
+}
+
+// PrependPathAtNonMountRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter an independent ancestor
+// Dentry that is not the Mount root.
+type PrependPathAtNonMountRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtNonMountRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
+}
+
+// PrependPathSyntheticError is returned by implementations of
+// FilesystemImpl.PrependPath() for which prepended names do not represent real
+// paths.
+type PrependPathSyntheticError struct{}
+
+// Error implements error.Error.
+func (PrependPathSyntheticError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
 }
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..7315a588e 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,8 @@ package vfs
 
 import (
 	"strings"
+
+	"gvisor.dev/gvisor/pkg/fspath"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +43,27 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
+
+// GenericPrependPath may be used by implementations of
+// FilesystemImpl.PrependPath() for which a single statically-determined lock
+// or set of locks is sufficient to ensure its preconditions (as opposed to
+// e.g. per-Dentry locks).
+//
+// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
+// vd.Dentry() and all of its ancestors.
+func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	mnt, d := vd.mount, vd.dentry
+	for {
+		if mnt == vfsroot.mount && d == vfsroot.dentry {
+			return PrependPathAtVFSRootError{}
+		}
+		if d == mnt.root {
+			return nil
+		}
+		if d.parent == nil {
+			return PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
new file mode 100644
index 000000000..8e155654f
--- /dev/null
+++ b/pkg/sentry/vfs/pathname.go
@@ -0,0 +1,153 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var fspathBuilderPool = sync.Pool{
+	New: func() interface{} {
+		return &fspath.Builder{}
+	},
+}
+
+func getFSPathBuilder() *fspath.Builder {
+	return fspathBuilderPool.Get().(*fspath.Builder)
+}
+
+func putFSPathBuilder(b *fspath.Builder) {
+	// No methods can be called on b after b.String(), so reset it to its zero
+	// value (as returned by fspathBuilderPool.New) instead.
+	*b = fspath.Builder{}
+	fspathBuilderPool.Put(b)
+}
+
+// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+
+	origD := vd.dentry
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				// GenericPrependPath() will have returned
+				// PrependPathAtVFSRootError in this case since it checks
+				// against vfsroot before mnt.root, but other implementations
+				// of FilesystemImpl.PrependPath() may return nil instead.
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+			// continue loop
+		case PrependPathSyntheticError:
+			// Skip prepending "/" and appending " (deleted)".
+			return b.String(), nil
+		case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if origD.IsDisowned() {
+		b.AppendString(" (deleted)")
+	}
+	return b.String(), nil
+}
+
+// PathnameForGetcwd returns an absolute pathname to vd, consistent with
+// Linux's sys_getcwd().
+func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	if vd.dentry.IsDisowned() {
+		return "", syserror.ENOENT
+	}
+
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+	unreachable := false
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				unreachable = true
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			unreachable = true
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if unreachable {
+		b.PrependString("(unreachable)")
+	}
+	return b.String(), nil
+}
+
+// As of this writing, we do not have equivalents to:
+//
+// - d_absolute_path(), which returns EINVAL if (effectively) any call to
+// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
+//
+// - dentry_path(), which does not walk up mounts (and only returns the path
+// relative to Filesystem root), but also appends "//deleted" for disowned
+// Dentries.
+//
+// These should be added as necessary.
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 593144cb7..7a1d9e383 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -15,7 +15,10 @@
 package vfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -114,6 +117,12 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
 	return syserror.EPERM
 }
 
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
+	return PrependPathSyntheticError{}
+}
+
 type fdTestDentry struct {
 	vfsd Dentry
 }
-- 
cgit v1.2.3


From e690651c67d38c2bd8532ddabd2967ebeef58c7e Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Wed, 11 Dec 2019 13:52:48 -0800
Subject: Run kythe build with -std=c++17

We seem to be getting some compiler errors when using the Kythe
extractors.bazelrc.

I'm not able to reproduce this on my local machine, so I'm hoping copying the
cxxopt from gvisor.dev/pr/1350 will fix the build.

PiperOrigin-RevId: 285054258
---
 kokoro/kythe/generate_xrefs.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 49186eeeb..799467a34 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -46,6 +46,7 @@ bazel \
   build \
   --override_repository kythe_release="${KYTHE_DIR}" \
   --define=kythe_corpus=gvisor.dev \
+  --cxxopt=-std=c++17 \
   //...
 
 "${KYTHE_DIR}/tools/kzip" merge \
-- 
cgit v1.2.3


From e2e3b38460096a00cabe9041177e729c54e07b3b Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Wed, 11 Dec 2019 15:38:07 -0800
Subject: GCloudProducer: tunnel_dispatch, mock_recorder, and machine.

Work to import GCloudProducer, written in gerrit, which is
too large to do in one CL. GCloudProducer sets up gcloud
instances to run benchmark workloads.

Included are:
- gcloud_mock_recorder - used to Mock GCloudProducer
- tunnel_dispatcher - updates to this module to bring it in line
with the style guide
- machine - updates to this module to bring it in line with the
 style guide

All changes are independent of the rest of the changes, and
should "just build".

PiperOrigin-RevId: 285076423
---
 benchmarks/harness/machine.py                      | 55 +++++++++---
 benchmarks/harness/machine_producers/BUILD         |  5 ++
 .../machine_producers/gcloud_mock_recorder.py      | 97 ++++++++++++++++++++++
 benchmarks/harness/tunnel_dispatcher.py            | 66 ++++++++++++---
 4 files changed, 199 insertions(+), 24 deletions(-)
 create mode 100644 benchmarks/harness/machine_producers/gcloud_mock_recorder.py

diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index 2166d040a..66b719b63 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -11,7 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Machine abstraction. This is the primary API for benchmarks."""
+"""Machine abstraction passed to benchmarks to run docker containers.
+
+Abstraction for interacting with test machines. Machines are produced
+by Machine producers and represent a local or remote machine. Benchmark
+methods in /benchmarks/suite are passed the required number of machines in order
+to run the benchmark. Machines contain methods to run commands via bash,
+possibly over ssh. Machines also hold a connection to the docker UNIX socket
+to run contianers.
+
+  Typical usage example:
+
+  machine = Machine()
+  machine.run(cmd)
+  machine.pull(path)
+  container = machine.container()
+"""
 
 import logging
 import re
@@ -28,12 +43,16 @@ from benchmarks.harness import ssh_connection
 from benchmarks.harness import tunnel_dispatcher
 
 
-class Machine:
+class Machine(object):
   """The machine object is the primary object for benchmarks.
 
   Machine objects are passed to each metric function call and benchmarks use
   machines to access real connections to those machines.
+
+  Attributes:
+    _name: Name as a string
   """
+  _name = ""
 
   def run(self, cmd: str) -> Tuple[str, str]:
     """Convenience method for running a bash command on a machine object.
@@ -90,11 +109,15 @@ class Machine:
 
   def sleep(self, amount: float):
     """Sleeps the given amount of time."""
-    raise NotImplementedError
+    time.sleep(amount)
+
+  def __str__(self):
+    return self._name
 
 
 class MockMachine(Machine):
   """A mocked machine."""
+  _name = "mock"
 
   def run(self, cmd: str) -> Tuple[str, str]:
     return "", ""
@@ -119,15 +142,18 @@ def get_address(machine: Machine) -> str:
 
 
 class LocalMachine(Machine):
-  """The local machine."""
+  """The local machine.
+
+  Attributes:
+    _name: Name as a string
+    _docker_client: a pythonic connection to to the local dockerd unix socket.
+      See: https://github.com/docker/docker-py
+  """
 
   def __init__(self, name):
     self._name = name
     self._docker_client = docker.from_env()
 
-  def __str__(self):
-    return self._name
-
   def run(self, cmd: str) -> Tuple[str, str]:
     process = subprocess.Popen(
         cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -155,7 +181,17 @@ class LocalMachine(Machine):
 
 
 class RemoteMachine(Machine):
-  """Remote machine accessible via an SSH connection."""
+  """Remote machine accessible via an SSH connection.
+
+  Attributes:
+    _name: Name as a string
+    _ssh_connection: a paramiko backed ssh connection which can be used to run
+      commands on this machine
+    _tunnel: a python wrapper around a port forwarded ssh connection between a
+      local unix socket and the remote machine's dockerd unix socket.
+    _docker_client: a pythonic wrapper backed by the _tunnel. Allows sending
+      docker commands: see https://github.com/docker/docker-py
+  """
 
   def __init__(self, name, **kwargs):
     self._name = name
@@ -164,9 +200,6 @@ class RemoteMachine(Machine):
     self._tunnel.connect()
     self._docker_client = self._tunnel.get_docker_client()
 
-  def __str__(self):
-    return self._name
-
   def run(self, cmd: str) -> Tuple[str, str]:
     return self._ssh_connection.run(cmd)
 
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index 5b2228e01..a48da02a1 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -33,3 +33,8 @@ py_library(
         requirement("PyYAML", False),
     ],
 )
+
+py_library(
+    name = "gcloud_mock_recorder",
+    srcs = ["gcloud_mock_recorder.py"],
+)
diff --git a/benchmarks/harness/machine_producers/gcloud_mock_recorder.py b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py
new file mode 100644
index 000000000..fd9837a37
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py
@@ -0,0 +1,97 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A recorder and replay for testing the GCloudProducer.
+
+MockPrinter and MockReader handle printing and reading mock data for the
+purposes of testing. MockPrinter is passed to GCloudProducer objects. The user
+can then run scenarios and record them for playback in tests later.
+
+MockReader is passed to MockGcloudProducer objects and handles reading the
+previously recorded mock data.
+
+It is left to the user to check if data printed is properly redacted for their
+own use. The intended usecase for this class is data coming from gcloud
+commands, which will contain public IPs and other instance data.
+
+The data format is json and printed/read from the ./test_data directory. The
+data is the output of subprocess.CompletedProcess objects in json format.
+
+  Typical usage example:
+
+  recorder = MockPrinter()
+  producer = GCloudProducer(args, recorder)
+  machines = producer.get_machines(1)
+  with open("my_file.json") as fd:
+    recorder.write_out(fd)
+
+  reader = MockReader(filename)
+  producer = MockGcloudProducer(args, mock)
+  machines = producer.get_machines(1)
+  assert len(machines) == 1
+"""
+
+import io
+import json
+import subprocess
+
+
+class MockPrinter(object):
+  """Handles printing Mock data for MockGcloudProducer.
+
+  Attributes:
+    _records: list of json object records for printing
+  """
+
+  def __init__(self):
+    self._records = []
+
+  def record(self, entry: subprocess.CompletedProcess):
+    """Records data and strips out ip addresses."""
+
+    record = {
+        "args": entry.args,
+        "stdout": entry.stdout.decode("utf-8"),
+        "returncode": str(entry.returncode)
+    }
+    self._records.append(record)
+
+  def write_out(self, fd: io.FileIO):
+    """Prints out the data into the given filepath."""
+    fd.write(json.dumps(self._records, indent=4))
+
+
+class MockReader(object):
+  """Handles reading Mock data for MockGcloudProducer.
+
+  Attributes:
+    _records: List[json] records read from the passed in file.
+  """
+
+  def __init__(self, filepath: str):
+    with open(filepath, "rb") as file:
+      self._records = json.loads(file.read())
+      self._i = 0
+
+  def __iter__(self):
+    return self
+
+  def __next__(self, args) -> subprocess.CompletedProcess:
+    """Returns the next record as a CompletedProcess."""
+    if self._i < len(self._records):
+      record = self._records[self._i]
+      stdout = record["stdout"].encode("ascii")
+      returncode = int(record["returncode"])
+      return subprocess.CompletedProcess(
+          args=args, returncode=returncode, stdout=stdout, stderr=b"")
+    raise StopIteration()
diff --git a/benchmarks/harness/tunnel_dispatcher.py b/benchmarks/harness/tunnel_dispatcher.py
index 8dfe2862a..c56fd022a 100644
--- a/benchmarks/harness/tunnel_dispatcher.py
+++ b/benchmarks/harness/tunnel_dispatcher.py
@@ -11,7 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tunnel handles setting up connections to remote machines."""
+"""Tunnel handles setting up connections to remote machines.
+
+Tunnel dispatcher is a wrapper around the connection from a local UNIX socket
+and a remote UNIX socket via SSH with port forwarding. This is done to
+initialize the pythonic dockerpy client to run containers on the remote host by
+connecting to /var/run/docker.sock (where Docker is listening). Tunnel
+dispatcher sets up the local UNIX socket and calls the `ssh` command as a
+subprocess, and holds a reference to that subprocess. It manages clean-up on
+exit as best it can by killing the ssh subprocess and deleting the local UNIX
+socket,stored in /tmp for easy cleanup in most systems if this fails.
+
+  Typical usage example:
+
+  t = Tunnel(name, **kwargs)
+  t.connect()
+  client = t.get_docker_client() #
+  client.containers.run("ubuntu", "echo hello world")
+
+"""
 
 import os
 import tempfile
@@ -21,31 +39,53 @@ import docker
 import pexpect
 
 SSH_TUNNEL_COMMAND = """ssh
- -o GlobalKnownHostsFile=/dev/null
- -o UserKnownHostsFile=/dev/null
- -o StrictHostKeyChecking=no
- -nNT -L {filename}:/var/run/docker.sock
- -i {key_path}
- {username}@{hostname}"""
+    -o GlobalKnownHostsFile=/dev/null
+    -o UserKnownHostsFile=/dev/null
+    -o StrictHostKeyChecking=no
+    -o IdentitiesOnly=yes
+    -nNT -L {filename}:/var/run/docker.sock
+    -i {key_path}
+    {username}@{hostname}"""
 
 
-class Tunnel:
+class Tunnel(object):
   """The tunnel object represents the tunnel via ssh.
 
   This connects a local unix domain socket with a remote socket.
+
+  Attributes:
+      _filename: a temporary name of the UNIX socket prefixed by the name
+        argument.
+      _hostname: the IP or resolvable hostname of the remote host.
+      _username: the username of the ssh_key used to run ssh.
+      _key_path: path to a valid key.
+      _key_password: optional password to the ssh key in _key_path
+      _process: holds reference to the ssh subprocess created.
+
+    Returns:
+      The new minimum port.
+
+    Raises:
+      ConnectionError: If no available port is found.
   """
 
-  def __init__(self, name, hostname: str, username: str, key_path: str,
+  def __init__(self,
+               name: str,
+               hostname: str,
+               username: str,
+               key_path: str,
+               key_password: str = "",
                **kwargs):
     self._filename = tempfile.NamedTemporaryFile(prefix=name).name
     self._hostname = hostname
     self._username = username
     self._key_path = key_path
+    self._key_password = key_password
     self._kwargs = kwargs
     self._process = None
 
   def connect(self):
-    """Connects the SSH tunnel."""
+    """Connects the SSH tunnel and stores the subprocess reference in _process."""
     cmd = SSH_TUNNEL_COMMAND.format(
         filename=self._filename,
         key_path=self._key_path,
@@ -54,9 +94,9 @@ class Tunnel:
     self._process = pexpect.spawn(cmd, timeout=10)
 
     # If given a password, assume we'll be asked for it.
-    if "key_password" in self._kwargs:
+    if self._key_password:
       self._process.expect(["Enter passphrase for key .*: "])
-      self._process.sendline(self._kwargs["key_password"])
+      self._process.sendline(self._key_password)
 
     while True:
       # Wait for the tunnel to appear.
@@ -71,7 +111,7 @@ class Tunnel:
     return self._filename
 
   def get_docker_client(self):
-    """Returns a docker client for this Tunne0l."""
+    """Returns a docker client for this Tunnel."""
     return docker.DockerClient(base_url="unix:/" + self._filename)
 
   def __del__(self):
-- 
cgit v1.2.3


From 1601e78a52e9181d1ea8a3ff36399575e95ad0bf Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 11 Dec 2019 16:39:58 -0800
Subject: Add syscall tests for getxattr and setxattr.

Support for getxattr and setxattr are in subsequent commits.

PiperOrigin-RevId: 285088817
---
 test/syscalls/BUILD          |   5 +
 test/syscalls/linux/BUILD    |  21 ++
 test/syscalls/linux/xattr.cc | 491 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 517 insertions(+)
 create mode 100644 test/syscalls/linux/xattr.cc

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 829693e8e..a3a85917d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -717,6 +717,11 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:xattr_test",
+)
+
 go_binary(
     name = "syscall_test_runner",
     testonly = 1,
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 6ea922fb4..0bbaaf28a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3722,3 +3722,24 @@ cc_binary(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_binary(
+    name = "xattr_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "xattr.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
new file mode 100644
index 000000000..3e07b634b
--- /dev/null
+++ b/test/syscalls/linux/xattr.cc
@@ -0,0 +1,491 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class XattrTest : public FileTest {};
+
+TEST_F(XattrTest, XattrNullName) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, XattrEmptyName) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+
+  EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrLargeName) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name = "user.";
+  name += std::string(XATTR_NAME_MAX - name.length(), 'a');
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+
+  name += "a";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrInvalidPrefix) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name(XATTR_NAME_MAX, 'a');
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_F(XattrTest, XattrReadOnly) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
+
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+              SyscallFailsWithErrno(EACCES));
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, XattrWriteOnly) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(XattrTest, XattrTrustedWithNonadmin) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  char name[] = "user.abc";
+
+  char char_device[] = "/dev/zero";
+  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+              SyscallFailsWithErrno(ENODATA));
+
+  // Use tmpfs, where creation of named pipes is supported.
+  const std::string fifo = NewTempAbsPathInDir("/dev/shm");
+  const char* path = fifo.c_str();
+  EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  size_t size = 1;
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+
+  // Note that each particular fs implementation may stipulate a lower size
+  // limit, in which case we actually may fail (e.g. error with ENOSPC) for
+  // some sizes under XATTR_SIZE_MAX.
+  size_t size = XATTR_SIZE_MAX + 1;
+  std::vector<char> val(size);
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallFailsWithErrno(E2BIG));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
+              SyscallFailsWithErrno(EFAULT));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val(XATTR_SIZE_MAX + 1);
+  std::fill(val.begin(), val.end(), 'a');
+  size_t size = 1;
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(2));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+              SyscallFailsWithErrno(EEXIST));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+              SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrInvalidFlags) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  int invalid_flags = 0xff;
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(XattrTest, Getxattr) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  size_t size = val.size();
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, 1), SyscallFailsWithErrno(ERANGE));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
+
+  std::vector<char> buf(XATTR_SIZE_MAX);
+  std::fill(buf.begin(), buf.end(), '-');
+  std::vector<char> expected_buf = buf;
+  expected_buf[0] = 'a';
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, 0),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf(XATTR_SIZE_MAX + 1);
+  std::fill(buf.begin(), buf.end(), '-');
+  std::vector<char> expected_buf = buf;
+  expected_buf[0] = 'a';
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, size),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  // Set value with zero size.
+  EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+  // Get value with nonzero size.
+  EXPECT_THAT(getxattr(path, name, nullptr, size), SyscallSucceedsWithValue(0));
+
+  // Set value with nonzero size.
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+  // Get value with zero size.
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
+}
+
+TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(b/127675828): Support getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name = "user.nonexistent";
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(ENODATA));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 6fc9f0aefd89ce42ef2c38ea7853f9ba7c4bee04 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 11 Dec 2019 17:51:37 -0800
Subject: Add support for TCP_USER_TIMEOUT option.

The implementation follows the linux behavior where specifying
a TCP_USER_TIMEOUT will cause the resend timer to honor the
user specified timeout rather than the default rto based timeout.

Further it alters when connections are timedout due to keepalive
failures. It does not alter the behavior of when keepalives are
sent. This is as per the linux behavior.

PiperOrigin-RevId: 285099795
---
 pkg/sentry/socket/netstack/netstack.go       |  23 ++++
 pkg/tcpip/tcpip.go                           |   5 +
 pkg/tcpip/transport/tcp/BUILD                |   1 +
 pkg/tcpip/transport/tcp/accept.go            |  15 +++
 pkg/tcpip/transport/tcp/connect.go           |  19 ++-
 pkg/tcpip/transport/tcp/endpoint.go          |  19 +++
 pkg/tcpip/transport/tcp/protocol.go          |  21 ++-
 pkg/tcpip/transport/tcp/rcv.go               |  19 ++-
 pkg/tcpip/transport/tcp/rcv_state.go         |  29 ++++
 pkg/tcpip/transport/tcp/snd.go               |  48 ++++++-
 pkg/tcpip/transport/tcp/snd_state.go         |  10 ++
 pkg/tcpip/transport/tcp/tcp_test.go          | 194 ++++++++++++++++++++++++---
 test/syscalls/linux/socket_inet_loopback.cc  |  56 +++++++-
 test/syscalls/linux/socket_ip_tcp_generic.cc |  63 +++++++++
 test/syscalls/linux/tcp_socket.cc            |  25 ++++
 15 files changed, 509 insertions(+), 38 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/rcv_state.go

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index fe5a46aa3..8a6522eac 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1127,6 +1127,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_USER_TIMEOUT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPUserTimeoutOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Millisecond), nil
+
 	case linux.TCP_INFO:
 		var v tcpip.TCPInfoOption
 		if err := ep.GetSockOpt(&v); err != nil {
@@ -1563,6 +1575,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_USER_TIMEOUT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+
 	case linux.TCP_CONGESTION:
 		v := tcpip.CongestionControlOption(optVal)
 		if err := ep.SetSockOpt(v); err != nil {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index d5bb5b6ed..f62fd729f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -576,6 +576,11 @@ type KeepaliveIntervalOption time.Duration
 // closed.
 type KeepaliveCountOption int
 
+// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
+// specified timeout for a given TCP connection.
+// See: RFC5482 for details.
+type TCPUserTimeoutOption time.Duration
+
 // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
 // the current congestion control algorithm.
 type CongestionControlOption string
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 455a1c098..3b353d56c 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -28,6 +28,7 @@ go_library(
         "forwarder.go",
         "protocol.go",
         "rcv.go",
+        "rcv_state.go",
         "reno.go",
         "sack.go",
         "sack_scoreboard.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 74df3edfb..5422ae80c 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -242,6 +242,13 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 	n.initGSO()
 
+	// Now inherit any socket options that should be inherited from the
+	// listening endpoint.
+	// In case of Forwarder listenEP will be nil and hence this check.
+	if l.listenEP != nil {
+		l.listenEP.propagateInheritableOptions(n)
+	}
+
 	// Register new endpoint so that packets are routed to it.
 	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
 		n.Close()
@@ -350,6 +357,14 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	}
 }
 
+// propagateInheritableOptions propagates any options set on the listening
+// endpoint to the newly created endpoint.
+func (e *endpoint) propagateInheritableOptions(n *endpoint) {
+	e.mu.Lock()
+	n.userTimeout = e.userTimeout
+	e.mu.Unlock()
+}
+
 // handleSynSegment is called in its own goroutine once the listening endpoint
 // receives a SYN segment. It is responsible for completing the handshake and
 // queueing the new endpoint for acceptance.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 3d059c302..4c34fc9d2 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -862,7 +862,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	}
 	e.state = StateError
 	e.HardError = err
-	if err != tcpip.ErrConnectionReset {
+	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
 		// one used by Linux. We need to handle the case of window being shrunk
 		// which can cause sndNxt to be outside the acceptable window on the
@@ -1087,12 +1087,24 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
+	e.mu.RLock()
+	userTimeout := e.userTimeout
+	e.mu.RUnlock()
+
 	e.keepalive.Lock()
 	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
 		e.keepalive.Unlock()
 		return nil
 	}
 
+	// If a userTimeout is set then abort the connection if it is
+	// exceeded.
+	if userTimeout != 0 && time.Since(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
+		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
+		return tcpip.ErrTimeout
+	}
+
 	if e.keepalive.unacked >= e.keepalive.count {
 		e.keepalive.Unlock()
 		e.stack.Stats().TCP.EstablishedTimedout.Increment()
@@ -1112,7 +1124,6 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
 // whether it is enabled for this endpoint.
 func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	e.keepalive.Lock()
-	defer e.keepalive.Unlock()
 	if receivedData {
 		e.keepalive.unacked = 0
 	}
@@ -1120,6 +1131,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	// data to send.
 	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
 		e.keepalive.timer.disable()
+		e.keepalive.Unlock()
 		return
 	}
 	if e.keepalive.unacked > 0 {
@@ -1127,6 +1139,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	} else {
 		e.keepalive.timer.enable(e.keepalive.idle)
 	}
+	e.keepalive.Unlock()
 }
 
 // disableKeepaliveTimer stops the keepalive timer.
@@ -1239,6 +1252,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			w: &e.snd.resendWaker,
 			f: func() *tcpip.Error {
 				if !e.snd.retransmitTimerExpired() {
+					e.stack.Stats().TCP.EstablishedTimedout.Increment()
 					return tcpip.ErrTimeout
 				}
 				return nil
@@ -1405,6 +1419,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		if s == nil {
 			break
 		}
+
 		e.tryDeliverSegmentFromClosedEndpoint(s)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4861ab513..dd8b47cbe 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -341,6 +341,7 @@ type endpoint struct {
 	// TCP should never broadcast but Linux nevertheless supports enabling/
 	// disabling SO_BROADCAST, albeit as a NOOP.
 	broadcast bool
+
 	// Values used to reserve a port or register a transport endpoint
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
@@ -474,6 +475,12 @@ type endpoint struct {
 	// without hearing a response, the connection is closed.
 	keepalive keepalive
 
+	// userTimeout if non-zero specifies a user specified timeout for
+	// a connection w/ pending data to send. A connection that has pending
+	// unacked data will be forcibily aborted if the timeout is reached
+	// without any data being acked.
+	userTimeout time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1333,6 +1340,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 		return nil
 
+	case tcpip.TCPUserTimeoutOption:
+		e.mu.Lock()
+		e.userTimeout = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.BroadcastOption:
 		e.mu.Lock()
 		e.broadcast = v != 0
@@ -1591,6 +1604,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.keepalive.Unlock()
 		return nil
 
+	case *tcpip.TCPUserTimeoutOption:
+		e.mu.Lock()
+		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
+		e.mu.Unlock()
+		return nil
+
 	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 		*o = 1
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 89b965c23..bc718064c 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -162,13 +162,26 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 func replyWithReset(s *segment) {
 	// Get the seqnum from the packet if the ack flag is set.
 	seq := seqnum.Value(0)
+	ack := seqnum.Value(0)
+	flags := byte(header.TCPFlagRst)
+	// As per RFC 793 page 35 (Reset Generation)
+	//   1.  If the connection does not exist (CLOSED) then a reset is sent
+	//   in response to any incoming segment except another reset.  In
+	//   particular, SYNs addressed to a non-existent connection are rejected
+	//   by this means.
+
+	//   If the incoming segment has an ACK field, the reset takes its
+	//   sequence number from the ACK field of the segment, otherwise the
+	//   reset has sequence number zero and the ACK field is set to the sum
+	//   of the sequence number and segment length of the incoming segment.
+	//   The connection remains in the CLOSED state.
 	if s.flagIsSet(header.TCPFlagAck) {
 		seq = s.ackNumber
+	} else {
+		flags |= header.TCPFlagAck
+		ack = s.sequenceNumber.Add(s.logicalLen())
 	}
-
-	ack := s.sequenceNumber.Add(s.logicalLen())
-
-	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
 }
 
 // SetOption implements TransportProtocol.SetOption.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 5ee499c36..0a5534959 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -50,16 +50,20 @@ type receiver struct {
 	pendingRcvdSegments segmentHeap
 	pendingBufUsed      seqnum.Size
 	pendingBufSize      seqnum.Size
+
+	// Time when the last ack was received.
+	lastRcvdAckTime time.Time `state:".(unixTime)"`
 }
 
 func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
 	return &receiver{
-		ep:             ep,
-		rcvNxt:         irs + 1,
-		rcvAcc:         irs.Add(rcvWnd + 1),
-		rcvWnd:         rcvWnd,
-		rcvWndScale:    rcvWndScale,
-		pendingBufSize: pendingBufSize,
+		ep:              ep,
+		rcvNxt:          irs + 1,
+		rcvAcc:          irs.Add(rcvWnd + 1),
+		rcvWnd:          rcvWnd,
+		rcvWndScale:     rcvWndScale,
+		pendingBufSize:  pendingBufSize,
+		lastRcvdAckTime: time.Now(),
 	}
 }
 
@@ -360,6 +364,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		return true, nil
 	}
 
+	// Store the time of the last ack.
+	r.lastRcvdAckTime = time.Now()
+
 	// Defer segment processing if it can't be consumed now.
 	if !r.consumeSegment(s, segSeq, segLen) {
 		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
diff --git a/pkg/tcpip/transport/tcp/rcv_state.go b/pkg/tcpip/transport/tcp/rcv_state.go
new file mode 100644
index 000000000..2bf21a2e7
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveLastRcvdAckTime is invoked by stateify.
+func (r *receiver) saveLastRcvdAckTime() unixTime {
+	return unixTime{r.lastRcvdAckTime.Unix(), r.lastRcvdAckTime.UnixNano()}
+}
+
+// loadLastRcvdAckTime is invoked by stateify.
+func (r *receiver) loadLastRcvdAckTime(unix unixTime) {
+	r.lastRcvdAckTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 8332a0179..8a947dc66 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -28,8 +28,11 @@ import (
 )
 
 const (
-	// minRTO is the minimum allowed value for the retransmit timeout.
-	minRTO = 200 * time.Millisecond
+	// MinRTO is the minimum allowed value for the retransmit timeout.
+	MinRTO = 200 * time.Millisecond
+
+	// MaxRTO is the maximum allowed value for the retransmit timeout.
+	MaxRTO = 120 * time.Second
 
 	// InitialCwnd is the initial congestion window.
 	InitialCwnd = 10
@@ -134,6 +137,10 @@ type sender struct {
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
 	rttMeasureTime time.Time `state:".(unixTime)"`
 
+	// firstRetransmittedSegXmitTime is the original transmit time of
+	// the first segment that was retransmitted due to RTO expiration.
+	firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
+
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
@@ -392,8 +399,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
 
 	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
 	s.rtt.Unlock()
-	if s.rto < minRTO {
-		s.rto = minRTO
+	if s.rto < MinRTO {
+		s.rto = MinRTO
 	}
 }
 
@@ -438,8 +445,30 @@ func (s *sender) retransmitTimerExpired() bool {
 	s.ep.stack.Stats().TCP.Timeouts.Increment()
 	s.ep.stats.SendErrors.Timeouts.Increment()
 
-	// Give up if we've waited more than a minute since the last resend.
-	if s.rto >= 60*time.Second {
+	// Give up if we've waited more than a minute since the last resend or
+	// if a user time out is set and we have exceeded the user specified
+	// timeout since the first retransmission.
+	s.ep.mu.RLock()
+	uto := s.ep.userTimeout
+	s.ep.mu.RUnlock()
+
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		// We store the original xmitTime of the segment that we are
+		// about to retransmit as the retransmission time. This is
+		// required as by the time the retransmitTimer has expired the
+		// segment has already been sent and unacked for the RTO at the
+		// time the segment was sent.
+		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
+	}
+
+	elapsed := time.Since(s.firstRetransmittedSegXmitTime)
+	remaining := MaxRTO
+	if uto != 0 {
+		// Cap to the user specified timeout if one is specified.
+		remaining = uto - elapsed
+	}
+
+	if remaining <= 0 || s.rto >= MaxRTO {
 		return false
 	}
 
@@ -447,6 +476,11 @@ func (s *sender) retransmitTimerExpired() bool {
 	// below.
 	s.rto *= 2
 
+	// Cap RTO to remaining time.
+	if s.rto > remaining {
+		s.rto = remaining
+	}
+
 	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
 	//
 	// Retransmit timeouts:
@@ -1168,6 +1202,8 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		// RFC 6298 Rule 5.3
 		if s.sndUna == s.sndNxt {
 			s.outstanding = 0
+			// Reset firstRetransmittedSegXmitTime to the zero value.
+			s.firstRetransmittedSegXmitTime = time.Time{}
 			s.resendTimer.disable()
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 12eff8afc..8b20c3455 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -48,3 +48,13 @@ func (s *sender) loadRttMeasureTime(unix unixTime) {
 func (s *sender) afterLoad() {
 	s.resendTimer.init(&s.resendWaker)
 }
+
+// saveFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) saveFirstRetransmittedSegXmitTime() unixTime {
+	return unixTime{s.firstRetransmittedSegXmitTime.Unix(), s.firstRetransmittedSegXmitTime.UnixNano()}
+}
+
+// loadFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) loadFirstRetransmittedSegXmitTime(unix unixTime) {
+	s.firstRetransmittedSegXmitTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index bc5cfcf0e..2a83f7bcc 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -323,8 +323,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
 }
 
 func TestTCPResetsReceivedIncrement(t *testing.T) {
@@ -460,18 +460,17 @@ func TestConnectResetAfterClose(t *testing.T) {
 			checker.TCP(
 				checker.DstPort(context.TestPort),
 				checker.SeqNum(uint32(c.IRS)+2),
-				checker.AckNum(790),
-				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+				checker.AckNum(0),
+				checker.TCPFlags(header.TCPFlagRst),
 			),
 		)
 		break
 	}
 }
 
-// TestClosingWithEnqueuedSegments tests handling of
-// still enqueued segments when the endpoint transitions
-// to StateClose. The in-flight segments would be re-enqueued
-// to a any listening endpoint.
+// TestClosingWithEnqueuedSegments tests handling of still enqueued segments
+// when the endpoint transitions to StateClose. The in-flight segments would be
+// re-enqueued to a any listening endpoint.
 func TestClosingWithEnqueuedSegments(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -576,8 +575,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(793),
-			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			checker.AckNum(0),
+			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
 }
@@ -914,7 +913,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
 
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.TCPFlags(header.TCPFlagRst),
 		checker.SeqNum(200)))
 }
 
@@ -942,7 +941,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.TCPFlags(header.TCPFlagRst),
 		checker.SeqNum(200)))
 }
 
@@ -4291,8 +4290,9 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	const keepAliveInterval = 10 * time.Millisecond
 	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(10 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
 	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5))
 	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
 
@@ -4382,13 +4382,29 @@ func TestKeepalive(t *testing.T) {
 		)
 	}
 
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
 	// The connection should be terminated after 5 unacked keepalives.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(790)),
-			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
 
@@ -6157,8 +6173,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(uint32(ackHeaders.SeqNum)),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
 
 	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
 		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want)
@@ -6336,7 +6352,147 @@ func TestTCPCloseWithData(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(uint32(ackHeaders.SeqNum)),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
+	userTimeout := 50 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Send some data and wait before ACKing it.
+	view := buffer.NewView(3)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Wait for a little over the minimum retransmit timeout of 200ms for
+	// the retransmitTimer to fire and close the connection.
+	time.Sleep(tcp.MinRTO + 10*time.Millisecond)
+
+	// No packet should be received as the connection should be silently
+	// closed due to timeout.
+	c.CheckNoPacket("unexpected packet received after userTimeout has expired")
+
+	next += uint32(len(view))
+
+	// The connection should be terminated after userTimeout has expired.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(next)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+	}
+
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+	}
+}
+
+func TestKeepaliveWithUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+	const keepAliveInterval = 10 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10))
+	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+
+	// Set userTimeout to be the duration for 3 keepalive probes.
+	userTimeout := 30 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Check that the connection is still alive.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+	}
+
+	// Now receive 2 keepalives, but don't ACK them. The connection should
+	// be reset when the 3rd one should be sent due to userTimeout being
+	// 30ms and each keepalive probe should be sent 10ms apart as set above after
+	// the connection has been idle for 10ms.
+	for i := 0; i < 2; i++ {
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)),
+				checker.AckNum(uint32(790)),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
+	// The connection should be terminated after 30ms.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(c.IRS + 1),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS+1)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+	}
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+	}
 }
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index fa4358ae4..761c3a9fe 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -206,7 +206,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
   // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
   // before function end.
-  // ds.reset()
+  // ds.reset();
 }
 
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
@@ -603,6 +603,60 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
               SyscallSucceeds());
 }
 
+TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the userTimeout on the listening socket.
+  constexpr int kUserTimeout = 10;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kUserTimeout, sizeof(kUserTimeout)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+  // Verify that the accepted socket inherited the user timeout set on
+  // listening socket.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kUserTimeout);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index c74273436..57ce8e169 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -812,5 +812,68 @@ TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
   ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
 }
 
+TEST_P(TCPSocketPairTest, TCPUserTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kZero, sizeof(kZero)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutBelowZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kNeg = -10;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kNeg, sizeof(kNeg)),
+              SyscallFailsWithErrno(EINVAL));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAbove = 10;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kAbove, sizeof(kAbove)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kAbove);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 99863b0ed..c503f3568 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1175,6 +1175,31 @@ TEST_P(SimpleTcpSocketTest, SetMaxSegFailsForInvalidMSSValues) {
   }
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  {
+    constexpr int kTCPUserTimeout = -1;
+    EXPECT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                           &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+                SyscallFailsWithErrno(EINVAL));
+  }
+
+  // kTCPUserTimeout is in milliseconds.
+  constexpr int kTCPUserTimeout = 100;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+              SyscallSucceedsWithValue(0));
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPUserTimeout);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From b9aa62b9f907e8de5244ac7cdb518960faafa307 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 11 Dec 2019 19:12:51 -0800
Subject: Enable IPv6 in runsc

Fixes #1341

PiperOrigin-RevId: 285108973
---
 runsc/boot/network.go    | 35 ++++++++++++------
 runsc/sandbox/BUILD      |  1 +
 runsc/sandbox/network.go | 95 ++++++++++++++++++++++++++++--------------------
 3 files changed, 80 insertions(+), 51 deletions(-)

diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index f98c5fd36..dd4926bb9 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -80,7 +80,8 @@ type CreateLinksAndRoutesArgs struct {
 	LoopbackLinks []LoopbackLink
 	FDBasedLinks  []FDBasedLink
 
-	DefaultGateway DefaultRoute
+	Defaultv4Gateway DefaultRoute
+	Defaultv6Gateway DefaultRoute
 }
 
 // Empty returns true if route hasn't been set.
@@ -122,10 +123,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		nicID++
 		nicids[link.Name] = nicID
 
-		ep := loopback.New()
+		linkEP := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
 			return err
 		}
 
@@ -157,7 +158,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
-		ep, err := fdbased.New(&fdbased.Options{
+		linkEP, err := fdbased.New(&fdbased.Options{
 			FDs:                FDs,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
@@ -172,7 +173,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
-		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
 
@@ -186,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 	}
 
-	if !args.DefaultGateway.Route.Empty() {
-		nicID, ok := nicids[args.DefaultGateway.Name]
+	if !args.Defaultv4Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv4Gateway.Name]
 		if !ok {
-			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
 		}
-		route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	if !args.Defaultv6Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv6Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+		}
+		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
 		if err != nil {
 			return err
 		}
@@ -208,11 +221,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
 	if loopback {
 		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err)
+			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 		}
 	} else {
 		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err)
+			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 		}
 	}
 
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 27459e6d1..8001949d5 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/platform",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
         "//pkg/urpc",
         "//runsc/boot",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index d42de0176..be8b72b3e 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -28,6 +28,7 @@ import (
 	"github.com/vishvananda/netlink"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
@@ -183,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			continue
 		}
 
-		// Keep only IPv4 addresses.
-		var ip4addrs []*net.IPNet
+		var ipAddrs []*net.IPNet
 		for _, ifaddr := range allAddrs {
 			ipNet, ok := ifaddr.(*net.IPNet)
 			if !ok {
 				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
 			}
-			if ipNet.IP.To4() == nil {
-				log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
-				continue
-			}
-			ip4addrs = append(ip4addrs, ipNet)
+			ipAddrs = append(ipAddrs, ipNet)
 		}
-		if len(ip4addrs) == 0 {
-			log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+		if len(ipAddrs) == 0 {
+			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
 			continue
 		}
 
 		// Scrape the routes before removing the address, since that
 		// will remove the routes as well.
-		routes, def, err := routesForIface(iface)
+		routes, defv4, defv6, err := routesForIface(iface)
 		if err != nil {
 			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
 		}
-		if def != nil {
-			if !args.DefaultGateway.Route.Empty() {
-				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+		if defv4 != nil {
+			if !args.Defaultv4Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
 			}
-			args.DefaultGateway.Route = *def
-			args.DefaultGateway.Name = iface.Name
+			args.Defaultv4Gateway.Route = *defv4
+			args.Defaultv4Gateway.Name = iface.Name
+		}
+
+		if defv6 != nil {
+			if !args.Defaultv6Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
+			}
+			args.Defaultv6Gateway.Route = *defv6
+			args.Defaultv6Gateway.Name = iface.Name
 		}
 
 		link := boot.FDBasedLink{
@@ -247,6 +251,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			}
 			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
 		}
+
 		if link.GSOMaxSize == 0 && softwareGSO {
 			// Hardware GSO is disabled. Let's enable software GSO.
 			link.GSOMaxSize = stack.SoftwareGSOMaxSize
@@ -255,7 +260,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
-		for _, addr := range ip4addrs {
+		for _, addr := range ipAddrs {
 			link.Addresses = append(link.Addresses, addr.IP)
 
 			// Steal IP address from NIC.
@@ -351,46 +356,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
 }
 
 // routesForIface iterates over all routes for the given interface and converts
-// them to boot.Routes.
-func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+// them to boot.Routes. It also returns the a default v4/v6 route if found.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
 	link, err := netlink.LinkByIndex(iface.Index)
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
 	if err != nil {
-		return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
 	}
 
-	var def *boot.Route
+	var defv4, defv6 *boot.Route
 	var routes []boot.Route
 	for _, r := range rs {
 		// Is it a default route?
 		if r.Dst == nil {
 			if r.Gw == nil {
-				return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
-			}
-			if r.Gw.To4() == nil {
-				log.Warningf("IPv6 is not supported, skipping default route: %v", r)
-				continue
-			}
-			if def != nil {
-				return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
 			}
 			// Create a catch all route to the gateway.
-			def = &boot.Route{
-				Destination: net.IPNet{
-					IP:   net.IPv4zero,
-					Mask: net.IPMask(net.IPv4zero),
-				},
-				Gateway: r.Gw,
+			switch len(r.Gw) {
+			case header.IPv4AddressSize:
+				if defv4 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
+				}
+				defv4 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv4zero,
+						Mask: net.IPMask(net.IPv4zero),
+					},
+					Gateway: r.Gw,
+				}
+			case header.IPv6AddressSize:
+				if defv6 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
+				}
+
+				defv6 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv6zero,
+						Mask: net.IPMask(net.IPv6zero),
+					},
+					Gateway: r.Gw,
+				}
+			default:
+				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
 			}
 			continue
 		}
-		if r.Dst.IP.To4() == nil {
-			log.Warningf("IPv6 is not supported, skipping route: %v", r)
-			continue
-		}
+
 		dst := *r.Dst
 		dst.IP = dst.IP.Mask(dst.Mask)
 		routes = append(routes, boot.Route{
@@ -398,7 +413,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
 			Gateway:     r.Gw,
 		})
 	}
-	return routes, def, nil
+	return routes, defv4, defv6, nil
 }
 
 // removeAddress removes IP address from network device. It's equivalent to:
-- 
cgit v1.2.3


From 378d6c1f3697b8b939e6632e980562bfc8fb2781 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 12 Dec 2019 11:07:25 -0800
Subject: unix: allow to bind unix sockets only to AF_UNIX addresses

Reported-by: syzbot+2c0bcfd87fb4e8b7b009@syzkaller.appspotmail.com
PiperOrigin-RevId: 285228312
---
 pkg/sentry/socket/netstack/netstack.go |  2 +-
 pkg/sentry/socket/unix/unix.go         |  3 +++
 test/syscalls/linux/socket_unix.cc     | 15 +++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8a6522eac..140851c17 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -326,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress,
 	}
 
 	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
+	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
 		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
 	}
 
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1aaae8487..885758054 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -118,6 +118,9 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
 	addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
 	if err != nil {
+		if err == syserr.ErrAddressFamilyNotSupported {
+			err = syserr.ErrInvalidArgument
+		}
 		return "", err
 	}
 
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 8a28202a8..4cf1f76f1 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -65,6 +65,21 @@ TEST_P(UnixSocketPairTest, BindToBadName) {
       SyscallFailsWithErrno(ENOENT));
 }
 
+TEST_P(UnixSocketPairTest, BindToBadFamily) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  constexpr char kBadName[] = "/some/path/that/does/not/exist";
+  sockaddr_un sockaddr;
+  sockaddr.sun_family = AF_INET;
+  memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+  EXPECT_THAT(
+      bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+           sizeof(sockaddr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   char sent_data[10];
-- 
cgit v1.2.3


From 007707a0726602bc99fc0dccaf2994ad967b9d96 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 12 Dec 2019 11:19:18 -0800
Subject: Implement kernfs.

PiperOrigin-RevId: 285231002
---
 pkg/abi/linux/file.go                          |  23 +
 pkg/abi/linux/fs.go                            |   7 +
 pkg/sentry/fsimpl/kernfs/BUILD                 |  60 +++
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 131 +++++
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       | 207 ++++++++
 pkg/sentry/fsimpl/kernfs/filesystem.go         | 691 +++++++++++++++++++++++++
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    | 492 ++++++++++++++++++
 pkg/sentry/fsimpl/kernfs/kernfs.go             | 405 +++++++++++++++
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        | 423 +++++++++++++++
 9 files changed, 2439 insertions(+)
 create mode 100644 pkg/sentry/fsimpl/kernfs/BUILD
 create mode 100644 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
 create mode 100644 pkg/sentry/fsimpl/kernfs/fd_impl_util.go
 create mode 100644 pkg/sentry/fsimpl/kernfs/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/kernfs/inode_impl_util.go
 create mode 100644 pkg/sentry/fsimpl/kernfs/kernfs.go
 create mode 100644 pkg/sentry/fsimpl/kernfs/kernfs_test.go

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 0f014d27f..16791d03e 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -286,6 +286,29 @@ func (m FileMode) String() string {
 	return strings.Join(s, "|")
 }
 
+// DirentType maps file types to dirent types appropriate for (struct
+// dirent)::d_type.
+func (m FileMode) DirentType() uint8 {
+	switch m.FileType() {
+	case ModeSocket:
+		return DT_SOCK
+	case ModeSymlink:
+		return DT_LNK
+	case ModeRegular:
+		return DT_REG
+	case ModeBlockDevice:
+		return DT_BLK
+	case ModeDirectory:
+		return DT_DIR
+	case ModeCharacterDevice:
+		return DT_CHR
+	case ModeNamedPipe:
+		return DT_FIFO
+	default:
+		return DT_UNKNOWN
+	}
+}
+
 var modeExtraBits = abi.FlagSet{
 	{
 		Flag: ModeSetUID,
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index b416e3472..2c652baa2 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -92,3 +92,10 @@ const (
 	SYNC_FILE_RANGE_WRITE       = 2
 	SYNC_FILE_RANGE_WAIT_AFTER  = 4
 )
+
+// Flag argument to renameat2(2), from include/uapi/linux/fs.h.
+const (
+	RENAME_NOREPLACE = (1 << 0) // Don't overwrite target.
+	RENAME_EXCHANGE  = (1 << 1) // Exchange src and dst.
+	RENAME_WHITEOUT  = (1 << 2) // Whiteout src.
+)
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
new file mode 100644
index 000000000..52596c090
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -0,0 +1,60 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "slot_list",
+    out = "slot_list.go",
+    package = "kernfs",
+    prefix = "slot",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*slot",
+        "Linker": "*slot",
+    },
+)
+
+go_library(
+    name = "kernfs",
+    srcs = [
+        "dynamic_bytes_file.go",
+        "fd_impl_util.go",
+        "filesystem.go",
+        "inode_impl_util.go",
+        "kernfs.go",
+        "slot_list.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "kernfs_test",
+    size = "small",
+    srcs = ["kernfs_test.go"],
+    deps = [
+        ":kernfs",
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
new file mode 100644
index 000000000..30c06baf0
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -0,0 +1,131 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DynamicBytesFile implements kernfs.Inode and represents a read-only
+// file whose contents are backed by a vfs.DynamicBytesSource.
+//
+// Must be initialized with Init before first use.
+type DynamicBytesFile struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeNotDirectory
+	InodeNotSymlink
+
+	data vfs.DynamicBytesSource
+}
+
+// Init intializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) {
+	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444)
+	f.data = data
+}
+
+// Open implements Inode.Open.
+func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &DynamicBytesFD{}
+	fd.Init(rp.Mount(), vfsd, f.data, flags)
+	return &fd.vfsfd, nil
+}
+
+// SetStat implements Inode.SetStat.
+func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return syserror.EPERM
+}
+
+// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
+// DynamicBytesFile.
+//
+// Must be initialized with Init before first use.
+type DynamicBytesFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DynamicBytesFileDescriptionImpl
+
+	vfsfd vfs.FileDescription
+	inode Inode
+	flags uint32
+}
+
+// Init initializes a DynamicBytesFD.
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
+	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	fd.flags = flags
+	fd.inode = d.Impl().(*Dentry).inode
+	fd.SetDataSource(data)
+	fd.vfsfd.Init(fd, m, d)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DynamicBytesFD) Release() {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(fs), nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return syserror.EPERM
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
new file mode 100644
index 000000000..d6c18937a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -0,0 +1,207 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
+// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
+// compatible with dynamic directories.
+//
+// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
+// IterDirents callback. The IterDirents callback therefore cannot hash or
+// unhash children, or recursively call IterDirents on the same underlying
+// inode.
+//
+// Must be initialize with Init before first use.
+type GenericDirectoryFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	vfsfd    vfs.FileDescription
+	children *OrderedChildren
+	flags    uint32
+	off      int64
+}
+
+// Init initializes a GenericDirectoryFD.
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
+	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	fd.children = children
+	fd.flags = flags
+	fd.vfsfd.Init(fd, m, d)
+}
+
+// VFSFileDescription returns a pointer to the vfs.FileDescription representing
+// this object.
+func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription {
+	return &fd.vfsfd
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDecriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release() {}
+
+func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *GenericDirectoryFD) inode() Inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// o.mu when calling cb.
+func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	vfsFS := fd.filesystem()
+	fs := vfsFS.Impl().(*Filesystem)
+	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	// Handle ".".
+	if fd.off == 0 {
+		stat := fd.inode().Stat(vfsFS)
+		dirent := vfs.Dirent{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     stat.Ino,
+			NextOff: 1,
+		}
+		if !cb.Handle(dirent) {
+			return nil
+		}
+		fd.off++
+	}
+
+	// Handle "..".
+	if fd.off == 1 {
+		parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
+		stat := parentInode.Stat(vfsFS)
+		dirent := vfs.Dirent{
+			Name:    "..",
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: 2,
+		}
+		if !cb.Handle(dirent) {
+			return nil
+		}
+		fd.off++
+	}
+
+	// Handle static children.
+	fd.children.mu.RLock()
+	defer fd.children.mu.RUnlock()
+	// fd.off accounts for "." and "..", but fd.children do not track
+	// these.
+	childIdx := fd.off - 2
+	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
+		inode := it.Dentry.Impl().(*Dentry).inode
+		stat := inode.Stat(vfsFS)
+		dirent := vfs.Dirent{
+			Name:    it.Name,
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: fd.off + 1,
+		}
+		if !cb.Handle(dirent) {
+			return nil
+		}
+		fd.off++
+	}
+
+	return nil
+}
+
+// Seek implements vfs.FileDecriptionImpl.Seek.
+func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fs := fd.filesystem().Impl().(*Filesystem)
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(fs), nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	fs := fd.filesystem()
+	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+	return inode.SetStat(fs, opts)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
new file mode 100644
index 000000000..db486b6c1
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -0,0 +1,691 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file implements vfs.FilesystemImpl for kernfs.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepExistingLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postcondition: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Directory searchable?
+	if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	d.dirMu.Lock()
+	nextVFSD, err := rp.ResolveComponent(vfsd)
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if nextVFSD != nil {
+		// Cached dentry exists, revalidate.
+		next := nextVFSD.Impl().(*Dentry)
+		if !next.inode.Valid(ctx) {
+			d.dirMu.Lock()
+			rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD)
+			d.dirMu.Unlock()
+			fs.deferDecRef(nextVFSD) // Reference from Lookup.
+			nextVFSD = nil
+		}
+	}
+	if nextVFSD == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		name := rp.Component()
+		var err error
+		nextVFSD, err = d.inode.Lookup(ctx, name)
+		// Reference on nextVFSD dropped by a corresponding Valid.
+		if err != nil {
+			return nil, err
+		}
+		d.InsertChild(name, nextVFSD)
+	}
+	next := nextVFSD.Impl().(*Dentry)
+
+	// Resolve any symlink at current path component.
+	if rp.ShouldFollowSymlink() && d.isSymlink() {
+		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
+		target, err := next.inode.Readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink
+
+	}
+	rp.Advance()
+	return nextVFSD, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Done() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Final() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
+// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
+	if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return "", err
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return "", syserror.EEXIST
+	}
+	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+	if err != nil {
+		return "", err
+	}
+	if childVFSD != nil {
+		return "", syserror.EEXIST
+	}
+	if parentVFSD.IsDisowned() {
+		return "", syserror.ENOENT
+	}
+	return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
+	parentVFSD := vfsd.Parent()
+	if parentVFSD == nil {
+		return syserror.EBUSY
+	}
+	if parentVFSD.IsDisowned() {
+		return syserror.ENOENT
+	}
+	if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	return nil
+}
+
+// checkRenameLocked checks that a rename operation may be performed on the
+// target dentry across the given set of parent directories. The target dentry
+// may be nil.
+//
+// Precondition: isDir(dstInode) == true.
+func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error {
+	srcDir := src.Parent()
+	if srcDir == nil {
+		return syserror.EBUSY
+	}
+	if srcDir.IsDisowned() {
+		return syserror.ENOENT
+	}
+	if dstDir.IsDisowned() {
+		return syserror.ENOENT
+	}
+	// Check for creation permissions on dst dir.
+	if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.CheckSearchable {
+		d := vfsd.Impl().(*Dentry)
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if rp.Mount() != vd.Mount() {
+		return syserror.EXDEV
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	d := vd.Dentry().Impl().(*Dentry)
+	if d.isDir() {
+		return syserror.EPERM
+	}
+
+	child, err := parentInode.NewLink(ctx, pc, d.inode)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	child, err := parentInode.NewDir(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	new, err := parentInode.NewNode(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, new)
+	return nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Filter out flags that are not supported by kernfs. O_DIRECTORY and
+	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
+	// appropriate bits in rp), but are returned by
+	// FileDescriptionImpl.StatusFlags().
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+
+	// Do not create new file.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.processDeferredDecRefs()
+		defer fs.mu.RUnlock()
+		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		if err != nil {
+			return nil, err
+		}
+		if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(rp, vfsd, opts.Flags)
+	}
+
+	// May create new file.
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(rp, vfsd, opts.Flags)
+	}
+afterTrailingSymlink:
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+	if err != nil {
+		return nil, err
+	}
+	if childVFSD == nil {
+		// Already checked for searchability above; now check for writability.
+		if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		child, err := parentInode.NewFile(ctx, pc, opts)
+		if err != nil {
+			return nil, err
+		}
+		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+		return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
+	}
+	// Open existing file or follow symlink.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	childDentry := childVFSD.Impl().(*Dentry)
+	childInode := childDentry.inode
+	if rp.ShouldFollowSymlink() {
+		if childDentry.isSymlink() {
+			target, err := childInode.Readlink(ctx)
+			if err != nil {
+				return nil, err
+			}
+			if err := rp.HandleSymlink(target); err != nil {
+				return nil, err
+			}
+			// rp.Final() may no longer be true since we now need to resolve the
+			// symlink target.
+			goto afterTrailingSymlink
+		}
+	}
+	if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	return childInode.Open(rp, childVFSD, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	d, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return "", err
+	}
+	if !d.Impl().(*Dentry).isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return inode.Readlink(ctx)
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
+	exchange := opts.Flags&linux.RENAME_EXCHANGE != 0
+	whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0
+	if exchange && (noReplace || whiteout) {
+		// Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE.
+		return syserror.EINVAL
+	}
+	if exchange || whiteout {
+		// Exchange and Whiteout flags are not supported on kernfs.
+		return syserror.EINVAL
+	}
+
+	fs.mu.Lock()
+	defer fs.mu.Lock()
+
+	mnt := rp.Mount()
+	if mnt != vd.Mount() {
+		return syserror.EXDEV
+	}
+
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+
+	srcVFSD := vd.Dentry()
+	srcDirVFSD := srcVFSD.Parent()
+
+	// Can we remove the src dentry?
+	if err := checkDeleteLocked(rp, srcVFSD); err != nil {
+		return err
+	}
+
+	// Can we create the dst dentry?
+	var dstVFSD *vfs.Dentry
+	pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode)
+	switch err {
+	case nil:
+		// Ok, continue with rename as replacement.
+	case syserror.EEXIST:
+		if noReplace {
+			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
+			return syserror.EEXIST
+		}
+		dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc)
+		if err != nil {
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+		}
+	default:
+		return err
+	}
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	virtfs := rp.VirtualFilesystem()
+
+	srcDirDentry := srcDirVFSD.Impl().(*Dentry)
+	dstDirDentry := dstDirVFSD.Impl().(*Dentry)
+
+	// We can't deadlock here due to lock ordering because we're protected from
+	// concurrent renames by fs.mu held for writing.
+	srcDirDentry.dirMu.Lock()
+	defer srcDirDentry.dirMu.Unlock()
+	dstDirDentry.dirMu.Lock()
+	defer dstDirDentry.dirMu.Unlock()
+
+	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
+		return err
+	}
+	srcDirInode := srcDirDentry.inode
+	replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD)
+	if err != nil {
+		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
+		return err
+	}
+	virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(rp, vfsd); err != nil {
+		return err
+	}
+	if !vfsd.Impl().(*Dentry).isDir() {
+		return syserror.ENOTDIR
+	}
+	if inode.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(vfsd)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return inode.SetStat(fs.VFSFilesystem(), opts)
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return inode.Stat(fs.VFSFilesystem()), nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO: actually implement statfs
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	child, err := parentInode.NewSymlink(ctx, pc, target)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(rp, vfsd); err != nil {
+		return err
+	}
+	if vfsd.Impl().(*Dentry).isDir() {
+		return syserror.EISDIR
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(vfsd)
+	return nil
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
new file mode 100644
index 000000000..7b45b702a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -0,0 +1,492 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// InodeNoopRefCount partially implements the Inode interface, specifically the
+// inodeRefs sub interface. InodeNoopRefCount implements a simple reference
+// count for inodes, performing no extra actions when references are obtained or
+// released. This is suitable for simple file inodes that don't reference any
+// resources.
+type InodeNoopRefCount struct {
+}
+
+// IncRef implements Inode.IncRef.
+func (n *InodeNoopRefCount) IncRef() {
+}
+
+// DecRef implements Inode.DecRef.
+func (n *InodeNoopRefCount) DecRef() {
+}
+
+// TryIncRef implements Inode.TryIncRef.
+func (n *InodeNoopRefCount) TryIncRef() bool {
+	return true
+}
+
+// Destroy implements Inode.Destroy.
+func (n *InodeNoopRefCount) Destroy() {
+}
+
+// InodeDirectoryNoNewChildren partially implements the Inode interface.
+// InodeDirectoryNoNewChildren represents a directory inode which does not
+// support creation of new children.
+type InodeDirectoryNoNewChildren struct{}
+
+// NewFile implements Inode.NewFile.
+func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewLink implements Inode.NewLink.
+func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// InodeNotDirectory partially implements the Inode interface, specifically the
+// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
+// represent directories can embed this to provide no-op implementations for
+// directory-related functions.
+type InodeNotDirectory struct {
+}
+
+// HasChildren implements Inode.HasChildren.
+func (*InodeNotDirectory) HasChildren() bool {
+	return false
+}
+
+// NewFile implements Inode.NewFile.
+func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	panic("NewFile called on non-directory inode")
+}
+
+// NewDir implements Inode.NewDir.
+func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	panic("NewDir called on non-directory inode")
+}
+
+// NewLink implements Inode.NewLinkink.
+func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	panic("NewLink called on non-directory inode")
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	panic("NewSymlink called on non-directory inode")
+}
+
+// NewNode implements Inode.NewNode.
+func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	panic("NewNode called on non-directory inode")
+}
+
+// Unlink implements Inode.Unlink.
+func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+	panic("Unlink called on non-directory inode")
+}
+
+// RmDir implements Inode.RmDir.
+func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+	panic("RmDir called on non-directory inode")
+}
+
+// Rename implements Inode.Rename.
+func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+	panic("Rename called on non-directory inode")
+}
+
+// Lookup implements Inode.Lookup.
+func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	panic("Lookup called on non-directory inode")
+}
+
+// Valid implements Inode.Valid.
+func (*InodeNotDirectory) Valid(context.Context) bool {
+	return true
+}
+
+// InodeNoDynamicLookup partially implements the Inode interface, specifically
+// the inodeDynamicLookup sub interface. Directory inodes that do not support
+// dymanic entries (i.e. entries that are not "hashed" into the
+// vfs.Dentry.children) can embed this to provide no-op implementations for
+// functions related to dynamic entries.
+type InodeNoDynamicLookup struct{}
+
+// Lookup implements Inode.Lookup.
+func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	return nil, syserror.ENOENT
+}
+
+// Valid implements Inode.Valid.
+func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+	return true
+}
+
+// InodeNotSymlink partially implements the Inode interface, specifically the
+// inodeSymlink sub interface. All inodes that are not symlinks may embed this
+// to return the appropriate errors from symlink-related functions.
+type InodeNotSymlink struct{}
+
+// Readlink implements Inode.Readlink.
+func (*InodeNotSymlink) Readlink(context.Context) (string, error) {
+	return "", syserror.EINVAL
+}
+
+// InodeAttrs partially implements the Inode interface, specifically the
+// inodeMetadata sub interface. InodeAttrs provides functionality related to
+// inode attributes.
+//
+// Must be initialized by Init prior to first use.
+type InodeAttrs struct {
+	ino   uint64
+	mode  uint32
+	uid   uint32
+	gid   uint32
+	nlink uint32
+}
+
+// Init initializes this InodeAttrs.
+func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
+	}
+
+	nlink := uint32(1)
+	if mode.FileType() == linux.ModeDirectory {
+		nlink = 2
+	}
+	atomic.StoreUint64(&a.ino, ino)
+	atomic.StoreUint32(&a.mode, uint32(mode))
+	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
+	atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
+	atomic.StoreUint32(&a.nlink, nlink)
+}
+
+// Mode implements Inode.Mode.
+func (a *InodeAttrs) Mode() linux.FileMode {
+	return linux.FileMode(atomic.LoadUint32(&a.mode))
+}
+
+// Stat partially implements Inode.Stat. Note that this function doesn't provide
+// all the stat fields, and the embedder should consider extending the result
+// with filesystem-specific fields.
+func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
+	var stat linux.Statx
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.Ino = atomic.LoadUint64(&a.ino)
+	stat.Mode = uint16(a.Mode())
+	stat.UID = atomic.LoadUint32(&a.uid)
+	stat.GID = atomic.LoadUint32(&a.gid)
+	stat.Nlink = atomic.LoadUint32(&a.nlink)
+
+	// TODO: Implement other stat fields like timestamps.
+
+	return stat
+}
+
+// SetStat implements Inode.SetStat.
+func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	stat := opts.Stat
+	if stat.Mask&linux.STATX_MODE != 0 {
+		for {
+			old := atomic.LoadUint32(&a.mode)
+			new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT))
+			if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped {
+				break
+			}
+		}
+	}
+
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&a.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&a.gid, stat.GID)
+	}
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation.
+
+	// TODO: Implement other stat fields like timestamps.
+
+	return nil
+}
+
+// CheckPermissions implements Inode.CheckPermissions.
+func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := a.Mode()
+	return vfs.GenericCheckPermissions(
+		creds,
+		ats,
+		mode.FileType() == linux.ModeDirectory,
+		uint16(mode),
+		auth.KUID(atomic.LoadUint32(&a.uid)),
+		auth.KGID(atomic.LoadUint32(&a.gid)),
+	)
+}
+
+// IncLinks implements Inode.IncLinks.
+func (a *InodeAttrs) IncLinks(n uint32) {
+	if atomic.AddUint32(&a.nlink, n) <= n {
+		panic("InodeLink.IncLinks called with no existing links")
+	}
+}
+
+// DecLinks implements Inode.DecLinks.
+func (a *InodeAttrs) DecLinks() {
+	if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) {
+		// Negative overflow
+		panic("Inode.DecLinks called at 0 links")
+	}
+}
+
+type slot struct {
+	Name   string
+	Dentry *vfs.Dentry
+	slotEntry
+}
+
+// OrderedChildrenOptions contains initialization options for OrderedChildren.
+type OrderedChildrenOptions struct {
+	// Writable indicates whether vfs.FilesystemImpl methods implemented by
+	// OrderedChildren may modify the tracked children. This applies to
+	// operations related to rename, unlink and rmdir. If an OrderedChildren is
+	// not writable, these operations all fail with EPERM.
+	Writable bool
+}
+
+// OrderedChildren partially implements the Inode interface. OrderedChildren can
+// be embedded in directory inodes to keep track of the children in the
+// directory, and can then be used to implement a generic directory FD -- see
+// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
+// directories.
+//
+// Must be initialize with Init before first use.
+type OrderedChildren struct {
+	refs.AtomicRefCount
+
+	// Can children be modified by user syscalls? It set to false, interface
+	// methods that would modify the children return EPERM. Immutable.
+	writable bool
+
+	mu    sync.RWMutex
+	order slotList
+	set   map[string]*slot
+}
+
+// Init initializes an OrderedChildren.
+func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
+	o.writable = opts.Writable
+	o.set = make(map[string]*slot)
+}
+
+// DecRef implements Inode.DecRef.
+func (o *OrderedChildren) DecRef() {
+	o.AtomicRefCount.DecRefWithDestructor(o.Destroy)
+}
+
+// Destroy cleans up resources referenced by this OrderedChildren.
+func (o *OrderedChildren) Destroy() {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	o.order.Reset()
+	o.set = nil
+}
+
+// Populate inserts children into this OrderedChildren, and d's dentry
+// cache. Populate returns the number of directories inserted, which the caller
+// may use to update the link count for the parent directory.
+//
+// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory
+// inode. children must not contain any conflicting entries already in o.
+func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+	var links uint32
+	for name, child := range children {
+		if child.isDir() {
+			links++
+		}
+		if err := o.Insert(name, child.VFSDentry()); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		}
+		d.InsertChild(name, child.VFSDentry())
+	}
+	return links
+}
+
+// HasChildren implements Inode.HasChildren.
+func (o *OrderedChildren) HasChildren() bool {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+	return len(o.set) > 0
+}
+
+// Insert inserts child into o. This ignores the writability of o, as this is
+// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if _, ok := o.set[name]; ok {
+		return syserror.EEXIST
+	}
+	s := &slot{
+		Name:   name,
+		Dentry: child,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) removeLocked(name string) {
+	if s, ok := o.set[name]; ok {
+		delete(o.set, name)
+		o.order.Remove(s)
+	}
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+	if s, ok := o.set[name]; ok {
+		// Existing slot with given name, simply replace the dentry.
+		var old *vfs.Dentry
+		old, s.Dentry = s.Dentry, new
+		return old
+	}
+
+	// No existing slot with given name, create and hash new slot.
+	s := &slot{
+		Name:   name,
+		Dentry: new,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for reading or writing.
+func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+	s, ok := o.set[name]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if s.Dentry != child {
+		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	}
+	return nil
+}
+
+// Unlink implements Inode.Unlink.
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+	if !o.writable {
+		return syserror.EPERM
+	}
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if err := o.checkExistingLocked(name, child); err != nil {
+		return err
+	}
+	o.removeLocked(name)
+	return nil
+}
+
+// Rmdir implements Inode.Rmdir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+	// We're not responsible for checking that child is a directory, that it's
+	// empty, or updating any link counts; so this is the same as unlink.
+	return o.Unlink(ctx, name, child)
+}
+
+type renameAcrossDifferentImplementationsError struct{}
+
+func (renameAcrossDifferentImplementationsError) Error() string {
+	return "rename across inodes with different implementations"
+}
+
+// Rename implements Inode.Rename.
+//
+// Precondition: Rename may only be called across two directory inodes with
+// identical implementations of Rename. Practically, this means filesystems that
+// implement Rename by embedding OrderedChildren for any directory
+// implementation must use OrderedChildren for all directory implementations
+// that will support Rename.
+//
+// Postcondition: reference on any replaced dentry transferred to caller.
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
+	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+	if !ok {
+		return nil, renameAcrossDifferentImplementationsError{}
+	}
+	if !o.writable || !dst.writable {
+		return nil, syserror.EPERM
+	}
+	// Note: There's a potential deadlock below if concurrent calls to Rename
+	// refer to the same src and dst directories in reverse. We avoid any
+	// ordering issues because the caller is required to serialize concurrent
+	// calls to Rename in accordance with the interface declaration.
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if dst != o {
+		dst.mu.Lock()
+		defer dst.mu.Unlock()
+	}
+	if err := o.checkExistingLocked(oldname, child); err != nil {
+		return nil, err
+	}
+	replaced := dst.replaceChildLocked(newname, child)
+	return replaced, nil
+}
+
+// nthLocked returns an iterator to the nth child tracked by this object. The
+// iterator is valid until the caller releases o.mu. Returns nil if the
+// requested index falls out of bounds.
+//
+// Preconditon: Caller must hold o.mu for reading.
+func (o *OrderedChildren) nthLocked(i int64) *slot {
+	for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
+		if i == 0 {
+			return it
+		}
+		i--
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
new file mode 100644
index 000000000..bb01c3d01
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -0,0 +1,405 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernfs provides the tools to implement inode-based filesystems.
+// Kernfs has two main features:
+//
+// 1. The Inode interface, which maps VFS2's path-based filesystem operations to
+//    specific filesystem nodes. Kernfs uses the Inode interface to provide a
+//    blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
+//    the synchronization mechanism for all filesystem operations by holding a
+//    filesystem-wide lock across all operations.
+//
+// 2. Various utility types which provide generic implementations for various
+//    parts of the Inode and vfs.FileDescription interfaces. Client filesystems
+//    based on kernfs can embed the appropriate set of these to avoid having to
+//    reimplement common filesystem operations. See inode_impl_util.go and
+//    fd_impl_util.go.
+//
+// Reference Model:
+//
+// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// independent lifetimes and reference counts. A child dentry unconditionally
+// holds a reference on its parent directory's dentry. A dentry also holds a
+// reference on the inode it points to. Multiple dentries can point to the same
+// inode (for example, in the case of hardlinks). File descriptors hold a
+// reference to the dentry they're opened on.
+//
+// Dentries are guaranteed to exist while holding Filesystem.mu for
+// reading. Dropping dentries require holding Filesystem.mu for writing. To
+// queue dentries for destruction from a read critical section, see
+// Filesystem.deferDecRef.
+//
+// Lock ordering:
+//
+// kernfs.Filesystem.mu
+//   kernfs.Dentry.dirMu
+//     vfs.VirtualFilesystem.mountMu
+//       vfs.Dentry.mu
+//   kernfs.Filesystem.droppedDentriesMu
+//   (inode implementation locks, if any)
+package kernfs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
+// filesystem. Concrete implementations are expected to embed this in their own
+// Filesystem type.
+type Filesystem struct {
+	vfsfs vfs.Filesystem
+
+	droppedDentriesMu sync.Mutex
+
+	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
+	// used to defer dentry destruction until mu can be acquired for
+	// writing. Protected by droppedDentriesMu.
+	droppedDentries []*vfs.Dentry
+
+	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
+	// for reading guarantees continued existence of any resolved dentries, but
+	// the dentry tree may be modified.
+	//
+	// Kernfs dentries can only be DecRef()ed while holding mu for writing. For
+	// example:
+	//
+	//   fs.mu.Lock()
+	//   defer fs.mu.Unlock()
+	//   ...
+	//   dentry1.DecRef()
+	//   defer dentry2.DecRef() // Ok, will run before Unlock.
+	//
+	// If discarding dentries in a read context, use Filesystem.deferDecRef. For
+	// example:
+	//
+	//   fs.mu.RLock()
+	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.mu.RUnlock()
+	//   ...
+	//   fs.deferDecRef(dentry)
+	mu sync.RWMutex
+
+	// nextInoMinusOne is used to to allocate inode numbers on this
+	// filesystem. Must be accessed by atomic operations.
+	nextInoMinusOne uint64
+}
+
+// deferDecRef defers dropping a dentry ref until the next call to
+// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
+//
+// Precondition: d must not already be pending destruction.
+func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+	fs.droppedDentriesMu.Lock()
+	fs.droppedDentries = append(fs.droppedDentries, d)
+	fs.droppedDentriesMu.Unlock()
+}
+
+// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
+// droppedDentries list. See comment on Filesystem.mu.
+func (fs *Filesystem) processDeferredDecRefs() {
+	fs.mu.Lock()
+	fs.processDeferredDecRefsLocked()
+	fs.mu.Unlock()
+}
+
+// Precondition: fs.mu must be held for writing.
+func (fs *Filesystem) processDeferredDecRefsLocked() {
+	fs.droppedDentriesMu.Lock()
+	for _, d := range fs.droppedDentries {
+		d.DecRef()
+	}
+	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
+	fs.droppedDentriesMu.Unlock()
+}
+
+// Init initializes a kernfs filesystem. This should be called from during
+// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding
+// kernfs.
+func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) {
+	fs.vfsfs.Init(vfsObj, fs)
+}
+
+// VFSFilesystem returns the generic vfs filesystem object.
+func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
+	return &fs.vfsfs
+}
+
+// NextIno allocates a new inode number on this filesystem.
+func (fs *Filesystem) NextIno() uint64 {
+	return atomic.AddUint64(&fs.nextInoMinusOne, 1)
+}
+
+// These consts are used in the Dentry.flags field.
+const (
+	// Dentry points to a directory inode.
+	dflagsIsDir = 1 << iota
+
+	// Dentry points to a symlink inode.
+	dflagsIsSymlink
+)
+
+// Dentry implements vfs.DentryImpl.
+//
+// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
+// named reference to an inode. A dentry generally lives as long as it's part of
+// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
+// to them are removed. Dentries hold a single reference to the inode they point
+// to, and child dentries hold a reference on their parent.
+//
+// Must be initialized by Init prior to first use.
+type Dentry struct {
+	refs.AtomicRefCount
+
+	vfsd  vfs.Dentry
+	inode Inode
+
+	refs uint64
+
+	// flags caches useful information about the dentry from the inode. See the
+	// dflags* consts above. Must be accessed by atomic ops.
+	flags uint32
+
+	// dirMu protects vfsd.children for directory dentries.
+	dirMu sync.Mutex
+}
+
+// Init initializes this dentry.
+//
+// Precondition: Caller must hold a reference on inode.
+//
+// Postcondition: Caller's reference on inode is transferred to the dentry.
+func (d *Dentry) Init(inode Inode) {
+	d.vfsd.Init(d)
+	d.inode = inode
+	ftype := inode.Mode().FileType()
+	if ftype == linux.ModeDirectory {
+		d.flags |= dflagsIsDir
+	}
+	if ftype == linux.ModeSymlink {
+		d.flags |= dflagsIsSymlink
+	}
+}
+
+// VFSDentry returns the generic vfs dentry for this kernfs dentry.
+func (d *Dentry) VFSDentry() *vfs.Dentry {
+	return &d.vfsd
+}
+
+// isDir checks whether the dentry points to a directory inode.
+func (d *Dentry) isDir() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
+}
+
+// isSymlink checks whether the dentry points to a symlink inode.
+func (d *Dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef() {
+	d.AtomicRefCount.DecRefWithDestructor(d.destroy)
+}
+
+// Precondition: Dentry must be removed from VFS' dentry cache.
+func (d *Dentry) destroy() {
+	d.inode.DecRef() // IncRef from Init.
+	d.inode = nil
+	if parent := d.vfsd.Parent(); parent != nil {
+		parent.DecRef() // IncRef from Dentry.InsertChild.
+	}
+}
+
+// InsertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on
+// it's own isn't sufficient to insert a child into a directory. InsertChild
+// updates the link count on d if required.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+	if !d.isDir() {
+		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+	}
+	vfsDentry := d.VFSDentry()
+	vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
+	d.dirMu.Lock()
+	vfsDentry.InsertChild(child, name)
+	d.dirMu.Unlock()
+}
+
+// The Inode interface maps filesystem-level operations that operate on paths to
+// equivalent operations on specific filesystem nodes.
+//
+// The interface methods are groups into logical categories as sub interfaces
+// below. Generally, an implementation for each sub interface can be provided by
+// embedding an appropriate type from inode_impl_utils.go. The sub interfaces
+// are purely organizational. Methods declared directly in the main interface
+// have no generic implementations, and should be explicitly provided by the
+// client filesystem.
+//
+// Generally, implementations are not responsible for tasks that are common to
+// all filesystems. These include:
+//
+// - Checking that dentries passed to methods are of the appropriate file type.
+// - Checking permissions.
+// - Updating link and reference counts.
+//
+// Specific responsibilities of implementations are documented below.
+type Inode interface {
+	// Methods related to reference counting. A generic implementation is
+	// provided by InodeNoopRefCount. These methods are generally called by the
+	// equivalent Dentry methods.
+	inodeRefs
+
+	// Methods related to node metadata. A generic implementation is provided by
+	// InodeAttrs.
+	inodeMetadata
+
+	// Method for inodes that represent symlink. InodeNotSymlink provides a
+	// blanket implementation for all non-symlink inodes.
+	inodeSymlink
+
+	// Method for inodes that represent directories. InodeNotDirectory provides
+	// a blanket implementation for all non-directory inodes.
+	inodeDirectory
+
+	// Method for inodes that represent dynamic directories and their
+	// children. InodeNoDynamicLookup provides a blanket implementation for all
+	// non-dynamic-directory inodes.
+	inodeDynamicLookup
+
+	// Open creates a file description for the filesystem object represented by
+	// this inode. The returned file description should hold a reference on the
+	// inode for its lifetime.
+	//
+	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
+	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error)
+}
+
+type inodeRefs interface {
+	IncRef()
+	DecRef()
+	TryIncRef() bool
+	// Destroy is called when the inode reaches zero references. Destroy release
+	// all resources (references) on objects referenced by the inode, including
+	// any child dentries.
+	Destroy()
+}
+
+type inodeMetadata interface {
+	// CheckPermissions checks that creds may access this inode for the
+	// requested access type, per the the rules of
+	// fs/namei.c:generic_permission().
+	CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error
+
+	// Mode returns the (struct stat)::st_mode value for this inode. This is
+	// separated from Stat for performance.
+	Mode() linux.FileMode
+
+	// Stat returns the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.StatAt.
+	Stat(fs *vfs.Filesystem) linux.Statx
+
+	// SetStat updates the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.SetStatAt.
+	SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error
+}
+
+// Precondition: All methods in this interface may only be called on directory
+// inodes.
+type inodeDirectory interface {
+	// The New{File,Dir,Node,Symlink} methods below should return a new inode
+	// hashed into this inode.
+	//
+	// These inode constructors are inode-level operations rather than
+	// filesystem-level operations to allow client filesystems to mix different
+	// implementations based on the new node's location in the
+	// filesystem.
+
+	// HasChildren returns true if the directory inode has any children.
+	HasChildren() bool
+
+	// NewFile creates a new regular file inode.
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+
+	// NewDir creates a new directory inode.
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+
+	// NewLink creates a new hardlink to a specified inode in this
+	// directory. Implementations should create a new kernfs Dentry pointing to
+	// target, and update target's link count.
+	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+
+	// NewSymlink creates a new symbolic link inode.
+	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+
+	// NewNode creates a new filesystem node for a mknod syscall.
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+
+	// Unlink removes a child dentry from this directory inode.
+	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// RmDir removes an empty child directory from this directory
+	// inode. Implementations must update the parent directory's link count,
+	// if required. Implementations are not responsible for checking that child
+	// is a directory, checking for an empty directory.
+	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// Rename is called on the source directory containing an inode being
+	// renamed. child should point to the resolved child in the source
+	// directory. If Rename replaces a dentry in the destination directory, it
+	// should return the replaced dentry or nil otherwise.
+	//
+	// Precondition: Caller must serialize concurrent calls to Rename.
+	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+}
+
+type inodeDynamicLookup interface {
+	// Lookup should return an appropriate dentry if name should resolve to a
+	// child of this dynamic directory inode. This gives the directory an
+	// opportunity on every lookup to resolve additional entries that aren't
+	// hashed into the directory. This is only called when the inode is a
+	// directory. If the inode is not a directory, or if the directory only
+	// contains a static set of children, the implementer can unconditionally
+	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	//
+	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
+	// lifetime can be controlled by the filesystem implementation with an
+	// appropriate implementation of Valid.
+	//
+	// Lookup returns the child with an extra reference and the caller owns this
+	// reference.
+	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
+}
+
+type inodeSymlink interface {
+	// Readlink resolves the target of a symbolic link. If an inode is not a
+	// symlink, the implementation should return EINVAL.
+	Readlink(ctx context.Context) (string, error)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
new file mode 100644
index 000000000..f78bb7b04
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -0,0 +1,423 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs_test
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"sync"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const defaultMode linux.FileMode = 01777
+const staticFileContent = "This is sample content for a static test file."
+
+// RootDentryFn is a generator function for creating the root dentry of a test
+// filesystem. See newTestSystem.
+type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+
+// TestSystem represents the context for a single test.
+type TestSystem struct {
+	t     *testing.T
+	ctx   context.Context
+	creds *auth.Credentials
+	vfs   *vfs.VirtualFilesystem
+	mns   *vfs.MountNamespace
+	root  vfs.VirtualDentry
+}
+
+// newTestSystem sets up a minimal environment for running a test, including an
+// instance of a test filesystem. Tests can control the contents of the
+// filesystem by providing an appropriate rootFn, which should return a
+// pre-populated root dentry.
+func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+	v := vfs.New()
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("Failed to create testfs root mount: %v", err)
+	}
+
+	s := &TestSystem{
+		t:     t,
+		ctx:   ctx,
+		creds: creds,
+		vfs:   v,
+		mns:   mns,
+		root:  mns.Root(),
+	}
+	runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() })
+	return s
+}
+
+// PathOpAtRoot constructs a vfs.PathOperation for a path from the
+// root of the test filesystem.
+//
+// Precondition: path should be relative path.
+func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
+	return vfs.PathOperation{
+		Root:     s.root,
+		Start:    s.root,
+		Pathname: path,
+	}
+}
+
+// GetDentryOrDie attempts to resolve a dentry referred to by the
+// provided path operation. If unsuccessful, the test fails.
+func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
+	vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
+	}
+	return vd
+}
+
+func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) {
+	buf := make([]byte, usermem.PageSize)
+	bufIOSeq := usermem.BytesIOSequence(buf)
+	opts := vfs.ReadOptions{}
+
+	var content bytes.Buffer
+	for {
+		n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts)
+		if n == 0 || err != nil {
+			if err == io.EOF {
+				err = nil
+			}
+			return content.String(), err
+		}
+		content.Write(buf[:n])
+	}
+}
+
+type fsType struct {
+	rootFn RootDentryFn
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+type file struct {
+	kernfs.DynamicBytesFile
+	content string
+}
+
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+	f := &file{}
+	f.content = content
+	f.DynamicBytesFile.Init(creds, fs.NextIno(), f)
+
+	d := &kernfs.Dentry{}
+	d.Init(f)
+	return d
+}
+
+func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%s", f.content)
+	return nil
+}
+
+type attrs struct {
+	kernfs.InodeAttrs
+}
+
+func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+type readonlyDir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeDirectoryNoNewChildren
+
+	kernfs.OrderedChildren
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &readonlyDir{}
+	dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+type dir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+
+	fs     *filesystem
+	dentry kernfs.Dentry
+	kernfs.OrderedChildren
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &dir{}
+	dir.fs = fs
+	dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	dir := d.fs.newDir(creds, opts.Mode, nil)
+	dirVFSD := dir.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+		dir.DecRef()
+		return nil, err
+	}
+	d.IncLinks(1)
+	return dirVFSD, nil
+}
+
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	f := d.fs.newFile(creds, "")
+	fVFSD := f.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+		f.DecRef()
+		return nil, err
+	}
+	return fVFSD, nil
+}
+
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj)
+	root := fst.rootFn(creds, fs)
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// -------------------- Remainder of the file are test cases --------------------
+
+func TestBasic(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef()
+}
+
+func TestMkdirGetDentry(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("dir1/a new directory")
+	if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+		t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
+	}
+	sys.GetDentryOrDie(pop).DecRef()
+}
+
+func TestReadStaticFile(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("file1")
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	content, err := sys.ReadToEnd(fd)
+	if err != nil {
+		sys.t.Fatalf("Read failed: %v", err)
+	}
+	if diff := cmp.Diff(staticFileContent, content); diff != "" {
+		sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+func TestCreateNewFileInStaticDir(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("dir1/newfile")
+	opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts)
+	if err != nil {
+		sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
+	}
+
+	// Close the file. The file should persist.
+	fd.DecRef()
+
+	fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+	}
+	fd.DecRef()
+}
+
+// direntCollector provides an implementation for vfs.IterDirentsCallback for
+// testing. It simply iterates to the end of a given directory FD and collects
+// all dirents emitted by the callback.
+type direntCollector struct {
+	mu      sync.Mutex
+	dirents map[string]vfs.Dirent
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (d *direntCollector) Handle(dirent vfs.Dirent) bool {
+	d.mu.Lock()
+	if d.dirents == nil {
+		d.dirents = make(map[string]vfs.Dirent)
+	}
+	d.dirents[dirent.Name] = dirent
+	d.mu.Unlock()
+	return true
+}
+
+// count returns the number of dirents currently in the collector.
+func (d *direntCollector) count() int {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return len(d.dirents)
+}
+
+// contains checks whether the collector has a dirent with the given name and
+// type.
+func (d *direntCollector) contains(name string, typ uint8) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	dirent, ok := d.dirents[name]
+	if !ok {
+		return fmt.Errorf("No dirent named %q found", name)
+	}
+	if dirent.Type != typ {
+		return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent)
+	}
+	return nil
+}
+
+func TestDirFDReadWrite(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, nil)
+	})
+
+	pop := sys.PathOpAtRoot("/")
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	// Read/Write should fail for directory FDs.
+	if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
+		sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
+	}
+	if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR {
+		sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err)
+	}
+}
+
+func TestDirFDIterDirents(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			// Fill root with nodes backed by various inode implementations.
+			"dir1": fs.newReadonlyDir(creds, 0755, nil),
+			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+				"dir3": fs.newDir(creds, 0755, nil),
+			}),
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("/")
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	collector := &direntCollector{}
+	if err := fd.IterDirents(sys.ctx, collector); err != nil {
+		sys.t.Fatalf("IterDirent failed: %v", err)
+	}
+
+	// Root directory should contain ".", ".." and 3 children:
+	if collector.count() != 5 {
+		sys.t.Fatalf("IterDirent returned too many dirents")
+	}
+	for _, dirName := range []string{".", "..", "dir1", "dir2"} {
+		if err := collector.contains(dirName, linux.DT_DIR); err != nil {
+			sys.t.Fatalf("IterDirent had unexpected results: %v", err)
+		}
+	}
+	if err := collector.contains("file1", linux.DT_REG); err != nil {
+		sys.t.Fatalf("IterDirent had unexpected results: %v", err)
+	}
+
+}
-- 
cgit v1.2.3


From 93d429d5b1e3801fb4c29568bcd40d6854c9fe94 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 12 Dec 2019 13:17:47 -0800
Subject: Implement memmap.MappingIdentity for vfs.FileDescription.

PiperOrigin-RevId: 285255855
---
 pkg/sentry/memmap/BUILD                      |  1 -
 pkg/sentry/memmap/memmap.go                  |  8 +++--
 pkg/sentry/vfs/context.go                    | 13 ++++++++
 pkg/sentry/vfs/file_description.go           | 44 ++++++++++++++++++++++++++++
 pkg/sentry/vfs/file_description_impl_util.go |  9 ++++++
 5 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 3ef84245b..112794e9c 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -41,7 +41,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 03b99aaea..16a722a13 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -18,7 +18,6 @@ package memmap
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -235,8 +234,11 @@ type InvalidateOpts struct {
 // coincidental; fs.File implements MappingIdentity, and some
 // fs.InodeOperations implement Mappable.)
 type MappingIdentity interface {
-	// MappingIdentity is reference-counted.
-	refs.RefCounter
+	// IncRef increments the MappingIdentity's reference count.
+	IncRef()
+
+	// DecRef decrements the MappingIdentity's reference count.
+	DecRef()
 
 	// MappedName returns the application-visible name shown in
 	// /proc/[pid]/maps.
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 32cf9151b..705194ebc 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -24,6 +24,9 @@ type contextID int
 const (
 	// CtxMountNamespace is a Context.Value key for a MountNamespace.
 	CtxMountNamespace contextID = iota
+
+	// CtxRoot is a Context.Value key for a VFS root.
+	CtxRoot
 )
 
 // MountNamespaceFromContext returns the MountNamespace used by ctx. It does
@@ -35,3 +38,13 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	}
 	return nil
 }
+
+// RootFromContext returns the VFS root used by ctx. It takes a reference on
+// the returned VirtualDentry. If ctx does not have a specific VFS root,
+// RootFromContext returns a zero-value VirtualDentry.
+func RootFromContext(ctx context.Context) VirtualDentry {
+	if v := ctx.Value(CtxRoot); v != nil {
+		return v.(VirtualDentry)
+	}
+	return VirtualDentry{}
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 4473dfce8..6575afd16 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -334,3 +334,47 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 func (fd *FileDescription) SyncFS(ctx context.Context) error {
 	return fd.vd.mount.fs.impl.Sync(ctx)
 }
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (fd *FileDescription) MappedName(ctx context.Context) string {
+	vfsroot := RootFromContext(ctx)
+	s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
+	if vfsroot.Ok() {
+		vfsroot.DecRef()
+	}
+	return s
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (fd *FileDescription) DeviceID() uint64 {
+	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+		// There is no STATX_DEV; we assume that Stat will return it if it's
+		// available regardless of mask.
+		Mask: 0,
+		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
+		// directly.
+		Sync: linux.AT_STATX_DONT_SYNC,
+	})
+	if err != nil {
+		return 0
+	}
+	return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (fd *FileDescription) InodeID() uint64 {
+	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+		Mask: linux.STATX_INO,
+		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
+		Sync: linux.AT_STATX_DONT_SYNC,
+	})
+	if err != nil || stat.Mask&linux.STATX_INO == 0 {
+		return 0
+	}
+	return stat.Ino
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	return fd.impl.Sync(ctx)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 4fbad7840..aae023254 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -252,3 +252,12 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
 	fd.off = offset
 	return offset, nil
 }
+
+// GenericConfigureMMap may be used by most implementations of
+// FileDescriptionImpl.ConfigureMMap.
+func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
+	opts.Mappable = m
+	opts.MappingIdentity = fd
+	fd.IncRef()
+	return nil
+}
-- 
cgit v1.2.3


From be2754a4b99cc92f13f479f74a5da8b0e6cb5839 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 12 Dec 2019 14:40:36 -0800
Subject: Add iptables testing framework.

It would be preferrable to test iptables via syscall tests, but there are some
problems with that approach:

* We're limited to loopback-only, as syscall tests involve only a single
  container. Other link interfaces (e.g. fdbased) should be tested.
* We'd have to shell out to call iptables anyways, as the iptables syscall
  interface itself is too large and complex to work with alone.
* Running the Linux/native version of the syscall test will require root, which
  is a pain to configure, is inherently unsafe, and could leave host iptables
  misconfigured.

Using the go_test target allows there to be no new test runner.

PiperOrigin-RevId: 285274275
---
 WORKSPACE                       |  39 +++++++++
 kokoro/iptables_tests.cfg       |  10 +++
 runsc/dockerutil/dockerutil.go  |  10 +++
 scripts/iptables_tests.sh       |  27 ++++++
 test/iptables/BUILD             |  31 +++++++
 test/iptables/README.md         |  44 ++++++++++
 test/iptables/filter_input.go   | 124 ++++++++++++++++++++++++++++
 test/iptables/iptables.go       |  53 ++++++++++++
 test/iptables/iptables_test.go  | 179 ++++++++++++++++++++++++++++++++++++++++
 test/iptables/iptables_util.go  |  82 ++++++++++++++++++
 test/iptables/runner/BUILD      |  16 ++++
 test/iptables/runner/Dockerfile |   4 +
 test/iptables/runner/main.go    |  70 ++++++++++++++++
 13 files changed, 689 insertions(+)
 create mode 100644 kokoro/iptables_tests.cfg
 create mode 100755 scripts/iptables_tests.sh
 create mode 100644 test/iptables/BUILD
 create mode 100644 test/iptables/README.md
 create mode 100644 test/iptables/filter_input.go
 create mode 100644 test/iptables/iptables.go
 create mode 100644 test/iptables/iptables_test.go
 create mode 100644 test/iptables/iptables_util.go
 create mode 100644 test/iptables/runner/BUILD
 create mode 100644 test/iptables/runner/Dockerfile
 create mode 100644 test/iptables/runner/main.go

diff --git a/WORKSPACE b/WORKSPACE
index 4561ed8fc..4b5a3bfe2 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -106,6 +106,45 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 rules_pkg_dependencies()
 
+# Container rules.
+http_archive(
+    name = "io_bazel_rules_docker",
+    sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b",
+    strip_prefix = "rules_docker-0.12.1",
+    urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"],
+)
+
+load(
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
+    container_repositories = "repositories",
+)
+
+container_repositories()
+
+load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
+
+container_deps()
+
+load(
+    "@io_bazel_rules_docker//container:container.bzl",
+    "container_pull",
+)
+
+# This container is built from the Dockerfile in test/iptables/runner.
+container_pull(
+    name = "iptables-test",
+    registry = "gcr.io",
+    repository = "gvisor-presubmit/iptables-test",
+    digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
+)
+
+load(
+    "@io_bazel_rules_docker//go:image.bzl",
+    _go_image_repos = "repositories",
+)
+
+_go_image_repos()
+
 # External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
new file mode 100644
index 000000000..7af20629a
--- /dev/null
+++ b/kokoro/iptables_tests.cfg
@@ -0,0 +1,10 @@
+build_file: "repo/scripts/iptables_test.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc_logs_*.tar.gz"
+  }
+}
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 57f6ae8de..9b6346ca2 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -380,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
 	return port, nil
 }
 
+// FindIP returns the IP address of the container as a string.
+func (d *Docker) FindIP() (string, error) {
+	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+	out, err := do("inspect", "-f", format, d.Name)
+	if err != nil {
+		return "", fmt.Errorf("error retrieving IP: %v", err)
+	}
+	return strings.TrimSpace(out), nil
+}
+
 // SandboxPid returns the PID to the sandbox process.
 func (d *Docker) SandboxPid() (int, error) {
 	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
new file mode 100755
index 000000000..c47cbd675
--- /dev/null
+++ b/scripts/iptables_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+install_runsc_for_test iptables
+
+# Build the docker image for the test.
+run //test/iptables/runner --norun
+
+# TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better
+# supported
+test //test/iptables:iptables_test "--test_arg=--runtime=runc" \
+  "--test_arg=--image=bazel/test/iptables/runner:runner"
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
new file mode 100644
index 000000000..fa833c3b2
--- /dev/null
+++ b/test/iptables/BUILD
@@ -0,0 +1,31 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "iptables",
+    srcs = [
+        "filter_input.go",
+        "iptables.go",
+        "iptables_util.go",
+    ],
+    importpath = "gvisor.dev/gvisor/test/iptables",
+    visibility = ["//test/iptables:__subpackages__"],
+)
+
+go_test(
+    name = "iptables_test",
+    srcs = [
+        "iptables_test.go",
+    ],
+    embed = [":iptables"],
+    tags = [
+        "local",
+        "manual",
+    ],
+    deps = [
+        "//pkg/log",
+        "//runsc/dockerutil",
+        "//runsc/testutil",
+    ],
+)
diff --git a/test/iptables/README.md b/test/iptables/README.md
new file mode 100644
index 000000000..b37cb2a96
--- /dev/null
+++ b/test/iptables/README.md
@@ -0,0 +1,44 @@
+# iptables Tests
+
+iptables tests are run via `scripts/iptables\_test.sh`.
+
+## Test Structure
+
+Each test implements `TestCase`, providing (1) a function to run inside the
+container and (2) a function to run locally. Those processes are given each
+others' IP addresses. The test succeeds when both functions succeed.
+
+The function inside the container (`ContainerAction`) typically sets some
+iptables rules and then tries to send or receive packets. The local function
+(`LocalAction`) will typically just send or receive packets.
+
+### Adding Tests
+
+1) Add your test to the `iptables` package.
+
+2) Register the test in an `init` function via `RegisterTestCase` (see
+`filter_input.go` as an example).
+
+3) Add it to `iptables_test.go` (see the other tests in that file).
+
+Your test is now runnable with bazel!
+
+## Run individual tests
+
+Build the testing Docker container:
+
+```bash
+$ bazel run //test/iptables/runner -- --norun
+```
+
+Run an individual test via:
+
+```bash
+$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME>
+```
+
+To run an individual test with `runc`:
+
+```bash
+$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME> --test_arg=--runtime=runc
+```
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
new file mode 100644
index 000000000..923f44e68
--- /dev/null
+++ b/test/iptables/filter_input.go
@@ -0,0 +1,124 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+	"time"
+)
+
+const (
+	dropPort         = 2401
+	acceptPort       = 2402
+	sendloopDuration = 2 * time.Second
+	network          = "udp4"
+)
+
+func init() {
+	RegisterTestCase(FilterInputDropUDP{})
+	RegisterTestCase(FilterInputDropUDPPort{})
+	RegisterTestCase(FilterInputDropDifferentUDPPort{})
+}
+
+// FilterInputDropUDP tests that we can drop UDP traffic.
+type FilterInputDropUDP struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropUDP) Name() string {
+	return "FilterInputDropUDP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropUDP) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropUDP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputDropUDPPort tests that we can drop UDP traffic by port.
+type FilterInputDropUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropUDPPort) Name() string {
+	return "FilterInputDropUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropUDPPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropUDPPort) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputDropDifferentUDPPort tests that dropping traffic for a single UDP port
+// doesn't drop packets on other ports.
+type FilterInputDropDifferentUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropDifferentUDPPort) Name() string {
+	return "FilterInputDropDifferentUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on another port.
+	if err := listenUDP(acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go
new file mode 100644
index 000000000..2e565d988
--- /dev/null
+++ b/test/iptables/iptables.go
@@ -0,0 +1,53 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package iptables contains a set of iptables tests implemented as TestCases
+package iptables
+
+import (
+	"fmt"
+	"net"
+)
+
+// IPExchangePort is the port the container listens on to receive the IP
+// address of the local process.
+const IPExchangePort = 2349
+
+// A TestCase contains one action to run in the container and one to run
+// locally. The actions run concurrently and each must succeed for the test
+// pass.
+type TestCase interface {
+	// Name returns the name of the test.
+	Name() string
+
+	// ContainerAction runs inside the container. It receives the IP of the
+	// local process.
+	ContainerAction(ip net.IP) error
+
+	// LocalAction runs locally. It receives the IP of the container.
+	LocalAction(ip net.IP) error
+}
+
+// Tests maps test names to TestCase.
+//
+// New TestCases are added by calling RegisterTestCase in an init function.
+var Tests = map[string]TestCase{}
+
+// RegisterTestCase registers tc so it can be run.
+func RegisterTestCase(tc TestCase) {
+	if _, ok := Tests[tc.Name()]; ok {
+		panic(fmt.Sprintf("TestCase %s already registered.", tc.Name()))
+	}
+	Tests[tc.Name()] = tc
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
new file mode 100644
index 000000000..bfbf1bb87
--- /dev/null
+++ b/test/iptables/iptables_test.go
@@ -0,0 +1,179 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+	"os"
+	"path"
+	"testing"
+	"time"
+
+	"flag"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+const timeout time.Duration = 10 * time.Second
+
+var image = flag.String("image", "bazel/test/iptables/runner:runner", "image to run tests in")
+
+type result struct {
+	output string
+	err    error
+}
+
+// singleTest runs a TestCase. Each test follows a pattern:
+// - Create a container.
+// - Get the container's IP.
+// - Send the container our IP.
+// - Start a new goroutine running the local action of the test.
+// - Wait for both the container and local actions to finish.
+//
+// Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or
+// to stderr.
+func singleTest(test TestCase) error {
+	if _, ok := Tests[test.Name()]; !ok {
+		return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name())
+	}
+
+	// Create and start the container.
+	cont := dockerutil.MakeDocker("gvisor-iptables")
+	defer cont.CleanUp()
+	resultChan := make(chan *result)
+	go func() {
+		output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name())
+		logContainer(output, err)
+		resultChan <- &result{output, err}
+	}()
+
+	// Get the container IP.
+	ip, err := getIP(cont)
+	if err != nil {
+		return fmt.Errorf("failed to get container IP: %v", err)
+	}
+
+	// Give the container our IP.
+	if err := sendIP(ip); err != nil {
+		return fmt.Errorf("failed to send IP to container: %v", err)
+	}
+
+	// Run our side of the test.
+	errChan := make(chan error)
+	go func() {
+		errChan <- test.LocalAction(ip)
+	}()
+
+	// Wait for both the container and local tests to finish.
+	var res *result
+	to := time.After(timeout)
+	for localDone := false; res == nil || !localDone; {
+		select {
+		case res = <-resultChan:
+			log.Infof("Container finished.")
+		case err, localDone = <-errChan:
+			log.Infof("Local finished.")
+			if err != nil {
+				return fmt.Errorf("local test failed: %v", err)
+			}
+		case <-to:
+			return fmt.Errorf("timed out after %f seconds", timeout.Seconds())
+		}
+	}
+
+	return res.err
+}
+
+func getIP(cont dockerutil.Docker) (net.IP, error) {
+	// The container might not have started yet, so retry a few times.
+	var ipStr string
+	to := time.After(timeout)
+	for ipStr == "" {
+		ipStr, _ = cont.FindIP()
+		select {
+		case <-to:
+			return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds())
+		default:
+			time.Sleep(250 * time.Millisecond)
+		}
+	}
+	ip := net.ParseIP(ipStr)
+	if ip == nil {
+		return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr)
+	}
+	log.Infof("Container has IP of %s", ipStr)
+	return ip, nil
+}
+
+func sendIP(ip net.IP) error {
+	contAddr := net.TCPAddr{
+		IP:   ip,
+		Port: IPExchangePort,
+	}
+	var conn *net.TCPConn
+	// The container may not be listening when we first connect, so retry
+	// upon error.
+	cb := func() error {
+		c, err := net.DialTCP("tcp4", nil, &contAddr)
+		conn = c
+		return err
+	}
+	if err := testutil.Poll(cb, timeout); err != nil {
+		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
+	}
+	if _, err := conn.Write([]byte{0}); err != nil {
+		return fmt.Errorf("error writing to container: %v", err)
+	}
+	return nil
+}
+
+func logContainer(output string, err error) {
+	msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output)
+	if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" {
+		fpath := path.Join(artifactsDir, "container.log")
+		if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil {
+			log.Warningf("Failed to open log file %q: %v", fpath, err)
+		} else {
+			defer file.Close()
+			if _, err := file.Write([]byte(msg)); err == nil {
+				return
+			}
+			log.Warningf("Failed to write to log file %s: %v", fpath, err)
+		}
+	}
+
+	// We couldn't write to the output directory -- just log to stderr.
+	log.Infof(msg)
+}
+
+func TestFilterInputDropUDP(t *testing.T) {
+	if err := singleTest(FilterInputDropUDP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
new file mode 100644
index 000000000..3a4d11f1a
--- /dev/null
+++ b/test/iptables/iptables_util.go
@@ -0,0 +1,82 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+	"os/exec"
+	"time"
+)
+
+const iptablesBinary = "iptables"
+
+// filterTable calls `iptables -t filter` with the given args.
+func filterTable(args ...string) error {
+	args = append([]string{"-t", "filter"}, args...)
+	cmd := exec.Command(iptablesBinary, args...)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out))
+	}
+	return nil
+}
+
+// listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
+// the first read on that port.
+func listenUDP(port int, timeout time.Duration) error {
+	localAddr := net.UDPAddr{
+		Port: port,
+	}
+	conn, err := net.ListenUDP(network, &localAddr)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+	conn.SetDeadline(time.Now().Add(timeout))
+	_, err = conn.Read([]byte{0})
+	return err
+}
+
+// sendUDPLoop sends 1 byte UDP packets repeatedly to the IP and port specified
+// over a duration.
+func sendUDPLoop(ip net.IP, port int, duration time.Duration) error {
+	// Send packets for a few seconds.
+	remote := net.UDPAddr{
+		IP:   ip,
+		Port: port,
+	}
+	conn, err := net.DialUDP(network, nil, &remote)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	to := time.After(duration)
+	for timedOut := false; !timedOut; {
+		// This may return an error (connection refused) if the remote
+		// hasn't started listening yet or they're dropping our
+		// packets. So we ignore Write errors and depend on the remote
+		// to report a failure if it doesn't get a packet it needs.
+		conn.Write([]byte{0})
+		select {
+		case <-to:
+			timedOut = true
+		default:
+			time.Sleep(200 * time.Millisecond)
+		}
+	}
+
+	return nil
+}
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
new file mode 100644
index 000000000..1c59e26b9
--- /dev/null
+++ b/test/iptables/runner/BUILD
@@ -0,0 +1,16 @@
+load("@io_bazel_rules_docker//container:container.bzl", "container_image")
+load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+
+package(licenses = ["notice"])
+
+container_image(
+    name = "iptables-base",
+    base = "@iptables-test//image",
+)
+
+go_image(
+    name = "runner",
+    srcs = ["main.go"],
+    base = ":iptables-base",
+    deps = ["//test/iptables"],
+)
diff --git a/test/iptables/runner/Dockerfile b/test/iptables/runner/Dockerfile
new file mode 100644
index 000000000..b77db44a1
--- /dev/null
+++ b/test/iptables/runner/Dockerfile
@@ -0,0 +1,4 @@
+# This Dockerfile builds the image hosted at
+# gcr.io/gvisor-presubmit/iptables-test.
+FROM ubuntu
+RUN apt update && apt install -y iptables
diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go
new file mode 100644
index 000000000..3c794114e
--- /dev/null
+++ b/test/iptables/runner/main.go
@@ -0,0 +1,70 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package main runs iptables tests from within a docker container.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"net"
+
+	"gvisor.dev/gvisor/test/iptables"
+)
+
+var name = flag.String("name", "", "name of the test to run")
+
+func main() {
+	flag.Parse()
+
+	// Find out which test we're running.
+	test, ok := iptables.Tests[*name]
+	if !ok {
+		log.Fatalf("No test found named %q", *name)
+	}
+	log.Printf("Running test %q", *name)
+
+	// Get the IP of the local process.
+	ip, err := getIP()
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// Run the test.
+	if err := test.ContainerAction(ip); err != nil {
+		log.Fatalf("Failed running test %q: %v", *name, err)
+	}
+}
+
+// getIP listens for a connection from the local process and returns the source
+// IP of that connection.
+func getIP() (net.IP, error) {
+	localAddr := net.TCPAddr{
+		Port: iptables.IPExchangePort,
+	}
+	listener, err := net.ListenTCP("tcp4", &localAddr)
+	if err != nil {
+		return net.IP{}, fmt.Errorf("failed listening for IP: %v", err)
+	}
+	defer listener.Close()
+	conn, err := listener.AcceptTCP()
+	if err != nil {
+		return net.IP{}, fmt.Errorf("failed accepting IP: %v", err)
+	}
+	defer conn.Close()
+	log.Printf("Connected to %v", conn.RemoteAddr())
+
+	return conn.RemoteAddr().(*net.TCPAddr).IP, nil
+}
-- 
cgit v1.2.3


From ad80dcf47077a1938631fe36f6b406256f3f3f4f Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 13 Dec 2019 16:26:06 -0800
Subject: Properly generate the EUI64 interface identifier from an Ethernet
 address

Fixed a bug where the interface identifier was not properly generated from an
Ethernet address.

Tests: Unittests to make sure the functions generating the EUI64 interface
identifier are correct.
PiperOrigin-RevId: 285494562
---
 pkg/tcpip/header/BUILD        |  3 +++
 pkg/tcpip/header/ipv6.go      | 21 ++++++++++----------
 pkg/tcpip/header/ipv6_test.go | 45 +++++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/ndp.go        |  6 +++---
 pkg/tcpip/stack/ndp_test.go   |  2 +-
 5 files changed, 63 insertions(+), 14 deletions(-)
 create mode 100644 pkg/tcpip/header/ipv6_test.go

diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 8392cb9e5..f1d837196 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -38,12 +38,15 @@ go_test(
     size = "small",
     srcs = [
         "checksum_test.go",
+        "ipv6_test.go",
         "ipversion_test.go",
         "tcp_test.go",
     ],
     deps = [
         ":header",
+        "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
 
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 5275b34d4..fc671e439 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -278,33 +278,34 @@ func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
 	return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
 }
 
-// EthernetAdddressToEUI64IntoBuf populates buf with a EUI-64 from a 48-bit
-// Ethernet/MAC address.
+// EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
+// from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
 //
 // buf MUST be at least 8 bytes.
-func EthernetAdddressToEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
+func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
 	buf[0] = linkAddr[0] ^ 2
 	buf[1] = linkAddr[1]
 	buf[2] = linkAddr[2]
-	buf[3] = 0xFE
+	buf[3] = 0xFF
 	buf[4] = 0xFE
 	buf[5] = linkAddr[3]
 	buf[6] = linkAddr[4]
 	buf[7] = linkAddr[5]
 }
 
-// EthernetAddressToEUI64 computes an EUI-64 from a 48-bit Ethernet/MAC address.
-func EthernetAddressToEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
+// EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit
+// Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
 	var buf [IIDSize]byte
-	EthernetAdddressToEUI64IntoBuf(linkAddr, buf[:])
+	EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
 	return buf
 }
 
 // LinkLocalAddr computes the default IPv6 link-local address from a link-layer
 // (MAC) address.
 func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
-	// Convert a 48-bit MAC to an EUI-64 and then prepend the link-local
-	// header, FE80::.
+	// Convert a 48-bit MAC to a modified EUI-64 and then prepend the
+	// link-local header, FE80::.
 	//
 	// The conversion is very nearly:
 	//	aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff
@@ -313,7 +314,7 @@ func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
 		0: 0xFE,
 		1: 0x80,
 	}
-	EthernetAdddressToEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
+	EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
 	return tcpip.Address(lladdrb[:])
 }
 
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
new file mode 100644
index 000000000..42c5c6fc1
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+
+func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
+	expectedIID := [header.IIDSize]byte{0, 2, 3, 255, 254, 4, 5, 6}
+
+	if diff := cmp.Diff(expectedIID, header.EthernetAddressToModifiedEUI64(linkAddr)); diff != "" {
+		t.Errorf("EthernetAddressToModifiedEUI64(%s) mismatch (-want +got):\n%s", linkAddr, diff)
+	}
+
+	var buf [header.IIDSize]byte
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+	if diff := cmp.Diff(expectedIID, buf); diff != "" {
+		t.Errorf("EthernetAddressToModifiedEUI64IntoBuf(%s, _) mismatch (-want +got):\n%s", linkAddr, diff)
+	}
+}
+
+func TestLinkLocalAddr(t *testing.T) {
+	if got, want := header.LinkLocalAddr(linkAddr), tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x02\x03\xff\xfe\x04\x05\x06"); got != want {
+		t.Errorf("got LinkLocalAddr(%s) = %s, want = %s", linkAddr, got, want)
+	}
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 060a2e7c6..27bd02e76 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1049,11 +1049,11 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 		return
 	}
 
-	// Generate an address within prefix from the EUI-64 of ndp's NIC's
-	// Ethernet MAC address.
+	// Generate an address within prefix from the modified EUI-64 of ndp's
+	// NIC's Ethernet MAC address.
 	addrBytes := make([]byte, header.IPv6AddressSize)
 	copy(addrBytes[:header.IIDOffsetInIPv6Address], prefix.ID()[:header.IIDOffsetInIPv6Address])
-	header.EthernetAdddressToEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
 	addr := tcpip.Address(addrBytes)
 	addrWithPrefix := tcpip.AddressWithPrefix{
 		Address:   addr,
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8d811eb8e..d8e7ce67e 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -62,7 +62,7 @@ func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWi
 	var addr tcpip.AddressWithPrefix
 	if header.IsValidUnicastEthernetAddress(linkAddr) {
 		addrBytes := []byte(subnet.ID())
-		header.EthernetAdddressToEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
 		addr = tcpip.AddressWithPrefix{
 			Address:   tcpip.Address(addrBytes),
 			PrefixLen: 64,
-- 
cgit v1.2.3


From 6b424530397e5100b08628efe8f6c62178daa70b Mon Sep 17 00:00:00 2001
From: lubinszARM <34124929+lubinszARM@users.noreply.github.com>
Date: Fri, 13 Dec 2019 17:09:55 -0800
Subject: enable kvm to support arm64

   There are 4 jobs were finished in this package:
  	1, Virtual machine initialization.
	2, Bluepill implementation.
	3, Move ring0.Vectors() into the address with 11-bits alignment.
	4, Basic support for "SwitchToUser".
Signed-off-by: Bin Lu <bin.lu@arm.com>
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/919 from lubinszARM:pr_kvm eedea52db451bf62722759009a9f14c54a69c55f
PiperOrigin-RevId: 285501256
---
 pkg/sentry/platform/kvm/BUILD                    |  10 +-
 pkg/sentry/platform/kvm/bluepill.go              |  24 +-
 pkg/sentry/platform/kvm/bluepill_amd64.go        |  20 +-
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go |   7 -
 pkg/sentry/platform/kvm/bluepill_arm64.go        |  79 +++++
 pkg/sentry/platform/kvm/bluepill_arm64.s         |  87 ++++++
 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go |  28 ++
 pkg/sentry/platform/kvm/bluepill_unsafe.go       |   9 +
 pkg/sentry/platform/kvm/filters.go               |  33 ---
 pkg/sentry/platform/kvm/filters_amd64.go         |  33 +++
 pkg/sentry/platform/kvm/filters_arm64.go         |  32 ++
 pkg/sentry/platform/kvm/kvm.go                   |   5 +-
 pkg/sentry/platform/kvm/kvm_amd64.go             |   9 +
 pkg/sentry/platform/kvm/kvm_arm64.go             |  83 ++++++
 pkg/sentry/platform/kvm/kvm_arm64_unsafe.go      |  39 +++
 pkg/sentry/platform/kvm/kvm_const.go             |   4 +-
 pkg/sentry/platform/kvm/kvm_const_arm64.go       | 132 +++++++++
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go  |  40 +++
 pkg/sentry/platform/kvm/machine_arm64.go         | 122 ++++++++
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go  | 362 +++++++++++++++++++++++
 pkg/sentry/platform/kvm/machine_unsafe.go        |  40 ---
 21 files changed, 1092 insertions(+), 106 deletions(-)
 create mode 100644 pkg/sentry/platform/kvm/bluepill_arm64.go
 create mode 100644 pkg/sentry/platform/kvm/bluepill_arm64.s
 create mode 100644 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
 delete mode 100644 pkg/sentry/platform/kvm/filters.go
 create mode 100644 pkg/sentry/platform/kvm/filters_amd64.go
 create mode 100644 pkg/sentry/platform/kvm/filters_arm64.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_arm64.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_const_arm64.go
 create mode 100644 pkg/sentry/platform/kvm/machine_arm64_unsafe.go

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 6803d488c..5c52d4007 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -12,18 +12,26 @@ go_library(
         "bluepill_amd64.go",
         "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
+        "bluepill_arm64.go",
+        "bluepill_arm64.s",
+        "bluepill_arm64_unsafe.go",
         "bluepill_fault.go",
         "bluepill_unsafe.go",
         "context.go",
-        "filters.go",
+        "filters_amd64.go",
+        "filters_arm64.go",
         "kvm.go",
         "kvm_amd64.go",
         "kvm_amd64_unsafe.go",
+        "kvm_arm64.go",
+        "kvm_arm64_unsafe.go",
         "kvm_const.go",
+        "kvm_const_arm64.go",
         "machine.go",
         "machine_amd64.go",
         "machine_amd64_unsafe.go",
         "machine_arm64.go",
+        "machine_arm64_unsafe.go",
         "machine_unsafe.go",
         "physical_map.go",
         "virtual_map.go",
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 043de51b3..30dbb74d6 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -20,6 +20,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
 )
 
@@ -36,6 +37,18 @@ func sighandler()
 func dieTrampoline()
 
 var (
+	// bounceSignal is the signal used for bouncing KVM.
+	//
+	// We use SIGCHLD because it is not masked by the runtime, and
+	// it will be ignored properly by other parts of the kernel.
+	bounceSignal = syscall.SIGCHLD
+
+	// bounceSignalMask has only bounceSignal set.
+	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+	// bounce is the interrupt vector used to return to the kernel.
+	bounce = uint32(ring0.VirtualizationException)
+
 	// savedHandler is a pointer to the previous handler.
 	//
 	// This is called by bluepillHandler.
@@ -45,6 +58,13 @@ var (
 	dieTrampolineAddr uintptr
 )
 
+// redpill invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
 // dieHandler is called by dieTrampoline.
 //
 //go:nosplit
@@ -73,8 +93,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 
 func init() {
 	// Install the handler.
-	if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
-		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
 	}
 
 	// Extract the address for the trampoline.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 421c88220..133c2203d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -24,26 +24,10 @@ import (
 )
 
 var (
-	// bounceSignal is the signal used for bouncing KVM.
-	//
-	// We use SIGCHLD because it is not masked by the runtime, and
-	// it will be ignored properly by other parts of the kernel.
-	bounceSignal = syscall.SIGCHLD
-
-	// bounceSignalMask has only bounceSignal set.
-	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
-
-	// bounce is the interrupt vector used to return to the kernel.
-	bounce = uint32(ring0.VirtualizationException)
+	// The action for bluepillSignal is changed by sigaction().
+	bluepillSignal = syscall.SIGSEGV
 )
 
-// redpill on amd64 invokes a syscall with -1.
-//
-//go:nosplit
-func redpill() {
-	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
-}
-
 // bluepillArchEnter is called during bluepillEnter.
 //
 //go:nosplit
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 9d8af143e..a63a6a071 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -23,13 +23,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
-// bluepillArchContext returns the arch-specific context.
-//
-//go:nosplit
-func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
-	return &((*arch.UContext64)(context).MContext)
-}
-
 // dieArchSetup initializes the state for dieTrampoline.
 //
 // The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
new file mode 100644
index 000000000..552341721
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -0,0 +1,79 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+	// The action for bluepillSignal is changed by sigaction().
+	bluepillSignal = syscall.SIGILL
+)
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
+	c = vCPUPtr(uintptr(context.Regs[8]))
+	regs := c.CPU.Registers()
+	regs.Regs = context.Regs
+	regs.Sp = context.Sp
+	regs.Pc = context.Pc
+	regs.Pstate = context.Pstate
+	regs.Pstate &^= uint64(ring0.KernelFlagsClear)
+	regs.Pstate |= ring0.KernelFlagsSet
+	return
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+	regs := c.CPU.Registers()
+	context.Regs = regs.Regs
+	context.Sp = regs.Sp
+	context.Pc = regs.Pc
+	context.Pstate = regs.Pstate
+	context.Pstate &^= uint64(ring0.UserFlagsClear)
+	context.Pstate |= ring0.UserFlagsSet
+}
+
+// KernelSyscall handles kernel syscalls.
+//
+//go:nosplit
+func (c *vCPU) KernelSyscall() {
+	regs := c.Registers()
+	if regs.Regs[8] != ^uint64(0) {
+		regs.Pc -= 4 // Rewind.
+	}
+	ring0.Halt()
+}
+
+// KernelException handles kernel exceptions.
+//
+//go:nosplit
+func (c *vCPU) KernelException(vector ring0.Vector) {
+	regs := c.Registers()
+	if vector == ring0.Vector(bounce) {
+		regs.Pc = 0
+	}
+	ring0.Halt()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
new file mode 100644
index 000000000..c61700892
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -0,0 +1,87 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers.
+#define SIGINFO_SIGNO 0x0
+#define CONTEXT_PC  0x1B8
+#define CONTEXT_R0 0xB8
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+	MOVD	vcpu+0(FP), R8
+	MOVD	$VCPU_CPU(R8), R9
+	ORR	$0xffff000000000000, R9, R9
+	// Trigger sigill.
+	// In ring0.Start(), the value of R8 will be stored into tpidr_el1.
+	// When the context was loaded into vcpu successfully,
+	// we will check if the value of R10 and R9 are the same.
+	WORD	$0xd538d08a // MRS TPIDR_EL1, R10
+check_vcpu:
+	CMP	R10, R9
+	BEQ	right_vCPU
+wrong_vcpu:
+	CALL	·redpill(SB)
+	B	begin
+right_vCPU:
+	RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// 	R0 - The signal number.
+// 	R1 - Pointer to siginfo_t structure.
+// 	R2 - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+	// si_signo should be sigill.
+	MOVD	SIGINFO_SIGNO(R1), R7
+	CMPW	$4, R7
+	BNE	fallback
+
+	MOVD	CONTEXT_PC(R2), R7
+	CMPW	$0, R7
+	BEQ	fallback
+
+	MOVD	R2, 8(RSP)
+	BL	·bluepillHandler(SB)   // Call the handler.
+
+	RET
+
+fallback:
+	// Jump to the previous signal handler.
+	MOVD	·savedHandler(SB), R7
+	B	(R7)
+
+// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
+TEXT ·dieTrampoline(SB),NOSPLIT,$0
+	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+	MOVD	R9, 8(RSP)
+	BL	·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
new file mode 100644
index 000000000..e5fac0d6a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+//go:nosplit
+func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index ca011ef78..9add7c944 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -23,6 +23,8 @@ import (
 	"sync/atomic"
 	"syscall"
 	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 //go:linkname throw runtime.throw
@@ -49,6 +51,13 @@ func uintptrValue(addr *byte) uintptr {
 	return (uintptr)(unsafe.Pointer(addr))
 }
 
+// bluepillArchContext returns the UContext64.
+//
+//go:nosplit
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+	return &((*arch.UContext64)(context).MContext)
+}
+
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters.go
deleted file mode 100644
index 7d949f1dd..000000000
--- a/pkg/sentry/platform/kvm/filters.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kvm
-
-import (
-	"syscall"
-
-	"gvisor.dev/gvisor/pkg/seccomp"
-)
-
-// SyscallFilters returns syscalls made exclusively by the KVM platform.
-func (*KVM) SyscallFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_ARCH_PRCTL:      {},
-		syscall.SYS_IOCTL:           {},
-		syscall.SYS_MMAP:            {},
-		syscall.SYS_RT_SIGSUSPEND:   {},
-		syscall.SYS_RT_SIGTIMEDWAIT: {},
-		0xffffffffffffffff:          {}, // KVM uses syscall -1 to transition to host.
-	}
-}
diff --git a/pkg/sentry/platform/kvm/filters_amd64.go b/pkg/sentry/platform/kvm/filters_amd64.go
new file mode 100644
index 000000000..7d949f1dd
--- /dev/null
+++ b/pkg/sentry/platform/kvm/filters_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// SyscallFilters returns syscalls made exclusively by the KVM platform.
+func (*KVM) SyscallFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ARCH_PRCTL:      {},
+		syscall.SYS_IOCTL:           {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_RT_SIGSUSPEND:   {},
+		syscall.SYS_RT_SIGTIMEDWAIT: {},
+		0xffffffffffffffff:          {}, // KVM uses syscall -1 to transition to host.
+	}
+}
diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go
new file mode 100644
index 000000000..9245d07c2
--- /dev/null
+++ b/pkg/sentry/platform/kvm/filters_arm64.go
@@ -0,0 +1,32 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// SyscallFilters returns syscalls made exclusively by the KVM platform.
+func (*KVM) SyscallFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_IOCTL:           {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_RT_SIGSUSPEND:   {},
+		syscall.SYS_RT_SIGTIMEDWAIT: {},
+		0xffffffffffffffff:          {}, // KVM uses syscall -1 to transition to host.
+	}
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ee4cd2f4d..f2c2c059e 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -21,7 +21,6 @@ import (
 	"sync"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
@@ -56,9 +55,7 @@ func New(deviceFile *os.File) (*KVM, error) {
 
 	// Ensure global initialization is done.
 	globalOnce.Do(func() {
-		physicalInit()
-		globalErr = updateSystemValues(int(fd))
-		ring0.Init(cpuid.HostFeatureSet())
+		globalErr = updateGlobalOnce(int(fd))
 	})
 	if globalErr != nil {
 		return nil, globalErr
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 5d8ef4761..c5a6f9c7d 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -17,6 +17,7 @@
 package kvm
 
 import (
+	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
@@ -211,3 +212,11 @@ type cpuidEntries struct {
 	_       uint32
 	entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
 }
+
+// updateGlobalOnce does global initialization. It has to be called only once.
+func updateGlobalOnce(fd int) error {
+	physicalInit()
+	err := updateSystemValues(int(fd))
+	ring0.Init(cpuid.HostFeatureSet())
+	return err
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
new file mode 100644
index 000000000..2319c86d3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"syscall"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+type kvmOneReg struct {
+	id   uint64
+	addr uint64
+}
+
+const KVM_NR_SPSR = 5
+
+type userFpsimdState struct {
+	vregs    [64]uint64
+	fpsr     uint32
+	fpcr     uint32
+	reserved [2]uint32
+}
+
+type userRegs struct {
+	Regs    syscall.PtraceRegs
+	sp_el1  uint64
+	elr_el1 uint64
+	spsr    [KVM_NR_SPSR]uint64
+	fpRegs  userFpsimdState
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
+// updateGlobalOnce does global initialization. It has to be called only once.
+func updateGlobalOnce(fd int) error {
+	physicalInit()
+	err := updateSystemValues(int(fd))
+	updateVectorTable()
+	return err
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
new file mode 100644
index 000000000..6531bae1d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"fmt"
+	"syscall"
+)
+
+var (
+	runDataSize int
+)
+
+func updateSystemValues(fd int) error {
+	// Extract the mmap size.
+	sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+	if errno != 0 {
+		return fmt.Errorf("getting VCPU mmap size: %v", errno)
+	}
+	// Save the data.
+	runDataSize = int(sz)
+
+	// Success.
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 766131d60..1d5c77ff4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -49,11 +49,13 @@ const (
 	_KVM_EXIT_SHUTDOWN        = 0x8
 	_KVM_EXIT_FAIL_ENTRY      = 0x9
 	_KVM_EXIT_INTERNAL_ERROR  = 0x11
+	_KVM_EXIT_SYSTEM_EVENT    = 0x18
 )
 
 // KVM capability options.
 const (
-	_KVM_CAP_MAX_VCPUS = 0x42
+	_KVM_CAP_MAX_VCPUS       = 0x42
+	_KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
 )
 
 // KVM limits.
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
new file mode 100644
index 000000000..5a74c6e36
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls for Arm64.
+const (
+	_KVM_GET_ONE_REG = 0x4010aeab
+	_KVM_SET_ONE_REG = 0x4010aeac
+
+	_KVM_ARM_PREFERRED_TARGET = 0x8020aeaf
+	_KVM_ARM_VCPU_INIT        = 0x4020aeae
+	_KVM_ARM64_REGS_PSTATE    = 0x6030000000100042
+	_KVM_ARM64_REGS_SP_EL1    = 0x6030000000100044
+	_KVM_ARM64_REGS_R0        = 0x6030000000100000
+	_KVM_ARM64_REGS_R1        = 0x6030000000100002
+	_KVM_ARM64_REGS_R2        = 0x6030000000100004
+	_KVM_ARM64_REGS_R3        = 0x6030000000100006
+	_KVM_ARM64_REGS_R8        = 0x6030000000100010
+	_KVM_ARM64_REGS_R18       = 0x6030000000100024
+	_KVM_ARM64_REGS_PC        = 0x6030000000100040
+	_KVM_ARM64_REGS_MAIR_EL1  = 0x603000000013c510
+	_KVM_ARM64_REGS_TCR_EL1   = 0x603000000013c102
+	_KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100
+	_KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101
+	_KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080
+	_KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082
+	_KVM_ARM64_REGS_VBAR_EL1  = 0x603000000013c600
+)
+
+// Arm64: Architectural Feature Access Control Register EL1.
+const (
+	_FPEN_NOTRAP = 0x3
+	_FPEN_SHIFT  = 0x20
+)
+
+// Arm64: System Control Register EL1.
+const (
+	_SCTLR_M = 1 << 0
+	_SCTLR_C = 1 << 2
+	_SCTLR_I = 1 << 12
+)
+
+// Arm64: Translation Control Register EL1.
+const (
+	_TCR_IPS_40BITS = 2 << 32 // PA=40
+	_TCR_IPS_48BITS = 5 << 32 // PA=48
+
+	_TCR_T0SZ_OFFSET = 0
+	_TCR_T1SZ_OFFSET = 16
+	_TCR_IRGN0_SHIFT = 8
+	_TCR_IRGN1_SHIFT = 24
+	_TCR_ORGN0_SHIFT = 10
+	_TCR_ORGN1_SHIFT = 26
+	_TCR_SH0_SHIFT   = 12
+	_TCR_SH1_SHIFT   = 28
+	_TCR_TG0_SHIFT   = 14
+	_TCR_TG1_SHIFT   = 30
+
+	_TCR_T0SZ_VA48 = 64 - 48 // VA=48
+	_TCR_T1SZ_VA48 = 64 - 48 // VA=48
+
+	_TCR_ASID16 = 1 << 36
+	_TCR_TBI0   = 1 << 37
+
+	_TCR_TXSZ_VA48 = (_TCR_T0SZ_VA48 << _TCR_T0SZ_OFFSET) | (_TCR_T1SZ_VA48 << _TCR_T1SZ_OFFSET)
+
+	_TCR_TG0_4K  = 0 << _TCR_TG0_SHIFT // 4K
+	_TCR_TG0_64K = 1 << _TCR_TG0_SHIFT // 64K
+
+	_TCR_TG1_4K = 2 << _TCR_TG1_SHIFT
+
+	_TCR_TG_FLAGS = _TCR_TG0_4K | _TCR_TG1_4K
+
+	_TCR_IRGN0_WBWA = 1 << _TCR_IRGN0_SHIFT
+	_TCR_IRGN1_WBWA = 1 << _TCR_IRGN1_SHIFT
+	_TCR_IRGN_WBWA  = _TCR_IRGN0_WBWA | _TCR_IRGN1_WBWA
+
+	_TCR_ORGN0_WBWA = 1 << _TCR_ORGN0_SHIFT
+	_TCR_ORGN1_WBWA = 1 << _TCR_ORGN1_SHIFT
+
+	_TCR_ORGN_WBWA = _TCR_ORGN0_WBWA | _TCR_ORGN1_WBWA
+
+	_TCR_SHARED = (3 << _TCR_SH0_SHIFT) | (3 << _TCR_SH1_SHIFT)
+
+	_TCR_CACHE_FLAGS = _TCR_IRGN_WBWA | _TCR_ORGN_WBWA
+)
+
+// Arm64: Memory Attribute Indirection Register EL1.
+const (
+	_MT_DEVICE_nGnRnE = 0
+	_MT_DEVICE_nGnRE  = 1
+	_MT_DEVICE_GRE    = 2
+	_MT_NORMAL_NC     = 3
+	_MT_NORMAL        = 4
+	_MT_NORMAL_WT     = 5
+	_MT_EL1_INIT      = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8)
+)
+
+const (
+	_KVM_ARM_VCPU_POWER_OFF = 0 // CPU is started in OFF state
+	_KVM_ARM_VCPU_PSCI_0_2  = 2 // CPU uses PSCI v0.2
+)
+
+// Arm64: Exception Syndrome Register EL1.
+const (
+	_ESR_ELx_FSC = 0x3F
+
+	_ESR_SEGV_MAPERR_L0 = 0x4
+	_ESR_SEGV_MAPERR_L1 = 0x5
+	_ESR_SEGV_MAPERR_L2 = 0x6
+	_ESR_SEGV_MAPERR_L3 = 0x7
+
+	_ESR_SEGV_ACCERR_L1 = 0x9
+	_ESR_SEGV_ACCERR_L2 = 0xa
+	_ESR_SEGV_ACCERR_L3 = 0xb
+
+	_ESR_SEGV_PEMERR_L1 = 0xd
+	_ESR_SEGV_PEMERR_L2 = 0xe
+	_ESR_SEGV_PEMERR_L3 = 0xf
+)
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 61227cafb..7156c245f 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -135,3 +135,43 @@ func (c *vCPU) setSignalMask() error {
 	}
 	return nil
 }
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
+	}
+	return nil
+}
+
+// getUserRegisters reloads user registers in the vCPU.
+//
+// This is safe to call from a nosplit context.
+//
+//go:nosplit
+func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return fmt.Errorf("error setting system registers: %v", errno)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index b7e2cfb9d..7ae47f291 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -12,8 +12,38 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package kvm
 
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+type vCPUArchState struct {
+	// PCIDs is the set of PCIDs for this vCPU.
+	//
+	// This starts above fixedKernelPCID.
+	PCIDs *pagetables.PCIDs
+}
+
+const (
+	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
+	// tables. We must start allocating user PCIDs above this in order to
+	// avoid any conflict (see below).
+	fixedKernelPCID = 1
+
+	// poolPCIDs is the number of PCIDs to record in the database. As this
+	// grows, assignment can take longer, since it is a simple linear scan.
+	// Beyond a relatively small number, there are likely few perform
+	// benefits, since the TLB has likely long since lost any translations
+	// from more than a few PCIDs past.
+	poolPCIDs = 8
+)
+
 // Get all read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 	var rdonlyRegions []region
@@ -59,3 +89,95 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
 
 	return phyRegions
 }
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUs {
+		c.PCIDs.Drop(pt)
+	}
+}
+
+// nonCanonical generates a canonical address return.
+//
+//go:nosplit
+func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	*info = arch.SignalInfo{
+		Signo: signal,
+		Code:  arch.SignalInfoKernel,
+	}
+	info.SetAddr(addr) // Include address.
+	return usermem.NoAccess, platform.ErrContextSignal
+}
+
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	faultAddr := c.GetFaultAddr()
+	code, user := c.ErrorCode()
+
+	// Reset the pointed SignalInfo.
+	*info = arch.SignalInfo{Signo: signal}
+	info.SetAddr(uint64(faultAddr))
+
+	read := true
+	write := false
+	execute := true
+
+	ret := code & _ESR_ELx_FSC
+	switch ret {
+	case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
+		info.Code = 1 //SEGV_MAPERR
+		read = false
+		write = true
+		execute = false
+	case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
+		info.Code = 2 // SEGV_ACCERR.
+		read = true
+		write = false
+		execute = false
+	default:
+		info.Code = 2
+	}
+
+	if !user {
+		read = true
+		write = false
+		execute = true
+
+	}
+	accessType := usermem.AccessType{
+		Read:    read,
+		Write:   write,
+		Execute: execute,
+	}
+
+	return accessType, platform.ErrContextSignal
+}
+
+// retryInGuest runs the given function in guest mode.
+//
+// If the function does not complete in guest mode (due to execution of a
+// system call due to a GC stall, for example), then it will be retried. The
+// given function must be idempotent as a result of the retry mechanism.
+func (m *machine) retryInGuest(fn func()) {
+	c := m.Get()
+	defer m.Put(c)
+	for {
+		c.ClearErrorCode() // See below.
+		bluepill(c)        // Force guest mode.
+		fn()               // Execute the given function.
+		_, user := c.ErrorCode()
+		if user {
+			// If user is set, then we haven't bailed back to host
+			// mode via a kernel exception or system call. We
+			// consider the full function to have executed in guest
+			// mode and we can return.
+			break
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
new file mode 100644
index 000000000..3f2f97a6b
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -0,0 +1,362 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+	userRegion := userMemoryRegion{
+		slot:          uint32(slot),
+		flags:         0,
+		guestPhysAddr: uint64(physical),
+		memorySize:    uint64(length),
+		userspaceAddr: uint64(virtual),
+	}
+
+	// Set the region.
+	_, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_USER_MEMORY_REGION,
+		uintptr(unsafe.Pointer(&userRegion)))
+	return errno
+}
+
+type kvmVcpuInit struct {
+	target   uint32
+	features [7]uint32
+}
+
+var vcpuInit kvmVcpuInit
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState() error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_ARM_PREFERRED_TARGET,
+		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
+		panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
+	}
+	return nil
+}
+
+func getPageWithReflect(p uintptr) []byte {
+	return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()]
+}
+
+// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
+//
+// According to the design documentation of Arm64,
+// the start address of exception vector table should be 11-bits aligned.
+// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
+// But, we can't align a function's start address to a specific address by using golang.
+// We have raised this question in golang community:
+// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
+// This function will be removed when golang supports this feature.
+//
+// There are 2 jobs were implemented in this function:
+// 1, move the start address of exception vector table into the specific address.
+// 2, modify the offset of each instruction.
+func updateVectorTable() {
+	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
+	offset := fromLocation & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	toLocation := fromLocation + offset
+	page := getPageWithReflect(toLocation)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+
+	page = getPageWithReflect(toLocation + 4096)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+
+	// Move exception-vector-table into the specific address.
+	var entry *uint32
+	var entryFrom *uint32
+	for i := 1; i <= 0x800; i++ {
+		entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i)))
+		entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i)))
+		*entry = *entryFrom
+	}
+
+	// The offset from the address of each unconditionally branch is changed.
+	// We should modify the offset of each instruction.
+	nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780}
+	for _, num := range nums {
+		entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num)))
+		*entry = *entry - (uint32)(offset/4)
+	}
+
+	page = getPageWithReflect(toLocation)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+
+	page = getPageWithReflect(toLocation + 4096)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+	var (
+		reg     kvmOneReg
+		data    uint64
+		regGet  kvmOneReg
+		dataGet uint64
+	)
+
+	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+	regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
+
+	vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_ARM_VCPU_INIT,
+		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
+		panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno))
+	}
+
+	// cpacr_el1
+	reg.id = _KVM_ARM64_REGS_CPACR_EL1
+	data = (_FPEN_NOTRAP << _FPEN_SHIFT)
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// sctlr_el1
+	regGet.id = _KVM_ARM64_REGS_SCTLR_EL1
+	if err := c.getOneRegister(&regGet); err != nil {
+		return err
+	}
+
+	dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I)
+	data = dataGet
+	reg.id = _KVM_ARM64_REGS_SCTLR_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// tcr_el1
+	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+	reg.id = _KVM_ARM64_REGS_TCR_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// mair_el1
+	data = _MT_EL1_INIT
+	reg.id = _KVM_ARM64_REGS_MAIR_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// ttbr0_el1
+	data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0)
+
+	reg.id = _KVM_ARM64_REGS_TTBR0_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	c.SetTtbr0Kvm(uintptr(data))
+
+	// ttbr1_el1
+	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+
+	reg.id = _KVM_ARM64_REGS_TTBR1_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// sp_el1
+	data = c.CPU.StackTop()
+	reg.id = _KVM_ARM64_REGS_SP_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// pc
+	reg.id = _KVM_ARM64_REGS_PC
+	data = uint64(reflect.ValueOf(ring0.Start).Pointer())
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// r8
+	reg.id = _KVM_ARM64_REGS_R8
+	data = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// vbar_el1
+	reg.id = _KVM_ARM64_REGS_VBAR_EL1
+
+	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
+	offset := fromLocation & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	toLocation := fromLocation + offset
+	data = uint64(ring0.KernelStartAddress | toLocation)
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
+	reg.id = _KVM_ARM64_REGS_PSTATE
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+//go:nosplit
+func (c *vCPU) loadSegments(tid uint64) {
+	// TODO(gvisor.dev/issue/1238):  TLS is not supported.
+	// Get TLS from tpidr_el0.
+	atomic.StoreUint64(&c.tid, tid)
+}
+
+func (c *vCPU) setOneRegister(reg *kvmOneReg) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_ONE_REG,
+		uintptr(unsafe.Pointer(reg))); errno != 0 {
+		return fmt.Errorf("error setting one register: %v", errno)
+	}
+	return nil
+}
+
+func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_ONE_REG,
+		uintptr(unsafe.Pointer(reg))); errno != 0 {
+		return fmt.Errorf("error setting one register: %v", errno)
+	}
+	return nil
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+	return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+func (c *vCPU) setSystemTime() error {
+	return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+
+	return nil
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
+	// Check for canonical addresses.
+	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) {
+		return nonCanonical(regs.Pc, int32(syscall.SIGSEGV), info)
+	} else if !ring0.IsCanonical(regs.Sp) {
+		return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
+	}
+
+	var vector ring0.Vector
+	ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
+	c.SetTtbr0App(uintptr(ttbr0App))
+
+	// TODO(gvisor.dev/issue/1238): full context-switch supporting for Arm64.
+	// The Arm64 user-mode execution state consists of:
+	// x0-x30
+	// PC, SP, PSTATE
+	// V0-V31: 32 128-bit registers for floating point, and simd
+	// FPSR
+	// TPIDR_EL0, used for TLS
+	appRegs := switchOpts.Registers
+	c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs)))
+
+	entersyscall()
+	bluepill(c)
+	vector = c.CPU.SwitchToUser(switchOpts)
+	exitsyscall()
+
+	switch vector {
+	case ring0.Syscall:
+		// Fast path: system call executed.
+		return usermem.NoAccess, nil
+
+	case ring0.PageFault:
+		return c.fault(int32(syscall.SIGSEGV), info)
+	case 0xaa:
+		return usermem.NoAccess, nil
+	default:
+		return usermem.NoAccess, platform.ErrContextSignal
+	}
+
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index ed9433311..f04be2ab5 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,46 +87,6 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
-// setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_REGS,
-		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return fmt.Errorf("error setting user registers: %v", errno)
-	}
-	return nil
-}
-
-// getUserRegisters reloads user registers in the vCPU.
-//
-// This is safe to call from a nosplit context.
-//
-//go:nosplit
-func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_GET_REGS,
-		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return errno
-	}
-	return 0
-}
-
-// setSystemRegisters sets system registers.
-func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SREGS,
-		uintptr(unsafe.Pointer(sregs))); errno != 0 {
-		return fmt.Errorf("error setting system registers: %v", errno)
-	}
-	return nil
-}
-
 // atomicAddressSpace is an atomic address space pointer.
 type atomicAddressSpace struct {
 	pointer unsafe.Pointer
-- 
cgit v1.2.3


From 8782f0e287df2a2fd9f9dfb3f0e1589cc15a4f91 Mon Sep 17 00:00:00 2001
From: Aleksandr Razumov <ar@gortc.io>
Date: Sun, 15 Dec 2019 20:57:23 +0300
Subject: Set CPU number to CPU quota

When application is not cgroups-aware, it can spawn excessive threads
which often defaults to CPU number.
Introduce a opt-in flag that will set CPU number accordingly to CPU
quota (if available).

Fixes #1391
---
 runsc/boot/config.go     |  9 +++++++++
 runsc/cgroup/cgroup.go   | 24 ++++++++++++++++++++++++
 runsc/main.go            |  2 ++
 runsc/sandbox/sandbox.go | 10 ++++++++++
 4 files changed, 45 insertions(+)

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 72a33534f..7841d1a7a 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -250,6 +250,12 @@ type Config struct {
 	// multiple tests are run in parallel, since there is no way to pass
 	// parameters to the runtime from docker.
 	TestOnlyTestNameEnv string
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota would result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -282,6 +288,9 @@ func (c *Config) ToFlags() []string {
 		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
 		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
 	}
+	if c.CPUNumFromQuota {
+		f = append(f, "--cpu-num-from-quota")
+	}
 	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		f = append(f, "--TESTONLY-unsafe-nonroot=true")
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index ab3a25b9b..653ca5f52 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -101,6 +101,14 @@ func getValue(path, name string) (string, error) {
 	return string(out), nil
 }
 
+func getInt(path, name string) (int, error) {
+	s, err := getValue(path, name)
+	if err != nil {
+		return 0, err
+	}
+	return strconv.Atoi(strings.TrimSpace(s))
+}
+
 // fillFromAncestor sets the value of a cgroup file from the first ancestor
 // that has content. It does nothing if the file in 'path' has already been set.
 func fillFromAncestor(path string) (string, error) {
@@ -323,6 +331,22 @@ func (c *Cgroup) Join() (func(), error) {
 	return undo, nil
 }
 
+func (c *Cgroup) CPUQuota() (float64, error) {
+	path := c.makePath("cpu")
+	quota, err := getInt(path, "cpu.cfs_quota_us")
+	if err != nil {
+		return -1, err
+	}
+	period, err := getInt(path, "cpu.cfs_period_us")
+	if err != nil {
+		return -1, err
+	}
+	if quota <= 0 || period <= 0 {
+		return -1, err
+	}
+	return float64(quota) / float64(period), nil
+}
+
 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
 func (c *Cgroup) NumCPU() (int, error) {
 	path := c.makePath("cpuset")
diff --git a/runsc/main.go b/runsc/main.go
index 4682b308c..febd59aed 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -82,6 +82,7 @@ var (
 	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater than quota value)")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -225,6 +226,7 @@ func main() {
 		AlsoLogToStderr:    *alsoLogToStderr,
 		ReferenceLeakMode:  refsLeakMode,
 		OverlayfsStaleRead: *overlayfsStaleRead,
+		CPUNumFromQuota:    *cpuNumFromQuota,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 805233184..cbfb873d1 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,6 +18,7 @@ package sandbox
 import (
 	"context"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"strconv"
@@ -631,6 +632,15 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		if err != nil {
 			return fmt.Errorf("getting cpu count from cgroups: %v", err)
 		}
+		if conf.CPUNumFromQuota {
+			quota, err := s.Cgroup.CPUQuota()
+			if err != nil {
+				return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
+			}
+			if quota > 0 {
+				cpuNum = int(math.Ceil(quota))
+			}
+		}
 		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
 
 		mem, err := s.Cgroup.MemoryLimit()
-- 
cgit v1.2.3


From 8a46e83111c70c7ee81368dd4e9df53559213fa0 Mon Sep 17 00:00:00 2001
From: Yong He <chenglang.hy@antfin.com>
Date: Sun, 15 Dec 2019 11:20:12 +0800
Subject: Fix UDS bind cause fd leak in gofer

After the finalizer optimize in 76039f895995c3fe0deef5958f843868685ecc38
commit, clientFile needs to closed before finalizer release it.
The clientFile is not closed if it is created via
gofer.(*inodeOperations).Bind, this will cause fd leak which is hold
by gofer process.

Fixes #1396

Signed-off-by: Yong He <chenglang.hy@antfin.com>
Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
---
 pkg/sentry/fs/gofer/path.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 8c17603f8..c09f3b71c 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -234,6 +234,8 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	if err != nil {
 		return nil, err
 	}
+	// We're not going to use newFile after return.
+	defer newFile.close(ctx)
 
 	// Stabilize the endpoint map while creation is in progress.
 	unlock := i.session().endpoints.lock()
@@ -254,7 +256,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// Get the attributes of the file to create inode key.
 	qid, mask, attr, err := getattr(ctx, newFile)
 	if err != nil {
-		newFile.close(ctx)
 		return nil, err
 	}
 
@@ -270,7 +271,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// cloned and re-opened multiple times after creation.
 	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
 	if err != nil {
-		newFile.close(ctx)
 		return nil, err
 	}
 
-- 
cgit v1.2.3


From bd5c7bf58dd656dc16b920b48a346ef08f1efba8 Mon Sep 17 00:00:00 2001
From: Yong He <chenglang.hy@antfin.com>
Date: Mon, 16 Dec 2019 14:04:03 +0800
Subject: Fix deadlock in overlay bind

Copy up parent when binding UDS on overlayfs is supported in commit
02ab1f187cd24c67b754b004229421d189cee264.
But the using of copyUp in overlayBind will cause sentry stuck, reason
is dead lock in renameMu.

1 [Process A] Invoke a Unix socket bind operation
  renameMu is hold in fs.(*Dirent).genericCreate by process A
2 [Process B] Invoke a read syscall on /proc/task/mounts
  waitng on Lock of renameMu in fs.(*MountNamespace).FindMount
3 [Process A] Continue Unix socket bind operation
  wating on RLock of renameMu in fs.copyUp

Root cause is recursive reading lock of reanmeMu in bind call trace,
if there are writing lock between the two reading lock, then deadlock
occured.

Fixes #1397
---
 pkg/sentry/fs/inode_overlay.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index a09147080..8d4303cc4 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -436,7 +436,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 }
 
 func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
-	if err := copyUp(ctx, parent); err != nil {
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
 		return nil, err
 	}
 
-- 
cgit v1.2.3


From e6f4124afd951c3b089f9c75c499c14f4d90a590 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 16 Dec 2019 13:18:36 -0800
Subject: Implement checks for get/setxattr at the syscall layer.

Add checks for input arguments, file type, permissions, etc. that match
the Linux implementation. A call to get/setxattr that passes all the
checks will still currently return EOPNOTSUPP. Actual support will be
added in following commits.

Only allow user.* extended attributes for the time being.

PiperOrigin-RevId: 285835159
---
 pkg/abi/linux/BUILD                        |   1 +
 pkg/abi/linux/xattr.go                     |  27 +++++
 pkg/sentry/fs/inode.go                     |   8 ++
 pkg/sentry/fs/inode_overlay.go             |   5 +
 pkg/sentry/syscalls/linux/BUILD            |   1 +
 pkg/sentry/syscalls/linux/linux64_amd64.go |   4 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   4 +-
 pkg/sentry/syscalls/linux/sys_xattr.go     | 169 +++++++++++++++++++++++++++++
 test/syscalls/linux/xattr.cc               |  79 +++++++-------
 9 files changed, 253 insertions(+), 45 deletions(-)
 create mode 100644 pkg/abi/linux/xattr.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_xattr.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 51774c6b6..9553f164d 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -57,6 +57,7 @@ go_library(
         "uio.go",
         "utsname.go",
         "wait.go",
+        "xattr.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/abi/linux",
     visibility = ["//visibility:public"],
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
new file mode 100644
index 000000000..a3b6406fa
--- /dev/null
+++ b/pkg/abi/linux/xattr.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Constants for extended attributes.
+const (
+	XATTR_NAME_MAX = 255
+	XATTR_SIZE_MAX = 65536
+
+	XATTR_CREATE  = 1
+	XATTR_REPLACE = 2
+
+	XATTR_USER_PREFIX     = "user."
+	XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
+)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 2d43dff1d..91e2fde2f 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,6 +270,14 @@ func (i *Inode) Getxattr(name string) (string, error) {
 	return i.InodeOperations.Getxattr(i, name)
 }
 
+// Setxattr calls i.InodeOperations.Setxattr with i as the Inode.
+func (i *Inode) Setxattr(name, value string) error {
+	if i.overlay != nil {
+		return overlaySetxattr(i.overlay, name, value)
+	}
+	return i.InodeOperations.Setxattr(i, name, value)
+}
+
 // Listxattr calls i.InodeOperations.Listxattr with i as the Inode.
 func (i *Inode) Listxattr() (map[string]struct{}, error) {
 	if i.overlay != nil {
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index a09147080..63a991beb 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -552,6 +552,11 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) {
 	return s, err
 }
 
+// TODO(b/146028302): Support setxattr for overlayfs.
+func overlaySetxattr(o *overlayEntry, name, value string) error {
+	return syserror.EOPNOTSUPP
+}
+
 func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4c0bf96e4..6766ba587 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -49,6 +49,7 @@ go_library(
         "sys_tls.go",
         "sys_utsname.go",
         "sys_write.go",
+        "sys_xattr.go",
         "timespec.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 797542d28..272ae9991 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
 		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
 		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 2bc7faff5..3b584eed9 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{
 		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		5:   syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
 		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		8:   syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
 		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
new file mode 100644
index 000000000..97d9a65ea
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -0,0 +1,169 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getxattr implements linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	valueLen := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		value, err := getxattr(t, d, dirPath, nameAddr)
+		if err != nil {
+			return err
+		}
+
+		valueLen = len(value)
+		if size == 0 {
+			return nil
+		}
+		if size > linux.XATTR_SIZE_MAX {
+			size = linux.XATTR_SIZE_MAX
+		}
+		if valueLen > int(size) {
+			return syserror.ERANGE
+		}
+
+		_, err = t.CopyOutBytes(valueAddr, []byte(value))
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(valueLen), nil, nil
+}
+
+// getxattr implements getxattr from the given *fs.Dirent.
+func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return "", syserror.ENOTDIR
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+		return "", err
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return "", err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.Getxattr(name)
+}
+
+// Setxattr implements linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Uint()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags)
+	})
+}
+
+// setxattr implements setxattr from the given *fs.Dirent.
+func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return syserror.ENOTDIR
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if size > linux.XATTR_SIZE_MAX {
+		return syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+		return err
+	}
+	value := string(buf)
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.Setxattr(name, value)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
+	// Restrict xattrs to regular files and directories.
+	//
+	// In Linux, this restriction technically only applies to xattrs in the
+	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
+	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+		if perms.Write {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	}
+
+	return i.CheckPermission(t, perms)
+}
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 3e07b634b..75740238c 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -28,6 +28,7 @@
 #include "test/util/capability_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
+#include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -37,9 +38,6 @@ namespace {
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNullName) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
@@ -49,9 +47,6 @@ TEST_F(XattrTest, XattrNullName) {
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
@@ -60,16 +55,17 @@ TEST_F(XattrTest, XattrEmptyName) {
 }
 
 TEST_F(XattrTest, XattrLargeName) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
-  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
-              SyscallSucceeds());
-  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
-              SyscallSucceedsWithValue(0));
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+                SyscallSucceeds());
+    EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+                SyscallSucceedsWithValue(0));
+  }
 
   name += "a";
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -79,9 +75,6 @@ TEST_F(XattrTest, XattrLargeName) {
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -91,9 +84,6 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
 }
 
 TEST_F(XattrTest, XattrReadOnly) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -102,22 +92,28 @@ TEST_F(XattrTest, XattrReadOnly) {
   char name[] = "user.abc";
   char val = 'a';
   size_t size = sizeof(val);
-  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+                SyscallSucceeds());
+  }
 
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
 
-  char buf = '-';
-  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
-  EXPECT_EQ(buf, val);
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    char buf = '-';
+    EXPECT_THAT(getxattr(path, name, &buf, size),
+                SyscallSucceedsWithValue(size));
+    EXPECT_EQ(buf, val);
+  }
 }
 
 TEST_F(XattrTest, XattrWriteOnly) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -128,7 +124,12 @@ TEST_F(XattrTest, XattrWriteOnly) {
   char name[] = "user.abc";
   char val = 'a';
   size_t size = sizeof(val);
-  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+                SyscallSucceeds());
+  }
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
 }
@@ -172,9 +173,6 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   char name[] = "user.abc";
 
   char char_device[] = "/dev/zero";
@@ -226,9 +224,6 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   char name[] = "user.abc";
 
@@ -240,19 +235,24 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
               SyscallFailsWithErrno(E2BIG));
 
-  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(getxattr(path, name, nullptr, 0),
+                SyscallFailsWithErrno(ENODATA));
+  }
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   char name[] = "user.abc";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
 
-  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(getxattr(path, name, nullptr, 0),
+                SyscallFailsWithErrno(ENODATA));
+  }
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
@@ -350,9 +350,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
-- 
cgit v1.2.3


From 3193b2fff8149fe43a3a59c266359e7f443a1563 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 16 Dec 2019 14:40:01 -0800
Subject: Drop unnecessary cast.

Bitshift operators with signed int is supported in Go 1.13.

PiperOrigin-RevId: 285853622
---
 go.mod                                | 32 ++++++++++++++++----------------
 pkg/sentry/strace/select.go           |  4 ++--
 pkg/sentry/syscalls/linux/sys_poll.go |  4 ++--
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/go.mod b/go.mod
index 821273d22..304b8bf13 100644
--- a/go.mod
+++ b/go.mod
@@ -1,21 +1,21 @@
 module gvisor.dev/gvisor
 
-go 1.12
+go 1.13
 
 require (
-	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
-	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
-	github.com/golang/mock v1.3.1
-	github.com/golang/protobuf v1.3.1
-	github.com/google/btree v1.0.0
-	github.com/google/go-cmp v0.2.0
-	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
-	github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
-	github.com/kr/pty v1.1.1
-	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
-	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
-	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
-	golang.org/x/net v0.0.0-20190311183353-d8887717615a
-	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+  github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+  github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+  github.com/golang/mock v1.3.1
+  github.com/golang/protobuf v1.3.1
+  github.com/google/btree v1.0.0
+  github.com/google/go-cmp v0.2.0
+  github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+  github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+  github.com/kr/pty v1.1.1
+  github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+  github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+  github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+  github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+  golang.org/x/net v0.0.0-20190311183353-d8887717615a
+  golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
 )
diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
index 92c18083d..dea309fda 100644
--- a/pkg/sentry/strace/select.go
+++ b/pkg/sentry/strace/select.go
@@ -27,7 +27,7 @@ func fdsFromSet(t *kernel.Task, set []byte) []int {
 	// Append n if the n-th bit is 1.
 	for i, v := range set {
 		for j := 0; j < 8; j++ {
-			if (v>>uint(j))&1 == 1 {
+			if (v>>j)&1 == 1 {
 				fds = append(fds, i*8+j)
 			}
 		}
@@ -42,7 +42,7 @@ func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string {
 
 	// Calculate the size of the fd set (one bit per fd).
 	nBytes := (nfds + 7) / 8
-	nBitsInLastPartialByte := uint(nfds % 8)
+	nBitsInLastPartialByte := nfds % 8
 
 	set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte)
 	if err != nil {
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 631dffec6..2b2df989a 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -198,7 +198,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 }
 
 // CopyInFDSet copies an fd set from select(2)/pselect(2).
-func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes int, nBitsInLastPartialByte uint) ([]byte, error) {
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
 	set := make([]byte, nBytes)
 
 	if addr != 0 {
@@ -222,7 +222,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 
 	// Calculate the size of the fd sets (one bit per fd).
 	nBytes := (nfds + 7) / 8
-	nBitsInLastPartialByte := uint(nfds % 8)
+	nBitsInLastPartialByte := nfds % 8
 
 	// Capture all the provided input vectors.
 	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
-- 
cgit v1.2.3


From 0881abdfdda6e1f7908af67e99a81261c37cc04f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 16 Dec 2019 16:31:43 -0800
Subject: Remove useless comments from p9/handlers.go.

These comments provided nothing, and have been copy-pasted into all
implementations. The code is clear without them.

I considered also removing the "handle implements handler.handle" comments, but
will let those stay for now.

PiperOrigin-RevId: 285876428
---
 pkg/p9/handlers.go | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 51869c7d6..b9582c07f 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -257,7 +257,6 @@ func CanOpen(mode FileMode) bool {
 
 // handle implements handler.handle.
 func (t *Tlopen) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -294,7 +293,6 @@ func (t *Tlopen) handle(cs *connState) message {
 			return syscall.EINVAL
 		}
 
-		// Do the open.
 		osFile, qid, ioUnit, err = ref.file.Open(t.Flags)
 		return err
 	}); err != nil {
@@ -311,12 +309,10 @@ func (t *Tlopen) handle(cs *connState) message {
 }
 
 func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return nil, syscall.EBADF
@@ -390,12 +386,10 @@ func (t *Tsymlink) handle(cs *connState) message {
 }
 
 func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return nil, syscall.EBADF
@@ -426,19 +420,16 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 
 // handle implements handler.handle.
 func (t *Tlink) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
 	}
 	defer ref.DecRef()
 
-	// Lookup the other FID.
 	refTarget, ok := cs.LookupFID(t.Target)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -467,7 +458,6 @@ func (t *Tlink) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Trenameat) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.OldName); err != nil {
 		return newErr(err)
 	}
@@ -475,14 +465,12 @@ func (t *Trenameat) handle(cs *connState) message {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.OldDirectory)
 	if !ok {
 		return newErr(syscall.EBADF)
 	}
 	defer ref.DecRef()
 
-	// Lookup the other FID.
 	refTarget, ok := cs.LookupFID(t.NewDirectory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -523,12 +511,10 @@ func (t *Trenameat) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tunlinkat) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -577,19 +563,16 @@ func (t *Tunlinkat) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Trename) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
 	}
 	defer ref.DecRef()
 
-	// Lookup the target.
 	refTarget, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -641,7 +624,6 @@ func (t *Trename) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Treadlink) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -669,7 +651,6 @@ func (t *Treadlink) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tread) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -708,7 +689,6 @@ func (t *Tread) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Twrite) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -747,12 +727,10 @@ func (t *Tmknod) handle(cs *connState) message {
 }
 
 func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return nil, syscall.EBADF
@@ -791,12 +769,10 @@ func (t *Tmkdir) handle(cs *connState) message {
 }
 
 func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return nil, syscall.EBADF
@@ -827,7 +803,6 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 
 // handle implements handler.handle.
 func (t *Tgetattr) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -856,7 +831,6 @@ func (t *Tgetattr) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tsetattr) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -883,7 +857,6 @@ func (t *Tsetattr) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tallocate) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -917,7 +890,6 @@ func (t *Tallocate) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Txattrwalk) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -930,7 +902,6 @@ func (t *Txattrwalk) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Txattrcreate) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -943,7 +914,6 @@ func (t *Txattrcreate) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -977,7 +947,6 @@ func (t *Treaddir) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tfsync) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1001,7 +970,6 @@ func (t *Tfsync) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tstatfs) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1192,7 +1160,6 @@ func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QI
 
 // handle implements handler.handle.
 func (t *Twalk) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1213,7 +1180,6 @@ func (t *Twalk) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Twalkgetattr) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1270,7 +1236,6 @@ func (t *Tumknod) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tlconnect) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1303,7 +1268,6 @@ func (t *Tchannel) handle(cs *connState) message {
 		return newErr(err)
 	}
 
-	// Lookup the given channel.
 	ch := cs.lookupChannel(t.ID)
 	if ch == nil {
 		return newErr(syscall.ENOSYS)
-- 
cgit v1.2.3


From b661434202672f920291bf5685b68772103c66cb Mon Sep 17 00:00:00 2001
From: Aleksandr Razumov <a.razumov@corp.mail.ru>
Date: Tue, 17 Dec 2019 13:06:42 +0300
Subject: Add minimum CPU number and only lower CPUs on --cpu-num-from-quota

* Add `--cpu-num-min` flag to control minimum CPUs
* Only lower CPU count
* Fix comments
---
 runsc/boot/config.go     | 12 ++++++++++--
 runsc/main.go            |  4 +++-
 runsc/sandbox/sandbox.go | 10 ++++++++--
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 7841d1a7a..d9f5b67c0 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -254,8 +254,14 @@ type Config struct {
 	// CPUNumFromQuota sets CPU number count to available CPU quota, using
 	// least integer value greater than or equal to quota.
 	//
-	// E.g. 0.2 CPU quota would result in 1, and 1.9 in 2.
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
 	CPUNumFromQuota bool
+
+	// CPUNumMin is minimum value of CPU number setting when CPUNumFromQuota
+	// strategy is active.
+	//
+	// E.g. when CPUNumMin is 2, 0.2 CPU quota will result in 2 instead of 1.
+	CPUNumMin int
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -289,7 +295,9 @@ func (c *Config) ToFlags() []string {
 		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
 	}
 	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
+		f = append(f, "--cpu-num-from-quota",
+			"--cpu-num-min="+strconv.Itoa(c.CPUNumMin),
+		)
 	}
 	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
diff --git a/runsc/main.go b/runsc/main.go
index febd59aed..7c60cbb4b 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -82,7 +82,8 @@ var (
 	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
-	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater than quota value)")
+	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value)")
+	cpuNumMin          = flag.Int("cpu-num-min", 2, "minimum number of cpu to use with --cpu-num-from-quota")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -227,6 +228,7 @@ func main() {
 		ReferenceLeakMode:  refsLeakMode,
 		OverlayfsStaleRead: *overlayfsStaleRead,
 		CPUNumFromQuota:    *cpuNumFromQuota,
+		CPUNumMin:          *cpuNumMin,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index cbfb873d1..f6feadf75 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -637,8 +637,14 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 			if err != nil {
 				return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
 			}
-			if quota > 0 {
-				cpuNum = int(math.Ceil(quota))
+			if n := int(math.Ceil(quota)); n > 0 {
+				if n < conf.CPUNumMin {
+					n = conf.CPUNumMin
+				}
+				if n < cpuNum {
+					// Only lower the cpu number.
+					cpuNum = n
+				}
 			}
 		}
 		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
-- 
cgit v1.2.3


From 67000b929b9f5a3aedf6f5f56611c76411d02d78 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 17 Dec 2019 06:31:41 -0800
Subject: Explicitly export files needed by other packages

PiperOrigin-RevId: 285968611
---
 test/syscalls/linux/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0bbaaf28a..e6568128e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -9,6 +9,7 @@ package(
 exports_files(
     [
         "socket.cc",
+        "socket_ip_loopback_blocking.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
-- 
cgit v1.2.3


From 67f678be27b3f4545d41539bd6855527da53a250 Mon Sep 17 00:00:00 2001
From: Aleksandr Razumov <a.razumov@corp.mail.ru>
Date: Tue, 17 Dec 2019 20:41:02 +0300
Subject: Leave minimum CPU number as a constant

Remove introduced CPUNumMin config and hard-code it as 2.
---
 runsc/boot/config.go     | 10 +---------
 runsc/main.go            |  4 +---
 runsc/sandbox/sandbox.go |  9 +++++++--
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index d9f5b67c0..a878bc2ce 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -256,12 +256,6 @@ type Config struct {
 	//
 	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
 	CPUNumFromQuota bool
-
-	// CPUNumMin is minimum value of CPU number setting when CPUNumFromQuota
-	// strategy is active.
-	//
-	// E.g. when CPUNumMin is 2, 0.2 CPU quota will result in 2 instead of 1.
-	CPUNumMin int
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -295,9 +289,7 @@ func (c *Config) ToFlags() []string {
 		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
 	}
 	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota",
-			"--cpu-num-min="+strconv.Itoa(c.CPUNumMin),
-		)
+		f = append(f, "--cpu-num-from-quota")
 	}
 	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
diff --git a/runsc/main.go b/runsc/main.go
index 7c60cbb4b..abf929511 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -82,8 +82,7 @@ var (
 	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
-	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value)")
-	cpuNumMin          = flag.Int("cpu-num-min", 2, "minimum number of cpu to use with --cpu-num-from-quota")
+	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -228,7 +227,6 @@ func main() {
 		ReferenceLeakMode:  refsLeakMode,
 		OverlayfsStaleRead: *overlayfsStaleRead,
 		CPUNumFromQuota:    *cpuNumFromQuota,
-		CPUNumMin:          *cpuNumMin,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f6feadf75..ce1452b87 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -633,13 +633,18 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 			return fmt.Errorf("getting cpu count from cgroups: %v", err)
 		}
 		if conf.CPUNumFromQuota {
+			// Dropping below 2 CPUs can trigger application to disable
+			// locks that can lead do hard to debug errors, so just
+			// leaving two cores as reasonable default.
+			const minCPUs = 2
+
 			quota, err := s.Cgroup.CPUQuota()
 			if err != nil {
 				return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
 			}
 			if n := int(math.Ceil(quota)); n > 0 {
-				if n < conf.CPUNumMin {
-					n = conf.CPUNumMin
+				if n < minCPUs {
+					n = minCPUs
 				}
 				if n < cpuNum {
 					// Only lower the cpu number.
-- 
cgit v1.2.3


From 3f4d8fefb45d75937292302e4c158f76da5c7ca8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 17 Dec 2019 10:08:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 286003946
---
 pkg/tcpip/transport/tcp/connect.go  |  8 +++++
 pkg/tcpip/transport/tcp/tcp_test.go | 65 +++++++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc   | 72 +++++++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4c34fc9d2..cdd69f360 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -218,6 +218,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// acceptable if the ack field acknowledges the SYN.
 	if s.flagIsSet(header.TCPFlagRst) {
 		if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
+			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
+			// was acceptable then signal the user "error: connection reset", drop
+			// the segment, enter CLOSED state, delete TCB, and return."
+			h.ep.mu.Lock()
+			h.ep.workerCleanup = true
+			h.ep.mu.Unlock()
+			// Although the RFC above calls out ECONNRESET, Linux actually returns
+			// ECONNREFUSED here so we do as well.
 			return tcpip.ErrConnectionRefused
 		}
 		return nil
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2a83f7bcc..e8fe4dab5 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1140,6 +1140,71 @@ func TestConnectBindToDevice(t *testing.T) {
 	}
 }
 
+func TestRstOnSynSent(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create an endpoint, don't handshake because we want to interfere with the
+	// handshake process.
+	c.Create(-1)
+
+	// Start connection attempt.
+	waitEntry, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
+	if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got Connect(%+v) = %v, want %s", addr, err, tcpip.ErrConnectStarted)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+		),
+	)
+
+	// Ensure that we've reached SynSent state
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	// Send a packet with a proper ACK and a RST flag to cause the socket
+	// to Error and close out
+	iss := seqnum.Value(789)
+	rcvWnd := seqnum.Size(30000)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
+		Flags:   header.TCPFlagRst | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		TCPOpts: nil,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timed out waiting for packet to arrive")
+	}
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrConnectionRefused)
+	}
+
+	// Due to the RST the endpoint should be in an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+}
+
 func TestOutOfOrderReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c503f3568..6b99c021d 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -967,6 +967,78 @@ TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) {
   EXPECT_THAT(close(s.release()), SyscallSucceeds());
 }
 
+// Test that connecting to a non-listening port and thus receiving a RST is
+// handled appropriately by the socket - the port that the socket was bound to
+// is released and the expected error is returned.
+TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) {
+  // Create a socket that is known to not be listening. As is it bound but not
+  // listening, when another socket connects to the port, it will refuse..
+  FileDescriptor bound_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage bound_addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t bound_addrlen = sizeof(bound_addr);
+
+  ASSERT_THAT(
+      bind(bound_s.get(), reinterpret_cast<struct sockaddr*>(&bound_addr),
+           bound_addrlen),
+      SyscallSucceeds());
+
+  // Get the addresses the socket is bound to because the port is chosen by the
+  // stack.
+  ASSERT_THAT(getsockname(bound_s.get(),
+                          reinterpret_cast<struct sockaddr*>(&bound_addr),
+                          &bound_addrlen),
+              SyscallSucceeds());
+
+  // Create, initialize, and bind the socket that is used to test connecting to
+  // the non-listening port.
+  FileDescriptor client_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // Initialize client address to the loopback one.
+  sockaddr_storage client_addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t client_addrlen = sizeof(client_addr);
+
+  ASSERT_THAT(
+      bind(client_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+           client_addrlen),
+      SyscallSucceeds());
+
+  ASSERT_THAT(getsockname(client_s.get(),
+                          reinterpret_cast<struct sockaddr*>(&client_addr),
+                          &client_addrlen),
+              SyscallSucceeds());
+
+  // Now the test: connect to the bound but not listening socket with the
+  // client socket. The bound socket should return a RST and cause the client
+  // socket to return an error and clean itself up immediately.
+  // The error being ECONNREFUSED diverges with RFC 793, page 37, but does what
+  // Linux does.
+  ASSERT_THAT(connect(client_s.get(),
+                      reinterpret_cast<const struct sockaddr*>(&bound_addr),
+                      bound_addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+
+  FileDescriptor new_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Test binding to the address from the client socket. This should be okay
+  // if it was dropped correctly.
+  ASSERT_THAT(
+      bind(new_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+           client_addrlen),
+      SyscallSucceeds());
+
+  // Attempt #2, with the new socket and reused addr our connect should fail in
+  // the same way as before, not with an EADDRINUSE.
+  ASSERT_THAT(connect(client_s.get(),
+                      reinterpret_cast<const struct sockaddr*>(&bound_addr),
+                      bound_addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
 // Test that we get an ECONNREFUSED with a nonblocking socket.
 TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
-- 
cgit v1.2.3


From 91f1ac731933ac1fe0f9ef30f4c9d06fa4031021 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 17 Dec 2019 13:57:39 -0800
Subject: Mark enableCpuidFault nosplit

This is called after fork, so it must be nosplit.

Updates #1408

PiperOrigin-RevId: 286053054
---
 pkg/sentry/platform/ptrace/subprocess_amd64.go | 10 ++++++++--
 pkg/sentry/platform/ptrace/subprocess_arm64.go |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index a55cff507..606dc2b1d 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -167,8 +167,14 @@ func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
 	}
 }
 
-// enableCpuidFault enable cpuid-faulting; this may fail on older kernels or hardware,
-// so we just disregard the result. Host CPUID will be enabled.
+// enableCpuidFault enables cpuid-faulting.
+//
+// This may fail on older kernels or hardware, so we just disregard the result.
+// Host CPUID will be enabled.
+//
+// This is safe to call in an afterFork context.
+//
+//go:nosplit
 func enableCpuidFault() {
 	syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index aed34e7ee..62a686ee7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -151,6 +151,8 @@ func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
 }
 
 // Noop on arm64.
+//
+//go:nosplit
 func enableCpuidFault() {
 }
 
-- 
cgit v1.2.3


From 64d00cc63dc8c3cb5fde1f638d4525c8d329733d Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 17 Dec 2019 16:20:19 -0800
Subject: Internal change.

PiperOrigin-RevId: 286083614
---
 test/iptables/runner/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
index 1c59e26b9..c6c42d870 100644
--- a/test/iptables/runner/BUILD
+++ b/test/iptables/runner/BUILD
@@ -1,5 +1,5 @@
-load("@io_bazel_rules_docker//container:container.bzl", "container_image")
 load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+load("@io_bazel_rules_docker//container:container.bzl", "container_image")
 
 package(licenses = ["notice"])
 
-- 
cgit v1.2.3


From cb533f18cbb93e3f236ba191d1693e93716313b5 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 18 Nov 2019 09:34:02 +0000
Subject: Enable pkg/sentry/strace support on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I006a1845b6aab2c2fdb9d80fffc1868a6a132ecd
---
 pkg/sentry/strace/BUILD            |   3 +-
 pkg/sentry/strace/linux64.go       | 367 -----------------------------------
 pkg/sentry/strace/linux64_amd64.go | 382 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/linux64_arm64.go | 323 +++++++++++++++++++++++++++++++
 pkg/sentry/strace/syscalls.go      |   9 +-
 5 files changed, 708 insertions(+), 376 deletions(-)
 delete mode 100644 pkg/sentry/strace/linux64.go
 create mode 100644 pkg/sentry/strace/linux64_amd64.go
 create mode 100644 pkg/sentry/strace/linux64_arm64.go

diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index d46421199..aa1ac720c 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -10,7 +10,8 @@ go_library(
         "capability.go",
         "clone.go",
         "futex.go",
-        "linux64.go",
+        "linux64_amd64.go",
+        "linux64_arm64.go",
         "open.go",
         "poll.go",
         "ptrace.go",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
deleted file mode 100644
index e603f858f..000000000
--- a/pkg/sentry/strace/linux64.go
+++ /dev/null
@@ -1,367 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package strace
-
-// linuxAMD64 provides a mapping of the Linux amd64 syscalls and their argument
-// types for display / formatting.
-var linuxAMD64 = SyscallMap{
-	0:   makeSyscallInfo("read", FD, ReadBuffer, Hex),
-	1:   makeSyscallInfo("write", FD, WriteBuffer, Hex),
-	2:   makeSyscallInfo("open", Path, OpenFlags, Mode),
-	3:   makeSyscallInfo("close", FD),
-	4:   makeSyscallInfo("stat", Path, Stat),
-	5:   makeSyscallInfo("fstat", FD, Stat),
-	6:   makeSyscallInfo("lstat", Path, Stat),
-	7:   makeSyscallInfo("poll", PollFDs, Hex, Hex),
-	8:   makeSyscallInfo("lseek", Hex, Hex, Hex),
-	9:   makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex),
-	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
-	11:  makeSyscallInfo("munmap", Hex, Hex),
-	12:  makeSyscallInfo("brk", Hex),
-	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
-	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
-	15:  makeSyscallInfo("rt_sigreturn"),
-	16:  makeSyscallInfo("ioctl", FD, Hex, Hex),
-	17:  makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex),
-	18:  makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex),
-	19:  makeSyscallInfo("readv", FD, ReadIOVec, Hex),
-	20:  makeSyscallInfo("writev", FD, WriteIOVec, Hex),
-	21:  makeSyscallInfo("access", Path, Oct),
-	22:  makeSyscallInfo("pipe", PipeFDs),
-	23:  makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval),
-	24:  makeSyscallInfo("sched_yield"),
-	25:  makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
-	26:  makeSyscallInfo("msync", Hex, Hex, Hex),
-	27:  makeSyscallInfo("mincore", Hex, Hex, Hex),
-	28:  makeSyscallInfo("madvise", Hex, Hex, Hex),
-	29:  makeSyscallInfo("shmget", Hex, Hex, Hex),
-	30:  makeSyscallInfo("shmat", Hex, Hex, Hex),
-	31:  makeSyscallInfo("shmctl", Hex, Hex, Hex),
-	32:  makeSyscallInfo("dup", FD),
-	33:  makeSyscallInfo("dup2", FD, FD),
-	34:  makeSyscallInfo("pause"),
-	35:  makeSyscallInfo("nanosleep", Timespec, PostTimespec),
-	36:  makeSyscallInfo("getitimer", ItimerType, PostItimerVal),
-	37:  makeSyscallInfo("alarm", Hex),
-	38:  makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal),
-	39:  makeSyscallInfo("getpid"),
-	40:  makeSyscallInfo("sendfile", FD, FD, Hex, Hex),
-	41:  makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
-	42:  makeSyscallInfo("connect", FD, SockAddr, Hex),
-	43:  makeSyscallInfo("accept", FD, PostSockAddr, SockLen),
-	44:  makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex),
-	45:  makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen),
-	46:  makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex),
-	47:  makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex),
-	48:  makeSyscallInfo("shutdown", FD, Hex),
-	49:  makeSyscallInfo("bind", FD, SockAddr, Hex),
-	50:  makeSyscallInfo("listen", FD, Hex),
-	51:  makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
-	52:  makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
-	53:  makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
-	54:  makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
-	55:  makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
-	56:  makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
-	57:  makeSyscallInfo("fork"),
-	58:  makeSyscallInfo("vfork"),
-	59:  makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector),
-	60:  makeSyscallInfo("exit", Hex),
-	61:  makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage),
-	62:  makeSyscallInfo("kill", Hex, Signal),
-	63:  makeSyscallInfo("uname", Uname),
-	64:  makeSyscallInfo("semget", Hex, Hex, Hex),
-	65:  makeSyscallInfo("semop", Hex, Hex, Hex),
-	66:  makeSyscallInfo("semctl", Hex, Hex, Hex, Hex),
-	67:  makeSyscallInfo("shmdt", Hex),
-	68:  makeSyscallInfo("msgget", Hex, Hex),
-	69:  makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex),
-	70:  makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex),
-	71:  makeSyscallInfo("msgctl", Hex, Hex, Hex),
-	72:  makeSyscallInfo("fcntl", FD, Hex, Hex),
-	73:  makeSyscallInfo("flock", FD, Hex),
-	74:  makeSyscallInfo("fsync", FD),
-	75:  makeSyscallInfo("fdatasync", FD),
-	76:  makeSyscallInfo("truncate", Path, Hex),
-	77:  makeSyscallInfo("ftruncate", FD, Hex),
-	78:  makeSyscallInfo("getdents", FD, Hex, Hex),
-	79:  makeSyscallInfo("getcwd", PostPath, Hex),
-	80:  makeSyscallInfo("chdir", Path),
-	81:  makeSyscallInfo("fchdir", FD),
-	82:  makeSyscallInfo("rename", Path, Path),
-	83:  makeSyscallInfo("mkdir", Path, Oct),
-	84:  makeSyscallInfo("rmdir", Path),
-	85:  makeSyscallInfo("creat", Path, Oct),
-	86:  makeSyscallInfo("link", Path, Path),
-	87:  makeSyscallInfo("unlink", Path),
-	88:  makeSyscallInfo("symlink", Path, Path),
-	89:  makeSyscallInfo("readlink", Path, ReadBuffer, Hex),
-	90:  makeSyscallInfo("chmod", Path, Mode),
-	91:  makeSyscallInfo("fchmod", FD, Mode),
-	92:  makeSyscallInfo("chown", Path, Hex, Hex),
-	93:  makeSyscallInfo("fchown", FD, Hex, Hex),
-	94:  makeSyscallInfo("lchown", Path, Hex, Hex),
-	95:  makeSyscallInfo("umask", Hex),
-	96:  makeSyscallInfo("gettimeofday", Timeval, Hex),
-	97:  makeSyscallInfo("getrlimit", Hex, Hex),
-	98:  makeSyscallInfo("getrusage", Hex, Rusage),
-	99:  makeSyscallInfo("sysinfo", Hex),
-	100: makeSyscallInfo("times", Hex),
-	101: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex),
-	102: makeSyscallInfo("getuid"),
-	103: makeSyscallInfo("syslog", Hex, Hex, Hex),
-	104: makeSyscallInfo("getgid"),
-	105: makeSyscallInfo("setuid", Hex),
-	106: makeSyscallInfo("setgid", Hex),
-	107: makeSyscallInfo("geteuid"),
-	108: makeSyscallInfo("getegid"),
-	109: makeSyscallInfo("setpgid", Hex, Hex),
-	110: makeSyscallInfo("getppid"),
-	111: makeSyscallInfo("getpgrp"),
-	112: makeSyscallInfo("setsid"),
-	113: makeSyscallInfo("setreuid", Hex, Hex),
-	114: makeSyscallInfo("setregid", Hex, Hex),
-	115: makeSyscallInfo("getgroups", Hex, Hex),
-	116: makeSyscallInfo("setgroups", Hex, Hex),
-	117: makeSyscallInfo("setresuid", Hex, Hex, Hex),
-	118: makeSyscallInfo("getresuid", Hex, Hex, Hex),
-	119: makeSyscallInfo("setresgid", Hex, Hex, Hex),
-	120: makeSyscallInfo("getresgid", Hex, Hex, Hex),
-	121: makeSyscallInfo("getpgid", Hex),
-	122: makeSyscallInfo("setfsuid", Hex),
-	123: makeSyscallInfo("setfsgid", Hex),
-	124: makeSyscallInfo("getsid", Hex),
-	125: makeSyscallInfo("capget", CapHeader, PostCapData),
-	126: makeSyscallInfo("capset", CapHeader, CapData),
-	127: makeSyscallInfo("rt_sigpending", Hex),
-	128: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
-	129: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex),
-	130: makeSyscallInfo("rt_sigsuspend", Hex),
-	131: makeSyscallInfo("sigaltstack", Hex, Hex),
-	132: makeSyscallInfo("utime", Path, Utimbuf),
-	133: makeSyscallInfo("mknod", Path, Mode, Hex),
-	134: makeSyscallInfo("uselib", Hex),
-	135: makeSyscallInfo("personality", Hex),
-	136: makeSyscallInfo("ustat", Hex, Hex),
-	137: makeSyscallInfo("statfs", Path, Hex),
-	138: makeSyscallInfo("fstatfs", FD, Hex),
-	139: makeSyscallInfo("sysfs", Hex, Hex, Hex),
-	140: makeSyscallInfo("getpriority", Hex, Hex),
-	141: makeSyscallInfo("setpriority", Hex, Hex, Hex),
-	142: makeSyscallInfo("sched_setparam", Hex, Hex),
-	143: makeSyscallInfo("sched_getparam", Hex, Hex),
-	144: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex),
-	145: makeSyscallInfo("sched_getscheduler", Hex),
-	146: makeSyscallInfo("sched_get_priority_max", Hex),
-	147: makeSyscallInfo("sched_get_priority_min", Hex),
-	148: makeSyscallInfo("sched_rr_get_interval", Hex, Hex),
-	149: makeSyscallInfo("mlock", Hex, Hex),
-	150: makeSyscallInfo("munlock", Hex, Hex),
-	151: makeSyscallInfo("mlockall", Hex),
-	152: makeSyscallInfo("munlockall"),
-	153: makeSyscallInfo("vhangup"),
-	154: makeSyscallInfo("modify_ldt", Hex, Hex, Hex),
-	155: makeSyscallInfo("pivot_root", Path, Path),
-	156: makeSyscallInfo("_sysctl", Hex),
-	157: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex),
-	158: makeSyscallInfo("arch_prctl", Hex, Hex),
-	159: makeSyscallInfo("adjtimex", Hex),
-	160: makeSyscallInfo("setrlimit", Hex, Hex),
-	161: makeSyscallInfo("chroot", Path),
-	162: makeSyscallInfo("sync"),
-	163: makeSyscallInfo("acct", Hex),
-	164: makeSyscallInfo("settimeofday", Timeval, Hex),
-	165: makeSyscallInfo("mount", Path, Path, Path, Hex, Path),
-	166: makeSyscallInfo("umount2", Path, Hex),
-	167: makeSyscallInfo("swapon", Hex, Hex),
-	168: makeSyscallInfo("swapoff", Hex),
-	169: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex),
-	170: makeSyscallInfo("sethostname", Hex, Hex),
-	171: makeSyscallInfo("setdomainname", Hex, Hex),
-	172: makeSyscallInfo("iopl", Hex),
-	173: makeSyscallInfo("ioperm", Hex, Hex, Hex),
-	174: makeSyscallInfo("create_module", Path, Hex),
-	175: makeSyscallInfo("init_module", Hex, Hex, Hex),
-	176: makeSyscallInfo("delete_module", Hex, Hex),
-	177: makeSyscallInfo("get_kernel_syms", Hex),
-	// 178: query_module (only present in Linux < 2.6)
-	179: makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex),
-	180: makeSyscallInfo("nfsservctl", Hex, Hex, Hex),
-	// 181: getpmsg (not implemented in the Linux kernel)
-	// 182: putpmsg (not implemented in the Linux kernel)
-	// 183: afs_syscall (not implemented in the Linux kernel)
-	// 184: tuxcall (not implemented in the Linux kernel)
-	// 185: security (not implemented in the Linux kernel)
-	186: makeSyscallInfo("gettid"),
-	187: makeSyscallInfo("readahead", Hex, Hex, Hex),
-	188: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex),
-	189: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex),
-	190: makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex),
-	191: makeSyscallInfo("getxattr", Path, Path, Hex, Hex),
-	192: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex),
-	193: makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex),
-	194: makeSyscallInfo("listxattr", Path, Path, Hex),
-	195: makeSyscallInfo("llistxattr", Path, Path, Hex),
-	196: makeSyscallInfo("flistxattr", FD, Path, Hex),
-	197: makeSyscallInfo("removexattr", Path, Path),
-	198: makeSyscallInfo("lremovexattr", Path, Path),
-	199: makeSyscallInfo("fremovexattr", FD, Path),
-	200: makeSyscallInfo("tkill", Hex, Signal),
-	201: makeSyscallInfo("time", Hex),
-	202: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
-	203: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex),
-	204: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex),
-	205: makeSyscallInfo("set_thread_area", Hex),
-	206: makeSyscallInfo("io_setup", Hex, Hex),
-	207: makeSyscallInfo("io_destroy", Hex),
-	208: makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec),
-	209: makeSyscallInfo("io_submit", Hex, Hex, Hex),
-	210: makeSyscallInfo("io_cancel", Hex, Hex, Hex),
-	211: makeSyscallInfo("get_thread_area", Hex),
-	212: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
-	213: makeSyscallInfo("epoll_create", Hex),
-	// 214: epoll_ctl_old (not implemented in the Linux kernel)
-	// 215: epoll_wait_old (not implemented in the Linux kernel)
-	216: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex),
-	217: makeSyscallInfo("getdents64", FD, Hex, Hex),
-	218: makeSyscallInfo("set_tid_address", Hex),
-	219: makeSyscallInfo("restart_syscall"),
-	220: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
-	221: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex),
-	222: makeSyscallInfo("timer_create", Hex, Hex, Hex),
-	223: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
-	224: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec),
-	225: makeSyscallInfo("timer_getoverrun", Hex),
-	226: makeSyscallInfo("timer_delete", Hex),
-	227: makeSyscallInfo("clock_settime", Hex, Timespec),
-	228: makeSyscallInfo("clock_gettime", Hex, PostTimespec),
-	229: makeSyscallInfo("clock_getres", Hex, PostTimespec),
-	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
-	231: makeSyscallInfo("exit_group", Hex),
-	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
-	233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
-	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
-	235: makeSyscallInfo("utimes", Path, Timeval),
-	// 236: vserver (not implemented in the Linux kernel)
-	237: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex),
-	238: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex),
-	239: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex),
-	240: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex),
-	241: makeSyscallInfo("mq_unlink", Hex),
-	242: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex),
-	243: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex),
-	244: makeSyscallInfo("mq_notify", Hex, Hex),
-	245: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex),
-	246: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex),
-	247: makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage),
-	248: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex),
-	249: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex),
-	250: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex),
-	251: makeSyscallInfo("ioprio_set", Hex, Hex, Hex),
-	252: makeSyscallInfo("ioprio_get", Hex, Hex),
-	253: makeSyscallInfo("inotify_init"),
-	254: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex),
-	255: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
-	256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
-	257: makeSyscallInfo("openat", FD, Path, OpenFlags, Mode),
-	258: makeSyscallInfo("mkdirat", FD, Path, Hex),
-	259: makeSyscallInfo("mknodat", FD, Path, Mode, Hex),
-	260: makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex),
-	261: makeSyscallInfo("futimesat", FD, Path, Hex),
-	262: makeSyscallInfo("newfstatat", FD, Path, Stat, Hex),
-	263: makeSyscallInfo("unlinkat", FD, Path, Hex),
-	264: makeSyscallInfo("renameat", FD, Path, Hex, Path),
-	265: makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex),
-	266: makeSyscallInfo("symlinkat", Path, Hex, Path),
-	267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
-	268: makeSyscallInfo("fchmodat", FD, Path, Mode),
-	269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
-	270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet),
-	271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
-	272: makeSyscallInfo("unshare", CloneFlags),
-	273: makeSyscallInfo("set_robust_list", Hex, Hex),
-	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
-	275: makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex),
-	276: makeSyscallInfo("tee", FD, FD, Hex, Hex),
-	277: makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex),
-	278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
-	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
-	280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
-	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
-	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
-	283: makeSyscallInfo("timerfd_create", Hex, Hex),
-	284: makeSyscallInfo("eventfd", Hex),
-	285: makeSyscallInfo("fallocate", FD, Hex, Hex, Hex),
-	286: makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec),
-	287: makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec),
-	288: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags),
-	289: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
-	290: makeSyscallInfo("eventfd2", Hex, Hex),
-	291: makeSyscallInfo("epoll_create1", Hex),
-	292: makeSyscallInfo("dup3", FD, FD, Hex),
-	293: makeSyscallInfo("pipe2", PipeFDs, Hex),
-	294: makeSyscallInfo("inotify_init1", Hex),
-	295: makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex),
-	296: makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex),
-	297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex),
-	298: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
-	299: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex),
-	300: makeSyscallInfo("fanotify_init", Hex, Hex),
-	301: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex),
-	302: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex),
-	303: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex),
-	304: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex),
-	305: makeSyscallInfo("clock_adjtime", Hex, Hex),
-	306: makeSyscallInfo("syncfs", FD),
-	307: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex),
-	308: makeSyscallInfo("setns", FD, Hex),
-	309: makeSyscallInfo("getcpu", Hex, Hex, Hex),
-	310: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex),
-	311: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex),
-	312: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex),
-	313: makeSyscallInfo("finit_module", Hex, Hex, Hex),
-	314: makeSyscallInfo("sched_setattr", Hex, Hex, Hex),
-	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
-	316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
-	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
-	318: makeSyscallInfo("getrandom", Hex, Hex, Hex),
-	319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close.
-	320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex),
-	321: makeSyscallInfo("bpf", Hex, Hex, Hex),
-	322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex),
-	323: makeSyscallInfo("userfaultfd", Hex),
-	324: makeSyscallInfo("membarrier", Hex, Hex),
-	325: makeSyscallInfo("mlock2", Hex, Hex, Hex),
-	326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
-	327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
-	328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
-	329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex),
-	330: makeSyscallInfo("pkey_alloc", Hex, Hex),
-	331: makeSyscallInfo("pkey_free", Hex),
-	332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
-	333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
-	334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
-	424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
-	425: makeSyscallInfo("io_uring_setup", Hex, Hex),
-	426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
-	427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
-	428: makeSyscallInfo("open_tree", FD, Path, Hex),
-	429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
-	430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
-	431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
-	432: makeSyscallInfo("fsmount", FD, Hex, Hex),
-	433: makeSyscallInfo("fspick", FD, Path, Hex),
-	434: makeSyscallInfo("pidfd_open", Hex, Hex),
-	435: makeSyscallInfo("clone3", Hex, Hex),
-}
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
new file mode 100644
index 000000000..9fa2f0e16
--- /dev/null
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -0,0 +1,382 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package strace
+
+import (
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// linuxAMD64 provides a mapping of the Linux amd64 syscalls and their argument
+// types for display / formatting.
+var linuxAMD64 = SyscallMap{
+	0:   makeSyscallInfo("read", FD, ReadBuffer, Hex),
+	1:   makeSyscallInfo("write", FD, WriteBuffer, Hex),
+	2:   makeSyscallInfo("open", Path, OpenFlags, Mode),
+	3:   makeSyscallInfo("close", FD),
+	4:   makeSyscallInfo("stat", Path, Stat),
+	5:   makeSyscallInfo("fstat", FD, Stat),
+	6:   makeSyscallInfo("lstat", Path, Stat),
+	7:   makeSyscallInfo("poll", PollFDs, Hex, Hex),
+	8:   makeSyscallInfo("lseek", Hex, Hex, Hex),
+	9:   makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex),
+	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
+	11:  makeSyscallInfo("munmap", Hex, Hex),
+	12:  makeSyscallInfo("brk", Hex),
+	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
+	15:  makeSyscallInfo("rt_sigreturn"),
+	16:  makeSyscallInfo("ioctl", FD, Hex, Hex),
+	17:  makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex),
+	18:  makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex),
+	19:  makeSyscallInfo("readv", FD, ReadIOVec, Hex),
+	20:  makeSyscallInfo("writev", FD, WriteIOVec, Hex),
+	21:  makeSyscallInfo("access", Path, Oct),
+	22:  makeSyscallInfo("pipe", PipeFDs),
+	23:  makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval),
+	24:  makeSyscallInfo("sched_yield"),
+	25:  makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
+	26:  makeSyscallInfo("msync", Hex, Hex, Hex),
+	27:  makeSyscallInfo("mincore", Hex, Hex, Hex),
+	28:  makeSyscallInfo("madvise", Hex, Hex, Hex),
+	29:  makeSyscallInfo("shmget", Hex, Hex, Hex),
+	30:  makeSyscallInfo("shmat", Hex, Hex, Hex),
+	31:  makeSyscallInfo("shmctl", Hex, Hex, Hex),
+	32:  makeSyscallInfo("dup", FD),
+	33:  makeSyscallInfo("dup2", FD, FD),
+	34:  makeSyscallInfo("pause"),
+	35:  makeSyscallInfo("nanosleep", Timespec, PostTimespec),
+	36:  makeSyscallInfo("getitimer", ItimerType, PostItimerVal),
+	37:  makeSyscallInfo("alarm", Hex),
+	38:  makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal),
+	39:  makeSyscallInfo("getpid"),
+	40:  makeSyscallInfo("sendfile", FD, FD, Hex, Hex),
+	41:  makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
+	42:  makeSyscallInfo("connect", FD, SockAddr, Hex),
+	43:  makeSyscallInfo("accept", FD, PostSockAddr, SockLen),
+	44:  makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex),
+	45:  makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen),
+	46:  makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex),
+	47:  makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex),
+	48:  makeSyscallInfo("shutdown", FD, Hex),
+	49:  makeSyscallInfo("bind", FD, SockAddr, Hex),
+	50:  makeSyscallInfo("listen", FD, Hex),
+	51:  makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
+	52:  makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
+	53:  makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
+	54:  makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
+	55:  makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
+	56:  makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
+	57:  makeSyscallInfo("fork"),
+	58:  makeSyscallInfo("vfork"),
+	59:  makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector),
+	60:  makeSyscallInfo("exit", Hex),
+	61:  makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage),
+	62:  makeSyscallInfo("kill", Hex, Signal),
+	63:  makeSyscallInfo("uname", Uname),
+	64:  makeSyscallInfo("semget", Hex, Hex, Hex),
+	65:  makeSyscallInfo("semop", Hex, Hex, Hex),
+	66:  makeSyscallInfo("semctl", Hex, Hex, Hex, Hex),
+	67:  makeSyscallInfo("shmdt", Hex),
+	68:  makeSyscallInfo("msgget", Hex, Hex),
+	69:  makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex),
+	70:  makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex),
+	71:  makeSyscallInfo("msgctl", Hex, Hex, Hex),
+	72:  makeSyscallInfo("fcntl", FD, Hex, Hex),
+	73:  makeSyscallInfo("flock", FD, Hex),
+	74:  makeSyscallInfo("fsync", FD),
+	75:  makeSyscallInfo("fdatasync", FD),
+	76:  makeSyscallInfo("truncate", Path, Hex),
+	77:  makeSyscallInfo("ftruncate", FD, Hex),
+	78:  makeSyscallInfo("getdents", FD, Hex, Hex),
+	79:  makeSyscallInfo("getcwd", PostPath, Hex),
+	80:  makeSyscallInfo("chdir", Path),
+	81:  makeSyscallInfo("fchdir", FD),
+	82:  makeSyscallInfo("rename", Path, Path),
+	83:  makeSyscallInfo("mkdir", Path, Oct),
+	84:  makeSyscallInfo("rmdir", Path),
+	85:  makeSyscallInfo("creat", Path, Oct),
+	86:  makeSyscallInfo("link", Path, Path),
+	87:  makeSyscallInfo("unlink", Path),
+	88:  makeSyscallInfo("symlink", Path, Path),
+	89:  makeSyscallInfo("readlink", Path, ReadBuffer, Hex),
+	90:  makeSyscallInfo("chmod", Path, Mode),
+	91:  makeSyscallInfo("fchmod", FD, Mode),
+	92:  makeSyscallInfo("chown", Path, Hex, Hex),
+	93:  makeSyscallInfo("fchown", FD, Hex, Hex),
+	94:  makeSyscallInfo("lchown", Path, Hex, Hex),
+	95:  makeSyscallInfo("umask", Hex),
+	96:  makeSyscallInfo("gettimeofday", Timeval, Hex),
+	97:  makeSyscallInfo("getrlimit", Hex, Hex),
+	98:  makeSyscallInfo("getrusage", Hex, Rusage),
+	99:  makeSyscallInfo("sysinfo", Hex),
+	100: makeSyscallInfo("times", Hex),
+	101: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex),
+	102: makeSyscallInfo("getuid"),
+	103: makeSyscallInfo("syslog", Hex, Hex, Hex),
+	104: makeSyscallInfo("getgid"),
+	105: makeSyscallInfo("setuid", Hex),
+	106: makeSyscallInfo("setgid", Hex),
+	107: makeSyscallInfo("geteuid"),
+	108: makeSyscallInfo("getegid"),
+	109: makeSyscallInfo("setpgid", Hex, Hex),
+	110: makeSyscallInfo("getppid"),
+	111: makeSyscallInfo("getpgrp"),
+	112: makeSyscallInfo("setsid"),
+	113: makeSyscallInfo("setreuid", Hex, Hex),
+	114: makeSyscallInfo("setregid", Hex, Hex),
+	115: makeSyscallInfo("getgroups", Hex, Hex),
+	116: makeSyscallInfo("setgroups", Hex, Hex),
+	117: makeSyscallInfo("setresuid", Hex, Hex, Hex),
+	118: makeSyscallInfo("getresuid", Hex, Hex, Hex),
+	119: makeSyscallInfo("setresgid", Hex, Hex, Hex),
+	120: makeSyscallInfo("getresgid", Hex, Hex, Hex),
+	121: makeSyscallInfo("getpgid", Hex),
+	122: makeSyscallInfo("setfsuid", Hex),
+	123: makeSyscallInfo("setfsgid", Hex),
+	124: makeSyscallInfo("getsid", Hex),
+	125: makeSyscallInfo("capget", CapHeader, PostCapData),
+	126: makeSyscallInfo("capset", CapHeader, CapData),
+	127: makeSyscallInfo("rt_sigpending", Hex),
+	128: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
+	129: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex),
+	130: makeSyscallInfo("rt_sigsuspend", Hex),
+	131: makeSyscallInfo("sigaltstack", Hex, Hex),
+	132: makeSyscallInfo("utime", Path, Utimbuf),
+	133: makeSyscallInfo("mknod", Path, Mode, Hex),
+	134: makeSyscallInfo("uselib", Hex),
+	135: makeSyscallInfo("personality", Hex),
+	136: makeSyscallInfo("ustat", Hex, Hex),
+	137: makeSyscallInfo("statfs", Path, Hex),
+	138: makeSyscallInfo("fstatfs", FD, Hex),
+	139: makeSyscallInfo("sysfs", Hex, Hex, Hex),
+	140: makeSyscallInfo("getpriority", Hex, Hex),
+	141: makeSyscallInfo("setpriority", Hex, Hex, Hex),
+	142: makeSyscallInfo("sched_setparam", Hex, Hex),
+	143: makeSyscallInfo("sched_getparam", Hex, Hex),
+	144: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex),
+	145: makeSyscallInfo("sched_getscheduler", Hex),
+	146: makeSyscallInfo("sched_get_priority_max", Hex),
+	147: makeSyscallInfo("sched_get_priority_min", Hex),
+	148: makeSyscallInfo("sched_rr_get_interval", Hex, Hex),
+	149: makeSyscallInfo("mlock", Hex, Hex),
+	150: makeSyscallInfo("munlock", Hex, Hex),
+	151: makeSyscallInfo("mlockall", Hex),
+	152: makeSyscallInfo("munlockall"),
+	153: makeSyscallInfo("vhangup"),
+	154: makeSyscallInfo("modify_ldt", Hex, Hex, Hex),
+	155: makeSyscallInfo("pivot_root", Path, Path),
+	156: makeSyscallInfo("_sysctl", Hex),
+	157: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex),
+	158: makeSyscallInfo("arch_prctl", Hex, Hex),
+	159: makeSyscallInfo("adjtimex", Hex),
+	160: makeSyscallInfo("setrlimit", Hex, Hex),
+	161: makeSyscallInfo("chroot", Path),
+	162: makeSyscallInfo("sync"),
+	163: makeSyscallInfo("acct", Hex),
+	164: makeSyscallInfo("settimeofday", Timeval, Hex),
+	165: makeSyscallInfo("mount", Path, Path, Path, Hex, Path),
+	166: makeSyscallInfo("umount2", Path, Hex),
+	167: makeSyscallInfo("swapon", Hex, Hex),
+	168: makeSyscallInfo("swapoff", Hex),
+	169: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex),
+	170: makeSyscallInfo("sethostname", Hex, Hex),
+	171: makeSyscallInfo("setdomainname", Hex, Hex),
+	172: makeSyscallInfo("iopl", Hex),
+	173: makeSyscallInfo("ioperm", Hex, Hex, Hex),
+	174: makeSyscallInfo("create_module", Path, Hex),
+	175: makeSyscallInfo("init_module", Hex, Hex, Hex),
+	176: makeSyscallInfo("delete_module", Hex, Hex),
+	177: makeSyscallInfo("get_kernel_syms", Hex),
+	// 178: query_module (only present in Linux < 2.6)
+	179: makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex),
+	180: makeSyscallInfo("nfsservctl", Hex, Hex, Hex),
+	// 181: getpmsg (not implemented in the Linux kernel)
+	// 182: putpmsg (not implemented in the Linux kernel)
+	// 183: afs_syscall (not implemented in the Linux kernel)
+	// 184: tuxcall (not implemented in the Linux kernel)
+	// 185: security (not implemented in the Linux kernel)
+	186: makeSyscallInfo("gettid"),
+	187: makeSyscallInfo("readahead", Hex, Hex, Hex),
+	188: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex),
+	189: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex),
+	190: makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex),
+	191: makeSyscallInfo("getxattr", Path, Path, Hex, Hex),
+	192: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex),
+	193: makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex),
+	194: makeSyscallInfo("listxattr", Path, Path, Hex),
+	195: makeSyscallInfo("llistxattr", Path, Path, Hex),
+	196: makeSyscallInfo("flistxattr", FD, Path, Hex),
+	197: makeSyscallInfo("removexattr", Path, Path),
+	198: makeSyscallInfo("lremovexattr", Path, Path),
+	199: makeSyscallInfo("fremovexattr", FD, Path),
+	200: makeSyscallInfo("tkill", Hex, Signal),
+	201: makeSyscallInfo("time", Hex),
+	202: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
+	203: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex),
+	204: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex),
+	205: makeSyscallInfo("set_thread_area", Hex),
+	206: makeSyscallInfo("io_setup", Hex, Hex),
+	207: makeSyscallInfo("io_destroy", Hex),
+	208: makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec),
+	209: makeSyscallInfo("io_submit", Hex, Hex, Hex),
+	210: makeSyscallInfo("io_cancel", Hex, Hex, Hex),
+	211: makeSyscallInfo("get_thread_area", Hex),
+	212: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
+	213: makeSyscallInfo("epoll_create", Hex),
+	// 214: epoll_ctl_old (not implemented in the Linux kernel)
+	// 215: epoll_wait_old (not implemented in the Linux kernel)
+	216: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex),
+	217: makeSyscallInfo("getdents64", FD, Hex, Hex),
+	218: makeSyscallInfo("set_tid_address", Hex),
+	219: makeSyscallInfo("restart_syscall"),
+	220: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
+	221: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex),
+	222: makeSyscallInfo("timer_create", Hex, Hex, Hex),
+	223: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
+	224: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec),
+	225: makeSyscallInfo("timer_getoverrun", Hex),
+	226: makeSyscallInfo("timer_delete", Hex),
+	227: makeSyscallInfo("clock_settime", Hex, Timespec),
+	228: makeSyscallInfo("clock_gettime", Hex, PostTimespec),
+	229: makeSyscallInfo("clock_getres", Hex, PostTimespec),
+	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
+	231: makeSyscallInfo("exit_group", Hex),
+	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
+	233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
+	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
+	235: makeSyscallInfo("utimes", Path, Timeval),
+	// 236: vserver (not implemented in the Linux kernel)
+	237: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex),
+	238: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex),
+	239: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex),
+	240: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex),
+	241: makeSyscallInfo("mq_unlink", Hex),
+	242: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex),
+	243: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex),
+	244: makeSyscallInfo("mq_notify", Hex, Hex),
+	245: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex),
+	246: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex),
+	247: makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage),
+	248: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex),
+	249: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex),
+	250: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex),
+	251: makeSyscallInfo("ioprio_set", Hex, Hex, Hex),
+	252: makeSyscallInfo("ioprio_get", Hex, Hex),
+	253: makeSyscallInfo("inotify_init"),
+	254: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex),
+	255: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
+	256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
+	257: makeSyscallInfo("openat", FD, Path, OpenFlags, Mode),
+	258: makeSyscallInfo("mkdirat", FD, Path, Hex),
+	259: makeSyscallInfo("mknodat", FD, Path, Mode, Hex),
+	260: makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex),
+	261: makeSyscallInfo("futimesat", FD, Path, Hex),
+	262: makeSyscallInfo("newfstatat", FD, Path, Stat, Hex),
+	263: makeSyscallInfo("unlinkat", FD, Path, Hex),
+	264: makeSyscallInfo("renameat", FD, Path, Hex, Path),
+	265: makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex),
+	266: makeSyscallInfo("symlinkat", Path, Hex, Path),
+	267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
+	268: makeSyscallInfo("fchmodat", FD, Path, Mode),
+	269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
+	270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet),
+	271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
+	272: makeSyscallInfo("unshare", CloneFlags),
+	273: makeSyscallInfo("set_robust_list", Hex, Hex),
+	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
+	275: makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex),
+	276: makeSyscallInfo("tee", FD, FD, Hex, Hex),
+	277: makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex),
+	278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
+	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
+	280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
+	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
+	283: makeSyscallInfo("timerfd_create", Hex, Hex),
+	284: makeSyscallInfo("eventfd", Hex),
+	285: makeSyscallInfo("fallocate", FD, Hex, Hex, Hex),
+	286: makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec),
+	287: makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec),
+	288: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags),
+	289: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
+	290: makeSyscallInfo("eventfd2", Hex, Hex),
+	291: makeSyscallInfo("epoll_create1", Hex),
+	292: makeSyscallInfo("dup3", FD, FD, Hex),
+	293: makeSyscallInfo("pipe2", PipeFDs, Hex),
+	294: makeSyscallInfo("inotify_init1", Hex),
+	295: makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex),
+	296: makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex),
+	297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex),
+	298: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
+	299: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex),
+	300: makeSyscallInfo("fanotify_init", Hex, Hex),
+	301: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex),
+	302: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex),
+	303: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex),
+	304: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex),
+	305: makeSyscallInfo("clock_adjtime", Hex, Hex),
+	306: makeSyscallInfo("syncfs", FD),
+	307: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex),
+	308: makeSyscallInfo("setns", FD, Hex),
+	309: makeSyscallInfo("getcpu", Hex, Hex, Hex),
+	310: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex),
+	311: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex),
+	312: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex),
+	313: makeSyscallInfo("finit_module", Hex, Hex, Hex),
+	314: makeSyscallInfo("sched_setattr", Hex, Hex, Hex),
+	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
+	316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
+	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+	318: makeSyscallInfo("getrandom", Hex, Hex, Hex),
+	319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close.
+	320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex),
+	321: makeSyscallInfo("bpf", Hex, Hex, Hex),
+	322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex),
+	323: makeSyscallInfo("userfaultfd", Hex),
+	324: makeSyscallInfo("membarrier", Hex, Hex),
+	325: makeSyscallInfo("mlock2", Hex, Hex, Hex),
+	326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
+	327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
+	328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
+	329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex),
+	330: makeSyscallInfo("pkey_alloc", Hex, Hex),
+	331: makeSyscallInfo("pkey_free", Hex),
+	332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
+	333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
+	334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
+	424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
+	425: makeSyscallInfo("io_uring_setup", Hex, Hex),
+	426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
+	427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
+	428: makeSyscallInfo("open_tree", FD, Path, Hex),
+	429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
+	430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
+	431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
+	432: makeSyscallInfo("fsmount", FD, Hex, Hex),
+	433: makeSyscallInfo("fspick", FD, Path, Hex),
+	434: makeSyscallInfo("pidfd_open", Hex, Hex),
+	435: makeSyscallInfo("clone3", Hex, Hex),
+}
+
+func init() {
+	syscallTables = append(syscallTables,
+		syscallTable{
+			os:       abi.Linux,
+			arch:     arch.AMD64,
+			syscalls: linuxAMD64})
+}
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
new file mode 100644
index 000000000..c3ac5248d
--- /dev/null
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -0,0 +1,323 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package strace
+
+import (
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// linuxARM64 provides a mapping of the Linux arm64 syscalls and their argument
+// types for display / formatting.
+var linuxARM64 = SyscallMap{
+	0:   makeSyscallInfo("io_setup", Hex, Hex),
+	1:   makeSyscallInfo("io_destroy", Hex),
+	2:   makeSyscallInfo("io_submit", Hex, Hex, Hex),
+	3:   makeSyscallInfo("io_cancel", Hex, Hex, Hex),
+	4:   makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec),
+	5:   makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex),
+	6:   makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex),
+	7:   makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex),
+	8:   makeSyscallInfo("getxattr", Path, Path, Hex, Hex),
+	9:   makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex),
+	10:  makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex),
+	11:  makeSyscallInfo("listxattr", Path, Path, Hex),
+	12:  makeSyscallInfo("llistxattr", Path, Path, Hex),
+	13:  makeSyscallInfo("flistxattr", FD, Path, Hex),
+	14:  makeSyscallInfo("removexattr", Path, Path),
+	15:  makeSyscallInfo("lremovexattr", Path, Path),
+	16:  makeSyscallInfo("fremovexattr", FD, Path),
+	17:  makeSyscallInfo("getcwd", PostPath, Hex),
+	18:  makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
+	19:  makeSyscallInfo("eventfd2", Hex, Hex),
+	20:  makeSyscallInfo("epoll_create1", Hex),
+	21:  makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
+	22:  makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	23:  makeSyscallInfo("dup", FD),
+	24:  makeSyscallInfo("dup3", FD, FD, Hex),
+	25:  makeSyscallInfo("fcntl", FD, Hex, Hex),
+	26:  makeSyscallInfo("inotify_init1", Hex),
+	27:  makeSyscallInfo("inotify_add_watch", Hex, Path, Hex),
+	28:  makeSyscallInfo("inotify_rm_watch", Hex, Hex),
+	29:  makeSyscallInfo("ioctl", FD, Hex, Hex),
+	30:  makeSyscallInfo("ioprio_set", Hex, Hex, Hex),
+	31:  makeSyscallInfo("ioprio_get", Hex, Hex),
+	32:  makeSyscallInfo("flock", FD, Hex),
+	33:  makeSyscallInfo("mknodat", FD, Path, Mode, Hex),
+	34:  makeSyscallInfo("mkdirat", FD, Path, Hex),
+	35:  makeSyscallInfo("unlinkat", FD, Path, Hex),
+	36:  makeSyscallInfo("symlinkat", Path, Hex, Path),
+	37:  makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex),
+	38:  makeSyscallInfo("renameat", FD, Path, Hex, Path),
+	39:  makeSyscallInfo("umount2", Path, Hex),
+	40:  makeSyscallInfo("mount", Path, Path, Path, Hex, Path),
+	41:  makeSyscallInfo("pivot_root", Path, Path),
+	42:  makeSyscallInfo("nfsservctl", Hex, Hex, Hex),
+	43:  makeSyscallInfo("statfs", Path, Hex),
+	44:  makeSyscallInfo("fstatfs", FD, Hex),
+	45:  makeSyscallInfo("truncate", Path, Hex),
+	46:  makeSyscallInfo("ftruncate", FD, Hex),
+	47:  makeSyscallInfo("fallocate", FD, Hex, Hex, Hex),
+	48:  makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
+	49:  makeSyscallInfo("chdir", Path),
+	50:  makeSyscallInfo("fchdir", FD),
+	51:  makeSyscallInfo("chroot", Path),
+	52:  makeSyscallInfo("fchmod", FD, Mode),
+	53:  makeSyscallInfo("fchmodat", FD, Path, Mode),
+	54:  makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex),
+	55:  makeSyscallInfo("fchown", FD, Hex, Hex),
+	56:  makeSyscallInfo("openat", FD, Path, OpenFlags, Mode),
+	57:  makeSyscallInfo("close", FD),
+	58:  makeSyscallInfo("vhangup"),
+	59:  makeSyscallInfo("pipe2", PipeFDs, Hex),
+	60:  makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex),
+	61:  makeSyscallInfo("getdents64", FD, Hex, Hex),
+	62:  makeSyscallInfo("lseek", Hex, Hex, Hex),
+	63:  makeSyscallInfo("read", FD, ReadBuffer, Hex),
+	64:  makeSyscallInfo("write", FD, WriteBuffer, Hex),
+	65:  makeSyscallInfo("readv", FD, ReadIOVec, Hex),
+	66:  makeSyscallInfo("writev", FD, WriteIOVec, Hex),
+	67:  makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex),
+	68:  makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex),
+	69:  makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex),
+	70:  makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex),
+	71:  makeSyscallInfo("sendfile", FD, FD, Hex, Hex),
+	72:  makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
+	73:  makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
+	74:  makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
+	75:  makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
+	76:  makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex),
+	77:  makeSyscallInfo("tee", FD, FD, Hex, Hex),
+	78:  makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
+	79:  makeSyscallInfo("fstatat", FD, Path, Stat, Hex),
+	80:  makeSyscallInfo("fstat", FD, Stat),
+	81:  makeSyscallInfo("sync"),
+	82:  makeSyscallInfo("fsync", FD),
+	83:  makeSyscallInfo("fdatasync", FD),
+	84:  makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex),
+	85:  makeSyscallInfo("timerfd_create", Hex, Hex),
+	86:  makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec),
+	87:  makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec),
+	88:  makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
+	89:  makeSyscallInfo("acct", Hex),
+	90:  makeSyscallInfo("capget", CapHeader, PostCapData),
+	91:  makeSyscallInfo("capset", CapHeader, CapData),
+	92:  makeSyscallInfo("personality", Hex),
+	93:  makeSyscallInfo("exit", Hex),
+	94:  makeSyscallInfo("exit_group", Hex),
+	95:  makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage),
+	96:  makeSyscallInfo("set_tid_address", Hex),
+	97:  makeSyscallInfo("unshare", CloneFlags),
+	98:  makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
+	99:  makeSyscallInfo("set_robust_list", Hex, Hex),
+	100: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
+	101: makeSyscallInfo("nanosleep", Timespec, PostTimespec),
+	102: makeSyscallInfo("getitimer", ItimerType, PostItimerVal),
+	103: makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal),
+	104: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex),
+	105: makeSyscallInfo("init_module", Hex, Hex, Hex),
+	106: makeSyscallInfo("delete_module", Hex, Hex),
+	107: makeSyscallInfo("timer_create", Hex, Hex, Hex),
+	108: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec),
+	109: makeSyscallInfo("timer_getoverrun", Hex),
+	110: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
+	111: makeSyscallInfo("timer_delete", Hex),
+	112: makeSyscallInfo("clock_settime", Hex, Timespec),
+	113: makeSyscallInfo("clock_gettime", Hex, PostTimespec),
+	114: makeSyscallInfo("clock_getres", Hex, PostTimespec),
+	115: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
+	116: makeSyscallInfo("syslog", Hex, Hex, Hex),
+	117: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex),
+	118: makeSyscallInfo("sched_setparam", Hex, Hex),
+	119: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex),
+	120: makeSyscallInfo("sched_getscheduler", Hex),
+	121: makeSyscallInfo("sched_getparam", Hex, Hex),
+	122: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex),
+	123: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex),
+	124: makeSyscallInfo("sched_yield"),
+	125: makeSyscallInfo("sched_get_priority_max", Hex),
+	126: makeSyscallInfo("sched_get_priority_min", Hex),
+	127: makeSyscallInfo("sched_rr_get_interval", Hex, Hex),
+	128: makeSyscallInfo("restart_syscall"),
+	129: makeSyscallInfo("kill", Hex, Signal),
+	130: makeSyscallInfo("tkill", Hex, Signal),
+	131: makeSyscallInfo("tgkill", Hex, Hex, Signal),
+	132: makeSyscallInfo("sigaltstack", Hex, Hex),
+	133: makeSyscallInfo("rt_sigsuspend", Hex),
+	134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+	135: makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
+	136: makeSyscallInfo("rt_sigpending", Hex),
+	137: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
+	138: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex),
+	139: makeSyscallInfo("rt_sigreturn"),
+	140: makeSyscallInfo("setpriority", Hex, Hex, Hex),
+	141: makeSyscallInfo("getpriority", Hex, Hex),
+	142: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex),
+	143: makeSyscallInfo("setregid", Hex, Hex),
+	144: makeSyscallInfo("setgid", Hex),
+	145: makeSyscallInfo("setreuid", Hex, Hex),
+	146: makeSyscallInfo("setuid", Hex),
+	147: makeSyscallInfo("setresuid", Hex, Hex, Hex),
+	148: makeSyscallInfo("getresuid", Hex, Hex, Hex),
+	149: makeSyscallInfo("setresgid", Hex, Hex, Hex),
+	150: makeSyscallInfo("getresgid", Hex, Hex, Hex),
+	151: makeSyscallInfo("setfsuid", Hex),
+	152: makeSyscallInfo("setfsgid", Hex),
+	153: makeSyscallInfo("times", Hex),
+	154: makeSyscallInfo("setpgid", Hex, Hex),
+	155: makeSyscallInfo("getpgid", Hex),
+	156: makeSyscallInfo("getsid", Hex),
+	157: makeSyscallInfo("setsid"),
+	158: makeSyscallInfo("getgroups", Hex, Hex),
+	159: makeSyscallInfo("setgroups", Hex, Hex),
+	160: makeSyscallInfo("uname", Uname),
+	161: makeSyscallInfo("sethostname", Hex, Hex),
+	162: makeSyscallInfo("setdomainname", Hex, Hex),
+	163: makeSyscallInfo("getrlimit", Hex, Hex),
+	164: makeSyscallInfo("setrlimit", Hex, Hex),
+	165: makeSyscallInfo("getrusage", Hex, Rusage),
+	166: makeSyscallInfo("umask", Hex),
+	167: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex),
+	168: makeSyscallInfo("getcpu", Hex, Hex, Hex),
+	169: makeSyscallInfo("gettimeofday", Timeval, Hex),
+	170: makeSyscallInfo("settimeofday", Timeval, Hex),
+	171: makeSyscallInfo("adjtimex", Hex),
+	172: makeSyscallInfo("getpid"),
+	173: makeSyscallInfo("getppid"),
+	174: makeSyscallInfo("getuid"),
+	175: makeSyscallInfo("geteuid"),
+	176: makeSyscallInfo("getgid"),
+	177: makeSyscallInfo("getegid"),
+	178: makeSyscallInfo("gettid"),
+	179: makeSyscallInfo("sysinfo", Hex),
+	180: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex),
+	181: makeSyscallInfo("mq_unlink", Hex),
+	182: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex),
+	183: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex),
+	184: makeSyscallInfo("mq_notify", Hex, Hex),
+	185: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex),
+	186: makeSyscallInfo("msgget", Hex, Hex),
+	187: makeSyscallInfo("msgctl", Hex, Hex, Hex),
+	188: makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex),
+	189: makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex),
+	190: makeSyscallInfo("semget", Hex, Hex, Hex),
+	191: makeSyscallInfo("semctl", Hex, Hex, Hex, Hex),
+	192: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
+	193: makeSyscallInfo("semop", Hex, Hex, Hex),
+	194: makeSyscallInfo("shmget", Hex, Hex, Hex),
+	195: makeSyscallInfo("shmctl", Hex, Hex, Hex),
+	196: makeSyscallInfo("shmat", Hex, Hex, Hex),
+	197: makeSyscallInfo("shmdt", Hex),
+	198: makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
+	199: makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
+	200: makeSyscallInfo("bind", FD, SockAddr, Hex),
+	201: makeSyscallInfo("listen", FD, Hex),
+	202: makeSyscallInfo("accept", FD, PostSockAddr, SockLen),
+	203: makeSyscallInfo("connect", FD, SockAddr, Hex),
+	204: makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
+	205: makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
+	206: makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex),
+	207: makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen),
+	208: makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
+	209: makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
+	210: makeSyscallInfo("shutdown", FD, Hex),
+	211: makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex),
+	212: makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex),
+	213: makeSyscallInfo("readahead", Hex, Hex, Hex),
+	214: makeSyscallInfo("brk", Hex),
+	215: makeSyscallInfo("munmap", Hex, Hex),
+	216: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
+	217: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex),
+	218: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex),
+	219: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex),
+	220: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
+	221: makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector),
+	222: makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex),
+	223: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex),
+	224: makeSyscallInfo("swapon", Hex, Hex),
+	225: makeSyscallInfo("swapoff", Hex),
+	226: makeSyscallInfo("mprotect", Hex, Hex, Hex),
+	227: makeSyscallInfo("msync", Hex, Hex, Hex),
+	228: makeSyscallInfo("mlock", Hex, Hex),
+	229: makeSyscallInfo("munlock", Hex, Hex),
+	230: makeSyscallInfo("mlockall", Hex),
+	231: makeSyscallInfo("munlockall"),
+	232: makeSyscallInfo("mincore", Hex, Hex, Hex),
+	233: makeSyscallInfo("madvise", Hex, Hex, Hex),
+	234: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex),
+	235: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex),
+	236: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex),
+	237: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex),
+	238: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
+	239: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
+	240: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex),
+	241: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
+	242: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags),
+	243: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex),
+
+	260: makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage),
+	261: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex),
+	262: makeSyscallInfo("fanotify_init", Hex, Hex),
+	263: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex),
+	264: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex),
+	265: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex),
+	266: makeSyscallInfo("clock_adjtime", Hex, Hex),
+	267: makeSyscallInfo("syncfs", FD),
+	268: makeSyscallInfo("setns", FD, Hex),
+	269: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex),
+	270: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex),
+	271: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex),
+	272: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex),
+	273: makeSyscallInfo("finit_module", Hex, Hex, Hex),
+	274: makeSyscallInfo("sched_setattr", Hex, Hex, Hex),
+	275: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
+	276: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
+	277: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+	278: makeSyscallInfo("getrandom", Hex, Hex, Hex),
+	279: makeSyscallInfo("memfd_create", Path, Hex),
+	280: makeSyscallInfo("bpf", Hex, Hex, Hex),
+	281: makeSyscallInfo("execveat", FD, Path, Hex, Hex, Hex),
+	282: makeSyscallInfo("userfaultfd", Hex),
+	283: makeSyscallInfo("membarrier", Hex),
+	284: makeSyscallInfo("mlock2", Hex, Hex, Hex),
+	285: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
+	286: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
+	287: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
+	291: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
+	292: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
+	293: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
+	424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
+	425: makeSyscallInfo("io_uring_setup", Hex, Hex),
+	426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
+	427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
+	428: makeSyscallInfo("open_tree", FD, Path, Hex),
+	429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
+	430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
+	431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
+	432: makeSyscallInfo("fsmount", FD, Hex, Hex),
+	433: makeSyscallInfo("fspick", FD, Path, Hex),
+	434: makeSyscallInfo("pidfd_open", Hex, Hex),
+	435: makeSyscallInfo("clone3", Hex, Hex),
+}
+
+func init() {
+	syscallTables = append(syscallTables,
+		syscallTable{
+			os:       abi.Linux,
+			arch:     arch.ARM64,
+			syscalls: linuxARM64})
+}
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index e5d486c4e..24e29a2ba 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -250,14 +250,7 @@ type syscallTable struct {
 	syscalls SyscallMap
 }
 
-// syscallTables contains all syscall tables.
-var syscallTables = []syscallTable{
-	{
-		os:       abi.Linux,
-		arch:     arch.AMD64,
-		syscalls: linuxAMD64,
-	},
-}
+var syscallTables []syscallTable
 
 // Lookup returns the SyscallMap for the OS/Arch combination. The returned map
 // must not be changed.
-- 
cgit v1.2.3


From eb868be7434600c41fe9aae8313166607869ecb3 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 4 Dec 2019 16:39:47 +0800
Subject: supporting lazy-fpsimd in guest on Arm64

Several jobs were finished in this patch:
1, provide functions to get/set fpcr/fpsr/vregs
2, support lazy-fpsimd-context-switch in el1

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/BUILD            |   1 +
 pkg/sentry/platform/ring0/defs_arm64.go    |   3 +
 pkg/sentry/platform/ring0/entry_arm64.s    |  26 +++++++
 pkg/sentry/platform/ring0/lib_arm64.go     |  26 +++++--
 pkg/sentry/platform/ring0/lib_arm64.s      | 118 +++++++++++++++++++++++++++++
 pkg/sentry/platform/ring0/offsets_arm64.go |   1 +
 6 files changed, 169 insertions(+), 6 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/lib_arm64.s

diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index f1af18265..87f4552b5 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -71,6 +71,7 @@ go_library(
         "lib_amd64.go",
         "lib_amd64.s",
         "lib_arm64.go",
+        "lib_arm64.s",
         "ring0.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0",
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index fbfbd9bab..dc0eeec01 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -73,6 +73,9 @@ type CPUArchState struct {
 
 	// application context pointer.
 	appAddr uintptr
+
+	// lazyVFP is the value of cpacr_el1.
+	lazyVFP uintptr
 }
 
 // ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 29c475882..add2c3e08 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -31,6 +31,11 @@
 #define RSV_REG 	R18_PLATFORM
 #define RSV_REG_APP 	R9
 
+#define FPEN_NOTRAP 	0x3
+#define FPEN_SHIFT 	20
+
+#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
+
 #define REGISTERS_SAVE(reg, offset) \
   MOVD R0, offset+PTRACE_R0(reg); \
   MOVD R1, offset+PTRACE_R1(reg); \
@@ -279,6 +284,16 @@
 #define IRQ_DISABLE \
 	MSR $2, DAIFClr;
 
+#define VFP_ENABLE \
+	MOVD $FPEN_ENABLE, R0; \
+	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
+	ISB $15;
+
+#define VFP_DISABLE \
+	MOVD $0x0, R0; \
+	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
+	ISB $15;
+
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -318,6 +333,11 @@ TEXT ·Halt(SB),NOSPLIT,$0
 	BNE mmio_exit
 	MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
 mmio_exit:
+	// Disable fpsimd.
+	WORD $0xd5381041 // MRS CPACR_EL1, R1
+	MOVD R1, CPU_LAZY_VFP(RSV_REG)
+	VFP_DISABLE
+
 	// MMIO_EXIT.
 	MOVD $0, R9
 	MOVD R0, 0xffff000000001000(R9)
@@ -382,6 +402,8 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
 	BEQ el1_svc
 	CMP $ESR_ELx_EC_BREAKPT_CUR, R24
 	BGE el1_dbg
+	CMP $ESR_ELx_EC_FP_ASIMD, R24
+	BEQ el1_fpsimd_acc
 	B el1_invalid
 
 el1_da:
@@ -402,6 +424,10 @@ el1_svc:
 el1_dbg:
 	B ·Shutdown(SB)
 
+el1_fpsimd_acc:
+	VFP_ENABLE
+	B ·kernelExitToEl1(SB)  // Resume.
+
 el1_invalid:
 	B ·Shutdown(SB)
 
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 900ee6380..3f3ab5cfb 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,10 +16,24 @@
 
 package ring0
 
-// LoadFloatingPoint loads floating point state by the most efficient mechanism
-// available (set by Init).
-var LoadFloatingPoint func(*byte)
+// CPACREL1 returns the value of the CPACR_EL1 register.
+func CPACREL1() (value uintptr)
 
-// SaveFloatingPoint saves floating point state by the most efficient mechanism
-// available (set by Init).
-var SaveFloatingPoint func(*byte)
+// GetFPCR returns the value of FPCR register.
+func GetFPCR() (value uintptr)
+
+// SetFPCR writes the FPCR value.
+func SetFPCR(value uintptr)
+
+// GetFPSR returns the value of FPSR register.
+func GetFPSR() (value uintptr)
+
+// SetFPSR writes the FPSR value.
+func SetFPSR(value uintptr)
+
+// SaveVRegs saves V0-V31 registers.
+// V0-V31: 32 128-bit registers for floating point and simd.
+func SaveVRegs(*byte)
+
+// LoadVRegs loads V0-V31 registers.
+func LoadVRegs(*byte)
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
new file mode 100644
index 000000000..7c96e8e9c
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -0,0 +1,118 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+TEXT ·CPACREL1(SB),NOSPLIT,$0-8
+	WORD $0xd5381041 	// MRS CPACR_EL1, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·GetFPCR(SB),NOSPLIT,$0-8
+	WORD $0xd53b4201    	// MRS NZCV, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·GetFPSR(SB),NOSPLIT,$0-8
+	WORD $0xd53b4421   	// MRS FPSR, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·SetFPCR(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	WORD $0xd51b4201  	// MSR R1, NZCV
+	RET
+
+TEXT ·SetFPSR(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	WORD $0xd51b4421   	// MSR R1, FPSR
+	RET
+
+TEXT ·SaveVRegs(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R0
+
+	// Skip aarch64_ctx, fpsr, fpcr.
+	FMOVD F0, 16*1(R0)
+	FMOVD F1, 16*2(R0)
+	FMOVD F2, 16*3(R0)
+	FMOVD F3, 16*4(R0)
+	FMOVD F4, 16*5(R0)
+	FMOVD F5, 16*6(R0)
+	FMOVD F6, 16*7(R0)
+	FMOVD F7, 16*8(R0)
+	FMOVD F8, 16*9(R0)
+	FMOVD F9, 16*10(R0)
+	FMOVD F10, 16*11(R0)
+	FMOVD F11, 16*12(R0)
+	FMOVD F12, 16*13(R0)
+	FMOVD F13, 16*14(R0)
+	FMOVD F14, 16*15(R0)
+	FMOVD F15, 16*16(R0)
+	FMOVD F16, 16*17(R0)
+	FMOVD F17, 16*18(R0)
+	FMOVD F18, 16*19(R0)
+	FMOVD F19, 16*20(R0)
+	FMOVD F20, 16*21(R0)
+	FMOVD F21, 16*22(R0)
+	FMOVD F22, 16*23(R0)
+	FMOVD F23, 16*24(R0)
+	FMOVD F24, 16*25(R0)
+	FMOVD F25, 16*26(R0)
+	FMOVD F26, 16*27(R0)
+	FMOVD F27, 16*28(R0)
+	FMOVD F28, 16*29(R0)
+	FMOVD F29, 16*30(R0)
+	FMOVD F30, 16*31(R0)
+	FMOVD F31, 16*32(R0)
+	ISB $15
+
+	RET
+
+TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R0
+
+	// Skip aarch64_ctx, fpsr, fpcr.
+	FMOVD 16*1(R0), F0
+	FMOVD 16*2(R0), F1
+	FMOVD 16*3(R0), F2
+	FMOVD 16*4(R0), F3
+	FMOVD 16*5(R0), F4
+	FMOVD 16*6(R0), F5
+	FMOVD 16*7(R0), F6
+	FMOVD 16*8(R0), F7
+	FMOVD 16*9(R0), F8
+	FMOVD 16*10(R0), F9
+	FMOVD 16*11(R0), F10
+	FMOVD 16*12(R0), F11
+	FMOVD 16*13(R0), F12
+	FMOVD 16*14(R0), F13
+	FMOVD 16*15(R0), F14
+	FMOVD 16*16(R0), F15
+	FMOVD 16*17(R0), F16
+	FMOVD 16*18(R0), F17
+	FMOVD 16*19(R0), F18
+	FMOVD 16*20(R0), F19
+	FMOVD 16*21(R0), F20
+	FMOVD 16*22(R0), F21
+	FMOVD 16*23(R0), F22
+	FMOVD 16*24(R0), F23
+	FMOVD 16*25(R0), F24
+	FMOVD 16*26(R0), F25
+	FMOVD 16*27(R0), F26
+	FMOVD 16*28(R0), F27
+	FMOVD 16*29(R0), F28
+	FMOVD 16*30(R0), F29
+	FMOVD 16*31(R0), F30
+	FMOVD 16*32(R0), F31
+	ISB $15
+
+	RET
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index d7aa1c7cc..cd2a65f97 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -39,6 +39,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define CPU_TTBR0_APP        0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_VECTOR_CODE      0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_APP_ADDR         0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_LAZY_VFP         0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
-- 
cgit v1.2.3


From 803437c96bb4b212dba425f0378ce4f6c0c9fff9 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 18 Dec 2019 12:20:16 -0800
Subject: Upgrade to Python 3

Python 3 tools must be listed in exec_tools for genrules.

PiperOrigin-RevId: 286241702
---
 vdso/BUILD | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vdso/BUILD b/vdso/BUILD
index 7ceed349e..2b6744c26 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -68,14 +68,14 @@ genrule(
           "&& $(location :check_vdso) " +
           "--check-data " +
           "--vdso $(location vdso.so) ",
+    exec_tools = [
+        ":check_vdso",
+    ],
     features = ["-pie"],
     toolchains = [
         "@bazel_tools//tools/cpp:current_cc_toolchain",
         ":no_pie_cc_flags",
     ],
-    tools = [
-        ":check_vdso",
-    ],
     visibility = ["//:sandbox"],
 )
 
@@ -87,6 +87,6 @@ cc_flags_supplier(
 py_binary(
     name = "check_vdso",
     srcs = ["check_vdso.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     visibility = ["//:sandbox"],
 )
-- 
cgit v1.2.3


From 65f53c583364295cbc211b38fae126fb88f08ec0 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 18 Dec 2019 12:28:13 -0800
Subject: Put GetSocketPairs() in unnamed namespace

This avoids conflicting definitions of GetSocketPairs() in outer namespace when
multiple such cc files are complied for one binary.

PiperOrigin-RevId: 286243045
---
 test/syscalls/linux/BUILD                          | 1 +
 test/syscalls/linux/socket_ip_loopback_blocking.cc | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback.cc      | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e6568128e..675ff5cdb 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -10,6 +10,7 @@ exports_files(
     [
         "socket.cc",
         "socket_ip_loopback_blocking.cc",
+        "socket_ip_tcp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index e58eedaba..fda252dd7 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingIPSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
index 831de53b8..9db3037bc 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 334a513f11f0ecc260abcce549b1f1a74edc2c51 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 18 Dec 2019 12:59:32 -0800
Subject: Add Mems_allowed to /proc/PID/status

PiperOrigin-RevId: 286248378
---
 pkg/sentry/fs/proc/task.go     | 4 ++++
 pkg/sentry/fsimpl/proc/task.go | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 0e46c5fb7..9bf4b4527 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -604,6 +604,10 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
 	fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
 	fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
 	fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	// We unconditionally report a single NUMA node. See
+	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
+	fmt.Fprintf(&buf, "Mems_allowed:\t1\n")
+	fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n")
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index c46e05c3a..0d87be52b 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -229,6 +229,10 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
 	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
 	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	// We unconditionally report a single NUMA node. See
+	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
+	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
+	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
 	return nil
 }
 
-- 
cgit v1.2.3


From 18d6e59b457c8a91bf7db518fbb9193c49d2ee7c Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 18 Dec 2019 13:06:02 -0800
Subject: Switch to netinet/tcp.h and poll.h to for better platform
 portability.

PiperOrigin-RevId: 286249699
---
 test/syscalls/linux/BUILD                   |  1 +
 test/syscalls/linux/socket_inet_loopback.cc | 67 +++++++++++++++--------------
 2 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 675ff5cdb..064ce8429 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -9,6 +9,7 @@ package(
 exports_files(
     [
         "socket.cc",
+        "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 761c3a9fe..5bb9d2e99 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
-#include <linux/tcp.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <poll.h>
 #include <string.h>
-#include <sys/epoll.h>
-#include <sys/socket.h>
 
 #include <atomic>
 #include <iostream>
@@ -46,6 +44,8 @@ namespace testing {
 
 namespace {
 
+using ::testing::Gt;
+
 PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
   switch (family) {
     case AF_INET:
@@ -976,41 +976,44 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
                 SyscallSucceedsWithValue(sizeof(i)));
   }
 
-  int epollfd;
-  ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds());
-
+  struct pollfd pollfds[kThreadCount];
   for (int i = 0; i < kThreadCount; i++) {
-    int fd = listener_fds[i].get();
-    struct epoll_event ev;
-    ev.data.fd = fd;
-    ev.events = EPOLLIN;
-    ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev), SyscallSucceeds());
+    pollfds[i].fd = listener_fds[i].get();
+    pollfds[i].events = POLLIN;
   }
 
   std::map<uint16_t, int> portToFD;
 
-  for (int i = 0; i < kConnectAttempts * 2; i++) {
-    struct sockaddr_storage addr = {};
-    socklen_t addrlen = sizeof(addr);
-    struct epoll_event ev;
-    int data, fd;
+  int received = 0;
+  while (received < kConnectAttempts * 2) {
+    ASSERT_THAT(poll(pollfds, kThreadCount, -1),
+                SyscallSucceedsWithValue(Gt(0)));
 
-    ASSERT_THAT(epoll_wait(epollfd, &ev, 1, -1), SyscallSucceedsWithValue(1));
+    for (int i = 0; i < kThreadCount; i++) {
+      if ((pollfds[i].revents & POLLIN) == 0) {
+        continue;
+      }
 
-    fd = ev.data.fd;
-    EXPECT_THAT(RetryEINTR(recvfrom)(fd, &data, sizeof(data), 0,
-                                     reinterpret_cast<struct sockaddr*>(&addr),
-                                     &addrlen),
-                SyscallSucceedsWithValue(sizeof(data)));
-    uint16_t const port =
-        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
-    auto prev_port = portToFD.find(port);
-    // Check that all packets from one client have been delivered to the same
-    // server socket.
-    if (prev_port == portToFD.end()) {
-      portToFD[port] = ev.data.fd;
-    } else {
-      EXPECT_EQ(portToFD[port], ev.data.fd);
+      received++;
+
+      const int fd = pollfds[i].fd;
+      struct sockaddr_storage addr = {};
+      socklen_t addrlen = sizeof(addr);
+      int data;
+      EXPECT_THAT(RetryEINTR(recvfrom)(
+                      fd, &data, sizeof(data), 0,
+                      reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+                  SyscallSucceedsWithValue(sizeof(data)));
+      uint16_t const port =
+          ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
+      auto prev_port = portToFD.find(port);
+      // Check that all packets from one client have been delivered to the
+      // same server socket.
+      if (prev_port == portToFD.end()) {
+        portToFD[port] = fd;
+      } else {
+        EXPECT_EQ(portToFD[port], fd);
+      }
     }
   }
 }
@@ -1897,7 +1900,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllFamlies, SocketMultiProtocolInetLoopbackTest,
+    AllFamilies, SocketMultiProtocolInetLoopbackTest,
     ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
                       ProtocolTestParam{"UDP", SOCK_DGRAM}),
     DescribeProtocolTestParam);
-- 
cgit v1.2.3


From 628948b1e197010177acc08ddc9f93d9925fba6b Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 18 Dec 2019 13:35:52 -0800
Subject: Cleanup NDP Tests

This change makes sure that test variables are captured before running tests
in parallel, and removes unneeded buffered channel allocations. This change also
removes unnecessary timeouts.

PiperOrigin-RevId: 286255066
---
 pkg/tcpip/stack/ndp_test.go | 625 ++++++++++++++++++++++----------------------
 1 file changed, 308 insertions(+), 317 deletions(-)

diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index d8e7ce67e..9f589a471 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -79,7 +79,7 @@ func TestDADDisabled(t *testing.T) {
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 	}
 
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(opts)
 	if err := s.CreateNIC(1, e); err != nil {
 		t.Fatalf("CreateNIC(_) = %s", err)
@@ -330,6 +330,8 @@ func TestDADResolve(t *testing.T) {
 	}
 
 	for _, test := range tests {
+		test := test
+
 		t.Run(test.name, func(t *testing.T) {
 			t.Parallel()
 
@@ -520,7 +522,7 @@ func TestDADFail(t *testing.T) {
 			}
 			opts.NDPConfigs.RetransmitTimer = time.Second * 2
 
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(opts)
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(_) = %s", err)
@@ -601,7 +603,7 @@ func TestDADStop(t *testing.T) {
 		NDPConfigs:       ndpConfigs,
 	}
 
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(opts)
 	if err := s.CreateNIC(1, e); err != nil {
 		t.Fatalf("CreateNIC(_) = %s", err)
@@ -702,7 +704,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent),
 			}
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NDPDisp:          &ndpDisp,
@@ -903,8 +905,6 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on
 // TestNoRouterDiscovery tests that router discovery will not be performed if
 // configured not to.
 func TestNoRouterDiscovery(t *testing.T) {
-	t.Parallel()
-
 	// Being configured to discover routers means handle and
 	// discover are set to true and forwarding is set to false.
 	// This tests all possible combinations of the configurations,
@@ -920,9 +920,9 @@ func TestNoRouterDiscovery(t *testing.T) {
 			t.Parallel()
 
 			ndpDisp := ndpDispatcher{
-				routerC: make(chan ndpRouterEvent, 10),
+				routerC: make(chan ndpRouterEvent, 1),
 			}
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NDPConfigs: stack.NDPConfigurations{
@@ -942,21 +942,27 @@ func TestNoRouterDiscovery(t *testing.T) {
 			select {
 			case <-ndpDisp.routerC:
 				t.Fatal("unexpectedly discovered a router when configured not to")
-			case <-time.After(defaultTimeout):
+			default:
 			}
 		})
 	}
 }
 
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// discovered flag set to discovered.
+func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) string {
+	return cmp.Diff(ndpRouterEvent{nicID: 1, addr: addr, discovered: discovered}, e, cmp.AllowUnexported(e))
+}
+
 // TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered router when the dispatcher asks it not to.
 func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 	t.Parallel()
 
 	ndpDisp := ndpDispatcher{
-		routerC: make(chan ndpRouterEvent, 10),
+		routerC: make(chan ndpRouterEvent, 1),
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -979,41 +985,35 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 	}
 	s.SetRouteTable(routeTable)
 
-	// Rx an RA with short lifetime.
-	lifetime := time.Duration(1)
-	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(lifetime)))
+	// Receive an RA for a router we should not remember.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, lifetimeSeconds))
 	select {
-	case r := <-ndpDisp.routerC:
-		if r.nicID != 1 {
-			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-		}
-		if r.addr != llAddr2 {
-			t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr2)
-		}
-		if !r.discovered {
-			t.Fatal("got r.discovered = false, want = true")
+	case e := <-ndpDisp.routerC:
+		if diff := checkRouterEvent(e, llAddr2, true); diff != "" {
+			t.Errorf("router event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(defaultTimeout):
-		t.Fatal("timeout waiting for router discovery event")
+	default:
+		t.Fatal("expected router discovery event")
 	}
 
 	// Original route table should not have been modified.
-	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
-	// Wait for the normal invalidation time plus an extra second to
-	// make sure we do not actually receive any invalidation events as
-	// we should not have remembered the router in the first place.
+	// Wait for the invalidation time plus some buffer to make sure we do
+	// not actually receive any invalidation events as we should not have
+	// remembered the router in the first place.
 	select {
 	case <-ndpDisp.routerC:
 		t.Fatal("should not have received any router events")
-	case <-time.After(lifetime*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
 
 	// Original route table should not have been modified.
-	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 }
 
@@ -1021,10 +1021,10 @@ func TestRouterDiscovery(t *testing.T) {
 	t.Parallel()
 
 	ndpDisp := ndpDispatcher{
-		routerC:        make(chan ndpRouterEvent, 10),
+		routerC:        make(chan ndpRouterEvent, 1),
 		rememberRouter: true,
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1034,22 +1034,29 @@ func TestRouterDiscovery(t *testing.T) {
 		NDPDisp: &ndpDisp,
 	})
 
-	waitForEvent := func(addr tcpip.Address, discovered bool, timeout time.Duration) {
+	expectRouterEvent := func(addr tcpip.Address, discovered bool) {
 		t.Helper()
 
 		select {
-		case r := <-ndpDisp.routerC:
-			if r.nicID != 1 {
-				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		case e := <-ndpDisp.routerC:
+			if diff := checkRouterEvent(e, addr, discovered); diff != "" {
+				t.Errorf("router event mismatch (-want +got):\n%s", diff)
 			}
-			if r.addr != addr {
-				t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
-			}
-			if r.discovered != discovered {
-				t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered)
+		default:
+			t.Fatal("expected router discovery event")
+		}
+	}
+
+	expectAsyncRouterInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.routerC:
+			if diff := checkRouterEvent(e, addr, false); diff != "" {
+				t.Errorf("router event mismatch (-want +got):\n%s", diff)
 			}
 		case <-time.After(timeout):
-			t.Fatal("timeout waiting for router discovery event")
+			t.Fatal("timed out waiting for router discovery event")
 		}
 	}
 
@@ -1063,26 +1070,27 @@ func TestRouterDiscovery(t *testing.T) {
 	select {
 	case <-ndpDisp.routerC:
 		t.Fatal("unexpectedly discovered a router with 0 lifetime")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Rx an RA from lladdr2 with a huge lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
-	waitForEvent(llAddr2, true, defaultTimeout)
+	expectRouterEvent(llAddr2, true)
 
 	// Should have a default route through the discovered router.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Rx an RA from another router (lladdr3) with non-zero lifetime.
 	l3Lifetime := time.Duration(6)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, uint16(l3Lifetime)))
-	waitForEvent(llAddr3, true, defaultTimeout)
+	expectRouterEvent(llAddr3, true)
 
 	// Should have default routes through the discovered routers.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	want := []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}
+	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Rx an RA from lladdr2 with lesser lifetime.
@@ -1091,12 +1099,12 @@ func TestRouterDiscovery(t *testing.T) {
 	select {
 	case <-ndpDisp.routerC:
 		t.Fatal("Should not receive a router event when updating lifetimes for known routers")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Should still have a default route through the discovered routers.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Wait for lladdr2's router invalidation timer to fire. The lifetime
@@ -1106,30 +1114,30 @@ func TestRouterDiscovery(t *testing.T) {
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	waitForEvent(llAddr2, false, l2Lifetime*time.Second+defaultTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr2, l2Lifetime*time.Second+defaultTimeout)
 
 	// Should no longer have the default route through lladdr2.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Rx an RA from lladdr2 with huge lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
-	waitForEvent(llAddr2, true, defaultTimeout)
+	expectRouterEvent(llAddr2, true)
 
 	// Should have a default route through the discovered routers.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}, {header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}, {header.IPv6EmptySubnet, llAddr2, 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Rx an RA from lladdr2 with zero lifetime. It should be invalidated.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
-	waitForEvent(llAddr2, false, defaultTimeout)
+	expectRouterEvent(llAddr2, false)
 
 	// Should have deleted the default route through the router that just
 	// got invalidated.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Wait for lladdr3's router invalidation timer to fire. The lifetime
@@ -1139,7 +1147,7 @@ func TestRouterDiscovery(t *testing.T) {
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	waitForEvent(llAddr3, false, l3Lifetime*time.Second+defaultTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr3, l3Lifetime*time.Second+defaultTimeout)
 
 	// Should not have any routes now that all discovered routers have been
 	// invalidated.
@@ -1154,10 +1162,10 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	t.Parallel()
 
 	ndpDisp := ndpDispatcher{
-		routerC:        make(chan ndpRouterEvent, 10),
+		routerC:        make(chan ndpRouterEvent, 1),
 		rememberRouter: true,
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1184,41 +1192,33 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 		if i <= stack.MaxDiscoveredDefaultRouters {
 			expectedRt[i-1] = tcpip.Route{header.IPv6EmptySubnet, llAddr, 1}
 			select {
-			case r := <-ndpDisp.routerC:
-				if r.nicID != 1 {
-					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-				}
-				if r.addr != llAddr {
-					t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr)
-				}
-				if !r.discovered {
-					t.Fatal("got r.discovered = false, want = true")
+			case e := <-ndpDisp.routerC:
+				if diff := checkRouterEvent(e, llAddr, true); diff != "" {
+					t.Errorf("router event mismatch (-want +got):\n%s", diff)
 				}
-			case <-time.After(defaultTimeout):
-				t.Fatal("timeout waiting for router discovery event")
+			default:
+				t.Fatal("expected router discovery event")
 			}
 
 		} else {
 			select {
 			case <-ndpDisp.routerC:
 				t.Fatal("should not have discovered a new router after we already discovered the max number of routers")
-			case <-time.After(defaultTimeout):
+			default:
 			}
 		}
 	}
 
 	// Should only have default routes for the first
 	// stack.MaxDiscoveredDefaultRouters discovered routers.
-	if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
+	if diff := cmp.Diff(expectedRt[:], s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 }
 
 // TestNoPrefixDiscovery tests that prefix discovery will not be performed if
 // configured not to.
 func TestNoPrefixDiscovery(t *testing.T) {
-	t.Parallel()
-
 	prefix := tcpip.AddressWithPrefix{
 		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
 		PrefixLen: 64,
@@ -1239,9 +1239,9 @@ func TestNoPrefixDiscovery(t *testing.T) {
 			t.Parallel()
 
 			ndpDisp := ndpDispatcher{
-				prefixC: make(chan ndpPrefixEvent, 10),
+				prefixC: make(chan ndpPrefixEvent, 1),
 			}
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NDPConfigs: stack.NDPConfigurations{
@@ -1262,12 +1262,18 @@ func TestNoPrefixDiscovery(t *testing.T) {
 			select {
 			case <-ndpDisp.prefixC:
 				t.Fatal("unexpectedly discovered a prefix when configured not to")
-			case <-time.After(defaultTimeout):
+			default:
 			}
 		})
 	}
 }
 
+// Check e to make sure that the event is for prefix on nic with ID 1, and the
+// discovered flag set to discovered.
+func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) string {
+	return cmp.Diff(ndpPrefixEvent{nicID: 1, prefix: prefix, discovered: discovered}, e, cmp.AllowUnexported(e))
+}
+
 // TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered on-link prefix when the dispatcher asks it not to.
 func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
@@ -1276,9 +1282,9 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 	prefix, subnet, _ := prefixSubnetAddr(0, "")
 
 	ndpDisp := ndpDispatcher{
-		prefixC: make(chan ndpPrefixEvent, 10),
+		prefixC: make(chan ndpPrefixEvent, 1),
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1302,41 +1308,35 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 	}
 	s.SetRouteTable(routeTable)
 
-	// Rx an RA with prefix with a short lifetime.
-	const lifetime = 1
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetime, 0))
+	// Receive an RA with prefix that we should not remember.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetimeSeconds, 0))
 	select {
-	case r := <-ndpDisp.prefixC:
-		if r.nicID != 1 {
-			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+	case e := <-ndpDisp.prefixC:
+		if diff := checkPrefixEvent(e, subnet, true); diff != "" {
+			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
 		}
-		if r.prefix != subnet {
-			t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet)
-		}
-		if !r.discovered {
-			t.Fatal("got r.discovered = false, want = true")
-		}
-	case <-time.After(defaultTimeout):
-		t.Fatal("timeout waiting for prefix discovery event")
+	default:
+		t.Fatal("expected prefix discovery event")
 	}
 
 	// Original route table should not have been modified.
-	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
-	// Wait for the normal invalidation time plus some buffer to
-	// make sure we do not actually receive any invalidation events as
-	// we should not have remembered the prefix in the first place.
+	// Wait for the invalidation time plus some buffer to make sure we do
+	// not actually receive any invalidation events as we should not have
+	// remembered the prefix in the first place.
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("should not have received any prefix events")
-	case <-time.After(lifetime*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
 
 	// Original route table should not have been modified.
-	if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 }
 
@@ -1348,10 +1348,10 @@ func TestPrefixDiscovery(t *testing.T) {
 	prefix3, subnet3, _ := prefixSubnetAddr(2, "")
 
 	ndpDisp := ndpDispatcher{
-		prefixC:        make(chan ndpPrefixEvent, 10),
+		prefixC:        make(chan ndpPrefixEvent, 1),
 		rememberPrefix: true,
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1361,73 +1361,68 @@ func TestPrefixDiscovery(t *testing.T) {
 		NDPDisp: &ndpDisp,
 	})
 
-	waitForEvent := func(subnet tcpip.Subnet, discovered bool, timeout time.Duration) {
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) {
 		t.Helper()
 
 		select {
-		case r := <-ndpDisp.prefixC:
-			if r.nicID != 1 {
-				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-			}
-			if r.prefix != subnet {
-				t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet)
+		case e := <-ndpDisp.prefixC:
+			if diff := checkPrefixEvent(e, prefix, discovered); diff != "" {
+				t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
 			}
-			if r.discovered != discovered {
-				t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered)
-			}
-		case <-time.After(timeout):
-			t.Fatal("timeout waiting for prefix discovery event")
+		default:
+			t.Fatal("expected prefix discovery event")
 		}
 	}
 
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(1) = %s", err)
-	}
-
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with zero valid lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly discovered a prefix with 0 lifetime")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with non-zero lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0))
-	waitForEvent(subnet1, true, defaultTimeout)
+	expectPrefixEvent(subnet1, true)
 
 	// Should have added a device route for subnet1 through the nic.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Receive an RA with prefix2 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0))
-	waitForEvent(subnet2, true, defaultTimeout)
+	expectPrefixEvent(subnet2, true)
 
 	// Should have added a device route for subnet2 through the nic.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Receive an RA with prefix3 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0))
-	waitForEvent(subnet3, true, defaultTimeout)
+	expectPrefixEvent(subnet3, true)
 
 	// Should have added a device route for subnet3 through the nic.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Receive an RA with prefix1 in a PI with lifetime = 0.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
-	waitForEvent(subnet1, false, defaultTimeout)
+	expectPrefixEvent(subnet1, false)
 
 	// Should have removed the device route for subnet1 through the nic.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	want := []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}
+	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Receive an RA with prefix2 in a PI with lesser lifetime.
@@ -1436,26 +1431,33 @@ func TestPrefixDiscovery(t *testing.T) {
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly received prefix event when updating lifetime")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Should not have updated route table.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Wait for prefix2's most recent invalidation timer plus some buffer to
 	// expire.
-	waitForEvent(subnet2, false, time.Duration(lifetime)*time.Second+defaultTimeout)
+	select {
+	case e := <-ndpDisp.prefixC:
+		if diff := checkPrefixEvent(e, subnet2, false); diff != "" {
+			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(time.Duration(lifetime)*time.Second + defaultTimeout):
+		t.Fatal("timed out waiting for prefix discovery event")
+	}
 
 	// Should have removed the device route for subnet2 through the nic.
-	if got, want := s.GetRouteTable(), []tcpip.Route{{subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+	if diff := cmp.Diff([]tcpip.Route{{subnet3, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 
 	// Receive RA to invalidate prefix3.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0))
-	waitForEvent(subnet3, false, defaultTimeout)
+	expectPrefixEvent(subnet3, false)
 
 	// Should not have any routes.
 	if got := len(s.GetRouteTable()); got != 0 {
@@ -1482,10 +1484,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	subnet := prefix.Subnet()
 
 	ndpDisp := ndpDispatcher{
-		prefixC:        make(chan ndpPrefixEvent, 10),
+		prefixC:        make(chan ndpPrefixEvent, 1),
 		rememberPrefix: true,
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1495,33 +1497,27 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 		NDPDisp: &ndpDisp,
 	})
 
-	waitForEvent := func(discovered bool, timeout time.Duration) {
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) {
 		t.Helper()
 
 		select {
-		case r := <-ndpDisp.prefixC:
-			if r.nicID != 1 {
-				t.Errorf("got r.nicID = %d, want = 1", r.nicID)
-			}
-			if r.prefix != subnet {
-				t.Errorf("got r.prefix = %s, want = %s", r.prefix, subnet)
-			}
-			if r.discovered != discovered {
-				t.Errorf("got r.discovered = %t, want = %t", r.discovered, discovered)
+		case e := <-ndpDisp.prefixC:
+			if diff := checkPrefixEvent(e, prefix, discovered); diff != "" {
+				t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
 			}
-		case <-time.After(timeout):
-			t.Fatal("timeout waiting for prefix discovery event")
+		default:
+			t.Fatal("expected prefix discovery event")
 		}
 	}
 
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(1) = %s", err)
-	}
-
 	// Receive an RA with prefix in an NDP Prefix Information option (PI)
 	// with infinite valid lifetime which should not get invalidated.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
-	waitForEvent(true, defaultTimeout)
+	expectPrefixEvent(subnet, true)
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1531,11 +1527,18 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	// Receive an RA with finite lifetime.
 	// The prefix should get invalidated after 1s.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
-	waitForEvent(false, testInfiniteLifetime)
+	select {
+	case e := <-ndpDisp.prefixC:
+		if diff := checkPrefixEvent(e, subnet, false); diff != "" {
+			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(testInfiniteLifetime):
+		t.Fatal("timed out waiting for prefix discovery event")
+	}
 
 	// Receive an RA with finite lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
-	waitForEvent(true, defaultTimeout)
+	expectPrefixEvent(subnet, true)
 
 	// Receive an RA with prefix with an infinite lifetime.
 	// The prefix should not be invalidated.
@@ -1558,7 +1561,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	// Receive an RA with 0 lifetime.
 	// The prefix should get invalidated.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 0, 0))
-	waitForEvent(false, defaultTimeout)
+	expectPrefixEvent(subnet, false)
 }
 
 // TestPrefixDiscoveryMaxRouters tests that only
@@ -1570,7 +1573,7 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1616,32 +1619,26 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
 		if i < stack.MaxDiscoveredOnLinkPrefixes {
 			select {
-			case r := <-ndpDisp.prefixC:
-				if r.nicID != 1 {
-					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-				}
-				if r.prefix != prefixes[i] {
-					t.Fatalf("got r.prefix = %s, want = %s", r.prefix, prefixes[i])
+			case e := <-ndpDisp.prefixC:
+				if diff := checkPrefixEvent(e, prefixes[i], true); diff != "" {
+					t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
 				}
-				if !r.discovered {
-					t.Fatal("got r.discovered = false, want = true")
-				}
-			case <-time.After(defaultTimeout):
-				t.Fatal("timeout waiting for prefix discovery event")
+			default:
+				t.Fatal("expected prefix discovery event")
 			}
 		} else {
 			select {
 			case <-ndpDisp.prefixC:
 				t.Fatal("should not have discovered a new prefix after we already discovered the max number of prefixes")
-			case <-time.After(defaultTimeout):
+			default:
 			}
 		}
 	}
 
 	// Should only have device routes for the first
 	// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes.
-	if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) {
-		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
+	if diff := cmp.Diff(expectedRt[:], s.GetRouteTable()); diff != "" {
+		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
 	}
 }
 
@@ -1663,8 +1660,6 @@ func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
 
 // TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
 func TestNoAutoGenAddr(t *testing.T) {
-	t.Parallel()
-
 	prefix, _, _ := prefixSubnetAddr(0, "")
 
 	// Being configured to auto-generate addresses means handle and
@@ -1682,9 +1677,9 @@ func TestNoAutoGenAddr(t *testing.T) {
 			t.Parallel()
 
 			ndpDisp := ndpDispatcher{
-				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NDPConfigs: stack.NDPConfigurations{
@@ -1705,12 +1700,18 @@ func TestNoAutoGenAddr(t *testing.T) {
 			select {
 			case <-ndpDisp.autoGenAddrC:
 				t.Fatal("unexpectedly auto-generated an address when configured not to")
-			case <-time.After(defaultTimeout):
+			default:
 			}
 		})
 	}
 }
 
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// event type is set to eventType.
+func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) string {
+	return cmp.Diff(ndpAutoGenAddrEvent{nicID: 1, addr: addr, eventType: eventType}, e, cmp.AllowUnexported(e))
+}
+
 // TestAutoGenAddr tests that an address is properly generated and invalidated
 // when configured to do so.
 func TestAutoGenAddr(t *testing.T) {
@@ -1726,9 +1727,9 @@ func TestAutoGenAddr(t *testing.T) {
 	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
-		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1738,42 +1739,36 @@ func TestAutoGenAddr(t *testing.T) {
 		NDPDisp: &ndpDisp,
 	})
 
-	waitForEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
 		t.Helper()
 
 		select {
-		case r := <-ndpDisp.autoGenAddrC:
-			if r.nicID != 1 {
-				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 			}
-			if r.addr != addr {
-				t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
-			}
-			if r.eventType != eventType {
-				t.Fatalf("got r.eventType = %v, want = %v", r.eventType, eventType)
-			}
-		case <-time.After(timeout):
-			t.Fatal("timeout waiting for addr auto gen event")
+		default:
+			t.Fatal("expected addr auto gen event")
 		}
 	}
 
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(1) = %s", err)
-	}
-
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with zero valid lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly auto-generated an address with 0 lifetime")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with non-zero lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
-	waitForEvent(addr1, newAddr, defaultTimeout)
+	expectAutoGenAddrEvent(addr1, newAddr)
 	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
@@ -1784,12 +1779,12 @@ func TestAutoGenAddr(t *testing.T) {
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly auto-generated an address with preferred lifetime > valid lifetime")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Receive an RA with prefix2 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	waitForEvent(addr2, newAddr, defaultTimeout)
+	expectAutoGenAddrEvent(addr2, newAddr)
 	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
@@ -1802,11 +1797,18 @@ func TestAutoGenAddr(t *testing.T) {
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 
 	// Wait for addr of prefix1 to be invalidated.
-	waitForEvent(addr1, invalidatedAddr, newMinVLDuration+defaultTimeout)
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(newMinVLDuration + defaultTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
 	if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should not have %s in the list of addresses", addr1)
 	}
@@ -1896,78 +1898,81 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 
 	const delta = 500 * time.Millisecond
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
-			ndpDisp := ndpDispatcher{
-				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
-			}
-			e := channel.New(10, 1280, linkAddr1)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              true,
-					AutoGenGlobalAddresses: true,
-				},
-				NDPDisp: &ndpDisp,
-			})
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
 
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(1) = %s", err)
-			}
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
 
-			// Receive an RA with prefix with initial VL, test.ovl.
-			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0))
-			select {
-			case r := <-ndpDisp.autoGenAddrC:
-				if r.nicID != 1 {
-					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-				}
-				if r.addr != addr {
-					t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+				ndpDisp := ndpDispatcher{
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
 				}
-				if r.eventType != newAddr {
-					t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+				e := channel.New(10, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						HandleRAs:              true,
+						AutoGenGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDisp,
+				})
+
+				if err := s.CreateNIC(1, e); err != nil {
+					t.Fatalf("CreateNIC(1) = %s", err)
 				}
-			case <-time.After(defaultTimeout):
-				t.Fatal("timeout waiting for addr auto gen event")
-			}
 
-			// Receive an new RA with prefix with new VL, test.nvl.
-			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0))
+				// Receive an RA with prefix with initial VL,
+				// test.ovl.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 
-			//
-			// Validate that the VL for the address got set to
-			// test.evl.
-			//
+				// Receive an new RA with prefix with new VL,
+				// test.nvl.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0))
 
-			// Make sure we do not get any invalidation events
-			// until atleast 500ms (delta) before test.evl.
-			select {
-			case <-ndpDisp.autoGenAddrC:
-				t.Fatalf("unexpectedly received an auto gen addr event")
-			case <-time.After(time.Duration(test.evl)*time.Second - delta):
-			}
+				//
+				// Validate that the VL for the address got set
+				// to test.evl.
+				//
 
-			// Wait for another second (2x delta), but now we expect
-			// the invalidation event.
-			select {
-			case r := <-ndpDisp.autoGenAddrC:
-				if r.nicID != 1 {
-					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-				}
-				if r.addr != addr {
-					t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+				// Make sure we do not get any invalidation
+				// events until atleast 500ms (delta) before
+				// test.evl.
+				select {
+				case <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly received an auto gen addr event")
+				case <-time.After(time.Duration(test.evl)*time.Second - delta):
 				}
-				if r.eventType != invalidatedAddr {
-					t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+
+				// Wait for another second (2x delta), but now
+				// we expect the invalidation event.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+
+				case <-time.After(2 * delta):
+					t.Fatal("timeout waiting for addr auto gen event")
 				}
-			case <-time.After(2 * delta):
-				t.Fatal("timeout waiting for addr auto gen event")
-			}
-		})
-	}
+			})
+		}
+	})
 }
 
 // TestAutoGenAddrRemoval tests that when auto-generated addresses are removed
@@ -1979,9 +1984,9 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
-		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -1995,51 +2000,37 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 		t.Fatalf("CreateNIC(1) = %s", err)
 	}
 
-	// Receive an RA with prefix with its valid lifetime = lifetime.
-	const lifetime = 5
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetime, 0))
-	select {
-	case r := <-ndpDisp.autoGenAddrC:
-		if r.nicID != 1 {
-			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-		}
-		if r.addr != addr {
-			t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
-		}
-		if r.eventType != newAddr {
-			t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
 		}
-	case <-time.After(defaultTimeout):
-		t.Fatal("timeout waiting for addr auto gen event")
 	}
 
-	// Remove the address.
+	// Receive a PI to auto-generate an address.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0))
+	expectAutoGenAddrEvent(addr, newAddr)
+
+	// Removing the address should result in an invalidation event
+	// immediately.
 	if err := s.RemoveAddress(1, addr.Address); err != nil {
 		t.Fatalf("RemoveAddress(_, %s) = %s", addr.Address, err)
 	}
-
-	// Should get the invalidation event immediately.
-	select {
-	case r := <-ndpDisp.autoGenAddrC:
-		if r.nicID != 1 {
-			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
-		}
-		if r.addr != addr {
-			t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
-		}
-		if r.eventType != invalidatedAddr {
-			t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
-		}
-	case <-time.After(defaultTimeout):
-		t.Fatal("timeout waiting for addr auto gen event")
-	}
+	expectAutoGenAddrEvent(addr, invalidatedAddr)
 
 	// Wait for the original valid lifetime to make sure the original timer
 	// got stopped/cleaned up.
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatalf("unexpectedly received an auto gen addr event")
-	case <-time.After(lifetime*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
 }
 
@@ -2051,9 +2042,9 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
-		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 	}
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NDPConfigs: stack.NDPConfigurations{
@@ -2077,12 +2068,12 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 
 	// Receive a PI where the generated address will be the same as the one
 	// that we already have assigned statically.
-	const lifetime = 5
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetime, 0))
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0))
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
-	case <-time.After(defaultTimeout):
+	default:
 	}
 	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
@@ -2093,7 +2084,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly received an auto gen addr event")
-	case <-time.After(lifetime*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
 	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
-- 
cgit v1.2.3


From 8e6e87f8e8885eeadb8b3d891e24137f11ebdf31 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 18 Dec 2019 15:12:33 -0800
Subject: Allow 'out-of-line' routing table updates for Router and Prefix
 discovery events

This change removes the requirement that a new routing table be provided when a
router or prefix discovery event happens so that an updated routing table may
be provided to the stack at a later time from the event.

This change is to address the use case where the netstack integrator may need to
obtain a lock before providing updated routes in response to the events above.

As an example, say we have an integrator that performs the below two operations
operations as described:
A. Normal route update:
  1. Obtain integrator lock
  2. Update routes in the integrator
  3. Call Stack.SetRouteTable with the updated routes
    3.1. Obtain Stack lock
    3.2. Update routes in Stack
    3.3. Release Stack lock
  4. Release integrator lock
B. NDP event triggered route update:
  1. Obtain Stack lock
  2. Call event handler
    2.1. Obtain integrator lock
    2.2. Update routes in the integrator
    2.3. Release integrator lock
    2.4. Return updated routes to update Stack
  3. Update routes in Stack
  4. Release Stack lock

A deadlock may occur if a Normal route update was attemped at the same time an
NDP event triggered route update was attempted. With threads T1 and T2:
1) T1 -> A.1, A.2
2) T2 -> B.1
3) T1 -> A.3 (hangs at A.3.1 since Stack lock is taken in step 2)
4) T2 -> B.2 (hangs at B.2.1 since integrator lock is taken in step 1)

Test: Existing tests were modified to not provide or expect routing table
changes in response to Router and Prefix discovery events.
PiperOrigin-RevId: 286274712
---
 pkg/tcpip/stack/ndp.go      |  69 ++++++--------
 pkg/tcpip/stack/ndp_test.go | 223 ++++----------------------------------------
 2 files changed, 50 insertions(+), 242 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 27bd02e76..90664ba8a 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -131,40 +131,34 @@ type NDPDispatcher interface {
 	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
 
 	// OnDefaultRouterDiscovered will be called when a new default router is
-	// discovered. Implementations must return true along with a new valid
-	// route table if the newly discovered router should be remembered. If
-	// an implementation returns false, the second return value will be
-	// ignored.
+	// discovered. Implementations must return true if the newly discovered
+	// router should be remembered.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
-	OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route)
+	OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool
 
 	// OnDefaultRouterInvalidated will be called when a discovered default
-	// router is invalidated. Implementers must return a new valid route
-	// table.
+	// router that was remembered is invalidated.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
-	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route
+	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address)
 
 	// OnOnLinkPrefixDiscovered will be called when a new on-link prefix is
-	// discovered. Implementations must return true along with a new valid
-	// route table if the newly discovered on-link prefix should be
-	// remembered. If an implementation returns false, the second return
-	// value will be ignored.
+	// discovered. Implementations must return true if the newly discovered
+	// on-link prefix should be remembered.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
-	OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route)
+	OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool
 
 	// OnOnLinkPrefixInvalidated will be called when a discovered on-link
-	// prefix is invalidated. Implementers must return a new valid route
-	// table.
+	// prefix that was remembered is invalidated.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
-	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route
+	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet)
 
 	// OnAutoGenAddress will be called when a new prefix with its
 	// autonomous address-configuration flag set has been received and SLAAC
@@ -668,7 +662,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 
 // invalidateDefaultRouter invalidates a discovered default router.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	rtr, ok := ndp.defaultRouters[ip]
 
@@ -686,8 +680,8 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
-	if ndp.nic.stack.ndpDisp != nil {
-		ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
 	}
 }
 
@@ -696,15 +690,15 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 //
 // The router identified by ip MUST NOT already be known by the NIC.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
-	if ndp.nic.stack.ndpDisp == nil {
+	ndpDisp := ndp.nic.stack.ndpDisp
+	if ndpDisp == nil {
 		return
 	}
 
 	// Inform the integrator when we discovered a default router.
-	remember, routeTable := ndp.nic.stack.ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip)
-	if !remember {
+	if !ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) {
 		// Informed by the integrator to not remember the router, do
 		// nothing further.
 		return
@@ -731,8 +725,6 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 		}),
 		doNotInvalidate: &doNotInvalidate,
 	}
-
-	ndp.nic.stack.routeTable = routeTable
 }
 
 // rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6
@@ -740,15 +732,15 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 //
 // The prefix identified by prefix MUST NOT already be known.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
-	if ndp.nic.stack.ndpDisp == nil {
+	ndpDisp := ndp.nic.stack.ndpDisp
+	if ndpDisp == nil {
 		return
 	}
 
 	// Inform the integrator when we discovered an on-link prefix.
-	remember, routeTable := ndp.nic.stack.ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix)
-	if !remember {
+	if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) {
 		// Informed by the integrator to not remember the prefix, do
 		// nothing further.
 		return
@@ -769,13 +761,11 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 		invalidationTimer: timer,
 		doNotInvalidate:   &doNotInvalidate,
 	}
-
-	ndp.nic.stack.routeTable = routeTable
 }
 
 // invalidateOnLinkPrefix invalidates a discovered on-link prefix.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	s, ok := ndp.onLinkPrefixes[prefix]
 
@@ -796,8 +786,8 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
-	if ndp.nic.stack.ndpDisp != nil {
-		ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
 	}
 }
 
@@ -829,7 +819,7 @@ func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Dur
 // handleOnLinkPrefixInformation assumes that the prefix this pi is for is
 // not the link-local prefix and the on-link flag is set.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
 	prefix := pi.Subnet()
 	prefixState, ok := ndp.onLinkPrefixes[prefix]
@@ -1066,10 +1056,11 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 	}
 
 	// Inform the integrator that we have a new SLAAC address.
-	if ndp.nic.stack.ndpDisp == nil {
+	ndpDisp := ndp.nic.stack.ndpDisp
+	if ndpDisp == nil {
 		return
 	}
-	if !ndp.nic.stack.ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
+	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
 		// Informed by the integrator not to add the address.
 		return
 	}
@@ -1135,8 +1126,8 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 
 	delete(ndp.autoGenAddresses, addr)
 
-	if ndp.nic.stack.ndpDisp != nil {
-		ndp.nic.stack.ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
 			Address:   addr,
 			PrefixLen: validPrefixLenForAutoGen,
 		})
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 9f589a471..666f86c33 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -163,7 +163,6 @@ type ndpDispatcher struct {
 	rememberPrefix bool
 	autoGenAddrC   chan ndpAutoGenAddrEvent
 	rdnssC         chan ndpRDNSSEvent
-	routeTable     []tcpip.Route
 }
 
 // Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
@@ -179,106 +178,56 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, add
 }
 
 // Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
-func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) {
-	if n.routerC != nil {
-		n.routerC <- ndpRouterEvent{
+func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool {
+	if c := n.routerC; c != nil {
+		c <- ndpRouterEvent{
 			nicID,
 			addr,
 			true,
 		}
 	}
 
-	if !n.rememberRouter {
-		return false, nil
-	}
-
-	rt := append([]tcpip.Route(nil), n.routeTable...)
-	rt = append(rt, tcpip.Route{
-		Destination: header.IPv6EmptySubnet,
-		Gateway:     addr,
-		NIC:         nicID,
-	})
-	n.routeTable = rt
-	return true, rt
+	return n.rememberRouter
 }
 
 // Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
-func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route {
-	if n.routerC != nil {
-		n.routerC <- ndpRouterEvent{
+func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) {
+	if c := n.routerC; c != nil {
+		c <- ndpRouterEvent{
 			nicID,
 			addr,
 			false,
 		}
 	}
-
-	var rt []tcpip.Route
-	exclude := tcpip.Route{
-		Destination: header.IPv6EmptySubnet,
-		Gateway:     addr,
-		NIC:         nicID,
-	}
-
-	for _, r := range n.routeTable {
-		if r != exclude {
-			rt = append(rt, r)
-		}
-	}
-	n.routeTable = rt
-	return rt
 }
 
 // Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered.
-func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route) {
-	if n.prefixC != nil {
-		n.prefixC <- ndpPrefixEvent{
+func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool {
+	if c := n.prefixC; c != nil {
+		c <- ndpPrefixEvent{
 			nicID,
 			prefix,
 			true,
 		}
 	}
 
-	if !n.rememberPrefix {
-		return false, nil
-	}
-
-	rt := append([]tcpip.Route(nil), n.routeTable...)
-	rt = append(rt, tcpip.Route{
-		Destination: prefix,
-		NIC:         nicID,
-	})
-	n.routeTable = rt
-	return true, rt
+	return n.rememberPrefix
 }
 
 // Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated.
-func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route {
-	if n.prefixC != nil {
-		n.prefixC <- ndpPrefixEvent{
+func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) {
+	if c := n.prefixC; c != nil {
+		c <- ndpPrefixEvent{
 			nicID,
 			prefix,
 			false,
 		}
 	}
-
-	var rt []tcpip.Route
-	exclude := tcpip.Route{
-		Destination: prefix,
-		NIC:         nicID,
-	}
-
-	for _, r := range n.routeTable {
-		if r != exclude {
-			rt = append(rt, r)
-		}
-	}
-	n.routeTable = rt
-	return rt
 }
 
 func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool {
-	if n.autoGenAddrC != nil {
-		n.autoGenAddrC <- ndpAutoGenAddrEvent{
+	if c := n.autoGenAddrC; c != nil {
+		c <- ndpAutoGenAddrEvent{
 			nicID,
 			addr,
 			newAddr,
@@ -288,8 +237,8 @@ func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWi
 }
 
 func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
-	if n.autoGenAddrC != nil {
-		n.autoGenAddrC <- ndpAutoGenAddrEvent{
+	if c := n.autoGenAddrC; c != nil {
+		c <- ndpAutoGenAddrEvent{
 			nicID,
 			addr,
 			invalidatedAddr,
@@ -299,8 +248,8 @@ func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpi
 
 // Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
 func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
-	if n.rdnssC != nil {
-		n.rdnssC <- ndpRDNSSEvent{
+	if c := n.rdnssC; c != nil {
+		c <- ndpRDNSSEvent{
 			nicID,
 			ndpRDNSS{
 				addrs,
@@ -976,15 +925,6 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 		t.Fatalf("CreateNIC(1) = %s", err)
 	}
 
-	routeTable := []tcpip.Route{
-		{
-			header.IPv6EmptySubnet,
-			llAddr3,
-			1,
-		},
-	}
-	s.SetRouteTable(routeTable)
-
 	// Receive an RA for a router we should not remember.
 	const lifetimeSeconds = 1
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, lifetimeSeconds))
@@ -997,11 +937,6 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 		t.Fatal("expected router discovery event")
 	}
 
-	// Original route table should not have been modified.
-	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Wait for the invalidation time plus some buffer to make sure we do
 	// not actually receive any invalidation events as we should not have
 	// remembered the router in the first place.
@@ -1010,11 +945,6 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 		t.Fatal("should not have received any router events")
 	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
-
-	// Original route table should not have been modified.
-	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
 }
 
 func TestRouterDiscovery(t *testing.T) {
@@ -1077,22 +1007,11 @@ func TestRouterDiscovery(t *testing.T) {
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
 	expectRouterEvent(llAddr2, true)
 
-	// Should have a default route through the discovered router.
-	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Rx an RA from another router (lladdr3) with non-zero lifetime.
 	l3Lifetime := time.Duration(6)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, uint16(l3Lifetime)))
 	expectRouterEvent(llAddr3, true)
 
-	// Should have default routes through the discovered routers.
-	want := []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}
-	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Rx an RA from lladdr2 with lesser lifetime.
 	l2Lifetime := time.Duration(2)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(l2Lifetime)))
@@ -1102,11 +1021,6 @@ func TestRouterDiscovery(t *testing.T) {
 	default:
 	}
 
-	// Should still have a default route through the discovered routers.
-	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Wait for lladdr2's router invalidation timer to fire. The lifetime
 	// of the router should have been updated to the most recent (smaller)
 	// lifetime.
@@ -1116,30 +1030,14 @@ func TestRouterDiscovery(t *testing.T) {
 	// event after this time, then something is wrong.
 	expectAsyncRouterInvalidationEvent(llAddr2, l2Lifetime*time.Second+defaultTimeout)
 
-	// Should no longer have the default route through lladdr2.
-	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Rx an RA from lladdr2 with huge lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
 	expectRouterEvent(llAddr2, true)
 
-	// Should have a default route through the discovered routers.
-	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}, {header.IPv6EmptySubnet, llAddr2, 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Rx an RA from lladdr2 with zero lifetime. It should be invalidated.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
 	expectRouterEvent(llAddr2, false)
 
-	// Should have deleted the default route through the router that just
-	// got invalidated.
-	if diff := cmp.Diff([]tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Wait for lladdr3's router invalidation timer to fire. The lifetime
 	// of the router should have been updated to the most recent (smaller)
 	// lifetime.
@@ -1148,12 +1046,6 @@ func TestRouterDiscovery(t *testing.T) {
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
 	expectAsyncRouterInvalidationEvent(llAddr3, l3Lifetime*time.Second+defaultTimeout)
-
-	// Should not have any routes now that all discovered routers have been
-	// invalidated.
-	if got := len(s.GetRouteTable()); got != 0 {
-		t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got)
-	}
 }
 
 // TestRouterDiscoveryMaxRouters tests that only
@@ -1179,8 +1071,6 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 		t.Fatalf("CreateNIC(1) = %s", err)
 	}
 
-	expectedRt := [stack.MaxDiscoveredDefaultRouters]tcpip.Route{}
-
 	// Receive an RA from 2 more than the max number of discovered routers.
 	for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ {
 		linkAddr := []byte{2, 2, 3, 4, 5, 0}
@@ -1190,7 +1080,6 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 		e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5))
 
 		if i <= stack.MaxDiscoveredDefaultRouters {
-			expectedRt[i-1] = tcpip.Route{header.IPv6EmptySubnet, llAddr, 1}
 			select {
 			case e := <-ndpDisp.routerC:
 				if diff := checkRouterEvent(e, llAddr, true); diff != "" {
@@ -1208,12 +1097,6 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 			}
 		}
 	}
-
-	// Should only have default routes for the first
-	// stack.MaxDiscoveredDefaultRouters discovered routers.
-	if diff := cmp.Diff(expectedRt[:], s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
 }
 
 // TestNoPrefixDiscovery tests that prefix discovery will not be performed if
@@ -1299,15 +1182,6 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 		t.Fatalf("CreateNIC(1) = %s", err)
 	}
 
-	routeTable := []tcpip.Route{
-		{
-			header.IPv6EmptySubnet,
-			llAddr3,
-			1,
-		},
-	}
-	s.SetRouteTable(routeTable)
-
 	// Receive an RA with prefix that we should not remember.
 	const lifetimeSeconds = 1
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetimeSeconds, 0))
@@ -1320,11 +1194,6 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 		t.Fatal("expected prefix discovery event")
 	}
 
-	// Original route table should not have been modified.
-	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Wait for the invalidation time plus some buffer to make sure we do
 	// not actually receive any invalidation events as we should not have
 	// remembered the prefix in the first place.
@@ -1333,11 +1202,6 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 		t.Fatal("should not have received any prefix events")
 	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
-
-	// Original route table should not have been modified.
-	if diff := cmp.Diff(routeTable, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
 }
 
 func TestPrefixDiscovery(t *testing.T) {
@@ -1392,39 +1256,18 @@ func TestPrefixDiscovery(t *testing.T) {
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0))
 	expectPrefixEvent(subnet1, true)
 
-	// Should have added a device route for subnet1 through the nic.
-	if diff := cmp.Diff([]tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Receive an RA with prefix2 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0))
 	expectPrefixEvent(subnet2, true)
 
-	// Should have added a device route for subnet2 through the nic.
-	if diff := cmp.Diff([]tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Receive an RA with prefix3 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0))
 	expectPrefixEvent(subnet3, true)
 
-	// Should have added a device route for subnet3 through the nic.
-	if diff := cmp.Diff([]tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Receive an RA with prefix1 in a PI with lifetime = 0.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
 	expectPrefixEvent(subnet1, false)
 
-	// Should have removed the device route for subnet1 through the nic.
-	want := []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}
-	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Receive an RA with prefix2 in a PI with lesser lifetime.
 	lifetime := uint32(2)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, lifetime, 0))
@@ -1434,11 +1277,6 @@ func TestPrefixDiscovery(t *testing.T) {
 	default:
 	}
 
-	// Should not have updated route table.
-	if diff := cmp.Diff(want, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Wait for prefix2's most recent invalidation timer plus some buffer to
 	// expire.
 	select {
@@ -1450,19 +1288,9 @@ func TestPrefixDiscovery(t *testing.T) {
 		t.Fatal("timed out waiting for prefix discovery event")
 	}
 
-	// Should have removed the device route for subnet2 through the nic.
-	if diff := cmp.Diff([]tcpip.Route{{subnet3, tcpip.Address([]byte(nil)), 1}}, s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
-
 	// Receive RA to invalidate prefix3.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0))
 	expectPrefixEvent(subnet3, false)
-
-	// Should not have any routes.
-	if got := len(s.GetRouteTable()); got != 0 {
-		t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got)
-	}
 }
 
 func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
@@ -1589,7 +1417,6 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 	}
 
 	optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2)
-	expectedRt := [stack.MaxDiscoveredOnLinkPrefixes]tcpip.Route{}
 	prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
 
 	// Receive an RA with 2 more than the max number of discovered on-link
@@ -1609,10 +1436,6 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 		copy(buf[14:], prefix.Address)
 
 		optSer[i] = header.NDPPrefixInformation(buf[:])
-
-		if i < stack.MaxDiscoveredOnLinkPrefixes {
-			expectedRt[i] = tcpip.Route{prefixes[i], tcpip.Address([]byte(nil)), 1}
-		}
 	}
 
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
@@ -1634,12 +1457,6 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 			}
 		}
 	}
-
-	// Should only have device routes for the first
-	// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes.
-	if diff := cmp.Diff(expectedRt[:], s.GetRouteTable()); diff != "" {
-		t.Fatalf("GetRouteTable() mismatch (-want +got):\n%s", diff)
-	}
 }
 
 // Checks to see if list contains an IPv6 address, item.
-- 
cgit v1.2.3


From 744401297a8c93ce5992ba99aa84f3dcdc19ae9e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 18 Dec 2019 15:47:24 -0800
Subject: Add VFS2 plumbing for extended attributes.

PiperOrigin-RevId: 286281274
---
 pkg/sentry/fsimpl/ext/filesystem.go          | 36 ++++++++++++
 pkg/sentry/fsimpl/kernfs/filesystem.go       | 52 +++++++++++++++++
 pkg/sentry/fsimpl/memfs/filesystem.go        | 48 +++++++++++++++
 pkg/sentry/vfs/file_description.go           | 49 +++++++++++++++-
 pkg/sentry/vfs/file_description_impl_util.go | 25 ++++++++
 pkg/sentry/vfs/filesystem.go                 | 16 ++++-
 pkg/sentry/vfs/options.go                    | 14 +++++
 pkg/sentry/vfs/testutil.go                   | 20 +++++++
 pkg/sentry/vfs/vfs.go                        | 87 ++++++++++++++++++++++++++++
 9 files changed, 345 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index e7aa3b41b..d7e87979a 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -443,6 +443,42 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index db486b6c1..3cbbe4b20 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -683,6 +683,58 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return nil, err
+	}
+	// kernfs currently does not support extended attributes.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return "", err
+	}
+	// kernfs currently does not support extended attributes.
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 1f2a5122a..22f1e811f 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -584,6 +584,54 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(b/127675828): support extended attributes
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	// TODO(b/127675828): support extended attributes
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 6575afd16..c5a9adca3 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -212,7 +213,21 @@ type FileDescriptionImpl interface {
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
-	// TODO: extended attributes; file locking
+	// Listxattr returns all extended attribute names for the file.
+	Listxattr(ctx context.Context) ([]string, error)
+
+	// Getxattr returns the value associated with the given extended attribute
+	// for the file.
+	Getxattr(ctx context.Context, name string) (string, error)
+
+	// Setxattr changes the value associated with the given extended attribute
+	// for the file.
+	Setxattr(ctx context.Context, opts SetxattrOptions) error
+
+	// Removexattr removes the given extended attribute from the file.
+	Removexattr(ctx context.Context, name string) error
+
+	// TODO: file locking
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
@@ -329,6 +344,38 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 	return fd.impl.Ioctl(ctx, uio, args)
 }
 
+// Listxattr returns all extended attribute names for the file represented by
+// fd.
+func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	names, err := fd.impl.Listxattr(ctx)
+	if err == syserror.ENOTSUP {
+		// Linux doesn't actually return ENOTSUP in this case; instead,
+		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+		// subsystem to return security extended attributes, which by default
+		// don't exist.
+		return nil, nil
+	}
+	return names, err
+}
+
+// Getxattr returns the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	return fd.impl.Getxattr(ctx, name)
+}
+
+// Setxattr changes the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	return fd.impl.Setxattr(ctx, opts)
+}
+
+// Removexattr removes the given extended attribute from the file represented
+// by fd.
+func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.impl.Removexattr(ctx, name)
+}
+
 // SyncFS instructs the filesystem containing fd to execute the semantics of
 // syncfs(2).
 func (fd *FileDescription) SyncFS(ctx context.Context) error {
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index aae023254..3df49991c 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 	return 0, syserror.ENOTTY
 }
 
+// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// inode_operations::listxattr == NULL in Linux.
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.Listxattr.
+	return nil, syserror.ENOTSUP
+}
+
+// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+	return "", syserror.ENOTSUP
+}
+
+// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	return syserror.ENOTSUP
+}
+
+// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 8011eba3f..b766614e7 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -186,6 +186,20 @@ type FilesystemImpl interface {
 	// UnlinkAt removes the non-directory file at rp.
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
+	// ListxattrAt returns all extended attribute names for the file at rp.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+
+	// GetxattrAt returns the value associated with the given extended
+	// attribute for the file at rp.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+
+	// SetxattrAt changes the value associated with the given extended
+	// attribute for the file at rp.
+	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+
+	// RemovexattrAt removes the given extended attribute from the file at rp.
+	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
 	//
 	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
@@ -208,7 +222,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: extended attributes; inotify_add_watch(); bind()
+	// TODO: inotify_add_watch(); bind()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3ecbc8fc1..97ee4a446 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -101,6 +101,20 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
+// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
+// FileDescriptionImpl.Setxattr().
+type SetxattrOptions struct {
+	// Name is the name of the extended attribute being mutated.
+	Name string
+
+	// Value is the extended attribute's new value.
+	Value string
+
+	// Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2).
+	Flags uint32
+}
+
 // StatOptions contains options to VirtualFilesystem.StatAt(),
 // FilesystemImpl.StatAt(), FileDescription.Stat(), and
 // FileDescriptionImpl.Stat().
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 7a1d9e383..d94117bce 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -117,6 +117,26 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
 	return syserror.EPERM
 }
 
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+	return nil, syserror.EPERM
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+	return "", syserror.EPERM
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+	return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+	return syserror.EPERM
+}
+
 // PrependPath implements FilesystemImpl.PrependPath.
 func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
 	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 7262b0d0a..e60898d7c 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -440,6 +440,93 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 	}
 }
 
+// ListxattrAt returns all extended attribute names for the file at the given
+// path.
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil, err
+	}
+	for {
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return names, nil
+		}
+		if err == syserror.ENOTSUP {
+			// Linux doesn't actually return ENOTSUP in this case; instead,
+			// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+			// subsystem to return security extended attributes, which by
+			// default don't exist.
+			vfs.putResolvingPath(rp)
+			return nil, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// GetxattrAt returns the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return "", err
+	}
+	for {
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return val, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// SetxattrAt changes the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RemovexattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
 // SyncAllFilesystems has the semantics of Linux's sync(2).
 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 	fss := make(map[*Filesystem]struct{})
-- 
cgit v1.2.3


From 0d475cdb019e659c84e767a7d89452cd12332257 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 18 Dec 2019 17:09:08 -0800
Subject: Increase waitForProcessList timeout

It can take more than 10 seconds when running under --race.

PiperOrigin-RevId: 286296060
---
 runsc/container/container_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 2ced028f6..c10f85992 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -60,7 +60,7 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 		return nil
 	}
 	// Gives plenty of time as tests can run slow under --race.
-	return testutil.Poll(cb, 10*time.Second)
+	return testutil.Poll(cb, 30*time.Second)
 }
 
 func waitForProcessCount(cont *Container, want int) error {
-- 
cgit v1.2.3


From 57ce26c0b465dce332a59c9fabb05f737ff4241d Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 18 Dec 2019 18:22:50 -0800
Subject: net/tcp: allow to call listen without bind

When listen(2) is called on an unbound socket, the socket is
automatically bound to a random free port with the local address
set to INADDR_ANY.

PiperOrigin-RevId: 286305906
---
 pkg/tcpip/transport/tcp/endpoint.go         | 13 ++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 33 +++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index dd8b47cbe..fe629aa40 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1974,6 +1974,15 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		return nil
 	}
 
+	if e.state == StateInitial {
+		// The listen is called on an unbound socket, the socket is
+		// automatically bound to a random free port with the local
+		// address set to INADDR_ANY.
+		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
+			return err
+		}
+	}
+
 	// Endpoint must be bound before it can transition to listen mode.
 	if e.state != StateBound {
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
@@ -2033,6 +2042,10 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
+	return e.bindLocked(addr)
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore. This is because once the endpoint goes into a connected or
 	// listen state, it is already bound.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 5bb9d2e99..619d41901 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -102,19 +102,17 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
               SyscallFailsWithErrno(EAFNOSUPPORT));
 }
 
-TEST_P(SocketInetLoopbackTest, TCP) {
-  auto const& param = GetParam();
-
-  TestAddress const& listener = param.listener;
-  TestAddress const& connector = param.connector;
-
+void tcpSimpleConnectTest(TestAddress const& listener,
+                          TestAddress const& connector, bool unbound) {
   // Create the listening socket.
   const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
   sockaddr_storage listen_addr = listener.addr;
-  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                   listener.addr_len),
-              SyscallSucceeds());
+  if (!unbound) {
+    ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                     listener.addr_len),
+                SyscallSucceeds());
+  }
   ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
@@ -148,6 +146,23 @@ TEST_P(SocketInetLoopbackTest, TCP) {
   ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
 }
 
+TEST_P(SocketInetLoopbackTest, TCP) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  tcpSimpleConnectTest(listener, connector, true);
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  tcpSimpleConnectTest(listener, connector, false);
+}
+
 TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   auto const& param = GetParam();
 
-- 
cgit v1.2.3


From bb00438f36ebd19968246a838c2ddd61b9e14b79 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 19 Dec 2019 14:29:57 -0800
Subject: Make masterInodeOperations.Truncate take a pointer receiver.

Otherwise a copy happens, which triggers a data race when reading
masterInodeOperations.SimpleFileOperations.uattr, which must be accessed with a
lock held.

PiperOrigin-RevId: 286464473
---
 pkg/sentry/fs/tty/master.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 934828c12..6b07f6bf2 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -77,7 +77,7 @@ func (mi *masterInodeOperations) Release(ctx context.Context) {
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
-- 
cgit v1.2.3


From 80c8aecd51c8cda02fe36ed663d09e5b71a5b682 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Thu, 19 Dec 2019 15:24:15 -0800
Subject: Install python2 in the Dockerfile.

Without it, we get a build failure (inside the container) when trying to build
//runsc.

PiperOrigin-RevId: 286474518
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 5b95822f9..738623023 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:bionic
 
-RUN apt-get update && apt-get install -y curl gnupg2 git python3 python3-distutils python3-pip
+RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
 RUN apt-get update && apt-get install -y bazel && apt-get clean
-- 
cgit v1.2.3


From 7419e0e5d74621b2be60e9b18e4e2d7bb2a65cc3 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 19 Dec 2019 16:05:35 -0800
Subject: Parameterize mmap tests.

This test suite has existed for quite a while and has become kind of messy.
Various tests can be joined together by parameterizing.

PiperOrigin-RevId: 286482240
---
 test/syscalls/linux/mmap.cc | 207 +++++++++++++++-----------------------------
 1 file changed, 69 insertions(+), 138 deletions(-)

diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 6f2639d8a..1c4d9f1c7 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -814,23 +814,27 @@ class MMapFileTest : public MMapTest {
   }
 };
 
+class MMapFileParamTest
+    : public MMapFileTest,
+      public ::testing::WithParamInterface<std::tuple<int, int>> {
+ protected:
+  int prot() const { return std::get<0>(GetParam()); }
+
+  int flags() const { return std::get<1>(GetParam()); }
+};
+
 // MAP_POPULATE allowed.
 // There isn't a good way to verify it actually did anything.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, MapPopulate) {
-  ASSERT_THAT(
-      Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0),
-      SyscallSucceeds());
+TEST_P(MMapFileParamTest, MapPopulate) {
+  ASSERT_THAT(Map(0, kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+              SyscallSucceeds());
 }
 
 // MAP_POPULATE on a short file.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, MapPopulateShort) {
-  ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE,
-                  fd_.get(), 0),
-              SyscallSucceeds());
+TEST_P(MMapFileParamTest, MapPopulateShort) {
+  ASSERT_THAT(
+      Map(0, 2 * kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+      SyscallSucceeds());
 }
 
 // Read contents from mapped file.
@@ -901,16 +905,6 @@ TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) {
             reinterpret_cast<volatile char*>(addr));
 }
 
-// MAP_PRIVATE PROT_READ is not allowed on write-only FDs.
-TEST_F(MMapFileTest, ReadPrivateOnWriteOnlyFd) {
-  const FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
-
-  uintptr_t addr;
-  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd.get(), 0),
-              SyscallFailsWithErrno(EACCES));
-}
-
 // MAP_SHARED PROT_WRITE not allowed on read-only FDs.
 TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
   const FileDescriptor fd =
@@ -922,28 +916,13 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
       SyscallFailsWithErrno(EACCES));
 }
 
-// MAP_SHARED PROT_READ not allowed on write-only FDs.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
-  const FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
-
-  uintptr_t addr;
-  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd.get(), 0),
-              SyscallFailsWithErrno(EACCES));
-}
-
-// MAP_SHARED PROT_WRITE not allowed on write-only FDs.
-// The FD must always be readable.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) {
+// The FD must be readable.
+TEST_P(MMapFileParamTest, WriteOnlyFd) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
 
   uintptr_t addr;
-  EXPECT_THAT(addr = Map(0, kPageSize, PROT_WRITE, MAP_SHARED, fd.get(), 0),
+  EXPECT_THAT(addr = Map(0, kPageSize, prot(), flags(), fd.get(), 0),
               SyscallFailsWithErrno(EACCES));
 }
 
@@ -1182,7 +1161,7 @@ TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) {
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
               SyscallSucceeds());
 
-  // Check that the memory contains he file data.
+  // Check that the memory contains the file data.
   EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
 
   // Truncate down, then up.
@@ -1371,125 +1350,68 @@ TEST_F(MMapFileTest, WritePrivate) {
               EqualsMemory(std::string(len, '\0')));
 }
 
-// SIGBUS raised when writing past end of file to a private mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
+// SIGBUS raised when reading or writing past end of a mapped file.
+TEST_P(MMapFileParamTest, SigBusDeath) {
   SetupGvisorDeathTest();
 
   uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                         fd_.get(), 0),
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
               SyscallSucceeds());
 
-  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
-  // accessible. Write just beyond that.
-  size_t len = strlen(kFileContents);
-  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
-                        reinterpret_cast<volatile char*>(addr + kPageSize)),
-              ::testing::KilledBySignal(SIGBUS), "");
-}
-
-// SIGBUS raised when reading past end of file on a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathReadShared) {
-  SetupGvisorDeathTest();
-
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
-              SyscallSucceeds());
-
-  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
-  // accessible. Read just beyond that.
-  std::vector<char> in(kPageSize);
-  EXPECT_EXIT(
-      std::copy(reinterpret_cast<volatile char*>(addr + kPageSize),
-                reinterpret_cast<volatile char*>(addr + kPageSize) + kPageSize,
-                in.data()),
-      ::testing::KilledBySignal(SIGBUS), "");
+  auto* start = reinterpret_cast<volatile char*>(addr + kPageSize);
+
+  // MMapFileTest makes a file kPageSize/2 long. The entire first page should be
+  // accessible, but anything beyond it should not.
+  if (prot() & PROT_WRITE) {
+    // Write beyond first page.
+    size_t len = strlen(kFileContents);
+    EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, start),
+                ::testing::KilledBySignal(SIGBUS), "");
+  } else {
+    // Read beyond first page.
+    std::vector<char> in(kPageSize);
+    EXPECT_EXIT(std::copy(start, start + kPageSize, in.data()),
+                ::testing::KilledBySignal(SIGBUS), "");
+  }
 }
 
-// SIGBUS raised when reading past end of file on a shared mapping.
+// Tests that SIGBUS is not raised when reading or writing to a file-mapped
+// page before EOF, even if part of the mapping extends beyond EOF.
 //
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathWriteShared) {
-  SetupGvisorDeathTest();
-
+// See b/27877699.
+TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) {
   uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd_.get(), 0),
-              SyscallSucceeds());
-
-  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
-  // accessible. Write just beyond that.
-  size_t len = strlen(kFileContents);
-  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
-                        reinterpret_cast<volatile char*>(addr + kPageSize)),
-              ::testing::KilledBySignal(SIGBUS), "");
-}
-
-// Tests that SIGBUS is not raised when writing to a file-mapped page before
-// EOF, even if part of the mapping extends beyond EOF.
-TEST_F(MMapFileTest, NoSigBusOnPagesBeforeEOF) {
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                         fd_.get(), 0),
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
               SyscallSucceeds());
 
   // The test passes if this survives.
-  size_t len = strlen(kFileContents);
-  std::copy(kFileContents, kFileContents + len,
-            reinterpret_cast<volatile char*>(addr));
-}
-
-// Tests that SIGBUS is not raised when writing to a file-mapped page containing
-// EOF, *after* the EOF for a private mapping.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) {
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                         fd_.get(), 0),
-              SyscallSucceeds());
-
-  // The test passes if this survives. (Technically addr+kPageSize/2 is already
-  // beyond EOF, but +1 to check for fencepost errors.)
-  size_t len = strlen(kFileContents);
-  std::copy(kFileContents, kFileContents + len,
-            reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
-}
-
-// Tests that SIGBUS is not raised when reading from a file-mapped page
-// containing EOF, *after* the EOF for a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
-              SyscallSucceeds());
-
-  // The test passes if this survives. (Technically addr+kPageSize/2 is already
-  // beyond EOF, but +1 to check for fencepost errors.)
   auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
   size_t len = strlen(kFileContents);
-  std::vector<char> in(len);
-  std::copy(start, start + len, in.data());
+  if (prot() & PROT_WRITE) {
+    std::copy(kFileContents, kFileContents + len, start);
+  } else {
+    std::vector<char> in(len);
+    std::copy(start, start + len, in.data());
+  }
 }
 
-// Tests that SIGBUS is not raised when writing to a file-mapped page containing
-// EOF, *after* the EOF for a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) {
+// Tests that SIGBUS is not raised when reading or writing from a file-mapped
+// page containing EOF, *after* the EOF.
+TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
   uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd_.get(), 0),
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
               SyscallSucceeds());
 
   // The test passes if this survives. (Technically addr+kPageSize/2 is already
   // beyond EOF, but +1 to check for fencepost errors.)
+  auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
   size_t len = strlen(kFileContents);
-  std::copy(kFileContents, kFileContents + len,
-            reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
+  if (prot() & PROT_WRITE) {
+    std::copy(kFileContents, kFileContents + len, start);
+  } else {
+    std::vector<char> in(len);
+    std::copy(start, start + len, in.data());
+  }
 }
 
 // Tests that reading from writable shared file-mapped pages succeeds.
@@ -1733,6 +1655,15 @@ TEST(MMapNoFixtureTest, Map32Bit) {
 
 #endif  // defined(__x86_64__)
 
+INSTANTIATE_TEST_SUITE_P(
+    ReadWriteSharedPrivate, MMapFileParamTest,
+    ::testing::Combine(::testing::ValuesIn({
+                           PROT_READ,
+                           PROT_WRITE,
+                           PROT_READ | PROT_WRITE,
+                       }),
+                       ::testing::ValuesIn({MAP_SHARED, MAP_PRIVATE})));
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 29955a4797e8264f75886a989dbc81b2b5443f4c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 19 Dec 2019 17:25:18 -0800
Subject: futex: wake one waiter if futex_wake is called with a non-positive
 value

This change is needed to be compatible with the Linux kernel.

There is no glibc wrapper for the futex system call, so it is easy to
make a mistake and call syscall(__NR_futex, FUTEX_WAKE, addr) without
the fourth argument. This works on Linux, because it wakes one waiter
even if val is nonpositive.

PiperOrigin-RevId: 286494396
---
 pkg/sentry/syscalls/linux/sys_futex.go | 10 ++++++++++
 test/syscalls/linux/futex.cc           | 21 +++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b9bd25464..bde17a767 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -226,6 +226,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if mask == 0 {
 			return 0, nil, syserror.EINVAL
 		}
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
 		n, err := t.Futex().Wake(t, addr, private, mask, val)
 		return uintptr(n), nil, err
 
@@ -242,6 +247,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FUTEX_WAKE_OP:
 		op := uint32(val3)
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
 		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index d3e3f998c..40c80a6e1 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -239,6 +239,27 @@ TEST_P(PrivateAndSharedFutexTest, Wake1_NoRandomSave) {
   EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
 }
 
+TEST_P(PrivateAndSharedFutexTest, Wake0_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  // Prevent save/restore from interrupting futex_wait, which will cause it to
+  // return EAGAIN instead of the expected result if futex_wait is restarted
+  // after we change the value of a below.
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+                SyscallSucceedsWithValue(0));
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  // Change a so that if futex_wake happens before futex_wait, the latter
+  // returns EAGAIN instead of hanging the test.
+  a.fetch_add(1);
+  // The Linux kernel wakes one waiter even if val is 0 or negative.
+  EXPECT_THAT(futex_wake(IsPrivate(), &a, 0), SyscallSucceedsWithValue(1));
+}
+
 TEST_P(PrivateAndSharedFutexTest, WakeAll_NoRandomSave) {
   constexpr int kInitialValue = 1;
   std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
-- 
cgit v1.2.3


From 822d847ccaa1e6016b818bee289b5e33335f9fee Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 20 Dec 2019 08:43:15 -0800
Subject: Check for valid nfds before copying in an fd set.

Otherwise, CopyInFDSet will try to allocate a negative-length slice.

PiperOrigin-RevId: 286584907
---
 pkg/sentry/strace/select.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
index dea309fda..c77d418e6 100644
--- a/pkg/sentry/strace/select.go
+++ b/pkg/sentry/strace/select.go
@@ -36,6 +36,9 @@ func fdsFromSet(t *kernel.Task, set []byte) []int {
 }
 
 func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string {
+	if nfds < 0 {
+		return fmt.Sprintf("%#x (negative nfds)", addr)
+	}
 	if addr == 0 {
 		return "null"
 	}
-- 
cgit v1.2.3


From 3eb489ed6c67b069bc135ab92cb031ce80b40d8f Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 20 Dec 2019 11:52:24 -0800
Subject: Move VFS2 file description status flags to vfs.FileDescription.

PiperOrigin-RevId: 286616668
---
 pkg/sentry/fsimpl/ext/file_description.go         |  19 ---
 pkg/sentry/fsimpl/ext/inode.go                    |   9 +-
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go    |  16 +--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go          |  16 +--
 pkg/sentry/fsimpl/memfs/filesystem.go             |  11 +-
 pkg/sentry/fsimpl/memfs/memfs.go                  |  14 ---
 pkg/sentry/fsimpl/memfs/named_pipe.go             |   2 +-
 pkg/sentry/vfs/file_description.go                | 141 +++++++++++++++-------
 pkg/sentry/vfs/file_description_impl_util_test.go |   2 +-
 9 files changed, 107 insertions(+), 123 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 5eca2b83f..841274daf 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,13 +26,6 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	// flags is the same as vfs.OpenOptions.Flags which are passed to
-	// vfs.FilesystemImpl.OpenAt.
-	// TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
-	// fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
-	// Only close(2), fstat(2), fstatfs(2) should work.
-	flags uint32
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 24249525c..b2cc826c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,10 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.flags = flags
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -169,10 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.flags = flags
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		if flags&linux.O_PATH == 0 {
@@ -180,10 +178,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		fd.flags = flags
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 30c06baf0..51102ce48 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -65,17 +65,15 @@ type DynamicBytesFD struct {
 
 	vfsfd vfs.FileDescription
 	inode Inode
-	flags uint32
 }
 
 // Init initializes a DynamicBytesFD.
 func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
 	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	fd.flags = flags
 	fd.inode = d.Impl().(*Dentry).inode
 	fd.SetDataSource(data)
-	fd.vfsfd.Init(fd, m, d)
+	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
@@ -117,15 +115,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
 	// DynamicBytesFiles are immutable.
 	return syserror.EPERM
 }
-
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index d6c18937a..bd402330f 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -39,7 +39,6 @@ type GenericDirectoryFD struct {
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
-	flags    uint32
 	off      int64
 }
 
@@ -48,8 +47,7 @@ func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *Ordere
 	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.children = children
-	fd.flags = flags
-	fd.vfsfd.Init(fd, m, d)
+	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
 
 // VFSFileDescription returns a pointer to the vfs.FileDescription representing
@@ -180,18 +178,6 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	return offset, nil
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 22f1e811f..af4389459 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -282,9 +282,8 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	// Filter out flags that are not supported by memfs. O_DIRECTORY and
 	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
-	// appropriate bits in rp), but are returned by
-	// FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by
-	// pipes.
+	// appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK
+	// is supported only by pipes.
 	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
 
 	if opts.Flags&linux.O_CREAT == 0 {
@@ -384,7 +383,6 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.flags = flags
 		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
 		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
 		if fd.writable {
@@ -395,7 +393,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		}
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data = impl.data[:0]
@@ -411,8 +409,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		var fd directoryFD
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
-		fd.flags = flags
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		// Can't open symlinks without O_PATH (which is unimplemented).
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index 4cb2a4e0f..9d509f6e4 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -261,8 +261,6 @@ func (i *inode) direntType() uint8 {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	flags uint32 // status flags; immutable
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -273,18 +271,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
index 91cb4b1fc..d5060850e 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -57,6 +57,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 	mnt := rp.Mount()
 	mnt.IncRef()
 	vfsd.IncRef()
-	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index c5a9adca3..df03886c3 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -39,49 +40,43 @@ type FileDescription struct {
 	// operations.
 	refs int64
 
+	// statusFlags contains status flags, "initialized by open(2) and possibly
+	// modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
+	// memory operations.
+	statusFlags uint32
+
 	// vd is the filesystem location at which this FileDescription was opened.
 	// A reference is held on vd. vd is immutable.
 	vd VirtualDentry
 
+	opts FileDescriptionOptions
+
 	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in FileDescription.
 	impl FileDescriptionImpl
 }
 
+// FileDescriptionOptions contains options to FileDescription.Init().
+type FileDescriptionOptions struct {
+	// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
+	// usually only the case if O_DIRECT would actually have an effect.
+	AllowDirectIO bool
+}
+
 // Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller.
-func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
+// on mnt and d held by the caller. statusFlags is the initial file description
+// status flags, which is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
 	fd.refs = 1
+	fd.statusFlags = statusFlags | linux.O_LARGEFILE
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
 	}
+	fd.opts = *opts
 	fd.impl = impl
 }
 
-// Impl returns the FileDescriptionImpl associated with fd.
-func (fd *FileDescription) Impl() FileDescriptionImpl {
-	return fd.impl
-}
-
-// Mount returns the mount on which fd was opened. It does not take a reference
-// on the returned Mount.
-func (fd *FileDescription) Mount() *Mount {
-	return fd.vd.mount
-}
-
-// Dentry returns the dentry at which fd was opened. It does not take a
-// reference on the returned Dentry.
-func (fd *FileDescription) Dentry() *Dentry {
-	return fd.vd.dentry
-}
-
-// VirtualDentry returns the location at which fd was opened. It does not take
-// a reference on the returned VirtualDentry.
-func (fd *FileDescription) VirtualDentry() VirtualDentry {
-	return fd.vd
-}
-
 // IncRef increments fd's reference count.
 func (fd *FileDescription) IncRef() {
 	atomic.AddInt64(&fd.refs, 1)
@@ -113,6 +108,82 @@ func (fd *FileDescription) DecRef() {
 	}
 }
 
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+	return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+	return fd.vd.dentry
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+	return fd.vd
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags() uint32 {
+	return atomic.LoadUint32(&fd.statusFlags)
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
+	// Compare Linux's fs/fcntl.c:setfl().
+	oldFlags := fd.StatusFlags()
+	// Linux documents this check as "O_APPEND cannot be cleared if the file is
+	// marked as append-only and the file is open for write", which would make
+	// sense. However, the check as actually implemented seems to be "O_APPEND
+	// cannot be changed if the file is marked as append-only".
+	if (flags^oldFlags)&linux.O_APPEND != 0 {
+		stat, err := fd.impl.Stat(ctx, StatOptions{
+			// There is no mask bit for stx_attributes.
+			Mask: 0,
+			// Linux just reads inode::i_flags directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
+			return syserror.EPERM
+		}
+	}
+	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
+		stat, err := fd.impl.Stat(ctx, StatOptions{
+			Mask: linux.STATX_UID,
+			// Linux's inode_owner_or_capable() just reads inode::i_uid
+			// directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if stat.Mask&linux.STATX_UID == 0 {
+			return syserror.EPERM
+		}
+		if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
+			return syserror.EPERM
+		}
+	}
+	if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
+		return syserror.EINVAL
+	}
+	// TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+	atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+	return nil
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+	return fd.impl
+}
+
 // FileDescriptionImpl contains implementation details for an FileDescription.
 // Implementations of FileDescriptionImpl should contain their associated
 // FileDescription by value as their first field.
@@ -132,14 +203,6 @@ type FileDescriptionImpl interface {
 	// prevent the file descriptor from being closed.
 	OnClose(ctx context.Context) error
 
-	// StatusFlags returns file description status flags, as for
-	// fcntl(F_GETFL).
-	StatusFlags(ctx context.Context) (uint32, error)
-
-	// SetStatusFlags sets file description status flags, as for
-	// fcntl(F_SETFL).
-	SetStatusFlags(ctx context.Context, flags uint32) error
-
 	// Stat returns metadata for the file represented by the FileDescription.
 	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
 
@@ -264,18 +327,6 @@ func (fd *FileDescription) OnClose(ctx context.Context) error {
 	return fd.impl.OnClose(ctx)
 }
 
-// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	flags, err := fd.impl.StatusFlags(ctx)
-	flags |= linux.O_LARGEFILE
-	return flags, err
-}
-
-// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return fd.impl.SetStatusFlags(ctx, flags)
-}
-
 // Stat returns metadata for the file represented by fd.
 func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
 	return fd.impl.Stat(ctx, opts)
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index ac7799296..678be07fe 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -48,7 +48,7 @@ type genCountFD struct {
 
 func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
 	var fd genCountFD
-	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
 	fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
 	return &fd.vfsfd
 }
-- 
cgit v1.2.3


From 08c39e25870821f84f6da1915ceefe13b3196e02 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 20 Dec 2019 14:17:57 -0800
Subject: Change TODO to track correct bug.

PiperOrigin-RevId: 286639163
---
 pkg/tcpip/network/ipv6/ipv6.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index dd31f0fb7..e13f1fabf 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -162,7 +162,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
 func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
-	// TODO(b/119580726): Support IPv6 header-included packets.
+	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
 
-- 
cgit v1.2.3


From 21a14e9532365fc5eb51a5796ab66cf7f007ede3 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 20 Dec 2019 16:43:34 -0800
Subject: Add vfs.Dentry.Children().

PiperOrigin-RevId: 286660774
---
 pkg/sentry/vfs/dentry.go | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 40f4c1d09..6209eb053 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -85,12 +85,12 @@ type Dentry struct {
 	// mounts is accessed using atomic memory operations.
 	mounts uint32
 
-	// mu synchronizes disowning and mounting over this Dentry.
-	mu sync.Mutex
-
 	// children are child Dentries.
 	children map[string]*Dentry
 
+	// mu synchronizes disowning and mounting over this Dentry.
+	mu sync.Mutex
+
 	// impl is the DentryImpl associated with this Dentry. impl is immutable.
 	// This should be the last field in Dentry.
 	impl DentryImpl
@@ -199,6 +199,18 @@ func (d *Dentry) HasChildren() bool {
 	return len(d.children) != 0
 }
 
+// Children returns a map containing all of d's children.
+func (d *Dentry) Children() map[string]*Dentry {
+	if !d.HasChildren() {
+		return nil
+	}
+	m := make(map[string]*Dentry)
+	for name, child := range d.children {
+		m[name] = child
+	}
+	return m
+}
+
 // InsertChild makes child a child of d with the given name.
 //
 // InsertChild is a mutator of d and child.
-- 
cgit v1.2.3


From 818eb22b11d6e0c056af6d4605e8cd246e622231 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 20 Dec 2019 17:40:18 -0800
Subject: Add vfs.ResolvingPath.HandleJump().

PiperOrigin-RevId: 286666533
---
 pkg/sentry/vfs/resolving_path.go | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 621f5a6f8..d580fd39e 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -85,11 +85,11 @@ func init() {
 // so error "constants" are really mutable vars, necessitating somewhat
 // expensive interface object comparisons.
 
-type resolveMountRootError struct{}
+type resolveMountRootOrJumpError struct{}
 
 // Error implements error.Error.
-func (resolveMountRootError) Error() string {
-	return "resolving mount root"
+func (resolveMountRootOrJumpError) Error() string {
+	return "resolving mount root or jump"
 }
 
 type resolveMountPointError struct{}
@@ -274,7 +274,7 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
 			// ... of non-root mount.
 			rp.nextMount = vd.mount
 			rp.nextStart = vd.dentry
-			return nil, resolveMountRootError{}
+			return nil, resolveMountRootOrJumpError{}
 		}
 		// ... of root mount.
 		parent = d
@@ -385,11 +385,32 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
 	}
 }
 
+// HandleJump is called when the current path component is a "magic" link to
+// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem
+// method should continue path traversal, HandleMagicSymlink updates the path
+// component stream to reflect the magic link target and returns nil. Otherwise
+// it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
+	if rp.symlinks >= linux.MaxSymlinkTraversals {
+		return syserror.ELOOP
+	}
+	rp.symlinks++
+	// Consume the path component that represented the magic link.
+	rp.Advance()
+	// Unconditionally return a resolveMountRootOrJumpError, even if the Mount
+	// isn't changing, to force restarting at the new Dentry.
+	target.IncRef()
+	rp.nextMount = target.mount
+	rp.nextStart = target.dentry
+	return resolveMountRootOrJumpError{}
+}
+
 func (rp *ResolvingPath) handleError(err error) bool {
 	switch err.(type) {
-	case resolveMountRootError:
-		// Switch to the new Mount. We hold references on the Mount and Dentry
-		// (from VFS.getMountpointAt()).
+	case resolveMountRootOrJumpError:
+		// Switch to the new Mount. We hold references on the Mount and Dentry.
 		rp.decRefStartAndMount()
 		rp.mount = rp.nextMount
 		rp.start = rp.nextStart
@@ -407,9 +428,8 @@ func (rp *ResolvingPath) handleError(err error) bool {
 		return true
 
 	case resolveMountPointError:
-		// Switch to the new Mount. We hold a reference on the Mount (from
-		// VFS.getMountAt()), but borrow the reference on the mount root from
-		// the Mount.
+		// Switch to the new Mount. We hold a reference on the Mount, but
+		// borrow the reference on the mount root from the Mount.
 		rp.decRefStartAndMount()
 		rp.mount = rp.nextMount
 		rp.start = rp.nextMount.root
-- 
cgit v1.2.3


From b6e31aadaa74b235c46bfa5e08b5ac66a3b4f8c5 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 23 Dec 2019 03:01:07 +0000
Subject: Replace syscall.PTRACE_SYSEMU with unix.PTRACE_SYSEMU

Linux PTRACE_SYSEMU support on arm64 was merged to mainline from
V5.3, and the corresponding support in go also enabled recently.

Since the "syscall" package is locked down from go 1.4, so the ptrace
PTRACE_SYSEMU definition can't be added to package "syscall" on arm64.
According to the golang community, updates required by new systems or
versions should use the corresponding package in the golang.org/x/sys
repository instead(https://golang.org/pkg/syscall/).

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I2f917bb2be62f990c3e158e2bb99e094ea03f751
---
 pkg/sentry/platform/ptrace/subprocess.go       | 5 +++--
 pkg/sentry/platform/ptrace/subprocess_linux.go | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index ddb1f41e3..821f6848d 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -21,6 +21,7 @@ import (
 	"sync"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -541,14 +542,14 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		if isSingleStepping(regs) {
 			if _, _, errno := syscall.RawSyscall6(
 				syscall.SYS_PTRACE,
-				syscall.PTRACE_SYSEMU_SINGLESTEP,
+				unix.PTRACE_SYSEMU_SINGLESTEP,
 				uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
 			}
 		} else {
 			if _, _, errno := syscall.RawSyscall6(
 				syscall.SYS_PTRACE,
-				syscall.PTRACE_SYSEMU,
+				unix.PTRACE_SYSEMU,
 				uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
 			}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index cf13ea5e4..74968dfdf 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -54,7 +54,7 @@ func probeSeccomp() bool {
 
 	for {
 		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
-- 
cgit v1.2.3


From 5bc4ae9d5746e65909a0bdab60e7bd598d4401c7 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 23 Dec 2019 08:53:57 -0800
Subject: Clear any host-specific NDP state when becoming a router

This change supports clearing all host-only NDP state when NICs become routers.
All discovered routers, discovered on-link prefixes and auto-generated addresses
will be invalidated when becoming a router. This is because normally, routers do
not process Router Advertisements to discover routers or on-link prefixes, and
do not do SLAAC.

Tests: Unittest to make sure that all discovered routers, discovered prefixes
and auto-generated addresses get invalidated when transitioning from a host to
a router.
PiperOrigin-RevId: 286902309
---
 pkg/tcpip/stack/ndp.go      |  35 ++++++
 pkg/tcpip/stack/ndp_test.go | 283 ++++++++++++++++++++++++++++++++++++++++++--
 pkg/tcpip/stack/nic.go      |  12 ++
 pkg/tcpip/stack/stack.go    |  22 +++-
 4 files changed, 338 insertions(+), 14 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 90664ba8a..d9ab59336 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1155,3 +1155,38 @@ func (ndp *ndpState) autoGenAddrInvalidationTimer(addr tcpip.Address, vl time.Du
 		ndp.invalidateAutoGenAddress(addr)
 	})
 }
+
+// cleanupHostOnlyState cleans up any state that is only useful for hosts.
+//
+// cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a
+// host to a router. This function will invalidate all discovered on-link
+// prefixes, discovered routers, and auto-generated addresses as routers do not
+// normally process Router Advertisements to discover default routers and
+// on-link prefixes, and auto-generate addresses via SLAAC.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupHostOnlyState() {
+	for addr, _ := range ndp.autoGenAddresses {
+		ndp.invalidateAutoGenAddress(addr)
+	}
+
+	if got := len(ndp.autoGenAddresses); got != 0 {
+		log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got)
+	}
+
+	for prefix, _ := range ndp.onLinkPrefixes {
+		ndp.invalidateOnLinkPrefix(prefix)
+	}
+
+	if got := len(ndp.onLinkPrefixes); got != 0 {
+		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got)
+	}
+
+	for router, _ := range ndp.defaultRouters {
+		ndp.invalidateDefaultRouter(router)
+	}
+
+	if got := len(ndp.defaultRouters); got != 0 {
+		log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got)
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 666f86c33..64a9a2b20 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -47,6 +47,19 @@ var (
 	llAddr3 = header.LinkLocalAddr(linkAddr3)
 )
 
+func addrForSubnet(subnet tcpip.Subnet, linkAddr tcpip.LinkAddress) tcpip.AddressWithPrefix {
+	if !header.IsValidUnicastEthernetAddress(linkAddr) {
+		return tcpip.AddressWithPrefix{}
+	}
+
+	addrBytes := []byte(subnet.ID())
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	return tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(addrBytes),
+		PrefixLen: 64,
+	}
+}
+
 // prefixSubnetAddr returns a prefix (Address + Length), the prefix's equivalent
 // tcpip.Subnet, and an address where the lower half of the address is composed
 // of the EUI-64 of linkAddr if it is a valid unicast ethernet address.
@@ -59,17 +72,7 @@ func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWi
 
 	subnet := prefix.Subnet()
 
-	var addr tcpip.AddressWithPrefix
-	if header.IsValidUnicastEthernetAddress(linkAddr) {
-		addrBytes := []byte(subnet.ID())
-		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
-		addr = tcpip.AddressWithPrefix{
-			Address:   tcpip.Address(addrBytes),
-			PrefixLen: 64,
-		}
-	}
-
-	return prefix, subnet, addr
+	return prefix, subnet, addrForSubnet(subnet, linkAddr)
 }
 
 // TestDADDisabled tests that an address successfully resolves immediately
@@ -1772,7 +1775,7 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 				// test.evl.
 				select {
 				case <-ndpDisp.autoGenAddrC:
-					t.Fatalf("unexpectedly received an auto gen addr event")
+					t.Fatal("unexpectedly received an auto gen addr event")
 				case <-time.After(time.Duration(test.evl)*time.Second - delta):
 				}
 
@@ -1846,7 +1849,7 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 	// got stopped/cleaned up.
 	select {
 	case <-ndpDisp.autoGenAddrC:
-		t.Fatalf("unexpectedly received an auto gen addr event")
+		t.Fatal("unexpectedly received an auto gen addr event")
 	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
 }
@@ -2055,3 +2058,257 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 		})
 	}
 }
+
+// TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
+// and prefixes, and auto-generated addresses get invalidated when a NIC
+// becomes a router.
+func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
+	t.Parallel()
+
+	const (
+		lifetimeSeconds = 5
+		maxEvents       = 4
+		nicID1          = 1
+		nicID2          = 2
+	)
+
+	prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1)
+	e2Addr1 := addrForSubnet(subnet1, linkAddr2)
+	e2Addr2 := addrForSubnet(subnet2, linkAddr2)
+
+	ndpDisp := ndpDispatcher{
+		routerC:        make(chan ndpRouterEvent, maxEvents),
+		rememberRouter: true,
+		prefixC:        make(chan ndpPrefixEvent, maxEvents),
+		rememberPrefix: true,
+		autoGenAddrC:   make(chan ndpAutoGenAddrEvent, maxEvents),
+	}
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+			DiscoverOnLinkPrefixes: true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	e1 := channel.New(0, 1280, linkAddr1)
+	if err := s.CreateNIC(nicID1, e1); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+	}
+
+	e2 := channel.New(0, 1280, linkAddr2)
+	if err := s.CreateNIC(nicID2, e2); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+	}
+
+	expectRouterEvent := func() (bool, ndpRouterEvent) {
+		select {
+		case e := <-ndpDisp.routerC:
+			return true, e
+		default:
+		}
+
+		return false, ndpRouterEvent{}
+	}
+
+	expectPrefixEvent := func() (bool, ndpPrefixEvent) {
+		select {
+		case e := <-ndpDisp.prefixC:
+			return true, e
+		default:
+		}
+
+		return false, ndpPrefixEvent{}
+	}
+
+	expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			return true, e
+		default:
+		}
+
+		return false, ndpAutoGenAddrEvent{}
+	}
+
+	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr1 and
+	// llAddr2) w/ PI (for prefix1 in RA from llAddr1 and prefix2 in RA from
+	// llAddr2) to discover multiple routers and prefixes, and auto-gen
+	// multiple addresses.
+
+	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+	// We have other tests that make sure we receive the *correct* events
+	// on normal discovery of routers/prefixes, and auto-generated
+	// addresses. Here we just make sure we get an event and let other tests
+	// handle the correctness check.
+	if ok, _ := expectRouterEvent(); !ok {
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID1)
+	}
+	if ok, _ := expectPrefixEvent(); !ok {
+		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
+	}
+	if ok, _ := expectAutoGenAddrEvent(); !ok {
+		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
+	}
+
+	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+	if ok, _ := expectRouterEvent(); !ok {
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID1)
+	}
+	if ok, _ := expectPrefixEvent(); !ok {
+		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
+	}
+	if ok, _ := expectAutoGenAddrEvent(); !ok {
+		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
+	}
+
+	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+	if ok, _ := expectRouterEvent(); !ok {
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID2)
+	}
+	if ok, _ := expectPrefixEvent(); !ok {
+		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
+	}
+	if ok, _ := expectAutoGenAddrEvent(); !ok {
+		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
+	}
+
+	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+	if ok, _ := expectRouterEvent(); !ok {
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID2)
+	}
+	if ok, _ := expectPrefixEvent(); !ok {
+		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
+	}
+	if ok, _ := expectAutoGenAddrEvent(); !ok {
+		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
+	}
+
+	// We should have the auto-generated addresses added.
+	nicinfo := s.NICInfo()
+	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
+	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+	if !contains(nic1Addrs, e1Addr1) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+	}
+	if !contains(nic1Addrs, e1Addr2) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+	}
+	if !contains(nic2Addrs, e2Addr1) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+	}
+	if !contains(nic2Addrs, e2Addr2) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+	}
+
+	// We can't proceed any further if we already failed the test (missing
+	// some discovery/auto-generated address events or addresses).
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	s.SetForwarding(true)
+
+	// Collect invalidation events after becoming a router
+	gotRouterEvents := make(map[ndpRouterEvent]int)
+	for i := 0; i < maxEvents; i++ {
+		ok, e := expectRouterEvent()
+		if !ok {
+			t.Errorf("expected %d router events after becoming a router; got = %d", maxEvents, i)
+			break
+		}
+		gotRouterEvents[e]++
+	}
+	gotPrefixEvents := make(map[ndpPrefixEvent]int)
+	for i := 0; i < maxEvents; i++ {
+		ok, e := expectPrefixEvent()
+		if !ok {
+			t.Errorf("expected %d prefix events after becoming a router; got = %d", maxEvents, i)
+			break
+		}
+		gotPrefixEvents[e]++
+	}
+	gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
+	for i := 0; i < maxEvents; i++ {
+		ok, e := expectAutoGenAddrEvent()
+		if !ok {
+			t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", maxEvents, i)
+			break
+		}
+		gotAutoGenAddrEvents[e]++
+	}
+
+	// No need to proceed any further if we already failed the test (missing
+	// some invalidation events).
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	expectedRouterEvents := map[ndpRouterEvent]int{
+		{nicID: nicID1, addr: llAddr1, discovered: false}: 1,
+		{nicID: nicID1, addr: llAddr2, discovered: false}: 1,
+		{nicID: nicID2, addr: llAddr1, discovered: false}: 1,
+		{nicID: nicID2, addr: llAddr2, discovered: false}: 1,
+	}
+	if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
+		t.Errorf("router events mismatch (-want +got):\n%s", diff)
+	}
+	expectedPrefixEvents := map[ndpPrefixEvent]int{
+		{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
+		{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
+		{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
+		{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
+	}
+	if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
+		t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
+	}
+	expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
+		{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
+		{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
+		{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
+		{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
+	}
+	if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
+		t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
+	}
+
+	// Make sure the auto-generated addresses got removed.
+	nicinfo = s.NICInfo()
+	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+	if contains(nic1Addrs, e1Addr1) {
+		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+	}
+	if contains(nic1Addrs, e1Addr2) {
+		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+	}
+	if contains(nic2Addrs, e2Addr1) {
+		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+	}
+	if contains(nic2Addrs, e2Addr2) {
+		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+	}
+
+	// Should not get any more events (invalidation timers should have been
+	// cancelled when we transitioned into a router).
+	time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
+	select {
+	case <-ndpDisp.routerC:
+		t.Error("unexpected router event")
+	default:
+	}
+	select {
+	case <-ndpDisp.prefixC:
+		t.Error("unexpected prefix event")
+	default:
+	}
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Error("unexpected auto-generated address event")
+	default:
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index e8401c673..ddd014658 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -203,6 +203,18 @@ func (n *NIC) enable() *tcpip.Error {
 	return err
 }
 
+// becomeIPv6Router transitions n into an IPv6 router.
+//
+// When transitioning into an IPv6 router, host-only state (NDP discovered
+// routers, discovered on-link prefixes, and auto-generated addresses) will
+// be cleaned up/invalidated.
+func (n *NIC) becomeIPv6Router() {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.ndp.cleanupHostOnlyState()
+}
+
 // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
 // to start delivering packets.
 func (n *NIC) attachLinkEndpoint() {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 0e88643a4..7a9600679 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -662,11 +662,31 @@ func (s *Stack) Stats() tcpip.Stats {
 }
 
 // SetForwarding enables or disables the packet forwarding between NICs.
+//
+// When forwarding becomes enabled, any host-only state on all NICs will be
+// cleaned up.
 func (s *Stack) SetForwarding(enable bool) {
 	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
 	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// If forwarding status didn't change, do nothing further.
+	if s.forwarding == enable {
+		return
+	}
+
 	s.forwarding = enable
-	s.mu.Unlock()
+
+	// If this stack does not support IPv6, do nothing further.
+	if _, ok := s.networkProtocols[header.IPv6ProtocolNumber]; !ok {
+		return
+	}
+
+	if enable {
+		for _, nic := range s.nics {
+			nic.becomeIPv6Router()
+		}
+	}
 }
 
 // Forwarding returns if the packet forwarding between NICs is enabled.
-- 
cgit v1.2.3


From e548ce18051398fb3fe379326080411f59fda379 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 23 Dec 2019 11:48:03 -0800
Subject: Add python3-pip as dependency for Kokoro VM images.

bm-tools requires python3 and pip3 in order to run
tests. Add pip3 so that dependencies correctly install
for Kokoro runs with bazel.

PiperOrigin-RevId: 286923840
---
 kokoro/ubuntu1604/40_kokoro.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/ubuntu1604/40_kokoro.sh b/kokoro/ubuntu1604/40_kokoro.sh
index 3f50929d5..5f2dfc858 100755
--- a/kokoro/ubuntu1604/40_kokoro.sh
+++ b/kokoro/ubuntu1604/40_kokoro.sh
@@ -23,7 +23,7 @@ declare -r ssh_public_keys=(
 )
 
 # Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip zip
+apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip
 
 # junitparser is used to merge junit xml files.
 pip install junitparser
-- 
cgit v1.2.3


From f45df7505b0e7baf48a37f7c625f05051d144738 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 23 Dec 2019 13:17:29 -0800
Subject: Clean up vfs.FilesystemImpl methods that operate on parent
 directories.

- Make FilesystemImpl methods that operate on parent directories require
  !rp.Done() (i.e. there is at least one path component to resolve) as
  precondition and postcondition (in cases where they do not finish path
  resolution due to mount boundary / absolute symlink), and require that they
  do not need to follow the last path component (the file being created /
  deleted) as a symlink. Check for these in VFS.

- Add FilesystemImpl.GetParentDentryAt(), which is required to obtain the old
  parent directory for VFS.RenameAt(). (Passing the Dentry to be renamed
  instead has the wrong semantics if the file named by the old path is a mount
  point since the Dentry will be on the wrong Mount.)

- Update memfs to implement these methods correctly (?), including RenameAt.

- Change fspath.Parse() to allow empty paths (to simplify implementation of
  AT_EMPTY_PATH).

- Change vfs.PathOperation to take a fspath.Path instead of a raw pathname;
  non-test callers will need to fspath.Parse() pathnames themselves anyway in
  order to detect absolute paths and select PathOperation.Start accordingly.

PiperOrigin-RevId: 286934941
---
 pkg/fspath/BUILD                                  |   2 -
 pkg/fspath/fspath.go                              |  24 +-
 pkg/fspath/fspath_test.go                         |  25 +-
 pkg/sentry/fsimpl/ext/BUILD                       |   1 +
 pkg/sentry/fsimpl/ext/benchmark/BUILD             |   1 +
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  11 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |   9 +-
 pkg/sentry/fsimpl/ext/filesystem.go               |  12 +-
 pkg/sentry/fsimpl/kernfs/BUILD                    |   1 +
 pkg/sentry/fsimpl/kernfs/filesystem.go            | 138 +++--
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |   7 +-
 pkg/sentry/fsimpl/memfs/BUILD                     |   2 +
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |  27 +-
 pkg/sentry/fsimpl/memfs/filesystem.go             | 667 ++++++++++++----------
 pkg/sentry/fsimpl/memfs/memfs.go                  |  29 +-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |  18 +-
 pkg/sentry/vfs/dentry.go                          |  29 +-
 pkg/sentry/vfs/file_description.go                |  19 +
 pkg/sentry/vfs/filesystem.go                      | 251 +++++++-
 pkg/sentry/vfs/options.go                         |   3 +
 pkg/sentry/vfs/resolving_path.go                  |  46 +-
 pkg/sentry/vfs/testutil.go                        |   7 +-
 pkg/sentry/vfs/vfs.go                             | 259 ++++++---
 pkg/syserror/syserror.go                          |   1 +
 24 files changed, 1051 insertions(+), 538 deletions(-)

diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index 0c5f50397..ca540363c 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -14,7 +14,6 @@ go_library(
         "fspath.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/fspath",
-    deps = ["//pkg/syserror"],
 )
 
 go_test(
@@ -25,5 +24,4 @@ go_test(
         "fspath_test.go",
     ],
     embed = [":fspath"],
-    deps = ["//pkg/syserror"],
 )
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
index f68752560..9fb3fee24 100644
--- a/pkg/fspath/fspath.go
+++ b/pkg/fspath/fspath.go
@@ -18,19 +18,17 @@ package fspath
 
 import (
 	"strings"
-
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 const pathSep = '/'
 
-// Parse parses a pathname as described by path_resolution(7).
-func Parse(pathname string) (Path, error) {
+// Parse parses a pathname as described by path_resolution(7), except that
+// empty pathnames will be parsed successfully to a Path for which
+// Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is
+// necessary to support AT_EMPTY_PATH.)
+func Parse(pathname string) Path {
 	if len(pathname) == 0 {
-		// "... POSIX decrees that an empty pathname must not be resolved
-		// successfully. Linux returns ENOENT in this case." -
-		// path_resolution(7)
-		return Path{}, syserror.ENOENT
+		return Path{}
 	}
 	// Skip leading path separators.
 	i := 0
@@ -41,7 +39,7 @@ func Parse(pathname string) (Path, error) {
 			return Path{
 				Absolute: true,
 				Dir:      true,
-			}, nil
+			}
 		}
 	}
 	// Skip trailing path separators. This is required by Iterator.Next. This
@@ -64,7 +62,7 @@ func Parse(pathname string) (Path, error) {
 		},
 		Absolute: i != 0,
 		Dir:      j != len(pathname)-1,
-	}, nil
+	}
 }
 
 // Path contains the information contained in a pathname string.
@@ -111,6 +109,12 @@ func (p Path) String() string {
 	return b.String()
 }
 
+// HasComponents returns true if p contains a non-zero number of path
+// components.
+func (p Path) HasComponents() bool {
+	return p.Begin.Ok()
+}
+
 // An Iterator represents either a path component in a Path or a terminal
 // iterator indicating that the end of the path has been reached.
 //
diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go
index 215b35622..d5e9a549a 100644
--- a/pkg/fspath/fspath_test.go
+++ b/pkg/fspath/fspath_test.go
@@ -18,15 +18,10 @@ import (
 	"reflect"
 	"strings"
 	"testing"
-
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 func TestParseIteratorPartialPathnames(t *testing.T) {
-	path, err := Parse("/foo//bar///baz////")
-	if err != nil {
-		t.Fatalf("Parse failed: %v", err)
-	}
+	path := Parse("/foo//bar///baz////")
 	// Parse strips leading slashes, and records their presence as
 	// Path.Absolute.
 	if !path.Absolute {
@@ -70,6 +65,12 @@ func TestParse(t *testing.T) {
 		dir      bool
 	}
 	tests := []testCase{
+		{
+			pathname: "",
+			relpath:  []string{},
+			abs:      false,
+			dir:      false,
+		},
 		{
 			pathname: "/",
 			relpath:  []string{},
@@ -113,10 +114,7 @@ func TestParse(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.pathname, func(t *testing.T) {
-			p, err := Parse(test.pathname)
-			if err != nil {
-				t.Fatalf("failed to parse pathname %q: %v", test.pathname, err)
-			}
+			p := Parse(test.pathname)
 			t.Logf("pathname %q => path %q", test.pathname, p)
 			if p.Absolute != test.abs {
 				t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs)
@@ -134,10 +132,3 @@ func TestParse(t *testing.T) {
 		})
 	}
 }
-
-func TestParseEmptyPathname(t *testing.T) {
-	p, err := Parse("")
-	if err != syserror.ENOENT {
-		t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err)
-	}
-}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 880b7bcd3..bc90330bc 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -74,6 +74,7 @@ go_test(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index bfc46dfa6..4fc8296ef 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -7,6 +7,7 @@ go_test(
     size = "small",
     srcs = ["benchmark_test.go"],
     deps = [
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fsimpl/ext",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 177ce2cb9..2f46d2d13 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -24,6 +24,7 @@ import (
 	"strings"
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
@@ -121,7 +122,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -150,9 +151,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 			mountPointName := "/1/"
 			pop := vfs.PathOperation{
-				Root:     *root,
-				Start:    *root,
-				Pathname: mountPointName,
+				Root:  *root,
+				Start: *root,
+				Path:  fspath.Parse(mountPointName),
 			}
 
 			// Save the mount point for later use.
@@ -181,7 +182,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index e9f756732..5d6c999bd 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -25,6 +25,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
@@ -140,7 +141,7 @@ func TestSeek(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -359,7 +360,7 @@ func TestStatAt(t *testing.T) {
 
 			got, err := vfsfs.StatAt(ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.StatOptions{},
 			)
 			if err != nil {
@@ -429,7 +430,7 @@ func TestRead(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -565,7 +566,7 @@ func TestIterDirents(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index d7e87979a..616fc002a 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -275,6 +275,16 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	vfsd, inode, err := fs.walk(rp, true)
+	if err != nil {
+		return nil, err
+	}
+	inode.incRef()
+	return vfsd, nil
+}
+
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	vfsd, inode, err := fs.walk(rp, false)
@@ -378,7 +388,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
 	if rp.Done() {
 		return syserror.ENOENT
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 52596c090..59f7f39e2 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -49,6 +49,7 @@ go_test(
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3cbbe4b20..a6f9fced5 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -44,39 +44,37 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 		return nil, err
 	}
 afterSymlink:
+	name := rp.Component()
+	// Revalidation must be skipped if name is "." or ".."; d or its parent
+	// respectively can't be expected to transition from invalidated back to
+	// valid, so detecting invalidation and retrying would loop forever. This
+	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
+	// calls d_revalidate(), but walk_component() => handle_dots() does not.
+	if name == "." {
+		rp.Advance()
+		return vfsd, nil
+	}
+	if name == ".." {
+		nextVFSD, err := rp.ResolveParent(vfsd)
+		if err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return nextVFSD, nil
+	}
 	d.dirMu.Lock()
-	nextVFSD, err := rp.ResolveComponent(vfsd)
-	d.dirMu.Unlock()
+	nextVFSD, err := rp.ResolveChild(vfsd, name)
 	if err != nil {
+		d.dirMu.Unlock()
 		return nil, err
 	}
-	if nextVFSD != nil {
-		// Cached dentry exists, revalidate.
-		next := nextVFSD.Impl().(*Dentry)
-		if !next.inode.Valid(ctx) {
-			d.dirMu.Lock()
-			rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD)
-			d.dirMu.Unlock()
-			fs.deferDecRef(nextVFSD) // Reference from Lookup.
-			nextVFSD = nil
-		}
-	}
-	if nextVFSD == nil {
-		// Dentry isn't cached; it either doesn't exist or failed
-		// revalidation. Attempt to resolve it via Lookup.
-		name := rp.Component()
-		var err error
-		nextVFSD, err = d.inode.Lookup(ctx, name)
-		// Reference on nextVFSD dropped by a corresponding Valid.
-		if err != nil {
-			return nil, err
-		}
-		d.InsertChild(name, nextVFSD)
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD)
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
 	}
-	next := nextVFSD.Impl().(*Dentry)
-
 	// Resolve any symlink at current path component.
-	if rp.ShouldFollowSymlink() && d.isSymlink() {
+	if rp.ShouldFollowSymlink() && next.isSymlink() {
 		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
 		target, err := next.inode.Readlink(ctx)
 		if err != nil {
@@ -89,7 +87,44 @@ afterSymlink:
 
 	}
 	rp.Advance()
-	return nextVFSD, nil
+	return &next.vfsd, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) {
+	if childVFSD != nil {
+		// Cached dentry exists, revalidate.
+		child := childVFSD.Impl().(*Dentry)
+		if !child.inode.Valid(ctx) {
+			vfsObj.ForceDeleteDentry(childVFSD)
+			fs.deferDecRef(childVFSD) // Reference from Lookup.
+			childVFSD = nil
+		}
+	}
+	if childVFSD == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		//
+		// FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry,
+		// not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries
+		// in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl
+		// casts accordingly.
+		var err error
+		childVFSD, err = parent.inode.Lookup(ctx, name)
+		if err != nil {
+			return nil, err
+		}
+		// Reference on childVFSD dropped by a corresponding Valid.
+		parent.InsertChild(name, childVFSD)
+	}
+	return childVFSD.Impl().(*Dentry), nil
 }
 
 // walkExistingLocked resolves rp to an existing file.
@@ -242,6 +277,19 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
+	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
 	if rp.Done() {
@@ -459,40 +507,42 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
-	exchange := opts.Flags&linux.RENAME_EXCHANGE != 0
-	whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0
-	if exchange && (noReplace || whiteout) {
-		// Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE.
-		return syserror.EINVAL
-	}
-	if exchange || whiteout {
-		// Exchange and Whiteout flags are not supported on kernfs.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Only RENAME_NOREPLACE is supported.
+	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
 		return syserror.EINVAL
 	}
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
 	defer fs.mu.Lock()
 
+	// Resolve the destination directory first to verify that it's on this
+	// Mount.
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
 	mnt := rp.Mount()
-	if mnt != vd.Mount() {
+	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
 	}
-
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
 	}
 	defer mnt.EndWrite()
 
-	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	srcDirVFSD := oldParentVD.Dentry()
+	srcDir := srcDirVFSD.Impl().(*Dentry)
+	srcDir.dirMu.Lock()
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName))
+	srcDir.dirMu.Unlock()
 	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
-
-	srcVFSD := vd.Dentry()
-	srcDirVFSD := srcVFSD.Parent()
+	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
 	if err := checkDeleteLocked(rp, srcVFSD); err != nil {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index f78bb7b04..73b6e43b5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
@@ -82,9 +83,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 // Precondition: path should be relative path.
 func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
 	return vfs.PathOperation{
-		Root:     s.root,
-		Start:    s.root,
-		Pathname: path,
+		Root:  s.root,
+		Start: s.root,
+		Path:  fspath.Parse(path),
 	}
 }
 
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 0cc751eb8..5689bed3b 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -50,6 +50,7 @@ go_test(
     deps = [
         ":memfs",
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
@@ -68,6 +69,7 @@ go_test(
     embed = [":memfs"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index 4a7a94a52..6e987af88 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
@@ -193,9 +194,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -216,7 +217,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -237,7 +238,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -378,9 +379,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			root := mntns.Root()
 			defer root.DecRef()
 			pop := vfs.PathOperation{
-				Root:     root,
-				Start:    root,
-				Pathname: mountPointName,
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(mountPointName),
 			}
 			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 				Mode: 0755,
@@ -408,9 +409,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -438,7 +439,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -458,7 +459,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index af4389459..4a83f310c 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -25,323 +25,283 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// stepLocked resolves rp.Component() in parent directory vfsd.
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, nil, err
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
 	}
 afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(vfsd)
+	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	if nextVFSD == nil {
 		// Since the Dentry tree is the sole source of truth for memfs, if it's
 		// not in the Dentry tree, it doesn't exist.
-		return nil, nil, syserror.ENOENT
+		return nil, syserror.ENOENT
 	}
-	nextInode := nextVFSD.Impl().(*dentry).inode
-	if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+	next := nextVFSD.Impl().(*dentry)
+	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// TODO: symlink traversals update access time
 		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 		goto afterSymlink // don't check the current directory again
 	}
 	rp.Advance()
-	return nextVFSD, nextInode, nil
+	return next, nil
 }
 
-// walkExistingLocked resolves rp to an existing file.
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
 //
-// walkExistingLocked is loosely analogous to Linux's
-// fs/namei.c:path_lookupat().
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
 //
-// Preconditions: filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Done() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	for !rp.Final() {
+		next, err := stepLocked(rp, d)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
+		d = next
 	}
-	if rp.MustBeDir() && !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, inode, nil
+	return d, nil
 }
 
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory. It does not check that the returned directory is
-// searchable by the provider of rp.
+// resolveLocked resolves rp to an existing file.
 //
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		next, err := stepLocked(rp, d)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
+		d = next
 	}
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
+	if rp.MustBeDir() && !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, inode, nil
+	return d, nil
 }
 
-// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
 //
-// Preconditions: filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
-	if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
-		return "", err
-	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
-		return "", syserror.EEXIST
-	}
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
-		return "", err
+		return err
 	}
-	if childVFSD != nil {
-		return "", syserror.EEXIST
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
 	}
-	if parentVFSD.IsDisowned() {
-		return "", syserror.ENOENT
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
 	}
-	return pc, nil
-}
-
-// checkDeleteLocked checks that the file represented by vfsd may be deleted.
-func checkDeleteLocked(vfsd *vfs.Dentry) error {
-	parentVFSD := vfsd.Parent()
-	if parentVFSD == nil {
-		return syserror.EBUSY
+	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the child exists we want to return EEXIST immediately instead
+	// of attempting symlink/mount traversal.
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
 	}
-	if parentVFSD.IsDisowned() {
+	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
-	return nil
+	// In memfs, the only way to cause a dentry to be disowned is by removing
+	// it from the filesystem, so this check is equivalent to checking if
+	// parent has been removed.
+	if parent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	return create(parent, name)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	vfsd, inode, err := walkExistingLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
 	if opts.CheckSearchable {
-		if !inode.isDir() {
+		if !d.inode.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
 			return nil, err
 		}
 	}
-	inode.incRef()
-	return vfsd, nil
+	d.IncRef()
+	return &d.vfsd, nil
 }
 
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
-		return err
-	}
-	if rp.Mount() != vd.Mount() {
-		return syserror.EXDEV
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	d := vd.Dentry().Impl().(*dentry)
-	if d.inode.isDir() {
-		return syserror.EPERM
+		return nil, err
 	}
-	d.inode.incLinksLocked()
-	child := fs.newDentry(d.inode)
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		if d.inode.isDir() {
+			return syserror.EPERM
+		}
+		if d.inode.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if d.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		d.inode.incLinksLocked()
+		child := fs.newDentry(d.inode)
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	parentInode.incLinksLocked() // from child's ".."
-	return nil
+	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+		if parent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		parent.inode.incLinksLocked() // from child's ".."
+		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
 }
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-
-	switch opts.Mode.FileType() {
-	case 0:
-		// "Zero file type is equivalent to type S_IFREG." - mknod(2)
-		fallthrough
-	case linux.ModeRegular:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeNamedPipe:
-		child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-		parentVFSD.InsertChild(&child.vfsd, pc)
-		parentInode.impl.(*directory).childList.PushBack(child)
-		return nil
-
-	case linux.ModeSocket:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeCharacterDevice:
-		fallthrough
-	case linux.ModeBlockDevice:
-		// TODO(b/72101894): We don't support creating block or character
-		// devices at the moment.
-		//
-		// When we start supporting block and character devices, we'll
-		// need to check for CAP_MKNOD here.
-		return syserror.EPERM
-
-	default:
-		// "EINVAL - mode requested creation of something other than a
-		// regular file, device special file, FIFO or socket." - mknod(2)
-		return syserror.EINVAL
-	}
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		switch opts.Mode.FileType() {
+		case 0, linux.S_IFREG:
+			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFIFO:
+			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+			// Not yet supported.
+			return syserror.EPERM
+		default:
+			return syserror.EINVAL
+		}
+	})
 }
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	// Filter out flags that are not supported by memfs. O_DIRECTORY and
-	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
-	// appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK
-	// is supported only by pipes.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		// Not yet supported.
+		return nil, syserror.EOPNOTSUPP
+	}
 
+	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
+	// don't need fs.mu for writing.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
 		defer fs.mu.RUnlock()
-		vfsd, inode, err := walkExistingLocked(rp)
+		d, err := resolveLocked(rp)
 		if err != nil {
 			return nil, err
 		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
+		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
 	}
 
 	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
+	start := rp.Start().Impl().(*dentry)
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 	if rp.Done() {
+		// Reject attempts to open directories with O_CREAT.
 		if rp.MustBeDir() {
 			return nil, syserror.EISDIR
 		}
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
+		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
 	}
 afterTrailingSymlink:
-	// Walk to the parent directory of the last path component.
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, err
-		}
-	}
-	if !inode.isDir() {
-		return nil, syserror.ENOTDIR
+	parent, err := walkParentDirLocked(rp, start)
+	if err != nil {
+		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
 	if rp.MustBeDir() {
 		return nil, syserror.EISDIR
 	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
+	name := rp.Component()
+	if name == "." || name == ".." {
 		return nil, syserror.EISDIR
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := rp.ResolveChild(vfsd, pc)
-	if err != nil {
-		return nil, err
-	}
-	if childVFSD == nil {
+	child, err := stepLocked(rp, parent)
+	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -349,38 +309,35 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
-		child := fs.newDentry(childInode)
-		vfsd.InsertChild(&child.vfsd, pc)
-		inode.impl.(*directory).childList.PushBack(child)
-		return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true)
+		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return child.open(ctx, rp, opts.Flags, true)
 	}
-	// Open existing file or follow symlink.
-	if mustCreate {
-		return nil, syserror.EEXIST
+	if err != nil {
+		return nil, err
 	}
-	childInode := childVFSD.Impl().(*dentry).inode
-	if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, err
-		}
-		// rp.Final() may no longer be true since we now need to resolve the
-		// symlink target.
+	// Do we need to resolve a trailing symlink?
+	if !rp.Done() {
+		start = parent
 		goto afterTrailingSymlink
 	}
-	return childInode.open(ctx, rp, childVFSD, opts.Flags, false)
+	// Open existing file.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	return child.open(ctx, rp, opts.Flags, false)
 }
 
-func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(flags)
 	if !afterCreate {
-		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
 			return nil, err
 		}
 	}
 	mnt := rp.Mount()
-	switch impl := i.impl.(type) {
+	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
 		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
@@ -392,8 +349,8 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 			// mnt.EndWrite() is called by regularFileFD.Release().
 		}
 		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		d.IncRef()
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data = impl.data[:0]
@@ -408,28 +365,28 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		}
 		var fd directoryFD
 		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		d.IncRef()
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, vfsd, flags)
+		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
 	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
 	}
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
 func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
 	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
-	symlink, ok := inode.impl.(*symlink)
+	symlink, ok := d.inode.impl.(*symlink)
 	if !ok {
 		return "", syserror.EINVAL
 	}
@@ -437,63 +394,172 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	if rp.Done() {
-		return syserror.ENOENT
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// TODO(b/145974740): Support renameat2 flags.
+		return syserror.EINVAL
 	}
+
+	// Resolve newParent first to verify that it's on this Mount.
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
 	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
 		return err
 	}
-	defer rp.Mount().EndWrite()
-	// TODO: actually implement RenameAt
-	return syserror.EPERM
+	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the existing child is a symlink or mount point then we want
+	// to rename over it rather than follow it.
+	renamedVFSD := oldParent.vfsd.Child(oldName)
+	if renamedVFSD == nil {
+		return syserror.ENOENT
+	}
+	renamed := renamedVFSD.Impl().(*dentry)
+	if renamed.inode.isDir() {
+		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			// Writability is needed to change renamed's "..".
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	if replacedVFSD != nil {
+		replaced = replacedVFSD.Impl().(*dentry)
+		if replaced.inode.isDir() {
+			if !renamed.inode.isDir() {
+				return syserror.EISDIR
+			}
+			if replaced.vfsd.HasChildren() {
+				return syserror.ENOTEMPTY
+			}
+		} else {
+			if rp.MustBeDir() {
+				return syserror.ENOTDIR
+			}
+			if renamed.inode.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	} else {
+		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+	}
+	if newParent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+
+	// Linux places this check before some of those above; we do it here for
+	// simplicity, under the assumption that applications are not intentionally
+	// doing noop renames expecting them to succeed where non-noop renames
+	// would fail.
+	if renamedVFSD == replacedVFSD {
+		return nil
+	}
+	vfsObj := rp.VirtualFilesystem()
+	oldParentDir := oldParent.inode.impl.(*directory)
+	newParentDir := newParent.inode.impl.(*directory)
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+		return err
+	}
+	if replaced != nil {
+		newParentDir.childList.Remove(replaced)
+		if replaced.inode.isDir() {
+			newParent.inode.decLinksLocked() // from replaced's ".."
+		}
+		replaced.inode.decLinksLocked()
+	}
+	oldParentDir.childList.Remove(renamed)
+	newParentDir.childList.PushBack(renamed)
+	if renamed.inode.isDir() {
+		oldParent.inode.decLinksLocked()
+		newParent.inode.incLinksLocked()
+	}
+	// TODO: update timestamps and parent directory sizes
+	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
 		return err
 	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
 	}
-	if !inode.isDir() {
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if !child.inode.isDir() {
 		return syserror.ENOTDIR
 	}
-	if vfsd.HasChildren() {
+	if childVFSD.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
 		return err
 	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decRef()
+	parent.inode.impl.(*directory).childList.Remove(child)
+	parent.inode.decLinksLocked() // from child's ".."
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
 	return nil
 }
 
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
@@ -507,21 +573,21 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 // StatAt implements vfs.FilesystemImpl.StatAt.
 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return linux.Statx{}, err
 	}
 	var stat linux.Statx
-	inode.statTo(&stat)
+	d.inode.statTo(&stat)
 	return stat, nil
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return linux.Statfs{}, err
 	}
@@ -531,53 +597,52 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
 		return err
 	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
 	}
-	if inode.isDir() {
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	if !rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
 		return err
 	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decLinksLocked()
+	parent.inode.impl.(*directory).childList.Remove(child)
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
 	return nil
 }
 
@@ -585,7 +650,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
@@ -597,7 +662,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
@@ -609,7 +674,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
@@ -621,7 +686,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index 9d509f6e4..8d0167c93 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -29,6 +29,7 @@ package memfs
 
 import (
 	"fmt"
+	"math"
 	"sync"
 	"sync/atomic"
 
@@ -64,12 +65,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 func (fs *filesystem) Release() {
 }
 
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// All filesystem state is in-memory.
-	return nil
-}
-
 // dentry implements vfs.DentryImpl.
 type dentry struct {
 	vfsd vfs.Dentry
@@ -137,6 +132,8 @@ type inode struct {
 	impl interface{} // immutable
 }
 
+const maxLinks = math.MaxUint32
+
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
 	i.refs = 1
 	i.mode = uint32(mode)
@@ -147,20 +144,28 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
 	i.impl = impl
 }
 
-// Preconditions: filesystem.mu must be locked for writing.
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
-	if atomic.AddUint32(&i.nlink, 1) <= 1 {
+	if i.nlink == 0 {
 		panic("memfs.inode.incLinksLocked() called with no existing links")
 	}
+	if i.nlink == maxLinks {
+		panic("memfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
 }
 
-// Preconditions: filesystem.mu must be locked for writing.
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
-	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
-		i.decRef()
-	} else if nlink == ^uint32(0) { // negative overflow
+	if i.nlink == 0 {
 		panic("memfs.inode.decLinksLocked() called with no existing links")
 	}
+	atomic.AddUint32(&i.nlink, ^uint32(0))
 }
 
 func (i *inode) incRef() {
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index 5bf527c80..be917aeee 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -38,7 +39,7 @@ func TestSeparateFDs(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	rfdchan := make(chan *vfs.FileDescription)
@@ -76,7 +77,7 @@ func TestNonblockingRead(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
@@ -108,7 +109,7 @@ func TestNonblockingWriteError(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
@@ -126,7 +127,7 @@ func TestSingleFD(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
@@ -160,10 +161,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	// Create the pipe.
 	root := mntns.Root()
 	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Pathname:           fileName,
-		FollowFinalSymlink: true,
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(fileName),
 	}
 	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
 	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
@@ -174,7 +174,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}, &vfs.StatOptions{})
 	if err != nil {
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 6209eb053..1bc9c4a38 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -234,6 +234,18 @@ func (d *Dentry) InsertChild(child *Dentry, name string) {
 	child.name = name
 }
 
+// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
+// d2's parent or an ancestor of d2's parent.
+func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
+	for d2.parent != nil {
+		if d2.parent == d {
+			return true
+		}
+		d2 = d2.parent
+	}
+	return false
+}
+
 // PrepareDeleteDentry must be called before attempting to delete the file
 // represented by d. If PrepareDeleteDentry succeeds, the caller must call
 // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
@@ -283,21 +295,6 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
 	}
 }
 
-// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
-// appropriate for in-memory filesystems that don't need to ensure that some
-// external state change succeeds before committing the deletion.
-//
-// DeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
-	if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
-		return err
-	}
-	vfs.CommitDeleteDentry(d)
-	return nil
-}
-
 // ForceDeleteDentry causes d to become disowned. It should only be used in
 // cases where VFS has no ability to stop the deletion (e.g. d represents the
 // local state of a file on a remote filesystem on which the file has already
@@ -326,7 +323,7 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
 // Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem.
+// Dentry from the same Filesystem. from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	if checkInvariants {
 		if from.parent == nil {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index df03886c3..0b053201a 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -192,6 +192,8 @@ func (fd *FileDescription) Impl() FileDescriptionImpl {
 // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
 // auth.KGID respectively).
 //
+// All methods may return errors not specified.
+//
 // FileDescriptionImpl is analogous to Linux's struct file_operations.
 type FileDescriptionImpl interface {
 	// Release is called when the associated FileDescription reaches zero
@@ -220,6 +222,10 @@ type FileDescriptionImpl interface {
 	// PRead reads from the file into dst, starting at the given offset, and
 	// returns the number of bytes read. PRead is permitted to return partial
 	// reads with a nil error.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -229,6 +235,10 @@ type FileDescriptionImpl interface {
 	// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
 	// with Regular File Operations" requires that all operations that may
 	// mutate the FileDescription offset are serialized.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
 	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
 
 	// PWrite writes src to the file, starting at the given offset, and returns
@@ -238,6 +248,11 @@ type FileDescriptionImpl interface {
 	// As in Linux (but not POSIX), if O_APPEND is in effect for the
 	// FileDescription, PWrite should ignore the offset and append data to the
 	// end of the file.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PWrite returns
+	// EOPNOTSUPP.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -247,6 +262,10 @@ type FileDescriptionImpl interface {
 	// PWrite that uses a FileDescription offset, to make it possible for
 	// remote filesystems to implement O_APPEND correctly (i.e. atomically with
 	// respect to writers outside the scope of VFS).
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
 	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
 
 	// IterDirents invokes cb on each entry in the directory represented by the
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index b766614e7..89bd58864 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -108,6 +108,24 @@ func (fs *Filesystem) DecRef() {
 // (responsible for actually implementing the operation) isn't known until path
 // resolution is complete.
 //
+// Unless otherwise specified, FilesystemImpl methods are responsible for
+// performing permission checks. In many cases, vfs package functions in
+// permissions.go may be used to help perform these checks.
+//
+// When multiple specified error conditions apply to a given method call, the
+// implementation may return any applicable errno unless otherwise specified,
+// but returning the earliest error specified is preferable to maximize
+// compatibility with Linux.
+//
+// All methods may return errors not specified, notably including:
+//
+// - ENOENT if a required path component does not exist.
+//
+// - ENOTDIR if an intermediate path component is not a directory.
+//
+// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
+// Mount.CheckBeginWrite(), permission-checking functions, etc.)
+//
 // For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
 // should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
 // and auth.KGID respectively).
@@ -130,46 +148,223 @@ type FilesystemImpl interface {
 	// GetDentryAt does not correspond directly to a Linux syscall; it is used
 	// in the implementation of:
 	//
-	// - Syscalls that need to resolve two paths: rename(), renameat(),
-	// renameat2(), link(), linkat().
+	// - Syscalls that need to resolve two paths: link(), linkat().
 	//
 	// - Syscalls that need to refer to a filesystem position outside the
 	// context of a file description: chdir(), fchdir(), chroot(), mount(),
 	// umount().
 	GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
 
+	// GetParentDentryAt returns a Dentry representing the directory at the
+	// second-to-last path component in rp. (Note that, despite the name, this
+	// is not necessarily the parent directory of the file at rp, since the
+	// last path component in rp may be "." or "..".) A reference is taken on
+	// the returned Dentry.
+	//
+	// GetParentDentryAt does not correspond directly to a Linux syscall; it is
+	// used in the implementation of the rename() family of syscalls, which
+	// must resolve the parent directories of two paths.
+	//
+	// Preconditions: !rp.Done().
+	//
+	// Postconditions: If GetParentDentryAt returns a nil error, then
+	// rp.Final(). If GetParentDentryAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)
+
 	// LinkAt creates a hard link at rp representing the same file as vd. It
 	// does not take ownership of references on vd.
 	//
-	// The implementation is responsible for checking that vd.Mount() ==
-	// rp.Mount(), and that vd does not represent a directory.
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", LinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, LinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), LinkAt returns ENOENT.
+	//
+	// - If the directory in which the link would be created has been removed
+	// by RmdirAt or RenameAt, LinkAt returns ENOENT.
+	//
+	// - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
+	//
+	// - If vd represents a directory, LinkAt returns EPERM.
+	//
+	// - If vd represents a file for which all existing links have been
+	// removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
+	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
+	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If LinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
 
 	// MkdirAt creates a directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MkdirAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MkdirAt returns EEXIST.
+	//
+	// - If the directory in which the new directory would be created has been
+	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MkdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
 
 	// MknodAt creates a regular file, device special file, or named pipe at
 	// rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MknodAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MknodAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), MknodAt returns ENOENT.
+	//
+	// - If the directory in which the file would be created has been removed
+	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MknodAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
 
 	// OpenAt returns an FileDescription providing access to the file at rp. A
 	// reference is taken on the returned FileDescription.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
+	// the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
+	// features are silently ignored, consistently with Linux's open*(2).)
 	OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
 
 	// ReadlinkAt returns the target of the symbolic link at rp.
+	//
+	// Errors:
+	//
+	// - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
 	ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
 
-	// RenameAt renames the Dentry represented by vd to rp. It does not take
-	// ownership of references on vd.
+	// RenameAt renames the file named oldName in directory oldParentVD to rp.
+	// It does not take ownership of references on oldParentVD.
+	//
+	// Errors [1]:
+	//
+	// - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags
+	// contains RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags does
+	// not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
+	//
+	// - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
 	//
-	// The implementation is responsible for checking that vd.Mount() ==
-	// rp.Mount().
-	RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
+	// - If the renamed file is not a directory, and opts.MustBeDir is true,
+	// RenameAt returns ENOTDIR.
+	//
+	// - If renaming would replace an existing file and opts.Flags contains
+	// RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If there is no existing file at rp and opts.Flags contains
+	// RENAME_EXCHANGE, RenameAt returns ENOENT.
+	//
+	// - If there is an existing non-directory file at rp, and rp.MustBeDir()
+	// is true, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, opts.Flags does not contain
+	// RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
+	// (This check is not subsumed by the check for directory replacement below
+	// since it applies even if there is no file to replace.)
+	//
+	// - If the renamed file is a directory, and the new parent directory of
+	// the renamed file is either the renamed directory or a descendant
+	// subdirectory of the renamed directory, RenameAt returns EINVAL.
+	//
+	// - If renaming would exchange the renamed file with an ancestor directory
+	// of the renamed file, RenameAt returns EINVAL.
+	//
+	// - If renaming would replace an ancestor directory of the renamed file,
+	// RenameAt returns ENOTEMPTY. (This check would be subsumed by the
+	// non-empty directory check below; however, this check takes place before
+	// the self-rename check.)
+	//
+	// - If the renamed file would replace or exchange with itself (i.e. the
+	// source and destination paths resolve to the same file), RenameAt returns
+	// nil, skipping the checks described below.
+	//
+	// - If the source or destination directory is not writable by the provider
+	// of rp.Credentials(), RenameAt returns EACCES.
+	//
+	// - If the renamed file is a directory, and renaming would replace a
+	// non-directory file, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, and renaming would replace a
+	// directory, RenameAt returns EISDIR.
+	//
+	// - If the new parent directory of the renamed file has been removed by
+	// RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
+	//
+	// - If the renamed file is a directory, it is not writable by the
+	// provider of rp.Credentials(), and the source and destination parent
+	// directories are different, RenameAt returns EACCES. (This is nominally
+	// required to change the ".." entry in the renamed directory.)
+	//
+	// - If renaming would replace a non-empty directory, RenameAt returns
+	// ENOTEMPTY.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+	//
+	// Postconditions: If RenameAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	//
+	// [1] "The worst of all namespace operations - renaming directory.
+	// "Perverted" doesn't even start to describe it. Somebody in UCB had a
+	// heck of a trip..." - fs/namei.c:vfs_rename()
+	RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error
 
 	// RmdirAt removes the directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is ".", RmdirAt returns EINVAL.
+	//
+	// - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
+	//
+	// - If no file exists at rp, RmdirAt returns ENOENT.
+	//
+	// - If the file at rp exists but is not a directory, RmdirAt returns
+	// ENOTDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If RmdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	RmdirAt(ctx context.Context, rp *ResolvingPath) error
 
 	// SetStatAt updates metadata for the file at the given path.
+	//
+	// Errors:
+	//
+	// - If opts specifies unsupported options, SetStatAt returns EINVAL.
 	SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
 
 	// StatAt returns metadata for the file at rp.
@@ -181,9 +376,45 @@ type FilesystemImpl interface {
 	StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
 
 	// SymlinkAt creates a symbolic link at rp referring to the given target.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", SymlinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, SymlinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), SymlinkAt returns ENOENT.
+	//
+	// - If the directory in which the symbolic link would be created has been
+	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If SymlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
 
-	// UnlinkAt removes the non-directory file at rp.
+	// UnlinkAt removes the file at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", UnlinkAt returns
+	// EISDIR.
+	//
+	// - If no file exists at rp, UnlinkAt returns ENOENT.
+	//
+	// - If rp.MustBeDir(), and the file at rp exists and is not a directory,
+	// UnlinkAt returns ENOTDIR.
+	//
+	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If UnlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
 	// ListxattrAt returns all extended attribute names for the file at rp.
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 97ee4a446..87d2b0d1c 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -83,6 +83,9 @@ type ReadOptions struct {
 type RenameOptions struct {
 	// Flags contains flags as specified for renameat2(2).
 	Flags uint32
+
+	// If MustBeDir is true, the renamed file must be a directory.
+	MustBeDir bool
 }
 
 // SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index d580fd39e..f0641d314 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -112,30 +112,26 @@ var resolvingPathPool = sync.Pool{
 	},
 }
 
-func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
-	path, err := fspath.Parse(pop.Pathname)
-	if err != nil {
-		return nil, err
-	}
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
 	rp := resolvingPathPool.Get().(*ResolvingPath)
 	rp.vfs = vfs
 	rp.root = pop.Root
 	rp.mount = pop.Start.mount
 	rp.start = pop.Start.dentry
-	rp.pit = path.Begin
+	rp.pit = pop.Path.Begin
 	rp.flags = 0
 	if pop.FollowFinalSymlink {
 		rp.flags |= rpflagsFollowFinalSymlink
 	}
-	rp.mustBeDir = path.Dir
-	rp.mustBeDirOrig = path.Dir
+	rp.mustBeDir = pop.Path.Dir
+	rp.mustBeDirOrig = pop.Path.Dir
 	rp.symlinks = 0
 	rp.curPart = 0
 	rp.numOrigParts = 1
 	rp.creds = creds
-	rp.parts[0] = path.Begin
-	rp.origParts[0] = path.Begin
-	return rp, nil
+	rp.parts[0] = pop.Path.Begin
+	rp.origParts[0] = pop.Path.Begin
+	return rp
 }
 
 func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
@@ -345,29 +341,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool {
 // symlink target and returns nil. Otherwise it returns a non-nil error.
 //
 // Preconditions: !rp.Done().
+//
+// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
 func (rp *ResolvingPath) HandleSymlink(target string) error {
 	if rp.symlinks >= linux.MaxSymlinkTraversals {
 		return syserror.ELOOP
 	}
-	targetPath, err := fspath.Parse(target)
-	if err != nil {
-		return err
+	if len(target) == 0 {
+		return syserror.ENOENT
 	}
 	rp.symlinks++
+	targetPath := fspath.Parse(target)
 	if targetPath.Absolute {
 		rp.absSymlinkTarget = targetPath
 		return resolveAbsSymlinkError{}
 	}
-	if !targetPath.Begin.Ok() {
-		panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
-	}
 	// Consume the path component that represented the symlink.
 	rp.Advance()
 	// Prepend the symlink target to the relative path.
+	if checkInvariants {
+		if !targetPath.HasComponents() {
+			panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
+		}
+	}
 	rp.relpathPrepend(targetPath)
 	return nil
 }
 
+// Preconditions: path.HasComponents().
 func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
 	if rp.pit.Ok() {
 		rp.parts[rp.curPart] = rp.pit
@@ -467,6 +468,17 @@ func (rp *ResolvingPath) handleError(err error) bool {
 	}
 }
 
+// canHandleError returns true if err is an error returned by rp.Resolve*()
+// that rp.handleError() may attempt to handle.
+func (rp *ResolvingPath) canHandleError(err error) bool {
+	switch err.(type) {
+	case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
+		return true
+	default:
+		return false
+	}
+}
+
 // MustBeDir returns true if the file traversed by rp must be a directory.
 func (rp *ResolvingPath) MustBeDir() bool {
 	return rp.mustBeDir
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index d94117bce..ee5c8b9e2 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -57,6 +57,11 @@ func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath,
 	return nil, syserror.EPERM
 }
 
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+	return nil, syserror.EPERM
+}
+
 // LinkAt implements FilesystemImpl.LinkAt.
 func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
 	return syserror.EPERM
@@ -83,7 +88,7 @@ func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (
 }
 
 // RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
+func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index e60898d7c..3e4df8558 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -28,9 +28,11 @@
 package vfs
 
 import (
+	"fmt"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -111,11 +113,11 @@ type PathOperation struct {
 	// are borrowed from the provider of the PathOperation (i.e. the caller of
 	// the VFS method to which the PathOperation was passed).
 	//
-	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+	// Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
 	Start VirtualDentry
 
 	// Path is the pathname traversed by this operation.
-	Pathname string
+	Path fspath.Path
 
 	// If FollowFinalSymlink is true, and the Dentry traversed by the final
 	// path component represents a symbolic link, the symbolic link should be
@@ -126,10 +128,7 @@ type PathOperation struct {
 // GetDentryAt returns a VirtualDentry representing the given path, at which a
 // file must exist. A reference is taken on the returned VirtualDentry.
 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return VirtualDentry{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
 		if err == nil {
@@ -148,6 +147,33 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
 	}
 }
 
+// Preconditions: pop.Path.Begin.Ok().
+func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
+		if err == nil {
+			parentVD := VirtualDentry{
+				mount:  rp.mount,
+				dentry: parent,
+			}
+			rp.mount.IncRef()
+			name := rp.Component()
+			vfs.putResolvingPath(rp)
+			return parentVD, name, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, "", err
+		}
+	}
+}
+
 // LinkAt creates a hard link at newpop representing the existing file at
 // oldpop.
 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
@@ -155,21 +181,36 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 	if err != nil {
 		return err
 	}
-	rp, err := vfs.getResolvingPath(creds, newpop)
-	if err != nil {
+
+	if !newpop.Path.Begin.Ok() {
 		oldVD.DecRef()
-		return err
+		if newpop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
 	}
+	if newpop.FollowFinalSymlink {
+		oldVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
 	for {
 		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
 		if err == nil {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
 			return err
 		}
 	}
@@ -177,19 +218,32 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 
 // MkdirAt creates a directory at the given path.
 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
 	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
 	// also honored." - mkdir(2)
 	opts.Mode &= 0777 | linux.S_ISVTX
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -200,16 +254,29 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 // MknodAt creates a file of the given mode at the given path. It returns an
 // error from the syserror package.
 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
+		if err != nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
-		// Handle mount traversals.
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -259,10 +326,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 	if opts.Flags&linux.O_NOFOLLOW != 0 {
 		pop.FollowFinalSymlink = false
 	}
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	if opts.Flags&linux.O_DIRECTORY != 0 {
 		rp.mustBeDir = true
 		rp.mustBeDirOrig = true
@@ -282,10 +346,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 
 // ReadlinkAt returns the target of the symbolic link at the given path.
 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return "", err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
 		if err == nil {
@@ -301,25 +362,59 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden
 
 // RenameAt renames the file at oldpop to newpop.
 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
-	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
-	if err != nil {
-		return err
+	if !oldpop.Path.Begin.Ok() {
+		if oldpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
 	}
-	rp, err := vfs.getResolvingPath(creds, newpop)
+	if oldpop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
 	if err != nil {
-		oldVD.DecRef()
 		return err
 	}
+	if oldName == "." || oldName == ".." {
+		oldParentVD.DecRef()
+		return syserror.EBUSY
+	}
+
+	if !newpop.Path.Begin.Ok() {
+		oldParentVD.DecRef()
+		if newpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if newpop.FollowFinalSymlink {
+		oldParentVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
+	renameOpts := *opts
+	if oldpop.Path.Dir {
+		renameOpts.MustBeDir = true
+	}
 	for {
-		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts)
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
 		if err == nil {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
 			return err
 		}
 	}
@@ -327,16 +422,29 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 
 // RmdirAt removes the directory at the given path.
 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
 	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -346,10 +454,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
 
 // SetStatAt changes metadata for the file at the given path.
 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
 		if err == nil {
@@ -365,10 +470,7 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent
 
 // StatAt returns metadata for the file at the given path.
 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statx{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
 		if err == nil {
@@ -385,10 +487,7 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential
 // StatFSAt returns metadata for the filesystem containing the file at the
 // given path.
 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statfs{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
 		if err == nil {
@@ -404,16 +503,29 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
 
 // SymlinkAt creates a symbolic link at the given path with the given target.
 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -423,16 +535,29 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 
 // UnlinkAt deletes the non-directory file at the given path.
 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -443,10 +568,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 // ListxattrAt returns all extended attribute names for the file at the given
 // path.
 func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
 		if err == nil {
@@ -471,10 +593,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 // GetxattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
 func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return "", err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
 		if err == nil {
@@ -491,10 +610,7 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
 // SetxattrAt changes the value associated with the given extended attribute
 // for the file at the given path.
 func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
 		if err == nil {
@@ -510,10 +626,7 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
 
 // RemovexattrAt removes the given extended attribute from the file at rp.
 func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
 		if err == nil {
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 1987e89cc..2269f6237 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -45,6 +45,7 @@ var (
 	ELIBBAD      = error(syscall.ELIBBAD)
 	ELOOP        = error(syscall.ELOOP)
 	EMFILE       = error(syscall.EMFILE)
+	EMLINK       = error(syscall.EMLINK)
 	EMSGSIZE     = error(syscall.EMSGSIZE)
 	ENAMETOOLONG = error(syscall.ENAMETOOLONG)
 	ENOATTR      = ENODATA
-- 
cgit v1.2.3


From 574e988f2bc6060078a17f37a377441703c52a22 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 23 Dec 2019 17:31:20 -0800
Subject: Fix deadlock in kernfs.Filesystem.revalidateChildLocked

It was calling Dentry.InsertChild with the dentry's mutex
already locked.

Updates #1035

PiperOrigin-RevId: 286962742
---
 pkg/sentry/fsimpl/kernfs/filesystem.go |  2 +-
 pkg/sentry/fsimpl/kernfs/kernfs.go     | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index a6f9fced5..79759e0fc 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -122,7 +122,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 			return nil, err
 		}
 		// Reference on childVFSD dropped by a corresponding Valid.
-		parent.InsertChild(name, childVFSD)
+		parent.insertChildLocked(name, childVFSD)
 	}
 	return childVFSD.Impl().(*Dentry), nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index bb01c3d01..ac802218d 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -239,14 +239,22 @@ func (d *Dentry) destroy() {
 //
 // Precondition: d must represent a directory inode.
 func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+	d.dirMu.Lock()
+	d.insertChildLocked(name, child)
+	d.dirMu.Unlock()
+}
+
+// insertChildLocked is equivalent to InsertChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) {
 	if !d.isDir() {
 		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
 	}
 	vfsDentry := d.VFSDentry()
 	vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
-	d.dirMu.Lock()
 	vfsDentry.InsertChild(child, name)
-	d.dirMu.Unlock()
 }
 
 // The Inode interface maps filesystem-level operations that operate on paths to
-- 
cgit v1.2.3


From 7b83d21856e569742397ab8b0146910eeff1462f Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 24 Dec 2019 10:50:05 +0800
Subject: slight changes to ring0&pagetables for Arm64

There are 2 jobs have been finished in this patch:
1, a comment was added to explain the purpose of the extra NOPs in Vectors().
2, some merge errors were fixed.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s    | 4 ++++
 pkg/sentry/platform/ring0/lib_arm64.s      | 7 +++++--
 pkg/sentry/platform/ring0/pagetables/BUILD | 5 ++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index add2c3e08..813ef9822 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -554,6 +554,10 @@ TEXT ·Vectors(SB),NOSPLIT,$0
 	B ·El0_error_invalid(SB)
 	nop31Instructions()
 
+	// The exception-vector-table is required to be 11-bits aligned.
+	// Please see Linux source code as reference: arch/arm64/kernel/entry.s.
+	// For gvisor, I defined it as 4K in length, filled the 2nd 2K part with NOPs.
+	// So that, I can safely move the 1st 2K part into the address with 11-bits alignment.
 	WORD $0xd503201f	//nop
 	nop31Instructions()
 	WORD $0xd503201f
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 1c9171004..0e6a6235b 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "funcdata.h"
+#include "textflag.h"
+
 TEXT ·CPACREL1(SB),NOSPLIT,$0-8
 	WORD $0xd5381041 	// MRS CPACR_EL1, R1
 	MOVD R1, ret+0(FP)
 	RET
 
-TEXT ·FPCR(SB),NOSPLIT,$0-8
+TEXT ·GetFPCR(SB),NOSPLIT,$0-8
 	WORD $0xd53b4201    	// MRS NZCV, R1
 	MOVD R1, ret+0(FP)
 	RET
@@ -27,7 +30,7 @@ TEXT ·GetFPSR(SB),NOSPLIT,$0-8
 	MOVD R1, ret+0(FP)
 	RET
 
-TEXT ·FPCR(SB),NOSPLIT,$0-8
+TEXT ·SetFPCR(SB),NOSPLIT,$0-8
 	MOVD addr+0(FP), R1
 	WORD $0xd51b4201  	// MSR R1, NZCV
 	RET
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index e2e15ba5c..09ecc3b09 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -11,7 +11,10 @@ config_setting(
 
 go_template(
     name = "generic_walker",
-    srcs = ["walker_amd64.go"],
+    srcs = select({
+           ":aarch64": ["walker_arm64.go",],
+           "//conditions:default": ["walker_amd64.go",],
+    }),
     opt_types = [
         "Visitor",
     ],
-- 
cgit v1.2.3


From e013c48c78c9a7daf245b7de9563e3a0bd8a1e97 Mon Sep 17 00:00:00 2001
From: Ryan Heacock <rheacock@google.com>
Date: Tue, 24 Dec 2019 08:48:14 -0800
Subject: Enable IP_RECVTOS socket option for datagram sockets

Added the ability to get/set the IP_RECVTOS socket option on UDP endpoints. If
enabled, TOS from the incoming Network Header passed as ancillary data in the
ControlMessages.

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test
PiperOrigin-RevId: 287029703
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 42 ++++++++++++++++-
 pkg/tcpip/checker/checker.go                 | 16 +++++++
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  6 ++-
 pkg/tcpip/transport/raw/endpoint.go          |  2 +-
 pkg/tcpip/transport/udp/endpoint.go          | 31 ++++++++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 69 ++++++++++++++++++++++++----
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 ++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 11 files changed, 201 insertions(+), 19 deletions(-)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index af1a4e95f..b649dd021 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 140851c17..d2f263402 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1323,6 +1323,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
+	case linux.IP_RECVTOS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.ReceiveTOSOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		if v {
+			return int32(1), nil
+		}
+		return int32(0), nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1808,6 +1823,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
+	case linux.IP_RECVTOS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(
+			tcpip.ReceiveTOSOption(v != 0),
+		))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1828,7 +1853,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
-		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2139,6 +2163,21 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
 	cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
 }
 
+func (s *SocketOperations) fillCmsgTOS(cmsg *socket.ControlMessages) {
+	if s.skType != linux.SOCK_DGRAM {
+		return
+	}
+	var receiveTOS tcpip.ReceiveTOSOption
+	if err := s.Endpoint.GetSockOpt(&receiveTOS); err != nil {
+		return
+	}
+	if !receiveTOS {
+		return
+	}
+	cmsg.IP.HasTOS = s.readCM.HasTOS
+	cmsg.IP.TOS = s.readCM.TOS
+}
+
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
@@ -2244,6 +2283,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 
 	cmsg := s.controlMessages()
 	s.fillCmsgInq(&cmsg)
+	s.fillCmsgTOS(&cmsg)
 	return n, flags, addr, addrLen, cmsg, syserr.FromError(err)
 }
 
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 2f15bf1f1..542abc99d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTOS {
+			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+		}
+		if got := cm.TOS; got != want {
+			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+		}
+	}
+}
+
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ddd014658..a4556674b 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -575,7 +575,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// Subnets returns the Subnets associated with this NIC.
+// AddressRanges returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7a9600679..251336224 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -829,7 +829,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICSubnets returns a map of NICIDs to their associated subnets.
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f62fd729f..5c7b2af88 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS int8
+	TOS uint8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -666,6 +666,10 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
+// ancillary message is passed with incoming packets.
+type ReceiveTOSOption bool
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 5aafe2615..6d23ab5a1 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -510,7 +510,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1ac4705af..269470ed4 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,6 +32,7 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
+	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -114,6 +115,10 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
+	// receiveTOS determines if the incoming IPv4 TOS header field is passed
+	// as ancillary data to ControlMessages on Read.
+	receiveTOS bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -244,7 +249,12 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+	return p.data.ToView(), tcpip.ControlMessages{
+		HasTimestamp: true,
+		Timestamp:    p.timestamp,
+		HasTOS:       e.receiveTOS,
+		TOS:          p.tos,
+	}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -656,6 +666,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.sendTOS = uint8(v)
 		e.mu.Unlock()
 		return nil
+
+	case tcpip.ReceiveTOSOption:
+		e.mu.Lock()
+		e.receiveTOS = bool(v)
+		e.mu.Unlock()
+		return nil
 	}
 	return nil
 }
@@ -792,6 +808,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.RUnlock()
 		return nil
 
+	case *tcpip.ReceiveTOSOption:
+		e.mu.RLock()
+		*o = tcpip.ReceiveTOSOption(e.receiveTOS)
+		e.mu.RUnlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1238,6 +1260,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
+	// Save any useful information from the NetworkHeader to the packet.
+	switch r.NetProto {
+	case header.IPv4ProtocolNumber:
+		// This packet has already been validated before being passed up the stack.
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+	}
+
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 7051a7a9c..43b8b35ba 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,6 +56,7 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
+	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
+		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -556,8 +558,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
+// correctness  including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -572,12 +574,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, _, err := c.ep.Read(&addr)
+	v, cm, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, _, err = c.ep.Read(&addr)
+			v, cm, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -610,15 +612,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
+
+	// Run any checkers against the ControlMessages.
+	for _, f := range checkers {
+		f(c.t, cm)
+	}
+
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness.
-func testRead(c *testContext, flow testFlow) {
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1286,7 +1294,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1321,7 +1329,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1348,6 +1356,49 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
+func TestReceiveTOSV4(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, broadcast} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Verify that setting and reading the option works.
+			const recvTos = true
+			var v tcpip.ReceiveTOSOption
+			if err := c.ep.GetSockOpt(&v); err != nil {
+				c.t.Errorf("GetSockopt failed: %s", err)
+			}
+			// Test for expected default value.
+			if v != false {
+				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, false)
+			}
+
+			if err := c.ep.SetSockOpt(tcpip.ReceiveTOSOption(recvTos)); err != nil {
+				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.ReceiveTOSOption(recvTos), err)
+			}
+
+			if err := c.ep.GetSockOpt(&v); err != nil {
+				c.t.Errorf("GetSockopt failed: %s", err)
+			}
+
+			if want := tcpip.ReceiveTOSOption(recvTos); v != want {
+				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, want)
+			}
+
+			// Bind to wildcard.
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			// Verify that the correct received TOS is actually handed through as
+			// ancillary data to the ControlMessages struct.
+			testRead(c, flow, checker.ReceiveTOS(testTOS))
+		})
+	}
+}
+
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 66eb68857..53290bed7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Ensure that Receiving TOS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS works as expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index dc35c2f50..68e0a8109 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,8 +1349,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
+          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1421,7 +1422,8 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 87e4d03fdf576348ac7023c599e0fc66ad4cccbd Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 26 Dec 2019 13:04:14 -0800
Subject: Automated rollback of changelist 287029703

PiperOrigin-RevId: 287217899
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 42 +----------------
 pkg/tcpip/checker/checker.go                 | 16 -------
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  6 +--
 pkg/tcpip/transport/raw/endpoint.go          |  2 +-
 pkg/tcpip/transport/udp/endpoint.go          | 31 +------------
 pkg/tcpip/transport/udp/udp_test.go          | 69 ++++------------------------
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 ----------------
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 11 files changed, 19 insertions(+), 201 deletions(-)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index b649dd021..af1a4e95f 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index d2f263402..140851c17 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1323,21 +1323,6 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
-	case linux.IP_RECVTOS:
-		if outLen < sizeOfInt32 {
-			return nil, syserr.ErrInvalidArgument
-		}
-
-		var v tcpip.ReceiveTOSOption
-		if err := ep.GetSockOpt(&v); err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		if v {
-			return int32(1), nil
-		}
-		return int32(0), nil
-
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1823,16 +1808,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
-	case linux.IP_RECVTOS:
-		v, err := parseIntOrChar(optVal)
-		if err != nil {
-			return err
-		}
-
-		return syserr.TranslateNetstackError(ep.SetSockOpt(
-			tcpip.ReceiveTOSOption(v != 0),
-		))
-
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1853,6 +1828,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
+		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2163,21 +2139,6 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
 	cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
 }
 
-func (s *SocketOperations) fillCmsgTOS(cmsg *socket.ControlMessages) {
-	if s.skType != linux.SOCK_DGRAM {
-		return
-	}
-	var receiveTOS tcpip.ReceiveTOSOption
-	if err := s.Endpoint.GetSockOpt(&receiveTOS); err != nil {
-		return
-	}
-	if !receiveTOS {
-		return
-	}
-	cmsg.IP.HasTOS = s.readCM.HasTOS
-	cmsg.IP.TOS = s.readCM.TOS
-}
-
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
@@ -2283,7 +2244,6 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 
 	cmsg := s.controlMessages()
 	s.fillCmsgInq(&cmsg)
-	s.fillCmsgTOS(&cmsg)
 	return n, flags, addr, addrLen, cmsg, syserr.FromError(err)
 }
 
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 542abc99d..2f15bf1f1 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,9 +33,6 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
-// ControlMessagesChecker is a function to check a property of ancillary data.
-type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
-
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -161,19 +158,6 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
-// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
-func ReceiveTOS(want uint8) ControlMessagesChecker {
-	return func(t *testing.T, cm tcpip.ControlMessages) {
-		t.Helper()
-		if !cm.HasTOS {
-			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
-		}
-		if got := cm.TOS; got != want {
-			t.Fatalf("got cm.TOS = %d, want %d", got, want)
-		}
-	}
-}
-
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index a4556674b..ddd014658 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -575,7 +575,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// AddressRanges returns the Subnets associated with this NIC.
+// Subnets returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 251336224..7a9600679 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -829,7 +829,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICAddressRanges returns a map of NICIDs to their associated subnets.
+// NICSubnets returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 5c7b2af88..f62fd729f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS uint8
+	TOS int8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -666,10 +666,6 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
-// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
-// ancillary message is passed with incoming packets.
-type ReceiveTOSOption bool
-
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 6d23ab5a1..5aafe2615 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -510,7 +510,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 269470ed4..1ac4705af 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,6 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -115,10 +114,6 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
-	// receiveTOS determines if the incoming IPv4 TOS header field is passed
-	// as ancillary data to ControlMessages on Read.
-	receiveTOS bool
-
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -249,12 +244,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{
-		HasTimestamp: true,
-		Timestamp:    p.timestamp,
-		HasTOS:       e.receiveTOS,
-		TOS:          p.tos,
-	}, nil
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -666,12 +656,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.sendTOS = uint8(v)
 		e.mu.Unlock()
 		return nil
-
-	case tcpip.ReceiveTOSOption:
-		e.mu.Lock()
-		e.receiveTOS = bool(v)
-		e.mu.Unlock()
-		return nil
 	}
 	return nil
 }
@@ -808,12 +792,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.RUnlock()
 		return nil
 
-	case *tcpip.ReceiveTOSOption:
-		e.mu.RLock()
-		*o = tcpip.ReceiveTOSOption(e.receiveTOS)
-		e.mu.RUnlock()
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1260,13 +1238,6 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
-	// Save any useful information from the NetworkHeader to the packet.
-	switch r.NetProto {
-	case header.IPv4ProtocolNumber:
-		// This packet has already been validated before being passed up the stack.
-		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
-	}
-
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 43b8b35ba..7051a7a9c 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,7 +56,6 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
-	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -454,7 +453,6 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
-		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -558,8 +556,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness  including any additional checker functions provided.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
+// correctness.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -574,12 +572,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, cm, err := c.ep.Read(&addr)
+	v, _, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, cm, err = c.ep.Read(&addr)
+			v, _, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -612,21 +610,15 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
-
-	// Run any checkers against the ControlMessages.
-	for _, f := range checkers {
-		f(c.t, cm)
-	}
-
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness including any additional checker functions provided.
-func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
+// its correctness.
+func testRead(c *testContext, flow testFlow) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1294,7 +1286,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tos = 0xC0
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1329,7 +1321,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tos = 0xC0
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1356,49 +1348,6 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
-
-			c.createEndpointForFlow(flow)
-
-			// Verify that setting and reading the option works.
-			const recvTos = true
-			var v tcpip.ReceiveTOSOption
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, false)
-			}
-
-			if err := c.ep.SetSockOpt(tcpip.ReceiveTOSOption(recvTos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.ReceiveTOSOption(recvTos), err)
-			}
-
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
-			}
-
-			if want := tcpip.ReceiveTOSOption(recvTos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, want)
-			}
-
-			// Bind to wildcard.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatalf("Bind failed: %s", err)
-			}
-
-			// Verify that the correct received TOS is actually handed through as
-			// ancillary data to the ControlMessages struct.
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
-	}
-}
-
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..66eb68857 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,46 +209,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 68e0a8109..dc35c2f50 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,8 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,8 +1421,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
-  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 3c125eb21946e1f6bf8f22f4169baafb7f07bf60 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 26 Dec 2019 14:42:19 -0800
Subject: Initial procfs implementation in VFSv2

Updates #1195

PiperOrigin-RevId: 287227722
---
 pkg/sentry/fsimpl/kernfs/BUILD                 |   1 +
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  20 +-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |   5 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  20 ++
 pkg/sentry/fsimpl/kernfs/kernfs.go             |   9 +
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |   2 +-
 pkg/sentry/fsimpl/kernfs/symlink.go            |  45 +++
 pkg/sentry/fsimpl/proc/BUILD                   |  33 +-
 pkg/sentry/fsimpl/proc/boot_test.go            | 149 +++++++++
 pkg/sentry/fsimpl/proc/filesystem.go           |  69 +++++
 pkg/sentry/fsimpl/proc/filesystems.go          |  25 --
 pkg/sentry/fsimpl/proc/loadavg.go              |   8 +-
 pkg/sentry/fsimpl/proc/meminfo.go              |   6 +-
 pkg/sentry/fsimpl/proc/proc.go                 |  16 -
 pkg/sentry/fsimpl/proc/stat.go                 |   6 +-
 pkg/sentry/fsimpl/proc/task.go                 | 341 ++++++++------------
 pkg/sentry/fsimpl/proc/task_files.go           | 272 ++++++++++++++++
 pkg/sentry/fsimpl/proc/tasks.go                | 162 ++++++++++
 pkg/sentry/fsimpl/proc/tasks_files.go          |  92 ++++++
 pkg/sentry/fsimpl/proc/tasks_test.go           | 412 +++++++++++++++++++++++++
 pkg/sentry/fsimpl/proc/version.go              |   6 +-
 pkg/sentry/kernel/kernel.go                    |   7 +-
 pkg/sentry/kernel/task_clone.go                |   2 +-
 pkg/sentry/kernel/thread_group.go              |   8 +-
 pkg/sentry/vfs/file_description_impl_util.go   |  11 +
 25 files changed, 1454 insertions(+), 273 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/kernfs/symlink.go
 create mode 100644 pkg/sentry/fsimpl/proc/boot_test.go
 create mode 100644 pkg/sentry/fsimpl/proc/filesystem.go
 delete mode 100644 pkg/sentry/fsimpl/proc/filesystems.go
 delete mode 100644 pkg/sentry/fsimpl/proc/proc.go
 create mode 100644 pkg/sentry/fsimpl/proc/task_files.go
 create mode 100644 pkg/sentry/fsimpl/proc/tasks.go
 create mode 100644 pkg/sentry/fsimpl/proc/tasks_files.go
 create mode 100644 pkg/sentry/fsimpl/proc/tasks_test.go

diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 59f7f39e2..39c03ee9d 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -25,6 +25,7 @@ go_library(
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
+        "symlink.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 51102ce48..c5fe65722 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -15,6 +15,8 @@
 package kernfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -26,7 +28,10 @@ import (
 // DynamicBytesFile implements kernfs.Inode and represents a read-only
 // file whose contents are backed by a vfs.DynamicBytesSource.
 //
-// Must be initialized with Init before first use.
+// Must be instantiated with NewDynamicBytesFile or initialized with Init
+// before first use.
+//
+// +stateify savable
 type DynamicBytesFile struct {
 	InodeAttrs
 	InodeNoopRefCount
@@ -36,9 +41,14 @@ type DynamicBytesFile struct {
 	data vfs.DynamicBytesSource
 }
 
-// Init intializes a dynamic bytes file.
-func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) {
-	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444)
+var _ Inode = (*DynamicBytesFile)(nil)
+
+// Init initializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm)
 	f.data = data
 }
 
@@ -59,6 +69,8 @@ func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
 // DynamicBytesFile.
 //
 // Must be initialized with Init before first use.
+//
+// +stateify savable
 type DynamicBytesFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DynamicBytesFileDescriptionImpl
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index bd402330f..77975583b 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -154,7 +154,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 		fd.off++
 	}
 
-	return nil
+	var err error
+	relOffset := fd.off - int64(len(fd.children.set)) - 2
+	fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+	return err
 }
 
 // Seek implements vfs.FileDecriptionImpl.Seek.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 7b45b702a..752e0f659 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -139,6 +139,11 @@ func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 	panic("Lookup called on non-directory inode")
 }
 
+// IterDirents implements Inode.IterDirents.
+func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	panic("IterDirents called on non-directory inode")
+}
+
 // Valid implements Inode.Valid.
 func (*InodeNotDirectory) Valid(context.Context) bool {
 	return true
@@ -156,6 +161,11 @@ func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dent
 	return nil, syserror.ENOENT
 }
 
+// IterDirents implements Inode.IterDirents.
+func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
 // Valid implements Inode.Valid.
 func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 	return true
@@ -490,3 +500,13 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
 	}
 	return nil
 }
+
+// InodeSymlink partially implements Inode interface for symlinks.
+type InodeSymlink struct {
+	InodeNotDirectory
+}
+
+// Open implements Inode.Open.
+func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	return nil, syserror.ELOOP
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index ac802218d..d69b299ae 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -404,6 +404,15 @@ type inodeDynamicLookup interface {
 	// Valid should return true if this inode is still valid, or needs to
 	// be resolved again by a call to Lookup.
 	Valid(ctx context.Context) bool
+
+	// IterDirents is used to iterate over dynamically created entries. It invokes
+	// cb on each entry in the directory represented by the FileDescription.
+	// 'offset' is the offset for the entire IterDirents call, which may include
+	// results from the caller. 'relOffset' is the offset inside the entries
+	// returned by this IterDirents invocation. In other words,
+	// 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
+	// while 'relOffset' is the place where iteration should start from.
+	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
 }
 
 type inodeSymlink interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 73b6e43b5..3db12caa0 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -133,7 +133,7 @@ type file struct {
 func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
 	f := &file{}
 	f.content = content
-	f.DynamicBytesFile.Init(creds, fs.NextIno(), f)
+	f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(f)
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
new file mode 100644
index 000000000..068063f4e
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type staticSymlink struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeSymlink
+
+	target string
+}
+
+var _ Inode = (*staticSymlink)(nil)
+
+// NewStaticSymlink creates a new symlink file pointing to 'target'.
+func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry {
+	inode := &staticSymlink{target: target}
+	inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+	d := &Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *staticSymlink) Readlink(_ context.Context) (string, error) {
+	return s.target, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index ade6ac946..1f44b3217 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -6,15 +6,17 @@ package(licenses = ["notice"])
 go_library(
     name = "proc",
     srcs = [
-        "filesystems.go",
+        "filesystem.go",
         "loadavg.go",
         "meminfo.go",
         "mounts.go",
         "net.go",
-        "proc.go",
         "stat.go",
         "sys.go",
         "task.go",
+        "task_files.go",
+        "tasks.go",
+        "tasks_files.go",
         "version.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
@@ -24,8 +26,10 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket",
@@ -34,17 +38,40 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/syserror",
     ],
 )
 
 go_test(
     name = "proc_test",
     size = "small",
-    srcs = ["net_test.go"],
+    srcs = [
+        "boot_test.go",
+        "net_test.go",
+        "tasks_test.go",
+    ],
     embed = [":proc"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/cpuid",
+        "//pkg/fspath",
+        "//pkg/memutil",
+        "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
         "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go
new file mode 100644
index 000000000..84a93ee56
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/boot_test.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+
+	// Platforms are plugable.
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+var (
+	platformFlag = flag.String("platform", "ptrace", "specify which platform to use")
+)
+
+// boot initializes a new bare bones kernel for test.
+func boot() (*kernel.Kernel, error) {
+	platformCtr, err := platform.Lookup(*platformFlag)
+	if err != nil {
+		return nil, fmt.Errorf("platform not found: %v", err)
+	}
+	deviceFile, err := platformCtr.OpenDevice()
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	plat, err := platformCtr.New(deviceFile)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+
+	k := &kernel.Kernel{
+		Platform: plat,
+	}
+
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, err
+	}
+	k.SetMemoryFile(mf)
+
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(nil, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	creds := auth.NewRootCredentials(auth.NewRootUserNamespace())
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		ApplicationCores:            uint(runtime.GOMAXPROCS(-1)),
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+
+	// Create mount namespace without root as it's the minimum required to create
+	// the global thread group.
+	mntns, err := fs.NewMountNamespace(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	ls, err := limits.NewLinuxLimitSet()
+	if err != nil {
+		return nil, err
+	}
+	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	k.TestOnly_SetGlobalInit(tg)
+
+	return k, nil
+}
+
+// createTask creates a new bare bones task for tests.
+func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+	k := kernel.KernelFromContext(ctx)
+	config := &kernel.TaskConfig{
+		Kernel:                  k,
+		ThreadGroup:             tc,
+		TaskContext:             &kernel.TaskContext{Name: name},
+		Credentials:             auth.CredentialsFromContext(ctx),
+		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
+		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
+		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
+		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+	}
+	return k.TaskSet().NewTask(config)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "test-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
new file mode 100644
index 000000000..d09182c77
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -0,0 +1,69 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for procfs.
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// procFSType is the factory class for procfs.
+//
+// +stateify savable
+type procFSType struct{}
+
+var _ vfs.FilesystemType = (*procFSType)(nil)
+
+// GetFilesystem implements vfs.FilesystemType.
+func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return nil, nil, fmt.Errorf("procfs requires a kernel")
+	}
+	pidns := kernel.PIDNamespaceFromContext(ctx)
+	if pidns == nil {
+		return nil, nil, fmt.Errorf("procfs requires a PID namespace")
+	}
+
+	procfs := &kernfs.Filesystem{}
+	procfs.VFSFilesystem().Init(vfsObj, procfs)
+
+	_, dentry := newTasksInode(procfs, k, pidns)
+	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
+}
+
+// dynamicInode is an overfitted interface for common Inodes with
+// dynamicByteSource types used in procfs.
+type dynamicInode interface {
+	kernfs.Inode
+	vfs.DynamicBytesSource
+
+	Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+}
+
+func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	inode.Init(creds, ino, inode, perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go
deleted file mode 100644
index 0e016bca5..000000000
--- a/pkg/sentry/fsimpl/proc/filesystems.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems.
-//
-// +stateify savable
-type filesystemsData struct{}
-
-// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for
-// filesystemsData. We would need to retrive filesystem names from
-// vfs.VirtualFilesystem. Also needs vfs replacement for
-// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
index 9135afef1..5351d86e8 100644
--- a/pkg/sentry/fsimpl/proc/loadavg.go
+++ b/pkg/sentry/fsimpl/proc/loadavg.go
@@ -19,15 +19,17 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 )
 
 // loadavgData backs /proc/loadavg.
 //
 // +stateify savable
-type loadavgData struct{}
+type loadavgData struct {
+	kernfs.DynamicBytesFile
+}
 
-var _ vfs.DynamicBytesSource = (*loadavgData)(nil)
+var _ dynamicInode = (*loadavgData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
index 9a827cd66..cbdd4f3fc 100644
--- a/pkg/sentry/fsimpl/proc/meminfo.go
+++ b/pkg/sentry/fsimpl/proc/meminfo.go
@@ -19,21 +19,23 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 // meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
 //
 // +stateify savable
 type meminfoData struct {
+	kernfs.DynamicBytesFile
+
 	// k is the owning Kernel.
 	k *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*meminfoData)(nil)
+var _ dynamicInode = (*meminfoData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/proc/proc.go
deleted file mode 100644
index 31dec36de..000000000
--- a/pkg/sentry/fsimpl/proc/proc.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package proc implements a partial in-memory file system for procfs.
-package proc
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
index 720db3828..50894a534 100644
--- a/pkg/sentry/fsimpl/proc/stat.go
+++ b/pkg/sentry/fsimpl/proc/stat.go
@@ -20,8 +20,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
@@ -66,11 +66,13 @@ func (c cpuStats) String() string {
 //
 // +stateify savable
 type statData struct {
+	kernfs.DynamicBytesFile
+
 	// k is the owning Kernel.
 	k *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*statData)(nil)
+var _ dynamicInode = (*statData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 0d87be52b..11a64c777 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -15,251 +15,176 @@
 package proc
 
 import (
-	"bytes"
-	"fmt"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// mapsCommon is embedded by mapsData and smapsData.
-type mapsCommon struct {
-	t *kernel.Task
-}
-
-// mm gets the kernel task's MemoryManager. No additional reference is taken on
-// mm here. This is safe because MemoryManager.destroy is required to leave the
-// MemoryManager in a state where it's still usable as a DynamicBytesSource.
-func (md *mapsCommon) mm() *mm.MemoryManager {
-	var tmm *mm.MemoryManager
-	md.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			tmm = mm
-		}
-	})
-	return tmm
-}
-
-// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+// taskInode represents the inode for /proc/PID/ directory.
 //
 // +stateify savable
-type mapsData struct {
-	mapsCommon
+type taskInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+
+	task *kernel.Task
 }
 
-var _ vfs.DynamicBytesSource = (*mapsData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	if mm := md.mm(); mm != nil {
-		mm.ReadMapsDataInto(ctx, buf)
+var _ kernfs.Inode = (*taskInode)(nil)
+
+func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry {
+	contents := map[string]*kernfs.Dentry{
+		//"auxv":      newAuxvec(t, msrc),
+		//"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
+		//"comm":      newComm(t, msrc),
+		//"environ":   newExecArgInode(t, msrc, environExecArg),
+		//"exe":       newExe(t, msrc),
+		//"fd":        newFdDir(t, msrc),
+		//"fdinfo":    newFdInfoDir(t, msrc),
+		//"gid_map":   newGIDMap(t, msrc),
+		"io":   newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)),
+		"maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}),
+		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		//"ns":        newNamespaceDir(t, msrc),
+		"smaps":  newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}),
+		"stat":   newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":  newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}),
+		"status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}),
+		//"uid_map":   newUIDMap(t, msrc),
 	}
-	return nil
-}
-
-// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
-//
-// +stateify savable
-type smapsData struct {
-	mapsCommon
-}
-
-var _ vfs.DynamicBytesSource = (*smapsData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	if mm := sd.mm(); mm != nil {
-		mm.ReadSmapsDataInto(ctx, buf)
+	if isThreadGroup {
+		//contents["task"] = p.newSubtasks(t, msrc)
 	}
-	return nil
-}
-
-// +stateify savable
-type taskStatData struct {
-	t *kernel.Task
+	//if len(p.cgroupControllers) > 0 {
+	//	contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
+	//}
 
-	// If tgstats is true, accumulate fault stats (not implemented) and CPU
-	// time across all tasks in t's thread group.
-	tgstats bool
+	taskInode := &taskInode{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
 
-	// pidns is the PID namespace associated with the proc filesystem that
-	// includes the file using this statData.
-	pidns *kernel.PIDNamespace
-}
-
-var _ vfs.DynamicBytesSource = (*taskStatData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
-	fmt.Fprintf(buf, "(%s) ", s.t.Name())
-	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
-	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
-		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
-	}
-	fmt.Fprintf(buf, "%d ", ppid)
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
-	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
-	fmt.Fprintf(buf, "0 " /* flags */)
-	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
-	var cputime usage.CPUStats
-	if s.tgstats {
-		cputime = s.t.ThreadGroup().CPUStats()
-	} else {
-		cputime = s.t.CPUStats()
-	}
-	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
-	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+	inode := &taskOwnedInode{Inode: taskInode, owner: task}
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
 
-	// itrealvalue. Since kernel 2.6.17, this field is no longer
-	// maintained, and is hard coded as 0.
-	fmt.Fprintf(buf, "0 ")
+	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := taskInode.OrderedChildren.Populate(dentry, contents)
+	taskInode.IncLinks(links)
 
-	// Start time is relative to boot time, expressed in clock ticks.
-	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+	return dentry
+}
 
-	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-		}
-	})
-	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// as the task is still running. When it's dead, another tasks with the same
+// PID could replace it.
+func (i *taskInode) Valid(ctx context.Context) bool {
+	return i.task.ExitState() != kernel.TaskExitDead
+}
 
-	// rsslim.
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+// Open implements kernfs.Inode.
+func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
 
-	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
-	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
-	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
-	terminationSignal := linux.Signal(0)
-	if s.t == s.t.ThreadGroup().Leader() {
-		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+// SetStat implements kernfs.Inode.
+func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	stat := opts.Stat
+	if stat.Mask&linux.STATX_MODE != 0 {
+		return syserror.EPERM
 	}
-	fmt.Fprintf(buf, "%d ", terminationSignal)
-	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
-	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
-	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
-	fmt.Fprintf(buf, "0\n" /* exit_code */)
-
 	return nil
 }
 
-// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
-//
-// +stateify savable
-type statmData struct {
-	t *kernel.Task
+// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
+// effective user and group.
+type taskOwnedInode struct {
+	kernfs.Inode
+
+	// owner is the task that owns this inode.
+	owner *kernel.Task
 }
 
-var _ vfs.DynamicBytesSource = (*statmData)(nil)
+var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-		}
-	})
+func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), ino, inode, perm)
 
-	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
-	return nil
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
 }
 
-// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
-//
-// +stateify savable
-type statusData struct {
-	t     *kernel.Task
-	pidns *kernel.PIDNamespace
+// Stat implements kernfs.Inode.
+func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
+	stat := i.Inode.Stat(fs)
+	uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+	stat.UID = uint32(uid)
+	stat.GID = uint32(gid)
+	return stat
 }
 
-var _ vfs.DynamicBytesSource = (*statusData)(nil)
+// CheckPermissions implements kernfs.Inode.
+func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := i.Mode()
+	uid, gid := i.getOwner(mode)
+	return vfs.GenericCheckPermissions(
+		creds,
+		ats,
+		mode.FileType() == linux.ModeDirectory,
+		uint16(mode),
+		uid,
+		gid,
+	)
+}
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
-	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
-	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
-	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
-	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
-		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
+	// By default, set the task owner as the file owner.
+	creds := i.owner.Credentials()
+	uid := creds.EffectiveKUID
+	gid := creds.EffectiveKGID
+
+	// Linux doesn't apply dumpability adjustments to world readable/executable
+	// directories so that applications can stat /proc/PID to determine the
+	// effective UID of a process. See fs/proc/base.c:task_dump_owner.
+	if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
+		return uid, gid
 	}
-	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
-	tpid := kernel.ThreadID(0)
-	if tracer := s.t.Tracer(); tracer != nil {
-		tpid = s.pidns.IDOfTask(tracer)
+
+	// If the task is not dumpable, then root (in the namespace preferred)
+	// owns the file.
+	m := getMM(i.owner)
+	if m == nil {
+		return auth.RootKUID, auth.RootKGID
 	}
-	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
-	var fds int
-	var vss, rss, data uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+	if m.Dumpability() != mm.UserDumpable {
+		uid = auth.RootKUID
+		if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
+			uid = kuid
 		}
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-			data = mm.VirtualDataSize()
+		gid = auth.RootKGID
+		if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
+			gid = kgid
 		}
-	})
-	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
-	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
-	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
-	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
-	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
-	creds := s.t.Credentials()
-	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
-	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
-	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
-	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
-	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
-	// We unconditionally report a single NUMA node. See
-	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
-	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
-	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
-	return nil
-}
-
-// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
-type ioUsage interface {
-	// IOUsage returns the io usage data.
-	IOUsage() *usage.IO
-}
-
-// +stateify savable
-type ioData struct {
-	ioUsage
+	}
+	return uid, gid
 }
 
-var _ vfs.DynamicBytesSource = (*ioData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	io := usage.IO{}
-	io.Accumulate(i.IOUsage())
-
-	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
-	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
-	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
-	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
-	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
-	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
-	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
-	return nil
+func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
+	if isThreadGroup {
+		return &ioData{ioUsage: t.ThreadGroup()}
+	}
+	return &ioData{ioUsage: t}
 }
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
new file mode 100644
index 000000000..93f0e1aa8
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -0,0 +1,272 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// mm here. This is safe because MemoryManager.destroy is required to leave the
+// MemoryManager in a state where it's still usable as a DynamicBytesSource.
+func getMM(task *kernel.Task) *mm.MemoryManager {
+	var tmm *mm.MemoryManager
+	task.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			tmm = mm
+		}
+	})
+	return tmm
+}
+
+// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+//
+// +stateify savable
+type mapsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := getMM(d.task); mm != nil {
+		mm.ReadMapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*smapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := getMM(d.task); mm != nil {
+		mm.ReadSmapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// +stateify savable
+type taskStatData struct {
+	kernfs.DynamicBytesFile
+
+	t *kernel.Task
+
+	// If tgstats is true, accumulate fault stats (not implemented) and CPU
+	// time across all tasks in t's thread group.
+	tgstats bool
+
+	// pidns is the PID namespace associated with the proc filesystem that
+	// includes the file using this statData.
+	pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*taskStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
+	fmt.Fprintf(buf, "(%s) ", s.t.Name())
+	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "%d ", ppid)
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
+	fmt.Fprintf(buf, "0 " /* flags */)
+	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+	var cputime usage.CPUStats
+	if s.tgstats {
+		cputime = s.t.ThreadGroup().CPUStats()
+	} else {
+		cputime = s.t.CPUStats()
+	}
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
+	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+
+	// itrealvalue. Since kernel 2.6.17, this field is no longer
+	// maintained, and is hard coded as 0.
+	fmt.Fprintf(buf, "0 ")
+
+	// Start time is relative to boot time, expressed in clock ticks.
+	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+
+	// rsslim.
+	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
+	terminationSignal := linux.Signal(0)
+	if s.t == s.t.ThreadGroup().Leader() {
+		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+	}
+	fmt.Fprintf(buf, "%d ", terminationSignal)
+	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
+	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+	fmt.Fprintf(buf, "0\n" /* exit_code */)
+
+	return nil
+}
+
+// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
+//
+// +stateify savable
+type statmData struct {
+	kernfs.DynamicBytesFile
+
+	t *kernel.Task
+}
+
+var _ dynamicInode = (*statmData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+
+	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+	return nil
+}
+
+// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
+//
+// +stateify savable
+type statusData struct {
+	kernfs.DynamicBytesFile
+
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*statusData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
+	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
+	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
+	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
+	tpid := kernel.ThreadID(0)
+	if tracer := s.t.Tracer(); tracer != nil {
+		tpid = s.pidns.IDOfTask(tracer)
+	}
+	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
+	var fds int
+	var vss, rss, data uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.Size()
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+			data = mm.VirtualDataSize()
+		}
+	})
+	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
+	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
+	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
+	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
+	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
+	creds := s.t.Credentials()
+	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	// We unconditionally report a single NUMA node. See
+	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
+	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
+	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
+	return nil
+}
+
+// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+type ioUsage interface {
+	// IOUsage returns the io usage data.
+	IOUsage() *usage.IO
+}
+
+// +stateify savable
+type ioData struct {
+	kernfs.DynamicBytesFile
+
+	ioUsage
+}
+
+var _ dynamicInode = (*ioData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	io := usage.IO{}
+	io.Accumulate(i.IOUsage())
+
+	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
+	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
+	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
+	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
+	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
+	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
+	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
new file mode 100644
index 000000000..50b2a832f
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -0,0 +1,162 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const defaultPermission = 0444
+
+// InoGenerator generates unique inode numbers for a given filesystem.
+type InoGenerator interface {
+	NextIno() uint64
+}
+
+// tasksInode represents the inode for /proc/ directory.
+//
+// +stateify savable
+type tasksInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+
+	inoGen InoGenerator
+	pidns  *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*tasksInode)(nil)
+
+func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) {
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+	contents := map[string]*kernfs.Dentry{
+		//"cpuinfo":     newCPUInfo(ctx, msrc),
+		//"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
+		"loadavg":     newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}),
+		"meminfo":     newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}),
+		"mounts":      kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"),
+		"self":        newSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns),
+		"stat":        newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}),
+		"thread-self": newThreadSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns),
+		//"uptime":      newUptime(ctx, msrc),
+		//"version": newVersionData(root, inoGen.NextIno(), k),
+		"version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}),
+	}
+
+	inode := &tasksInode{
+		pidns:  pidns,
+		inoGen: inoGen,
+	}
+	inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := inode.OrderedChildren.Populate(dentry, contents)
+	inode.IncLinks(links)
+
+	return inode, dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	// Try to lookup a corresponding task.
+	tid, err := strconv.ParseUint(name, 10, 64)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+
+	task := i.pidns.TaskWithID(kernel.ThreadID(tid))
+	if task == nil {
+		return nil, syserror.ENOENT
+	}
+
+	taskDentry := newTaskInode(i.inoGen, task, i.pidns, true)
+	return taskDentry.VFSDentry(), nil
+}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Valid(ctx context.Context) bool {
+	return true
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+//
+// TODO(gvisor.dev/issue/1195): Use tgid N offset = TGID_OFFSET + N.
+func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	var tids []int
+
+	// Collect all tasks. Per linux we only include it in directory listings if
+	// it's the leader. But for whatever crazy reason, you can still walk to the
+	// given node.
+	for _, tg := range i.pidns.ThreadGroups() {
+		if leader := tg.Leader(); leader != nil {
+			tids = append(tids, int(i.pidns.IDOfThreadGroup(tg)))
+		}
+	}
+
+	if len(tids) == 0 {
+		return offset, nil
+	}
+	if relOffset >= int64(len(tids)) {
+		return offset, nil
+	}
+
+	sort.Ints(tids)
+	for _, tid := range tids[relOffset:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(tid), 10),
+			Type:    linux.DT_DIR,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+// Open implements kernfs.Inode.
+func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
+	stat := i.InodeAttrs.Stat(vsfs)
+
+	// Add dynamic children to link count.
+	for _, tg := range i.pidns.ThreadGroups() {
+		if leader := tg.Leader(); leader != nil {
+			stat.Nlink++
+		}
+	}
+
+	return stat
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
new file mode 100644
index 000000000..91f30a798
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -0,0 +1,92 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type selfSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*selfSymlink)(nil)
+
+func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+	inode := &selfSymlink{pidns: pidns}
+	inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// Who is reading this link?
+		return "", syserror.EINVAL
+	}
+	tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+	if tgid == 0 {
+		return "", syserror.ENOENT
+	}
+	return strconv.FormatUint(uint64(tgid), 10), nil
+}
+
+type threadSelfSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*threadSelfSymlink)(nil)
+
+func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+	inode := &threadSelfSymlink{pidns: pidns}
+	inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// Who is reading this link?
+		return "", syserror.EINVAL
+	}
+	tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+	tid := s.pidns.IDOfTask(t)
+	if tid == 0 || tgid == 0 {
+		return "", syserror.ENOENT
+	}
+	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
new file mode 100644
index 000000000..48201d75a
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -0,0 +1,412 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"path"
+	"strconv"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type testIterDirentsCallback struct {
+	dirents []vfs.Dirent
+}
+
+func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool {
+	t.dirents = append(t.dirents, d)
+	return true
+}
+
+func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
+	if got := len(dirs); got < 2 {
+		return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs)
+	}
+	for i, want := range []string{".", ".."} {
+		if got := dirs[i].Name; got != want {
+			return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got)
+		}
+		if got := dirs[i].Type; got != linux.DT_DIR {
+			return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got)
+		}
+	}
+	return dirs[2:], nil
+}
+
+func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
+	wants := map[string]vfs.Dirent{
+		"loadavg":     vfs.Dirent{Type: linux.DT_REG},
+		"meminfo":     vfs.Dirent{Type: linux.DT_REG},
+		"mounts":      vfs.Dirent{Type: linux.DT_LNK},
+		"self":        vfs.Dirent{Type: linux.DT_LNK},
+		"stat":        vfs.Dirent{Type: linux.DT_REG},
+		"thread-self": vfs.Dirent{Type: linux.DT_LNK},
+		"version":     vfs.Dirent{Type: linux.DT_REG},
+	}
+	return checkFiles(gots, wants)
+}
+
+func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
+	wants := map[string]vfs.Dirent{
+		"io":     vfs.Dirent{Type: linux.DT_REG},
+		"maps":   vfs.Dirent{Type: linux.DT_REG},
+		"smaps":  vfs.Dirent{Type: linux.DT_REG},
+		"stat":   vfs.Dirent{Type: linux.DT_REG},
+		"statm":  vfs.Dirent{Type: linux.DT_REG},
+		"status": vfs.Dirent{Type: linux.DT_REG},
+	}
+	return checkFiles(gots, wants)
+}
+
+func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) {
+	// Go over all files, when there is a match, the file is removed from both
+	// 'gots' and 'wants'. wants is expected to reach 0, as all files must
+	// be present. Remaining files in 'gots', is returned to caller to decide
+	// whether this is valid or not.
+	for i := 0; i < len(gots); i++ {
+		got := gots[i]
+		want, ok := wants[got.Name]
+		if !ok {
+			continue
+		}
+		if want.Type != got.Type {
+			return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got)
+		}
+
+		delete(wants, got.Name)
+		gots = append(gots[0:i], gots[i+1:]...)
+		i--
+	}
+	if len(wants) != 0 {
+		return gots, fmt.Errorf("not all files were found, missing: %+v", wants)
+	}
+	return gots, nil
+}
+
+func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) {
+	k, err := boot()
+	if err != nil {
+		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
+	}
+	return ctx, vfsObj, mntns.Root(), nil
+}
+
+func TestTasksEmpty(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTasksStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(cb.dirents) != 0 {
+		t.Error("found more files than expected: %+v", cb.dirents)
+	}
+}
+
+func TestTasks(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	var tasks []*kernel.Task
+	for i := 0; i < 5; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc)
+		if err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+		tasks = append(tasks, task)
+	}
+
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTasksStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	lastPid := 0
+	for _, d := range cb.dirents {
+		pid, err := strconv.Atoi(d.Name)
+		if err != nil {
+			t.Fatalf("Invalid process directory %q", d.Name)
+		}
+		if lastPid > pid {
+			t.Errorf("pids not in order: %v", cb.dirents)
+		}
+		found := false
+		for _, t := range tasks {
+			if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) {
+				found = true
+			}
+		}
+		if !found {
+			t.Errorf("Additional task ID %d listed: %v", pid, tasks)
+		}
+	}
+
+	// Test lookup.
+	for _, path := range []string{"/1", "/2"} {
+		fd, err := vfsObj.OpenAt(
+			ctx,
+			auth.CredentialsFromContext(ctx),
+			&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)},
+			&vfs.OpenOptions{},
+		)
+		if err != nil {
+			t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
+		}
+		buf := make([]byte, 1)
+		bufIOSeq := usermem.BytesIOSequence(buf)
+		if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
+			t.Errorf("wrong error reading directory: %v", err)
+		}
+	}
+
+	if _, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")},
+		&vfs.OpenOptions{},
+	); err != syserror.ENOENT {
+		t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err)
+	}
+}
+
+func TestTask(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	_, err = createTask(ctx, "name", tc)
+	if err != nil {
+		t.Fatalf("CreateTask(): %v", err)
+	}
+
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTaskStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(cb.dirents) != 0 {
+		t.Errorf("found more files than expected: %+v", cb.dirents)
+	}
+}
+
+func TestProcSelf(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	task, err := createTask(ctx, "name", tc)
+	if err != nil {
+		t.Fatalf("CreateTask(): %v", err)
+	}
+
+	fd, err := vfsObj.OpenAt(
+		task,
+		auth.CredentialsFromContext(ctx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err)
+	}
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	cb.dirents, err = checkTaskStaticFiles(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(cb.dirents) != 0 {
+		t.Errorf("found more files than expected: %+v", cb.dirents)
+	}
+}
+
+func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) {
+	t.Logf("Iterating: /proc%s", fd.MappedName(ctx))
+
+	cb := testIterDirentsCallback{}
+	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	var err error
+	cb.dirents, err = checkDots(cb.dirents)
+	if err != nil {
+		t.Error(err.Error())
+	}
+	for _, d := range cb.dirents {
+		childPath := path.Join(fd.MappedName(ctx), d.Name)
+		if d.Type == linux.DT_LNK {
+			link, err := vfsObj.ReadlinkAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+			)
+			if err != nil {
+				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err)
+			} else {
+				t.Logf("Skipping symlink: /proc%s => %s", childPath, link)
+			}
+			continue
+		}
+
+		t.Logf("Opening: /proc%s", childPath)
+		child, err := vfsObj.OpenAt(
+			ctx,
+			auth.CredentialsFromContext(ctx),
+			&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+			&vfs.OpenOptions{},
+		)
+		if err != nil {
+			t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err)
+			continue
+		}
+		stat, err := child.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			t.Errorf("Stat(%v) failed: %v", childPath, err)
+		}
+		if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type {
+			t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type)
+		}
+		if d.Type == linux.DT_DIR {
+			// Found another dir, let's do it again!
+			iterateDir(ctx, t, vfsObj, root, child)
+		}
+	}
+}
+
+// TestTree iterates all directories and stats every file.
+func TestTree(t *testing.T) {
+	uberCtx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(uberCtx)
+	var tasks []*kernel.Task
+	for i := 0; i < 5; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc)
+		if err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+		tasks = append(tasks, task)
+	}
+
+	ctx := tasks[0]
+	fd, err := vfsObj.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(uberCtx),
+		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+	}
+	iterateDir(ctx, t, vfsObj, root, fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
index e1643d4e0..367f2396b 100644
--- a/pkg/sentry/fsimpl/proc/version.go
+++ b/pkg/sentry/fsimpl/proc/version.go
@@ -19,19 +19,21 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 // versionData implements vfs.DynamicBytesSource for /proc/version.
 //
 // +stateify savable
 type versionData struct {
+	kernfs.DynamicBytesFile
+
 	// k is the owning Kernel.
 	k *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*versionData)(nil)
+var _ dynamicInode = (*versionData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index bd3fb4c03..8653d2f63 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		mounts.IncRef()
 	}
 
-	tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
 	// Get the root directory from the MountNamespace.
@@ -1191,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup {
 	return k.globalInit
 }
 
+// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace.
+func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) {
+	k.globalInit = tg
+}
+
 // ApplicationCores returns the number of CPUs visible to sandboxed
 // applications.
 func (k *Kernel) ApplicationCores() uint {
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 3eadfedb4..5f3589493 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -243,7 +243,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
-		tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
 	}
 
 	cfg := &TaskConfig{
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 72568d296..0cded73f6 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -256,20 +256,20 @@ type ThreadGroup struct {
 	tty *TTY
 }
 
-// newThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
-func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
 	tg := &ThreadGroup{
 		threadGroupNode: threadGroupNode{
-			pidns: ns,
+			pidns: pidns,
 		},
 		signalHandlers:    sh,
 		terminationSignal: terminationSignal,
 		ioUsage:           &usage.IO{},
 		limits:            limits,
-		mounts:            mounts,
+		mounts:            mntns,
 	}
 	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
 	tg.timers = make(map[linux.TimerID]*IntervalTimer)
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 3df49991c..de782e577 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -199,6 +199,17 @@ type DynamicBytesSource interface {
 	Generate(ctx context.Context, buf *bytes.Buffer) error
 }
 
+// StaticData implements DynamicBytesSource over a static string.
+type StaticData struct {
+	Data string
+}
+
+// Generate implements DynamicBytesSource.
+func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(s.Data)
+	return nil
+}
+
 // SetDataSource must be called exactly once on fd before first use.
 func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
 	fd.data = data
-- 
cgit v1.2.3


From 796f53c0befc21570b185811e26b74e71950dfc3 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 27 Dec 2019 00:12:14 -0800
Subject: Add VFS2 support for /proc/filesystems.

Updates #1195

PiperOrigin-RevId: 287269106
---
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  4 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |  4 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |  4 +-
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |  8 +++-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |  4 +-
 pkg/sentry/fsimpl/proc/tasks_test.go              | 30 +++++++------
 pkg/sentry/vfs/file_description_impl_util_test.go |  2 +-
 pkg/sentry/vfs/filesystem_type.go                 | 55 ++++++++++++++++++++---
 pkg/sentry/vfs/mount.go                           | 15 ++++---
 pkg/sentry/vfs/options.go                         |  4 ++
 pkg/sentry/vfs/vfs.go                             | 12 ++---
 11 files changed, 103 insertions(+), 39 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 2f46d2d13..a56b03711 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -50,7 +50,9 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 5d6c999bd..6c14a1e2d 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -66,7 +66,9 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 3db12caa0..4b6b95f5f 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -59,7 +59,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 	v := vfs.New()
-	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn})
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index 6e987af88..a27876a4e 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -176,7 +176,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 
 			// Create VFS.
 			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
 			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
@@ -365,7 +367,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 
 			// Create VFS.
 			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
 			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index be917aeee..807c1af7a 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -152,7 +152,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 48201d75a..2560fcef9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -56,25 +56,25 @@ func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
 
 func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
-		"loadavg":     vfs.Dirent{Type: linux.DT_REG},
-		"meminfo":     vfs.Dirent{Type: linux.DT_REG},
-		"mounts":      vfs.Dirent{Type: linux.DT_LNK},
-		"self":        vfs.Dirent{Type: linux.DT_LNK},
-		"stat":        vfs.Dirent{Type: linux.DT_REG},
-		"thread-self": vfs.Dirent{Type: linux.DT_LNK},
-		"version":     vfs.Dirent{Type: linux.DT_REG},
+		"loadavg":     {Type: linux.DT_REG},
+		"meminfo":     {Type: linux.DT_REG},
+		"mounts":      {Type: linux.DT_LNK},
+		"self":        {Type: linux.DT_LNK},
+		"stat":        {Type: linux.DT_REG},
+		"thread-self": {Type: linux.DT_LNK},
+		"version":     {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
 }
 
 func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
-		"io":     vfs.Dirent{Type: linux.DT_REG},
-		"maps":   vfs.Dirent{Type: linux.DT_REG},
-		"smaps":  vfs.Dirent{Type: linux.DT_REG},
-		"stat":   vfs.Dirent{Type: linux.DT_REG},
-		"statm":  vfs.Dirent{Type: linux.DT_REG},
-		"status": vfs.Dirent{Type: linux.DT_REG},
+		"io":     {Type: linux.DT_REG},
+		"maps":   {Type: linux.DT_REG},
+		"smaps":  {Type: linux.DT_REG},
+		"stat":   {Type: linux.DT_REG},
+		"statm":  {Type: linux.DT_REG},
+		"status": {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
 }
@@ -114,7 +114,9 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error)
 	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{})
+	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 678be07fe..9ed58512f 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -89,7 +89,7 @@ func TestGenCountFD(t *testing.T) {
 	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := New() // vfs.New()
-	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
+	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create testfs root mount: %v", err)
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c335e206d..023301780 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -15,6 +15,7 @@
 package vfs
 
 import (
+	"bytes"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -43,28 +44,70 @@ type GetFilesystemOptions struct {
 	InternalData interface{}
 }
 
+type registeredFilesystemType struct {
+	fsType FilesystemType
+	opts   RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+	// If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+	// for which MountOptions.InternalMount == false to use this filesystem
+	// type.
+	AllowUserMount bool
+
+	// If AllowUserList is true, make this filesystem type visible in
+	// /proc/filesystems.
+	AllowUserList bool
+
+	// If RequiresDevice is true, indicate that mounting this filesystem
+	// requires a block device as the mount source in /proc/filesystems.
+	RequiresDevice bool
+}
+
 // RegisterFilesystemType registers the given FilesystemType in vfs with the
 // given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
 	vfs.fsTypesMu.Lock()
 	defer vfs.fsTypesMu.Unlock()
 	if existing, ok := vfs.fsTypes[name]; ok {
-		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+	}
+	vfs.fsTypes[name] = &registeredFilesystemType{
+		fsType: fsType,
+		opts:   *opts,
 	}
-	vfs.fsTypes[name] = fsType
 	return nil
 }
 
 // MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
 // panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
-	if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+	if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
 		panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
 	}
 }
 
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
 	vfs.fsTypesMu.RLock()
 	defer vfs.fsTypesMu.RUnlock()
 	return vfs.fsTypes[name]
 }
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+	vfs.fsTypesMu.RLock()
+	defer vfs.fsTypesMu.RUnlock()
+	for name, rft := range vfs.fsTypes {
+		if !rft.opts.AllowUserList {
+			continue
+		}
+		var nodev string
+		if !rft.opts.RequiresDevice {
+			nodev = "nodev"
+		}
+		fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+	}
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index ec23ab0dd..00177b371 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -112,11 +112,11 @@ type MountNamespace struct {
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
-	fsType := vfs.getFilesystemType(fsTypeName)
-	if fsType == nil {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return nil, err
 	}
@@ -136,11 +136,14 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
-	fsType := vfs.getFilesystemType(fsTypeName)
-	if fsType == nil {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
+	if !opts.InternalMount && !rft.opts.AllowUserMount {
+		return syserror.ENODEV
+	}
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 87d2b0d1c..b7774bf28 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -50,6 +50,10 @@ type MknodOptions struct {
 type MountOptions struct {
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
+
+	// If InternalMount is true, allow the use of filesystem types for which
+	// RegisterFilesystemTypeOptions.AllowUserMount == false.
+	InternalMount bool
 }
 
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 3e4df8558..a3bdb5805 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -75,23 +75,23 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+	// fsTypesMu.
+	fsTypesMu sync.RWMutex
+	fsTypes   map[string]*registeredFilesystemType
+
 	// filesystems contains all Filesystems. filesystems is protected by
 	// filesystemsMu.
 	filesystemsMu sync.Mutex
 	filesystems   map[*Filesystem]struct{}
-
-	// fsTypes contains all FilesystemTypes that are usable in the
-	// VirtualFilesystem. fsTypes is protected by fsTypesMu.
-	fsTypesMu sync.RWMutex
-	fsTypes   map[string]FilesystemType
 }
 
 // New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		fsTypes:     make(map[string]*registeredFilesystemType),
 		filesystems: make(map[*Filesystem]struct{}),
-		fsTypes:     make(map[string]FilesystemType),
 	}
 	vfs.mounts.Init()
 	return vfs
-- 
cgit v1.2.3


From 03e53745cc04f674d4795fcafcca755c836e526f Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 24 Dec 2019 10:29:36 +0800
Subject: Add test/util/save_util_linux.cc:MaybeSave to support arm64

There is no syscall_create_module on Arm64.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/util/save_util_linux.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/test/util/save_util_linux.cc b/test/util/save_util_linux.cc
index 7a0f14342..cd56118c0 100644
--- a/test/util/save_util_linux.cc
+++ b/test/util/save_util_linux.cc
@@ -18,13 +18,25 @@
 
 #include "test/util/save_util.h"
 
+#if defined(__x86_64__) || defined(__i386__)
+#define SYS_TRIGGER_SAVE SYS_create_module
+#elif defined(__aarch64__)
+#define SYS_TRIGGER_SAVE SYS_finit_module
+#else
+#error "Unknown architecture"
+#endif
+
 namespace gvisor {
 namespace testing {
 
 void MaybeSave() {
   if (internal::ShouldSave()) {
     int orig_errno = errno;
-    syscall(SYS_create_module, nullptr, 0);
+    // We use it to trigger saving the sentry state
+    // when this syscall is called.
+    // Notice: this needs to be a valid syscall
+    // that is not used in any of the syscall tests.
+    syscall(SYS_TRIGGER_SAVE, nullptr, 0);
     errno = orig_errno;
   }
 }
-- 
cgit v1.2.3


From 1f384ac42b9ee8b52000dc2bff79d975853519ed Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 30 Dec 2019 11:35:06 -0800
Subject: Add VFS2 support for device special files.

- Add FileDescriptionOptions.UseDentryMetadata, which reduces the amount of
  boilerplate needed for device FDs and the like between filesystems.

- Switch back to having FileDescription.Init() take references on the Mount and
  Dentry; otherwise managing refcounts around failed calls to
  OpenDeviceSpecialFile() / Device.Open() is tricky.

PiperOrigin-RevId: 287575574
---
 pkg/sentry/fsimpl/ext/inode.go                 |   6 --
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |   2 -
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |   2 -
 pkg/sentry/fsimpl/memfs/filesystem.go          |   4 -
 pkg/sentry/fsimpl/memfs/named_pipe.go          |   2 -
 pkg/sentry/vfs/BUILD                           |   1 +
 pkg/sentry/vfs/device.go                       | 100 ++++++++++++++++++++++++
 pkg/sentry/vfs/file_description.go             | 101 +++++++++++++++++++++++--
 pkg/sentry/vfs/file_description_impl_util.go   |  15 ++++
 pkg/sentry/vfs/filesystem.go                   |  21 +++++
 pkg/sentry/vfs/vfs.go                          |   6 ++
 11 files changed, 236 insertions(+), 24 deletions(-)
 create mode 100644 pkg/sentry/vfs/device.go

diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index b2cc826c7..8608805bf 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,8 +157,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		mnt.IncRef()
-		vfsd.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *directory:
@@ -168,8 +166,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		mnt.IncRef()
-		vfsd.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
@@ -178,8 +174,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		mnt.IncRef()
-		vfsd.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index c5fe65722..606ca692d 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -81,8 +81,6 @@ type DynamicBytesFD struct {
 
 // Init initializes a DynamicBytesFD.
 func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
-	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.inode = d.Impl().(*Dentry).inode
 	fd.SetDataSource(data)
 	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 77975583b..bcf069b5f 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -44,8 +44,6 @@ type GenericDirectoryFD struct {
 
 // Init initializes a GenericDirectoryFD.
 func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
-	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.children = children
 	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 4a83f310c..b063e09a3 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -348,8 +348,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			}
 			// mnt.EndWrite() is called by regularFileFD.Release().
 		}
-		mnt.IncRef()
-		d.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
@@ -364,8 +362,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		mnt.IncRef()
-		d.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
index d5060850e..b5a204438 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 		return nil, err
 	}
 	mnt := rp.Mount()
-	mnt.IncRef()
-	vfsd.IncRef()
 	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index e3e554b88..4c6aa04a1 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -9,6 +9,7 @@ go_library(
         "context.go",
         "debug.go",
         "dentry.go",
+        "device.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..cb672e36f
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,100 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+	// BlockDevice indicates a block device.
+	BlockDevice DeviceKind = iota
+
+	// CharDevice indicates a character device.
+	CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+	switch kind {
+	case BlockDevice:
+		return "block"
+	case CharDevice:
+		return "character"
+	default:
+		return fmt.Sprintf("invalid device kind %d", kind)
+	}
+}
+
+type devTuple struct {
+	kind  DeviceKind
+	major uint32
+	minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+	// Open returns a FileDescription representing this device.
+	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+type registeredDevice struct {
+	dev  Device
+	opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+type RegisterDeviceOptions struct {
+	// GroupName is the name shown for this device registration in
+	// /proc/devices. If GroupName is empty, this registration will not be
+	// shown in /proc/devices.
+	GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.Lock()
+	defer vfs.devicesMu.Unlock()
+	if existing, ok := vfs.devices[tup]; ok {
+		return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+	}
+	vfs.devices[tup] = &registeredDevice{
+		dev:  dev,
+		opts: *opts,
+	}
+	return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.RLock()
+	defer vfs.devicesMu.RUnlock()
+	rd, ok := vfs.devices[tup]
+	if !ok {
+		return nil, syserror.ENXIO
+	}
+	return rd.dev.Open(ctx, mnt, d, *opts)
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 0b053201a..6afe280bc 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -61,11 +61,25 @@ type FileDescriptionOptions struct {
 	// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
 	// usually only the case if O_DIRECT would actually have an effect.
 	AllowDirectIO bool
+
+	// If UseDentryMetadata is true, calls to FileDescription methods that
+	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
+	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// the corresponding FilesystemImpl methods instead of the corresponding
+	// FileDescriptionImpl methods.
+	//
+	// UseDentryMetadata is intended for file descriptions that are implemented
+	// outside of individual filesystems, such as pipes, sockets, and device
+	// special files. FileDescriptions for which UseDentryMetadata is true may
+	// embed DentryMetadataFileDescriptionImpl to obtain appropriate
+	// implementations of FileDescriptionImpl methods that should not be
+	// called.
+	UseDentryMetadata bool
 }
 
-// Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller. statusFlags is the initial file description
-// status flags, which is usually the full set of flags passed to open(2).
+// Init must be called before first use of fd. It takes references on mnt and
+// d. statusFlags is the initial file description status flags, which is
+// usually the full set of flags passed to open(2).
 func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
 	fd.refs = 1
 	fd.statusFlags = statusFlags | linux.O_LARGEFILE
@@ -73,6 +87,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 		mount:  mnt,
 		dentry: d,
 	}
+	fd.vd.IncRef()
 	fd.opts = *opts
 	fd.impl = impl
 }
@@ -140,7 +155,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
 	// sense. However, the check as actually implemented seems to be "O_APPEND
 	// cannot be changed if the file is marked as append-only".
 	if (flags^oldFlags)&linux.O_APPEND != 0 {
-		stat, err := fd.impl.Stat(ctx, StatOptions{
+		stat, err := fd.Stat(ctx, StatOptions{
 			// There is no mask bit for stx_attributes.
 			Mask: 0,
 			// Linux just reads inode::i_flags directly.
@@ -154,7 +169,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
 		}
 	}
 	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
-		stat, err := fd.impl.Stat(ctx, StatOptions{
+		stat, err := fd.Stat(ctx, StatOptions{
 			Mask: linux.STATX_UID,
 			// Linux's inode_owner_or_capable() just reads inode::i_uid
 			// directly.
@@ -348,17 +363,47 @@ func (fd *FileDescription) OnClose(ctx context.Context) error {
 
 // Stat returns metadata for the file represented by fd.
 func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return stat, err
+	}
 	return fd.impl.Stat(ctx, opts)
 }
 
 // SetStat updates metadata for the file represented by fd.
 func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.SetStat(ctx, opts)
 }
 
 // StatFS returns metadata for the filesystem containing the file represented
 // by fd.
 func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return statfs, err
+	}
 	return fd.impl.StatFS(ctx)
 }
 
@@ -417,6 +462,16 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 // Listxattr returns all extended attribute names for the file represented by
 // fd.
 func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return names, err
+	}
 	names, err := fd.impl.Listxattr(ctx)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
@@ -431,18 +486,48 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
 // Getxattr returns the value associated with the given extended attribute for
 // the file represented by fd.
 func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return val, err
+	}
 	return fd.impl.Getxattr(ctx, name)
 }
 
 // Setxattr changes the value associated with the given extended attribute for
 // the file represented by fd.
 func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.Setxattr(ctx, opts)
 }
 
 // Removexattr removes the given extended attribute from the file represented
 // by fd.
 func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.Removexattr(ctx, name)
 }
 
@@ -464,7 +549,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string {
 
 // DeviceID implements memmap.MappingIdentity.DeviceID.
 func (fd *FileDescription) DeviceID() uint64 {
-	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+	stat, err := fd.Stat(context.Background(), StatOptions{
 		// There is no STATX_DEV; we assume that Stat will return it if it's
 		// available regardless of mask.
 		Mask: 0,
@@ -480,7 +565,7 @@ func (fd *FileDescription) DeviceID() uint64 {
 
 // InodeID implements memmap.MappingIdentity.InodeID.
 func (fd *FileDescription) InodeID() uint64 {
-	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+	stat, err := fd.Stat(context.Background(), StatOptions{
 		Mask: linux.STATX_INO,
 		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
 		Sync: linux.AT_STATX_DONT_SYNC,
@@ -493,5 +578,5 @@ func (fd *FileDescription) InodeID() uint64 {
 
 // Msync implements memmap.MappingIdentity.Msync.
 func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
-	return fd.impl.Sync(ctx)
+	return fd.Sync(ctx)
 }
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index de782e577..66eb57bc2 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -177,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
 	return 0, syserror.EISDIR
 }
 
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
 // DynamicBytesFileDescriptionImpl may be embedded by implementations of
 // FileDescriptionImpl that represent read-only regular files whose contents
 // are backed by a bytes.Buffer that is regenerated when necessary, consistent
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 89bd58864..ea78f555b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -418,17 +418,38 @@ type FilesystemImpl interface {
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
 	// ListxattrAt returns all extended attribute names for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
+	// explanation.)
 	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
 
 	// GetxattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// returns ENOTSUP.
 	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
 
 	// SetxattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// returns ENOTSUP.
 	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
 
 	// RemovexattrAt removes the given extended attribute from the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// RemovexattrAt returns ENOTSUP.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index a3bdb5805..ea2db7031 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -75,6 +75,11 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// devices contains all registered Devices. devices is protected by
+	// devicesMu.
+	devicesMu sync.RWMutex
+	devices   map[devTuple]*registeredDevice
+
 	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
 	// fsTypesMu.
 	fsTypesMu sync.RWMutex
@@ -90,6 +95,7 @@ type VirtualFilesystem struct {
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		devices:     make(map[devTuple]*registeredDevice),
 		fsTypes:     make(map[string]*registeredFilesystemType),
 		filesystems: make(map[*Filesystem]struct{}),
 	}
-- 
cgit v1.2.3


From 200cf245c4ed43d8e2a37484cdbc36f5fbfa1ac9 Mon Sep 17 00:00:00 2001
From: Marek Majkowski <marek@cloudflare.com>
Date: Tue, 31 Dec 2019 16:49:30 +0100
Subject: netstack: minor fix typo in "if err" handler

---
 pkg/tcpip/sample/tun_tcp_connect/main.go | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 2239c1e66..0ab089208 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -164,7 +164,7 @@ func main() {
 	// Create TCP endpoint.
 	var wq waiter.Queue
 	ep, e := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
-	if err != nil {
+	if e != nil {
 		log.Fatal(e)
 	}
 
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index bca73cbb1..9e37cab18 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -168,7 +168,7 @@ func main() {
 	// Create TCP endpoint, bind it, then start listening.
 	var wq waiter.Queue
 	ep, e := s.NewEndpoint(tcp.ProtocolNumber, proto, &wq)
-	if err != nil {
+	if e != nil {
 		log.Fatal(e)
 	}
 
-- 
cgit v1.2.3


From d1d878a801e066d6a54838ac3b2cdb43d65743e1 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 3 Jan 2020 12:58:40 -0800
Subject: Support generating opaque interface identifiers as defined by RFC
 7217

Support generating opaque interface identifiers as defined by RFC 7217 for
auto-generated IPv6 link-local addresses. Opaque interface identifiers will also
be used for IPv6 addresses auto-generated via SLAAC in a later change.

Note, this change does not handle retries in response to DAD conflicts yet.
That will also come in a later change.

Tests: Test that when configured to generated opaque IIDs, they are properly
generated as outlined by RFC 7217.
PiperOrigin-RevId: 288035349
---
 pkg/tcpip/header/BUILD        |   1 +
 pkg/tcpip/header/ipv6.go      |  45 ++++++++++++
 pkg/tcpip/header/ipv6_test.go | 163 ++++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/BUILD         |   1 +
 pkg/tcpip/stack/nic.go        |  26 ++++---
 pkg/tcpip/stack/stack.go      |  36 ++++++++++
 pkg/tcpip/stack/stack_test.go | 127 ++++++++++++++++++++++++++++++--
 7 files changed, 384 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index f1d837196..f2061c778 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -44,6 +44,7 @@ go_test(
     ],
     deps = [
         ":header",
+        "//pkg/rand",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "@com_github_google_go-cmp//cmp:go_default_library",
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index fc671e439..135a60b12 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -15,6 +15,7 @@
 package header
 
 import (
+	"crypto/sha256"
 	"encoding/binary"
 	"strings"
 
@@ -102,6 +103,11 @@ const (
 	// bytes including and after the IIDOffsetInIPv6Address-th byte are
 	// for the IID.
 	IIDOffsetInIPv6Address = 8
+
+	// OpaqueIIDSecretKeyMinBytes is the recommended minimum number of bytes
+	// for the secret key used to generate an opaque interface identifier as
+	// outlined by RFC 7217.
+	OpaqueIIDSecretKeyMinBytes = 16
 )
 
 // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -326,3 +332,42 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool {
 	}
 	return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
 }
+
+// AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier
+// (IID) to buf as outlined by RFC 7217 and returns the extended buffer.
+//
+// The opaque IID is generated from the cryptographic hash of the concatenation
+// of the prefix, NIC's name, DAD counter (DAD retry counter) and the secret
+// key. The secret key SHOULD be at least OpaqueIIDSecretKeyMinBytes bytes and
+// MUST be generated to a pseudo-random number. See RFC 4086 for randomness
+// requirements for security.
+//
+// If buf has enough capacity for the IID (IIDSize bytes), a new underlying
+// array for the buffer will not be allocated.
+func AppendOpaqueInterfaceIdentifier(buf []byte, prefix tcpip.Subnet, nicName string, dadCounter uint8, secretKey []byte) []byte {
+	// As per RFC 7217 section 5, the opaque identifier can be generated as a
+	// cryptographic hash of the concatenation of each of the function parameters.
+	// Note, we omit the optional Network_ID field.
+	h := sha256.New()
+	// h.Write never returns an error.
+	h.Write([]byte(prefix.ID()[:IIDOffsetInIPv6Address]))
+	h.Write([]byte(nicName))
+	h.Write([]byte{dadCounter})
+	h.Write(secretKey)
+
+	var sumBuf [sha256.Size]byte
+	sum := h.Sum(sumBuf[:0])
+
+	return append(buf, sum[:IIDSize]...)
+}
+
+// LinkLocalAddrWithOpaqueIID computes the default IPv6 link-local address with
+// an opaque IID.
+func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []byte) tcpip.Address {
+	lladdrb := [IPv6AddressSize]byte{
+		0: 0xFE,
+		1: 0x80,
+	}
+
+	return tcpip.Address(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey))
+}
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index 42c5c6fc1..cd1862e42 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -15,9 +15,12 @@
 package header_test
 
 import (
+	"bytes"
+	"crypto/sha256"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
@@ -43,3 +46,163 @@ func TestLinkLocalAddr(t *testing.T) {
 		t.Errorf("got LinkLocalAddr(%s) = %s, want = %s", linkAddr, got, want)
 	}
 }
+
+func TestAppendOpaqueInterfaceIdentifier(t *testing.T) {
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte
+	if n, err := rand.Read(secretKeyBuf[:]); err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	} else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want {
+		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n)
+	}
+
+	tests := []struct {
+		name       string
+		prefix     tcpip.Subnet
+		nicName    string
+		dadCounter uint8
+		secretKey  []byte
+	}{
+		{
+			name:       "SecretKey of minimum size",
+			prefix:     header.IPv6LinkLocalPrefix.Subnet(),
+			nicName:    "eth0",
+			dadCounter: 0,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes],
+		},
+		{
+			name: "SecretKey of less than minimum size",
+			prefix: func() tcpip.Subnet {
+				addrWithPrefix := tcpip.AddressWithPrefix{
+					Address:   "\x01\x02\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+					PrefixLen: header.IIDOffsetInIPv6Address * 8,
+				}
+				return addrWithPrefix.Subnet()
+			}(),
+			nicName:    "eth10",
+			dadCounter: 1,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2],
+		},
+		{
+			name: "SecretKey of more than minimum size",
+			prefix: func() tcpip.Subnet {
+				addrWithPrefix := tcpip.AddressWithPrefix{
+					Address:   "\x01\x02\x03\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+					PrefixLen: header.IIDOffsetInIPv6Address * 8,
+				}
+				return addrWithPrefix.Subnet()
+			}(),
+			nicName:    "eth11",
+			dadCounter: 2,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
+		},
+		{
+			name: "Nil SecretKey",
+			prefix: func() tcpip.Subnet {
+				addrWithPrefix := tcpip.AddressWithPrefix{
+					Address:   "\x01\x02\x03\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+					PrefixLen: header.IIDOffsetInIPv6Address * 8,
+				}
+				return addrWithPrefix.Subnet()
+			}(),
+			nicName:    "eth12",
+			dadCounter: 3,
+			secretKey:  nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			h := sha256.New()
+			h.Write([]byte(test.prefix.ID()[:header.IIDOffsetInIPv6Address]))
+			h.Write([]byte(test.nicName))
+			h.Write([]byte{test.dadCounter})
+			if k := test.secretKey; k != nil {
+				h.Write(k)
+			}
+			var hashSum [sha256.Size]byte
+			h.Sum(hashSum[:0])
+			want := hashSum[:header.IIDSize]
+
+			// Passing a nil buffer should result in a new buffer returned with the
+			// IID.
+			if got := header.AppendOpaqueInterfaceIdentifier(nil, test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) {
+				t.Errorf("got AppendOpaqueInterfaceIdentifier(nil, %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want)
+			}
+
+			// Passing a buffer with sufficient capacity for the IID should populate
+			// the buffer provided.
+			var iidBuf [header.IIDSize]byte
+			if got := header.AppendOpaqueInterfaceIdentifier(iidBuf[:0], test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) {
+				t.Errorf("got AppendOpaqueInterfaceIdentifier(iidBuf[:0], %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want)
+			}
+			if got := iidBuf[:]; !bytes.Equal(got, want) {
+				t.Errorf("got iidBuf = %x, want = %x", got, want)
+			}
+		})
+	}
+}
+
+func TestLinkLocalAddrWithOpaqueIID(t *testing.T) {
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte
+	if n, err := rand.Read(secretKeyBuf[:]); err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	} else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want {
+		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n)
+	}
+
+	prefix := header.IPv6LinkLocalPrefix.Subnet()
+
+	tests := []struct {
+		name       string
+		prefix     tcpip.Subnet
+		nicName    string
+		dadCounter uint8
+		secretKey  []byte
+	}{
+		{
+			name:       "SecretKey of minimum size",
+			nicName:    "eth0",
+			dadCounter: 0,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes],
+		},
+		{
+			name:       "SecretKey of less than minimum size",
+			nicName:    "eth10",
+			dadCounter: 1,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2],
+		},
+		{
+			name:       "SecretKey of more than minimum size",
+			nicName:    "eth11",
+			dadCounter: 2,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
+		},
+		{
+			name:       "Nil SecretKey",
+			nicName:    "eth12",
+			dadCounter: 3,
+			secretKey:  nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			addrBytes := [header.IPv6AddressSize]byte{
+				0: 0xFE,
+				1: 0x80,
+			}
+
+			want := tcpip.Address(header.AppendOpaqueInterfaceIdentifier(
+				addrBytes[:header.IIDOffsetInIPv6Address],
+				prefix,
+				test.nicName,
+				test.dadCounter,
+				test.secretKey,
+			))
+
+			if got := header.LinkLocalAddrWithOpaqueIID(test.nicName, test.dadCounter, test.secretKey); got != want {
+				t.Errorf("got LinkLocalAddrWithOpaqueIID(%s, %d, %x) = %s, want = %s", test.nicName, test.dadCounter, test.secretKey, got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 69077669a..b8f9517d0 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -59,6 +59,7 @@ go_test(
     ],
     deps = [
         ":stack",
+        "//pkg/rand",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ddd014658..3bed0af3c 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -178,20 +178,24 @@ func (n *NIC) enable() *tcpip.Error {
 		return nil
 	}
 
-	l2addr := n.linkEP.LinkAddress()
+	var addr tcpip.Address
+	if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+		addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID()), 0, oIID.SecretKey)
+	} else {
+		l2addr := n.linkEP.LinkAddress()
+
+		// Only attempt to generate the link-local address if we have a valid MAC
+		// address.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		if !header.IsValidUnicastEthernetAddress(l2addr) {
+			return nil
+		}
 
-	// Only attempt to generate the link-local address if we have a
-	// valid MAC address.
-	//
-	// TODO(b/141011931): Validate a LinkEndpoint's link address
-	// (provided by LinkEndpoint.LinkAddress) before reaching this
-	// point.
-	if !header.IsValidUnicastEthernetAddress(l2addr) {
-		return nil
+		addr = header.LinkLocalAddr(l2addr)
 	}
 
-	addr := header.LinkLocalAddr(l2addr)
-
 	_, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
 		Protocol: header.IPv6ProtocolNumber,
 		AddressWithPrefix: tcpip.AddressWithPrefix{
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7a9600679..c6e6becf3 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -352,6 +352,33 @@ func (u *uniqueIDGenerator) UniqueID() uint64 {
 	return atomic.AddUint64((*uint64)(u), 1)
 }
 
+// NICNameFromID is a function that returns a stable name for the specified NIC,
+// even if the NIC ID changes over time.
+type NICNameFromID func(tcpip.NICID) string
+
+// OpaqueInterfaceIdentifierOptions holds the options related to the generation
+// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
+type OpaqueInterfaceIdentifierOptions struct {
+	// NICNameFromID is a function that returns a stable name for a specified NIC,
+	// even if the NIC ID changes over time.
+	//
+	// Must be specified to generate the opaque IID.
+	NICNameFromID NICNameFromID
+
+	// SecretKey is a pseudo-random number used as the secret key when generating
+	// opaque IIDs as defined by RFC 7217. The key SHOULD be at least
+	// header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
+	// requirements for security as outlined by RFC 4086. SecretKey MUST NOT
+	// change between program runs, unless explicitly changed.
+	//
+	// OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
+	// MUST NOT be modified after Stack is created.
+	//
+	// May be nil, but a nil value is highly discouraged to maintain
+	// some level of randomness between nodes.
+	SecretKey []byte
+}
+
 // Stack is a networking stack, with all supported protocols, NICs, and route
 // table.
 type Stack struct {
@@ -422,6 +449,10 @@ type Stack struct {
 
 	// uniqueIDGenerator is a generator of unique identifiers.
 	uniqueIDGenerator UniqueID
+
+	// opaqueIIDOpts hold the options for generating opaque interface identifiers
+	// (IIDs) as outlined by RFC 7217.
+	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
 }
 
 // UniqueID is an abstract generator of unique identifiers.
@@ -479,6 +510,10 @@ type Options struct {
 	// RawFactory produces raw endpoints. Raw endpoints are enabled only if
 	// this is non-nil.
 	RawFactory RawFactory
+
+	// OpaqueIIDOpts hold the options for generating opaque interface identifiers
+	// (IIDs) as outlined by RFC 7217.
+	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -549,6 +584,7 @@ func New(opts Options) *Stack {
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
+		opaqueIIDOpts:        opts.OpaqueIIDOpts,
 	}
 
 	// Add specified network protocols.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 8fc034ca1..e18dfea83 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -27,6 +27,7 @@ import (
 	"time"
 
 	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -1894,55 +1895,67 @@ func TestNICForwarding(t *testing.T) {
 }
 
 // TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses
-// (or lack there-of if disabled (default)). Note, DAD will be disabled in
-// these tests.
+// using the modified EUI-64 of the NIC's MAC address (or lack there-of if
+// disabled (default)). Note, DAD will be disabled in these tests.
 func TestNICAutoGenAddr(t *testing.T) {
 	tests := []struct {
 		name      string
 		autoGen   bool
 		linkAddr  tcpip.LinkAddress
+		iidOpts   stack.OpaqueInterfaceIdentifierOptions
 		shouldGen bool
 	}{
 		{
 			"Disabled",
 			false,
 			linkAddr1,
+			stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(nicID tcpip.NICID) string {
+					return fmt.Sprintf("nic%d", nicID)
+				},
+			},
 			false,
 		},
 		{
 			"Enabled",
 			true,
 			linkAddr1,
+			stack.OpaqueInterfaceIdentifierOptions{},
 			true,
 		},
 		{
 			"Nil MAC",
 			true,
 			tcpip.LinkAddress([]byte(nil)),
+			stack.OpaqueInterfaceIdentifierOptions{},
 			false,
 		},
 		{
 			"Empty MAC",
 			true,
 			tcpip.LinkAddress(""),
+			stack.OpaqueInterfaceIdentifierOptions{},
 			false,
 		},
 		{
 			"Invalid MAC",
 			true,
 			tcpip.LinkAddress("\x01\x02\x03"),
+			stack.OpaqueInterfaceIdentifierOptions{},
 			false,
 		},
 		{
 			"Multicast MAC",
 			true,
 			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			stack.OpaqueInterfaceIdentifierOptions{},
 			false,
 		},
 		{
 			"Unspecified MAC",
 			true,
 			tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+			stack.OpaqueInterfaceIdentifierOptions{},
 			false,
 		},
 	}
@@ -1951,6 +1964,112 @@ func TestNICAutoGenAddr(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			opts := stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				OpaqueIIDOpts:    test.iidOpts,
+			}
+
+			if test.autoGen {
+				// Only set opts.AutoGenIPv6LinkLocal when test.autoGen is true because
+				// opts.AutoGenIPv6LinkLocal should be false by default.
+				opts.AutoGenIPv6LinkLocal = true
+			}
+
+			e := channel.New(10, 1280, test.linkAddr)
+			s := stack.New(opts)
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+
+			if test.shouldGen {
+				// Should have auto-generated an address and resolved immediately (DAD
+				// is disabled).
+				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+				}
+			} else {
+				// Should not have auto-generated an address.
+				if want := (tcpip.AddressWithPrefix{}); addr != want {
+					t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				}
+			}
+		})
+	}
+}
+
+// TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local
+// addresses with opaque interface identifiers. Link Local addresses should
+// always be generated with opaque IIDs if configured to use them, even if the
+// NIC has an invalid MAC address.
+func TestNICAutoGenAddrWithOpaque(t *testing.T) {
+	var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte
+	n, err := rand.Read(secretKey[:])
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n)
+	}
+
+	iidOpts := stack.OpaqueInterfaceIdentifierOptions{
+		NICNameFromID: func(nicID tcpip.NICID) string {
+			return fmt.Sprintf("nic%d", nicID)
+		},
+		SecretKey: secretKey[:],
+	}
+
+	tests := []struct {
+		name     string
+		autoGen  bool
+		linkAddr tcpip.LinkAddress
+	}{
+		{
+			"Disabled",
+			false,
+			linkAddr1,
+		},
+		{
+			"Enabled",
+			true,
+			linkAddr1,
+		},
+		// These are all cases where we would not have generated a
+		// link-local address if opaque IIDs were disabled.
+		{
+			"Nil MAC",
+			true,
+			tcpip.LinkAddress([]byte(nil)),
+		},
+		{
+			"Empty MAC",
+			true,
+			tcpip.LinkAddress(""),
+		},
+		{
+			"Invalid MAC",
+			true,
+			tcpip.LinkAddress("\x01\x02\x03"),
+		},
+		{
+			"Multicast MAC",
+			true,
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+		{
+			"Unspecified MAC",
+			true,
+			tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				OpaqueIIDOpts:    iidOpts,
 			}
 
 			if test.autoGen {
@@ -1972,10 +2091,10 @@ func TestNICAutoGenAddr(t *testing.T) {
 				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
 			}
 
-			if test.shouldGen {
+			if test.autoGen {
 				// Should have auto-generated an address and
 				// resolved immediately (DAD is disabled).
-				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID("nic1", 0, secretKey[:]), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
 					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
 				}
 			} else {
-- 
cgit v1.2.3


From bf53d325ddcd533d202efcab40047535078a02f3 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 3 Jan 2020 17:46:04 -0800
Subject: Remove FIXME comments to close old bug.

PiperOrigin-RevId: 288075400
---
 pkg/sentry/mm/procfs.go | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 8c2246bb4..79610acb7 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -66,8 +66,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer
 	var start usermem.Addr
 
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
 	}
 
@@ -81,7 +79,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer
 	//
 	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		buf.WriteString(vsyscallMapsEntry)
 	}
 }
@@ -97,8 +94,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
 			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
@@ -116,7 +111,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 	//
 	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallMapsEntry),
@@ -187,15 +181,12 @@ func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffe
 	var start usermem.Addr
 
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
 	}
 
 	// We always emulate vsyscall, so advertise it here. See
 	// ReadMapsSeqFileData for additional commentary.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		buf.WriteString(vsyscallSmapsEntry)
 	}
 }
@@ -211,8 +202,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
 			Buf:    mm.vmaSmapsEntryLocked(ctx, vseg),
@@ -223,7 +212,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 	// We always emulate vsyscall, so advertise it here. See
 	// ReadMapsSeqFileData for additional commentary.
 	if start != vsyscallEnd {
-		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallSmapsEntry),
-- 
cgit v1.2.3


From 83ab47e87badd8b46f784739903361d9f824fa2c Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 3 Jan 2020 18:27:04 -0800
Subject: Use opaque interface identifiers when generating IPv6 addresses via
 SLAAC

Support using opaque interface identifiers when generating IPv6 addresses via
SLAAC when configured to do so.

Note, this change does not handle retries in response to DAD conflicts yet.
That will also come in a later change.

Test: Test that when SLAAC addresses are generated, they use opaque interface
identifiers when configured to do so.
PiperOrigin-RevId: 288078605
---
 pkg/tcpip/stack/ndp.go      |  32 +++++++-------
 pkg/tcpip/stack/ndp_test.go | 104 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index d9ab59336..ba6a57e6f 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1028,22 +1028,24 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 		return
 	}
 
-	// Only attempt to generate an interface-specific IID if we have a valid
-	// link address.
-	//
-	// TODO(b/141011931): Validate a LinkEndpoint's link address
-	// (provided by LinkEndpoint.LinkAddress) before reaching this
-	// point.
-	linkAddr := ndp.nic.linkEP.LinkAddress()
-	if !header.IsValidUnicastEthernetAddress(linkAddr) {
-		return
-	}
+	addrBytes := []byte(prefix.ID())
+	if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+		addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID()), 0 /* dadCounter */, oIID.SecretKey)
+	} else {
+		// Only attempt to generate an interface-specific IID if we have a valid
+		// link address.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		linkAddr := ndp.nic.linkEP.LinkAddress()
+		if !header.IsValidUnicastEthernetAddress(linkAddr) {
+			return
+		}
 
-	// Generate an address within prefix from the modified EUI-64 of ndp's
-	// NIC's Ethernet MAC address.
-	addrBytes := make([]byte, header.IPv6AddressSize)
-	copy(addrBytes[:header.IIDOffsetInIPv6Address], prefix.ID()[:header.IIDOffsetInIPv6Address])
-	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+		// Generate an address within prefix from the modified EUI-64 of ndp's NIC's
+		// Ethernet MAC address.
+		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	}
 	addr := tcpip.Address(addrBytes)
 	addrWithPrefix := tcpip.AddressWithPrefix{
 		Address:   addr,
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 64a9a2b20..8e817e730 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -21,6 +21,7 @@ import (
 	"time"
 
 	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -1911,6 +1912,109 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	}
 }
 
+// TestAutoGenAddrWithOpaqueIID tests that SLAAC generated addresses will use
+// opaque interface identifiers when configured to do so.
+func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
+	t.Parallel()
+
+	const nicID = 1
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix1, subnet1, _ := prefixSubnetAddr(0, linkAddr1)
+	prefix2, subnet2, _ := prefixSubnetAddr(1, linkAddr1)
+	// addr1 and addr2 are the addresses that are expected to be generated when
+	// stack.Stack is configured to generate opaque interface identifiers as
+	// defined by RFC 7217.
+	addrBytes := []byte(subnet1.ID())
+	addr1 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet1, "nic1", 0, secretKey)),
+		PrefixLen: 64,
+	}
+	addrBytes = []byte(subnet2.ID())
+	addr2 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet2, "nic1", 0, secretKey)),
+		PrefixLen: 64,
+	}
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			NICNameFromID: func(nicID tcpip.NICID) string {
+				return fmt.Sprintf("nic%d", nicID)
+			},
+			SecretKey: secretKey,
+		},
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in a PI.
+	const validLifetimeSecondPrefix1 = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+
+	// Receive an RA with prefix2 in a PI with a large valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+}
+
 // TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
 // to the integrator when an RA is received with the NDP Recursive DNS Server
 // option with at least one valid address.
-- 
cgit v1.2.3


From de0d127ae61df783745880871a199ff86a720035 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 31 Dec 2019 08:44:06 +0000
Subject: Make some of the fcntl flags arch specific..

Some of the flags in the file system related system call
are architecture specific(O_NOFOLLOW/O_DIRECT..). Ref to
the fcntl.h file in the Linux src codes.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I354d988073bfd0c9ff5371d4e0be9da2b8fd019f
---
 pkg/abi/linux/file.go       | 38 +++++++++++++++++---------------------
 pkg/abi/linux/file_amd64.go |  8 ++++++++
 pkg/abi/linux/file_arm64.go |  8 ++++++++
 3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 16791d03e..64bee84b4 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -24,27 +24,23 @@ import (
 
 // Constants for open(2).
 const (
-	O_ACCMODE   = 000000003
-	O_RDONLY    = 000000000
-	O_WRONLY    = 000000001
-	O_RDWR      = 000000002
-	O_CREAT     = 000000100
-	O_EXCL      = 000000200
-	O_NOCTTY    = 000000400
-	O_TRUNC     = 000001000
-	O_APPEND    = 000002000
-	O_NONBLOCK  = 000004000
-	O_DSYNC     = 000010000
-	O_ASYNC     = 000020000
-	O_DIRECT    = 000040000
-	O_LARGEFILE = 000100000
-	O_DIRECTORY = 000200000
-	O_NOFOLLOW  = 000400000
-	O_NOATIME   = 001000000
-	O_CLOEXEC   = 002000000
-	O_SYNC      = 004000000 // __O_SYNC in Linux
-	O_PATH      = 010000000
-	O_TMPFILE   = 020000000 // __O_TMPFILE in Linux
+	O_ACCMODE  = 000000003
+	O_RDONLY   = 000000000
+	O_WRONLY   = 000000001
+	O_RDWR     = 000000002
+	O_CREAT    = 000000100
+	O_EXCL     = 000000200
+	O_NOCTTY   = 000000400
+	O_TRUNC    = 000001000
+	O_APPEND   = 000002000
+	O_NONBLOCK = 000004000
+	O_DSYNC    = 000010000
+	O_ASYNC    = 000020000
+	O_NOATIME  = 001000000
+	O_CLOEXEC  = 002000000
+	O_SYNC     = 004000000 // __O_SYNC in Linux
+	O_PATH     = 010000000
+	O_TMPFILE  = 020000000 // __O_TMPFILE in Linux
 )
 
 // Constants for fstatat(2).
diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
index 74c554be6..9d307e840 100644
--- a/pkg/abi/linux/file_amd64.go
+++ b/pkg/abi/linux/file_amd64.go
@@ -14,6 +14,14 @@
 
 package linux
 
+// Constants for open(2).
+const (
+	O_DIRECT    = 000040000
+	O_LARGEFILE = 000100000
+	O_DIRECTORY = 000200000
+	O_NOFOLLOW  = 000400000
+)
+
 // Stat represents struct stat.
 type Stat struct {
 	Dev     uint64
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
index f16c07589..26a54f416 100644
--- a/pkg/abi/linux/file_arm64.go
+++ b/pkg/abi/linux/file_arm64.go
@@ -14,6 +14,14 @@
 
 package linux
 
+// Constants for open(2).
+const (
+	O_DIRECTORY = 000040000
+	O_NOFOLLOW  = 000100000
+	O_DIRECT    = 000200000
+	O_LARGEFILE = 000400000
+)
+
 // Stat represents struct stat.
 type Stat struct {
 	Dev     uint64
-- 
cgit v1.2.3


From 6410387ff9b4f0dbe88325ea0e30776f5f3efd5d Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 6 Jan 2020 09:27:35 -0800
Subject: Cleanup Shm reference handling

Currently, shm.Registry.FindByID will return Shm instances without taking an
additional reference on them, making it possible for them to disappear.

More explicitly handle references. All callers hold a reference for the
duration that they hold the instance. Registry.shms may transitively hold Shms
with no references, so it must TryIncRef to determine if they are still valid.

PiperOrigin-RevId: 288314529
---
 pkg/sentry/kernel/shm/shm.go         | 85 +++++++++++++++++++++++++-----------
 pkg/sentry/syscalls/linux/sys_shm.go |  7 ++-
 2 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 5bd610f68..19034a21e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -71,9 +71,20 @@ type Registry struct {
 	mu sync.Mutex `state:"nosave"`
 
 	// shms maps segment ids to segments.
+	//
+	// shms holds all referenced segments, which are removed on the last
+	// DecRef. Thus, it cannot itself hold a reference on the Shm.
+	//
+	// Since removal only occurs after the last (unlocked) DecRef, there
+	// exists a short window during which a Shm still exists in Shm, but is
+	// unreferenced. Users must use TryIncRef to determine if the Shm is
+	// still valid.
 	shms map[ID]*Shm
 
 	// keysToShms maps segment keys to segments.
+	//
+	// Shms in keysToShms are guaranteed to be referenced, as they are
+	// removed by disassociateKey before the last DecRef.
 	keysToShms map[Key]*Shm
 
 	// Sum of the sizes of all existing segments rounded up to page size, in
@@ -95,10 +106,18 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
 }
 
 // FindByID looks up a segment given an ID.
+//
+// FindByID returns a reference on Shm.
 func (r *Registry) FindByID(id ID) *Shm {
 	r.mu.Lock()
 	defer r.mu.Unlock()
-	return r.shms[id]
+	s := r.shms[id]
+	// Take a reference on s. If TryIncRef fails, s has reached the last
+	// DecRef, but hasn't quite been removed from r.shms yet.
+	if s != nil && s.TryIncRef() {
+		return s
+	}
+	return nil
 }
 
 // dissociateKey removes the association between a segment and its key,
@@ -119,6 +138,8 @@ func (r *Registry) dissociateKey(s *Shm) {
 
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
 // analogous to open(2).
+//
+// FindOrCreate returns a reference on Shm.
 func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
 	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
 		// "A new segment was to be created and size is less than SHMMIN or
@@ -166,6 +187,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 				return nil, syserror.EEXIST
 			}
 
+			shm.IncRef()
 			return shm, nil
 		}
 
@@ -193,7 +215,14 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 	// Need to create a new segment.
 	creator := fs.FileOwnerFromContext(ctx)
 	perms := fs.FilePermsFromMode(mode)
-	return r.newShm(ctx, pid, key, creator, perms, size)
+	s, err := r.newShm(ctx, pid, key, creator, perms, size)
+	if err != nil {
+		return nil, err
+	}
+	// The initial reference is held by s itself. Take another to return to
+	// the caller.
+	s.IncRef()
+	return s, nil
 }
 
 // newShm creates a new segment in the registry.
@@ -296,22 +325,26 @@ func (r *Registry) remove(s *Shm) {
 
 // Shm represents a single shared memory segment.
 //
-// Shm segment are backed directly by an allocation from platform
-// memory. Segments are always mapped as a whole, greatly simplifying how
-// mappings are tracked. However note that mremap and munmap calls may cause the
-// vma for a segment to become fragmented; which requires special care when
-// unmapping a segment. See mm/shm.go.
+// Shm segment are backed directly by an allocation from platform memory.
+// Segments are always mapped as a whole, greatly simplifying how mappings are
+// tracked. However note that mremap and munmap calls may cause the vma for a
+// segment to become fragmented; which requires special care when unmapping a
+// segment. See mm/shm.go.
 //
 // Segments persist until they are explicitly marked for destruction via
-// shmctl(SHM_RMID).
+// MarkDestroyed().
 //
 // Shm implements memmap.Mappable and memmap.MappingIdentity.
 //
 // +stateify savable
 type Shm struct {
-	// AtomicRefCount tracks the number of references to this segment from
-	// maps. A segment always holds a reference to itself, until it's marked for
+	// AtomicRefCount tracks the number of references to this segment.
+	//
+	// A segment holds a reference to itself until it is marked for
 	// destruction.
+	//
+	// In addition to direct users, the MemoryManager will hold references
+	// via MappingIdentity.
 	refs.AtomicRefCount
 
 	mfp pgalloc.MemoryFileProvider
@@ -484,9 +517,8 @@ type AttachOpts struct {
 // ConfigureAttach creates an mmap configuration for the segment with the
 // requested attach options.
 //
-// ConfigureAttach returns with a ref on s on success. The caller should drop
-// this once the map is installed. This reference prevents s from being
-// destroyed before the returned configuration is used.
+// Postconditions: The returned MMapOpts are valid only as long as a reference
+// continues to be held on s.
 func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -504,7 +536,6 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac
 		// in the user namespace that governs its IPC namespace." - man shmat(2)
 		return memmap.MMapOpts{}, syserror.EACCES
 	}
-	s.IncRef()
 	return memmap.MMapOpts{
 		Length: s.size,
 		Offset: 0,
@@ -549,10 +580,15 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
 	}
 	creds := auth.CredentialsFromContext(ctx)
 
-	nattach := uint64(s.ReadRefs())
-	// Don't report the self-reference we keep prior to being marked for
-	// destruction. However, also don't report a count of -1 for segments marked
-	// as destroyed, with no mappings.
+	// Use the reference count as a rudimentary count of the number of
+	// attaches. We exclude:
+	//
+	// 1. The reference the caller holds.
+	// 2. The self-reference held by s prior to destruction.
+	//
+	// Note that this may still overcount by including transient references
+	// used in concurrent calls.
+	nattach := uint64(s.ReadRefs()) - 1
 	if !s.pendingDestruction {
 		nattach--
 	}
@@ -620,18 +656,17 @@ func (s *Shm) MarkDestroyed() {
 	s.registry.dissociateKey(s)
 
 	s.mu.Lock()
-	// Only drop the segment's self-reference once, when destruction is
-	// requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a
-	// segment to be destroyed prematurely, potentially with active maps to the
-	// segment's address range. Remaining references are dropped when the
-	// segment is detached or unmaped.
+	defer s.mu.Unlock()
 	if !s.pendingDestruction {
 		s.pendingDestruction = true
-		s.mu.Unlock() // Must release s.mu before calling s.DecRef.
+		// Drop the self-reference so destruction occurs when all
+		// external references are gone.
+		//
+		// N.B. This cannot be the final DecRef, as the caller also
+		// holds a reference.
 		s.DecRef()
 		return
 	}
-	s.mu.Unlock()
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index d57ffb3a1..4a8bc24a2 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -39,10 +39,13 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if err != nil {
 		return 0, nil, err
 	}
+	defer segment.DecRef()
 	return uintptr(segment.ID), nil, nil
 }
 
 // findSegment retrives a shm segment by the given id.
+//
+// findSegment returns a reference on Shm.
 func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
 	r := t.IPCNamespace().ShmRegistry()
 	segment := r.FindByID(id)
@@ -63,6 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if err != nil {
 		return 0, nil, syserror.EINVAL
 	}
+	defer segment.DecRef()
 
 	opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
 		Execute:  flag&linux.SHM_EXEC == linux.SHM_EXEC,
@@ -72,7 +76,6 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if err != nil {
 		return 0, nil, err
 	}
-	defer segment.DecRef()
 	addr, err = t.MemoryManager().MMap(t, opts)
 	return uintptr(addr), nil, err
 }
@@ -105,6 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if err != nil {
 			return 0, nil, syserror.EINVAL
 		}
+		defer segment.DecRef()
 
 		stat, err := segment.IPCStat(t)
 		if err == nil {
@@ -128,6 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if err != nil {
 		return 0, nil, syserror.EINVAL
 	}
+	defer segment.DecRef()
 
 	switch cmd {
 	case linux.IPC_SET:
-- 
cgit v1.2.3


From 354a15a234c1270bcb9b902503f61835b2ccd2d0 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 6 Jan 2020 11:41:13 -0800
Subject: Implement rseq(2)

PiperOrigin-RevId: 288342928
---
 pkg/abi/linux/BUILD                        |   1 +
 pkg/abi/linux/rseq.go                      | 130 ++++++++++
 pkg/sentry/arch/arch.go                    |   6 +-
 pkg/sentry/arch/arch_amd64.go              |   4 +-
 pkg/sentry/kernel/rseq.go                  | 383 +++++++++++++++++++++++++----
 pkg/sentry/kernel/task.go                  |  43 +++-
 pkg/sentry/kernel/task_clone.go            |   7 +
 pkg/sentry/kernel/task_exec.go             |   6 +-
 pkg/sentry/kernel/task_run.go              |  16 +-
 pkg/sentry/kernel/task_start.go            |  10 +
 pkg/sentry/kernel/thread_group.go          |  18 +-
 pkg/sentry/syscalls/linux/BUILD            |   1 +
 pkg/sentry/syscalls/linux/linux64_amd64.go |   2 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   2 +-
 pkg/sentry/syscalls/linux/sys_rseq.go      |  48 ++++
 15 files changed, 598 insertions(+), 79 deletions(-)
 create mode 100644 pkg/abi/linux/rseq.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_rseq.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 9553f164d..716ff22d2 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -41,6 +41,7 @@ go_library(
         "poll.go",
         "prctl.go",
         "ptrace.go",
+        "rseq.go",
         "rusage.go",
         "sched.go",
         "seccomp.go",
diff --git a/pkg/abi/linux/rseq.go b/pkg/abi/linux/rseq.go
new file mode 100644
index 000000000..76253ba30
--- /dev/null
+++ b/pkg/abi/linux/rseq.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Flags passed to rseq(2).
+//
+// Defined in include/uapi/linux/rseq.h.
+const (
+	// RSEQ_FLAG_UNREGISTER unregisters the current thread.
+	RSEQ_FLAG_UNREGISTER = 1 << 0
+)
+
+// Critical section flags used in RSeqCriticalSection.Flags and RSeq.Flags.
+//
+// Defined in include/uapi/linux/rseq.h.
+const (
+	// RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT inhibits restart on preemption.
+	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 1 << 0
+
+	// RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL inhibits restart on signal
+	// delivery.
+	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 1 << 1
+
+	// RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE inhibits restart on CPU
+	// migration.
+	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 1 << 2
+)
+
+// RSeqCriticalSection describes a restartable sequences critical section. It
+// is equivalent to struct rseq_cs, defined in include/uapi/linux/rseq.h.
+//
+// In userspace, this structure is always aligned to 32 bytes.
+//
+// +marshal
+type RSeqCriticalSection struct {
+	// Version is the version of this structure. Version 0 is defined here.
+	Version uint32
+
+	// Flags are the critical section flags, defined above.
+	Flags uint32
+
+	// Start is the start address of the critical section.
+	Start uint64
+
+	// PostCommitOffset is the offset from Start of the first instruction
+	// outside of the critical section.
+	PostCommitOffset uint64
+
+	// Abort is the abort address. It must be outside the critical section,
+	// and the 4 bytes prior must match the abort signature.
+	Abort uint64
+}
+
+const (
+	// SizeOfRSeqCriticalSection is the size of RSeqCriticalSection.
+	SizeOfRSeqCriticalSection = 32
+
+	// SizeOfRSeqSignature is the size of the signature immediately
+	// preceding RSeqCriticalSection.Abort.
+	SizeOfRSeqSignature = 4
+)
+
+// Special values for RSeq.CPUID, defined in include/uapi/linux/rseq.h.
+const (
+	// RSEQ_CPU_ID_UNINITIALIZED indicates that this thread has not
+	// performed rseq initialization.
+	RSEQ_CPU_ID_UNINITIALIZED = ^uint32(0) // -1
+
+	// RSEQ_CPU_ID_REGISTRATION_FAILED indicates that rseq initialization
+	// failed.
+	RSEQ_CPU_ID_REGISTRATION_FAILED = ^uint32(1) // -2
+)
+
+// RSeq is the thread-local restartable sequences config/status. It
+// is equivalent to struct rseq, defined in include/uapi/linux/rseq.h.
+//
+// In userspace, this structure is always aligned to 32 bytes.
+type RSeq struct {
+	// CPUIDStart contains the current CPU ID if rseq is initialized.
+	//
+	// This field should only be read by the thread which registered this
+	// structure, and must be read atomically.
+	CPUIDStart uint32
+
+	// CPUID contains the current CPU ID or one of the CPU ID special
+	// values defined above.
+	//
+	// This field should only be read by the thread which registered this
+	// structure, and must be read atomically.
+	CPUID uint32
+
+	// RSeqCriticalSection is a pointer to the current RSeqCriticalSection
+	// block, or NULL. It is reset to NULL by the kernel on restart or
+	// non-restarting preempt/signal.
+	//
+	// This field should only be written by the thread which registered
+	// this structure, and must be written atomically.
+	RSeqCriticalSection uint64
+
+	// Flags are the critical section flags that apply to all critical
+	// sections on this thread, defined above.
+	Flags uint32
+}
+
+const (
+	// SizeOfRSeq is the size of RSeq.
+	//
+	// Note that RSeq is naively 24 bytes. However, it has 32-byte
+	// alignment, which in C increases sizeof to 32. That is the size that
+	// the Linux kernel uses.
+	SizeOfRSeq = 32
+
+	// AlignOfRSeq is the standard alignment of RSeq.
+	AlignOfRSeq = 32
+
+	// OffsetOfRSeqCriticalSection is the offset of RSeqCriticalSection in RSeq.
+	OffsetOfRSeqCriticalSection = 8
+)
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 498ca4669..81ec98a77 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -125,9 +125,9 @@ type Context interface {
 	// SetTLS sets the current TLS pointer. Returns false if value is invalid.
 	SetTLS(value uintptr) bool
 
-	// SetRSEQInterruptedIP sets the register that contains the old IP when a
-	// restartable sequence is interrupted.
-	SetRSEQInterruptedIP(value uintptr)
+	// SetOldRSeqInterruptedIP sets the register that contains the old IP
+	// when an "old rseq" restartable sequence is interrupted.
+	SetOldRSeqInterruptedIP(value uintptr)
 
 	// StateData returns a pointer to underlying architecture state.
 	StateData() *State
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 67daa6c24..2aa08b1a9 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -174,8 +174,8 @@ func (c *context64) SetTLS(value uintptr) bool {
 	return true
 }
 
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 	c.Regs.R10 = uint64(value)
 }
 
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 24ea002ba..b14429854 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -15,17 +15,29 @@
 package kernel
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+// Restartable sequences.
+//
+// We support two different APIs for restartable sequences.
+//
+//  1. The upstream interface added in v4.18.
+//  2. The interface described in https://lwn.net/Articles/650333/.
+//
+// Throughout this file and other parts of the kernel, the latter is referred
+// to as "old rseq". This interface was never merged upstream, but is supported
+// for a limited set of applications that use it regardless.
 
-// RSEQCriticalRegion describes a restartable sequence critical region.
+// OldRSeqCriticalRegion describes an old rseq critical region.
 //
 // +stateify savable
-type RSEQCriticalRegion struct {
+type OldRSeqCriticalRegion struct {
 	// When a task in this thread group has its CPU preempted (as defined by
 	// platform.ErrContextCPUPreempted) or has a signal delivered to an
 	// application handler while its instruction pointer is in CriticalSection,
@@ -35,86 +47,359 @@ type RSEQCriticalRegion struct {
 	Restart         usermem.Addr
 }
 
-// RSEQAvailable returns true if t supports restartable sequences.
-func (t *Task) RSEQAvailable() bool {
+// RSeqAvailable returns true if t supports (old and new) restartable sequences.
+func (t *Task) RSeqAvailable() bool {
 	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
 }
 
-// RSEQCriticalRegion returns a copy of t's thread group's current restartable
-// sequence.
-func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
-	return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+// SetRSeq registers addr as this thread's rseq structure.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr != 0 {
+		if t.rseqAddr != addr {
+			return syserror.EINVAL
+		}
+		if t.rseqSignature != signature {
+			return syserror.EINVAL
+		}
+		return syserror.EBUSY
+	}
+
+	// rseq must be aligned and correctly sized.
+	if addr&(linux.AlignOfRSeq-1) != 0 {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
+		return syserror.EFAULT
+	}
+
+	t.rseqAddr = addr
+	t.rseqSignature = signature
+
+	// Initialize the CPUID.
+	//
+	// Linux implicitly does this on return from userspace, where failure
+	// would cause SIGSEGV.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.rseqAddr = 0
+		t.rseqSignature = 0
+
+		t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return syserror.EFAULT
+	}
+
+	return nil
 }
 
-// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+// ClearRSeq unregisters addr as this thread's rseq structure.
 //
-// Preconditions: t.RSEQAvailable() == true.
-func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr == 0 {
+		return syserror.EINVAL
+	}
+	if t.rseqAddr != addr {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if t.rseqSignature != signature {
+		return syserror.EPERM
+	}
+
+	if err := t.rseqClearCPU(); err != nil {
+		return err
+	}
+
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+
+	if t.oldRSeqCPUAddr == 0 {
+		// rseqCPU no longer needed.
+		t.rseqCPU = -1
+	}
+
+	return nil
+}
+
+// OldRSeqCriticalRegion returns a copy of t's thread group's current
+// old restartable sequence.
+func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
+	return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+}
+
+// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
+// sequence.
+//
+// Preconditions: t.RSeqAvailable() == true.
+func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
 	// These checks are somewhat more lenient than in Linux, which (bizarrely)
-	// requires rscr.CriticalSection to be non-empty and rscr.Restart to be
-	// outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+	// requires r.CriticalSection to be non-empty and r.Restart to be
+	// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
 	// (which disables the critical region).
-	if rscr.CriticalSection.Start == 0 {
-		rscr.CriticalSection.End = 0
-		rscr.Restart = 0
-		t.tg.rscr.Store(&rscr)
+	if r.CriticalSection.Start == 0 {
+		r.CriticalSection.End = 0
+		r.Restart = 0
+		t.tg.oldRSeqCritical.Store(&r)
 		return nil
 	}
-	if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+	if r.CriticalSection.Start >= r.CriticalSection.End {
 		return syserror.EINVAL
 	}
-	if rscr.CriticalSection.Contains(rscr.Restart) {
+	if r.CriticalSection.Contains(r.Restart) {
 		return syserror.EINVAL
 	}
-	// TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
-	// the application address range, for consistency with Linux
-	t.tg.rscr.Store(&rscr)
+	// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
+	// the application address range, for consistency with Linux.
+	t.tg.oldRSeqCritical.Store(&r)
 	return nil
 }
 
-// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
-// number.
+// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
+// CPU number.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) RSEQCPUAddr() usermem.Addr {
-	return t.rseqCPUAddr
+func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+	return t.oldRSeqCPUAddr
 }
 
-// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
-// number.
+// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
+// t's CPU number.
 //
-// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
 // task goroutine. t's AddressSpace must be active.
-func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
-	t.rseqCPUAddr = addr
-	if addr != 0 {
-		t.rseqCPU = int32(hostcpu.GetCPU())
-		if err := t.rseqCopyOutCPU(); err != nil {
-			t.rseqCPUAddr = 0
-			t.rseqCPU = -1
-			return syserror.EINVAL // yes, EINVAL, not err or EFAULT
-		}
-	} else {
-		t.rseqCPU = -1
+func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+	t.oldRSeqCPUAddr = addr
+
+	// Check that addr is writable.
+	//
+	// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
+	// unfortunate, but unlikely in a correct program.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.oldRSeqCPUAddr = 0
+		return syserror.EINVAL // yes, EINVAL, not err or EFAULT
 	}
 	return nil
 }
 
 // Preconditions: The caller must be running on the task goroutine. t's
 // AddressSpace must be active.
-func (t *Task) rseqCopyOutCPU() error {
+func (t *Task) rseqUpdateCPU() error {
+	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
+		t.rseqCPU = -1
+		return nil
+	}
+
+	t.rseqCPU = int32(hostcpu.GetCPU())
+
+	// Update both CPUs, even if one fails.
+	rerr := t.rseqCopyOutCPU()
+	oerr := t.oldRSeqCopyOutCPU()
+
+	if rerr != nil {
+		return rerr
+	}
+	return oerr
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) oldRSeqCopyOutCPU() error {
+	if t.oldRSeqCPUAddr == 0 {
+		return nil
+	}
+
 	buf := t.CopyScratchBuffer(4)
 	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
-	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	if t.rseqAddr == 0 {
+		return nil
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqClearCPU() error {
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
 	return err
 }
 
+// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
+//
+// This is a bit complex since both the RSeq and RSeqCriticalSection structs
+// are stored in userspace. So we must:
+//
+// 1. Copy in the address of RSeqCriticalSection from RSeq.
+// 2. Copy in RSeqCriticalSection itself.
+// 3. Validate critical section struct version, address range, abort address.
+// 4. Validate the abort signature (4 bytes preceding abort IP match expected
+//    signature).
+// 5. Clear address of RSeqCriticalSection from RSeq.
+// 6. Finally, conditionally abort.
+//
+// See kernel/rseq.c:rseq_ip_fixup for reference.
+//
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqAddrInterrupt() {
+	if t.rseqAddr == 0 {
+		return
+	}
+
+	critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
+	if !ok {
+		// SetRSeq should validate this.
+		panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
+	}
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		t.Debugf("Only 64-bit rseq supported.")
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+	if critAddr == 0 {
+		return
+	}
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
+	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Manually marshal RSeqCriticalSection as this is in the hot path when
+	// rseq is enabled. It must be as fast as possible.
+	//
+	// TODO(b/130243041): Replace with go_marshal.
+	cs := linux.RSeqCriticalSection{
+		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
+		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
+		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
+		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
+		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
+	}
+
+	if cs.Version != 0 {
+		t.Debugf("Unknown version in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	start := usermem.Addr(cs.Start)
+	critRange, ok := start.ToRange(cs.PostCommitOffset)
+	if !ok {
+		t.Debugf("Invalid start and offset in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	abort := usermem.Addr(cs.Abort)
+	if critRange.Contains(abort) {
+		t.Debugf("Abort in critical section in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Verify signature.
+	sigAddr := abort - linux.SizeOfRSeqSignature
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
+	if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	sig := usermem.ByteOrder.Uint32(buf)
+	if sig != t.rseqSignature {
+		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Clear the critical section address.
+	//
+	// NOTE(b/143949567): We don't support any rseq flags, so we always
+	// restart if we are in the critical section, and thus *always* clear
+	// critAddrAddr.
+	if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Finally we can actually decide whether or not to restart.
+	if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+		return
+	}
+
+	t.Arch().SetIP(uintptr(cs.Abort))
+}
+
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) rseqInterrupt() {
-	rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
-	if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
-		t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
-		t.Arch().SetIP(uintptr(rscr.Restart))
-		t.Arch().SetRSEQInterruptedIP(ip)
+func (t *Task) oldRSeqInterrupt() {
+	r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+	if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
+		t.Arch().SetIP(uintptr(r.Restart))
+		t.Arch().SetOldRSeqInterruptedIP(ip)
 	}
 }
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	t.rseqAddrInterrupt()
+	t.oldRSeqInterrupt()
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index ab0c6c4aa..d25a7903b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -489,18 +489,43 @@ type Task struct {
 	// netns is protected by mu. netns is owned by the task goroutine.
 	netns bool
 
-	// If rseqPreempted is true, before the next call to p.Switch(), interrupt
-	// RSEQ critical regions as defined by tg.rseq and write the task
-	// goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
-	// written to rseqCPUAddr.
+	// If rseqPreempted is true, before the next call to p.Switch(),
+	// interrupt rseq critical regions as defined by rseqAddr and
+	// tg.oldRSeqCritical and write the task goroutine's CPU number to
+	// rseqAddr/oldRSeqCPUAddr.
 	//
-	// If rseqCPUAddr is 0, rseqCPU is -1.
+	// We support two ABIs for restartable sequences:
 	//
-	// rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
-	// goroutine.
+	//  1. The upstream interface added in v4.18,
+	//  2. An "old" interface never merged upstream. In the implementation,
+	//     this is referred to as "old rseq".
+	//
+	// rseqPreempted is exclusive to the task goroutine.
 	rseqPreempted bool `state:"nosave"`
-	rseqCPUAddr   usermem.Addr
-	rseqCPU       int32
+
+	// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
+	//
+	// If rseq is unused, rseqCPU is -1 for convenient use in
+	// platform.Context.Switch.
+	//
+	// rseqCPU is exclusive to the task goroutine.
+	rseqCPU int32
+
+	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
+	//
+	// oldRSeqCPUAddr is exclusive to the task goroutine.
+	oldRSeqCPUAddr usermem.Addr
+
+	// rseqAddr is a pointer to the userspace linux.RSeq structure.
+	//
+	// rseqAddr is exclusive to the task goroutine.
+	rseqAddr usermem.Addr
+
+	// rseqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	//
+	// rseqSignature is exclusive to the task goroutine.
+	rseqSignature uint32
 
 	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
 	// implementations that require an intermediate buffer to copy data
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 5f3589493..247bd4aba 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -236,7 +236,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else if opts.NewPIDNamespace {
 		pidns = pidns.NewChild(userns)
 	}
+
 	tg := t.tg
+	rseqAddr := usermem.Addr(0)
+	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
 		tg.mounts.IncRef()
 		sh := t.tg.signalHandlers
@@ -244,6 +247,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 			sh = sh.Fork()
 		}
 		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+		rseqAddr = t.rseqAddr
+		rseqSignature = t.rseqSignature
 	}
 
 	cfg := &TaskConfig{
@@ -260,6 +265,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		RSeqAddr:                rseqAddr,
+		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
 	}
 	if opts.NewThreadGroup {
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 90a6190f1..fa6528386 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -190,9 +190,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.updateRSSLocked()
 	// Restartable sequence state is discarded.
 	t.rseqPreempted = false
-	t.rseqCPUAddr = 0
 	t.rseqCPU = -1
-	t.tg.rscr.Store(&RSEQCriticalRegion{})
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+	t.oldRSeqCPUAddr = 0
+	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	t.tg.pidns.owner.mu.Unlock()
 
 	// Remove FDs with the CloseOnExec flag set.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index d97f8c189..6357273d3 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -169,12 +169,22 @@ func (*runApp) execute(t *Task) taskRunState {
 	// Apply restartable sequences.
 	if t.rseqPreempted {
 		t.rseqPreempted = false
-		if t.rseqCPUAddr != 0 {
+		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
+			// Linux writes the CPU on every preemption. We only do
+			// so if it changed. Thus we may delay delivery of
+			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
 			cpu := int32(hostcpu.GetCPU())
 			if t.rseqCPU != cpu {
 				t.rseqCPU = cpu
 				if err := t.rseqCopyOutCPU(); err != nil {
-					t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
+				if err := t.oldRSeqCopyOutCPU(); err != nil {
+					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
 					t.forceSignal(linux.SIGSEGV, false)
 					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 					// Re-enter the task run loop for signal delivery.
@@ -320,7 +330,7 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextCPUPreempted:
-		// Ensure that RSEQ critical sections are interrupted and per-thread
+		// Ensure that rseq critical sections are interrupted and per-thread
 		// CPU values are updated before the next platform.Context.Switch().
 		t.rseqPreempted = true
 		return (*runApp)(nil)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 3522a4ae5..58af16ee2 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -79,6 +80,13 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
+	RSeqAddr usermem.Addr
+
+	// RSeqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	RSeqSignature uint32
+
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
 }
@@ -126,6 +134,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		ipcns:           cfg.IPCNamespace,
 		abstractSockets: cfg.AbstractSocketNamespace,
 		rseqCPU:         -1,
+		rseqAddr:        cfg.RSeqAddr,
+		rseqSignature:   cfg.RSeqSignature,
 		futexWaiter:     futex.NewWaiter(),
 		containerID:     cfg.ContainerID,
 	}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 0cded73f6..c0197a563 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -238,8 +238,8 @@ type ThreadGroup struct {
 	// execed is protected by the TaskSet mutex.
 	execed bool
 
-	// rscr is the thread group's RSEQ critical region.
-	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+	// oldRSeqCritical is the thread group's old rseq critical region.
+	oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`
 
 	// mounts is the thread group's mount namespace. This does not really
 	// correspond to a "mount namespace" in Linux, but is more like a
@@ -273,18 +273,18 @@ func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, s
 	}
 	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
 	tg.timers = make(map[linux.TimerID]*IntervalTimer)
-	tg.rscr.Store(&RSEQCriticalRegion{})
+	tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	return tg
 }
 
-// saveRscr is invoked by stateify.
-func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
-	return tg.rscr.Load().(*RSEQCriticalRegion)
+// saveOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
+	return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
 }
 
-// loadRscr is invoked by stateify.
-func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
-	tg.rscr.Store(rscr)
+// loadOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
+	tg.oldRSeqCritical.Store(r)
 }
 
 // SignalHandlers returns the signal handlers used by tg.
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 6766ba587..a76975cee 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "sys_random.go",
         "sys_read.go",
         "sys_rlimit.go",
+        "sys_rseq.go",
         "sys_rusage.go",
         "sys_sched.go",
         "sys_seccomp.go",
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 272ae9991..479c5f6ff 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -377,7 +377,7 @@ var AMD64 = &kernel.SyscallTable{
 		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		332: syscalls.Supported("statx", Statx),
 		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
-		334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+		334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
 
 		// Linux skips ahead to syscall 424 to sync numbers between arches.
 		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 3b584eed9..d3f61f5e8 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -307,7 +307,7 @@ var ARM64 = &kernel.SyscallTable{
 		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		291: syscalls.Supported("statx", Statx),
 		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
-		293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+		293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
 
 		// Linux skips ahead to syscall 424 to sync numbers between arches.
 		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go
new file mode 100644
index 000000000..90db10ea6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rseq.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// RSeq implements syscall rseq(2).
+func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Uint()
+	flags := args[2].Int()
+	signature := args[3].Uint()
+
+	if !t.RSeqAvailable() {
+		// Event for applications that want rseq on a configuration
+		// that doesn't support them.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	switch flags {
+	case 0:
+		// Register.
+		return 0, nil, t.SetRSeq(addr, length, signature)
+	case linux.RSEQ_FLAG_UNREGISTER:
+		return 0, nil, t.ClearRSeq(addr, length, signature)
+	default:
+		// Unknown flag.
+		return 0, nil, syserror.EINVAL
+	}
+}
-- 
cgit v1.2.3


From 51f3ab85e024fcd74c49d273ce5202a207577d31 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 6 Jan 2020 12:51:35 -0800
Subject: Convert memfs into proto-tmpfs.

- Renamed memfs to tmpfs.
- Copied fileRangeSet bits from fs/fsutil/ to fsimpl/tmpfs/
- Changed tmpfs to be backed by filemem instead of byte slice.
- regularFileReadWriter uses a sync.Pool, similar to gofer client.

PiperOrigin-RevId: 288356380
---
 pkg/sentry/fs/fsutil/BUILD                   |   2 +-
 pkg/sentry/fs/fsutil/file_range_set.go       |  14 +-
 pkg/sentry/fsimpl/memfs/BUILD                |  80 ---
 pkg/sentry/fsimpl/memfs/benchmark_test.go    | 487 -------------------
 pkg/sentry/fsimpl/memfs/directory.go         | 187 -------
 pkg/sentry/fsimpl/memfs/filesystem.go        | 698 ---------------------------
 pkg/sentry/fsimpl/memfs/memfs.go             | 293 -----------
 pkg/sentry/fsimpl/memfs/named_pipe.go        |  60 ---
 pkg/sentry/fsimpl/memfs/pipe_test.go         | 235 ---------
 pkg/sentry/fsimpl/memfs/regular_file.go      | 154 ------
 pkg/sentry/fsimpl/memfs/symlink.go           |  36 --
 pkg/sentry/fsimpl/tmpfs/BUILD                |  92 ++++
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go    | 487 +++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/directory.go         | 187 +++++++
 pkg/sentry/fsimpl/tmpfs/filesystem.go        | 698 +++++++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/named_pipe.go        |  60 +++
 pkg/sentry/fsimpl/tmpfs/pipe_test.go         | 235 +++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file.go      | 357 ++++++++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 224 +++++++++
 pkg/sentry/fsimpl/tmpfs/symlink.go           |  36 ++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             | 299 ++++++++++++
 21 files changed, 2683 insertions(+), 2238 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/memfs/BUILD
 delete mode 100644 pkg/sentry/fsimpl/memfs/benchmark_test.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/directory.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/filesystem.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/memfs.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/named_pipe.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/pipe_test.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/regular_file.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/symlink.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/BUILD
 create mode 100644 pkg/sentry/fsimpl/tmpfs/benchmark_test.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/directory.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/named_pipe.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/pipe_test.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/regular_file.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/regular_file_test.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/symlink.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/tmpfs.go

diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index b2e8d9c77..9ca695a95 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -53,7 +53,7 @@ go_template_instance(
         "Key": "uint64",
         "Range": "memmap.MappableRange",
         "Value": "uint64",
-        "Functions": "fileRangeSetFunctions",
+        "Functions": "FileRangeSetFunctions",
     },
 )
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 0a5466b0a..f52d712e3 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -34,25 +34,25 @@ import (
 //
 // type FileRangeSet <generated by go_generics>
 
-// fileRangeSetFunctions implements segment.Functions for FileRangeSet.
-type fileRangeSetFunctions struct{}
+// FileRangeSetFunctions implements segment.Functions for FileRangeSet.
+type FileRangeSetFunctions struct{}
 
 // MinKey implements segment.Functions.MinKey.
-func (fileRangeSetFunctions) MinKey() uint64 {
+func (FileRangeSetFunctions) MinKey() uint64 {
 	return 0
 }
 
 // MaxKey implements segment.Functions.MaxKey.
-func (fileRangeSetFunctions) MaxKey() uint64 {
+func (FileRangeSetFunctions) MaxKey() uint64 {
 	return math.MaxUint64
 }
 
 // ClearValue implements segment.Functions.ClearValue.
-func (fileRangeSetFunctions) ClearValue(_ *uint64) {
+func (FileRangeSetFunctions) ClearValue(_ *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
+func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
 	if frstart1+mr1.Length() != frstart2 {
 		return 0, false
 	}
@@ -60,7 +60,7 @@ func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _
 }
 
 // Split implements segment.Functions.Split.
-func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
+func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
 	return frstart, frstart + (split - mr.Start)
 }
 
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
deleted file mode 100644
index 5689bed3b..000000000
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ /dev/null
@@ -1,80 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-go_template_instance(
-    name = "dentry_list",
-    out = "dentry_list.go",
-    package = "memfs",
-    prefix = "dentry",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*dentry",
-        "Linker": "*dentry",
-    },
-)
-
-go_library(
-    name = "memfs",
-    srcs = [
-        "dentry_list.go",
-        "directory.go",
-        "filesystem.go",
-        "memfs.go",
-        "named_pipe.go",
-        "regular_file.go",
-        "symlink.go",
-    ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/fspath",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/pipe",
-        "//pkg/sentry/usermem",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "benchmark_test",
-    size = "small",
-    srcs = ["benchmark_test.go"],
-    deps = [
-        ":memfs",
-        "//pkg/abi/linux",
-        "//pkg/fspath",
-        "//pkg/refs",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/tmpfs",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "memfs_test",
-    size = "small",
-    srcs = ["pipe_test.go"],
-    embed = [":memfs"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/fspath",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
deleted file mode 100644
index a27876a4e..000000000
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ /dev/null
@@ -1,487 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package benchmark_test
-
-import (
-	"fmt"
-	"runtime"
-	"strings"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// Differences from stat_benchmark:
-//
-// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
-// not included.
-//
-// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
-// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
-// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
-// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
-// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
-// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
-var depths = []int{1, 2, 3, 8, 64, 100}
-
-const (
-	mountPointName = "tmp"
-	filename       = "gvisor_test_temp_0_1557494568"
-)
-
-// This is copied from syscalls/linux/sys_file.go, with the dependency on
-// kernel.Task stripped out.
-func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
-	var (
-		d   *fs.Dirent // The file.
-		rel *fs.Dirent // The relative directory for search (if required.)
-		err error
-	)
-
-	// Extract the working directory (maybe).
-	if len(path) > 0 && path[0] == '/' {
-		// Absolute path; rel can be nil.
-	} else if dirFD == linux.AT_FDCWD {
-		// Need to reference the working directory.
-		rel = wd
-	} else {
-		// Need to extract the given FD.
-		return syserror.EBADF
-	}
-
-	// Lookup the node.
-	remainingTraversals := uint(linux.MaxSymlinkTraversals)
-	if resolve {
-		d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
-	} else {
-		d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
-	}
-	if err != nil {
-		return err
-	}
-
-	err = fn(root, d)
-	d.DecRef()
-	return err
-}
-
-func BenchmarkVFS1TmpfsStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-
-			// Create VFS.
-			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
-			if !ok {
-				b.Fatalf("failed to find tmpfs filesystem type")
-			}
-			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			mntns, err := fs.NewMountNamespace(ctx, rootInode)
-			if err != nil {
-				b.Fatalf("failed to create mount namespace: %v", err)
-			}
-			defer mntns.DecRef()
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			root := mntns.Root()
-			defer root.DecRef()
-			d := root
-			d.IncRef()
-			defer d.DecRef()
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				next, err := d.Walk(ctx, root, name)
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				d.DecRef()
-				d = next
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Create the file that will be stat'd.
-			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			file.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			dirPath := false
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
-					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-						return syserror.ENOTDIR
-					}
-					uattr, err := d.Inode.UnstableAttr(ctx)
-					if err != nil {
-						return err
-					}
-					// Sanity check.
-					if uattr.Perms.User.Execute {
-						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
-					}
-					return nil
-				})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func BenchmarkVFS2MemfsStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-			creds := auth.CredentialsFromContext(ctx)
-
-			// Create VFS.
-			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-				AllowUserMount: true,
-			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			defer mntns.DecRef(vfsObj)
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			root := mntns.Root()
-			defer root.DecRef()
-			vd := root
-			vd.IncRef()
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				pop := vfs.PathOperation{
-					Root:  root,
-					Start: vd,
-					Path:  fspath.Parse(name),
-				}
-				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
-					Mode: 0755,
-				}); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				vd.DecRef()
-				vd = nextVD
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Create the file that will be stat'd.
-			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-				Root:               root,
-				Start:              vd,
-				Path:               fspath.Parse(filename),
-				FollowFinalSymlink: true,
-			}, &vfs.OpenOptions{
-				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-				Mode:  0644,
-			})
-			vd.DecRef()
-			vd = vfs.VirtualDentry{}
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			defer fd.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-					Root:               root,
-					Start:              root,
-					Path:               fspath.Parse(filePath),
-					FollowFinalSymlink: true,
-				}, &vfs.StatOptions{})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-				// Sanity check.
-				if stat.Mode&^linux.S_IFMT != 0644 {
-					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-
-			// Create VFS.
-			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
-			if !ok {
-				b.Fatalf("failed to find tmpfs filesystem type")
-			}
-			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			mntns, err := fs.NewMountNamespace(ctx, rootInode)
-			if err != nil {
-				b.Fatalf("failed to create mount namespace: %v", err)
-			}
-			defer mntns.DecRef()
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create and mount the submount.
-			root := mntns.Root()
-			defer root.DecRef()
-			if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
-				b.Fatalf("failed to create mount point: %v", err)
-			}
-			mountPoint, err := root.Walk(ctx, root, mountPointName)
-			if err != nil {
-				b.Fatalf("failed to walk to mount point: %v", err)
-			}
-			defer mountPoint.DecRef()
-			submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
-			if err != nil {
-				b.Fatalf("failed to create tmpfs submount: %v", err)
-			}
-			if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
-				b.Fatalf("failed to mount tmpfs submount: %v", err)
-			}
-			filePathBuilder.WriteString(mountPointName)
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			d, err := root.Walk(ctx, root, mountPointName)
-			if err != nil {
-				b.Fatalf("failed to walk to mount root: %v", err)
-			}
-			defer d.DecRef()
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				next, err := d.Walk(ctx, root, name)
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				d.DecRef()
-				d = next
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Create the file that will be stat'd.
-			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			file.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			dirPath := false
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
-					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-						return syserror.ENOTDIR
-					}
-					uattr, err := d.Inode.UnstableAttr(ctx)
-					if err != nil {
-						return err
-					}
-					// Sanity check.
-					if uattr.Perms.User.Execute {
-						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
-					}
-					return nil
-				})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func BenchmarkVFS2MemfsMountStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-			creds := auth.CredentialsFromContext(ctx)
-
-			// Create VFS.
-			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-				AllowUserMount: true,
-			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			defer mntns.DecRef(vfsObj)
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create the mount point.
-			root := mntns.Root()
-			defer root.DecRef()
-			pop := vfs.PathOperation{
-				Root:  root,
-				Start: root,
-				Path:  fspath.Parse(mountPointName),
-			}
-			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
-				Mode: 0755,
-			}); err != nil {
-				b.Fatalf("failed to create mount point: %v", err)
-			}
-			// Save the mount point for later use.
-			mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-			if err != nil {
-				b.Fatalf("failed to walk to mount point: %v", err)
-			}
-			defer mountPoint.DecRef()
-			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil {
-				b.Fatalf("failed to mount tmpfs submount: %v", err)
-			}
-			filePathBuilder.WriteString(mountPointName)
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-			if err != nil {
-				b.Fatalf("failed to walk to mount root: %v", err)
-			}
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				pop := vfs.PathOperation{
-					Root:  root,
-					Start: vd,
-					Path:  fspath.Parse(name),
-				}
-				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
-					Mode: 0755,
-				}); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				vd.DecRef()
-				vd = nextVD
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Verify that we didn't create any directories under the mount
-			// point (i.e. they were all created on the submount).
-			firstDirName := fmt.Sprintf("%d", depth)
-			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
-				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
-			}
-
-			// Create the file that will be stat'd.
-			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-				Root:               root,
-				Start:              vd,
-				Path:               fspath.Parse(filename),
-				FollowFinalSymlink: true,
-			}, &vfs.OpenOptions{
-				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-				Mode:  0644,
-			})
-			vd.DecRef()
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			fd.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-					Root:               root,
-					Start:              root,
-					Path:               fspath.Parse(filePath),
-					FollowFinalSymlink: true,
-				}, &vfs.StatOptions{})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-				// Sanity check.
-				if stat.Mode&^linux.S_IFMT != 0644 {
-					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func init() {
-	// Turn off reference leak checking for a fair comparison between vfs1 and
-	// vfs2.
-	refs.SetLeakMode(refs.NoLeakChecking)
-}
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
deleted file mode 100644
index 0bd82e480..000000000
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-type directory struct {
-	inode inode
-
-	// childList is a list containing (1) child Dentries and (2) fake Dentries
-	// (with inode == nil) that represent the iteration position of
-	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is protected by filesystem.mu.
-	childList dentryList
-}
-
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
-	dir := &directory{}
-	dir.inode.init(dir, fs, creds, mode)
-	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
-	return &dir.inode
-}
-
-func (i *inode) isDir() bool {
-	_, ok := i.impl.(*directory)
-	return ok
-}
-
-type directoryFD struct {
-	fileDescription
-	vfs.DirectoryFileDescriptionDefaultImpl
-
-	// Protected by filesystem.mu.
-	iter *dentry
-	off  int64
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
-	if fd.iter != nil {
-		fs := fd.filesystem()
-		dir := fd.inode().impl.(*directory)
-		fs.mu.Lock()
-		dir.childList.Remove(fd.iter)
-		fs.mu.Unlock()
-		fd.iter = nil
-	}
-}
-
-// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
-func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
-	fs := fd.filesystem()
-	vfsd := fd.vfsfd.VirtualDentry().Dentry()
-
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-
-	if fd.off == 0 {
-		if !cb.Handle(vfs.Dirent{
-			Name:    ".",
-			Type:    linux.DT_DIR,
-			Ino:     vfsd.Impl().(*dentry).inode.ino,
-			NextOff: 1,
-		}) {
-			return nil
-		}
-		fd.off++
-	}
-	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
-		if !cb.Handle(vfs.Dirent{
-			Name:    "..",
-			Type:    parentInode.direntType(),
-			Ino:     parentInode.ino,
-			NextOff: 2,
-		}) {
-			return nil
-		}
-		fd.off++
-	}
-
-	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
-	var child *dentry
-	if fd.iter == nil {
-		// Start iteration at the beginning of dir.
-		child = dir.childList.Front()
-		fd.iter = &dentry{}
-	} else {
-		// Continue iteration from where we left off.
-		child = fd.iter.Next()
-		dir.childList.Remove(fd.iter)
-	}
-	for child != nil {
-		// Skip other directoryFD iterators.
-		if child.inode != nil {
-			if !cb.Handle(vfs.Dirent{
-				Name:    child.vfsd.Name(),
-				Type:    child.inode.direntType(),
-				Ino:     child.inode.ino,
-				NextOff: fd.off + 1,
-			}) {
-				dir.childList.InsertBefore(child, fd.iter)
-				return nil
-			}
-			fd.off++
-		}
-		child = child.Next()
-	}
-	dir.childList.PushBack(fd.iter)
-	return nil
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fs := fd.filesystem()
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-
-	switch whence {
-	case linux.SEEK_SET:
-		// Use offset as given.
-	case linux.SEEK_CUR:
-		offset += fd.off
-	default:
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-
-	// If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
-	// seek even if doing so might reposition the iterator due to concurrent
-	// mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
-	if fd.off == offset {
-		return offset, nil
-	}
-
-	fd.off = offset
-	// Compensate for "." and "..".
-	remChildren := int64(0)
-	if offset >= 2 {
-		remChildren = offset - 2
-	}
-
-	dir := fd.inode().impl.(*directory)
-
-	// Ensure that fd.iter exists and is not linked into dir.childList.
-	if fd.iter == nil {
-		fd.iter = &dentry{}
-	} else {
-		dir.childList.Remove(fd.iter)
-	}
-	// Insert fd.iter before the remChildren'th child, or at the end of the
-	// list if remChildren >= number of children.
-	child := dir.childList.Front()
-	for child != nil {
-		// Skip other directoryFD iterators.
-		if child.inode != nil {
-			if remChildren == 0 {
-				dir.childList.InsertBefore(child, fd.iter)
-				return offset, nil
-			}
-			remChildren--
-		}
-		child = child.Next()
-	}
-	dir.childList.PushBack(fd.iter)
-	return offset, nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
deleted file mode 100644
index b063e09a3..000000000
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ /dev/null
@@ -1,698 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"fmt"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// All filesystem state is in-memory.
-	return nil
-}
-
-// stepLocked resolves rp.Component() to an existing file, starting from the
-// given directory.
-//
-// stepLocked is loosely analogous to fs/namei.c:walk_component().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
-	if !d.inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, err
-	}
-afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
-	if err != nil {
-		return nil, err
-	}
-	if nextVFSD == nil {
-		// Since the Dentry tree is the sole source of truth for memfs, if it's
-		// not in the Dentry tree, it doesn't exist.
-		return nil, syserror.ENOENT
-	}
-	next := nextVFSD.Impl().(*dentry)
-	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, err
-		}
-		goto afterSymlink // don't check the current directory again
-	}
-	rp.Advance()
-	return next, nil
-}
-
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory, starting from the given directory (which is usually
-// rp.Start().Impl().(*dentry)). It does not check that the returned directory
-// is searchable by the provider of rp.
-//
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
-	for !rp.Final() {
-		next, err := stepLocked(rp, d)
-		if err != nil {
-			return nil, err
-		}
-		d = next
-	}
-	if !d.inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	return d, nil
-}
-
-// resolveLocked resolves rp to an existing file.
-//
-// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
-//
-// Preconditions: filesystem.mu must be locked.
-func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
-	d := rp.Start().Impl().(*dentry)
-	for !rp.Done() {
-		next, err := stepLocked(rp, d)
-		if err != nil {
-			return nil, err
-		}
-		d = next
-	}
-	if rp.MustBeDir() && !d.inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	return d, nil
-}
-
-// doCreateAt checks that creating a file at rp is permitted, then invokes
-// create to do so.
-//
-// doCreateAt is loosely analogous to a conjunction of Linux's
-// fs/namei.c:filename_create() and done_path_create().
-//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	name := rp.Component()
-	if name == "." || name == ".." {
-		return syserror.EEXIST
-	}
-	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the child exists we want to return EEXIST immediately instead
-	// of attempting symlink/mount traversal.
-	if parent.vfsd.Child(name) != nil {
-		return syserror.EEXIST
-	}
-	if !dir && rp.MustBeDir() {
-		return syserror.ENOENT
-	}
-	// In memfs, the only way to cause a dentry to be disowned is by removing
-	// it from the filesystem, so this check is equivalent to checking if
-	// parent has been removed.
-	if parent.vfsd.IsDisowned() {
-		return syserror.ENOENT
-	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	return create(parent, name)
-}
-
-// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := resolveLocked(rp)
-	if err != nil {
-		return nil, err
-	}
-	if opts.CheckSearchable {
-		if !d.inode.isDir() {
-			return nil, syserror.ENOTDIR
-		}
-		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
-			return nil, err
-		}
-	}
-	d.IncRef()
-	return &d.vfsd, nil
-}
-
-// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
-func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return nil, err
-	}
-	d.IncRef()
-	return &d.vfsd, nil
-}
-
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		if rp.Mount() != vd.Mount() {
-			return syserror.EXDEV
-		}
-		d := vd.Dentry().Impl().(*dentry)
-		if d.inode.isDir() {
-			return syserror.EPERM
-		}
-		if d.inode.nlink == 0 {
-			return syserror.ENOENT
-		}
-		if d.inode.nlink == maxLinks {
-			return syserror.EMLINK
-		}
-		d.inode.incLinksLocked()
-		child := fs.newDentry(d.inode)
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return nil
-	})
-}
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
-		if parent.inode.nlink == maxLinks {
-			return syserror.EMLINK
-		}
-		parent.inode.incLinksLocked() // from child's ".."
-		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return nil
-	})
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		switch opts.Mode.FileType() {
-		case 0, linux.S_IFREG:
-			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-			parent.vfsd.InsertChild(&child.vfsd, name)
-			parent.inode.impl.(*directory).childList.PushBack(child)
-			return nil
-		case linux.S_IFIFO:
-			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-			parent.vfsd.InsertChild(&child.vfsd, name)
-			parent.inode.impl.(*directory).childList.PushBack(child)
-			return nil
-		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
-			// Not yet supported.
-			return syserror.EPERM
-		default:
-			return syserror.EINVAL
-		}
-	})
-}
-
-// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	if opts.Flags&linux.O_TMPFILE != 0 {
-		// Not yet supported.
-		return nil, syserror.EOPNOTSUPP
-	}
-
-	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
-	// don't need fs.mu for writing.
-	if opts.Flags&linux.O_CREAT == 0 {
-		fs.mu.RLock()
-		defer fs.mu.RUnlock()
-		d, err := resolveLocked(rp)
-		if err != nil {
-			return nil, err
-		}
-		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
-	}
-
-	mustCreate := opts.Flags&linux.O_EXCL != 0
-	start := rp.Start().Impl().(*dentry)
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	if rp.Done() {
-		// Reject attempts to open directories with O_CREAT.
-		if rp.MustBeDir() {
-			return nil, syserror.EISDIR
-		}
-		if mustCreate {
-			return nil, syserror.EEXIST
-		}
-		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
-	}
-afterTrailingSymlink:
-	parent, err := walkParentDirLocked(rp, start)
-	if err != nil {
-		return nil, err
-	}
-	// Check for search permission in the parent directory.
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, err
-	}
-	// Reject attempts to open directories with O_CREAT.
-	if rp.MustBeDir() {
-		return nil, syserror.EISDIR
-	}
-	name := rp.Component()
-	if name == "." || name == ".." {
-		return nil, syserror.EISDIR
-	}
-	// Determine whether or not we need to create a file.
-	child, err := stepLocked(rp, parent)
-	if err == syserror.ENOENT {
-		// Already checked for searchability above; now check for writability.
-		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
-			return nil, err
-		}
-		if err := rp.Mount().CheckBeginWrite(); err != nil {
-			return nil, err
-		}
-		defer rp.Mount().EndWrite()
-		// Create and open the child.
-		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return child.open(ctx, rp, opts.Flags, true)
-	}
-	if err != nil {
-		return nil, err
-	}
-	// Do we need to resolve a trailing symlink?
-	if !rp.Done() {
-		start = parent
-		goto afterTrailingSymlink
-	}
-	// Open existing file.
-	if mustCreate {
-		return nil, syserror.EEXIST
-	}
-	return child.open(ctx, rp, opts.Flags, false)
-}
-
-func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
-	if !afterCreate {
-		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
-			return nil, err
-		}
-	}
-	mnt := rp.Mount()
-	switch impl := d.inode.impl.(type) {
-	case *regularFile:
-		var fd regularFileFD
-		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-		if fd.writable {
-			if err := mnt.CheckBeginWrite(); err != nil {
-				return nil, err
-			}
-			// mnt.EndWrite() is called by regularFileFD.Release().
-		}
-		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
-		if flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data = impl.data[:0]
-			atomic.StoreInt64(&impl.dataLen, 0)
-			impl.mu.Unlock()
-		}
-		return &fd.vfsfd, nil
-	case *directory:
-		// Can't open directories writably.
-		if ats&vfs.MayWrite != 0 {
-			return nil, syserror.EISDIR
-		}
-		var fd directoryFD
-		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
-		return &fd.vfsfd, nil
-	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
-		return nil, syserror.ELOOP
-	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
-	}
-}
-
-// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := resolveLocked(rp)
-	if err != nil {
-		return "", err
-	}
-	symlink, ok := d.inode.impl.(*symlink)
-	if !ok {
-		return "", syserror.EINVAL
-	}
-	return symlink.target, nil
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
-	if opts.Flags != 0 {
-		// TODO(b/145974740): Support renameat2 flags.
-		return syserror.EINVAL
-	}
-
-	// Resolve newParent first to verify that it's on this Mount.
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	newName := rp.Component()
-	if newName == "." || newName == ".." {
-		return syserror.EBUSY
-	}
-	mnt := rp.Mount()
-	if mnt != oldParentVD.Mount() {
-		return syserror.EXDEV
-	}
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-
-	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the existing child is a symlink or mount point then we want
-	// to rename over it rather than follow it.
-	renamedVFSD := oldParent.vfsd.Child(oldName)
-	if renamedVFSD == nil {
-		return syserror.ENOENT
-	}
-	renamed := renamedVFSD.Impl().(*dentry)
-	if renamed.inode.isDir() {
-		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
-			return syserror.EINVAL
-		}
-		if oldParent != newParent {
-			// Writability is needed to change renamed's "..".
-			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
-				return err
-			}
-		}
-	} else {
-		if opts.MustBeDir || rp.MustBeDir() {
-			return syserror.ENOTDIR
-		}
-	}
-
-	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	replacedVFSD := newParent.vfsd.Child(newName)
-	var replaced *dentry
-	if replacedVFSD != nil {
-		replaced = replacedVFSD.Impl().(*dentry)
-		if replaced.inode.isDir() {
-			if !renamed.inode.isDir() {
-				return syserror.EISDIR
-			}
-			if replaced.vfsd.HasChildren() {
-				return syserror.ENOTEMPTY
-			}
-		} else {
-			if rp.MustBeDir() {
-				return syserror.ENOTDIR
-			}
-			if renamed.inode.isDir() {
-				return syserror.ENOTDIR
-			}
-		}
-	} else {
-		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
-			return syserror.EMLINK
-		}
-	}
-	if newParent.vfsd.IsDisowned() {
-		return syserror.ENOENT
-	}
-
-	// Linux places this check before some of those above; we do it here for
-	// simplicity, under the assumption that applications are not intentionally
-	// doing noop renames expecting them to succeed where non-noop renames
-	// would fail.
-	if renamedVFSD == replacedVFSD {
-		return nil
-	}
-	vfsObj := rp.VirtualFilesystem()
-	oldParentDir := oldParent.inode.impl.(*directory)
-	newParentDir := newParent.inode.impl.(*directory)
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
-		return err
-	}
-	if replaced != nil {
-		newParentDir.childList.Remove(replaced)
-		if replaced.inode.isDir() {
-			newParent.inode.decLinksLocked() // from replaced's ".."
-		}
-		replaced.inode.decLinksLocked()
-	}
-	oldParentDir.childList.Remove(renamed)
-	newParentDir.childList.PushBack(renamed)
-	if renamed.inode.isDir() {
-		oldParent.inode.decLinksLocked()
-		newParent.inode.incLinksLocked()
-	}
-	// TODO: update timestamps and parent directory sizes
-	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
-	return nil
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	name := rp.Component()
-	if name == "." {
-		return syserror.EINVAL
-	}
-	if name == ".." {
-		return syserror.ENOTEMPTY
-	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
-		return syserror.ENOENT
-	}
-	child := childVFSD.Impl().(*dentry)
-	if !child.inode.isDir() {
-		return syserror.ENOTDIR
-	}
-	if childVFSD.HasChildren() {
-		return syserror.ENOTEMPTY
-	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
-		return err
-	}
-	parent.inode.impl.(*directory).childList.Remove(child)
-	parent.inode.decLinksLocked() // from child's ".."
-	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
-	return nil
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return err
-	}
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
-
-// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := resolveLocked(rp)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	var stat linux.Statx
-	d.inode.statTo(&stat)
-	return stat, nil
-}
-
-// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return linux.Statfs{}, err
-	}
-	// TODO: actually implement statfs
-	return linux.Statfs{}, syserror.ENOSYS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return nil
-	})
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	name := rp.Component()
-	if name == "." || name == ".." {
-		return syserror.EISDIR
-	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
-		return syserror.ENOENT
-	}
-	child := childVFSD.Impl().(*dentry)
-	if child.inode.isDir() {
-		return syserror.EISDIR
-	}
-	if !rp.MustBeDir() {
-		return syserror.ENOTDIR
-	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
-		return err
-	}
-	parent.inode.impl.(*directory).childList.Remove(child)
-	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
-	return nil
-}
-
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return nil, err
-	}
-	// TODO(b/127675828): support extended attributes
-	return nil, syserror.ENOTSUP
-}
-
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return "", err
-	}
-	// TODO(b/127675828): support extended attributes
-	return "", syserror.ENOTSUP
-}
-
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return err
-	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
-}
-
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return err
-	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
-}
-
-// PrependPath implements vfs.FilesystemImpl.PrependPath.
-func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
-}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
deleted file mode 100644
index 8d0167c93..000000000
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ /dev/null
@@ -1,293 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package memfs provides a filesystem implementation that behaves like tmpfs:
-// the Dentry tree is the sole source of truth for the state of the filesystem.
-//
-// memfs is intended primarily to demonstrate filesystem implementation
-// patterns. Real uses cases for an in-memory filesystem should use tmpfs
-// instead.
-//
-// Lock order:
-//
-// filesystem.mu
-//   regularFileFD.offMu
-//     regularFile.mu
-//   inode.mu
-package memfs
-
-import (
-	"fmt"
-	"math"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
-
-// filesystem implements vfs.FilesystemImpl.
-type filesystem struct {
-	vfsfs vfs.Filesystem
-
-	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
-
-	nextInoMinusOne uint64 // accessed using atomic memory operations
-}
-
-// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	var fs filesystem
-	fs.vfsfs.Init(vfsObj, &fs)
-	root := fs.newDentry(fs.newDirectory(creds, 01777))
-	return &fs.vfsfs, &root.vfsd, nil
-}
-
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
-}
-
-// dentry implements vfs.DentryImpl.
-type dentry struct {
-	vfsd vfs.Dentry
-
-	// inode is the inode represented by this dentry. Multiple Dentries may
-	// share a single non-directory inode (with hard links). inode is
-	// immutable.
-	inode *inode
-
-	// memfs doesn't count references on dentries; because the dentry tree is
-	// the sole source of truth, it is by definition always consistent with the
-	// state of the filesystem. However, it does count references on inodes,
-	// because inode resources are released when all references are dropped.
-	// (memfs doesn't really have resources to release, but we implement
-	// reference counting because tmpfs regular files will.)
-
-	// dentryEntry (ugh) links dentries into their parent directory.childList.
-	dentryEntry
-}
-
-func (fs *filesystem) newDentry(inode *inode) *dentry {
-	d := &dentry{
-		inode: inode,
-	}
-	d.vfsd.Init(d)
-	return d
-}
-
-// IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef() {
-	d.inode.incRef()
-}
-
-// TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef() bool {
-	return d.inode.tryIncRef()
-}
-
-// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
-	d.inode.decRef()
-}
-
-// inode represents a filesystem object.
-type inode struct {
-	// refs is a reference count. refs is accessed using atomic memory
-	// operations.
-	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
-	refs int64
-
-	// Inode metadata; protected by mu and accessed using atomic memory
-	// operations unless otherwise specified.
-	mu    sync.RWMutex
-	mode  uint32 // excluding file type bits, which are based on impl
-	nlink uint32 // protected by filesystem.mu instead of inode.mu
-	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid   uint32 // auth.KGID, but ...
-	ino   uint64 // immutable
-
-	impl interface{} // immutable
-}
-
-const maxLinks = math.MaxUint32
-
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
-	i.refs = 1
-	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
-	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
-	// i.nlink initialized by caller
-	i.impl = impl
-}
-
-// incLinksLocked increments i's link count.
-//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
-func (i *inode) incLinksLocked() {
-	if i.nlink == 0 {
-		panic("memfs.inode.incLinksLocked() called with no existing links")
-	}
-	if i.nlink == maxLinks {
-		panic("memfs.inode.incLinksLocked() called with maximum link count")
-	}
-	atomic.AddUint32(&i.nlink, 1)
-}
-
-// decLinksLocked decrements i's link count.
-//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-func (i *inode) decLinksLocked() {
-	if i.nlink == 0 {
-		panic("memfs.inode.decLinksLocked() called with no existing links")
-	}
-	atomic.AddUint32(&i.nlink, ^uint32(0))
-}
-
-func (i *inode) incRef() {
-	if atomic.AddInt64(&i.refs, 1) <= 1 {
-		panic("memfs.inode.incRef() called without holding a reference")
-	}
-}
-
-func (i *inode) tryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&i.refs)
-		if refs == 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
-func (i *inode) decRef() {
-	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
-		// This is unnecessary; it's mostly to simulate what tmpfs would do.
-		if regfile, ok := i.impl.(*regularFile); ok {
-			regfile.mu.Lock()
-			regfile.data = nil
-			atomic.StoreInt64(&regfile.dataLen, 0)
-			regfile.mu.Unlock()
-		}
-	} else if refs < 0 {
-		panic("memfs.inode.decRef() called without holding a reference")
-	}
-}
-
-func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
-}
-
-// Go won't inline this function, and returning linux.Statx (which is quite
-// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
-// output parameter.
-func (i *inode) statTo(stat *linux.Statx) {
-	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
-	stat.Blksize = 1 // usermem.PageSize in tmpfs
-	stat.Nlink = atomic.LoadUint32(&i.nlink)
-	stat.UID = atomic.LoadUint32(&i.uid)
-	stat.GID = atomic.LoadUint32(&i.gid)
-	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
-	stat.Ino = i.ino
-	// TODO: device number
-	switch impl := i.impl.(type) {
-	case *regularFile:
-		stat.Mode |= linux.S_IFREG
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
-		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
-		// a uint64 accessed using atomic memory operations to avoid taking
-		// locks).
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *directory:
-		stat.Mode |= linux.S_IFDIR
-	case *symlink:
-		stat.Mode |= linux.S_IFLNK
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(len(impl.target))
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *namedPipe:
-		stat.Mode |= linux.S_IFIFO
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// allocatedBlocksForSize returns the number of 512B blocks needed to
-// accommodate the given size in bytes, as appropriate for struct
-// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
-// size is independent of the "preferred block size for I/O", struct
-// stat::st_blksize and struct statx::stx_blksize.)
-func allocatedBlocksForSize(size uint64) uint64 {
-	return (size + 511) / 512
-}
-
-func (i *inode) direntType() uint8 {
-	switch i.impl.(type) {
-	case *regularFile:
-		return linux.DT_REG
-	case *directory:
-		return linux.DT_DIR
-	case *symlink:
-		return linux.DT_LNK
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// fileDescription is embedded by memfs implementations of
-// vfs.FileDescriptionImpl.
-type fileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-}
-
-func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
-}
-
-func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.Dentry().Impl().(*dentry).inode
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	var stat linux.Statx
-	fd.inode().statTo(&stat)
-	return stat, nil
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
deleted file mode 100644
index b5a204438..000000000
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-type namedPipe struct {
-	inode inode
-
-	pipe *pipe.VFSPipe
-}
-
-// Preconditions:
-//   * fs.mu must be locked.
-//   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, mode)
-	file.inode.nlink = 1 // Only the parent has a link.
-	return &file.inode
-}
-
-// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
-// entirely via struct embedding.
-type namedPipeFD struct {
-	fileDescription
-
-	*pipe.VFSPipeFD
-}
-
-func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	var err error
-	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
-	mnt := rp.Mount()
-	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
-	return &fd.vfsfd, nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
deleted file mode 100644
index 807c1af7a..000000000
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"bytes"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-const fileName = "mypipe"
-
-func TestSeparateFDs(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the read side. This is done in a concurrently because opening
-	// One end the pipe blocks until the other end is opened.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	rfdchan := make(chan *vfs.FileDescription)
-	go func() {
-		openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY}
-		rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-		rfdchan <- rfd
-	}()
-
-	// Open the write side.
-	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY}
-	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
-	}
-	defer wfd.DecRef()
-
-	rfd, ok := <-rfdchan
-	if !ok {
-		t.Fatalf("failed to open pipe for reading %q", fileName)
-	}
-	defer rfd.DecRef()
-
-	const msg = "vamos azul"
-	checkEmpty(ctx, t, rfd)
-	checkWrite(ctx, t, wfd, msg)
-	checkRead(ctx, t, rfd, msg)
-}
-
-func TestNonblockingRead(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the read side as nonblocking.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
-	rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
-	}
-	defer rfd.DecRef()
-
-	// Open the write side.
-	openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
-	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
-	}
-	defer wfd.DecRef()
-
-	const msg = "geh blau"
-	checkEmpty(ctx, t, rfd)
-	checkWrite(ctx, t, wfd, msg)
-	checkRead(ctx, t, rfd, msg)
-}
-
-func TestNonblockingWriteError(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the write side as nonblocking, which should return ENXIO.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
-	_, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != syserror.ENXIO {
-		t.Fatalf("expected ENXIO, but got error: %v", err)
-	}
-}
-
-func TestSingleFD(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the pipe as readable and writable.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
-	fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
-	}
-	defer fd.DecRef()
-
-	const msg = "forza blu"
-	checkEmpty(ctx, t, fd)
-	checkWrite(ctx, t, fd, msg)
-	checkRead(ctx, t, fd, msg)
-}
-
-// setup creates a VFS with a pipe in the root directory at path fileName. The
-// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal
-// upon failure.
-func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) {
-	ctx := contexttest.Context(t)
-	creds := auth.CredentialsFromContext(ctx)
-
-	// Create VFS.
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserMount: true,
-	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
-	if err != nil {
-		t.Fatalf("failed to create tmpfs root mount: %v", err)
-	}
-
-	// Create the pipe.
-	root := mntns.Root()
-	pop := vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(fileName),
-	}
-	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
-	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
-		t.Fatalf("failed to create file %q: %v", fileName, err)
-	}
-
-	// Sanity check: the file pipe exists and has the correct mode.
-	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}, &vfs.StatOptions{})
-	if err != nil {
-		t.Fatalf("stat(%q) failed: %v", fileName, err)
-	}
-	if stat.Mode&^linux.S_IFMT != 0644 {
-		t.Errorf("got wrong permissions (%0o)", stat.Mode)
-	}
-	if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe {
-		t.Errorf("got wrong file type (%0o)", stat.Mode)
-	}
-
-	return ctx, creds, vfsObj, root
-}
-
-// checkEmpty calls t.Fatal if the pipe in fd is not empty.
-func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
-	readData := make([]byte, 1)
-	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
-	if err != syserror.ErrWouldBlock {
-		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
-	}
-	if bytesRead != 0 {
-		t.Fatalf("expected to read 0 bytes, but got %d", bytesRead)
-	}
-}
-
-// checkWrite calls t.Fatal if it fails to write all of msg to fd.
-func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
-	writeData := []byte(msg)
-	src := usermem.BytesIOSequence(writeData)
-	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
-	if err != nil {
-		t.Fatalf("error writing to pipe %q: %v", fileName, err)
-	}
-	if bytesWritten != int64(len(writeData)) {
-		t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten)
-	}
-}
-
-// checkRead calls t.Fatal if it fails to read msg from fd.
-func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
-	readData := make([]byte, len(msg))
-	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
-	if err != nil {
-		t.Fatalf("error reading from pipe %q: %v", fileName, err)
-	}
-	if bytesRead != int64(len(msg)) {
-		t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead)
-	}
-	if !bytes.Equal(readData, []byte(msg)) {
-		t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData))
-	}
-}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
deleted file mode 100644
index b7f4853b3..000000000
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"io"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-type regularFile struct {
-	inode inode
-
-	mu   sync.RWMutex
-	data []byte
-	// dataLen is len(data), but accessed using atomic memory operations to
-	// avoid locking in inode.stat().
-	dataLen int64
-}
-
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &regularFile{}
-	file.inode.init(file, fs, creds, mode)
-	file.inode.nlink = 1 // from parent directory
-	return &file.inode
-}
-
-type regularFileFD struct {
-	fileDescription
-
-	// These are immutable.
-	readable bool
-	writable bool
-
-	// off is the file offset. off is accessed using atomic memory operations.
-	// offMu serializes operations that may mutate off.
-	off   int64
-	offMu sync.Mutex
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.RLock()
-	if offset >= int64(len(f.data)) {
-		f.mu.RUnlock()
-		return 0, io.EOF
-	}
-	n, err := dst.CopyOut(ctx, f.data[offset:])
-	f.mu.RUnlock()
-	return int64(n), err
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PRead(ctx, dst, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	srclen := src.NumBytes()
-	if srclen == 0 {
-		return 0, nil
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.Lock()
-	end := offset + srclen
-	if end < offset {
-		// Overflow.
-		f.mu.Unlock()
-		return 0, syserror.EFBIG
-	}
-	if end > f.dataLen {
-		f.data = append(f.data, make([]byte, end-f.dataLen)...)
-		atomic.StoreInt64(&f.dataLen, end)
-	}
-	n, err := src.CopyIn(ctx, f.data[offset:end])
-	f.mu.Unlock()
-	return int64(n), err
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fd.offMu.Lock()
-	defer fd.offMu.Unlock()
-	switch whence {
-	case linux.SEEK_SET:
-		// use offset as specified
-	case linux.SEEK_CUR:
-		offset += fd.off
-	case linux.SEEK_END:
-		offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
-	default:
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	fd.off = offset
-	return offset, nil
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
deleted file mode 100644
index b2ac2cbeb..000000000
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-)
-
-type symlink struct {
-	inode  inode
-	target string // immutable
-}
-
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
-	link := &symlink{
-		target: target,
-	}
-	link.inode.init(link, fs, creds, 0777)
-	link.inode.nlink = 1 // from parent directory
-	return &link.inode
-}
-
-// O_PATH is unimplemented, so there's no way to get a FileDescription
-// representing a symlink yet.
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
new file mode 100644
index 000000000..a5b285987
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -0,0 +1,92 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "tmpfs",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_library(
+    name = "tmpfs",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "named_pipe.go",
+        "regular_file.go",
+        "symlink.go",
+        "tmpfs.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "benchmark_test",
+    size = "small",
+    srcs = ["benchmark_test.go"],
+    deps = [
+        ":tmpfs",
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "tmpfs_test",
+    size = "small",
+    srcs = [
+        "pipe_test.go",
+        "regular_file_test.go",
+    ],
+    embed = [":tmpfs"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/contexttest",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
new file mode 100644
index 000000000..d88c83499
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -0,0 +1,487 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Differences from stat_benchmark:
+//
+// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
+// not included.
+//
+// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
+// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
+// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
+// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
+// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
+// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const (
+	mountPointName = "tmp"
+	filename       = "gvisor_test_temp_0_1557494568"
+)
+
+// This is copied from syscalls/linux/sys_file.go, with the dependency on
+// kernel.Task stripped out.
+func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		rel *fs.Dirent // The relative directory for search (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		return syserror.EBADF
+	}
+
+	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	if resolve {
+		d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
+	} else {
+		d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d)
+	d.DecRef()
+	return err
+}
+
+func BenchmarkVFS1TmpfsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+
+			// Create VFS.
+			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+			if !ok {
+				b.Fatalf("failed to find tmpfs filesystem type")
+			}
+			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			mntns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				b.Fatalf("failed to create mount namespace: %v", err)
+			}
+			defer mntns.DecRef()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			root := mntns.Root()
+			defer root.DecRef()
+			d := root
+			d.IncRef()
+			defer d.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				next, err := d.Walk(ctx, root, name)
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				d.DecRef()
+				d = next
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			file.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			dirPath := false
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+						return syserror.ENOTDIR
+					}
+					uattr, err := d.Inode.UnstableAttr(ctx)
+					if err != nil {
+						return err
+					}
+					// Sanity check.
+					if uattr.Perms.User.Execute {
+						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+					}
+					return nil
+				})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkVFS2MemfsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+			creds := auth.CredentialsFromContext(ctx)
+
+			// Create VFS.
+			vfsObj := vfs.New()
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			defer mntns.DecRef(vfsObj)
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			root := mntns.Root()
+			defer root.DecRef()
+			vd := root
+			vd.IncRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				pop := vfs.PathOperation{
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
+				}
+				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+					Mode: 0755,
+				}); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				vd.DecRef()
+				vd = nextVD
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+				Root:               root,
+				Start:              vd,
+				Path:               fspath.Parse(filename),
+				FollowFinalSymlink: true,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+				Mode:  0644,
+			})
+			vd.DecRef()
+			vd = vfs.VirtualDentry{}
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			defer fd.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               root,
+					Start:              root,
+					Path:               fspath.Parse(filePath),
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Mode&^linux.S_IFMT != 0644 {
+					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+
+			// Create VFS.
+			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+			if !ok {
+				b.Fatalf("failed to find tmpfs filesystem type")
+			}
+			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			mntns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				b.Fatalf("failed to create mount namespace: %v", err)
+			}
+			defer mntns.DecRef()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create and mount the submount.
+			root := mntns.Root()
+			defer root.DecRef()
+			if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
+				b.Fatalf("failed to create mount point: %v", err)
+			}
+			mountPoint, err := root.Walk(ctx, root, mountPointName)
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+			submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs submount: %v", err)
+			}
+			if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
+				b.Fatalf("failed to mount tmpfs submount: %v", err)
+			}
+			filePathBuilder.WriteString(mountPointName)
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			d, err := root.Walk(ctx, root, mountPointName)
+			if err != nil {
+				b.Fatalf("failed to walk to mount root: %v", err)
+			}
+			defer d.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				next, err := d.Walk(ctx, root, name)
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				d.DecRef()
+				d = next
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			file.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			dirPath := false
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+						return syserror.ENOTDIR
+					}
+					uattr, err := d.Inode.UnstableAttr(ctx)
+					if err != nil {
+						return err
+					}
+					// Sanity check.
+					if uattr.Perms.User.Execute {
+						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+					}
+					return nil
+				})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+			creds := auth.CredentialsFromContext(ctx)
+
+			// Create VFS.
+			vfsObj := vfs.New()
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			defer mntns.DecRef(vfsObj)
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create the mount point.
+			root := mntns.Root()
+			defer root.DecRef()
+			pop := vfs.PathOperation{
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(mountPointName),
+			}
+			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+				Mode: 0755,
+			}); err != nil {
+				b.Fatalf("failed to create mount point: %v", err)
+			}
+			// Save the mount point for later use.
+			mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+			// Create and mount the submount.
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+				b.Fatalf("failed to mount tmpfs submount: %v", err)
+			}
+			filePathBuilder.WriteString(mountPointName)
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount root: %v", err)
+			}
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				pop := vfs.PathOperation{
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
+				}
+				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+					Mode: 0755,
+				}); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				vd.DecRef()
+				vd = nextVD
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Verify that we didn't create any directories under the mount
+			// point (i.e. they were all created on the submount).
+			firstDirName := fmt.Sprintf("%d", depth)
+			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
+				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
+			}
+
+			// Create the file that will be stat'd.
+			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+				Root:               root,
+				Start:              vd,
+				Path:               fspath.Parse(filename),
+				FollowFinalSymlink: true,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+				Mode:  0644,
+			})
+			vd.DecRef()
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			fd.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               root,
+					Start:              root,
+					Path:               fspath.Parse(filePath),
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Mode&^linux.S_IFMT != 0644 {
+					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func init() {
+	// Turn off reference leak checking for a fair comparison between vfs1 and
+	// vfs2.
+	refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
new file mode 100644
index 000000000..887ca2619
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -0,0 +1,187 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type directory struct {
+	inode inode
+
+	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// (with inode == nil) that represent the iteration position of
+	// directoryFDs. childList is used to support directoryFD.IterDirents()
+	// efficiently. childList is protected by filesystem.mu.
+	childList dentryList
+}
+
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+	dir := &directory{}
+	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
+	return &dir.inode
+}
+
+func (i *inode) isDir() bool {
+	_, ok := i.impl.(*directory)
+	return ok
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	// Protected by filesystem.mu.
+	iter *dentry
+	off  int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+	if fd.iter != nil {
+		fs := fd.filesystem()
+		dir := fd.inode().impl.(*directory)
+		fs.mu.Lock()
+		dir.childList.Remove(fd.iter)
+		fs.mu.Unlock()
+		fd.iter = nil
+	}
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fs := fd.filesystem()
+	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	if fd.off == 0 {
+		if !cb.Handle(vfs.Dirent{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     vfsd.Impl().(*dentry).inode.ino,
+			NextOff: 1,
+		}) {
+			return nil
+		}
+		fd.off++
+	}
+	if fd.off == 1 {
+		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
+		if !cb.Handle(vfs.Dirent{
+			Name:    "..",
+			Type:    parentInode.direntType(),
+			Ino:     parentInode.ino,
+			NextOff: 2,
+		}) {
+			return nil
+		}
+		fd.off++
+	}
+
+	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+	var child *dentry
+	if fd.iter == nil {
+		// Start iteration at the beginning of dir.
+		child = dir.childList.Front()
+		fd.iter = &dentry{}
+	} else {
+		// Continue iteration from where we left off.
+		child = fd.iter.Next()
+		dir.childList.Remove(fd.iter)
+	}
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.inode != nil {
+			if !cb.Handle(vfs.Dirent{
+				Name:    child.vfsd.Name(),
+				Type:    child.inode.direntType(),
+				Ino:     child.inode.ino,
+				NextOff: fd.off + 1,
+			}) {
+				dir.childList.InsertBefore(child, fd.iter)
+				return nil
+			}
+			fd.off++
+		}
+		child = child.Next()
+	}
+	dir.childList.PushBack(fd.iter)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fs := fd.filesystem()
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+	// seek even if doing so might reposition the iterator due to concurrent
+	// mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+	if fd.off == offset {
+		return offset, nil
+	}
+
+	fd.off = offset
+	// Compensate for "." and "..".
+	remChildren := int64(0)
+	if offset >= 2 {
+		remChildren = offset - 2
+	}
+
+	dir := fd.inode().impl.(*directory)
+
+	// Ensure that fd.iter exists and is not linked into dir.childList.
+	if fd.iter == nil {
+		fd.iter = &dentry{}
+	} else {
+		dir.childList.Remove(fd.iter)
+	}
+	// Insert fd.iter before the remChildren'th child, or at the end of the
+	// list if remChildren >= number of children.
+	child := dir.childList.Front()
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.inode != nil {
+			if remChildren == 0 {
+				dir.childList.InsertBefore(child, fd.iter)
+				return offset, nil
+			}
+			remChildren--
+		}
+		child = child.Next()
+	}
+	dir.childList.PushBack(fd.iter)
+	return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
new file mode 100644
index 000000000..26979729e
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -0,0 +1,698 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
+	if err != nil {
+		return nil, err
+	}
+	if nextVFSD == nil {
+		// Since the Dentry tree is the sole source of truth for tmpfs, if it's
+		// not in the Dentry tree, it doesn't exist.
+		return nil, syserror.ENOENT
+	}
+	next := nextVFSD.Impl().(*dentry)
+	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// TODO: symlink traversals update access time
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return next, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	for !rp.Final() {
+		next, err := stepLocked(rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		next, err := stepLocked(rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the child exists we want to return EEXIST immediately instead
+	// of attempting symlink/mount traversal.
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	// In memfs, the only way to cause a dentry to be disowned is by removing
+	// it from the filesystem, so this check is equivalent to checking if
+	// parent has been removed.
+	if parent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	return create(parent, name)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		if d.inode.isDir() {
+			return syserror.EPERM
+		}
+		if d.inode.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if d.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		d.inode.incLinksLocked()
+		child := fs.newDentry(d.inode)
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+		if parent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		parent.inode.incLinksLocked() // from child's ".."
+		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		switch opts.Mode.FileType() {
+		case 0, linux.S_IFREG:
+			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFIFO:
+			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+			// Not yet supported.
+			return syserror.EPERM
+		default:
+			return syserror.EINVAL
+		}
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		// Not yet supported.
+		return nil, syserror.EOPNOTSUPP
+	}
+
+	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
+	// don't need fs.mu for writing.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.mu.RUnlock()
+		d, err := resolveLocked(rp)
+		if err != nil {
+			return nil, err
+		}
+		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
+	}
+
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	start := rp.Start().Impl().(*dentry)
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		// Reject attempts to open directories with O_CREAT.
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
+	}
+afterTrailingSymlink:
+	parent, err := walkParentDirLocked(rp, start)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	child, err := stepLocked(rp, parent)
+	if err == syserror.ENOENT {
+		// Already checked for searchability above; now check for writability.
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return child.open(ctx, rp, opts.Flags, true)
+	}
+	if err != nil {
+		return nil, err
+	}
+	// Do we need to resolve a trailing symlink?
+	if !rp.Done() {
+		start = parent
+		goto afterTrailingSymlink
+	}
+	// Open existing file.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	return child.open(ctx, rp, opts.Flags, false)
+}
+
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if !afterCreate {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+			return nil, err
+		}
+	}
+	mnt := rp.Mount()
+	switch impl := d.inode.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+		if fd.writable {
+			if err := mnt.CheckBeginWrite(); err != nil {
+				return nil, err
+			}
+			// mnt.EndWrite() is called by regularFileFD.Release().
+		}
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		if flags&linux.O_TRUNC != 0 {
+			impl.mu.Lock()
+			impl.data.Truncate(0, impl.memFile)
+			atomic.StoreUint64(&impl.size, 0)
+			impl.mu.Unlock()
+		}
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		return &fd.vfsfd, nil
+	case *symlink:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	case *namedPipe:
+		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
+	}
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := d.inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// TODO(b/145974740): Support renameat2 flags.
+		return syserror.EINVAL
+	}
+
+	// Resolve newParent first to verify that it's on this Mount.
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the existing child is a symlink or mount point then we want
+	// to rename over it rather than follow it.
+	renamedVFSD := oldParent.vfsd.Child(oldName)
+	if renamedVFSD == nil {
+		return syserror.ENOENT
+	}
+	renamed := renamedVFSD.Impl().(*dentry)
+	if renamed.inode.isDir() {
+		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			// Writability is needed to change renamed's "..".
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	if replacedVFSD != nil {
+		replaced = replacedVFSD.Impl().(*dentry)
+		if replaced.inode.isDir() {
+			if !renamed.inode.isDir() {
+				return syserror.EISDIR
+			}
+			if replaced.vfsd.HasChildren() {
+				return syserror.ENOTEMPTY
+			}
+		} else {
+			if rp.MustBeDir() {
+				return syserror.ENOTDIR
+			}
+			if renamed.inode.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	} else {
+		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+	}
+	if newParent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+
+	// Linux places this check before some of those above; we do it here for
+	// simplicity, under the assumption that applications are not intentionally
+	// doing noop renames expecting them to succeed where non-noop renames
+	// would fail.
+	if renamedVFSD == replacedVFSD {
+		return nil
+	}
+	vfsObj := rp.VirtualFilesystem()
+	oldParentDir := oldParent.inode.impl.(*directory)
+	newParentDir := newParent.inode.impl.(*directory)
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+		return err
+	}
+	if replaced != nil {
+		newParentDir.childList.Remove(replaced)
+		if replaced.inode.isDir() {
+			newParent.inode.decLinksLocked() // from replaced's ".."
+		}
+		replaced.inode.decLinksLocked()
+	}
+	oldParentDir.childList.Remove(renamed)
+	newParentDir.childList.PushBack(renamed)
+	if renamed.inode.isDir() {
+		oldParent.inode.decLinksLocked()
+		newParent.inode.incLinksLocked()
+	}
+	// TODO: update timestamps and parent directory sizes
+	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if !child.inode.isDir() {
+		return syserror.ENOTDIR
+	}
+	if childVFSD.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+		return err
+	}
+	parent.inode.impl.(*directory).childList.Remove(child)
+	parent.inode.decLinksLocked() // from child's ".."
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement inode.setStat
+	return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	d.inode.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO: actually implement statfs
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if child.inode.isDir() {
+		return syserror.EISDIR
+	}
+	if !rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+		return err
+	}
+	parent.inode.impl.(*directory).childList.Remove(child)
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
+	return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(b/127675828): support extended attributes
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	// TODO(b/127675828): support extended attributes
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
new file mode 100644
index 000000000..40bde54de
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type namedPipe struct {
+	inode inode
+
+	pipe *pipe.VFSPipe
+}
+
+// Preconditions:
+//   * fs.mu must be locked.
+//   * rp.Mount().CheckBeginWrite() has been called successfully.
+func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // Only the parent has a link.
+	return &file.inode
+}
+
+// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
+// entirely via struct embedding.
+type namedPipeFD struct {
+	fileDescription
+
+	*pipe.VFSPipeFD
+}
+
+func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	var err error
+	var fd namedPipeFD
+	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
+	if err != nil {
+		return nil, err
+	}
+	mnt := rp.Mount()
+	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+	return &fd.vfsfd, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
new file mode 100644
index 000000000..70b42a6ec
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -0,0 +1,235 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const fileName = "mypipe"
+
+func TestSeparateFDs(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the read side. This is done in a concurrently because opening
+	// One end the pipe blocks until the other end is opened.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	rfdchan := make(chan *vfs.FileDescription)
+	go func() {
+		openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY}
+		rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+		rfdchan <- rfd
+	}()
+
+	// Open the write side.
+	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY}
+	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer wfd.DecRef()
+
+	rfd, ok := <-rfdchan
+	if !ok {
+		t.Fatalf("failed to open pipe for reading %q", fileName)
+	}
+	defer rfd.DecRef()
+
+	const msg = "vamos azul"
+	checkEmpty(ctx, t, rfd)
+	checkWrite(ctx, t, wfd, msg)
+	checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingRead(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the read side as nonblocking.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
+	rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
+	}
+	defer rfd.DecRef()
+
+	// Open the write side.
+	openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
+	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer wfd.DecRef()
+
+	const msg = "geh blau"
+	checkEmpty(ctx, t, rfd)
+	checkWrite(ctx, t, wfd, msg)
+	checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingWriteError(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the write side as nonblocking, which should return ENXIO.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
+	_, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != syserror.ENXIO {
+		t.Fatalf("expected ENXIO, but got error: %v", err)
+	}
+}
+
+func TestSingleFD(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the pipe as readable and writable.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
+	fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer fd.DecRef()
+
+	const msg = "forza blu"
+	checkEmpty(ctx, t, fd)
+	checkWrite(ctx, t, fd, msg)
+	checkRead(ctx, t, fd, msg)
+}
+
+// setup creates a VFS with a pipe in the root directory at path fileName. The
+// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal
+// upon failure.
+func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Create VFS.
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("failed to create tmpfs root mount: %v", err)
+	}
+
+	// Create the pipe.
+	root := mntns.Root()
+	pop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(fileName),
+	}
+	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
+	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
+		t.Fatalf("failed to create file %q: %v", fileName, err)
+	}
+
+	// Sanity check: the file pipe exists and has the correct mode.
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}, &vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat(%q) failed: %v", fileName, err)
+	}
+	if stat.Mode&^linux.S_IFMT != 0644 {
+		t.Errorf("got wrong permissions (%0o)", stat.Mode)
+	}
+	if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe {
+		t.Errorf("got wrong file type (%0o)", stat.Mode)
+	}
+
+	return ctx, creds, vfsObj, root
+}
+
+// checkEmpty calls t.Fatal if the pipe in fd is not empty.
+func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
+	readData := make([]byte, 1)
+	dst := usermem.BytesIOSequence(readData)
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
+	if err != syserror.ErrWouldBlock {
+		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
+	}
+	if bytesRead != 0 {
+		t.Fatalf("expected to read 0 bytes, but got %d", bytesRead)
+	}
+}
+
+// checkWrite calls t.Fatal if it fails to write all of msg to fd.
+func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+	writeData := []byte(msg)
+	src := usermem.BytesIOSequence(writeData)
+	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("error writing to pipe %q: %v", fileName, err)
+	}
+	if bytesWritten != int64(len(writeData)) {
+		t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten)
+	}
+}
+
+// checkRead calls t.Fatal if it fails to read msg from fd.
+func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+	readData := make([]byte, len(msg))
+	dst := usermem.BytesIOSequence(readData)
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
+	if err != nil {
+		t.Fatalf("error reading from pipe %q: %v", fileName, err)
+	}
+	if bytesRead != int64(len(msg)) {
+		t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead)
+	}
+	if !bytes.Equal(readData, []byte(msg)) {
+		t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData))
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
new file mode 100644
index 000000000..f51e247a7
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -0,0 +1,357 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+	inode inode
+
+	// memFile is a platform.File used to allocate pages to this regularFile.
+	memFile *pgalloc.MemoryFile
+
+	// mu protects the fields below.
+	mu sync.RWMutex
+
+	// data maps offsets into the file to offsets into memFile that store
+	// the file's data.
+	data fsutil.FileRangeSet
+
+	// size is the size of data, but accessed using atomic memory
+	// operations to avoid locking in inode.stat().
+	size uint64
+
+	// seals represents file seals on this inode.
+	seals uint32
+}
+
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+	file := &regularFile{
+		memFile: fs.memFile,
+	}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// These are immutable.
+	readable bool
+	writable bool
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	// offMu serializes operations that may mutate off.
+	off   int64
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+	if fd.writable {
+		fd.vfsfd.VirtualDentry().Mount().EndWrite()
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putRegularFileReadWriter(rw)
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	srclen := src.NumBytes()
+	if srclen == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	end := offset + srclen
+	if end < offset {
+		// Overflow.
+		return 0, syserror.EFBIG
+	}
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := src.CopyInTo(ctx, rw)
+	putRegularFileReadWriter(rw)
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return nil
+}
+
+// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
+type regularFileReadWriter struct {
+	file *regularFile
+
+	// Offset into the file to read/write at. Note that this may be
+	// different from the FD offset if PRead/PWrite is used.
+	off uint64
+}
+
+var regularFileReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &regularFileReadWriter{}
+	},
+}
+
+func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
+	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
+	rw.file = file
+	rw.off = uint64(offset)
+	return rw
+}
+
+func putRegularFileReadWriter(rw *regularFileReadWriter) {
+	rw.file = nil
+	regularFileReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	rw.file.mu.RLock()
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.file.size {
+		rw.file.mu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.file.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Tmpfs holes are zero-filled.
+			gapmr := gap.Range().Intersect(mr)
+			dst := dsts.TakeFirst64(gapmr.Length())
+			n, err := safemem.ZeroSeq(dst)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+	rw.file.mu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	rw.file.mu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	// Check if seals prevent either file growth or all writes.
+	switch {
+	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+		rw.file.mu.Unlock()
+		return 0, syserror.EPERM
+	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+		// When growth is sealed, Linux effectively allows writes which would
+		// normally grow the file to partially succeed up to the current EOF,
+		// rounded down to the page boundary before the EOF.
+		//
+		// This happens because writes (and thus the growth check) for tmpfs
+		// files proceed page-by-page on Linux, and the final write to the page
+		// containing EOF fails, resulting in a partial write up to the start of
+		// that page.
+		//
+		// To emulate this behaviour, artifically truncate the write to the
+		// start of the page containing the current EOF.
+		//
+		// See Linux, mm/filemap.c:generic_perform_write() and
+		// mm/shmem.c:shmem_write_begin().
+		if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart {
+			end = pgstart
+		}
+		if end <= rw.off {
+			// Truncation would result in no data being written.
+			rw.file.mu.Unlock()
+			return 0, syserror.EPERM
+		}
+	}
+
+	// Page-aligned mr for when we need to allocate memory. RoundUp can't
+	// overflow since end is an int64.
+	pgstartaddr := usermem.Addr(rw.off).RoundDown()
+	pgendaddr, _ := usermem.Addr(end).RoundUp()
+	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += uint64(n)
+			srcs = srcs.DropFirst64(n)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Allocate memory for the write.
+			gapMR := gap.Range().Intersect(pgMR)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Write to that memory as usual.
+			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	// If the write ends beyond the file's previous size, it causes the
+	// file to grow.
+	if rw.off > rw.file.size {
+		atomic.StoreUint64(&rw.file.size, rw.off)
+	}
+
+	rw.file.mu.Unlock()
+	return done, retErr
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
new file mode 100644
index 000000000..3731c5b6f
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+	}
+	root := mntns.Root()
+
+	// Create the file that will be write/read.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(filename),
+		FollowFinalSymlink: true,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  0644,
+	})
+	if err != nil {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+	}
+
+	return fd, func() {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+	}, nil
+}
+
+// Test that we can write some data to a file and read it back.`
+func TestSimpleWriteRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "simpleReadWrite")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write.
+	data := []byte("foobarbaz")
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+
+	// Seek back to beginning.
+	if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil {
+		t.Fatalf("fd.Seek failed: %v", err)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want {
+		t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want)
+	}
+
+	// Read.
+	buf := make([]byte, len(data))
+	n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.Read failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Read got short read length %d, want %d", n, len(data))
+	}
+	if got, want := string(buf), string(data); got != want {
+		t.Errorf("Read got %q want %s", got, want)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+}
+
+func TestPWrite(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "PRead")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Fill file with 1k 'a's.
+	data := bytes.Repeat([]byte{'a'}, 1000)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Write "gVisor is awesome" at various offsets.
+	buf := []byte("gVisor is awesome")
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+	for _, offset := range offsets {
+		name := fmt.Sprintf("PWrite offset=%d", offset)
+		t.Run(name, func(t *testing.T) {
+			n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{})
+			if err != nil {
+				t.Errorf("fd.PWrite got err %v want nil", err)
+			}
+			if n != int64(len(buf)) {
+				t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf))
+			}
+
+			// Update data to reflect expected file contents.
+			if len(data) < offset+len(buf) {
+				data = append(data, make([]byte, (offset+len(buf))-len(data))...)
+			}
+			copy(data[offset:], buf)
+
+			// Read the whole file and compare with data.
+			readBuf := make([]byte, len(data))
+			n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{})
+			if err != nil {
+				t.Fatalf("fd.PRead failed: %v", err)
+			}
+			if n != int64(len(data)) {
+				t.Errorf("fd.PRead got short read length %d, want %d", n, len(data))
+			}
+			if got, want := string(readBuf), string(data); got != want {
+				t.Errorf("PRead got %q want %s", got, want)
+			}
+
+		})
+	}
+}
+
+func TestPRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "PRead")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write 100 sequences of 'gVisor is awesome'.
+	data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Read various sizes from various offsets.
+	sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000}
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+
+	for _, size := range sizes {
+		for _, offset := range offsets {
+			name := fmt.Sprintf("PRead offset=%d size=%d", offset, size)
+			t.Run(name, func(t *testing.T) {
+				var (
+					wantRead []byte
+					wantErr  error
+				)
+				if offset < len(data) {
+					wantRead = data[offset:]
+				} else if size > 0 {
+					wantErr = io.EOF
+				}
+				if offset+size < len(data) {
+					wantRead = wantRead[:size]
+				}
+				buf := make([]byte, size)
+				n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{})
+				if err != wantErr {
+					t.Errorf("fd.PRead got err %v want %v", err, wantErr)
+				}
+				if n != int64(len(wantRead)) {
+					t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead))
+				}
+				if got := string(buf[:n]); got != string(wantRead) {
+					t.Errorf("fd.PRead got %q want %q", got, string(wantRead))
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
new file mode 100644
index 000000000..5246aca84
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type symlink struct {
+	inode  inode
+	target string // immutable
+}
+
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+	link := &symlink{
+		target: target,
+	}
+	link.inode.init(link, fs, creds, 0777)
+	link.inode.nlink = 1 // from parent directory
+	return &link.inode
+}
+
+// O_PATH is unimplemented, so there's no way to get a FileDescription
+// representing a symlink yet.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
new file mode 100644
index 000000000..7be6faa5b
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// Lock order:
+//
+// filesystem.mu
+//   regularFileFD.offMu
+//     regularFile.mu
+//   inode.mu
+package tmpfs
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// memFile is used to allocate pages to for regular files.
+	memFile *pgalloc.MemoryFile
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+	if memFileProvider == nil {
+		panic("MemoryFileProviderFromContext returned nil")
+	}
+	fs := filesystem{
+		memFile: memFileProvider.MemoryFile(),
+	}
+	fs.vfsfs.Init(vfsObj, &fs)
+	root := fs.newDentry(fs.newDirectory(creds, 01777))
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// inode is the inode represented by this dentry. Multiple Dentries may
+	// share a single non-directory inode (with hard links). inode is
+	// immutable.
+	inode *inode
+
+	// tmpfs doesn't count references on dentries; because the dentry tree is
+	// the sole source of truth, it is by definition always consistent with the
+	// state of the filesystem. However, it does count references on inodes,
+	// because inode resources are released when all references are dropped.
+	// (tmpfs doesn't really have resources to release, but we implement
+	// reference counting because tmpfs regular files will.)
+
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
+	dentryEntry
+}
+
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+	d := &dentry{
+		inode: inode,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	d.inode.decRef()
+}
+
+// inode represents a filesystem object.
+type inode struct {
+	// refs is a reference count. refs is accessed using atomic memory
+	// operations.
+	//
+	// A reference is held on all inodes that are reachable in the filesystem
+	// tree. For non-directories (which may have multiple hard links), this
+	// means that a reference is dropped when nlink reaches 0. For directories,
+	// nlink never reaches 0 due to the "." entry; instead,
+	// filesystem.RmdirAt() drops the reference.
+	refs int64
+
+	// Inode metadata; protected by mu and accessed using atomic memory
+	// operations unless otherwise specified.
+	mu    sync.RWMutex
+	mode  uint32 // excluding file type bits, which are based on impl
+	nlink uint32 // protected by filesystem.mu instead of inode.mu
+	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32 // auth.KGID, but ...
+	ino   uint64 // immutable
+
+	impl interface{} // immutable
+}
+
+const maxLinks = math.MaxUint32
+
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+	i.refs = 1
+	i.mode = uint32(mode)
+	i.uid = uint32(creds.EffectiveKUID)
+	i.gid = uint32(creds.EffectiveKGID)
+	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// i.nlink initialized by caller
+	i.impl = impl
+}
+
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
+func (i *inode) incLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.incLinksLocked() called with no existing links")
+	}
+	if i.nlink == maxLinks {
+		panic("memfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
+}
+
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+func (i *inode) decLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.decLinksLocked() called with no existing links")
+	}
+	atomic.AddUint32(&i.nlink, ^uint32(0))
+}
+
+func (i *inode) incRef() {
+	if atomic.AddInt64(&i.refs, 1) <= 1 {
+		panic("tmpfs.inode.incRef() called without holding a reference")
+	}
+}
+
+func (i *inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&i.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (i *inode) decRef() {
+	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		// This is unnecessary; it's mostly to simulate what tmpfs would do.
+		if regFile, ok := i.impl.(*regularFile); ok {
+			regFile.mu.Lock()
+			regFile.data.DropAll(regFile.memFile)
+			atomic.StoreUint64(&regFile.size, 0)
+			regFile.mu.Unlock()
+		}
+	} else if refs < 0 {
+		panic("tmpfs.inode.decRef() called without holding a reference")
+	}
+}
+
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	stat.Blksize = 1 // usermem.PageSize in tmpfs
+	stat.Nlink = atomic.LoadUint32(&i.nlink)
+	stat.UID = atomic.LoadUint32(&i.uid)
+	stat.GID = atomic.LoadUint32(&i.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+	stat.Ino = i.ino
+	// TODO: device number
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		stat.Mode |= linux.S_IFREG
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(atomic.LoadUint64(&impl.size))
+		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+		// a uint64 accessed using atomic memory operations to avoid taking
+		// locks).
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		stat.Mode |= linux.S_IFDIR
+	case *symlink:
+		stat.Mode |= linux.S_IFLNK
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(len(impl.target))
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *namedPipe:
+		stat.Mode |= linux.S_IFIFO
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+	return (size + 511) / 512
+}
+
+func (i *inode) direntType() uint8 {
+	switch i.impl.(type) {
+	case *regularFile:
+		return linux.DT_REG
+	case *directory:
+		return linux.DT_DIR
+	case *symlink:
+		return linux.DT_LNK
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// fileDescription is embedded by tmpfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement inode.setStat
+	return syserror.EPERM
+}
-- 
cgit v1.2.3


From 17c18241cdeb66e75738c3892730f1a434a4bd60 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 6 Jan 2020 15:53:21 -0800
Subject: platform/syscall: use syscall + int3 to execute a system call in a
 stub process

Right now, we need to call ptrace(PTRACE_SYSCALL) and wait() twice to execute
one system call in a stub process. With these changes, we will need to call
ptrace + wait only once.

In addition, this allows to workaround the kernel bug when a stub process
doesn't stop on syscall-exit-stop and starts executing the next system call.

Reported-by: syzbot+37143cafa8dc3b5008ee@syzkaller.appspotmail.com
PiperOrigin-RevId: 288393029
---
 pkg/sentry/platform/ptrace/stub_amd64.s        | 29 ++++++++++++++-----------
 pkg/sentry/platform/ptrace/stub_arm64.s        | 30 +++++++++++++++-----------
 pkg/sentry/platform/ptrace/subprocess.go       | 20 +++++------------
 pkg/sentry/platform/ptrace/subprocess_amd64.go |  4 +++-
 pkg/sentry/platform/ptrace/subprocess_arm64.go |  2 ++
 5 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 64c718d21..16f9c523e 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -64,6 +64,8 @@ begin:
 	CMPQ AX, $0
 	JL error
 
+	MOVQ $0, BX
+
 	// SIGSTOP to wait for attach.
 	//
 	// The SYSCALL instruction will be used for future syscall injection by
@@ -73,23 +75,26 @@ begin:
 	MOVQ $SIGSTOP, SI
 	SYSCALL
 
-	// The tracer may "detach" and/or allow code execution here in three cases:
-	//
-	// 1. New (traced) stub threads are explicitly detached by the
-	// goroutine in newSubprocess. However, they are detached while in
-	// group-stop, so they do not execute code here.
-	//
-	// 2. If a tracer thread exits, it implicitly detaches from the stub,
-	// potentially allowing code execution here. However, the Go runtime
-	// never exits individual threads, so this case never occurs.
-	//
-	// 3. subprocess.createStub clones a new stub process that is untraced,
+	// The sentry sets BX to 1 when creating stub process.
+	CMPQ BX, $1
+	JE clone
+
+	// Notify the Sentry that syscall exited.
+done:
+	INT $3
+	// Be paranoid.
+	JMP done
+clone:
+	// subprocess.createStub clones a new stub process that is untraced,
 	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
 	// ourselves for attach by the tracer.
 	//
 	// R15 has been updated with the expected PPID.
-	JMP begin
+	CMPQ AX, $0
+	JE begin
 
+	// The clone syscall returns a non-zero value.
+	JMP done
 error:
 	// Exit with -errno.
 	MOVQ AX, DI
diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s
index 2c5e4d5cb..6162df02a 100644
--- a/pkg/sentry/platform/ptrace/stub_arm64.s
+++ b/pkg/sentry/platform/ptrace/stub_arm64.s
@@ -59,6 +59,8 @@ begin:
 	CMP $0x0, R0
 	BLT error
 
+	MOVD $0, R9
+
 	// SIGSTOP to wait for attach.
 	//
 	// The SYSCALL instruction will be used for future syscall injection by
@@ -66,22 +68,26 @@ begin:
 	MOVD $SYS_KILL, R8
 	MOVD $SIGSTOP, R1
 	SVC
-	// The tracer may "detach" and/or allow code execution here in three cases:
-	//
-	// 1. New (traced) stub threads are explicitly detached by the
-	// goroutine in newSubprocess. However, they are detached while in
-	// group-stop, so they do not execute code here.
-	//
-	// 2. If a tracer thread exits, it implicitly detaches from the stub,
-	// potentially allowing code execution here. However, the Go runtime
-	// never exits individual threads, so this case never occurs.
-	//
-	// 3. subprocess.createStub clones a new stub process that is untraced,
+
+	// The sentry sets R9 to 1 when creating stub process.
+	CMP $1, R9
+	BEQ clone
+
+done:
+	// Notify the Sentry that syscall exited.
+	BRK $3
+	B done // Be paranoid.
+clone:
+	// subprocess.createStub clones a new stub process that is untraced,
 	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
 	// ourselves for attach by the tracer.
 	//
 	// R7 has been updated with the expected PPID.
-	B begin
+	CMP $0, R0
+	BEQ begin
+
+	// The clone system call returned a non-zero value.
+	B done
 
 error:
 	// Exit with -errno.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 821f6848d..20244fd95 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -430,13 +430,15 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 	}
 
 	for {
-		// Execute the syscall instruction.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+		// Execute the syscall instruction. The task has to stop on the
+		// trap instruction which is right after the syscall
+		// instruction.
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
 		sig := t.wait(stopped)
-		if sig == (syscallEvent | syscall.SIGTRAP) {
+		if sig == syscall.SIGTRAP {
 			// Reached syscall-enter-stop.
 			break
 		} else {
@@ -448,18 +450,6 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 		}
 	}
 
-	// Complete the actual system call.
-	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
-		panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
-	}
-
-	// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
-	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
-	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
-	if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
-		t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
-	}
-
 	// Grab registers.
 	if err := t.getRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 606dc2b1d..e99798c56 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -141,9 +141,11 @@ func (t *thread) adjustInitRegsRip() {
 	t.initRegs.Rip -= initRegsRipAdjustment
 }
 
-// Pass the expected PPID to the child via R15 when creating stub process
+// Pass the expected PPID to the child via R15 when creating stub process.
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.R15 = uint64(ppid)
+	// Rbx has to be set to 1 when creating stub process.
+	initregs.Rbx = 1
 }
 
 // patchSignalInfo patches the signal info to account for hitting the seccomp
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 62a686ee7..7b975137f 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -127,6 +127,8 @@ func (t *thread) adjustInitRegsRip() {
 // Pass the expected PPID to the child via X7 when creating stub process
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.Regs[7] = uint64(ppid)
+	// R9 has to be set to 1 when creating stub process.
+	initregs.Regs[9] = 1
 }
 
 // patchSignalInfo patches the signal info to account for hitting the seccomp
-- 
cgit v1.2.3


From 8dfd92284016f7c719b5766506cf3d6ab9c39c0e Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 6 Jan 2020 16:04:19 -0800
Subject: Pass the NIC-internal name to the NIC name function when generating
 opaque IIDs

Pass the NIC-internal name to the NIC name function when generating opaque IIDs
so implementations can use the name that was provided when the NIC was created.
Previously, explicit NICID to NIC name resolution was required from the netstack
integrator.

Tests: Test that the name provided when creating a NIC is passed to the NIC name
function when generating opaque IIDs.
PiperOrigin-RevId: 288395359
---
 pkg/tcpip/header/ipv6_test.go |  8 ++--
 pkg/tcpip/stack/ndp.go        |  2 +-
 pkg/tcpip/stack/ndp_test.go   | 13 ++++---
 pkg/tcpip/stack/nic.go        |  2 +-
 pkg/tcpip/stack/stack.go      |  9 ++++-
 pkg/tcpip/stack/stack_test.go | 90 +++++++++++++++++++++++++------------------
 6 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index cd1862e42..1994003ed 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -96,7 +96,7 @@ func TestAppendOpaqueInterfaceIdentifier(t *testing.T) {
 			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
 		},
 		{
-			name: "Nil SecretKey",
+			name: "Nil SecretKey and empty nicName",
 			prefix: func() tcpip.Subnet {
 				addrWithPrefix := tcpip.AddressWithPrefix{
 					Address:   "\x01\x02\x03\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -104,7 +104,7 @@ func TestAppendOpaqueInterfaceIdentifier(t *testing.T) {
 				}
 				return addrWithPrefix.Subnet()
 			}(),
-			nicName:    "eth12",
+			nicName:    "",
 			dadCounter: 3,
 			secretKey:  nil,
 		},
@@ -178,8 +178,8 @@ func TestLinkLocalAddrWithOpaqueIID(t *testing.T) {
 			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
 		},
 		{
-			name:       "Nil SecretKey",
-			nicName:    "eth12",
+			name:       "Nil SecretKey and empty nicName",
+			nicName:    "",
 			dadCounter: 3,
 			secretKey:  nil,
 		},
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index ba6a57e6f..238bc27dc 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1030,7 +1030,7 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 
 	addrBytes := []byte(prefix.ID())
 	if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-		addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID()), 0 /* dadCounter */, oIID.SecretKey)
+		addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name), 0 /* dadCounter */, oIID.SecretKey)
 	} else {
 		// Only attempt to generate an interface-specific IID if we have a valid
 		// link address.
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8e817e730..9430844d3 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1918,6 +1918,7 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	t.Parallel()
 
 	const nicID = 1
+	const nicName = "nic1"
 	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
 	secretKey := secretKeyBuf[:]
 	n, err := rand.Read(secretKey)
@@ -1935,12 +1936,12 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	// defined by RFC 7217.
 	addrBytes := []byte(subnet1.ID())
 	addr1 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet1, "nic1", 0, secretKey)),
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet1, nicName, 0, secretKey)),
 		PrefixLen: 64,
 	}
 	addrBytes = []byte(subnet2.ID())
 	addr2 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet2, "nic1", 0, secretKey)),
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet2, nicName, 0, secretKey)),
 		PrefixLen: 64,
 	}
 
@@ -1956,15 +1957,15 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 		},
 		NDPDisp: &ndpDisp,
 		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-			NICNameFromID: func(nicID tcpip.NICID) string {
-				return fmt.Sprintf("nic%d", nicID)
+			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+				return nicName
 			},
 			SecretKey: secretKey,
 		},
 	})
 
-	if err := s.CreateNIC(nicID, e); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	if err := s.CreateNamedNIC(nicID, nicName, e); err != nil {
+		t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, nicName, err)
 	}
 
 	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3bed0af3c..044fe5298 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -180,7 +180,7 @@ func (n *NIC) enable() *tcpip.Error {
 
 	var addr tcpip.Address
 	if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-		addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID()), 0, oIID.SecretKey)
+		addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey)
 	} else {
 		l2addr := n.linkEP.LinkAddress()
 
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index c6e6becf3..ffb379363 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -353,8 +353,13 @@ func (u *uniqueIDGenerator) UniqueID() uint64 {
 }
 
 // NICNameFromID is a function that returns a stable name for the specified NIC,
-// even if the NIC ID changes over time.
-type NICNameFromID func(tcpip.NICID) string
+// even if different NIC IDs are used to refer to the same NIC in different
+// program runs. It is used when generating opaque interface identifiers (IIDs).
+// If the NIC was created with a name, it will be passed to NICNameFromID.
+//
+// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
+// generated for the same prefix on differnt NICs.
+type NICNameFromID func(tcpip.NICID, string) string
 
 // OpaqueInterfaceIdentifierOptions holds the options related to the generation
 // of opaque interface indentifiers (IIDs) as defined by RFC 7217.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index e18dfea83..f533949c0 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1910,7 +1910,7 @@ func TestNICAutoGenAddr(t *testing.T) {
 			false,
 			linkAddr1,
 			stack.OpaqueInterfaceIdentifierOptions{
-				NICNameFromID: func(nicID tcpip.NICID) string {
+				NICNameFromID: func(nicID tcpip.NICID, _ string) string {
 					return fmt.Sprintf("nic%d", nicID)
 				},
 			},
@@ -2005,6 +2005,8 @@ func TestNICAutoGenAddr(t *testing.T) {
 // always be generated with opaque IIDs if configured to use them, even if the
 // NIC has an invalid MAC address.
 func TestNICAutoGenAddrWithOpaque(t *testing.T) {
+	const nicID = 1
+
 	var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte
 	n, err := rand.Read(secretKey[:])
 	if err != nil {
@@ -2014,54 +2016,61 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n)
 	}
 
-	iidOpts := stack.OpaqueInterfaceIdentifierOptions{
-		NICNameFromID: func(nicID tcpip.NICID) string {
-			return fmt.Sprintf("nic%d", nicID)
-		},
-		SecretKey: secretKey[:],
-	}
-
 	tests := []struct {
-		name     string
-		autoGen  bool
-		linkAddr tcpip.LinkAddress
+		name      string
+		nicName   string
+		autoGen   bool
+		linkAddr  tcpip.LinkAddress
+		secretKey []byte
 	}{
 		{
-			"Disabled",
-			false,
-			linkAddr1,
+			name:      "Disabled",
+			nicName:   "nic1",
+			autoGen:   false,
+			linkAddr:  linkAddr1,
+			secretKey: secretKey[:],
 		},
 		{
-			"Enabled",
-			true,
-			linkAddr1,
+			name:      "Enabled",
+			nicName:   "nic1",
+			autoGen:   true,
+			linkAddr:  linkAddr1,
+			secretKey: secretKey[:],
 		},
 		// These are all cases where we would not have generated a
 		// link-local address if opaque IIDs were disabled.
 		{
-			"Nil MAC",
-			true,
-			tcpip.LinkAddress([]byte(nil)),
+			name:      "Nil MAC and empty nicName",
+			nicName:   "",
+			autoGen:   true,
+			linkAddr:  tcpip.LinkAddress([]byte(nil)),
+			secretKey: secretKey[:1],
 		},
 		{
-			"Empty MAC",
-			true,
-			tcpip.LinkAddress(""),
+			name:      "Empty MAC and empty nicName",
+			autoGen:   true,
+			linkAddr:  tcpip.LinkAddress(""),
+			secretKey: secretKey[:2],
 		},
 		{
-			"Invalid MAC",
-			true,
-			tcpip.LinkAddress("\x01\x02\x03"),
+			name:      "Invalid MAC",
+			nicName:   "test",
+			autoGen:   true,
+			linkAddr:  tcpip.LinkAddress("\x01\x02\x03"),
+			secretKey: secretKey[:3],
 		},
 		{
-			"Multicast MAC",
-			true,
-			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			name:      "Multicast MAC",
+			nicName:   "test2",
+			autoGen:   true,
+			linkAddr:  tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			secretKey: secretKey[:4],
 		},
 		{
-			"Unspecified MAC",
-			true,
-			tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+			name:     "Unspecified MAC and nil SecretKey",
+			nicName:  "test3",
+			autoGen:  true,
+			linkAddr: tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
 		},
 	}
 
@@ -2069,7 +2078,12 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			opts := stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				OpaqueIIDOpts:    iidOpts,
+				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+					NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+						return nicName
+					},
+					SecretKey: test.secretKey,
+				},
 			}
 
 			if test.autoGen {
@@ -2082,19 +2096,19 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 
 			e := channel.New(10, 1280, test.linkAddr)
 			s := stack.New(opts)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNamedNIC(nicID, test.nicName, e); err != nil {
+				t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, test.nicName, err)
 			}
 
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+				t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
 			}
 
 			if test.autoGen {
 				// Should have auto-generated an address and
 				// resolved immediately (DAD is disabled).
-				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID("nic1", 0, secretKey[:]), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID(test.nicName, 0, test.secretKey), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
 					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
 				}
 			} else {
-- 
cgit v1.2.3


From ed60bc326b7479995fee4a94d29cbc7d9dddef02 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 6 Jan 2020 16:48:07 -0800
Subject: Fix readme formatting.

PiperOrigin-RevId: 288402480
---
 test/iptables/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/iptables/README.md b/test/iptables/README.md
index b37cb2a96..9f8e34420 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -1,6 +1,6 @@
 # iptables Tests
 
-iptables tests are run via `scripts/iptables\_test.sh`.
+iptables tests are run via `scripts/iptables_test.sh`.
 
 ## Test Structure
 
-- 
cgit v1.2.3


From 2031cc4701d5bfd21b34d7b0f7dc86920a553385 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 7 Jan 2020 10:07:59 -0800
Subject: Disable auto-generation of IPv6 link-local addresses for loopback
 NICs

Test: Test that an IPv6 link-local address is not auto-generated for loopback
NICs, even when it is enabled for non-loopback NICS.
PiperOrigin-RevId: 288519591
---
 pkg/tcpip/stack/nic.go        |  3 ++-
 pkg/tcpip/stack/stack.go      | 18 +++++++++-------
 pkg/tcpip/stack/stack_test.go | 49 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 044fe5298..523c2a699 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -174,7 +174,8 @@ func (n *NIC) enable() *tcpip.Error {
 		return err
 	}
 
-	if !n.stack.autoGenIPv6LinkLocal {
+	// Do not auto-generate an IPv6 link-local address for loopback devices.
+	if !n.stack.autoGenIPv6LinkLocal || n.loopback {
 		return nil
 	}
 
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index ffb379363..d4e98f277 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -444,8 +444,8 @@ type Stack struct {
 	ndpConfigs NDPConfigurations
 
 	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
-	// to auto-generate an IPv6 link-local address for newly enabled NICs.
-	// See the AutoGenIPv6LinkLocal field of Options for more details.
+	// to auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
 	autoGenIPv6LinkLocal bool
 
 	// ndpDisp is the NDP event dispatcher that is used to send the netstack
@@ -496,13 +496,15 @@ type Options struct {
 	// before assigning an address to a NIC.
 	NDPConfigs NDPConfigurations
 
-	// AutoGenIPv6LinkLocal determins whether or not the stack will attempt
-	// to auto-generate an IPv6 link-local address for newly enabled NICs.
+	// AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
+	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs.
+	//
 	// Note, setting this to true does not mean that a link-local address
-	// will be assigned right away, or at all. If Duplicate Address
-	// Detection is enabled, an address will only be assigned if it
-	// successfully resolves. If it fails, no further attempt will be made
-	// to auto-generate an IPv6 link-local address.
+	// will be assigned right away, or at all. If Duplicate Address Detection
+	// is enabled, an address will only be assigned if it successfully resolves.
+	// If it fails, no further attempt will be made to auto-generate an IPv6
+	// link-local address.
 	//
 	// The generated link-local address will follow RFC 4291 Appendix A
 	// guidelines.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index f533949c0..d970a4abb 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2121,6 +2121,55 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 	}
 }
 
+// TestNoLinkLocalAutoGenForLoopbackNIC tests that IPv6 link-local addresses are
+// not auto-generated for loopback NICs.
+func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
+	const nicID = 1
+	const nicName = "nicName"
+
+	tests := []struct {
+		name          string
+		opaqueIIDOpts stack.OpaqueInterfaceIdentifierOptions
+	}{
+		{
+			name:          "IID From MAC",
+			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{},
+		},
+		{
+			name: "Opaque IID",
+			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+					return nicName
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: true,
+				OpaqueIIDOpts:        test.opaqueIIDOpts,
+			}
+
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNamedLoopbackNIC(nicID, nicName, e); err != nil {
+				t.Fatalf("CreateNamedLoopbackNIC(%d, %q, _) = %s", nicID, nicName, err)
+			}
+
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Errorf("got stack.GetMainNICAddress(%d, _) = %s, want = %s", nicID, addr, want)
+			}
+		})
+	}
+}
+
 // TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
 // link-local addresses will only be assigned after the DAD process resolves.
 func TestNICAutoGenAddrDoesDAD(t *testing.T) {
-- 
cgit v1.2.3


From 08a97a6d193f46cd547fadae3bb4125cc788543b Mon Sep 17 00:00:00 2001
From: Marek Majkowski <marek@cloudflare.com>
Date: Thu, 2 Jan 2020 12:09:24 +0100
Subject: #1398 - send ACK when available buffer space gets larger than 1 MSS

When receiving data, netstack avoids sending spurious acks. When
user does recv() should netstack send ack telling the sender that
the window was increased? It depends. Before this patch, netstack
_will_ send the ack in the case when window was zero or window >>
scale was zero. Basically - when recv space increased from zero.

This is not working right with silly-window-avoidance on the sender
side. Some network stacks refuse to transmit segments, that will fill
the window but are below MSS. Before this patch, this confuses
netstack. On one hand if the window was like 3 bytes, netstack
will _not_ send ack if the window increases. On the other hand
sending party will refuse to transmit 3-byte packet.

This patch changes that, making netstack will send an ACK when
the available buffer size increases to or above 1*MSS. This will
inform other party buffer is large enough, and hopefully uncork it.


Signed-off-by: Marek Majkowski <marek@cloudflare.com>
---
 pkg/tcpip/transport/tcp/endpoint.go |  30 +++++---
 pkg/tcpip/transport/tcp/rcv.go      |   6 --
 pkg/tcpip/transport/tcp/tcp_test.go | 139 ++++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 14 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index fe629aa40..5d42f8045 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -955,10 +955,20 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	}
 
 	e.rcvBufUsed -= len(v)
+
+	avail := e.receiveBufferAvailableLocked()
+
+	// If the window was small before, lower than MSS, send immediate
+	// ack. Without this, the sender might be stuck  waiting for the
+	// window to grow, while we think the window is non-zero so we
+	// don't need to send acks. To avoid silly window syndrome, send
+	// ack only when the window grows above one MSS.
+	crossedMSS := avail-len(v) < int(e.amss) && avail >= int(e.amss)
+
 	// If the window was zero before this read and if the read freed up
 	// enough buffer space for the scaled window to be non-zero then notify
 	// the protocol goroutine to send a window update.
-	if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) {
+	if (e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale)) || crossedMSS {
 		e.zeroWindow = false
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
@@ -1138,11 +1148,8 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 //
 // It must be called with rcvListMu held.
 func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
-	if e.rcvBufUsed >= e.rcvBufSize {
-		return true
-	}
-
-	return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
+	avail := e.receiveBufferAvailableLocked()
+	return (avail >> scale) == 0
 }
 
 // SetSockOptInt sets a socket option.
@@ -1181,9 +1188,16 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 			size = math.MaxInt32 / 2
 		}
 
+		availBefore := e.receiveBufferAvailableLocked()
 		e.rcvBufSize = size
+		availAfter := e.receiveBufferAvailableLocked()
+
 		e.rcvAutoParams.disabled = true
-		if e.zeroWindow && !e.zeroReceiveWindow(scale) {
+
+		// Immediatelly send ACK in two cases: when the buffer
+		// grows so that it leaves zero-window state, when the
+		// buffer grows from small < MSS to >= MSS.
+		if (e.zeroWindow && !e.zeroReceiveWindow(scale)) || (availBefore < int(e.amss) && availAfter >= int(e.amss)) {
 			e.zeroWindow = false
 			mask |= notifyNonZeroReceiveWindow
 		}
@@ -2229,7 +2243,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		// we set the zero window before we deliver the segment to ensure
 		// that a subsequent read of the segment will correctly trigger
 		// a non-zero notification.
-		if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 {
+		if e.zeroReceiveWindow(e.rcv.rcvWndScale) {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 			e.zeroWindow = true
 		}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 0a5534959..05c8488f8 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -98,12 +98,6 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 // in such cases we may need to send an ack to indicate to our peer that it can
 // resume sending data.
 func (r *receiver) nonZeroWindow() {
-	if (r.rcvAcc-r.rcvNxt)>>r.rcvWndScale != 0 {
-		// We never got around to announcing a zero window size, so we
-		// don't need to immediately announce a nonzero one.
-		return
-	}
-
 	// Immediately send an ack.
 	r.ep.snd.sendAck()
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index e8fe4dab5..a05365c6a 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6561,3 +6561,142 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
 	}
 }
+
+func TestIncreaseWindowOnReceive(t *testing.T) {
+	// This test ensures that the endpoint sends an ack,
+	// after recv() when the window grows to more than 1 MSS.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const rcvBuf = 65535 * 10
+	c.CreateConnected(789, 30000, rcvBuf)
+
+	// Write chunks of ~30000 bytes. It's important that two
+	// payloads make it equal or longer than MSS.
+	remain := rcvBuf
+	sent := 0
+	data := make([]byte, defaultMTU/2)
+	lastWnd := uint16(0)
+
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+
+		lastWnd = uint16(remain)
+		if remain > 0xffff {
+			lastWnd = 0xffff
+		}
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(lastWnd),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	if lastWnd == 0xffff || lastWnd == 0 {
+		t.Fatalf("expected small, non-zero window: %d", lastWnd)
+	}
+
+	// We now have < 1 MSS in the buffer space.  Read the data! An
+	// ack should be sent in response to that. The window was not
+	// zero, but it grew to larger than MSS.
+	_, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+
+	_, _, err = c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+
+	// After reading two packets, we surely crossed MSS. See the ack:
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(0xffff)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestIncreaseWindowOnBufferResize(t *testing.T) {
+	// This test ensures that the endpoint sends an ack,
+	// after available recv buffer grows to more than 1 MSS.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const rcvBuf = 65535 * 10
+	c.CreateConnected(789, 30000, rcvBuf)
+
+	// Write chunks of ~30000 bytes. It's important that two
+	// payloads make it equal or longer than MSS.
+	remain := rcvBuf
+	sent := 0
+	data := make([]byte, defaultMTU/2)
+	lastWnd := uint16(0)
+
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+
+		lastWnd = uint16(remain)
+		if remain > 0xffff {
+			lastWnd = 0xffff
+		}
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(lastWnd),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	if lastWnd == 0xffff || lastWnd == 0 {
+		t.Fatalf("expected small, non-zero window: %d", lastWnd)
+	}
+
+	// Increasing the buffer from should generate an ACK,
+	// since window grew from small value to larger equal MSS
+	c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
+
+	// After reading two packets, we surely crossed MSS. See the ack:
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(0xffff)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
-- 
cgit v1.2.3


From 4e19d165ccc8035cd23eb31f34af82f1d6389907 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 7 Jan 2020 13:40:43 -0800
Subject: Support deprecating SLAAC addresses after the preferred lifetime

Support deprecating network endpoints on a NIC. If an endpoint is deprecated, it
should not be used for new connections unless a more preferred endpoint is not
available, or unless the deprecated endpoint was explicitly requested.

Test: Test that deprecated endpoints are only returned when more preferred
endpoints are not available and SLAAC addresses are deprecated after its
preferred lifetime
PiperOrigin-RevId: 288562705
---
 pkg/tcpip/stack/ndp.go      | 289 ++++++++++++++++--------
 pkg/tcpip/stack/ndp_test.go | 539 +++++++++++++++++++++++++++++++++++++++++++-
 pkg/tcpip/stack/nic.go      |  93 +++++++-
 pkg/tcpip/stack/stack.go    |  15 +-
 4 files changed, 827 insertions(+), 109 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 238bc27dc..4722ec9ce 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -169,6 +169,15 @@ type NDPDispatcher interface {
 	// call functions on the stack itself.
 	OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
 
+	// OnAutoGenAddressDeprecated will be called when an auto-generated
+	// address (as part of SLAAC) has been deprecated, but is still
+	// considered valid. Note, if an address is invalidated at the same
+	// time it is deprecated, the deprecation event MAY be omitted.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix)
+
 	// OnAutoGenAddressInvalidated will be called when an auto-generated
 	// address (as part of SLAAC) has been invalidated.
 	//
@@ -335,6 +344,17 @@ type onLinkPrefixState struct {
 // autoGenAddressState holds data associated with an address generated via
 // SLAAC.
 type autoGenAddressState struct {
+	// A reference to the referencedNetworkEndpoint that this autoGenAddressState
+	// is holding state for.
+	ref *referencedNetworkEndpoint
+
+	deprecationTimer *time.Timer
+
+	// Used to signal the timer not to deprecate the SLAAC address in a race
+	// condition. Used for the same reason as doNotInvalidate, but for deprecating
+	// an address.
+	doNotDeprecate *bool
+
 	invalidationTimer *time.Timer
 
 	// Used to signal the timer not to invalidate the SLAAC address (A) in
@@ -912,103 +932,30 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 	prefix := pi.Subnet()
 
 	// Check if we already have an auto-generated address for prefix.
-	for _, ref := range ndp.nic.endpoints {
-		if ref.protocol != header.IPv6ProtocolNumber {
-			continue
-		}
-
-		if ref.configType != slaac {
-			continue
-		}
-
-		addr := ref.ep.ID().LocalAddress
-		refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: ref.ep.PrefixLen()}
+	for addr, addrState := range ndp.autoGenAddresses {
+		refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: addrState.ref.ep.PrefixLen()}
 		if refAddrWithPrefix.Subnet() != prefix {
 			continue
 		}
 
-		//
-		// At this point, we know we are refreshing a SLAAC generated
-		// IPv6 address with the prefix, prefix. Do the work as outlined
-		// by RFC 4862 section 5.5.3.e.
-		//
-
-		addrState, ok := ndp.autoGenAddresses[addr]
-		if !ok {
-			panic(fmt.Sprintf("must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr))
-		}
-
-		// TODO(b/143713887): Handle deprecating auto-generated address
-		//                    after the preferred lifetime.
-
-		// As per RFC 4862 section 5.5.3.e, the valid lifetime of the
-		// address generated by SLAAC is as follows:
-		//
-		// 1) If the received Valid Lifetime is greater than 2 hours or
-		//    greater than RemainingLifetime, set the valid lifetime of
-		//    the address to the advertised Valid Lifetime.
-		//
-		// 2) If RemainingLifetime is less than or equal to 2 hours,
-		//    ignore the advertised Valid Lifetime.
-		//
-		// 3) Otherwise, reset the valid lifetime of the address to 2
-		//    hours.
-
-		// Handle the infinite valid lifetime separately as we do not
-		// keep a timer in this case.
-		if vl >= header.NDPInfiniteLifetime {
-			if addrState.invalidationTimer != nil {
-				// Valid lifetime was finite before, but now it
-				// is valid forever.
-				if !addrState.invalidationTimer.Stop() {
-					*addrState.doNotInvalidate = true
-				}
-				addrState.invalidationTimer = nil
-				addrState.validUntil = time.Time{}
-				ndp.autoGenAddresses[addr] = addrState
-			}
-
-			return
-		}
-
-		var effectiveVl time.Duration
-		var rl time.Duration
-
-		// If the address was originally set to be valid forever,
-		// assume the remaining time to be the maximum possible value.
-		if addrState.invalidationTimer == nil {
-			rl = header.NDPInfiniteLifetime
-		} else {
-			rl = time.Until(addrState.validUntil)
-		}
-
-		if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
-			effectiveVl = vl
-		} else if rl <= MinPrefixInformationValidLifetimeForUpdate {
-			ndp.autoGenAddresses[addr] = addrState
-			return
-		} else {
-			effectiveVl = MinPrefixInformationValidLifetimeForUpdate
-		}
-
-		if addrState.invalidationTimer == nil {
-			addrState.invalidationTimer = ndp.autoGenAddrInvalidationTimer(addr, effectiveVl, addrState.doNotInvalidate)
-		} else {
-			if !addrState.invalidationTimer.Stop() {
-				*addrState.doNotInvalidate = true
-			}
-			addrState.invalidationTimer.Reset(effectiveVl)
-		}
-
-		addrState.validUntil = time.Now().Add(effectiveVl)
-		ndp.autoGenAddresses[addr] = addrState
+		// At this point, we know we are refreshing a SLAAC generated IPv6 address
+		// with the prefix prefix. Do the work as outlined by RFC 4862 section
+		// 5.5.3.e.
+		ndp.refreshAutoGenAddressLifetimes(addr, pl, vl)
 		return
 	}
 
 	// We do not already have an address within the prefix, prefix. Do the
 	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
 	// to auto-generated global addresses by SLAAC.
+	ndp.newAutoGenAddress(prefix, pl, vl)
+}
 
+// newAutoGenAddress generates a new SLAAC address with the provided lifetimes
+// for prefix.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// Are we configured to auto-generate new global addresses?
 	if !ndp.configs.AutoGenGlobalAddresses {
 		return
@@ -1067,18 +1014,25 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 		return
 	}
 
-	if _, err := ndp.nic.addAddressLocked(tcpip.ProtocolAddress{
+	protocolAddr := tcpip.ProtocolAddress{
 		Protocol:          header.IPv6ProtocolNumber,
 		AddressWithPrefix: addrWithPrefix,
-	}, FirstPrimaryEndpoint, permanent, slaac); err != nil {
-		panic(err)
+	}
+	// If the preferred lifetime is zero, then the address should be considered
+	// deprecated.
+	deprecated := pl == 0
+	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
+	if err != nil {
+		log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
 	}
 
-	// Setup the timers to deprecate and invalidate this newly generated
-	// address.
+	// Setup the timers to deprecate and invalidate this newly generated address.
 
-	// TODO(b/143713887): Handle deprecating auto-generated addresses
-	//                    after the preferred lifetime.
+	var doNotDeprecate bool
+	var pTimer *time.Timer
+	if !deprecated && pl < header.NDPInfiniteLifetime {
+		pTimer = ndp.autoGenAddrDeprecationTimer(addr, pl, &doNotDeprecate)
+	}
 
 	var doNotInvalidate bool
 	var vTimer *time.Timer
@@ -1087,12 +1041,126 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 	}
 
 	ndp.autoGenAddresses[addr] = autoGenAddressState{
+		ref:               ref,
+		deprecationTimer:  pTimer,
+		doNotDeprecate:    &doNotDeprecate,
 		invalidationTimer: vTimer,
 		doNotInvalidate:   &doNotInvalidate,
 		validUntil:        time.Now().Add(vl),
 	}
 }
 
+// refreshAutoGenAddressLifetimes refreshes the lifetime of a SLAAC generated
+// address addr.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl time.Duration) {
+	addrState, ok := ndp.autoGenAddresses[addr]
+	if !ok {
+		log.Fatalf("ndp: SLAAC state not found to refresh lifetimes for %s", addr)
+	}
+	defer func() { ndp.autoGenAddresses[addr] = addrState }()
+
+	// If the preferred lifetime is zero, then the address should be considered
+	// deprecated.
+	deprecated := pl == 0
+	wasDeprecated := addrState.ref.deprecated
+	addrState.ref.deprecated = deprecated
+
+	// Only send the deprecation event if the deprecated status for addr just
+	// changed from non-deprecated to deprecated.
+	if !wasDeprecated && deprecated {
+		ndp.notifyAutoGenAddressDeprecated(addr)
+	}
+
+	// If addr was preferred for some finite lifetime before, stop the deprecation
+	// timer so it can be reset.
+	if t := addrState.deprecationTimer; t != nil && !t.Stop() {
+		*addrState.doNotDeprecate = true
+	}
+
+	// Reset the deprecation timer.
+	if pl >= header.NDPInfiniteLifetime || deprecated {
+		// If addr is preferred forever or it has been deprecated already, there is
+		// no need for a deprecation timer.
+		addrState.deprecationTimer = nil
+	} else if addrState.deprecationTimer == nil {
+		// addr is now preferred for a finite lifetime.
+		addrState.deprecationTimer = ndp.autoGenAddrDeprecationTimer(addr, pl, addrState.doNotDeprecate)
+	} else {
+		// addr continues to be preferred for a finite lifetime.
+		addrState.deprecationTimer.Reset(pl)
+	}
+
+	// As per RFC 4862 section 5.5.3.e, the valid lifetime of the address
+	//
+	//
+	// 1) If the received Valid Lifetime is greater than 2 hours or greater than
+	//    RemainingLifetime, set the valid lifetime of the address to the
+	//    advertised Valid Lifetime.
+	//
+	// 2) If RemainingLifetime is less than or equal to 2 hours, ignore the
+	//    advertised Valid Lifetime.
+	//
+	// 3) Otherwise, reset the valid lifetime of the address to 2 hours.
+
+	// Handle the infinite valid lifetime separately as we do not keep a timer in
+	// this case.
+	if vl >= header.NDPInfiniteLifetime {
+		if addrState.invalidationTimer != nil {
+			// Valid lifetime was finite before, but now it is valid forever.
+			if !addrState.invalidationTimer.Stop() {
+				*addrState.doNotInvalidate = true
+			}
+			addrState.invalidationTimer = nil
+			addrState.validUntil = time.Time{}
+		}
+
+		return
+	}
+
+	var effectiveVl time.Duration
+	var rl time.Duration
+
+	// If the address was originally set to be valid forever, assume the remaining
+	// time to be the maximum possible value.
+	if addrState.invalidationTimer == nil {
+		rl = header.NDPInfiniteLifetime
+	} else {
+		rl = time.Until(addrState.validUntil)
+	}
+
+	if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
+		effectiveVl = vl
+	} else if rl <= MinPrefixInformationValidLifetimeForUpdate {
+		return
+	} else {
+		effectiveVl = MinPrefixInformationValidLifetimeForUpdate
+	}
+
+	if addrState.invalidationTimer == nil {
+		addrState.invalidationTimer = ndp.autoGenAddrInvalidationTimer(addr, effectiveVl, addrState.doNotInvalidate)
+	} else {
+		if !addrState.invalidationTimer.Stop() {
+			*addrState.doNotInvalidate = true
+		}
+		addrState.invalidationTimer.Reset(effectiveVl)
+	}
+
+	addrState.validUntil = time.Now().Add(effectiveVl)
+}
+
+// notifyAutoGenAddressDeprecated notifies the stack's NDP dispatcher that addr
+// has been deprecated.
+func (ndp *ndpState) notifyAutoGenAddressDeprecated(addr tcpip.Address) {
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+			Address:   addr,
+			PrefixLen: validPrefixLenForAutoGen,
+		})
+	}
+}
+
 // invalidateAutoGenAddress invalidates an auto-generated address.
 //
 // The NIC that ndp belongs to MUST be locked.
@@ -1118,6 +1186,14 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 		return false
 	}
 
+	if state.deprecationTimer != nil {
+		state.deprecationTimer.Stop()
+		state.deprecationTimer = nil
+		*state.doNotDeprecate = true
+	}
+
+	state.doNotDeprecate = nil
+
 	if state.invalidationTimer != nil {
 		state.invalidationTimer.Stop()
 		state.invalidationTimer = nil
@@ -1138,6 +1214,33 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 	return true
 }
 
+// autoGenAddrDeprecationTimer returns a new deprecation timer for an
+// auto-generated address that fires after pl.
+//
+// doNotDeprecate is used to inform the timer when it fires at the same time
+// that an auto-generated address's preferred lifetime gets refreshed. See
+// autoGenAddrState.doNotDeprecate for more details.
+func (ndp *ndpState) autoGenAddrDeprecationTimer(addr tcpip.Address, pl time.Duration, doNotDeprecate *bool) *time.Timer {
+	return time.AfterFunc(pl, func() {
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
+		if *doNotDeprecate {
+			*doNotDeprecate = false
+			return
+		}
+
+		addrState, ok := ndp.autoGenAddresses[addr]
+		if !ok {
+			log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr)
+		}
+		addrState.ref.deprecated = true
+		ndp.notifyAutoGenAddressDeprecated(addr)
+		addrState.deprecationTimer = nil
+		ndp.autoGenAddresses[addr] = addrState
+	})
+}
+
 // autoGenAddrInvalidationTimer returns a new invalidation timer for an
 // auto-generated address that fires after vl.
 //
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 9430844d3..8d89859ba 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -30,6 +30,8 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const (
@@ -46,6 +48,10 @@ var (
 	llAddr1 = header.LinkLocalAddr(linkAddr1)
 	llAddr2 = header.LinkLocalAddr(linkAddr2)
 	llAddr3 = header.LinkLocalAddr(linkAddr3)
+	dstAddr = tcpip.FullAddress{
+		Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+		Port: 25,
+	}
 )
 
 func addrForSubnet(subnet tcpip.Subnet, linkAddr tcpip.LinkAddress) tcpip.AddressWithPrefix {
@@ -136,6 +142,7 @@ type ndpAutoGenAddrEventType int
 
 const (
 	newAddr ndpAutoGenAddrEventType = iota
+	deprecatedAddr
 	invalidatedAddr
 )
 
@@ -240,6 +247,16 @@ func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWi
 	return true
 }
 
+func (n *ndpDispatcher) OnAutoGenAddressDeprecated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+	if c := n.autoGenAddrC; c != nil {
+		c <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			deprecatedAddr,
+		}
+	}
+}
+
 func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
 	if c := n.autoGenAddrC; c != nil {
 		c <- ndpAutoGenAddrEvent{
@@ -1638,12 +1655,532 @@ func TestAutoGenAddr(t *testing.T) {
 	}
 }
 
+// stackAndNdpDispatcherWithDefaultRoute returns an ndpDispatcher,
+// channel.Endpoint and stack.Stack.
+//
+// stack.Stack will have a default route through the router (llAddr3) installed
+// and a static link-address (linkAddr3) added to the link address cache for the
+// router.
+func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
+	t.Helper()
+	ndpDisp := &ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: ndpDisp,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+	s.SetRouteTable([]tcpip.Route{{
+		Destination: header.IPv6EmptySubnet,
+		Gateway:     llAddr3,
+		NIC:         nicID,
+	}})
+	s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	return ndpDisp, e, s
+}
+
+// addrForNewConnection returns the local address used when creating a new
+// connection.
+func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+	}
+	defer ep.Close()
+	v := tcpip.V6OnlyOption(1)
+	if err := ep.SetSockOpt(v); err != nil {
+		t.Fatalf("SetSockOpt(%+v): %s", v, err)
+	}
+	if err := ep.Connect(dstAddr); err != nil {
+		t.Fatalf("ep.Connect(%+v): %s", dstAddr, err)
+	}
+	got, err := ep.GetLocalAddress()
+	if err != nil {
+		t.Fatalf("ep.GetLocalAddress(): %s", err)
+	}
+	return got.Addr
+}
+
+// addrForNewConnectionWithAddr returns the local address used when creating a
+// new connection with a specific local address.
+func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+	}
+	defer ep.Close()
+	v := tcpip.V6OnlyOption(1)
+	if err := ep.SetSockOpt(v); err != nil {
+		t.Fatalf("SetSockOpt(%+v): %s", v, err)
+	}
+	if err := ep.Bind(addr); err != nil {
+		t.Fatalf("ep.Bind(%+v): %s", addr, err)
+	}
+	if err := ep.Connect(dstAddr); err != nil {
+		t.Fatalf("ep.Connect(%+v): %s", dstAddr, err)
+	}
+	got, err := ep.GetLocalAddress()
+	if err != nil {
+		t.Fatalf("ep.GetLocalAddress(): %s", err)
+	}
+	return got.Addr
+}
+
+// TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when
+// receiving a PI with 0 preferred lifetime.
+func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
+	const nicID = 1
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+		t.Helper()
+
+		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+		} else if got != addr {
+			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+		}
+
+		if got := addrForNewConnection(t, s); got != addr.Address {
+			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+		}
+	}
+
+	// Receive PI for prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	expectPrimaryAddr(addr1)
+
+	// Deprecate addr for prefix1 immedaitely.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr1, deprecatedAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	// addr should still be the primary endpoint as there are no other addresses.
+	expectPrimaryAddr(addr1)
+
+	// Refresh lifetimes of addr generated from prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr1)
+
+	// Receive PI for prefix2.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr2)
+
+	// Deprecate addr for prefix2 immedaitely.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr2, deprecatedAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	// addr1 should be the primary endpoint now since addr2 is deprecated but
+	// addr1 is not.
+	expectPrimaryAddr(addr1)
+	// addr2 is deprecated but if explicitly requested, it should be used.
+	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+	}
+
+	// Another PI w/ 0 preferred lifetime should not result in a deprecation
+	// event.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr1)
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+	}
+
+	// Refresh lifetimes of addr generated from prefix2.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr2)
+}
+
+// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated
+// when its preferred lifetime expires.
+func TestAutoGenAddrTimerDeprecation(t *testing.T) {
+	const nicID = 1
+	const newMinVL = 2
+	newMinVLDuration := newMinVL * time.Second
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+		t.Helper()
+
+		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+		} else if got != addr {
+			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+		}
+
+		if got := addrForNewConnection(t, s); got != addr.Address {
+			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+		}
+	}
+
+	// Receive PI for prefix2.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr2)
+
+	// Receive a PI for prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr1)
+
+	// Refresh lifetime for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr1)
+
+	// Wait for addr of prefix1 to be deprecated.
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	// addr2 should be the primary endpoint now since addr1 is deprecated but
+	// addr2 is not.
+	expectPrimaryAddr(addr2)
+	// addr1 is deprecated but if explicitly requested, it should be used.
+	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+	}
+
+	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
+	// sure we do not get a deprecation event again.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr2)
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+	}
+
+	// Refresh lifetimes for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	// addr1 is the primary endpoint again since it is non-deprecated now.
+	expectPrimaryAddr(addr1)
+
+	// Wait for addr of prefix1 to be deprecated.
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout)
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	// addr2 should be the primary endpoint now since it is not deprecated.
+	expectPrimaryAddr(addr2)
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultTimeout)
+	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr2)
+
+	// Refresh both lifetimes for addr of prefix2 to the same value.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+
+	// Wait for a deprecation then invalidation events, or just an invalidation
+	// event. We need to cover both cases but cannot deterministically hit both
+	// cases because the deprecation and invalidation handlers could be handled in
+	// either deprecation then invalidation, or invalidation then deprecation
+	// (which should be cancelled by the invalidation handler).
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
+			// If we get a deprecation event first, we should get an invalidation
+			// event almost immediately after.
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
+					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				}
+			case <-time.After(defaultTimeout):
+				t.Fatal("timed out waiting for addr auto gen event")
+			}
+		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
+			// If we get an invalidation  event first, we should not get a deprecation
+			// event after.
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			case <-time.After(defaultTimeout):
+			}
+		} else {
+			t.Fatalf("got unexpected auto-generated event")
+		}
+
+	case <-time.After(newMinVLDuration + defaultTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should not have %s in the list of addresses", addr2)
+	}
+	// Should not have any primary endpoints.
+	if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+		t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+	} else if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+	}
+	defer ep.Close()
+	v := tcpip.V6OnlyOption(1)
+	if err := ep.SetSockOpt(v); err != nil {
+		t.Fatalf("SetSockOpt(%+v): %s", v, err)
+	}
+
+	if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
+		t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+	}
+}
+
+// Tests transitioning a SLAAC address's valid lifetime between finite and
+// infinite values.
+func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
+	const infiniteVLSeconds = 2
+	const minVLSeconds = 1
+	savedIL := header.NDPInfiniteLifetime
+	savedMinVL := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
+		header.NDPInfiniteLifetime = savedIL
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
+	header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	tests := []struct {
+		name       string
+		infiniteVL uint32
+	}{
+		{
+			name:       "EqualToInfiniteVL",
+			infiniteVL: infiniteVLSeconds,
+		},
+		// Our implementation supports changing header.NDPInfiniteLifetime for tests
+		// such that a packet can be received where the lifetime field has a value
+		// greater than header.NDPInfiniteLifetime. Because of this, we test to make
+		// sure that receiving a value greater than header.NDPInfiniteLifetime is
+		// handled the same as when receiving a value equal to
+		// header.NDPInfiniteLifetime.
+		{
+			name:       "MoreThanInfiniteVL",
+			infiniteVL: infiniteVLSeconds + 1,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				ndpDisp := ndpDispatcher{
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+				}
+				e := channel.New(0, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						HandleRAs:              true,
+						AutoGenGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDisp,
+				})
+
+				if err := s.CreateNIC(1, e); err != nil {
+					t.Fatalf("CreateNIC(1) = %s", err)
+				}
+
+				// Receive an RA with finite prefix.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+
+				// Receive an new RA with prefix with infinite VL.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.infiniteVL, 0))
+
+				// Receive a new RA with prefix with finite VL.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0))
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+
+				case <-time.After(minVLSeconds*time.Second + defaultTimeout):
+					t.Fatal("timeout waiting for addr auto gen event")
+				}
+			})
+		}
+	})
+}
+
 // TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an
 // auto-generated address only gets updated when required to, as specified in
 // RFC 4862 section 5.5.3.e.
 func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 	const infiniteVL = 4294967295
-	const newMinVL = 5
+	const newMinVL = 4
 	saved := stack.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
 		stack.MinPrefixInformationValidLifetimeForUpdate = saved
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 523c2a699..5726c3642 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -249,17 +249,47 @@ func (n *NIC) setSpoofing(enable bool) {
 
 // primaryEndpoint returns the primary endpoint of n for the given network
 // protocol.
+//
+// primaryEndpoint will return the first non-deprecated endpoint if such an
+// endpoint exists. If no non-deprecated endpoint exists, the first deprecated
+// endpoint will be returned.
 func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 
+	var deprecatedEndpoint *referencedNetworkEndpoint
 	for _, r := range n.primary[protocol] {
-		if r.isValidForOutgoing() && r.tryIncRef() {
-			return r
+		if !r.isValidForOutgoing() {
+			continue
+		}
+
+		if !r.deprecated {
+			if r.tryIncRef() {
+				// r is not deprecated, so return it immediately.
+				//
+				// If we kept track of a deprecated endpoint, decrement its reference
+				// count since it was incremented when we decided to keep track of it.
+				if deprecatedEndpoint != nil {
+					deprecatedEndpoint.decRefLocked()
+					deprecatedEndpoint = nil
+				}
+
+				return r
+			}
+		} else if deprecatedEndpoint == nil && r.tryIncRef() {
+			// We prefer an endpoint that is not deprecated, but we keep track of r in
+			// case n doesn't have any non-deprecated endpoints.
+			//
+			// If we end up finding a more preferred endpoint, r's reference count
+			// will be decremented when such an endpoint is found.
+			deprecatedEndpoint = r
 		}
 	}
 
-	return nil
+	// n doesn't have any valid non-deprecated endpoints, so return
+	// deprecatedEndpoint (which may be nil if n doesn't have any valid deprecated
+	// endpoints either).
+	return deprecatedEndpoint
 }
 
 // hasPermanentAddrLocked returns true if n has a permanent (including currently
@@ -367,7 +397,7 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 			Address:   address,
 			PrefixLen: netProto.DefaultPrefixLen(),
 		},
-	}, peb, temporary, static)
+	}, peb, temporary, static, false)
 
 	n.mu.Unlock()
 	return ref
@@ -416,10 +446,10 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 		}
 	}
 
-	return n.addAddressLocked(protocolAddress, peb, permanent, static)
+	return n.addAddressLocked(protocolAddress, peb, permanent, static, false)
 }
 
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType) (*referencedNetworkEndpoint, *tcpip.Error) {
+func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
 	// TODO(b/141022673): Validate IP address before adding them.
 
 	// Sanity check.
@@ -455,6 +485,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 		protocol:   protocolAddress.Protocol,
 		kind:       kind,
 		configType: configType,
+		deprecated: deprecated,
 	}
 
 	// Set up cache if link address resolution exists for this protocol.
@@ -553,6 +584,51 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
 	return addrs
 }
 
+// primaryAddress returns the primary address associated with this NIC.
+//
+// primaryAddress will return the first non-deprecated address if such an
+// address exists. If no non-deprecated address exists, the first deprecated
+// address will be returned.
+func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWithPrefix {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	list, ok := n.primary[proto]
+	if !ok {
+		return tcpip.AddressWithPrefix{}
+	}
+
+	var deprecatedEndpoint *referencedNetworkEndpoint
+	for _, ref := range list {
+		// Don't include tentative, expired or tempory endpoints to avoid confusion
+		// and prevent the caller from using those.
+		switch ref.getKind() {
+		case permanentTentative, permanentExpired, temporary:
+			continue
+		}
+
+		if !ref.deprecated {
+			return tcpip.AddressWithPrefix{
+				Address:   ref.ep.ID().LocalAddress,
+				PrefixLen: ref.ep.PrefixLen(),
+			}
+		}
+
+		if deprecatedEndpoint == nil {
+			deprecatedEndpoint = ref
+		}
+	}
+
+	if deprecatedEndpoint != nil {
+		return tcpip.AddressWithPrefix{
+			Address:   deprecatedEndpoint.ep.ID().LocalAddress,
+			PrefixLen: deprecatedEndpoint.ep.PrefixLen(),
+		}
+	}
+
+	return tcpip.AddressWithPrefix{}
+}
+
 // AddAddressRange adds a range of addresses to n, so that it starts accepting
 // packets targeted at the given addresses and network protocol. The range is
 // given by a subnet address, and all addresses contained in the subnet are
@@ -1109,6 +1185,11 @@ type referencedNetworkEndpoint struct {
 	// configType is the method that was used to configure this endpoint.
 	// This must never change after the endpoint is added to a NIC.
 	configType networkEndpointConfigType
+
+	// deprecated indicates whether or not the endpoint should be considered
+	// deprecated. That is, when deprecated is true, other endpoints that are not
+	// deprecated should be preferred.
+	deprecated bool
 }
 
 func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index d4e98f277..583ede3e5 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1036,9 +1036,11 @@ func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
 	return nics
 }
 
-// GetMainNICAddress returns the first primary address and prefix for the given
-// NIC and protocol. Returns an error if the NIC doesn't exist and an empty
-// value if the NIC doesn't have a primary address for the given protocol.
+// GetMainNICAddress returns the first non-deprecated primary address and prefix
+// for the given NIC and protocol. If no non-deprecated primary address exists,
+// a deprecated primary address and prefix will be returned. Returns an error if
+// the NIC doesn't exist and an empty value if the NIC doesn't have a primary
+// address for the given protocol.
 func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@@ -1048,12 +1050,7 @@ func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocol
 		return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID
 	}
 
-	for _, a := range nic.PrimaryAddresses() {
-		if a.Protocol == protocol {
-			return a.AddressWithPrefix, nil
-		}
-	}
-	return tcpip.AddressWithPrefix{}, nil
+	return nic.primaryAddress(protocol), nil
 }
 
 func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
-- 
cgit v1.2.3


From e77ad574233b779519a253c6f58197c339e9100a Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 7 Jan 2020 17:25:18 -0800
Subject: Fix partial_bad_buffer write tests.

The write tests are fitted to Linux-specific behavior, but it is not
well-specified. Tweak the tests to allow for both acceptable outcomes.

PiperOrigin-RevId: 288606386
---
 test/syscalls/linux/partial_bad_buffer.cc | 138 ++++++++++++++----------------
 1 file changed, 64 insertions(+), 74 deletions(-)

diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 33822ee57..df7129acc 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -18,7 +18,9 @@
 #include <netinet/tcp.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/syscall.h>
+#include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
 
@@ -62,9 +64,9 @@ class PartialBadBufferTest : public ::testing::Test {
     // Write some initial data.
     size_t size = sizeof(kMessage) - 1;
     EXPECT_THAT(WriteFd(fd_, &kMessage, size), SyscallSucceedsWithValue(size));
-
     ASSERT_THAT(lseek(fd_, 0, SEEK_SET), SyscallSucceeds());
 
+    // Map a useable buffer.
     addr_ = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     ASSERT_NE(addr_, MAP_FAILED);
@@ -79,6 +81,15 @@ class PartialBadBufferTest : public ::testing::Test {
     bad_buffer_ = buf + kPageSize - 1;
   }
 
+  off_t Size() {
+    struct stat st;
+    int rc = fstat(fd_, &st);
+    if (rc < 0) {
+      return static_cast<off_t>(rc);
+    }
+    return st.st_size;
+  }
+
   void TearDown() override {
     EXPECT_THAT(munmap(addr_, 2 * kPageSize), SyscallSucceeds()) << addr_;
     EXPECT_THAT(close(fd_), SyscallSucceeds());
@@ -165,97 +176,99 @@ TEST_F(PartialBadBufferTest, PreadvSmall) {
 }
 
 TEST_F(PartialBadBufferTest, WriteBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, kPageSize),
-              SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(write)(fd_, bad_buffer_, kPageSize)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, WriteSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
-              SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(write)(fd_, bad_buffer_, 10)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwriteBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwriteSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, 10, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwrite)(fd_, bad_buffer_, 10, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, WritevBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = kPageSize;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(writev)(fd_, &vec, 1)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, WritevSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = 10;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(writev)(fd_, &vec, 1)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwritevBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = kPageSize;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwritevSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = 10;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 // getdents returns EFAULT when the you claim the buffer is large enough, but
@@ -283,29 +296,6 @@ TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
       SyscallSucceedsWithValue(Gt(0)));
 }
 
-// Verify that when write returns EFAULT the kernel hasn't silently written
-// the initial valid bytes.
-TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
-  bad_buffer_[0] = 'A';
-  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
-              SyscallFailsWithErrno(EFAULT));
-
-  size_t size = 255;
-  char buf[255];
-  memset(buf, 0, size);
-
-  EXPECT_THAT(RetryEINTR(pread)(fd_, buf, size, 0),
-              SyscallSucceedsWithValue(sizeof(kMessage) - 1));
-
-  // 'A' has not been written.
-  EXPECT_STREQ(buf, kMessage);
-}
-
 PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
   struct sockaddr_storage addr;
   memset(&addr, 0, sizeof(addr));
-- 
cgit v1.2.3


From a53ac7307abfeb7172e67f48d0a7aaa4b5c3f31e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 7 Jan 2020 23:52:59 -0800
Subject: fs/splice: don't report a partialResult error if there is no data
 loss

PiperOrigin-RevId: 288642552
---
 pkg/sentry/fs/file.go          |  7 +++++++
 pkg/sentry/fs/splice.go        |  5 +++++
 test/syscalls/linux/inotify.cc | 28 ++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index c0a6e884b..a2f966cb6 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -555,6 +555,10 @@ type lockedWriter struct {
 	//
 	// This applies only to Write, not WriteAt.
 	Offset int64
+
+	// Err contains the first error encountered while copying. This is
+	// useful to determine whether Writer or Reader failed during io.Copy.
+	Err error
 }
 
 // Write implements io.Writer.Write.
@@ -590,5 +594,8 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
 			break
 		}
 	}
+	if w.Err == nil {
+		w.Err = err
+	}
 	return written, err
 }
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 311798811..389c330a0 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -167,6 +167,11 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 		if !srcPipe && !opts.SrcOffset {
 			atomic.StoreInt64(&src.offset, src.offset+n)
 		}
+
+		// Don't report any errors if we have some progress without data loss.
+		if w.Err == nil {
+			err = nil
+		}
 	}
 
 	// Drop locks.
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 7384c27dc..59ec9940a 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1591,6 +1591,34 @@ TEST(Inotify, EpollNoDeadlock) {
   }
 }
 
+TEST(Inotify, SpliceEvent) {
+  int pipes[2];
+  ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  const int watcher = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  char buf;
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr,
+                     sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK),
+              SyscallSucceedsWithValue(sizeof(struct inotify_event)));
+
+  const FileDescriptor read_fd(pipes[0]);
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From c276e4740f225068dc394a500dfdddd64af1d18a Mon Sep 17 00:00:00 2001
From: Marek Majkowski <marek@cloudflare.com>
Date: Wed, 8 Jan 2020 12:56:39 +0000
Subject: Fix #1522 - implement silly window sydrome protection on rx side

Before, each of small read()'s that raises window either from zero
or above threshold of aMSS, would generate an ACK. In a classic
silly-window-syndrome scenario, we can imagine a pessimistic case
when small read()'s generate a stream of ACKs.

This PR fixes that, essentially treating window size < aMSS as zero.
We send ACK exactly in a moment when window increases to >= aMSS
or half of receive buffer size (whichever smaller).
---
 pkg/tcpip/transport/tcp/endpoint.go | 73 +++++++++++++++++++++----------------
 pkg/tcpip/transport/tcp/tcp_test.go | 14 ++++---
 2 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5d42f8045..8ff125855 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -956,20 +956,11 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 
 	e.rcvBufUsed -= len(v)
 
-	avail := e.receiveBufferAvailableLocked()
-
-	// If the window was small before, lower than MSS, send immediate
-	// ack. Without this, the sender might be stuck  waiting for the
-	// window to grow, while we think the window is non-zero so we
-	// don't need to send acks. To avoid silly window syndrome, send
-	// ack only when the window grows above one MSS.
-	crossedMSS := avail-len(v) < int(e.amss) && avail >= int(e.amss)
-
-	// If the window was zero before this read and if the read freed up
-	// enough buffer space for the scaled window to be non-zero then notify
-	// the protocol goroutine to send a window update.
-	if (e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale)) || crossedMSS {
-		e.zeroWindow = false
+	// If the window was small before this read and if the read
+	// freed up enough buffer space, to either fit an aMSS or half
+	// a receive buffer (whichever smaller), then notify the
+	// protocol goroutine to send a window update.
+	if e.windowCrossedACKThreshold(len(v)) == 1 {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1143,13 +1134,35 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
-// zeroReceiveWindow checks if the receive window to be announced now would be
-// zero, based on the amount of available buffer and the receive window scaling.
+// windowCrossedACKThreshold checks if the receive window to be announced now would be
+// under aMSS or under half receive buffer, whichever smaller. This is useful as
+// a receive side silly window syndrome prevention mechanism. If window grows
+// to reasonable value, we should send ACK to the sender to inform the rx space is now
+// large. We also want ensure a series of small read()'s won't trigger a flood of
+// spurious tiny ACK's.
 //
-// It must be called with rcvListMu held.
-func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
-	avail := e.receiveBufferAvailableLocked()
-	return (avail >> scale) == 0
+// For large receive buffers, the threshold is aMSS - once reader reads more than aMSS
+// we'll send ACK. For tiny receive buffers, the threshold is half of receive buffer size.
+// This is chosen arbitrairly.
+func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) int {
+	newAvail := e.receiveBufferAvailableLocked()
+	oldAvail := newAvail - deltaBefore
+	if oldAvail < 0 {
+		oldAvail = 0
+	}
+
+	threshold := int(e.amss)
+	if threshold > e.rcvBufSize/2 {
+		threshold = e.rcvBufSize / 2
+	}
+
+	switch {
+	case oldAvail < threshold && newAvail >= threshold:
+		return 1
+	case oldAvail >= threshold && newAvail < threshold:
+		return -1
+	}
+	return 0
 }
 
 // SetSockOptInt sets a socket option.
@@ -1194,11 +1207,11 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 
 		e.rcvAutoParams.disabled = true
 
-		// Immediatelly send ACK in two cases: when the buffer
-		// grows so that it leaves zero-window state, when the
-		// buffer grows from small < MSS to >= MSS.
-		if (e.zeroWindow && !e.zeroReceiveWindow(scale)) || (availBefore < int(e.amss) && availAfter >= int(e.amss)) {
-			e.zeroWindow = false
+		// Immediatelly send an ACK to uncork the sender silly
+		// window syndrome prevetion, when our available space
+		// grows above aMSS or half receive buffer, whichever
+		// smaller.
+		if e.windowCrossedACKThreshold(availAfter-availBefore) == 1 {
 			mask |= notifyNonZeroReceiveWindow
 		}
 		e.rcvListMu.Unlock()
@@ -2239,13 +2252,11 @@ func (e *endpoint) readyToRead(s *segment) {
 	if s != nil {
 		s.incRef()
 		e.rcvBufUsed += s.data.Size()
-		// Check if the receive window is now closed. If so make sure
-		// we set the zero window before we deliver the segment to ensure
-		// that a subsequent read of the segment will correctly trigger
-		// a non-zero notification.
-		if e.zeroReceiveWindow(e.rcv.rcvWndScale) {
+		// Increase counter if the receive window falls down
+		// below MSS or half receive buffer size, whichever
+		// smaller.
+		if e.windowCrossedACKThreshold(-s.data.Size()) == -1 {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
-			e.zeroWindow = true
 		}
 		e.rcvList.PushBack(s)
 	} else {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a05365c6a..4c2e458e3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2091,10 +2091,14 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 		)
 	}
 
-	// Read some data. An ack should be sent in response to that.
-	v, _, err := c.EP.Read(nil)
-	if err != nil {
-		t.Fatalf("Read failed: %v", err)
+	// Read at least 1MSS of data. An ack should be sent in response to that.
+	sz := 0
+	for sz < defaultMTU {
+		v, _, err := c.EP.Read(nil)
+		if err != nil {
+			t.Fatalf("Read failed: %v", err)
+		}
+		sz += len(v)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -2103,7 +2107,7 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(c.IRS)+1),
 			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(len(v)>>ws)),
+			checker.Window(uint16(sz>>ws)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
-- 
cgit v1.2.3


From 0cc1e74b57e539e66c1a421c047a08635c0008e8 Mon Sep 17 00:00:00 2001
From: Bert Muthalaly <stijlist@google.com>
Date: Wed, 8 Jan 2020 09:28:53 -0800
Subject: Add NIC.isLoopback()

...enabling us to remove the "CreateNamedLoopbackNIC" variant of
CreateNIC and all the plumbing to connect it through to where the value
is read in FindRoute.

PiperOrigin-RevId: 288713093
---
 pkg/tcpip/stack/nic.go        | 18 ++++++++++--------
 pkg/tcpip/stack/stack.go      | 24 +++++++++---------------
 pkg/tcpip/stack/stack_test.go |  7 ++++---
 runsc/boot/network.go         | 16 +++++-----------
 4 files changed, 28 insertions(+), 37 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 5726c3642..4144d5d0f 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -27,11 +27,10 @@ import (
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
-	stack    *Stack
-	id       tcpip.NICID
-	name     string
-	linkEP   LinkEndpoint
-	loopback bool
+	stack  *Stack
+	id     tcpip.NICID
+	name   string
+	linkEP LinkEndpoint
 
 	mu            sync.RWMutex
 	spoofing      bool
@@ -85,7 +84,7 @@ const (
 )
 
 // newNIC returns a new NIC using the default NDP configurations from stack.
-func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC {
 	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
 	// example, make sure that the link address it provides is a valid
 	// unicast ethernet address.
@@ -99,7 +98,6 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 		id:         id,
 		name:       name,
 		linkEP:     ep,
-		loopback:   loopback,
 		primary:    make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint),
 		endpoints:  make(map[NetworkEndpointID]*referencedNetworkEndpoint),
 		mcastJoins: make(map[NetworkEndpointID]int32),
@@ -175,7 +173,7 @@ func (n *NIC) enable() *tcpip.Error {
 	}
 
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
-	if !n.stack.autoGenIPv6LinkLocal || n.loopback {
+	if !n.stack.autoGenIPv6LinkLocal || n.isLoopback() {
 		return nil
 	}
 
@@ -240,6 +238,10 @@ func (n *NIC) isPromiscuousMode() bool {
 	return rv
 }
 
+func (n *NIC) isLoopback() bool {
+	return n.linkEP.Capabilities()&CapabilityLoopback != 0
+}
+
 // setSpoofing enables or disables address spoofing.
 func (n *NIC) setSpoofing(enable bool) {
 	n.mu.Lock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 583ede3e5..807f910f6 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -798,7 +798,7 @@ func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNum
 
 // createNIC creates a NIC with the provided id and link-layer endpoint, and
 // optionally enable it.
-func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error {
+func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled bool) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -807,7 +807,7 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled,
 		return tcpip.ErrDuplicateNICID
 	}
 
-	n := newNIC(s, id, name, ep, loopback)
+	n := newNIC(s, id, name, ep)
 
 	s.nics[id] = n
 	if enabled {
@@ -819,32 +819,26 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled,
 
 // CreateNIC creates a NIC with the provided id and link-layer endpoint.
 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, "", ep, true, false)
+	return s.createNIC(id, "", ep, true)
 }
 
 // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
 // and a human-readable name.
 func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, name, ep, true, false)
-}
-
-// CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer
-// endpoint, and a human-readable name.
-func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, name, ep, true, true)
+	return s.createNIC(id, name, ep, true)
 }
 
 // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
 // but leave it disable. Stack.EnableNIC must be called before the link-layer
 // endpoint starts delivering packets to it.
 func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, "", ep, false, false)
+	return s.createNIC(id, "", ep, false)
 }
 
 // CreateDisabledNamedNIC is a combination of CreateNamedNIC and
 // CreateDisabledNIC.
 func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, name, ep, false, false)
+	return s.createNIC(id, name, ep, false)
 }
 
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
@@ -911,7 +905,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 			Up:          true, // Netstack interfaces are always up.
 			Running:     nic.linkEP.IsAttached(),
 			Promiscuous: nic.isPromiscuousMode(),
-			Loopback:    nic.linkEP.Capabilities()&CapabilityLoopback != 0,
+			Loopback:    nic.isLoopback(),
 		}
 		nics[id] = NICInfo{
 			Name:              nic.name,
@@ -1072,7 +1066,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok {
 			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil
+				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
 			}
 		}
 	} else {
@@ -1088,7 +1082,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 						remoteAddr = ref.ep.ID().LocalAddress
 					}
 
-					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback)
+					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
 					if needRoute {
 						r.NextHop = route.Gateway
 					}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index d970a4abb..bf057745e 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -2153,10 +2154,10 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 				OpaqueIIDOpts:        test.opaqueIIDOpts,
 			}
 
-			e := channel.New(0, 1280, linkAddr1)
+			e := loopback.New()
 			s := stack.New(opts)
-			if err := s.CreateNamedLoopbackNIC(nicID, nicName, e); err != nil {
-				t.Fatalf("CreateNamedLoopbackNIC(%d, %q, _) = %s", nicID, nicName, err)
+			if err := s.CreateNamedNIC(nicID, nicName, e); err != nil {
+				t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, nicName, err)
 			}
 
 			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index dd4926bb9..0240fe323 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -126,7 +126,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		linkEP := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
 		}
 
@@ -173,7 +173,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
 		}
 
@@ -218,15 +218,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
-	if loopback {
-		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, ep, err)
-		}
-	} else {
-		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
-		}
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+	if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
+		return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 	}
 
 	// Always start with an arp address for the NIC.
-- 
cgit v1.2.3


From 8cc1c35bbdc5c9bd6b3965311497885ce72317a8 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 12 Dec 2019 15:48:24 -0800
Subject: Write simple ACCEPT rules to the filter table.

This gets us closer to passing the iptables tests and opens up iptables
so it can be worked on by multiple people.

A few restrictions are enforced for security (i.e. we don't want to let
users write a bunch of iptables rules and then just not enforce them):

- Only the filter table is writable.
- Only ACCEPT rules with no matching criteria can be added.
---
 go.mod                                   |  31 +--
 go.sum                                   |  10 +
 pkg/abi/linux/netfilter.go               |  82 +++---
 pkg/sentry/socket/netfilter/BUILD        |   1 +
 pkg/sentry/socket/netfilter/netfilter.go | 411 ++++++++++++++++++++++++-------
 pkg/sentry/socket/netstack/netstack.go   |  23 +-
 pkg/tcpip/iptables/iptables.go           | 114 ++++++---
 pkg/tcpip/iptables/targets.go            |   8 +
 pkg/tcpip/iptables/types.go              |  55 ++---
 test/iptables/filter_input.go            |   1 +
 test/iptables/iptables_test.go           |  23 +-
 11 files changed, 550 insertions(+), 209 deletions(-)

diff --git a/go.mod b/go.mod
index 304b8bf13..4802359f8 100644
--- a/go.mod
+++ b/go.mod
@@ -3,19 +3,20 @@ module gvisor.dev/gvisor
 go 1.13
 
 require (
-  github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
-  github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
-  github.com/golang/mock v1.3.1
-  github.com/golang/protobuf v1.3.1
-  github.com/google/btree v1.0.0
-  github.com/google/go-cmp v0.2.0
-  github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
-  github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
-  github.com/kr/pty v1.1.1
-  github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
-  github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
-  github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
-  github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
-  golang.org/x/net v0.0.0-20190311183353-d8887717615a
-  golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+	github.com/golang/mock v1.3.1
+	github.com/golang/protobuf v1.3.1
+	github.com/google/btree v1.0.0
+	github.com/google/go-cmp v0.2.0
+	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+	github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+	github.com/kr/pty v1.1.1
+	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+	golang.org/x/net v0.0.0-20190311183353-d8887717615a
+	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
 )
diff --git a/go.sum b/go.sum
index 7a0bc175a..cf092956e 100644
--- a/go.sum
+++ b/go.sum
@@ -1,19 +1,29 @@
+github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 h1:+FKjzBIdfBHYDvxCv+djmDJdes/AoDtg8gpcxowBlF8=
 github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422/go.mod h1:b6Nc7NRH5C4aCISLry0tLnTjcuTEvoiqcWDdsU0sOGM=
 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
 github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
+github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
 github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 h1:d9F+LNYwMyi3BDN4GzZdaSiq4otb8duVEWyZjeUtOQI=
 github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 h1:b6uOv7YOFK0TYG7HtkIgExQo+2RdLuwRft63jn2HWj8=
 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 269ba5567..0bcb232de 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -42,6 +42,13 @@ const (
 	NF_RETURN = -NF_REPEAT - 1
 )
 
+var VerdictStrings = map[int32]string{
+	-NF_DROP - 1:   "DROP",
+	-NF_ACCEPT - 1: "ACCEPT",
+	-NF_QUEUE - 1:  "QUEUE",
+	NF_RETURN:      "RETURN",
+}
+
 // Socket options. These correspond to values in
 // include/uapi/linux/netfilter_ipv4/ip_tables.h.
 const (
@@ -179,7 +186,7 @@ const SizeOfXTCounters = 16
 // the user data.
 type XTEntryMatch struct {
 	MatchSize uint16
-	Name      [XT_EXTENSION_MAXNAMELEN]byte
+	Name      ExtensionName
 	Revision  uint8
 	// Data is omitted here because it would cause XTEntryMatch to be an
 	// extra byte larger (see http://www.catb.org/esr/structure-packing/).
@@ -199,7 +206,7 @@ const SizeOfXTEntryMatch = 32
 // the user data.
 type XTEntryTarget struct {
 	TargetSize uint16
-	Name       [XT_EXTENSION_MAXNAMELEN]byte
+	Name       ExtensionName
 	Revision   uint8
 	// Data is omitted here because it would cause XTEntryTarget to be an
 	// extra byte larger (see http://www.catb.org/esr/structure-packing/).
@@ -226,9 +233,9 @@ const SizeOfXTStandardTarget = 40
 // ErrorName. It corresponds to struct xt_error_target in
 // include/uapi/linux/netfilter/x_tables.h.
 type XTErrorTarget struct {
-	Target    XTEntryTarget
-	ErrorName [XT_FUNCTION_MAXNAMELEN]byte
-	_         [2]byte
+	Target XTEntryTarget
+	Name   ErrorName
+	_      [2]byte
 }
 
 // SizeOfXTErrorTarget is the size of an XTErrorTarget.
@@ -237,7 +244,7 @@ const SizeOfXTErrorTarget = 64
 // IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds
 // to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h.
 type IPTGetinfo struct {
-	Name       [XT_TABLE_MAXNAMELEN]byte
+	Name       TableName
 	ValidHooks uint32
 	HookEntry  [NF_INET_NUMHOOKS]uint32
 	Underflow  [NF_INET_NUMHOOKS]uint32
@@ -248,16 +255,11 @@ type IPTGetinfo struct {
 // SizeOfIPTGetinfo is the size of an IPTGetinfo.
 const SizeOfIPTGetinfo = 84
 
-// TableName returns the table name.
-func (info *IPTGetinfo) TableName() string {
-	return tableName(info.Name[:])
-}
-
 // IPTGetEntries is the argument for the IPT_SO_GET_ENTRIES sockopt. It
 // corresponds to struct ipt_get_entries in
 // include/uapi/linux/netfilter_ipv4/ip_tables.h.
 type IPTGetEntries struct {
-	Name [XT_TABLE_MAXNAMELEN]byte
+	Name TableName
 	Size uint32
 	_    [4]byte
 	// Entrytable is omitted here because it would cause IPTGetEntries to
@@ -266,34 +268,22 @@ type IPTGetEntries struct {
 	// Entrytable [0]IPTEntry
 }
 
-// TableName returns the entries' table name.
-func (entries *IPTGetEntries) TableName() string {
-	return tableName(entries.Name[:])
-}
-
 // SizeOfIPTGetEntries is the size of an IPTGetEntries.
 const SizeOfIPTGetEntries = 40
 
-// KernelIPTGetEntries is identical to IPTEntry, but includes the Elems field.
-// This struct marshaled via the binary package to write an KernelIPTGetEntries
-// to userspace.
+// KernelIPTGetEntries is identical to IPTGetEntries, but includes the
+// Entrytable field. This struct marshaled via the binary package to write an
+// KernelIPTGetEntries to userspace.
 type KernelIPTGetEntries struct {
-	Name       [XT_TABLE_MAXNAMELEN]byte
-	Size       uint32
-	_          [4]byte
+	IPTGetEntries
 	Entrytable []KernelIPTEntry
 }
 
-// TableName returns the entries' table name.
-func (entries *KernelIPTGetEntries) TableName() string {
-	return tableName(entries.Name[:])
-}
-
 // IPTReplace is the argument for the IPT_SO_SET_REPLACE sockopt. It
 // corresponds to struct ipt_replace in
 // include/uapi/linux/netfilter_ipv4/ip_tables.h.
 type IPTReplace struct {
-	Name        [XT_TABLE_MAXNAMELEN]byte
+	Name        TableName
 	ValidHooks  uint32
 	NumEntries  uint32
 	Size        uint32
@@ -306,14 +296,40 @@ type IPTReplace struct {
 	// Entries [0]IPTEntry
 }
 
+type KernelIPTReplace struct {
+	IPTReplace
+	Entries [0]IPTEntry
+}
+
 // SizeOfIPTReplace is the size of an IPTReplace.
 const SizeOfIPTReplace = 96
 
-func tableName(name []byte) string {
-	for i, c := range name {
+type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte
+
+// String implements fmt.Stringer.
+func (en ExtensionName) String() string {
+	return name(en[:])
+}
+
+type TableName [XT_TABLE_MAXNAMELEN]byte
+
+// String implements fmt.Stringer.
+func (tn TableName) String() string {
+	return name(tn[:])
+}
+
+type ErrorName [XT_FUNCTION_MAXNAMELEN]byte
+
+// String implements fmt.Stringer.
+func (fn ErrorName) String() string {
+	return name(fn[:])
+}
+
+func name(cstring []byte) string {
+	for i, c := range cstring {
 		if c == 0 {
-			return string(name[:i])
+			return string(cstring[:i])
 		}
 	}
-	return string(name)
+	return string(cstring)
 }
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 5eb06bbf4..b70047d81 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -14,6 +14,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/log",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 9f87c32f1..8c7f3c7fc 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
@@ -35,6 +36,7 @@ const errorTargetName = "ERROR"
 
 // metadata is opaque to netstack. It holds data that we need to translate
 // between Linux's and netstack's iptables representations.
+// TODO(gvisor.dev/issue/170): This might be removable.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -51,7 +53,7 @@ func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTG
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, info.TableName())
+	table, err := findTable(ep, info.Name.String())
 	if err != nil {
 		return linux.IPTGetinfo{}, err
 	}
@@ -82,18 +84,19 @@ func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen i
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, userEntries.TableName())
+	table, err := findTable(ep, userEntries.Name.String())
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(userEntries.TableName(), table)
+	entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
 	if binary.Size(entries) > uintptr(outLen) {
+		log.Infof("Insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
@@ -142,103 +145,63 @@ func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPT
 
 	// The table name has to fit in the struct.
 	if linux.XT_TABLE_MAXNAMELEN < len(name) {
+		log.Infof("Table name too long.")
 		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
 	}
 	copy(entries.Name[:], name)
 
-	// Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of
-	// these chains ends with an unconditional policy entry.
-	for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ {
-		chain, ok := table.BuiltinChains[hook]
-		if !ok {
-			// This table doesn't support this hook.
-			continue
-		}
-
-		// Sanity check.
-		if len(chain.Rules) < 1 {
-			return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
-		}
-
-		for ruleIdx, rule := range chain.Rules {
-			// If this is the first rule of a builtin chain, set
-			// the metadata hook entry point.
-			if ruleIdx == 0 {
+	for ruleIdx, rule := range table.Rules {
+		// Is this a chain entry point?
+		for hook, hookRuleIdx := range table.BuiltinChains {
+			if hookRuleIdx == ruleIdx {
 				meta.HookEntry[hook] = entries.Size
 			}
-
-			// Each rule corresponds to an entry.
-			entry := linux.KernelIPTEntry{
-				IPTEntry: linux.IPTEntry{
-					NextOffset:   linux.SizeOfIPTEntry,
-					TargetOffset: linux.SizeOfIPTEntry,
-				},
+		}
+		// Is this a chain underflow point? The underflow rule is the last rule
+		// in the chain, and is an unconditional rule (i.e. it matches any
+		// packet). This is enforced when saving iptables.
+		for underflow, underflowRuleIdx := range table.Underflows {
+			if underflowRuleIdx == ruleIdx {
+				meta.Underflow[underflow] = entries.Size
 			}
+		}
 
-			for _, matcher := range rule.Matchers {
-				// Serialize the matcher and add it to the
-				// entry.
-				serialized := marshalMatcher(matcher)
-				entry.Elems = append(entry.Elems, serialized...)
-				entry.NextOffset += uint16(len(serialized))
-				entry.TargetOffset += uint16(len(serialized))
-			}
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIPTEntry{
+			IPTEntry: linux.IPTEntry{
+				NextOffset:   linux.SizeOfIPTEntry,
+				TargetOffset: linux.SizeOfIPTEntry,
+			},
+		}
 
-			// Serialize and append the target.
-			serialized := marshalTarget(rule.Target)
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
 			entry.Elems = append(entry.Elems, serialized...)
 			entry.NextOffset += uint16(len(serialized))
-
-			// The underflow rule is the last rule in the chain,
-			// and is an unconditional rule (i.e. it matches any
-			// packet). This is enforced when saving iptables.
-			if ruleIdx == len(chain.Rules)-1 {
-				meta.Underflow[hook] = entries.Size
-			}
-
-			entries.Size += uint32(entry.NextOffset)
-			entries.Entrytable = append(entries.Entrytable, entry)
-			meta.NumEntries++
+			entry.TargetOffset += uint16(len(serialized))
 		}
 
-	}
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.NextOffset += uint16(len(serialized))
 
-	// TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of
-	// these starts with an error node holding the chain's name and ends
-	// with an unconditional return.
-
-	// Lastly, each table ends with an unconditional error target rule as
-	// its final entry.
-	errorEntry := linux.KernelIPTEntry{
-		IPTEntry: linux.IPTEntry{
-			NextOffset:   linux.SizeOfIPTEntry,
-			TargetOffset: linux.SizeOfIPTEntry,
-		},
+		entries.Size += uint32(entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		meta.NumEntries++
 	}
-	var errorTarget linux.XTErrorTarget
-	errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget
-	copy(errorTarget.ErrorName[:], errorTargetName)
-	copy(errorTarget.Target.Name[:], errorTargetName)
-
-	// Serialize and add it to the list of entries.
-	errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget)
-	serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget)
-	errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...)
-	errorEntry.NextOffset += uint16(len(serializedErrorTarget))
-
-	entries.Size += uint32(errorEntry.NextOffset)
-	entries.Entrytable = append(entries.Entrytable, errorEntry)
-	meta.NumEntries++
-	meta.Size = entries.Size
 
+	meta.Size = entries.Size
 	return entries, meta, nil
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch matcher.(type) {
 	default:
-		// TODO(gvisor.dev/issue/170): We don't support any matchers yet, so
-		// any call to marshalMatcher will panic.
+		// TODO(gvisor.dev/issue/170): We don't support any matchers
+		// yet, so any call to marshalMatcher will panic.
 		panic(fmt.Errorf("unknown matcher of type %T", matcher))
 	}
 }
@@ -246,28 +209,46 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
-		return marshalUnconditionalAcceptTarget()
+		return marshalStandardTarget(iptables.Accept)
+	case iptables.UnconditionalDropTarget:
+		return marshalStandardTarget(iptables.Drop)
+	case iptables.PanicTarget:
+		return marshalPanicTarget()
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
-func marshalUnconditionalAcceptTarget() []byte {
+func marshalStandardTarget(verdict iptables.Verdict) []byte {
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTStandardTarget,
 		},
-		Verdict: translateStandardVerdict(iptables.Accept),
+		Verdict: translateFromStandardVerdict(verdict),
 	}
 
 	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
-// translateStandardVerdict translates verdicts the same way as the iptables
+func marshalPanicTarget() []byte {
+	// This is an error target named error
+	target := linux.XTErrorTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTErrorTarget,
+		},
+	}
+	copy(target.Name[:], errorTargetName)
+	copy(target.Target.Name[:], errorTargetName)
+
+	ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+// translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateStandardVerdict(verdict iptables.Verdict) int32 {
+func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
 	switch verdict {
 	case iptables.Accept:
 		return -linux.NF_ACCEPT - 1
@@ -280,7 +261,269 @@ func translateStandardVerdict(verdict iptables.Verdict) int32 {
 	case iptables.Jump:
 		// TODO(gvisor.dev/issue/170): Support Jump.
 		panic("Jump isn't supported yet")
+	}
+	panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+}
+
+// translateToStandardVerdict translates from the value in a
+// linux.XTStandardTarget to an iptables.Verdict.
+func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
+	// TODO(gvisor.dev/issue/170): Support other verdicts.
+	switch val {
+	case -linux.NF_ACCEPT - 1:
+		return iptables.Accept, nil
+	case -linux.NF_DROP - 1:
+		return iptables.Drop, nil
+	case -linux.NF_QUEUE - 1:
+		log.Infof("Unsupported iptables verdict QUEUE.")
+	case linux.NF_RETURN:
+		log.Infof("Unsupported iptables verdict RETURN.")
+	}
+	log.Infof("Unknown iptables verdict %d.", val)
+	return iptables.Invalid, syserr.ErrInvalidArgument
+}
+
+// SetEntries sets iptables rules for a single table. See
+// net/ipv4/netfilter/ip_tables.c:translate_table for reference.
+func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
+	printReplace(optVal)
+
+	// Get the basic rules data (struct ipt_replace).
+	if len(optVal) < linux.SizeOfIPTReplace {
+		return syserr.ErrInvalidArgument
+	}
+	var replace linux.IPTReplace
+	replaceBuf := optVal[:linux.SizeOfIPTReplace]
+	optVal = optVal[linux.SizeOfIPTReplace:]
+	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
+
+	// TODO(gvisor.dev/issue/170): Support other tables.
+	var table iptables.Table
+	switch replace.Name.String() {
+	case iptables.TablenameFilter:
+		table = iptables.EmptyFilterTable()
 	default:
-		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+		log.Infof(fmt.Sprintf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()))
+		return syserr.ErrInvalidArgument
+	}
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	var offsets []uint32
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIPTEntry {
+			return syserr.ErrInvalidArgument
+		}
+		var entry linux.IPTEntry
+		buf := optVal[:linux.SizeOfIPTEntry]
+		optVal = optVal[linux.SizeOfIPTEntry:]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		if entry.TargetOffset != linux.SizeOfIPTEntry {
+			// TODO(gvisor.dev/issue/170): Support matchers.
+			return syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support IPTIP
+		// filtering. We reject any nonzero IPTIP values for now.
+		emptyIPTIP := linux.IPTIP{}
+		if entry.IP != emptyIPTIP {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Get the target of the rule.
+		target, consumed, err := parseTarget(optVal)
+		if err != nil {
+			return err
+		}
+		optVal = optVal[consumed:]
+
+		table.Rules = append(table.Rules, iptables.Rule{Target: target})
+		offsets = append(offsets, offset)
+		offset += linux.SizeOfIPTEntry + consumed
+	}
+
+	// Go through the list of supported hooks for this table and, for each
+	// one, set the rule it corresponds to.
+	for hook, _ := range replace.HookEntry {
+		if table.ValidHooks()&uint32(hook) != 0 {
+			hk := hookFromLinux(hook)
+			for ruleIdx, offset := range offsets {
+				if offset == replace.HookEntry[hook] {
+					table.BuiltinChains[hk] = ruleIdx
+				}
+				if offset == replace.Underflow[hook] {
+					table.Underflows[hk] = ruleIdx
+				}
+			}
+			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
+				log.Infof("Hook %v is unset.", hk)
+				return syserr.ErrInvalidArgument
+			}
+			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
+				log.Infof("Underflow %v is unset.", hk)
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+
+	ipt := stack.IPTables()
+	table.SetMetadata(metadata{
+		HookEntry:  replace.HookEntry,
+		Underflow:  replace.Underflow,
+		NumEntries: replace.NumEntries,
+		Size:       replace.Size,
+	})
+	ipt.Tables[replace.Name.String()] = table
+	// TODO: Do we need to worry about locking? We could write rules while
+	// packets traverse tables.
+	stack.SetIPTables(ipt)
+
+	return nil
+}
+
+// parseTarget parses a target from the start of optVal and returns the target
+// along with the number of bytes it occupies in optVal.
+func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
+	if len(optVal) < linux.SizeOfXTEntryTarget {
+		return nil, 0, syserr.ErrInvalidArgument
+	}
+	var target linux.XTEntryTarget
+	buf := optVal[:linux.SizeOfXTEntryTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &target)
+	switch target.Name.String() {
+	case "":
+		// Standard target.
+		if len(optVal) < linux.SizeOfXTStandardTarget {
+			return nil, 0, syserr.ErrInvalidArgument
+		}
+		var target linux.XTStandardTarget
+		buf = optVal[:linux.SizeOfXTStandardTarget]
+		binary.Unmarshal(buf, usermem.ByteOrder, &target)
+
+		verdict, err := translateToStandardVerdict(target.Verdict)
+		if err != nil {
+			return nil, 0, err
+		}
+		switch verdict {
+		case iptables.Accept:
+			return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil
+		case iptables.Drop:
+			// TODO(gvisor.dev/issue/170): Return an
+			// iptables.UnconditionalDropTarget to support DROP.
+			log.Infof("netfilter DROP is not supported yet.")
+			return nil, 0, syserr.ErrInvalidArgument
+		default:
+			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
+		}
+
+	case errorTargetName:
+		// Error target.
+		if len(optVal) < linux.SizeOfXTErrorTarget {
+			return nil, 0, syserr.ErrInvalidArgument
+		}
+		var target linux.XTErrorTarget
+		buf = optVal[:linux.SizeOfXTErrorTarget]
+		binary.Unmarshal(buf, usermem.ByteOrder, &target)
+
+		// Error targets are used in 2 cases:
+		// * An actual error case. These rules have an error
+		//   named errorTargetName. The last entry of the table
+		//   is usually an error case to catch any packets that
+		//   somehow fall through every rule.
+		// * To mark the start of a user defined chain. These
+		//   rules have an error with the name of the chain.
+		switch target.Name.String() {
+		case errorTargetName:
+			return iptables.PanicTarget{}, linux.SizeOfXTErrorTarget, nil
+		default:
+			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", target.Name.String())
+			return nil, 0, syserr.ErrInvalidArgument
+		}
+	}
+
+	// Unknown target.
+	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
+	return nil, 0, syserr.ErrInvalidArgument
+}
+
+func chainNameFromHook(hook int) string {
+	switch hook {
+	case linux.NF_INET_PRE_ROUTING:
+		return iptables.ChainNamePrerouting
+	case linux.NF_INET_LOCAL_IN:
+		return iptables.ChainNameInput
+	case linux.NF_INET_FORWARD:
+		return iptables.ChainNameForward
+	case linux.NF_INET_LOCAL_OUT:
+		return iptables.ChainNameOutput
+	case linux.NF_INET_POST_ROUTING:
+		return iptables.ChainNamePostrouting
+	}
+	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain"))
+}
+
+func hookFromLinux(hook int) iptables.Hook {
+	switch hook {
+	case linux.NF_INET_PRE_ROUTING:
+		return iptables.Prerouting
+	case linux.NF_INET_LOCAL_IN:
+		return iptables.Input
+	case linux.NF_INET_FORWARD:
+		return iptables.Forward
+	case linux.NF_INET_LOCAL_OUT:
+		return iptables.Output
+	case linux.NF_INET_POST_ROUTING:
+		return iptables.Postrouting
+	}
+	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain"))
+}
+
+// printReplace prints information about the struct ipt_replace in optVal. It
+// is only for debugging.
+func printReplace(optVal []byte) {
+	// Basic replace info.
+	var replace linux.IPTReplace
+	replaceBuf := optVal[:linux.SizeOfIPTReplace]
+	optVal = optVal[linux.SizeOfIPTReplace:]
+	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
+	log.Infof("kevin: Replacing table %q: %+v", replace.Name.String(), replace)
+
+	// Read in the list of entries at the end of replace.
+	var totalOffset uint16
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		var entry linux.IPTEntry
+		entryBuf := optVal[:linux.SizeOfIPTEntry]
+		binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry)
+		log.Infof("kevin: Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
+
+		totalOffset += entry.NextOffset
+		if entry.TargetOffset == linux.SizeOfIPTEntry {
+			log.Infof("kevin: Entry has no matches.")
+		} else {
+			log.Infof("kevin: Entry has matches.")
+		}
+
+		var target linux.XTEntryTarget
+		targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget]
+		binary.Unmarshal(targetBuf, usermem.ByteOrder, &target)
+		log.Infof("kevin: Target named %q: %+v", target.Name.String(), target)
+
+		switch target.Name.String() {
+		case "":
+			var standardTarget linux.XTStandardTarget
+			stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget]
+			binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget)
+			log.Infof("kevin: Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
+		case errorTargetName:
+			var errorTarget linux.XTErrorTarget
+			etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget]
+			binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget)
+			log.Infof("kevin: Error target with name %q.", errorTarget.Name.String())
+		default:
+			log.Infof("kevin: Unknown target type.")
+		}
+
+		optVal = optVal[entry.NextOffset:]
 	}
 }
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 140851c17..f7caa45b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -326,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress,
 	}
 
 	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
+	if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
 		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
 	}
 
@@ -1356,6 +1356,27 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 		return nil
 	}
 
+	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+		if name == linux.IPT_SO_SET_REPLACE {
+			if len(optVal) < linux.SizeOfIPTReplace {
+				return syserr.ErrInvalidArgument
+			}
+
+			stack := inet.StackFromContext(t)
+			if stack == nil {
+				return syserr.ErrNoDevice
+			}
+			// Stack must be a netstack stack.
+			if err := netfilter.SetEntries(stack.(*Stack).Stack, optVal); err != nil {
+				return err
+			}
+			return nil
+		} else if name == linux.IPT_SO_SET_ADD_COUNTERS {
+			// TODO(gvisor.dev/issue/170): Counter support.
+			return nil
+		}
+	}
+
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 68c68d4aa..9e7005374 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -17,65 +17,107 @@
 package iptables
 
 const (
-	tablenameNat    = "nat"
-	tablenameMangle = "mangle"
+	TablenameNat    = "nat"
+	TablenameMangle = "mangle"
+	TablenameFilter = "filter"
 )
 
+// TODO: Make this an iota? Faster! Do it.
 // Chain names as defined by net/ipv4/netfilter/ip_tables.c.
 const (
-	chainNamePrerouting  = "PREROUTING"
-	chainNameInput       = "INPUT"
-	chainNameForward     = "FORWARD"
-	chainNameOutput      = "OUTPUT"
-	chainNamePostrouting = "POSTROUTING"
+	ChainNamePrerouting  = "PREROUTING"
+	ChainNameInput       = "INPUT"
+	ChainNameForward     = "FORWARD"
+	ChainNameOutput      = "OUTPUT"
+	ChainNamePostrouting = "POSTROUTING"
 )
 
+const HookUnset = -1
+
 // DefaultTables returns a default set of tables. Each chain is set to accept
 // all packets.
 func DefaultTables() IPTables {
 	return IPTables{
 		Tables: map[string]Table{
-			tablenameNat: Table{
-				BuiltinChains: map[Hook]Chain{
-					Prerouting:  unconditionalAcceptChain(chainNamePrerouting),
-					Input:       unconditionalAcceptChain(chainNameInput),
-					Output:      unconditionalAcceptChain(chainNameOutput),
-					Postrouting: unconditionalAcceptChain(chainNamePostrouting),
+			TablenameNat: Table{
+				Rules: []Rule{
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: PanicTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
 				},
-				DefaultTargets: map[Hook]Target{
-					Prerouting:  UnconditionalAcceptTarget{},
-					Input:       UnconditionalAcceptTarget{},
-					Output:      UnconditionalAcceptTarget{},
-					Postrouting: UnconditionalAcceptTarget{},
+				Underflows: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
 				},
-				UserChains: map[string]Chain{},
+				UserChains: map[string]int{},
 			},
-			tablenameMangle: Table{
-				BuiltinChains: map[Hook]Chain{
-					Prerouting: unconditionalAcceptChain(chainNamePrerouting),
-					Output:     unconditionalAcceptChain(chainNameOutput),
+			TablenameMangle: Table{
+				Rules: []Rule{
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: PanicTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
 				},
-				DefaultTargets: map[Hook]Target{
-					Prerouting: UnconditionalAcceptTarget{},
-					Output:     UnconditionalAcceptTarget{},
+				Underflows: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
 				},
-				UserChains: map[string]Chain{},
+				UserChains: map[string]int{},
+			},
+			TablenameFilter: Table{
+				Rules: []Rule{
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: PanicTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				Underflows: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				UserChains: map[string]int{},
 			},
 		},
 		Priorities: map[Hook][]string{
-			Prerouting: []string{tablenameMangle, tablenameNat},
-			Output:     []string{tablenameMangle, tablenameNat},
+			Input:      []string{TablenameNat, TablenameFilter},
+			Prerouting: []string{TablenameMangle, TablenameNat},
+			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
 		},
 	}
 }
 
-func unconditionalAcceptChain(name string) Chain {
-	return Chain{
-		Name: name,
-		Rules: []Rule{
-			Rule{
-				Target: UnconditionalAcceptTarget{},
-			},
+func EmptyFilterTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
 		},
+		UserChains: map[string]int{},
 	}
 }
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 19a7f77e3..03c9f19ff 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -33,3 +33,11 @@ type UnconditionalDropTarget struct{}
 func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
 	return Drop, ""
 }
+
+// PanicTarget just panics.
+type PanicTarget struct{}
+
+// Actions implements Target.Action.
+func (PanicTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+	panic("PanicTarget triggered.")
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 42a79ef9f..76364ff1f 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -61,9 +61,12 @@ const (
 type Verdict int
 
 const (
+	// Invalid indicates an unkonwn or erroneous verdict.
+	Invalid Verdict = iota
+
 	// Accept indicates the packet should continue traversing netstack as
 	// normal.
-	Accept Verdict = iota
+	Accept
 
 	// Drop inicates the packet should be dropped, stopping traversing
 	// netstack.
@@ -109,24 +112,18 @@ type IPTables struct {
 //   * nat
 //   * mangle
 type Table struct {
-	// BuiltinChains holds the un-deletable chains built into netstack. If
-	// a hook isn't present in the map, this table doesn't utilize that
-	// hook.
-	BuiltinChains map[Hook]Chain
+	// A table is just a list of rules with some entrypoints.
+	Rules []Rule
+
+	BuiltinChains map[Hook]int
+
+	Underflows map[Hook]int
 
-	// DefaultTargets holds a target for each hook that will be executed if
-	// chain traversal doesn't yield a verdict.
-	DefaultTargets map[Hook]Target
+	// DefaultTargets map[Hook]int
 
 	// UserChains holds user-defined chains for the keyed by name. Users
 	// can give their chains arbitrary names.
-	UserChains map[string]Chain
-
-	// Chains maps names to chains for both builtin and user-defined chains.
-	// Its entries point to Chains already either in BuiltinChains or
-	// UserChains, and its purpose is to make looking up tables by name
-	// fast.
-	Chains map[string]*Chain
+	UserChains map[string]int
 
 	// Metadata holds information about the Table that is useful to users
 	// of IPTables, but not to the netstack IPTables code itself.
@@ -152,20 +149,20 @@ func (table *Table) SetMetadata(metadata interface{}) {
 	table.metadata = metadata
 }
 
-// A Chain defines a list of rules for packet processing. When a packet
-// traverses a chain, it is checked against each rule until either a rule
-// returns a verdict or the chain ends.
-//
-// By convention, builtin chains end with a rule that matches everything and
-// returns either Accept or Drop. User-defined chains end with Return. These
-// aren't strictly necessary here, but the iptables tool writes tables this way.
-type Chain struct {
-	// Name is the chain name.
-	Name string
-
-	// Rules is the list of rules to traverse.
-	Rules []Rule
-}
+//// A Chain defines a list of rules for packet processing. When a packet
+//// traverses a chain, it is checked against each rule until either a rule
+//// returns a verdict or the chain ends.
+////
+//// By convention, builtin chains end with a rule that matches everything and
+//// returns either Accept or Drop. User-defined chains end with Return. These
+//// aren't strictly necessary here, but the iptables tool writes tables this way.
+//type Chain struct {
+//	// Name is the chain name.
+//	Name string
+
+//	// Rules is the list of rules to traverse.
+//	Rules []Rule
+//}
 
 // A Rule is a packet processing rule. It consists of two pieces. First it
 // contains zero or more matchers, each of which is a specification of which
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 923f44e68..0cb668635 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -44,6 +44,7 @@ func (FilterInputDropUDP) Name() string {
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputDropUDP) ContainerAction(ip net.IP) error {
 	if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
+		// if err := filterTable("-A", "INPUT", "-j", "ACCEPT"); err != nil {
 		return err
 	}
 
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index bfbf1bb87..e761e0f2f 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -23,6 +23,7 @@ import (
 	"time"
 
 	"flag"
+
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/dockerutil"
 	"gvisor.dev/gvisor/runsc/testutil"
@@ -166,14 +167,14 @@ func TestFilterInputDropUDP(t *testing.T) {
 	}
 }
 
-func TestFilterInputDropUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestFilterInputDropDifferentUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
-}
+// func TestFilterInputDropUDPPort(t *testing.T) {
+// 	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+// 		t.Fatal(err)
+// 	}
+// }
+
+// func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+// 	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+// 		t.Fatal(err)
+// 	}
+// }
-- 
cgit v1.2.3


From 9df018767cdfee5d837746b6dce6dafd9b9fcfce Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 8 Jan 2020 10:10:57 -0800
Subject: Remove redundant function argument

PacketLooping is already a member on the passed Route.

PiperOrigin-RevId: 288721500
---
 pkg/tcpip/network/arp/arp.go    |  6 +++---
 pkg/tcpip/network/ip_test.go    |  4 ++--
 pkg/tcpip/network/ipv4/ipv4.go  | 18 +++++++++---------
 pkg/tcpip/network/ipv6/ipv6.go  | 14 +++++++-------
 pkg/tcpip/stack/registration.go |  6 +++---
 pkg/tcpip/stack/route.go        |  6 +++---
 pkg/tcpip/stack/stack_test.go   | 10 +++++-----
 7 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index da8482509..42cacb8a6 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,16 +79,16 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketLooping, tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, tcpip.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 4144a7837..f1bc33adf 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -239,7 +239,7 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -480,7 +480,7 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index e645cf62c..4ee3d5b45 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -238,11 +238,11 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
-	if loop&stack.PacketLoop != 0 {
+	if r.Loop&stack.PacketLoop != 0 {
 		// The inbound path expects the network header to still be in
 		// the PacketBuffer's Data field.
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
@@ -256,7 +256,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 
 		loopedR.Release()
 	}
-	if loop&stack.PacketOut == 0 {
+	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
 	if pkt.Header.UsedLength()+pkt.Data.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
@@ -270,11 +270,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
-	if loop&stack.PacketLoop != 0 {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+	if r.Loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
-	if loop&stack.PacketOut == 0 {
+	if r.Loop&stack.PacketOut == 0 {
 		return len(pkts), nil
 	}
 
@@ -289,7 +289,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	ip := header.IPv4(pkt.Data.First())
@@ -324,10 +324,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLo
 	ip.SetChecksum(0)
 	ip.SetChecksum(^ip.CalculateChecksum())
 
-	if loop&stack.PacketLoop != 0 {
+	if r.Loop&stack.PacketLoop != 0 {
 		e.HandlePacket(r, pkt.Clone())
 	}
-	if loop&stack.PacketOut == 0 {
+	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
 
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index e13f1fabf..58c3c79b9 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -112,11 +112,11 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
-	if loop&stack.PacketLoop != 0 {
+	if r.Loop&stack.PacketLoop != 0 {
 		// The inbound path expects the network header to still be in
 		// the PacketBuffer's Data field.
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
@@ -130,7 +130,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 
 		loopedR.Release()
 	}
-	if loop&stack.PacketOut == 0 {
+	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
 
@@ -139,11 +139,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
-	if loop&stack.PacketLoop != 0 {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
 	}
-	if loop&stack.PacketOut == 0 {
+	if r.Loop&stack.PacketOut == 0 {
 		return len(pkts), nil
 	}
 
@@ -161,7 +161,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 61fd46d66..2b8751d49 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -234,15 +234,15 @@ type NetworkEndpoint interface {
 	// WritePacket writes a packet to the given destination address and
 	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
 	// already been set.
-	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
 	// protocol. pkts must not be zero length.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
+	WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 34307ae07..517f4b941 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -158,7 +158,7 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.Pack
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	err := r.ref.ep.WritePacket(r, gso, params, r.Loop, pkt)
+	err := r.ref.ep.WritePacket(r, gso, params, pkt)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	} else {
@@ -174,7 +174,7 @@ func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params Network
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
-	n, err := r.ref.ep.WritePackets(r, gso, pkts, params, r.Loop)
+	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n))
 	}
@@ -195,7 +195,7 @@ func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	if err := r.ref.ep.WriteHeaderIncludedPacket(r, r.Loop, pkt); err != nil {
+	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index bf057745e..33f20579f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -124,7 +124,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
@@ -135,7 +135,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 	b[1] = f.id.LocalAddress[0]
 	b[2] = byte(params.Protocol)
 
-	if loop&stack.PacketLoop != 0 {
+	if r.Loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 		views[0] = pkt.Header.View()
 		views = append(views, pkt.Data.Views()...)
@@ -143,7 +143,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 	}
-	if loop&stack.PacketOut == 0 {
+	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
 
@@ -151,11 +151,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
-- 
cgit v1.2.3


From db376e13924be59182ed4df95762328febf26298 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Jan 2020 10:30:50 -0800
Subject: Make /proc/[pid] offset start at TGID_OFFSET

Updates #1195

PiperOrigin-RevId: 288725745
---
 pkg/sentry/fsimpl/proc/tasks.go      | 102 ++++++++++++++++++------
 pkg/sentry/fsimpl/proc/tasks_test.go | 147 ++++++++++++++++++++++++++++++++++-
 2 files changed, 223 insertions(+), 26 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 50b2a832f..d8f92d52f 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -27,7 +27,11 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-const defaultPermission = 0444
+const (
+	defaultPermission = 0444
+	selfName          = "self"
+	threadSelfName    = "thread-self"
+)
 
 // InoGenerator generates unique inode numbers for a given filesystem.
 type InoGenerator interface {
@@ -45,6 +49,11 @@ type tasksInode struct {
 
 	inoGen InoGenerator
 	pidns  *kernel.PIDNamespace
+
+	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
+	// Linux. So handle them outside of OrderedChildren.
+	selfSymlink       *vfs.Dentry
+	threadSelfSymlink *vfs.Dentry
 }
 
 var _ kernfs.Inode = (*tasksInode)(nil)
@@ -54,20 +63,20 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 	contents := map[string]*kernfs.Dentry{
 		//"cpuinfo":     newCPUInfo(ctx, msrc),
 		//"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
-		"loadavg":     newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}),
-		"meminfo":     newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}),
-		"mounts":      kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"),
-		"self":        newSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns),
-		"stat":        newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}),
-		"thread-self": newThreadSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns),
+		"loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}),
+		"meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}),
+		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"),
+		"stat":    newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}),
 		//"uptime":      newUptime(ctx, msrc),
 		//"version": newVersionData(root, inoGen.NextIno(), k),
 		"version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}),
 	}
 
 	inode := &tasksInode{
-		pidns:  pidns,
-		inoGen: inoGen,
+		pidns:             pidns,
+		inoGen:            inoGen,
+		selfSymlink:       newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+		threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
 	}
 	inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
 
@@ -86,6 +95,13 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 	// Try to lookup a corresponding task.
 	tid, err := strconv.ParseUint(name, 10, 64)
 	if err != nil {
+		// If it failed to parse, check if it's one of the special handled files.
+		switch name {
+		case selfName:
+			return i.selfSymlink, nil
+		case threadSelfName:
+			return i.threadSelfSymlink, nil
+		}
 		return nil, syserror.ENOENT
 	}
 
@@ -104,41 +120,81 @@ func (i *tasksInode) Valid(ctx context.Context) bool {
 }
 
 // IterDirents implements kernfs.inodeDynamicLookup.
-//
-// TODO(gvisor.dev/issue/1195): Use tgid N offset = TGID_OFFSET + N.
-func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
-	var tids []int
+func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
+	const FIRST_PROCESS_ENTRY = 256
+
+	// Use maxTaskID to shortcut searches that will result in 0 entries.
+	const maxTaskID = kernel.TasksLimit + 1
+	if offset >= maxTaskID {
+		return offset, nil
+	}
+
+	// According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories
+	// start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by
+	// '/proc/thread-self' and then '/proc/[pid]'.
+	if offset < FIRST_PROCESS_ENTRY {
+		offset = FIRST_PROCESS_ENTRY
+	}
+
+	if offset == FIRST_PROCESS_ENTRY {
+		dirent := vfs.Dirent{
+			Name:    selfName,
+			Type:    linux.DT_LNK,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
+	if offset == FIRST_PROCESS_ENTRY+1 {
+		dirent := vfs.Dirent{
+			Name:    threadSelfName,
+			Type:    linux.DT_LNK,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
 
-	// Collect all tasks. Per linux we only include it in directory listings if
-	// it's the leader. But for whatever crazy reason, you can still walk to the
-	// given node.
+	// Collect all tasks that TGIDs are greater than the offset specified. Per
+	// Linux we only include in directory listings if it's the leader. But for
+	// whatever crazy reason, you can still walk to the given node.
+	var tids []int
+	startTid := offset - FIRST_PROCESS_ENTRY - 2
 	for _, tg := range i.pidns.ThreadGroups() {
+		tid := i.pidns.IDOfThreadGroup(tg)
+		if int64(tid) < startTid {
+			continue
+		}
 		if leader := tg.Leader(); leader != nil {
-			tids = append(tids, int(i.pidns.IDOfThreadGroup(tg)))
+			tids = append(tids, int(tid))
 		}
 	}
 
 	if len(tids) == 0 {
 		return offset, nil
 	}
-	if relOffset >= int64(len(tids)) {
-		return offset, nil
-	}
 
 	sort.Ints(tids)
-	for _, tid := range tids[relOffset:] {
+	for _, tid := range tids {
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(tid), 10),
 			Type:    linux.DT_DIR,
 			Ino:     i.inoGen.NextIno(),
-			NextOff: offset + 1,
+			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
 		}
 		if !cb.Handle(dirent) {
 			return offset, nil
 		}
 		offset++
 	}
-	return offset, nil
+	return maxTaskID, nil
 }
 
 // Open implements kernfs.Inode.
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 2560fcef9..ca8c87ec2 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -16,6 +16,7 @@ package proc
 
 import (
 	"fmt"
+	"math"
 	"path"
 	"strconv"
 	"testing"
@@ -30,6 +31,18 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+var (
+	// Next offset 256 by convention. Adds 1 for the next offset.
+	selfLink       = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1}
+	threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1}
+
+	// /proc/[pid] next offset starts at 256+2 (files above), then adds the
+	// PID, and adds 1 for the next offset.
+	proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1}
+	proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1}
+	proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1}
+)
+
 type testIterDirentsCallback struct {
 	dirents []vfs.Dirent
 }
@@ -59,9 +72,9 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 		"loadavg":     {Type: linux.DT_REG},
 		"meminfo":     {Type: linux.DT_REG},
 		"mounts":      {Type: linux.DT_LNK},
-		"self":        {Type: linux.DT_LNK},
+		"self":        selfLink,
 		"stat":        {Type: linux.DT_REG},
-		"thread-self": {Type: linux.DT_LNK},
+		"thread-self": threadSelfLink,
 		"version":     {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
@@ -93,6 +106,9 @@ func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, e
 		if want.Type != got.Type {
 			return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got)
 		}
+		if want.NextOff != 0 && want.NextOff != got.NextOff {
+			return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got)
+		}
 
 		delete(wants, got.Name)
 		gots = append(gots[0:i], gots[i+1:]...)
@@ -154,7 +170,7 @@ func TestTasksEmpty(t *testing.T) {
 		t.Error(err.Error())
 	}
 	if len(cb.dirents) != 0 {
-		t.Error("found more files than expected: %+v", cb.dirents)
+		t.Errorf("found more files than expected: %+v", cb.dirents)
 	}
 }
 
@@ -216,6 +232,11 @@ func TestTasks(t *testing.T) {
 		if !found {
 			t.Errorf("Additional task ID %d listed: %v", pid, tasks)
 		}
+		// Next offset starts at 256+2 ('self' and 'thread-self'), then adds the
+		// PID, and adds 1 for the next offset.
+		if want := int64(256 + 2 + pid + 1); d.NextOff != want {
+			t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d)
+		}
 	}
 
 	// Test lookup.
@@ -246,6 +267,126 @@ func TestTasks(t *testing.T) {
 	}
 }
 
+func TestTasksOffset(t *testing.T) {
+	ctx, vfsObj, root, err := setup()
+	if err != nil {
+		t.Fatalf("Setup failed: %v", err)
+	}
+	defer root.DecRef()
+
+	k := kernel.KernelFromContext(ctx)
+	for i := 0; i < 3; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+	}
+
+	for _, tc := range []struct {
+		name   string
+		offset int64
+		wants  map[string]vfs.Dirent
+	}{
+		{
+			name:   "small offset",
+			offset: 100,
+			wants: map[string]vfs.Dirent{
+				"self":        selfLink,
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "offset at start",
+			offset: 256,
+			wants: map[string]vfs.Dirent{
+				"self":        selfLink,
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "skip /proc/self",
+			offset: 257,
+			wants: map[string]vfs.Dirent{
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "skip symlinks",
+			offset: 258,
+			wants: map[string]vfs.Dirent{
+				"1": proc1,
+				"2": proc2,
+				"3": proc3,
+			},
+		},
+		{
+			name:   "skip first process",
+			offset: 260,
+			wants: map[string]vfs.Dirent{
+				"2": proc2,
+				"3": proc3,
+			},
+		},
+		{
+			name:   "last process",
+			offset: 261,
+			wants: map[string]vfs.Dirent{
+				"3": proc3,
+			},
+		},
+		{
+			name:   "after last",
+			offset: 262,
+			wants:  nil,
+		},
+		{
+			name:   "TaskLimit+1",
+			offset: kernel.TasksLimit + 1,
+			wants:  nil,
+		},
+		{
+			name:   "max",
+			offset: math.MaxInt64,
+			wants:  nil,
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			fd, err := vfsObj.OpenAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+				&vfs.OpenOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+			}
+			if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil {
+				t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
+			}
+
+			cb := testIterDirentsCallback{}
+			if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+				t.Fatalf("IterDirents(): %v", err)
+			}
+			if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil {
+				t.Error(err.Error())
+			}
+			if len(cb.dirents) != 0 {
+				t.Errorf("found more files than expected: %+v", cb.dirents)
+			}
+		})
+	}
+}
+
 func TestTask(t *testing.T) {
 	ctx, vfsObj, root, err := setup()
 	if err != nil {
-- 
cgit v1.2.3


From 1e1921e2acdb7357972257219fdffb9edf17bf55 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 11:15:46 -0800
Subject: Minor fixes to comments and logging

---
 pkg/sentry/socket/netfilter/netfilter.go | 10 +++++++---
 pkg/sentry/socket/netstack/netstack.go   |  8 +++++---
 pkg/sentry/syscalls/linux/sys_socket.go  |  2 +-
 pkg/tcpip/iptables/targets.go            |  2 +-
 pkg/tcpip/iptables/types.go              | 28 ++++++----------------------
 test/iptables/filter_input.go            |  3 +--
 test/iptables/iptables_test.go           | 22 +++++++++++-----------
 7 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 8c7f3c7fc..b7867a576 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -157,9 +157,7 @@ func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPT
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
-		// Is this a chain underflow point? The underflow rule is the last rule
-		// in the chain, and is an unconditional rule (i.e. it matches any
-		// packet). This is enforced when saving iptables.
+		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
 				meta.Underflow[underflow] = entries.Size
@@ -290,6 +288,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
+		log.Infof("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
 		return syserr.ErrInvalidArgument
 	}
 	var replace linux.IPTReplace
@@ -313,6 +312,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
+			log.Infof("netfilter: optVal has insufficient size for entry %d", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		var entry linux.IPTEntry
@@ -328,6 +328,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// filtering. We reject any nonzero IPTIP values for now.
 		emptyIPTIP := linux.IPTIP{}
 		if entry.IP != emptyIPTIP {
+			log.Infof("netfilter: non-empty struct iptip found")
 			return syserr.ErrInvalidArgument
 		}
 
@@ -386,6 +387,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // along with the number of bytes it occupies in optVal.
 func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	if len(optVal) < linux.SizeOfXTEntryTarget {
+		log.Infof("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, 0, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
@@ -395,6 +397,7 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	case "":
 		// Standard target.
 		if len(optVal) < linux.SizeOfXTStandardTarget {
+			log.Infof("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
 			return nil, 0, syserr.ErrInvalidArgument
 		}
 		var target linux.XTStandardTarget
@@ -420,6 +423,7 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	case errorTargetName:
 		// Error target.
 		if len(optVal) < linux.SizeOfXTErrorTarget {
+			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
 			return nil, 0, syserr.ErrInvalidArgument
 		}
 		var target linux.XTErrorTarget
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index f7caa45b4..8c07eef4b 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -326,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress,
 	}
 
 	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
+	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
 		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
 	}
 
@@ -1357,7 +1357,8 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 	}
 
 	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		if name == linux.IPT_SO_SET_REPLACE {
+		switch name {
+		case linux.IPT_SO_SET_REPLACE:
 			if len(optVal) < linux.SizeOfIPTReplace {
 				return syserr.ErrInvalidArgument
 			}
@@ -1371,7 +1372,8 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 				return err
 			}
 			return nil
-		} else if name == linux.IPT_SO_SET_ADD_COUNTERS {
+
+		case linux.IPT_SO_SET_ADD_COUNTERS:
 			// TODO(gvisor.dev/issue/170): Counter support.
 			return nil
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 4b5aafcc0..cda517a81 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -41,7 +41,7 @@ const maxListenBacklog = 1024
 const maxAddrLen = 200
 
 // maxOptLen is the maximum sockopt parameter length we're willing to accept.
-const maxOptLen = 1024
+const maxOptLen = 1024 * 8
 
 // maxControlLen is the maximum length of the msghdr.msg_control buffer we're
 // willing to accept. Note that this limit is smaller than Linux, which allows
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 03c9f19ff..2c3598e3d 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -34,7 +34,7 @@ func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, st
 	return Drop, ""
 }
 
-// PanicTarget just panics.
+// PanicTarget just panics. It represents a target that should be unreachable.
 type PanicTarget struct{}
 
 // Actions implements Target.Action.
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 76364ff1f..fe0394a31 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -107,20 +107,19 @@ type IPTables struct {
 	Priorities map[Hook][]string
 }
 
-// A Table defines a set of chains and hooks into the network stack. The
-// currently supported tables are:
-//   * nat
-//   * mangle
+// A Table defines a set of chains and hooks into the network stack. It is
+// really just a list of rules with some metadata for entrypoints and such.
 type Table struct {
-	// A table is just a list of rules with some entrypoints.
+	// Rules holds the rules that make up the table.
 	Rules []Rule
 
+	// BuiltinChains maps builtin chains to their entrypoints.
 	BuiltinChains map[Hook]int
 
+	// Underflows maps builtin chains to their underflow point (i.e. the
+	// rule to execute if the chain returns without a verdict).
 	Underflows map[Hook]int
 
-	// DefaultTargets map[Hook]int
-
 	// UserChains holds user-defined chains for the keyed by name. Users
 	// can give their chains arbitrary names.
 	UserChains map[string]int
@@ -149,21 +148,6 @@ func (table *Table) SetMetadata(metadata interface{}) {
 	table.metadata = metadata
 }
 
-//// A Chain defines a list of rules for packet processing. When a packet
-//// traverses a chain, it is checked against each rule until either a rule
-//// returns a verdict or the chain ends.
-////
-//// By convention, builtin chains end with a rule that matches everything and
-//// returns either Accept or Drop. User-defined chains end with Return. These
-//// aren't strictly necessary here, but the iptables tool writes tables this way.
-//type Chain struct {
-//	// Name is the chain name.
-//	Name string
-
-//	// Rules is the list of rules to traverse.
-//	Rules []Rule
-//}
-
 // A Rule is a packet processing rule. It consists of two pieces. First it
 // contains zero or more matchers, each of which is a specification of which
 // packets this rule applies to. If there are no matchers in the rule, it
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 0cb668635..34a85db97 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -43,8 +43,7 @@ func (FilterInputDropUDP) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputDropUDP) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
-		// if err := filterTable("-A", "INPUT", "-j", "ACCEPT"); err != nil {
+	if err := filterTable("-A", "INPUT", "-j", "ACCEPT"); err != nil {
 		return err
 	}
 
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index e761e0f2f..2465a4e65 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -167,14 +167,14 @@ func TestFilterInputDropUDP(t *testing.T) {
 	}
 }
 
-// func TestFilterInputDropUDPPort(t *testing.T) {
-// 	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
-// 		t.Fatal(err)
-// 	}
-// }
-
-// func TestFilterInputDropDifferentUDPPort(t *testing.T) {
-// 	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
-// 		t.Fatal(err)
-// 	}
-// }
+func TestFilterInputDropUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
-- 
cgit v1.2.3


From 899309c4ebadd0e4e22fbc4da12fbd6719d68a3a Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 11:16:41 -0800
Subject: Revert filter_input change

---
 test/iptables/filter_input.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 34a85db97..923f44e68 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -43,7 +43,7 @@ func (FilterInputDropUDP) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputDropUDP) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-j", "ACCEPT"); err != nil {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
 		return err
 	}
 
-- 
cgit v1.2.3


From 2f02e15e54d41d72eb5815a98b49c265977c6507 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 11:17:15 -0800
Subject: Newline

---
 test/iptables/iptables_test.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 2465a4e65..23d15bf71 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -15,6 +15,7 @@
 package iptables
 
 import (
+	"flag"
 	"fmt"
 	"net"
 	"os"
@@ -22,8 +23,6 @@ import (
 	"testing"
 	"time"
 
-	"flag"
-
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/dockerutil"
 	"gvisor.dev/gvisor/runsc/testutil"
-- 
cgit v1.2.3


From 446a250996d9c946d9a5279f7fd081cc1be0bd11 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 11:20:48 -0800
Subject: Comment cleanup.

---
 pkg/abi/linux/netfilter.go               | 2 ++
 pkg/sentry/socket/netfilter/netfilter.go | 2 --
 pkg/tcpip/iptables/types.go              | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 0bcb232de..35d66d622 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -42,6 +42,8 @@ const (
 	NF_RETURN = -NF_REPEAT - 1
 )
 
+// VerdictStrings maps int verdicts to the strings they represent. It is used
+// for debugging.
 var VerdictStrings = map[int32]string{
 	-NF_DROP - 1:   "DROP",
 	-NF_ACCEPT - 1: "ACCEPT",
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index b7867a576..347342f98 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -376,8 +376,6 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		Size:       replace.Size,
 	})
 	ipt.Tables[replace.Name.String()] = table
-	// TODO: Do we need to worry about locking? We could write rules while
-	// packets traverse tables.
 	stack.SetIPTables(ipt)
 
 	return nil
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index fe0394a31..540f8c0b4 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -113,11 +113,11 @@ type Table struct {
 	// Rules holds the rules that make up the table.
 	Rules []Rule
 
-	// BuiltinChains maps builtin chains to their entrypoints.
+	// BuiltinChains maps builtin chains to their entrypoint rule in Rules.
 	BuiltinChains map[Hook]int
 
-	// Underflows maps builtin chains to their underflow point (i.e. the
-	// rule to execute if the chain returns without a verdict).
+	// Underflows maps builtin chains to their underflow rule in Rules
+	// (i.e. the rule to execute if the chain returns without a verdict).
 	Underflows map[Hook]int
 
 	// UserChains holds user-defined chains for the keyed by name. Users
-- 
cgit v1.2.3


From 366e050ad516d6085bcae1215e8e122c6077e9ff Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 11:24:55 -0800
Subject: Revert go.mod and go.sum

---
 go.mod | 31 +++++++++++++++----------------
 go.sum | 10 ----------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/go.mod b/go.mod
index 4802359f8..304b8bf13 100644
--- a/go.mod
+++ b/go.mod
@@ -3,20 +3,19 @@ module gvisor.dev/gvisor
 go 1.13
 
 require (
-	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
-	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
-	github.com/golang/mock v1.3.1
-	github.com/golang/protobuf v1.3.1
-	github.com/google/btree v1.0.0
-	github.com/google/go-cmp v0.2.0
-	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
-	github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
-	github.com/kr/pty v1.1.1
-	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
-	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
-	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
-	golang.org/x/net v0.0.0-20190311183353-d8887717615a
-	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
-	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
+  github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+  github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+  github.com/golang/mock v1.3.1
+  github.com/golang/protobuf v1.3.1
+  github.com/google/btree v1.0.0
+  github.com/google/go-cmp v0.2.0
+  github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+  github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+  github.com/kr/pty v1.1.1
+  github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+  github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+  github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+  github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+  golang.org/x/net v0.0.0-20190311183353-d8887717615a
+  golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
 )
diff --git a/go.sum b/go.sum
index cf092956e..7a0bc175a 100644
--- a/go.sum
+++ b/go.sum
@@ -1,29 +1,19 @@
-github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 h1:+FKjzBIdfBHYDvxCv+djmDJdes/AoDtg8gpcxowBlF8=
 github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422/go.mod h1:b6Nc7NRH5C4aCISLry0tLnTjcuTEvoiqcWDdsU0sOGM=
 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
 github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
-github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
-github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
 github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 h1:d9F+LNYwMyi3BDN4GzZdaSiq4otb8duVEWyZjeUtOQI=
 github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
-github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 h1:b6uOv7YOFK0TYG7HtkIgExQo+2RdLuwRft63jn2HWj8=
 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
-golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-- 
cgit v1.2.3


From 7cebd77806d164a3baec52eaeb05662e8c404967 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 12:43:46 -0800
Subject: First commit -- re-adding DROP

---
 pkg/sentry/socket/netfilter/netfilter.go | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 347342f98..e4c493141 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -410,10 +410,7 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 		case iptables.Accept:
 			return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil
 		case iptables.Drop:
-			// TODO(gvisor.dev/issue/170): Return an
-			// iptables.UnconditionalDropTarget to support DROP.
-			log.Infof("netfilter DROP is not supported yet.")
-			return nil, 0, syserr.ErrInvalidArgument
+			return iptables.UnconditionalDropTarget{}, linux.SizeOfXTStandardTarget, nil
 		default:
 			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
 		}
-- 
cgit v1.2.3


From 447f64c561e6b5893c1bbae7d641187b7aca64ac Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 12:48:17 -0800
Subject: Added test for unconditional DROP on the filter INPUT chain

---
 test/iptables/filter_input.go  | 32 ++++++++++++++++++++++++++++++++
 test/iptables/iptables_test.go |  6 ++++++
 2 files changed, 38 insertions(+)

diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 923f44e68..1723a4d3e 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -31,6 +31,7 @@ func init() {
 	RegisterTestCase(FilterInputDropUDP{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropDifferentUDPPort{})
+	RegisterTestCase(FilterInputDropAll{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -122,3 +123,34 @@ func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
 func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// FilterInputDropAll tests that we can drop all traffic to the INPUT chain.
+type FilterInputDropAll struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropAll) Name() string {
+	return "FilterInputDropAll"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropAll) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for All packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets should have been dropped, but got a packet")
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropAll) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 23d15bf71..5927eb017 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -177,3 +177,9 @@ func TestFilterInputDropDifferentUDPPort(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestFilterInputDropAll(t *testing.T) {
+	if err := singleTest(FilterInputDropAll{}); err != nil {
+		t.Fatal(err)
+	}
+}
-- 
cgit v1.2.3


From d01240d871c8737989b1af27c137f6ae40bc6d37 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 8 Jan 2020 13:52:56 -0800
Subject: Take addresses as const

PiperOrigin-RevId: 288767927
---
 test/syscalls/linux/ip_socket_test_util.cc | 10 +++++-----
 test/syscalls/linux/ip_socket_test_util.h  |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 8398fc95f..6b472eb2f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -187,24 +187,24 @@ PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) {
   return InterfaceIndex(name);
 }
 
-std::string GetAddr4Str(in_addr* a) {
+std::string GetAddr4Str(const in_addr* a) {
   char str[INET_ADDRSTRLEN];
   inet_ntop(AF_INET, a, str, sizeof(str));
   return std::string(str);
 }
 
-std::string GetAddr6Str(in6_addr* a) {
+std::string GetAddr6Str(const in6_addr* a) {
   char str[INET6_ADDRSTRLEN];
   inet_ntop(AF_INET6, a, str, sizeof(str));
   return std::string(str);
 }
 
-std::string GetAddrStr(sockaddr* a) {
+std::string GetAddrStr(const sockaddr* a) {
   if (a->sa_family == AF_INET) {
-    auto src = &(reinterpret_cast<sockaddr_in*>(a)->sin_addr);
+    auto src = &(reinterpret_cast<const sockaddr_in*>(a)->sin_addr);
     return GetAddr4Str(src);
   } else if (a->sa_family == AF_INET6) {
-    auto src = &(reinterpret_cast<sockaddr_in6*>(a)->sin6_addr);
+    auto src = &(reinterpret_cast<const sockaddr_in6*>(a)->sin6_addr);
     return GetAddr6Str(src);
   }
   return std::string("<invalid>");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 9cb4566db..0f58e0f77 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -105,14 +105,14 @@ class IfAddrHelper {
 };
 
 // GetAddr4Str returns the given IPv4 network address structure as a string.
-std::string GetAddr4Str(in_addr* a);
+std::string GetAddr4Str(const in_addr* a);
 
 // GetAddr6Str returns the given IPv6 network address structure as a string.
-std::string GetAddr6Str(in6_addr* a);
+std::string GetAddr6Str(const in6_addr* a);
 
 // GetAddrStr returns the given IPv4 or IPv6 network address structure as a
 // string.
-std::string GetAddrStr(sockaddr* a);
+std::string GetAddrStr(const sockaddr* a);
 
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From a271bccfc61390be64ca0175b8fc7d20e66d05b6 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 8 Jan 2020 14:16:38 -0800
Subject: Rename tcpip.SockOpt{,Int}

PiperOrigin-RevId: 288772878
---
 pkg/sentry/socket/netstack/netstack.go   |  4 ++--
 pkg/sentry/socket/unix/transport/unix.go |  8 ++++----
 pkg/tcpip/stack/transport_test.go        |  4 ++--
 pkg/tcpip/tcpip.go                       | 10 +++++-----
 pkg/tcpip/transport/icmp/endpoint.go     |  4 ++--
 pkg/tcpip/transport/packet/endpoint.go   |  4 ++--
 pkg/tcpip/transport/raw/endpoint.go      |  4 ++--
 pkg/tcpip/transport/tcp/endpoint.go      |  4 ++--
 pkg/tcpip/transport/udp/endpoint.go      |  4 ++--
 9 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 140851c17..5f91a0d1a 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -224,7 +224,7 @@ type commonEndpoint interface {
 
 	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
 	// transport.Endpoint.SetSockOptInt.
-	SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
 	// transport.Endpoint.GetSockOpt.
@@ -232,7 +232,7 @@ type commonEndpoint interface {
 
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
-	GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
+	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
 }
 
 // SocketOperations encapsulates all the state needed to represent a network stack
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 529a7a7a9..fcba49435 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -177,7 +177,7 @@ type Endpoint interface {
 
 	// SetSockOptInt sets a socket option for simple cases when a value has
 	// the int type.
-	SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// tcpip.*Option types.
@@ -185,7 +185,7 @@ type Endpoint interface {
 
 	// GetSockOptInt gets a socket option for simple cases when a return
 	// value has the int type.
-	GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
+	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
 
 	// State returns the current state of the socket, as represented by Linux in
 	// procfs.
@@ -851,11 +851,11 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
-func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
-func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 748ce4ea5..095346f0b 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -103,12 +103,12 @@ func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt sets a socket option. Currently not supported.
-func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOpt, int) *tcpip.Error {
+func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOptInt, int) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	return -1, tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f62fd729f..b172d71b0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -425,7 +425,7 @@ type Endpoint interface {
 
 	// SetSockOptInt sets a socket option, for simple cases where a value
 	// has the int type.
-	SetSockOptInt(opt SockOpt, v int) *Error
+	SetSockOptInt(opt SockOptInt, v int) *Error
 
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// *Option types.
@@ -433,7 +433,7 @@ type Endpoint interface {
 
 	// GetSockOptInt gets a socket option for simple cases where a return
 	// value has the int type.
-	GetSockOptInt(SockOpt) (int, *Error)
+	GetSockOptInt(SockOptInt) (int, *Error)
 
 	// State returns a socket's lifecycle state. The returned value is
 	// protocol-specific and is primarily used for diagnostics.
@@ -488,13 +488,13 @@ type WriteOptions struct {
 	Atomic bool
 }
 
-// SockOpt represents socket options which values have the int type.
-type SockOpt int
+// SockOptInt represents socket options which values have the int type.
+type SockOptInt int
 
 const (
 	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
 	// number of unread bytes in the input buffer should be returned.
-	ReceiveQueueSizeOption SockOpt = iota
+	ReceiveQueueSizeOption SockOptInt = iota
 
 	// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
 	// specify the send buffer size option.
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 9c40931b5..5816ce49a 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -351,12 +351,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt sets a socket option. Currently not supported.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 0010b5e5f..6360ce880 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -251,12 +251,12 @@ func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 5aafe2615..0fd9c456a 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -510,12 +510,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index fe629aa40..f79154b95 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1146,7 +1146,7 @@ func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
 }
 
 // SetSockOptInt sets a socket option.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
@@ -1447,7 +1447,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1ac4705af..dae373ea7 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -457,7 +457,7 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
@@ -661,7 +661,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
-- 
cgit v1.2.3


From b2a881784c8e525c1fea71c6f23663413d107f05 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 14:48:47 -0800
Subject: Built dead-simple traversal, but now getting depedency cycle error
 :'(

---
 pkg/sentry/socket/netfilter/netfilter.go |  4 +++
 pkg/tcpip/iptables/BUILD                 |  4 ++-
 pkg/tcpip/iptables/iptables.go           | 59 ++++++++++++++++++++++++++++++++
 pkg/tcpip/network/ipv4/ipv4.go           |  6 ++++
 test/iptables/filter_input.go            |  2 +-
 5 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e4c493141..57785220e 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -368,6 +368,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
+	// TODO(gvisor.dev/issue/170): Check the following conditions:
+	// - There are no loops.
+	// - There are no chains without an unconditional final rule.
+
 	ipt := stack.IPTables()
 	table.SetMetadata(metadata{
 		HookEntry:  replace.HookEntry,
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index cc5f531e2..6ed7c6da0 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -11,5 +11,7 @@ go_library(
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/tcpip/buffer"],
+    deps = [
+        "//pkg/tcpip",
+    ],
 )
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 9e7005374..025a4679d 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -16,6 +16,8 @@
 // tool.
 package iptables
 
+import "github.com/google/netstack/tcpip"
+
 const (
 	TablenameNat    = "nat"
 	TablenameMangle = "mangle"
@@ -121,3 +123,60 @@ func EmptyFilterTable() Table {
 		UserChains: map[string]int{},
 	}
 }
+
+// Check runs pkt through the rules for hook. It returns true when the packet
+// should continue traversing the network stack and false when it should be
+// dropped.
+func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
+	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
+	// we're missing features. Jumps, the call stack, etc. aren't checked
+	// for yet because we're yet to support them.
+	log.Infof("kevin: iptables.IPTables: checking hook %v", hook)
+
+	// Go through each table containing the hook.
+	for _, tablename := range it.Priorities[hook] {
+		verdict := it.checkTable(tablename)
+		switch verdict {
+		// TODO: We either got a final verdict or simply continue on.
+		}
+	}
+}
+
+func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) bool {
+	log.Infof("kevin: iptables.IPTables: checking table %q", tablename)
+	table := it.Tables[tablename]
+	ruleIdx := table.BuiltinChains[hook]
+
+	// Start from ruleIdx and go down until a rule gives us a verdict.
+	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
+		verdict := checkRule(hook, pkt, table, ruleIdx)
+		switch verdict {
+		case Accept, Drop:
+			return verdict
+		case Continue:
+			continue
+		case Stolen, Queue, Repeat, None, Jump, Return:
+		}
+	}
+
+	panic("Traversed past the entire list of iptables rules.")
+}
+
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
+	rule := table.Rules[ruleIdx]
+	// Go through each rule matcher. If they all match, run
+	// the rule target.
+	for _, matcher := range rule.Matchers {
+		matches, hotdrop := matcher.Match(hook, pkt, "")
+		if hotdrop {
+			return Drop
+		}
+		if !matches {
+			return Continue
+		}
+	}
+
+	// All the matchers matched, so run the target.
+	verdict, _ := rule.Target.Action(pkt)
+	return verdict
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index e645cf62c..bbb5aafee 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -350,6 +350,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	}
 	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
+	// iptables filtering.
+	if ok := iptables.Check(iptables.Input, pkt); !ok {
+		// iptables is telling us to drop the packet.
+		return
+	}
+
 	hlen := int(h.HeaderLength())
 	tlen := int(h.TotalLength())
 	pkt.Data.TrimFront(hlen)
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 1723a4d3e..7c4d469fa 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -138,7 +138,7 @@ func (FilterInputDropAll) ContainerAction(ip net.IP) error {
 		return err
 	}
 
-	// Listen for All packets on dropPort.
+	// Listen for all packets on dropPort.
 	if err := listenUDP(dropPort, sendloopDuration); err == nil {
 		return fmt.Errorf("packets should have been dropped, but got a packet")
 	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
-- 
cgit v1.2.3


From e21c5840569155d39e8e11ac18cee99bc6d67469 Mon Sep 17 00:00:00 2001
From: Bert Muthalaly <stijlist@google.com>
Date: Wed, 8 Jan 2020 14:49:12 -0800
Subject: Combine various Create*NIC methods into CreateNICWithOptions.

PiperOrigin-RevId: 288779416
---
 pkg/tcpip/stack/ndp_test.go                        |  6 +--
 pkg/tcpip/stack/stack.go                           | 46 ++++++++++------------
 pkg/tcpip/stack/stack_test.go                      | 10 +++--
 pkg/tcpip/stack/transport_demuxer_test.go          |  5 ++-
 pkg/tcpip/transport/tcp/tcp_test.go                |  5 ++-
 pkg/tcpip/transport/tcp/testing/context/context.go | 10 +++--
 pkg/tcpip/transport/udp/udp_test.go                |  5 ++-
 runsc/boot/network.go                              |  5 ++-
 8 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8d89859ba..070d80c8d 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -2500,9 +2500,9 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 			SecretKey: secretKey,
 		},
 	})
-
-	if err := s.CreateNamedNIC(nicID, nicName, e); err != nil {
-		t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, nicName, err)
+	opts := stack.NICOptions{Name: nicName}
+	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v, _) = %s", nicID, opts, err)
 	}
 
 	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 807f910f6..fb7ac409e 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -796,9 +796,21 @@ func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNum
 	return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
 }
 
-// createNIC creates a NIC with the provided id and link-layer endpoint, and
-// optionally enable it.
-func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled bool) *tcpip.Error {
+// NICOptions specifies the configuration of a NIC as it is being created.
+// The zero value creates an enabled, unnamed NIC.
+type NICOptions struct {
+	// Name specifies the name of the NIC.
+	Name string
+
+	// Disabled specifies whether to avoid calling Attach on the passed
+	// LinkEndpoint.
+	Disabled bool
+}
+
+// CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
+// NICOptions. See the documentation on type NICOptions for details on how
+// NICs can be configured.
+func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -807,38 +819,20 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled
 		return tcpip.ErrDuplicateNICID
 	}
 
-	n := newNIC(s, id, name, ep)
+	n := newNIC(s, id, opts.Name, ep)
 
 	s.nics[id] = n
-	if enabled {
+	if !opts.Disabled {
 		return n.enable()
 	}
 
 	return nil
 }
 
-// CreateNIC creates a NIC with the provided id and link-layer endpoint.
+// CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
+// `LinkEndpoint.Attach` to start delivering packets to it.
 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, "", ep, true)
-}
-
-// CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
-// and a human-readable name.
-func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, name, ep, true)
-}
-
-// CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
-// but leave it disable. Stack.EnableNIC must be called before the link-layer
-// endpoint starts delivering packets to it.
-func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, "", ep, false)
-}
-
-// CreateDisabledNamedNIC is a combination of CreateNamedNIC and
-// CreateDisabledNIC.
-func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
-	return s.createNIC(id, name, ep, false)
+	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 33f20579f..9ac50bb23 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2097,8 +2097,9 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 
 			e := channel.New(10, 1280, test.linkAddr)
 			s := stack.New(opts)
-			if err := s.CreateNamedNIC(nicID, test.nicName, e); err != nil {
-				t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, test.nicName, err)
+			nicOpts := stack.NICOptions{Name: test.nicName}
+			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
 			}
 
 			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
@@ -2156,8 +2157,9 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 
 			e := loopback.New()
 			s := stack.New(opts)
-			if err := s.CreateNamedNIC(nicID, nicName, e); err != nil {
-				t.Fatalf("CreateNamedNIC(%d, %q, _) = %s", nicID, nicName, err)
+			nicOpts := stack.NICOptions{Name: nicName}
+			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
 			}
 
 			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 3b28b06d0..33dbc0536 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -80,8 +80,9 @@ func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string)
 	for i, linkEpName := range linkEpNames {
 		channelEP := channel.New(256, mtu, "")
 		nicID := tcpip.NICID(i + 1)
-		if err := s.CreateNamedNIC(nicID, linkEpName, channelEP); err != nil {
-			t.Fatalf("CreateNIC failed: %v", err)
+		opts := stack.NICOptions{Name: linkEpName}
+		if err := s.CreateNICWithOptions(nicID, channelEP, opts); err != nil {
+			t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
 		}
 		linkEPs[linkEpName] = channelEP
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index e8fe4dab5..9d7b0910d 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -3794,8 +3794,9 @@ func TestBindToDeviceOption(t *testing.T) {
 	}
 	defer ep.Close()
 
-	if err := s.CreateNamedNIC(321, "my_device", loopback.New()); err != nil {
-		t.Errorf("CreateNamedNIC failed: %v", err)
+	opts := stack.NICOptions{Name: "my_device"}
+	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
+		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
 	}
 
 	// Make an nameless NIC.
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index b0a376eba..50c81aa65 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -158,15 +158,17 @@ func New(t *testing.T, mtu uint32) *Context {
 	if testing.Verbose() {
 		wep = sniffer.New(ep)
 	}
-	if err := s.CreateNamedNIC(1, "nic1", wep); err != nil {
-		t.Fatalf("CreateNIC failed: %v", err)
+	opts := stack.NICOptions{Name: "nic1"}
+	if err := s.CreateNICWithOptions(1, wep, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
 	}
 	wep2 := stack.LinkEndpoint(channel.New(1000, mtu, ""))
 	if testing.Verbose() {
 		wep2 = sniffer.New(channel.New(1000, mtu, ""))
 	}
-	if err := s.CreateNamedNIC(2, "nic2", wep2); err != nil {
-		t.Fatalf("CreateNIC failed: %v", err)
+	opts2 := stack.NICOptions{Name: "nic2"}
+	if err := s.CreateNICWithOptions(2, wep2, opts2); err != nil {
+		t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts2, err)
 	}
 
 	if err := s.AddAddress(1, ipv4.ProtocolNumber, StackAddr); err != nil {
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 7051a7a9c..65382b7f1 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -508,8 +508,9 @@ func TestBindToDeviceOption(t *testing.T) {
 	}
 	defer ep.Close()
 
-	if err := s.CreateNamedNIC(321, "my_device", loopback.New()); err != nil {
-		t.Errorf("CreateNamedNIC failed: %v", err)
+	opts := stack.NICOptions{Name: "my_device"}
+	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
+		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
 	}
 
 	// Make an nameless NIC.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0240fe323..6a8765ec8 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -219,8 +219,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
 func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
-	if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
-		return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
+	opts := stack.NICOptions{Name: name}
+	if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
+		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
 	}
 
 	// Always start with an arp address for the NIC.
-- 
cgit v1.2.3


From d530df2f95c3f75488ecc56b8fd205c3ee0966f8 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 8 Jan 2020 15:39:22 -0800
Subject: Introduce tcpip.SockOptBool

...and port V6OnlyOption to it.

PiperOrigin-RevId: 288789451
---
 pkg/sentry/socket/netstack/netstack.go             | 21 ++++--
 pkg/sentry/socket/unix/transport/unix.go           | 16 +++++
 pkg/tcpip/stack/ndp_test.go                        | 15 ++---
 pkg/tcpip/stack/transport_demuxer_test.go          | 48 +++++++-------
 pkg/tcpip/stack/transport_test.go                  | 10 +++
 pkg/tcpip/tcpip.go                                 | 21 ++++--
 pkg/tcpip/transport/icmp/endpoint.go               | 10 +++
 pkg/tcpip/transport/packet/endpoint.go             | 22 +++++--
 pkg/tcpip/transport/raw/endpoint.go                | 40 +++++++-----
 pkg/tcpip/transport/tcp/dual_stack_test.go         |  8 +--
 pkg/tcpip/transport/tcp/endpoint.go                | 75 ++++++++++++----------
 pkg/tcpip/transport/tcp/tcp_test.go                |  8 +--
 pkg/tcpip/transport/tcp/testing/context/context.go |  6 +-
 pkg/tcpip/transport/udp/endpoint.go                | 60 +++++++++--------
 pkg/tcpip/transport/udp/udp_test.go                |  2 +-
 15 files changed, 224 insertions(+), 138 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 5f91a0d1a..9e0d69046 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -222,6 +222,10 @@ type commonEndpoint interface {
 	// transport.Endpoint.SetSockOpt.
 	SetSockOpt(interface{}) *tcpip.Error
 
+	// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
+	// transport.Endpoint.SetSockOptBool.
+	SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
+
 	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
 	// transport.Endpoint.SetSockOptInt.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
@@ -230,6 +234,10 @@ type commonEndpoint interface {
 	// transport.Endpoint.GetSockOpt.
 	GetSockOpt(interface{}) *tcpip.Error
 
+	// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
+	// transport.Endpoint.GetSockOpt.
+	GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
+
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
@@ -1213,12 +1221,15 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.V6OnlyOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.V6OnlyOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		var o uint32
+		if v {
+			o = 1
+		}
+		return int32(o), nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1621,7 +1632,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.V6OnlyOption, v != 0))
 
 	case linux.IPV6_ADD_MEMBERSHIP,
 		linux.IPV6_DROP_MEMBERSHIP,
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index fcba49435..37c7ac3c1 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -175,6 +175,10 @@ type Endpoint interface {
 	// types.
 	SetSockOpt(opt interface{}) *tcpip.Error
 
+	// SetSockOptBool sets a socket option for simple cases when a value has
+	// the int type.
+	SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
+
 	// SetSockOptInt sets a socket option for simple cases when a value has
 	// the int type.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
@@ -183,6 +187,10 @@ type Endpoint interface {
 	// tcpip.*Option types.
 	GetSockOpt(opt interface{}) *tcpip.Error
 
+	// GetSockOptBool gets a socket option for simple cases when a return
+	// value has the int type.
+	GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
+
 	// GetSockOptInt gets a socket option for simple cases when a return
 	// value has the int type.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
@@ -851,10 +859,18 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
+func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	return nil
+}
+
 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
+func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 070d80c8d..e51462a55 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1701,9 +1701,8 @@ func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
 		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
 	}
 	defer ep.Close()
-	v := tcpip.V6OnlyOption(1)
-	if err := ep.SetSockOpt(v); err != nil {
-		t.Fatalf("SetSockOpt(%+v): %s", v, err)
+	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 	}
 	if err := ep.Connect(dstAddr); err != nil {
 		t.Fatalf("ep.Connect(%+v): %s", dstAddr, err)
@@ -1728,9 +1727,8 @@ func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullA
 		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
 	}
 	defer ep.Close()
-	v := tcpip.V6OnlyOption(1)
-	if err := ep.SetSockOpt(v); err != nil {
-		t.Fatalf("SetSockOpt(%+v): %s", v, err)
+	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 	}
 	if err := ep.Bind(addr); err != nil {
 		t.Fatalf("ep.Bind(%+v): %s", addr, err)
@@ -2066,9 +2064,8 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
 	}
 	defer ep.Close()
-	v := tcpip.V6OnlyOption(1)
-	if err := ep.SetSockOpt(v); err != nil {
-		t.Fatalf("SetSockOpt(%+v): %s", v, err)
+	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 	}
 
 	if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 33dbc0536..df5ced887 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -61,11 +61,7 @@ func (c *testContext) createV6Endpoint(v6only bool) {
 		c.t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	var v tcpip.V6OnlyOption
-	if v6only {
-		v = 1
-	}
-	if err := c.ep.SetSockOpt(v); err != nil {
+	if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
 		c.t.Fatalf("SetSockOpt failed: %v", err)
 	}
 }
@@ -201,54 +197,54 @@ func TestDistribution(t *testing.T) {
 			"BindPortReuse",
 			// 5 endpoints that all have reuse set.
 			[]endpointSockopts{
-				endpointSockopts{1, ""},
-				endpointSockopts{1, ""},
-				endpointSockopts{1, ""},
-				endpointSockopts{1, ""},
-				endpointSockopts{1, ""},
+				{1, ""},
+				{1, ""},
+				{1, ""},
+				{1, ""},
+				{1, ""},
 			},
 			map[string][]float64{
 				// Injected packets on dev0 get distributed evenly.
-				"dev0": []float64{0.2, 0.2, 0.2, 0.2, 0.2},
+				"dev0": {0.2, 0.2, 0.2, 0.2, 0.2},
 			},
 		},
 		{
 			"BindToDevice",
 			// 3 endpoints with various bindings.
 			[]endpointSockopts{
-				endpointSockopts{0, "dev0"},
-				endpointSockopts{0, "dev1"},
-				endpointSockopts{0, "dev2"},
+				{0, "dev0"},
+				{0, "dev1"},
+				{0, "dev2"},
 			},
 			map[string][]float64{
 				// Injected packets on dev0 go only to the endpoint bound to dev0.
-				"dev0": []float64{1, 0, 0},
+				"dev0": {1, 0, 0},
 				// Injected packets on dev1 go only to the endpoint bound to dev1.
-				"dev1": []float64{0, 1, 0},
+				"dev1": {0, 1, 0},
 				// Injected packets on dev2 go only to the endpoint bound to dev2.
-				"dev2": []float64{0, 0, 1},
+				"dev2": {0, 0, 1},
 			},
 		},
 		{
 			"ReuseAndBindToDevice",
 			// 6 endpoints with various bindings.
 			[]endpointSockopts{
-				endpointSockopts{1, "dev0"},
-				endpointSockopts{1, "dev0"},
-				endpointSockopts{1, "dev1"},
-				endpointSockopts{1, "dev1"},
-				endpointSockopts{1, "dev1"},
-				endpointSockopts{1, ""},
+				{1, "dev0"},
+				{1, "dev0"},
+				{1, "dev1"},
+				{1, "dev1"},
+				{1, "dev1"},
+				{1, ""},
 			},
 			map[string][]float64{
 				// Injected packets on dev0 get distributed among endpoints bound to
 				// dev0.
-				"dev0": []float64{0.5, 0.5, 0, 0, 0, 0},
+				"dev0": {0.5, 0.5, 0, 0, 0, 0},
 				// Injected packets on dev1 get distributed among endpoints bound to
 				// dev1 or unbound.
-				"dev1": []float64{0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
+				"dev1": {0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
 				// Injected packets on dev999 go only to the unbound.
-				"dev999": []float64{0, 0, 0, 0, 0, 1},
+				"dev999": {0, 0, 0, 0, 0, 1},
 			},
 		},
 	} {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 095346f0b..f50604a8a 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -102,11 +102,21 @@ func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
+// SetSockOptBool sets a socket option. Currently not supported.
+func (*fakeTransportEndpoint) SetSockOptBool(tcpip.SockOptBool, bool) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
 // SetSockOptInt sets a socket option. Currently not supported.
 func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOptInt, int) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (*fakeTransportEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	return -1, tcpip.ErrUnknownProtocolOption
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index b172d71b0..1eca76c30 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -423,6 +423,10 @@ type Endpoint interface {
 	// SetSockOpt sets a socket option. opt should be one of the *Option types.
 	SetSockOpt(opt interface{}) *Error
 
+	// SetSockOptBool sets a socket option, for simple cases where a value
+	// has the bool type.
+	SetSockOptBool(opt SockOptBool, v bool) *Error
+
 	// SetSockOptInt sets a socket option, for simple cases where a value
 	// has the int type.
 	SetSockOptInt(opt SockOptInt, v int) *Error
@@ -431,6 +435,10 @@ type Endpoint interface {
 	// *Option types.
 	GetSockOpt(opt interface{}) *Error
 
+	// GetSockOptBool gets a socket option for simple cases where a return
+	// value has the bool type.
+	GetSockOptBool(SockOptBool) (bool, *Error)
+
 	// GetSockOptInt gets a socket option for simple cases where a return
 	// value has the int type.
 	GetSockOptInt(SockOptInt) (int, *Error)
@@ -488,6 +496,15 @@ type WriteOptions struct {
 	Atomic bool
 }
 
+// SockOptBool represents socket options which values have the bool type.
+type SockOptBool int
+
+const (
+	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
+	// socket is to be restricted to sending and receiving IPv6 packets only.
+	V6OnlyOption SockOptBool = iota
+)
+
 // SockOptInt represents socket options which values have the int type.
 type SockOptInt int
 
@@ -521,10 +538,6 @@ const (
 // the endpoint should be cleared and returned.
 type ErrorOption struct{}
 
-// V6OnlyOption is used by SetSockOpt/GetSockOpt to specify whether an IPv6
-// socket is to be restricted to sending and receiving IPv6 packets only.
-type V6OnlyOption int
-
 // CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
 // held until segments are full by the TCP transport protocol.
 type CorkOption int
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 5816ce49a..c7ce74cdd 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -350,11 +350,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
+// SetSockOptBool sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	return nil
+}
+
 // SetSockOptInt sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 6360ce880..07ffa8aba 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -247,17 +247,17 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
 func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return tcpip.ErrNotSupported
+	return tcpip.ErrUnknownProtocolOption
 }
 
-// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
-	return 0, tcpip.ErrNotSupported
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
@@ -265,6 +265,16 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrNotSupported
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	return 0, tcpip.ErrNotSupported
+}
+
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
 func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	ep.rcvMu.Lock()
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 0fd9c456a..85f7eb76b 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -509,11 +509,36 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	case *tcpip.KeepaliveEnabledOption:
+		*o = 0
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -544,21 +569,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	return -1, tcpip.ErrUnknownProtocolOption
 }
 
-// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
-	default:
-		return tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
 func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
 	e.rcvMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index dfaa4a559..4f361b226 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -391,9 +391,8 @@ func testV4Accept(t *testing.T, c *context.Context) {
 	// Make sure we get the same error when calling the original ep and the
 	// new one. This validates that v4-mapped endpoints are still able to
 	// query the V6Only flag, whereas pure v4 endpoints are not.
-	var v tcpip.V6OnlyOption
-	expected := c.EP.GetSockOpt(&v)
-	if err := nep.GetSockOpt(&v); err != expected {
+	_, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption)
+	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected {
 		t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected)
 	}
 
@@ -531,8 +530,7 @@ func TestV6AcceptOnV6(t *testing.T) {
 
 	// Make sure we can still query the v6 only status of the new endpoint,
 	// that is, that it is in fact a v6 socket.
-	var v tcpip.V6OnlyOption
-	if err := nep.GetSockOpt(&v); err != nil {
+	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
 		t.Fatalf("GetSockOpt failed failed: %v", err)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index f79154b95..2ac1b6877 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1145,6 +1145,29 @@ func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
 	return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
 }
 
+// SetSockOptBool sets a socket option.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		// We only allow this to be set when we're in the initial state.
+		if e.state != StateInitial {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.v6only = v
+	}
+
+	return nil
+}
+
 // SetSockOptInt sets a socket option.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
@@ -1289,23 +1312,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.notifyProtocolGoroutine(notifyMSSChanged)
 		return nil
 
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrInvalidEndpointState
-		}
-
-		e.mu.Lock()
-		defer e.mu.Unlock()
-
-		// We only allow this to be set when we're in the initial state.
-		if e.state != StateInitial {
-			return tcpip.ErrInvalidEndpointState
-		}
-
-		e.v6only = v != 0
-		return nil
-
 	case tcpip.TTLOption:
 		e.mu.Lock()
 		e.ttl = uint8(v)
@@ -1446,6 +1452,25 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	return e.rcvBufUsed, nil
 }
 
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrUnknownProtocolOption
+		}
+
+		e.mu.Lock()
+		v := e.v6only
+		e.mu.Unlock()
+
+		return v, nil
+	}
+
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -1540,22 +1565,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
-	case *tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrUnknownProtocolOption
-		}
-
-		e.mu.Lock()
-		v := e.v6only
-		e.mu.Unlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
 	case *tcpip.TTLOption:
 		e.mu.Lock()
 		*o = tcpip.TTLOption(e.ttl)
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9d7b0910d..15745ebd4 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -4028,12 +4028,12 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												switch network {
 												case "ipv4":
 												case "ipv6":
-													if err := ep.SetSockOpt(tcpip.V6OnlyOption(1)); err != nil {
-														t.Fatalf("SetSockOpt(V6OnlyOption(1)) failed: %v", err)
+													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+														t.Fatalf("SetSockOpt(V6OnlyOption(true)) failed: %v", err)
 													}
 												case "dual":
-													if err := ep.SetSockOpt(tcpip.V6OnlyOption(0)); err != nil {
-														t.Fatalf("SetSockOpt(V6OnlyOption(0)) failed: %v", err)
+													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
+														t.Fatalf("SetSockOpt(V6OnlyOption(false)) failed: %v", err)
 													}
 												default:
 													t.Fatalf("unknown network: '%s'", network)
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 50c81aa65..822907998 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -475,11 +475,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 		c.t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	var v tcpip.V6OnlyOption
-	if v6only {
-		v = 1
-	}
-	if err := c.EP.SetSockOpt(v); err != nil {
+	if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
 		c.t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 }
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index dae373ea7..1a5ee6317 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -456,14 +456,9 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
-// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
-	return nil
-}
-
-// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -478,8 +473,20 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		e.v6only = v != 0
+		e.v6only = v
+	}
+
+	return nil
+}
 
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	return nil
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
 	case tcpip.TTLOption:
 		e.mu.Lock()
 		e.ttl = uint8(v)
@@ -660,6 +667,25 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrUnknownProtocolOption
+		}
+
+		e.mu.Lock()
+		v := e.v6only
+		e.mu.Unlock()
+
+		return v, nil
+	}
+
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -695,22 +721,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrUnknownProtocolOption
-		}
-
-		e.mu.Lock()
-		v := e.v6only
-		e.mu.Unlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
 	case *tcpip.TTLOption:
 		e.mu.Lock()
 		*o = tcpip.TTLOption(e.ttl)
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 65382b7f1..149fff999 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -335,7 +335,7 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 
 	c.createEndpoint(flow.sockProto())
 	if flow.isV6Only() {
-		if err := c.ep.SetSockOpt(tcpip.V6OnlyOption(1)); err != nil {
+		if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
 			c.t.Fatalf("SetSockOpt failed: %v", err)
 		}
 	} else if flow.isBroadcast() {
-- 
cgit v1.2.3


From 0999ae8b34d83a4b2ea8342d0459c8131c35d6e1 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 15:57:25 -0800
Subject: Getting a panic when running tests. For some reason the filter table
 is ending up with the wrong chains and is indexing -1 into rules.

---
 pkg/sentry/socket/netfilter/netfilter.go | 17 ++++++----------
 pkg/sentry/socket/netstack/netstack.go   | 12 +++++++++--
 pkg/tcpip/BUILD                          |  1 -
 pkg/tcpip/iptables/BUILD                 |  1 +
 pkg/tcpip/iptables/iptables.go           | 35 +++++++++++++++++++++++++-------
 pkg/tcpip/iptables/targets.go            |  8 ++++----
 pkg/tcpip/iptables/types.go              |  8 +++-----
 pkg/tcpip/network/arp/arp.go             |  2 +-
 pkg/tcpip/network/ipv4/BUILD             |  1 +
 pkg/tcpip/network/ipv4/ipv4.go           |  8 ++++++--
 pkg/tcpip/network/ipv6/ipv6.go           |  2 +-
 pkg/tcpip/stack/nic.go                   |  2 +-
 pkg/tcpip/stack/registration.go          |  2 +-
 pkg/tcpip/tcpip.go                       |  4 ----
 14 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 57785220e..3a857ef6d 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -45,7 +44,7 @@ type metadata struct {
 }
 
 // GetInfo returns information about iptables.
-func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
 	var info linux.IPTGetinfo
 	if _, err := t.CopyIn(outPtr, &info); err != nil {
@@ -53,7 +52,7 @@ func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTG
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, info.Name.String())
+	table, err := findTable(stack, info.Name.String())
 	if err != nil {
 		return linux.IPTGetinfo{}, err
 	}
@@ -76,7 +75,7 @@ func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTG
 }
 
 // GetEntries returns netstack's iptables rules encoded for the iptables tool.
-func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
@@ -84,7 +83,7 @@ func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen i
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, userEntries.Name.String())
+	table, err := findTable(stack, userEntries.Name.String())
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
@@ -103,12 +102,8 @@ func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen i
 	return entries, nil
 }
 
-func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) {
-	ipt, err := ep.IPTables()
-	if err != nil {
-		return iptables.Table{}, syserr.FromError(err)
-	}
-	table, ok := ipt.Tables[tableName]
+func findTable(stack *stack.Stack, tableName string) (iptables.Table, *syserr.Error) {
+	table, ok := stack.IPTables().Tables[tableName]
 	if !ok {
 		return iptables.Table{}, syserr.ErrInvalidArgument
 	}
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8c07eef4b..86a8104df 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -826,7 +826,11 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 				return nil, syserr.ErrInvalidArgument
 			}
 
-			info, err := netfilter.GetInfo(t, s.Endpoint, outPtr)
+			stack := inet.StackFromContext(t)
+			if stack == nil {
+				return nil, syserr.ErrNoDevice
+			}
+			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
 			if err != nil {
 				return nil, err
 			}
@@ -837,7 +841,11 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 				return nil, syserr.ErrInvalidArgument
 			}
 
-			entries, err := netfilter.GetEntries(t, s.Endpoint, outPtr, outLen)
+			stack := inet.StackFromContext(t)
+			if stack == nil {
+				return nil, syserr.ErrNoDevice
+			}
+			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
 			if err != nil {
 				return nil, err
 			}
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 65d4d0cd8..36bc3a63b 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -15,7 +15,6 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip/buffer",
-        "//pkg/tcpip/iptables",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index 6ed7c6da0..2893c80cd 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -12,6 +12,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/tcpip",
     ],
 )
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 025a4679d..aff8a680b 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -16,7 +16,12 @@
 // tool.
 package iptables
 
-import "github.com/google/netstack/tcpip"
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
 
 const (
 	TablenameNat    = "nat"
@@ -135,31 +140,47 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 
 	// Go through each table containing the hook.
 	for _, tablename := range it.Priorities[hook] {
-		verdict := it.checkTable(tablename)
+		verdict := it.checkTable(hook, pkt, tablename)
 		switch verdict {
-		// TODO: We either got a final verdict or simply continue on.
+		// If the table returns Accept, move on to the next table.
+		case Accept:
+			continue
+		// The Drop verdict is final.
+		case Drop:
+			log.Infof("kevin: Packet dropped")
+			return false
+		case Stolen, Queue, Repeat, None, Jump, Return, Continue:
+			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
 		}
 	}
+
+	// Every table returned Accept.
+	log.Infof("kevin: Packet accepted")
+	return true
 }
 
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) bool {
+func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
 	log.Infof("kevin: iptables.IPTables: checking table %q", tablename)
 	table := it.Tables[tablename]
-	ruleIdx := table.BuiltinChains[hook]
+	log.Infof("kevin: iptables.IPTables: table %+v", table)
 
 	// Start from ruleIdx and go down until a rule gives us a verdict.
 	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
-		verdict := checkRule(hook, pkt, table, ruleIdx)
+		verdict := it.checkRule(hook, pkt, table, ruleIdx)
 		switch verdict {
+		// For either of these cases, this table is done with the
+		// packet.
 		case Accept, Drop:
 			return verdict
+		// Continue traversing the rules of the table.
 		case Continue:
 			continue
 		case Stolen, Queue, Repeat, None, Jump, Return:
+			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
 		}
 	}
 
-	panic("Traversed past the entire list of iptables rules.")
+	panic(fmt.Sprintf("Traversed past the entire list of iptables rules in table %q.", tablename))
 }
 
 func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 2c3598e3d..cb3ac1aff 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -16,13 +16,13 @@
 
 package iptables
 
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+import "gvisor.dev/gvisor/pkg/tcpip"
 
 // UnconditionalAcceptTarget accepts all packets.
 type UnconditionalAcceptTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalAcceptTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+func (UnconditionalAcceptTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
 	return Accept, ""
 }
 
@@ -30,7 +30,7 @@ func (UnconditionalAcceptTarget) Action(packet buffer.VectorisedView) (Verdict,
 type UnconditionalDropTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
 	return Drop, ""
 }
 
@@ -38,6 +38,6 @@ func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, st
 type PanicTarget struct{}
 
 // Actions implements Target.Action.
-func (PanicTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+func (PanicTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
 	panic("PanicTarget triggered.")
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 540f8c0b4..9f6906100 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -14,9 +14,7 @@
 
 package iptables
 
-import (
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-)
+import "gvisor.dev/gvisor/pkg/tcpip"
 
 // A Hook specifies one of the hooks built into the network stack.
 //
@@ -165,7 +163,7 @@ type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
 	// should be "hotdropped", i.e. dropped immediately. This is usually
 	// used for suspicious packets.
-	Match(hook Hook, packet buffer.VectorisedView, interfaceName string) (matches bool, hotdrop bool)
+	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
 // A Target is the interface for taking an action for a packet.
@@ -173,5 +171,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the name of the chain to jump to.
-	Action(packet buffer.VectorisedView) (Verdict, string)
+	Action(packet tcpip.PacketBuffer) (Verdict, string)
 }
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index da8482509..d88119f68 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -137,7 +137,7 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
 }
 
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
 	if addrWithPrefix.Address != ProtocolAddress {
 		return nil, tcpip.ErrBadLocalAddress
 	}
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index aeddfcdd4..4e2aae9a3 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -15,6 +15,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index bbb5aafee..f856081e6 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -54,10 +55,11 @@ type endpoint struct {
 	dispatcher    stack.TransportDispatcher
 	fragmentation *fragmentation.Fragmentation
 	protocol      *protocol
+	stack         *stack.Stack
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
 	e := &endpoint{
 		nicID:         nicID,
 		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
@@ -66,6 +68,7 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 		dispatcher:    dispatcher,
 		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
 		protocol:      p,
+		stack:         st,
 	}
 
 	return e, nil
@@ -351,7 +354,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
 	// iptables filtering.
-	if ok := iptables.Check(iptables.Input, pkt); !ok {
+	ipt := e.stack.IPTables()
+	if ok := ipt.Check(iptables.Input, pkt); !ok {
 		// iptables is telling us to drop the packet.
 		return
 	}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index e13f1fabf..4c940e9e5 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -221,7 +221,7 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // NewEndpoint creates a new ipv6 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
 	return &endpoint{
 		nicID:         nicID,
 		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 4144d5d0f..f2d338bd1 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -467,7 +467,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 	}
 
 	// Create the new network endpoint.
-	ep, err := netProto.NewEndpoint(n.id, protocolAddress.AddressWithPrefix, n.stack, n, n.linkEP)
+	ep, err := netProto.NewEndpoint(n.id, protocolAddress.AddressWithPrefix, n.stack, n, n.linkEP, n.stack)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 61fd46d66..754323e82 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -282,7 +282,7 @@ type NetworkProtocol interface {
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
 	// NewEndpoint creates a new endpoint of this protocol.
-	NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint) (NetworkEndpoint, *tcpip.Error)
+	NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) (NetworkEndpoint, *tcpip.Error)
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f62fd729f..d02950c7a 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -40,7 +40,6 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -446,9 +445,6 @@ type Endpoint interface {
 	// NOTE: This method is a no-op for sockets other than TCP.
 	ModerateRecvBuf(copied int)
 
-	// IPTables returns the iptables for this endpoint's stack.
-	IPTables() (iptables.IPTables, error)
-
 	// Info returns a copy to the transport endpoint info.
 	Info() EndpointInfo
 
-- 
cgit v1.2.3


From 1c2420146777de5b5727f69331b50be1b57a3351 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Jan 2020 15:40:00 -0800
Subject: Github bug reviver

For everyone's joy, this is a tool that reopens issues that
have been closed, but are still referenced by TODOs in the
code. The idea is to run it in Kokoro nightly. Kokoro changes
are coming up next.

PiperOrigin-RevId: 288789560
---
 WORKSPACE                                   |  21 +++
 go.mod                                      |   2 +
 go.sum                                      |   2 +
 tools/issue_reviver/BUILD                   |  12 ++
 tools/issue_reviver/github/BUILD            |  17 +++
 tools/issue_reviver/github/github.go        | 164 ++++++++++++++++++++++++
 tools/issue_reviver/main.go                 |  89 +++++++++++++
 tools/issue_reviver/reviver/BUILD           |  19 +++
 tools/issue_reviver/reviver/reviver.go      | 192 ++++++++++++++++++++++++++++
 tools/issue_reviver/reviver/reviver_test.go |  88 +++++++++++++
 10 files changed, 606 insertions(+)
 create mode 100644 tools/issue_reviver/BUILD
 create mode 100644 tools/issue_reviver/github/BUILD
 create mode 100644 tools/issue_reviver/github/github.go
 create mode 100644 tools/issue_reviver/main.go
 create mode 100644 tools/issue_reviver/reviver/BUILD
 create mode 100644 tools/issue_reviver/reviver/reviver.go
 create mode 100644 tools/issue_reviver/reviver/reviver_test.go

diff --git a/WORKSPACE b/WORKSPACE
index 4b5a3bfe2..e2afc073c 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -290,6 +290,27 @@ go_repository(
     version = "v1.3.1",
 )
 
+go_repository(
+    name = "com_github_google_go-github",
+    importpath = "github.com/google/go-github",
+    sum = "h1:N0LgJ1j65A7kfXrZnUDaYCs/Sf4rEjNlfyDHW9dolSY=",
+    version = "v17.0.0",
+)
+
+go_repository(
+    name = "org_golang_x_oauth2",
+    importpath = "golang.org/x/oauth2",
+    sum = "h1:pE8b58s1HRDMi8RDc79m0HISf9D4TzseP40cEA6IGfs=",
+    version = "v0.0.0-20191202225959-858c2ad4c8b6",
+)
+
+go_repository(
+    name = "com_github_google_go-querystring",
+    importpath = "github.com/google/go-querystring",
+    sum = "h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk=",
+    version = "v1.0.0",
+)
+
 # System Call test dependencies.
 http_archive(
     name = "com_google_absl",
diff --git a/go.mod b/go.mod
index 304b8bf13..c4687ed02 100644
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
   github.com/golang/protobuf v1.3.1
   github.com/google/btree v1.0.0
   github.com/google/go-cmp v0.2.0
+  github.com/google/go-github/v28 v28.1.1
   github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
   github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
   github.com/kr/pty v1.1.1
@@ -17,5 +18,6 @@ require (
   github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
   github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
   golang.org/x/net v0.0.0-20190311183353-d8887717615a
+  golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6
   golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
 )
diff --git a/go.sum b/go.sum
index 7a0bc175a..434770beb 100644
--- a/go.sum
+++ b/go.sum
@@ -4,6 +4,7 @@ github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFU
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-github/v28 v28.1.1/go.mod h1:bsqJWQX05omyWVmc00nEUql9mhQyv38lDZ8kPZcQVoM=
 github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
 github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -13,6 +14,7 @@ github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+S
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD
new file mode 100644
index 000000000..ee7ea11fd
--- /dev/null
+++ b/tools/issue_reviver/BUILD
@@ -0,0 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "issue_reviver",
+    srcs = ["main.go"],
+    deps = [
+        "//tools/issue_reviver/github",
+        "//tools/issue_reviver/reviver",
+    ],
+)
diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD
new file mode 100644
index 000000000..6da22ba1c
--- /dev/null
+++ b/tools/issue_reviver/github/BUILD
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "github",
+    srcs = ["github.go"],
+    importpath = "gvisor.dev/gvisor/tools/issue_reviver/github",
+    visibility = [
+        "//tools/issue_reviver:__subpackages__",
+    ],
+    deps = [
+        "//tools/issue_reviver/reviver",
+        "@com_github_google_go-github//github:go_default_library",
+        "@org_golang_x_oauth2//:go_default_library",
+    ],
+)
diff --git a/tools/issue_reviver/github/github.go b/tools/issue_reviver/github/github.go
new file mode 100644
index 000000000..e07949c8f
--- /dev/null
+++ b/tools/issue_reviver/github/github.go
@@ -0,0 +1,164 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package github implements reviver.Bugger interface on top of Github issues.
+package github
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/google/go-github/github"
+	"golang.org/x/oauth2"
+	"gvisor.dev/gvisor/tools/issue_reviver/reviver"
+)
+
+// Bugger implements reviver.Bugger interface for github issues.
+type Bugger struct {
+	owner  string
+	repo   string
+	dryRun bool
+
+	client *github.Client
+	issues map[int]*github.Issue
+}
+
+// NewBugger creates a new Bugger.
+func NewBugger(token, owner, repo string, dryRun bool) (*Bugger, error) {
+	b := &Bugger{
+		owner:  owner,
+		repo:   repo,
+		dryRun: dryRun,
+		issues: map[int]*github.Issue{},
+	}
+	if err := b.load(token); err != nil {
+		return nil, err
+	}
+	return b, nil
+}
+
+func (b *Bugger) load(token string) error {
+	ctx := context.Background()
+	if len(token) == 0 {
+		fmt.Print("No OAUTH token provided, using unauthenticated account.\n")
+		b.client = github.NewClient(nil)
+	} else {
+		ts := oauth2.StaticTokenSource(
+			&oauth2.Token{AccessToken: token},
+		)
+		tc := oauth2.NewClient(ctx, ts)
+		b.client = github.NewClient(tc)
+	}
+
+	err := processAllPages(func(listOpts github.ListOptions) (*github.Response, error) {
+		opts := &github.IssueListByRepoOptions{State: "open", ListOptions: listOpts}
+		tmps, resp, err := b.client.Issues.ListByRepo(ctx, b.owner, b.repo, opts)
+		if err != nil {
+			return resp, err
+		}
+		for _, issue := range tmps {
+			b.issues[issue.GetNumber()] = issue
+		}
+		return resp, nil
+	})
+	if err != nil {
+		return err
+	}
+
+	fmt.Printf("Loaded %d issues from github.com/%s/%s\n", len(b.issues), b.owner, b.repo)
+	return nil
+}
+
+// Activate implements reviver.Bugger.
+func (b *Bugger) Activate(todo *reviver.Todo) (bool, error) {
+	const prefix = "gvisor.dev/issue/"
+
+	// First check if I can handle the TODO.
+	idStr := strings.TrimPrefix(todo.Issue, prefix)
+	if len(todo.Issue) == len(idStr) {
+		return false, nil
+	}
+
+	id, err := strconv.Atoi(idStr)
+	if err != nil {
+		return true, err
+	}
+
+	// Check against active issues cache.
+	if _, ok := b.issues[id]; ok {
+		fmt.Printf("%q is active: OK\n", todo.Issue)
+		return true, nil
+	}
+
+	fmt.Printf("%q is not active: reopening issue %d\n", todo.Issue, id)
+
+	// Format comment with TODO locations and search link.
+	comment := strings.Builder{}
+	fmt.Fprintln(&comment, "There are TODOs still referencing this issue:")
+	for _, l := range todo.Locations {
+		fmt.Fprintf(&comment,
+			"1. [%s:%d](https://github.com/%s/%s/blob/HEAD/%s#%d): %s\n",
+			l.File, l.Line, b.owner, b.repo, l.File, l.Line, l.Comment)
+	}
+	fmt.Fprintf(&comment,
+		"\n\nSearch [TODO](https://github.com/%s/%s/search?q=%%22%s%d%%22)", b.owner, b.repo, prefix, id)
+
+	if b.dryRun {
+		fmt.Printf("[dry-run: skipping change to issue %d]\n%s\n=======================\n", id, comment.String())
+		return true, nil
+	}
+
+	ctx := context.Background()
+	req := &github.IssueRequest{State: github.String("open")}
+	_, _, err = b.client.Issues.Edit(ctx, b.owner, b.repo, id, req)
+	if err != nil {
+		return true, fmt.Errorf("failed to reactivate issue %d: %v", id, err)
+	}
+
+	cmt := &github.IssueComment{
+		Body:      github.String(comment.String()),
+		Reactions: &github.Reactions{Confused: github.Int(1)},
+	}
+	if _, _, err := b.client.Issues.CreateComment(ctx, b.owner, b.repo, id, cmt); err != nil {
+		return true, fmt.Errorf("failed to add comment to issue %d: %v", id, err)
+	}
+
+	return true, nil
+}
+
+func processAllPages(fn func(github.ListOptions) (*github.Response, error)) error {
+	opts := github.ListOptions{PerPage: 1000}
+	for {
+		resp, err := fn(opts)
+		if err != nil {
+			if rateErr, ok := err.(*github.RateLimitError); ok {
+				duration := rateErr.Rate.Reset.Sub(time.Now())
+				if duration > 5*time.Minute {
+					return fmt.Errorf("Rate limited for too long: %v", duration)
+				}
+				fmt.Printf("Rate limited, sleeping for: %v\n", duration)
+				time.Sleep(duration)
+				continue
+			}
+			return err
+		}
+		if resp.NextPage == 0 {
+			return nil
+		}
+		opts.Page = resp.NextPage
+	}
+}
diff --git a/tools/issue_reviver/main.go b/tools/issue_reviver/main.go
new file mode 100644
index 000000000..4256f5a6c
--- /dev/null
+++ b/tools/issue_reviver/main.go
@@ -0,0 +1,89 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package main is the entry point for issue_reviver.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+
+	"gvisor.dev/gvisor/tools/issue_reviver/github"
+	"gvisor.dev/gvisor/tools/issue_reviver/reviver"
+)
+
+var (
+	owner     string
+	repo      string
+	tokenFile string
+	path      string
+	dryRun    bool
+)
+
+// Keep the options simple for now. Supports only a single path and repo.
+func init() {
+	flag.StringVar(&owner, "owner", "google", "Github project org/owner to look for issues")
+	flag.StringVar(&repo, "repo", "gvisor", "Github repo to look for issues")
+	flag.StringVar(&tokenFile, "oauth-token-file", "", "Path to file containing the OAUTH token to be used as credential to github")
+	flag.StringVar(&path, "path", "", "Path to scan for TODOs")
+	flag.BoolVar(&dryRun, "dry-run", false, "If set to true, no changes are made to issues")
+}
+
+func main() {
+	flag.Parse()
+
+	// Check for mandatory parameters.
+	if len(owner) == 0 {
+		fmt.Println("missing --owner option.")
+		flag.Usage()
+		os.Exit(1)
+	}
+	if len(repo) == 0 {
+		fmt.Println("missing --repo option.")
+		flag.Usage()
+		os.Exit(1)
+	}
+	if len(path) == 0 {
+		fmt.Println("missing --path option.")
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	// Token is passed as a file so it doesn't show up in command line arguments.
+	var token string
+	if len(tokenFile) != 0 {
+		bytes, err := ioutil.ReadFile(tokenFile)
+		if err != nil {
+			fmt.Println(err.Error())
+			os.Exit(1)
+		}
+		token = string(bytes)
+	}
+
+	bugger, err := github.NewBugger(token, owner, repo, dryRun)
+	if err != nil {
+		fmt.Fprintln(os.Stderr, "Error getting github issues:", err)
+		os.Exit(1)
+	}
+	rev := reviver.New([]string{path}, []reviver.Bugger{bugger})
+	if errs := rev.Run(); len(errs) > 0 {
+		fmt.Fprintf(os.Stderr, "Encountered %d errors:\n", len(errs))
+		for _, err := range errs {
+			fmt.Fprintf(os.Stderr, "\t%v\n", err)
+		}
+		os.Exit(1)
+	}
+}
diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD
new file mode 100644
index 000000000..2c3675977
--- /dev/null
+++ b/tools/issue_reviver/reviver/BUILD
@@ -0,0 +1,19 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "reviver",
+    srcs = ["reviver.go"],
+    importpath = "gvisor.dev/gvisor/tools/issue_reviver/reviver",
+    visibility = [
+        "//tools/issue_reviver:__subpackages__",
+    ],
+)
+
+go_test(
+    name = "reviver_test",
+    size = "small",
+    srcs = ["reviver_test.go"],
+    embed = [":reviver"],
+)
diff --git a/tools/issue_reviver/reviver/reviver.go b/tools/issue_reviver/reviver/reviver.go
new file mode 100644
index 000000000..682db0c01
--- /dev/null
+++ b/tools/issue_reviver/reviver/reviver.go
@@ -0,0 +1,192 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package reviver scans the code looking for TODOs and pass them to registered
+// Buggers to ensure TODOs point to active issues.
+package reviver
+
+import (
+	"bufio"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sync"
+)
+
+// This is how a TODO looks like.
+var regexTodo = regexp.MustCompile(`(\/\/|#)\s*(TODO|FIXME)\(([a-zA-Z0-9.\/]+)\):\s*(.+)`)
+
+// Bugger interface is called for every TODO found in the code. If it can handle
+// the TODO, it must return true. If it returns false, the next Bugger is
+// called. If no Bugger handles the TODO, it's dropped on the floor.
+type Bugger interface {
+	Activate(todo *Todo) (bool, error)
+}
+
+// Location saves the location where the TODO was found.
+type Location struct {
+	Comment string
+	File    string
+	Line    uint
+}
+
+// Todo represents a unique TODO. There can be several TODOs pointing to the
+// same issue in the code. They are all grouped together.
+type Todo struct {
+	Issue     string
+	Locations []Location
+}
+
+// Reviver scans the given paths for TODOs and calls Buggers to handle them.
+type Reviver struct {
+	paths   []string
+	buggers []Bugger
+
+	mu    sync.Mutex
+	todos map[string]*Todo
+	errs  []error
+}
+
+// New create a new Reviver.
+func New(paths []string, buggers []Bugger) *Reviver {
+	return &Reviver{
+		paths:   paths,
+		buggers: buggers,
+		todos:   map[string]*Todo{},
+	}
+}
+
+// Run runs. It returns all errors found during processing, it doesn't stop
+// on errors.
+func (r *Reviver) Run() []error {
+	// Process each directory in parallel.
+	wg := sync.WaitGroup{}
+	for _, path := range r.paths {
+		wg.Add(1)
+		go func(path string) {
+			defer wg.Done()
+			r.processPath(path, &wg)
+		}(path)
+	}
+
+	wg.Wait()
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	fmt.Printf("Processing %d TODOs (%d errors)...\n", len(r.todos), len(r.errs))
+	dropped := 0
+	for _, todo := range r.todos {
+		ok, err := r.processTodo(todo)
+		if err != nil {
+			r.errs = append(r.errs, err)
+		}
+		if !ok {
+			dropped++
+		}
+	}
+	fmt.Printf("Processed %d TODOs, %d were skipped (%d errors)\n", len(r.todos)-dropped, dropped, len(r.errs))
+
+	return r.errs
+}
+
+func (r *Reviver) processPath(path string, wg *sync.WaitGroup) {
+	fmt.Printf("Processing dir %q\n", path)
+	fis, err := ioutil.ReadDir(path)
+	if err != nil {
+		r.addErr(fmt.Errorf("error processing dir %q: %v", path, err))
+		return
+	}
+
+	for _, fi := range fis {
+		childPath := filepath.Join(path, fi.Name())
+		switch {
+		case fi.Mode().IsDir():
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				r.processPath(childPath, wg)
+			}()
+
+		case fi.Mode().IsRegular():
+			file, err := os.Open(childPath)
+			if err != nil {
+				r.addErr(err)
+				continue
+			}
+
+			scanner := bufio.NewScanner(file)
+			lineno := uint(0)
+			for scanner.Scan() {
+				lineno++
+				line := scanner.Text()
+				if todo := r.processLine(line, childPath, lineno); todo != nil {
+					r.addTodo(todo)
+				}
+			}
+		}
+	}
+}
+
+func (r *Reviver) processLine(line, path string, lineno uint) *Todo {
+	matches := regexTodo.FindStringSubmatch(line)
+	if matches == nil {
+		return nil
+	}
+	if len(matches) != 5 {
+		panic(fmt.Sprintf("regex returned wrong matches for %q: %v", line, matches))
+	}
+	return &Todo{
+		Issue: matches[3],
+		Locations: []Location{
+			{
+				File:    path,
+				Line:    lineno,
+				Comment: matches[4],
+			},
+		},
+	}
+}
+
+func (r *Reviver) addTodo(newTodo *Todo) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if todo := r.todos[newTodo.Issue]; todo == nil {
+		r.todos[newTodo.Issue] = newTodo
+	} else {
+		todo.Locations = append(todo.Locations, newTodo.Locations...)
+	}
+}
+
+func (r *Reviver) addErr(err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.errs = append(r.errs, err)
+}
+
+func (r *Reviver) processTodo(todo *Todo) (bool, error) {
+	for _, bugger := range r.buggers {
+		ok, err := bugger.Activate(todo)
+		if err != nil {
+			return false, err
+		}
+		if ok {
+			return true, nil
+		}
+	}
+	return false, nil
+}
diff --git a/tools/issue_reviver/reviver/reviver_test.go b/tools/issue_reviver/reviver/reviver_test.go
new file mode 100644
index 000000000..a9fb1f9f1
--- /dev/null
+++ b/tools/issue_reviver/reviver/reviver_test.go
@@ -0,0 +1,88 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package reviver
+
+import (
+	"testing"
+)
+
+func TestProcessLine(t *testing.T) {
+	for _, tc := range []struct {
+		line string
+		want *Todo
+	}{
+		{
+			line: "// TODO(foobar.com/issue/123): comment, bla. blabla.",
+			want: &Todo{
+				Issue: "foobar.com/issue/123",
+				Locations: []Location{
+					{Comment: "comment, bla. blabla."},
+				},
+			},
+		},
+		{
+			line: "// FIXME(b/123): internal bug",
+			want: &Todo{
+				Issue: "b/123",
+				Locations: []Location{
+					{Comment: "internal bug"},
+				},
+			},
+		},
+		{
+			line: "TODO(issue): not todo",
+		},
+		{
+			line: "FIXME(issue): not todo",
+		},
+		{
+			line: "// TODO (issue): not todo",
+		},
+		{
+			line: "// TODO(issue) not todo",
+		},
+		{
+			line: "// todo(issue): not todo",
+		},
+		{
+			line: "// TODO(issue):",
+		},
+	} {
+		t.Logf("Testing: %s", tc.line)
+		r := Reviver{}
+		got := r.processLine(tc.line, "test", 0)
+		if got == nil {
+			if tc.want != nil {
+				t.Errorf("failed to process line, want: %+v", tc.want)
+			}
+		} else {
+			if tc.want == nil {
+				t.Errorf("expected error, got: %+v", got)
+				continue
+			}
+			if got.Issue != tc.want.Issue {
+				t.Errorf("wrong issue, got: %v, want: %v", got.Issue, tc.want.Issue)
+			}
+			if len(got.Locations) != len(tc.want.Locations) {
+				t.Errorf("wrong number of locations, got: %v, want: %v, locations: %+v", len(got.Locations), len(tc.want.Locations), got.Locations)
+			}
+			for i, wantLoc := range tc.want.Locations {
+				if got.Locations[i].Comment != wantLoc.Comment {
+					t.Errorf("wrong comment, got: %v, want: %v", got.Locations[i].Comment, wantLoc.Comment)
+				}
+			}
+		}
+	}
+}
-- 
cgit v1.2.3


From b3ae8a62cfdf13821d35467d4150ed983ac556f1 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Wed, 8 Jan 2020 16:29:12 -0800
Subject: Fix slice bounds out of range panic in parsing socket control
 message.

Panic found by syzakller.

PiperOrigin-RevId: 288799046
---
 pkg/sentry/socket/control/control.go     |  6 ++++++
 test/syscalls/linux/socket_ip_unbound.cc | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index af1a4e95f..4301b697c 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -471,6 +471,9 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_IP:
 			switch h.Type {
 			case linux.IP_TOS:
+				if length < linux.SizeOfControlMessageTOS {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
 				cmsgs.IP.HasTOS = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += AlignUp(length, width)
@@ -481,6 +484,9 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_IPV6:
 			switch h.Type {
 			case linux.IPV6_TCLASS:
+				if length < linux.SizeOfControlMessageTClass {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
 				cmsgs.IP.HasTClass = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
 				i += AlignUp(length, width)
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index b6754111f..ca597e267 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -129,6 +129,7 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTtl) {
 struct TOSOption {
   int level;
   int option;
+  int cmsg_level;
 };
 
 constexpr int INET_ECN_MASK = 3;
@@ -139,10 +140,12 @@ static TOSOption GetTOSOption(int domain) {
     case AF_INET:
       opt.level = IPPROTO_IP;
       opt.option = IP_TOS;
+      opt.cmsg_level = SOL_IP;
       break;
     case AF_INET6:
       opt.level = IPPROTO_IPV6;
       opt.option = IPV6_TCLASS;
+      opt.cmsg_level = SOL_IPV6;
       break;
   }
   return opt;
@@ -386,6 +389,36 @@ TEST_P(IPUnboundSocketTest, NullTOS) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST_P(IPUnboundSocketTest, InsufficientBufferTOS) {
+  SKIP_IF(GetParam().protocol == IPPROTO_TCP);
+
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  TOSOption t = GetTOSOption(GetParam().domain);
+
+  in_addr addr4;
+  in6_addr addr6;
+  ASSERT_THAT(inet_pton(AF_INET, "127.0.0.1", &addr4), ::testing::Eq(1));
+  ASSERT_THAT(inet_pton(AF_INET6, "fe80::", &addr6), ::testing::Eq(1));
+
+  cmsghdr cmsg = {};
+  cmsg.cmsg_len = sizeof(cmsg);
+  cmsg.cmsg_level = t.cmsg_level;
+  cmsg.cmsg_type = t.option;
+
+  msghdr msg = {};
+  msg.msg_control = &cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+  if (GetParam().domain == AF_INET) {
+    msg.msg_name = &addr4;
+    msg.msg_namelen = sizeof(addr4);
+  } else {
+    msg.msg_name = &addr6;
+    msg.msg_namelen = sizeof(addr6);
+  }
+
+  EXPECT_THAT(sendmsg(socket->get(), &msg, 0), SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     IPUnboundSockets, IPUnboundSocketTest,
     ::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
-- 
cgit v1.2.3


From f26a576984052a235b63ec79081a8c4a8c8ffc00 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 16:35:01 -0800
Subject: Addressed GH comments

---
 pkg/abi/linux/netfilter.go               | 15 ++++--
 pkg/sentry/socket/netfilter/netfilter.go | 87 +++++++++++++-------------------
 pkg/sentry/socket/netstack/netstack.go   |  5 +-
 3 files changed, 47 insertions(+), 60 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 35d66d622..c4f4ea0b1 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -298,6 +298,7 @@ type IPTReplace struct {
 	// Entries [0]IPTEntry
 }
 
+// KernelIPTEntry is identical to IPTReplace, but includes the Entries field.
 type KernelIPTReplace struct {
 	IPTReplace
 	Entries [0]IPTEntry
@@ -306,28 +307,32 @@ type KernelIPTReplace struct {
 // SizeOfIPTReplace is the size of an IPTReplace.
 const SizeOfIPTReplace = 96
 
+// ExtensionName holds the name of a netfilter extension.
 type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte
 
 // String implements fmt.Stringer.
 func (en ExtensionName) String() string {
-	return name(en[:])
+	return goString(en[:])
 }
 
+// ExtensionName holds the name of a netfilter table.
 type TableName [XT_TABLE_MAXNAMELEN]byte
 
 // String implements fmt.Stringer.
 func (tn TableName) String() string {
-	return name(tn[:])
+	return goString(tn[:])
 }
 
+// ExtensionName holds the name of a netfilter error. These can also hold
+// user-defined chains.
 type ErrorName [XT_FUNCTION_MAXNAMELEN]byte
 
 // String implements fmt.Stringer.
-func (fn ErrorName) String() string {
-	return name(fn[:])
+func (en ErrorName) String() string {
+	return goString(en[:])
 }
 
-func name(cstring []byte) string {
+func goString(cstring []byte) string {
 	for i, c := range cstring {
 		if c == 0 {
 			return string(cstring[:i])
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 347342f98..799865b03 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -53,7 +53,7 @@ func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTG
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, info.Name.String())
+	table, err := findTable(ep, info.Name)
 	if err != nil {
 		return linux.IPTGetinfo{}, err
 	}
@@ -84,7 +84,7 @@ func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen i
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, userEntries.Name.String())
+	table, err := findTable(ep, userEntries.Name)
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
@@ -96,19 +96,19 @@ func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen i
 		return linux.KernelIPTGetEntries{}, err
 	}
 	if binary.Size(entries) > uintptr(outLen) {
-		log.Infof("Insufficient GetEntries output size: %d", uintptr(outLen))
+		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	return entries, nil
 }
 
-func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) {
+func findTable(ep tcpip.Endpoint, tablename linux.TableName) (iptables.Table, *syserr.Error) {
 	ipt, err := ep.IPTables()
 	if err != nil {
 		return iptables.Table{}, syserr.FromError(err)
 	}
-	table, ok := ipt.Tables[tableName]
+	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
 		return iptables.Table{}, syserr.ErrInvalidArgument
 	}
@@ -138,17 +138,17 @@ func FillDefaultIPTables(stack *stack.Stack) {
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
 
 	// The table name has to fit in the struct.
-	if linux.XT_TABLE_MAXNAMELEN < len(name) {
-		log.Infof("Table name too long.")
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		log.Warningf("Table name %q too long.", tablename)
 		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
 	}
-	copy(entries.Name[:], name)
+	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
 		// Is this a chain entry point?
@@ -273,11 +273,12 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 	case -linux.NF_DROP - 1:
 		return iptables.Drop, nil
 	case -linux.NF_QUEUE - 1:
-		log.Infof("Unsupported iptables verdict QUEUE.")
+		log.Warningf("Unsupported iptables verdict QUEUE.")
 	case linux.NF_RETURN:
-		log.Infof("Unsupported iptables verdict RETURN.")
+		log.Warningf("Unsupported iptables verdict RETURN.")
+	default:
+		log.Warningf("Unknown iptables verdict %d.", val)
 	}
-	log.Infof("Unknown iptables verdict %d.", val)
 	return iptables.Invalid, syserr.ErrInvalidArgument
 }
 
@@ -288,7 +289,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
-		log.Infof("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
+		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
 		return syserr.ErrInvalidArgument
 	}
 	var replace linux.IPTReplace
@@ -302,7 +303,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	case iptables.TablenameFilter:
 		table = iptables.EmptyFilterTable()
 	default:
-		log.Infof(fmt.Sprintf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()))
+		log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
 	}
 
@@ -312,7 +313,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
-			log.Infof("netfilter: optVal has insufficient size for entry %d", len(optVal))
+			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		var entry linux.IPTEntry
@@ -328,7 +329,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// filtering. We reject any nonzero IPTIP values for now.
 		emptyIPTIP := linux.IPTIP{}
 		if entry.IP != emptyIPTIP {
-			log.Infof("netfilter: non-empty struct iptip found")
+			log.Warningf("netfilter: non-empty struct iptip found")
 			return syserr.ErrInvalidArgument
 		}
 
@@ -358,11 +359,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 				}
 			}
 			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
-				log.Infof("Hook %v is unset.", hk)
+				log.Warningf("Hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
-				log.Infof("Underflow %v is unset.", hk)
+				log.Warningf("Underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -385,7 +386,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // along with the number of bytes it occupies in optVal.
 func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	if len(optVal) < linux.SizeOfXTEntryTarget {
-		log.Infof("netfilter: optVal has insufficient size for entry target %d", len(optVal))
+		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, 0, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
@@ -395,14 +396,14 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	case "":
 		// Standard target.
 		if len(optVal) < linux.SizeOfXTStandardTarget {
-			log.Infof("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
+			log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
 			return nil, 0, syserr.ErrInvalidArgument
 		}
-		var target linux.XTStandardTarget
+		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &target)
+		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		verdict, err := translateToStandardVerdict(target.Verdict)
+		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
 		if err != nil {
 			return nil, 0, err
 		}
@@ -424,9 +425,9 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
 			return nil, 0, syserr.ErrInvalidArgument
 		}
-		var target linux.XTErrorTarget
+		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &target)
+		binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
 
 		// Error targets are used in 2 cases:
 		// * An actual error case. These rules have an error
@@ -435,11 +436,11 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 		//   somehow fall through every rule.
 		// * To mark the start of a user defined chain. These
 		//   rules have an error with the name of the chain.
-		switch target.Name.String() {
+		switch errorTarget.Name.String() {
 		case errorTargetName:
 			return iptables.PanicTarget{}, linux.SizeOfXTErrorTarget, nil
 		default:
-			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", target.Name.String())
+			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
 			return nil, 0, syserr.ErrInvalidArgument
 		}
 	}
@@ -449,22 +450,6 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	return nil, 0, syserr.ErrInvalidArgument
 }
 
-func chainNameFromHook(hook int) string {
-	switch hook {
-	case linux.NF_INET_PRE_ROUTING:
-		return iptables.ChainNamePrerouting
-	case linux.NF_INET_LOCAL_IN:
-		return iptables.ChainNameInput
-	case linux.NF_INET_FORWARD:
-		return iptables.ChainNameForward
-	case linux.NF_INET_LOCAL_OUT:
-		return iptables.ChainNameOutput
-	case linux.NF_INET_POST_ROUTING:
-		return iptables.ChainNamePostrouting
-	}
-	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain"))
-}
-
 func hookFromLinux(hook int) iptables.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
@@ -489,7 +474,7 @@ func printReplace(optVal []byte) {
 	replaceBuf := optVal[:linux.SizeOfIPTReplace]
 	optVal = optVal[linux.SizeOfIPTReplace:]
 	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
-	log.Infof("kevin: Replacing table %q: %+v", replace.Name.String(), replace)
+	log.Infof("Replacing table %q: %+v", replace.Name.String(), replace)
 
 	// Read in the list of entries at the end of replace.
 	var totalOffset uint16
@@ -497,33 +482,33 @@ func printReplace(optVal []byte) {
 		var entry linux.IPTEntry
 		entryBuf := optVal[:linux.SizeOfIPTEntry]
 		binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry)
-		log.Infof("kevin: Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
+		log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
 
 		totalOffset += entry.NextOffset
 		if entry.TargetOffset == linux.SizeOfIPTEntry {
-			log.Infof("kevin: Entry has no matches.")
+			log.Infof("Entry has no matches.")
 		} else {
-			log.Infof("kevin: Entry has matches.")
+			log.Infof("Entry has matches.")
 		}
 
 		var target linux.XTEntryTarget
 		targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget]
 		binary.Unmarshal(targetBuf, usermem.ByteOrder, &target)
-		log.Infof("kevin: Target named %q: %+v", target.Name.String(), target)
+		log.Infof("Target named %q: %+v", target.Name.String(), target)
 
 		switch target.Name.String() {
 		case "":
 			var standardTarget linux.XTStandardTarget
 			stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget]
 			binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget)
-			log.Infof("kevin: Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
+			log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
 		case errorTargetName:
 			var errorTarget linux.XTErrorTarget
 			etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget]
 			binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget)
-			log.Infof("kevin: Error target with name %q.", errorTarget.Name.String())
+			log.Infof("Error target with name %q.", errorTarget.Name.String())
 		default:
-			log.Infof("kevin: Unknown target type.")
+			log.Infof("Unknown target type.")
 		}
 
 		optVal = optVal[entry.NextOffset:]
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8c07eef4b..cd3dd1a53 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1368,10 +1368,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 				return syserr.ErrNoDevice
 			}
 			// Stack must be a netstack stack.
-			if err := netfilter.SetEntries(stack.(*Stack).Stack, optVal); err != nil {
-				return err
-			}
-			return nil
+			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
 
 		case linux.IPT_SO_SET_ADD_COUNTERS:
 			// TODO(gvisor.dev/issue/170): Counter support.
-- 
cgit v1.2.3


From 565b64148314018e1234196182b55c4f01772e77 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 8 Jan 2020 16:32:50 -0800
Subject: Define sizes for extent headers and entries separately to improve
 clarity.

PiperOrigin-RevId: 288799694
---
 pkg/sentry/fsimpl/ext/disklayout/extent.go      | 10 +++++++---
 pkg/sentry/fsimpl/ext/disklayout/extent_test.go |  6 +++---
 pkg/sentry/fsimpl/ext/extent_file.go            |  8 ++++----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 567523d32..4110649ab 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -29,8 +29,12 @@ package disklayout
 //       byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()).
 
 const (
-	// ExtentStructsSize is the size of all the three extent on-disk structs.
-	ExtentStructsSize = 12
+	// ExtentHeaderSize is the size of the header of an extent tree node.
+	ExtentHeaderSize = 12
+
+	// ExtentEntrySize is the size of an entry in an extent tree node.
+	// This size is the same for both leaf and internal nodes.
+	ExtentEntrySize = 12
 
 	// ExtentMagic is the magic number which must be present in the header.
 	ExtentMagic = 0xf30a
@@ -57,7 +61,7 @@ type ExtentNode struct {
 	Entries []ExtentEntryPair
 }
 
-// ExtentEntry reprsents an extent tree node entry. The entry can either be
+// ExtentEntry represents an extent tree node entry. The entry can either be
 // an ExtentIdx or Extent itself. This exists to simplify navigation logic.
 type ExtentEntry interface {
 	// FileBlock returns the first file block number covered by this entry.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index b0fad9b71..8762b90db 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,7 @@ import (
 // TestExtentSize tests that the extent structs are of the correct
 // size.
 func TestExtentSize(t *testing.T) {
-	assertSize(t, ExtentHeader{}, ExtentStructsSize)
-	assertSize(t, ExtentIdx{}, ExtentStructsSize)
-	assertSize(t, Extent{}, ExtentStructsSize)
+	assertSize(t, ExtentHeader{}, ExtentHeaderSize)
+	assertSize(t, ExtentIdx{}, ExtentEntrySize)
+	assertSize(t, Extent{}, ExtentEntrySize)
 }
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 3d3ebaca6..11dcc0346 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -57,7 +57,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) {
 func (f *extentFile) buildExtTree() error {
 	rootNodeData := f.regFile.inode.diskInode.Data()
 
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+	binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
 
 	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
 	if f.root.Header.NumEntries > 4 {
@@ -67,7 +67,7 @@ func (f *extentFile) buildExtTree() error {
 	}
 
 	f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
-	for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+	for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
 		var curEntry disklayout.ExtentEntry
 		if f.root.Header.Height == 0 {
 			// Leaf node.
@@ -76,7 +76,7 @@ func (f *extentFile) buildExtTree() error {
 			// Internal node.
 			curEntry = &disklayout.ExtentIdx{}
 		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
 		f.root.Entries[i].Entry = curEntry
 	}
 
@@ -105,7 +105,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla
 	}
 
 	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
-	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+	for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
 		var curEntry disklayout.ExtentEntry
 		if header.Height == 0 {
 			// Leaf node.
-- 
cgit v1.2.3


From fbb2c008e26a7e9d860f6cbf796ea7c375858502 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Wed, 8 Jan 2020 16:35:43 -0800
Subject: Return correct length with MSG_TRUNC for unix sockets.

This change calls a new Truncate method on the EndpointReader in RecvMsg for
both netlink and unix sockets.  This allows readers such as sockets to peek at
the length of data without actually reading it to a buffer.

Fixes #993 #1240

PiperOrigin-RevId: 288800167
---
 pkg/sentry/socket/netlink/BUILD                   |   1 -
 pkg/sentry/socket/netlink/socket.go               |  29 +++---
 pkg/sentry/socket/unix/io.go                      |  13 +++
 pkg/sentry/socket/unix/unix.go                    |  23 ++++-
 test/syscalls/linux/BUILD                         |   1 -
 test/syscalls/linux/socket_non_stream.cc          | 113 +++++++++++++++++++++-
 test/syscalls/linux/socket_non_stream_blocking.cc |  37 +++++++
 test/syscalls/linux/socket_stream.cc              |  55 ++++++++++-
 8 files changed, 250 insertions(+), 22 deletions(-)

diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 79589e3c8..136821963 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 4a1b87a9a..d2e3644a6 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
@@ -500,29 +499,29 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	trunc := flags&linux.MSG_TRUNC != 0
 
 	r := unix.EndpointReader{
+		Ctx:      t,
 		Endpoint: s.ep,
 		Peek:     flags&linux.MSG_PEEK != 0,
 	}
 
+	doRead := func() (int64, error) {
+		return dst.CopyOutFrom(t, &r)
+	}
+
 	// If MSG_TRUNC is set with a zero byte destination then we still need
 	// to read the message and discard it, or in the case where MSG_PEEK is
 	// set, leave it be. In both cases the full message length must be
-	// returned. However, the memory manager for the destination will not read
-	// the endpoint if the destination is zero length.
-	//
-	// In order for the endpoint to be read when the destination size is zero,
-	// we must cause a read of the endpoint by using a separate fake zero
-	// length block sequence and calling the EndpointReader directly.
+	// returned.
 	if trunc && dst.Addrs.NumBytes() == 0 {
-		// Perform a read to a zero byte block sequence. We can ignore the
-		// original destination since it was zero bytes. The length returned by
-		// ReadToBlocks is ignored and we return the full message length to comply
-		// with MSG_TRUNC.
-		_, err := r.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(make([]byte, 0))))
-		return int(r.MsgSize), linux.MSG_TRUNC, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
+		doRead = func() (int64, error) {
+			err := r.Truncate()
+			// Always return zero for bytes read since the destination size is
+			// zero.
+			return 0, err
+		}
 	}
 
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		var mflags int
 		if n < int64(r.MsgSize) {
 			mflags |= linux.MSG_TRUNC
@@ -540,7 +539,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	defer s.EventUnregister(&e)
 
 	for {
-		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+		if n, err := doRead(); err != syserror.ErrWouldBlock {
 			var mflags int
 			if n < int64(r.MsgSize) {
 				mflags |= linux.MSG_TRUNC
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 2ec1a662d..2447f24ef 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -83,6 +83,19 @@ type EndpointReader struct {
 	ControlTrunc bool
 }
 
+// Truncate calls RecvMsg on the endpoint without writing to a destination.
+func (r *EndpointReader) Truncate() error {
+	// Ignore bytes read since it will always be zero.
+	_, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, r.Creds, r.NumRights, r.Peek, r.From)
+	r.Control = c
+	r.ControlTrunc = ct
+	r.MsgSize = ms
+	if err != nil {
+		return err.ToError()
+	}
+	return nil
+}
+
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 885758054..91effe89a 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -544,8 +544,27 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if senderRequested {
 		r.From = &tcpip.FullAddress{}
 	}
+
+	doRead := func() (int64, error) {
+		return dst.CopyOutFrom(t, &r)
+	}
+
+	// If MSG_TRUNC is set with a zero byte destination then we still need
+	// to read the message and discard it, or in the case where MSG_PEEK is
+	// set, leave it be. In both cases the full message length must be
+	// returned.
+	if trunc && dst.Addrs.NumBytes() == 0 {
+		doRead = func() (int64, error) {
+			err := r.Truncate()
+			// Always return zero for bytes read since the destination size is
+			// zero.
+			return 0, err
+		}
+
+	}
+
 	var total int64
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
+	if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait {
 		var from linux.SockAddr
 		var fromLen uint32
 		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
@@ -580,7 +599,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	defer s.EventUnregister(&e)
 
 	for {
-		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+		if n, err := doRead(); err != syserror.ErrWouldBlock {
 			var from linux.SockAddr
 			var fromLen uint32
 			if r.From != nil {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 064ce8429..ce8abe217 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2888,7 +2888,6 @@ cc_library(
         ":unix_domain_socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "//test/util:timer_util",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index d91c5ed39..c61817f14 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -113,7 +113,7 @@ TEST_P(NonStreamSocketPairTest, RecvmsgMsghdrFlagMsgTrunc) {
   EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
 
   // Check that msghdr flags were updated.
-  EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+  EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
 }
 
 // Stream sockets allow data sent with multiple sends to be peeked at in a
@@ -193,7 +193,7 @@ TEST_P(NonStreamSocketPairTest, MsgTruncTruncationRecvmsgMsghdrFlagMsgTrunc) {
   EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
 
   // Check that msghdr flags were updated.
-  EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+  EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
 }
 
 TEST_P(NonStreamSocketPairTest, MsgTruncSameSize) {
@@ -224,5 +224,114 @@ TEST_P(NonStreamSocketPairTest, MsgTruncNotFull) {
   EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
 }
 
+// This test tests reading from a socket with MSG_TRUNC and a zero length
+// receive buffer. The user should be able to get the message length.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // The receive buffer is of zero length.
+  char received_data[0] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  // The syscall succeeds returning the full size of the message on the socket.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // Check that MSG_TRUNC is set on msghdr flags.
+  EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer. The user should be able to get the message length
+// without reading data off the socket.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncMsgPeekZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // The receive buffer is of zero length.
+  char peek_data[0] = {};
+
+  struct iovec peek_iov;
+  peek_iov.iov_base = peek_data;
+  peek_iov.iov_len = sizeof(peek_data);
+  struct msghdr peek_msg = {};
+  peek_msg.msg_flags = -1;
+  peek_msg.msg_iov = &peek_iov;
+  peek_msg.msg_iovlen = 1;
+
+  // The syscall succeeds returning the full size of the message on the socket.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+                                  MSG_TRUNC | MSG_PEEK),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // Check that MSG_TRUNC is set on msghdr flags because the receive buffer is
+  // smaller than the message size.
+  EXPECT_EQ(peek_msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+
+  char received_data[sizeof(sent_data)] = {};
+
+  struct iovec received_iov;
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = sizeof(received_data);
+  struct msghdr received_msg = {};
+  received_msg.msg_flags = -1;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+
+  // Next we can read the actual data.
+  ASSERT_THAT(
+      RetryEINTR(recvmsg)(sockets->second_fd(), &received_msg, MSG_TRUNC),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // Check that MSG_TRUNC is not set on msghdr flags because we read the whole
+  // message.
+  EXPECT_EQ(received_msg.msg_flags & MSG_TRUNC, 0);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The user should be able to get an
+// EAGAIN or EWOULDBLOCK error response.
+TEST_P(NonStreamSocketPairTest, RecvmsgTruncPeekDontwaitZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // NOTE: We don't send any data on the socket.
+
+  // The receive buffer is of zero length.
+  char peek_data[0] = {};
+
+  struct iovec peek_iov;
+  peek_iov.iov_base = peek_data;
+  peek_iov.iov_len = sizeof(peek_data);
+  struct msghdr peek_msg = {};
+  peek_msg.msg_flags = -1;
+  peek_msg.msg_iov = &peek_iov;
+  peek_msg.msg_iovlen = 1;
+
+  // recvmsg fails with EAGAIN because no data is available on the socket.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+                                  MSG_TRUNC | MSG_PEEK | MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 62d87c1af..b052f6e61 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -25,6 +25,7 @@
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -44,5 +45,41 @@ TEST_P(BlockingNonStreamSocketPairTest, RecvLessThanBufferWaitAll) {
               SyscallSucceedsWithValue(sizeof(sent_data)));
 }
 
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The recvmsg call should block on
+// reading the data.
+TEST_P(BlockingNonStreamSocketPairTest,
+       RecvmsgTruncPeekDontwaitZeroLenBlocking) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // NOTE: We don't initially send any data on the socket.
+  const int data_size = 10;
+  char sent_data[data_size];
+  RandomizeBuffer(sent_data, data_size);
+
+  // The receive buffer is of zero length.
+  char peek_data[0] = {};
+
+  struct iovec peek_iov;
+  peek_iov.iov_base = peek_data;
+  peek_iov.iov_len = sizeof(peek_data);
+  struct msghdr peek_msg = {};
+  peek_msg.msg_flags = -1;
+  peek_msg.msg_iov = &peek_iov;
+  peek_msg.msg_iovlen = 1;
+
+  ScopedThread t([&]() {
+    // The syscall succeeds returning the full size of the message on the
+    // socket. This should block until there is data on the socket.
+    ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+                                    MSG_TRUNC | MSG_PEEK),
+                SyscallSucceedsWithValue(data_size));
+  });
+
+  absl::SleepFor(absl::Seconds(1));
+  ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), sent_data, data_size, 0),
+              SyscallSucceedsWithValue(data_size));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index 346443f96..6522b2e01 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -104,7 +104,60 @@ TEST_P(StreamSocketPairTest, RecvmsgMsghdrFlagsNoMsgTrunc) {
   EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
 
   // Check that msghdr flags were cleared (MSG_TRUNC was not set).
-  EXPECT_EQ(msg.msg_flags, 0);
+  ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[0] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+              SyscallSucceedsWithValue(0));
+
+  // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+  ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncPeekZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[0] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(
+      RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC | MSG_PEEK),
+      SyscallSucceedsWithValue(0));
+
+  // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+  ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
 }
 
 TEST_P(StreamSocketPairTest, MsgTrunc) {
-- 
cgit v1.2.3


From ae060a63d9ad1bfb65b84a2ccbaf2893c5a50b76 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 17:30:08 -0800
Subject: More GH comments.

---
 pkg/abi/linux/netfilter.go               |  6 +++---
 pkg/sentry/socket/netfilter/netfilter.go |  8 ++++----
 pkg/tcpip/iptables/BUILD                 |  5 ++++-
 pkg/tcpip/iptables/iptables.go           |  6 +++---
 pkg/tcpip/iptables/targets.go            | 16 +++++++++++-----
 5 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index c4f4ea0b1..33fcc6c95 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -298,7 +298,7 @@ type IPTReplace struct {
 	// Entries [0]IPTEntry
 }
 
-// KernelIPTEntry is identical to IPTReplace, but includes the Entries field.
+// KernelIPTReplace is identical to IPTReplace, but includes the Entries field.
 type KernelIPTReplace struct {
 	IPTReplace
 	Entries [0]IPTEntry
@@ -315,7 +315,7 @@ func (en ExtensionName) String() string {
 	return goString(en[:])
 }
 
-// ExtensionName holds the name of a netfilter table.
+// TableName holds the name of a netfilter table.
 type TableName [XT_TABLE_MAXNAMELEN]byte
 
 // String implements fmt.Stringer.
@@ -323,7 +323,7 @@ func (tn TableName) String() string {
 	return goString(tn[:])
 }
 
-// ExtensionName holds the name of a netfilter error. These can also hold
+// ErrorName holds the name of a netfilter error. These can also hold
 // user-defined chains.
 type ErrorName [XT_FUNCTION_MAXNAMELEN]byte
 
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 799865b03..60bb30a9f 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -210,8 +210,8 @@ func marshalTarget(target iptables.Target) []byte {
 		return marshalStandardTarget(iptables.Accept)
 	case iptables.UnconditionalDropTarget:
 		return marshalStandardTarget(iptables.Drop)
-	case iptables.PanicTarget:
-		return marshalPanicTarget()
+	case iptables.ErrorTarget:
+		return marshalErrorTarget()
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
@@ -230,7 +230,7 @@ func marshalStandardTarget(verdict iptables.Verdict) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
-func marshalPanicTarget() []byte {
+func marshalErrorTarget() []byte {
 	// This is an error target named error
 	target := linux.XTErrorTarget{
 		Target: linux.XTEntryTarget{
@@ -438,7 +438,7 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 		//   rules have an error with the name of the chain.
 		switch errorTarget.Name.String() {
 		case errorTargetName:
-			return iptables.PanicTarget{}, linux.SizeOfXTErrorTarget, nil
+			return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil
 		default:
 			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
 			return nil, 0, syserr.ErrInvalidArgument
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index cc5f531e2..64769c333 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -11,5 +11,8 @@ go_library(
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/tcpip/buffer"],
+    deps = [
+        "//pkg/log",
+        "//pkg/tcpip/buffer",
+    ],
 )
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 9e7005374..db0450a21 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -45,7 +45,7 @@ func DefaultTables() IPTables {
 					Rule{Target: UnconditionalAcceptTarget{}},
 					Rule{Target: UnconditionalAcceptTarget{}},
 					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: PanicTarget{}},
+					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
 					Prerouting:  0,
@@ -65,7 +65,7 @@ func DefaultTables() IPTables {
 				Rules: []Rule{
 					Rule{Target: UnconditionalAcceptTarget{}},
 					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: PanicTarget{}},
+					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
 					Prerouting: 0,
@@ -82,7 +82,7 @@ func DefaultTables() IPTables {
 					Rule{Target: UnconditionalAcceptTarget{}},
 					Rule{Target: UnconditionalAcceptTarget{}},
 					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: PanicTarget{}},
+					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
 					Input:   0,
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 2c3598e3d..d65ed8df5 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -16,7 +16,10 @@
 
 package iptables
 
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+import (
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
 
 // UnconditionalAcceptTarget accepts all packets.
 type UnconditionalAcceptTarget struct{}
@@ -34,10 +37,13 @@ func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, st
 	return Drop, ""
 }
 
-// PanicTarget just panics. It represents a target that should be unreachable.
-type PanicTarget struct{}
+// ErrorTarget logs an error and drops the packet. It represents a target that
+// should be unreachable.
+type ErrorTarget struct{}
 
 // Actions implements Target.Action.
-func (PanicTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
-	panic("PanicTarget triggered.")
+func (ErrorTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+	log.Warningf("ErrorTarget triggered.")
+	return Drop, ""
+
 }
-- 
cgit v1.2.3


From d057871f410088fe6825b1dde695f015e36abf73 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 8 Jan 2020 17:19:35 -0800
Subject: CancellableTimer to encapsulate the work of safely stopping timers

Add a new CancellableTimer type to encapsulate the work of safely stopping
timers when it fires at the same time some "related work" is being handled. The
term "related work" is some work that needs to be done while having obtained
some common lock (L).

Example: Say we have an invalidation timer that may be extended or cancelled by
some event. Creating a normal timer and simply cancelling may not be sufficient
as the timer may have already fired when the event handler attemps to cancel it.
Even if the timer and event handler obtains L before doing work, once the event
handler releases L, the timer will eventually obtain L and do some unwanted
work.

To prevent the timer from doing unwanted work, it checks if it should early
return instead of doing the normal work after obtaining L. When stopping the
timer callers must have L locked so the timer can be safely informed that it
should early return.

Test: Tests that CancellableTimer fires and resets properly. Test to make sure
the timer fn is not called after being stopped within the lock L.
PiperOrigin-RevId: 288806984
---
 pkg/tcpip/BUILD             |   8 +
 pkg/tcpip/stack/ndp.go      | 349 ++++++++------------------------------------
 pkg/tcpip/stack/ndp_test.go |  12 +-
 pkg/tcpip/timer.go          | 161 ++++++++++++++++++++
 pkg/tcpip/timer_test.go     | 236 ++++++++++++++++++++++++++++++
 5 files changed, 473 insertions(+), 293 deletions(-)
 create mode 100644 pkg/tcpip/timer.go
 create mode 100644 pkg/tcpip/timer_test.go

diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 65d4d0cd8..e07ebd153 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -10,6 +10,7 @@ go_library(
         "packet_buffer_state.go",
         "tcpip.go",
         "time_unsafe.go",
+        "timer.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
@@ -26,3 +27,10 @@ go_test(
     srcs = ["tcpip_test.go"],
     embed = [":tcpip"],
 )
+
+go_test(
+    name = "timer_test",
+    size = "small",
+    srcs = ["timer_test.go"],
+    deps = [":tcpip"],
+)
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 4722ec9ce..35825ebf7 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -299,46 +299,14 @@ type dadState struct {
 // defaultRouterState holds data associated with a default router discovered by
 // a Router Advertisement (RA).
 type defaultRouterState struct {
-	invalidationTimer *time.Timer
-
-	// Used to inform the timer not to invalidate the default router (R) in
-	// a race condition (T1 is a goroutine that handles an RA from R and T2
-	// is the goroutine that handles R's invalidation timer firing):
-	//   T1: Receive a new RA from R
-	//   T1: Obtain the NIC's lock before processing the RA
-	//   T2: R's invalidation timer fires, and gets blocked on obtaining the
-	//       NIC's lock
-	//   T1: Refreshes/extends R's lifetime & releases NIC's lock
-	//   T2: Obtains NIC's lock & invalidates R immediately
-	//
-	// To resolve this, T1 will check to see if the timer already fired, and
-	// inform the timer using doNotInvalidate to not invalidate R, so that
-	// once T2 obtains the lock, it will see that it is set to true and do
-	// nothing further.
-	doNotInvalidate *bool
+	invalidationTimer tcpip.CancellableTimer
 }
 
 // onLinkPrefixState holds data associated with an on-link prefix discovered by
 // a Router Advertisement's Prefix Information option (PI) when the NDP
 // configurations was configured to do so.
 type onLinkPrefixState struct {
-	invalidationTimer *time.Timer
-
-	// Used to signal the timer not to invalidate the on-link prefix (P) in
-	// a race condition (T1 is a goroutine that handles a PI for P and T2
-	// is the goroutine that handles P's invalidation timer firing):
-	//   T1: Receive a new PI for P
-	//   T1: Obtain the NIC's lock before processing the PI
-	//   T2: P's invalidation timer fires, and gets blocked on obtaining the
-	//       NIC's lock
-	//   T1: Refreshes/extends P's lifetime & releases NIC's lock
-	//   T2: Obtains NIC's lock & invalidates P immediately
-	//
-	// To resolve this, T1 will check to see if the timer already fired, and
-	// inform the timer using doNotInvalidate to not invalidate P, so that
-	// once T2 obtains the lock, it will see that it is set to true and do
-	// nothing further.
-	doNotInvalidate *bool
+	invalidationTimer tcpip.CancellableTimer
 }
 
 // autoGenAddressState holds data associated with an address generated via
@@ -348,33 +316,10 @@ type autoGenAddressState struct {
 	// is holding state for.
 	ref *referencedNetworkEndpoint
 
-	deprecationTimer *time.Timer
-
-	// Used to signal the timer not to deprecate the SLAAC address in a race
-	// condition. Used for the same reason as doNotInvalidate, but for deprecating
-	// an address.
-	doNotDeprecate *bool
+	deprecationTimer  tcpip.CancellableTimer
+	invalidationTimer tcpip.CancellableTimer
 
-	invalidationTimer *time.Timer
-
-	// Used to signal the timer not to invalidate the SLAAC address (A) in
-	// a race condition (T1 is a goroutine that handles a PI for A and T2
-	// is the goroutine that handles A's invalidation timer firing):
-	//   T1: Receive a new PI for A
-	//   T1: Obtain the NIC's lock before processing the PI
-	//   T2: A's invalidation timer fires, and gets blocked on obtaining the
-	//       NIC's lock
-	//   T1: Refreshes/extends A's lifetime & releases NIC's lock
-	//   T2: Obtains NIC's lock & invalidates A immediately
-	//
-	// To resolve this, T1 will check to see if the timer already fired, and
-	// inform the timer using doNotInvalidate to not invalidate A, so that
-	// once T2 obtains the lock, it will see that it is set to true and do
-	// nothing further.
-	doNotInvalidate *bool
-
-	// Nonzero only when the address is not valid forever (invalidationTimer
-	// is not nil).
+	// Nonzero only when the address is not valid forever.
 	validUntil time.Time
 }
 
@@ -576,7 +521,7 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 // handleRA handles a Router Advertisement message that arrived on the NIC
 // this ndp is for. Does nothing if the NIC is configured to not handle RAs.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	// Is the NIC configured to handle RAs at all?
 	//
@@ -605,27 +550,9 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 		case ok && rl != 0:
 			// This is an already discovered default router. Update
 			// the invalidation timer.
-			timer := rtr.invalidationTimer
-
-			// We should ALWAYS have an invalidation timer for a
-			// discovered router.
-			if timer == nil {
-				panic("ndphandlera: RA invalidation timer should not be nil")
-			}
-
-			if !timer.Stop() {
-				// If we reach this point, then we know the
-				// timer fired after we already took the NIC
-				// lock. Inform the timer not to invalidate the
-				// router when it obtains the lock as we just
-				// got a new RA that refreshes its lifetime to a
-				// non-zero value. See
-				// defaultRouterState.doNotInvalidate for more
-				// details.
-				*rtr.doNotInvalidate = true
-			}
-
-			timer.Reset(rl)
+			rtr.invalidationTimer.StopLocked()
+			rtr.invalidationTimer.Reset(rl)
+			ndp.defaultRouters[ip] = rtr
 
 		case ok && rl == 0:
 			// We know about the router but it is no longer to be
@@ -692,10 +619,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 		return
 	}
 
-	rtr.invalidationTimer.Stop()
-	rtr.invalidationTimer = nil
-	*rtr.doNotInvalidate = true
-	rtr.doNotInvalidate = nil
+	rtr.invalidationTimer.StopLocked()
 
 	delete(ndp.defaultRouters, ip)
 
@@ -724,27 +648,15 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 		return
 	}
 
-	// Used to signal the timer not to invalidate the default router (R) in
-	// a race condition. See defaultRouterState.doNotInvalidate for more
-	// details.
-	var doNotInvalidate bool
-
-	ndp.defaultRouters[ip] = defaultRouterState{
-		invalidationTimer: time.AfterFunc(rl, func() {
-			ndp.nic.stack.mu.Lock()
-			defer ndp.nic.stack.mu.Unlock()
-			ndp.nic.mu.Lock()
-			defer ndp.nic.mu.Unlock()
-
-			if doNotInvalidate {
-				doNotInvalidate = false
-				return
-			}
-
+	state := defaultRouterState{
+		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
 			ndp.invalidateDefaultRouter(ip)
 		}),
-		doNotInvalidate: &doNotInvalidate,
 	}
+
+	state.invalidationTimer.Reset(rl)
+
+	ndp.defaultRouters[ip] = state
 }
 
 // rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6
@@ -766,21 +678,17 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 		return
 	}
 
-	// Used to signal the timer not to invalidate the on-link prefix (P) in
-	// a race condition. See onLinkPrefixState.doNotInvalidate for more
-	// details.
-	var doNotInvalidate bool
-	var timer *time.Timer
+	state := onLinkPrefixState{
+		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+			ndp.invalidateOnLinkPrefix(prefix)
+		}),
+	}
 
-	// Only create a timer if the lifetime is not infinite.
 	if l < header.NDPInfiniteLifetime {
-		timer = ndp.prefixInvalidationCallback(prefix, l, &doNotInvalidate)
+		state.invalidationTimer.Reset(l)
 	}
 
-	ndp.onLinkPrefixes[prefix] = onLinkPrefixState{
-		invalidationTimer: timer,
-		doNotInvalidate:   &doNotInvalidate,
-	}
+	ndp.onLinkPrefixes[prefix] = state
 }
 
 // invalidateOnLinkPrefix invalidates a discovered on-link prefix.
@@ -795,13 +703,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 		return
 	}
 
-	if s.invalidationTimer != nil {
-		s.invalidationTimer.Stop()
-		s.invalidationTimer = nil
-		*s.doNotInvalidate = true
-	}
-
-	s.doNotInvalidate = nil
+	s.invalidationTimer.StopLocked()
 
 	delete(ndp.onLinkPrefixes, prefix)
 
@@ -811,28 +713,6 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	}
 }
 
-// prefixInvalidationCallback returns a new on-link prefix invalidation timer
-// for prefix that fires after vl.
-//
-// doNotInvalidate is used to signal the timer when it fires at the same time
-// that a prefix's valid lifetime gets refreshed. See
-// onLinkPrefixState.doNotInvalidate for more details.
-func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Duration, doNotInvalidate *bool) *time.Timer {
-	return time.AfterFunc(vl, func() {
-		ndp.nic.stack.mu.Lock()
-		defer ndp.nic.stack.mu.Unlock()
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
-
-		if *doNotInvalidate {
-			*doNotInvalidate = false
-			return
-		}
-
-		ndp.invalidateOnLinkPrefix(prefix)
-	})
-}
-
 // handleOnLinkPrefixInformation handles a Prefix Information option with
 // its on-link flag set, as per RFC 4861 section 6.3.4.
 //
@@ -872,42 +752,17 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio
 
 	// This is an already discovered on-link prefix with a
 	// new non-zero valid lifetime.
+	//
 	// Update the invalidation timer.
-	timer := prefixState.invalidationTimer
-
-	if timer == nil && vl >= header.NDPInfiniteLifetime {
-		// Had infinite valid lifetime before and
-		// continues to have an invalid lifetime. Do
-		// nothing further.
-		return
-	}
 
-	if timer != nil && !timer.Stop() {
-		// If we reach this point, then we know the timer alread fired
-		// after we took the NIC lock. Inform the timer to not
-		// invalidate the prefix once it obtains the lock as we just
-		// got a new PI that refreshes its lifetime to a non-zero value.
-		// See onLinkPrefixState.doNotInvalidate for more details.
-		*prefixState.doNotInvalidate = true
-	}
+	prefixState.invalidationTimer.StopLocked()
 
-	if vl >= header.NDPInfiniteLifetime {
-		// Prefix is now valid forever so we don't need
-		// an invalidation timer.
-		prefixState.invalidationTimer = nil
-		ndp.onLinkPrefixes[prefix] = prefixState
-		return
-	}
-
-	if timer != nil {
-		// We already have a timer so just reset it to
-		// expire after the new valid lifetime.
-		timer.Reset(vl)
-		return
+	if vl < header.NDPInfiniteLifetime {
+		// Prefix is valid for a finite lifetime, reset the timer to expire after
+		// the new valid lifetime.
+		prefixState.invalidationTimer.Reset(vl)
 	}
 
-	// We do not have a timer so just create a new one.
-	prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
 	ndp.onLinkPrefixes[prefix] = prefixState
 }
 
@@ -917,7 +772,7 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio
 // handleAutonomousPrefixInformation assumes that the prefix this pi is for is
 // not the link-local prefix and the autonomous flag is set.
 //
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
 	vl := pi.ValidLifetime()
 	pl := pi.PreferredLifetime()
@@ -1026,28 +881,34 @@ func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration
 		log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
 	}
 
-	// Setup the timers to deprecate and invalidate this newly generated address.
+	state := autoGenAddressState{
+		ref: ref,
+		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+			addrState, ok := ndp.autoGenAddresses[addr]
+			if !ok {
+				log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr)
+			}
+			addrState.ref.deprecated = true
+			ndp.notifyAutoGenAddressDeprecated(addr)
+		}),
+		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+			ndp.invalidateAutoGenAddress(addr)
+		}),
+	}
+
+	// Setup the initial timers to deprecate and invalidate this newly generated
+	// address.
 
-	var doNotDeprecate bool
-	var pTimer *time.Timer
 	if !deprecated && pl < header.NDPInfiniteLifetime {
-		pTimer = ndp.autoGenAddrDeprecationTimer(addr, pl, &doNotDeprecate)
+		state.deprecationTimer.Reset(pl)
 	}
 
-	var doNotInvalidate bool
-	var vTimer *time.Timer
 	if vl < header.NDPInfiniteLifetime {
-		vTimer = ndp.autoGenAddrInvalidationTimer(addr, vl, &doNotInvalidate)
+		state.invalidationTimer.Reset(vl)
+		state.validUntil = time.Now().Add(vl)
 	}
 
-	ndp.autoGenAddresses[addr] = autoGenAddressState{
-		ref:               ref,
-		deprecationTimer:  pTimer,
-		doNotDeprecate:    &doNotDeprecate,
-		invalidationTimer: vTimer,
-		doNotInvalidate:   &doNotInvalidate,
-		validUntil:        time.Now().Add(vl),
-	}
+	ndp.autoGenAddresses[addr] = state
 }
 
 // refreshAutoGenAddressLifetimes refreshes the lifetime of a SLAAC generated
@@ -1075,20 +936,10 @@ func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl t
 
 	// If addr was preferred for some finite lifetime before, stop the deprecation
 	// timer so it can be reset.
-	if t := addrState.deprecationTimer; t != nil && !t.Stop() {
-		*addrState.doNotDeprecate = true
-	}
+	addrState.deprecationTimer.StopLocked()
 
-	// Reset the deprecation timer.
-	if pl >= header.NDPInfiniteLifetime || deprecated {
-		// If addr is preferred forever or it has been deprecated already, there is
-		// no need for a deprecation timer.
-		addrState.deprecationTimer = nil
-	} else if addrState.deprecationTimer == nil {
-		// addr is now preferred for a finite lifetime.
-		addrState.deprecationTimer = ndp.autoGenAddrDeprecationTimer(addr, pl, addrState.doNotDeprecate)
-	} else {
-		// addr continues to be preferred for a finite lifetime.
+	// Reset the deprecation timer if addr has a finite preferred lifetime.
+	if !deprecated && pl < header.NDPInfiniteLifetime {
 		addrState.deprecationTimer.Reset(pl)
 	}
 
@@ -1107,15 +958,8 @@ func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl t
 	// Handle the infinite valid lifetime separately as we do not keep a timer in
 	// this case.
 	if vl >= header.NDPInfiniteLifetime {
-		if addrState.invalidationTimer != nil {
-			// Valid lifetime was finite before, but now it is valid forever.
-			if !addrState.invalidationTimer.Stop() {
-				*addrState.doNotInvalidate = true
-			}
-			addrState.invalidationTimer = nil
-			addrState.validUntil = time.Time{}
-		}
-
+		addrState.invalidationTimer.StopLocked()
+		addrState.validUntil = time.Time{}
 		return
 	}
 
@@ -1124,7 +968,7 @@ func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl t
 
 	// If the address was originally set to be valid forever, assume the remaining
 	// time to be the maximum possible value.
-	if addrState.invalidationTimer == nil {
+	if addrState.validUntil == (time.Time{}) {
 		rl = header.NDPInfiniteLifetime
 	} else {
 		rl = time.Until(addrState.validUntil)
@@ -1138,15 +982,8 @@ func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl t
 		effectiveVl = MinPrefixInformationValidLifetimeForUpdate
 	}
 
-	if addrState.invalidationTimer == nil {
-		addrState.invalidationTimer = ndp.autoGenAddrInvalidationTimer(addr, effectiveVl, addrState.doNotInvalidate)
-	} else {
-		if !addrState.invalidationTimer.Stop() {
-			*addrState.doNotInvalidate = true
-		}
-		addrState.invalidationTimer.Reset(effectiveVl)
-	}
-
+	addrState.invalidationTimer.StopLocked()
+	addrState.invalidationTimer.Reset(effectiveVl)
 	addrState.validUntil = time.Now().Add(effectiveVl)
 }
 
@@ -1181,27 +1018,12 @@ func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) {
 // The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool {
 	state, ok := ndp.autoGenAddresses[addr]
-
 	if !ok {
 		return false
 	}
 
-	if state.deprecationTimer != nil {
-		state.deprecationTimer.Stop()
-		state.deprecationTimer = nil
-		*state.doNotDeprecate = true
-	}
-
-	state.doNotDeprecate = nil
-
-	if state.invalidationTimer != nil {
-		state.invalidationTimer.Stop()
-		state.invalidationTimer = nil
-		*state.doNotInvalidate = true
-	}
-
-	state.doNotInvalidate = nil
-
+	state.deprecationTimer.StopLocked()
+	state.invalidationTimer.StopLocked()
 	delete(ndp.autoGenAddresses, addr)
 
 	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
@@ -1214,53 +1036,6 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 	return true
 }
 
-// autoGenAddrDeprecationTimer returns a new deprecation timer for an
-// auto-generated address that fires after pl.
-//
-// doNotDeprecate is used to inform the timer when it fires at the same time
-// that an auto-generated address's preferred lifetime gets refreshed. See
-// autoGenAddrState.doNotDeprecate for more details.
-func (ndp *ndpState) autoGenAddrDeprecationTimer(addr tcpip.Address, pl time.Duration, doNotDeprecate *bool) *time.Timer {
-	return time.AfterFunc(pl, func() {
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
-
-		if *doNotDeprecate {
-			*doNotDeprecate = false
-			return
-		}
-
-		addrState, ok := ndp.autoGenAddresses[addr]
-		if !ok {
-			log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr)
-		}
-		addrState.ref.deprecated = true
-		ndp.notifyAutoGenAddressDeprecated(addr)
-		addrState.deprecationTimer = nil
-		ndp.autoGenAddresses[addr] = addrState
-	})
-}
-
-// autoGenAddrInvalidationTimer returns a new invalidation timer for an
-// auto-generated address that fires after vl.
-//
-// doNotInvalidate is used to inform the timer when it fires at the same time
-// that an auto-generated address's valid lifetime gets refreshed. See
-// autoGenAddrState.doNotInvalidate for more details.
-func (ndp *ndpState) autoGenAddrInvalidationTimer(addr tcpip.Address, vl time.Duration, doNotInvalidate *bool) *time.Timer {
-	return time.AfterFunc(vl, func() {
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
-
-		if *doNotInvalidate {
-			*doNotInvalidate = false
-			return
-		}
-
-		ndp.invalidateAutoGenAddress(addr)
-	})
-}
-
 // cleanupHostOnlyState cleans up any state that is only useful for hosts.
 //
 // cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index e51462a55..d334af289 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1029,13 +1029,13 @@ func TestRouterDiscovery(t *testing.T) {
 	expectRouterEvent(llAddr2, true)
 
 	// Rx an RA from another router (lladdr3) with non-zero lifetime.
-	l3Lifetime := time.Duration(6)
-	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, uint16(l3Lifetime)))
+	const l3LifetimeSeconds = 6
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, l3LifetimeSeconds))
 	expectRouterEvent(llAddr3, true)
 
 	// Rx an RA from lladdr2 with lesser lifetime.
-	l2Lifetime := time.Duration(2)
-	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(l2Lifetime)))
+	const l2LifetimeSeconds = 2
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, l2LifetimeSeconds))
 	select {
 	case <-ndpDisp.routerC:
 		t.Fatal("Should not receive a router event when updating lifetimes for known routers")
@@ -1049,7 +1049,7 @@ func TestRouterDiscovery(t *testing.T) {
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	expectAsyncRouterInvalidationEvent(llAddr2, l2Lifetime*time.Second+defaultTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultTimeout)
 
 	// Rx an RA from lladdr2 with huge lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
@@ -1066,7 +1066,7 @@ func TestRouterDiscovery(t *testing.T) {
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	expectAsyncRouterInvalidationEvent(llAddr3, l3Lifetime*time.Second+defaultTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultTimeout)
 }
 
 // TestRouterDiscoveryMaxRouters tests that only
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
new file mode 100644
index 000000000..f5f01f32f
--- /dev/null
+++ b/pkg/tcpip/timer.go
@@ -0,0 +1,161 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import (
+	"sync"
+	"time"
+)
+
+// cancellableTimerInstance is a specific instance of CancellableTimer.
+//
+// Different instances are created each time CancellableTimer is Reset so each
+// timer has its own earlyReturn signal. This is to address a bug when a
+// CancellableTimer is stopped and reset in quick succession resulting in a
+// timer instance's earlyReturn signal being affected or seen by another timer
+// instance.
+//
+// Consider the following sceneario where timer instances share a common
+// earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a
+// lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second
+// (B), third (C), and fourth (D) instance of the timer firing, respectively):
+//   T1: Obtain L
+//   T1: Create a new CancellableTimer w/ lock L (create instance A)
+//   T2: instance A fires, blocked trying to obtain L.
+//   T1: Attempt to stop instance A (set earlyReturn = true)
+//   T1: Reset timer (create instance B)
+//   T3: instance B fires, blocked trying to obtain L.
+//   T1: Attempt to stop instance B (set earlyReturn = true)
+//   T1: Reset timer (create instance C)
+//   T4: instance C fires, blocked trying to obtain L.
+//   T1: Attempt to stop instance C (set earlyReturn = true)
+//   T1: Reset timer (create instance D)
+//   T5: instance D fires, blocked trying to obtain L.
+//   T1: Release L
+//
+// Now that T1 has released L, any of the 4 timer instances can take L and check
+// earlyReturn. If the timers simply check earlyReturn and then do nothing
+// further, then instance D will never early return even though it was not
+// requested to stop. If the timers reset earlyReturn before early returning,
+// then all but one of the timers will do work when only one was expected to.
+// If CancellableTimer resets earlyReturn when resetting, then all the timers
+// will fire (again, when only one was expected to).
+//
+// To address the above concerns the simplest solution was to give each timer
+// its own earlyReturn signal.
+type cancellableTimerInstance struct {
+	timer *time.Timer
+
+	// Used to inform the timer to early return when it gets stopped while the
+	// lock the timer tries to obtain when fired is held (T1 is a goroutine that
+	// tries to cancel the timer and T2 is the goroutine that handles the timer
+	// firing):
+	//   T1: Obtain the lock, then call StopLocked()
+	//   T2: timer fires, and gets blocked on obtaining the lock
+	//   T1: Releases lock
+	//   T2: Obtains lock does unintended work
+	//
+	// To resolve this, T1 will check to see if the timer already fired, and
+	// inform the timer using earlyReturn to return early so that once T2 obtains
+	// the lock, it will see that it is set to true and do nothing further.
+	earlyReturn *bool
+}
+
+// stop stops the timer instance t from firing if it hasn't fired already. If it
+// has fired and is blocked at obtaining the lock, earlyReturn will be set to
+// true so that it will early return when it obtains the lock.
+func (t *cancellableTimerInstance) stop() {
+	if t.timer != nil {
+		t.timer.Stop()
+		*t.earlyReturn = true
+	}
+}
+
+// CancellableTimer is a timer that does some work and can be safely cancelled
+// when it fires at the same time some "related work" is being done.
+//
+// The term "related work" is defined as some work that needs to be done while
+// holding some lock that the timer must also hold while doing some work.
+type CancellableTimer struct {
+	// The active instance of a cancellable timer.
+	instance cancellableTimerInstance
+
+	// locker is the lock taken by the timer immediately after it fires and must
+	// be held when attempting to stop the timer.
+	//
+	// Must never change after being assigned.
+	locker sync.Locker
+
+	// fn is the function that will be called when a timer fires and has not been
+	// signaled to early return.
+	//
+	// fn MUST NOT attempt to lock locker.
+	//
+	// Must never change after being assigned.
+	fn func()
+}
+
+// StopLocked prevents the Timer from firing if it has not fired already.
+//
+// If the timer is blocked on obtaining the t.locker lock when StopLocked is
+// called, it will early return instead of calling t.fn.
+//
+// Note, t will be modified.
+//
+// t.locker MUST be locked.
+func (t *CancellableTimer) StopLocked() {
+	t.instance.stop()
+
+	// Nothing to do with the stopped instance anymore.
+	t.instance = cancellableTimerInstance{}
+}
+
+// Reset changes the timer to expire after duration d.
+//
+// Note, t will be modified.
+//
+// Reset should only be called on stopped or expired timers. To be safe, callers
+// should always call StopLocked before calling Reset.
+func (t *CancellableTimer) Reset(d time.Duration) {
+	// Create a new instance.
+	earlyReturn := false
+	t.instance = cancellableTimerInstance{
+		timer: time.AfterFunc(d, func() {
+			t.locker.Lock()
+			defer t.locker.Unlock()
+
+			if earlyReturn {
+				// If we reach this point, it means that the timer fired while another
+				// goroutine called StopLocked while it had the lock. Simply return
+				// here and do nothing further.
+				earlyReturn = false
+				return
+			}
+
+			t.fn()
+		}),
+		earlyReturn: &earlyReturn,
+	}
+}
+
+// MakeCancellableTimer returns an unscheduled CancellableTimer with the given
+// locker and fn.
+//
+// fn MUST NOT attempt to lock locker.
+//
+// Callers must call Reset to schedule the timer to fire.
+func MakeCancellableTimer(locker sync.Locker, fn func()) CancellableTimer {
+	return CancellableTimer{locker: locker, fn: fn}
+}
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
new file mode 100644
index 000000000..1f735d735
--- /dev/null
+++ b/pkg/tcpip/timer_test.go
@@ -0,0 +1,236 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package timer_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	shortDuration  = 1 * time.Nanosecond
+	middleDuration = 100 * time.Millisecond
+	longDuration   = 1 * time.Second
+)
+
+func TestCancellableTimerFire(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	timer := tcpip.MakeCancellableTimer(&lock, func() {
+		ch <- struct{}{}
+	})
+	timer.Reset(shortDuration)
+
+	// Wait for timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerResetFromLongDuration(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(middleDuration)
+
+	lock.Lock()
+	timer.StopLocked()
+	lock.Unlock()
+
+	timer.Reset(shortDuration)
+
+	// Wait for timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerResetFromShortDuration(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	timer.StopLocked()
+	lock.Unlock()
+
+	// Wait for timer to fire if it wasn't correctly stopped.
+	select {
+	case <-ch:
+		t.Fatal("timer fired after being stopped")
+	case <-time.After(middleDuration):
+	}
+
+	timer.Reset(shortDuration)
+
+	// Wait for timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerImmediatelyStop(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	for i := 0; i < 1000; i++ {
+		lock.Lock()
+		timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+		timer.Reset(shortDuration)
+		timer.StopLocked()
+		lock.Unlock()
+	}
+
+	// Wait for timer to fire if it wasn't correctly stopped.
+	select {
+	case <-ch:
+		t.Fatal("timer fired after being stopped")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	timer.StopLocked()
+	lock.Unlock()
+
+	for i := 0; i < 10; i++ {
+		timer.Reset(middleDuration)
+
+		lock.Lock()
+		// Sleep until the timer fires and gets blocked trying to take the lock.
+		time.Sleep(middleDuration * 2)
+		timer.StopLocked()
+		lock.Unlock()
+	}
+
+	// Wait for double the duration so timers that weren't correctly stopped can
+	// fire.
+	select {
+	case <-ch:
+		t.Fatal("timer fired after being stopped")
+	case <-time.After(middleDuration * 2):
+	}
+}
+
+func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	for i := 0; i < 10; i++ {
+		// Sleep until the timer fires and gets blocked trying to take the lock.
+		time.Sleep(middleDuration)
+		timer.StopLocked()
+		timer.Reset(shortDuration)
+	}
+	lock.Unlock()
+
+	// Wait for double the duration for the last timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestManyCancellableTimerResetUnderLock(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	for i := 0; i < 10; i++ {
+		timer.StopLocked()
+		timer.Reset(shortDuration)
+	}
+	lock.Unlock()
+
+	// Wait for double the duration for the last timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
-- 
cgit v1.2.3


From 781a68eeb65b4db09ae6591a4273e27a2bf60999 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 21:39:13 -0800
Subject: It works! It drops some packets.

---
 pkg/sentry/socket/netfilter/netfilter.go | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 2fc7aeea3..014dfa625 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -52,7 +52,7 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, info.Name)
+	table, err := findTable(stack, info.Name)
 	if err != nil {
 		return linux.IPTGetinfo{}, err
 	}
@@ -83,7 +83,7 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	}
 
 	// Find the appropriate table.
-	table, err := findTable(ep, userEntries.Name)
+	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
@@ -102,11 +102,8 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-func findTable(ep tcpip.Endpoint, tablename linux.TableName) (iptables.Table, *syserr.Error) {
-	ipt, err := ep.IPTables()
-	if err != nil {
-		return iptables.Table{}, syserr.FromError(err)
-	}
+func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, *syserr.Error) {
+	ipt := stack.IPTables()
 	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
 		return iptables.Table{}, syserr.ErrInvalidArgument
@@ -347,7 +344,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// Go through the list of supported hooks for this table and, for each
 	// one, set the rule it corresponds to.
 	for hook, _ := range replace.HookEntry {
-		if table.ValidHooks()&uint32(hook) != 0 {
+		if table.ValidHooks()&(1<<hook) != 0 {
 			hk := hookFromLinux(hook)
 			for ruleIdx, offset := range offsets {
 				if offset == replace.HookEntry[hook] {
-- 
cgit v1.2.3


From aeb3a4017b9bc038ebe5630fe270d5ea8691d141 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 8 Jan 2020 22:10:35 -0800
Subject: Working on filtering by protocol.

---
 pkg/tcpip/iptables/types.go    | 20 ++++++++++++++++++++
 test/iptables/iptables_test.go | 34 +++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 9f6906100..4b2a9c294 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -151,6 +151,9 @@ func (table *Table) SetMetadata(metadata interface{}) {
 // packets this rule applies to. If there are no matchers in the rule, it
 // applies to any packet.
 type Rule struct {
+	// IPHeaderFilters holds basic IP filtering fields common to every rule.
+	IPHeaderFilter IPHeaderFilter
+
 	// Matchers is the list of matchers for this rule.
 	Matchers []Matcher
 
@@ -158,6 +161,23 @@ type Rule struct {
 	Target Target
 }
 
+// TODO: This is gross.
+// TODO: Save this in SetEntries.
+// TODO: Utilize this when traversing tables.
+type IPHeaderFilter struct {
+	Source              [4]byte
+	Destination         [4]byte
+	SourceMask          [4]byte
+	DestinationMask     [4]byte
+	OutputInterface     string
+	InputInterface      string
+	OutputInterfaceMask string
+	InputInterfaceMask  string
+	Protocol            uint16
+	Flags               uint8
+	InverseFlags        uint8
+}
+
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 5927eb017..d040e971a 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -166,20 +166,20 @@ func TestFilterInputDropUDP(t *testing.T) {
 	}
 }
 
-func TestFilterInputDropUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestFilterInputDropDifferentUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestFilterInputDropAll(t *testing.T) {
-	if err := singleTest(FilterInputDropAll{}); err != nil {
-		t.Fatal(err)
-	}
-}
+// func TestFilterInputDropUDPPort(t *testing.T) {
+// 	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+// 		t.Fatal(err)
+// 	}
+// }
+
+// func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+// 	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+// 		t.Fatal(err)
+// 	}
+// }
+
+// func TestFilterInputDropAll(t *testing.T) {
+// 	if err := singleTest(FilterInputDropAll{}); err != nil {
+// 		t.Fatal(err)
+// 	}
+// }
-- 
cgit v1.2.3


From fdfa05ff2c99b4a2f7c0b22fc491a268f1f2e164 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <eag0628@gmail.com>
Date: Thu, 9 Jan 2020 09:01:17 +0000
Subject: Avoid panic when c.PCIDs is nil

When PCID is disabled, there would throw a panic
when dropPageTables() access to c.PCID without check.

Signed-off-by: Lai Jiangshan <eag0628@gmail.com>
---
 pkg/sentry/platform/kvm/machine_amd64.go | 4 +++-
 pkg/sentry/platform/kvm/machine_arm64.go | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index b99fe425e..873e39dc7 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -90,7 +90,9 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 
 	// Clear from all PCIDs.
 	for _, c := range m.vCPUs {
-		c.PCIDs.Drop(pt)
+		if c.PCIDs != nil {
+			c.PCIDs.Drop(pt)
+		}
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 7ae47f291..3b1f20219 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -97,7 +97,9 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 
 	// Clear from all PCIDs.
 	for _, c := range m.vCPUs {
-		c.PCIDs.Drop(pt)
+		if c.PCIDs != nil {
+			c.PCIDs.Drop(pt)
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 290908fa8ae2363c3d2a7af7cef8d5dda622cde7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 Jan 2020 10:16:02 -0800
Subject: Configure issue reviver to run with Kokoro

PiperOrigin-RevId: 288921032
---
 kokoro/issue_reviver.cfg | 15 +++++++++++++++
 scripts/issue_reviver.sh | 27 +++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 kokoro/issue_reviver.cfg
 create mode 100755 scripts/issue_reviver.sh

diff --git a/kokoro/issue_reviver.cfg b/kokoro/issue_reviver.cfg
new file mode 100644
index 000000000..2370d9250
--- /dev/null
+++ b/kokoro/issue_reviver.cfg
@@ -0,0 +1,15 @@
+build_file: "repo/scripts/issue_reviver.sh"
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+      keystore_config_id: 73898
+      keyname: "kokoro-github-access-token"
+    }
+  }
+}
+
+env_vars {
+  key: "KOKORO_GITHUB_ACCESS_TOKEN"
+  value: "73898_kokoro-github-access-token"
+}
diff --git a/scripts/issue_reviver.sh b/scripts/issue_reviver.sh
new file mode 100755
index 000000000..bac9b9192
--- /dev/null
+++ b/scripts/issue_reviver.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DIR=$(dirname $0)
+source "${DIR}"/common.sh
+
+# Provide a credential file if available.
+export OAUTH_TOKEN_FILE=""
+if [[ -v KOKORO_GITHUB_ACCESS_TOKEN ]]; then
+  OAUTH_TOKEN_FILE="${KOKORO_KEYSTORE_DIR}/${KOKORO_GITHUB_ACCESS_TOKEN}"
+fi
+
+REPO_ROOT=$(cd "$(dirname "${DIR}")"; pwd)
+run //tools/issue_reviver:issue_reviver --path "${REPO_ROOT}" --oauth-token-file="${OAUTH_TOKEN_FILE}"
-- 
cgit v1.2.3


From 6cc8e2d814f99439e01c308e16f6631d75578ec0 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 9 Jan 2020 10:03:22 -0800
Subject: Add test to check iptables redirect port rule

---
 test/iptables/filter_input.go  | 28 ++++++++++++++++++++++++++++
 test/iptables/iptables_test.go |  7 +++++++
 2 files changed, 35 insertions(+)

diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 923f44e68..41bb85369 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -23,6 +23,7 @@ import (
 const (
 	dropPort         = 2401
 	acceptPort       = 2402
+	redirectPort     = 42
 	sendloopDuration = 2 * time.Second
 	network          = "udp4"
 )
@@ -31,6 +32,7 @@ func init() {
 	RegisterTestCase(FilterInputDropUDP{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropDifferentUDPPort{})
+	RegisterTestCase(FilterInputRedirectUDPPort{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -122,3 +124,29 @@ func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
 func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// FilterInputRedirectUDPPort tests that packets are redirected to different port.
+type FilterInputRedirectUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputRedirectUDPPort) Name() string {
+        return "FilterInputRedirectUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputRedirectUDPPort) ContainerAction(ip net.IP) error {
+        if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+		return err
+	}
+
+	if err := listenUDP(redirectPort, sendloopDuration); err != nil {
+	        return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, redirectPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputRedirectUDPPort) LocalAction(ip net.IP) error {
+        return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index bfbf1bb87..d57ddc0fe 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -177,3 +177,10 @@ func TestFilterInputDropDifferentUDPPort(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestFilterInputRedirectUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputRedirectUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
-- 
cgit v1.2.3


From e752ddbb72d89b19863a6b50d99814149a08d5fe Mon Sep 17 00:00:00 2001
From: Bert Muthalaly <stijlist@google.com>
Date: Thu, 9 Jan 2020 10:34:30 -0800
Subject: Allow clients to store an opaque NICContext with NICs

...retrievable later via stack.NICInfo().

Clients of this library can use it to add metadata that should be tracked
alongside a NIC, to avoid having to keep a map[tcpip.NICID]metadata mirroring
stack.Stack's nic map.

PiperOrigin-RevId: 288924900
---
 pkg/tcpip/stack/nic.go        | 12 +++++++-----
 pkg/tcpip/stack/stack.go      | 16 +++++++++++++++-
 pkg/tcpip/stack/stack_test.go | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 4144d5d0f..3810c6602 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -27,10 +27,11 @@ import (
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
-	stack  *Stack
-	id     tcpip.NICID
-	name   string
-	linkEP LinkEndpoint
+	stack   *Stack
+	id      tcpip.NICID
+	name    string
+	linkEP  LinkEndpoint
+	context NICContext
 
 	mu            sync.RWMutex
 	spoofing      bool
@@ -84,7 +85,7 @@ const (
 )
 
 // newNIC returns a new NIC using the default NDP configurations from stack.
-func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC {
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *NIC {
 	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
 	// example, make sure that the link address it provides is a valid
 	// unicast ethernet address.
@@ -98,6 +99,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC {
 		id:         id,
 		name:       name,
 		linkEP:     ep,
+		context:    ctx,
 		primary:    make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint),
 		endpoints:  make(map[NetworkEndpointID]*referencedNetworkEndpoint),
 		mcastJoins: make(map[NetworkEndpointID]int32),
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index fb7ac409e..e2a2edb2c 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -796,6 +796,9 @@ func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNum
 	return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
 }
 
+// NICContext is an opaque pointer used to store client-supplied NIC metadata.
+type NICContext interface{}
+
 // NICOptions specifies the configuration of a NIC as it is being created.
 // The zero value creates an enabled, unnamed NIC.
 type NICOptions struct {
@@ -805,6 +808,12 @@ type NICOptions struct {
 	// Disabled specifies whether to avoid calling Attach on the passed
 	// LinkEndpoint.
 	Disabled bool
+
+	// Context specifies user-defined data that will be returned in stack.NICInfo
+	// for the NIC. Clients of this library can use it to add metadata that
+	// should be tracked alongside a NIC, to avoid having to keep a
+	// map[tcpip.NICID]metadata mirroring stack.Stack's nic map.
+	Context NICContext
 }
 
 // CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
@@ -819,7 +828,7 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 		return tcpip.ErrDuplicateNICID
 	}
 
-	n := newNIC(s, id, opts.Name, ep)
+	n := newNIC(s, id, opts.Name, ep, opts.Context)
 
 	s.nics[id] = n
 	if !opts.Disabled {
@@ -886,6 +895,10 @@ type NICInfo struct {
 	MTU uint32
 
 	Stats NICStats
+
+	// Context is user-supplied data optionally supplied in CreateNICWithOptions.
+	// See type NICOptions for more details.
+	Context NICContext
 }
 
 // NICInfo returns a map of NICIDs to their associated information.
@@ -908,6 +921,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 			Flags:             flags,
 			MTU:               nic.linkEP.MTU(),
 			Stats:             nic.stats,
+			Context:           nic.context,
 		}
 	}
 	return nics
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9ac50bb23..44e5229cc 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2001,6 +2001,46 @@ func TestNICAutoGenAddr(t *testing.T) {
 	}
 }
 
+// TestNICContextPreservation tests that you can read out via stack.NICInfo the
+// Context data you pass via NICContext.Context in stack.CreateNICWithOptions.
+func TestNICContextPreservation(t *testing.T) {
+	var ctx *int
+	tests := []struct {
+		name string
+		opts stack.NICOptions
+		want stack.NICContext
+	}{
+		{
+			"context_set",
+			stack.NICOptions{Context: ctx},
+			ctx,
+		},
+		{
+			"context_not_set",
+			stack.NICOptions{},
+			nil,
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			id := tcpip.NICID(1)
+			ep := channel.New(0, 0, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"))
+			if err := s.CreateNICWithOptions(id, ep, test.opts); err != nil {
+				t.Fatalf("got stack.CreateNICWithOptions(%d, %+v, %+v) = %s, want nil", id, ep, test.opts, err)
+			}
+			nicinfos := s.NICInfo()
+			nicinfo, ok := nicinfos[id]
+			if !ok {
+				t.Fatalf("got nicinfos[%d] = _, %t, want _, true; nicinfos = %+v", id, ok, nicinfos)
+			}
+			if got, want := nicinfo.Context == test.want, true; got != want {
+				t.Fatal("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
+			}
+		})
+	}
+}
+
 // TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local
 // addresses with opaque interface identifiers. Link Local addresses should
 // always be generated with opaque IIDs if configured to use them, even if the
-- 
cgit v1.2.3


From 8643933d6e58492cbe9d5c78124873ab40f65feb Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 9 Jan 2020 13:06:24 -0800
Subject: Change BindToDeviceOption to store NICID

This makes it possible to call the sockopt from go even when the NIC has no
name.

PiperOrigin-RevId: 288955236
---
 pkg/sentry/socket/netstack/netstack.go    | 29 ++++++++--
 pkg/tcpip/stack/stack.go                  |  8 +++
 pkg/tcpip/stack/transport_demuxer_test.go | 89 +++++++++++++++----------------
 pkg/tcpip/tcpip.go                        |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go       | 27 ++++------
 pkg/tcpip/transport/tcp/tcp_test.go       | 42 +++++++--------
 pkg/tcpip/transport/udp/endpoint.go       | 27 ++++------
 pkg/tcpip/transport/udp/udp_test.go       | 31 +++++------
 8 files changed, 127 insertions(+), 128 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9e0d69046..764f11a6b 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -985,13 +985,23 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		if len(v) == 0 {
+		if v == 0 {
 			return []byte{}, nil
 		}
 		if outLen < linux.IFNAMSIZ {
 			return nil, syserr.ErrInvalidArgument
 		}
-		return append([]byte(v), 0), nil
+		s := t.NetworkContext()
+		if s == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		nic, ok := s.Interfaces()[int32(v)]
+		if !ok {
+			// The NICID no longer indicates a valid interface, probably because that
+			// interface was removed.
+			return nil, syserr.ErrUnknownDevice
+		}
+		return append([]byte(nic.Name), 0), nil
 
 	case linux.SO_BROADCAST:
 		if outLen < sizeOfInt32 {
@@ -1438,7 +1448,20 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		if n == -1 {
 			n = len(optVal)
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(optVal[:n])))
+		name := string(optVal[:n])
+		if name == "" {
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+		}
+		s := t.NetworkContext()
+		if s == nil {
+			return syserr.ErrNoDevice
+		}
+		for nicID, nic := range s.Interfaces() {
+			if nic.Name == name {
+				return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+			}
+		}
+		return syserr.ErrUnknownDevice
 
 	case linux.SO_BROADCAST:
 		if len(optVal) < sizeOfInt32 {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index e2a2edb2c..41bf9fd9b 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -901,6 +901,14 @@ type NICInfo struct {
 	Context NICContext
 }
 
+// HasNIC returns true if the NICID is defined in the stack.
+func (s *Stack) HasNIC(id tcpip.NICID) bool {
+	s.mu.RLock()
+	_, ok := s.nics[id]
+	s.mu.RUnlock()
+	return ok
+}
+
 // NICInfo returns a map of NICIDs to their associated information.
 func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	s.mu.RLock()
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index df5ced887..5e9237de9 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -41,7 +41,7 @@ const (
 
 type testContext struct {
 	t       *testing.T
-	linkEPs map[string]*channel.Endpoint
+	linkEps map[tcpip.NICID]*channel.Endpoint
 	s       *stack.Stack
 
 	ep tcpip.Endpoint
@@ -66,27 +66,24 @@ func (c *testContext) createV6Endpoint(v6only bool) {
 	}
 }
 
-// newDualTestContextMultiNic creates the testing context and also linkEpNames
-// named NICs.
-func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string) *testContext {
+// newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
+func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
-	linkEPs := make(map[string]*channel.Endpoint)
-	for i, linkEpName := range linkEpNames {
-		channelEP := channel.New(256, mtu, "")
-		nicID := tcpip.NICID(i + 1)
-		opts := stack.NICOptions{Name: linkEpName}
-		if err := s.CreateNICWithOptions(nicID, channelEP, opts); err != nil {
-			t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
+	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
+	for _, linkEpID := range linkEpIDs {
+		channelEp := channel.New(256, mtu, "")
+		if err := s.CreateNIC(linkEpID, channelEp); err != nil {
+			t.Fatalf("CreateNIC failed: %v", err)
 		}
-		linkEPs[linkEpName] = channelEP
+		linkEps[linkEpID] = channelEp
 
-		if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil {
+		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, stackAddr); err != nil {
 			t.Fatalf("AddAddress IPv4 failed: %v", err)
 		}
 
-		if err := s.AddAddress(nicID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
+		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
 			t.Fatalf("AddAddress IPv6 failed: %v", err)
 		}
 	}
@@ -105,7 +102,7 @@ func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string)
 	return &testContext{
 		t:       t,
 		s:       s,
-		linkEPs: linkEPs,
+		linkEps: linkEps,
 	}
 }
 
@@ -122,7 +119,7 @@ func newPayload() []byte {
 	return b
 }
 
-func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string) {
+func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
 	copy(buf[len(buf)-len(payload):], payload)
@@ -153,7 +150,7 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEPs[linkEpName].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
@@ -183,7 +180,7 @@ func TestTransportDemuxerRegister(t *testing.T) {
 func TestDistribution(t *testing.T) {
 	type endpointSockopts struct {
 		reuse        int
-		bindToDevice string
+		bindToDevice tcpip.NICID
 	}
 	for _, test := range []struct {
 		name string
@@ -191,71 +188,71 @@ func TestDistribution(t *testing.T) {
 		endpoints []endpointSockopts
 		// wantedDistribution is the wanted ratio of packets received on each
 		// endpoint for each NIC on which packets are injected.
-		wantedDistributions map[string][]float64
+		wantedDistributions map[tcpip.NICID][]float64
 	}{
 		{
 			"BindPortReuse",
 			// 5 endpoints that all have reuse set.
 			[]endpointSockopts{
-				{1, ""},
-				{1, ""},
-				{1, ""},
-				{1, ""},
-				{1, ""},
+				{1, 0},
+				{1, 0},
+				{1, 0},
+				{1, 0},
+				{1, 0},
 			},
-			map[string][]float64{
+			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed evenly.
-				"dev0": {0.2, 0.2, 0.2, 0.2, 0.2},
+				1: {0.2, 0.2, 0.2, 0.2, 0.2},
 			},
 		},
 		{
 			"BindToDevice",
 			// 3 endpoints with various bindings.
 			[]endpointSockopts{
-				{0, "dev0"},
-				{0, "dev1"},
-				{0, "dev2"},
+				{0, 1},
+				{0, 2},
+				{0, 3},
 			},
-			map[string][]float64{
+			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 go only to the endpoint bound to dev0.
-				"dev0": {1, 0, 0},
+				1: {1, 0, 0},
 				// Injected packets on dev1 go only to the endpoint bound to dev1.
-				"dev1": {0, 1, 0},
+				2: {0, 1, 0},
 				// Injected packets on dev2 go only to the endpoint bound to dev2.
-				"dev2": {0, 0, 1},
+				3: {0, 0, 1},
 			},
 		},
 		{
 			"ReuseAndBindToDevice",
 			// 6 endpoints with various bindings.
 			[]endpointSockopts{
-				{1, "dev0"},
-				{1, "dev0"},
-				{1, "dev1"},
-				{1, "dev1"},
-				{1, "dev1"},
-				{1, ""},
+				{1, 1},
+				{1, 1},
+				{1, 2},
+				{1, 2},
+				{1, 2},
+				{1, 0},
 			},
-			map[string][]float64{
+			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed among endpoints bound to
 				// dev0.
-				"dev0": {0.5, 0.5, 0, 0, 0, 0},
+				1: {0.5, 0.5, 0, 0, 0, 0},
 				// Injected packets on dev1 get distributed among endpoints bound to
 				// dev1 or unbound.
-				"dev1": {0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
+				2: {0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
 				// Injected packets on dev999 go only to the unbound.
-				"dev999": {0, 0, 0, 0, 0, 1},
+				1000: {0, 0, 0, 0, 0, 1},
 			},
 		},
 	} {
 		t.Run(test.name, func(t *testing.T) {
 			for device, wantedDistribution := range test.wantedDistributions {
-				t.Run(device, func(t *testing.T) {
-					var devices []string
+				t.Run(string(device), func(t *testing.T) {
+					var devices []tcpip.NICID
 					for d := range test.wantedDistributions {
 						devices = append(devices, d)
 					}
-					c := newDualTestContextMultiNic(t, defaultMTU, devices)
+					c := newDualTestContextMultiNIC(t, defaultMTU, devices)
 					defer c.cleanup()
 
 					c.createV6Endpoint(false)
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 1eca76c30..72b5ce179 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -552,7 +552,7 @@ type ReusePortOption int
 
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
-type BindToDeviceOption string
+type BindToDeviceOption NICID
 
 // QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
 type QuickAckOption int
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 2ac1b6877..920b24975 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1279,19 +1279,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case tcpip.BindToDeviceOption:
-		e.mu.Lock()
-		defer e.mu.Unlock()
-		if v == "" {
-			e.bindToDevice = 0
-			return nil
-		}
-		for nicID, nic := range e.stack.NICInfo() {
-			if nic.Name == string(v) {
-				e.bindToDevice = nicID
-				return nil
-			}
+		id := tcpip.NICID(v)
+		if id != 0 && !e.stack.HasNIC(id) {
+			return tcpip.ErrUnknownDevice
 		}
-		return tcpip.ErrUnknownDevice
+		e.mu.Lock()
+		e.bindToDevice = id
+		e.mu.Unlock()
+		return nil
 
 	case tcpip.QuickAckOption:
 		if v == 0 {
@@ -1550,12 +1545,8 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 	case *tcpip.BindToDeviceOption:
 		e.mu.RLock()
-		defer e.mu.RUnlock()
-		if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
-			*o = tcpip.BindToDeviceOption(nic.Name)
-			return nil
-		}
-		*o = ""
+		*o = tcpip.BindToDeviceOption(e.bindToDevice)
+		e.mu.RUnlock()
 		return nil
 
 	case *tcpip.QuickAckOption:
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 15745ebd4..1aa0733d0 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1083,12 +1083,12 @@ func TestTrafficClassV6(t *testing.T) {
 func TestConnectBindToDevice(t *testing.T) {
 	for _, test := range []struct {
 		name   string
-		device string
+		device tcpip.NICID
 		want   tcp.EndpointState
 	}{
-		{"RightDevice", "nic1", tcp.StateEstablished},
-		{"WrongDevice", "nic2", tcp.StateSynSent},
-		{"AnyDevice", "", tcp.StateEstablished},
+		{"RightDevice", 1, tcp.StateEstablished},
+		{"WrongDevice", 2, tcp.StateSynSent},
+		{"AnyDevice", 0, tcp.StateEstablished},
 	} {
 		t.Run(test.name, func(t *testing.T) {
 			c := context.New(t, defaultMTU)
@@ -3794,47 +3794,41 @@ func TestBindToDeviceOption(t *testing.T) {
 	}
 	defer ep.Close()
 
-	opts := stack.NICOptions{Name: "my_device"}
-	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
-		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
-	}
-
-	// Make an nameless NIC.
-	if err := s.CreateNIC(54321, loopback.New()); err != nil {
+	if err := s.CreateNIC(321, loopback.New()); err != nil {
 		t.Errorf("CreateNIC failed: %v", err)
 	}
 
-	// strPtr is used instead of taking the address of string literals, which is
+	// nicIDPtr is used instead of taking the address of NICID literals, which is
 	// a compiler error.
-	strPtr := func(s string) *string {
+	nicIDPtr := func(s tcpip.NICID) *tcpip.NICID {
 		return &s
 	}
 
 	testActions := []struct {
 		name                 string
-		setBindToDevice      *string
+		setBindToDevice      *tcpip.NICID
 		setBindToDeviceError *tcpip.Error
 		getBindToDevice      tcpip.BindToDeviceOption
 	}{
-		{"GetDefaultValue", nil, nil, ""},
-		{"BindToNonExistent", strPtr("non_existent_device"), tcpip.ErrUnknownDevice, ""},
-		{"BindToExistent", strPtr("my_device"), nil, "my_device"},
-		{"UnbindToDevice", strPtr(""), nil, ""},
+		{"GetDefaultValue", nil, nil, 0},
+		{"BindToNonExistent", nicIDPtr(999), tcpip.ErrUnknownDevice, 0},
+		{"BindToExistent", nicIDPtr(321), nil, 321},
+		{"UnbindToDevice", nicIDPtr(0), nil, 0},
 	}
 	for _, testAction := range testActions {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if got, want := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; got != want {
-					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, got, want)
+				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
 				}
 			}
-			bindToDevice := tcpip.BindToDeviceOption("to be modified by GetSockOpt")
-			if ep.GetSockOpt(&bindToDevice) != nil {
-				t.Errorf("GetSockOpt got %v, want %v", ep.GetSockOpt(&bindToDevice), nil)
+			bindToDevice := tcpip.BindToDeviceOption(88888)
+			if err := ep.GetSockOpt(&bindToDevice); err != nil {
+				t.Errorf("GetSockOpt got %v, want %v", err, nil)
 			}
 			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %q, want %q", got, want)
+				t.Errorf("bindToDevice got %d, want %d", got, want)
 			}
 		})
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1a5ee6317..864dc8733 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -631,19 +631,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 
 	case tcpip.BindToDeviceOption:
-		e.mu.Lock()
-		defer e.mu.Unlock()
-		if v == "" {
-			e.bindToDevice = 0
-			return nil
-		}
-		for nicID, nic := range e.stack.NICInfo() {
-			if nic.Name == string(v) {
-				e.bindToDevice = nicID
-				return nil
-			}
+		id := tcpip.NICID(v)
+		if id != 0 && !e.stack.HasNIC(id) {
+			return tcpip.ErrUnknownDevice
 		}
-		return tcpip.ErrUnknownDevice
+		e.mu.Lock()
+		e.bindToDevice = id
+		e.mu.Unlock()
+		return nil
 
 	case tcpip.BroadcastOption:
 		e.mu.Lock()
@@ -767,12 +762,8 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 	case *tcpip.BindToDeviceOption:
 		e.mu.RLock()
-		defer e.mu.RUnlock()
-		if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
-			*o = tcpip.BindToDeviceOption(nic.Name)
-			return nil
-		}
-		*o = tcpip.BindToDeviceOption("")
+		*o = tcpip.BindToDeviceOption(e.bindToDevice)
+		e.mu.RUnlock()
 		return nil
 
 	case *tcpip.KeepaliveEnabledOption:
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 149fff999..0a82bc4fa 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -513,42 +513,37 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
 	}
 
-	// Make an nameless NIC.
-	if err := s.CreateNIC(54321, loopback.New()); err != nil {
-		t.Errorf("CreateNIC failed: %v", err)
-	}
-
-	// strPtr is used instead of taking the address of string literals, which is
+	// nicIDPtr is used instead of taking the address of NICID literals, which is
 	// a compiler error.
-	strPtr := func(s string) *string {
+	nicIDPtr := func(s tcpip.NICID) *tcpip.NICID {
 		return &s
 	}
 
 	testActions := []struct {
 		name                 string
-		setBindToDevice      *string
+		setBindToDevice      *tcpip.NICID
 		setBindToDeviceError *tcpip.Error
 		getBindToDevice      tcpip.BindToDeviceOption
 	}{
-		{"GetDefaultValue", nil, nil, ""},
-		{"BindToNonExistent", strPtr("non_existent_device"), tcpip.ErrUnknownDevice, ""},
-		{"BindToExistent", strPtr("my_device"), nil, "my_device"},
-		{"UnbindToDevice", strPtr(""), nil, ""},
+		{"GetDefaultValue", nil, nil, 0},
+		{"BindToNonExistent", nicIDPtr(999), tcpip.ErrUnknownDevice, 0},
+		{"BindToExistent", nicIDPtr(321), nil, 321},
+		{"UnbindToDevice", nicIDPtr(0), nil, 0},
 	}
 	for _, testAction := range testActions {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if got, want := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; got != want {
-					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, got, want)
+				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
 				}
 			}
-			bindToDevice := tcpip.BindToDeviceOption("to be modified by GetSockOpt")
-			if ep.GetSockOpt(&bindToDevice) != nil {
-				t.Errorf("GetSockOpt got %v, want %v", ep.GetSockOpt(&bindToDevice), nil)
+			bindToDevice := tcpip.BindToDeviceOption(88888)
+			if err := ep.GetSockOpt(&bindToDevice); err != nil {
+				t.Errorf("GetSockOpt got %v, want %v", err, nil)
 			}
 			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %q, want %q", got, want)
+				t.Errorf("bindToDevice got %d, want %d", got, want)
 			}
 		})
 	}
-- 
cgit v1.2.3


From 89d11b4d96b0c40e373f14ba72d570c9b894f976 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 9 Jan 2020 13:41:52 -0800
Subject: Added a test that we don't pass yet

---
 pkg/sentry/socket/netfilter/netfilter.go | 37 +++++++++++++++++++++++++----
 pkg/tcpip/iptables/iptables.go           |  7 ++++++
 pkg/tcpip/iptables/types.go              |  4 ++--
 test/iptables/BUILD                      |  4 ++++
 test/iptables/filter_input.go            | 30 ++++++++++++++++++++++++
 test/iptables/iptables_test.go           | 16 +++++++++----
 test/iptables/iptables_util.go           | 40 ++++++++++++++++++++++++++++++++
 test/iptables/runner/BUILD               |  1 +
 8 files changed, 127 insertions(+), 12 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 014dfa625..f30461936 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -323,10 +323,9 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 		// TODO(gvisor.dev/issue/170): We should support IPTIP
 		// filtering. We reject any nonzero IPTIP values for now.
-		emptyIPTIP := linux.IPTIP{}
-		if entry.IP != emptyIPTIP {
-			log.Warningf("netfilter: non-empty struct iptip found")
-			return syserr.ErrInvalidArgument
+		filter, err := filterFromIPTIP(entry.IP)
+		if err != nil {
+			return err
 		}
 
 		// Get the target of the rule.
@@ -336,7 +335,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 		optVal = optVal[consumed:]
 
-		table.Rules = append(table.Rules, iptables.Rule{Target: target})
+		table.Rules = append(table.Rules, iptables.Rule{
+			Filter: filter,
+			Target: target,
+		})
 		offsets = append(offsets, offset)
 		offset += linux.SizeOfIPTEntry + consumed
 	}
@@ -447,6 +449,31 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	return nil, 0, syserr.ErrInvalidArgument
 }
 
+func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
+	if containsUnsupportedFields(iptip) {
+		log.Warningf("netfilter: unsupported fields in struct iptip: %+v")
+		return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument
+	}
+	return iptables.IPHeaderFilter{
+		Protocol: iptip.Protocol,
+	}, nil
+}
+
+func containsUnsupportedFields(iptip linux.IPTIP) bool {
+	// Currently we check that everything except protocol is zeroed.
+	var emptyInetAddr = linux.InetAddr{}
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	return iptip.Dst != emptyInetAddr ||
+		iptip.SrcMask != emptyInetAddr ||
+		iptip.DstMask != emptyInetAddr ||
+		iptip.InputInterface != emptyInterface ||
+		iptip.OutputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.OutputInterfaceMask != emptyInterface ||
+		iptip.Flags != 0 ||
+		iptip.InverseFlags != 0
+}
+
 func hookFromLinux(hook int) iptables.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 91abbbea8..b8d70ec1e 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -185,6 +185,13 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 
 func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
 	rule := table.Rules[ruleIdx]
+
+	// First check whether the packet matches the IP header filter.
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// if rule.Filter.Protocol != pkt.Protocol {
+	// 	return Continue
+	// }
+
 	// Go through each rule matcher. If they all match, run
 	// the rule target.
 	for _, matcher := range rule.Matchers {
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 4b2a9c294..4bedd9bc8 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -151,8 +151,8 @@ func (table *Table) SetMetadata(metadata interface{}) {
 // packets this rule applies to. If there are no matchers in the rule, it
 // applies to any packet.
 type Rule struct {
-	// IPHeaderFilters holds basic IP filtering fields common to every rule.
-	IPHeaderFilter IPHeaderFilter
+	// IPHeaderFilter holds basic IP filtering fields common to every rule.
+	Filter IPHeaderFilter
 
 	// Matchers is the list of matchers for this rule.
 	Matchers []Matcher
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index fa833c3b2..6a9d05828 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -4,6 +4,7 @@ package(licenses = ["notice"])
 
 go_library(
     name = "iptables",
+    testonly = 1,
     srcs = [
         "filter_input.go",
         "iptables.go",
@@ -11,6 +12,9 @@ go_library(
     ],
     importpath = "gvisor.dev/gvisor/test/iptables",
     visibility = ["//test/iptables:__subpackages__"],
+    deps = [
+        "//runsc/testutil",
+    ],
 )
 
 go_test(
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 7c4d469fa..a3f0052b5 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -28,6 +28,7 @@ const (
 )
 
 func init() {
+	RegisterTestCase(FilterInputDropOnlyUDP{})
 	RegisterTestCase(FilterInputDropUDP{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropDifferentUDPPort{})
@@ -65,6 +66,35 @@ func (FilterInputDropUDP) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, dropPort, sendloopDuration)
 }
 
+// FilterInputDropOnlyUDP tests that "-p udp -j DROP" only affects UDP traffic.
+type FilterInputDropOnlyUDP struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropOnlyUDP) Name() string {
+	return "FilterInputDropOnlyUDP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropOnlyUDP) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for a TCP connection, which should be allowed.
+	if err := listenTCP(acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("failed to establish a connection %v", err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropOnlyUDP) LocalAction(ip net.IP) error {
+	// Try to establish a TCP connection with the container, which should
+	// succeed.
+	return connectLoopTCP(ip, acceptPort, sendloopDuration)
+}
+
 // FilterInputDropUDPPort tests that we can drop UDP traffic by port.
 type FilterInputDropUDPPort struct{}
 
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index d040e971a..beaaf519c 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -160,11 +160,11 @@ func logContainer(output string, err error) {
 	log.Infof(msg)
 }
 
-func TestFilterInputDropUDP(t *testing.T) {
-	if err := singleTest(FilterInputDropUDP{}); err != nil {
-		t.Fatal(err)
-	}
-}
+// func TestFilterInputDropUDP(t *testing.T) {
+// 	if err := singleTest(FilterInputDropUDP{}); err != nil {
+// 		t.Fatal(err)
+// 	}
+// }
 
 // func TestFilterInputDropUDPPort(t *testing.T) {
 // 	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
@@ -183,3 +183,9 @@ func TestFilterInputDropUDP(t *testing.T) {
 // 		t.Fatal(err)
 // 	}
 // }
+
+func TestFilterInputDropOnlyUDP(t *testing.T) {
+	if err := singleTest(FilterInputDropOnlyUDP{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 3a4d11f1a..3dcaafb79 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -19,6 +19,8 @@ import (
 	"net"
 	"os/exec"
 	"time"
+
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 const iptablesBinary = "iptables"
@@ -80,3 +82,41 @@ func sendUDPLoop(ip net.IP, port int, duration time.Duration) error {
 
 	return nil
 }
+
+func listenTCP(port int, timeout time.Duration) error {
+	localAddr := net.TCPAddr{Port: acceptPort}
+	listener, err := net.ListenTCP("tcp4", &localAddr)
+	if err != nil {
+		return err
+	}
+	defer listener.Close()
+	listener.SetDeadline(time.Now().Add(timeout))
+	conn, err := listener.AcceptTCP()
+	if err != nil {
+		return fmt.Errorf("failed to establish a connection %v", err)
+	}
+	defer conn.Close()
+
+	return nil
+}
+
+func connectLoopTCP(ip net.IP, port int, timeout time.Duration) error {
+	contAddr := net.TCPAddr{
+		IP:   ip,
+		Port: port,
+	}
+	// The container may not be listening when we first connect, so retry
+	// upon error.
+	cb := func() error {
+		conn, err := net.DialTCP("tcp4", nil, &contAddr)
+		if conn != nil {
+			conn.Close()
+		}
+		return err
+	}
+	if err := testutil.Poll(cb, timeout); err != nil {
+		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
+	}
+
+	return nil
+}
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
index c6c42d870..a5b6f082c 100644
--- a/test/iptables/runner/BUILD
+++ b/test/iptables/runner/BUILD
@@ -10,6 +10,7 @@ container_image(
 
 go_image(
     name = "runner",
+    testonly = 1,
     srcs = ["main.go"],
     base = ":iptables-base",
     deps = ["//test/iptables"],
-- 
cgit v1.2.3


From ff719159befaee7d2abcfeb88905a7486cd34845 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 9 Jan 2020 15:38:21 -0800
Subject: Confirmed that it works if I hardcode 17 in for pkt.Protocol. Need to
 address parsing the packet early :(

---
 pkg/tcpip/iptables/iptables.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index b8d70ec1e..4e1700fdb 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -188,9 +188,9 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	// if rule.Filter.Protocol != pkt.Protocol {
-	// 	return Continue
-	// }
+	if rule.Filter.Protocol != pkt.Protocol {
+		return Continue
+	}
 
 	// Go through each rule matcher. If they all match, run
 	// the rule target.
-- 
cgit v1.2.3


From 04abc9cf558930472605bf740a4333d6fafe5930 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 9 Jan 2020 15:38:28 -0800
Subject: Add test for redirect port

Fix the indentation and print statements.
Moved the NAT redirect tests to new file.
Added negative test to check redirect rule on ports other than
redirected port.
---
 test/iptables/BUILD            |  1 +
 test/iptables/filter_input.go  | 28 --------------
 test/iptables/iptables_test.go |  9 ++++-
 test/iptables/nat.go           | 83 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 30 deletions(-)
 create mode 100644 test/iptables/nat.go

diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index fa833c3b2..68eed721e 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -8,6 +8,7 @@ go_library(
         "filter_input.go",
         "iptables.go",
         "iptables_util.go",
+        "nat.go",
     ],
     importpath = "gvisor.dev/gvisor/test/iptables",
     visibility = ["//test/iptables:__subpackages__"],
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 41bb85369..923f44e68 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -23,7 +23,6 @@ import (
 const (
 	dropPort         = 2401
 	acceptPort       = 2402
-	redirectPort     = 42
 	sendloopDuration = 2 * time.Second
 	network          = "udp4"
 )
@@ -32,7 +31,6 @@ func init() {
 	RegisterTestCase(FilterInputDropUDP{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropDifferentUDPPort{})
-	RegisterTestCase(FilterInputRedirectUDPPort{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -124,29 +122,3 @@ func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
 func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
-
-// FilterInputRedirectUDPPort tests that packets are redirected to different port.
-type FilterInputRedirectUDPPort struct{}
-
-// Name implements TestCase.Name.
-func (FilterInputRedirectUDPPort) Name() string {
-        return "FilterInputRedirectUDPPort"
-}
-
-// ContainerAction implements TestCase.ContainerAction.
-func (FilterInputRedirectUDPPort) ContainerAction(ip net.IP) error {
-        if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
-		return err
-	}
-
-	if err := listenUDP(redirectPort, sendloopDuration); err != nil {
-	        return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, redirectPort, err)
-	}
-
-	return nil
-}
-
-// LocalAction implements TestCase.LocalAction.
-func (FilterInputRedirectUDPPort) LocalAction(ip net.IP) error {
-        return sendUDPLoop(ip, acceptPort, sendloopDuration)
-}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index d57ddc0fe..fce9247aa 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -178,9 +178,14 @@ func TestFilterInputDropDifferentUDPPort(t *testing.T) {
 	}
 }
 
-func TestFilterInputRedirectUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputRedirectUDPPort{}); err != nil {
+func TestFilterNATRedirectUDPPort(t *testing.T) {
+	if err := singleTest(FilterNATRedirectUDPPort{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
+func TestFilterNATDropUDP(t *testing.T) {
+        if err := singleTest(FilterNATDropUDP{}); err != nil {
+	        t.Fatal(err)
+	}
+}
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
new file mode 100644
index 000000000..6deabf217
--- /dev/null
+++ b/test/iptables/nat.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+        "fmt"
+        "net"
+)
+
+const (
+        redirectPort     = 42
+)
+
+func init() {
+        RegisterTestCase(FilterNATRedirectUDPPort{})
+	RegisterTestCase(FilterNATDropUDP{})
+}
+
+// FilterInputRedirectUDPPort tests that packets are redirected to different port.
+type FilterNATRedirectUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterNATRedirectUDPPort) Name() string {
+        return "FilterNATRedirectUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterNATRedirectUDPPort) ContainerAction(ip net.IP) error {
+        if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports",
+	fmt.Sprintf("%d", redirectPort)); err != nil {
+		return err
+	}
+
+	if err := listenUDP(redirectPort, sendloopDuration); err != nil {
+	        return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", redirectPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterNATRedirectUDPPort) LocalAction(ip net.IP) error {
+        return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterNATDropUDP tests that packets are not received in ports other than redirect port.
+type FilterNATDropUDP struct{}
+
+// Name implements TestCase.Name.
+func (FilterNATDropUDP) Name() string {
+        return "FilterNATDropUDP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterNATDropUDP) ContainerAction(ip net.IP) error {
+        if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports",
+	fmt.Sprintf("%d", redirectPort)); err != nil {
+		return err
+	}
+
+	if err := listenUDP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been redirected to port %d", acceptPort, redirectPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterNATDropUDP) LocalAction(ip net.IP) error {
+        return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
-- 
cgit v1.2.3


From 8fafd3142e85175fe56bc3333d859f1a8cfbb878 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 9 Jan 2020 15:55:24 -0800
Subject: Separate NDP tests into its own package

Internal tools timeout after 60s during tests that are required to pass before
changes can be submitted. Separate out NDP tests into its own package to help
prevent timeouts when testing.

PiperOrigin-RevId: 288990597
---
 pkg/tcpip/stack/BUILD         | 23 ++++++++++--
 pkg/tcpip/stack/ndp_test.go   | 87 +++++++++++++++++++++++++++++++++++++++----
 pkg/tcpip/stack/stack_test.go | 74 +++---------------------------------
 3 files changed, 106 insertions(+), 78 deletions(-)

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index b8f9517d0..826fca4de 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -52,7 +52,6 @@ go_test(
     name = "stack_x_test",
     size = "small",
     srcs = [
-        "ndp_test.go",
         "stack_test.go",
         "transport_demuxer_test.go",
         "transport_test.go",
@@ -62,14 +61,12 @@ go_test(
         "//pkg/rand",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
-        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
-        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
         "@com_github_google_go-cmp//cmp:go_default_library",
@@ -86,3 +83,23 @@ go_test(
         "//pkg/tcpip",
     ],
 )
+
+go_test(
+    name = "ndp_test",
+    size = "small",
+    srcs = ["ndp_test.go"],
+    deps = [
+        ":stack",
+        "//pkg/rand",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index d334af289..fa84c94a6 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package stack_test
+package ndp_test
 
 import (
 	"encoding/binary"
@@ -285,6 +285,8 @@ func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tc
 // Included in the subtests is a test to make sure that an invalid
 // RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
 func TestDADResolve(t *testing.T) {
+	t.Parallel()
+
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -417,6 +419,8 @@ func TestDADResolve(t *testing.T) {
 // a node doing DAD for the same address), or if another node is detected to own
 // the address already (receive an NA message for the tentative address).
 func TestDADFail(t *testing.T) {
+	t.Parallel()
+
 	tests := []struct {
 		name    string
 		makeBuf func(tgt tcpip.Address) buffer.Prependable
@@ -560,6 +564,8 @@ func TestDADFail(t *testing.T) {
 // TestDADStop tests to make sure that the DAD process stops when an address is
 // removed.
 func TestDADStop(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent),
 	}
@@ -632,6 +638,71 @@ func TestDADStop(t *testing.T) {
 	}
 }
 
+// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
+// link-local addresses will only be assigned after the DAD process resolves.
+func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	t.Parallel()
+
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.DefaultNDPConfigurations()
+	opts := stack.Options{
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:           ndpConfigs,
+		AutoGenIPv6LinkLocal: true,
+		NDPDisp:              &ndpDisp,
+	}
+
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(_) = %s", err)
+	}
+
+	// Address should not be considered bound to the
+	// NIC yet (DAD ongoing).
+	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+	}
+
+	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// We should get a resolution event after 1s (default time to
+		// resolve as per default NDP configurations). Waiting for that
+		// resolution time + an extra 1s without a resolution event
+		// means something is wrong.
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicID != 1 {
+			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
+		}
+		if e.addr != linkLocalAddr {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
+		}
+		if !e.resolved {
+			t.Fatal("got DAD event w/ resolved = false, want = true")
+		}
+	}
+	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+	}
+	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+	}
+}
+
 // TestSetNDPConfigurationFailsForBadNICID tests to make sure we get an error if
 // we attempt to update NDP configurations using an invalid NICID.
 func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
@@ -649,6 +720,8 @@ func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
 // configurations without affecting the default NDP configurations or other
 // interfaces' configurations.
 func TestSetNDPConfigurations(t *testing.T) {
+	t.Parallel()
+
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -875,6 +948,8 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on
 // TestNoRouterDiscovery tests that router discovery will not be performed if
 // configured not to.
 func TestNoRouterDiscovery(t *testing.T) {
+	t.Parallel()
+
 	// Being configured to discover routers means handle and
 	// discover are set to true and forwarding is set to false.
 	// This tests all possible combinations of the configurations,
@@ -887,8 +962,6 @@ func TestNoRouterDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				routerC: make(chan ndpRouterEvent, 1),
 			}
@@ -1123,6 +1196,8 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 // TestNoPrefixDiscovery tests that prefix discovery will not be performed if
 // configured not to.
 func TestNoPrefixDiscovery(t *testing.T) {
+	t.Parallel()
+
 	prefix := tcpip.AddressWithPrefix{
 		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
 		PrefixLen: 64,
@@ -1140,8 +1215,6 @@ func TestNoPrefixDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				prefixC: make(chan ndpPrefixEvent, 1),
 			}
@@ -1498,6 +1571,8 @@ func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
 
 // TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
 func TestNoAutoGenAddr(t *testing.T) {
+	t.Parallel()
+
 	prefix, _, _ := prefixSubnetAddr(0, "")
 
 	// Being configured to auto-generate addresses means handle and
@@ -1512,8 +1587,6 @@ func TestNoAutoGenAddr(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 44e5229cc..e8de4e87d 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -24,7 +24,6 @@ import (
 	"sort"
 	"strings"
 	"testing"
-	"time"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/rand"
@@ -50,6 +49,8 @@ const (
 	// where another value is explicitly used. It is chosen to match the MTU
 	// of loopback interfaces on linux systems.
 	defaultMTU = 65536
+
+	linkAddr = "\x02\x02\x03\x04\x05\x06"
 )
 
 // fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and
@@ -1909,7 +1910,7 @@ func TestNICAutoGenAddr(t *testing.T) {
 		{
 			"Disabled",
 			false,
-			linkAddr1,
+			linkAddr,
 			stack.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: func(nicID tcpip.NICID, _ string) string {
 					return fmt.Sprintf("nic%d", nicID)
@@ -1920,7 +1921,7 @@ func TestNICAutoGenAddr(t *testing.T) {
 		{
 			"Enabled",
 			true,
-			linkAddr1,
+			linkAddr,
 			stack.OpaqueInterfaceIdentifierOptions{},
 			true,
 		},
@@ -2068,14 +2069,14 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 			name:      "Disabled",
 			nicName:   "nic1",
 			autoGen:   false,
-			linkAddr:  linkAddr1,
+			linkAddr:  linkAddr,
 			secretKey: secretKey[:],
 		},
 		{
 			name:      "Enabled",
 			nicName:   "nic1",
 			autoGen:   true,
-			linkAddr:  linkAddr1,
+			linkAddr:  linkAddr,
 			secretKey: secretKey[:],
 		},
 		// These are all cases where we would not have generated a
@@ -2213,69 +2214,6 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 	}
 }
 
-// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
-// link-local addresses will only be assigned after the DAD process resolves.
-func TestNICAutoGenAddrDoesDAD(t *testing.T) {
-	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent),
-	}
-	ndpConfigs := stack.DefaultNDPConfigurations()
-	opts := stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:           ndpConfigs,
-		AutoGenIPv6LinkLocal: true,
-		NDPDisp:              &ndpDisp,
-	}
-
-	e := channel.New(10, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
-	}
-
-	// Address should not be considered bound to the
-	// NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
-	}
-
-	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
-
-	// Wait for DAD to resolve.
-	select {
-	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
-		// We should get a resolution event after 1s (default time to
-		// resolve as per default NDP configurations). Waiting for that
-		// resolution time + an extra 1s without a resolution event
-		// means something is wrong.
-		t.Fatal("timed out waiting for DAD resolution")
-	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != 1 {
-			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-		}
-		if e.addr != linkLocalAddr {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
-		}
-		if !e.resolved {
-			t.Fatal("got DAD event w/ resolved = false, want = true")
-		}
-	}
-	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
-	}
-	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
-	}
-}
-
 // TestNewPEB tests that a new PrimaryEndpointBehavior value (peb) is respected
 // when an address's kind gets "promoted" to permanent from permanentExpired.
 func TestNewPEBOnPromotionToPermanent(t *testing.T) {
-- 
cgit v1.2.3


From 26c5653bb547450e85666f345d542b516b3417fc Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 9 Jan 2020 16:55:01 -0800
Subject: Inform NDPDispatcher when Stack learns about available configurations
 via DHCPv6

Inform the Stack's NDPDispatcher when it receives an NDP Router Advertisement
that updates the available configurations via DHCPv6. The Stack makes sure that
its NDPDispatcher isn't informed unless the avaiable configurations via DHCPv6
for a NIC is updated.

Tests: Test that a Stack's NDPDispatcher is informed when it receives an NDP
Router Advertisement that informs it of new configurations available via DHCPv6.
PiperOrigin-RevId: 289001283
---
 pkg/tcpip/stack/ndp.go      |  62 +++++++++++++++++
 pkg/tcpip/stack/ndp_test.go | 165 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 212 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 35825ebf7..a9dd322db 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -115,6 +115,30 @@ var (
 	MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour
 )
 
+// DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
+// NDP Router Advertisement informed the Stack about.
+type DHCPv6ConfigurationFromNDPRA int
+
+const (
+	// DHCPv6NoConfiguration indicates that no configurations are available via
+	// DHCPv6.
+	DHCPv6NoConfiguration DHCPv6ConfigurationFromNDPRA = iota
+
+	// DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
+	//
+	// DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6
+	// will return all available configuration information.
+	DHCPv6ManagedAddress
+
+	// DHCPv6OtherConfigurations indicates that other configuration information is
+	// available via DHCPv6.
+	//
+	// Other configurations are configurations other than addresses. Examples of
+	// other configurations are recursive DNS server list, DNS search lists and
+	// default gateway.
+	DHCPv6OtherConfigurations
+)
+
 // NDPDispatcher is the interface integrators of netstack must implement to
 // receive and handle NDP related events.
 type NDPDispatcher interface {
@@ -194,7 +218,20 @@ type NDPDispatcher interface {
 	// already known DNS servers. If called with known DNS servers, their
 	// valid lifetimes must be refreshed to lifetime (it may be increased,
 	// decreased, or completely invalidated when lifetime = 0).
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
 	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
+
+	// OnDHCPv6Configuration will be called with an updated configuration that is
+	// available via DHCPv6 for a specified NIC.
+	//
+	// NDPDispatcher assumes that the initial configuration available by DHCPv6 is
+	// DHCPv6NoConfiguration.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA)
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
@@ -281,6 +318,9 @@ type ndpState struct {
 
 	// The addresses generated by SLAAC.
 	autoGenAddresses map[tcpip.Address]autoGenAddressState
+
+	// The last learned DHCPv6 configuration from an NDP RA.
+	dhcpv6Configuration DHCPv6ConfigurationFromNDPRA
 }
 
 // dadState holds the Duplicate Address Detection timer and channel to signal
@@ -533,6 +573,28 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 		return
 	}
 
+	// Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
+	// only inform the dispatcher on configuration changes. We do nothing else
+	// with the information.
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		var configuration DHCPv6ConfigurationFromNDPRA
+		switch {
+		case ra.ManagedAddrConfFlag():
+			configuration = DHCPv6ManagedAddress
+
+		case ra.OtherConfFlag():
+			configuration = DHCPv6OtherConfigurations
+
+		default:
+			configuration = DHCPv6NoConfiguration
+		}
+
+		if ndp.dhcpv6Configuration != configuration {
+			ndp.dhcpv6Configuration = configuration
+			ndpDisp.OnDHCPv6Configuration(ndp.nic.ID(), configuration)
+		}
+	}
+
 	// Is the NIC configured to discover default routers?
 	if ndp.configs.DiscoverDefaultRouters {
 		rtr, ok := ndp.defaultRouters[ip]
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index fa84c94a6..108762b6e 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -162,18 +162,24 @@ type ndpRDNSSEvent struct {
 	rdnss ndpRDNSS
 }
 
+type ndpDHCPv6Event struct {
+	nicID         tcpip.NICID
+	configuration stack.DHCPv6ConfigurationFromNDPRA
+}
+
 var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
 // related events happen for test purposes.
 type ndpDispatcher struct {
-	dadC           chan ndpDADEvent
-	routerC        chan ndpRouterEvent
-	rememberRouter bool
-	prefixC        chan ndpPrefixEvent
-	rememberPrefix bool
-	autoGenAddrC   chan ndpAutoGenAddrEvent
-	rdnssC         chan ndpRDNSSEvent
+	dadC                 chan ndpDADEvent
+	routerC              chan ndpRouterEvent
+	rememberRouter       bool
+	prefixC              chan ndpPrefixEvent
+	rememberPrefix       bool
+	autoGenAddrC         chan ndpAutoGenAddrEvent
+	rdnssC               chan ndpRDNSSEvent
+	dhcpv6ConfigurationC chan ndpDHCPv6Event
 }
 
 // Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
@@ -280,6 +286,16 @@ func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tc
 	}
 }
 
+// Implements stack.NDPDispatcher.OnDHCPv6Configuration.
+func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
+	if c := n.dhcpv6ConfigurationC; c != nil {
+		c <- ndpDHCPv6Event{
+			nicID,
+			configuration,
+		}
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -870,21 +886,32 @@ func TestSetNDPConfigurations(t *testing.T) {
 	}
 }
 
-// raBufWithOpts returns a valid NDP Router Advertisement with options.
-//
-// Note, raBufWithOpts does not populate any of the RA fields other than the
-// Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+// raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
+// and DHCPv6 configurations specified.
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
 	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
 	pkt.SetType(header.ICMPv6RouterAdvert)
 	pkt.SetCode(0)
-	ra := header.NDPRouterAdvert(pkt.NDPPayload())
+	raPayload := pkt.NDPPayload()
+	ra := header.NDPRouterAdvert(raPayload)
+	// Populate the Router Lifetime.
+	binary.BigEndian.PutUint16(raPayload[2:], rl)
+	// Populate the Managed Address flag field.
+	if managedAddress {
+		// The Managed Addresses flag field is the 7th bit of byte #1 (0-indexing)
+		// of the RA payload.
+		raPayload[1] |= (1 << 7)
+	}
+	// Populate the Other Configurations flag field.
+	if otherConfigurations {
+		// The Other Configurations flag field is the 6th bit of byte #1
+		// (0-indexing) of the RA payload.
+		raPayload[1] |= (1 << 6)
+	}
 	opts := ra.Options()
 	opts.Serialize(optSer)
-	// Populate the Router Lifetime.
-	binary.BigEndian.PutUint16(pkt.NDPPayload()[2:], rl)
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, ip, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
 	payloadLength := hdr.UsedLength()
 	iph := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
@@ -899,6 +926,23 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
 	return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
 }
 
+// raBufWithOpts returns a valid NDP Router Advertisement with options.
+//
+// Note, raBufWithOpts does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
+}
+
+// raBufWithDHCPv6 returns a valid NDP Router Advertisement with DHCPv6 related
+// fields set.
+//
+// Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
+// DHCPv6 related ones.
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) tcpip.PacketBuffer {
+	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
+}
+
 // raBuf returns a valid NDP Router Advertisement.
 //
 // Note, raBuf does not populate any of the RA fields other than the
@@ -3024,3 +3068,94 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	default:
 	}
 }
+
+// TestDHCPv6ConfigurationFromNDPDA tests that the NDPDispatcher is properly
+// informed when new information about what configurations are available via
+// DHCPv6 is learned.
+func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dhcpv6ConfigurationC: make(chan ndpDHCPv6Event, 1),
+		rememberRouter:       true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectDHCPv6Event := func(configuration stack.DHCPv6ConfigurationFromNDPRA) {
+		t.Helper()
+		select {
+		case e := <-ndpDisp.dhcpv6ConfigurationC:
+			if diff := cmp.Diff(ndpDHCPv6Event{nicID: nicID, configuration: configuration}, e, cmp.AllowUnexported(e)); diff != "" {
+				t.Errorf("dhcpv6 event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected DHCPv6 configuration event")
+		}
+	}
+
+	expectNoDHCPv6Event := func() {
+		t.Helper()
+		select {
+		case <-ndpDisp.dhcpv6ConfigurationC:
+			t.Fatal("unexpected DHCPv6 configuration event")
+		default:
+		}
+	}
+
+	// The initial DHCPv6 configuration should be stack.DHCPv6NoConfiguration.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Other
+	// Configurations.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	// Receiving the same update again should not result in an event to the
+	// NDPDispatcher.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Managed Address.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to none.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Managed Address.
+	//
+	// Note, when the M flag is set, the O flag is redundant.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+	expectNoDHCPv6Event()
+	// Even though the DHCPv6 flags are different, the effective configuration is
+	// the same so we should not receive a new event.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+	expectNoDHCPv6Event()
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Other
+	// Configurations.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectNoDHCPv6Event()
+}
-- 
cgit v1.2.3


From f1b69b159f440b93845c11192fcf1c7633d2b6c8 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 9 Jan 2020 17:22:48 -0800
Subject: Implement gcloud_producer for bm-tools

bm-tools works via "Machine" objects, which are front
objects for actual virtual or physical machines. glcoud_producer
produces machines on GCP using the `gcloud` tool.

Included are:
- GCloudProducer - the class producing machines
- MockGCloudProducer - class for mocking GCloudProducer.
- tests using the mock
- test data as .json files. Code to generate this test data may be
included in a follow up.

PiperOrigin-RevId: 289005958
---
 benchmarks/BUILD                                   |   1 +
 benchmarks/harness/BUILD                           |   5 +-
 benchmarks/harness/machine_producers/BUILD         |  40 ++++
 .../harness/machine_producers/gcloud_producer.py   | 250 +++++++++++++++++++++
 .../machine_producers/gcloud_producer_test.py      |  48 ++++
 .../harness/machine_producers/mock_producer.py     |  23 +-
 .../machine_producers/testdata/get_five.json       | 211 +++++++++++++++++
 .../machine_producers/testdata/get_one.json        | 145 ++++++++++++
 8 files changed, 721 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/harness/machine_producers/gcloud_producer.py
 create mode 100644 benchmarks/harness/machine_producers/gcloud_producer_test.py
 create mode 100644 benchmarks/harness/machine_producers/testdata/get_five.json
 create mode 100644 benchmarks/harness/machine_producers/testdata/get_one.json

diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index dbadeeaf2..1455c6c5b 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -5,5 +5,6 @@ py_binary(
     srcs = ["run.py"],
     main = "run.py",
     python_version = "PY3",
+    srcs_version = "PY3",
     deps = ["//benchmarks/runner"],
 )
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
index 9546220c4..081a74243 100644
--- a/benchmarks/harness/BUILD
+++ b/benchmarks/harness/BUILD
@@ -24,6 +24,7 @@ py_library(
     name = "container",
     srcs = ["container.py"],
     deps = [
+        "//benchmarks/workloads",
         requirement("asn1crypto", False),
         requirement("chardet", False),
         requirement("certifi", False),
@@ -45,6 +46,7 @@ py_library(
         "//benchmarks/harness:container",
         "//benchmarks/harness:ssh_connection",
         "//benchmarks/harness:tunnel_dispatcher",
+        "//benchmarks/harness/machine_mocks",
         requirement("asn1crypto", False),
         requirement("chardet", False),
         requirement("certifi", False),
@@ -53,6 +55,7 @@ py_library(
         requirement("idna", False),
         requirement("ptyprocess", False),
         requirement("requests", False),
+        requirement("six", False),
         requirement("urllib3", False),
         requirement("websocket-client", False),
     ],
@@ -64,7 +67,7 @@ py_library(
     deps = [
         "//benchmarks/harness",
         requirement("bcrypt", False),
-        requirement("cffi", False),
+        requirement("cffi", True),
         requirement("paramiko", True),
         requirement("cryptography", False),
     ],
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index a48da02a1..c4e943882 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -20,6 +20,7 @@ py_library(
     srcs = ["mock_producer.py"],
     deps = [
         "//benchmarks/harness:machine",
+        "//benchmarks/harness/machine_producers:gcloud_producer",
         "//benchmarks/harness/machine_producers:machine_producer",
     ],
 )
@@ -38,3 +39,42 @@ py_library(
     name = "gcloud_mock_recorder",
     srcs = ["gcloud_mock_recorder.py"],
 )
+
+py_library(
+    name = "gcloud_producer",
+    srcs = ["gcloud_producer.py"],
+    deps = [
+        "//benchmarks/harness:machine",
+        "//benchmarks/harness/machine_producers:gcloud_mock_recorder",
+        "//benchmarks/harness/machine_producers:machine_producer",
+    ],
+)
+
+filegroup(
+    name = "test_data",
+    srcs = [
+        "testdata/get_five.json",
+        "testdata/get_one.json",
+    ],
+)
+
+py_library(
+    name = "gcloud_producer_test_lib",
+    srcs = ["gcloud_producer_test.py"],
+    deps = [
+        "//benchmarks/harness/machine_producers:machine_producer",
+        "//benchmarks/harness/machine_producers:mock_producer",
+    ],
+)
+
+py_test(
+    name = "gcloud_producer_test",
+    srcs = [":gcloud_producer_test_lib"],
+    data = [
+        ":test_data",
+    ],
+    python_version = "PY3",
+    tags = [
+        "local",
+    ],
+)
diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
new file mode 100644
index 000000000..4693dd8a2
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -0,0 +1,250 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A machine producer which produces machine objects using `gcloud`.
+
+Machine producers produce valid harness.Machine objects which are backed by
+real machines. This producer produces those machines on the given user's GCP
+account using the `gcloud` tool.
+
+GCloudProducer creates instances on the given GCP account named like:
+`machine-XXXXXXX-XXXX-XXXX-XXXXXXXXXXXX` in a randomized fashion such that name
+collisions with user instances shouldn't happen.
+
+  Typical usage example:
+
+  producer = GCloudProducer(args)
+  machines = producer.get_machines(NUM_MACHINES)
+  # run stuff on machines with machines[i].run(CMD)
+  producer.release_machines(NUM_MACHINES)
+"""
+import datetime
+import getpass
+import json
+import subprocess
+import threading
+from typing import List, Dict, Any
+import uuid
+
+from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import gcloud_mock_recorder
+from benchmarks.harness.machine_producers import machine_producer
+
+DEFAULT_USER = getpass.getuser()
+
+
+class GCloudProducer(machine_producer.MachineProducer):
+  """Implementation of MachineProducer backed by GCP.
+
+  Produces Machine objects backed by GCP instances.
+
+  Attributes:
+    project: The GCP project name under which to create the machines.
+    ssh_key_path: path to a valid ssh key. See README on vaild ssh keys.
+    image: image name as a string.
+    image_project: image project as a string.
+    zone: string to a valid GCP zone.
+    ssh_user: string of user name for ssh_key
+    ssh_password: string of password for ssh key
+    mock: a mock printer which will print mock data if required. Mock data is
+      recorded output from subprocess calls (returncode, stdout, args).
+    condition: mutex for this class around machine creation and deleteion.
+  """
+
+  def __init__(self,
+               project: str,
+               ssh_key_path: str,
+               image: str,
+               image_project: str,
+               zone: str,
+               ssh_user: str,
+               mock: gcloud_mock_recorder.MockPrinter = None):
+    self.project = project
+    self.ssh_key_path = ssh_key_path
+    self.image = image
+    self.image_project = image_project
+    self.zone = zone
+    self.ssh_user = ssh_user if ssh_user else DEFAULT_USER
+    self.mock = mock
+    self.condition = threading.Condition()
+
+  def get_machines(self, num_machines: int) -> List[machine.Machine]:
+    """Returns requested number of machines backed by GCP instances."""
+    if num_machines <= 0:
+      raise ValueError(
+          "Cannot ask for {num} machines!".format(num=num_machines))
+    with self.condition:
+      names = self._get_unique_names(num_machines)
+      self._build_instances(names)
+      instances = self._start_command(names)
+      self._add_ssh_key_to_instances(names)
+      return self._machines_from_instances(instances)
+
+  def release_machines(self, machine_list: List[machine.Machine]):
+    """Releases the requested number of machines, deleting the instances."""
+    if not machine_list:
+      return
+    with self.condition:
+      cmd = "gcloud compute instances delete --quiet".split(" ")
+      names = [str(m) for m in machine_list]
+      cmd.extend(names)
+      cmd.append("--zone={zone}".format(zone=self.zone))
+      self._run_command(cmd)
+
+  def _machines_from_instances(
+      self, instances: List[Dict[str, Any]]) -> List[machine.Machine]:
+    """Creates Machine Objects from json data describing created instances."""
+    machines = []
+    for instance in instances:
+      name = instance["name"]
+      kwargs = {
+          "hostname":
+              instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"],
+          "key_path":
+              self.ssh_key_path,
+          "username":
+              self.ssh_user
+      }
+      machines.append(machine.RemoteMachine(name=name, **kwargs))
+    return machines
+
+  def _get_unique_names(self, num_names) -> List[str]:
+    """Returns num_names unique names based on data from the GCP project."""
+    curr_machines = self._list_machines()
+    curr_names = set([machine["name"] for machine in curr_machines])
+    ret = []
+    while len(ret) < num_names:
+      new_name = "machine-" + str(uuid.uuid4())
+      if new_name not in curr_names:
+        ret.append(new_name)
+        curr_names.update(new_name)
+    return ret
+
+  def _build_instances(self, names: List[str]) -> List[Dict[str, Any]]:
+    """Creates instances using gcloud command.
+
+    Runs the command `gcloud compute instances create` and returns json data
+    on created instances on success. Creates len(names) instances, one for each
+    name.
+
+    Args:
+      names: list of names of instances to create.
+
+    Returns:
+      List of json data describing created machines.
+    """
+    if not names:
+      raise ValueError(
+          "_build_instances cannot create instances without names.")
+    cmd = "gcloud compute instances create".split(" ")
+    cmd.extend(names)
+    cmd.extend("--preemptible --image={image} --zone={zone}".format(
+        image=self.image, zone=self.zone).split(" "))
+    if self.image_project:
+      cmd.append("--image-project={project}".format(project=self.image_project))
+      res = self._run_command(cmd)
+      return json.loads(res.stdout)
+
+  def _start_command(self, names):
+    """Starts instances using gcloud command.
+
+    Runs the command `gcloud compute instances start` on list of instances by
+    name and returns json data on started instances on success.
+
+    Args:
+      names: list of names of instances to start.
+
+    Returns:
+      List of json data describing started machines.
+    """
+    if not names:
+      raise ValueError("_start_command cannot start empty instance list.")
+    cmd = "gcloud compute instances start".split(" ")
+    cmd.extend(names)
+    cmd.append("--zone={zone}".format(zone=self.zone))
+    cmd.append("--project={project}".format(project=self.project))
+    res = self._run_command(cmd)
+    return json.loads(res.stdout)
+
+  def _add_ssh_key_to_instances(self, names: List[str]) -> None:
+    """Adds ssh key to instances by calling gcloud ssh command.
+
+    Runs the command `gcloud compute ssh instance_name` on list of images by
+    name. Tries to ssh into given instance
+
+    Args:
+      names: list of machine names to which to add the ssh-key
+        self.ssh_key_path.
+
+    Raises:
+      subprocess.CalledProcessError: when underlying subprocess call returns an
+      error other than 255 (Connection closed by remote host).
+      TimeoutError: when 3 unsuccessful tries to ssh into the host return 255.
+    """
+    for name in names:
+      cmd = "gcloud compute ssh {name}".format(name=name).split(" ")
+      cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_path))
+      cmd.append("--zone={zone}".format(zone=self.zone))
+      cmd.append("--command=uname")
+      timeout = datetime.timedelta(seconds=5 * 60)
+      start = datetime.datetime.now()
+      while datetime.datetime.now() <= timeout + start:
+        try:
+          self._run_command(cmd)
+          break
+        except subprocess.CalledProcessError as e:
+          if datetime.datetime.now() > timeout + start:
+            raise TimeoutError(
+                "Could not SSH into instance after 5 min: {name}".format(
+                    name=name))
+          # 255 is the returncode for ssh connection refused.
+          elif e.returncode == 255:
+
+            continue
+          else:
+            raise e
+
+  def _list_machines(self) -> List[Dict[str, Any]]:
+    """Runs `list` gcloud command and returns list of Machine data."""
+    cmd = "gcloud compute instances list --project {project}".format(
+        project=self.project).split(" ")
+    res = self._run_command(cmd)
+    return json.loads(res.stdout)
+
+  def _run_command(self, cmd: List[str]) -> subprocess.CompletedProcess:
+    """Runs command as a subprocess.
+
+    Runs command as subprocess and returns the result.
+    If this has a mock recorder, use the record method to record the subprocess
+    call.
+
+    Args:
+      cmd: command to be run as a list of strings.
+
+    Returns:
+      Completed process object to be parsed by caller.
+
+    Raises:
+      CalledProcessError: if subprocess.run returns an error.
+    """
+    cmd = cmd + ["--format=json"]
+    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if self.mock:
+      self.mock.record(res)
+    if res.returncode != 0:
+      raise subprocess.CalledProcessError(
+          cmd=res.args,
+          output=res.stdout,
+          stderr=res.stderr,
+          returncode=res.returncode)
+    return res
diff --git a/benchmarks/harness/machine_producers/gcloud_producer_test.py b/benchmarks/harness/machine_producers/gcloud_producer_test.py
new file mode 100644
index 000000000..c8adb2bdc
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_producer_test.py
@@ -0,0 +1,48 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests GCloudProducer using mock data.
+
+GCloudProducer produces machines using 'get_machines' and 'release_machines'
+methods. The tests check recorded data (jsonified subprocess.CompletedProcess
+objects) of the producer producing one and five machines.
+"""
+import os
+import types
+
+from benchmarks.harness.machine_producers import machine_producer
+from benchmarks.harness.machine_producers import mock_producer
+
+TEST_DIR = os.path.dirname(__file__)
+
+
+def run_get_release(producer: machine_producer.MachineProducer,
+                    num_machines: int,
+                    validator: types.FunctionType = None):
+  machines = producer.get_machines(num_machines)
+  assert len(machines) == num_machines
+  if validator:
+    validator(machines=machines, cmd="uname -a", workload=None)
+  producer.release_machines(machines)
+
+
+def test_run_one():
+  mock = mock_producer.MockReader(TEST_DIR + "get_one.json")
+  producer = mock_producer.MockGCloudProducer(mock)
+  run_get_release(producer, 1)
+
+
+def test_run_five():
+  mock = mock_producer.MockReader(TEST_DIR + "get_five.json")
+  producer = mock_producer.MockGCloudProducer(mock)
+  run_get_release(producer, 5)
diff --git a/benchmarks/harness/machine_producers/mock_producer.py b/benchmarks/harness/machine_producers/mock_producer.py
index 4f29ad53f..37e9cb4b7 100644
--- a/benchmarks/harness/machine_producers/mock_producer.py
+++ b/benchmarks/harness/machine_producers/mock_producer.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 """Producers of mocks."""
 
-from typing import List
+from typing import List, Any
 
 from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import gcloud_mock_recorder
+from benchmarks.harness.machine_producers import gcloud_producer
 from benchmarks.harness.machine_producers import machine_producer
 
 
@@ -29,3 +31,22 @@ class MockMachineProducer(machine_producer.MachineProducer):
   def release_machines(self, machine_list: List[machine.MockMachine]):
     """No-op."""
     return
+
+
+class MockGCloudProducer(gcloud_producer.GCloudProducer):
+  """Mocks GCloudProducer for testing purposes."""
+
+  def __init__(self, mock: gcloud_mock_recorder.MockReader, **kwargs):
+    gcloud_producer.GCloudProducer.__init__(
+        self, project="mock", ssh_private_key_path="mock", **kwargs)
+    self.mock = mock
+
+  def _validate_ssh_file(self):
+    pass
+
+  def _run_command(self, cmd):
+    return self.mock.pop(cmd)
+
+  def _machines_from_instances(
+      self, instances: List[Any]) -> List[machine.MockMachine]:
+    return [machine.MockMachine() for _ in instances]
diff --git a/benchmarks/harness/machine_producers/testdata/get_five.json b/benchmarks/harness/machine_producers/testdata/get_five.json
new file mode 100644
index 000000000..32bad1b06
--- /dev/null
+++ b/benchmarks/harness/machine_producers/testdata/get_five.json
@@ -0,0 +1,211 @@
+[
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "list",
+            "--project",
+            "project",
+            "--format=json"
+        ],
+        "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":{\"natIP\":\"0.0.0.0\"}]}]}]",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "create",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+            "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+            "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+            "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+            "--preemptible",
+            "--image=ubuntu-1910-eoan-v20191204",
+            "--zone=us-west1-b",
+            "--image-project=ubuntu-os-cloud",
+            "--format=json"
+        ],
+        "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "start",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+            "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+            "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+            "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+            "--zone=us-west1-b",
+            "--project=project",
+            "--format=json"
+        ],
+        "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "Linux\n[]\n",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "Linux\n[]\n",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "Linux\n[]\n",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "Linux\n[]\n",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "Linux\n[]\n",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "delete",
+            "--quiet",
+            "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+            "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+            "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+            "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+            "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+            "--zone=us-west1-b",
+            "--format=json"
+        ],
+        "stdout": "[]\n",
+        "returncode": "0"
+    }
+]
diff --git a/benchmarks/harness/machine_producers/testdata/get_one.json b/benchmarks/harness/machine_producers/testdata/get_one.json
new file mode 100644
index 000000000..c359c19c8
--- /dev/null
+++ b/benchmarks/harness/machine_producers/testdata/get_one.json
@@ -0,0 +1,145 @@
+[
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "list",
+            "--project",
+            "linux-testing-user",
+            "--format=json"
+        ],
+        "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "create",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--preemptible",
+            "--image=ubuntu-1910-eoan-v20191204",
+            "--zone=us-west1-b",
+            "--image-project=ubuntu-os-cloud",
+            "--format=json"
+        ],
+        "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "start",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--zone=us-west1-b",
+            "--project=linux-testing-user",
+            "--format=json"
+        ],
+        "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+        
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "",
+        "returncode": "255"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "ssh",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+            "--zone=us-west1-b",
+            "--command=uname",
+            "--format=json"
+        ],
+        "stdout": "Linux\n[]\n",
+        "returncode": "0"
+    },
+    {
+        "args": [
+            "gcloud",
+            "compute",
+            "instances",
+            "delete",
+            "--quiet",
+            "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+            "--zone=us-west1-b",
+            "--format=json"
+        ],
+        "stdout": "[]\n",
+        "returncode": "0"
+    }
+]
-- 
cgit v1.2.3


From 356d81146bafc4b4548163eb87e886c851b49e12 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 9 Jan 2020 17:56:58 -0800
Subject: Deflake a couple of TCP syscall tests when run under gotsan.

PiperOrigin-RevId: 289010316
---
 .../linux/socket_bind_to_device_distribution.cc    | 25 ++++++++++++++++++--
 test/syscalls/linux/socket_inet_loopback.cc        | 27 +++++++++++++++++++---
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index 5767181a1..5ed57625c 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -183,7 +183,14 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
             }
             // Receive some data from a socket to be sure that the connect()
             // system call has been completed on another side.
-            int data;
+            // Do a short read and then close the socket to trigger a RST. This
+            // ensures that both ends of the connection are cleaned up and no
+            // goroutines hang around in TIME-WAIT. We do this so that this test
+            // does not timeout under gotsan runs where lots of goroutines can
+            // cause the test to use absurd amounts of memory.
+            //
+            // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -198,15 +205,29 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
   }
 
   for (int i = 0; i < kConnectAttempts; i++) {
-    FileDescriptor const fd = ASSERT_NO_ERRNO_AND_VALUE(
+    const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
     ASSERT_THAT(
         RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
                             connector.addr_len),
         SyscallSucceeds());
 
+    // Do two separate sends to ensure two segments are received. This is
+    // required for netstack where read is incorrectly assuming a whole
+    // segment is read when endpoint.Read() is called which is technically
+    // incorrect as the syscall that invoked endpoint.Read() may only
+    // consume it partially. This results in a case where a close() of
+    // such a socket does not trigger a RST in netstack due to the
+    // endpoint assuming that the endpoint has no unread data.
     EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
                 SyscallSucceedsWithValue(sizeof(i)));
+
+    // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+    //   generates a RST.
+    if (IsRunningOnGvisor()) {
+      EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+                  SyscallSucceedsWithValue(sizeof(i)));
+    }
   }
 
   // Join threads to be sure that all connections have been counted.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 619d41901..138024d9e 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -714,7 +714,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
   sockaddr_storage listen_addr = listener.addr;
   sockaddr_storage conn_addr = connector.addr;
   constexpr int kThreadCount = 3;
-  constexpr int kConnectAttempts = 4096;
+  constexpr int kConnectAttempts = 10000;
 
   // Create the listening socket.
   FileDescriptor listener_fds[kThreadCount];
@@ -729,7 +729,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
     ASSERT_THAT(
         bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
         SyscallSucceeds());
-    ASSERT_THAT(listen(fd, kConnectAttempts / 3), SyscallSucceeds());
+    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
 
     // On the first bind we need to determine which port was bound.
     if (i != 0) {
@@ -772,7 +772,14 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
             }
             // Receive some data from a socket to be sure that the connect()
             // system call has been completed on another side.
-            int data;
+            // Do a short read and then close the socket to trigger a RST. This
+            // ensures that both ends of the connection are cleaned up and no
+            // goroutines hang around in TIME-WAIT. We do this so that this test
+            // does not timeout under gotsan runs where lots of goroutines can
+            // cause the test to use absurd amounts of memory.
+            //
+            // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -795,8 +802,22 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
                               connector.addr_len),
           SyscallSucceeds());
 
+      // Do two separate sends to ensure two segments are received. This is
+      // required for netstack where read is incorrectly assuming a whole
+      // segment is read when endpoint.Read() is called which is technically
+      // incorrect as the syscall that invoked endpoint.Read() may only
+      // consume it partially. This results in a case where a close() of
+      // such a socket does not trigger a RST in netstack due to the
+      // endpoint assuming that the endpoint has no unread data.
       EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
                   SyscallSucceedsWithValue(sizeof(i)));
+
+      // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+      //   generates a RST.
+      if (IsRunningOnGvisor()) {
+        EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+                    SyscallSucceedsWithValue(sizeof(i)));
+      }
     }
   });
 
-- 
cgit v1.2.3


From 27500d529f7fb87eef8812278fd1bbca67bcba72 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 9 Jan 2020 22:00:42 -0800
Subject: New sync package.

* Rename syncutil to sync.
* Add aliases to sync types.
* Replace existing usage of standard library sync package.

This will make it easier to swap out synchronization primitives. For example,
this will allow us to use primitives from github.com/sasha-s/go-deadlock to
check for lock ordering violations.

Updates #1472

PiperOrigin-RevId: 289033387
---
 pkg/amutex/BUILD                                   |   1 +
 pkg/amutex/amutex_test.go                          |   3 +-
 pkg/atomicbitops/BUILD                             |   1 +
 pkg/atomicbitops/atomic_bitops_test.go             |   3 +-
 pkg/compressio/BUILD                               |   5 +-
 pkg/compressio/compressio.go                       |   2 +-
 pkg/control/server/BUILD                           |   1 +
 pkg/control/server/server.go                       |   2 +-
 pkg/eventchannel/BUILD                             |   2 +
 pkg/eventchannel/event.go                          |   2 +-
 pkg/eventchannel/event_test.go                     |   2 +-
 pkg/fdchannel/BUILD                                |   1 +
 pkg/fdchannel/fdchannel_test.go                    |   3 +-
 pkg/fdnotifier/BUILD                               |   1 +
 pkg/fdnotifier/fdnotifier.go                       |   2 +-
 pkg/flipcall/BUILD                                 |   3 +-
 pkg/flipcall/flipcall_example_test.go              |   3 +-
 pkg/flipcall/flipcall_test.go                      |   3 +-
 pkg/flipcall/flipcall_unsafe.go                    |  10 +-
 pkg/gate/BUILD                                     |   1 +
 pkg/gate/gate_test.go                              |   2 +-
 pkg/linewriter/BUILD                               |   1 +
 pkg/linewriter/linewriter.go                       |   3 +-
 pkg/log/BUILD                                      |   5 +-
 pkg/log/log.go                                     |   2 +-
 pkg/metric/BUILD                                   |   1 +
 pkg/metric/metric.go                               |   2 +-
 pkg/p9/BUILD                                       |   1 +
 pkg/p9/client.go                                   |   2 +-
 pkg/p9/p9test/BUILD                                |   2 +
 pkg/p9/p9test/client_test.go                       |   2 +-
 pkg/p9/p9test/p9test.go                            |   2 +-
 pkg/p9/path_tree.go                                |   3 +-
 pkg/p9/pool.go                                     |   2 +-
 pkg/p9/server.go                                   |   2 +-
 pkg/p9/transport.go                                |   2 +-
 pkg/procid/BUILD                                   |   2 +
 pkg/procid/procid_test.go                          |   3 +-
 pkg/rand/BUILD                                     |   5 +-
 pkg/rand/rand_linux.go                             |   2 +-
 pkg/refs/BUILD                                     |   2 +
 pkg/refs/refcounter.go                             |   2 +-
 pkg/refs/refcounter_test.go                        |   3 +-
 pkg/sentry/arch/BUILD                              |   1 +
 pkg/sentry/arch/arch_x86.go                        |   2 +-
 pkg/sentry/control/BUILD                           |   1 +
 pkg/sentry/control/pprof.go                        |   2 +-
 pkg/sentry/device/BUILD                            |   5 +-
 pkg/sentry/device/device.go                        |   2 +-
 pkg/sentry/fs/BUILD                                |   3 +-
 pkg/sentry/fs/copy_up.go                           |   2 +-
 pkg/sentry/fs/copy_up_test.go                      |   2 +-
 pkg/sentry/fs/dirent.go                            |   2 +-
 pkg/sentry/fs/dirent_cache.go                      |   3 +-
 pkg/sentry/fs/dirent_cache_limiter.go              |   3 +-
 pkg/sentry/fs/fdpipe/BUILD                         |   1 +
 pkg/sentry/fs/fdpipe/pipe.go                       |   2 +-
 pkg/sentry/fs/fdpipe/pipe_state.go                 |   2 +-
 pkg/sentry/fs/file.go                              |   2 +-
 pkg/sentry/fs/file_overlay.go                      |   2 +-
 pkg/sentry/fs/filesystems.go                       |   2 +-
 pkg/sentry/fs/fs.go                                |   3 +-
 pkg/sentry/fs/fsutil/BUILD                         |   1 +
 pkg/sentry/fs/fsutil/host_file_mapper.go           |   2 +-
 pkg/sentry/fs/fsutil/host_mappable.go              |   2 +-
 pkg/sentry/fs/fsutil/inode.go                      |   3 +-
 pkg/sentry/fs/fsutil/inode_cached.go               |   2 +-
 pkg/sentry/fs/gofer/BUILD                          |   1 +
 pkg/sentry/fs/gofer/inode.go                       |   2 +-
 pkg/sentry/fs/gofer/session.go                     |   2 +-
 pkg/sentry/fs/host/BUILD                           |   1 +
 pkg/sentry/fs/host/inode.go                        |   2 +-
 pkg/sentry/fs/host/socket.go                       |   2 +-
 pkg/sentry/fs/host/tty.go                          |   3 +-
 pkg/sentry/fs/inode.go                             |   3 +-
 pkg/sentry/fs/inode_inotify.go                     |   3 +-
 pkg/sentry/fs/inotify.go                           |   2 +-
 pkg/sentry/fs/inotify_watch.go                     |   2 +-
 pkg/sentry/fs/lock/BUILD                           |   1 +
 pkg/sentry/fs/lock/lock.go                         |   2 +-
 pkg/sentry/fs/mounts.go                            |   2 +-
 pkg/sentry/fs/overlay.go                           |   5 +-
 pkg/sentry/fs/proc/BUILD                           |   1 +
 pkg/sentry/fs/proc/seqfile/BUILD                   |   1 +
 pkg/sentry/fs/proc/seqfile/seqfile.go              |   2 +-
 pkg/sentry/fs/proc/sys_net.go                      |   2 +-
 pkg/sentry/fs/ramfs/BUILD                          |   1 +
 pkg/sentry/fs/ramfs/dir.go                         |   2 +-
 pkg/sentry/fs/restore.go                           |   2 +-
 pkg/sentry/fs/tmpfs/BUILD                          |   1 +
 pkg/sentry/fs/tmpfs/inode_file.go                  |   2 +-
 pkg/sentry/fs/tty/BUILD                            |   1 +
 pkg/sentry/fs/tty/dir.go                           |   2 +-
 pkg/sentry/fs/tty/line_discipline.go               |   2 +-
 pkg/sentry/fs/tty/queue.go                         |   3 +-
 pkg/sentry/fsimpl/ext/BUILD                        |   1 +
 pkg/sentry/fsimpl/ext/directory.go                 |   3 +-
 pkg/sentry/fsimpl/ext/filesystem.go                |   2 +-
 pkg/sentry/fsimpl/ext/regular_file.go              |   2 +-
 pkg/sentry/fsimpl/kernfs/BUILD                     |   2 +
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go        |   2 +-
 pkg/sentry/fsimpl/kernfs/kernfs.go                 |   2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go            |   2 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                      |   1 +
 pkg/sentry/fsimpl/tmpfs/regular_file.go            |   2 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go                   |   2 +-
 pkg/sentry/kernel/BUILD                            |   5 +-
 pkg/sentry/kernel/abstract_socket_namespace.go     |   2 +-
 pkg/sentry/kernel/auth/BUILD                       |   3 +-
 pkg/sentry/kernel/auth/user_namespace.go           |   2 +-
 pkg/sentry/kernel/epoll/BUILD                      |   1 +
 pkg/sentry/kernel/epoll/epoll.go                   |   2 +-
 pkg/sentry/kernel/eventfd/BUILD                    |   1 +
 pkg/sentry/kernel/eventfd/eventfd.go               |   2 +-
 pkg/sentry/kernel/fasync/BUILD                     |   1 +
 pkg/sentry/kernel/fasync/fasync.go                 |   3 +-
 pkg/sentry/kernel/fd_table.go                      |   2 +-
 pkg/sentry/kernel/fd_table_test.go                 |   2 +-
 pkg/sentry/kernel/fs_context.go                    |   2 +-
 pkg/sentry/kernel/futex/BUILD                      |   8 +-
 pkg/sentry/kernel/futex/futex.go                   |   3 +-
 pkg/sentry/kernel/futex/futex_test.go              |   2 +-
 pkg/sentry/kernel/kernel.go                        |   2 +-
 pkg/sentry/kernel/memevent/BUILD                   |   1 +
 pkg/sentry/kernel/memevent/memory_events.go        |   2 +-
 pkg/sentry/kernel/pipe/BUILD                       |   1 +
 pkg/sentry/kernel/pipe/buffer.go                   |   2 +-
 pkg/sentry/kernel/pipe/node.go                     |   3 +-
 pkg/sentry/kernel/pipe/pipe.go                     |   2 +-
 pkg/sentry/kernel/pipe/pipe_util.go                |   2 +-
 pkg/sentry/kernel/pipe/vfs.go                      |   3 +-
 pkg/sentry/kernel/semaphore/BUILD                  |   1 +
 pkg/sentry/kernel/semaphore/semaphore.go           |   2 +-
 pkg/sentry/kernel/shm/BUILD                        |   1 +
 pkg/sentry/kernel/shm/shm.go                       |   2 +-
 pkg/sentry/kernel/signal_handlers.go               |   3 +-
 pkg/sentry/kernel/signalfd/BUILD                   |   1 +
 pkg/sentry/kernel/signalfd/signalfd.go             |   3 +-
 pkg/sentry/kernel/syscalls.go                      |   2 +-
 pkg/sentry/kernel/syslog.go                        |   3 +-
 pkg/sentry/kernel/task.go                          |   5 +-
 pkg/sentry/kernel/thread_group.go                  |   2 +-
 pkg/sentry/kernel/threads.go                       |   2 +-
 pkg/sentry/kernel/time/BUILD                       |   1 +
 pkg/sentry/kernel/time/time.go                     |   2 +-
 pkg/sentry/kernel/timekeeper.go                    |   2 +-
 pkg/sentry/kernel/tty.go                           |   2 +-
 pkg/sentry/kernel/uts_namespace.go                 |   3 +-
 pkg/sentry/limits/BUILD                            |   1 +
 pkg/sentry/limits/limits.go                        |   3 +-
 pkg/sentry/mm/BUILD                                |   2 +-
 pkg/sentry/mm/aio_context.go                       |   3 +-
 pkg/sentry/mm/mm.go                                |   8 +-
 pkg/sentry/pgalloc/BUILD                           |   1 +
 pkg/sentry/pgalloc/pgalloc.go                      |   2 +-
 pkg/sentry/platform/interrupt/BUILD                |   1 +
 pkg/sentry/platform/interrupt/interrupt.go         |   3 +-
 pkg/sentry/platform/kvm/BUILD                      |   1 +
 pkg/sentry/platform/kvm/address_space.go           |   2 +-
 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go   |   2 -
 pkg/sentry/platform/kvm/kvm.go                     |   2 +-
 pkg/sentry/platform/kvm/machine.go                 |   2 +-
 pkg/sentry/platform/ptrace/BUILD                   |   1 +
 pkg/sentry/platform/ptrace/ptrace.go               |   2 +-
 pkg/sentry/platform/ptrace/subprocess.go           |   2 +-
 .../platform/ptrace/subprocess_linux_unsafe.go     |   2 +-
 pkg/sentry/platform/ring0/defs.go                  |   2 +-
 pkg/sentry/platform/ring0/defs_amd64.go            |   1 +
 pkg/sentry/platform/ring0/defs_arm64.go            |   1 +
 pkg/sentry/platform/ring0/pagetables/BUILD         |   5 +-
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go  |   2 +-
 pkg/sentry/socket/netlink/BUILD                    |   1 +
 pkg/sentry/socket/netlink/port/BUILD               |   1 +
 pkg/sentry/socket/netlink/port/port.go             |   3 +-
 pkg/sentry/socket/netlink/socket.go                |   2 +-
 pkg/sentry/socket/netstack/BUILD                   |   1 +
 pkg/sentry/socket/netstack/netstack.go             |   2 +-
 pkg/sentry/socket/rpcinet/conn/BUILD               |   1 +
 pkg/sentry/socket/rpcinet/conn/conn.go             |   2 +-
 pkg/sentry/socket/rpcinet/notifier/BUILD           |   1 +
 pkg/sentry/socket/rpcinet/notifier/notifier.go     |   2 +-
 pkg/sentry/socket/unix/transport/BUILD             |   1 +
 pkg/sentry/socket/unix/transport/connectioned.go   |   3 +-
 pkg/sentry/socket/unix/transport/queue.go          |   3 +-
 pkg/sentry/socket/unix/transport/unix.go           |   2 +-
 pkg/sentry/syscalls/linux/BUILD                    |   1 +
 pkg/sentry/syscalls/linux/error.go                 |   2 +-
 pkg/sentry/time/BUILD                              |   4 +-
 pkg/sentry/time/calibrated_clock.go                |   2 +-
 pkg/sentry/usage/BUILD                             |   1 +
 pkg/sentry/usage/memory.go                         |   2 +-
 pkg/sentry/vfs/BUILD                               |   3 +-
 pkg/sentry/vfs/dentry.go                           |   2 +-
 pkg/sentry/vfs/file_description_impl_util.go       |   2 +-
 pkg/sentry/vfs/mount_test.go                       |   3 +-
 pkg/sentry/vfs/mount_unsafe.go                     |   4 +-
 pkg/sentry/vfs/pathname.go                         |   3 +-
 pkg/sentry/vfs/resolving_path.go                   |   2 +-
 pkg/sentry/vfs/vfs.go                              |   2 +-
 pkg/sentry/watchdog/BUILD                          |   1 +
 pkg/sentry/watchdog/watchdog.go                    |   2 +-
 pkg/sync/BUILD                                     |  53 +++++++
 pkg/sync/LICENSE                                   |  27 ++++
 pkg/sync/README.md                                 |   5 +
 pkg/sync/aliases.go                                |  37 +++++
 pkg/sync/atomicptr_unsafe.go                       |  47 +++++++
 pkg/sync/atomicptrtest/BUILD                       |  29 ++++
 pkg/sync/atomicptrtest/atomicptr_test.go           |  31 +++++
 pkg/sync/downgradable_rwmutex_test.go              | 150 ++++++++++++++++++++
 pkg/sync/downgradable_rwmutex_unsafe.go            | 146 ++++++++++++++++++++
 pkg/sync/memmove_unsafe.go                         |  28 ++++
 pkg/sync/norace_unsafe.go                          |  35 +++++
 pkg/sync/race_unsafe.go                            |  41 ++++++
 pkg/sync/seqatomic_unsafe.go                       |  72 ++++++++++
 pkg/sync/seqatomictest/BUILD                       |  33 +++++
 pkg/sync/seqatomictest/seqatomic_test.go           | 132 ++++++++++++++++++
 pkg/sync/seqcount.go                               | 149 ++++++++++++++++++++
 pkg/sync/seqcount_test.go                          | 153 +++++++++++++++++++++
 pkg/sync/syncutil.go                               |   7 +
 pkg/syncutil/BUILD                                 |  52 -------
 pkg/syncutil/LICENSE                               |  27 ----
 pkg/syncutil/README.md                             |   5 -
 pkg/syncutil/atomicptr_unsafe.go                   |  47 -------
 pkg/syncutil/atomicptrtest/BUILD                   |  29 ----
 pkg/syncutil/atomicptrtest/atomicptr_test.go       |  31 -----
 pkg/syncutil/downgradable_rwmutex_test.go          | 150 --------------------
 pkg/syncutil/downgradable_rwmutex_unsafe.go        | 146 --------------------
 pkg/syncutil/memmove_unsafe.go                     |  28 ----
 pkg/syncutil/norace_unsafe.go                      |  35 -----
 pkg/syncutil/race_unsafe.go                        |  41 ------
 pkg/syncutil/seqatomic_unsafe.go                   |  72 ----------
 pkg/syncutil/seqatomictest/BUILD                   |  35 -----
 pkg/syncutil/seqatomictest/seqatomic_test.go       | 132 ------------------
 pkg/syncutil/seqcount.go                           | 149 --------------------
 pkg/syncutil/seqcount_test.go                      | 153 ---------------------
 pkg/syncutil/syncutil.go                           |   7 -
 pkg/tcpip/BUILD                                    |   1 +
 pkg/tcpip/adapters/gonet/BUILD                     |   1 +
 pkg/tcpip/adapters/gonet/gonet.go                  |   2 +-
 pkg/tcpip/link/fdbased/BUILD                       |   1 +
 pkg/tcpip/link/fdbased/endpoint.go                 |   2 +-
 pkg/tcpip/link/sharedmem/BUILD                     |   2 +
 pkg/tcpip/link/sharedmem/pipe/BUILD                |   1 +
 pkg/tcpip/link/sharedmem/pipe/pipe_test.go         |   3 +-
 pkg/tcpip/link/sharedmem/sharedmem.go              |   2 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go         |   2 +-
 pkg/tcpip/network/fragmentation/BUILD              |   1 +
 pkg/tcpip/network/fragmentation/fragmentation.go   |   2 +-
 pkg/tcpip/network/fragmentation/reassembler.go     |   2 +-
 pkg/tcpip/ports/BUILD                              |   1 +
 pkg/tcpip/ports/ports.go                           |   2 +-
 pkg/tcpip/stack/BUILD                              |   2 +
 pkg/tcpip/stack/linkaddrcache.go                   |   2 +-
 pkg/tcpip/stack/linkaddrcache_test.go              |   2 +-
 pkg/tcpip/stack/nic.go                             |   2 +-
 pkg/tcpip/stack/stack.go                           |   2 +-
 pkg/tcpip/stack/transport_demuxer.go               |   2 +-
 pkg/tcpip/tcpip.go                                 |   2 +-
 pkg/tcpip/transport/icmp/BUILD                     |   1 +
 pkg/tcpip/transport/icmp/endpoint.go               |   3 +-
 pkg/tcpip/transport/packet/BUILD                   |   1 +
 pkg/tcpip/transport/packet/endpoint.go             |   3 +-
 pkg/tcpip/transport/raw/BUILD                      |   1 +
 pkg/tcpip/transport/raw/endpoint.go                |   3 +-
 pkg/tcpip/transport/tcp/BUILD                      |   1 +
 pkg/tcpip/transport/tcp/accept.go                  |   2 +-
 pkg/tcpip/transport/tcp/connect.go                 |   2 +-
 pkg/tcpip/transport/tcp/endpoint.go                |   2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go          |   2 +-
 pkg/tcpip/transport/tcp/forwarder.go               |   3 +-
 pkg/tcpip/transport/tcp/protocol.go                |   2 +-
 pkg/tcpip/transport/tcp/segment_queue.go           |   2 +-
 pkg/tcpip/transport/tcp/snd.go                     |   2 +-
 pkg/tcpip/transport/udp/BUILD                      |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |   3 +-
 pkg/tmutex/BUILD                                   |   1 +
 pkg/tmutex/tmutex_test.go                          |   3 +-
 pkg/unet/BUILD                                     |   1 +
 pkg/unet/unet_test.go                              |   3 +-
 pkg/urpc/BUILD                                     |   1 +
 pkg/urpc/urpc.go                                   |   2 +-
 pkg/waiter/BUILD                                   |   1 +
 pkg/waiter/waiter.go                               |   2 +-
 runsc/boot/BUILD                                   |   2 +
 runsc/boot/compat.go                               |   2 +-
 runsc/boot/limits.go                               |   2 +-
 runsc/boot/loader.go                               |   2 +-
 runsc/boot/loader_test.go                          |   2 +-
 runsc/cmd/BUILD                                    |   1 +
 runsc/cmd/create.go                                |   1 +
 runsc/cmd/gofer.go                                 |   2 +-
 runsc/cmd/start.go                                 |   1 +
 runsc/container/BUILD                              |   2 +
 runsc/container/console_test.go                    |   2 +-
 runsc/container/container_test.go                  |   2 +-
 runsc/container/multi_container_test.go            |   2 +-
 runsc/container/state_file.go                      |   2 +-
 runsc/fsgofer/BUILD                                |   1 +
 runsc/fsgofer/fsgofer.go                           |   2 +-
 runsc/sandbox/BUILD                                |   1 +
 runsc/sandbox/sandbox.go                           |   2 +-
 runsc/testutil/BUILD                               |   1 +
 runsc/testutil/testutil.go                         |   2 +-
 303 files changed, 1507 insertions(+), 1368 deletions(-)
 create mode 100644 pkg/sync/BUILD
 create mode 100644 pkg/sync/LICENSE
 create mode 100644 pkg/sync/README.md
 create mode 100644 pkg/sync/aliases.go
 create mode 100644 pkg/sync/atomicptr_unsafe.go
 create mode 100644 pkg/sync/atomicptrtest/BUILD
 create mode 100644 pkg/sync/atomicptrtest/atomicptr_test.go
 create mode 100644 pkg/sync/downgradable_rwmutex_test.go
 create mode 100644 pkg/sync/downgradable_rwmutex_unsafe.go
 create mode 100644 pkg/sync/memmove_unsafe.go
 create mode 100644 pkg/sync/norace_unsafe.go
 create mode 100644 pkg/sync/race_unsafe.go
 create mode 100644 pkg/sync/seqatomic_unsafe.go
 create mode 100644 pkg/sync/seqatomictest/BUILD
 create mode 100644 pkg/sync/seqatomictest/seqatomic_test.go
 create mode 100644 pkg/sync/seqcount.go
 create mode 100644 pkg/sync/seqcount_test.go
 create mode 100644 pkg/sync/syncutil.go
 delete mode 100644 pkg/syncutil/BUILD
 delete mode 100644 pkg/syncutil/LICENSE
 delete mode 100644 pkg/syncutil/README.md
 delete mode 100644 pkg/syncutil/atomicptr_unsafe.go
 delete mode 100644 pkg/syncutil/atomicptrtest/BUILD
 delete mode 100644 pkg/syncutil/atomicptrtest/atomicptr_test.go
 delete mode 100644 pkg/syncutil/downgradable_rwmutex_test.go
 delete mode 100644 pkg/syncutil/downgradable_rwmutex_unsafe.go
 delete mode 100644 pkg/syncutil/memmove_unsafe.go
 delete mode 100644 pkg/syncutil/norace_unsafe.go
 delete mode 100644 pkg/syncutil/race_unsafe.go
 delete mode 100644 pkg/syncutil/seqatomic_unsafe.go
 delete mode 100644 pkg/syncutil/seqatomictest/BUILD
 delete mode 100644 pkg/syncutil/seqatomictest/seqatomic_test.go
 delete mode 100644 pkg/syncutil/seqcount.go
 delete mode 100644 pkg/syncutil/seqcount_test.go
 delete mode 100644 pkg/syncutil/syncutil.go

diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 6bc486b62..d99e37b40 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -15,4 +15,5 @@ go_test(
     size = "small",
     srcs = ["amutex_test.go"],
     embed = [":amutex"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 1d7f45641..8a3952f2a 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -15,9 +15,10 @@
 package amutex
 
 import (
-	"sync"
 	"testing"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 type sleeper struct {
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 36beaade9..6403c60c2 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -20,4 +20,5 @@ go_test(
     size = "small",
     srcs = ["atomic_bitops_test.go"],
     embed = [":atomicbitops"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
index 965e9be79..9466d3e23 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomic_bitops_test.go
@@ -16,8 +16,9 @@ package atomicbitops
 
 import (
 	"runtime"
-	"sync"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 const iterations = 100
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index a0b21d4bd..2bb581b18 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -8,7 +8,10 @@ go_library(
     srcs = ["compressio.go"],
     importpath = "gvisor.dev/gvisor/pkg/compressio",
     visibility = ["//:sandbox"],
-    deps = ["//pkg/binary"],
+    deps = [
+        "//pkg/binary",
+        "//pkg/sync",
+    ],
 )
 
 go_test(
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index 3b0bb086e..5f52cbe74 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -52,9 +52,9 @@ import (
 	"hash"
 	"io"
 	"runtime"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 var bufPool = sync.Pool{
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index 21adf3adf..adbd1e3f8 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -9,6 +9,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
     ],
diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go
index a56152d10..41abe1f2d 100644
--- a/pkg/control/server/server.go
+++ b/pkg/control/server/server.go
@@ -22,9 +22,9 @@ package server
 
 import (
 	"os"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 0b4b7cc44..9d68682c7 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -15,6 +15,7 @@ go_library(
     deps = [
         ":eventchannel_go_proto",
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/unet",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_golang_protobuf//ptypes:go_default_library_gen",
@@ -40,6 +41,7 @@ go_test(
     srcs = ["event_test.go"],
     embed = [":eventchannel"],
     deps = [
+        "//pkg/sync",
         "@com_github_golang_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index d37ad0428..9a29c58bd 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -22,13 +22,13 @@ package eventchannel
 import (
 	"encoding/binary"
 	"fmt"
-	"sync"
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
 	"github.com/golang/protobuf/ptypes"
 	pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 3649097d6..7f41b4a27 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -16,11 +16,11 @@ package eventchannel
 
 import (
 	"fmt"
-	"sync"
 	"testing"
 	"time"
 
 	"github.com/golang/protobuf/proto"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // testEmitter is an emitter that can be used in tests. It records all events
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD
index 56495cbd9..b0478c672 100644
--- a/pkg/fdchannel/BUILD
+++ b/pkg/fdchannel/BUILD
@@ -15,4 +15,5 @@ go_test(
     size = "small",
     srcs = ["fdchannel_test.go"],
     embed = [":fdchannel"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go
index 5d01dc636..7a8a63a59 100644
--- a/pkg/fdchannel/fdchannel_test.go
+++ b/pkg/fdchannel/fdchannel_test.go
@@ -17,10 +17,11 @@ package fdchannel
 import (
 	"io/ioutil"
 	"os"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func TestSendRecvFD(t *testing.T) {
diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
index aca2d8a82..91a202a30 100644
--- a/pkg/fdnotifier/BUILD
+++ b/pkg/fdnotifier/BUILD
@@ -11,6 +11,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/fdnotifier",
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/sync",
         "//pkg/waiter",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go
index f4aae1953..a6b63c982 100644
--- a/pkg/fdnotifier/fdnotifier.go
+++ b/pkg/fdnotifier/fdnotifier.go
@@ -22,10 +22,10 @@ package fdnotifier
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index e590a71ba..85bd83af1 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -19,7 +19,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/memutil",
-        "//pkg/syncutil",
+        "//pkg/sync",
     ],
 )
 
@@ -31,4 +31,5 @@ go_test(
         "flipcall_test.go",
     ],
     embed = [":flipcall"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/flipcall/flipcall_example_test.go b/pkg/flipcall/flipcall_example_test.go
index 8d88b845d..2e28a149a 100644
--- a/pkg/flipcall/flipcall_example_test.go
+++ b/pkg/flipcall/flipcall_example_test.go
@@ -17,7 +17,8 @@ package flipcall
 import (
 	"bytes"
 	"fmt"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func Example() {
diff --git a/pkg/flipcall/flipcall_test.go b/pkg/flipcall/flipcall_test.go
index 168a487ec..33fd55a44 100644
--- a/pkg/flipcall/flipcall_test.go
+++ b/pkg/flipcall/flipcall_test.go
@@ -16,9 +16,10 @@ package flipcall
 
 import (
 	"runtime"
-	"sync"
 	"testing"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 var testPacketWindowSize = pageSize
diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go
index 27b8939fc..ac974b232 100644
--- a/pkg/flipcall/flipcall_unsafe.go
+++ b/pkg/flipcall/flipcall_unsafe.go
@@ -18,7 +18,7 @@ import (
 	"reflect"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/syncutil"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Packets consist of a 16-byte header followed by an arbitrarily-sized
@@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte {
 var ioSync int64
 
 func raceBecomeActive() {
-	if syncutil.RaceEnabled {
-		syncutil.RaceAcquire((unsafe.Pointer)(&ioSync))
+	if sync.RaceEnabled {
+		sync.RaceAcquire((unsafe.Pointer)(&ioSync))
 	}
 }
 
 func raceBecomeInactive() {
-	if syncutil.RaceEnabled {
-		syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
+	if sync.RaceEnabled {
+		sync.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
 	}
 }
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 4b9321711..f22bd070d 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -19,5 +19,6 @@ go_test(
     ],
     deps = [
         ":gate",
+        "//pkg/sync",
     ],
 )
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 5dbd8d712..850693df8 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -15,11 +15,11 @@
 package gate_test
 
 import (
-	"sync"
 	"testing"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/gate"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func TestBasicEnter(t *testing.T) {
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index a5d980d14..bcde6d308 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -8,6 +8,7 @@ go_library(
     srcs = ["linewriter.go"],
     importpath = "gvisor.dev/gvisor/pkg/linewriter",
     visibility = ["//visibility:public"],
+    deps = ["//pkg/sync"],
 )
 
 go_test(
diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go
index cd6e4e2ce..a1b1285d4 100644
--- a/pkg/linewriter/linewriter.go
+++ b/pkg/linewriter/linewriter.go
@@ -17,7 +17,8 @@ package linewriter
 
 import (
 	"bytes"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Writer is an io.Writer which buffers input, flushing
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index fc5f5779b..0df0f2849 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -16,7 +16,10 @@ go_library(
     visibility = [
         "//visibility:public",
     ],
-    deps = ["//pkg/linewriter"],
+    deps = [
+        "//pkg/linewriter",
+        "//pkg/sync",
+    ],
 )
 
 go_test(
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 9387586e6..91a81b288 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -25,12 +25,12 @@ import (
 	stdlog "log"
 	"os"
 	"runtime"
-	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/linewriter"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Level is the log level.
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index dd6ca6d39..9145f3233 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -14,6 +14,7 @@ go_library(
         ":metric_go_proto",
         "//pkg/eventchannel",
         "//pkg/log",
+        "//pkg/sync",
     ],
 )
 
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index eadde06e4..93d4f2b8c 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -18,12 +18,12 @@ package metric
 import (
 	"errors"
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
 	pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 var (
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index f32244c69..a3e05c96d 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -29,6 +29,7 @@ go_library(
         "//pkg/fdchannel",
         "//pkg/flipcall",
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/unet",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 221516c6c..4045e41fa 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -17,12 +17,12 @@ package p9
 import (
 	"errors"
 	"fmt"
-	"sync"
 	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/flipcall"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index 28707c0ca..f4edd68b2 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -70,6 +70,7 @@ go_library(
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/sync",
         "//pkg/unet",
         "@com_github_golang_mock//gomock:go_default_library",
     ],
@@ -83,6 +84,7 @@ go_test(
     deps = [
         "//pkg/fd",
         "//pkg/p9",
+        "//pkg/sync",
         "@com_github_golang_mock//gomock:go_default_library",
     ],
 )
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 6e758148d..6e7bb3db2 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -22,7 +22,6 @@ import (
 	"os"
 	"reflect"
 	"strings"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -30,6 +29,7 @@ import (
 	"github.com/golang/mock/gomock"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func TestPanic(t *testing.T) {
diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go
index 4d3271b37..dd8b01b6d 100644
--- a/pkg/p9/p9test/p9test.go
+++ b/pkg/p9/p9test/p9test.go
@@ -17,13 +17,13 @@ package p9test
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 	"syscall"
 	"testing"
 
 	"github.com/golang/mock/gomock"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go
index 865459411..72ef53313 100644
--- a/pkg/p9/path_tree.go
+++ b/pkg/p9/path_tree.go
@@ -16,7 +16,8 @@ package p9
 
 import (
 	"fmt"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // pathNode is a single node in a path traversal.
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
index 52de889e1..2b14a5ce3 100644
--- a/pkg/p9/pool.go
+++ b/pkg/p9/pool.go
@@ -15,7 +15,7 @@
 package p9
 
 import (
-	"sync"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // pool is a simple allocator.
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 40b8fa023..fdfa83648 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -17,7 +17,6 @@ package p9
 import (
 	"io"
 	"runtime/debug"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
@@ -25,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fdchannel"
 	"gvisor.dev/gvisor/pkg/flipcall"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 6e8b4bbcd..9c11e28ce 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -19,11 +19,11 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD
index 078f084b2..b506813f0 100644
--- a/pkg/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -21,6 +21,7 @@ go_test(
         "procid_test.go",
     ],
     embed = [":procid"],
+    deps = ["//pkg/sync"],
 )
 
 go_test(
@@ -31,4 +32,5 @@ go_test(
         "procid_test.go",
     ],
     embed = [":procid"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/procid/procid_test.go b/pkg/procid/procid_test.go
index 88dd0b3ae..9ec08c3d6 100644
--- a/pkg/procid/procid_test.go
+++ b/pkg/procid/procid_test.go
@@ -17,9 +17,10 @@ package procid
 import (
 	"os"
 	"runtime"
-	"sync"
 	"syscall"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // runOnMain is used to send functions to run on the main (initial) thread.
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index f4f2001f3..9d5b4859b 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -10,5 +10,8 @@ go_library(
     ],
     importpath = "gvisor.dev/gvisor/pkg/rand",
     visibility = ["//:sandbox"],
-    deps = ["@org_golang_x_sys//unix:go_default_library"],
+    deps = [
+        "//pkg/sync",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
 )
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 2b92db3e6..0bdad5fad 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -19,9 +19,9 @@ package rand
 import (
 	"crypto/rand"
 	"io"
-	"sync"
 
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // reader implements an io.Reader that returns pseudorandom bytes.
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 7ad59dfd7..974d9af9b 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -27,6 +27,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
+        "//pkg/sync",
     ],
 )
 
@@ -35,4 +36,5 @@ go_test(
     size = "small",
     srcs = ["refcounter_test.go"],
     embed = [":refs"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index ad69e0757..c45ba8200 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -21,10 +21,10 @@ import (
 	"fmt"
 	"reflect"
 	"runtime"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // RefCounter is the interface to be implemented by objects that are reference
diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go
index ffd3d3f07..1ab4a4440 100644
--- a/pkg/refs/refcounter_test.go
+++ b/pkg/refs/refcounter_test.go
@@ -16,8 +16,9 @@ package refs
 
 import (
 	"reflect"
-	"sync"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 type testCounter struct {
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 18c73cc24..ae3e364cd 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/limits",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 9294ac773..9f41e566f 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -19,7 +19,6 @@ package arch
 import (
 	"fmt"
 	"io"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/binary"
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 5522cecd0..2561a6109 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/sentry/strace",
         "//pkg/sentry/usage",
         "//pkg/sentry/watchdog",
+        "//pkg/sync",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
     ],
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index e1f2fea60..151808911 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -19,10 +19,10 @@ import (
 	"runtime"
 	"runtime/pprof"
 	"runtime/trace"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 1098ed777..97fa1512c 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -8,7 +8,10 @@ go_library(
     srcs = ["device.go"],
     importpath = "gvisor.dev/gvisor/pkg/sentry/device",
     visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/abi/linux"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sync",
+    ],
 )
 
 go_test(
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 47945d1a7..69e71e322 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -19,10 +19,10 @@ package device
 import (
 	"bytes"
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Registry tracks all simple devices and related state on the system for
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index c035ffff7..7d5d72d5a 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -68,7 +68,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
-        "//pkg/syncutil",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
@@ -115,6 +115,7 @@ go_test(
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 9ac62c84d..734177e90 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -17,12 +17,12 @@ package fs
 import (
 	"fmt"
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 1d80bf15a..738580c5f 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -19,13 +19,13 @@ import (
 	"crypto/rand"
 	"fmt"
 	"io"
-	"sync"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 const (
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 3cb73bd78..31fc4d87b 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"path"
 	"sort"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
@@ -28,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 60a15a275..25514ace4 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -16,7 +16,8 @@ package fs
 
 import (
 	"fmt"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // DirentCache is an LRU cache of Dirents. The Dirent's refCount is
diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go
index ebb80bd50..525ee25f9 100644
--- a/pkg/sentry/fs/dirent_cache_limiter.go
+++ b/pkg/sentry/fs/dirent_cache_limiter.go
@@ -16,7 +16,8 @@ package fs
 
 import (
 	"fmt"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // DirentCacheLimiter acts as a global limit for all dirent caches in the
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 277ee4c31..cc43de69d 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -23,6 +23,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 669ffcb75..5b6cfeb0a 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -17,7 +17,6 @@ package fdpipe
 
 import (
 	"os"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/fd"
@@ -29,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 29175fb3d..cee87f726 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -17,10 +17,10 @@ package fdpipe
 import (
 	"fmt"
 	"io/ioutil"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // beforeSave is invoked by stateify.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index a2f966cb6..7c4586296 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"math"
-	"sync"
 	"sync/atomic"
 	"time"
 
@@ -29,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 225e40186..8a633b1ba 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -16,13 +16,13 @@ package fs
 
 import (
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index b157fd228..c5b51620a 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -18,9 +18,9 @@ import (
 	"fmt"
 	"sort"
 	"strings"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // FilesystemFlags matches include/linux/fs.h:file_system_type.fs_flags.
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 8b2a5e6b2..26abf49e2 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -54,10 +54,9 @@
 package fs
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 var (
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 9ca695a95..945b6270d 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -93,6 +93,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index b06a71cc2..837fc70b5 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -16,7 +16,6 @@ package fsutil
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -24,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // HostFileMapper caches mappings of an arbitrary host file descriptor. It is
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 30475f340..a625f0e26 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -16,7 +16,6 @@ package fsutil
 
 import (
 	"math"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -24,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // HostMappable implements memmap.Mappable and platform.File over a
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 4e100a402..adf5ec69c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -15,13 +15,12 @@
 package fsutil
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 798920d18..20a014402 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -17,7 +17,6 @@ package fsutil
 import (
 	"fmt"
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -30,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Lock order (compare the lock order model in mm/mm.go):
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 4a005c605..fd870e8e1 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -44,6 +44,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/unet",
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 91263ebdc..245fe2ef1 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -16,7 +16,6 @@ package gofer
 
 import (
 	"errors"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -31,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 4e358a46a..edc796ce0 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -16,7 +16,6 @@ package gofer
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/refs"
@@ -25,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 23daeb528..2b581aa69 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//pkg/sentry/unimpl",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index a6e4a09e3..873a1c52d 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -15,7 +15,6 @@
 package host
 
 import (
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -28,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 107336a3e..c076d5bdd 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -16,7 +16,6 @@ package host
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -30,6 +29,7 @@ import (
 	unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 90331e3b2..753ef8cd6 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -15,8 +15,6 @@
 package host
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 91e2fde2f..468043df0 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -15,8 +15,6 @@
 package fs
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
@@ -26,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 0f2a66a79..efd3c962b 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -16,7 +16,8 @@ package fs
 
 import (
 	"fmt"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Watches is the collection of inotify watches on an inode.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index ba3e0233d..cc7dd1c92 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"io"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 0aa0a5e9b..900cba3ca 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -15,10 +15,10 @@
 package fs
 
 import (
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Watch represent a particular inotify watch created by inotify_add_watch.
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 8d62642e7..2c332a82a 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -44,6 +44,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 636484424..41b040818 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -52,9 +52,9 @@ package lock
 import (
 	"fmt"
 	"math"
-	"sync"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index ac0398bd9..db3dfd096 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -19,7 +19,6 @@ import (
 	"math"
 	"path"
 	"strings"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 25573e986..4cad55327 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -17,13 +17,12 @@ package fs
 import (
 	"fmt"
 	"strings"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/syncutil"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -199,7 +198,7 @@ type overlayEntry struct {
 	upper *Inode
 
 	// dirCacheMu protects dirCache.
-	dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"`
+	dirCacheMu sync.DowngradableRWMutex `state:"nosave"`
 
 	// dirCache is cache of DentAttrs from upper and lower Inodes.
 	dirCache *SortedDentryMap
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 75cbb0622..94d46ab1b 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index fe7067be1..38b246dff 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 5fe823000..f9af191d5 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -17,7 +17,6 @@ package seqfile
 
 import (
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -26,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index bd93f83fa..a37e1fa06 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -17,7 +17,6 @@ package proc
 import (
 	"fmt"
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 012cb3e44..3fb7b0633 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 78e082b8e..dcbb8eb2e 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -17,7 +17,6 @@ package ramfs
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index f10168125..64c6a6ae9 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -15,7 +15,7 @@
 package fs
 
 import (
-	"sync"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // RestoreEnvironment is the restore environment for file systems. It consists
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 59ce400c2..3400b940c 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index f86dfaa36..f1c87fe41 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,7 +17,6 @@ package tmpfs
 import (
 	"fmt"
 	"io"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -31,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 95ad98cb0..f6f60d0cf 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 2f639c823..88aa66b24 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -19,7 +19,6 @@ import (
 	"fmt"
 	"math"
 	"strconv"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -28,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 7cc0eb409..894964260 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -16,13 +16,13 @@ package tty
 
 import (
 	"bytes"
-	"sync"
 	"unicode/utf8"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 231e4e6eb..8b5d4699a 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -15,13 +15,12 @@
 package tty
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index bc90330bc..903874141 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 91802dc1e..8944171c8 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -15,8 +15,6 @@
 package ext
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/log"
@@ -25,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 616fc002a..9afb1a84c 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -17,13 +17,13 @@ package ext
 import (
 	"errors"
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index aec33e00a..d11153c90 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 39c03ee9d..809178250 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -39,6 +39,7 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
@@ -56,6 +57,7 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "@com_github_google_go-cmp//cmp:go_default_library",
     ],
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 752e0f659..1d469a0db 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -16,7 +16,6 @@ package kernfs
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -24,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index d69b299ae..bb12f39a2 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -53,7 +53,6 @@ package kernfs
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -61,6 +60,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // FilesystemType implements vfs.FilesystemType.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 4b6b95f5f..5c9d580e1 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -19,7 +19,6 @@ import (
 	"fmt"
 	"io"
 	"runtime"
-	"sync"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -31,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index a5b285987..82f5c2f41 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -47,6 +47,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index f51e247a7..f200e767d 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -17,7 +17,6 @@ package tmpfs
 import (
 	"io"
 	"math"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -30,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 7be6faa5b..701826f90 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -26,7 +26,6 @@ package tmpfs
 import (
 	"fmt"
 	"math"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -34,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 2706927ff..ac85ba0c8 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -35,7 +35,7 @@ go_template_instance(
     out = "seqatomic_taskgoroutineschedinfo_unsafe.go",
     package = "kernel",
     suffix = "TaskGoroutineSchedInfo",
-    template = "//pkg/syncutil:generic_seqatomic",
+    template = "//pkg/sync:generic_seqatomic",
     types = {
         "Value": "TaskGoroutineSchedInfo",
     },
@@ -209,7 +209,7 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/state/statefile",
-        "//pkg/syncutil",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
@@ -241,6 +241,7 @@ go_test(
         "//pkg/sentry/time",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 244655b5c..920fe4329 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -15,11 +15,11 @@
 package kernel
 
 import (
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // +stateify savable
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 04c244447..1aa72fa47 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -8,7 +8,7 @@ go_template_instance(
     out = "atomicptr_credentials_unsafe.go",
     package = "auth",
     suffix = "Credentials",
-    template = "//pkg/syncutil:generic_atomicptr",
+    template = "//pkg/sync:generic_atomicptr",
     types = {
         "Value": "Credentials",
     },
@@ -64,6 +64,7 @@ go_library(
         "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/context",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index af28ccc65..9dd52c860 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -16,8 +16,8 @@ package auth
 
 import (
 	"math"
-	"sync"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 3361e8b7d..c47f6b6fc 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 9c0a4e1b4..430311cc0 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -18,7 +18,6 @@ package epoll
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/refs"
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index e65b961e8..c831fbab2 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 12f0d429b..687690679 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -18,7 +18,6 @@ package eventfd
 
 import (
 	"math"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -28,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 49d81b712..6b36bc63e 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -12,6 +12,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sync",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 6b0bb0324..d32c3e90a 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -16,12 +16,11 @@
 package fasync
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 11f613a11..cd1501f85 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -18,7 +18,6 @@ import (
 	"bytes"
 	"fmt"
 	"math"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
@@ -28,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // FDFlags define flags for an individual descriptor.
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 2bcb6216a..eccb7d1e7 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -16,7 +16,6 @@ package kernel
 
 import (
 	"runtime"
-	"sync"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 const (
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index ded27d668..2448c1d99 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -16,10 +16,10 @@ package kernel
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // FSContext contains filesystem context.
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 75ec31761..50db443ce 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "atomicptr_bucket_unsafe.go",
     package = "futex",
     suffix = "Bucket",
-    template = "//pkg/syncutil:generic_atomicptr",
+    template = "//pkg/sync:generic_atomicptr",
     types = {
         "Value": "bucket",
     },
@@ -42,6 +42,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
@@ -51,5 +52,8 @@ go_test(
     size = "small",
     srcs = ["futex_test.go"],
     embed = [":futex"],
-    deps = ["//pkg/sentry/usermem"],
+    deps = [
+        "//pkg/sentry/usermem",
+        "//pkg/sync",
+    ],
 )
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 278cc8143..d1931c8f4 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -18,11 +18,10 @@
 package futex
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 65e5d1428..c23126ca5 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -17,13 +17,13 @@ package futex
 import (
 	"math"
 	"runtime"
-	"sync"
 	"sync/atomic"
 	"syscall"
 	"testing"
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // testData implements the Target interface, and allows us to
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 8653d2f63..c85e97fef 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -36,7 +36,6 @@ import (
 	"fmt"
 	"io"
 	"path/filepath"
-	"sync"
 	"sync/atomic"
 	"time"
 
@@ -67,6 +66,7 @@ import (
 	uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index d7a7d1169..7f36252a9 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/metric",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usage",
+        "//pkg/sync",
     ],
 )
 
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index b0d98e7f0..200565bb8 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -17,7 +17,6 @@
 package memevent
 
 import (
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/eventchannel"
@@ -26,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.")
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 9d34f6d4d..5eeaeff66 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
index 95bee2d37..1c0f34269 100644
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ b/pkg/sentry/kernel/pipe/buffer.go
@@ -16,9 +16,9 @@ package pipe
 
 import (
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // buffer encapsulates a queueable byte buffer.
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4a19ab7ce..716f589af 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -15,12 +15,11 @@
 package pipe
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 1a1b38f83..e4fd7d420 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,12 +17,12 @@ package pipe
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index ef9641e6a..8394eb78b 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -17,7 +17,6 @@ package pipe
 import (
 	"io"
 	"math"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 6416e0dd8..bf7461cbb 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -15,13 +15,12 @@
 package pipe
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index f4c00cd86..13a961594 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index de9617e9d..18299814e 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -17,7 +17,6 @@ package semaphore
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
@@ -25,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index cd48945e6..7321b22ed 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -24,6 +24,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 19034a21e..8ddef7eb8 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -35,7 +35,6 @@ package shm
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
@@ -49,6 +48,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index a16f3d57f..768fda220 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -15,10 +15,9 @@
 package kernel
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // SignalHandlers holds information about signal actions.
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 9f7e19b4d..89e4d84b1 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 4b08d7d72..28be4a939 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -16,8 +16,6 @@
 package signalfd
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -26,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 2fdee0282..d2d01add4 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -16,13 +16,13 @@ package kernel
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // maxSyscallNum is the highest supported syscall number.
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 8227ecf1d..4607cde2f 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -17,7 +17,8 @@ package kernel
 import (
 	"fmt"
 	"math/rand"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // syslog represents a sentry-global kernel log.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index d25a7903b..978d66da8 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -17,7 +17,6 @@ package kernel
 import (
 	gocontext "context"
 	"runtime/trace"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -37,7 +36,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/syncutil"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -85,7 +84,7 @@ type Task struct {
 	//
 	// gosched is protected by goschedSeq. gosched is owned by the task
 	// goroutine.
-	goschedSeq syncutil.SeqCount `state:"nosave"`
+	goschedSeq sync.SeqCount `state:"nosave"`
 	gosched    TaskGoroutineSchedInfo
 
 	// yieldCount is the number of times the task goroutine has called
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index c0197a563..768e958d2 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -15,7 +15,6 @@
 package kernel
 
 import (
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 8267929a6..bf2dabb6e 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -16,9 +16,9 @@ package kernel
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 31847e1df..4e4de0512 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -13,6 +13,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 107394183..706de83ef 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -19,10 +19,10 @@ package time
 import (
 	"fmt"
 	"math"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 76417342a..dc99301de 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -16,7 +16,6 @@ package kernel
 
 import (
 	"fmt"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -24,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Timekeeper manages all of the kernel clocks.
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
index 048de26dc..464d2306a 100644
--- a/pkg/sentry/kernel/tty.go
+++ b/pkg/sentry/kernel/tty.go
@@ -14,7 +14,7 @@
 
 package kernel
 
-import "sync"
+import "gvisor.dev/gvisor/pkg/sync"
 
 // TTY defines the relationship between a thread group and its controlling
 // terminal.
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 0a563e715..8ccf04bd1 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -15,9 +15,8 @@
 package kernel
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // UTSNamespace represents a UTS namespace, a holder of two system identifiers:
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 156e67bf8..9fa841e8b 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -15,6 +15,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
+        "//pkg/sync",
     ],
 )
 
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index b6c22656b..31b9e9ff6 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -16,8 +16,9 @@
 package limits
 
 import (
-	"sync"
 	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // LimitType defines a type of resource limit.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 839931f67..83e248431 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -118,7 +118,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/syncutil",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
     ],
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 1b746d030..4b48866ad 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -15,8 +15,6 @@
 package mm
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -25,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 58a5c186d..fa86ebced 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -35,8 +35,6 @@
 package mm
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -44,7 +42,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/syncutil"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // MemoryManager implements a virtual address space.
@@ -82,7 +80,7 @@ type MemoryManager struct {
 	users int32
 
 	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
-	mappingMu syncutil.DowngradableRWMutex `state:"nosave"`
+	mappingMu sync.DowngradableRWMutex `state:"nosave"`
 
 	// vmas stores virtual memory areas. Since vmas are stored by value,
 	// clients should usually use vmaIterator.ValuePtr() instead of
@@ -125,7 +123,7 @@ type MemoryManager struct {
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
-	activeMu syncutil.DowngradableRWMutex `state:"nosave"`
+	activeMu sync.DowngradableRWMutex `state:"nosave"`
 
 	// pmas stores platform mapping areas used to implement vmas. Since pmas
 	// are stored by value, clients should usually use pmaIterator.ValuePtr()
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index f404107af..a9a2642c5 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -73,6 +73,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index f7f7298c4..c99e023d9 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -25,7 +25,6 @@ import (
 	"fmt"
 	"math"
 	"os"
-	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
@@ -37,6 +36,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index b6d008dbe..85e882df9 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -10,6 +10,7 @@ go_library(
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt",
     visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/sync"],
 )
 
 go_test(
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index a4651f500..57be41647 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -17,7 +17,8 @@ package interrupt
 
 import (
 	"fmt"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Receiver receives interrupt notifications from a Forwarder.
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index f3afd98da..6a358d1d4 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -55,6 +55,7 @@ go_library(
         "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/time",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
     ],
 )
 
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index ea8b9632e..a25f3c449 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -15,13 +15,13 @@
 package kvm
 
 import (
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // dirtySet tracks vCPUs for invalidation.
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index e5fac0d6a..2f02c03cf 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,8 +17,6 @@
 package kvm
 
 import (
-	"unsafe"
-
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index f2c2c059e..a7850faed 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -18,13 +18,13 @@ package kvm
 import (
 	"fmt"
 	"os"
-	"sync"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // KVM represents a lightweight VM context.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 7d02ebf19..e6d912168 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -17,7 +17,6 @@ package kvm
 import (
 	"fmt"
 	"runtime"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // machine contains state associated with the VM as a whole.
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 0df8cfa0f..cd13390c3 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 7b120a15d..bb0e03880 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -46,13 +46,13 @@ package ptrace
 
 import (
 	"os"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 var (
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 20244fd95..15dc46a5b 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"os"
 	"runtime"
-	"sync"
 	"syscall"
 
 	"golang.org/x/sys/unix"
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Linux kernel errnos which "should never be seen by user programs", but will
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 2e6fbe488..245b20722 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,7 +18,6 @@
 package ptrace
 
 import (
-	"sync"
 	"sync/atomic"
 	"syscall"
 	"unsafe"
@@ -26,6 +25,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // maskPool contains reusable CPU masks for setting affinity. Unfortunately,
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 3f094c2a7..86fd5ed58 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -17,7 +17,7 @@ package ring0
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
 // Kernel is a global kernel object.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 10dbd381f..9dae0dccb 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -18,6 +18,7 @@ package ring0
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
 var (
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index dc0eeec01..a850ce6cf 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -18,6 +18,7 @@ package ring0
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
 var (
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index e2e15ba5c..387a7f6c3 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -96,7 +96,10 @@ go_library(
         "//pkg/sentry/platform/kvm:__subpackages__",
         "//pkg/sentry/platform/ring0:__subpackages__",
     ],
-    deps = ["//pkg/sentry/usermem"],
+    deps = [
+        "//pkg/sentry/usermem",
+        "//pkg/sync",
+    ],
 )
 
 go_test(
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 0f029f25d..e199bae18 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -17,7 +17,7 @@
 package pagetables
 
 import (
-	"sync"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // limitPCID is the number of valid PCIDs.
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 136821963..103933144 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -27,6 +27,7 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 463544c1a..2d9f4ba9b 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -8,6 +8,7 @@ go_library(
     srcs = ["port.go"],
     importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port",
     visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/sync"],
 )
 
 go_test(
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index e9d3275b1..2cd3afc22 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -24,7 +24,8 @@ import (
 	"fmt"
 	"math"
 	"math/rand"
-	"sync"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // maxPorts is a sanity limit on the maximum number of ports to allocate per
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index d2e3644a6..cea56f4ed 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -17,7 +17,6 @@ package netlink
 
 import (
 	"math"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
@@ -34,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index e414d8055..f78784569 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/sentry/socket/netfilter",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 764f11a6b..0affb8071 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,7 +29,6 @@ import (
 	"io"
 	"math"
 	"reflect"
-	"sync"
 	"syscall"
 	"time"
 
@@ -49,6 +48,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index 23eadcb1b..b2677c659 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -10,6 +10,7 @@ go_library(
     deps = [
         "//pkg/binary",
         "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/unet",
         "@com_github_golang_protobuf//proto:go_default_library",
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index 356adad99..02f39c767 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -17,12 +17,12 @@ package conn
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/unet"
 
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index a3585e10d..a5954f22b 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -10,6 +10,7 @@ go_library(
     deps = [
         "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
         "//pkg/sentry/socket/rpcinet/conn",
+        "//pkg/sync",
         "//pkg/waiter",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index 7efe4301f..82b75d6dd 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -17,12 +17,12 @@ package notifier
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
 	pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 788ad70d2..d7ba95dff 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/ilist",
         "//pkg/refs",
         "//pkg/sentry/context",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index dea11e253..9e6fbc111 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -15,10 +15,9 @@
 package transport
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/waiter"
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index e27b1c714..5dcd3d95e 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -15,9 +15,8 @@
 package transport
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 37c7ac3c1..fcc0da332 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -16,11 +16,11 @@
 package transport
 
 import (
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index a76975cee..aa05e208a 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -91,6 +91,7 @@ go_library(
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 1d9018c96..60469549d 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -16,13 +16,13 @@ package linux
 
 import (
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 18e212dff..3cde3a0be 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "seqatomic_parameters_unsafe.go",
     package = "time",
     suffix = "Parameters",
-    template = "//pkg/syncutil:generic_seqatomic",
+    template = "//pkg/sync:generic_seqatomic",
     types = {
         "Value": "Parameters",
     },
@@ -36,7 +36,7 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/metric",
-        "//pkg/syncutil",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index 318503277..f9a93115d 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -17,11 +17,11 @@
 package time
 
 import (
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index c32fe3241..5518ac3d0 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -18,5 +18,6 @@ go_library(
     deps = [
         "//pkg/bits",
         "//pkg/memutil",
+        "//pkg/sync",
     ],
 )
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index d6ef644d8..538c645eb 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -17,12 +17,12 @@ package usage
 import (
 	"fmt"
 	"os"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // MemoryKind represents a type of memory used by the application.
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 4c6aa04a1..35c7be259 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -34,7 +34,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
-        "//pkg/syncutil",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
@@ -54,6 +54,7 @@ go_test(
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/usermem",
+        "//pkg/sync",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 1bc9c4a38..486a76475 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -16,9 +16,9 @@ package vfs
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 66eb57bc2..c00b3c84b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -17,13 +17,13 @@ package vfs
 import (
 	"bytes"
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index adff0b94b..3b933468d 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -17,8 +17,9 @@ package vfs
 import (
 	"fmt"
 	"runtime"
-	"sync"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func TestMountTableLookupEmpty(t *testing.T) {
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index ab13fa461..bd90d36c4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,7 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/syncutil"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // mountKey represents the location at which a Mount is mounted. It is
@@ -75,7 +75,7 @@ type mountTable struct {
 	// intrinsics and inline assembly, limiting the performance of this
 	// approach.)
 
-	seq  syncutil.SeqCount
+	seq  sync.SeqCount
 	seed uint32 // for hashing keys
 
 	// size holds both length (number of elements) and capacity (number of
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index 8e155654f..cf80df90e 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -15,10 +15,9 @@
 package vfs
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index f0641d314..8a0b382f6 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -16,11 +16,11 @@ package vfs
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index ea2db7031..1f21b0b31 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -29,12 +29,12 @@ package vfs
 
 import (
 	"fmt"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 4d8435265..28f21f13d 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -13,5 +13,6 @@ go_library(
         "//pkg/metric",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
+        "//pkg/sync",
     ],
 )
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 5e4611333..bfb2fac26 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -32,7 +32,6 @@ package watchdog
 import (
 	"bytes"
 	"fmt"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -40,6 +39,7 @@ import (
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Opts configures the watchdog.
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
new file mode 100644
index 000000000..e8cd16b8f
--- /dev/null
+++ b/pkg/sync/BUILD
@@ -0,0 +1,53 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+exports_files(["LICENSE"])
+
+go_template(
+    name = "generic_atomicptr",
+    srcs = ["atomicptr_unsafe.go"],
+    types = [
+        "Value",
+    ],
+)
+
+go_template(
+    name = "generic_seqatomic",
+    srcs = ["seqatomic_unsafe.go"],
+    types = [
+        "Value",
+    ],
+    deps = [
+        ":sync",
+    ],
+)
+
+go_library(
+    name = "sync",
+    srcs = [
+        "aliases.go",
+        "downgradable_rwmutex_unsafe.go",
+        "memmove_unsafe.go",
+        "norace_unsafe.go",
+        "race_unsafe.go",
+        "seqcount.go",
+        "syncutil.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sync",
+)
+
+go_test(
+    name = "sync_test",
+    size = "small",
+    srcs = [
+        "downgradable_rwmutex_test.go",
+        "seqcount_test.go",
+    ],
+    embed = [":sync"],
+)
diff --git a/pkg/sync/LICENSE b/pkg/sync/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/sync/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/sync/README.md b/pkg/sync/README.md
new file mode 100644
index 000000000..2183c4e20
--- /dev/null
+++ b/pkg/sync/README.md
@@ -0,0 +1,5 @@
+# Syncutil
+
+This package provides additional synchronization primitives not provided by the
+Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
+package from go1.10.
diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go
new file mode 100644
index 000000000..20c7ca041
--- /dev/null
+++ b/pkg/sync/aliases.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync"
+)
+
+// Aliases of standard library types.
+type (
+	// Mutex is an alias of sync.Mutex.
+	Mutex = sync.Mutex
+
+	// RWMutex is an alias of sync.RWMutex.
+	RWMutex = sync.RWMutex
+
+	// Cond is an alias of sync.Cond.
+	Cond = sync.Cond
+
+	// Locker is an alias of sync.Locker.
+	Locker = sync.Locker
+
+	// Once is an alias of sync.Once.
+	Once = sync.Once
+
+	// Pool is an alias of sync.Pool.
+	Pool = sync.Pool
+
+	// WaitGroup is an alias of sync.WaitGroup.
+	WaitGroup = sync.WaitGroup
+
+	// Map is an alias of sync.Map.
+	Map = sync.Map
+)
diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go
new file mode 100644
index 000000000..525c4beed
--- /dev/null
+++ b/pkg/sync/atomicptr_unsafe.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// Value is a required type parameter.
+type Value struct{}
+
+// An AtomicPtr is a pointer to a value of type Value that can be atomically
+// loaded and stored. The zero value of an AtomicPtr represents nil.
+//
+// Note that copying AtomicPtr by value performs a non-atomic read of the
+// stored pointer, which is unsafe if Store() can be called concurrently; in
+// this case, do `dst.Store(src.Load())` instead.
+//
+// +stateify savable
+type AtomicPtr struct {
+	ptr unsafe.Pointer `state:".(*Value)"`
+}
+
+func (p *AtomicPtr) savePtr() *Value {
+	return p.Load()
+}
+
+func (p *AtomicPtr) loadPtr(v *Value) {
+	p.Store(v)
+}
+
+// Load returns the value set by the most recent Store. It returns nil if there
+// has been no previous call to Store.
+func (p *AtomicPtr) Load() *Value {
+	return (*Value)(atomic.LoadPointer(&p.ptr))
+}
+
+// Store sets the value returned by Load to x.
+func (p *AtomicPtr) Store(x *Value) {
+	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
+}
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
new file mode 100644
index 000000000..418eda29c
--- /dev/null
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -0,0 +1,29 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "atomicptr_int",
+    out = "atomicptr_int_unsafe.go",
+    package = "atomicptr",
+    suffix = "Int",
+    template = "//pkg/sync:generic_atomicptr",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "atomicptr",
+    srcs = ["atomicptr_int_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr",
+)
+
+go_test(
+    name = "atomicptr_test",
+    size = "small",
+    srcs = ["atomicptr_test.go"],
+    embed = [":atomicptr"],
+)
diff --git a/pkg/sync/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go
new file mode 100644
index 000000000..8fdc5112e
--- /dev/null
+++ b/pkg/sync/atomicptrtest/atomicptr_test.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomicptr
+
+import (
+	"testing"
+)
+
+func newInt(val int) *int {
+	return &val
+}
+
+func TestAtomicPtr(t *testing.T) {
+	var p AtomicPtrInt
+	if got := p.Load(); got != nil {
+		t.Errorf("initial value is %p (%v), wanted nil", got, got)
+	}
+	want := newInt(42)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+	want = newInt(100)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+}
diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go
new file mode 100644
index 000000000..f04496bc5
--- /dev/null
+++ b/pkg/sync/downgradable_rwmutex_test.go
@@ -0,0 +1,150 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
+// addition of downgradingWriter and the renaming of num_iterations to
+// numIterations to shut up Golint.
+
+package sync
+
+import (
+	"fmt"
+	"runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	<-cunlock
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders, gomaxprocs int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	var m DowngradableRWMutex
+	clocked := make(chan bool)
+	cunlock := make(chan bool)
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	for i := 0; i < numReaders; i++ {
+		cunlock <- true
+	}
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelReaders(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	doTestParallelReaders(1, 4)
+	doTestParallelReaders(3, 4)
+	doTestParallelReaders(4, 2)
+}
+
+func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.DowngradeLock()
+		n = atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		n = atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm DowngradableRWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	// Wait for the 4 writers and all readers to finish.
+	for i := 0; i < 4+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestDowngradableRWMutex(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerDowngradableRWMutex(1, 1, n)
+	HammerDowngradableRWMutex(1, 3, n)
+	HammerDowngradableRWMutex(1, 10, n)
+	HammerDowngradableRWMutex(4, 1, n)
+	HammerDowngradableRWMutex(4, 3, n)
+	HammerDowngradableRWMutex(4, 10, n)
+	HammerDowngradableRWMutex(10, 1, n)
+	HammerDowngradableRWMutex(10, 3, n)
+	HammerDowngradableRWMutex(10, 10, n)
+	HammerDowngradableRWMutex(10, 5, n)
+}
diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go
new file mode 100644
index 000000000..9bb55cd3a
--- /dev/null
+++ b/pkg/sync/downgradable_rwmutex_unsafe.go
@@ -0,0 +1,146 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+// This is mostly copied from the standard library's sync/rwmutex.go.
+//
+// Happens-before relationships indicated to the race detector:
+// - Unlock -> Lock (via writerSem)
+// - Unlock -> RLock (via readerSem)
+// - RUnlock -> Lock (via writerSem)
+// - DowngradeLock -> RLock (via readerSem)
+
+package sync
+
+import (
+	"sync"
+	"sync/atomic"
+	"unsafe"
+)
+
+//go:linkname runtimeSemacquire sync.runtime_Semacquire
+func runtimeSemacquire(s *uint32)
+
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
+
+// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
+// method.
+type DowngradableRWMutex struct {
+	w           sync.Mutex // held if there are pending writers
+	writerSem   uint32     // semaphore for writers to wait for completing readers
+	readerSem   uint32     // semaphore for readers to wait for completing writers
+	readerCount int32      // number of pending readers
+	readerWait  int32      // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// RLock locks rw for reading.
+func (rw *DowngradableRWMutex) RLock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
+		// A writer is pending, wait for it.
+		runtimeSemacquire(&rw.readerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.readerSem))
+	}
+}
+
+// RUnlock undoes a single RLock call.
+func (rw *DowngradableRWMutex) RUnlock() {
+	if RaceEnabled {
+		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
+		RaceDisable()
+	}
+	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			panic("RUnlock of unlocked DowngradableRWMutex")
+		}
+		// A writer is pending.
+		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			runtimeSemrelease(&rw.writerSem, false, 0)
+		}
+	}
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// Lock locks rw for writing.
+func (rw *DowngradableRWMutex) Lock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	rw.w.Lock()
+	// Announce to readers there is a pending writer.
+	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
+	// Wait for active readers.
+	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
+		runtimeSemacquire(&rw.writerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+}
+
+// Unlock unlocks rw for writing.
+func (rw *DowngradableRWMutex) Unlock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.writerSem))
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
+	if r >= rwmutexMaxReaders {
+		panic("Unlock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any.
+	for i := 0; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// DowngradeLock atomically unlocks rw for writing and locks it for reading.
+func (rw *DowngradableRWMutex) DowngradeLock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer and one additional reader.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
+	if r >= rwmutexMaxReaders+1 {
+		panic("DowngradeLock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
+	// includes this goroutine.
+	for i := 1; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
+	// block on rw.writerSem since at least this reader exists, such that
+	// DowngradeLock() is atomic with the previous write lock.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
new file mode 100644
index 000000000..ad4a3a37e
--- /dev/null
+++ b/pkg/sync/memmove_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package sync
+
+import (
+	"unsafe"
+)
+
+//go:linkname memmove runtime.memmove
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
+
+// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
+// define it because go_generics can't update the go:linkname annotation.
+// Furthermore, go:linkname silently doesn't work if the local name is exported
+// (this is of course undocumented), which is why this indirection is
+// necessary.
+func Memmove(to, from unsafe.Pointer, n uintptr) {
+	memmove(to, from, n)
+}
diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go
new file mode 100644
index 000000000..006055dd6
--- /dev/null
+++ b/pkg/sync/norace_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+package sync
+
+import (
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = false
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+}
diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go
new file mode 100644
index 000000000..31d8fa9a6
--- /dev/null
+++ b/pkg/sync/race_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package sync
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = true
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+	runtime.RaceDisable()
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+	runtime.RaceEnable()
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+	runtime.RaceAcquire(addr)
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+	runtime.RaceRelease(addr)
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+	runtime.RaceReleaseMerge(addr)
+}
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
new file mode 100644
index 000000000..eda6fb131
--- /dev/null
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Value is a required type parameter.
+//
+// Value must not contain any pointers, including interface objects, function
+// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
+// containing any of the above. An init() function will panic if this property
+// does not hold.
+type Value struct{}
+
+// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
+// with any writer critical sections in sc.
+func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value {
+	// This function doesn't use SeqAtomicTryLoad because doing so is
+	// measurably, significantly (~20%) slower; Go is awful at inlining.
+	var val Value
+	for {
+		epoch := sc.BeginRead()
+		if sync.RaceEnabled {
+			// runtime.RaceDisable() doesn't actually stop the race detector,
+			// so it can't help us here. Instead, call runtime.memmove
+			// directly, which is not instrumented by the race detector.
+			sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+		} else {
+			// This is ~40% faster for short reads than going through memmove.
+			val = *ptr
+		}
+		if sc.ReadOk(epoch) {
+			break
+		}
+	}
+	return val
+}
+
+// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
+// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
+// would race with a writer critical section, SeqAtomicTryLoad returns
+// (unspecified, false).
+func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) {
+	var val Value
+	if sync.RaceEnabled {
+		sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+	} else {
+		val = *ptr
+	}
+	return val, sc.ReadOk(epoch)
+}
+
+func init() {
+	var val Value
+	typ := reflect.TypeOf(val)
+	name := typ.Name()
+	if ptrs := sync.PointersInType(typ, name); len(ptrs) != 0 {
+		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
+	}
+}
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
new file mode 100644
index 000000000..eba21518d
--- /dev/null
+++ b/pkg/sync/seqatomictest/BUILD
@@ -0,0 +1,33 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "seqatomic_int",
+    out = "seqatomic_int_unsafe.go",
+    package = "seqatomic",
+    suffix = "Int",
+    template = "//pkg/sync:generic_seqatomic",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "seqatomic",
+    srcs = ["seqatomic_int_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic",
+    deps = [
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "seqatomic_test",
+    size = "small",
+    srcs = ["seqatomic_test.go"],
+    embed = [":seqatomic"],
+    deps = ["//pkg/sync"],
+)
diff --git a/pkg/sync/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go
new file mode 100644
index 000000000..2c4568b07
--- /dev/null
+++ b/pkg/sync/seqatomictest/seqatomic_test.go
@@ -0,0 +1,132 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seqatomic
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestSeqAtomicLoadUncontended(t *testing.T) {
+	var seq sync.SeqCount
+	const want = 1
+	data := want
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadAfterWrite(t *testing.T) {
+	var seq sync.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadDuringWrite(t *testing.T) {
+	var seq sync.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicTryLoadUncontended(t *testing.T) {
+	var seq sync.SeqCount
+	const want = 1
+	data := want
+	epoch := seq.BeginRead()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+	}
+}
+
+func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
+	var seq sync.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+	seq.EndWrite()
+}
+
+func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
+	var seq sync.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+}
+
+func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
+	var seq sync.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := SeqAtomicLoadInt(&seq, &data); got != want {
+				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
+
+func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
+	var seq sync.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		epoch := seq.BeginRead()
+		for pb.Next() {
+			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+			}
+		}
+	})
+}
+
+// For comparison:
+func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
+	var a atomic.Value
+	const want = 42
+	a.Store(int(want))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := a.Load().(int); got != want {
+				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
new file mode 100644
index 000000000..a1e895352
--- /dev/null
+++ b/pkg/sync/seqcount.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+)
+
+// SeqCount is a synchronization primitive for optimistic reader/writer
+// synchronization in cases where readers can work with stale data and
+// therefore do not need to block writers.
+//
+// Compared to sync/atomic.Value:
+//
+// - Mutation of SeqCount-protected data does not require memory allocation,
+// whereas atomic.Value generally does. This is a significant advantage when
+// writes are common.
+//
+// - Atomic reads of SeqCount-protected data require copying. This is a
+// disadvantage when atomic reads are common.
+//
+// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
+// operations to be made atomic with reads of SeqCount-protected data.
+//
+// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
+// cannot include pointers.
+//
+// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
+// data require instantiating function templates using go_generics (see
+// seqatomic.go).
+type SeqCount struct {
+	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
+	// if a writer critical section is active, and a read from data protected
+	// by this SeqCount is atomic iff epoch is the same even value before and
+	// after the read.
+	epoch uint32
+}
+
+// SeqCountEpoch tracks writer critical sections in a SeqCount.
+type SeqCountEpoch struct {
+	val uint32
+}
+
+// We assume that:
+//
+// - All functions in sync/atomic that perform a memory read are at least a
+// read fence: memory reads before calls to such functions cannot be reordered
+// after the call, and memory reads after calls to such functions cannot be
+// reordered before the call, even if those reads do not use sync/atomic.
+//
+// - All functions in sync/atomic that perform a memory write are at least a
+// write fence: memory writes before calls to such functions cannot be
+// reordered after the call, and memory writes after calls to such functions
+// cannot be reordered before the call, even if those writes do not use
+// sync/atomic.
+//
+// As of this writing, the Go memory model completely fails to describe
+// sync/atomic, but these properties are implied by
+// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
+
+// BeginRead indicates the beginning of a reader critical section. Reader
+// critical sections DO NOT BLOCK writer critical sections, so operations in a
+// reader critical section MAY RACE with writer critical sections. Races are
+// detected by ReadOk at the end of the reader critical section. Thus, the
+// low-level structure of readers is generally:
+//
+//     for {
+//         epoch := seq.BeginRead()
+//         // do something idempotent with seq-protected data
+//         if seq.ReadOk(epoch) {
+//             break
+//         }
+//     }
+//
+// However, since reader critical sections may race with writer critical
+// sections, the Go race detector will (accurately) flag data races in readers
+// using this pattern. Most users of SeqCount will need to use the
+// SeqAtomicLoad function template in seqatomic.go.
+func (s *SeqCount) BeginRead() SeqCountEpoch {
+	epoch := atomic.LoadUint32(&s.epoch)
+	for epoch&1 != 0 {
+		runtime.Gosched()
+		epoch = atomic.LoadUint32(&s.epoch)
+	}
+	return SeqCountEpoch{epoch}
+}
+
+// ReadOk returns true if the reader critical section initiated by a previous
+// call to BeginRead() that returned epoch did not race with any writer critical
+// sections.
+//
+// ReadOk may be called any number of times during a reader critical section.
+// Reader critical sections do not need to be explicitly terminated; the last
+// call to ReadOk is implicitly the end of the reader critical section.
+func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
+	return atomic.LoadUint32(&s.epoch) == epoch.val
+}
+
+// BeginWrite indicates the beginning of a writer critical section.
+//
+// SeqCount does not support concurrent writer critical sections; clients with
+// concurrent writers must synchronize them using e.g. sync.Mutex.
+func (s *SeqCount) BeginWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
+		panic("SeqCount.BeginWrite during writer critical section")
+	}
+}
+
+// EndWrite ends the effect of a preceding BeginWrite.
+func (s *SeqCount) EndWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
+		panic("SeqCount.EndWrite outside writer critical section")
+	}
+}
+
+// PointersInType returns a list of pointers reachable from values named
+// valName of the given type.
+//
+// PointersInType is not exhaustive, but it is guaranteed that if typ contains
+// at least one pointer, then PointersInTypeOf returns a non-empty list.
+func PointersInType(typ reflect.Type, valName string) []string {
+	switch kind := typ.Kind(); kind {
+	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+		return nil
+
+	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
+		return []string{valName}
+
+	case reflect.Array:
+		return PointersInType(typ.Elem(), valName+"[]")
+
+	case reflect.Struct:
+		var ptrs []string
+		for i, n := 0, typ.NumField(); i < n; i++ {
+			field := typ.Field(i)
+			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
+		}
+		return ptrs
+
+	default:
+		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
+	}
+}
diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go
new file mode 100644
index 000000000..6eb7b4b59
--- /dev/null
+++ b/pkg/sync/seqcount_test.go
@@ -0,0 +1,153 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestSeqCountWriteUncontended(t *testing.T) {
+	var seq SeqCount
+	seq.BeginWrite()
+	seq.EndWrite()
+}
+
+func TestSeqCountReadUncontended(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadAfterWrite(t *testing.T) {
+	var seq SeqCount
+	var data int32
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadDuringWrite(t *testing.T) {
+	var seq SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountReadOkAfterWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+}
+
+func TestSeqCountReadOkDuringWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+	seq.EndWrite()
+}
+
+func BenchmarkSeqCountWriteUncontended(b *testing.B) {
+	var seq SeqCount
+	for i := 0; i < b.N; i++ {
+		seq.BeginWrite()
+		seq.EndWrite()
+	}
+}
+
+func BenchmarkSeqCountReadUncontended(b *testing.B) {
+	var seq SeqCount
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			epoch := seq.BeginRead()
+			if !seq.ReadOk(epoch) {
+				b.Fatalf("ReadOk: got false, wanted true")
+			}
+		}
+	})
+}
+
+func TestPointersInType(t *testing.T) {
+	for _, test := range []struct {
+		name string // used for both test and value name
+		val  interface{}
+		ptrs []string
+	}{
+		{
+			name: "EmptyStruct",
+			val:  struct{}{},
+		},
+		{
+			name: "Int",
+			val:  int(0),
+		},
+		{
+			name: "MixedStruct",
+			val: struct {
+				b             bool
+				I             int
+				ExportedPtr   *struct{}
+				unexportedPtr *struct{}
+				arr           [2]int
+				ptrArr        [2]*int
+				nestedStruct  struct {
+					nestedNonptr int
+					nestedPtr    *int
+				}
+				structArr [1]struct {
+					nonptr int
+					ptr    *int
+				}
+			}{},
+			ptrs: []string{
+				"MixedStruct.ExportedPtr",
+				"MixedStruct.unexportedPtr",
+				"MixedStruct.ptrArr[]",
+				"MixedStruct.nestedStruct.nestedPtr",
+				"MixedStruct.structArr[].ptr",
+			},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			typ := reflect.TypeOf(test.val)
+			ptrs := PointersInType(typ, test.name)
+			t.Logf("Found pointers: %v", ptrs)
+			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
+				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
+			}
+		})
+	}
+}
diff --git a/pkg/sync/syncutil.go b/pkg/sync/syncutil.go
new file mode 100644
index 000000000..b16cf5333
--- /dev/null
+++ b/pkg/sync/syncutil.go
@@ -0,0 +1,7 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sync provides synchronization primitives.
+package sync
diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD
deleted file mode 100644
index cb1f41628..000000000
--- a/pkg/syncutil/BUILD
+++ /dev/null
@@ -1,52 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template")
-
-package(
-    default_visibility = ["//:sandbox"],
-    licenses = ["notice"],
-)
-
-exports_files(["LICENSE"])
-
-go_template(
-    name = "generic_atomicptr",
-    srcs = ["atomicptr_unsafe.go"],
-    types = [
-        "Value",
-    ],
-)
-
-go_template(
-    name = "generic_seqatomic",
-    srcs = ["seqatomic_unsafe.go"],
-    types = [
-        "Value",
-    ],
-    deps = [
-        ":sync",
-    ],
-)
-
-go_library(
-    name = "syncutil",
-    srcs = [
-        "downgradable_rwmutex_unsafe.go",
-        "memmove_unsafe.go",
-        "norace_unsafe.go",
-        "race_unsafe.go",
-        "seqcount.go",
-        "syncutil.go",
-    ],
-    importpath = "gvisor.dev/gvisor/pkg/syncutil",
-)
-
-go_test(
-    name = "syncutil_test",
-    size = "small",
-    srcs = [
-        "downgradable_rwmutex_test.go",
-        "seqcount_test.go",
-    ],
-    embed = [":syncutil"],
-)
diff --git a/pkg/syncutil/LICENSE b/pkg/syncutil/LICENSE
deleted file mode 100644
index 6a66aea5e..000000000
--- a/pkg/syncutil/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/syncutil/README.md b/pkg/syncutil/README.md
deleted file mode 100644
index 2183c4e20..000000000
--- a/pkg/syncutil/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Syncutil
-
-This package provides additional synchronization primitives not provided by the
-Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
-package from go1.10.
diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/syncutil/atomicptr_unsafe.go
deleted file mode 100644
index 525c4beed..000000000
--- a/pkg/syncutil/atomicptr_unsafe.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-// Value is a required type parameter.
-type Value struct{}
-
-// An AtomicPtr is a pointer to a value of type Value that can be atomically
-// loaded and stored. The zero value of an AtomicPtr represents nil.
-//
-// Note that copying AtomicPtr by value performs a non-atomic read of the
-// stored pointer, which is unsafe if Store() can be called concurrently; in
-// this case, do `dst.Store(src.Load())` instead.
-//
-// +stateify savable
-type AtomicPtr struct {
-	ptr unsafe.Pointer `state:".(*Value)"`
-}
-
-func (p *AtomicPtr) savePtr() *Value {
-	return p.Load()
-}
-
-func (p *AtomicPtr) loadPtr(v *Value) {
-	p.Store(v)
-}
-
-// Load returns the value set by the most recent Store. It returns nil if there
-// has been no previous call to Store.
-func (p *AtomicPtr) Load() *Value {
-	return (*Value)(atomic.LoadPointer(&p.ptr))
-}
-
-// Store sets the value returned by Load to x.
-func (p *AtomicPtr) Store(x *Value) {
-	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
-}
diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/syncutil/atomicptrtest/BUILD
deleted file mode 100644
index 63f411a90..000000000
--- a/pkg/syncutil/atomicptrtest/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "atomicptr_int",
-    out = "atomicptr_int_unsafe.go",
-    package = "atomicptr",
-    suffix = "Int",
-    template = "//pkg/syncutil:generic_atomicptr",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "atomicptr",
-    srcs = ["atomicptr_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr",
-)
-
-go_test(
-    name = "atomicptr_test",
-    size = "small",
-    srcs = ["atomicptr_test.go"],
-    embed = [":atomicptr"],
-)
diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/syncutil/atomicptrtest/atomicptr_test.go
deleted file mode 100644
index 8fdc5112e..000000000
--- a/pkg/syncutil/atomicptrtest/atomicptr_test.go
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package atomicptr
-
-import (
-	"testing"
-)
-
-func newInt(val int) *int {
-	return &val
-}
-
-func TestAtomicPtr(t *testing.T) {
-	var p AtomicPtrInt
-	if got := p.Load(); got != nil {
-		t.Errorf("initial value is %p (%v), wanted nil", got, got)
-	}
-	want := newInt(42)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-	want = newInt(100)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-}
diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/syncutil/downgradable_rwmutex_test.go
deleted file mode 100644
index ffaf7ecc7..000000000
--- a/pkg/syncutil/downgradable_rwmutex_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GOMAXPROCS=10 go test
-
-// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
-// addition of downgradingWriter and the renaming of num_iterations to
-// numIterations to shut up Golint.
-
-package syncutil
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-)
-
-func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
-	m.RLock()
-	clocked <- true
-	<-cunlock
-	m.RUnlock()
-	cdone <- true
-}
-
-func doTestParallelReaders(numReaders, gomaxprocs int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	var m DowngradableRWMutex
-	clocked := make(chan bool)
-	cunlock := make(chan bool)
-	cdone := make(chan bool)
-	for i := 0; i < numReaders; i++ {
-		go parallelReader(&m, clocked, cunlock, cdone)
-	}
-	// Wait for all parallel RLock()s to succeed.
-	for i := 0; i < numReaders; i++ {
-		<-clocked
-	}
-	for i := 0; i < numReaders; i++ {
-		cunlock <- true
-	}
-	// Wait for the goroutines to finish.
-	for i := 0; i < numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestParallelReaders(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	doTestParallelReaders(1, 4)
-	doTestParallelReaders(3, 4)
-	doTestParallelReaders(4, 2)
-}
-
-func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.RLock()
-		n := atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.Unlock()
-	}
-	cdone <- true
-}
-
-func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.DowngradeLock()
-		n = atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		n = atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	// Number of active readers + 10000 * number of active writers.
-	var activity int32
-	var rwm DowngradableRWMutex
-	cdone := make(chan bool)
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	var i int
-	for i = 0; i < numReaders/2; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	for ; i < numReaders; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	// Wait for the 4 writers and all readers to finish.
-	for i := 0; i < 4+numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestDowngradableRWMutex(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	n := 1000
-	if testing.Short() {
-		n = 5
-	}
-	HammerDowngradableRWMutex(1, 1, n)
-	HammerDowngradableRWMutex(1, 3, n)
-	HammerDowngradableRWMutex(1, 10, n)
-	HammerDowngradableRWMutex(4, 1, n)
-	HammerDowngradableRWMutex(4, 3, n)
-	HammerDowngradableRWMutex(4, 10, n)
-	HammerDowngradableRWMutex(10, 1, n)
-	HammerDowngradableRWMutex(10, 3, n)
-	HammerDowngradableRWMutex(10, 10, n)
-	HammerDowngradableRWMutex(10, 5, n)
-}
diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go
deleted file mode 100644
index 51e11555d..000000000
--- a/pkg/syncutil/downgradable_rwmutex_unsafe.go
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-// This is mostly copied from the standard library's sync/rwmutex.go.
-//
-// Happens-before relationships indicated to the race detector:
-// - Unlock -> Lock (via writerSem)
-// - Unlock -> RLock (via readerSem)
-// - RUnlock -> Lock (via writerSem)
-// - DowngradeLock -> RLock (via readerSem)
-
-package syncutil
-
-import (
-	"sync"
-	"sync/atomic"
-	"unsafe"
-)
-
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
-
-// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
-// method.
-type DowngradableRWMutex struct {
-	w           sync.Mutex // held if there are pending writers
-	writerSem   uint32     // semaphore for writers to wait for completing readers
-	readerSem   uint32     // semaphore for readers to wait for completing writers
-	readerCount int32      // number of pending readers
-	readerWait  int32      // number of departing readers
-}
-
-const rwmutexMaxReaders = 1 << 30
-
-// RLock locks rw for reading.
-func (rw *DowngradableRWMutex) RLock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
-		// A writer is pending, wait for it.
-		runtimeSemacquire(&rw.readerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.readerSem))
-	}
-}
-
-// RUnlock undoes a single RLock call.
-func (rw *DowngradableRWMutex) RUnlock() {
-	if RaceEnabled {
-		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
-		RaceDisable()
-	}
-	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
-		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
-			panic("RUnlock of unlocked DowngradableRWMutex")
-		}
-		// A writer is pending.
-		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
-			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false, 0)
-		}
-	}
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// Lock locks rw for writing.
-func (rw *DowngradableRWMutex) Lock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	// First, resolve competition with other writers.
-	rw.w.Lock()
-	// Announce to readers there is a pending writer.
-	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
-	// Wait for active readers.
-	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
-		runtimeSemacquire(&rw.writerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.writerSem))
-	}
-}
-
-// Unlock unlocks rw for writing.
-func (rw *DowngradableRWMutex) Unlock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.writerSem))
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
-	if r >= rwmutexMaxReaders {
-		panic("Unlock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any.
-	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *DowngradableRWMutex) DowngradeLock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer and one additional reader.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
-	if r >= rwmutexMaxReaders+1 {
-		panic("DowngradeLock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
-	// includes this goroutine.
-	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
-	// block on rw.writerSem since at least this reader exists, such that
-	// DowngradeLock() is atomic with the previous write lock.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/syncutil/memmove_unsafe.go
deleted file mode 100644
index 348675baa..000000000
--- a/pkg/syncutil/memmove_unsafe.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package syncutil
-
-import (
-	"unsafe"
-)
-
-//go:linkname memmove runtime.memmove
-//go:noescape
-func memmove(to, from unsafe.Pointer, n uintptr)
-
-// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
-// define it because go_generics can't update the go:linkname annotation.
-// Furthermore, go:linkname silently doesn't work if the local name is exported
-// (this is of course undocumented), which is why this indirection is
-// necessary.
-func Memmove(to, from unsafe.Pointer, n uintptr) {
-	memmove(to, from, n)
-}
diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/syncutil/norace_unsafe.go
deleted file mode 100644
index 0a0a9deda..000000000
--- a/pkg/syncutil/norace_unsafe.go
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !race
-
-package syncutil
-
-import (
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = false
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-}
diff --git a/pkg/syncutil/race_unsafe.go b/pkg/syncutil/race_unsafe.go
deleted file mode 100644
index 206067ec1..000000000
--- a/pkg/syncutil/race_unsafe.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build race
-
-package syncutil
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = true
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-	runtime.RaceDisable()
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-	runtime.RaceEnable()
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-	runtime.RaceAcquire(addr)
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-	runtime.RaceRelease(addr)
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-	runtime.RaceReleaseMerge(addr)
-}
diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/syncutil/seqatomic_unsafe.go
deleted file mode 100644
index cb6d2eb22..000000000
--- a/pkg/syncutil/seqatomic_unsafe.go
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"fmt"
-	"reflect"
-	"strings"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/syncutil"
-)
-
-// Value is a required type parameter.
-//
-// Value must not contain any pointers, including interface objects, function
-// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
-// containing any of the above. An init() function will panic if this property
-// does not hold.
-type Value struct{}
-
-// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
-	for {
-		epoch := sc.BeginRead()
-		if syncutil.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
-		}
-	}
-	return val
-}
-
-// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
-// (unspecified, false).
-func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
-	if syncutil.RaceEnabled {
-		syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-	} else {
-		val = *ptr
-	}
-	return val, sc.ReadOk(epoch)
-}
-
-func init() {
-	var val Value
-	typ := reflect.TypeOf(val)
-	name := typ.Name()
-	if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 {
-		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
-	}
-}
diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/syncutil/seqatomictest/BUILD
deleted file mode 100644
index ba18f3238..000000000
--- a/pkg/syncutil/seqatomictest/BUILD
+++ /dev/null
@@ -1,35 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "seqatomic_int",
-    out = "seqatomic_int_unsafe.go",
-    package = "seqatomic",
-    suffix = "Int",
-    template = "//pkg/syncutil:generic_seqatomic",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "seqatomic",
-    srcs = ["seqatomic_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic",
-    deps = [
-        "//pkg/syncutil",
-    ],
-)
-
-go_test(
-    name = "seqatomic_test",
-    size = "small",
-    srcs = ["seqatomic_test.go"],
-    embed = [":seqatomic"],
-    deps = [
-        "//pkg/syncutil",
-    ],
-)
diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/syncutil/seqatomictest/seqatomic_test.go
deleted file mode 100644
index b0db44999..000000000
--- a/pkg/syncutil/seqatomictest/seqatomic_test.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package seqatomic
-
-import (
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/syncutil"
-)
-
-func TestSeqAtomicLoadUncontended(t *testing.T) {
-	var seq syncutil.SeqCount
-	const want = 1
-	data := want
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadAfterWrite(t *testing.T) {
-	var seq syncutil.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadDuringWrite(t *testing.T) {
-	var seq syncutil.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicTryLoadUncontended(t *testing.T) {
-	var seq syncutil.SeqCount
-	const want = 1
-	data := want
-	epoch := seq.BeginRead()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-	}
-}
-
-func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
-	var seq syncutil.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-	seq.EndWrite()
-}
-
-func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
-	var seq syncutil.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-}
-
-func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
-	var seq syncutil.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := SeqAtomicLoadInt(&seq, &data); got != want {
-				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
-
-func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
-	var seq syncutil.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		epoch := seq.BeginRead()
-		for pb.Next() {
-			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-			}
-		}
-	})
-}
-
-// For comparison:
-func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
-	var a atomic.Value
-	const want = 42
-	a.Store(int(want))
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := a.Load().(int); got != want {
-				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
diff --git a/pkg/syncutil/seqcount.go b/pkg/syncutil/seqcount.go
deleted file mode 100644
index 11d8dbfaa..000000000
--- a/pkg/syncutil/seqcount.go
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syncutil
-
-import (
-	"fmt"
-	"reflect"
-	"runtime"
-	"sync/atomic"
-)
-
-// SeqCount is a synchronization primitive for optimistic reader/writer
-// synchronization in cases where readers can work with stale data and
-// therefore do not need to block writers.
-//
-// Compared to sync/atomic.Value:
-//
-// - Mutation of SeqCount-protected data does not require memory allocation,
-// whereas atomic.Value generally does. This is a significant advantage when
-// writes are common.
-//
-// - Atomic reads of SeqCount-protected data require copying. This is a
-// disadvantage when atomic reads are common.
-//
-// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
-// operations to be made atomic with reads of SeqCount-protected data.
-//
-// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
-// cannot include pointers.
-//
-// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
-// data require instantiating function templates using go_generics (see
-// seqatomic.go).
-type SeqCount struct {
-	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
-	// if a writer critical section is active, and a read from data protected
-	// by this SeqCount is atomic iff epoch is the same even value before and
-	// after the read.
-	epoch uint32
-}
-
-// SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
-
-// We assume that:
-//
-// - All functions in sync/atomic that perform a memory read are at least a
-// read fence: memory reads before calls to such functions cannot be reordered
-// after the call, and memory reads after calls to such functions cannot be
-// reordered before the call, even if those reads do not use sync/atomic.
-//
-// - All functions in sync/atomic that perform a memory write are at least a
-// write fence: memory writes before calls to such functions cannot be
-// reordered after the call, and memory writes after calls to such functions
-// cannot be reordered before the call, even if those writes do not use
-// sync/atomic.
-//
-// As of this writing, the Go memory model completely fails to describe
-// sync/atomic, but these properties are implied by
-// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
-
-// BeginRead indicates the beginning of a reader critical section. Reader
-// critical sections DO NOT BLOCK writer critical sections, so operations in a
-// reader critical section MAY RACE with writer critical sections. Races are
-// detected by ReadOk at the end of the reader critical section. Thus, the
-// low-level structure of readers is generally:
-//
-//     for {
-//         epoch := seq.BeginRead()
-//         // do something idempotent with seq-protected data
-//         if seq.ReadOk(epoch) {
-//             break
-//         }
-//     }
-//
-// However, since reader critical sections may race with writer critical
-// sections, the Go race detector will (accurately) flag data races in readers
-// using this pattern. Most users of SeqCount will need to use the
-// SeqAtomicLoad function template in seqatomic.go.
-func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
-	}
-	return SeqCountEpoch{epoch}
-}
-
-// ReadOk returns true if the reader critical section initiated by a previous
-// call to BeginRead() that returned epoch did not race with any writer critical
-// sections.
-//
-// ReadOk may be called any number of times during a reader critical section.
-// Reader critical sections do not need to be explicitly terminated; the last
-// call to ReadOk is implicitly the end of the reader critical section.
-func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
-}
-
-// BeginWrite indicates the beginning of a writer critical section.
-//
-// SeqCount does not support concurrent writer critical sections; clients with
-// concurrent writers must synchronize them using e.g. sync.Mutex.
-func (s *SeqCount) BeginWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
-		panic("SeqCount.BeginWrite during writer critical section")
-	}
-}
-
-// EndWrite ends the effect of a preceding BeginWrite.
-func (s *SeqCount) EndWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
-		panic("SeqCount.EndWrite outside writer critical section")
-	}
-}
-
-// PointersInType returns a list of pointers reachable from values named
-// valName of the given type.
-//
-// PointersInType is not exhaustive, but it is guaranteed that if typ contains
-// at least one pointer, then PointersInTypeOf returns a non-empty list.
-func PointersInType(typ reflect.Type, valName string) []string {
-	switch kind := typ.Kind(); kind {
-	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		return nil
-
-	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
-		return []string{valName}
-
-	case reflect.Array:
-		return PointersInType(typ.Elem(), valName+"[]")
-
-	case reflect.Struct:
-		var ptrs []string
-		for i, n := 0, typ.NumField(); i < n; i++ {
-			field := typ.Field(i)
-			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
-		}
-		return ptrs
-
-	default:
-		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
-	}
-}
diff --git a/pkg/syncutil/seqcount_test.go b/pkg/syncutil/seqcount_test.go
deleted file mode 100644
index 14d6aedea..000000000
--- a/pkg/syncutil/seqcount_test.go
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syncutil
-
-import (
-	"reflect"
-	"testing"
-	"time"
-)
-
-func TestSeqCountWriteUncontended(t *testing.T) {
-	var seq SeqCount
-	seq.BeginWrite()
-	seq.EndWrite()
-}
-
-func TestSeqCountReadUncontended(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadAfterWrite(t *testing.T) {
-	var seq SeqCount
-	var data int32
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadDuringWrite(t *testing.T) {
-	var seq SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountReadOkAfterWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-}
-
-func TestSeqCountReadOkDuringWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-	seq.EndWrite()
-}
-
-func BenchmarkSeqCountWriteUncontended(b *testing.B) {
-	var seq SeqCount
-	for i := 0; i < b.N; i++ {
-		seq.BeginWrite()
-		seq.EndWrite()
-	}
-}
-
-func BenchmarkSeqCountReadUncontended(b *testing.B) {
-	var seq SeqCount
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			epoch := seq.BeginRead()
-			if !seq.ReadOk(epoch) {
-				b.Fatalf("ReadOk: got false, wanted true")
-			}
-		}
-	})
-}
-
-func TestPointersInType(t *testing.T) {
-	for _, test := range []struct {
-		name string // used for both test and value name
-		val  interface{}
-		ptrs []string
-	}{
-		{
-			name: "EmptyStruct",
-			val:  struct{}{},
-		},
-		{
-			name: "Int",
-			val:  int(0),
-		},
-		{
-			name: "MixedStruct",
-			val: struct {
-				b             bool
-				I             int
-				ExportedPtr   *struct{}
-				unexportedPtr *struct{}
-				arr           [2]int
-				ptrArr        [2]*int
-				nestedStruct  struct {
-					nestedNonptr int
-					nestedPtr    *int
-				}
-				structArr [1]struct {
-					nonptr int
-					ptr    *int
-				}
-			}{},
-			ptrs: []string{
-				"MixedStruct.ExportedPtr",
-				"MixedStruct.unexportedPtr",
-				"MixedStruct.ptrArr[]",
-				"MixedStruct.nestedStruct.nestedPtr",
-				"MixedStruct.structArr[].ptr",
-			},
-		},
-	} {
-		t.Run(test.name, func(t *testing.T) {
-			typ := reflect.TypeOf(test.val)
-			ptrs := PointersInType(typ, test.name)
-			t.Logf("Found pointers: %v", ptrs)
-			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
-				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
-			}
-		})
-	}
-}
diff --git a/pkg/syncutil/syncutil.go b/pkg/syncutil/syncutil.go
deleted file mode 100644
index 66e750d06..000000000
--- a/pkg/syncutil/syncutil.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package syncutil provides synchronization primitives.
-package syncutil
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index e07ebd153..db06d02c6 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -15,6 +15,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/iptables",
         "//pkg/waiter",
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 78df5a0b1..3df7d18d3 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -9,6 +9,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index cd6ce930a..a2f44b496 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -20,9 +20,9 @@ import (
 	"errors"
 	"io"
 	"net"
-	"sync"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 897c94821..66cc53ed4 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -16,6 +16,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index fa8a703d9..b7f60178e 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -41,10 +41,10 @@ package fdbased
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index a4f9cdd69..09165dd4c 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -15,6 +15,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
@@ -31,6 +32,7 @@ go_test(
     ],
     embed = [":sharedmem"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index 6b5bc542c..a0d4ad0be 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -21,4 +21,5 @@ go_test(
         "pipe_test.go",
     ],
     embed = [":pipe"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index 59ef69a8b..dc239a0d0 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -18,8 +18,9 @@ import (
 	"math/rand"
 	"reflect"
 	"runtime"
-	"sync"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func TestSimpleReadWrite(t *testing.T) {
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 080f9d667..655e537c4 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -23,11 +23,11 @@
 package sharedmem
 
 import (
-	"sync"
 	"sync/atomic"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 89603c48f..5c729a439 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -22,11 +22,11 @@ import (
 	"math/rand"
 	"os"
 	"strings"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index acf1e022c..ed16076fd 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -28,6 +28,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
     ],
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 6da5238ec..92f2aa13a 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -19,9 +19,9 @@ package fragmentation
 import (
 	"fmt"
 	"log"
-	"sync"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 9e002e396..0a83d81f2 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -18,9 +18,9 @@ import (
 	"container/heap"
 	"fmt"
 	"math"
-	"sync"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index e156b01f6..a6ef3bdcc 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -9,6 +9,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/tcpip/ports",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
     ],
 )
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 6c5e19e8f..b937cb84b 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -18,9 +18,9 @@ package ports
 import (
 	"math"
 	"math/rand"
-	"sync"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 826fca4de..6a8654105 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -36,6 +36,7 @@ go_library(
         "//pkg/ilist",
         "//pkg/rand",
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
@@ -80,6 +81,7 @@ go_test(
     embed = [":stack"],
     deps = [
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
     ],
 )
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 267df60d1..403557fd7 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -16,10 +16,10 @@ package stack
 
 import (
 	"fmt"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index 9946b8fe8..1baa498d0 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -16,12 +16,12 @@ package stack
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3810c6602..fe557ccbd 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -16,9 +16,9 @@ package stack
 
 import (
 	"strings"
-	"sync"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 41bf9fd9b..a47ceba54 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -21,13 +21,13 @@ package stack
 
 import (
 	"encoding/binary"
-	"sync"
 	"sync/atomic"
 	"time"
 
 	"golang.org/x/time/rate"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 67c21be42..f384a91de 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -18,8 +18,8 @@ import (
 	"fmt"
 	"math/rand"
 	"sort"
-	"sync"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 72b5ce179..4a090ac86 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -35,10 +35,10 @@ import (
 	"reflect"
 	"strconv"
 	"strings"
-	"sync"
 	"sync/atomic"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/waiter"
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index d8c5b5058..3aa23d529 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -28,6 +28,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index c7ce74cdd..330786f4c 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -15,8 +15,7 @@
 package icmp
 
 import (
-	"sync"
-
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index 44b58ff6b..4858d150c 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -28,6 +28,7 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 07ffa8aba..fc5bc69fa 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -25,8 +25,7 @@
 package packet
 
 import (
-	"sync"
-
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 00991ac8e..2f2131ff7 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -29,6 +29,7 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 85f7eb76b..ee9c4c58b 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -26,8 +26,7 @@
 package raw
 
 import (
-	"sync"
-
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 3b353d56c..353bd06f4 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -48,6 +48,7 @@ go_library(
         "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 5422ae80c..1ea996936 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -19,11 +19,11 @@ import (
 	"encoding/binary"
 	"hash"
 	"io"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index cdd69f360..613ec1775 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -16,11 +16,11 @@ package tcp
 
 import (
 	"encoding/binary"
-	"sync"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 830bc1e3e..cca511fb9 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -19,12 +19,12 @@ import (
 	"fmt"
 	"math"
 	"strings"
-	"sync"
 	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 7aa4c3f0e..4b8d867bc 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -16,9 +16,9 @@ package tcp
 
 import (
 	"fmt"
-	"sync"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 4983bca81..7eb613be5 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -15,8 +15,7 @@
 package tcp
 
 import (
-	"sync"
-
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index bc718064c..9a8f64aa6 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -22,9 +22,9 @@ package tcp
 
 import (
 	"strings"
-	"sync"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index e0759225e..bd20a7ee9 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -15,7 +15,7 @@
 package tcp
 
 import (
-	"sync"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 8a947dc66..79f2d274b 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -16,11 +16,11 @@ package tcp
 
 import (
 	"math"
-	"sync"
 	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 97e4d5825..57ff123e3 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -30,6 +30,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 864dc8733..a4ff29a7d 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,8 +15,7 @@
 package udp
 
 import (
-	"sync"
-
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index 6afdb29b7..07778e4f7 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -15,4 +15,5 @@ go_test(
     size = "medium",
     srcs = ["tmutex_test.go"],
     embed = [":tmutex"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
index ce34c7962..05540696a 100644
--- a/pkg/tmutex/tmutex_test.go
+++ b/pkg/tmutex/tmutex_test.go
@@ -17,10 +17,11 @@ package tmutex
 import (
 	"fmt"
 	"runtime"
-	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func TestBasicLock(t *testing.T) {
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index 8f6f180e5..d1885ae66 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -24,4 +24,5 @@ go_test(
         "unet_test.go",
     ],
     embed = [":unet"],
+    deps = ["//pkg/sync"],
 )
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index a3cc6f5d3..5c4b9e8e9 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -19,10 +19,11 @@ import (
 	"os"
 	"path/filepath"
 	"reflect"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func randomFilename() (string, error) {
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index b6bbb0ea2..b8fdc3125 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -11,6 +11,7 @@ go_library(
     deps = [
         "//pkg/fd",
         "//pkg/log",
+        "//pkg/sync",
         "//pkg/unet",
     ],
 )
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index df59ffab1..13b2ea314 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -27,10 +27,10 @@ import (
 	"os"
 	"reflect"
 	"runtime"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
 
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 0427bc41f..1c6890e52 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -24,6 +24,7 @@ go_library(
     ],
     importpath = "gvisor.dev/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
+    deps = ["//pkg/sync"],
 )
 
 go_test(
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 8a65ed164..f708e95fa 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -58,7 +58,7 @@
 package waiter
 
 import (
-	"sync"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // EventMask represents io events as used in the poll() syscall.
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 6226b63f8..3e20f8f2f 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -74,6 +74,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/sentry/watchdog",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
@@ -114,6 +115,7 @@ go_test(
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sync",
         "//pkg/unet",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 352e710d2..9c23b9553 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -17,7 +17,6 @@ package boot
 import (
 	"fmt"
 	"os"
-	"sync"
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
@@ -27,6 +26,7 @@ import (
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 	spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func initCompatLogs(fd int) error {
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index d1c0bb9b5..ce62236e5 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -16,12 +16,12 @@ package boot
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Mapping from linux resource names to limits.LimitType.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index bc1d0c1bb..fad72f4ab 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,7 +20,6 @@ import (
 	mrand "math/rand"
 	"os"
 	"runtime"
-	"sync"
 	"sync/atomic"
 	"syscall"
 	gtime "time"
@@ -46,6 +45,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 147ff7703..bec0dc292 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -19,7 +19,6 @@ import (
 	"math/rand"
 	"os"
 	"reflect"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -30,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 250845ad7..b94bc4fa0 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -44,6 +44,7 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index a4e3071b3..1815c93b9 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 4831210c0..7df7995f0 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -21,7 +21,6 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
-	"sync"
 	"syscall"
 
 	"flag"
@@ -30,6 +29,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/fsgofer"
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index de2115dff..5e9bc53ab 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 2bd12120d..6dea179e4 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -18,6 +18,7 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/sentry/control",
+        "//pkg/sync",
         "//runsc/boot",
         "//runsc/cgroup",
         "//runsc/sandbox",
@@ -53,6 +54,7 @@ go_test(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 5ed131a7f..060b63bf3 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -20,7 +20,6 @@ import (
 	"io"
 	"os"
 	"path/filepath"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -29,6 +28,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/testutil"
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c10f85992..b54d8f712 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -26,7 +26,6 @@ import (
 	"reflect"
 	"strconv"
 	"strings"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -39,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/specutils"
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 4ad09ceab..2da93ec5b 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -22,7 +22,6 @@ import (
 	"path"
 	"path/filepath"
 	"strings"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -30,6 +29,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
 	"gvisor.dev/gvisor/runsc/testutil"
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
index d95151ea5..17a251530 100644
--- a/runsc/container/state_file.go
+++ b/runsc/container/state_file.go
@@ -20,10 +20,10 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
-	"sync"
 
 	"github.com/gofrs/flock"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 const stateFileExtension = ".state"
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index afcb41801..a9582d92b 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/sync",
         "//pkg/syserr",
         "//runsc/specutils",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index b59e1a70e..93606d051 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -29,7 +29,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"sync"
 	"syscall"
 
 	"golang.org/x/sys/unix"
@@ -37,6 +36,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 8001949d5..ddbc37456 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/platform",
+        "//pkg/sync",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
         "//pkg/urpc",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ce1452b87..ec72bdbfd 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -22,7 +22,6 @@ import (
 	"os"
 	"os/exec"
 	"strconv"
-	"sync"
 	"syscall"
 	"time"
 
@@ -34,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index c96ca2eb6..3c3027cb5 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -10,6 +10,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
+        "//pkg/sync",
         "//runsc/boot",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 9632776d2..fb22eae39 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -34,7 +34,6 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
-	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
@@ -42,6 +41,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
-- 
cgit v1.2.3


From ebd25099bfb9ac6af9739dd9a7795aff13f8e34a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 10 Jan 2020 16:45:45 +0800
Subject: enable //test/syscalls:proc_test support on Arm64

Problems with different platform architectures have been solved.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/proc.cc | 70 +++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 8cf08991b..66f89ef64 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -102,7 +102,55 @@ namespace {
 
 // O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
 // because "it isn't needed", even though Linux can return it via F_GETFL.
+#if defined(__x86_64__) || defined(__i386__)
 constexpr int kOLargeFile = 00100000;
+#elif __aarch64__
+// The value originate from the Linux
+// kernel's arch/arm64/include/uapi/asm/fcntl.h.
+constexpr int kOLargeFile = 00400000;
+#else
+#error "Unknown architecture"
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+  // This list of "required" fields is taken from reading the file
+  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "vendor_id",
+      "cpu family",
+      "model\t\t:",
+      "model name",
+      "stepping",
+      "cpu MHz",
+      "fpu\t\t:",
+      "fpu_exception",
+      "cpuid level",
+      "wp",
+      "bogomips",
+      "clflush size",
+      "cache_alignment",
+      "address sizes",
+      "power management",
+  };
+#elif __aarch64__
+  // This list of "required" fields is taken from reading the file
+  // arch/arm64/kernel/cpuinfo.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "BogoMIPS",
+      "Features",
+      "CPU implementer",
+      "CPU architecture",
+      "CPU variant",
+      "CPU part",
+      "CPU revision",
+  };
+#else
+#error "Unknown architecture"
+#endif
 
 // Takes the subprocess command line and pid.
 // If it returns !OK, WithSubprocess returns immediately.
@@ -717,28 +765,6 @@ TEST(ProcCpuinfo, RequiredFieldsArePresent) {
   ASSERT_FALSE(proc_cpuinfo.empty());
   std::vector<std::string> cpuinfo_fields = absl::StrSplit(proc_cpuinfo, '\n');
 
-  // This list of "required" fields is taken from reading the file
-  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
-  // printed by the kernel.
-  static const char* required_fields[] = {
-      "processor",
-      "vendor_id",
-      "cpu family",
-      "model\t\t:",
-      "model name",
-      "stepping",
-      "cpu MHz",
-      "fpu\t\t:",
-      "fpu_exception",
-      "cpuid level",
-      "wp",
-      "bogomips",
-      "clflush size",
-      "cache_alignment",
-      "address sizes",
-      "power management",
-  };
-
   // Check that the usual fields are there. We don't really care about the
   // contents.
   for (const std::string& field : required_fields) {
-- 
cgit v1.2.3


From dacd349d6fb4fc7453b1fbf694158fd25496ed42 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 10 Jan 2020 06:01:10 -0800
Subject: panic fix in retransmitTimerExpired.

This is a band-aid fix for now to prevent panics.

PiperOrigin-RevId: 289078453
---
 pkg/tcpip/transport/tcp/snd.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 79f2d274b..fdff7ed81 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -442,6 +442,13 @@ func (s *sender) retransmitTimerExpired() bool {
 		return true
 	}
 
+	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
+	// when writeList is empty. Remove this once we have a proper fix for this
+	// issue.
+	if s.writeList.Front() == nil {
+		return true
+	}
+
 	s.ep.stack.Stats().TCP.Timeouts.Increment()
 	s.ep.stats.SendErrors.Timeouts.Increment()
 
-- 
cgit v1.2.3


From 9aeb053bbaf834aab5b716b8645996943262b525 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Fri, 10 Jan 2020 09:05:25 -0800
Subject: Add tests for redirect port

Fix indentation and change function names.
---
 test/iptables/iptables_test.go | 10 +++++-----
 test/iptables/nat.go           | 45 +++++++++++++++++++++---------------------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index fce9247aa..05f27569f 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -178,14 +178,14 @@ func TestFilterInputDropDifferentUDPPort(t *testing.T) {
 	}
 }
 
-func TestFilterNATRedirectUDPPort(t *testing.T) {
-	if err := singleTest(FilterNATRedirectUDPPort{}); err != nil {
+func TestNATRedirectUDPPort(t *testing.T) {
+	if err := singleTest(NATRedirectUDPPort{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
-func TestFilterNATDropUDP(t *testing.T) {
-        if err := singleTest(FilterNATDropUDP{}); err != nil {
-	        t.Fatal(err)
+func TestNATDropUDP(t *testing.T) {
+	if err := singleTest(NATDropUDP{}); err != nil {
+		t.Fatal(err)
 	}
 }
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index 6deabf217..72c413af2 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,56 +15,55 @@
 package iptables
 
 import (
-        "fmt"
-        "net"
+	"fmt"
+	"net"
 )
 
 const (
-        redirectPort     = 42
+	redirectPort     = 42
 )
 
 func init() {
-        RegisterTestCase(FilterNATRedirectUDPPort{})
-	RegisterTestCase(FilterNATDropUDP{})
+	RegisterTestCase(NATRedirectUDPPort{})
+	RegisterTestCase(NATDropUDP{})
 }
 
-// FilterInputRedirectUDPPort tests that packets are redirected to different port.
-type FilterNATRedirectUDPPort struct{}
+// InputRedirectUDPPort tests that packets are redirected to different port.
+type NATRedirectUDPPort struct{}
 
 // Name implements TestCase.Name.
-func (FilterNATRedirectUDPPort) Name() string {
-        return "FilterNATRedirectUDPPort"
+func (NATRedirectUDPPort) Name() string {
+	return "NATRedirectUDPPort"
 }
 
 // ContainerAction implements TestCase.ContainerAction.
-func (FilterNATRedirectUDPPort) ContainerAction(ip net.IP) error {
-        if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports",
+func (NATRedirectUDPPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports",
 	fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
 	if err := listenUDP(redirectPort, sendloopDuration); err != nil {
-	        return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", redirectPort, err)
+		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", redirectPort, err)
 	}
-
 	return nil
 }
 
 // LocalAction implements TestCase.LocalAction.
-func (FilterNATRedirectUDPPort) LocalAction(ip net.IP) error {
-        return sendUDPLoop(ip, acceptPort, sendloopDuration)
+func (NATRedirectUDPPort) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
 
-// FilterNATDropUDP tests that packets are not received in ports other than redirect port.
-type FilterNATDropUDP struct{}
+// NATDropUDP tests that packets are not received in ports other than redirect port.
+type NATDropUDP struct{}
 
 // Name implements TestCase.Name.
-func (FilterNATDropUDP) Name() string {
-        return "FilterNATDropUDP"
+func (NATDropUDP) Name() string {
+	return "NATDropUDP"
 }
 
 // ContainerAction implements TestCase.ContainerAction.
-func (FilterNATDropUDP) ContainerAction(ip net.IP) error {
+func (NATDropUDP) ContainerAction(ip net.IP) error {
         if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports",
 	fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
@@ -78,6 +77,6 @@ func (FilterNATDropUDP) ContainerAction(ip net.IP) error {
 }
 
 // LocalAction implements TestCase.LocalAction.
-func (FilterNATDropUDP) LocalAction(ip net.IP) error {
-        return sendUDPLoop(ip, acceptPort, sendloopDuration)
+func (NATDropUDP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
-- 
cgit v1.2.3


From 6b83111499e9a8f42b6aa3998839922ba70eefdc Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 10 Jan 2020 13:33:12 -0800
Subject: goid: new package

Allows retrieving the goroutine ID for concurrency testing when the race
detector is enabled.

Updates #1472

PiperOrigin-RevId: 289155308
---
 pkg/goid/BUILD          | 26 +++++++++++++++++
 pkg/goid/empty_test.go  | 22 +++++++++++++++
 pkg/goid/goid.go        | 24 ++++++++++++++++
 pkg/goid/goid_amd64.s   | 21 ++++++++++++++
 pkg/goid/goid_race.go   | 25 +++++++++++++++++
 pkg/goid/goid_test.go   | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 pkg/goid/goid_unsafe.go | 64 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 256 insertions(+)
 create mode 100644 pkg/goid/BUILD
 create mode 100644 pkg/goid/empty_test.go
 create mode 100644 pkg/goid/goid.go
 create mode 100644 pkg/goid/goid_amd64.s
 create mode 100644 pkg/goid/goid_race.go
 create mode 100644 pkg/goid/goid_test.go
 create mode 100644 pkg/goid/goid_unsafe.go

diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
new file mode 100644
index 000000000..5d31e5366
--- /dev/null
+++ b/pkg/goid/BUILD
@@ -0,0 +1,26 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "goid",
+    srcs = [
+        "goid.go",
+        "goid_amd64.s",
+        "goid_race.go",
+        "goid_unsafe.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/goid",
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "goid_test",
+    size = "small",
+    srcs = [
+        "empty_test.go",
+        "goid_test.go",
+    ],
+    embed = [":goid"],
+)
diff --git a/pkg/goid/empty_test.go b/pkg/goid/empty_test.go
new file mode 100644
index 000000000..c0a4b17ab
--- /dev/null
+++ b/pkg/goid/empty_test.go
@@ -0,0 +1,22 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !race
+
+package goid
+
+import "testing"
+
+// TestNothing exists to make the build system happy.
+func TestNothing(t *testing.T) {}
diff --git a/pkg/goid/goid.go b/pkg/goid/goid.go
new file mode 100644
index 000000000..39df30031
--- /dev/null
+++ b/pkg/goid/goid.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !race
+
+// Package goid provides access to the ID of the current goroutine in
+// race/gotsan builds.
+package goid
+
+// Get returns the ID of the current goroutine.
+func Get() int64 {
+	panic("unimplemented for non-race builds")
+}
diff --git a/pkg/goid/goid_amd64.s b/pkg/goid/goid_amd64.s
new file mode 100644
index 000000000..d9f5cd2a3
--- /dev/null
+++ b/pkg/goid/goid_amd64.s
@@ -0,0 +1,21 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// func getg() *g
+TEXT ·getg(SB),NOSPLIT,$0-8
+	MOVQ (TLS), R14
+	MOVQ R14, ret+0(FP)
+	RET
diff --git a/pkg/goid/goid_race.go b/pkg/goid/goid_race.go
new file mode 100644
index 000000000..1766beaee
--- /dev/null
+++ b/pkg/goid/goid_race.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Only available in race/gotsan builds.
+// +build race
+
+// Package goid provides access to the ID of the current goroutine in
+// race/gotsan builds.
+package goid
+
+// Get returns the ID of the current goroutine.
+func Get() int64 {
+	return goid()
+}
diff --git a/pkg/goid/goid_test.go b/pkg/goid/goid_test.go
new file mode 100644
index 000000000..31970ce79
--- /dev/null
+++ b/pkg/goid/goid_test.go
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package goid
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+)
+
+func TestInitialGoID(t *testing.T) {
+	const max = 10000
+	if id := goid(); id < 0 || id > max {
+		t.Errorf("got goid = %d, want 0 < goid <= %d", id, max)
+	}
+}
+
+// TestGoIDSquence verifies that goid returns values which could plausibly be
+// goroutine IDs. If this test breaks or becomes flaky, the structs in
+// goid_unsafe.go may need to be updated.
+func TestGoIDSquence(t *testing.T) {
+	// Goroutine IDs are cached by each P.
+	runtime.GOMAXPROCS(1)
+
+	// Fill any holes in lower range.
+	for i := 0; i < 50; i++ {
+		var wg sync.WaitGroup
+		wg.Add(1)
+		go func() {
+			wg.Done()
+
+			// Leak the goroutine to prevent the ID from being
+			// reused.
+			select {}
+		}()
+		wg.Wait()
+	}
+
+	id := goid()
+	for i := 0; i < 100; i++ {
+		var (
+			newID int64
+			wg    sync.WaitGroup
+		)
+		wg.Add(1)
+		go func() {
+			newID = goid()
+			wg.Done()
+
+			// Leak the goroutine to prevent the ID from being
+			// reused.
+			select {}
+		}()
+		wg.Wait()
+		if max := id + 100; newID <= id || newID > max {
+			t.Errorf("unexpected goroutine ID pattern, got goid = %d, want %d < goid <= %d (previous = %d)", newID, id, max, id)
+		}
+		id = newID
+	}
+}
diff --git a/pkg/goid/goid_unsafe.go b/pkg/goid/goid_unsafe.go
new file mode 100644
index 000000000..ded8004dd
--- /dev/null
+++ b/pkg/goid/goid_unsafe.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goid
+
+// Structs from Go runtime. These may change in the future and require
+// updating. These structs are currently the same on both AMD64 and ARM64,
+// but may diverge in the future.
+
+type stack struct {
+	lo uintptr
+	hi uintptr
+}
+
+type gobuf struct {
+	sp   uintptr
+	pc   uintptr
+	g    uintptr
+	ctxt uintptr
+	ret  uint64
+	lr   uintptr
+	bp   uintptr
+}
+
+type g struct {
+	stack       stack
+	stackguard0 uintptr
+	stackguard1 uintptr
+
+	_panic       uintptr
+	_defer       uintptr
+	m            uintptr
+	sched        gobuf
+	syscallsp    uintptr
+	syscallpc    uintptr
+	stktopsp     uintptr
+	param        uintptr
+	atomicstatus uint32
+	stackLock    uint32
+	goid         int64
+
+	// More fields...
+	//
+	// We only use goid and the fields before it are only listed to
+	// calculate the correct offset.
+}
+
+func getg() *g
+
+// goid returns the ID of the current goroutine.
+func goid() int64 {
+	return getg().goid
+}
-- 
cgit v1.2.3


From d147e6d1b29d25607bcdcdb0beddb5122fea085e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 10 Jan 2020 13:58:46 -0800
Subject: Cleaned up logs.

---
 pkg/tcpip/iptables/iptables.go | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 91abbbea8..8a72feb77 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -19,7 +19,6 @@ package iptables
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
@@ -136,18 +135,15 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
 	// we're missing features. Jumps, the call stack, etc. aren't checked
 	// for yet because we're yet to support them.
-	log.Infof("kevin: iptables.IPTables: checking hook %v", hook)
 
 	// Go through each table containing the hook.
 	for _, tablename := range it.Priorities[hook] {
-		verdict := it.checkTable(hook, pkt, tablename)
-		switch verdict {
+		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
 		// If the table returns Accept, move on to the next table.
 		case Accept:
 			continue
 		// The Drop verdict is final.
 		case Drop:
-			log.Infof("kevin: Packet dropped")
 			return false
 		case Stolen, Queue, Repeat, None, Jump, Return, Continue:
 			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
@@ -155,21 +151,16 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	}
 
 	// Every table returned Accept.
-	log.Infof("kevin: Packet accepted")
 	return true
 }
 
 func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
-	log.Infof("kevin: iptables.IPTables: checking table %q", tablename)
+	// Start from ruleIdx and walk the list of rules until a rule gives us
+	// a verdict.
 	table := it.Tables[tablename]
-	log.Infof("kevin: iptables.IPTables: table %+v", table)
-
-	// Start from ruleIdx and go down until a rule gives us a verdict.
 	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
-		verdict := it.checkRule(hook, pkt, table, ruleIdx)
-		switch verdict {
-		// For either of these cases, this table is done with the
-		// packet.
+		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
+		// In either of these cases, this table is done with the packet.
 		case Accept, Drop:
 			return verdict
 		// Continue traversing the rules of the table.
-- 
cgit v1.2.3


From bcedf6a8e48b958e39ad7a7dba908354620a0d09 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 10 Jan 2020 14:30:33 -0800
Subject: Put CancellableTimer tests in the tcpip_test package

CancellableTimer tests were in a timer_test package but lived within the
tcpip directory. This caused issues with go tools.

PiperOrigin-RevId: 289166345
---
 pkg/tcpip/BUILD         | 2 +-
 pkg/tcpip/timer_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index db06d02c6..ebc8d0209 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -30,7 +30,7 @@ go_test(
 )
 
 go_test(
-    name = "timer_test",
+    name = "tcpip_x_test",
     size = "small",
     srcs = ["timer_test.go"],
     deps = [":tcpip"],
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
index 1f735d735..2d20f7ef3 100644
--- a/pkg/tcpip/timer_test.go
+++ b/pkg/tcpip/timer_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package timer_test
+package tcpip_test
 
 import (
 	"sync"
-- 
cgit v1.2.3


From d27208463e93c01d4e39c0450c3b27c00c466728 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 10 Jan 2020 14:47:08 -0800
Subject: Automated rollback of changelist 288990597

PiperOrigin-RevId: 289169518
---
 pkg/tcpip/stack/BUILD         | 23 ++----------
 pkg/tcpip/stack/ndp_test.go   | 87 ++++---------------------------------------
 pkg/tcpip/stack/stack_test.go | 74 +++++++++++++++++++++++++++++++++---
 3 files changed, 78 insertions(+), 106 deletions(-)

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 6a8654105..705e984c1 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -53,6 +53,7 @@ go_test(
     name = "stack_x_test",
     size = "small",
     srcs = [
+        "ndp_test.go",
         "stack_test.go",
         "transport_demuxer_test.go",
         "transport_test.go",
@@ -62,12 +63,14 @@ go_test(
         "//pkg/rand",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
         "@com_github_google_go-cmp//cmp:go_default_library",
@@ -85,23 +88,3 @@ go_test(
         "//pkg/tcpip",
     ],
 )
-
-go_test(
-    name = "ndp_test",
-    size = "small",
-    srcs = ["ndp_test.go"],
-    deps = [
-        ":stack",
-        "//pkg/rand",
-        "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
-        "//pkg/tcpip/checker",
-        "//pkg/tcpip/header",
-        "//pkg/tcpip/link/channel",
-        "//pkg/tcpip/network/ipv6",
-        "//pkg/tcpip/transport/icmp",
-        "//pkg/tcpip/transport/udp",
-        "//pkg/waiter",
-        "@com_github_google_go-cmp//cmp:go_default_library",
-    ],
-)
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 108762b6e..f9bc18c55 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ndp_test
+package stack_test
 
 import (
 	"encoding/binary"
@@ -301,8 +301,6 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 // Included in the subtests is a test to make sure that an invalid
 // RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
 func TestDADResolve(t *testing.T) {
-	t.Parallel()
-
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -435,8 +433,6 @@ func TestDADResolve(t *testing.T) {
 // a node doing DAD for the same address), or if another node is detected to own
 // the address already (receive an NA message for the tentative address).
 func TestDADFail(t *testing.T) {
-	t.Parallel()
-
 	tests := []struct {
 		name    string
 		makeBuf func(tgt tcpip.Address) buffer.Prependable
@@ -580,8 +576,6 @@ func TestDADFail(t *testing.T) {
 // TestDADStop tests to make sure that the DAD process stops when an address is
 // removed.
 func TestDADStop(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent),
 	}
@@ -654,71 +648,6 @@ func TestDADStop(t *testing.T) {
 	}
 }
 
-// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
-// link-local addresses will only be assigned after the DAD process resolves.
-func TestNICAutoGenAddrDoesDAD(t *testing.T) {
-	t.Parallel()
-
-	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent),
-	}
-	ndpConfigs := stack.DefaultNDPConfigurations()
-	opts := stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:           ndpConfigs,
-		AutoGenIPv6LinkLocal: true,
-		NDPDisp:              &ndpDisp,
-	}
-
-	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
-	}
-
-	// Address should not be considered bound to the
-	// NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
-	}
-
-	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
-
-	// Wait for DAD to resolve.
-	select {
-	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
-		// We should get a resolution event after 1s (default time to
-		// resolve as per default NDP configurations). Waiting for that
-		// resolution time + an extra 1s without a resolution event
-		// means something is wrong.
-		t.Fatal("timed out waiting for DAD resolution")
-	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != 1 {
-			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-		}
-		if e.addr != linkLocalAddr {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
-		}
-		if !e.resolved {
-			t.Fatal("got DAD event w/ resolved = false, want = true")
-		}
-	}
-	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
-	}
-	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
-	}
-}
-
 // TestSetNDPConfigurationFailsForBadNICID tests to make sure we get an error if
 // we attempt to update NDP configurations using an invalid NICID.
 func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
@@ -736,8 +665,6 @@ func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
 // configurations without affecting the default NDP configurations or other
 // interfaces' configurations.
 func TestSetNDPConfigurations(t *testing.T) {
-	t.Parallel()
-
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -992,8 +919,6 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on
 // TestNoRouterDiscovery tests that router discovery will not be performed if
 // configured not to.
 func TestNoRouterDiscovery(t *testing.T) {
-	t.Parallel()
-
 	// Being configured to discover routers means handle and
 	// discover are set to true and forwarding is set to false.
 	// This tests all possible combinations of the configurations,
@@ -1006,6 +931,8 @@ func TestNoRouterDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				routerC: make(chan ndpRouterEvent, 1),
 			}
@@ -1240,8 +1167,6 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 // TestNoPrefixDiscovery tests that prefix discovery will not be performed if
 // configured not to.
 func TestNoPrefixDiscovery(t *testing.T) {
-	t.Parallel()
-
 	prefix := tcpip.AddressWithPrefix{
 		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
 		PrefixLen: 64,
@@ -1259,6 +1184,8 @@ func TestNoPrefixDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				prefixC: make(chan ndpPrefixEvent, 1),
 			}
@@ -1615,8 +1542,6 @@ func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
 
 // TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
 func TestNoAutoGenAddr(t *testing.T) {
-	t.Parallel()
-
 	prefix, _, _ := prefixSubnetAddr(0, "")
 
 	// Being configured to auto-generate addresses means handle and
@@ -1631,6 +1556,8 @@ func TestNoAutoGenAddr(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index e8de4e87d..44e5229cc 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -24,6 +24,7 @@ import (
 	"sort"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/rand"
@@ -49,8 +50,6 @@ const (
 	// where another value is explicitly used. It is chosen to match the MTU
 	// of loopback interfaces on linux systems.
 	defaultMTU = 65536
-
-	linkAddr = "\x02\x02\x03\x04\x05\x06"
 )
 
 // fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and
@@ -1910,7 +1909,7 @@ func TestNICAutoGenAddr(t *testing.T) {
 		{
 			"Disabled",
 			false,
-			linkAddr,
+			linkAddr1,
 			stack.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: func(nicID tcpip.NICID, _ string) string {
 					return fmt.Sprintf("nic%d", nicID)
@@ -1921,7 +1920,7 @@ func TestNICAutoGenAddr(t *testing.T) {
 		{
 			"Enabled",
 			true,
-			linkAddr,
+			linkAddr1,
 			stack.OpaqueInterfaceIdentifierOptions{},
 			true,
 		},
@@ -2069,14 +2068,14 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 			name:      "Disabled",
 			nicName:   "nic1",
 			autoGen:   false,
-			linkAddr:  linkAddr,
+			linkAddr:  linkAddr1,
 			secretKey: secretKey[:],
 		},
 		{
 			name:      "Enabled",
 			nicName:   "nic1",
 			autoGen:   true,
-			linkAddr:  linkAddr,
+			linkAddr:  linkAddr1,
 			secretKey: secretKey[:],
 		},
 		// These are all cases where we would not have generated a
@@ -2214,6 +2213,69 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 	}
 }
 
+// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
+// link-local addresses will only be assigned after the DAD process resolves.
+func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.DefaultNDPConfigurations()
+	opts := stack.Options{
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:           ndpConfigs,
+		AutoGenIPv6LinkLocal: true,
+		NDPDisp:              &ndpDisp,
+	}
+
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(_) = %s", err)
+	}
+
+	// Address should not be considered bound to the
+	// NIC yet (DAD ongoing).
+	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+	}
+
+	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// We should get a resolution event after 1s (default time to
+		// resolve as per default NDP configurations). Waiting for that
+		// resolution time + an extra 1s without a resolution event
+		// means something is wrong.
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicID != 1 {
+			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
+		}
+		if e.addr != linkLocalAddr {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
+		}
+		if !e.resolved {
+			t.Fatal("got DAD event w/ resolved = false, want = true")
+		}
+	}
+	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+	}
+	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+	}
+}
+
 // TestNewPEB tests that a new PrimaryEndpointBehavior value (peb) is respected
 // when an address's kind gets "promoted" to permanent from permanentExpired.
 func TestNewPEBOnPromotionToPermanent(t *testing.T) {
-- 
cgit v1.2.3


From bf6429b944aed6de073c62ceb446cfaed5042dbc Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Fri, 10 Jan 2020 16:34:59 -0800
Subject: Don't set RWF_HIPRI on InvalidOffset test.

This test fails on ubuntu 18.04 because preadv2 for some reason returns
EOPNOTSUPP instead of EINVAL. Instead of root-causing the failure, I'm dropping
the flag in the preadv2 call since it isn't under test in this scenario.

PiperOrigin-RevId: 289188358
---
 test/syscalls/linux/preadv2.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index c9246367d..cd936ea90 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -202,7 +202,7 @@ TEST(Preadv2Test, TestInvalidOffset) {
   iov[0].iov_len = 0;
 
   EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1, /*offset=*/-8,
-                      /*flags=*/RWF_HIPRI),
+                      /*flags=*/0),
               SyscallFailsWithErrno(EINVAL));
 }
 
-- 
cgit v1.2.3


From d793677cd424fef10ac0b080871d181db0bcdec0 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 10 Jan 2020 18:07:15 -0800
Subject: I think INPUT works with protocol

---
 pkg/sentry/socket/netfilter/netfilter.go |  3 ++-
 pkg/tcpip/iptables/BUILD                 |  1 +
 pkg/tcpip/iptables/iptables.go           |  4 +++-
 pkg/tcpip/iptables/types.go              |  2 +-
 pkg/tcpip/network/ipv4/ipv4.go           |  3 ++-
 pkg/tcpip/packet_buffer.go               | 25 ++++++++++++++++++++++++-
 6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index f30461936..175466f19 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -455,7 +456,7 @@ func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error)
 		return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument
 	}
 	return iptables.IPHeaderFilter{
-		Protocol: iptip.Protocol,
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
 	}, nil
 }
 
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index 2893c80cd..297eaccaf 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -14,5 +14,6 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/tcpip",
+        "//pkg/tcpip/header",
     ],
 )
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 4e1700fdb..3cff879a2 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 const (
@@ -183,12 +184,13 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 	panic(fmt.Sprintf("Traversed past the entire list of iptables rules in table %q.", tablename))
 }
 
+// Precondition: pk.NetworkHeader is set.
 func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	if rule.Filter.Protocol != pkt.Protocol {
+	if rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
 		return Continue
 	}
 
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 4bedd9bc8..4f2a4d65e 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -173,7 +173,7 @@ type IPHeaderFilter struct {
 	InputInterface      string
 	OutputInterfaceMask string
 	InputInterfaceMask  string
-	Protocol            uint16
+	Protocol            tcpip.TransportProtocolNumber
 	Flags               uint8
 	InverseFlags        uint8
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index f856081e6..5388d2549 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -353,7 +353,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	}
 	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
-	// iptables filtering.
+	// iptables filtering. All packets that reach here are intended for
+	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
 	if ok := ipt.Check(iptables.Input, pkt); !ok {
 		// iptables is telling us to drop the packet.
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
index ab24372e7..7a036b93c 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/packet_buffer.go
@@ -13,7 +13,9 @@
 
 package tcpip
 
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
 
 // A PacketBuffer contains all the data of a network packet.
 //
@@ -65,3 +67,24 @@ func (pk PacketBuffer) Clone() PacketBuffer {
 	pk.Data = pk.Data.Clone(nil)
 	return pk
 }
+
+//// TransportProtocol returns the transport protocol of pk.
+////
+//// Precondition: pk.NetworkHeader is set.
+//func (pk PacketBuffer) TransportProtocolIPv4() uint16 {
+//	if pk.NetworkHeader == nil {
+//		panic("This should only be called when pk.NetworkHeader is set.")
+//	}
+//	return header.IPv4(pk.NetworkHeader).TransportProtocol()
+//}
+
+// func (pk Packet) findNetHeader() header.IPv4 {
+// 	// Inbound:
+// 	// Data holds everything, but may have had some headers shaved off.
+// 	// Figure out whether it's set or still somewhere in data and return
+// 	// appropriately.
+
+// 	// Outbound:
+// 	// NetworkHeader will be set if we've added one. Otherwise there's no
+// 	// header.
+// }
-- 
cgit v1.2.3


From 98327a94cce7597589ac22b8557c5d9a2a03464d Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Mon, 13 Jan 2020 09:11:40 -0800
Subject: Add test for iptables TCP rule

Added tests for tcp protocol with input and output rules including options sport and dport
Increased timeout in iptables_test as TCP tests were timing out with existing value.
---
 test/iptables/BUILD            |  1 +
 test/iptables/filter_input.go  | 66 +++++++++++++++++++++++++++++++
 test/iptables/filter_output.go | 89 ++++++++++++++++++++++++++++++++++++++++++
 test/iptables/iptables_test.go | 26 +++++++++++-
 test/iptables/iptables_util.go | 55 ++++++++++++++++++++++++++
 5 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 test/iptables/filter_output.go

diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index 68eed721e..372ba7abf 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "iptables",
     srcs = [
         "filter_input.go",
+        "filter_output.go",
         "iptables.go",
         "iptables_util.go",
         "nat.go",
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 923f44e68..1c04601df 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -31,6 +31,8 @@ func init() {
 	RegisterTestCase(FilterInputDropUDP{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropDifferentUDPPort{})
+	RegisterTestCase(FilterInputDropTCPDestPort{})
+	RegisterTestCase(FilterInputDropTCPSrcPort{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -122,3 +124,67 @@ func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
 func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// FilterInputDropTCP tests that connections are not accepted on specified source ports.
+type FilterInputDropTCPDestPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropTCPDestPort) Name() string {
+	return "FilterInputDropTCPDestPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropTCPDestPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "tcp", "-m", "tcp", "--dport",
+	fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on drop port.
+	if err := listenTCP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("Connections on port %d should not be accepted, but got accepted", dropPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropTCPDestPort) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("Connection destined to port %d should not be accepted, but got accepted", dropPort)
+	}
+
+	return nil
+}
+
+// FilterInputDropTCPSrcPort tests that connections are not accepted on specified source ports.
+type FilterInputDropTCPSrcPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropTCPSrcPort) Name() string {
+	return "FilterInputDropTCPSrcPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "tcp", "-m", "tcp", "--sport",
+	fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connections destined to port %d should not be accepted, but got accepted", dropPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropTCPSrcPort) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection sent from port %d should not be accepted", dropPort)
+	}
+
+	return nil
+}
diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go
new file mode 100644
index 000000000..63d74e4f4
--- /dev/null
+++ b/test/iptables/filter_output.go
@@ -0,0 +1,89 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+)
+
+func init() {
+	RegisterTestCase(FilterOutputDropTCPDestPort{})
+	RegisterTestCase(FilterOutputDropTCPSrcPort{})
+}
+
+// FilterOutputDropTCPDestPort tests that connections are not accepted on specified source ports.
+type FilterOutputDropTCPDestPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropTCPDestPort) Name() string {
+	return "FilterOutputDropTCPDestPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropTCPDestPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "tcp", "--dport",
+	fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connections destined to port %d should not be accepted, but got accepted", dropPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropTCPDestPort) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection sent from port %d should not be accepted, but got accepted", dropPort)
+	}
+
+	return nil
+}
+
+// FilterOutputDropTCPSrcPort tests that connections are not accepted on specified source ports.
+type FilterOutputDropTCPSrcPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropTCPSrcPort) Name() string {
+	return "FilterOutputDropTCPSrcPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropTCPSrcPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "tcp", "--sport",
+	fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on drop port.
+	if err := listenTCP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connections on port %d should not be accepted, but got accepted", dropPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropTCPSrcPort) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err == nil {
+                return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
+        }
+
+	return nil
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 05f27569f..3eeb75b8b 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -28,7 +28,7 @@ import (
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
-const timeout time.Duration = 10 * time.Second
+const timeout time.Duration = 18 * time.Second
 
 var image = flag.String("image", "bazel/test/iptables/runner:runner", "image to run tests in")
 
@@ -189,3 +189,27 @@ func TestNATDropUDP(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestFilterInputDropTCPDestPort(t *testing.T) {
+	if err := singleTest(FilterInputDropTCPDestPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropTCPSrcPort(t *testing.T) {
+	if err := singleTest(FilterInputDropTCPSrcPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterOutputDropTCPDestPort(t *testing.T) {
+	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterOutputDropTCPSrcPort(t *testing.T) {
+	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 3a4d11f1a..44945bd89 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -80,3 +80,58 @@ func sendUDPLoop(ip net.IP, port int, duration time.Duration) error {
 
 	return nil
 }
+
+// listenTCP listens for connections on a TCP port
+func listenTCP(port int, timeout time.Duration) error {
+	localAddr := net.TCPAddr{
+		Port: port,
+	}
+
+	// Starts listening on port
+	lConn, err := net.ListenTCP("tcp4", &localAddr)
+	if err != nil {
+		return err
+	}
+	defer lConn.Close()
+
+	// Accept connections on port
+	lConn.SetDeadline(time.Now().Add(timeout))
+	conn, err := lConn.AcceptTCP()
+	if err == nil {
+		conn.Close()
+	}
+	return err
+}
+
+// connectTCP connects the TCP server over specified local port, server IP
+// and remote/server port
+func connectTCP(ip net.IP, remotePort int, localPort int, duration time.Duration) error {
+	remote := net.TCPAddr{
+		IP: ip,
+		Port: remotePort,
+	}
+
+	local := net.TCPAddr{
+		Port: localPort,
+	}
+
+	// Container may not be up. Retry DialTCP
+	// over a given duration
+	to := time.After(duration)
+	var res error
+	for timedOut := false; !timedOut; {
+		conn, err := net.DialTCP("tcp4", &local, &remote)
+		res = err
+		if res == nil {
+			conn.Close()
+			return nil
+		}
+		select{
+		case <-to:
+			timedOut = true
+		default:
+			time.Sleep(200 * time.Millisecond)
+		}
+	}
+        return res
+}
-- 
cgit v1.2.3


From f54b9c0ee6e02f9c8bf32aa268c9028ff741bf7c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 13 Jan 2020 10:14:30 -0800
Subject: tests: fix errors detected by asan.

PiperOrigin-RevId: 289467083
---
 test/syscalls/linux/inotify.cc      | 4 ++--
 test/syscalls/linux/poll.cc         | 3 ++-
 test/syscalls/linux/readv_common.cc | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 59ec9940a..fdef646eb 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -977,7 +977,7 @@ TEST(Inotify, WatchOnRelativePath) {
       ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
 
   // Change working directory to root.
-  const char* old_working_dir = get_current_dir_name();
+  const FileDescriptor cwd = ASSERT_NO_ERRNO_AND_VALUE(Open(".", O_PATH));
   EXPECT_THAT(chdir(root.path().c_str()), SyscallSucceeds());
 
   // Add a watch on file1 with a relative path.
@@ -997,7 +997,7 @@ TEST(Inotify, WatchOnRelativePath) {
   // continue to hold a reference, random save/restore tests can fail if a save
   // is triggered after "root" is unlinked; we can't save deleted fs objects
   // with active references.
-  EXPECT_THAT(chdir(old_working_dir), SyscallSucceeds());
+  EXPECT_THAT(fchdir(cwd.get()), SyscallSucceeds());
 }
 
 TEST(Inotify, ZeroLengthReadWriteDoesNotGenerateEvent) {
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index 9e5aa7fd0..c42472474 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -275,7 +275,8 @@ TEST_F(PollTest, Nfds) {
   // Each entry in the 'fds' array refers to the eventfd and polls for
   // "writable" events (events=POLLOUT). This essentially guarantees that the
   // poll() is a no-op and allows negative testing of the 'nfds' parameter.
-  std::vector<struct pollfd> fds(max_fds, {.fd = efd.get(), .events = POLLOUT});
+  std::vector<struct pollfd> fds(max_fds + 1,
+                                 {.fd = efd.get(), .events = POLLOUT});
 
   // Verify that 'nfds' up to RLIMIT_NOFILE are allowed.
   EXPECT_THAT(RetryEINTR(poll)(fds.data(), 1, 1), SyscallSucceedsWithValue(1));
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 491d5f40f..2694dc64f 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -154,7 +154,7 @@ void ReadBuffersOverlapping(int fd) {
   char* expected_ptr = expected.data();
   memcpy(expected_ptr, &kReadvTestData[overlap_bytes], overlap_bytes);
   memcpy(&expected_ptr[overlap_bytes], &kReadvTestData[overlap_bytes],
-         kReadvTestDataSize);
+         kReadvTestDataSize - overlap_bytes);
 
   struct iovec iovs[2];
   iovs[0].iov_base = buffer.data();
-- 
cgit v1.2.3


From fff04769518b279a364c928307a71055eaa6166d Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 13 Jan 2020 13:08:36 -0800
Subject: benchmarks/tcp: set a number of channels to GOMAXPROCS

Updates #231

PiperOrigin-RevId: 289502669
---
 benchmarks/tcp/tcp_benchmark.sh | 21 ++++++++++++++-
 benchmarks/tcp/tcp_proxy.go     | 58 ++++++++++++++++++++++-------------------
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh
index 69344c9c3..e65801a7b 100755
--- a/benchmarks/tcp/tcp_benchmark.sh
+++ b/benchmarks/tcp/tcp_benchmark.sh
@@ -41,6 +41,8 @@ duplicate=0.1           # 0.1% means duplicates are 1/10x as frequent as losses.
 duration=30             # 30s is enough time to consistent results (experimentally).
 helper_dir=$(dirname $0)
 netstack_opts=
+disable_linux_gso=
+num_client_threads=1
 
 # Check for netem support.
 lsmod_output=$(lsmod | grep sch_netem)
@@ -125,6 +127,13 @@ while [ $# -gt 0 ]; do
       shift
       netstack_opts="${netstack_opts} -memprofile=$1"
       ;;
+    --disable-linux-gso)
+      disable_linux_gso=1
+      ;;
+    --num-client-threads)
+      shift
+      num_client_threads=$1
+      ;;
     --helpers)
       shift
       [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1
@@ -147,6 +156,8 @@ while [ $# -gt 0 ]; do
       echo " --loss                set the loss probability (%)"
       echo " --duplicate           set the duplicate probability (%)"
       echo " --helpers             set the helper directory"
+      echo " --num-client-threads  number of parallel client threads to run"
+      echo " --disable-linux-gso   disable segmentation offload in the Linux network stack"
       echo ""
       echo "The output will of the script will be:"
       echo "  <throughput> <client-cpu-usage> <server-cpu-usage>"
@@ -301,6 +312,14 @@ fi
 # Add client and server addresses, and bring everything up.
 ${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0
 ${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0
+if [ "${disable_linux_gso}" == "1" ]; then
+  ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 tso off
+  ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gro off
+  ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gso off
+  ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 tso off
+  ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gso off
+  ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gro off
+fi
 ${nsjoin_binary} /tmp/client.netns ip link set client.0 up
 ${nsjoin_binary} /tmp/client.netns ip link set lo up
 ${nsjoin_binary} /tmp/server.netns ip link set server.0 up
@@ -338,7 +357,7 @@ trap cleanup EXIT
 
 # Run the benchmark, recording the results file.
 while ${nsjoin_binary} /tmp/client.netns iperf \\
-    -p ${proxy_port} -c ${client_addr} -t ${duration} -f m 2>&1 \\
+    -p ${proxy_port} -c ${client_addr} -t ${duration} -f m -P ${num_client_threads} 2>&1 \\
     | tee \$results_file \\
     | grep "connect failed" >/dev/null; do
   sleep 0.1 # Wait for all services.
diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index 361a56755..be0d7bdd6 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -94,11 +94,11 @@ type netstackImpl struct {
 	mode string
 }
 
-func setupNetwork(ifaceName string) (fd int, err error) {
+func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
 	// Get all interfaces in the namespace.
 	ifaces, err := net.Interfaces()
 	if err != nil {
-		return -1, fmt.Errorf("querying interfaces: %v", err)
+		return nil, fmt.Errorf("querying interfaces: %v", err)
 	}
 
 	for _, iface := range ifaces {
@@ -107,39 +107,43 @@ func setupNetwork(ifaceName string) (fd int, err error) {
 		}
 		// Create the socket.
 		const protocol = 0x0300 // htons(ETH_P_ALL)
-		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
-		if err != nil {
-			return -1, fmt.Errorf("unable to create raw socket: %v", err)
-		}
+		fds := make([]int, numChannels)
+		for i := range fds {
+			fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+			if err != nil {
+				return nil, fmt.Errorf("unable to create raw socket: %v", err)
+			}
 
-		// Bind to the appropriate device.
-		ll := syscall.SockaddrLinklayer{
-			Protocol: protocol,
-			Ifindex:  iface.Index,
-			Pkttype:  syscall.PACKET_HOST,
-		}
-		if err := syscall.Bind(fd, &ll); err != nil {
-			return -1, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
-		}
+			// Bind to the appropriate device.
+			ll := syscall.SockaddrLinklayer{
+				Protocol: protocol,
+				Ifindex:  iface.Index,
+				Pkttype:  syscall.PACKET_HOST,
+			}
+			if err := syscall.Bind(fd, &ll); err != nil {
+				return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+			}
 
-		// RAW Sockets by default have a very small SO_RCVBUF of 256KB,
-		// up it to at least 1MB to reduce packet drops.
-		if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, rcvBufSize); err != nil {
-			return -1, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
-		}
+			// RAW Sockets by default have a very small SO_RCVBUF of 256KB,
+			// up it to at least 1MB to reduce packet drops.
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, rcvBufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+			}
 
-		if !*swgso && *gso != 0 {
-			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
-				return -1, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+			if !*swgso && *gso != 0 {
+				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+					return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+				}
 			}
+			fds[i] = fd
 		}
-		return fd, nil
+		return fds, nil
 	}
-	return -1, fmt.Errorf("failed to find interface: %v", ifaceName)
+	return nil, fmt.Errorf("failed to find interface: %v", ifaceName)
 }
 
 func newNetstackImpl(mode string) (impl, error) {
-	fd, err := setupNetwork(*iface)
+	fds, err := setupNetwork(*iface, runtime.GOMAXPROCS(-1))
 	if err != nil {
 		return nil, err
 	}
@@ -177,7 +181,7 @@ func newNetstackImpl(mode string) (impl, error) {
 	mac[0] &^= 0x1 // Clear multicast bit.
 	mac[0] |= 0x2  // Set local assignment bit (IEEE802).
 	ep, err := fdbased.New(&fdbased.Options{
-		FDs:            []int{fd},
+		FDs:            fds,
 		MTU:            uint32(*mtu),
 		EthernetHeader: true,
 		Address:        tcpip.LinkAddress(mac),
-- 
cgit v1.2.3


From 36641a21953b72d64d4378d4974ef467e901a5fe Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 13 Jan 2020 14:14:49 -0800
Subject: Only allow INPUT modifications.

---
 pkg/sentry/socket/netfilter/netfilter.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 37f726295..507a77483 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -365,9 +365,22 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
+	// TODO(gvisor.dev/issue/170): Support other chains.
+	// Since we only support modifying the INPUT chain right now, make sure
+	// all other chains point to ACCEPT rules.
+	for hook, ruleIdx := range table.BuiltinChains {
+		if hook != iptables.Input {
+			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
+				log.Warningf("Hook %d is unsupported.", hook)
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+
 	// TODO(gvisor.dev/issue/170): Check the following conditions:
 	// - There are no loops.
 	// - There are no chains without an unconditional final rule.
+	// - There are no chains without an unconditional underflow rule.
 
 	ipt := stack.IPTables()
 	table.SetMetadata(metadata{
-- 
cgit v1.2.3


From debd213da61cf35d7c91346820e93fc87bfa5896 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Mon, 13 Jan 2020 14:45:31 -0800
Subject: Allow dual stack sockets to operate on AF_INET

Fixes #1490
Fixes #1495

PiperOrigin-RevId: 289523250
---
 pkg/sentry/socket/netstack/netstack.go      |  65 +++++++++---
 pkg/sentry/socket/unix/unix.go              |   5 +-
 pkg/sentry/strace/socket.go                 |   2 +-
 pkg/tcpip/stack/stack.go                    |  43 ++++++++
 pkg/tcpip/transport/icmp/endpoint.go        |  22 ++--
 pkg/tcpip/transport/tcp/endpoint.go         |  23 +---
 pkg/tcpip/transport/udp/endpoint.go         |  41 ++------
 scripts/common.sh                           |   2 +-
 test/syscalls/linux/BUILD                   |   1 +
 test/syscalls/linux/socket_inet_loopback.cc | 156 ++++++++++++++++++++++++++++
 10 files changed, 278 insertions(+), 82 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 099319327..c020c11cb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -324,22 +324,15 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
 // converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
 // AF_INET6, and AF_PACKET addresses.
 //
-// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
-//
 // AddressAndFamily returns an address and its family.
-func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
+func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
 	// Make sure we have at least 2 bytes for the address family.
 	if len(addr) < 2 {
 		return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
 	}
 
-	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
-		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
-	}
-
 	// Get the rest of the fields based on the address family.
-	switch family {
+	switch family := usermem.ByteOrder.Uint16(addr); family {
 	case linux.AF_UNIX:
 		path := addr[2:]
 		if len(path) > linux.UnixPathMax {
@@ -638,10 +631,40 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return r
 }
 
+func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error {
+	if family == uint16(s.family) {
+		return nil
+	}
+	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
+		v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption)
+		if err != nil {
+			return syserr.TranslateNetstackError(err)
+		}
+		if !v {
+			return nil
+		}
+	}
+	return syserr.ErrInvalidArgument
+}
+
+// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
+// receiver's family is AF_INET6.
+//
+// This is a hack to work around the fact that both IPv4 and IPv6 ANY are
+// represented by the empty string.
+//
+// TODO(gvisor.dev/issues/1556): remove this function.
+func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
+	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
+		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
+	}
+	return addr
+}
+
 // Connect implements the linux syscall connect(2) for sockets backed by
 // tpcip.Endpoint.
 func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
-	addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
+	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
@@ -653,6 +676,12 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 		}
 		return syserr.TranslateNetstackError(err)
 	}
+
+	if err := s.checkFamily(family, false /* exact */); err != nil {
+		return err
+	}
+	addr = s.mapFamily(addr, family)
+
 	// Always return right away in the non-blocking case.
 	if !blocking {
 		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
@@ -681,10 +710,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
+	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
+	if err := s.checkFamily(family, true /* exact */); err != nil {
+		return err
+	}
+	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
@@ -2080,8 +2113,8 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 
 	case linux.AF_INET6:
 		var out linux.SockAddrInet6
-		if len(addr.Addr) == 4 {
-			// Copy address is v4-mapped format.
+		if len(addr.Addr) == header.IPv4AddressSize {
+			// Copy address in v4-mapped format.
 			copy(out.Addr[12:], addr.Addr)
 			out.Addr[10] = 0xff
 			out.Addr[11] = 0xff
@@ -2395,10 +2428,14 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 	var addr *tcpip.FullAddress
 	if len(to) > 0 {
-		addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
+		addrBuf, family, err := AddressAndFamily(to)
 		if err != nil {
 			return 0, err
 		}
+		if err := s.checkFamily(family, false /* exact */); err != nil {
+			return 0, err
+		}
+		addrBuf = s.mapFamily(addrBuf, family)
 
 		addr = &addrBuf
 	}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 91effe89a..7f49ba864 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -116,13 +116,16 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 
 // extractPath extracts and validates the address.
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
-	addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
+	addr, family, err := netstack.AddressAndFamily(sockaddr)
 	if err != nil {
 		if err == syserr.ErrAddressFamilyNotSupported {
 			err = syserr.ErrInvalidArgument
 		}
 		return "", err
 	}
+	if family != linux.AF_UNIX {
+		return "", syserr.ErrInvalidArgument
+	}
 
 	// The address is trimmed by GetAddress.
 	p := string(addr.Addr)
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51f2efb39..b6d7177f4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -341,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
 
 	switch family {
 	case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
-		fa, _, err := netstack.AddressAndFamily(int(family), b, true /* strict */)
+		fa, _, err := netstack.AddressAndFamily(b)
 		if err != nil {
 			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
 		}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a47ceba54..113b457fb 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -547,6 +547,49 @@ type TransportEndpointInfo struct {
 	RegisterNICID tcpip.NICID
 }
 
+// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
+// and returns the network protocol number to be used to communicate with the
+// specified address. It returns an error if the passed address is incompatible
+// with the receiver.
+func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.NetProto
+	switch len(addr.Addr) {
+	case header.IPv4AddressSize:
+		netProto = header.IPv4ProtocolNumber
+	case header.IPv6AddressSize:
+		if header.IsV4MappedAddress(addr.Addr) {
+			netProto = header.IPv4ProtocolNumber
+			addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
+			if addr.Addr == header.IPv4Any {
+				addr.Addr = ""
+			}
+		}
+	}
+
+	switch len(e.ID.LocalAddress) {
+	case header.IPv4AddressSize:
+		if len(addr.Addr) == header.IPv6AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+		}
+	case header.IPv6AddressSize:
+		if len(addr.Addr) == header.IPv4AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNetworkUnreachable
+		}
+	}
+
+	switch {
+	case netProto == e.NetProto:
+	case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+		if v6only {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
+		}
+	default:
+		return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return addr, netProto, nil
+}
+
 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
 // marker interface.
 func (*TransportEndpointInfo) IsEndpointInfo() {}
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 330786f4c..42afb3f5b 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -288,7 +288,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 		toCopy := *to
 		to = &toCopy
-		netProto, err := e.checkV4Mapped(to, true)
+		netProto, err := e.checkV4Mapped(to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -475,18 +475,12 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		return 0, tcpip.ErrNoRoute
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
@@ -518,7 +512,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
@@ -631,7 +625,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cca511fb9..cc8b533c8 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1691,26 +1691,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		// Fail if using a v4 mapped address on a v6only endpoint.
-		if e.v6only {
-			return 0, tcpip.ErrNoRoute
-		}
-
-		netProto = header.IPv4ProtocolNumber
-		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
-		if addr.Addr == header.IPv4Any {
-			addr.Addr = ""
-		}
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index a4ff29a7d..13446f5d9 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -402,7 +402,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, tcpip.ErrBroadcastDisabled
 		}
 
-		netProto, err := e.checkV4Mapped(to, false)
+		netProto, err := e.checkV4Mapped(to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -501,7 +501,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
-		netProto, err := e.checkV4Mapped(&fa, false)
+		netProto, err := e.checkV4Mapped(&fa)
 		if err != nil {
 			return err
 		}
@@ -839,35 +839,12 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if len(addr.Addr) == 0 {
-		return netProto, nil
-	}
-	if header.IsV4MappedAddress(addr.Addr) {
-		// Fail if using a v4 mapped address on a v6only endpoint.
-		if e.v6only {
-			return 0, tcpip.ErrNoRoute
-		}
-
-		netProto = header.IPv4ProtocolNumber
-		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
-		if addr.Addr == header.IPv4Any {
-			addr.Addr = ""
-		}
-
-		// Fail if we are bound to an IPv6 address.
-		if !allowMismatch && len(e.ID.LocalAddress) == 16 {
-			return 0, tcpip.ErrNetworkUnreachable
-		}
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
@@ -916,7 +893,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
@@ -1074,7 +1051,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, true)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
diff --git a/scripts/common.sh b/scripts/common.sh
index 6dabad141..fdb1aa142 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -73,7 +73,7 @@ function install_runsc() {
   sudo "${RUNSC_BIN}" install --experimental=true --runtime="${runtime}" -- --debug-log "${RUNSC_LOGS}" "$@"
 
   # Clear old logs files that may exist.
-  sudo rm -f "${RUNSC_LOGS_DIR}"/*
+  sudo rm -f "${RUNSC_LOGS_DIR}"/'*'
 
   # Restart docker to pick up the new runtime configuration.
   sudo systemctl restart docker
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ce8abe217..4c7ec3f06 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2693,6 +2693,7 @@ cc_binary(
     srcs = ["socket_inet_loopback.cc"],
     linkstatic = 1,
     deps = [
+        ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 138024d9e..5d114d460 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -32,6 +32,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -102,6 +103,161 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
               SyscallFailsWithErrno(EAFNOSUPPORT));
 }
 
+enum class Operation {
+  Bind,
+  Connect,
+  SendTo,
+};
+
+std::string OperationToString(Operation operation) {
+  switch (operation) {
+    case Operation::Bind:
+      return "Bind";
+    case Operation::Connect:
+      return "Connect";
+    case Operation::SendTo:
+      return "SendTo";
+  }
+}
+
+using OperationSequence = std::vector<Operation>;
+
+using DualStackSocketTest =
+    ::testing::TestWithParam<std::tuple<TestAddress, OperationSequence>>;
+
+TEST_P(DualStackSocketTest, AddressOperations) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_DGRAM, 0));
+
+  const TestAddress& addr = std::get<0>(GetParam());
+  const OperationSequence& operations = std::get<1>(GetParam());
+
+  auto addr_in = reinterpret_cast<const sockaddr*>(&addr.addr);
+
+  // sockets may only be bound once. Both `connect` and `sendto` cause a socket
+  // to be bound.
+  bool bound = false;
+  for (const Operation& operation : operations) {
+    bool sockname = false;
+    bool peername = false;
+    switch (operation) {
+      case Operation::Bind: {
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 0));
+
+        int bind_ret = bind(fd.get(), addr_in, addr.addr_len);
+
+        // Dual stack sockets may only be bound to AF_INET6.
+        if (!bound && addr.family() == AF_INET6) {
+          EXPECT_THAT(bind_ret, SyscallSucceeds());
+          bound = true;
+
+          sockname = true;
+        } else {
+          EXPECT_THAT(bind_ret, SyscallFailsWithErrno(EINVAL));
+        }
+        break;
+      }
+      case Operation::Connect: {
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+        EXPECT_THAT(connect(fd.get(), addr_in, addr.addr_len),
+                    SyscallSucceeds())
+            << GetAddrStr(addr_in);
+        bound = true;
+
+        sockname = true;
+        peername = true;
+
+        break;
+      }
+      case Operation::SendTo: {
+        const char payload[] = "hello";
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+        ssize_t sendto_ret = sendto(fd.get(), &payload, sizeof(payload), 0,
+                                    addr_in, addr.addr_len);
+
+        EXPECT_THAT(sendto_ret, SyscallSucceedsWithValue(sizeof(payload)));
+        sockname = !bound;
+        bound = true;
+        break;
+      }
+    }
+
+    if (sockname) {
+      sockaddr_storage sock_addr;
+      socklen_t addrlen = sizeof(sock_addr);
+      ASSERT_THAT(getsockname(fd.get(), reinterpret_cast<sockaddr*>(&sock_addr),
+                              &addrlen),
+                  SyscallSucceeds());
+      ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+      auto sock_addr_in6 = reinterpret_cast<const sockaddr_in6*>(&sock_addr);
+
+      if (operation == Operation::SendTo) {
+        EXPECT_EQ(sock_addr_in6->sin6_family, AF_INET6);
+        EXPECT_TRUE(IN6_IS_ADDR_UNSPECIFIED(sock_addr_in6->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getsocknam="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+
+        EXPECT_NE(sock_addr_in6->sin6_port, 0);
+      } else if (IN6_IS_ADDR_V4MAPPED(
+                     reinterpret_cast<const sockaddr_in6*>(addr_in)
+                         ->sin6_addr.s6_addr32)) {
+        EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(sock_addr_in6->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getsocknam="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+      }
+    }
+
+    if (peername) {
+      sockaddr_storage peer_addr;
+      socklen_t addrlen = sizeof(peer_addr);
+      ASSERT_THAT(getpeername(fd.get(), reinterpret_cast<sockaddr*>(&peer_addr),
+                              &addrlen),
+                  SyscallSucceeds());
+      ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+      if (addr.family() == AF_INET ||
+          IN6_IS_ADDR_V4MAPPED(reinterpret_cast<const sockaddr_in6*>(addr_in)
+                                   ->sin6_addr.s6_addr32)) {
+        EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(
+            reinterpret_cast<const sockaddr_in6*>(&peer_addr)
+                ->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getpeername="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&peer_addr));
+      }
+    }
+  }
+}
+
+// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+INSTANTIATE_TEST_SUITE_P(
+    All, DualStackSocketTest,
+    ::testing::Combine(
+        ::testing::Values(V4Any(), V4Loopback(), /*V4MappedAny(),*/
+                          V4MappedLoopback(), V6Any(), V6Loopback()),
+        ::testing::ValuesIn<OperationSequence>(
+            {{Operation::Bind, Operation::Connect, Operation::SendTo},
+             {Operation::Bind, Operation::SendTo, Operation::Connect},
+             {Operation::Connect, Operation::Bind, Operation::SendTo},
+             {Operation::Connect, Operation::SendTo, Operation::Bind},
+             {Operation::SendTo, Operation::Bind, Operation::Connect},
+             {Operation::SendTo, Operation::Connect, Operation::Bind}})),
+    [](::testing::TestParamInfo<
+        std::tuple<TestAddress, OperationSequence>> const& info) {
+      const TestAddress& addr = std::get<0>(info.param);
+      const OperationSequence& operations = std::get<1>(info.param);
+      std::string s = addr.description;
+      for (const Operation& operation : operations) {
+        absl::StrAppend(&s, OperationToString(operation));
+      }
+      return s;
+    });
+
 void tcpSimpleConnectTest(TestAddress const& listener,
                           TestAddress const& connector, bool unbound) {
   // Create the listening socket.
-- 
cgit v1.2.3


From 1c3d3c70b93d483894dd49fb444171347f0ca250 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 13 Jan 2020 14:54:32 -0800
Subject: Fix test building.

---
 pkg/tcpip/network/ip_test.go        | 21 ++++++++++++++-------
 pkg/tcpip/network/ipv6/icmp_test.go |  2 +-
 pkg/tcpip/network/ipv6/ndp_test.go  |  2 +-
 pkg/tcpip/stack/stack_test.go       |  2 +-
 pkg/tcpip/transport/udp/udp_test.go | 10 ++++++++--
 5 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index f1bc33adf..f4d78f8c6 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -212,10 +212,17 @@ func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
 }
 
+func buildDummyStack() *stack.Stack {
+	return stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+	})
+}
+
 func TestIPv4Send(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, nil, &o)
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, nil, &o, buildDummyStack())
 	if err != nil {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
@@ -250,7 +257,7 @@ func TestIPv4Send(t *testing.T) {
 func TestIPv4Receive(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil)
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
 	if err != nil {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
@@ -318,7 +325,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 		t.Run(c.name, func(t *testing.T) {
 			o := testObject{t: t}
 			proto := ipv4.NewProtocol()
-			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil)
+			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
 			if err != nil {
 				t.Fatalf("NewEndpoint failed: %v", err)
 			}
@@ -385,7 +392,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 func TestIPv4FragmentationReceive(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil)
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
 	if err != nil {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
@@ -456,7 +463,7 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 func TestIPv6Send(t *testing.T) {
 	o := testObject{t: t}
 	proto := ipv6.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, nil, &o)
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, nil, &o, buildDummyStack())
 	if err != nil {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
@@ -491,7 +498,7 @@ func TestIPv6Send(t *testing.T) {
 func TestIPv6Receive(t *testing.T) {
 	o := testObject{t: t}
 	proto := ipv6.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil)
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil, buildDummyStack())
 	if err != nil {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
@@ -568,7 +575,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 		t.Run(c.name, func(t *testing.T) {
 			o := testObject{t: t}
 			proto := ipv6.NewProtocol()
-			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil)
+			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil, buildDummyStack())
 			if err != nil {
 				t.Fatalf("NewEndpoint failed: %v", err)
 			}
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 335f634d5..a2fdc5dcd 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -109,7 +109,7 @@ func TestICMPCounts(t *testing.T) {
 	if netProto == nil {
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
-	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{lladdr1, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil)
+	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{lladdr1, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
 	if err != nil {
 		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
 	}
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 0dbce14a0..fe895b376 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -62,7 +62,7 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
 
-	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{rlladdr, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil)
+	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{rlladdr, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
 	if err != nil {
 		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
 	}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 44e5229cc..cf41e02eb 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -200,7 +200,7 @@ func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Addres
 	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
 }
 
-func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) {
+func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
 	return &fakeNetworkEndpoint{
 		nicID:      nicID,
 		id:         stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0a82bc4fa..d33507156 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1228,7 +1228,10 @@ func TestTTL(t *testing.T) {
 				} else {
 					p = ipv6.NewProtocol()
 				}
-				ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil)
+				ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil, stack.New(stack.Options{
+					NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+					TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				}))
 				if err != nil {
 					t.Fatal(err)
 				}
@@ -1261,7 +1264,10 @@ func TestSetTTL(t *testing.T) {
 					} else {
 						p = ipv6.NewProtocol()
 					}
-					ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil)
+					ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil, stack.New(stack.Options{
+						NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+						TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+					}))
 					if err != nil {
 						t.Fatal(err)
 					}
-- 
cgit v1.2.3


From bd292894097ffdf316bc78d81aebd0a2988124f3 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 13 Jan 2020 16:10:00 -0800
Subject: Protocol filtering works.

---
 pkg/tcpip/iptables/iptables.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 921546984..5c813d8a0 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -187,7 +187,7 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	if rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
+	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
 		return Continue
 	}
 
-- 
cgit v1.2.3


From 1ad8381eac108304f7b96162674624b34b95ec7b Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 13 Jan 2020 17:56:44 -0800
Subject: Do Source Address Selection when choosing an IPv6 source address

Do Source Address Selection when choosing an IPv6 source address as per RFC 6724
section 5 rules 1-3:
1) Prefer same address
2) Prefer appropriate scope
3) Avoid deprecated addresses.

A later change will update Source Address Selection to follow rules 4-8.

Tests:
Rule 1 & 2: stack.TestIPv6SourceAddressSelectionScopeAndSameAddress,
Rule 3:     stack.TestAutoGenAddrTimerDeprecation,
            stack.TestAutoGenAddrDeprecateFromPI
PiperOrigin-RevId: 289559373
---
 pkg/tcpip/header/ipv6.go      |  43 ++++++++++++
 pkg/tcpip/header/ipv6_test.go |  96 +++++++++++++++++++++++++-
 pkg/tcpip/stack/ndp_test.go   |  22 ++++--
 pkg/tcpip/stack/nic.go        | 115 ++++++++++++++++++++++++++++++--
 pkg/tcpip/stack/stack.go      |   8 +--
 pkg/tcpip/stack/stack_test.go | 152 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 420 insertions(+), 16 deletions(-)

diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 135a60b12..83425c614 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -333,6 +333,17 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool {
 	return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
 }
 
+// IsV6UniqueLocalAddress determines if the provided address is an IPv6
+// unique-local address (within the prefix FC00::/7).
+func IsV6UniqueLocalAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv6AddressSize {
+		return false
+	}
+	// According to RFC 4193 section 3.1, a unique local address has the prefix
+	// FC00::/7.
+	return (addr[0] & 0xfe) == 0xfc
+}
+
 // AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier
 // (IID) to buf as outlined by RFC 7217 and returns the extended buffer.
 //
@@ -371,3 +382,35 @@ func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []by
 
 	return tcpip.Address(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey))
 }
+
+// IPv6AddressScope is the scope of an IPv6 address.
+type IPv6AddressScope int
+
+const (
+	// LinkLocalScope indicates a link-local address.
+	LinkLocalScope IPv6AddressScope = iota
+
+	// UniqueLocalScope indicates a unique-local address.
+	UniqueLocalScope
+
+	// GlobalScope indicates a global address.
+	GlobalScope
+)
+
+// ScopeForIPv6Address returns the scope for an IPv6 address.
+func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
+	if len(addr) != IPv6AddressSize {
+		return GlobalScope, tcpip.ErrBadAddress
+	}
+
+	switch {
+	case IsV6LinkLocalAddress(addr):
+		return LinkLocalScope, nil
+
+	case IsV6UniqueLocalAddress(addr):
+		return UniqueLocalScope, nil
+
+	default:
+		return GlobalScope, nil
+	}
+}
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index 1994003ed..29f54bc57 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -25,7 +25,13 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
-const linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+const (
+	linkAddr         = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkLocalAddr    = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	globalAddr       = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+)
 
 func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
 	expectedIID := [header.IIDSize]byte{0, 2, 3, 255, 254, 4, 5, 6}
@@ -206,3 +212,91 @@ func TestLinkLocalAddrWithOpaqueIID(t *testing.T) {
 		})
 	}
 }
+
+func TestIsV6UniqueLocalAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Unique 1",
+			addr:     uniqueLocalAddr1,
+			expected: true,
+		},
+		{
+			name:     "Valid Unique 2",
+			addr:     uniqueLocalAddr1,
+			expected: true,
+		},
+		{
+			name:     "Link Local",
+			addr:     linkLocalAddr,
+			expected: false,
+		},
+		{
+			name:     "Global",
+			addr:     globalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4",
+			addr:     "\x01\x02\x03\x04",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6UniqueLocalAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6UniqueLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestScopeForIPv6Address(t *testing.T) {
+	tests := []struct {
+		name  string
+		addr  tcpip.Address
+		scope header.IPv6AddressScope
+		err   *tcpip.Error
+	}{
+		{
+			name:  "Unique Local",
+			addr:  uniqueLocalAddr1,
+			scope: header.UniqueLocalScope,
+			err:   nil,
+		},
+		{
+			name:  "Link Local",
+			addr:  linkLocalAddr,
+			scope: header.LinkLocalScope,
+			err:   nil,
+		},
+		{
+			name:  "Global",
+			addr:  globalAddr,
+			scope: header.GlobalScope,
+			err:   nil,
+		},
+		{
+			name:  "IPv4",
+			addr:  "\x01\x02\x03\x04",
+			scope: header.GlobalScope,
+			err:   tcpip.ErrBadAddress,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			got, err := header.ScopeForIPv6Address(test.addr)
+			if err != test.err {
+				t.Errorf("got header.IsV6UniqueLocalAddress(%s) = (_, %v), want = (_, %v)", test.addr, err, test.err)
+			}
+			if got != test.scope {
+				t.Errorf("got header.IsV6UniqueLocalAddress(%s) = (%d, _), want = (%d, _)", test.addr, got, test.scope)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index f9bc18c55..d390c6312 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1732,9 +1732,11 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd
 	return ndpDisp, e, s
 }
 
-// addrForNewConnection returns the local address used when creating a new
-// connection.
-func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
+// addrForNewConnectionTo returns the local address used when creating a new
+// connection to addr.
+func addrForNewConnectionTo(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+	t.Helper()
+
 	wq := waiter.Queue{}
 	we, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&we, waiter.EventIn)
@@ -1748,8 +1750,8 @@ func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
 	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
 		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 	}
-	if err := ep.Connect(dstAddr); err != nil {
-		t.Fatalf("ep.Connect(%+v): %s", dstAddr, err)
+	if err := ep.Connect(addr); err != nil {
+		t.Fatalf("ep.Connect(%+v): %s", addr, err)
 	}
 	got, err := ep.GetLocalAddress()
 	if err != nil {
@@ -1758,9 +1760,19 @@ func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
 	return got.Addr
 }
 
+// addrForNewConnection returns the local address used when creating a new
+// connection.
+func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
+	t.Helper()
+
+	return addrForNewConnectionTo(t, s, dstAddr)
+}
+
 // addrForNewConnectionWithAddr returns the local address used when creating a
 // new connection with a specific local address.
 func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+	t.Helper()
+
 	wq := waiter.Queue{}
 	we, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&we, waiter.EventIn)
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index fe557ccbd..abf73fe33 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -15,6 +15,8 @@
 package stack
 
 import (
+	"log"
+	"sort"
 	"strings"
 	"sync/atomic"
 
@@ -251,13 +253,17 @@ func (n *NIC) setSpoofing(enable bool) {
 	n.mu.Unlock()
 }
 
-// primaryEndpoint returns the primary endpoint of n for the given network
-// protocol.
-//
 // primaryEndpoint will return the first non-deprecated endpoint if such an
-// endpoint exists. If no non-deprecated endpoint exists, the first deprecated
-// endpoint will be returned.
-func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint {
+// endpoint exists for the given protocol and remoteAddr. If no non-deprecated
+// endpoint exists, the first deprecated endpoint will be returned.
+//
+// If an IPv6 primary endpoint is requested, Source Address Selection (as
+// defined by RFC 6724 section 5) will be performed.
+func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+	if protocol == header.IPv6ProtocolNumber && remoteAddr != "" {
+		return n.primaryIPv6Endpoint(remoteAddr)
+	}
+
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 
@@ -296,6 +302,103 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
 	return deprecatedEndpoint
 }
 
+// ipv6AddrCandidate is an IPv6 candidate for Source Address Selection (RFC
+// 6724 section 5).
+type ipv6AddrCandidate struct {
+	ref   *referencedNetworkEndpoint
+	scope header.IPv6AddressScope
+}
+
+// primaryIPv6Endpoint returns an IPv6 endpoint following Source Address
+// Selection (RFC 6724 section 5).
+//
+// Note, only rules 1-3 are followed.
+//
+// remoteAddr must be a valid IPv6 address.
+func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	primaryAddrs := n.primary[header.IPv6ProtocolNumber]
+
+	if len(primaryAddrs) == 0 {
+		return nil
+	}
+
+	// Create a candidate set of available addresses we can potentially use as a
+	// source address.
+	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
+	for _, r := range primaryAddrs {
+		// If r is not valid for outgoing connections, it is not a valid endpoint.
+		if !r.isValidForOutgoing() {
+			continue
+		}
+
+		addr := r.ep.ID().LocalAddress
+		scope, err := header.ScopeForIPv6Address(addr)
+		if err != nil {
+			// Should never happen as we got r from the primary IPv6 endpoint list and
+			// ScopeForIPv6Address only returns an error if addr is not an IPv6
+			// address.
+			log.Fatalf("header.ScopeForIPv6Address(%s): %s", addr, err)
+		}
+
+		cs = append(cs, ipv6AddrCandidate{
+			ref:   r,
+			scope: scope,
+		})
+	}
+
+	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
+	if err != nil {
+		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
+		log.Fatalf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err)
+	}
+
+	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
+	//
+	// TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
+	sort.Slice(cs, func(i, j int) bool {
+		sa := cs[i]
+		sb := cs[j]
+
+		// Prefer same address as per RFC 6724 section 5 rule 1.
+		if sa.ref.ep.ID().LocalAddress == remoteAddr {
+			return true
+		}
+		if sb.ref.ep.ID().LocalAddress == remoteAddr {
+			return false
+		}
+
+		// Prefer appropriate scope as per RFC 6724 section 5 rule 2.
+		if sa.scope < sb.scope {
+			return sa.scope >= remoteScope
+		} else if sb.scope < sa.scope {
+			return sb.scope < remoteScope
+		}
+
+		// Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
+		if saDep, sbDep := sa.ref.deprecated, sb.ref.deprecated; saDep != sbDep {
+			// If sa is not deprecated, it is preferred over sb.
+			return sbDep
+		}
+
+		// sa and sb are equal, return the endpoint that is closest to the front of
+		// the primary endpoint list.
+		return i < j
+	})
+
+	// Return the most preferred address that can have its reference count
+	// incremented.
+	for _, c := range cs {
+		if r := c.ref; r.tryIncRef() {
+			return r
+		}
+	}
+
+	return nil
+}
+
 // hasPermanentAddrLocked returns true if n has a permanent (including currently
 // tentative) address, addr.
 func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 113b457fb..f8d89248e 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1106,9 +1106,9 @@ func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocol
 	return nic.primaryAddress(protocol), nil
 }
 
-func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
+func (s *Stack) getRefEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
 	if len(localAddr) == 0 {
-		return nic.primaryEndpoint(netProto)
+		return nic.primaryEndpoint(netProto, remoteAddr)
 	}
 	return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint)
 }
@@ -1124,7 +1124,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok {
-			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
+			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
 			}
 		}
@@ -1134,7 +1134,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 				continue
 			}
 			if nic, ok := s.nics[route.NIC]; ok {
-				if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
+				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
 						// provided will refer to the link local address.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 44e5229cc..4b3d18f1b 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 )
 
 const (
@@ -2411,3 +2412,154 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 		}
 	}
 }
+
+func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
+	const (
+		linkLocalAddr1   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		linkLocalAddr2   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		globalAddr1      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		globalAddr2      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		nicID            = 1
+	)
+
+	// Rule 3 is not tested here, and is instead tested by NDP's AutoGenAddr test.
+	tests := []struct {
+		name              string
+		nicAddrs          []tcpip.Address
+		connectAddr       tcpip.Address
+		expectedLocalAddr tcpip.Address
+	}{
+		// Test Rule 1 of RFC 6724 section 5.
+		{
+			name:              "Same Global most preferred (last address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       globalAddr1,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Same Global most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       globalAddr1,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Same Link Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalAddr1,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Same Link Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalAddr1,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Same Unique Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
+			connectAddr:       uniqueLocalAddr1,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+		{
+			name:              "Same Unique Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       uniqueLocalAddr1,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+
+		// Test Rule 2 of RFC 6724 section 5.
+		{
+			name:              "Global most preferred (last address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Global most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Link Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Unique Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
+			connectAddr:       uniqueLocalAddr2,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+		{
+			name:              "Unique Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       uniqueLocalAddr2,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+
+		// Test returning the endpoint that is closest to the front when
+		// candidate addresses are "equal" from the perspective of RFC 6724
+		// section 5.
+		{
+			name:              "Unique Local for Global",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, uniqueLocalAddr2},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+		{
+			name:              "Link Local for Global",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, linkLocalAddr2},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local for Unique Local",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, linkLocalAddr2},
+			connectAddr:       uniqueLocalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				Gateway:     llAddr3,
+				NIC:         nicID,
+			}})
+			s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+
+			for _, a := range test.nicAddrs {
+				if err := s.AddAddress(nicID, ipv6.ProtocolNumber, a); err != nil {
+					t.Errorf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, a, err)
+				}
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			if got := addrForNewConnectionTo(t, s, tcpip.FullAddress{Addr: test.connectAddr, NIC: nicID, Port: 1234}); got != test.expectedLocalAddr {
+				t.Errorf("got local address = %s, want = %s", got, test.expectedLocalAddr)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From 50625cee59aaff834c7968771ab385ad0e7b0e1f Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 14 Jan 2020 13:31:52 -0800
Subject: Implement {g,s}etsockopt(IP_RECVTOS) for UDP sockets

PiperOrigin-RevId: 289718534
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 36 +++++++++++++--
 pkg/tcpip/checker/checker.go                 | 16 +++++++
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  8 +++-
 pkg/tcpip/transport/udp/endpoint.go          | 40 +++++++++++++++--
 pkg/tcpip/transport/udp/udp_test.go          | 67 ++++++++++++++++++++++++----
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 +++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 10 files changed, 197 insertions(+), 24 deletions(-)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4301b697c..1684dfc24 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index c020c11cb..d2f7e987d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1268,11 +1268,11 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o uint32
+		var o int32
 		if v {
 			o = 1
 		}
-		return int32(o), nil
+		return o, nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1377,6 +1377,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
+	case linux.IP_RECVTOS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1895,6 +1910,13 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
+	case linux.IP_RECVTOS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1915,7 +1937,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
-		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2335,7 +2356,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 }
 
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
-	return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}}
+	return socket.ControlMessages{
+		IP: tcpip.ControlMessages{
+			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:    s.readCM.Timestamp,
+			HasTOS:       s.readCM.HasTOS,
+			TOS:          s.readCM.TOS,
+		},
+	}
 }
 
 // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 2f15bf1f1..542abc99d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTOS {
+			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+		}
+		if got := cm.TOS; got != want {
+			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+		}
+	}
+}
+
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index abf73fe33..071221d5a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -763,7 +763,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// Subnets returns the Subnets associated with this NIC.
+// AddressRanges returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f8d89248e..386eb6eec 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -912,7 +912,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICSubnets returns a map of NICIDs to their associated subnets.
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 4a090ac86..b7813cbc0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS int8
+	TOS uint8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -500,9 +500,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
+	// ancillary message is passed with incoming packets.
+	ReceiveTOSOption SockOptBool = iota
+
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
-	V6OnlyOption SockOptBool = iota
+	V6OnlyOption
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 13446f5d9..c9cbed8f4 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -31,6 +31,7 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
+	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -113,6 +114,10 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
+	// receiveTOS determines if the incoming IPv4 TOS header field is passed
+	// as ancillary data to ControlMessages on Read.
+	receiveTOS bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -243,7 +248,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+	cm := tcpip.ControlMessages{
+		HasTimestamp: true,
+		Timestamp:    p.timestamp,
+	}
+	e.mu.RLock()
+	receiveTOS := e.receiveTOS
+	e.mu.RUnlock()
+	if receiveTOS {
+		cm.HasTOS = true
+		cm.TOS = p.tos
+	}
+	return p.data.ToView(), cm, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -458,6 +474,12 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	switch opt {
+	case tcpip.ReceiveTOSOption:
+		e.mu.Lock()
+		e.receiveTOS = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -664,15 +686,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.ReceiveTOSOption:
+		e.mu.RLock()
+		v := e.receiveTOS
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
 			return false, tcpip.ErrUnknownProtocolOption
 		}
 
-		e.mu.Lock()
+		e.mu.RLock()
 		v := e.v6only
-		e.mu.Unlock()
+		e.mu.RUnlock()
 
 		return v, nil
 	}
@@ -1215,6 +1243,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
+	// Save any useful information from the network header to the packet.
+	switch r.NetProto {
+	case header.IPv4ProtocolNumber:
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+	}
+
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0a82bc4fa..ee9d10555 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,6 +56,7 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
+	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
+		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -552,8 +554,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
+// correctness including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -568,12 +570,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, _, err := c.ep.Read(&addr)
+	v, cm, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, _, err = c.ep.Read(&addr)
+			v, cm, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -606,15 +608,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
+
+	// Run any checkers against the ControlMessages.
+	for _, f := range checkers {
+		f(c.t, cm)
+	}
+
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness.
-func testRead(c *testContext, flow testFlow) {
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1282,7 +1290,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1317,7 +1325,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1344,6 +1352,47 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
+func TestReceiveTOSV4(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, broadcast} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Verify that setting and reading the option works.
+			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+			if err != nil {
+				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+			}
+			// Test for expected default value.
+			if v != false {
+				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
+			}
+
+			want := true
+			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
+				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
+			}
+
+			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+			if err != nil {
+				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+			}
+			if got != want {
+				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
+			}
+
+			// Verify that the correct received TOS is handed through as
+			// ancillary data to the ControlMessages struct.
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				c.t.Fatal("Bind failed:", err)
+			}
+			testRead(c, flow, checker.ReceiveTOS(testTOS))
+		})
+	}
+}
+
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 66eb68857..53290bed7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Ensure that Receiving TOS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS works as expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index dc35c2f50..68e0a8109 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,8 +1349,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
+          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1421,7 +1422,8 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From a611fdaee3c14abe2222140ae0a8a742ebfd31ab Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 14 Jan 2020 14:14:17 -0800
Subject: Changes TCP packet dispatch to use a pool of goroutines.

All inbound segments for connections in ESTABLISHED state are delivered to the
endpoint's queue but for every segment delivered we also queue the endpoint for
processing to a selected processor. This ensures that when there are a large
number of connections in ESTABLISHED state the inbound packets are all handled
by a small number of goroutines and significantly reduces the amount of work the
goscheduler has to perform.

We let connections in other states follow the current path where the
endpoint's goroutine directly handles the segments.

Updates #231

PiperOrigin-RevId: 289728325
---
 benchmarks/tcp/tcp_proxy.go                 |   6 +-
 pkg/sleep/sleep_test.go                     |  31 +++
 pkg/tcpip/stack/transport_demuxer.go        |  54 ++++-
 pkg/tcpip/transport/tcp/BUILD               |  15 +-
 pkg/tcpip/transport/tcp/accept.go           |   9 +-
 pkg/tcpip/transport/tcp/connect.go          | 310 ++++++++++++++++------------
 pkg/tcpip/transport/tcp/dispatcher.go       | 218 +++++++++++++++++++
 pkg/tcpip/transport/tcp/endpoint.go         | 303 ++++++++++++++++++---------
 pkg/tcpip/transport/tcp/endpoint_state.go   |  30 +--
 pkg/tcpip/transport/tcp/protocol.go         |  11 +
 pkg/tcpip/transport/tcp/rcv.go              |  21 +-
 pkg/tcpip/transport/tcp/snd.go              |  14 +-
 pkg/tcpip/transport/tcp/tcp_test.go         |  11 +-
 test/syscalls/linux/socket_inet_loopback.cc |   2 +-
 test/syscalls/linux/tcp_socket.cc           |  14 ++
 15 files changed, 769 insertions(+), 280 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/dispatcher.go

diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index be0d7bdd6..dc96add66 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -85,7 +85,7 @@ func (netImpl) printStats() {
 
 const (
 	nicID      = 1       // Fixed.
-	rcvBufSize = 1 << 20 // 1MB.
+	rcvBufSize = 4 << 20 // 1MB.
 )
 
 type netstackImpl struct {
@@ -130,6 +130,10 @@ func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
 				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
 			}
 
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, rcvBufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+			}
+
 			if !*swgso && *gso != 0 {
 				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
 					return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index 130806c86..af47e2ba1 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -376,6 +376,37 @@ func TestRace(t *testing.T) {
 	}
 }
 
+// TestRaceInOrder tests that multiple wakers can continuously send wake requests to
+// the sleeper and that the wakers are retrieved in the order asserted.
+func TestRaceInOrder(t *testing.T) {
+	const wakers = 100
+	const wakeRequests = 10000
+
+	w := make([]Waker, wakers)
+	s := Sleeper{}
+
+	// Associate each waker and start goroutines that will assert them.
+	for i := range w {
+		s.AddWaker(&w[i], i)
+	}
+	go func() {
+		n := 0
+		for n < wakeRequests {
+			wk := w[n%len(w)]
+			wk.Assert()
+			n++
+		}
+	}()
+
+	// Wait for all wake up notifications from all wakers.
+	for i := 0; i < wakeRequests; i++ {
+		v, _ := s.Fetch(true)
+		if got, want := v, i%wakers; got != want {
+			t.Fatalf("got  %d want %d", got, want)
+		}
+	}
+}
+
 // BenchmarkSleeperMultiSelect measures how long it takes to fetch a wake up
 // from 4 wakers when at least one is already asserted.
 func BenchmarkSleeperMultiSelect(b *testing.B) {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index f384a91de..d686e6eb8 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -104,7 +104,14 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 		return
 	}
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
+	transEP := selectEndpoint(id, mpep, epsByNic.seed)
+	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
+		queuedProtocol.QueuePacket(r, transEP, id, pkt)
+		epsByNic.mu.RUnlock()
+		return
+	}
+
+	transEP.HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
@@ -130,7 +137,7 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
 	epsByNic.mu.Lock()
 	defer epsByNic.mu.Unlock()
 
@@ -140,7 +147,7 @@ func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort
 	}
 
 	// This is a new binding.
-	multiPortEp := &multiPortEndpoint{}
+	multiPortEp := &multiPortEndpoint{demux: d, netProto: netProto, transProto: transProto}
 	multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
 	multiPortEp.reuse = reusePort
 	epsByNic.endpoints[bindToDevice] = multiPortEp
@@ -168,18 +175,34 @@ func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t T
 // newTransportDemuxer.
 type transportDemuxer struct {
 	// protocol is immutable.
-	protocol map[protocolIDs]*transportEndpoints
+	protocol        map[protocolIDs]*transportEndpoints
+	queuedProtocols map[protocolIDs]queuedTransportProtocol
+}
+
+// queuedTransportProtocol if supported by a protocol implementation will cause
+// the dispatcher to delivery packets to the QueuePacket method instead of
+// calling HandlePacket directly on the endpoint.
+type queuedTransportProtocol interface {
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
-	d := &transportDemuxer{protocol: make(map[protocolIDs]*transportEndpoints)}
+	d := &transportDemuxer{
+		protocol:        make(map[protocolIDs]*transportEndpoints),
+		queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
+	}
 
 	// Add each network and transport pair to the demuxer.
 	for netProto := range stack.networkProtocols {
 		for proto := range stack.transportProtocols {
-			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
+			protoIDs := protocolIDs{netProto, proto}
+			d.protocol[protoIDs] = &transportEndpoints{
 				endpoints: make(map[TransportEndpointID]*endpointsByNic),
 			}
+			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
+			if isQueued {
+				d.queuedProtocols[protoIDs] = qTransProto
+			}
 		}
 	}
 
@@ -209,7 +232,11 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 //
 // +stateify savable
 type multiPortEndpoint struct {
-	mu           sync.RWMutex `state:"nosave"`
+	mu         sync.RWMutex `state:"nosave"`
+	demux      *transportDemuxer
+	netProto   tcpip.NetworkProtocolNumber
+	transProto tcpip.TransportProtocolNumber
+
 	endpointsArr []TransportEndpoint
 	endpointsMap map[TransportEndpoint]int
 	// reuse indicates if more than one endpoint is allowed.
@@ -258,13 +285,22 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 
 func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep.mu.RLock()
+	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	for i, endpoint := range ep.endpointsArr {
 		// HandlePacket takes ownership of pkt, so each endpoint needs
 		// its own copy except for the final one.
 		if i == len(ep.endpointsArr)-1 {
+			if mustQueue {
+				queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+				break
+			}
 			endpoint.HandlePacket(r, id, pkt)
 			break
 		}
+		if mustQueue {
+			queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
+			continue
+		}
 		endpoint.HandlePacket(r, id, pkt.Clone())
 	}
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
@@ -357,7 +393,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 
 	if epsByNic, ok := eps.endpoints[id]; ok {
 		// There was already a binding.
-		return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+		return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 	}
 
 	// This is a new binding.
@@ -367,7 +403,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	}
 	eps.endpoints[id] = epsByNic
 
-	return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 353bd06f4..0e3ab05ad 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -16,6 +16,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "tcp_endpoint_list",
+    out = "tcp_endpoint_list.go",
+    package = "tcp",
+    prefix = "endpoint",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*endpoint",
+        "Linker": "*endpoint",
+    },
+)
+
 go_library(
     name = "tcp",
     srcs = [
@@ -23,6 +35,7 @@ go_library(
         "connect.go",
         "cubic.go",
         "cubic_state.go",
+        "dispatcher.go",
         "endpoint.go",
         "endpoint_state.go",
         "forwarder.go",
@@ -38,6 +51,7 @@ go_library(
         "segment_state.go",
         "snd.go",
         "snd_state.go",
+        "tcp_endpoint_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
@@ -45,7 +59,6 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 1ea996936..1a2e3efa9 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -285,7 +285,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
-		if l.listenEP.state != StateListen {
+		if l.listenEP.EndpointState() != StateListen {
 			l.listenEP.mu.Unlock()
 			return nil, tcpip.ErrConnectionAborted
 		}
@@ -344,11 +344,12 @@ func (l *listenContext) closeAllPendingEndpoints() {
 // instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.Lock()
-	state := e.state
+	state := e.EndpointState()
 	e.pendingAccepted.Add(1)
 	defer e.pendingAccepted.Done()
 	acceptedChan := e.acceptedChan
 	e.mu.Unlock()
+
 	if state == StateListen {
 		acceptedChan <- n
 		e.waiterQueue.Notify(waiter.EventIn)
@@ -562,8 +563,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// We do not use transitionToStateEstablishedLocked here as there is
 		// no handshake state available when doing a SYN cookie based accept.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
-		n.state = StateEstablished
 		n.isConnectNotified = true
+		n.setEndpointState(StateEstablished)
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
@@ -596,7 +597,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
 		e.mu.Lock()
-		e.state = StateClose
+		e.setEndpointState(StateClose)
 
 		// close any endpoints in SYN-RCVD state.
 		ctx.closeAllPendingEndpoints()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 613ec1775..f3896715b 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -190,7 +190,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
 	h.ep.mu.Lock()
-	h.ep.state = StateSynRecv
+	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 }
 
@@ -274,14 +274,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
-	h.ep.state = StateSynRecv
 	ttl := h.ep.ttl
+	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    int(h.effectiveRcvWndScale()),
 		TS:    rcvSynOpts.TS,
 		TSVal: h.ep.timestamp(),
-		TSEcr: h.ep.recentTS,
+		TSEcr: h.ep.recentTimestamp(),
 
 		// We only send SACKPermitted if the other side indicated it
 		// permits SACK. This is not explicitly defined in the RFC but
@@ -341,7 +341,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			WS:            h.rcvWndScale,
 			TS:            h.ep.sendTSOk,
 			TSVal:         h.ep.timestamp(),
-			TSEcr:         h.ep.recentTS,
+			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
@@ -501,7 +501,7 @@ func (h *handshake) execute() *tcpip.Error {
 		WS:            h.rcvWndScale,
 		TS:            true,
 		TSVal:         h.ep.timestamp(),
-		TSEcr:         h.ep.recentTS,
+		TSEcr:         h.ep.recentTimestamp(),
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
@@ -792,7 +792,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
 		offset += header.EncodeNOP(options[offset:])
 		offset += header.EncodeNOP(options[offset:])
-		offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:])
+		offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
 	}
 	if e.sackPermitted && len(sackBlocks) > 0 {
 		offset += header.EncodeNOP(options[offset:])
@@ -811,7 +811,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 // sendRaw sends a TCP segment to the endpoint's peer.
 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
 	var sackBlocks []header.SACKBlock
-	if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+	if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
@@ -848,6 +848,9 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 }
 
 func (e *endpoint) handleClose() *tcpip.Error {
+	if !e.EndpointState().connected() {
+		return nil
+	}
 	// Drain the send queue.
 	e.handleWrite()
 
@@ -864,11 +867,7 @@ func (e *endpoint) handleClose() *tcpip.Error {
 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	// Only send a reset if the connection is being aborted for a reason
 	// other than receiving a reset.
-	if e.state == StateEstablished || e.state == StateCloseWait {
-		e.stack.Stats().TCP.EstablishedResets.Increment()
-		e.stack.Stats().TCP.CurrentEstablished.Decrement()
-	}
-	e.state = StateError
+	e.setEndpointState(StateError)
 	e.HardError = err
 	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
@@ -888,9 +887,12 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 }
 
 // completeWorkerLocked is called by the worker goroutine when it's about to
-// exit. It marks the worker as completed and performs cleanup work if requested
-// by Close().
+// exit.
 func (e *endpoint) completeWorkerLocked() {
+	// Worker is terminating(either due to moving to
+	// CLOSED or ERROR state, ensure we release all
+	// registrations port reservations even if the socket
+	// itself is not yet closed by the application.
 	e.workerRunning = false
 	if e.workerCleanup {
 		e.cleanupLocked()
@@ -917,8 +919,7 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 		e.rcvAutoParams.prevCopied = int(h.rcvWnd)
 		e.rcvListMu.Unlock()
 	}
-	h.ep.stack.Stats().TCP.CurrentEstablished.Increment()
-	e.state = StateEstablished
+	e.setEndpointState(StateEstablished)
 }
 
 // transitionToStateCloseLocked ensures that the endpoint is
@@ -927,11 +928,12 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 // delivered to this endpoint from the demuxer when the endpoint
 // is transitioned to StateClose.
 func (e *endpoint) transitionToStateCloseLocked() {
-	if e.state == StateClose {
+	if e.EndpointState() == StateClose {
 		return
 	}
+	// Mark the endpoint as fully closed for reads/writes.
 	e.cleanupLocked()
-	e.state = StateClose
+	e.setEndpointState(StateClose)
 	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
@@ -946,7 +948,9 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		s.decRef()
 		return
 	}
-	ep.(*endpoint).enqueueSegment(s)
+	if ep.(*endpoint).enqueueSegment(s) {
+		ep.(*endpoint).newSegmentWaker.Assert()
+	}
 }
 
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
@@ -955,9 +959,8 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		// except SYN-SENT, all reset (RST) segments are
 		// validated by checking their SEQ-fields." So
 		// we only process it if it's acceptable.
-		s.decRef()
 		e.mu.Lock()
-		switch e.state {
+		switch e.EndpointState() {
 		// In case of a RST in CLOSE-WAIT linux moves
 		// the socket to closed state with an error set
 		// to indicate EPIPE.
@@ -981,103 +984,57 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
 			e.mu.Unlock()
+			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
 			e.mu.Unlock()
+			// RFC 793, page 37 states that "in all states
+			// except SYN-SENT, all reset (RST) segments are
+			// validated by checking their SEQ-fields." So
+			// we only process it if it's acceptable.
+
+			// Notify protocol goroutine. This is required when
+			// handleSegment is invoked from the processor goroutine
+			// rather than the worker goroutine.
+			e.notifyProtocolGoroutine(notifyResetByPeer)
 			return false, tcpip.ErrConnectionReset
 		}
 	}
 	return true, nil
 }
 
-// handleSegments pulls segments from the queue and processes them. It returns
-// no error if the protocol loop should continue, an error otherwise.
-func (e *endpoint) handleSegments() *tcpip.Error {
+// handleSegments processes all inbound segments.
+func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
+		if e.EndpointState() == StateClose || e.EndpointState() == StateError {
+			return nil
+		}
 		s := e.segmentQueue.dequeue()
 		if s == nil {
 			checkRequeue = false
 			break
 		}
 
-		// Invoke the tcp probe if installed.
-		if e.probe != nil {
-			e.probe(e.completeState())
+		cont, err := e.handleSegment(s)
+		if err != nil {
+			s.decRef()
+			e.mu.Lock()
+			e.setEndpointState(StateError)
+			e.HardError = err
+			e.mu.Unlock()
+			return err
 		}
-
-		if s.flagIsSet(header.TCPFlagRst) {
-			if ok, err := e.handleReset(s); !ok {
-				return err
-			}
-		} else if s.flagIsSet(header.TCPFlagSyn) {
-			// See: https://tools.ietf.org/html/rfc5961#section-4.1
-			//   1) If the SYN bit is set, irrespective of the sequence number, TCP
-			//    MUST send an ACK (also referred to as challenge ACK) to the remote
-			//    peer:
-			//
-			//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
-			//
-			//    After sending the acknowledgment, TCP MUST drop the unacceptable
-			//    segment and stop processing further.
-			//
-			// By sending an ACK, the remote peer is challenged to confirm the loss
-			// of the previous connection and the request to start a new connection.
-			// A legitimate peer, after restart, would not have a TCB in the
-			// synchronized state.  Thus, when the ACK arrives, the peer should send
-			// a RST segment back with the sequence number derived from the ACK
-			// field that caused the RST.
-
-			// This RST will confirm that the remote peer has indeed closed the
-			// previous connection.  Upon receipt of a valid RST, the local TCP
-			// endpoint MUST terminate its connection.  The local TCP endpoint
-			// should then rely on SYN retransmission from the remote end to
-			// re-establish the connection.
-
-			e.snd.sendAck()
-		} else if s.flagIsSet(header.TCPFlagAck) {
-			// Patch the window size in the segment according to the
-			// send window scale.
-			s.window <<= e.snd.sndWndScale
-
-			// RFC 793, page 41 states that "once in the ESTABLISHED
-			// state all segments must carry current acknowledgment
-			// information."
-			drop, err := e.rcv.handleRcvdSegment(s)
-			if err != nil {
-				s.decRef()
-				return err
-			}
-			if drop {
-				s.decRef()
-				continue
-			}
-
-			// Now check if the received segment has caused us to transition
-			// to a CLOSED state, if yes then terminate processing and do
-			// not invoke the sender.
-			e.mu.RLock()
-			state := e.state
-			e.mu.RUnlock()
-			if state == StateClose {
-				// When we get into StateClose while processing from the queue,
-				// return immediately and let the protocolMainloop handle it.
-				//
-				// We can reach StateClose only while processing a previous segment
-				// or a notification from the protocolMainLoop (caller goroutine).
-				// This means that with this return, the segment dequeue below can
-				// never occur on a closed endpoint.
-				s.decRef()
-				return nil
-			}
-			e.snd.handleRcvdSegment(s)
+		if !cont {
+			s.decRef()
+			return nil
 		}
-		s.decRef()
 	}
 
-	// If the queue is not empty, make sure we'll wake up in the next
-	// iteration.
-	if checkRequeue && !e.segmentQueue.empty() {
+	// When fastPath is true we don't want to wake up the worker
+	// goroutine. If the endpoint has more segments to process the
+	// dispatcher will call handleSegments again anyway.
+	if !fastPath && checkRequeue && !e.segmentQueue.empty() {
 		e.newSegmentWaker.Assert()
 	}
 
@@ -1086,11 +1043,88 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 		e.snd.sendAck()
 	}
 
-	e.resetKeepaliveTimer(true)
+	e.resetKeepaliveTimer(true /* receivedData */)
 
 	return nil
 }
 
+// handleSegment handles a given segment and notifies the worker goroutine if
+// if the connection should be terminated.
+func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
+	// Invoke the tcp probe if installed.
+	if e.probe != nil {
+		e.probe(e.completeState())
+	}
+
+	if s.flagIsSet(header.TCPFlagRst) {
+		if ok, err := e.handleReset(s); !ok {
+			return false, err
+		}
+	} else if s.flagIsSet(header.TCPFlagSyn) {
+		// See: https://tools.ietf.org/html/rfc5961#section-4.1
+		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
+		//    MUST send an ACK (also referred to as challenge ACK) to the remote
+		//    peer:
+		//
+		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+		//
+		//    After sending the acknowledgment, TCP MUST drop the unacceptable
+		//    segment and stop processing further.
+		//
+		// By sending an ACK, the remote peer is challenged to confirm the loss
+		// of the previous connection and the request to start a new connection.
+		// A legitimate peer, after restart, would not have a TCB in the
+		// synchronized state.  Thus, when the ACK arrives, the peer should send
+		// a RST segment back with the sequence number derived from the ACK
+		// field that caused the RST.
+
+		// This RST will confirm that the remote peer has indeed closed the
+		// previous connection.  Upon receipt of a valid RST, the local TCP
+		// endpoint MUST terminate its connection.  The local TCP endpoint
+		// should then rely on SYN retransmission from the remote end to
+		// re-establish the connection.
+
+		e.snd.sendAck()
+	} else if s.flagIsSet(header.TCPFlagAck) {
+		// Patch the window size in the segment according to the
+		// send window scale.
+		s.window <<= e.snd.sndWndScale
+
+		// RFC 793, page 41 states that "once in the ESTABLISHED
+		// state all segments must carry current acknowledgment
+		// information."
+		drop, err := e.rcv.handleRcvdSegment(s)
+		if err != nil {
+			return false, err
+		}
+		if drop {
+			return true, nil
+		}
+
+		// Now check if the received segment has caused us to transition
+		// to a CLOSED state, if yes then terminate processing and do
+		// not invoke the sender.
+		e.mu.RLock()
+		state := e.state
+		e.mu.RUnlock()
+		if state == StateClose {
+			// When we get into StateClose while processing from the queue,
+			// return immediately and let the protocolMainloop handle it.
+			//
+			// We can reach StateClose only while processing a previous segment
+			// or a notification from the protocolMainLoop (caller goroutine).
+			// This means that with this return, the segment dequeue below can
+			// never occur on a closed endpoint.
+			s.decRef()
+			return false, nil
+		}
+
+		e.snd.handleRcvdSegment(s)
+	}
+
+	return true, nil
+}
+
 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
@@ -1160,7 +1194,7 @@ func (e *endpoint) disableKeepaliveTimer() {
 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
 // goroutine and is responsible for sending segments and handling received
 // segments.
-func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
@@ -1182,6 +1216,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		}
 
 		e.mu.Unlock()
+		e.workMu.Unlock()
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1193,7 +1228,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		initialRcvWnd := e.initialReceiveWindow()
 		h := newHandshake(e, seqnum.Size(initialRcvWnd))
 		e.mu.Lock()
-		h.ep.state = StateSynSent
+		h.ep.setEndpointState(StateSynSent)
 		e.mu.Unlock()
 
 		if err := h.execute(); err != nil {
@@ -1202,12 +1237,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			e.state = StateError
+			e.setEndpointState(StateError)
 			e.HardError = err
 
 			// Lock released below.
 			epilogue()
-
 			return err
 		}
 	}
@@ -1215,7 +1249,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	e.keepalive.timer.init(&e.keepalive.waker)
 	defer e.keepalive.timer.cleanup()
 
-	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
 	drained := e.drainDone != nil
 	e.mu.Unlock()
@@ -1224,8 +1257,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		<-e.undrain
 	}
 
-	e.waiterQueue.Notify(waiter.EventOut)
-
 	// Set up the functions that will be called when the main protocol loop
 	// wakes up.
 	funcs := []struct {
@@ -1240,18 +1271,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			w: &e.sndCloseWaker,
 			f: e.handleClose,
 		},
-		{
-			w: &e.newSegmentWaker,
-			f: e.handleSegments,
-		},
 		{
 			w: &closeWaker,
 			f: func() *tcpip.Error {
 				// This means the socket is being closed due
-				// to the TCP_FIN_WAIT2 timeout was hit. Just
+				// to the TCP-FIN-WAIT2 timeout was hit. Just
 				// mark the socket as closed.
 				e.mu.Lock()
 				e.transitionToStateCloseLocked()
+				e.workerCleanup = true
 				e.mu.Unlock()
 				return nil
 			},
@@ -1266,6 +1294,12 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				return nil
 			},
 		},
+		{
+			w: &e.newSegmentWaker,
+			f: func() *tcpip.Error {
+				return e.handleSegments(false /* fastPath */)
+			},
+		},
 		{
 			w: &e.keepalive.waker,
 			f: e.keepaliveTimerExpired,
@@ -1293,14 +1327,16 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				}
 
 				if n&notifyReset != 0 {
-					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.mu.Unlock()
+					return tcpip.ErrConnectionAborted
+				}
+
+				if n&notifyResetByPeer != 0 {
+					return tcpip.ErrConnectionReset
 				}
 
 				if n&notifyClose != 0 && closeTimer == nil {
 					e.mu.Lock()
-					if e.state == StateFinWait2 && e.closed {
+					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
 						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
@@ -1320,11 +1356,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 				if n&notifyDrain != 0 {
 					for !e.segmentQueue.empty() {
-						if err := e.handleSegments(); err != nil {
+						if err := e.handleSegments(false /* fastPath */); err != nil {
 							return err
 						}
 					}
-					if e.state != StateClose && e.state != StateError {
+					if e.EndpointState() != StateClose && e.EndpointState() != StateError {
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
@@ -1349,14 +1385,21 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.AddWaker(funcs[i].w, i)
 	}
 
+	// Notify the caller that the waker initialization is complete and the
+	// endpoint is ready.
+	if wakerInitDone != nil {
+		close(wakerInitDone)
+	}
+
+	// Tell waiters that the endpoint is connected and writable.
+	e.waiterQueue.Notify(waiter.EventOut)
+
 	// The following assertions and notifications are needed for restored
 	// endpoints. Fresh newly created endpoints have empty states and should
 	// not invoke any.
-	e.segmentQueue.mu.Lock()
-	if !e.segmentQueue.list.Empty() {
+	if !e.segmentQueue.empty() {
 		e.newSegmentWaker.Assert()
 	}
-	e.segmentQueue.mu.Unlock()
 
 	e.rcvListMu.Lock()
 	if !e.rcvList.Empty() {
@@ -1372,27 +1415,32 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 
-	for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
+	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
 		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
 		e.workMu.Lock()
+		// We need to double check here because the notification
+		// maybe stale by the time we got around to processing it.
+		// NOTE: since we now hold the workMu the processors cannot
+		// change the state of the endpoint so it' safe to proceed
+		// after this check.
+		if e.EndpointState() == StateTimeWait || e.EndpointState() == StateClose || e.EndpointState() == StateError {
+			e.mu.Lock()
+			break
+		}
 		if err := funcs[v].f(); err != nil {
 			e.mu.Lock()
-			// Ensure we release all endpoint registration and route
-			// references as the connection is now in an error
-			// state.
 			e.workerCleanup = true
 			e.resetConnectionLocked(err)
 			// Lock released below.
 			epilogue()
-
 			return nil
 		}
 		e.mu.Lock()
 	}
 
-	state := e.state
+	state := e.EndpointState()
 	e.mu.Unlock()
 	var reuseTW func()
 	if state == StateTimeWait {
@@ -1405,13 +1453,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.Done()
 		// Wake up any waiters before we enter TIME_WAIT.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		e.mu.Lock()
+		e.workerCleanup = true
+		e.mu.Unlock()
 		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
 	e.mu.Lock()
-	if e.state != StateError {
-		e.stack.Stats().TCP.CurrentEstablished.Decrement()
+	if e.EndpointState() != StateError {
 		e.transitionToStateCloseLocked()
 	}
 
@@ -1468,7 +1518,11 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
 					tcpEP := listenEP.(*endpoint)
 					if EndpointState(tcpEP.State()) == StateListen {
 						reuseTW = func() {
-							tcpEP.enqueueSegment(s)
+							if !tcpEP.enqueueSegment(s) {
+								s.decRef()
+								return
+							}
+							tcpEP.newSegmentWaker.Assert()
 						}
 						// We explicitly do not decRef
 						// the segment as it's still
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
new file mode 100644
index 000000000..a72f0c379
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -0,0 +1,218 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// epQueue is a queue of endpoints.
+type epQueue struct {
+	mu   sync.Mutex
+	list endpointList
+}
+
+// enqueue adds e to the queue if the endpoint is not already on the queue.
+func (q *epQueue) enqueue(e *endpoint) {
+	q.mu.Lock()
+	if e.pendingProcessing {
+		q.mu.Unlock()
+		return
+	}
+	q.list.PushBack(e)
+	e.pendingProcessing = true
+	q.mu.Unlock()
+}
+
+// dequeue removes and returns the first element from the queue if available,
+// returns nil otherwise.
+func (q *epQueue) dequeue() *endpoint {
+	q.mu.Lock()
+	if e := q.list.Front(); e != nil {
+		q.list.Remove(e)
+		e.pendingProcessing = false
+		q.mu.Unlock()
+		return e
+	}
+	q.mu.Unlock()
+	return nil
+}
+
+// empty returns true if the queue is empty, false otherwise.
+func (q *epQueue) empty() bool {
+	q.mu.Lock()
+	v := q.list.Empty()
+	q.mu.Unlock()
+	return v
+}
+
+// processor is responsible for processing packets queued to a tcp endpoint.
+type processor struct {
+	epQ              epQueue
+	newEndpointWaker sleep.Waker
+	id               int
+}
+
+func newProcessor(id int) *processor {
+	p := &processor{
+		id: id,
+	}
+	go p.handleSegments()
+	return p
+}
+
+func (p *processor) queueEndpoint(ep *endpoint) {
+	// Queue an endpoint for processing by the processor goroutine.
+	p.epQ.enqueue(ep)
+	p.newEndpointWaker.Assert()
+}
+
+func (p *processor) handleSegments() {
+	const newEndpointWaker = 1
+	s := sleep.Sleeper{}
+	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+	defer s.Done()
+	for {
+		s.Fetch(true)
+		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
+			if ep.segmentQueue.empty() {
+				continue
+			}
+
+			// If socket has transitioned out of connected state
+			// then just let the worker handle the packet.
+			//
+			// NOTE: We read this outside of e.mu lock which means
+			// that by the time we get to handleSegments the
+			// endpoint may not be in ESTABLISHED. But this should
+			// be fine as all normal shutdown states are handled by
+			// handleSegments and if the endpoint moves to a
+			// CLOSED/ERROR state then handleSegments is a noop.
+			if ep.EndpointState() != StateEstablished {
+				ep.newSegmentWaker.Assert()
+				continue
+			}
+
+			if !ep.workMu.TryLock() {
+				ep.newSegmentWaker.Assert()
+				continue
+			}
+			// If the endpoint is in a connected state then we do
+			// direct delivery to ensure low latency and avoid
+			// scheduler interactions.
+			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
+				ep.notifyProtocolGoroutine(notifyTickleWorker)
+				ep.workMu.Unlock()
+				continue
+			}
+
+			if !ep.segmentQueue.empty() {
+				p.epQ.enqueue(ep)
+			}
+
+			ep.workMu.Unlock()
+		}
+	}
+}
+
+// dispatcher manages a pool of TCP endpoint processors which are responsible
+// for the processing of inbound segments. This fixed pool of processor
+// goroutines do full tcp processing. The processor is selected based on the
+// hash of the endpoint id to ensure that delivery for the same endpoint happens
+// in-order.
+type dispatcher struct {
+	processors []*processor
+	seed       uint32
+}
+
+func newDispatcher(nProcessors int) *dispatcher {
+	processors := []*processor{}
+	for i := 0; i < nProcessors; i++ {
+		processors = append(processors, newProcessor(i))
+	}
+	return &dispatcher{
+		processors: processors,
+		seed:       generateRandUint32(),
+	}
+}
+
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	ep := stackEP.(*endpoint)
+	s := newSegment(r, id, pkt)
+	if !s.parse() {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
+		ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		s.decRef()
+		return
+	}
+
+	if !s.csumValid {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.ChecksumErrors.Increment()
+		ep.stats.ReceiveErrors.ChecksumErrors.Increment()
+		s.decRef()
+		return
+	}
+
+	ep.stack.Stats().TCP.ValidSegmentsReceived.Increment()
+	ep.stats.SegmentsReceived.Increment()
+	if (s.flags & header.TCPFlagRst) != 0 {
+		ep.stack.Stats().TCP.ResetsReceived.Increment()
+	}
+
+	if !ep.enqueueSegment(s) {
+		s.decRef()
+		return
+	}
+
+	// For sockets not in established state let the worker goroutine
+	// handle the packets.
+	if ep.EndpointState() != StateEstablished {
+		ep.newSegmentWaker.Assert()
+		return
+	}
+
+	d.selectProcessor(id).queueEndpoint(ep)
+}
+
+func generateRandUint32() uint32 {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
+	payload := []byte{
+		byte(id.LocalPort),
+		byte(id.LocalPort >> 8),
+		byte(id.RemotePort),
+		byte(id.RemotePort >> 8)}
+
+	h := jenkins.Sum32(d.seed)
+	h.Write(payload)
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+
+	return d.processors[h.Sum32()%uint32(len(d.processors))]
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cc8b533c8..1799c6e10 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -120,6 +120,7 @@ const (
 	notifyMTUChanged
 	notifyDrain
 	notifyReset
+	notifyResetByPeer
 	notifyKeepaliveChanged
 	notifyMSSChanged
 	// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -127,6 +128,7 @@ const (
 	// ensures the loop terminates if the final state of the endpoint is
 	// say TIME_WAIT.
 	notifyTickleWorker
+	notifyError
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
@@ -283,6 +285,18 @@ func (*EndpointInfo) IsEndpointInfo() {}
 type endpoint struct {
 	EndpointInfo
 
+	// endpointEntry is used to queue endpoints for processing to the
+	// a given tcp processor goroutine.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	endpointEntry `state:"nosave"`
+
+	// pendingProcessing is true if this endpoint is queued for processing
+	// to a TCP processor.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	pendingProcessing bool `state:"nosave"`
+
 	// workMu is used to arbitrate which goroutine may perform protocol
 	// work. Only the main protocol goroutine is expected to call Lock() on
 	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
@@ -324,6 +338,7 @@ type endpoint struct {
 	// The following fields are protected by the mutex.
 	mu sync.RWMutex `state:"nosave"`
 
+	// state must be read/set using the EndpointState()/setEndpointState() methods.
 	state EndpointState `state:".(EndpointState)"`
 
 	// origEndpointState is only used during a restore phase to save the
@@ -359,7 +374,7 @@ type endpoint struct {
 	workerRunning bool
 
 	// workerCleanup specifies if the worker goroutine must perform cleanup
-	// before exitting. This can only be set to true when workerRunning is
+	// before exiting. This can only be set to true when workerRunning is
 	// also true, and they're both protected by the mutex.
 	workerCleanup bool
 
@@ -371,6 +386,8 @@ type endpoint struct {
 	// recentTS is the timestamp that should be sent in the TSEcr field of
 	// the timestamp for future segments sent by the endpoint. This field is
 	// updated if required when a new segment is received by this endpoint.
+	//
+	// recentTS must be read/written atomically.
 	recentTS uint32
 
 	// tsOffset is a randomized offset added to the value of the
@@ -567,6 +584,47 @@ func (e *endpoint) ResumeWork() {
 	e.workMu.Unlock()
 }
 
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+	oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+	switch state {
+	case StateEstablished:
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+	case StateError:
+		fallthrough
+	case StateClose:
+		if oldstate == StateCloseWait || oldstate == StateEstablished {
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+		}
+		fallthrough
+	default:
+		if oldstate == StateEstablished {
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
+		}
+	}
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
+// setRecentTimestamp atomically sets the recentTS field to the
+// provided value.
+func (e *endpoint) setRecentTimestamp(recentTS uint32) {
+	atomic.StoreUint32(&e.recentTS, recentTS)
+}
+
+// recentTimestamp atomically reads and returns the value of the recentTS field.
+func (e *endpoint) recentTimestamp() uint32 {
+	return atomic.LoadUint32(&e.recentTS)
+}
+
 // keepalive is a synchronization wrapper used to appease stateify. See the
 // comment in endpoint, where it is used.
 //
@@ -656,7 +714,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
@@ -672,7 +730,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 			}
 		}
 	}
-	if e.state.connected() {
+	if e.EndpointState().connected() {
 		// Determine if the endpoint is writable if requested.
 		if (mask & waiter.EventOut) != 0 {
 			e.sndBufMu.Lock()
@@ -733,14 +791,20 @@ func (e *endpoint) Close() {
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdown()
+}
 
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdown() {
 	e.mu.Lock()
 
 	// For listening sockets, we always release ports inline so that they
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
 	// in Listen() when trying to register.
-	if e.state == StateListen && e.isPortReserved {
+	if e.EndpointState() == StateListen && e.isPortReserved {
 		if e.isRegistered {
 			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 			e.isRegistered = false
@@ -780,6 +844,8 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 		defer close(done)
 		for n := range e.acceptedChan {
 			n.notifyProtocolGoroutine(notifyReset)
+			// close all connections that have completed but
+			// not accepted by the application.
 			n.Close()
 		}
 	}()
@@ -797,11 +863,13 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 // after Close() is called and the worker goroutine (if any) is done with its
 // work.
 func (e *endpoint) cleanupLocked() {
+
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
 	if e.acceptedChan != nil {
 		e.closePendingAcceptableConnectionsLocked()
 	}
+
 	e.workerCleanup = false
 
 	if e.isRegistered {
@@ -920,7 +988,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	// reads to proceed before returning a ECONNRESET.
 	e.rcvListMu.Lock()
 	bufUsed := e.rcvBufUsed
-	if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
+	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
 		e.mu.RUnlock()
@@ -944,7 +1012,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || !e.state.connected() {
+		if e.rcvClosed || !e.EndpointState().connected() {
 			return buffer.View{}, tcpip.ErrClosedForReceive
 		}
 		return buffer.View{}, tcpip.ErrWouldBlock
@@ -980,8 +1048,8 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
 	// The endpoint cannot be written to if it's not connected.
-	if !e.state.connected() {
-		switch e.state {
+	if !e.EndpointState().connected() {
+		switch e.EndpointState() {
 		case StateError:
 			return 0, e.HardError
 		default:
@@ -1039,42 +1107,86 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, perr
 	}
 
-	if !opts.Atomic { // See above.
-		e.mu.RLock()
-		e.sndBufMu.Lock()
+	if opts.Atomic {
+		// Add data to the send queue.
+		s := newSegmentFromView(&e.route, e.ID, v)
+		e.sndBufUsed += len(v)
+		e.sndBufInQueue += seqnum.Size(len(v))
+		e.sndQueue.PushBack(s)
+		e.sndBufMu.Unlock()
+		// Release the endpoint lock to prevent deadlocks due to lock
+		// order inversion when acquiring workMu.
+		e.mu.RUnlock()
+	}
 
-		// Because we released the lock before copying, check state again
-		// to make sure the endpoint is still in a valid state for a write.
-		avail, err = e.isEndpointWritableLocked()
-		if err != nil {
+	if e.workMu.TryLock() {
+		// Since we released locks in between it's possible that the
+		// endpoint transitioned to a CLOSED/ERROR states so make
+		// sure endpoint is still writable before trying to write.
+		if !opts.Atomic { // See above.
+			e.mu.RLock()
+			e.sndBufMu.Lock()
+
+			// Because we released the lock before copying, check state again
+			// to make sure the endpoint is still in a valid state for a write.
+			avail, err = e.isEndpointWritableLocked()
+			if err != nil {
+				e.sndBufMu.Unlock()
+				e.mu.RUnlock()
+				e.stats.WriteErrors.WriteClosed.Increment()
+				return 0, nil, err
+			}
+
+			// Discard any excess data copied in due to avail being reduced due
+			// to a simultaneous write call to the socket.
+			if avail < len(v) {
+				v = v[:avail]
+			}
+			// Add data to the send queue.
+			s := newSegmentFromView(&e.route, e.ID, v)
+			e.sndBufUsed += len(v)
+			e.sndBufInQueue += seqnum.Size(len(v))
+			e.sndQueue.PushBack(s)
 			e.sndBufMu.Unlock()
+			// Release the endpoint lock to prevent deadlocks due to lock
+			// order inversion when acquiring workMu.
 			e.mu.RUnlock()
-			e.stats.WriteErrors.WriteClosed.Increment()
-			return 0, nil, err
-		}
 
-		// Discard any excess data copied in due to avail being reduced due
-		// to a simultaneous write call to the socket.
-		if avail < len(v) {
-			v = v[:avail]
 		}
-	}
-
-	// Add data to the send queue.
-	s := newSegmentFromView(&e.route, e.ID, v)
-	e.sndBufUsed += len(v)
-	e.sndBufInQueue += seqnum.Size(len(v))
-	e.sndQueue.PushBack(s)
-	e.sndBufMu.Unlock()
-	// Release the endpoint lock to prevent deadlocks due to lock
-	// order inversion when acquiring workMu.
-	e.mu.RUnlock()
-
-	if e.workMu.TryLock() {
 		// Do the work inline.
 		e.handleWrite()
 		e.workMu.Unlock()
 	} else {
+		if !opts.Atomic { // See above.
+			e.mu.RLock()
+			e.sndBufMu.Lock()
+
+			// Because we released the lock before copying, check state again
+			// to make sure the endpoint is still in a valid state for a write.
+			avail, err = e.isEndpointWritableLocked()
+			if err != nil {
+				e.sndBufMu.Unlock()
+				e.mu.RUnlock()
+				e.stats.WriteErrors.WriteClosed.Increment()
+				return 0, nil, err
+			}
+
+			// Discard any excess data copied in due to avail being reduced due
+			// to a simultaneous write call to the socket.
+			if avail < len(v) {
+				v = v[:avail]
+			}
+			// Add data to the send queue.
+			s := newSegmentFromView(&e.route, e.ID, v)
+			e.sndBufUsed += len(v)
+			e.sndBufInQueue += seqnum.Size(len(v))
+			e.sndQueue.PushBack(s)
+			e.sndBufMu.Unlock()
+			// Release the endpoint lock to prevent deadlocks due to lock
+			// order inversion when acquiring workMu.
+			e.mu.RUnlock()
+
+		}
 		// Let the protocol goroutine do the work.
 		e.sndWaker.Assert()
 	}
@@ -1091,7 +1203,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
-	if s := e.state; !s.connected() && s != StateClose {
+	if s := e.EndpointState(); !s.connected() && s != StateClose {
 		if s == StateError {
 			return 0, tcpip.ControlMessages{}, e.HardError
 		}
@@ -1103,7 +1215,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	defer e.rcvListMu.Unlock()
 
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || !e.state.connected() {
+		if e.rcvClosed || !e.EndpointState().connected() {
 			e.stats.ReadErrors.ReadClosed.Increment()
 			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
@@ -1187,7 +1299,7 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		// We only allow this to be set when we're in the initial state.
-		if e.state != StateInitial {
+		if e.EndpointState() != StateInitial {
 			return tcpip.ErrInvalidEndpointState
 		}
 
@@ -1402,14 +1514,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 				// Acquire the work mutex as we may need to
 				// reinitialize the congestion control state.
 				e.mu.Lock()
-				state := e.state
+				state := e.EndpointState()
 				e.cc = v
 				e.mu.Unlock()
 				switch state {
 				case StateEstablished:
 					e.workMu.Lock()
 					e.mu.Lock()
-					if e.state == state {
+					if e.EndpointState() == state {
 						e.snd.cc = e.snd.initCongestionControl(e.cc)
 					}
 					e.mu.Unlock()
@@ -1472,7 +1584,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// The endpoint cannot be in listen state.
-	if e.state == StateListen {
+	if e.EndpointState() == StateListen {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
@@ -1731,7 +1843,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		return err
 	}
 
-	if e.state.connected() {
+	if e.EndpointState().connected() {
 		// The endpoint is already connected. If caller hasn't been
 		// notified yet, return success.
 		if !e.isConnectNotified {
@@ -1743,7 +1855,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	nicID := addr.NIC
-	switch e.state {
+	switch e.EndpointState() {
 	case StateBound:
 		// If we're already bound to a NIC but the caller is requesting
 		// that we use a different one now, we cannot proceed.
@@ -1850,7 +1962,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	e.isRegistered = true
-	e.state = StateConnecting
+	e.setEndpointState(StateConnecting)
 	e.route = r.Clone()
 	e.boundNICID = nicID
 	e.effectiveNetProtos = netProtos
@@ -1871,14 +1983,13 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		}
 		e.segmentQueue.mu.Unlock()
 		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
-		e.state = StateEstablished
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.setEndpointState(StateEstablished)
 	}
 
 	if run {
 		e.workerRunning = true
 		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
-		go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
+		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
 	}
 
 	return tcpip.ErrConnectStarted
@@ -1896,7 +2007,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.shutdownFlags |= flags
 	finQueued := false
 	switch {
-	case e.state.connected():
+	case e.EndpointState().connected():
 		// Close for read.
 		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
 			// Mark read side as closed.
@@ -1908,8 +2019,23 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				e.notifyProtocolGoroutine(notifyReset)
+				// Move the socket to error state immediately.
+				// This is done redundantly because in case of
+				// save/restore on a Shutdown/Close() the socket
+				// state needs to indicate the error otherwise
+				// save file will show the socket in established
+				// state even though snd/rcv are closed.
 				e.mu.Unlock()
+				// Try to send an active reset immediately if the
+				// work mutex is available.
+				if e.workMu.TryLock() {
+					e.mu.Lock()
+					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+					e.mu.Unlock()
+					e.workMu.Unlock()
+				} else {
+					e.notifyProtocolGoroutine(notifyReset)
+				}
 				return nil
 			}
 		}
@@ -1931,11 +2057,10 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
-
 			e.sndBufMu.Unlock()
 		}
 
-	case e.state == StateListen:
+	case e.EndpointState() == StateListen:
 		// Tell protocolListenLoop to stop.
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
@@ -1976,7 +2101,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	// When the endpoint shuts down, it sets workerCleanup to true, and from
 	// that point onward, acceptedChan is the responsibility of the cleanup()
 	// method (and should not be touched anywhere else, including here).
-	if e.state == StateListen && !e.workerCleanup {
+	if e.EndpointState() == StateListen && !e.workerCleanup {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
 		if len(e.acceptedChan) > backlog {
@@ -1994,7 +2119,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		return nil
 	}
 
-	if e.state == StateInitial {
+	if e.EndpointState() == StateInitial {
 		// The listen is called on an unbound socket, the socket is
 		// automatically bound to a random free port with the local
 		// address set to INADDR_ANY.
@@ -2004,7 +2129,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	// Endpoint must be bound before it can transition to listen mode.
-	if e.state != StateBound {
+	if e.EndpointState() != StateBound {
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -2015,24 +2140,27 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	e.isRegistered = true
-	e.state = StateListen
+	e.setEndpointState(StateListen)
+
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
 	e.workerRunning = true
-
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
-
 	return nil
 }
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+	e.mu.Lock()
 	e.waiterQueue = waiterQueue
 	e.workerRunning = true
-	go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
+	e.mu.Unlock()
+	wakerInitDone := make(chan struct{})
+	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
+	<-wakerInitDone
 }
 
 // Accept returns a new endpoint if a peer has established a connection
@@ -2042,7 +2170,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// Endpoint must be in listen state before it can accept connections.
-	if e.state != StateListen {
+	if e.EndpointState() != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
@@ -2069,7 +2197,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore. This is because once the endpoint goes into a connected or
 	// listen state, it is already bound.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return tcpip.ErrAlreadyBound
 	}
 
@@ -2131,7 +2259,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	// Mark endpoint as bound.
-	e.state = StateBound
+	e.setEndpointState(StateBound)
 
 	return nil
 }
@@ -2153,7 +2281,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if !e.state.connected() {
+	if !e.EndpointState().connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -2164,45 +2292,22 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
-	s := newSegment(r, id, pkt)
-	if !s.parse() {
-		e.stack.Stats().MalformedRcvdPackets.Increment()
-		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
-		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
-		s.decRef()
-		return
-	}
-
-	if !s.csumValid {
-		e.stack.Stats().MalformedRcvdPackets.Increment()
-		e.stack.Stats().TCP.ChecksumErrors.Increment()
-		e.stats.ReceiveErrors.ChecksumErrors.Increment()
-		s.decRef()
-		return
-	}
-
-	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
-	e.stats.SegmentsReceived.Increment()
-	if (s.flags & header.TCPFlagRst) != 0 {
-		e.stack.Stats().TCP.ResetsReceived.Increment()
-	}
-
-	e.enqueueSegment(s)
+	// TCP HandlePacket is not required anymore as inbound packets first
+	// land at the Dispatcher which then can either delivery using the
+	// worker go routine or directly do the invoke the tcp processing inline
+	// based on the state of the endpoint.
 }
 
-func (e *endpoint) enqueueSegment(s *segment) {
+func (e *endpoint) enqueueSegment(s *segment) bool {
 	// Send packet to worker goroutine.
-	if e.segmentQueue.enqueue(s) {
-		e.newSegmentWaker.Assert()
-	} else {
+	if !e.segmentQueue.enqueue(s) {
 		// The queue is full, so we drop the segment.
 		e.stack.Stats().DroppedPackets.Increment()
 		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
-		s.decRef()
+		return false
 	}
+	return true
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
@@ -2319,8 +2424,8 @@ func (e *endpoint) rcvWndScaleForHandshake() int {
 // updateRecentTimestamp updates the recent timestamp using the algorithm
 // described in https://tools.ietf.org/html/rfc7323#section-4.3
 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
-	if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
-		e.recentTS = tsVal
+	if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
+		e.setRecentTimestamp(tsVal)
 	}
 }
 
@@ -2330,7 +2435,7 @@ func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value,
 func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
 	if synOpts.TS {
 		e.sendTSOk = true
-		e.recentTS = synOpts.TSVal
+		e.setRecentTimestamp(synOpts.TSVal)
 	}
 }
 
@@ -2419,7 +2524,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 
 	// Endpoint TCP Option state.
 	s.SendTSOk = e.sendTSOk
-	s.RecentTS = e.recentTS
+	s.RecentTS = e.recentTimestamp()
 	s.TSOffset = e.tsOffset
 	s.SACKPermitted = e.sackPermitted
 	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
@@ -2526,9 +2631,7 @@ func (e *endpoint) initGSO() {
 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
 // state for diagnostics.
 func (e *endpoint) State() uint32 {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return uint32(e.state)
+	return uint32(e.EndpointState())
 }
 
 // Info returns a copy of the endpoint info.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4b8d867bc..4a46f0ec5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"fmt"
+	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sync"
@@ -48,7 +49,7 @@ func (e *endpoint) beforeSave() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial, StateBound:
 		// TODO(b/138137272): this enumeration duplicates
 		// EndpointState.connected. remove it.
@@ -70,31 +71,30 @@ func (e *endpoint) beforeSave() {
 		fallthrough
 	case StateListen, StateConnecting:
 		e.drainSegmentLocked()
-		if e.state != StateClose && e.state != StateError {
+		if e.EndpointState() != StateClose && e.EndpointState() != StateError {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
 			break
 		}
-		fallthrough
 	case StateError, StateClose:
-		for (e.state == StateError || e.state == StateClose) && e.workerRunning {
+		for e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
 			e.mu.Lock()
 		}
 		if e.workerRunning {
-			panic("endpoint still has worker running in closed or error state")
+			panic(fmt.Sprintf("endpoint: %+v still has worker running in closed or error state", e.ID))
 		}
 	default:
-		panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
+		panic(fmt.Sprintf("endpoint in unknown state %v", e.EndpointState()))
 	}
 
 	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
 		panic("endpoint still has waiters upon save")
 	}
 
-	if e.state != StateClose && !((e.state == StateBound || e.state == StateListen) == e.isPortReserved) {
+	if e.EndpointState() != StateClose && !((e.EndpointState() == StateBound || e.EndpointState() == StateListen) == e.isPortReserved) {
 		panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
 	}
 }
@@ -135,7 +135,7 @@ func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
 
 // saveState is invoked by stateify.
 func (e *endpoint) saveState() EndpointState {
-	return e.state
+	return e.EndpointState()
 }
 
 // Endpoint loading must be done in the following ordering by their state, to
@@ -151,7 +151,8 @@ var connectingLoading sync.WaitGroup
 func (e *endpoint) loadState(state EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
-	if state.connected() {
+	// For restore purposes we treat TimeWait like a connected endpoint.
+	if state.connected() || state == StateTimeWait {
 		connectedLoading.Add(1)
 	}
 	switch state {
@@ -160,13 +161,14 @@ func (e *endpoint) loadState(state EndpointState) {
 	case StateConnecting, StateSynSent, StateSynRecv:
 		connectingLoading.Add(1)
 	}
-	e.state = state
+	// Directly update the state here rather than using e.setEndpointState
+	// as the endpoint is still being loaded and the stack reference to increment
+	// metrics is not yet initialized.
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
 }
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
-	// Freeze segment queue before registering to prevent any segments
-	// from being delivered while it is being restored.
 	e.origEndpointState = e.state
 	// Restore the endpoint to InitialState as it will be moved to
 	// its origEndpointState during Resume.
@@ -180,7 +182,6 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
 	state := e.origEndpointState
-
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
@@ -276,7 +277,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 				listenLoading.Wait()
 				connectingLoading.Wait()
 				bind()
-				e.state = StateClose
+				e.setEndpointState(StateClose)
 				tcpip.AsyncLoading.Done()
 			}()
 		}
@@ -288,6 +289,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
+
 }
 
 // saveLastError is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 9a8f64aa6..958c06fa7 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -21,6 +21,7 @@
 package tcp
 
 import (
+	"runtime"
 	"strings"
 	"time"
 
@@ -104,6 +105,7 @@ type protocol struct {
 	moderateReceiveBuffer      bool
 	tcpLingerTimeout           time.Duration
 	tcpTimeWaitTimeout         time.Duration
+	dispatcher                 *dispatcher
 }
 
 // Number returns the tcp protocol number.
@@ -134,6 +136,14 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 	return h.SourcePort(), h.DestinationPort(), nil
 }
 
+// QueuePacket queues packets targeted at an endpoint after hashing the packet
+// to a specific processing queue. Each queue is serviced by its own processor
+// goroutine which is responsible for dequeuing and doing full TCP dispatch of
+// the packet.
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	p.dispatcher.queuePacket(r, ep, id, pkt)
+}
+
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
 //
@@ -330,5 +340,6 @@ func NewProtocol() stack.TransportProtocol {
 		availableCongestionControl: []string{ccReno, ccCubic},
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 05c8488f8..958f03ac1 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -169,19 +169,19 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		// We just received a FIN, our next state depends on whether we sent a
 		// FIN already or not.
 		r.ep.mu.Lock()
-		switch r.ep.state {
+		switch r.ep.EndpointState() {
 		case StateEstablished:
-			r.ep.state = StateCloseWait
+			r.ep.setEndpointState(StateCloseWait)
 		case StateFinWait1:
 			if s.flagIsSet(header.TCPFlagAck) {
 				// FIN-ACK, transition to TIME-WAIT.
-				r.ep.state = StateTimeWait
+				r.ep.setEndpointState(StateTimeWait)
 			} else {
 				// Simultaneous close, expecting a final ACK.
-				r.ep.state = StateClosing
+				r.ep.setEndpointState(StateClosing)
 			}
 		case StateFinWait2:
-			r.ep.state = StateTimeWait
+			r.ep.setEndpointState(StateTimeWait)
 		}
 		r.ep.mu.Unlock()
 
@@ -205,16 +205,16 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// shutdown states.
 	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
 		r.ep.mu.Lock()
-		switch r.ep.state {
+		switch r.ep.EndpointState() {
 		case StateFinWait1:
-			r.ep.state = StateFinWait2
+			r.ep.setEndpointState(StateFinWait2)
 			// Notify protocol goroutine that we have received an
 			// ACK to our FIN so that it can start the FIN_WAIT2
 			// timer to abort connection if the other side does
 			// not close within 2MSL.
 			r.ep.notifyProtocolGoroutine(notifyClose)
 		case StateClosing:
-			r.ep.state = StateTimeWait
+			r.ep.setEndpointState(StateTimeWait)
 		case StateLastAck:
 			r.ep.transitionToStateCloseLocked()
 		}
@@ -267,7 +267,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	switch state {
 	case StateCloseWait, StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
-			s.decRef()
 			// Just drop the segment as we have
 			// already received a FIN and this
 			// segment is after the sequence number
@@ -284,7 +283,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// trigger a RST.
 		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
 		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
-			s.decRef()
 			return true, tcpip.ErrConnectionAborted
 		}
 		if state == StateFinWait1 {
@@ -314,7 +312,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// the last actual data octet in a segment in
 		// which it occurs.
 		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
-			s.decRef()
 			return true, tcpip.ErrConnectionAborted
 		}
 	}
@@ -336,7 +333,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 // r as they arrive. It is called by the protocol main loop.
 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 	r.ep.mu.RLock()
-	state := r.ep.state
+	state := r.ep.EndpointState()
 	closed := r.ep.closed
 	r.ep.mu.RUnlock()
 
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index fdff7ed81..b74b61e7d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -705,17 +705,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		}
 		seg.flags = header.TCPFlagAck | header.TCPFlagFin
 		segEnd = seg.sequenceNumber.Add(1)
-		// Transition to FIN-WAIT1 state since we're initiating an active close.
-		s.ep.mu.Lock()
-		switch s.ep.state {
+		// Update the state to reflect that we have now
+		// queued a FIN.
+		switch s.ep.EndpointState() {
 		case StateCloseWait:
-			// We've already received a FIN and are now sending our own. The
-			// sender is now awaiting a final ACK for this FIN.
-			s.ep.state = StateLastAck
+			s.ep.setEndpointState(StateLastAck)
 		default:
-			s.ep.state = StateFinWait1
+			s.ep.setEndpointState(StateFinWait1)
 		}
-		s.ep.mu.Unlock()
+
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 6edfa8dce..a9dfbe857 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -293,7 +293,6 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		checker.SeqNum(uint32(c.IRS+1)),
 		checker.AckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
-
 	finHeaders := &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
@@ -459,6 +458,9 @@ func TestConnectResetAfterClose(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
+				// RST is always generated with sndNxt which if the FIN
+				// has been sent will be 1 higher than the sequence number
+				// of the FIN itself.
 				checker.SeqNum(uint32(c.IRS)+2),
 				checker.AckNum(0),
 				checker.TCPFlags(header.TCPFlagRst),
@@ -1500,6 +1502,9 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			// RST is always generated with sndNxt which if the FIN
+			// has been sent will be 1 higher than the sequence
+			// number of the FIN itself.
 			checker.SeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
@@ -5441,6 +5446,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 		rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
 		packetsSent++
 	}
+
 	// Resume the worker so that it only sees the packets once all of them
 	// are waiting to be read.
 	worker.ResumeWork()
@@ -5508,7 +5514,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 	stk := c.Stack()
 	// Set lower limits for auto-tuning tests. This is required because the
 	// test stops the worker which can cause packets to be dropped because
-	// the segment queue holding unprocessed packets is limited to 500.
+	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
 	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, receiveBufferSize, maxReceiveBufferSize}); err != nil {
@@ -5563,6 +5569,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 			totalSent += mss
 			packetsSent++
 		}
+
 		// Resume it so that it only sees the packets once all of them
 		// are waiting to be read.
 		worker.ResumeWork()
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 5d114d460..2f9821555 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -533,7 +533,7 @@ TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
 
   // Sleep for a little over the linger timeout to reduce flakiness in
   // save/restore tests.
-  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2));
 
   ds.reset();
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 6b99c021d..33a5ac66c 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -814,6 +814,20 @@ TEST_P(TcpSocketTest, FullBuffer) {
   t_ = -1;
 }
 
+TEST_P(TcpSocketTest, PollAfterShutdown) {
+  ScopedThread client_thread([this]() {
+    EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceedsWithValue(0));
+    struct pollfd poll_fd = {s_, POLLIN | POLLERR | POLLHUP, 0};
+    EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+                SyscallSucceedsWithValue(1));
+  });
+
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceedsWithValue(0));
+  struct pollfd poll_fd = {t_, POLLIN | POLLERR | POLLHUP, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+              SyscallSucceedsWithValue(1));
+}
+
 TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
   // Initialize address to the loopback one.
   sockaddr_storage addr =
-- 
cgit v1.2.3


From 95e9de31d20ee1c7262fe5870e10485a369e6497 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 14 Jan 2020 17:54:02 -0800
Subject: Address Nic's comments.

---
 pkg/tcpip/iptables/iptables.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 83d807a4d..605a71679 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -153,6 +153,8 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 			return false
 		case Stolen, Queue, Repeat, None, Jump, Return, Continue:
 			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
+		default:
+			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
 	}
 
@@ -174,6 +176,8 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 			continue
 		case Stolen, Queue, Repeat, None, Jump, Return:
 			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
+		default:
+			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
 	}
 
-- 
cgit v1.2.3


From 376a777c55680f8139313d87bd460248fd251941 Mon Sep 17 00:00:00 2001
From: "chris.zn" <chris.zn@antfin.com>
Date: Tue, 14 Jan 2020 18:59:33 +0800
Subject: Fix "unlock of unlocked mutex" crash when getting tty

This patch holds taskset.mu when getting tty. If we don't
do this, it may cause a "unlock of unlocked mutex" problem,
since signalHandlers may be replaced by CopyForExec() in
runSyscallAfterExecStop after the signalHandlers.mu has
been holded in TTY().

The problem is easy to reproduce with keeping to do "runsc ps".

The crash log is :

fatal error: sync: unlock of unlocked mutex

goroutine 5801304 [running]:
runtime.throw(0xfd019c, 0x1e)
        GOROOT/src/runtime/panic.go:774 +0x72 fp=0xc001ba47b0 sp=0xc001ba4780 pc=0x431702
sync.throw(0xfd019c, 0x1e)
        GOROOT/src/runtime/panic.go:760 +0x35 fp=0xc001ba47d0 sp=0xc001ba47b0 pc=0x431685
sync.(*Mutex).unlockSlow(0xc00cf94a30, 0xc0ffffffff)
        GOROOT/src/sync/mutex.go:196 +0xd6 fp=0xc001ba47f8 sp=0xc001ba47d0 pc=0x4707d6
sync.(*Mutex).Unlock(0xc00cf94a30)
        GOROOT/src/sync/mutex.go:190 +0x48 fp=0xc001ba4818 sp=0xc001ba47f8 pc=0x4706e8
gvisor.dev/gvisor/pkg/sentry/kernel.(*ThreadGroup).TTY(0xc011a9e800, 0x0)
        pkg/sentry/kernel/tty.go:38 +0x88 fp=0xc001ba4868 sp=0xc001ba4818 pc=0x835fa8
gvisor.dev/gvisor/pkg/sentry/control.Processes(0xc00025ae00, 0xc013e397c0, 0x40, 0xc0137b9800, 0x1, 0x7f292e9a4cc0)
        pkg/sentry/control/proc.go:366 +0x355 fp=0xc001ba49a0 sp=0xc001ba4868 pc=0x9ac4a5
gvisor.dev/gvisor/runsc/boot.(*containerManager).Processes(0xc0003b62c0, 0xc0051423d0, 0xc0137b9800, 0x0, 0x0)
        runsc/boot/controller.go:228 +0xdf fp=0xc001ba49e8 sp=0xc001ba49a0 pc=0xaf06cf

Signed-off-by: chris.zn <chris.zn@antfin.com>
---
 pkg/sentry/kernel/tty.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
index 464d2306a..d0e0810e8 100644
--- a/pkg/sentry/kernel/tty.go
+++ b/pkg/sentry/kernel/tty.go
@@ -33,6 +33,8 @@ type TTY struct {
 // TTY returns the thread group's controlling terminal. If nil, there is no
 // controlling terminal.
 func (tg *ThreadGroup) TTY() *TTY {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
 	tg.signalHandlers.mu.Lock()
 	defer tg.signalHandlers.mu.Unlock()
 	return tg.tty
-- 
cgit v1.2.3


From ff78a721700f8b7d3c8dae14fc14c04f3a82b970 Mon Sep 17 00:00:00 2001
From: lubinszARM <34124929+lubinszARM@users.noreply.github.com>
Date: Tue, 14 Jan 2020 22:22:45 -0800
Subject: enable pkg/sentry/arch to support arm64 basically

Signed-off-by: Bin Lu <bin.lu@arm.com>
Change-Id: I9cce23db4e5caec82ce42b4970fdb7f7e8c08f1d
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/773 from lubinszARM:pr_arch_basic 3fe2fd8e6286766bbe489ef971dce204f924feba
PiperOrigin-RevId: 289795569
---
 pkg/sentry/arch/BUILD                 |   6 +
 pkg/sentry/arch/arch_aarch64.go       | 293 ++++++++++++++++++++++++++++++++++
 pkg/sentry/arch/arch_arm64.go         | 266 ++++++++++++++++++++++++++++++
 pkg/sentry/arch/arch_state_aarch64.go |  38 +++++
 pkg/sentry/arch/arch_state_x86.go     |   2 +
 pkg/sentry/arch/registers.proto       |  37 +++++
 pkg/sentry/arch/signal.go             | 250 +++++++++++++++++++++++++++++
 pkg/sentry/arch/signal_amd64.go       | 230 --------------------------
 pkg/sentry/arch/signal_arm64.go       | 126 +++++++++++++++
 pkg/sentry/arch/signal_stack.go       |   2 +-
 pkg/sentry/arch/syscalls_arm64.go     |  62 +++++++
 11 files changed, 1081 insertions(+), 231 deletions(-)
 create mode 100644 pkg/sentry/arch/arch_aarch64.go
 create mode 100644 pkg/sentry/arch/arch_arm64.go
 create mode 100644 pkg/sentry/arch/arch_state_aarch64.go
 create mode 100644 pkg/sentry/arch/signal.go
 create mode 100644 pkg/sentry/arch/signal_arm64.go
 create mode 100644 pkg/sentry/arch/syscalls_arm64.go

diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index ae3e364cd..65f22af2b 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -9,17 +9,23 @@ go_library(
     srcs = [
         "aligned.go",
         "arch.go",
+        "arch_aarch64.go",
         "arch_amd64.go",
         "arch_amd64.s",
+        "arch_arm64.go",
+        "arch_state_aarch64.go",
         "arch_state_x86.go",
         "arch_x86.go",
         "auxv.go",
+        "signal.go",
         "signal_act.go",
         "signal_amd64.go",
+        "signal_arm64.go",
         "signal_info.go",
         "signal_stack.go",
         "stack.go",
         "syscalls_amd64.go",
+        "syscalls_arm64.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/arch",
     visibility = ["//:sandbox"],
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
new file mode 100644
index 000000000..ea4dedbdf
--- /dev/null
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -0,0 +1,293 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package arch
+
+import (
+	"fmt"
+	"io"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/log"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	// SyscallWidth is the width of insturctions.
+	SyscallWidth = 4
+)
+
+// aarch64FPState is aarch64 floating point state.
+type aarch64FPState []byte
+
+// initAarch64FPState (defined in asm files) sets up initial state.
+func initAarch64FPState(data *FloatingPointData) {
+	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+}
+
+func newAarch64FPStateSlice() []byte {
+	return alignedBytes(4096, 32)[:4096]
+}
+
+// newAarch64FPState returns an initialized floating point state.
+//
+// The returned state is large enough to store all floating point state
+// supported by host, even if the app won't use much of it due to a restricted
+// FeatureSet. Since they may still be able to see state not advertised by
+// CPUID we must ensure it does not contain any sentry state.
+func newAarch64FPState() aarch64FPState {
+	f := aarch64FPState(newAarch64FPStateSlice())
+	initAarch64FPState(f.FloatingPointData())
+	return f
+}
+
+// fork creates and returns an identical copy of the aarch64 floating point state.
+func (f aarch64FPState) fork() aarch64FPState {
+	n := aarch64FPState(newAarch64FPStateSlice())
+	copy(n, f)
+	return n
+}
+
+// FloatingPointData returns the raw data pointer.
+func (f aarch64FPState) FloatingPointData() *FloatingPointData {
+	return (*FloatingPointData)(&f[0])
+}
+
+// NewFloatingPointData returns a new floating point data blob.
+//
+// This is primarily for use in tests.
+func NewFloatingPointData() *FloatingPointData {
+	return (*FloatingPointData)(&(newAarch64FPState()[0]))
+}
+
+// State contains the common architecture bits for aarch64 (the build tag of this
+// file ensures it's only built on aarch64).
+type State struct {
+	// The system registers.
+	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+
+	// Our floating point state.
+	aarch64FPState `state:"wait"`
+
+	// FeatureSet is a pointer to the currently active feature set.
+	FeatureSet *cpuid.FeatureSet
+}
+
+// Proto returns a protobuf representation of the system registers in State.
+func (s State) Proto() *rpb.Registers {
+	regs := &rpb.ARM64Registers{
+		R0:     s.Regs.Regs[0],
+		R1:     s.Regs.Regs[1],
+		R2:     s.Regs.Regs[2],
+		R3:     s.Regs.Regs[3],
+		R4:     s.Regs.Regs[4],
+		R5:     s.Regs.Regs[5],
+		R6:     s.Regs.Regs[6],
+		R7:     s.Regs.Regs[7],
+		R8:     s.Regs.Regs[8],
+		R9:     s.Regs.Regs[9],
+		R10:    s.Regs.Regs[10],
+		R11:    s.Regs.Regs[11],
+		R12:    s.Regs.Regs[12],
+		R13:    s.Regs.Regs[13],
+		R14:    s.Regs.Regs[14],
+		R15:    s.Regs.Regs[15],
+		R16:    s.Regs.Regs[16],
+		R17:    s.Regs.Regs[17],
+		R18:    s.Regs.Regs[18],
+		R19:    s.Regs.Regs[19],
+		R20:    s.Regs.Regs[20],
+		R21:    s.Regs.Regs[21],
+		R22:    s.Regs.Regs[22],
+		R23:    s.Regs.Regs[23],
+		R24:    s.Regs.Regs[24],
+		R25:    s.Regs.Regs[25],
+		R26:    s.Regs.Regs[26],
+		R27:    s.Regs.Regs[27],
+		R28:    s.Regs.Regs[28],
+		R29:    s.Regs.Regs[29],
+		R30:    s.Regs.Regs[30],
+		Sp:     s.Regs.Sp,
+		Pc:     s.Regs.Pc,
+		Pstate: s.Regs.Pstate,
+	}
+	return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}}
+}
+
+// Fork creates and returns an identical copy of the state.
+func (s *State) Fork() State {
+	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+	return State{
+		Regs:       s.Regs,
+		FeatureSet: s.FeatureSet,
+	}
+}
+
+// StateData implements Context.StateData.
+func (s *State) StateData() *State {
+	return s
+}
+
+// CPUIDEmulate emulates a cpuid instruction.
+func (s *State) CPUIDEmulate(l log.Logger) {
+	// TODO(gvisor.dev/issue/1255): cpuid is not supported.
+}
+
+// SingleStep implements Context.SingleStep.
+func (s *State) SingleStep() bool {
+	return false
+}
+
+// SetSingleStep enables single stepping.
+func (s *State) SetSingleStep() {
+	// Set the trap flag.
+	// TODO(gvisor.dev/issue/1239): ptrace single-step is not supported.
+}
+
+// ClearSingleStep enables single stepping.
+func (s *State) ClearSingleStep() {
+	// Clear the trap flag.
+	// TODO(gvisor.dev/issue/1239): ptrace single-step is not supported.
+}
+
+// RegisterMap returns a map of all registers.
+func (s *State) RegisterMap() (map[string]uintptr, error) {
+	return map[string]uintptr{
+		"R0":     uintptr(s.Regs.Regs[0]),
+		"R1":     uintptr(s.Regs.Regs[1]),
+		"R2":     uintptr(s.Regs.Regs[2]),
+		"R3":     uintptr(s.Regs.Regs[3]),
+		"R4":     uintptr(s.Regs.Regs[4]),
+		"R5":     uintptr(s.Regs.Regs[5]),
+		"R6":     uintptr(s.Regs.Regs[6]),
+		"R7":     uintptr(s.Regs.Regs[7]),
+		"R8":     uintptr(s.Regs.Regs[8]),
+		"R9":     uintptr(s.Regs.Regs[9]),
+		"R10":    uintptr(s.Regs.Regs[10]),
+		"R11":    uintptr(s.Regs.Regs[11]),
+		"R12":    uintptr(s.Regs.Regs[12]),
+		"R13":    uintptr(s.Regs.Regs[13]),
+		"R14":    uintptr(s.Regs.Regs[14]),
+		"R15":    uintptr(s.Regs.Regs[15]),
+		"R16":    uintptr(s.Regs.Regs[16]),
+		"R17":    uintptr(s.Regs.Regs[17]),
+		"R18":    uintptr(s.Regs.Regs[18]),
+		"R19":    uintptr(s.Regs.Regs[19]),
+		"R20":    uintptr(s.Regs.Regs[20]),
+		"R21":    uintptr(s.Regs.Regs[21]),
+		"R22":    uintptr(s.Regs.Regs[22]),
+		"R23":    uintptr(s.Regs.Regs[23]),
+		"R24":    uintptr(s.Regs.Regs[24]),
+		"R25":    uintptr(s.Regs.Regs[25]),
+		"R26":    uintptr(s.Regs.Regs[26]),
+		"R27":    uintptr(s.Regs.Regs[27]),
+		"R28":    uintptr(s.Regs.Regs[28]),
+		"R29":    uintptr(s.Regs.Regs[29]),
+		"R30":    uintptr(s.Regs.Regs[30]),
+		"Sp":     uintptr(s.Regs.Sp),
+		"Pc":     uintptr(s.Regs.Pc),
+		"Pstate": uintptr(s.Regs.Pstate),
+	}, nil
+}
+
+// PtraceGetRegs implements Context.PtraceGetRegs.
+func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
+	return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+}
+
+func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+	return s.Regs
+}
+
+var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+
+// PtraceSetRegs implements Context.PtraceSetRegs.
+func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
+	var regs syscall.PtraceRegs
+	buf := make([]byte, ptraceRegsSize)
+	if _, err := io.ReadFull(src, buf); err != nil {
+		return 0, err
+	}
+	binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+	s.Regs = regs
+	return ptraceRegsSize, nil
+}
+
+// PtraceGetFPRegs implements Context.PtraceGetFPRegs.
+func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) {
+	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+	return 0, nil
+}
+
+// PtraceSetFPRegs implements Context.PtraceSetFPRegs.
+func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) {
+	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+	return 0, nil
+}
+
+// Register sets defined in include/uapi/linux/elf.h.
+const (
+	_NT_PRSTATUS = 1
+	_NT_PRFPREG  = 2
+)
+
+// PtraceGetRegSet implements Context.PtraceGetRegSet.
+func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
+	switch regset {
+	case _NT_PRSTATUS:
+		if maxlen < ptraceRegsSize {
+			return 0, syserror.EFAULT
+		}
+		return s.PtraceGetRegs(dst)
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// PtraceSetRegSet implements Context.PtraceSetRegSet.
+func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
+	switch regset {
+	case _NT_PRSTATUS:
+		if maxlen < ptraceRegsSize {
+			return 0, syserror.EFAULT
+		}
+		return s.PtraceSetRegs(src)
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// FullRestore indicates whether a full restore is required.
+func (s *State) FullRestore() bool {
+	return false
+}
+
+// New returns a new architecture context.
+func New(arch Arch, fs *cpuid.FeatureSet) Context {
+	switch arch {
+	case ARM64:
+		return &context64{
+			State{
+				FeatureSet: fs,
+			},
+		}
+	}
+	panic(fmt.Sprintf("unknown architecture %v", arch))
+}
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
new file mode 100644
index 000000000..0d5b7d317
--- /dev/null
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -0,0 +1,266 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"fmt"
+	"math/rand"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// Host specifies the host architecture.
+const Host = ARM64
+
+// These constants come directly from Linux.
+const (
+	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
+	// for a 64-bit process.
+	maxAddr64 usermem.Addr = (1 << 48)
+
+	// maxStackRand64 is the maximum randomization to apply to the stack.
+	// It is defined by arch/arm64/mm/mmap.c:(STACK_RND_MASK << PAGE_SHIFT) in Linux.
+	maxStackRand64 = 0x3ffff << 12 // 16 GB
+
+	// maxMmapRand64 is the maximum randomization to apply to the mmap
+	// layout. It is defined by arch/arm64/mm/mmap.c:arch_mmap_rnd in Linux.
+	maxMmapRand64 = (1 << 33) * usermem.PageSize
+
+	// minGap64 is the minimum gap to leave at the top of the address space
+	// for the stack. It is defined by arch/arm64/mm/mmap.c:MIN_GAP in Linux.
+	minGap64 = (128 << 20) + maxStackRand64
+
+	// preferredPIELoadAddr is the standard Linux position-independent
+	// executable base load address. It is ELF_ET_DYN_BASE in Linux.
+	//
+	// The Platform {Min,Max}UserAddress() may preclude loading at this
+	// address. See other preferredFoo comments below.
+	preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5
+)
+
+// These constants are selected as heuristics to help make the Platform's
+// potentially limited address space conform as closely to Linux as possible.
+const (
+	preferredTopDownAllocMin usermem.Addr = 0x7e8000000000
+	preferredAllocationGap                = 128 << 30 // 128 GB
+	preferredTopDownBaseMin               = preferredTopDownAllocMin + preferredAllocationGap
+
+	// minMmapRand64 is the smallest we are willing to make the
+	// randomization to stay above preferredTopDownBaseMin.
+	minMmapRand64 = (1 << 18) * usermem.PageSize
+)
+
+// context64 represents an ARM64 context.
+type context64 struct {
+	State
+}
+
+// Arch implements Context.Arch.
+func (c *context64) Arch() Arch {
+	return ARM64
+}
+
+// Fork returns an exact copy of this context.
+func (c *context64) Fork() Context {
+	return &context64{
+		State: c.State.Fork(),
+	}
+}
+
+// General purpose registers usage on Arm64:
+// R0...R7: parameter/result registers.
+// R8: indirect result location register.
+// R9...R15: temporary rgisters.
+// R16: the first intra-procedure-call scratch register.
+// R17: the second intra-procedure-call scratch register.
+// R18: the platform register.
+// R19...R28: callee-saved registers.
+// R29: the frame pointer.
+// R30: the link register.
+
+// Return returns the current syscall return value.
+func (c *context64) Return() uintptr {
+	return uintptr(c.Regs.Regs[0])
+}
+
+// SetReturn sets the syscall return value.
+func (c *context64) SetReturn(value uintptr) {
+	c.Regs.Regs[0] = uint64(value)
+}
+
+// IP returns the current instruction pointer.
+func (c *context64) IP() uintptr {
+	return uintptr(c.Regs.Pc)
+}
+
+// SetIP sets the current instruction pointer.
+func (c *context64) SetIP(value uintptr) {
+	c.Regs.Pc = uint64(value)
+}
+
+// Stack returns the current stack pointer.
+func (c *context64) Stack() uintptr {
+	return uintptr(c.Regs.Sp)
+}
+
+// SetStack sets the current stack pointer.
+func (c *context64) SetStack(value uintptr) {
+	c.Regs.Sp = uint64(value)
+}
+
+// TLS returns the current TLS pointer.
+func (c *context64) TLS() uintptr {
+	// TODO(gvisor.dev/issue/1238): TLS is not supported.
+	// MRS_TPIDR_EL0
+	return 0
+}
+
+// SetTLS sets the current TLS pointer. Returns false if value is invalid.
+func (c *context64) SetTLS(value uintptr) bool {
+	// TODO(gvisor.dev/issue/1238): TLS is not supported.
+	// MSR_TPIDR_EL0
+	return false
+}
+
+// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
+func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+	c.Regs.Regs[3] = uint64(value)
+}
+
+// Native returns the native type for the given val.
+func (c *context64) Native(val uintptr) interface{} {
+	v := uint64(val)
+	return &v
+}
+
+// Value returns the generic val for the given native type.
+func (c *context64) Value(val interface{}) uintptr {
+	return uintptr(*val.(*uint64))
+}
+
+// Width returns the byte width of this architecture.
+func (c *context64) Width() uint {
+	return 8
+}
+
+// FeatureSet returns the FeatureSet in use.
+func (c *context64) FeatureSet() *cpuid.FeatureSet {
+	return c.State.FeatureSet
+}
+
+// mmapRand returns a random adjustment for randomizing an mmap layout.
+func mmapRand(max uint64) usermem.Addr {
+	return usermem.Addr(rand.Int63n(int64(max))).RoundDown()
+}
+
+// NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
+func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) {
+	min, ok := min.RoundUp()
+	if !ok {
+		return MmapLayout{}, syscall.EINVAL
+	}
+	if max > maxAddr64 {
+		max = maxAddr64
+	}
+	max = max.RoundDown()
+
+	if min > max {
+		return MmapLayout{}, syscall.EINVAL
+	}
+
+	stackSize := r.Get(limits.Stack)
+
+	// MAX_GAP in Linux.
+	maxGap := (max / 6) * 5
+	gap := usermem.Addr(stackSize.Cur)
+	if gap < minGap64 {
+		gap = minGap64
+	}
+	if gap > maxGap {
+		gap = maxGap
+	}
+	defaultDir := MmapTopDown
+	if stackSize.Cur == limits.Infinity {
+		defaultDir = MmapBottomUp
+	}
+
+	topDownMin := max - gap - maxMmapRand64
+	maxRand := usermem.Addr(maxMmapRand64)
+	if topDownMin < preferredTopDownBaseMin {
+		// Try to keep TopDownBase above preferredTopDownBaseMin by
+		// shrinking maxRand.
+		maxAdjust := maxRand - minMmapRand64
+		needAdjust := preferredTopDownBaseMin - topDownMin
+		if needAdjust <= maxAdjust {
+			maxRand -= needAdjust
+		}
+	}
+
+	rnd := mmapRand(uint64(maxRand))
+	l := MmapLayout{
+		MinAddr: min,
+		MaxAddr: max,
+		// TASK_UNMAPPED_BASE in Linux.
+		BottomUpBase:     (max/3 + rnd).RoundDown(),
+		TopDownBase:      (max - gap - rnd).RoundDown(),
+		DefaultDirection: defaultDir,
+		// We may have reduced the maximum randomization to keep
+		// TopDownBase above preferredTopDownBaseMin while maintaining
+		// our stack gap. Stack allocations must use that max
+		// randomization to avoiding eating into the gap.
+		MaxStackRand: uint64(maxRand),
+	}
+
+	// Final sanity check on the layout.
+	if !l.Valid() {
+		panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
+	}
+
+	return l, nil
+}
+
+// PIELoadAddress implements Context.PIELoadAddress.
+func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
+	base := preferredPIELoadAddr
+	max, ok := base.AddLength(maxMmapRand64)
+	if !ok {
+		panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
+	}
+
+	if max > l.MaxAddr {
+		// preferredPIELoadAddr won't fit; fall back to the standard
+		// Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
+		//
+		// Don't bother trying to shrink the randomization for now.
+		base = l.TopDownBase / 3 * 2
+	}
+
+	return base + mmapRand(maxMmapRand64)
+}
+
+// PtracePeekUser implements Context.PtracePeekUser.
+func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+	// TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
+	return c.Native(0), nil
+}
+
+// PtracePokeUser implements Context.PtracePokeUser.
+func (c *context64) PtracePokeUser(addr, data uintptr) error {
+	// TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
+	return nil
+}
diff --git a/pkg/sentry/arch/arch_state_aarch64.go b/pkg/sentry/arch/arch_state_aarch64.go
new file mode 100644
index 000000000..0136a85ad
--- /dev/null
+++ b/pkg/sentry/arch/arch_state_aarch64.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package arch
+
+import (
+	"syscall"
+)
+
+type syscallPtraceRegs struct {
+	Regs   [31]uint64
+	Sp     uint64
+	Pc     uint64
+	Pstate uint64
+}
+
+// saveRegs is invoked by stateify.
+func (s *State) saveRegs() syscallPtraceRegs {
+	return syscallPtraceRegs(s.Regs)
+}
+
+// loadRegs is invoked by stateify.
+func (s *State) loadRegs(r syscallPtraceRegs) {
+	s.Regs = syscall.PtraceRegs(r)
+}
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index 9061fcc86..84f11b0d1 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64 i386
+
 package arch
 
 import (
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 9dc83e241..60c027aab 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -48,8 +48,45 @@ message AMD64Registers {
   uint64 gs_base = 27;
 }
 
+message ARM64Registers {
+  uint64 r0 = 1;
+  uint64 r1 = 2;
+  uint64 r2 = 3;
+  uint64 r3 = 4;
+  uint64 r4 = 5;
+  uint64 r5 = 6;
+  uint64 r6 = 7;
+  uint64 r7 = 8;
+  uint64 r8 = 9;
+  uint64 r9 = 10;
+  uint64 r10 = 11;
+  uint64 r11 = 12;
+  uint64 r12 = 13;
+  uint64 r13 = 14;
+  uint64 r14 = 15;
+  uint64 r15 = 16;
+  uint64 r16 = 17;
+  uint64 r17 = 18;
+  uint64 r18 = 19;
+  uint64 r19 = 20;
+  uint64 r20 = 21;
+  uint64 r21 = 22;
+  uint64 r22 = 23;
+  uint64 r23 = 24;
+  uint64 r24 = 25;
+  uint64 r25 = 26;
+  uint64 r26 = 27;
+  uint64 r27 = 28;
+  uint64 r28 = 29;
+  uint64 r29 = 30;
+  uint64 r30 = 31;
+  uint64 sp = 32;
+  uint64 pc = 33;
+  uint64 pstate = 34;
+}
 message Registers {
   oneof arch {
     AMD64Registers amd64 = 1;
+    ARM64Registers arm64 = 2;
   }
 }
diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go
new file mode 100644
index 000000000..402e46025
--- /dev/null
+++ b/pkg/sentry/arch/signal.go
@@ -0,0 +1,250 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// SignalAct represents the action that should be taken when a signal is
+// delivered, and is equivalent to struct sigaction.
+//
+// +stateify savable
+type SignalAct struct {
+	Handler  uint64
+	Flags    uint64
+	Restorer uint64 // Only used on amd64.
+	Mask     linux.SignalSet
+}
+
+// SerializeFrom implements NativeSignalAct.SerializeFrom.
+func (s *SignalAct) SerializeFrom(other *SignalAct) {
+	*s = *other
+}
+
+// DeserializeTo implements NativeSignalAct.DeserializeTo.
+func (s *SignalAct) DeserializeTo(other *SignalAct) {
+	*other = *s
+}
+
+// SignalStack represents information about a user stack, and is equivalent to
+// stack_t.
+//
+// +stateify savable
+type SignalStack struct {
+	Addr  uint64
+	Flags uint32
+	_     uint32
+	Size  uint64
+}
+
+// SerializeFrom implements NativeSignalStack.SerializeFrom.
+func (s *SignalStack) SerializeFrom(other *SignalStack) {
+	*s = *other
+}
+
+// DeserializeTo implements NativeSignalStack.DeserializeTo.
+func (s *SignalStack) DeserializeTo(other *SignalStack) {
+	*other = *s
+}
+
+// SignalInfo represents information about a signal being delivered, and is
+// equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h).
+//
+// +stateify savable
+type SignalInfo struct {
+	Signo int32 // Signal number
+	Errno int32 // Errno value
+	Code  int32 // Signal code
+	_     uint32
+
+	// struct siginfo::_sifields is a union. In SignalInfo, fields in the union
+	// are accessed through methods.
+	//
+	// For reference, here is the definition of _sifields: (_sigfault._trapno,
+	// which does not exist on x86, omitted for clarity)
+	//
+	// union {
+	// 	int _pad[SI_PAD_SIZE];
+	//
+	// 	/* kill() */
+	// 	struct {
+	// 		__kernel_pid_t _pid;	/* sender's pid */
+	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
+	// 	} _kill;
+	//
+	// 	/* POSIX.1b timers */
+	// 	struct {
+	// 		__kernel_timer_t _tid;	/* timer id */
+	// 		int _overrun;		/* overrun count */
+	// 		char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
+	// 		sigval_t _sigval;	/* same as below */
+	// 		int _sys_private;       /* not to be passed to user */
+	// 	} _timer;
+	//
+	// 	/* POSIX.1b signals */
+	// 	struct {
+	// 		__kernel_pid_t _pid;	/* sender's pid */
+	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
+	// 		sigval_t _sigval;
+	// 	} _rt;
+	//
+	// 	/* SIGCHLD */
+	// 	struct {
+	// 		__kernel_pid_t _pid;	/* which child */
+	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
+	// 		int _status;		/* exit code */
+	// 		__ARCH_SI_CLOCK_T _utime;
+	// 		__ARCH_SI_CLOCK_T _stime;
+	// 	} _sigchld;
+	//
+	// 	/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
+	// 	struct {
+	// 		void *_addr; /* faulting insn/memory ref. */
+	// 		short _addr_lsb; /* LSB of the reported address */
+	// 	} _sigfault;
+	//
+	// 	/* SIGPOLL */
+	// 	struct {
+	// 		__ARCH_SI_BAND_T _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
+	// 		int _fd;
+	// 	} _sigpoll;
+	//
+	// 	/* SIGSYS */
+	// 	struct {
+	// 		void *_call_addr; /* calling user insn */
+	// 		int _syscall;	/* triggering system call number */
+	// 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
+	// 	} _sigsys;
+	// } _sifields;
+	//
+	// _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128
+	// bytes.
+	Fields [128 - 16]byte
+}
+
+// FixSignalCodeForUser fixes up si_code.
+//
+// The si_code we get from Linux may contain the kernel-specific code in the
+// top 16 bits if it's positive (e.g., from ptrace). Linux's
+// copy_siginfo_to_user does
+//     err |= __put_user((short)from->si_code, &to->si_code);
+// to mask out those bits and we need to do the same.
+func (s *SignalInfo) FixSignalCodeForUser() {
+	if s.Code > 0 {
+		s.Code &= 0x0000ffff
+	}
+}
+
+// Pid returns the si_pid field.
+func (s *SignalInfo) Pid() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[0:4]))
+}
+
+// SetPid mutates the si_pid field.
+func (s *SignalInfo) SetPid(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
+}
+
+// Uid returns the si_uid field.
+func (s *SignalInfo) Uid() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
+}
+
+// SetUid mutates the si_uid field.
+func (s *SignalInfo) SetUid(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
+}
+
+// Sigval returns the sigval field, which is aliased to both si_int and si_ptr.
+func (s *SignalInfo) Sigval() uint64 {
+	return usermem.ByteOrder.Uint64(s.Fields[8:16])
+}
+
+// SetSigval mutates the sigval field.
+func (s *SignalInfo) SetSigval(val uint64) {
+	usermem.ByteOrder.PutUint64(s.Fields[8:16], val)
+}
+
+// TimerID returns the si_timerid field.
+func (s *SignalInfo) TimerID() linux.TimerID {
+	return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4]))
+}
+
+// SetTimerID sets the si_timerid field.
+func (s *SignalInfo) SetTimerID(val linux.TimerID) {
+	usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
+}
+
+// Overrun returns the si_overrun field.
+func (s *SignalInfo) Overrun() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
+}
+
+// SetOverrun sets the si_overrun field.
+func (s *SignalInfo) SetOverrun(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
+}
+
+// Addr returns the si_addr field.
+func (s *SignalInfo) Addr() uint64 {
+	return usermem.ByteOrder.Uint64(s.Fields[0:8])
+}
+
+// SetAddr sets the si_addr field.
+func (s *SignalInfo) SetAddr(val uint64) {
+	usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
+}
+
+// Status returns the si_status field.
+func (s *SignalInfo) Status() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
+}
+
+// SetStatus mutates the si_status field.
+func (s *SignalInfo) SetStatus(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
+}
+
+// CallAddr returns the si_call_addr field.
+func (s *SignalInfo) CallAddr() uint64 {
+	return usermem.ByteOrder.Uint64(s.Fields[0:8])
+}
+
+// SetCallAddr mutates the si_call_addr field.
+func (s *SignalInfo) SetCallAddr(val uint64) {
+	usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
+}
+
+// Syscall returns the si_syscall field.
+func (s *SignalInfo) Syscall() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
+}
+
+// SetSyscall mutates the si_syscall field.
+func (s *SignalInfo) SetSyscall(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
+}
+
+// Arch returns the si_arch field.
+func (s *SignalInfo) Arch() uint32 {
+	return usermem.ByteOrder.Uint32(s.Fields[12:16])
+}
+
+// SetArch mutates the si_arch field.
+func (s *SignalInfo) SetArch(val uint32) {
+	usermem.ByteOrder.PutUint32(s.Fields[12:16], val)
+}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index febd6f9b9..1e4f9c3c2 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -26,236 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
-// SignalAct represents the action that should be taken when a signal is
-// delivered, and is equivalent to struct sigaction on 64-bit x86.
-//
-// +stateify savable
-type SignalAct struct {
-	Handler  uint64
-	Flags    uint64
-	Restorer uint64
-	Mask     linux.SignalSet
-}
-
-// SerializeFrom implements NativeSignalAct.SerializeFrom.
-func (s *SignalAct) SerializeFrom(other *SignalAct) {
-	*s = *other
-}
-
-// DeserializeTo implements NativeSignalAct.DeserializeTo.
-func (s *SignalAct) DeserializeTo(other *SignalAct) {
-	*other = *s
-}
-
-// SignalStack represents information about a user stack, and is equivalent to
-// stack_t on 64-bit x86.
-//
-// +stateify savable
-type SignalStack struct {
-	Addr  uint64
-	Flags uint32
-	_     uint32
-	Size  uint64
-}
-
-// SerializeFrom implements NativeSignalStack.SerializeFrom.
-func (s *SignalStack) SerializeFrom(other *SignalStack) {
-	*s = *other
-}
-
-// DeserializeTo implements NativeSignalStack.DeserializeTo.
-func (s *SignalStack) DeserializeTo(other *SignalStack) {
-	*other = *s
-}
-
-// SignalInfo represents information about a signal being delivered, and is
-// equivalent to struct siginfo on 64-bit x86.
-//
-// +stateify savable
-type SignalInfo struct {
-	Signo int32 // Signal number
-	Errno int32 // Errno value
-	Code  int32 // Signal code
-	_     uint32
-
-	// struct siginfo::_sifields is a union. In SignalInfo, fields in the union
-	// are accessed through methods.
-	//
-	// For reference, here is the definition of _sifields: (_sigfault._trapno,
-	// which does not exist on x86, omitted for clarity)
-	//
-	// union {
-	// 	int _pad[SI_PAD_SIZE];
-	//
-	// 	/* kill() */
-	// 	struct {
-	// 		__kernel_pid_t _pid;	/* sender's pid */
-	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
-	// 	} _kill;
-	//
-	// 	/* POSIX.1b timers */
-	// 	struct {
-	// 		__kernel_timer_t _tid;	/* timer id */
-	// 		int _overrun;		/* overrun count */
-	// 		char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
-	// 		sigval_t _sigval;	/* same as below */
-	// 		int _sys_private;       /* not to be passed to user */
-	// 	} _timer;
-	//
-	// 	/* POSIX.1b signals */
-	// 	struct {
-	// 		__kernel_pid_t _pid;	/* sender's pid */
-	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
-	// 		sigval_t _sigval;
-	// 	} _rt;
-	//
-	// 	/* SIGCHLD */
-	// 	struct {
-	// 		__kernel_pid_t _pid;	/* which child */
-	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
-	// 		int _status;		/* exit code */
-	// 		__ARCH_SI_CLOCK_T _utime;
-	// 		__ARCH_SI_CLOCK_T _stime;
-	// 	} _sigchld;
-	//
-	// 	/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
-	// 	struct {
-	// 		void *_addr; /* faulting insn/memory ref. */
-	// 		short _addr_lsb; /* LSB of the reported address */
-	// 	} _sigfault;
-	//
-	// 	/* SIGPOLL */
-	// 	struct {
-	// 		__ARCH_SI_BAND_T _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
-	// 		int _fd;
-	// 	} _sigpoll;
-	//
-	// 	/* SIGSYS */
-	// 	struct {
-	// 		void *_call_addr; /* calling user insn */
-	// 		int _syscall;	/* triggering system call number */
-	// 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
-	// 	} _sigsys;
-	// } _sifields;
-	//
-	// _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128
-	// bytes.
-	Fields [128 - 16]byte
-}
-
-// FixSignalCodeForUser fixes up si_code.
-//
-// The si_code we get from Linux may contain the kernel-specific code in the
-// top 16 bits if it's positive (e.g., from ptrace). Linux's
-// copy_siginfo_to_user does
-//     err |= __put_user((short)from->si_code, &to->si_code);
-// to mask out those bits and we need to do the same.
-func (s *SignalInfo) FixSignalCodeForUser() {
-	if s.Code > 0 {
-		s.Code &= 0x0000ffff
-	}
-}
-
-// Pid returns the si_pid field.
-func (s *SignalInfo) Pid() int32 {
-	return int32(usermem.ByteOrder.Uint32(s.Fields[0:4]))
-}
-
-// SetPid mutates the si_pid field.
-func (s *SignalInfo) SetPid(val int32) {
-	usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
-}
-
-// Uid returns the si_uid field.
-func (s *SignalInfo) Uid() int32 {
-	return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
-}
-
-// SetUid mutates the si_uid field.
-func (s *SignalInfo) SetUid(val int32) {
-	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
-}
-
-// Sigval returns the sigval field, which is aliased to both si_int and si_ptr.
-func (s *SignalInfo) Sigval() uint64 {
-	return usermem.ByteOrder.Uint64(s.Fields[8:16])
-}
-
-// SetSigval mutates the sigval field.
-func (s *SignalInfo) SetSigval(val uint64) {
-	usermem.ByteOrder.PutUint64(s.Fields[8:16], val)
-}
-
-// TimerID returns the si_timerid field.
-func (s *SignalInfo) TimerID() linux.TimerID {
-	return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4]))
-}
-
-// SetTimerID sets the si_timerid field.
-func (s *SignalInfo) SetTimerID(val linux.TimerID) {
-	usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
-}
-
-// Overrun returns the si_overrun field.
-func (s *SignalInfo) Overrun() int32 {
-	return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
-}
-
-// SetOverrun sets the si_overrun field.
-func (s *SignalInfo) SetOverrun(val int32) {
-	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
-}
-
-// Addr returns the si_addr field.
-func (s *SignalInfo) Addr() uint64 {
-	return usermem.ByteOrder.Uint64(s.Fields[0:8])
-}
-
-// SetAddr sets the si_addr field.
-func (s *SignalInfo) SetAddr(val uint64) {
-	usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
-}
-
-// Status returns the si_status field.
-func (s *SignalInfo) Status() int32 {
-	return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
-}
-
-// SetStatus mutates the si_status field.
-func (s *SignalInfo) SetStatus(val int32) {
-	usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
-}
-
-// CallAddr returns the si_call_addr field.
-func (s *SignalInfo) CallAddr() uint64 {
-	return usermem.ByteOrder.Uint64(s.Fields[0:8])
-}
-
-// SetCallAddr mutates the si_call_addr field.
-func (s *SignalInfo) SetCallAddr(val uint64) {
-	usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
-}
-
-// Syscall returns the si_syscall field.
-func (s *SignalInfo) Syscall() int32 {
-	return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
-}
-
-// SetSyscall mutates the si_syscall field.
-func (s *SignalInfo) SetSyscall(val int32) {
-	usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
-}
-
-// Arch returns the si_arch field.
-func (s *SignalInfo) Arch() uint32 {
-	return usermem.ByteOrder.Uint32(s.Fields[12:16])
-}
-
-// SetArch mutates the si_arch field.
-func (s *SignalInfo) SetArch(val uint32) {
-	usermem.ByteOrder.PutUint32(s.Fields[12:16], val)
-}
-
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
 // second argument to signal handlers set by signal(2).
 type SignalContext64 struct {
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
new file mode 100644
index 000000000..7d0e98935
--- /dev/null
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -0,0 +1,126 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"encoding/binary"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// SignalContext64 is equivalent to struct sigcontext, the type passed as the
+// second argument to signal handlers set by signal(2).
+type SignalContext64 struct {
+	FaultAddr uint64
+	Regs      [31]uint64
+	Sp        uint64
+	Pc        uint64
+	Pstate    uint64
+	_pad      [8]byte // __attribute__((__aligned__(16)))
+	Reserved  [4096]uint8
+}
+
+// UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
+type UContext64 struct {
+	Flags  uint64
+	Link   *UContext64
+	Stack  SignalStack
+	Sigset linux.SignalSet
+	// glibc uses a 1024-bit sigset_t
+	_pad [(1024 - 64) / 8]byte
+	// sigcontext must be aligned to 16-byte
+	_pad2 [8]byte
+	// last for future expansion
+	MContext SignalContext64
+}
+
+// NewSignalAct implements Context.NewSignalAct.
+func (c *context64) NewSignalAct() NativeSignalAct {
+	return &SignalAct{}
+}
+
+// NewSignalStack implements Context.NewSignalStack.
+func (c *context64) NewSignalStack() NativeSignalStack {
+	return &SignalStack{}
+}
+
+// SignalSetup implements Context.SignalSetup.
+func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error {
+	sp := st.Bottom
+
+	if !(alt.IsEnabled() && sp == alt.Top()) {
+		sp -= 128
+	}
+
+	// Construct the UContext64 now since we need its size.
+	uc := &UContext64{
+		Flags: 0,
+		Stack: *alt,
+		MContext: SignalContext64{
+			Regs:   c.Regs.Regs,
+			Sp:     c.Regs.Sp,
+			Pc:     c.Regs.Pc,
+			Pstate: c.Regs.Pstate,
+		},
+		Sigset: sigset,
+	}
+
+	ucSize := binary.Size(uc)
+	if ucSize < 0 {
+		panic("can't get size of UContext64")
+	}
+	// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
+	frameSize := int(st.Arch.Width()) + ucSize + 128
+	frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
+	sp = frameBottom + usermem.Addr(frameSize)
+	st.Bottom = sp
+
+	// Prior to proceeding, figure out if the frame will exhaust the range
+	// for the signal stack. This is not allowed, and should immediately
+	// force signal delivery (reverting to the default handler).
+	if act.IsOnStack() && alt.IsEnabled() && !alt.Contains(frameBottom) {
+		return syscall.EFAULT
+	}
+
+	// Adjust the code.
+	info.FixSignalCodeForUser()
+
+	// Set up the stack frame.
+	infoAddr, err := st.Push(info)
+	if err != nil {
+		return err
+	}
+	ucAddr, err := st.Push(uc)
+	if err != nil {
+		return err
+	}
+
+	// Set up registers.
+	c.Regs.Sp = uint64(st.Bottom)
+	c.Regs.Pc = act.Handler
+	c.Regs.Regs[0] = uint64(info.Signo)
+	c.Regs.Regs[1] = uint64(infoAddr)
+	c.Regs.Regs[2] = uint64(ucAddr)
+
+	return nil
+}
+
+// SignalRestore implements Context.SignalRestore.
+// Only used on intel.
+func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
+	return 0, SignalStack{}, nil
+}
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 5a3228113..d324da705 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build i386 amd64 arm64
 
 package arch
 
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
new file mode 100644
index 000000000..00d5ef461
--- /dev/null
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package arch
+
+const restartSyscallNr = uintptr(128)
+
+// SyscallNo returns the syscall number according to the 64-bit convention.
+func (c *context64) SyscallNo() uintptr {
+	return uintptr(c.Regs.Regs[8])
+}
+
+// SyscallArgs provides syscall arguments according to the 64-bit convention.
+//
+// Due to the way addresses are mapped for the sentry this binary *must* be
+// built in 64-bit mode. So we can just assume the syscall numbers that come
+// back match the expected host system call numbers.
+// General purpose registers usage on Arm64:
+// R0...R7: parameter/result registers.
+// R8: indirect result location register.
+// R9...R15: temporary registers.
+// R16: the first intra-procedure-call scratch register.
+// R17: the second intra-procedure-call scratch register.
+// R18: the platform register.
+// R19...R28: callee-saved registers.
+// R29: the frame pointer.
+// R30: the link register.
+func (c *context64) SyscallArgs() SyscallArguments {
+	return SyscallArguments{
+		SyscallArgument{Value: uintptr(c.Regs.Regs[0])},
+		SyscallArgument{Value: uintptr(c.Regs.Regs[1])},
+		SyscallArgument{Value: uintptr(c.Regs.Regs[2])},
+		SyscallArgument{Value: uintptr(c.Regs.Regs[3])},
+		SyscallArgument{Value: uintptr(c.Regs.Regs[4])},
+		SyscallArgument{Value: uintptr(c.Regs.Regs[5])},
+	}
+}
+
+// RestartSyscall implements Context.RestartSyscall.
+func (c *context64) RestartSyscall() {
+	c.Regs.Pc -= SyscallWidth
+	c.Regs.Regs[8] = uint64(restartSyscallNr)
+}
+
+// RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
+func (c *context64) RestartSyscallWithRestartBlock() {
+	c.Regs.Pc -= SyscallWidth
+	c.Regs.Regs[8] = uint64(restartSyscallNr)
+}
-- 
cgit v1.2.3


From f874723e64bd8a2e747bb336e0b6b8f0da1f044a Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 15 Jan 2020 11:17:25 -0800
Subject: Bump SO_SNDBUF for fdbased endpoint used by runsc.

Updates #231

PiperOrigin-RevId: 289897881
---
 benchmarks/tcp/tcp_proxy.go | 14 +++++++-------
 runsc/sandbox/network.go    | 23 ++++++++++++++---------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index dc96add66..72ada5700 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -84,8 +84,8 @@ func (netImpl) printStats() {
 }
 
 const (
-	nicID      = 1       // Fixed.
-	rcvBufSize = 4 << 20 // 1MB.
+	nicID   = 1       // Fixed.
+	bufSize = 4 << 20 // 4MB.
 )
 
 type netstackImpl struct {
@@ -125,13 +125,13 @@ func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
 			}
 
 			// RAW Sockets by default have a very small SO_RCVBUF of 256KB,
-			// up it to at least 1MB to reduce packet drops.
-			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, rcvBufSize); err != nil {
-				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+			// up it to at least 4MB to reduce packet drops.
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", bufSize, err)
 			}
 
-			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, rcvBufSize); err != nil {
-				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_SNDBUF, %v,..) = %v", bufSize, err)
 			}
 
 			if !*swgso && *gso != 0 {
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index be8b72b3e..ff48f5646 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -321,16 +321,21 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 		}
 	}
 
-	// Use SO_RCVBUFFORCE because on linux the receive buffer for an
-	// AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
-	// defaults to a unusually low value of 208KB. This is too low
-	// for gVisor to be able to receive packets at high throughputs
-	// without incurring packet drops.
-	const rcvBufSize = 4 << 20 // 4MB.
-
-	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+	// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
+	// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
+	// wmem_max/rmem_max default to a unusually low value of 208KB. This is too low
+	// for gVisor to be able to receive packets at high throughputs without
+	// incurring packet drops.
+	const bufSize = 4 << 20 // 4MB.
+
+	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
+		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
 	}
+
+	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
+		return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+	}
+
 	return &socketEntry{deviceFile, gsoMaxSize}, nil
 }
 
-- 
cgit v1.2.3


From 275ac8ce1debf89a22eb1150df3bf9ba7a0bc9ba Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 15 Jan 2020 13:20:14 -0800
Subject: Bugfix to terminate the protocol loop on StateError.

The change to introduce worker goroutines can cause the endpoint
to transition to StateError and we should terminate the loop rather
than let the endpoint transition to a CLOSED state as we do
in case the endpoint enters TIME-WAIT/CLOSED. Moving to a closed
state would cause the actual error to not be propagated to
any read() calls etc.

PiperOrigin-RevId: 289923568
---
 pkg/tcpip/transport/tcp/connect.go    | 50 +++++++++++++++++++++++------------
 pkg/tcpip/transport/tcp/dispatcher.go |  6 +++++
 pkg/tcpip/transport/tcp/endpoint.go   |  7 +----
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index f3896715b..a2f384384 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1019,10 +1019,6 @@ func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 		cont, err := e.handleSegment(s)
 		if err != nil {
 			s.decRef()
-			e.mu.Lock()
-			e.setEndpointState(StateError)
-			e.HardError = err
-			e.mu.Unlock()
 			return err
 		}
 		if !cont {
@@ -1414,30 +1410,50 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
+	cleanupOnError := func(err *tcpip.Error) {
+		e.mu.Lock()
+		e.workerCleanup = true
+		if err != nil {
+			e.resetConnectionLocked(err)
+		}
+		// Lock released below.
+		epilogue()
+	}
 
+loop:
 	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
 		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
 		e.workMu.Lock()
-		// We need to double check here because the notification
-		// maybe stale by the time we got around to processing it.
+
+		// We need to double check here because the notification maybe
+		// stale by the time we got around to processing it.
+		//
 		// NOTE: since we now hold the workMu the processors cannot
-		// change the state of the endpoint so it' safe to proceed
+		// change the state of the endpoint so it's safe to proceed
 		// after this check.
-		if e.EndpointState() == StateTimeWait || e.EndpointState() == StateClose || e.EndpointState() == StateError {
+		switch e.EndpointState() {
+		case StateError:
+			// If the endpoint has already transitioned to an ERROR
+			// state just pass nil here as any reset that may need
+			// to be sent etc should already have been done and we
+			// just want to terminate the loop and cleanup the
+			// endpoint.
+			cleanupOnError(nil)
+			return nil
+		case StateTimeWait:
+			fallthrough
+		case StateClose:
 			e.mu.Lock()
-			break
-		}
-		if err := funcs[v].f(); err != nil {
+			break loop
+		default:
+			if err := funcs[v].f(); err != nil {
+				cleanupOnError(err)
+				return nil
+			}
 			e.mu.Lock()
-			e.workerCleanup = true
-			e.resetConnectionLocked(err)
-			// Lock released below.
-			epilogue()
-			return nil
 		}
-		e.mu.Lock()
 	}
 
 	state := e.EndpointState()
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index a72f0c379..e18012ac0 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -119,6 +119,12 @@ func (p *processor) handleSegments() {
 			// direct delivery to ensure low latency and avoid
 			// scheduler interactions.
 			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
+				// Send any active resets if required.
+				if err != nil {
+					ep.mu.Lock()
+					ep.resetConnectionLocked(err)
+					ep.mu.Unlock()
+				}
 				ep.notifyProtocolGoroutine(notifyTickleWorker)
 				ep.workMu.Unlock()
 				continue
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 1799c6e10..4797f11d1 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2019,18 +2019,13 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				// Move the socket to error state immediately.
-				// This is done redundantly because in case of
-				// save/restore on a Shutdown/Close() the socket
-				// state needs to indicate the error otherwise
-				// save file will show the socket in established
-				// state even though snd/rcv are closed.
 				e.mu.Unlock()
 				// Try to send an active reset immediately if the
 				// work mutex is available.
 				if e.workMu.TryLock() {
 					e.mu.Lock()
 					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+					e.notifyProtocolGoroutine(notifyTickleWorker)
 					e.mu.Unlock()
 					e.workMu.Unlock()
 				} else {
-- 
cgit v1.2.3


From 7b7ce29af326ccd247ee5225e9b5b55a9d0330ce Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Wed, 15 Jan 2020 14:24:55 -0800
Subject: Update commandline and get local runs working.

PiperOrigin-RevId: 289937063
---
 benchmarks/README.md                               | 126 ++++++++++-----------
 benchmarks/harness/__init__.py                     |   2 +-
 benchmarks/harness/machine.py                      |   8 +-
 .../harness/machine_producers/machine_producer.py  |  21 ++++
 benchmarks/runner/BUILD                            |  10 ++
 benchmarks/runner/__init__.py                      |  59 +++-------
 benchmarks/runner/commands.py                      |  84 ++++++++++++++
 benchmarks/runner/runner_test.py                   |   2 +-
 benchmarks/suites/http.py                          |   2 +-
 benchmarks/workloads/BUILD                         |  40 +++----
 benchmarks/workloads/ab/BUILD                      |   5 +-
 benchmarks/workloads/absl/BUILD                    |   5 +-
 benchmarks/workloads/curl/BUILD                    |   6 +-
 benchmarks/workloads/ffmpeg/BUILD                  |   6 +-
 benchmarks/workloads/fio/BUILD                     |   5 +-
 benchmarks/workloads/httpd/BUILD                   |   6 +-
 benchmarks/workloads/iperf/BUILD                   |   5 +-
 benchmarks/workloads/netcat/BUILD                  |   6 +-
 benchmarks/workloads/nginx/BUILD                   |   6 +-
 benchmarks/workloads/node/BUILD                    |   6 +-
 benchmarks/workloads/node_template/BUILD           |   6 +-
 benchmarks/workloads/redis/BUILD                   |   6 +-
 benchmarks/workloads/redisbenchmark/BUILD          |   5 +-
 benchmarks/workloads/ruby/BUILD                    |  13 +++
 benchmarks/workloads/ruby_template/BUILD           |   7 +-
 benchmarks/workloads/sleep/BUILD                   |   6 +-
 benchmarks/workloads/sysbench/BUILD                |   5 +-
 benchmarks/workloads/syscall/BUILD                 |   5 +-
 benchmarks/workloads/tensorflow/BUILD              |   6 +-
 benchmarks/workloads/true/BUILD                    |   7 +-
 30 files changed, 303 insertions(+), 173 deletions(-)
 create mode 100644 benchmarks/runner/commands.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index ad44cd6ac..ff21614c5 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,66 +6,55 @@ These scripts are tools for collecting performance data for Docker-based tests.
 
 The scripts assume the following:
 
-*   You have a local machine with bazel installed.
-*   You have some machine(s) with docker installed. These machines will be
-    refered to as the "Environment".
-*   Environment machines have the runtime(s) under test installed, such that you
-    can run docker with a command like: `docker run --runtime=$RUNTIME
-    your/image`.
-*   You are able to login to machines in the environment with the local machine
-    via ssh and the user for ssh can run docker commands without using `sudo`.
+*   There are two sets of machines: one where the scripts will be run
+    (controller) and one or more machines on which docker containers will be run
+    (environment).
+*   The controller machine must have bazel installed along with this source
+    code. You should be able to run a command like `bazel run :benchmarks --
+    --list`
+*   Environment machines must have docker and the required runtimes installed.
+    More specifically, you should be able to run a command like: `docker run
+    --runtime=$RUNTIME your/image`.
+*   The controller has ssh private key which can be used to login to environment
+    machines and run docker commands without using `sudo`. This is not required
+    if running locally via the `run-local` command.
 *   The docker daemon on each of your environment machines is listening on
     `unix:///var/run/docker.sock` (docker's default).
 
 For configuring the environment manually, consult the
 [dockerd documentation][dockerd].
 
-## Environment
-
-All benchmarks require a user defined yaml file describe the environment. These
-files are of the form:
-
-```yaml
-machine1: local
-machine2:
-  hostname: 100.100.100.100
-  username: username
-  key_path: ~/private_keyfile
-  key_password: passphrase
-machine3:
-  hostname: 100.100.100.101
-  username: username
-  key_path: ~/private_keyfile
-  key_password: passphrase
-```
+## Running benchmarks
 
-The yaml file defines an environment with three machines named `machine1`,
-`machine2` and `machine3`. `machine1` is the local machine, `machine2` and
-`machine3` are remote machines. Both `machine2` and `machine3` should be
-reachable by `ssh`. For example, the command `ssh -i ~/private_keyfile
-username@100.100.100.100` (using the passphrase `passphrase`) should connect to
-`machine2`.
+Run the following from the benchmarks directory:
 
-The above is an example only. Machines should be uniform, since they are treated
-as such by the tests. Machines must also be accessible to each other via their
-default routes. Furthermore, some benchmarks will meaningless if running on the
-local machine, such as density.
+```bash
+bazel run :benchmarks -- run-local startup
 
-For remote machines, `hostname`, `key_path`, and `username` are required and
-others are optional. In addition key files must be generated
-[using the instrcutions below](#generating-ssh-keys).
+...
+method,metric,result
+startup.empty,startup_time_ms,652.5772
+startup.node,startup_time_ms,1654.4042000000002
+startup.ruby,startup_time_ms,1429.835
+```
 
-The above yaml file can be checked for correctness with the `validate` command
-in the top level perf.py script:
+The above command ran the startup benchmark locally, which consists of three
+benchmarks (empty, node, and ruby). Benchmark tools ran it on the default
+runtime, runc. Running on another installed runtime, like say runsc, is as
+simple as:
 
-`bazel run :benchmarks -- validate $PWD/examples/localhost.yaml`
+```bash
+bazel run :benchmakrs -- run-local startup --runtime=runsc
+```
 
-## Running benchmarks
+There is help: ``bash bash bazel run :benchmarks -- --help bazel
+run :benchmarks -- run-local --help` ``
 
 To list available benchmarks, use the `list` commmand:
 
 ```bash
 bazel run :benchmarks -- list
+ls
 
 ...
 Benchmark: sysbench.cpu
@@ -75,24 +64,44 @@ Metrics: events_per_second
     :param max_prime: The maximum prime number to search.
 ```
 
-To run benchmarks, use the `run` command. For example, to run the sysbench
-benchmark above:
+You can choose benchmarks by name or regex like:
 
 ```bash
-bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml sysbench.cpu
+bazel run :benchmarks -- run-local startup.node
+...
+metric,result
+startup_time_ms,1671.7178000000001
+
+```
+
+or
+
+```bash
+bazel run :benchmarks -- run-local s
+...
+method,metric,result
+startup.empty,startup_time_ms,1792.8292
+startup.node,startup_time_ms,3113.5274
+startup.ruby,startup_time_ms,3025.2424
+sysbench.cpu,cpu_events_per_second,12661.47
+sysbench.memory,memory_ops_per_second,7228268.44
+sysbench.mutex,mutex_time,17.4835
+sysbench.mutex,mutex_latency,3496.7
+sysbench.mutex,mutex_deviation,0.04
+syscall.syscall,syscall_time_ns,2065.0
 ```
 
 You can run parameterized benchmarks, for example to run with different
 runtimes:
 
 ```bash
-bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml --runtime=runc --runtime=runsc sysbench.cpu
+bazel run :benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
 ```
 
 Or with different parameters:
 
 ```bash
-bazel run :benchmarks -- run --env $PWD/examples/localhost.yaml --max_prime=10 --max_prime=100 sysbench.cpu
+bazel run :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
 ```
 
 ## Writing benchmarks
@@ -121,7 +130,7 @@ The harness requires workloads to run. These are all available in the
 
 In general, a workload consists of a Dockerfile to build it (while these are not
 hermetic, in general they should be as fixed and isolated as possible), some
-parses for output if required, parser tests and sample data. Provided the test
+parsers for output if required, parser tests and sample data. Provided the test
 is named after the workload package and contains a function named `sample`, this
 variable will be used to automatically mock workload output when the `--mock`
 flag is provided to the main tool.
@@ -149,24 +158,5 @@ To write a new benchmark, open a module in the `suites` directory and use the
 above signature. You should add a descriptive doc string to describe what your
 benchmark is and any test centric arguments.
 
-## Generating SSH Keys
-
-The scripts only support RSA Keys, and ssh library used in paramiko. Paramiko
-only supports RSA keys that look like the following (PEM format):
-
-```bash
-$ cat /path/to/ssh/key
-
------BEGIN RSA PRIVATE KEY-----
-...private key text...
------END RSA PRIVATE KEY-----
-
-```
-
-To generate ssh keys in PEM format, use the [`-t rsa -m PEM -b 4096`][RSA-keys].
-option.
-
 [dockerd]: https://docs.docker.com/engine/reference/commandline/dockerd/
 [docker-py]: https://docker-py.readthedocs.io/en/stable/
-[paramiko]: http://docs.paramiko.org/en/2.4/api/client.html
-[RSA-keys]: https://serverfault.com/questions/939909/ssh-keygen-does-not-create-rsa-private-key
diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
index a7f34da9e..7b96d1666 100644
--- a/benchmarks/harness/__init__.py
+++ b/benchmarks/harness/__init__.py
@@ -18,7 +18,7 @@ import os
 # LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a
 # format string that accepts a single string parameter.
 LOCAL_WORKLOADS_PATH = os.path.join(
-    os.path.dirname(__file__), "../workloads/{}")
+    os.path.dirname(__file__), "../workloads/{}/tar.tar")
 
 # REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the
 # remote host. This is a format string that accepts a single string parameter.
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index 66b719b63..af037dbcc 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -160,15 +160,17 @@ class LocalMachine(Machine):
     stdout, stderr = process.communicate()
     return stdout.decode("utf-8"), stderr.decode("utf-8")
 
-  def read(self, path: str) -> str:
+  def read(self, path: str) -> bytes:
     # Read the exact path locally.
     return open(path, "r").read()
 
   def pull(self, workload: str) -> str:
     # Run the docker build command locally.
     logging.info("Building %s@%s locally...", workload, self._name)
-    self.run("docker build --tag={} {}".format(
-        workload, harness.LOCAL_WORKLOADS_PATH.format(workload)))
+    with open(harness.LOCAL_WORKLOADS_PATH.format(workload),
+              "rb") as dockerfile:
+      self._docker_client.images.build(
+          fileobj=dockerfile, tag=workload, custom_context=True)
     return workload  # Workload is the tag.
 
   def container(self, image: str, **kwargs) -> container.Container:
diff --git a/benchmarks/harness/machine_producers/machine_producer.py b/benchmarks/harness/machine_producers/machine_producer.py
index 124ee14cc..f5591c026 100644
--- a/benchmarks/harness/machine_producers/machine_producer.py
+++ b/benchmarks/harness/machine_producers/machine_producer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Abstract types."""
 
+import threading
 from typing import List
 
 from benchmarks.harness import machine
@@ -28,3 +29,23 @@ class MachineProducer:
   def release_machines(self, machine_list: List[machine.Machine]):
     """Releases the given set of machines."""
     raise NotImplementedError
+
+
+class LocalMachineProducer(MachineProducer):
+  """Produces Local Machines."""
+
+  def __init__(self, limit: int):
+    self.limit_sem = threading.Semaphore(value=limit)
+
+  def get_machines(self, num_machines: int) -> List[machine.Machine]:
+    """Returns the request number of MockMachines."""
+
+    self.limit_sem.acquire()
+    return [machine.LocalMachine("local") for _ in range(num_machines)]
+
+  def release_machines(self, machine_list: List[machine.MockMachine]):
+    """No-op."""
+    if not machine_list:
+      raise ValueError("Cannot release an empty list!")
+    self.limit_sem.release()
+    machine_list.clear()
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
index de24824cc..e1b2ea550 100644
--- a/benchmarks/runner/BUILD
+++ b/benchmarks/runner/BUILD
@@ -10,7 +10,9 @@ py_library(
     ],
     visibility = ["//benchmarks:__pkg__"],
     deps = [
+        ":commands",
         "//benchmarks/harness:benchmark_driver",
+        "//benchmarks/harness/machine_producers:machine_producer",
         "//benchmarks/harness/machine_producers:mock_producer",
         "//benchmarks/harness/machine_producers:yaml_producer",
         "//benchmarks/suites",
@@ -30,6 +32,14 @@ py_library(
     ],
 )
 
+py_library(
+    name = "commands",
+    srcs = ["commands.py"],
+    deps = [
+        requirement("click", True),
+    ],
+)
+
 py_test(
     name = "runner_test",
     srcs = ["runner_test.py"],
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
index 9bf9cfd65..6f56704d8 100644
--- a/benchmarks/runner/__init__.py
+++ b/benchmarks/runner/__init__.py
@@ -28,8 +28,10 @@ import click
 
 from benchmarks import suites
 from benchmarks.harness import benchmark_driver
+from benchmarks.harness.machine_producers import machine_producer
 from benchmarks.harness.machine_producers import mock_producer
 from benchmarks.harness.machine_producers import yaml_producer
+from benchmarks.runner import commands
 
 
 @click.group()
@@ -100,30 +102,22 @@ def list_all(method):
     print("\n")
 
 
-# pylint: disable=too-many-arguments
-# pylint: disable=too-many-branches
-# pylint: disable=too-many-locals
-@runner.command(
-    context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
+@runner.command("run-local", commands.LocalCommand)
 @click.pass_context
-@click.argument("method")
-@click.option("--mock/--no-mock", default=False, help="Mock the machines.")
-@click.option("--env", default=None, help="Specify a yaml file with machines.")
-@click.option(
-    "--runtime", default=["runc"], help="The runtime to use.", multiple=True)
-@click.option("--metric", help="The metric to extract.", multiple=True)
-@click.option(
-    "--runs", default=1, help="The number of times to run each benchmark.")
-@click.option(
-    "--stat",
-    default="median",
-    help="How to aggregate the data from all runs."
-    "\nmedian - returns the median of all runs (default)"
-    "\nall - returns all results comma separated"
-    "\nmeanstd - returns result as mean,std")
-# pylint: disable=too-many-statements
-def run(ctx, method: str, runs: int, env: str, mock: bool, runtime: List[str],
-        metric: List[str], stat: str, **kwargs):
+def run_local(ctx, limit: float, **kwargs):
+  """Runs benchmarks locally."""
+  run(ctx, machine_producer.LocalMachineProducer(limit=limit), **kwargs)
+
+
+@runner.command("run-mock", commands.RunCommand)
+@click.pass_context
+def run_mock(ctx, **kwargs):
+  """Runs benchmarks on Mock machines. Used for testing."""
+  run(ctx, mock_producer.MockMachineProducer(), **kwargs)
+
+
+def run(ctx, producer: machine_producer.MachineProducer, method: str, runs: int,
+        runtime: List[str], metric: List[str], stat: str, **kwargs):
   """Runs arbitrary benchmarks.
 
   All unknown command line flags are passed through to the underlying benchmark
@@ -139,16 +133,13 @@ def run(ctx, method: str, runs: int, env: str, mock: bool, runtime: List[str],
   All benchmarks are run in parallel where possible, but have exclusive
   ownership over the individual machines.
 
-  Exactly one of the --mock and --env flag must be specified.
-
   Every benchmark method will be run the times indicated by --runs.
 
   Args:
     ctx: Click context.
+    producer: A Machine Producer from which to get Machines.
     method: A regular expression for methods to be run.
     runs: Number of runs.
-    env: Environment to use.
-    mock: If true, use mocked environment (supercedes env).
     runtime: A list of runtimes to test.
     metric: A list of metrics to extract.
     stat: The class of statistics to extract.
@@ -218,20 +209,6 @@ def run(ctx, method: str, runs: int, env: str, mock: bool, runtime: List[str],
     sys.exit(1)
   fold("method", list(methods.keys()), allow_flatten=True)
 
-  # Construct the environment.
-  if mock and env:
-    # You can't provide both.
-    logging.error("both --mock and --env are set: which one is it?")
-    sys.exit(1)
-  elif mock:
-    producer = mock_producer.MockMachineProducer()
-  elif env:
-    producer = yaml_producer.YamlMachineProducer(env)
-  else:
-    # You must provide one of mock or env.
-    logging.error("no enviroment provided: use --mock or --env.")
-    sys.exit(1)
-
   # Spin up the drivers.
   #
   # We ensure that metric is the last entry, because we have special behavior.
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
new file mode 100644
index 000000000..4973843b9
--- /dev/null
+++ b/benchmarks/runner/commands.py
@@ -0,0 +1,84 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module with the guts of `click` commands.
+
+Overrides of the click.core.Command. This is done so flags are inherited between
+similar commands (the run command). The classes below are meant to be used in
+click templates like so.
+
+@runner.command("run-mock", RunCommand)
+def run_mock(**kwargs):
+  # mock implementation
+
+"""
+import click
+
+
+class RunCommand(click.core.Command):
+  """Base Run Command with flags.
+
+  Attributes:
+    method: regex of which suite to choose (e.g. sysbench would run
+      sysbench.cpu, sysbench.memory, and sysbench.mutex) See list command for
+      details.
+    metric: metric(s) to extract. See list command for details.
+    runtime: the runtime(s) on which to run.
+    runs: the number of runs to do of each method.
+    stat: how to compile results in the case of multiple run (e.g. median).
+  """
+
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    method = click.core.Argument(("method",))
+
+    metric = click.core.Option(("--metric",),
+                               help="The metric to extract.",
+                               multiple=True)
+
+    runtime = click.core.Option(("--runtime",),
+                                default=["runc"],
+                                help="The runtime to use.",
+                                multiple=True)
+    runs = click.core.Option(("--runs",),
+                             default=1,
+                             help="The number of times to run each benchmark.")
+    stat = click.core.Option(
+        ("--stat",),
+        default="median",
+        help="How to aggregate the data from all runs."
+        "\nmedian - returns the median of all runs (default)"
+        "\nall - returns all results comma separated"
+        "\nmeanstd - returns result as mean,std")
+    self.params.extend([method, runtime, runs, stat, metric])
+    self.ignore_unknown_options = True
+    self.allow_extra_args = True
+
+
+class LocalCommand(RunCommand):
+  """LocalCommand inherits all flags from RunCommand.
+
+  Attributes:
+    limit: limits the number of machines on which to run benchmarks. This limits
+      for local how many benchmarks may run at a time. e.g. "startup" requires
+      one machine -- passing two machines would limit two startup jobs at a
+      time. Default is infinity.
+  """
+
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.params.append(
+        click.core.Option(
+            ("--limit",),
+            default=1,
+            help="Limit of number of benchmarks that can run at a given time."))
diff --git a/benchmarks/runner/runner_test.py b/benchmarks/runner/runner_test.py
index 5719c2838..7818d631a 100644
--- a/benchmarks/runner/runner_test.py
+++ b/benchmarks/runner/runner_test.py
@@ -49,7 +49,7 @@ def test_list():
 
 def test_run():
   cli_runner = testing.CliRunner()
-  result = cli_runner.invoke(runner.runner, ["run", "--mock", "."])
+  result = cli_runner.invoke(runner.runner, ["run-mock", "."])
   print(result.output)
   assert result.exit_code == 0
 
diff --git a/benchmarks/suites/http.py b/benchmarks/suites/http.py
index ea9024e43..6efea938c 100644
--- a/benchmarks/suites/http.py
+++ b/benchmarks/suites/http.py
@@ -92,7 +92,7 @@ def http_app(server: machine.Machine,
   redis = server.pull("redis")
   image = server.pull(workload)
   redis_port = 6379
-  redis_name = "redis_server"
+  redis_name = "{workload}_redis_server".format(workload=workload)
 
   with server.container(redis, name=redis_name).detach():
     server.container(server_netcat, links={redis_name: redis_name})\
diff --git a/benchmarks/workloads/BUILD b/benchmarks/workloads/BUILD
index 643806105..ccb86af5b 100644
--- a/benchmarks/workloads/BUILD
+++ b/benchmarks/workloads/BUILD
@@ -11,25 +11,25 @@ py_library(
 filegroup(
     name = "files",
     srcs = [
-        "//benchmarks/workloads/ab:files",
-        "//benchmarks/workloads/absl:files",
-        "//benchmarks/workloads/curl:files",
-        "//benchmarks/workloads/ffmpeg:files",
-        "//benchmarks/workloads/fio:files",
-        "//benchmarks/workloads/httpd:files",
-        "//benchmarks/workloads/iperf:files",
-        "//benchmarks/workloads/netcat:files",
-        "//benchmarks/workloads/nginx:files",
-        "//benchmarks/workloads/node:files",
-        "//benchmarks/workloads/node_template:files",
-        "//benchmarks/workloads/redis:files",
-        "//benchmarks/workloads/redisbenchmark:files",
-        "//benchmarks/workloads/ruby:files",
-        "//benchmarks/workloads/ruby_template:files",
-        "//benchmarks/workloads/sleep:files",
-        "//benchmarks/workloads/sysbench:files",
-        "//benchmarks/workloads/syscall:files",
-        "//benchmarks/workloads/tensorflow:files",
-        "//benchmarks/workloads/true:files",
+        "//benchmarks/workloads/ab:tar",
+        "//benchmarks/workloads/absl:tar",
+        "//benchmarks/workloads/curl:tar",
+        "//benchmarks/workloads/ffmpeg:tar",
+        "//benchmarks/workloads/fio:tar",
+        "//benchmarks/workloads/httpd:tar",
+        "//benchmarks/workloads/iperf:tar",
+        "//benchmarks/workloads/netcat:tar",
+        "//benchmarks/workloads/nginx:tar",
+        "//benchmarks/workloads/node:tar",
+        "//benchmarks/workloads/node_template:tar",
+        "//benchmarks/workloads/redis:tar",
+        "//benchmarks/workloads/redisbenchmark:tar",
+        "//benchmarks/workloads/ruby:tar",
+        "//benchmarks/workloads/ruby_template:tar",
+        "//benchmarks/workloads/sleep:tar",
+        "//benchmarks/workloads/sysbench:tar",
+        "//benchmarks/workloads/syscall:tar",
+        "//benchmarks/workloads/tensorflow:tar",
+        "//benchmarks/workloads/true:tar",
     ],
 )
diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD
index e99a8d674..4fc0ab735 100644
--- a/benchmarks/workloads/ab/BUILD
+++ b/benchmarks/workloads/ab/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD
index bb499620e..61e010096 100644
--- a/benchmarks/workloads/absl/BUILD
+++ b/benchmarks/workloads/absl/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD
index 83f3c71a0..eb0fb6165 100644
--- a/benchmarks/workloads/curl/BUILD
+++ b/benchmarks/workloads/curl/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD
index c1f2afc40..be472dfb2 100644
--- a/benchmarks/workloads/ffmpeg/BUILD
+++ b/benchmarks/workloads/ffmpeg/BUILD
@@ -1,3 +1,5 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
@@ -8,8 +10,8 @@ py_library(
     srcs = ["__init__.py"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD
index 7fc96cfa5..de257adad 100644
--- a/benchmarks/workloads/fio/BUILD
+++ b/benchmarks/workloads/fio/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD
index 83f3c71a0..eb0fb6165 100644
--- a/benchmarks/workloads/httpd/BUILD
+++ b/benchmarks/workloads/httpd/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD
index fe0acbfce..8832a996c 100644
--- a/benchmarks/workloads/iperf/BUILD
+++ b/benchmarks/workloads/iperf/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD
index 83f3c71a0..eb0fb6165 100644
--- a/benchmarks/workloads/netcat/BUILD
+++ b/benchmarks/workloads/netcat/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD
index 83f3c71a0..eb0fb6165 100644
--- a/benchmarks/workloads/nginx/BUILD
+++ b/benchmarks/workloads/nginx/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD
index 59460d02f..71cd9f519 100644
--- a/benchmarks/workloads/node/BUILD
+++ b/benchmarks/workloads/node/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
         "index.js",
diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD
index ae7f121d3..ca996f068 100644
--- a/benchmarks/workloads/node_template/BUILD
+++ b/benchmarks/workloads/node_template/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
         "index.hbs",
diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD
index 83f3c71a0..eb0fb6165 100644
--- a/benchmarks/workloads/redis/BUILD
+++ b/benchmarks/workloads/redis/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD
index d40e75a3a..f5994a815 100644
--- a/benchmarks/workloads/redisbenchmark/BUILD
+++ b/benchmarks/workloads/redisbenchmark/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD
index 9846c7e70..e37d77804 100644
--- a/benchmarks/workloads/ruby/BUILD
+++ b/benchmarks/workloads/ruby/BUILD
@@ -1,3 +1,5 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
@@ -13,3 +15,14 @@ filegroup(
         "index.rb",
     ],
 )
+
+pkg_tar(
+    name = "tar",
+    srcs = [
+        "Dockerfile",
+        "Gemfile",
+        "Gemfile.lock",
+        "config.ru",
+        "index.rb",
+    ],
+)
diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD
index 2b99892af..27f7c0c46 100644
--- a/benchmarks/workloads/ruby_template/BUILD
+++ b/benchmarks/workloads/ruby_template/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
         "Gemfile",
@@ -13,4 +15,5 @@ filegroup(
         "index.erb",
         "main.rb",
     ],
+    strip_prefix = "third_party/gvisor/benchmarks/workloads/ruby_template",
 )
diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD
index 83f3c71a0..eb0fb6165 100644
--- a/benchmarks/workloads/sleep/BUILD
+++ b/benchmarks/workloads/sleep/BUILD
@@ -1,10 +1,12 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD
index 35f4d460b..fd2f8f03d 100644
--- a/benchmarks/workloads/sysbench/BUILD
+++ b/benchmarks/workloads/sysbench/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD
index e1ff3059b..5100cbb21 100644
--- a/benchmarks/workloads/syscall/BUILD
+++ b/benchmarks/workloads/syscall/BUILD
@@ -1,4 +1,5 @@
 load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -27,8 +28,8 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
         "syscall.c",
diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD
index 17f1f8ebb..026c3b316 100644
--- a/benchmarks/workloads/tensorflow/BUILD
+++ b/benchmarks/workloads/tensorflow/BUILD
@@ -1,3 +1,5 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
@@ -8,8 +10,8 @@ py_library(
     srcs = ["__init__.py"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD
index 83f3c71a0..221c4b9a7 100644
--- a/benchmarks/workloads/true/BUILD
+++ b/benchmarks/workloads/true/BUILD
@@ -1,11 +1,14 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "files",
+pkg_tar(
+    name = "tar",
     srcs = [
         "Dockerfile",
     ],
+    extension = "tar",
 )
-- 
cgit v1.2.3


From d6fb1ec6c7c76040dd20e915b32f9ed795ae7077 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 15 Jan 2020 16:31:24 -0800
Subject: Add timestamps to VFS2 tmpfs, and implement some of SetStat.

PiperOrigin-RevId: 289962040
---
 pkg/abi/linux/time.go                        |  13 ++
 pkg/sentry/fsimpl/tmpfs/BUILD                |   2 +
 pkg/sentry/fsimpl/tmpfs/filesystem.go        |  16 +-
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 129 ++++++++++++---
 pkg/sentry/fsimpl/tmpfs/stat_test.go         | 232 +++++++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             |  72 +++++++--
 6 files changed, 425 insertions(+), 39 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/tmpfs/stat_test.go

diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 546668bca..5c5a58cd4 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -234,6 +234,19 @@ type StatxTimestamp struct {
 	_    int32
 }
 
+// ToNsec returns the nanosecond representation.
+func (sxts StatxTimestamp) ToNsec() int64 {
+	return int64(sxts.Sec)*1e9 + int64(sxts.Nsec)
+}
+
+// ToNsecCapped returns the safe nanosecond representation.
+func (sxts StatxTimestamp) ToNsecCapped() int64 {
+	if sxts.Sec > maxSecInDuration {
+		return math.MaxInt64
+	}
+	return sxts.ToNsec()
+}
+
 // NsecToStatxTimestamp translates nanoseconds to StatxTimestamp.
 func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
 	return StatxTimestamp{
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 82f5c2f41..7601c7c04 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -40,6 +40,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
@@ -77,6 +78,7 @@ go_test(
     srcs = [
         "pipe_test.go",
         "regular_file_test.go",
+        "stat_test.go",
     ],
     embed = [":tmpfs"],
     deps = [
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 26979729e..4cd7e9aea 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -56,7 +56,8 @@ afterSymlink:
 	}
 	next := nextVFSD.Impl().(*dentry)
 	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
+		// TODO(gvisor.dev/issues/1197): Symlink traversals updates
+		// access time.
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
 		}
@@ -501,7 +502,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.inode.decLinksLocked()
 		newParent.inode.incLinksLocked()
 	}
-	// TODO: update timestamps and parent directory sizes
+	// TODO(gvisor.dev/issues/1197): Update timestamps and parent directory
+	// sizes.
 	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
 	return nil
 }
@@ -555,15 +557,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
+	return d.inode.setStat(opts.Stat)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -587,7 +585,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO: actually implement statfs
+	// TODO(gvisor.dev/issues/1197): Actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 3731c5b6f..7b0a962f0 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"sync/atomic"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -29,10 +30,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
-// the returned err is not nil, then cleanup should be called when the FD is no
-// longer needed.
-func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) {
+// nextFileID is used to generate unique file names.
+var nextFileID int64
+
+// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
+// is not nil, then cleanup should be called when the root is no longer needed.
+func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
 	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := vfs.New()
@@ -41,36 +44,124 @@ func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func
 	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
-		return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
 	}
 	root := mntns.Root()
+	return vfsObj, root, func() {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+	}, nil
+}
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
 
 	// Create the file that will be write/read.
 	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(filename),
-		FollowFinalSymlink: true,
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
 	}, &vfs.OpenOptions{
 		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-		Mode:  0644,
+		Mode:  linux.ModeRegular | mode,
 	})
 	if err != nil {
-		root.DecRef()
-		mntns.DecRef(vfsObj)
+		cleanup()
 		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
 	}
 
-	return fd, func() {
-		root.DecRef()
-		mntns.DecRef(vfsObj)
-	}, nil
+	return fd, cleanup, nil
+}
+
+// newDirFD is like newFileFD, but for directories.
+func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
+
+	// Create the dir.
+	if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dirname),
+	}, &vfs.MkdirOptions{
+		Mode: linux.ModeDirectory | mode,
+	}); err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
+	}
+
+	// Open the dir and return it.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dirname),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
+	}
+
+	return fd, cleanup, nil
+}
+
+// newPipeFD is like newFileFD, but for pipes.
+func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1))
+
+	// Create the pipe.
+	if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(pipename),
+	}, &vfs.MknodOptions{
+		Mode: linux.ModeNamedPipe | mode,
+	}); err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err)
+	}
+
+	// Open the pipe and return it.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(pipename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err)
+	}
+
+	return fd, cleanup, nil
 }
 
 // Test that we can write some data to a file and read it back.`
 func TestSimpleWriteRead(t *testing.T) {
 	ctx := contexttest.Context(t)
-	fd, cleanup, err := newFileFD(ctx, "simpleReadWrite")
+	fd, cleanup, err := newFileFD(ctx, 0644)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -116,7 +207,7 @@ func TestSimpleWriteRead(t *testing.T) {
 
 func TestPWrite(t *testing.T) {
 	ctx := contexttest.Context(t)
-	fd, cleanup, err := newFileFD(ctx, "PRead")
+	fd, cleanup, err := newFileFD(ctx, 0644)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -171,7 +262,7 @@ func TestPWrite(t *testing.T) {
 
 func TestPRead(t *testing.T) {
 	ctx := contexttest.Context(t)
-	fd, cleanup, err := newFileFD(ctx, "PRead")
+	fd, cleanup, err := newFileFD(ctx, 0644)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
new file mode 100644
index 000000000..ebe035dee
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -0,0 +1,232 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func TestStatAfterCreate(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mode := linux.FileMode(0644)
+
+	// Run with different file types.
+	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	for _, typ := range []string{"file", "dir", "pipe"} {
+		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
+			var (
+				fd      *vfs.FileDescription
+				cleanup func()
+				err     error
+			)
+			switch typ {
+			case "file":
+				fd, cleanup, err = newFileFD(ctx, mode)
+			case "dir":
+				fd, cleanup, err = newDirFD(ctx, mode)
+			case "pipe":
+				fd, cleanup, err = newPipeFD(ctx, mode)
+			default:
+				panic(fmt.Sprintf("unknown typ %q", typ))
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer cleanup()
+
+			got, err := fd.Stat(ctx, vfs.StatOptions{})
+			if err != nil {
+				t.Fatalf("Stat failed: %v", err)
+			}
+
+			// Atime, Ctime, Mtime should all be current time (non-zero).
+			atime, ctime, mtime := got.Atime.ToNsec(), got.Ctime.ToNsec(), got.Mtime.ToNsec()
+			if atime != ctime || ctime != mtime {
+				t.Errorf("got atime=%d ctime=%d mtime=%d, wanted equal values", atime, ctime, mtime)
+			}
+			if atime == 0 {
+				t.Errorf("got atime=%d, want non-zero", atime)
+			}
+
+			// Btime should be 0, as it is not set by tmpfs.
+			if btime := got.Btime.ToNsec(); btime != 0 {
+				t.Errorf("got btime %d, want 0", got.Btime.ToNsec())
+			}
+
+			// Size should be 0.
+			if got.Size != 0 {
+				t.Errorf("got size %d, want 0", got.Size)
+			}
+
+			// Nlink should be 1 for files, 2 for dirs.
+			wantNlink := uint32(1)
+			if typ == "dir" {
+				wantNlink = 2
+			}
+			if got.Nlink != wantNlink {
+				t.Errorf("got nlink %d, want %d", got.Nlink, wantNlink)
+			}
+
+			// UID and GID are set from context creds.
+			creds := auth.CredentialsFromContext(ctx)
+			if got.UID != uint32(creds.EffectiveKUID) {
+				t.Errorf("got uid %d, want %d", got.UID, uint32(creds.EffectiveKUID))
+			}
+			if got.GID != uint32(creds.EffectiveKGID) {
+				t.Errorf("got gid %d, want %d", got.GID, uint32(creds.EffectiveKGID))
+			}
+
+			// Mode.
+			wantMode := uint16(mode)
+			switch typ {
+			case "file":
+				wantMode |= linux.S_IFREG
+			case "dir":
+				wantMode |= linux.S_IFDIR
+			case "pipe":
+				wantMode |= linux.S_IFIFO
+			default:
+				panic(fmt.Sprintf("unknown typ %q", typ))
+			}
+
+			if got.Mode != wantMode {
+				t.Errorf("got mode %x, want %x", got.Mode, wantMode)
+			}
+
+			// Ino.
+			if got.Ino == 0 {
+				t.Errorf("got ino %d, want not 0", got.Ino)
+			}
+		})
+	}
+}
+
+func TestSetStatAtime(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL}
+
+	// Get initial stat.
+	initialStat, err := fd.Stat(ctx, allStatOptions)
+	if err != nil {
+		t.Fatalf("Stat failed: %v", err)
+	}
+
+	// Set atime, but without the mask.
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{
+		Mask:  0,
+		Atime: linux.NsecToStatxTimestamp(100),
+	}}); err != nil {
+		t.Errorf("SetStat atime without mask failed: %v")
+	}
+	// Atime should be unchanged.
+	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+		t.Errorf("Stat got error: %v", err)
+	} else if gotStat.Atime != initialStat.Atime {
+		t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime)
+	}
+
+	// Set atime, this time included in the mask.
+	setStat := linux.Statx{
+		Mask:  linux.STATX_ATIME,
+		Atime: linux.NsecToStatxTimestamp(100),
+	}
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
+		t.Errorf("SetStat atime with mask failed: %v")
+	}
+	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+		t.Errorf("Stat got error: %v", err)
+	} else if gotStat.Atime != setStat.Atime {
+		t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime)
+	}
+}
+
+func TestSetStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mode := linux.FileMode(0644)
+
+	// Run with different file types.
+	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	for _, typ := range []string{"file", "dir", "pipe"} {
+		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
+			var (
+				fd      *vfs.FileDescription
+				cleanup func()
+				err     error
+			)
+			switch typ {
+			case "file":
+				fd, cleanup, err = newFileFD(ctx, mode)
+			case "dir":
+				fd, cleanup, err = newDirFD(ctx, mode)
+			case "pipe":
+				fd, cleanup, err = newPipeFD(ctx, mode)
+			default:
+				panic(fmt.Sprintf("unknown typ %q", typ))
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer cleanup()
+
+			allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL}
+
+			// Get initial stat.
+			initialStat, err := fd.Stat(ctx, allStatOptions)
+			if err != nil {
+				t.Fatalf("Stat failed: %v", err)
+			}
+
+			// Set atime, but without the mask.
+			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{
+				Mask:  0,
+				Atime: linux.NsecToStatxTimestamp(100),
+			}}); err != nil {
+				t.Errorf("SetStat atime without mask failed: %v")
+			}
+			// Atime should be unchanged.
+			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+				t.Errorf("Stat got error: %v", err)
+			} else if gotStat.Atime != initialStat.Atime {
+				t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime)
+			}
+
+			// Set atime, this time included in the mask.
+			setStat := linux.Statx{
+				Mask:  linux.STATX_ATIME,
+				Atime: linux.NsecToStatxTimestamp(100),
+			}
+			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
+				t.Errorf("SetStat atime with mask failed: %v")
+			}
+			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+				t.Errorf("Stat got error: %v", err)
+			} else if gotStat.Atime != setStat.Atime {
+				t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 701826f90..d6960ee47 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -31,10 +31,10 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // FilesystemType implements vfs.FilesystemType.
@@ -47,6 +47,9 @@ type filesystem struct {
 	// memFile is used to allocate pages to for regular files.
 	memFile *pgalloc.MemoryFile
 
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock time.Clock
+
 	// mu serializes changes to the Dentry tree.
 	mu sync.RWMutex
 
@@ -59,8 +62,10 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	if memFileProvider == nil {
 		panic("MemoryFileProviderFromContext returned nil")
 	}
+	clock := time.RealtimeClockFromContext(ctx)
 	fs := filesystem{
 		memFile: memFileProvider.MemoryFile(),
+		clock:   clock,
 	}
 	fs.vfsfs.Init(vfsObj, &fs)
 	root := fs.newDentry(fs.newDirectory(creds, 01777))
@@ -126,26 +131,36 @@ type inode struct {
 	// filesystem.RmdirAt() drops the reference.
 	refs int64
 
-	// Inode metadata; protected by mu and accessed using atomic memory
-	// operations unless otherwise specified.
-	mu    sync.RWMutex
+	// Inode metadata. Writing multiple fields atomically requires holding
+	// mu, othewise atomic operations can be used.
+	mu    sync.Mutex
 	mode  uint32 // excluding file type bits, which are based on impl
 	nlink uint32 // protected by filesystem.mu instead of inode.mu
 	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
 	gid   uint32 // auth.KGID, but ...
 	ino   uint64 // immutable
 
+	// Linux's tmpfs has no concept of btime.
+	atime int64 // nanoseconds
+	ctime int64 // nanoseconds
+	mtime int64 // nanoseconds
+
 	impl interface{} // immutable
 }
 
 const maxLinks = math.MaxUint32
 
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+	now := fs.clock.Now().Nanoseconds()
 	i.refs = 1
 	i.mode = uint32(mode)
 	i.uid = uint32(creds.EffectiveKUID)
 	i.gid = uint32(creds.EffectiveKGID)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// Tmpfs creation sets atime, ctime, and mtime to current time.
+	i.atime = now
+	i.ctime = now
+	i.mtime = now
 	// i.nlink initialized by caller
 	i.impl = impl
 }
@@ -213,15 +228,24 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, i
 // Go won't inline this function, and returning linux.Statx (which is quite
 // big) means spending a lot of time in runtime.duffcopy(), so instead it's an
 // output parameter.
+//
+// Note that Linux does not guarantee to return consistent data (in the case of
+// a concurrent modification), so we do not require holding inode.mu.
 func (i *inode) statTo(stat *linux.Statx) {
-	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME |
+		linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME
 	stat.Blksize = 1 // usermem.PageSize in tmpfs
 	stat.Nlink = atomic.LoadUint32(&i.nlink)
 	stat.UID = atomic.LoadUint32(&i.uid)
 	stat.GID = atomic.LoadUint32(&i.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
 	stat.Ino = i.ino
-	// TODO: device number
+	// Linux's tmpfs has no concept of btime, so zero-value is returned.
+	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
+	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
+	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
+	// TODO(gvisor.dev/issues/1197): Device number.
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		stat.Mode |= linux.S_IFREG
@@ -245,6 +269,36 @@ func (i *inode) statTo(stat *linux.Statx) {
 	}
 }
 
+func (i *inode) setStat(stat linux.Statx) error {
+	// TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking
+	// the file.
+	if stat.Mask == 0 {
+		return nil
+	}
+	i.mu.Lock()
+	mask := stat.Mask
+	if mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&i.mode, uint32(stat.Mode))
+	}
+	if mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&i.uid, stat.UID)
+	}
+	if mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&i.gid, stat.GID)
+	}
+	if mask&linux.STATX_ATIME != 0 {
+		atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+	}
+	if mask&linux.STATX_CTIME != 0 {
+		atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+	}
+	if mask&linux.STATX_MTIME != 0 {
+		atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+	}
+	i.mu.Unlock()
+	return nil
+}
+
 // allocatedBlocksForSize returns the number of 512B blocks needed to
 // accommodate the given size in bytes, as appropriate for struct
 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -291,9 +345,5 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
+	return fd.inode().setStat(opts.Stat)
 }
-- 
cgit v1.2.3


From 815df2959a76e4a19f5882e40402b9bbca9e70be Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 15 Jan 2020 16:43:36 -0800
Subject: Solicit IPv6 routers when a NIC becomes enabled as a host

This change adds support to send NDP Router Solicitation messages when a NIC
becomes enabled as a host, as per RFC 4861 section 6.3.7.

Note, Router Solicitations will only be sent when the stack has forwarding
disabled.

Tests: Unittests to make sure that the initial Router Solicitations are sent
as configured. The tests also validate the sent Router Solicitations' fields.
PiperOrigin-RevId: 289964095
---
 pkg/tcpip/checker/checker.go           |   6 +
 pkg/tcpip/header/BUILD                 |   1 +
 pkg/tcpip/header/ipv6.go               |   7 ++
 pkg/tcpip/header/ndp_router_solicit.go |  36 ++++++
 pkg/tcpip/stack/BUILD                  |   2 +-
 pkg/tcpip/stack/ndp.go                 | 175 +++++++++++++++++++++++---
 pkg/tcpip/stack/ndp_test.go            | 224 +++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/nic.go                 |  73 +++++++----
 pkg/tcpip/stack/stack.go               |   8 +-
 9 files changed, 486 insertions(+), 46 deletions(-)
 create mode 100644 pkg/tcpip/header/ndp_router_solicit.go

diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 542abc99d..885d773b0 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -770,3 +770,9 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 		}
 	}
 }
+
+// NDPRS creates a checker that checks that the packet contains a valid NDP
+// Router Solicitation message (as per the raw wire format).
+func NDPRS() NetworkChecker {
+	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize)
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index f2061c778..cd747d100 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -20,6 +20,7 @@ go_library(
         "ndp_neighbor_solicit.go",
         "ndp_options.go",
         "ndp_router_advert.go",
+        "ndp_router_solicit.go",
         "tcp.go",
         "udp.go",
     ],
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 83425c614..70e6ce095 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -84,6 +84,13 @@ const (
 	// The address is ff02::1.
 	IPv6AllNodesMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
 
+	// IPv6AllRoutersMulticastAddress is a link-local multicast group that
+	// all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets
+	// destined to this address will reach all routers on a link.
+	//
+	// The address is ff02::2.
+	IPv6AllRoutersMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+
 	// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460,
 	// section 5.
 	IPv6MinimumMTU = 1280
diff --git a/pkg/tcpip/header/ndp_router_solicit.go b/pkg/tcpip/header/ndp_router_solicit.go
new file mode 100644
index 000000000..9e67ba95d
--- /dev/null
+++ b/pkg/tcpip/header/ndp_router_solicit.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+// NDPRouterSolicit is an NDP Router Solicitation message. It will only contain
+// the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.1 for more details.
+type NDPRouterSolicit []byte
+
+const (
+	// NDPRSMinimumSize is the minimum size of a valid NDP Router
+	// Solicitation message (body of an ICMPv6 packet).
+	NDPRSMinimumSize = 4
+
+	// ndpRSOptionsOffset is the start of the NDP options in an
+	// NDPRouterSolicit.
+	ndpRSOptionsOffset = 4
+)
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPRouterSolicit) Options() NDPOptions {
+	return NDPOptions(b[ndpRSOptionsOffset:])
+}
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 705e984c1..783351a69 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -51,7 +51,7 @@ go_library(
 
 go_test(
     name = "stack_x_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "ndp_test.go",
         "stack_test.go",
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index a9dd322db..acefc356a 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -17,6 +17,7 @@ package stack
 import (
 	"fmt"
 	"log"
+	"math/rand"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -38,24 +39,36 @@ const (
 	// Default = 1s (from RFC 4861 section 10).
 	defaultRetransmitTimer = time.Second
 
+	// defaultMaxRtrSolicitations is the default number of Router
+	// Solicitation messages to send when a NIC becomes enabled.
+	//
+	// Default = 3 (from RFC 4861 section 10).
+	defaultMaxRtrSolicitations = 3
+
+	// defaultRtrSolicitationInterval is the default amount of time between
+	// sending Router Solicitation messages.
+	//
+	// Default = 4s (from 4861 section 10).
+	defaultRtrSolicitationInterval = 4 * time.Second
+
+	// defaultMaxRtrSolicitationDelay is the default maximum amount of time
+	// to wait before sending the first Router Solicitation message.
+	//
+	// Default = 1s (from 4861 section 10).
+	defaultMaxRtrSolicitationDelay = time.Second
+
 	// defaultHandleRAs is the default configuration for whether or not to
 	// handle incoming Router Advertisements as a host.
-	//
-	// Default = true.
 	defaultHandleRAs = true
 
 	// defaultDiscoverDefaultRouters is the default configuration for
 	// whether or not to discover default routers from incoming Router
 	// Advertisements, as a host.
-	//
-	// Default = true.
 	defaultDiscoverDefaultRouters = true
 
 	// defaultDiscoverOnLinkPrefixes is the default configuration for
 	// whether or not to discover on-link prefixes from incoming Router
 	// Advertisements' Prefix Information option, as a host.
-	//
-	// Default = true.
 	defaultDiscoverOnLinkPrefixes = true
 
 	// defaultAutoGenGlobalAddresses is the default configuration for
@@ -74,26 +87,31 @@ const (
 	// value of 0 means unspecified, so the smallest valid value is 1.
 	// Note, the unit of the RetransmitTimer field in the Router
 	// Advertisement is milliseconds.
-	//
-	// Min = 1ms.
 	minimumRetransmitTimer = time.Millisecond
 
+	// minimumRtrSolicitationInterval is the minimum amount of time to wait
+	// between sending Router Solicitation messages. This limit is imposed
+	// to make sure that Router Solicitation messages are not sent all at
+	// once, defeating the purpose of sending the initial few messages.
+	minimumRtrSolicitationInterval = 500 * time.Millisecond
+
+	// minimumMaxRtrSolicitationDelay is the minimum amount of time to wait
+	// before sending the first Router Solicitation message. It is 0 because
+	// we cannot have a negative delay.
+	minimumMaxRtrSolicitationDelay = 0
+
 	// MaxDiscoveredDefaultRouters is the maximum number of discovered
 	// default routers. The stack should stop discovering new routers after
 	// discovering MaxDiscoveredDefaultRouters routers.
 	//
 	// This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and
 	// SHOULD be more.
-	//
-	// Max = 10.
 	MaxDiscoveredDefaultRouters = 10
 
 	// MaxDiscoveredOnLinkPrefixes is the maximum number of discovered
 	// on-link prefixes. The stack should stop discovering new on-link
 	// prefixes after discovering MaxDiscoveredOnLinkPrefixes on-link
 	// prefixes.
-	//
-	// Max = 10.
 	MaxDiscoveredOnLinkPrefixes = 10
 
 	// validPrefixLenForAutoGen is the expected prefix length that an
@@ -245,9 +263,24 @@ type NDPConfigurations struct {
 	// The amount of time to wait between sending Neighbor solicitation
 	// messages.
 	//
-	// Must be greater than 0.5s.
+	// Must be greater than or equal to 1ms.
 	RetransmitTimer time.Duration
 
+	// The number of Router Solicitation messages to send when the NIC
+	// becomes enabled.
+	MaxRtrSolicitations uint8
+
+	// The amount of time between transmitting Router Solicitation messages.
+	//
+	// Must be greater than or equal to 0.5s.
+	RtrSolicitationInterval time.Duration
+
+	// The maximum amount of time before transmitting the first Router
+	// Solicitation message.
+	//
+	// Must be greater than or equal to 0s.
+	MaxRtrSolicitationDelay time.Duration
+
 	// HandleRAs determines whether or not Router Advertisements will be
 	// processed.
 	HandleRAs bool
@@ -278,12 +311,15 @@ type NDPConfigurations struct {
 // default values.
 func DefaultNDPConfigurations() NDPConfigurations {
 	return NDPConfigurations{
-		DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
-		RetransmitTimer:        defaultRetransmitTimer,
-		HandleRAs:              defaultHandleRAs,
-		DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
-		DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes,
-		AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses,
+		DupAddrDetectTransmits:  defaultDupAddrDetectTransmits,
+		RetransmitTimer:         defaultRetransmitTimer,
+		MaxRtrSolicitations:     defaultMaxRtrSolicitations,
+		RtrSolicitationInterval: defaultRtrSolicitationInterval,
+		MaxRtrSolicitationDelay: defaultMaxRtrSolicitationDelay,
+		HandleRAs:               defaultHandleRAs,
+		DiscoverDefaultRouters:  defaultDiscoverDefaultRouters,
+		DiscoverOnLinkPrefixes:  defaultDiscoverOnLinkPrefixes,
+		AutoGenGlobalAddresses:  defaultAutoGenGlobalAddresses,
 	}
 }
 
@@ -292,10 +328,24 @@ func DefaultNDPConfigurations() NDPConfigurations {
 //
 // If RetransmitTimer is less than minimumRetransmitTimer, then a value of
 // defaultRetransmitTimer will be used.
+//
+// If RtrSolicitationInterval is less than minimumRtrSolicitationInterval, then
+// a value of defaultRtrSolicitationInterval will be used.
+//
+// If MaxRtrSolicitationDelay is less than minimumMaxRtrSolicitationDelay, then
+// a value of defaultMaxRtrSolicitationDelay will be used.
 func (c *NDPConfigurations) validate() {
 	if c.RetransmitTimer < minimumRetransmitTimer {
 		c.RetransmitTimer = defaultRetransmitTimer
 	}
+
+	if c.RtrSolicitationInterval < minimumRtrSolicitationInterval {
+		c.RtrSolicitationInterval = defaultRtrSolicitationInterval
+	}
+
+	if c.MaxRtrSolicitationDelay < minimumMaxRtrSolicitationDelay {
+		c.MaxRtrSolicitationDelay = defaultMaxRtrSolicitationDelay
+	}
 }
 
 // ndpState is the per-interface NDP state.
@@ -316,6 +366,10 @@ type ndpState struct {
 	// Information option.
 	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
 
+	// The timer used to send the next router solicitation message.
+	// If routers are being solicited, rtrSolicitTimer MUST NOT be nil.
+	rtrSolicitTimer *time.Timer
+
 	// The addresses generated by SLAAC.
 	autoGenAddresses map[tcpip.Address]autoGenAddressState
 
@@ -501,10 +555,12 @@ func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining u
 		// address.
 		panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", ndp.nic.ID(), snmc, addr))
 	}
+	snmcRef.incRef()
 
 	// Use the unspecified address as the source address when performing
 	// DAD.
 	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), snmcRef, false, false)
+	defer r.Release()
 
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
@@ -1132,3 +1188,84 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 		log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got)
 	}
 }
+
+// startSolicitingRouters starts soliciting routers, as per RFC 4861 section
+// 6.3.7. If routers are already being solicited, this function does nothing.
+//
+// The NIC ndp belongs to MUST be locked.
+func (ndp *ndpState) startSolicitingRouters() {
+	if ndp.rtrSolicitTimer != nil {
+		// We are already soliciting routers.
+		return
+	}
+
+	remaining := ndp.configs.MaxRtrSolicitations
+	if remaining == 0 {
+		return
+	}
+
+	// Calculate the random delay before sending our first RS, as per RFC
+	// 4861 section 6.3.7.
+	var delay time.Duration
+	if ndp.configs.MaxRtrSolicitationDelay > 0 {
+		delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
+	}
+
+	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
+		// Send an RS message with the unspecified source address.
+		ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, true)
+		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		defer r.Release()
+
+		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
+		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
+		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
+		pkt.SetType(header.ICMPv6RouterSolicit)
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+		sent := r.Stats().ICMP.V6PacketsSent
+		if err := r.WritePacket(nil,
+			NetworkHeaderParams{
+				Protocol: header.ICMPv6ProtocolNumber,
+				TTL:      header.NDPHopLimit,
+				TOS:      DefaultTOS,
+			}, tcpip.PacketBuffer{Header: hdr},
+		); err != nil {
+			sent.Dropped.Increment()
+			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
+			// Don't send any more messages if we had an error.
+			remaining = 0
+		} else {
+			sent.RouterSolicit.Increment()
+			remaining--
+		}
+
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+		if remaining == 0 {
+			ndp.rtrSolicitTimer = nil
+		} else if ndp.rtrSolicitTimer != nil {
+			// Note, we need to explicitly check to make sure that
+			// the timer field is not nil because if it was nil but
+			// we still reached this point, then we know the NIC
+			// was requested to stop soliciting routers so we don't
+			// need to send the next Router Solicitation message.
+			ndp.rtrSolicitTimer.Reset(ndp.configs.RtrSolicitationInterval)
+		}
+	})
+
+}
+
+// stopSolicitingRouters stops soliciting routers. If routers are not currently
+// being solicited, this function does nothing.
+//
+// The NIC ndp belongs to MUST be locked.
+func (ndp *ndpState) stopSolicitingRouters() {
+	if ndp.rtrSolicitTimer == nil {
+		// Nothing to do.
+		return
+	}
+
+	ndp.rtrSolicitTimer.Stop()
+	ndp.rtrSolicitTimer = nil
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index d390c6312..7c68e8ed4 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -3098,3 +3098,227 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 }
+
+// TestRouterSolicitation tests the initial Router Solicitations that are sent
+// when a NIC newly becomes enabled.
+func TestRouterSolicitation(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name                        string
+		maxRtrSolicit               uint8
+		rtrSolicitInt               time.Duration
+		effectiveRtrSolicitInt      time.Duration
+		maxRtrSolicitDelay          time.Duration
+		effectiveMaxRtrSolicitDelay time.Duration
+	}{
+		{
+			name:                        "Single RS with delay",
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               time.Second,
+			effectiveRtrSolicitInt:      time.Second,
+			maxRtrSolicitDelay:          time.Second,
+			effectiveMaxRtrSolicitDelay: time.Second,
+		},
+		{
+			name:                        "Two RS with delay",
+			maxRtrSolicit:               2,
+			rtrSolicitInt:               time.Second,
+			effectiveRtrSolicitInt:      time.Second,
+			maxRtrSolicitDelay:          500 * time.Millisecond,
+			effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
+		},
+		{
+			name:                        "Single RS without delay",
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               time.Second,
+			effectiveRtrSolicitInt:      time.Second,
+			maxRtrSolicitDelay:          0,
+			effectiveMaxRtrSolicitDelay: 0,
+		},
+		{
+			name:                        "Two RS without delay and invalid zero interval",
+			maxRtrSolicit:               2,
+			rtrSolicitInt:               0,
+			effectiveRtrSolicitInt:      4 * time.Second,
+			maxRtrSolicitDelay:          0,
+			effectiveMaxRtrSolicitDelay: 0,
+		},
+		{
+			name:                        "Three RS without delay",
+			maxRtrSolicit:               3,
+			rtrSolicitInt:               500 * time.Millisecond,
+			effectiveRtrSolicitInt:      500 * time.Millisecond,
+			maxRtrSolicitDelay:          0,
+			effectiveMaxRtrSolicitDelay: 0,
+		},
+		{
+			name:                        "Two RS with invalid negative delay",
+			maxRtrSolicit:               2,
+			rtrSolicitInt:               time.Second,
+			effectiveRtrSolicitInt:      time.Second,
+			maxRtrSolicitDelay:          -3 * time.Second,
+			effectiveMaxRtrSolicitDelay: time.Second,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
+				waitForPkt := func(timeout time.Duration) {
+					t.Helper()
+					select {
+					case p := <-e.C:
+						if p.Proto != header.IPv6ProtocolNumber {
+							t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+						}
+						checker.IPv6(t,
+							p.Pkt.Header.View(),
+							checker.SrcAddr(header.IPv6Any),
+							checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+							checker.TTL(header.NDPHopLimit),
+							checker.NDPRS(),
+						)
+
+					case <-time.After(timeout):
+						t.Fatal("timed out waiting for packet")
+					}
+				}
+				waitForNothing := func(timeout time.Duration) {
+					t.Helper()
+					select {
+					case <-e.C:
+						t.Fatal("unexpectedly got a packet")
+					case <-time.After(timeout):
+					}
+				}
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						MaxRtrSolicitations:     test.maxRtrSolicit,
+						RtrSolicitationInterval: test.rtrSolicitInt,
+						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
+					},
+				})
+				if err := s.CreateNIC(1, e); err != nil {
+					t.Fatalf("CreateNIC(1) = %s", err)
+				}
+
+				// Make sure each RS got sent at the right
+				// times.
+				remaining := test.maxRtrSolicit
+				if remaining > 0 {
+					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+					remaining--
+				}
+				for ; remaining > 0; remaining-- {
+					waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
+					waitForPkt(2 * defaultTimeout)
+				}
+
+				// Make sure no more RS.
+				if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
+					waitForNothing(test.effectiveRtrSolicitInt + defaultTimeout)
+				} else {
+					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+				}
+
+				// Make sure the counter got properly
+				// incremented.
+				if got, want := s.Stats().ICMP.V6PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want {
+					t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want)
+				}
+			})
+		}
+	})
+}
+
+// TestStopStartSolicitingRouters tests that when forwarding is enabled or
+// disabled, router solicitations are stopped or started, respecitively.
+func TestStopStartSolicitingRouters(t *testing.T) {
+	t.Parallel()
+
+	const interval = 500 * time.Millisecond
+	const delay = time.Second
+	const maxRtrSolicitations = 3
+	e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
+	waitForPkt := func(timeout time.Duration) {
+		t.Helper()
+		select {
+		case p := <-e.C:
+			if p.Proto != header.IPv6ProtocolNumber {
+				t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+			}
+			checker.IPv6(t, p.Pkt.Header.View(),
+				checker.SrcAddr(header.IPv6Any),
+				checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+				checker.TTL(header.NDPHopLimit),
+				checker.NDPRS())
+
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for packet")
+		}
+	}
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			MaxRtrSolicitations:     maxRtrSolicitations,
+			RtrSolicitationInterval: interval,
+			MaxRtrSolicitationDelay: delay,
+		},
+	})
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Enable forwarding which should stop router solicitations.
+	s.SetForwarding(true)
+	select {
+	case <-e.C:
+		// A single RS may have been sent before forwarding was enabled.
+		select {
+		case <-e.C:
+			t.Fatal("Should not have sent more than one RS message")
+		case <-time.After(interval + defaultTimeout):
+		}
+	case <-time.After(delay + defaultTimeout):
+	}
+
+	// Enabling forwarding again should do nothing.
+	s.SetForwarding(true)
+	select {
+	case <-e.C:
+		t.Fatal("unexpectedly got a packet after becoming a router")
+	case <-time.After(delay + defaultTimeout):
+	}
+
+	// Disable forwarding which should start router solicitations.
+	s.SetForwarding(false)
+	waitForPkt(delay + defaultTimeout)
+	waitForPkt(interval + defaultTimeout)
+	waitForPkt(interval + defaultTimeout)
+	select {
+	case <-e.C:
+		t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+	case <-time.After(interval + defaultTimeout):
+	}
+
+	// Disabling forwarding again should do nothing.
+	s.SetForwarding(false)
+	select {
+	case <-e.C:
+		t.Fatal("unexpectedly got a packet after becoming a router")
+	case <-time.After(delay + defaultTimeout):
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 071221d5a..1089fdf35 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -177,49 +177,72 @@ func (n *NIC) enable() *tcpip.Error {
 	}
 
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
-	if !n.stack.autoGenIPv6LinkLocal || n.isLoopback() {
-		return nil
-	}
+	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
+		var addr tcpip.Address
+		if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+			addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey)
+		} else {
+			l2addr := n.linkEP.LinkAddress()
 
-	var addr tcpip.Address
-	if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-		addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey)
-	} else {
-		l2addr := n.linkEP.LinkAddress()
+			// Only attempt to generate the link-local address if we have a valid MAC
+			// address.
+			//
+			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+			// LinkEndpoint.LinkAddress) before reaching this point.
+			if !header.IsValidUnicastEthernetAddress(l2addr) {
+				return nil
+			}
 
-		// Only attempt to generate the link-local address if we have a valid MAC
-		// address.
-		//
-		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
-		// LinkEndpoint.LinkAddress) before reaching this point.
-		if !header.IsValidUnicastEthernetAddress(l2addr) {
-			return nil
+			addr = header.LinkLocalAddr(l2addr)
 		}
 
-		addr = header.LinkLocalAddr(l2addr)
+		if _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
+			Protocol: header.IPv6ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   addr,
+				PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+			},
+		}, CanBePrimaryEndpoint); err != nil {
+			return err
+		}
 	}
 
-	_, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
-		Protocol: header.IPv6ProtocolNumber,
-		AddressWithPrefix: tcpip.AddressWithPrefix{
-			Address:   addr,
-			PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
-		},
-	}, CanBePrimaryEndpoint)
+	// If we are operating as a router, then do not solicit routers since we
+	// won't process the RAs anyways.
+	//
+	// Routers do not process Router Advertisements (RA) the same way a host
+	// does. That is, routers do not learn from RAs (e.g. on-link prefixes
+	// and default routers). Therefore, soliciting RAs from other routers on
+	// a link is unnecessary for routers.
+	if !n.stack.forwarding {
+		n.ndp.startSolicitingRouters()
+	}
 
-	return err
+	return nil
 }
 
 // becomeIPv6Router transitions n into an IPv6 router.
 //
 // When transitioning into an IPv6 router, host-only state (NDP discovered
 // routers, discovered on-link prefixes, and auto-generated addresses) will
-// be cleaned up/invalidated.
+// be cleaned up/invalidated and NDP router solicitations will be stopped.
 func (n *NIC) becomeIPv6Router() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
 	n.ndp.cleanupHostOnlyState()
+	n.ndp.stopSolicitingRouters()
+}
+
+// becomeIPv6Host transitions n into an IPv6 host.
+//
+// When transitioning into an IPv6 host, NDP router solicitations will be
+// started.
+func (n *NIC) becomeIPv6Host() {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.ndp.startSolicitingRouters()
 }
 
 // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 386eb6eec..fc56a6d79 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -750,7 +750,9 @@ func (s *Stack) Stats() tcpip.Stats {
 // SetForwarding enables or disables the packet forwarding between NICs.
 //
 // When forwarding becomes enabled, any host-only state on all NICs will be
-// cleaned up.
+// cleaned up and if IPv6 is enabled, NDP Router Solicitations will be started.
+// When forwarding becomes disabled and if IPv6 is enabled, NDP Router
+// Solicitations will be stopped.
 func (s *Stack) SetForwarding(enable bool) {
 	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
 	s.mu.Lock()
@@ -772,6 +774,10 @@ func (s *Stack) SetForwarding(enable bool) {
 		for _, nic := range s.nics {
 			nic.becomeIPv6Router()
 		}
+	} else {
+		for _, nic := range s.nics {
+			nic.becomeIPv6Host()
+		}
 	}
 }
 
-- 
cgit v1.2.3


From a7a1f00425c6a742a0c953ae3cb6de513011d41b Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 15 Jan 2020 20:21:20 -0800
Subject: Support upgrading expired/removed IPv6 addresses to permanent SLAAC
 addresses

If a previously added IPv6 address (statically or via SLAAC) was removed, it
would be left in an expired state waiting to be cleaned up if any references to
it were still held. During this time, the same address could be regenerated via
SLAAC, which should be allowed. This change supports this scenario.

When upgrading an endpoint from temporary or permanentExpired to permanent,
respect the new configuration type (static or SLAAC) and deprecated status,
along with the new PrimaryEndpointBehavior (which was already supported).

Test: stack.TestAutoGenAddrAfterRemoval
PiperOrigin-RevId: 289990168
---
 pkg/tcpip/stack/ndp.go      |   2 +-
 pkg/tcpip/stack/ndp_test.go | 125 +++++++++++++++++++++++++++++++++++++++++---
 pkg/tcpip/stack/nic.go      |  30 ++++++++---
 3 files changed, 142 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index acefc356a..c99d387d5 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -994,7 +994,7 @@ func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration
 	// If the preferred lifetime is zero, then the address should be considered
 	// deprecated.
 	deprecated := pl == 0
-	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
+	ref, err := ndp.nic.addPermanentAddressLocked(protocolAddr, FirstPrimaryEndpoint, slaac, deprecated)
 	if err != nil {
 		log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
 	}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 7c68e8ed4..1a52e0e68 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -35,12 +35,12 @@ import (
 )
 
 const (
-	addr1          = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	addr2          = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
-	addr3          = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
-	linkAddr1      = "\x02\x02\x03\x04\x05\x06"
-	linkAddr2      = "\x02\x02\x03\x04\x05\x07"
-	linkAddr3      = "\x02\x02\x03\x04\x05\x08"
+	addr1          = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	addr2          = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	addr3          = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
+	linkAddr1      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkAddr2      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
+	linkAddr3      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
 	defaultTimeout = 100 * time.Millisecond
 )
 
@@ -2445,6 +2445,119 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 	}
 }
 
+// TestAutoGenAddrAfterRemoval tests adding a SLAAC address that was previously
+// assigned to the NIC but is in the permanentExpired state.
+func TestAutoGenAddrAfterRemoval(t *testing.T) {
+	t.Parallel()
+
+	const nicID = 1
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+		t.Helper()
+
+		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+		} else if got != addr {
+			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+		}
+
+		if got := addrForNewConnection(t, s); got != addr.Address {
+			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+		}
+	}
+
+	// Receive a PI to auto-generate addr1 with a large valid and preferred
+	// lifetime.
+	const largeLifetimeSeconds = 999
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	expectPrimaryAddr(addr1)
+
+	// Add addr2 as a static address.
+	protoAddr2 := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: addr2,
+	}
+	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
+		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d, %s) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+	}
+	// addr2 should be more preferred now since it is at the front of the primary
+	// list.
+	expectPrimaryAddr(addr2)
+
+	// Get a route using addr2 to increment its reference count then remove it
+	// to leave it in the permanentExpired state.
+	r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
+	if err != nil {
+		t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
+	}
+	defer r.Release()
+	if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
+		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
+	}
+	// addr1 should be preferred again since addr2 is in the expired state.
+	expectPrimaryAddr(addr1)
+
+	// Receive a PI to auto-generate addr2 as valid and preferred.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	// addr2 should be more preferred now that it is closer to the front of the
+	// primary list and not deprecated.
+	expectPrimaryAddr(addr2)
+
+	// Removing the address should result in an invalidation event immediately.
+	// It should still be in the permanentExpired state because r is still held.
+	//
+	// We remove addr2 here to make sure addr2 was marked as a SLAAC address
+	// (it was previously marked as a static address).
+	if err := s.RemoveAddress(1, addr2.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+	}
+	expectAutoGenAddrEvent(addr2, invalidatedAddr)
+	// addr1 should be more preferred since addr2 is in the expired state.
+	expectPrimaryAddr(addr1)
+
+	// Receive a PI to auto-generate addr2 as valid and deprecated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	// addr1 should still be more preferred since addr2 is deprecated, even though
+	// it is closer to the front of the primary list.
+	expectPrimaryAddr(addr1)
+
+	// Receive a PI to refresh addr2's preferred lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto gen addr event")
+	default:
+	}
+	// addr2 should be more preferred now that it is not deprecated.
+	expectPrimaryAddr(addr2)
+
+	if err := s.RemoveAddress(1, addr2.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+	}
+	expectAutoGenAddrEvent(addr2, invalidatedAddr)
+	expectPrimaryAddr(addr1)
+}
+
 // TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
 // is already assigned to the NIC, the static address remains.
 func TestAutoGenAddrStaticConflict(t *testing.T) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 1089fdf35..4452a1302 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -202,7 +202,7 @@ func (n *NIC) enable() *tcpip.Error {
 				Address:   addr,
 				PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
 			},
-		}, CanBePrimaryEndpoint); err != nil {
+		}, CanBePrimaryEndpoint, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -533,7 +533,12 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	return ref
 }
 
-func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) (*referencedNetworkEndpoint, *tcpip.Error) {
+// addPermanentAddressLocked adds a permanent address to n.
+//
+// If n already has the address in a non-permanent state,
+// addPermanentAddressLocked will promote it to permanent and update the
+// endpoint with the properties provided.
+func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
 	id := NetworkEndpointID{protocolAddress.AddressWithPrefix.Address}
 	if ref, ok := n.endpoints[id]; ok {
 		switch ref.getKind() {
@@ -541,10 +546,14 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 			// The NIC already have a permanent endpoint with that address.
 			return nil, tcpip.ErrDuplicateAddress
 		case permanentExpired, temporary:
-			// Promote the endpoint to become permanent and respect
-			// the new peb.
+			// Promote the endpoint to become permanent and respect the new peb,
+			// configType and deprecated status.
 			if ref.tryIncRef() {
+				// TODO(b/147748385): Perform Duplicate Address Detection when promoting
+				// an IPv6 endpoint to permanent.
 				ref.setKind(permanent)
+				ref.deprecated = deprecated
+				ref.configType = configType
 
 				refs := n.primary[ref.protocol]
 				for i, r := range refs {
@@ -576,9 +585,13 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 		}
 	}
 
-	return n.addAddressLocked(protocolAddress, peb, permanent, static, false)
+	return n.addAddressLocked(protocolAddress, peb, permanent, configType, deprecated)
 }
 
+// addAddressLocked adds a new protocolAddress to n.
+//
+// If the address is already known by n (irrespective of the state it is in),
+// addAddressLocked does nothing and returns tcpip.ErrDuplicateAddress.
 func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
 	// TODO(b/141022673): Validate IP address before adding them.
 
@@ -653,7 +666,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 func (n *NIC) AddAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
 	// Add the endpoint.
 	n.mu.Lock()
-	_, err := n.addPermanentAddressLocked(protocolAddress, peb)
+	_, err := n.addPermanentAddressLocked(protocolAddress, peb, static, false /* deprecated */)
 	n.mu.Unlock()
 
 	return err
@@ -935,7 +948,7 @@ func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.A
 				Address:   addr,
 				PrefixLen: netProto.DefaultPrefixLen(),
 			},
-		}, NeverPrimaryEndpoint); err != nil {
+		}, NeverPrimaryEndpoint, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -1313,7 +1326,8 @@ type referencedNetworkEndpoint struct {
 	kind networkEndpointKind
 
 	// configType is the method that was used to configure this endpoint.
-	// This must never change after the endpoint is added to a NIC.
+	// This must never change except during endpoint creation and promotion to
+	// permanent.
 	configType networkEndpointConfigType
 
 	// deprecated indicates whether or not the endpoint should be considered
-- 
cgit v1.2.3


From 420d335fc9495ec18a20f710869770d0708d9a49 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Thu, 16 Jan 2020 10:26:23 -0800
Subject: Enable clone syscall support on arm64.

sys_clone has many flavors in Linux, and amd64 chose
a different one from x86(different arguments order).
Ref kernel/fork.c for more info.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I6c8cbc685f4a6e786b171715ab68292fc95cbf48
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1545 from xiaobo55x:clone 156bd2dfbc63ef5291627b0578ddea77997393b2
PiperOrigin-RevId: 290093953
---
 pkg/sentry/syscalls/linux/BUILD              |  2 ++
 pkg/sentry/syscalls/linux/sys_clone_amd64.go | 35 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_clone_arm64.go | 35 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_thread.go      | 13 -----------
 4 files changed, 72 insertions(+), 13 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/sys_clone_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_clone_arm64.go

diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index aa05e208a..430d796ba 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -13,6 +13,8 @@ go_library(
         "sigset.go",
         "sys_aio.go",
         "sys_capability.go",
+        "sys_clone_amd64.go",
+        "sys_clone_arm64.go",
         "sys_epoll.go",
         "sys_eventfd.go",
         "sys_file.go",
diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
new file mode 100644
index 000000000..dd43cf18d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in linux 3.11
+// x86_64:
+//    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+	stack := args[1].Pointer()
+	parentTID := args[2].Pointer()
+	childTID := args[3].Pointer()
+	tls := args[4].Pointer()
+	return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
new file mode 100644
index 000000000..cf68a8949
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors, and we implement the default one in linux 3.11
+// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file):
+//    sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+	stack := args[1].Pointer()
+	parentTID := args[2].Pointer()
+	tls := args[3].Pointer()
+	childTID := args[4].Pointer()
+	return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 4115116ff..b47c3b5c4 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -220,19 +220,6 @@ func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr
 	return uintptr(ntid), ctrl, err
 }
 
-// Clone implements linux syscall clone(2).
-// sys_clone has so many flavors. We implement the default one in linux 3.11
-// x86_64:
-//    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
-func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	flags := int(args[0].Int())
-	stack := args[1].Pointer()
-	parentTID := args[2].Pointer()
-	childTID := args[3].Pointer()
-	tls := args[4].Pointer()
-	return clone(t, flags, stack, parentTID, childTID, tls)
-}
-
 // Fork implements Linux syscall fork(2).
 func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// "A call to fork() is equivalent to a call to clone(2) specifying flags
-- 
cgit v1.2.3


From 7b7c31820b83abcfe43f7170eff1f7953f3f27e2 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 16 Jan 2020 12:28:44 -0800
Subject: Add remaining /proc/* and /proc/sys/* files

Except for one under /proc/sys/net/ipv4/tcp_sack.
/proc/pid/* is still incomplete.

Updates #1195

PiperOrigin-RevId: 290120438
---
 pkg/cpuid/cpuid.go                          |  44 ++--
 pkg/sentry/fs/proc/cpuinfo.go               |   8 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |  40 ++++
 pkg/sentry/fsimpl/proc/BUILD                |  11 +-
 pkg/sentry/fsimpl/proc/filesystem.go        |  11 +
 pkg/sentry/fsimpl/proc/loadavg.go           |  42 ----
 pkg/sentry/fsimpl/proc/meminfo.go           |  79 -------
 pkg/sentry/fsimpl/proc/net.go               | 338 ----------------------------
 pkg/sentry/fsimpl/proc/net_test.go          |  78 -------
 pkg/sentry/fsimpl/proc/stat.go              | 129 -----------
 pkg/sentry/fsimpl/proc/sys.go               |  51 -----
 pkg/sentry/fsimpl/proc/task.go              |  12 +-
 pkg/sentry/fsimpl/proc/tasks.go             |  41 +++-
 pkg/sentry/fsimpl/proc/tasks_files.go       | 245 ++++++++++++++++++++
 pkg/sentry/fsimpl/proc/tasks_net.go         | 337 +++++++++++++++++++++++++++
 pkg/sentry/fsimpl/proc/tasks_sys.go         | 143 ++++++++++++
 pkg/sentry/fsimpl/proc/tasks_sys_test.go    |  78 +++++++
 pkg/sentry/fsimpl/proc/tasks_test.go        |   3 +
 pkg/sentry/fsimpl/proc/version.go           |  70 ------
 19 files changed, 922 insertions(+), 838 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/proc/loadavg.go
 delete mode 100644 pkg/sentry/fsimpl/proc/meminfo.go
 delete mode 100644 pkg/sentry/fsimpl/proc/net.go
 delete mode 100644 pkg/sentry/fsimpl/proc/net_test.go
 delete mode 100644 pkg/sentry/fsimpl/proc/stat.go
 delete mode 100644 pkg/sentry/fsimpl/proc/sys.go
 create mode 100644 pkg/sentry/fsimpl/proc/tasks_net.go
 create mode 100644 pkg/sentry/fsimpl/proc/tasks_sys.go
 create mode 100644 pkg/sentry/fsimpl/proc/tasks_sys_test.go
 delete mode 100644 pkg/sentry/fsimpl/proc/version.go

diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index d37047368..cf50ee53f 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -657,30 +657,28 @@ func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string {
 	return strings.Join(s, " ")
 }
 
-// CPUInfo is to generate a section of one cpu in /proc/cpuinfo. This is a
-// minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
+// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
+// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
 // not always printed in Linux. The bogomips field is simply made up.
-func (fs FeatureSet) CPUInfo(cpu uint) string {
-	var b bytes.Buffer
-	fmt.Fprintf(&b, "processor\t: %d\n", cpu)
-	fmt.Fprintf(&b, "vendor_id\t: %s\n", fs.VendorID)
-	fmt.Fprintf(&b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
-	fmt.Fprintf(&b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
-	fmt.Fprintf(&b, "model name\t: %s\n", "unknown") // Unknown for now.
-	fmt.Fprintf(&b, "stepping\t: %s\n", "unknown")   // Unknown for now.
-	fmt.Fprintf(&b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
-	fmt.Fprintln(&b, "fpu\t\t: yes")
-	fmt.Fprintln(&b, "fpu_exception\t: yes")
-	fmt.Fprintf(&b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
-	fmt.Fprintln(&b, "wp\t\t: yes")
-	fmt.Fprintf(&b, "flags\t\t: %s\n", fs.FlagsString(true))
-	fmt.Fprintf(&b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
-	fmt.Fprintf(&b, "clflush size\t: %d\n", fs.CacheLine)
-	fmt.Fprintf(&b, "cache_alignment\t: %d\n", fs.CacheLine)
-	fmt.Fprintf(&b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
-	fmt.Fprintln(&b, "power management:") // This is always here, but can be blank.
-	fmt.Fprintln(&b, "")                  // The /proc/cpuinfo file ends with an extra newline.
-	return b.String()
+func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
+	fmt.Fprintf(b, "processor\t: %d\n", cpu)
+	fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID)
+	fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
+	fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
+	fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now.
+	fmt.Fprintf(b, "stepping\t: %s\n", "unknown")   // Unknown for now.
+	fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
+	fmt.Fprintln(b, "fpu\t\t: yes")
+	fmt.Fprintln(b, "fpu_exception\t: yes")
+	fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
+	fmt.Fprintln(b, "wp\t\t: yes")
+	fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true))
+	fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
+	fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine)
+	fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine)
+	fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
+	fmt.Fprintln(b, "power management:") // This is always here, but can be blank.
+	fmt.Fprintln(b, "")                  // The /proc/cpuinfo file ends with an extra newline.
 }
 
 const (
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index 3edf36780..6330337eb 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -15,6 +15,8 @@
 package proc
 
 import (
+	"bytes"
+
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -27,9 +29,9 @@ func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// Kernel is always initialized with a FeatureSet.
 		panic("cpuinfo read with nil FeatureSet")
 	}
-	contents := make([]byte, 0, 1024)
+	var buf bytes.Buffer
 	for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
-		contents = append(contents, []byte(features.CPUInfo(i))...)
+		features.WriteCPUInfoTo(i, &buf)
 	}
-	return newStaticProcInode(ctx, msrc, contents)
+	return newStaticProcInode(ctx, msrc, buf.Bytes())
 }
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 1d469a0db..6aff3d39a 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -510,3 +510,43 @@ type InodeSymlink struct {
 func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
+
+// StaticDirectory is a standard implementation of a directory with static
+// contents.
+//
+// +stateify savable
+type StaticDirectory struct {
+	InodeNotSymlink
+	InodeDirectoryNoNewChildren
+	InodeAttrs
+	InodeNoDynamicLookup
+	OrderedChildren
+}
+
+var _ Inode = (*StaticDirectory)(nil)
+
+// NewStaticDir creates a new static directory and returns its dentry.
+func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+
+	inode := &StaticDirectory{}
+	inode.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm)
+
+	dentry := &Dentry{}
+	dentry.Init(inode)
+
+	inode.OrderedChildren.Init(OrderedChildrenOptions{})
+	links := inode.OrderedChildren.Populate(dentry, children)
+	inode.IncLinks(links)
+
+	return dentry
+}
+
+// Open implements kernfs.Inode.
+func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 1f44b3217..6cd18cec8 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -7,17 +7,13 @@ go_library(
     name = "proc",
     srcs = [
         "filesystem.go",
-        "loadavg.go",
-        "meminfo.go",
         "mounts.go",
-        "net.go",
-        "stat.go",
-        "sys.go",
         "task.go",
         "task_files.go",
         "tasks.go",
         "tasks_files.go",
-        "version.go",
+        "tasks_net.go",
+        "tasks_sys.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
     deps = [
@@ -30,6 +26,7 @@ go_library(
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket",
@@ -47,7 +44,7 @@ go_test(
     size = "small",
     srcs = [
         "boot_test.go",
-        "net_test.go",
+        "tasks_sys_test.go",
         "tasks_test.go",
     ],
     embed = [":proc"],
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index d09182c77..e9cb7895f 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -67,3 +67,14 @@ func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode d
 	d.Init(inode)
 	return d
 }
+
+type staticFile struct {
+	kernfs.DynamicBytesFile
+	vfs.StaticData
+}
+
+var _ dynamicInode = (*staticFile)(nil)
+
+func newStaticFile(data string) *staticFile {
+	return &staticFile{StaticData: vfs.StaticData{Data: data}}
+}
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
deleted file mode 100644
index 5351d86e8..000000000
--- a/pkg/sentry/fsimpl/proc/loadavg.go
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-)
-
-// loadavgData backs /proc/loadavg.
-//
-// +stateify savable
-type loadavgData struct {
-	kernfs.DynamicBytesFile
-}
-
-var _ dynamicInode = (*loadavgData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	// TODO(b/62345059): Include real data in fields.
-	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
-	// Column 4-5: currently running processes and the total number of processes.
-	// Column 6: the last process ID used.
-	fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
deleted file mode 100644
index cbdd4f3fc..000000000
--- a/pkg/sentry/fsimpl/proc/meminfo.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-)
-
-// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
-//
-// +stateify savable
-type meminfoData struct {
-	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
-}
-
-var _ dynamicInode = (*meminfoData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	mf := d.k.MemoryFile()
-	mf.UpdateUsage()
-	snapshot, totalUsage := usage.MemoryAccounting.Copy()
-	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
-	anon := snapshot.Anonymous + snapshot.Tmpfs
-	file := snapshot.PageCache + snapshot.Mapped
-	// We don't actually have active/inactive LRUs, so just make up numbers.
-	activeFile := (file / 2) &^ (usermem.PageSize - 1)
-	inactiveFile := file - activeFile
-
-	fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
-	memFree := (totalSize - totalUsage) / 1024
-	// We use MemFree as MemAvailable because we don't swap.
-	// TODO(rahat): When reclaim is implemented the value of MemAvailable
-	// should change.
-	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree)
-	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree)
-	fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
-	fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
-	// Emulate a system with no swap, which disables inactivation of anon pages.
-	fmt.Fprintf(buf, "SwapCache:             0 kB\n")
-	fmt.Fprintf(buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
-	fmt.Fprintf(buf, "Inactive:       %8d kB\n", inactiveFile/1024)
-	fmt.Fprintf(buf, "Active(anon):   %8d kB\n", anon/1024)
-	fmt.Fprintf(buf, "Inactive(anon):        0 kB\n")
-	fmt.Fprintf(buf, "Active(file):   %8d kB\n", activeFile/1024)
-	fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
-	fmt.Fprintf(buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
-	fmt.Fprintf(buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
-	fmt.Fprintf(buf, "SwapTotal:             0 kB\n")
-	fmt.Fprintf(buf, "SwapFree:              0 kB\n")
-	fmt.Fprintf(buf, "Dirty:                 0 kB\n")
-	fmt.Fprintf(buf, "Writeback:             0 kB\n")
-	fmt.Fprintf(buf, "AnonPages:      %8d kB\n", anon/1024)
-	fmt.Fprintf(buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
-	fmt.Fprintf(buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go
deleted file mode 100644
index fd46eebf8..000000000
--- a/pkg/sentry/fsimpl/proc/net.go
+++ /dev/null
@@ -1,338 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
-//
-// +stateify savable
-type ifinet6 struct {
-	s inet.Stack
-}
-
-var _ vfs.DynamicBytesSource = (*ifinet6)(nil)
-
-func (n *ifinet6) contents() []string {
-	var lines []string
-	nics := n.s.Interfaces()
-	for id, naddrs := range n.s.InterfaceAddrs() {
-		nic, ok := nics[id]
-		if !ok {
-			// NIC was added after NICNames was called. We'll just
-			// ignore it.
-			continue
-		}
-
-		for _, a := range naddrs {
-			// IPv6 only.
-			if a.Family != linux.AF_INET6 {
-				continue
-			}
-
-			// Fields:
-			// IPv6 address displayed in 32 hexadecimal chars without colons
-			// Netlink device number (interface index) in hexadecimal (use nic id)
-			// Prefix length in hexadecimal
-			// Scope value (use 0)
-			// Interface flags
-			// Device name
-			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
-		}
-	}
-	return lines
-}
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	for _, l := range n.contents() {
-		buf.WriteString(l)
-	}
-	return nil
-}
-
-// netDev implements vfs.DynamicBytesSource for /proc/net/dev.
-//
-// +stateify savable
-type netDev struct {
-	s inet.Stack
-}
-
-var _ vfs.DynamicBytesSource = (*netDev)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	interfaces := n.s.Interfaces()
-	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
-	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
-
-	for _, i := range interfaces {
-		// Implements the same format as
-		// net/core/net-procfs.c:dev_seq_printf_stats.
-		var stats inet.StatDev
-		if err := n.s.Statistics(&stats, i.Name); err != nil {
-			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
-			continue
-		}
-		fmt.Fprintf(
-			buf,
-			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
-			i.Name,
-			// Received
-			stats[0], // bytes
-			stats[1], // packets
-			stats[2], // errors
-			stats[3], // dropped
-			stats[4], // fifo
-			stats[5], // frame
-			stats[6], // compressed
-			stats[7], // multicast
-			// Transmitted
-			stats[8],  // bytes
-			stats[9],  // packets
-			stats[10], // errors
-			stats[11], // dropped
-			stats[12], // fifo
-			stats[13], // frame
-			stats[14], // compressed
-			stats[15], // multicast
-		)
-	}
-
-	return nil
-}
-
-// netUnix implements vfs.DynamicBytesSource for /proc/net/unix.
-//
-// +stateify savable
-type netUnix struct {
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*netUnix)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
-	for _, se := range n.k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
-			continue
-		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
-			s.DecRef()
-			// Not a unix socket.
-			continue
-		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
-
-		addr, err := sops.Endpoint().GetLocalAddress()
-		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
-			addr.Addr = "<unknown>"
-		}
-
-		sockFlags := 0
-		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
-			if ce.Listening() {
-				// For unix domain sockets, linux reports a single flag
-				// value if the socket is listening, of __SO_ACCEPTCON.
-				sockFlags = linux.SO_ACCEPTCON
-			}
-		}
-
-		// In the socket entry below, the value for the 'Num' field requires
-		// some consideration. Linux prints the address to the struct
-		// unix_sock representing a socket in the kernel, but may redact the
-		// value for unprivileged users depending on the kptr_restrict
-		// sysctl.
-		//
-		// One use for this field is to allow a privileged user to
-		// introspect into the kernel memory to determine information about
-		// a socket not available through procfs, such as the socket's peer.
-		//
-		// In gvisor, returning a pointer to our internal structures would
-		// be pointless, as it wouldn't match the memory layout for struct
-		// unix_sock, making introspection difficult. We could populate a
-		// struct unix_sock with the appropriate data, but even that
-		// requires consideration for which kernel version to emulate, as
-		// the definition of this struct changes over time.
-		//
-		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
-			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
-			0,                             // Protocol, always 0 for UDS.
-			sockFlags,                     // Flags.
-			sops.Endpoint().Type(),        // Type.
-			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
-		)
-
-		// Path
-		if len(addr.Addr) != 0 {
-			if addr.Addr[0] == 0 {
-				// Abstract path.
-				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
-			} else {
-				fmt.Fprintf(buf, " %s", string(addr.Addr))
-			}
-		}
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-	return nil
-}
-
-// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp.
-//
-// +stateify savable
-type netTCP struct {
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*netTCP)(nil)
-
-func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	t := kernel.TaskFromContext(ctx)
-	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
-	for _, se := range n.k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
-			continue
-		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
-		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
-		}
-		if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
-			s.DecRef()
-			// Not tcp4 sockets.
-			continue
-		}
-
-		// Linux's documentation for the fields below can be found at
-		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
-		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
-		// Note that the header doesn't contain labels for all the fields.
-
-		// Field: sl; entry number.
-		fmt.Fprintf(buf, "%4d: ", se.ID)
-
-		portBuf := make([]byte, 2)
-
-		// Field: local_adddress.
-		var localAddr linux.SockAddrInet
-		if local, _, err := sops.GetSockName(t); err == nil {
-			localAddr = *local.(*linux.SockAddrInet)
-		}
-		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
-		fmt.Fprintf(buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(localAddr.Addr[:]),
-			portBuf)
-
-		// Field: rem_address.
-		var remoteAddr linux.SockAddrInet
-		if remote, _, err := sops.GetPeerName(t); err == nil {
-			remoteAddr = *remote.(*linux.SockAddrInet)
-		}
-		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
-		fmt.Fprintf(buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
-			portBuf)
-
-		// Field: state; socket state.
-		fmt.Fprintf(buf, "%02X ", sops.State())
-
-		// Field: tx_queue, rx_queue; number of packets in the transmit and
-		// receive queue. Unimplemented.
-		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
-
-		// Field: tr, tm->when; timer active state and number of jiffies
-		// until timer expires. Unimplemented.
-		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
-
-		// Field: retrnsmt; number of unrecovered RTO timeouts.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%08X ", 0)
-
-		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
-			fmt.Fprintf(buf, "%5d ", 0)
-		} else {
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-		}
-
-		// Field: timeout; number of unanswered 0-window probes.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%8d ", 0)
-
-		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
-
-		// Field: refcount. Don't count the ref we obtain while deferencing
-		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
-
-		// Field: Socket struct address. Redacted due to the same reason as
-		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
-		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
-
-		// Field: retransmit timeout. Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: predicted tick of soft clock (delayed ACK control data).
-		// Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: sending congestion window, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
-		// Unimplemented, report as large threshold.
-		fmt.Fprintf(buf, "%d", -1)
-
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/net_test.go
deleted file mode 100644
index 20a77a8ca..000000000
--- a/pkg/sentry/fsimpl/proc/net_test.go
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"reflect"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
-)
-
-func newIPv6TestStack() *inet.TestStack {
-	s := inet.NewTestStack()
-	s.SupportsIPv6Flag = true
-	return s
-}
-
-func TestIfinet6NoAddresses(t *testing.T) {
-	n := &ifinet6{s: newIPv6TestStack()}
-	var buf bytes.Buffer
-	n.Generate(contexttest.Context(t), &buf)
-	if buf.Len() > 0 {
-		t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{})
-	}
-}
-
-func TestIfinet6(t *testing.T) {
-	s := newIPv6TestStack()
-	s.InterfacesMap[1] = inet.Interface{Name: "eth0"}
-	s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{
-		{
-			Family:    linux.AF_INET6,
-			PrefixLen: 128,
-			Addr:      []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"),
-		},
-	}
-	s.InterfacesMap[2] = inet.Interface{Name: "eth1"}
-	s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{
-		{
-			Family:    linux.AF_INET6,
-			PrefixLen: 128,
-			Addr:      []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
-		},
-	}
-	want := map[string]struct{}{
-		"000102030405060708090a0b0c0d0e0f 01 80 00 00     eth0\n": {},
-		"101112131415161718191a1b1c1d1e1f 02 80 00 00     eth1\n": {},
-	}
-
-	n := &ifinet6{s: s}
-	contents := n.contents()
-	if len(contents) != len(want) {
-		t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
-	}
-	got := map[string]struct{}{}
-	for _, l := range contents {
-		got[l] = struct{}{}
-	}
-
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf("Got n.contents() = %v, want = %v", got, want)
-	}
-}
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
deleted file mode 100644
index 50894a534..000000000
--- a/pkg/sentry/fsimpl/proc/stat.go
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-)
-
-// cpuStats contains the breakdown of CPU time for /proc/stat.
-type cpuStats struct {
-	// user is time spent in userspace tasks with non-positive niceness.
-	user uint64
-
-	// nice is time spent in userspace tasks with positive niceness.
-	nice uint64
-
-	// system is time spent in non-interrupt kernel context.
-	system uint64
-
-	// idle is time spent idle.
-	idle uint64
-
-	// ioWait is time spent waiting for IO.
-	ioWait uint64
-
-	// irq is time spent in interrupt context.
-	irq uint64
-
-	// softirq is time spent in software interrupt context.
-	softirq uint64
-
-	// steal is involuntary wait time.
-	steal uint64
-
-	// guest is time spent in guests with non-positive niceness.
-	guest uint64
-
-	// guestNice is time spent in guests with positive niceness.
-	guestNice uint64
-}
-
-// String implements fmt.Stringer.
-func (c cpuStats) String() string {
-	return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
-}
-
-// statData implements vfs.DynamicBytesSource for /proc/stat.
-//
-// +stateify savable
-type statData struct {
-	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
-}
-
-var _ dynamicInode = (*statData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	// TODO(b/37226836): We currently export only zero CPU stats. We could
-	// at least provide some aggregate stats.
-	var cpu cpuStats
-	fmt.Fprintf(buf, "cpu  %s\n", cpu)
-
-	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
-		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
-	}
-
-	// The total number of interrupts is dependent on the CPUs and PCI
-	// devices on the system. See arch_probe_nr_irqs.
-	//
-	// Since we don't report real interrupt stats, just choose an arbitrary
-	// value from a representative VM.
-	const numInterrupts = 256
-
-	// The Kernel doesn't handle real interrupts, so report all zeroes.
-	// TODO(b/37226836): We could count page faults as #PF.
-	fmt.Fprintf(buf, "intr 0") // total
-	for i := 0; i < numInterrupts; i++ {
-		fmt.Fprintf(buf, " 0")
-	}
-	fmt.Fprintf(buf, "\n")
-
-	// Total number of context switches.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "ctxt 0\n")
-
-	// CLOCK_REALTIME timestamp from boot, in seconds.
-	fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
-
-	// Total number of clones.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "processes 0\n")
-
-	// Number of runnable tasks.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "procs_running 0\n")
-
-	// Number of tasks waiting on IO.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "procs_blocked 0\n")
-
-	// Number of each softirq handled.
-	fmt.Fprintf(buf, "softirq 0") // total
-	for i := 0; i < linux.NumSoftIRQ; i++ {
-		fmt.Fprintf(buf, " 0")
-	}
-	fmt.Fprintf(buf, "\n")
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go
deleted file mode 100644
index b88256e12..000000000
--- a/pkg/sentry/fsimpl/proc/sys.go
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// mmapMinAddrData implements vfs.DynamicBytesSource for
-// /proc/sys/vm/mmap_min_addr.
-//
-// +stateify savable
-type mmapMinAddrData struct {
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
-	return nil
-}
-
-// +stateify savable
-type overcommitMemory struct{}
-
-var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "0\n")
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 11a64c777..5a384817f 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -50,15 +50,15 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 		//"fd":        newFdDir(t, msrc),
 		//"fdinfo":    newFdInfoDir(t, msrc),
 		//"gid_map":   newGIDMap(t, msrc),
-		"io":   newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)),
-		"maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}),
+		"io":   newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
 		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		//"ns":        newNamespaceDir(t, msrc),
-		"smaps":  newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}),
-		"stat":   newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":  newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}),
-		"status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}),
+		"smaps":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
+		"stat":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{t: task}),
+		"status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{t: task, pidns: pidns}),
 		//"uid_map":   newUIDMap(t, msrc),
 	}
 	if isThreadGroup {
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index d8f92d52f..72315d25c 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -15,6 +15,7 @@
 package proc
 
 import (
+	"bytes"
 	"sort"
 	"strconv"
 
@@ -28,9 +29,8 @@ import (
 )
 
 const (
-	defaultPermission = 0444
-	selfName          = "self"
-	threadSelfName    = "thread-self"
+	selfName       = "self"
+	threadSelfName = "thread-self"
 )
 
 // InoGenerator generates unique inode numbers for a given filesystem.
@@ -61,15 +61,15 @@ var _ kernfs.Inode = (*tasksInode)(nil)
 func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 	contents := map[string]*kernfs.Dentry{
-		//"cpuinfo":     newCPUInfo(ctx, msrc),
-		//"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
-		"loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}),
-		"meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}),
-		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"),
-		"stat":    newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}),
-		//"uptime":      newUptime(ctx, msrc),
-		//"version": newVersionData(root, inoGen.NextIno(), k),
-		"version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}),
+		"cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
+		//"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}),
+		"loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
+		"sys":     newSysDir(root, inoGen),
+		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
+		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), 0777, "self/mounts"),
+		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
+		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
+		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
 	}
 
 	inode := &tasksInode{
@@ -216,3 +216,20 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
 
 	return stat
 }
+
+func cpuInfoData(k *kernel.Kernel) string {
+	features := k.FeatureSet()
+	if features == nil {
+		// Kernel is always initialized with a FeatureSet.
+		panic("cpuinfo read with nil FeatureSet")
+	}
+	var buf bytes.Buffer
+	for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
+		features.WriteCPUInfoTo(i, &buf)
+	}
+	return buf.String()
+}
+
+func shmData(v uint64) dynamicInode {
+	return newStaticFile(strconv.FormatUint(v, 10))
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 91f30a798..ad3760e39 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -15,6 +15,7 @@
 package proc
 
 import (
+	"bytes"
 	"fmt"
 	"strconv"
 
@@ -23,6 +24,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -90,3 +94,244 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	}
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
+
+// cpuStats contains the breakdown of CPU time for /proc/stat.
+type cpuStats struct {
+	// user is time spent in userspace tasks with non-positive niceness.
+	user uint64
+
+	// nice is time spent in userspace tasks with positive niceness.
+	nice uint64
+
+	// system is time spent in non-interrupt kernel context.
+	system uint64
+
+	// idle is time spent idle.
+	idle uint64
+
+	// ioWait is time spent waiting for IO.
+	ioWait uint64
+
+	// irq is time spent in interrupt context.
+	irq uint64
+
+	// softirq is time spent in software interrupt context.
+	softirq uint64
+
+	// steal is involuntary wait time.
+	steal uint64
+
+	// guest is time spent in guests with non-positive niceness.
+	guest uint64
+
+	// guestNice is time spent in guests with positive niceness.
+	guestNice uint64
+}
+
+// String implements fmt.Stringer.
+func (c cpuStats) String() string {
+	return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
+}
+
+// statData implements vfs.DynamicBytesSource for /proc/stat.
+//
+// +stateify savable
+type statData struct {
+	kernfs.DynamicBytesFile
+
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+var _ dynamicInode = (*statData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/37226836): We currently export only zero CPU stats. We could
+	// at least provide some aggregate stats.
+	var cpu cpuStats
+	fmt.Fprintf(buf, "cpu  %s\n", cpu)
+
+	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
+		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
+	}
+
+	// The total number of interrupts is dependent on the CPUs and PCI
+	// devices on the system. See arch_probe_nr_irqs.
+	//
+	// Since we don't report real interrupt stats, just choose an arbitrary
+	// value from a representative VM.
+	const numInterrupts = 256
+
+	// The Kernel doesn't handle real interrupts, so report all zeroes.
+	// TODO(b/37226836): We could count page faults as #PF.
+	fmt.Fprintf(buf, "intr 0") // total
+	for i := 0; i < numInterrupts; i++ {
+		fmt.Fprintf(buf, " 0")
+	}
+	fmt.Fprintf(buf, "\n")
+
+	// Total number of context switches.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "ctxt 0\n")
+
+	// CLOCK_REALTIME timestamp from boot, in seconds.
+	fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
+
+	// Total number of clones.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "processes 0\n")
+
+	// Number of runnable tasks.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "procs_running 0\n")
+
+	// Number of tasks waiting on IO.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "procs_blocked 0\n")
+
+	// Number of each softirq handled.
+	fmt.Fprintf(buf, "softirq 0") // total
+	for i := 0; i < linux.NumSoftIRQ; i++ {
+		fmt.Fprintf(buf, " 0")
+	}
+	fmt.Fprintf(buf, "\n")
+	return nil
+}
+
+// loadavgData backs /proc/loadavg.
+//
+// +stateify savable
+type loadavgData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*loadavgData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/62345059): Include real data in fields.
+	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
+	// Column 4-5: currently running processes and the total number of processes.
+	// Column 6: the last process ID used.
+	fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
+	return nil
+}
+
+// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
+//
+// +stateify savable
+type meminfoData struct {
+	kernfs.DynamicBytesFile
+
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+var _ dynamicInode = (*meminfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	mf := d.k.MemoryFile()
+	mf.UpdateUsage()
+	snapshot, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+	anon := snapshot.Anonymous + snapshot.Tmpfs
+	file := snapshot.PageCache + snapshot.Mapped
+	// We don't actually have active/inactive LRUs, so just make up numbers.
+	activeFile := (file / 2) &^ (usermem.PageSize - 1)
+	inactiveFile := file - activeFile
+
+	fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
+	memFree := (totalSize - totalUsage) / 1024
+	// We use MemFree as MemAvailable because we don't swap.
+	// TODO(rahat): When reclaim is implemented the value of MemAvailable
+	// should change.
+	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree)
+	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree)
+	fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
+	fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
+	// Emulate a system with no swap, which disables inactivation of anon pages.
+	fmt.Fprintf(buf, "SwapCache:             0 kB\n")
+	fmt.Fprintf(buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
+	fmt.Fprintf(buf, "Inactive:       %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(buf, "Active(anon):   %8d kB\n", anon/1024)
+	fmt.Fprintf(buf, "Inactive(anon):        0 kB\n")
+	fmt.Fprintf(buf, "Active(file):   %8d kB\n", activeFile/1024)
+	fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(buf, "SwapTotal:             0 kB\n")
+	fmt.Fprintf(buf, "SwapFree:              0 kB\n")
+	fmt.Fprintf(buf, "Dirty:                 0 kB\n")
+	fmt.Fprintf(buf, "Writeback:             0 kB\n")
+	fmt.Fprintf(buf, "AnonPages:      %8d kB\n", anon/1024)
+	fmt.Fprintf(buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
+	fmt.Fprintf(buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
+	return nil
+}
+
+// uptimeData implements vfs.DynamicBytesSource for /proc/uptime.
+//
+// +stateify savable
+type uptimeData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*uptimeData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	now := time.NowFromContext(ctx)
+
+	// Pretend that we've spent zero time sleeping (second number).
+	fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds())
+	return nil
+}
+
+// versionData implements vfs.DynamicBytesSource for /proc/version.
+//
+// +stateify savable
+type versionData struct {
+	kernfs.DynamicBytesFile
+
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+var _ dynamicInode = (*versionData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	init := v.k.GlobalInit()
+	if init == nil {
+		// Attempted to read before the init Task is created. This can
+		// only occur during startup, which should never need to read
+		// this file.
+		panic("Attempted to read version before initial Task is available")
+	}
+
+	// /proc/version takes the form:
+	//
+	// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
+	// (COMPILER_VERSION) VERSION"
+	//
+	// where:
+	// - SYSNAME, RELEASE, and VERSION are the same as returned by
+	// sys_utsname
+	// - COMPILE_USER is the user that build the kernel
+	// - COMPILE_HOST is the hostname of the machine on which the kernel
+	// was built
+	// - COMPILER_VERSION is the version reported by the building compiler
+	//
+	// Since we don't really want to expose build information to
+	// applications, those fields are omitted.
+	//
+	// FIXME(mpratt): Using Version from the init task SyscallTable
+	// disregards the different version a task may have (e.g., in a uts
+	// namespace).
+	ver := init.Leader().SyscallTable().Version
+	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
new file mode 100644
index 000000000..06dc43c26
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -0,0 +1,337 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
+//
+// +stateify savable
+type ifinet6 struct {
+	s inet.Stack
+}
+
+var _ vfs.DynamicBytesSource = (*ifinet6)(nil)
+
+func (n *ifinet6) contents() []string {
+	var lines []string
+	nics := n.s.Interfaces()
+	for id, naddrs := range n.s.InterfaceAddrs() {
+		nic, ok := nics[id]
+		if !ok {
+			// NIC was added after NICNames was called. We'll just ignore it.
+			continue
+		}
+
+		for _, a := range naddrs {
+			// IPv6 only.
+			if a.Family != linux.AF_INET6 {
+				continue
+			}
+
+			// Fields:
+			// IPv6 address displayed in 32 hexadecimal chars without colons
+			// Netlink device number (interface index) in hexadecimal (use nic id)
+			// Prefix length in hexadecimal
+			// Scope value (use 0)
+			// Interface flags
+			// Device name
+			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+		}
+	}
+	return lines
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	for _, l := range n.contents() {
+		buf.WriteString(l)
+	}
+	return nil
+}
+
+// netDev implements vfs.DynamicBytesSource for /proc/net/dev.
+//
+// +stateify savable
+type netDev struct {
+	s inet.Stack
+}
+
+var _ vfs.DynamicBytesSource = (*netDev)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	interfaces := n.s.Interfaces()
+	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
+	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
+
+	for _, i := range interfaces {
+		// Implements the same format as
+		// net/core/net-procfs.c:dev_seq_printf_stats.
+		var stats inet.StatDev
+		if err := n.s.Statistics(&stats, i.Name); err != nil {
+			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
+			continue
+		}
+		fmt.Fprintf(
+			buf,
+			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+			i.Name,
+			// Received
+			stats[0], // bytes
+			stats[1], // packets
+			stats[2], // errors
+			stats[3], // dropped
+			stats[4], // fifo
+			stats[5], // frame
+			stats[6], // compressed
+			stats[7], // multicast
+			// Transmitted
+			stats[8],  // bytes
+			stats[9],  // packets
+			stats[10], // errors
+			stats[11], // dropped
+			stats[12], // fifo
+			stats[13], // frame
+			stats[14], // compressed
+			stats[15], // multicast
+		)
+	}
+
+	return nil
+}
+
+// netUnix implements vfs.DynamicBytesSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnix struct {
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*netUnix)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+			s.DecRef()
+			// Not a unix socket.
+			continue
+		}
+		sops := sfile.FileOperations.(*unix.SocketOperations)
+
+		addr, err := sops.Endpoint().GetLocalAddress()
+		if err != nil {
+			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			addr.Addr = "<unknown>"
+		}
+
+		sockFlags := 0
+		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+			if ce.Listening() {
+				// For unix domain sockets, linux reports a single flag
+				// value if the socket is listening, of __SO_ACCEPTCON.
+				sockFlags = linux.SO_ACCEPTCON
+			}
+		}
+
+		// In the socket entry below, the value for the 'Num' field requires
+		// some consideration. Linux prints the address to the struct
+		// unix_sock representing a socket in the kernel, but may redact the
+		// value for unprivileged users depending on the kptr_restrict
+		// sysctl.
+		//
+		// One use for this field is to allow a privileged user to
+		// introspect into the kernel memory to determine information about
+		// a socket not available through procfs, such as the socket's peer.
+		//
+		// In gvisor, returning a pointer to our internal structures would
+		// be pointless, as it wouldn't match the memory layout for struct
+		// unix_sock, making introspection difficult. We could populate a
+		// struct unix_sock with the appropriate data, but even that
+		// requires consideration for which kernel version to emulate, as
+		// the definition of this struct changes over time.
+		//
+		// For now, we always redact this pointer.
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			0,                             // Protocol, always 0 for UDS.
+			sockFlags,                     // Flags.
+			sops.Endpoint().Type(),        // Type.
+			sops.State(),                  // State.
+			sfile.InodeID(),               // Inode.
+		)
+
+		// Path
+		if len(addr.Addr) != 0 {
+			if addr.Addr[0] == 0 {
+				// Abstract path.
+				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
+			} else {
+				fmt.Fprintf(buf, " %s", string(addr.Addr))
+			}
+		}
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCP struct {
+	k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*netTCP)(nil)
+
+func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	t := kernel.TaskFromContext(ctx)
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
+			s.DecRef()
+			// Not tcp4 sockets.
+			continue
+		}
+
+		// Linux's documentation for the fields below can be found at
+		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+		// Note that the header doesn't contain labels for all the fields.
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%4d: ", se.ID)
+
+		portBuf := make([]byte, 2)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if local, _, err := sops.GetSockName(t); err == nil {
+			localAddr = *local.(*linux.SockAddrInet)
+		}
+		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
+		fmt.Fprintf(buf, "%08X:%04X ",
+			binary.LittleEndian.Uint32(localAddr.Addr[:]),
+			portBuf)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if remote, _, err := sops.GetPeerName(t); err == nil {
+			remoteAddr = *remote.(*linux.SockAddrInet)
+		}
+		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
+		fmt.Fprintf(buf, "%08X:%04X ",
+			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
+			portBuf)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when; timer active state and number of jiffies
+		// until timer expires. Unimplemented.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt; number of unrecovered RTO timeouts.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+		}
+
+		// Field: timeout; number of unanswered 0-window probes.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: refcount. Don't count the ref we obtain while deferencing
+		// the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: retransmit timeout. Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: predicted tick of soft clock (delayed ACK control data).
+		// Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: sending congestion window, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+		// Unimplemented, report as large threshold.
+		fmt.Fprintf(buf, "%d", -1)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
new file mode 100644
index 000000000..aabf2bf0c
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -0,0 +1,143 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// newSysDir returns the dentry corresponding to /proc/sys directory.
+func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry {
+	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}),
+			"shmall":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)),
+			"shmmax":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)),
+			"shmmni":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)),
+		}),
+		"vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"mmap_min_addr":     newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}),
+			"overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")),
+		}),
+		"net": newSysNetDir(root, inoGen),
+	})
+}
+
+// newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
+func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry {
+	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+				// Add tcp_sack.
+				// TODO(gvisor.dev/issue/1195): tcp_sack allows write(2)
+				// "tcp_sack": newTCPSackInode(ctx, msrc, s),
+
+				// The following files are simple stubs until they are implemented in
+				// netstack, most of these files are configuration related. We use the
+				// value closest to the actual netstack behavior or any empty file, all
+				// of these files will have mode 0444 (read-only for all users).
+				"ip_local_port_range":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000   65535")),
+				"ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")),
+				"ipfrag_time":             newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")),
+				"ip_nonlocal_bind":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"ip_no_pmtu_disc":         newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+
+				// tcp_allowed_congestion_control tell the user what they are able to
+				// do as an unprivledged process so we leave it empty.
+				"tcp_allowed_congestion_control":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")),
+				"tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_congestion_control":           newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")),
+
+				// Many of the following stub files are features netstack doesn't
+				// support. The unsupported features return "0" to indicate they are
+				// disabled.
+				"tcp_base_mss":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")),
+				"tcp_dsack":                 newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_early_retrans":         newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fack":                  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fastopen":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fastopen_key":          newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")),
+				"tcp_invalid_ratelimit":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_intvl":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_probes":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_time":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")),
+				"tcp_mtu_probing":           newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_no_metrics_save":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+				"tcp_probe_interval":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_probe_threshold":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"tcp_retries1":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")),
+				"tcp_retries2":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")),
+				"tcp_rfc1337":               newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+				"tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+				"tcp_synack_retries":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")),
+				"tcp_syn_retries":           newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")),
+				"tcp_timestamps":            newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+			}),
+			"core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+				"default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")),
+				"message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")),
+				"message_cost":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")),
+				"optmem_max":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
+				"rmem_default":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
+				"rmem_max":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
+				"somaxconn":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")),
+				"wmem_default":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
+				"wmem_max":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
+			}),
+		}),
+	})
+}
+
+// mmapMinAddrData implements vfs.DynamicBytesSource for
+// /proc/sys/vm/mmap_min_addr.
+//
+// +stateify savable
+type mmapMinAddrData struct {
+	kernfs.DynamicBytesFile
+
+	k *kernel.Kernel
+}
+
+var _ dynamicInode = (*mmapMinAddrData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
+	return nil
+}
+
+// hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname.
+//
+// +stateify savable
+type hostnameData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*hostnameData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	utsns := kernel.UTSNamespaceFromContext(ctx)
+	buf.WriteString(utsns.HostName())
+	buf.WriteString("\n")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
new file mode 100644
index 000000000..20a77a8ca
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -0,0 +1,78 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+)
+
+func newIPv6TestStack() *inet.TestStack {
+	s := inet.NewTestStack()
+	s.SupportsIPv6Flag = true
+	return s
+}
+
+func TestIfinet6NoAddresses(t *testing.T) {
+	n := &ifinet6{s: newIPv6TestStack()}
+	var buf bytes.Buffer
+	n.Generate(contexttest.Context(t), &buf)
+	if buf.Len() > 0 {
+		t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{})
+	}
+}
+
+func TestIfinet6(t *testing.T) {
+	s := newIPv6TestStack()
+	s.InterfacesMap[1] = inet.Interface{Name: "eth0"}
+	s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{
+		{
+			Family:    linux.AF_INET6,
+			PrefixLen: 128,
+			Addr:      []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"),
+		},
+	}
+	s.InterfacesMap[2] = inet.Interface{Name: "eth1"}
+	s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{
+		{
+			Family:    linux.AF_INET6,
+			PrefixLen: 128,
+			Addr:      []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
+		},
+	}
+	want := map[string]struct{}{
+		"000102030405060708090a0b0c0d0e0f 01 80 00 00     eth0\n": {},
+		"101112131415161718191a1b1c1d1e1f 02 80 00 00     eth1\n": {},
+	}
+
+	n := &ifinet6{s: s}
+	contents := n.contents()
+	if len(contents) != len(want) {
+		t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
+	}
+	got := map[string]struct{}{}
+	for _, l := range contents {
+		got[l] = struct{}{}
+	}
+
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Got n.contents() = %v, want = %v", got, want)
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index ca8c87ec2..76eafe593 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -69,12 +69,15 @@ func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
 
 func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
+		"cpuinfo":     {Type: linux.DT_REG},
 		"loadavg":     {Type: linux.DT_REG},
 		"meminfo":     {Type: linux.DT_REG},
 		"mounts":      {Type: linux.DT_LNK},
 		"self":        selfLink,
 		"stat":        {Type: linux.DT_REG},
+		"sys":         {Type: linux.DT_DIR},
 		"thread-self": threadSelfLink,
+		"uptime":      {Type: linux.DT_REG},
 		"version":     {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
deleted file mode 100644
index 367f2396b..000000000
--- a/pkg/sentry/fsimpl/proc/version.go
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-)
-
-// versionData implements vfs.DynamicBytesSource for /proc/version.
-//
-// +stateify savable
-type versionData struct {
-	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
-}
-
-var _ dynamicInode = (*versionData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	init := v.k.GlobalInit()
-	if init == nil {
-		// Attempted to read before the init Task is created. This can
-		// only occur during startup, which should never need to read
-		// this file.
-		panic("Attempted to read version before initial Task is available")
-	}
-
-	// /proc/version takes the form:
-	//
-	// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
-	// (COMPILER_VERSION) VERSION"
-	//
-	// where:
-	// - SYSNAME, RELEASE, and VERSION are the same as returned by
-	// sys_utsname
-	// - COMPILE_USER is the user that build the kernel
-	// - COMPILE_HOST is the hostname of the machine on which the kernel
-	// was built
-	// - COMPILER_VERSION is the version reported by the building compiler
-	//
-	// Since we don't really want to expose build information to
-	// applications, those fields are omitted.
-	//
-	// FIXME(mpratt): Using Version from the init task SyscallTable
-	// disregards the different version a task may have (e.g., in a uts
-	// namespace).
-	ver := init.Leader().SyscallTable().Version
-	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
-	return nil
-}
-- 
cgit v1.2.3


From 07f258497932e53f4651b80a086117ffda843fe3 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 16 Jan 2020 12:33:07 -0800
Subject: Plumb getting/setting xattrs through InodeOperations and 9p gofer
 interfaces.

There was a very bare get/setxattr in the InodeOperations interface. Add
context.Context to both, size to getxattr, and flags to setxattr.
Note that extended attributes are passed around as strings in this
implementation, so size is automatically encoded into the value. Size is
added in getxattr so that implementations can return ERANGE if a value is larger
than can fit in the user-allocated buffer. This prevents us from unnecessarily
passing around an arbitrarily large xattr when the user buffer is actually too
small.

Don't use the existing xattrwalk and xattrcreate messages and define our
own, mainly for the sake of simplicity.

Extended attributes will be implemented in future commits.

PiperOrigin-RevId: 290121300
---
 pkg/p9/client_file.go                      |  23 +++++
 pkg/p9/file.go                             |  16 ++++
 pkg/p9/handlers.go                         |  29 +++++++
 pkg/p9/messages.go                         | 129 +++++++++++++++++++++++++++++
 pkg/p9/messages_test.go                    |  15 ++++
 pkg/p9/p9.go                               |   4 +
 pkg/sentry/fs/copy_up.go                   |   9 +-
 pkg/sentry/fs/file_overlay.go              |   2 +-
 pkg/sentry/fs/fsutil/inode.go              |  41 +++++----
 pkg/sentry/fs/gofer/context_file.go        |  14 ++++
 pkg/sentry/fs/gofer/inode.go               |  18 +++-
 pkg/sentry/fs/inode.go                     |  24 +++---
 pkg/sentry/fs/inode_operations.go          |  25 ++++--
 pkg/sentry/fs/inode_overlay.go             |  30 +++----
 pkg/sentry/fs/inode_overlay_test.go        |   4 +-
 pkg/sentry/fs/tmpfs/tmpfs.go               |  18 ++--
 pkg/sentry/syscalls/linux/linux64_amd64.go |   4 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   4 +-
 pkg/sentry/syscalls/linux/sys_xattr.go     |  48 ++++++-----
 runsc/fsgofer/fsgofer.go                   |  10 +++
 20 files changed, 374 insertions(+), 93 deletions(-)

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index de9357389..04b584383 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -165,6 +165,29 @@ func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error {
 	return c.client.sendRecv(&Tsetattr{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattr{})
 }
 
+// GetXattr implements File.GetXattr.
+func (c *clientFile) GetXattr(name string, size uint64) (string, error) {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return "", syscall.EBADF
+	}
+
+	rgetxattr := Rgetxattr{}
+	if err := c.client.sendRecv(&Tgetxattr{FID: c.fid, Name: name, Size: size}, &rgetxattr); err != nil {
+		return "", err
+	}
+
+	return rgetxattr.Value, nil
+}
+
+// SetXattr implements File.SetXattr.
+func (c *clientFile) SetXattr(name, value string, flags uint32) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+
+	return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
+}
+
 // Allocate implements File.Allocate.
 func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
 	if atomic.LoadUint32(&c.closed) != 0 {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 96d1f2a8e..4607cfcdf 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -89,6 +89,22 @@ type File interface {
 	// On the server, SetAttr has a write concurrency guarantee.
 	SetAttr(valid SetAttrMask, attr SetAttr) error
 
+	// GetXattr returns extended attributes of this node.
+	//
+	// Size indicates the size of the buffer that has been allocated to hold the
+	// attribute value. If the value is larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	//
+	// TODO(b/127675828): Determine concurrency guarantees once implemented.
+	GetXattr(name string, size uint64) (string, error)
+
+	// SetXattr sets extended attributes on this node.
+	//
+	// TODO(b/127675828): Determine concurrency guarantees once implemented.
+	SetXattr(name, value string, flags uint32) error
+
 	// Allocate allows the caller to directly manipulate the allocated disk space
 	// for the file. See fallocate(2) for more details.
 	Allocate(mode AllocateMode, offset, length uint64) error
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index b9582c07f..7d6653a07 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -912,6 +912,35 @@ func (t *Txattrcreate) handle(cs *connState) message {
 	return newErr(syscall.ENOSYS)
 }
 
+// handle implements handler.handle.
+func (t *Tgetxattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	val, err := ref.file.GetXattr(t.Name, t.Size)
+	if err != nil {
+		return newErr(err)
+	}
+	return &Rgetxattr{Value: val}
+}
+
+// handle implements handler.handle.
+func (t *Tsetxattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.file.SetXattr(t.Name, t.Value, t.Flags); err != nil {
+		return newErr(err)
+	}
+	return &Rsetxattr{}
+}
+
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.Directory)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index ffdd7e8c6..ceb723d86 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1611,6 +1611,131 @@ func (r *Rxattrcreate) String() string {
 	return fmt.Sprintf("Rxattrcreate{}")
 }
 
+// Tgetxattr is a getxattr request.
+type Tgetxattr struct {
+	// FID refers to the file for which to get xattrs.
+	FID FID
+
+	// Name is the xattr to get.
+	Name string
+
+	// Size is the buffer size for the xattr to get.
+	Size uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tgetxattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Name = b.ReadString()
+	t.Size = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tgetxattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.WriteString(t.Name)
+	b.Write64(t.Size)
+}
+
+// Type implements message.Type.
+func (*Tgetxattr) Type() MsgType {
+	return MsgTgetxattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tgetxattr) String() string {
+	return fmt.Sprintf("Tgetxattr{FID: %d, Name: %s, Size: %d}", t.FID, t.Name, t.Size)
+}
+
+// Rgetxattr is a getxattr response.
+type Rgetxattr struct {
+	// Value is the extended attribute value.
+	Value string
+}
+
+// Decode implements encoder.Decode.
+func (r *Rgetxattr) Decode(b *buffer) {
+	r.Value = b.ReadString()
+}
+
+// Encode implements encoder.Encode.
+func (r *Rgetxattr) Encode(b *buffer) {
+	b.WriteString(r.Value)
+}
+
+// Type implements message.Type.
+func (*Rgetxattr) Type() MsgType {
+	return MsgRgetxattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rgetxattr) String() string {
+	return fmt.Sprintf("Rgetxattr{Value: %s}", r.Value)
+}
+
+// Tsetxattr sets extended attributes.
+type Tsetxattr struct {
+	// FID refers to the file on which to set xattrs.
+	FID FID
+
+	// Name is the attribute name.
+	Name string
+
+	// Value is the attribute value.
+	Value string
+
+	// Linux setxattr(2) flags.
+	Flags uint32
+}
+
+// Decode implements encoder.Decode.
+func (t *Tsetxattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Name = b.ReadString()
+	t.Value = b.ReadString()
+	t.Flags = b.Read32()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tsetxattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.WriteString(t.Name)
+	b.WriteString(t.Value)
+	b.Write32(t.Flags)
+}
+
+// Type implements message.Type.
+func (*Tsetxattr) Type() MsgType {
+	return MsgTsetxattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tsetxattr) String() string {
+	return fmt.Sprintf("Tsetxattr{FID: %d, Name: %s, Value: %s, Flags: %d}", t.FID, t.Name, t.Value, t.Flags)
+}
+
+// Rsetxattr is a setxattr response.
+type Rsetxattr struct {
+}
+
+// Decode implements encoder.Decode.
+func (r *Rsetxattr) Decode(b *buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (r *Rsetxattr) Encode(b *buffer) {
+}
+
+// Type implements message.Type.
+func (*Rsetxattr) Type() MsgType {
+	return MsgRsetxattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rsetxattr) String() string {
+	return fmt.Sprintf("Rsetxattr{}")
+}
+
 // Treaddir is a readdir request.
 type Treaddir struct {
 	// Directory is the directory FID to read.
@@ -2363,6 +2488,10 @@ func init() {
 	msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} })
 	msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} })
 	msgRegistry.register(MsgRxattrcreate, func() message { return &Rxattrcreate{} })
+	msgRegistry.register(MsgTgetxattr, func() message { return &Tgetxattr{} })
+	msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} })
+	msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} })
+	msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} })
 	msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} })
 	msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} })
 	msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} })
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 6ba6a1654..825c939da 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -194,6 +194,21 @@ func TestEncodeDecode(t *testing.T) {
 			Flags:    3,
 		},
 		&Rxattrcreate{},
+		&Tgetxattr{
+			FID:  1,
+			Name: "abc",
+			Size: 2,
+		},
+		&Rgetxattr{
+			Value: "xyz",
+		},
+		&Tsetxattr{
+			FID:   1,
+			Name:  "abc",
+			Value: "xyz",
+			Flags: 2,
+		},
+		&Rsetxattr{},
 		&Treaddir{
 			Directory: 1,
 			Offset:    2,
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index d3090535a..5ab00d625 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -339,6 +339,10 @@ const (
 	MsgRxattrwalk           = 31
 	MsgTxattrcreate         = 32
 	MsgRxattrcreate         = 33
+	MsgTgetxattr            = 34
+	MsgRgetxattr            = 35
+	MsgTsetxattr            = 36
+	MsgRsetxattr            = 37
 	MsgTreaddir             = 40
 	MsgRreaddir             = 41
 	MsgTfsync               = 50
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 734177e90..e03e3e417 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -395,12 +396,12 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in
 // Size and permissions are set on upper when the file content is copied
 // and when the file is created respectively.
 func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error {
-	// Extract attributes fro the lower filesystem.
+	// Extract attributes from the lower filesystem.
 	lowerAttr, err := lower.UnstableAttr(ctx)
 	if err != nil {
 		return err
 	}
-	lowerXattr, err := lower.Listxattr()
+	lowerXattr, err := lower.ListXattr(ctx)
 	if err != nil && err != syserror.EOPNOTSUPP {
 		return err
 	}
@@ -421,11 +422,11 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error
 		if isXattrOverlay(name) {
 			continue
 		}
-		value, err := lower.Getxattr(name)
+		value, err := lower.GetXattr(ctx, name, linux.XATTR_SIZE_MAX)
 		if err != nil {
 			return err
 		}
-		if err := upper.InodeOperations.Setxattr(upper, name, value); err != nil {
+		if err := upper.InodeOperations.SetXattr(ctx, upper, name, value, 0 /* flags */); err != nil {
 			return err
 		}
 	}
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 8a633b1ba..8991207b4 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -475,7 +475,7 @@ func readdirEntries(ctx context.Context, o *overlayEntry) (*SortedDentryMap, err
 			// Skip this name if it is a negative entry in the
 			// upper or there exists a whiteout for it.
 			if o.upper != nil {
-				if overlayHasWhiteout(o.upper, name) {
+				if overlayHasWhiteout(ctx, o.upper, name) {
 					continue
 				}
 			}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index adf5ec69c..df7b74855 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -15,6 +15,7 @@
 package fsutil
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -202,7 +203,7 @@ func (i *InodeSimpleAttributes) NotifyModificationAndStatusChange(ctx context.Co
 }
 
 // InodeSimpleExtendedAttributes implements
-// fs.InodeOperations.{Get,Set,List}xattr.
+// fs.InodeOperations.{Get,Set,List}Xattr.
 //
 // +stateify savable
 type InodeSimpleExtendedAttributes struct {
@@ -211,8 +212,8 @@ type InodeSimpleExtendedAttributes struct {
 	xattrs map[string]string
 }
 
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) (string, error) {
+// GetXattr implements fs.InodeOperations.GetXattr.
+func (i *InodeSimpleExtendedAttributes) GetXattr(_ context.Context, _ *fs.Inode, name string, _ uint64) (string, error) {
 	i.mu.RLock()
 	value, ok := i.xattrs[name]
 	i.mu.RUnlock()
@@ -222,19 +223,31 @@ func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) (stri
 	return value, nil
 }
 
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name, value string) error {
+// SetXattr implements fs.InodeOperations.SetXattr.
+func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode, name, value string, flags uint32) error {
 	i.mu.Lock()
+	defer i.mu.Unlock()
 	if i.xattrs == nil {
+		if flags&linux.XATTR_REPLACE != 0 {
+			return syserror.ENODATA
+		}
 		i.xattrs = make(map[string]string)
 	}
+
+	_, ok := i.xattrs[name]
+	if ok && flags&linux.XATTR_CREATE != 0 {
+		return syserror.EEXIST
+	}
+	if !ok && flags&linux.XATTR_REPLACE != 0 {
+		return syserror.ENODATA
+	}
+
 	i.xattrs[name] = value
-	i.mu.Unlock()
 	return nil
 }
 
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struct{}, error) {
+// ListXattr implements fs.InodeOperations.ListXattr.
+func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
 	i.mu.RLock()
 	names := make(map[string]struct{}, len(i.xattrs))
 	for name := range i.xattrs {
@@ -436,18 +449,18 @@ func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
 // extended attributes.
 type InodeNoExtendedAttributes struct{}
 
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) (string, error) {
+// GetXattr implements fs.InodeOperations.GetXattr.
+func (InodeNoExtendedAttributes) GetXattr(context.Context, *fs.Inode, string, uint64) (string, error) {
 	return "", syserror.EOPNOTSUPP
 }
 
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, string) error {
+// SetXattr implements fs.InodeOperations.SetXattr.
+func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, string, uint32) error {
 	return syserror.EOPNOTSUPP
 }
 
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) {
+// ListXattr implements fs.InodeOperations.ListXattr.
+func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
 	return nil, syserror.EOPNOTSUPP
 }
 
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 44b72582a..2125dafef 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -59,6 +59,20 @@ func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9
 	return err
 }
 
+func (c *contextFile) getXattr(ctx context.Context, name string, size uint64) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	val, err := c.file.GetXattr(name, size)
+	ctx.UninterruptibleSleepFinish(false)
+	return val, err
+}
+
+func (c *contextFile) setXattr(ctx context.Context, name, value string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.SetXattr(name, value, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 245fe2ef1..98d1a8a48 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -38,8 +38,7 @@ import (
 //
 // +stateify savable
 type inodeOperations struct {
-	fsutil.InodeNotVirtual           `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNotVirtual `state:"nosave"`
 
 	// fileState implements fs.CachedFileObject. It exists
 	// to break a circular load dependency between inodeOperations
@@ -604,6 +603,21 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 	return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)})
 }
 
+// GetXattr implements fs.InodeOperations.GetXattr.
+func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) {
+	return i.fileState.file.getXattr(ctx, name, size)
+}
+
+// SetXattr implements fs.InodeOperations.SetXattr.
+func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error {
+	return i.fileState.file.setXattr(ctx, name, value, flags)
+}
+
+// ListXattr implements fs.InodeOperations.ListXattr.
+func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+	return nil, syscall.EOPNOTSUPP
+}
+
 // Allocate implements fs.InodeOperations.Allocate.
 func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
 	// This can only be called for files anyway.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 468043df0..ee9d301ef 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -261,28 +261,28 @@ func (i *Inode) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
 	return i.InodeOperations.UnstableAttr(ctx, i)
 }
 
-// Getxattr calls i.InodeOperations.Getxattr with i as the Inode.
-func (i *Inode) Getxattr(name string) (string, error) {
+// GetXattr calls i.InodeOperations.GetXattr with i as the Inode.
+func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string, error) {
 	if i.overlay != nil {
-		return overlayGetxattr(i.overlay, name)
+		return overlayGetXattr(ctx, i.overlay, name, size)
 	}
-	return i.InodeOperations.Getxattr(i, name)
+	return i.InodeOperations.GetXattr(ctx, i, name, size)
 }
 
-// Setxattr calls i.InodeOperations.Setxattr with i as the Inode.
-func (i *Inode) Setxattr(name, value string) error {
+// SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
+func (i *Inode) SetXattr(ctx context.Context, name, value string, flags uint32) error {
 	if i.overlay != nil {
-		return overlaySetxattr(i.overlay, name, value)
+		return overlaySetxattr(ctx, i.overlay, name, value, flags)
 	}
-	return i.InodeOperations.Setxattr(i, name, value)
+	return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
 }
 
-// Listxattr calls i.InodeOperations.Listxattr with i as the Inode.
-func (i *Inode) Listxattr() (map[string]struct{}, error) {
+// ListXattr calls i.InodeOperations.ListXattr with i as the Inode.
+func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) {
 	if i.overlay != nil {
-		return overlayListxattr(i.overlay)
+		return overlayListXattr(ctx, i.overlay)
 	}
-	return i.InodeOperations.Listxattr(i)
+	return i.InodeOperations.ListXattr(ctx, i)
 }
 
 // CheckPermission will check if the caller may access this file in the
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 5cde9d215..13261cb81 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -170,20 +170,27 @@ type InodeOperations interface {
 	// file system events.
 	UnstableAttr(ctx context.Context, inode *Inode) (UnstableAttr, error)
 
-	// Getxattr retrieves the value of extended attribute name. Inodes that
-	// do not support extended attributes return EOPNOTSUPP. Inodes that
-	// support extended attributes but don't have a value at name return
+	// GetXattr retrieves the value of extended attribute specified by name.
+	// Inodes that do not support extended attributes return EOPNOTSUPP. Inodes
+	// that support extended attributes but don't have a value at name return
 	// ENODATA.
-	Getxattr(inode *Inode, name string) (string, error)
+	//
+	// If this is called through the getxattr(2) syscall, size indicates the
+	// size of the buffer that the application has allocated to hold the
+	// attribute value. If the value is larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	GetXattr(ctx context.Context, inode *Inode, name string, size uint64) (string, error)
 
-	// Setxattr sets the value of extended attribute name. Inodes that
-	// do not support extended attributes return EOPNOTSUPP.
-	Setxattr(inode *Inode, name, value string) error
+	// SetXattr sets the value of extended attribute specified by name. Inodes
+	// that do not support extended attributes return EOPNOTSUPP.
+	SetXattr(ctx context.Context, inode *Inode, name, value string, flags uint32) error
 
-	// Listxattr returns the set of all extended attributes names that
+	// ListXattr returns the set of all extended attributes names that
 	// have values. Inodes that do not support extended attributes return
 	// EOPNOTSUPP.
-	Listxattr(inode *Inode) (map[string]struct{}, error)
+	ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error)
 
 	// Check determines whether an Inode can be accessed with the
 	// requested permission mask using the context (which gives access
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 13d11e001..b90da20d0 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -25,13 +25,13 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func overlayHasWhiteout(parent *Inode, name string) bool {
-	s, err := parent.Getxattr(XattrOverlayWhiteout(name))
+func overlayHasWhiteout(ctx context.Context, parent *Inode, name string) bool {
+	s, err := parent.GetXattr(ctx, XattrOverlayWhiteout(name), 1)
 	return err == nil && s == "y"
 }
 
-func overlayCreateWhiteout(parent *Inode, name string) error {
-	return parent.InodeOperations.Setxattr(parent, XattrOverlayWhiteout(name), "y")
+func overlayCreateWhiteout(ctx context.Context, parent *Inode, name string) error {
+	return parent.InodeOperations.SetXattr(ctx, parent, XattrOverlayWhiteout(name), "y", 0 /* flags */)
 }
 
 func overlayWriteOut(ctx context.Context, o *overlayEntry) error {
@@ -89,7 +89,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 		}
 
 		// Are we done?
-		if overlayHasWhiteout(parent.upper, name) {
+		if overlayHasWhiteout(ctx, parent.upper, name) {
 			if upperInode == nil {
 				parent.copyMu.RUnlock()
 				if negativeUpperChild {
@@ -345,7 +345,7 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child *
 		}
 	}
 	if child.Inode.overlay.lowerExists {
-		if err := overlayCreateWhiteout(o.upper, child.name); err != nil {
+		if err := overlayCreateWhiteout(ctx, o.upper, child.name); err != nil {
 			return err
 		}
 	}
@@ -426,7 +426,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 		return err
 	}
 	if renamed.Inode.overlay.lowerExists {
-		if err := overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName); err != nil {
+		if err := overlayCreateWhiteout(ctx, oldParent.Inode.overlay.upper, oldName); err != nil {
 			return err
 		}
 	}
@@ -528,7 +528,7 @@ func overlayUnstableAttr(ctx context.Context, o *overlayEntry) (UnstableAttr, er
 	return attr, err
 }
 
-func overlayGetxattr(o *overlayEntry, name string) (string, error) {
+func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uint64) (string, error) {
 	// Hot path. This is how the overlay checks for whiteout files.
 	// Avoid defers.
 	var (
@@ -544,31 +544,31 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) {
 
 	o.copyMu.RLock()
 	if o.upper != nil {
-		s, err = o.upper.Getxattr(name)
+		s, err = o.upper.GetXattr(ctx, name, size)
 	} else {
-		s, err = o.lower.Getxattr(name)
+		s, err = o.lower.GetXattr(ctx, name, size)
 	}
 	o.copyMu.RUnlock()
 	return s, err
 }
 
 // TODO(b/146028302): Support setxattr for overlayfs.
-func overlaySetxattr(o *overlayEntry, name, value string) error {
+func overlaySetxattr(ctx context.Context, o *overlayEntry, name, value string, flags uint32) error {
 	return syserror.EOPNOTSUPP
 }
 
-func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
+func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	var names map[string]struct{}
 	var err error
 	if o.upper != nil {
-		names, err = o.upper.Listxattr()
+		names, err = o.upper.ListXattr(ctx)
 	} else {
-		names, err = o.lower.Listxattr()
+		names, err = o.lower.ListXattr(ctx)
 	}
 	for name := range names {
-		// Same as overlayGetxattr, we shouldn't forward along
+		// Same as overlayGetXattr, we shouldn't forward along
 		// overlay attributes.
 		if strings.HasPrefix(XattrOverlayPrefix, name) {
 			delete(names, name)
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 8935aad65..493d98c36 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -382,8 +382,8 @@ type dir struct {
 	ReaddirCalled bool
 }
 
-// Getxattr implements InodeOperations.Getxattr.
-func (d *dir) Getxattr(inode *fs.Inode, name string) (string, error) {
+// GetXattr implements InodeOperations.GetXattr.
+func (d *dir) GetXattr(_ context.Context, _ *fs.Inode, name string, _ uint64) (string, error) {
 	for _, n := range d.negative {
 		if name == fs.XattrOverlayWhiteout(n) {
 			return "y", nil
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 69089c8a8..0f718e236 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -148,19 +148,19 @@ func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms
 	return d.ramfsDir.CreateFifo(ctx, dir, name, perms)
 }
 
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (d *Dir) Getxattr(i *fs.Inode, name string) (string, error) {
-	return d.ramfsDir.Getxattr(i, name)
+// GetXattr implements fs.InodeOperations.GetXattr.
+func (d *Dir) GetXattr(ctx context.Context, i *fs.Inode, name string, size uint64) (string, error) {
+	return d.ramfsDir.GetXattr(ctx, i, name, size)
 }
 
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (d *Dir) Setxattr(i *fs.Inode, name, value string) error {
-	return d.ramfsDir.Setxattr(i, name, value)
+// SetXattr implements fs.InodeOperations.SetXattr.
+func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, flags uint32) error {
+	return d.ramfsDir.SetXattr(ctx, i, name, value, flags)
 }
 
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (d *Dir) Listxattr(i *fs.Inode) (map[string]struct{}, error) {
-	return d.ramfsDir.Listxattr(i)
+// ListXattr implements fs.InodeOperations.ListXattr.
+func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) {
+	return d.ramfsDir.ListXattr(ctx, i)
 }
 
 // Lookup implements fs.InodeOperations.Lookup.
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 479c5f6ff..6b2920900 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
 		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
 		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index d3f61f5e8..8c1b20911 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{
 		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
 		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		8:   syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
 		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 97d9a65ea..816352218 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -25,12 +25,12 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Getxattr implements linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
-	size := args[3].SizeT()
+	size := uint64(args[3].SizeT())
 
 	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
 	if err != nil {
@@ -39,22 +39,28 @@ func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	valueLen := 0
 	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		value, err := getxattr(t, d, dirPath, nameAddr)
+		// If getxattr(2) is called with size 0, the size of the value will be
+		// returned successfully even if it is nonzero. In that case, we need to
+		// retrieve the entire attribute value so we can return the correct size.
+		requestedSize := size
+		if size == 0 || size > linux.XATTR_SIZE_MAX {
+			requestedSize = linux.XATTR_SIZE_MAX
+		}
+
+		value, err := getXattr(t, d, dirPath, nameAddr, uint64(requestedSize))
 		if err != nil {
 			return err
 		}
 
 		valueLen = len(value)
-		if size == 0 {
-			return nil
-		}
-		if size > linux.XATTR_SIZE_MAX {
-			size = linux.XATTR_SIZE_MAX
-		}
-		if valueLen > int(size) {
+		if uint64(valueLen) > requestedSize {
 			return syserror.ERANGE
 		}
 
+		// Skip copying out the attribute value if size is 0.
+		if size == 0 {
+			return nil
+		}
 		_, err = t.CopyOutBytes(valueAddr, []byte(value))
 		return err
 	})
@@ -64,8 +70,8 @@ func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return uintptr(valueLen), nil, nil
 }
 
-// getxattr implements getxattr from the given *fs.Dirent.
-func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) {
+// getXattr implements getxattr(2) from the given *fs.Dirent.
+func getXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr, size uint64) (string, error) {
 	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 		return "", syserror.ENOTDIR
 	}
@@ -83,15 +89,15 @@ func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr)
 		return "", syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.Getxattr(name)
+	return d.Inode.GetXattr(t, name, size)
 }
 
-// Setxattr implements linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// SetXattr implements linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
-	size := args[3].SizeT()
+	size := uint64(args[3].SizeT())
 	flags := args[4].Uint()
 
 	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
@@ -104,12 +110,12 @@ func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags)
+		return setXattr(t, d, dirPath, nameAddr, valueAddr, uint64(size), flags)
 	})
 }
 
-// setxattr implements setxattr from the given *fs.Dirent.
-func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error {
+// setXattr implements setxattr(2) from the given *fs.Dirent.
+func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
 	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 		return syserror.ENOTDIR
 	}
@@ -136,7 +142,7 @@ func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.Setxattr(name, value)
+	return d.Inode.SetXattr(t, name, value, flags)
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 93606d051..4d84ad999 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -767,6 +767,16 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
+// TODO(b/127675828): support getxattr.
+func (l *localFile) GetXattr(name string, size uint64) (string, error) {
+	return "", syscall.EOPNOTSUPP
+}
+
+// TODO(b/127675828): support setxattr.
+func (l *localFile) SetXattr(name, value string, flags uint32) error {
+	return syscall.EOPNOTSUPP
+}
+
 // Allocate implements p9.File.
 func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
 	if !l.isOpen() {
-- 
cgit v1.2.3


From c50efc8c700fa2628f1415daeeb3b382009eb1bb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 16 Jan 2020 12:48:05 -0800
Subject: Disable xattr tests.

These can remain disabled until we actually support extended attributes.

The following modifications were also made:
1. Disable save/restore on tests that change file permissions. Restore will not
work properly for these tests, since it will try to open the file with
read-write after it has been read- or write-only.
2. Change user.abc to user.test.

PiperOrigin-RevId: 290123941
---
 test/syscalls/BUILD          |   5 --
 test/syscalls/linux/xattr.cc | 152 ++++++++++++-------------------------------
 2 files changed, 42 insertions(+), 115 deletions(-)

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a3a85917d..829693e8e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -717,11 +717,6 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
-syscall_test(
-    add_overlay = True,
-    test = "//test/syscalls/linux:xattr_test",
-)
-
 go_binary(
     name = "syscall_test_runner",
     testonly = 1,
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 75740238c..b3bc3463e 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -59,7 +59,8 @@ TEST_F(XattrTest, XattrLargeName) {
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
 
-  // TODO(b/127675828): Support setxattr and getxattr.
+  // An xattr should be whitelisted before it can be accessed--do not allow
+  // arbitrary xattrs to be read/written in gVisor.
   if (!IsRunningOnGvisor()) {
     EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
                 SyscallSucceeds());
@@ -83,59 +84,53 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
               SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
-TEST_F(XattrTest, XattrReadOnly) {
+// Do not allow save/restore cycles after making the test file read-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
-                SyscallSucceeds());
-  }
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
+  DisableSave ds;
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    char buf = '-';
-    EXPECT_THAT(getxattr(path, name, &buf, size),
-                SyscallSucceedsWithValue(size));
-    EXPECT_EQ(buf, val);
-  }
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, XattrWriteOnly) {
+// Do not allow save/restore cycles after making the test file write-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
+  DisableSave ds;
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
 
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
-                SyscallSucceeds());
-  }
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
-  // TODO(b/127675828): Support setxattr and getxattr.
+  // TODO(b/127675828): Support setxattr and getxattr with "trusted" prefix.
   SKIP_IF(IsRunningOnGvisor());
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
@@ -147,11 +142,8 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
               SyscallSucceeds());
   EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
@@ -159,13 +151,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
               SyscallSucceeds());
   EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
@@ -173,7 +162,7 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  char name[] = "user.abc";
+  const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
   EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
@@ -191,11 +180,8 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   size_t size = 1;
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
@@ -209,11 +195,8 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
 
@@ -225,7 +208,7 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
 
   // Note that each particular fs implementation may stipulate a lower size
   // limit, in which case we actually may fail (e.g. error with ENOSPC) for
@@ -235,43 +218,29 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
               SyscallFailsWithErrno(E2BIG));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(getxattr(path, name, nullptr, 0),
-                SyscallFailsWithErrno(ENODATA));
-  }
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(getxattr(path, name, nullptr, 0),
-                SyscallFailsWithErrno(ENODATA));
-  }
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
   std::fill(val.begin(), val.end(), 'a');
   size_t size = 1;
@@ -286,11 +255,8 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
               SyscallSucceeds());
@@ -304,11 +270,8 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
               SyscallSucceeds());
@@ -321,11 +284,8 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
               SyscallSucceeds());
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -335,11 +295,8 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
               SyscallFailsWithErrno(ENODATA));
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -357,11 +314,8 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   int val = 1234;
   size_t size = sizeof(val);
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -372,11 +326,8 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   size_t size = val.size();
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -387,11 +338,8 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
 
@@ -405,11 +353,8 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
               SyscallSucceeds());
@@ -421,11 +366,8 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
               SyscallSucceeds());
@@ -440,11 +382,8 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -454,11 +393,8 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
   // Set value with zero size.
@@ -473,13 +409,9 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(b/127675828): Support getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  std::string name = "user.nonexistent";
-  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
-              SyscallFailsWithErrno(ENODATA));
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 94be30a18dc7c75dc70716ce1ede74a7fb1352fb Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 16 Jan 2020 13:00:58 -0800
Subject: Add run-gcp command.

Add command to run benchmarks on GCP backed machines
using the gcloud producer.

Run with:
`bazel run :benchmarks -- run-gcp [BENCHMARK_NAME]`

Tested with the startup benchmark.

PiperOrigin-RevId: 290126444
---
 benchmarks/harness/__init__.py                     |  7 +++
 benchmarks/harness/machine.py                      |  3 +
 .../harness/machine_producers/gcloud_producer.py   | 70 ++++++++++++++--------
 benchmarks/harness/ssh_connection.py               |  9 +--
 benchmarks/runner/__init__.py                      | 60 +++++++++++++++++++
 benchmarks/runner/commands.py                      | 51 ++++++++++++++++
 6 files changed, 168 insertions(+), 32 deletions(-)

diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
index 7b96d1666..61fd25f73 100644
--- a/benchmarks/harness/__init__.py
+++ b/benchmarks/harness/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Core benchmark utilities."""
 
+import getpass
 import os
 
 # LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a
@@ -23,3 +24,9 @@ LOCAL_WORKLOADS_PATH = os.path.join(
 # REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the
 # remote host. This is a format string that accepts a single string parameter.
 REMOTE_WORKLOADS_PATH = "workloads/{}"
+
+# DEFAULT_USER is the default user running this script.
+DEFAULT_USER = getpass.getuser()
+
+# DEFAULT_USER_HOME is the home directory of the user running the script.
+DEFAULT_USER_HOME = os.environ["HOME"] if "HOME" in os.environ else ""
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index af037dbcc..2df4c9e31 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -214,6 +214,9 @@ class RemoteMachine(Machine):
     # Push to the remote machine and build.
     logging.info("Building %s@%s remotely...", workload, self._name)
     remote_path = self._ssh_connection.send_workload(workload)
+    # Workloads are all tarballs.
+    self.run("tar -xvf {remote_path}/tar.tar -C {remote_path}".format(
+        remote_path=remote_path))
     self.run("docker build --tag={} {}".format(workload, remote_path))
     return workload  # Workload is the tag.
 
diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
index 4693dd8a2..e0b77d52b 100644
--- a/benchmarks/harness/machine_producers/gcloud_producer.py
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -29,7 +29,6 @@ collisions with user instances shouldn't happen.
   producer.release_machines(NUM_MACHINES)
 """
 import datetime
-import getpass
 import json
 import subprocess
 import threading
@@ -40,8 +39,6 @@ from benchmarks.harness import machine
 from benchmarks.harness.machine_producers import gcloud_mock_recorder
 from benchmarks.harness.machine_producers import machine_producer
 
-DEFAULT_USER = getpass.getuser()
-
 
 class GCloudProducer(machine_producer.MachineProducer):
   """Implementation of MachineProducer backed by GCP.
@@ -50,9 +47,10 @@ class GCloudProducer(machine_producer.MachineProducer):
 
   Attributes:
     project: The GCP project name under which to create the machines.
-    ssh_key_path: path to a valid ssh key. See README on vaild ssh keys.
+    ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys.
     image: image name as a string.
     image_project: image project as a string.
+    machine_type: type of GCP to create. e.g. n1-standard-4
     zone: string to a valid GCP zone.
     ssh_user: string of user name for ssh_key
     ssh_password: string of password for ssh key
@@ -63,18 +61,22 @@ class GCloudProducer(machine_producer.MachineProducer):
 
   def __init__(self,
                project: str,
-               ssh_key_path: str,
+               ssh_key_file: str,
                image: str,
                image_project: str,
+               machine_type: str,
                zone: str,
                ssh_user: str,
+               ssh_password: str,
                mock: gcloud_mock_recorder.MockPrinter = None):
     self.project = project
-    self.ssh_key_path = ssh_key_path
+    self.ssh_key_file = ssh_key_file
     self.image = image
     self.image_project = image_project
+    self.machine_type = machine_type
     self.zone = zone
-    self.ssh_user = ssh_user if ssh_user else DEFAULT_USER
+    self.ssh_user = ssh_user
+    self.ssh_password = ssh_password
     self.mock = mock
     self.condition = threading.Condition()
 
@@ -86,20 +88,19 @@ class GCloudProducer(machine_producer.MachineProducer):
     with self.condition:
       names = self._get_unique_names(num_machines)
       self._build_instances(names)
-      instances = self._start_command(names)
-      self._add_ssh_key_to_instances(names)
-      return self._machines_from_instances(instances)
+    instances = self._start_command(names)
+    self._add_ssh_key_to_instances(names)
+    return self._machines_from_instances(instances)
 
   def release_machines(self, machine_list: List[machine.Machine]):
     """Releases the requested number of machines, deleting the instances."""
     if not machine_list:
       return
-    with self.condition:
-      cmd = "gcloud compute instances delete --quiet".split(" ")
-      names = [str(m) for m in machine_list]
-      cmd.extend(names)
-      cmd.append("--zone={zone}".format(zone=self.zone))
-      self._run_command(cmd)
+    cmd = "gcloud compute instances delete --quiet".split(" ")
+    names = [str(m) for m in machine_list]
+    cmd.extend(names)
+    cmd.append("--zone={zone}".format(zone=self.zone))
+    self._run_command(cmd, detach=True)
 
   def _machines_from_instances(
       self, instances: List[Dict[str, Any]]) -> List[machine.Machine]:
@@ -111,9 +112,11 @@ class GCloudProducer(machine_producer.MachineProducer):
           "hostname":
               instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"],
           "key_path":
-              self.ssh_key_path,
+              self.ssh_key_file,
           "username":
-              self.ssh_user
+              self.ssh_user,
+          "key_password":
+              self.ssh_password
       }
       machines.append(machine.RemoteMachine(name=name, **kwargs))
     return machines
@@ -148,12 +151,15 @@ class GCloudProducer(machine_producer.MachineProducer):
           "_build_instances cannot create instances without names.")
     cmd = "gcloud compute instances create".split(" ")
     cmd.extend(names)
-    cmd.extend("--preemptible --image={image} --zone={zone}".format(
-        image=self.image, zone=self.zone).split(" "))
+    cmd.extend(
+        "--preemptible --image={image} --zone={zone} --machine-type={machine_type}"
+        .format(
+            image=self.image, zone=self.zone,
+            machine_type=self.machine_type).split(" "))
     if self.image_project:
       cmd.append("--image-project={project}".format(project=self.image_project))
-      res = self._run_command(cmd)
-      return json.loads(res.stdout)
+    res = self._run_command(cmd)
+    return json.loads(res.stdout)
 
   def _start_command(self, names):
     """Starts instances using gcloud command.
@@ -184,7 +190,7 @@ class GCloudProducer(machine_producer.MachineProducer):
 
     Args:
       names: list of machine names to which to add the ssh-key
-        self.ssh_key_path.
+        self.ssh_key_file.
 
     Raises:
       subprocess.CalledProcessError: when underlying subprocess call returns an
@@ -193,7 +199,7 @@ class GCloudProducer(machine_producer.MachineProducer):
     """
     for name in names:
       cmd = "gcloud compute ssh {name}".format(name=name).split(" ")
-      cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_path))
+      cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file))
       cmd.append("--zone={zone}".format(zone=self.zone))
       cmd.append("--command=uname")
       timeout = datetime.timedelta(seconds=5 * 60)
@@ -221,7 +227,9 @@ class GCloudProducer(machine_producer.MachineProducer):
     res = self._run_command(cmd)
     return json.loads(res.stdout)
 
-  def _run_command(self, cmd: List[str]) -> subprocess.CompletedProcess:
+  def _run_command(self,
+                   cmd: List[str],
+                   detach: bool = False) -> [None, subprocess.CompletedProcess]:
     """Runs command as a subprocess.
 
     Runs command as subprocess and returns the result.
@@ -230,14 +238,24 @@ class GCloudProducer(machine_producer.MachineProducer):
 
     Args:
       cmd: command to be run as a list of strings.
+      detach: if True, run the child process and don't wait for it to return.
 
     Returns:
-      Completed process object to be parsed by caller.
+      Completed process object to be parsed by caller or None if detach=True.
 
     Raises:
       CalledProcessError: if subprocess.run returns an error.
     """
     cmd = cmd + ["--format=json"]
+    if detach:
+      p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+      if self.mock:
+        out, _ = p.communicate()
+        self.mock.record(
+            subprocess.CompletedProcess(
+                returncode=p.returncode, stdout=out, args=p.args))
+      return
+
     res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     if self.mock:
       self.mock.record(res)
diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py
index fcbfbcdb2..e0bf258f1 100644
--- a/benchmarks/harness/ssh_connection.py
+++ b/benchmarks/harness/ssh_connection.py
@@ -94,7 +94,7 @@ class SSHConnection:
     return stdout, stderr
 
   def send_workload(self, name: str) -> str:
-    """Sends a workload to the remote machine.
+    """Sends a workload tarball to the remote machine.
 
     Args:
       name: The workload name.
@@ -103,9 +103,6 @@ class SSHConnection:
       The remote path.
     """
     with self._client() as client:
-      for dirpath, _, filenames in os.walk(
-          harness.LOCAL_WORKLOADS_PATH.format(name)):
-        for filename in filenames:
-          send_one_file(client, os.path.join(dirpath, filename),
-                        harness.REMOTE_WORKLOADS_PATH.format(name))
+      send_one_file(client, harness.LOCAL_WORKLOADS_PATH.format(name),
+                    harness.REMOTE_WORKLOADS_PATH.format(name))
     return harness.REMOTE_WORKLOADS_PATH.format(name)
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
index 6f56704d8..ba80d83d7 100644
--- a/benchmarks/runner/__init__.py
+++ b/benchmarks/runner/__init__.py
@@ -15,10 +15,13 @@
 
 import copy
 import csv
+import json
 import logging
+import os
 import pkgutil
 import pydoc
 import re
+import subprocess
 import sys
 import types
 from typing import List
@@ -26,8 +29,10 @@ from typing import Tuple
 
 import click
 
+from benchmarks import harness
 from benchmarks import suites
 from benchmarks.harness import benchmark_driver
+from benchmarks.harness.machine_producers import gcloud_producer
 from benchmarks.harness.machine_producers import machine_producer
 from benchmarks.harness.machine_producers import mock_producer
 from benchmarks.harness.machine_producers import yaml_producer
@@ -116,6 +121,61 @@ def run_mock(ctx, **kwargs):
   run(ctx, mock_producer.MockMachineProducer(), **kwargs)
 
 
+@runner.command("run-gcp", commands.GCPCommand)
+@click.pass_context
+def run_gcp(ctx, project: str, ssh_key_file: str, image: str,
+            image_project: str, machine_type: str, zone: str, ssh_user: str,
+            ssh_password: str, **kwargs):
+  """Runs all benchmarks on GCP instances."""
+
+  if not ssh_user:
+    ssh_user = harness.DEFAULT_USER
+
+  # Get the default project if one was not provided.
+  if not project:
+    sub = subprocess.run(
+        "gcloud config get-value project".split(" "), stdout=subprocess.PIPE)
+    if sub.returncode:
+      raise ValueError(
+          "Cannot get default project from gcloud. Is it configured>")
+    project = sub.stdout.decode("utf-8").strip("\n")
+
+  if not image_project:
+    image_project = project
+
+  # Check that the ssh-key exists and is readable.
+  if not os.access(ssh_key_file, os.R_OK):
+    raise ValueError(
+        "ssh key given `{ssh_key}` is does not exist or is not readable."
+        .format(ssh_key=ssh_key_file))
+
+  # Check that the image exists.
+  sub = subprocess.run(
+      "gcloud compute images describe {image} --project {image_project} --format=json"
+      .format(image=image, image_project=image_project).split(" "),
+      stdout=subprocess.PIPE)
+  if sub.returncode or "READY" not in json.loads(sub.stdout)["status"]:
+    raise ValueError(
+        "given image was not found or is not ready: {image} {image_project}."
+        .format(image=image, image_project=image_project))
+
+  # Check and set zone to default.
+  if not zone:
+    sub = subprocess.run(
+        "gcloud config get-value compute/zone".split(" "),
+        stdout=subprocess.PIPE)
+    if sub.returncode:
+      raise ValueError(
+          "Default zone is not set in gcloud. Set one or pass a zone with the --zone flag."
+      )
+    zone = sub.stdout.decode("utf-8").strip("\n")
+
+  producer = gcloud_producer.GCloudProducer(project, ssh_key_file, image,
+                                            image_project, machine_type, zone,
+                                            ssh_user, ssh_password)
+  run(ctx, producer, **kwargs)
+
+
 def run(ctx, producer: machine_producer.MachineProducer, method: str, runs: int,
         runtime: List[str], metric: List[str], stat: str, **kwargs):
   """Runs arbitrary benchmarks.
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
index 4973843b9..7ab12fac6 100644
--- a/benchmarks/runner/commands.py
+++ b/benchmarks/runner/commands.py
@@ -24,6 +24,8 @@ def run_mock(**kwargs):
 """
 import click
 
+from benchmarks import harness
+
 
 class RunCommand(click.core.Command):
   """Base Run Command with flags.
@@ -82,3 +84,52 @@ class LocalCommand(RunCommand):
             ("--limit",),
             default=1,
             help="Limit of number of benchmarks that can run at a given time."))
+
+
+class GCPCommand(RunCommand):
+  """GCPCommand inherits all flags from RunCommand and adds flags for run_gcp method.
+
+  Attributes:
+    project: GCP project
+    ssh_key_path: path to the ssh-key to use for the run
+    image: name of the image to build machines from
+    image_project: GCP project under which to find image
+    zone: a GCP zone (e.g. us-west1-b)
+    ssh_user: username to use for the ssh-key
+    ssh_password: password to use for the ssh-key
+  """
+
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+
+    project = click.core.Option(
+        ("--project",),
+        help="Project to run on if not default value given by 'gcloud config get-value project'."
+    )
+    ssh_key_path = click.core.Option(
+        ("--ssh-key-file",),
+        help="Path to a valid ssh private key to use. See README on generating a valid ssh key. Set to ~/.ssh/benchmark-tools by default.",
+        default=harness.DEFAULT_USER_HOME + "/.ssh/benchmark-tools")
+    image = click.core.Option(("--image",),
+                              help="The image on which to build VMs.",
+                              default="bm-tools-testing")
+    image_project = click.core.Option(
+        ("--image_project",),
+        help="The project under which the image to be used is listed.",
+        default="")
+    machine_type = click.core.Option(("--machine_type",),
+                                     help="Type to make all machines.",
+                                     default="n1-standard-4")
+    zone = click.core.Option(("--zone",),
+                             help="The GCP zone to run on.",
+                             default="")
+    ssh_user = click.core.Option(("--ssh-user",),
+                                 help="User for the ssh key.",
+                                 default=harness.DEFAULT_USER)
+    ssh_password = click.core.Option(("--ssh-password",),
+                                     help="Password for the ssh key.",
+                                     default="")
+    self.params.extend([
+        project, ssh_key_path, image, image_project, machine_type, zone,
+        ssh_user, ssh_password
+    ])
-- 
cgit v1.2.3


From 3dd3275da7b665cf2ca297e4bf566fcc77025af8 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 16 Jan 2020 13:13:22 -0800
Subject: Add more files to /proc/[pid]/*

Files not implemented require VFSv2 plumbing into the kernel.
Also, cgroup is not implemented yet.

Updates #1195

PiperOrigin-RevId: 290129176
---
 pkg/sentry/fsimpl/kernfs/filesystem.go      |  63 ++----
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |  16 +-
 pkg/sentry/fsimpl/kernfs/kernfs.go          |   2 +-
 pkg/sentry/fsimpl/kernfs/symlink.go         |  21 +-
 pkg/sentry/fsimpl/proc/BUILD                |   3 +-
 pkg/sentry/fsimpl/proc/mounts.go            |  33 ---
 pkg/sentry/fsimpl/proc/subtasks.go          | 126 +++++++++++
 pkg/sentry/fsimpl/proc/task.go              |  69 ++++--
 pkg/sentry/fsimpl/proc/task_files.go        | 315 +++++++++++++++++++++++++---
 pkg/sentry/fsimpl/proc/tasks.go             |   2 +-
 pkg/sentry/fsimpl/proc/tasks_test.go        |  20 +-
 pkg/sentry/vfs/permissions.go               |  24 ++-
 12 files changed, 549 insertions(+), 145 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/proc/mounts.go
 create mode 100644 pkg/sentry/fsimpl/proc/subtasks.go

diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 79759e0fc..a4600ad47 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -40,7 +39,7 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 		return nil, syserror.ENOTDIR
 	}
 	// Directory searchable?
-	if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
@@ -182,8 +181,8 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 //
 // Preconditions: Filesystem.mu must be locked for at least reading. parentInode
 // == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
-	if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
+	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return "", err
 	}
 	pc := rp.Component()
@@ -206,7 +205,7 @@ func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInod
 // checkDeleteLocked checks that the file represented by vfsd may be deleted.
 //
 // Preconditions: Filesystem.mu must be locked for at least reading.
-func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
 	parentVFSD := vfsd.Parent()
 	if parentVFSD == nil {
 		return syserror.EBUSY
@@ -214,36 +213,12 @@ func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
 	if parentVFSD.IsDisowned() {
 		return syserror.ENOENT
 	}
-	if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	return nil
 }
 
-// checkRenameLocked checks that a rename operation may be performed on the
-// target dentry across the given set of parent directories. The target dentry
-// may be nil.
-//
-// Precondition: isDir(dstInode) == true.
-func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error {
-	srcDir := src.Parent()
-	if srcDir == nil {
-		return syserror.EBUSY
-	}
-	if srcDir.IsDisowned() {
-		return syserror.ENOENT
-	}
-	if dstDir.IsDisowned() {
-		return syserror.ENOENT
-	}
-	// Check for creation permissions on dst dir.
-	if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
-		return err
-	}
-
-	return nil
-}
-
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *Filesystem) Release() {
 }
@@ -269,7 +244,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -302,7 +277,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
 	if err != nil {
 		return err
 	}
@@ -339,7 +314,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
 	if err != nil {
 		return err
 	}
@@ -367,7 +342,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
 	if err != nil {
 		return err
 	}
@@ -401,7 +376,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err != nil {
 			return nil, err
 		}
-		if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
 		return inode.Open(rp, vfsd, opts.Flags)
@@ -420,7 +395,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
 		return inode.Open(rp, vfsd, opts.Flags)
@@ -432,7 +407,7 @@ afterTrailingSymlink:
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -450,7 +425,7 @@ afterTrailingSymlink:
 	}
 	if childVFSD == nil {
 		// Already checked for searchability above; now check for writability.
-		if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -485,7 +460,7 @@ afterTrailingSymlink:
 			goto afterTrailingSymlink
 		}
 	}
-	if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil {
+	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
 	return childInode.Open(rp, childVFSD, opts.Flags)
@@ -545,13 +520,13 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
-	if err := checkDeleteLocked(rp, srcVFSD); err != nil {
+	if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
 		return err
 	}
 
 	// Can we create the dst dentry?
 	var dstVFSD *vfs.Dentry
-	pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode)
+	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
 	switch err {
 	case nil:
 		// Ok, continue with rename as replacement.
@@ -607,7 +582,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
 		return err
 	}
 	if !vfsd.Impl().(*Dentry).isDir() {
@@ -683,7 +658,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
 	if err != nil {
 		return err
 	}
@@ -712,7 +687,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
 		return err
 	}
 	if vfsd.Impl().(*Dentry).isDir() {
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 6aff3d39a..1700fffd9 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -262,7 +262,7 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
 }
 
 // CheckPermissions implements Inode.CheckPermissions.
-func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := a.Mode()
 	return vfs.GenericCheckPermissions(
 		creds,
@@ -527,12 +527,8 @@ var _ Inode = (*StaticDirectory)(nil)
 
 // NewStaticDir creates a new static directory and returns its dentry.
 func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
-	if perm&^linux.PermissionsMask != 0 {
-		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
-	}
-
 	inode := &StaticDirectory{}
-	inode.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm)
+	inode.Init(creds, ino, perm)
 
 	dentry := &Dentry{}
 	dentry.Init(inode)
@@ -544,6 +540,14 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil
 	return dentry
 }
 
+// Init initializes StaticDirectory.
+func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm)
+}
+
 // Open implements kernfs.Inode.
 func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
 	fd := &GenericDirectoryFD{}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index bb12f39a2..85bcdcc57 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -320,7 +320,7 @@ type inodeMetadata interface {
 	// CheckPermissions checks that creds may access this inode for the
 	// requested access type, per the the rules of
 	// fs/namei.c:generic_permission().
-	CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error
+	CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error
 
 	// Mode returns the (struct stat)::st_mode value for this inode. This is
 	// separated from Stat for performance.
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 068063f4e..f19f12854 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -20,7 +20,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
-type staticSymlink struct {
+// StaticSymlink provides an Inode implementation for symlinks that point to
+// a immutable target.
+type StaticSymlink struct {
 	InodeAttrs
 	InodeNoopRefCount
 	InodeSymlink
@@ -28,18 +30,25 @@ type staticSymlink struct {
 	target string
 }
 
-var _ Inode = (*staticSymlink)(nil)
+var _ Inode = (*StaticSymlink)(nil)
 
 // NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry {
-	inode := &staticSymlink{target: target}
-	inode.Init(creds, ino, linux.ModeSymlink|perm)
+func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry {
+	inode := &StaticSymlink{}
+	inode.Init(creds, ino, target)
 
 	d := &Dentry{}
 	d.Init(inode)
 	return d
 }
 
-func (s *staticSymlink) Readlink(_ context.Context) (string, error) {
+// Init initializes the instance.
+func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) {
+	s.target = target
+	s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777)
+}
+
+// Readlink implements Inode.
+func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 	return s.target, nil
 }
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 6cd18cec8..e92564b5d 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -7,7 +7,7 @@ go_library(
     name = "proc",
     srcs = [
         "filesystem.go",
-        "mounts.go",
+        "subtasks.go",
         "task.go",
         "task_files.go",
         "tasks.go",
@@ -29,6 +29,7 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
+        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go
deleted file mode 100644
index 8683cf677..000000000
--- a/pkg/sentry/fsimpl/proc/mounts.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import "gvisor.dev/gvisor/pkg/sentry/kernel"
-
-// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile.
-
-// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo.
-//
-// +stateify savable
-type mountInfoFile struct {
-	t *kernel.Task
-}
-
-// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts.
-//
-// +stateify savable
-type mountsFile struct {
-	t *kernel.Task
-}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
new file mode 100644
index 000000000..8892c5a11
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -0,0 +1,126 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// subtasksInode represents the inode for /proc/[pid]/task/ directory.
+//
+// +stateify savable
+type subtasksInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+
+	task   *kernel.Task
+	pidns  *kernel.PIDNamespace
+	inoGen InoGenerator
+}
+
+var _ kernfs.Inode = (*subtasksInode)(nil)
+
+func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator) *kernfs.Dentry {
+	subInode := &subtasksInode{
+		task:   task,
+		pidns:  pidns,
+		inoGen: inoGen,
+	}
+	// Note: credentials are overridden by taskOwnedInode.
+	subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
+	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	inode := &taskOwnedInode{Inode: subInode, owner: task}
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+
+	return dentry
+}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (i *subtasksInode) Valid(ctx context.Context) bool {
+	return true
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	tid, err := strconv.ParseUint(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+
+	subTask := i.pidns.TaskWithID(kernel.ThreadID(tid))
+	if subTask == nil {
+		return nil, syserror.ENOENT
+	}
+	if subTask.ThreadGroup() != i.task.ThreadGroup() {
+		return nil, syserror.ENOENT
+	}
+
+	subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false)
+	return subTaskDentry.VFSDentry(), nil
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
+	if len(tasks) == 0 {
+		return offset, syserror.ENOENT
+	}
+
+	tids := make([]int, 0, len(tasks))
+	for _, tid := range tasks {
+		tids = append(tids, int(tid))
+	}
+
+	sort.Ints(tids)
+	for _, tid := range tids[relOffset:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(tid), 10),
+			Type:    linux.DT_DIR,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if !cb.Handle(dirent) {
+			return offset, nil
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+// Open implements kernfs.Inode.
+func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+// Stat implements kernfs.Inode.
+func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
+	stat := i.InodeAttrs.Stat(vsfs)
+	stat.Nlink += uint32(i.task.ThreadGroup().Count())
+	return stat
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 5a384817f..621c17cfe 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -15,6 +15,8 @@
 package proc
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
@@ -42,27 +44,31 @@ var _ kernfs.Inode = (*taskInode)(nil)
 
 func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry {
 	contents := map[string]*kernfs.Dentry{
-		//"auxv":      newAuxvec(t, msrc),
-		//"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
-		//"comm":      newComm(t, msrc),
-		//"environ":   newExecArgInode(t, msrc, environExecArg),
+		"auxv":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+		"comm":    newComm(task, inoGen.NextIno(), 0444),
+		"environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
 		//"exe":       newExe(t, msrc),
 		//"fd":        newFdDir(t, msrc),
 		//"fdinfo":    newFdInfoDir(t, msrc),
-		//"gid_map":   newGIDMap(t, msrc),
-		"io":   newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)),
-		"maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
+		"gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":      newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
 		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		//"ns":        newNamespaceDir(t, msrc),
-		"smaps":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
-		"stat":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{t: task}),
-		"status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{t: task, pidns: pidns}),
-		//"uid_map":   newUIDMap(t, msrc),
+		"ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{
+			"net":  newNamespaceSymlink(task, inoGen.NextIno(), "net"),
+			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
+			"user": newNamespaceSymlink(task, inoGen.NextIno(), "user"),
+		}),
+		"smaps":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
+		"stat":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
+		"status":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
-		//contents["task"] = p.newSubtasks(t, msrc)
+		contents["task"] = newSubtasks(task, pidns, inoGen)
 	}
 	//if len(p.cgroupControllers) > 0 {
 	//	contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
@@ -127,6 +133,23 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode
 	return d
 }
 
+func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &kernfs.StaticDirectory{}
+
+	// Note: credentials are overridden by taskOwnedInode.
+	dir.Init(task.Credentials(), ino, perm)
+
+	inode := &taskOwnedInode{Inode: dir, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := dir.OrderedChildren.Populate(d, children)
+	dir.IncLinks(links)
+
+	return d
+}
+
 // Stat implements kernfs.Inode.
 func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
 	stat := i.Inode.Stat(fs)
@@ -137,7 +160,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
 }
 
 // CheckPermissions implements kernfs.Inode.
-func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
 	return vfs.GenericCheckPermissions(
@@ -188,3 +211,19 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
 	}
 	return &ioData{ioUsage: t}
 }
+
+func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+	// Namespace symlinks should contain the namespace name and the inode number
+	// for the namespace instance, so for example user:[123456]. We currently fake
+	// the inode number by sticking the symlink inode in its place.
+	target := fmt.Sprintf("%s:[%d]", ns, ino)
+
+	inode := &kernfs.StaticSymlink{}
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), ino, target)
+
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 93f0e1aa8..7bc352ae9 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -17,15 +17,20 @@ package proc
 import (
 	"bytes"
 	"fmt"
+	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // mm gets the kernel task's MemoryManager. No additional reference is taken on
@@ -41,6 +46,256 @@ func getMM(task *kernel.Task) *mm.MemoryManager {
 	return tmm
 }
 
+// getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the
+// MemoryManager's users count is incremented, and must be decremented by the
+// caller when it is no longer in use.
+func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
+	if task.ExitState() == kernel.TaskExitDead {
+		return nil, syserror.ESRCH
+	}
+	var m *mm.MemoryManager
+	task.WithMuLocked(func(t *kernel.Task) {
+		m = t.MemoryManager()
+	})
+	if m == nil || !m.IncUsers() {
+		return nil, io.EOF
+	}
+	return m, nil
+}
+
+type bufferWriter struct {
+	buf *bytes.Buffer
+}
+
+// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
+// the number of bytes written. It may return a partial write without an
+// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
+// return a full write with an error (i.e. srcs.NumBytes(), err) where err
+// != nil).
+func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	written := srcs.NumBytes()
+	for !srcs.IsEmpty() {
+		w.buf.Write(srcs.Head().ToSlice())
+		srcs = srcs.Tail()
+	}
+	return written, nil
+}
+
+// auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv.
+//
+// +stateify savable
+type auxvData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*auxvData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	m, err := getMMIncRef(d.task)
+	if err != nil {
+		return err
+	}
+	defer m.DecUsers(ctx)
+
+	// Space for buffer with AT_NULL (0) terminator at the end.
+	auxv := m.Auxv()
+	buf.Grow((len(auxv) + 1) * 16)
+	for _, e := range auxv {
+		var tmp [8]byte
+		usermem.ByteOrder.PutUint64(tmp[:], e.Key)
+		buf.Write(tmp[:])
+
+		usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value))
+		buf.Write(tmp[:])
+	}
+	return nil
+}
+
+// execArgType enumerates the types of exec arguments that are exposed through
+// proc.
+type execArgType int
+
+const (
+	cmdlineDataArg execArgType = iota
+	environDataArg
+)
+
+// cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline.
+//
+// +stateify savable
+type cmdlineData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+
+	// arg is the type of exec argument this file contains.
+	arg execArgType
+}
+
+var _ dynamicInode = (*cmdlineData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	m, err := getMMIncRef(d.task)
+	if err != nil {
+		return err
+	}
+	defer m.DecUsers(ctx)
+
+	// Figure out the bounds of the exec arg we are trying to read.
+	var ar usermem.AddrRange
+	switch d.arg {
+	case cmdlineDataArg:
+		ar = usermem.AddrRange{
+			Start: m.ArgvStart(),
+			End:   m.ArgvEnd(),
+		}
+	case environDataArg:
+		ar = usermem.AddrRange{
+			Start: m.EnvvStart(),
+			End:   m.EnvvEnd(),
+		}
+	default:
+		panic(fmt.Sprintf("unknown exec arg type %v", d.arg))
+	}
+	if ar.Start == 0 || ar.End == 0 {
+		// Don't attempt to read before the start/end are set up.
+		return io.EOF
+	}
+
+	// N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
+	// until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
+	// cmdline and environment").
+	writer := &bufferWriter{buf: buf}
+	if n, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil {
+		// Nothing to copy or something went wrong.
+		return err
+	}
+
+	// On Linux, if the NULL byte at the end of the argument vector has been
+	// overwritten, it continues reading the environment vector as part of
+	// the argument vector.
+	if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 {
+		if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 {
+			// If we found a NULL character somewhere else in argv, truncate the
+			// return up to the NULL terminator (including it).
+			buf.Truncate(end)
+			return nil
+		}
+
+		// There is no NULL terminator in the string, return into envp.
+		arEnvv := usermem.AddrRange{
+			Start: m.EnvvStart(),
+			End:   m.EnvvEnd(),
+		}
+
+		// Upstream limits the returned amount to one page of slop.
+		// https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208
+		// we'll return one page total between argv and envp because of the
+		// above page restrictions.
+		if buf.Len() >= usermem.PageSize {
+			// Returned at least one page already, nothing else to add.
+			return nil
+		}
+		remaining := usermem.PageSize - buf.Len()
+		if int(arEnvv.Length()) > remaining {
+			end, ok := arEnvv.Start.AddLength(uint64(remaining))
+			if !ok {
+				return syserror.EFAULT
+			}
+			arEnvv.End = end
+		}
+		if _, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil {
+			return err
+		}
+
+		// Linux will return envp up to and including the first NULL character,
+		// so find it.
+		if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 {
+			buf.Truncate(end)
+		}
+	}
+
+	return nil
+}
+
+// +stateify savable
+type commInode struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+	inode := &commInode{task: task}
+	inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	// This file can always be read or written by members of the same thread
+	// group. See fs/proc/base.c:proc_tid_comm_permission.
+	//
+	// N.B. This check is currently a no-op as we don't yet support writing and
+	// this file is world-readable anyways.
+	t := kernel.TaskFromContext(ctx)
+	if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() {
+		return nil
+	}
+
+	return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats)
+}
+
+// commData implements vfs.DynamicBytesSource for /proc/[pid]/comm.
+//
+// +stateify savable
+type commData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*commData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(d.task.Name())
+	buf.WriteString("\n")
+	return nil
+}
+
+// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}.
+//
+// +stateify savable
+type idMapData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+	gids bool
+}
+
+var _ dynamicInode = (*idMapData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var entries []auth.IDMapEntry
+	if d.gids {
+		entries = d.task.UserNamespace().GIDMap()
+	} else {
+		entries = d.task.UserNamespace().UIDMap()
+	}
+	for _, e := range entries {
+		fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
+	}
+	return nil
+}
+
 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
 //
 // +stateify savable
@@ -83,7 +338,7 @@ func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 type taskStatData struct {
 	kernfs.DynamicBytesFile
 
-	t *kernel.Task
+	task *kernel.Task
 
 	// If tgstats is true, accumulate fault stats (not implemented) and CPU
 	// time across all tasks in t's thread group.
@@ -98,40 +353,40 @@ var _ dynamicInode = (*taskStatData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
-	fmt.Fprintf(buf, "(%s) ", s.t.Name())
-	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task))
+	fmt.Fprintf(buf, "(%s) ", s.task.Name())
+	fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0])
 	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
+	if parent := s.task.Parent(); parent != nil {
 		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
 	}
 	fmt.Fprintf(buf, "%d ", ppid)
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup()))
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session()))
 	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
 	fmt.Fprintf(buf, "0 " /* flags */)
 	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
 	var cputime usage.CPUStats
 	if s.tgstats {
-		cputime = s.t.ThreadGroup().CPUStats()
+		cputime = s.task.ThreadGroup().CPUStats()
 	} else {
-		cputime = s.t.CPUStats()
+		cputime = s.task.CPUStats()
 	}
 	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+	cputime = s.task.ThreadGroup().JoinedChildCPUStats()
 	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+	fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness())
+	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count())
 
 	// itrealvalue. Since kernel 2.6.17, this field is no longer
 	// maintained, and is hard coded as 0.
 	fmt.Fprintf(buf, "0 ")
 
 	// Start time is relative to boot time, expressed in clock ticks.
-	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime())))
 
 	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
+	s.task.WithMuLocked(func(t *kernel.Task) {
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
 			rss = mm.ResidentSetSize()
@@ -140,14 +395,14 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
 
 	// rsslim.
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur)
 
 	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
 	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
 	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
 	terminationSignal := linux.Signal(0)
-	if s.t == s.t.ThreadGroup().Leader() {
-		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+	if s.task == s.task.ThreadGroup().Leader() {
+		terminationSignal = s.task.ThreadGroup().TerminationSignal()
 	}
 	fmt.Fprintf(buf, "%d ", terminationSignal)
 	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
@@ -164,7 +419,7 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 type statmData struct {
 	kernfs.DynamicBytesFile
 
-	t *kernel.Task
+	task *kernel.Task
 }
 
 var _ dynamicInode = (*statmData)(nil)
@@ -172,7 +427,7 @@ var _ dynamicInode = (*statmData)(nil)
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
+	s.task.WithMuLocked(func(t *kernel.Task) {
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
 			rss = mm.ResidentSetSize()
@@ -189,7 +444,7 @@ func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 type statusData struct {
 	kernfs.DynamicBytesFile
 
-	t     *kernel.Task
+	task  *kernel.Task
 	pidns *kernel.PIDNamespace
 }
 
@@ -197,23 +452,23 @@ var _ dynamicInode = (*statusData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
-	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
-	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
-	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+	fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name())
+	fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus())
+	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup()))
+	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task))
 	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
+	if parent := s.task.Parent(); parent != nil {
 		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
 	}
 	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
 	tpid := kernel.ThreadID(0)
-	if tracer := s.t.Tracer(); tracer != nil {
+	if tracer := s.task.Tracer(); tracer != nil {
 		tpid = s.pidns.IDOfTask(tracer)
 	}
 	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
 	var fds int
 	var vss, rss, data uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
+	s.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
 			fds = fdTable.Size()
 		}
@@ -227,13 +482,13 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
 	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
 	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
-	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
-	creds := s.t.Credentials()
+	fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count())
+	creds := s.task.Credentials()
 	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
 	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
 	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
 	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
-	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode())
 	// We unconditionally report a single NUMA node. See
 	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
 	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 72315d25c..a97b1753a 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -66,7 +66,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
 		"sys":     newSysDir(root, inoGen),
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
-		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), 0777, "self/mounts"),
+		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
 		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
 		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 76eafe593..6b58c16b9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -85,12 +85,20 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 
 func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
-		"io":     {Type: linux.DT_REG},
-		"maps":   {Type: linux.DT_REG},
-		"smaps":  {Type: linux.DT_REG},
-		"stat":   {Type: linux.DT_REG},
-		"statm":  {Type: linux.DT_REG},
-		"status": {Type: linux.DT_REG},
+		"auxv":    {Type: linux.DT_REG},
+		"cmdline": {Type: linux.DT_REG},
+		"comm":    {Type: linux.DT_REG},
+		"environ": {Type: linux.DT_REG},
+		"gid_map": {Type: linux.DT_REG},
+		"io":      {Type: linux.DT_REG},
+		"maps":    {Type: linux.DT_REG},
+		"ns":      {Type: linux.DT_DIR},
+		"smaps":   {Type: linux.DT_REG},
+		"stat":    {Type: linux.DT_REG},
+		"statm":   {Type: linux.DT_REG},
+		"status":  {Type: linux.DT_REG},
+		"task":    {Type: linux.DT_DIR},
+		"uid_map": {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
 }
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f1edb0680..d279d05ca 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -30,6 +30,26 @@ const (
 	MayExec              = 1
 )
 
+// OnlyRead returns true if access _only_ allows read.
+func (a AccessTypes) OnlyRead() bool {
+	return a == MayRead
+}
+
+// MayRead returns true if access allows read.
+func (a AccessTypes) MayRead() bool {
+	return a&MayRead != 0
+}
+
+// MayWrite returns true if access allows write.
+func (a AccessTypes) MayWrite() bool {
+	return a&MayWrite != 0
+}
+
+// MayExec returns true if access allows exec.
+func (a AccessTypes) MayExec() bool {
+	return a&MayExec != 0
+}
+
 // GenericCheckPermissions checks that creds has the given access rights on a
 // file with the given permissions, UID, and GID, subject to the rules of
 // fs/namei.c:generic_permission(). isDir is true if the file is a directory.
@@ -53,7 +73,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	}
 	// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
 	// directories, and read arbitrary non-directory files.
-	if (isDir && (ats&MayWrite == 0)) || ats == MayRead {
+	if (isDir && !ats.MayWrite()) || ats.OnlyRead() {
 		if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
 			return nil
 		}
@@ -61,7 +81,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
 	// access to non-directory files, and execute access to non-directory files
 	// for which at least one execute bit is set.
-	if isDir || (ats&MayExec == 0) || (mode&0111 != 0) {
+	if isDir || !ats.MayExec() || (mode&0111 != 0) {
 		if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
 			return nil
 		}
-- 
cgit v1.2.3


From 70d7c52bd7583393d39177a7935cca57372d67f1 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 16 Jan 2020 13:58:25 -0800
Subject: Implement tmpfs.SetStat with a size argument.

This is similar to 'Truncate' in vfs1.

Updates https://github.com/google/gvisor/issues/1197

PiperOrigin-RevId: 290139140
---
 pkg/sentry/fsimpl/tmpfs/regular_file.go      |  35 ++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 121 +++++++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             |  54 ++++++++++--
 3 files changed, 205 insertions(+), 5 deletions(-)

diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index f200e767d..5fa70cc6d 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -63,6 +63,41 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod
 	return &file.inode
 }
 
+// truncate grows or shrinks the file to the given size. It returns true if the
+// file size was updated.
+func (rf *regularFile) truncate(size uint64) (bool, error) {
+	rf.mu.Lock()
+	defer rf.mu.Unlock()
+
+	if size == rf.size {
+		// Nothing to do.
+		return false, nil
+	}
+
+	if size > rf.size {
+		// Growing the file.
+		if rf.seals&linux.F_SEAL_GROW != 0 {
+			// Seal does not allow growth.
+			return false, syserror.EPERM
+		}
+		rf.size = size
+		return true, nil
+	}
+
+	// Shrinking the file
+	if rf.seals&linux.F_SEAL_SHRINK != 0 {
+		// Seal does not allow shrink.
+		return false, syserror.EPERM
+	}
+
+	// TODO(gvisor.dev/issues/1197): Invalidate mappings once we have
+	// mappings.
+
+	rf.data.Truncate(size, rf.memFile)
+	rf.size = size
+	return true, nil
+}
+
 type regularFileFD struct {
 	fileDescription
 
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 7b0a962f0..034a29fdb 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -313,3 +313,124 @@ func TestPRead(t *testing.T) {
 		}
 	}
 }
+
+func TestTruncate(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Fill the file with some data.
+	data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+	written, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+
+	// Size should be same as written.
+	sizeStatOpts := vfs.StatOptions{Mask: linux.STATX_SIZE}
+	stat, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	if got, want := int64(stat.Size), written; got != want {
+		t.Errorf("fd.Stat got size %d, want %d", got, want)
+	}
+
+	// Truncate down.
+	newSize := uint64(10)
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: newSize,
+		},
+	}); err != nil {
+		t.Errorf("fd.Truncate failed: %v", err)
+	}
+	// Size should be updated.
+	statAfterTruncateDown, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	if got, want := statAfterTruncateDown.Size, newSize; got != want {
+		t.Errorf("fd.Stat got size %d, want %d", got, want)
+	}
+	// We should only read newSize worth of data.
+	buf := make([]byte, 1000)
+	if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead failed: %v", err)
+	} else if uint64(n) != newSize {
+		t.Errorf("fd.PRead got size %d, want %d", n, newSize)
+	}
+	// Mtime and Ctime should be bumped.
+	if got := statAfterTruncateDown.Mtime.ToNsec(); got <= stat.Mtime.ToNsec() {
+		t.Errorf("fd.Stat got Mtime %v, want > %v", got, stat.Mtime)
+	}
+	if got := statAfterTruncateDown.Ctime.ToNsec(); got <= stat.Ctime.ToNsec() {
+		t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime)
+	}
+
+	// Truncate up.
+	newSize = 100
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: newSize,
+		},
+	}); err != nil {
+		t.Errorf("fd.Truncate failed: %v", err)
+	}
+	// Size should be updated.
+	statAfterTruncateUp, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	if got, want := statAfterTruncateUp.Size, newSize; got != want {
+		t.Errorf("fd.Stat got size %d, want %d", got, want)
+	}
+	// We should read newSize worth of data.
+	buf = make([]byte, 1000)
+	if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead failed: %v", err)
+	} else if uint64(n) != newSize {
+		t.Errorf("fd.PRead got size %d, want %d", n, newSize)
+	}
+	// Bytes should be null after 10, since we previously truncated to 10.
+	for i := uint64(10); i < newSize; i++ {
+		if buf[i] != 0 {
+			t.Errorf("fd.PRead got byte %d=%x, want 0", i, buf[i])
+			break
+		}
+	}
+	// Mtime and Ctime should be bumped.
+	if got := statAfterTruncateUp.Mtime.ToNsec(); got <= statAfterTruncateDown.Mtime.ToNsec() {
+		t.Errorf("fd.Stat got Mtime %v, want > %v", got, statAfterTruncateDown.Mtime)
+	}
+	if got := statAfterTruncateUp.Ctime.ToNsec(); got <= statAfterTruncateDown.Ctime.ToNsec() {
+		t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime)
+	}
+
+	// Truncate to the current size.
+	newSize = statAfterTruncateUp.Size
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: newSize,
+		},
+	}); err != nil {
+		t.Errorf("fd.Truncate failed: %v", err)
+	}
+	statAfterTruncateNoop, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	// Mtime and Ctime should not be bumped, since operation is a noop.
+	if got := statAfterTruncateNoop.Mtime.ToNsec(); got != statAfterTruncateUp.Mtime.ToNsec() {
+		t.Errorf("fd.Stat got Mtime %v, want %v", got, statAfterTruncateUp.Mtime)
+	}
+	if got := statAfterTruncateNoop.Ctime.ToNsec(); got != statAfterTruncateUp.Ctime.ToNsec() {
+		t.Errorf("fd.Stat got Ctime %v, want %v", got, statAfterTruncateUp.Ctime)
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index d6960ee47..1d4889c89 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // FilesystemType implements vfs.FilesystemType.
@@ -121,6 +122,9 @@ func (d *dentry) DecRef() {
 
 // inode represents a filesystem object.
 type inode struct {
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock time.Clock
+
 	// refs is a reference count. refs is accessed using atomic memory
 	// operations.
 	//
@@ -151,13 +155,14 @@ type inode struct {
 const maxLinks = math.MaxUint32
 
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
-	now := fs.clock.Now().Nanoseconds()
+	i.clock = fs.clock
 	i.refs = 1
 	i.mode = uint32(mode)
 	i.uid = uint32(creds.EffectiveKUID)
 	i.gid = uint32(creds.EffectiveKGID)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
 	// Tmpfs creation sets atime, ctime, and mtime to current time.
+	now := i.clock.Now().Nanoseconds()
 	i.atime = now
 	i.ctime = now
 	i.mtime = now
@@ -270,30 +275,69 @@ func (i *inode) statTo(stat *linux.Statx) {
 }
 
 func (i *inode) setStat(stat linux.Statx) error {
-	// TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking
-	// the file.
 	if stat.Mask == 0 {
 		return nil
 	}
 	i.mu.Lock()
+	var (
+		needsMtimeBump bool
+		needsCtimeBump bool
+	)
 	mask := stat.Mask
 	if mask&linux.STATX_MODE != 0 {
 		atomic.StoreUint32(&i.mode, uint32(stat.Mode))
+		needsCtimeBump = true
 	}
 	if mask&linux.STATX_UID != 0 {
 		atomic.StoreUint32(&i.uid, stat.UID)
+		needsCtimeBump = true
 	}
 	if mask&linux.STATX_GID != 0 {
 		atomic.StoreUint32(&i.gid, stat.GID)
+		needsCtimeBump = true
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		switch impl := i.impl.(type) {
+		case *regularFile:
+			updated, err := impl.truncate(stat.Size)
+			if err != nil {
+				return err
+			}
+			if updated {
+				needsMtimeBump = true
+				needsCtimeBump = true
+			}
+		case *directory:
+			return syserror.EISDIR
+		case *symlink:
+			return syserror.EINVAL
+		case *namedPipe:
+			// Nothing.
+		default:
+			panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+		}
 	}
 	if mask&linux.STATX_ATIME != 0 {
 		atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		needsCtimeBump = true
+	}
+	if mask&linux.STATX_MTIME != 0 {
+		atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		needsCtimeBump = true
+		// Ignore the mtime bump, since we just set it ourselves.
+		needsMtimeBump = false
 	}
 	if mask&linux.STATX_CTIME != 0 {
 		atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		// Ignore the ctime bump, since we just set it ourselves.
+		needsCtimeBump = false
 	}
-	if mask&linux.STATX_MTIME != 0 {
-		atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+	now := i.clock.Now().Nanoseconds()
+	if needsMtimeBump {
+		atomic.StoreInt64(&i.mtime, now)
+	}
+	if needsCtimeBump {
+		atomic.StoreInt64(&i.ctime, now)
 	}
 	i.mu.Unlock()
 	return nil
-- 
cgit v1.2.3


From ab48112e41427579ecf585f6280be1e2d58acf06 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 16 Jan 2020 13:58:50 -0800
Subject: Add IfChange/ThenChange reminders in fs/proc

There is a lot of code duplication for VFSv2 and this
serves as remind to keep the copies in sync.

Updates #1195

PiperOrigin-RevId: 290139234
---
 pkg/sentry/fs/proc/cgroup.go      | 4 ++++
 pkg/sentry/fs/proc/cpuinfo.go     | 4 ++++
 pkg/sentry/fs/proc/exec_args.go   | 4 ++++
 pkg/sentry/fs/proc/fds.go         | 4 ++++
 pkg/sentry/fs/proc/filesystems.go | 4 ++++
 pkg/sentry/fs/proc/fs.go          | 4 ++++
 pkg/sentry/fs/proc/inode.go       | 4 ++++
 pkg/sentry/fs/proc/loadavg.go     | 4 ++++
 pkg/sentry/fs/proc/meminfo.go     | 4 ++++
 pkg/sentry/fs/proc/mounts.go      | 4 ++++
 pkg/sentry/fs/proc/net.go         | 4 ++++
 pkg/sentry/fs/proc/proc.go        | 4 ++++
 pkg/sentry/fs/proc/stat.go        | 4 ++++
 pkg/sentry/fs/proc/sys.go         | 4 ++++
 pkg/sentry/fs/proc/sys_net.go     | 4 ++++
 pkg/sentry/fs/proc/task.go        | 4 ++++
 pkg/sentry/fs/proc/uid_gid_map.go | 4 ++++
 pkg/sentry/fs/proc/uptime.go      | 4 ++++
 pkg/sentry/fs/proc/version.go     | 4 ++++
 19 files changed, 76 insertions(+)

diff --git a/pkg/sentry/fs/proc/cgroup.go b/pkg/sentry/fs/proc/cgroup.go
index 05e31c55d..c4abe319d 100644
--- a/pkg/sentry/fs/proc/cgroup.go
+++ b/pkg/sentry/fs/proc/cgroup.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
+// LINT.IfChange
+
 func newCGroupInode(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string]string) *fs.Inode {
 	// From man 7 cgroups: "For each cgroup hierarchy of which the process
 	// is a member, there is one entry containing three colon-separated
@@ -39,3 +41,5 @@ func newCGroupInode(ctx context.Context, msrc *fs.MountSource, cgroupControllers
 
 	return newStaticProcInode(ctx, msrc, []byte(data))
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_files.go)
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index 6330337eb..df0c4e3a7 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
 
+// LINT.IfChange
+
 func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	k := kernel.KernelFromContext(ctx)
 	features := k.FeatureSet()
@@ -35,3 +37,5 @@ func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	}
 	return newStaticProcInode(ctx, msrc, buf.Bytes())
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_files.go)
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index 1d3a2d426..9aaeb780b 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -29,6 +29,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // execArgType enumerates the types of exec arguments that are exposed through
 // proc.
 type execArgType int
@@ -201,3 +203,5 @@ func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen
 	}
 	return int64(n), err
 }
+
+// LINT.ThenChange(../../fsimpl/proc/task.go)
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index bee421d76..2fa3cfa7d 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // walkDescriptors finds the descriptor (file-flag pair) for the fd identified
 // by p, and calls the toInodeOperations callback with that descriptor.  This is a helper
 // method for implementing fs.InodeOperations.Lookup.
@@ -277,3 +279,5 @@ func (fdid *fdInfoDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.
 	}
 	return fs.NewFile(ctx, dirent, flags, fops), nil
 }
+
+// LINT.ThenChange(../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index e9250c51c..7b3b974ab 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 )
 
+// LINT.IfChange
+
 // filesystemsData backs /proc/filesystems.
 //
 // +stateify savable
@@ -59,3 +61,5 @@ func (*filesystemsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle
 	// Return the SeqData and advance the generation counter.
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*filesystemsData)(nil)}}, 1
 }
+
+// LINT.ThenChange(../../fsimpl/proc/filesystem.go)
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index f14833805..761d24462 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
+// LINT.IfChange
+
 // filesystem is a procfs.
 //
 // +stateify savable
@@ -79,3 +81,5 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 	// never want them cached.
 	return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags), cgroups)
 }
+
+// LINT.ThenChange(../../fsimpl/proc/filesystem.go)
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 0c04f81fa..723f6b661 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -26,6 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
+// LINT.IfChange
+
 // taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr
 // method to return either the task or root as the owner, depending on the
 // task's dumpability.
@@ -131,3 +133,5 @@ func newProcInode(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo
 	}
 	return fs.NewInode(ctx, iops, msrc, sattr)
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks.go)
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 8602b7426..d7d2afcb7 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 )
 
+// LINT.IfChange
+
 // loadavgData backs /proc/loadavg.
 //
 // +stateify savable
@@ -53,3 +55,5 @@ func (d *loadavgData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 		},
 	}, 0
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_files.go)
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 495f3e3ba..313c6a32b 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
+// LINT.IfChange
+
 // meminfoData backs /proc/meminfo.
 //
 // +stateify savable
@@ -83,3 +85,5 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	fmt.Fprintf(&buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*meminfoData)(nil)}}, 0
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_files.go)
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index e33c4a460..5aedae799 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
 
+// LINT.IfChange
+
 // forEachMountSource runs f for the process root mount and  each mount that is a
 // descendant of the root.
 func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
@@ -195,3 +197,5 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_files.go)
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 402919924..3f17e98ea 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -38,6 +38,8 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+// LINT.IfChange
+
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
@@ -831,3 +833,5 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	}
 	return data, 0
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_net.go)
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 56e92721e..da9341e4e 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -31,6 +31,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // proc is a root proc node.
 //
 // +stateify savable
@@ -249,3 +251,5 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent
 	}
 	return offset, nil
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks.go)
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index b641effbb..bc5b2bc7b 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -24,6 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
 
+// LINT.IfChange
+
 // statData backs /proc/stat.
 //
 // +stateify savable
@@ -140,3 +142,5 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 		},
 	}, 0
 }
+
+// LINT.ThenChange(../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index cd37776c8..1062bd852 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -31,6 +31,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
 //
 // +stateify savable
@@ -160,3 +162,5 @@ func (hf *hostnameFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequ
 }
 
 var _ fs.FileOperations = (*hostnameFile)(nil)
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_sys.go)
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index a37e1fa06..b9e8ef35f 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -30,6 +30,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 type tcpMemDir int
 
 const (
@@ -364,3 +366,5 @@ func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_sys.go)
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9bf4b4527..7358d6ef9 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -37,6 +37,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's
 // users count is incremented, and must be decremented by the caller when it is
 // no longer in use.
@@ -800,3 +802,5 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	n, err := dst.CopyOut(ctx, buf[offset:])
 	return int64(n), err
 }
+
+// LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index eea37d15c..3eacc9265 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -30,6 +30,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // idMapInodeOperations implements fs.InodeOperations for
 // /proc/[pid]/{uid,gid}_map.
 //
@@ -177,3 +179,5 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u
 	// count, even if fewer bytes were used.
 	return int64(srclen), nil
 }
+
+// LINT.ThenChange(../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 4e903917a..adfe58adb 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // uptime is a file containing the system uptime.
 //
 // +stateify savable
@@ -85,3 +87,5 @@ func (f *uptimeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	n, err := dst.CopyOut(ctx, s[offset:])
 	return int64(n), err
 }
+
+// LINT.ThenChange(../../fsimpl/proc/tasks_files.go)
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index a6d2c3cd3..27fd5b1cb 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
 
+// LINT.IfChange
+
 // versionData backs /proc/version.
 //
 // +stateify savable
@@ -76,3 +78,5 @@ func (v *versionData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 		},
 	}, 0
 }
+
+// LINT.ThenChange(../../fsimpl/proc/task_files.go)
-- 
cgit v1.2.3


From 1e7f0c822b3a7c643d532d40a14ab79eb1df85c6 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 16 Jan 2020 14:25:56 -0800
Subject: Bump p9 version, adding corresponding checks to client_file.go.

PiperOrigin-RevId: 290145451
---
 pkg/p9/client_file.go | 6 ++++++
 pkg/p9/version.go     | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 04b584383..0254e4ccc 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -170,6 +170,9 @@ func (c *clientFile) GetXattr(name string, size uint64) (string, error) {
 	if atomic.LoadUint32(&c.closed) != 0 {
 		return "", syscall.EBADF
 	}
+	if !versionSupportsGetSetXattr(c.client.version) {
+		return "", syscall.EOPNOTSUPP
+	}
 
 	rgetxattr := Rgetxattr{}
 	if err := c.client.sendRecv(&Tgetxattr{FID: c.fid, Name: name, Size: size}, &rgetxattr); err != nil {
@@ -184,6 +187,9 @@ func (c *clientFile) SetXattr(name, value string, flags uint32) error {
 	if atomic.LoadUint32(&c.closed) != 0 {
 		return syscall.EBADF
 	}
+	if !versionSupportsGetSetXattr(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
 
 	return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
 }
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 36a694c58..34a15eb55 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 9
+	highestSupportedVersion uint32 = 10
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -161,3 +161,9 @@ func versionSupportsFlipcall(v uint32) bool {
 func VersionSupportsOpenTruncateFlag(v uint32) bool {
 	return v >= 9
 }
+
+// versionSupportsGetSetXattr returns true if version v supports
+// the Tgetxattr and Tsetxattr messages.
+func versionSupportsGetSetXattr(v uint32) bool {
+	return v >= 10
+}
-- 
cgit v1.2.3


From 7a45ae7e67438697296fc12345202e3c76304096 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 16 Jan 2020 18:13:27 -0800
Subject: Implement setxattr for overlays.

PiperOrigin-RevId: 290186303
---
 pkg/sentry/fs/inode.go                 |  4 ++--
 pkg/sentry/fs/inode_overlay.go         | 13 ++++++++++---
 pkg/sentry/syscalls/linux/sys_xattr.go |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index ee9d301ef..e4cf5a570 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,9 +270,9 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
 }
 
 // SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
-func (i *Inode) SetXattr(ctx context.Context, name, value string, flags uint32) error {
+func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
 	if i.overlay != nil {
-		return overlaySetxattr(ctx, i.overlay, name, value, flags)
+		return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
 	}
 	return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index b90da20d0..c477de837 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -552,9 +552,16 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 	return s, err
 }
 
-// TODO(b/146028302): Support setxattr for overlayfs.
-func overlaySetxattr(ctx context.Context, o *overlayEntry, name, value string, flags uint32) error {
-	return syserror.EOPNOTSUPP
+func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+	// Don't allow changes to overlay xattrs through a setxattr syscall.
+	if strings.HasPrefix(XattrOverlayPrefix, name) {
+		return syserror.EPERM
+	}
+
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.SetXattr(ctx, d, name, value, flags)
 }
 
 func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) {
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 816352218..23d20da6f 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -142,7 +142,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.SetXattr(t, name, value, flags)
+	return d.Inode.SetXattr(t, d, name, value, flags)
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
-- 
cgit v1.2.3


From 19b4653147c8ec1cd2cf6e2a8f9bfc7865a5f850 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 16 Jan 2020 20:19:40 -0800
Subject: Remove unused rpcinet.

PiperOrigin-RevId: 290198756
---
 pkg/sentry/fs/proc/BUILD                       |   2 -
 pkg/sentry/fs/proc/proc.go                     |   9 +-
 pkg/sentry/fs/proc/rpcinet_proc.go             | 217 ------
 pkg/sentry/fs/proc/sys.go                      |   9 +-
 pkg/sentry/socket/rpcinet/BUILD                |  69 --
 pkg/sentry/socket/rpcinet/conn/BUILD           |  18 -
 pkg/sentry/socket/rpcinet/conn/conn.go         | 187 -----
 pkg/sentry/socket/rpcinet/device.go            |  19 -
 pkg/sentry/socket/rpcinet/notifier/BUILD       |  17 -
 pkg/sentry/socket/rpcinet/notifier/notifier.go | 231 -------
 pkg/sentry/socket/rpcinet/rpcinet.go           |  16 -
 pkg/sentry/socket/rpcinet/socket.go            | 909 -------------------------
 pkg/sentry/socket/rpcinet/stack.go             | 177 -----
 pkg/sentry/socket/rpcinet/stack_unsafe.go      | 193 ------
 pkg/sentry/socket/rpcinet/syscall_rpc.proto    | 352 ----------
 15 files changed, 2 insertions(+), 2423 deletions(-)
 delete mode 100644 pkg/sentry/fs/proc/rpcinet_proc.go
 delete mode 100644 pkg/sentry/socket/rpcinet/BUILD
 delete mode 100644 pkg/sentry/socket/rpcinet/conn/BUILD
 delete mode 100644 pkg/sentry/socket/rpcinet/conn/conn.go
 delete mode 100644 pkg/sentry/socket/rpcinet/device.go
 delete mode 100644 pkg/sentry/socket/rpcinet/notifier/BUILD
 delete mode 100644 pkg/sentry/socket/rpcinet/notifier/notifier.go
 delete mode 100644 pkg/sentry/socket/rpcinet/rpcinet.go
 delete mode 100644 pkg/sentry/socket/rpcinet/socket.go
 delete mode 100644 pkg/sentry/socket/rpcinet/stack.go
 delete mode 100644 pkg/sentry/socket/rpcinet/stack_unsafe.go
 delete mode 100644 pkg/sentry/socket/rpcinet/syscall_rpc.proto

diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 94d46ab1b..cb37c6c6b 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -18,7 +18,6 @@ go_library(
         "mounts.go",
         "net.go",
         "proc.go",
-        "rpcinet_proc.go",
         "stat.go",
         "sys.go",
         "sys_net.go",
@@ -46,7 +45,6 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket",
-        "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index da9341e4e..29867dc3a 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -87,15 +86,9 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 	}
 
 	// Add more contents that need proc to be initialized.
+	p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc))
 	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
 
-	// If we're using rpcinet we will let it manage /proc/net.
-	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
-	} else {
-		p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc))
-	}
-
 	return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil
 }
 
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
deleted file mode 100644
index 01ac97530..000000000
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"io"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-// rpcInetInode implements fs.InodeOperations.
-type rpcInetInode struct {
-	fsutil.SimpleFileInode
-
-	// filepath is the full path of this rpcInetInode.
-	filepath string
-
-	k *kernel.Kernel
-}
-
-func newRPCInetInode(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode {
-	f := &rpcInetInode{
-		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(mode), linux.PROC_SUPER_MAGIC),
-		filepath:        filepath,
-		k:               kernel.KernelFromContext(ctx),
-	}
-	return newProcInode(ctx, f, msrc, fs.SpecialFile, nil)
-}
-
-// GetFile implements fs.InodeOperations.GetFile.
-func (i *rpcInetInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	flags.Pread = true
-	flags.Pwrite = true
-	fops := &rpcInetFile{
-		inode: i,
-	}
-	return fs.NewFile(ctx, dirent, flags, fops), nil
-}
-
-// rpcInetFile implements fs.FileOperations as RPCs.
-type rpcInetFile struct {
-	fsutil.FileGenericSeek          `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoSplice             `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoopFsync            `state:"nosave"`
-	fsutil.FileNoopRelease          `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	waiter.AlwaysReady              `state:"nosave"`
-
-	inode *rpcInetInode
-}
-
-// Read implements fs.FileOperations.Read.
-//
-// This method can panic if an rpcInetInode was created without an rpcinet
-// stack.
-func (f *rpcInetFile) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack)
-	if !ok {
-		panic("Network stack is not a rpcinet.")
-	}
-
-	contents, se := s.RPCReadFile(f.inode.filepath)
-	if se != nil || offset >= int64(len(contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, contents[offset:])
-	return int64(n), err
-}
-
-// Write implements fs.FileOperations.Write.
-//
-// This method can panic if an rpcInetInode was created without an rpcInet
-// stack.
-func (f *rpcInetFile) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
-	s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack)
-	if !ok {
-		panic("Network stack is not a rpcinet.")
-	}
-
-	if src.NumBytes() == 0 {
-		return 0, nil
-	}
-
-	b := make([]byte, src.NumBytes(), src.NumBytes())
-	n, err := src.CopyIn(ctx, b)
-	if err != nil {
-		return int64(n), err
-	}
-
-	written, se := s.RPCWriteFile(f.inode.filepath, b)
-	return int64(written), se.ToError()
-}
-
-// newRPCInetProcNet will build an inode for /proc/net.
-func newRPCInetProcNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	contents := map[string]*fs.Inode{
-		"arp":        newRPCInetInode(ctx, msrc, "/proc/net/arp", 0444),
-		"dev":        newRPCInetInode(ctx, msrc, "/proc/net/dev", 0444),
-		"if_inet6":   newRPCInetInode(ctx, msrc, "/proc/net/if_inet6", 0444),
-		"ipv6_route": newRPCInetInode(ctx, msrc, "/proc/net/ipv6_route", 0444),
-		"netlink":    newRPCInetInode(ctx, msrc, "/proc/net/netlink", 0444),
-		"netstat":    newRPCInetInode(ctx, msrc, "/proc/net/netstat", 0444),
-		"packet":     newRPCInetInode(ctx, msrc, "/proc/net/packet", 0444),
-		"protocols":  newRPCInetInode(ctx, msrc, "/proc/net/protocols", 0444),
-		"psched":     newRPCInetInode(ctx, msrc, "/proc/net/psched", 0444),
-		"ptype":      newRPCInetInode(ctx, msrc, "/proc/net/ptype", 0444),
-		"route":      newRPCInetInode(ctx, msrc, "/proc/net/route", 0444),
-		"tcp":        newRPCInetInode(ctx, msrc, "/proc/net/tcp", 0444),
-		"tcp6":       newRPCInetInode(ctx, msrc, "/proc/net/tcp6", 0444),
-		"udp":        newRPCInetInode(ctx, msrc, "/proc/net/udp", 0444),
-		"udp6":       newRPCInetInode(ctx, msrc, "/proc/net/udp6", 0444),
-	}
-
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
-}
-
-// newRPCInetProcSysNet will build an inode for /proc/sys/net.
-func newRPCInetProcSysNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	contents := map[string]*fs.Inode{
-		"ipv4": newRPCInetSysNetIPv4Dir(ctx, msrc),
-		"core": newRPCInetSysNetCore(ctx, msrc),
-	}
-
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
-}
-
-// newRPCInetSysNetCore builds the /proc/sys/net/core directory.
-func newRPCInetSysNetCore(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	contents := map[string]*fs.Inode{
-		"default_qdisc": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444),
-		"message_burst": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_burst", 0444),
-		"message_cost":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_cost", 0444),
-		"optmem_max":    newRPCInetInode(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444),
-		"rmem_default":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444),
-		"rmem_max":      newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444),
-		"somaxconn":     newRPCInetInode(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444),
-		"wmem_default":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444),
-		"wmem_max":      newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444),
-	}
-
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
-}
-
-// newRPCInetSysNetIPv4Dir builds the /proc/sys/net/ipv4 directory.
-func newRPCInetSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	contents := map[string]*fs.Inode{
-		"ip_local_port_range":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444),
-		"ip_local_reserved_ports":          newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444),
-		"ipfrag_time":                      newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444),
-		"ip_nonlocal_bind":                 newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444),
-		"ip_no_pmtu_disc":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444),
-		"tcp_allowed_congestion_control":   newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444),
-		"tcp_available_congestion_control": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444),
-		"tcp_base_mss":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444),
-		"tcp_congestion_control":           newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644),
-		"tcp_dsack":                        newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644),
-		"tcp_early_retrans":                newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644),
-		"tcp_fack":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644),
-		"tcp_fastopen":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644),
-		"tcp_fastopen_key":                 newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444),
-		"tcp_fin_timeout":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644),
-		"tcp_invalid_ratelimit":            newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444),
-		"tcp_keepalive_intvl":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644),
-		"tcp_keepalive_probes":             newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644),
-		"tcp_keepalive_time":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644),
-		"tcp_mem":                          newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444),
-		"tcp_mtu_probing":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644),
-		"tcp_no_metrics_save":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444),
-		"tcp_probe_interval":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444),
-		"tcp_probe_threshold":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444),
-		"tcp_retries1":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644),
-		"tcp_retries2":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644),
-		"tcp_rfc1337":                      newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444),
-		"tcp_rmem":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444),
-		"tcp_sack":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644),
-		"tcp_slow_start_after_idle":        newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644),
-		"tcp_synack_retries":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644),
-		"tcp_syn_retries":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644),
-		"tcp_timestamps":                   newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644),
-		"tcp_wmem":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444),
-	}
-
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
-}
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 1062bd852..2bdcf5f70 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -106,16 +105,10 @@ func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	children := map[string]*fs.Inode{
 		"kernel": p.newKernelDir(ctx, msrc),
+		"net":    p.newSysNetDir(ctx, msrc),
 		"vm":     p.newVMDir(ctx, msrc),
 	}
 
-	// If we're using rpcinet we will let it manage /proc/sys/net.
-	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		children["net"] = newRPCInetProcSysNet(ctx, msrc)
-	} else {
-		children["net"] = p.newSysNetDir(ctx, msrc)
-	}
-
 	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
deleted file mode 100644
index 4668b87d1..000000000
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "rpcinet",
-    srcs = [
-        "device.go",
-        "rpcinet.go",
-        "socket.go",
-        "stack.go",
-        "stack_unsafe.go",
-    ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        ":syscall_rpc_go_proto",
-        "//pkg/abi/linux",
-        "//pkg/binary",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/device",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/inet",
-        "//pkg/sentry/kernel",
-        "//pkg/sentry/kernel/time",
-        "//pkg/sentry/socket",
-        "//pkg/sentry/socket/hostinet",
-        "//pkg/sentry/socket/rpcinet/conn",
-        "//pkg/sentry/socket/rpcinet/notifier",
-        "//pkg/sentry/unimpl",
-        "//pkg/sentry/usermem",
-        "//pkg/syserr",
-        "//pkg/syserror",
-        "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
-        "//pkg/tcpip/stack",
-        "//pkg/unet",
-        "//pkg/waiter",
-    ],
-)
-
-proto_library(
-    name = "syscall_rpc_proto",
-    srcs = ["syscall_rpc.proto"],
-    visibility = [
-        "//visibility:public",
-    ],
-)
-
-cc_proto_library(
-    name = "syscall_rpc_cc_proto",
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [":syscall_rpc_proto"],
-)
-
-go_proto_library(
-    name = "syscall_rpc_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto",
-    proto = ":syscall_rpc_proto",
-    visibility = [
-        "//visibility:public",
-    ],
-)
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
deleted file mode 100644
index b2677c659..000000000
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "conn",
-    srcs = ["conn.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/binary",
-        "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
-        "//pkg/sync",
-        "//pkg/syserr",
-        "//pkg/unet",
-        "@com_github_golang_protobuf//proto:go_default_library",
-    ],
-)
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
deleted file mode 100644
index 02f39c767..000000000
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package conn is an RPC connection to a syscall RPC server.
-package conn
-
-import (
-	"fmt"
-	"sync/atomic"
-	"syscall"
-
-	"github.com/golang/protobuf/proto"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/unet"
-
-	pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
-)
-
-type request struct {
-	response     []byte
-	ready        chan struct{}
-	ignoreResult bool
-}
-
-// RPCConnection represents a single RPC connection to a syscall gofer.
-type RPCConnection struct {
-	// reqID is the ID of the last request and must be accessed atomically.
-	reqID uint64
-
-	sendMu sync.Mutex
-	socket *unet.Socket
-
-	reqMu    sync.Mutex
-	requests map[uint64]request
-}
-
-// NewRPCConnection initializes a RPC connection to a socket gofer.
-func NewRPCConnection(s *unet.Socket) *RPCConnection {
-	conn := &RPCConnection{socket: s, requests: map[uint64]request{}}
-	go func() { // S/R-FIXME(b/77962828)
-		var nums [16]byte
-		for {
-			for n := 0; n < len(nums); {
-				nn, err := conn.socket.Read(nums[n:])
-				if err != nil {
-					panic(fmt.Sprint("error reading length from socket rpc gofer: ", err))
-				}
-				n += nn
-			}
-
-			b := make([]byte, binary.LittleEndian.Uint64(nums[:8]))
-			id := binary.LittleEndian.Uint64(nums[8:])
-
-			for n := 0; n < len(b); {
-				nn, err := conn.socket.Read(b[n:])
-				if err != nil {
-					panic(fmt.Sprint("error reading request from socket rpc gofer: ", err))
-				}
-				n += nn
-			}
-
-			conn.reqMu.Lock()
-			r := conn.requests[id]
-			if r.ignoreResult {
-				delete(conn.requests, id)
-			} else {
-				r.response = b
-				conn.requests[id] = r
-			}
-			conn.reqMu.Unlock()
-			close(r.ready)
-		}
-	}()
-	return conn
-}
-
-// NewRequest makes a request to the RPC gofer and returns the request ID and a
-// channel which will be closed once the request completes.
-func (c *RPCConnection) NewRequest(req pb.SyscallRequest, ignoreResult bool) (uint64, chan struct{}) {
-	b, err := proto.Marshal(&req)
-	if err != nil {
-		panic(fmt.Sprint("invalid proto: ", err))
-	}
-
-	id := atomic.AddUint64(&c.reqID, 1)
-	ch := make(chan struct{})
-
-	c.reqMu.Lock()
-	c.requests[id] = request{ready: ch, ignoreResult: ignoreResult}
-	c.reqMu.Unlock()
-
-	c.sendMu.Lock()
-	defer c.sendMu.Unlock()
-
-	var nums [16]byte
-	binary.LittleEndian.PutUint64(nums[:8], uint64(len(b)))
-	binary.LittleEndian.PutUint64(nums[8:], id)
-	for n := 0; n < len(nums); {
-		nn, err := c.socket.Write(nums[n:])
-		if err != nil {
-			panic(fmt.Sprint("error writing length and ID to socket gofer: ", err))
-		}
-		n += nn
-	}
-
-	for n := 0; n < len(b); {
-		nn, err := c.socket.Write(b[n:])
-		if err != nil {
-			panic(fmt.Sprint("error writing request to socket gofer: ", err))
-		}
-		n += nn
-	}
-
-	return id, ch
-}
-
-// RPCReadFile will execute the ReadFile helper RPC method which avoids the
-// common pattern of open(2), read(2), close(2) by doing all three operations
-// as a single RPC. It will read the entire file or return EFBIG if the file
-// was too large.
-func (c *RPCConnection) RPCReadFile(path string) ([]byte, *syserr.Error) {
-	req := &pb.SyscallRequest_ReadFile{&pb.ReadFileRequest{
-		Path: path,
-	}}
-
-	id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-ch
-
-	res := c.Request(id).Result.(*pb.SyscallResponse_ReadFile).ReadFile.Result
-	if e, ok := res.(*pb.ReadFileResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.ReadFileResponse_Data).Data, nil
-}
-
-// RPCWriteFile will execute the WriteFile helper RPC method which avoids the
-// common pattern of open(2), write(2), write(2), close(2) by doing all
-// operations as a single RPC.
-func (c *RPCConnection) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) {
-	req := &pb.SyscallRequest_WriteFile{&pb.WriteFileRequest{
-		Path:    path,
-		Content: data,
-	}}
-
-	id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-ch
-
-	res := c.Request(id).Result.(*pb.SyscallResponse_WriteFile).WriteFile
-	if e := res.ErrorNumber; e != 0 {
-		return int64(res.Written), syserr.FromHost(syscall.Errno(e))
-	}
-
-	return int64(res.Written), nil
-}
-
-// Request retrieves the request corresponding to the given request ID.
-//
-// The channel returned by NewRequest must have been closed before Request can
-// be called. This will happen automatically, do not manually close the
-// channel.
-func (c *RPCConnection) Request(id uint64) pb.SyscallResponse {
-	c.reqMu.Lock()
-	r := c.requests[id]
-	delete(c.requests, id)
-	c.reqMu.Unlock()
-
-	var resp pb.SyscallResponse
-	if err := proto.Unmarshal(r.response, &resp); err != nil {
-		panic(fmt.Sprint("invalid proto: ", err))
-	}
-
-	return resp
-}
diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go
deleted file mode 100644
index 8cfd5f6e5..000000000
--- a/pkg/sentry/socket/rpcinet/device.go
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package rpcinet
-
-import "gvisor.dev/gvisor/pkg/sentry/device"
-
-var socketDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
deleted file mode 100644
index a5954f22b..000000000
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "notifier",
-    srcs = ["notifier.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier",
-    visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
-        "//pkg/sentry/socket/rpcinet/conn",
-        "//pkg/sync",
-        "//pkg/waiter",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
deleted file mode 100644
index 82b75d6dd..000000000
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package notifier implements an FD notifier implementation over RPC.
-package notifier
-
-import (
-	"fmt"
-	"syscall"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
-	pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-type fdInfo struct {
-	queue   *waiter.Queue
-	waiting bool
-}
-
-// Notifier holds all the state necessary to issue notifications when IO events
-// occur in the observed FDs.
-type Notifier struct {
-	// rpcConn is the connection that is used for sending RPCs.
-	rpcConn *conn.RPCConnection
-
-	// epFD is the epoll file descriptor used to register for io
-	// notifications.
-	epFD uint32
-
-	// mu protects fdMap.
-	mu sync.Mutex
-
-	// fdMap maps file descriptors to their notification queues and waiting
-	// status.
-	fdMap map[uint32]*fdInfo
-}
-
-// NewRPCNotifier creates a new notifier object.
-func NewRPCNotifier(cn *conn.RPCConnection) (*Notifier, error) {
-	id, c := cn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCreate1{&pb.EpollCreate1Request{}}}, false /* ignoreResult */)
-	<-c
-
-	res := cn.Request(id).Result.(*pb.SyscallResponse_EpollCreate1).EpollCreate1.Result
-	if e, ok := res.(*pb.EpollCreate1Response_ErrorNumber); ok {
-		return nil, syscall.Errno(e.ErrorNumber)
-	}
-
-	w := &Notifier{
-		rpcConn: cn,
-		epFD:    res.(*pb.EpollCreate1Response_Fd).Fd,
-		fdMap:   make(map[uint32]*fdInfo),
-	}
-
-	go w.waitAndNotify() // S/R-FIXME(b/77962828)
-
-	return w, nil
-}
-
-// waitFD waits on mask for fd. The fdMap mutex must be hold.
-func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error {
-	if !fi.waiting && mask == 0 {
-		return nil
-	}
-
-	e := pb.EpollEvent{
-		Events: mask.ToLinux() | unix.EPOLLET,
-		Fd:     fd,
-	}
-
-	switch {
-	case !fi.waiting && mask != 0:
-		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_ADD, Fd: fd, Event: &e}}}, false /* ignoreResult */)
-		<-c
-
-		e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber
-		if e != 0 {
-			return syscall.Errno(e)
-		}
-
-		fi.waiting = true
-	case fi.waiting && mask == 0:
-		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_DEL, Fd: fd}}}, false /* ignoreResult */)
-		<-c
-		n.rpcConn.Request(id)
-
-		fi.waiting = false
-	case fi.waiting && mask != 0:
-		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_MOD, Fd: fd, Event: &e}}}, false /* ignoreResult */)
-		<-c
-
-		e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber
-		if e != 0 {
-			return syscall.Errno(e)
-		}
-	}
-
-	return nil
-}
-
-// addFD adds an FD to the list of FDs observed by n.
-func (n *Notifier) addFD(fd uint32, queue *waiter.Queue) {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	// Panic if we're already notifying on this FD.
-	if _, ok := n.fdMap[fd]; ok {
-		panic(fmt.Sprintf("File descriptor %d added twice", fd))
-	}
-
-	// We have nothing to wait for at the moment. Just add it to the map.
-	n.fdMap[fd] = &fdInfo{queue: queue}
-}
-
-// updateFD updates the set of events the FD needs to be notified on.
-func (n *Notifier) updateFD(fd uint32) error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	if fi, ok := n.fdMap[fd]; ok {
-		return n.waitFD(fd, fi, fi.queue.Events())
-	}
-
-	return nil
-}
-
-// RemoveFD removes an FD from the list of FDs observed by n.
-func (n *Notifier) removeFD(fd uint32) {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	// Remove from map, then from epoll object.
-	n.waitFD(fd, n.fdMap[fd], 0)
-	delete(n.fdMap, fd)
-}
-
-// hasFD returns true if the FD is in the list of observed FDs.
-func (n *Notifier) hasFD(fd uint32) bool {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	_, ok := n.fdMap[fd]
-	return ok
-}
-
-// waitAndNotify loops waiting for io event notifications from the epoll
-// object. Once notifications arrive, they are dispatched to the
-// registered queue.
-func (n *Notifier) waitAndNotify() error {
-	for {
-		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollWait{&pb.EpollWaitRequest{Fd: n.epFD, NumEvents: 100, Msec: -1}}}, false /* ignoreResult */)
-		<-c
-
-		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollWait).EpollWait.Result
-		if e, ok := res.(*pb.EpollWaitResponse_ErrorNumber); ok {
-			err := syscall.Errno(e.ErrorNumber)
-			// NOTE(magi): I don't think epoll_wait can return EAGAIN but I'm being
-			// conseratively careful here since exiting the notification thread
-			// would be really bad.
-			if err == syscall.EINTR || err == syscall.EAGAIN {
-				continue
-			}
-			return err
-		}
-
-		n.mu.Lock()
-		for _, e := range res.(*pb.EpollWaitResponse_Events).Events.Events {
-			if fi, ok := n.fdMap[e.Fd]; ok {
-				fi.queue.Notify(waiter.EventMaskFromLinux(e.Events))
-			}
-		}
-		n.mu.Unlock()
-	}
-}
-
-// AddFD adds an FD to the list of observed FDs.
-func (n *Notifier) AddFD(fd uint32, queue *waiter.Queue) error {
-	n.addFD(fd, queue)
-	return nil
-}
-
-// UpdateFD updates the set of events the FD needs to be notified on.
-func (n *Notifier) UpdateFD(fd uint32) error {
-	return n.updateFD(fd)
-}
-
-// RemoveFD removes an FD from the list of observed FDs.
-func (n *Notifier) RemoveFD(fd uint32) {
-	n.removeFD(fd)
-}
-
-// HasFD returns true if the FD is in the list of observed FDs.
-//
-// This should only be used by tests to assert that FDs are correctly
-// registered.
-func (n *Notifier) HasFD(fd uint32) bool {
-	return n.hasFD(fd)
-}
-
-// NonBlockingPoll polls the given fd in non-blocking fashion. It is used just
-// to query the FD's current state; this method will block on the RPC response
-// although the syscall is non-blocking.
-func (n *Notifier) NonBlockingPoll(fd uint32, mask waiter.EventMask) waiter.EventMask {
-	for {
-		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Poll{&pb.PollRequest{Fd: fd, Events: mask.ToLinux()}}}, false /* ignoreResult */)
-		<-c
-
-		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_Poll).Poll.Result
-		if e, ok := res.(*pb.PollResponse_ErrorNumber); ok {
-			if syscall.Errno(e.ErrorNumber) == syscall.EINTR {
-				continue
-			}
-			return mask
-		}
-
-		return waiter.EventMaskFromLinux(res.(*pb.PollResponse_Events).Events)
-	}
-}
diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go
deleted file mode 100644
index 5d4fd4dac..000000000
--- a/pkg/sentry/socket/rpcinet/rpcinet.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package rpcinet implements sockets using an RPC for each syscall.
-package rpcinet
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
deleted file mode 100644
index ddb76d9d4..000000000
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ /dev/null
@@ -1,909 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package rpcinet
-
-import (
-	"sync/atomic"
-	"syscall"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier"
-	pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
-	"gvisor.dev/gvisor/pkg/sentry/unimpl"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-// socketOperations implements fs.FileOperations and socket.Socket for a socket
-// implemented using a host socket.
-type socketOperations struct {
-	fsutil.FilePipeSeek             `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoSplice             `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	socket.SendReceiveTimeout
-
-	family   int            // Read-only.
-	stype    linux.SockType // Read-only.
-	protocol int            // Read-only.
-
-	fd       uint32 // must be O_NONBLOCK
-	wq       *waiter.Queue
-	rpcConn  *conn.RPCConnection
-	notifier *notifier.Notifier
-
-	// shState is the state of the connection with respect to shutdown. Because
-	// we're mixing non-blocking semantics on the other side we have to adapt for
-	// some strange differences between blocking and non-blocking sockets.
-	shState int32
-}
-
-// Verify that we actually implement socket.Socket.
-var _ = socket.Socket(&socketOperations{})
-
-// New creates a new RPC socket.
-func newSocketFile(ctx context.Context, stack *Stack, family int, skType linux.SockType, protocol int) (*fs.File, *syserr.Error) {
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */)
-	<-c
-
-	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result
-	if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-	fd := res.(*pb.SocketResponse_Fd).Fd
-
-	var wq waiter.Queue
-	stack.notifier.AddFD(fd, &wq)
-
-	dirent := socket.NewDirent(ctx, socketDevice)
-	defer dirent.DecRef()
-	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{
-		family:   family,
-		stype:    skType,
-		protocol: protocol,
-		wq:       &wq,
-		fd:       fd,
-		rpcConn:  stack.rpcConn,
-		notifier: stack.notifier,
-	}), nil
-}
-
-func isBlockingErrno(err error) bool {
-	return err == syscall.EAGAIN || err == syscall.EWOULDBLOCK
-}
-
-func translateIOSyscallError(err error) error {
-	if isBlockingErrno(err) {
-		return syserror.ErrWouldBlock
-	}
-	return err
-}
-
-// setShutdownFlags will set the shutdown flag so we can handle blocking reads
-// after a read shutdown.
-func (s *socketOperations) setShutdownFlags(how int) {
-	var f tcpip.ShutdownFlags
-	switch how {
-	case linux.SHUT_RD:
-		f = tcpip.ShutdownRead
-	case linux.SHUT_WR:
-		f = tcpip.ShutdownWrite
-	case linux.SHUT_RDWR:
-		f = tcpip.ShutdownWrite | tcpip.ShutdownRead
-	}
-
-	// Atomically update the flags.
-	for {
-		old := atomic.LoadInt32(&s.shState)
-		if atomic.CompareAndSwapInt32(&s.shState, old, old|int32(f)) {
-			break
-		}
-	}
-}
-
-func (s *socketOperations) resetShutdownFlags() {
-	atomic.StoreInt32(&s.shState, 0)
-}
-
-func (s *socketOperations) isShutRdSet() bool {
-	return atomic.LoadInt32(&s.shState)&int32(tcpip.ShutdownRead) != 0
-}
-
-func (s *socketOperations) isShutWrSet() bool {
-	return atomic.LoadInt32(&s.shState)&int32(tcpip.ShutdownWrite) != 0
-}
-
-// Release implements fs.FileOperations.Release.
-func (s *socketOperations) Release() {
-	s.notifier.RemoveFD(s.fd)
-
-	// We always need to close the FD.
-	_, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: s.fd}}}, true /* ignoreResult */)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return s.notifier.NonBlockingPoll(s.fd, mask)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	s.wq.EventRegister(e, mask)
-	s.notifier.UpdateFD(s.fd)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (s *socketOperations) EventUnregister(e *waiter.Entry) {
-	s.wq.EventUnregister(e)
-	s.notifier.UpdateFD(s.fd)
-}
-
-func rpcRead(t *kernel.Task, req *pb.SyscallRequest_Read) (*pb.ReadResponse_Data, *syserr.Error) {
-	s := t.NetworkContext().(*Stack)
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Read).Read.Result
-	if e, ok := res.(*pb.ReadResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.ReadResponse_Data), nil
-}
-
-// Read implements fs.FileOperations.Read.
-func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
-	req := &pb.SyscallRequest_Read{&pb.ReadRequest{
-		Fd:     s.fd,
-		Length: uint32(dst.NumBytes()),
-	}}
-
-	res, se := rpcRead(ctx.(*kernel.Task), req)
-	if se == nil {
-		n, e := dst.CopyOut(ctx, res.Data)
-		return int64(n), e
-	}
-
-	return 0, se.ToError()
-}
-
-func rpcWrite(t *kernel.Task, req *pb.SyscallRequest_Write) (uint32, *syserr.Error) {
-	s := t.NetworkContext().(*Stack)
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Write).Write.Result
-	if e, ok := res.(*pb.WriteResponse_ErrorNumber); ok {
-		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.WriteResponse_Length).Length, nil
-}
-
-// Write implements fs.FileOperations.Write.
-func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
-	t := ctx.(*kernel.Task)
-	v := buffer.NewView(int(src.NumBytes()))
-
-	// Copy all the data into the buffer.
-	if _, err := src.CopyIn(t, v); err != nil {
-		return 0, err
-	}
-
-	n, err := rpcWrite(t, &pb.SyscallRequest_Write{&pb.WriteRequest{Fd: s.fd, Data: v}})
-	if n > 0 && n < uint32(src.NumBytes()) {
-		// The FileOperations.Write interface expects us to return ErrWouldBlock in
-		// the event of a partial write.
-		return int64(n), syserror.ErrWouldBlock
-	}
-	return int64(n), err.ToError()
-}
-
-func rpcConnect(t *kernel.Task, fd uint32, sockaddr []byte) *syserr.Error {
-	s := t.NetworkContext().(*Stack)
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Connect{&pb.ConnectRequest{Fd: uint32(fd), Address: sockaddr}}}, false /* ignoreResult */)
-	<-c
-
-	if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Connect).Connect.ErrorNumber; e != 0 {
-		return syserr.FromHost(syscall.Errno(e))
-	}
-	return nil
-}
-
-// Connect implements socket.Socket.Connect.
-func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
-	if !blocking {
-		e := rpcConnect(t, s.fd, sockaddr)
-		if e == nil {
-			// Reset the shutdown state on new connects.
-			s.resetShutdownFlags()
-		}
-		return e
-	}
-
-	// Register for notification when the endpoint becomes writable, then
-	// initiate the connection.
-	e, ch := waiter.NewChannelEntry(nil)
-	s.EventRegister(&e, waiter.EventOut|waiter.EventIn|waiter.EventHUp)
-	defer s.EventUnregister(&e)
-	for {
-		if err := rpcConnect(t, s.fd, sockaddr); err == nil || err != syserr.ErrInProgress && err != syserr.ErrAlreadyInProgress {
-			if err == nil {
-				// Reset the shutdown state on new connects.
-				s.resetShutdownFlags()
-			}
-			return err
-		}
-
-		// It's pending, so we have to wait for a notification, and fetch the
-		// result once the wait completes.
-		if err := t.Block(ch); err != nil {
-			return syserr.FromError(err)
-		}
-	}
-}
-
-func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultPayload, *syserr.Error) {
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Accept{&pb.AcceptRequest{Fd: fd, Peer: peer, Flags: syscall.SOCK_NONBLOCK}}}, false /* ignoreResult */)
-	<-c
-
-	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Accept).Accept.Result
-	if e, ok := res.(*pb.AcceptResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-	return res.(*pb.AcceptResponse_Payload).Payload, nil
-}
-
-// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	payload, se := rpcAccept(t, s.fd, peerRequested)
-
-	// Check if we need to block.
-	if blocking && se == syserr.ErrTryAgain {
-		// Register for notifications.
-		e, ch := waiter.NewChannelEntry(nil)
-		// FIXME(b/119878986): This waiter.EventHUp is a partial
-		// measure, need to figure out how to translate linux events to
-		// internal events.
-		s.EventRegister(&e, waiter.EventIn|waiter.EventHUp)
-		defer s.EventUnregister(&e)
-
-		// Try to accept the connection again; if it fails, then wait until we
-		// get a notification.
-		for {
-			if payload, se = rpcAccept(t, s.fd, peerRequested); se != syserr.ErrTryAgain {
-				break
-			}
-
-			if err := t.Block(ch); err != nil {
-				return 0, nil, 0, syserr.FromError(err)
-			}
-		}
-	}
-
-	// Handle any error from accept.
-	if se != nil {
-		return 0, nil, 0, se
-	}
-
-	var wq waiter.Queue
-	s.notifier.AddFD(payload.Fd, &wq)
-
-	dirent := socket.NewDirent(t, socketDevice)
-	defer dirent.DecRef()
-	fileFlags := fs.FileFlags{
-		Read:        true,
-		Write:       true,
-		NonSeekable: true,
-		NonBlocking: flags&linux.SOCK_NONBLOCK != 0,
-	}
-	file := fs.NewFile(t, dirent, fileFlags, &socketOperations{
-		family:   s.family,
-		stype:    s.stype,
-		protocol: s.protocol,
-		wq:       &wq,
-		fd:       payload.Fd,
-		rpcConn:  s.rpcConn,
-		notifier: s.notifier,
-	})
-	defer file.DecRef()
-
-	fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
-		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
-	})
-	if err != nil {
-		return 0, nil, 0, syserr.FromError(err)
-	}
-	t.Kernel().RecordSocket(file)
-
-	if peerRequested {
-		return fd, socket.UnmarshalSockAddr(s.family, payload.Address.Address), payload.Address.Length, nil
-	}
-
-	return fd, nil, 0, nil
-}
-
-// Bind implements socket.Socket.Bind.
-func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: s.fd, Address: sockaddr}}}, false /* ignoreResult */)
-	<-c
-
-	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 {
-		return syserr.FromHost(syscall.Errno(e))
-	}
-	return nil
-}
-
-// Listen implements socket.Socket.Listen.
-func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Listen{&pb.ListenRequest{Fd: s.fd, Backlog: int64(backlog)}}}, false /* ignoreResult */)
-	<-c
-
-	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Listen).Listen.ErrorNumber; e != 0 {
-		return syserr.FromHost(syscall.Errno(e))
-	}
-	return nil
-}
-
-// Shutdown implements socket.Socket.Shutdown.
-func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
-	// We save the shutdown state because of strange differences on linux
-	// related to recvs on blocking vs. non-blocking sockets after a SHUT_RD.
-	// We need to emulate that behavior on the blocking side.
-	// TODO(b/120096741): There is a possible race that can exist with loopback,
-	// where data could possibly be lost.
-	s.setShutdownFlags(how)
-
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Shutdown{&pb.ShutdownRequest{Fd: s.fd, How: int64(how)}}}, false /* ignoreResult */)
-	<-c
-
-	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Shutdown).Shutdown.ErrorNumber; e != 0 {
-		return syserr.FromHost(syscall.Errno(e))
-	}
-
-	return nil
-}
-
-// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
-	// SO_RCVTIMEO and SO_SNDTIMEO are special because blocking is performed
-	// within the sentry.
-	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
-		if outLen < linux.SizeOfTimeval {
-			return nil, syserr.ErrInvalidArgument
-		}
-
-		return linux.NsecToTimeval(s.RecvTimeout()), nil
-	}
-	if level == linux.SOL_SOCKET && name == linux.SO_SNDTIMEO {
-		if outLen < linux.SizeOfTimeval {
-			return nil, syserr.ErrInvalidArgument
-		}
-
-		return linux.NsecToTimeval(s.SendTimeout()), nil
-	}
-
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockOpt{&pb.GetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Length: uint32(outLen)}}}, false /* ignoreResult */)
-	<-c
-
-	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockOpt).GetSockOpt.Result
-	if e, ok := res.(*pb.GetSockOptResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.GetSockOptResponse_Opt).Opt, nil
-}
-
-// SetSockOpt implements socket.Socket.SetSockOpt.
-func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
-	// Because blocking actually happens within the sentry we need to inspect
-	// this socket option to determine if it's a SO_RCVTIMEO or SO_SNDTIMEO,
-	// and if so, we will save it and use it as the deadline for recv(2)
-	// or send(2) related syscalls.
-	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
-		if len(opt) < linux.SizeOfTimeval {
-			return syserr.ErrInvalidArgument
-		}
-
-		var v linux.Timeval
-		binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
-		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
-			return syserr.ErrDomain
-		}
-		s.SetRecvTimeout(v.ToNsecCapped())
-		return nil
-	}
-	if level == linux.SOL_SOCKET && name == linux.SO_SNDTIMEO {
-		if len(opt) < linux.SizeOfTimeval {
-			return syserr.ErrInvalidArgument
-		}
-
-		var v linux.Timeval
-		binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
-		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
-			return syserr.ErrDomain
-		}
-		s.SetSendTimeout(v.ToNsecCapped())
-		return nil
-	}
-
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_SetSockOpt{&pb.SetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Opt: opt}}}, false /* ignoreResult */)
-	<-c
-
-	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_SetSockOpt).SetSockOpt.ErrorNumber; e != 0 {
-		return syserr.FromHost(syscall.Errno(e))
-	}
-	return nil
-}
-
-// GetPeerName implements socket.Socket.GetPeerName.
-func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetPeerName{&pb.GetPeerNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
-	<-c
-
-	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetPeerName).GetPeerName.Result
-	if e, ok := res.(*pb.GetPeerNameResponse_ErrorNumber); ok {
-		return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	addr := res.(*pb.GetPeerNameResponse_Address).Address
-	return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil
-}
-
-// GetSockName implements socket.Socket.GetSockName.
-func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
-	stack := t.NetworkContext().(*Stack)
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockName{&pb.GetSockNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
-	<-c
-
-	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockName).GetSockName.Result
-	if e, ok := res.(*pb.GetSockNameResponse_ErrorNumber); ok {
-		return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	addr := res.(*pb.GetSockNameResponse_Address).Address
-	return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil
-}
-
-func rpcIoctl(t *kernel.Task, fd, cmd uint32, arg []byte) ([]byte, error) {
-	stack := t.NetworkContext().(*Stack)
-
-	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Ioctl{&pb.IOCtlRequest{Fd: fd, Cmd: cmd, Arg: arg}}}, false /* ignoreResult */)
-	<-c
-
-	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Ioctl).Ioctl.Result
-	if e, ok := res.(*pb.IOCtlResponse_ErrorNumber); ok {
-		return nil, syscall.Errno(e.ErrorNumber)
-	}
-
-	return res.(*pb.IOCtlResponse_Value).Value, nil
-}
-
-// ifconfIoctlFromStack populates a struct ifconf for the SIOCGIFCONF ioctl.
-func ifconfIoctlFromStack(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
-	// If Ptr is NULL, return the necessary buffer size via Len.
-	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
-	// structs.
-	t := ctx.(*kernel.Task)
-	s := t.NetworkContext().(*Stack)
-	if s == nil {
-		return syserr.ErrNoDevice.ToError()
-	}
-
-	if ifc.Ptr == 0 {
-		ifc.Len = int32(len(s.Interfaces())) * int32(linux.SizeOfIFReq)
-		return nil
-	}
-
-	max := ifc.Len
-	ifc.Len = 0
-	for key, ifaceAddrs := range s.InterfaceAddrs() {
-		iface := s.Interfaces()[key]
-		for _, ifaceAddr := range ifaceAddrs {
-			// Don't write past the end of the buffer.
-			if ifc.Len+int32(linux.SizeOfIFReq) > max {
-				break
-			}
-			if ifaceAddr.Family != linux.AF_INET {
-				continue
-			}
-
-			// Populate ifr.ifr_addr.
-			ifr := linux.IFReq{}
-			ifr.SetName(iface.Name)
-			usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
-			usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0)
-			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
-
-			// Copy the ifr to userspace.
-			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
-			ifc.Len += int32(linux.SizeOfIFReq)
-			if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
-				AddressSpaceActive: true,
-			}); err != nil {
-				return err
-			}
-		}
-	}
-	return nil
-}
-
-// Ioctl implements fs.FileOperations.Ioctl.
-func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	t := ctx.(*kernel.Task)
-
-	cmd := uint32(args[1].Int())
-	arg := args[2].Pointer()
-
-	var buf []byte
-	switch cmd {
-	// The following ioctls take 4 byte argument parameters.
-	case syscall.TIOCINQ,
-		syscall.TIOCOUTQ:
-		buf = make([]byte, 4)
-	// The following ioctls have args which are sizeof(struct ifreq).
-	case syscall.SIOCGIFADDR,
-		syscall.SIOCGIFBRDADDR,
-		syscall.SIOCGIFDSTADDR,
-		syscall.SIOCGIFFLAGS,
-		syscall.SIOCGIFHWADDR,
-		syscall.SIOCGIFINDEX,
-		syscall.SIOCGIFMAP,
-		syscall.SIOCGIFMETRIC,
-		syscall.SIOCGIFMTU,
-		syscall.SIOCGIFNAME,
-		syscall.SIOCGIFNETMASK,
-		syscall.SIOCGIFTXQLEN:
-		buf = make([]byte, linux.SizeOfIFReq)
-	case syscall.SIOCGIFCONF:
-		// SIOCGIFCONF has slightly different behavior than the others, in that it
-		// will need to populate the array of ifreqs.
-		var ifc linux.IFConf
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-
-		if err := ifconfIoctlFromStack(ctx, io, &ifc); err != nil {
-			return 0, err
-		}
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-
-		return 0, err
-
-	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
-		unimpl.EmitUnimplementedEvent(ctx)
-
-	default:
-		return 0, syserror.ENOTTY
-	}
-
-	_, err := io.CopyIn(ctx, arg, buf, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-
-	if err != nil {
-		return 0, err
-	}
-
-	v, err := rpcIoctl(t, s.fd, cmd, buf)
-	if err != nil {
-		return 0, err
-	}
-
-	if len(v) != len(buf) {
-		return 0, syserror.EINVAL
-	}
-
-	_, err = io.CopyOut(ctx, arg, v, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	return 0, err
-}
-
-func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) {
-	s := t.NetworkContext().(*Stack)
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result
-	if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.RecvmsgResponse_Payload).Payload, nil
-}
-
-// Because we only support SO_TIMESTAMP we will search control messages for
-// that value and set it if so, all other control messages will be ignored.
-func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_ResultPayload) socket.ControlMessages {
-	c := socket.ControlMessages{}
-	if len(payload.GetCmsgData()) > 0 {
-		// Parse the control messages looking for SO_TIMESTAMP.
-		msgs, e := syscall.ParseSocketControlMessage(payload.GetCmsgData())
-		if e != nil {
-			return socket.ControlMessages{}
-		}
-		for _, m := range msgs {
-			if m.Header.Level != linux.SOL_SOCKET || m.Header.Type != linux.SO_TIMESTAMP {
-				continue
-			}
-
-			// Let's parse the time stamp and set it.
-			if len(m.Data) < linux.SizeOfTimeval {
-				// Give up on locating the SO_TIMESTAMP option.
-				return socket.ControlMessages{}
-			}
-
-			var v linux.Timeval
-			binary.Unmarshal(m.Data[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
-			c.IP.HasTimestamp = true
-			c.IP.Timestamp = v.ToNsecCapped()
-			break
-		}
-	}
-	return c
-}
-
-// RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
-	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
-		Fd:         s.fd,
-		Length:     uint32(dst.NumBytes()),
-		Sender:     senderRequested,
-		Trunc:      flags&linux.MSG_TRUNC != 0,
-		Peek:       flags&linux.MSG_PEEK != 0,
-		CmsgLength: uint32(controlDataLen),
-	}}
-
-	res, err := rpcRecvMsg(t, req)
-	if err == nil {
-		var e error
-		var n int
-		if len(res.Data) > 0 {
-			n, e = dst.CopyOut(t, res.Data)
-			if e == nil && n != len(res.Data) {
-				panic("CopyOut failed to copy full buffer")
-			}
-		}
-		c := s.extractControlMessages(res)
-		return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e)
-	}
-	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
-		return 0, 0, nil, 0, socket.ControlMessages{}, err
-	}
-
-	// We'll have to block. Register for notifications and keep trying to
-	// send all the data.
-	e, ch := waiter.NewChannelEntry(nil)
-	s.EventRegister(&e, waiter.EventIn)
-	defer s.EventUnregister(&e)
-
-	for {
-		res, err := rpcRecvMsg(t, req)
-		if err == nil {
-			var e error
-			var n int
-			if len(res.Data) > 0 {
-				n, e = dst.CopyOut(t, res.Data)
-				if e == nil && n != len(res.Data) {
-					panic("CopyOut failed to copy full buffer")
-				}
-			}
-			c := s.extractControlMessages(res)
-			return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e)
-		}
-		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
-			return 0, 0, nil, 0, socket.ControlMessages{}, err
-		}
-
-		if s.isShutRdSet() {
-			// Blocking would have caused us to block indefinitely so we return 0,
-			// this is the same behavior as Linux.
-			return 0, 0, nil, 0, socket.ControlMessages{}, nil
-		}
-
-		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
-			if err == syserror.ETIMEDOUT {
-				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
-			}
-			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
-		}
-	}
-}
-
-func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) {
-	s := t.NetworkContext().(*Stack)
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result
-	if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok {
-		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.SendmsgResponse_Length).Length, nil
-}
-
-// SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
-	// Whitelist flags.
-	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
-		return 0, syserr.ErrInvalidArgument
-	}
-
-	// Reject Unix control messages.
-	if !controlMessages.Unix.Empty() {
-		return 0, syserr.ErrInvalidArgument
-	}
-
-	v := buffer.NewView(int(src.NumBytes()))
-
-	// Copy all the data into the buffer.
-	if _, err := src.CopyIn(t, v); err != nil {
-		return 0, syserr.FromError(err)
-	}
-
-	// TODO(bgeffon): this needs to change to map directly to a SendMsg syscall
-	// in the RPC.
-	totalWritten := 0
-	n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
-		Fd:          uint32(s.fd),
-		Data:        v,
-		Address:     to,
-		More:        flags&linux.MSG_MORE != 0,
-		EndOfRecord: flags&linux.MSG_EOR != 0,
-	}})
-
-	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
-		return int(n), err
-	}
-
-	if n > 0 {
-		totalWritten += int(n)
-		v.TrimFront(int(n))
-	}
-
-	// We'll have to block. Register for notification and keep trying to
-	// send all the data.
-	e, ch := waiter.NewChannelEntry(nil)
-	s.EventRegister(&e, waiter.EventOut)
-	defer s.EventUnregister(&e)
-
-	for {
-		n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
-			Fd:          uint32(s.fd),
-			Data:        v,
-			Address:     to,
-			More:        flags&linux.MSG_MORE != 0,
-			EndOfRecord: flags&linux.MSG_EOR != 0,
-		}})
-
-		if n > 0 {
-			totalWritten += int(n)
-			v.TrimFront(int(n))
-
-			if err == nil && totalWritten < int(src.NumBytes()) {
-				continue
-			}
-		}
-
-		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
-			// We eat the error in this situation.
-			return int(totalWritten), nil
-		}
-
-		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
-			if err == syserror.ETIMEDOUT {
-				return int(totalWritten), syserr.ErrTryAgain
-			}
-			return int(totalWritten), syserr.FromError(err)
-		}
-	}
-}
-
-// State implements socket.Socket.State.
-func (s *socketOperations) State() uint32 {
-	// TODO(b/127845868): Define a new rpc to query the socket state.
-	return 0
-}
-
-// Type implements socket.Socket.Type.
-func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) {
-	return s.family, s.stype, s.protocol
-}
-
-type socketProvider struct {
-	family int
-}
-
-// Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) {
-	// Check that we are using the RPC network stack.
-	stack := t.NetworkContext()
-	if stack == nil {
-		return nil, nil
-	}
-
-	s, ok := stack.(*Stack)
-	if !ok {
-		return nil, nil
-	}
-
-	// Only accept TCP and UDP.
-	//
-	// Try to restrict the flags we will accept to minimize backwards
-	// incompatibility with netstack.
-	stype := stypeflags & linux.SOCK_TYPE_MASK
-	switch stype {
-	case syscall.SOCK_STREAM:
-		switch protocol {
-		case 0, syscall.IPPROTO_TCP:
-			// ok
-		default:
-			return nil, nil
-		}
-	case syscall.SOCK_DGRAM:
-		switch protocol {
-		case 0, syscall.IPPROTO_UDP:
-			// ok
-		default:
-			return nil, nil
-		}
-	default:
-		return nil, nil
-	}
-
-	return newSocketFile(t, s, p.family, stype, protocol)
-}
-
-// Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
-	// Not supported by AF_INET/AF_INET6.
-	return nil, nil, nil
-}
-
-func init() {
-	for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
-		socket.RegisterProvider(family, &socketProvider{family})
-	}
-}
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
deleted file mode 100644
index f7878a760..000000000
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package rpcinet
-
-import (
-	"fmt"
-	"syscall"
-
-	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
-	"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier"
-	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/tcpip/stack"
-	"gvisor.dev/gvisor/pkg/unet"
-)
-
-// Stack implements inet.Stack for RPC backed sockets.
-type Stack struct {
-	interfaces     map[int32]inet.Interface
-	interfaceAddrs map[int32][]inet.InterfaceAddr
-	routes         []inet.Route
-	rpcConn        *conn.RPCConnection
-	notifier       *notifier.Notifier
-}
-
-// NewStack returns a Stack containing the current state of the host network
-// stack.
-func NewStack(fd int32) (*Stack, error) {
-	sock, err := unet.NewSocket(int(fd))
-	if err != nil {
-		return nil, err
-	}
-
-	stack := &Stack{
-		interfaces:     make(map[int32]inet.Interface),
-		interfaceAddrs: make(map[int32][]inet.InterfaceAddr),
-		rpcConn:        conn.NewRPCConnection(sock),
-	}
-
-	var e error
-	stack.notifier, e = notifier.NewRPCNotifier(stack.rpcConn)
-	if e != nil {
-		return nil, e
-	}
-
-	links, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETLINK)
-	if err != nil {
-		return nil, fmt.Errorf("RTM_GETLINK failed: %v", err)
-	}
-
-	addrs, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETADDR)
-	if err != nil {
-		return nil, fmt.Errorf("RTM_GETADDR failed: %v", err)
-	}
-
-	e = hostinet.ExtractHostInterfaces(links, addrs, stack.interfaces, stack.interfaceAddrs)
-	if e != nil {
-		return nil, e
-	}
-
-	routes, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETROUTE)
-	if err != nil {
-		return nil, fmt.Errorf("RTM_GETROUTE failed: %v", err)
-	}
-
-	stack.routes, e = hostinet.ExtractHostRoutes(routes)
-	if e != nil {
-		return nil, e
-	}
-
-	return stack, nil
-}
-
-// RPCReadFile will execute the ReadFile helper RPC method which avoids the
-// common pattern of open(2), read(2), close(2) by doing all three operations
-// as a single RPC. It will read the entire file or return EFBIG if the file
-// was too large.
-func (s *Stack) RPCReadFile(path string) ([]byte, *syserr.Error) {
-	return s.rpcConn.RPCReadFile(path)
-}
-
-// RPCWriteFile will execute the WriteFile helper RPC method which avoids the
-// common pattern of open(2), write(2), write(2), close(2) by doing all
-// operations as a single RPC.
-func (s *Stack) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) {
-	return s.rpcConn.RPCWriteFile(path, data)
-}
-
-// Interfaces implements inet.Stack.Interfaces.
-func (s *Stack) Interfaces() map[int32]inet.Interface {
-	interfaces := make(map[int32]inet.Interface)
-	for k, v := range s.interfaces {
-		interfaces[k] = v
-	}
-	return interfaces
-}
-
-// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
-func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
-	addrs := make(map[int32][]inet.InterfaceAddr)
-	for k, v := range s.interfaceAddrs {
-		addrs[k] = append([]inet.InterfaceAddr(nil), v...)
-	}
-	return addrs
-}
-
-// SupportsIPv6 implements inet.Stack.SupportsIPv6.
-func (s *Stack) SupportsIPv6() bool {
-	panic("rpcinet handles procfs directly this method should not be called")
-}
-
-// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
-func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
-	panic("rpcinet handles procfs directly this method should not be called")
-}
-
-// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
-func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
-	panic("rpcinet handles procfs directly this method should not be called")
-
-}
-
-// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
-func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
-	panic("rpcinet handles procfs directly this method should not be called")
-
-}
-
-// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
-func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
-	panic("rpcinet handles procfs directly this method should not be called")
-}
-
-// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
-func (s *Stack) TCPSACKEnabled() (bool, error) {
-	panic("rpcinet handles procfs directly this method should not be called")
-}
-
-// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
-func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
-	panic("rpcinet handles procfs directly this method should not be called")
-}
-
-// Statistics implements inet.Stack.Statistics.
-func (s *Stack) Statistics(stat interface{}, arg string) error {
-	return syserr.ErrEndpointOperation.ToError()
-}
-
-// RouteTable implements inet.Stack.RouteTable.
-func (s *Stack) RouteTable() []inet.Route {
-	return append([]inet.Route(nil), s.routes...)
-}
-
-// Resume implements inet.Stack.Resume.
-func (s *Stack) Resume() {}
-
-// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints.
-func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil }
-
-// CleanupEndpoints implements inet.Stack.CleanupEndpoints.
-func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
-
-// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
-func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go
deleted file mode 100644
index a94bdad83..000000000
--- a/pkg/sentry/socket/rpcinet/stack_unsafe.go
+++ /dev/null
@@ -1,193 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package rpcinet
-
-import (
-	"syscall"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/syserr"
-)
-
-// NewNetlinkRouteRequest builds a netlink message for getting the RIB,
-// the routing information base.
-func newNetlinkRouteRequest(proto, seq, family int) []byte {
-	rr := &syscall.NetlinkRouteRequest{}
-	rr.Header.Len = uint32(syscall.NLMSG_HDRLEN + syscall.SizeofRtGenmsg)
-	rr.Header.Type = uint16(proto)
-	rr.Header.Flags = syscall.NLM_F_DUMP | syscall.NLM_F_REQUEST
-	rr.Header.Seq = uint32(seq)
-	rr.Data.Family = uint8(family)
-	return netlinkRRtoWireFormat(rr)
-}
-
-func netlinkRRtoWireFormat(rr *syscall.NetlinkRouteRequest) []byte {
-	b := make([]byte, rr.Header.Len)
-	*(*uint32)(unsafe.Pointer(&b[0:4][0])) = rr.Header.Len
-	*(*uint16)(unsafe.Pointer(&b[4:6][0])) = rr.Header.Type
-	*(*uint16)(unsafe.Pointer(&b[6:8][0])) = rr.Header.Flags
-	*(*uint32)(unsafe.Pointer(&b[8:12][0])) = rr.Header.Seq
-	*(*uint32)(unsafe.Pointer(&b[12:16][0])) = rr.Header.Pid
-	b[16] = byte(rr.Data.Family)
-	return b
-}
-
-func (s *Stack) getNetlinkFd() (uint32, *syserr.Error) {
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(syscall.AF_NETLINK), Type: int64(syscall.SOCK_RAW | syscall.SOCK_NONBLOCK), Protocol: int64(syscall.NETLINK_ROUTE)}}}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result
-	if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok {
-		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-	return res.(*pb.SocketResponse_Fd).Fd, nil
-}
-
-func (s *Stack) bindNetlinkFd(fd uint32, sockaddr []byte) *syserr.Error {
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: fd, Address: sockaddr}}}, false /* ignoreResult */)
-	<-c
-
-	if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 {
-		return syserr.FromHost(syscall.Errno(e))
-	}
-	return nil
-}
-
-func (s *Stack) closeNetlinkFd(fd uint32) {
-	_, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: fd}}}, true /* ignoreResult */)
-}
-
-func (s *Stack) rpcSendMsg(req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) {
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result
-	if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok {
-		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.SendmsgResponse_Length).Length, nil
-}
-
-func (s *Stack) sendMsg(fd uint32, buf []byte, to []byte, flags int) (int, *syserr.Error) {
-	// Whitelist flags.
-	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
-		return 0, syserr.ErrInvalidArgument
-	}
-
-	req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
-		Fd:          fd,
-		Data:        buf,
-		Address:     to,
-		More:        flags&linux.MSG_MORE != 0,
-		EndOfRecord: flags&linux.MSG_EOR != 0,
-	}}
-
-	n, err := s.rpcSendMsg(req)
-	return int(n), err
-}
-
-func (s *Stack) rpcRecvMsg(req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) {
-	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
-	<-c
-
-	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result
-	if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok {
-		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
-	}
-
-	return res.(*pb.RecvmsgResponse_Payload).Payload, nil
-}
-
-func (s *Stack) recvMsg(fd, l, flags uint32) ([]byte, *syserr.Error) {
-	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
-		Fd:     fd,
-		Length: l,
-		Sender: false,
-		Trunc:  flags&linux.MSG_TRUNC != 0,
-		Peek:   flags&linux.MSG_PEEK != 0,
-	}}
-
-	res, err := s.rpcRecvMsg(req)
-	if err != nil {
-		return nil, err
-	}
-	return res.Data, nil
-}
-
-func (s *Stack) netlinkRequest(proto, family int) ([]byte, error) {
-	fd, err := s.getNetlinkFd()
-	if err != nil {
-		return nil, err.ToError()
-	}
-	defer s.closeNetlinkFd(fd)
-
-	lsa := syscall.SockaddrNetlink{Family: syscall.AF_NETLINK}
-	b := binary.Marshal(nil, usermem.ByteOrder, &lsa)
-	if err := s.bindNetlinkFd(fd, b); err != nil {
-		return nil, err.ToError()
-	}
-
-	wb := newNetlinkRouteRequest(proto, 1, family)
-	_, err = s.sendMsg(fd, wb, b, 0)
-	if err != nil {
-		return nil, err.ToError()
-	}
-
-	var tab []byte
-done:
-	for {
-		rb, err := s.recvMsg(fd, uint32(syscall.Getpagesize()), 0)
-		nr := len(rb)
-		if err != nil {
-			return nil, err.ToError()
-		}
-
-		if nr < syscall.NLMSG_HDRLEN {
-			return nil, syserr.ErrInvalidArgument.ToError()
-		}
-
-		tab = append(tab, rb...)
-		msgs, e := syscall.ParseNetlinkMessage(rb)
-		if e != nil {
-			return nil, e
-		}
-
-		for _, m := range msgs {
-			if m.Header.Type == syscall.NLMSG_DONE {
-				break done
-			}
-			if m.Header.Type == syscall.NLMSG_ERROR {
-				return nil, syserr.ErrInvalidArgument.ToError()
-			}
-		}
-	}
-
-	return tab, nil
-}
-
-// DoNetlinkRouteRequest returns routing information base, also known as RIB,
-// which consists of network facility information, states and parameters.
-func (s *Stack) DoNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) {
-	data, err := s.netlinkRequest(req, syscall.AF_UNSPEC)
-	if err != nil {
-		return nil, err
-	}
-	return syscall.ParseNetlinkMessage(data)
-}
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
deleted file mode 100644
index b677e9eb3..000000000
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ /dev/null
@@ -1,352 +0,0 @@
-syntax = "proto3";
-
-// package syscall_rpc is a set of networking related system calls that can be
-// forwarded to a socket gofer.
-//
-package syscall_rpc;
-
-message SendmsgRequest {
-  uint32 fd = 1;
-  bytes data = 2 [ctype = CORD];
-  bytes address = 3;
-  bool more = 4;
-  bool end_of_record = 5;
-}
-
-message SendmsgResponse {
-  oneof result {
-    uint32 error_number = 1;
-    uint32 length = 2;
-  }
-}
-
-message IOCtlRequest {
-  uint32 fd = 1;
-  uint32 cmd = 2;
-  bytes arg = 3;
-}
-
-message IOCtlResponse {
-  oneof result {
-    uint32 error_number = 1;
-    bytes value = 2;
-  }
-}
-
-message RecvmsgRequest {
-  uint32 fd = 1;
-  uint32 length = 2;
-  bool sender = 3;
-  bool peek = 4;
-  bool trunc = 5;
-  uint32 cmsg_length = 6;
-}
-
-message OpenRequest {
-  bytes path = 1;
-  uint32 flags = 2;
-  uint32 mode = 3;
-}
-
-message OpenResponse {
-  oneof result {
-    uint32 error_number = 1;
-    uint32 fd = 2;
-  }
-}
-
-message ReadRequest {
-  uint32 fd = 1;
-  uint32 length = 2;
-}
-
-message ReadResponse {
-  oneof result {
-    uint32 error_number = 1;
-    bytes data = 2 [ctype = CORD];
-  }
-}
-
-message ReadFileRequest {
-  string path = 1;
-}
-
-message ReadFileResponse {
-  oneof result {
-    uint32 error_number = 1;
-    bytes data = 2 [ctype = CORD];
-  }
-}
-
-message WriteRequest {
-  uint32 fd = 1;
-  bytes data = 2 [ctype = CORD];
-}
-
-message WriteResponse {
-  oneof result {
-    uint32 error_number = 1;
-    uint32 length = 2;
-  }
-}
-
-message WriteFileRequest {
-  string path = 1;
-  bytes content = 2;
-}
-
-message WriteFileResponse {
-  uint32 error_number = 1;
-  uint32 written = 2;
-}
-
-message AddressResponse {
-  bytes address = 1;
-  uint32 length = 2;
-}
-
-message RecvmsgResponse {
-  message ResultPayload {
-    bytes data = 1 [ctype = CORD];
-    AddressResponse address = 2;
-    uint32 length = 3;
-    bytes cmsg_data = 4;
-  }
-  oneof result {
-    uint32 error_number = 1;
-    ResultPayload payload = 2;
-  }
-}
-
-message BindRequest {
-  uint32 fd = 1;
-  bytes address = 2;
-}
-
-message BindResponse {
-  uint32 error_number = 1;
-}
-
-message AcceptRequest {
-  uint32 fd = 1;
-  bool peer = 2;
-  int64 flags = 3;
-}
-
-message AcceptResponse {
-  message ResultPayload {
-    uint32 fd = 1;
-    AddressResponse address = 2;
-  }
-  oneof result {
-    uint32 error_number = 1;
-    ResultPayload payload = 2;
-  }
-}
-
-message ConnectRequest {
-  uint32 fd = 1;
-  bytes address = 2;
-}
-
-message ConnectResponse {
-  uint32 error_number = 1;
-}
-
-message ListenRequest {
-  uint32 fd = 1;
-  int64 backlog = 2;
-}
-
-message ListenResponse {
-  uint32 error_number = 1;
-}
-
-message ShutdownRequest {
-  uint32 fd = 1;
-  int64 how = 2;
-}
-
-message ShutdownResponse {
-  uint32 error_number = 1;
-}
-
-message CloseRequest {
-  uint32 fd = 1;
-}
-
-message CloseResponse {
-  uint32 error_number = 1;
-}
-
-message GetSockOptRequest {
-  uint32 fd = 1;
-  int64 level = 2;
-  int64 name = 3;
-  uint32 length = 4;
-}
-
-message GetSockOptResponse {
-  oneof result {
-    uint32 error_number = 1;
-    bytes opt = 2;
-  }
-}
-
-message SetSockOptRequest {
-  uint32 fd = 1;
-  int64 level = 2;
-  int64 name = 3;
-  bytes opt = 4;
-}
-
-message SetSockOptResponse {
-  uint32 error_number = 1;
-}
-
-message GetSockNameRequest {
-  uint32 fd = 1;
-}
-
-message GetSockNameResponse {
-  oneof result {
-    uint32 error_number = 1;
-    AddressResponse address = 2;
-  }
-}
-
-message GetPeerNameRequest {
-  uint32 fd = 1;
-}
-
-message GetPeerNameResponse {
-  oneof result {
-    uint32 error_number = 1;
-    AddressResponse address = 2;
-  }
-}
-
-message SocketRequest {
-  int64 family = 1;
-  int64 type = 2;
-  int64 protocol = 3;
-}
-
-message SocketResponse {
-  oneof result {
-    uint32 error_number = 1;
-    uint32 fd = 2;
-  }
-}
-
-message EpollWaitRequest {
-  uint32 fd = 1;
-  uint32 num_events = 2;
-  sint64 msec = 3;
-}
-
-message EpollEvent {
-  uint32 fd = 1;
-  uint32 events = 2;
-}
-
-message EpollEvents {
-  repeated EpollEvent events = 1;
-}
-
-message EpollWaitResponse {
-  oneof result {
-    uint32 error_number = 1;
-    EpollEvents events = 2;
-  }
-}
-
-message EpollCtlRequest {
-  uint32 epfd = 1;
-  int64 op = 2;
-  uint32 fd = 3;
-  EpollEvent event = 4;
-}
-
-message EpollCtlResponse {
-  uint32 error_number = 1;
-}
-
-message EpollCreate1Request {
-  int64 flag = 1;
-}
-
-message EpollCreate1Response {
-  oneof result {
-    uint32 error_number = 1;
-    uint32 fd = 2;
-  }
-}
-
-message PollRequest {
-  uint32 fd = 1;
-  uint32 events = 2;
-}
-
-message PollResponse {
-  oneof result {
-    uint32 error_number = 1;
-    uint32 events = 2;
-  }
-}
-
-message SyscallRequest {
-  oneof args {
-    SocketRequest socket = 1;
-    SendmsgRequest sendmsg = 2;
-    RecvmsgRequest recvmsg = 3;
-    BindRequest bind = 4;
-    AcceptRequest accept = 5;
-    ConnectRequest connect = 6;
-    ListenRequest listen = 7;
-    ShutdownRequest shutdown = 8;
-    CloseRequest close = 9;
-    GetSockOptRequest get_sock_opt = 10;
-    SetSockOptRequest set_sock_opt = 11;
-    GetSockNameRequest get_sock_name = 12;
-    GetPeerNameRequest get_peer_name = 13;
-    EpollWaitRequest epoll_wait = 14;
-    EpollCtlRequest epoll_ctl = 15;
-    EpollCreate1Request epoll_create1 = 16;
-    PollRequest poll = 17;
-    ReadRequest read = 18;
-    WriteRequest write = 19;
-    OpenRequest open = 20;
-    IOCtlRequest ioctl = 21;
-    WriteFileRequest write_file = 22;
-    ReadFileRequest read_file = 23;
-  }
-}
-
-message SyscallResponse {
-  oneof result {
-    SocketResponse socket = 1;
-    SendmsgResponse sendmsg = 2;
-    RecvmsgResponse recvmsg = 3;
-    BindResponse bind = 4;
-    AcceptResponse accept = 5;
-    ConnectResponse connect = 6;
-    ListenResponse listen = 7;
-    ShutdownResponse shutdown = 8;
-    CloseResponse close = 9;
-    GetSockOptResponse get_sock_opt = 10;
-    SetSockOptResponse set_sock_opt = 11;
-    GetSockNameResponse get_sock_name = 12;
-    GetPeerNameResponse get_peer_name = 13;
-    EpollWaitResponse epoll_wait = 14;
-    EpollCtlResponse epoll_ctl = 15;
-    EpollCreate1Response epoll_create1 = 16;
-    PollResponse poll = 17;
-    ReadResponse read = 18;
-    WriteResponse write = 19;
-    OpenResponse open = 20;
-    IOCtlResponse ioctl = 21;
-    WriteFileResponse write_file = 22;
-    ReadFileResponse read_file = 23;
-  }
-}
-- 
cgit v1.2.3


From 82ae857877fdf3492f40bca87657a07892c3f59b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 6 Dec 2019 06:29:24 +0000
Subject: Enable build of test/syscall tests on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I277d6c708bbf5c3edd7c3568941cfd01dc122e17
---
 test/syscalls/linux/BUILD       | 55 ++++++++++++++++++++++++++++++++++-------
 test/syscalls/linux/bad.cc      |  3 ++-
 test/syscalls/linux/chroot.cc   |  2 +-
 test/syscalls/linux/fork.cc     |  3 +++
 test/syscalls/linux/getdents.cc | 10 +++++++-
 test/syscalls/linux/preadv2.cc  |  2 ++
 test/syscalls/linux/proc.cc     |  2 +-
 test/syscalls/linux/pwritev2.cc |  2 ++
 test/syscalls/linux/seccomp.cc  |  5 ++++
 test/syscalls/linux/stat.cc     |  2 ++
 test/util/signal_util.h         | 14 +++++++++++
 test/util/test_util.cc          |  2 +-
 12 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 064ce8429..68dcc598b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -19,6 +19,16 @@ exports_files(
     visibility = ["//:sandbox"],
 )
 
+config_setting(
+    name = "x86_64",
+    constraint_values = ["@bazel_tools//platforms:x86_64"],
+)
+
+config_setting(
+    name = "aarch64",
+    constraint_values = ["@bazel_tools//platforms:aarch64"],
+)
+
 cc_binary(
     name = "sigaltstack_check",
     testonly = 1,
@@ -197,7 +207,10 @@ cc_binary(
 cc_binary(
     name = "32bit_test",
     testonly = 1,
-    srcs = ["32bit.cc"],
+    srcs = select({
+	":x86_64": ["32bit.cc"],
+	":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:memory_util",
@@ -584,7 +597,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select({
+        ":x86_64": ["exceptions.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -640,7 +656,10 @@ cc_binary(
 cc_binary(
     name = "exec_binary_test",
     testonly = 1,
-    srcs = ["exec_binary.cc"],
+    srcs = select({
+        ":x86_64": ["exec_binary.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
@@ -811,7 +830,10 @@ cc_binary(
 cc_binary(
     name = "fpsig_fork_test",
     testonly = 1,
-    srcs = ["fpsig_fork.cc"],
+    srcs = select({
+        ":x86_64": ["fpsig_fork.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -825,7 +847,10 @@ cc_binary(
 cc_binary(
     name = "fpsig_nested_test",
     testonly = 1,
-    srcs = ["fpsig_nested.cc"],
+    srcs = select({
+        ":x86_64": ["fpsig_nested.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:test_main",
@@ -1440,7 +1465,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select({
+        ":x86_64": ["arch_prctl.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:test_main",
@@ -2035,7 +2063,10 @@ cc_binary(
 cc_binary(
     name = "sigiret_test",
     testonly = 1,
-    srcs = ["sigiret.cc"],
+    srcs = select({
+        ":x86_64": ["sigiret.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -2043,7 +2074,10 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:timer_util",
         "@com_google_googletest//:gtest",
-    ],
+    ] + select({
+        ":x86_64": [],
+        ":aarch64": ["//test/util:test_main"],
+	}),
 )
 
 cc_binary(
@@ -3260,7 +3294,10 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = ["sysret.cc"],
+    srcs = select({
+        ":x86_64": ["sysret.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index f246a799e..9e4d8ea57 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -22,12 +22,13 @@ namespace gvisor {
 namespace testing {
 
 namespace {
-
+#if defined(__x86_64__)
 TEST(BadSyscallTest, NotImplemented) {
   // get_kernel_syms is not supported in Linux > 2.6, and not implemented in
   // gVisor.
   EXPECT_THAT(syscall(SYS_get_kernel_syms), SyscallFailsWithErrno(ENOSYS));
 }
+#endif // defined(__x86_64__)
 
 TEST(BadSyscallTest, NegativeOne) {
   EXPECT_THAT(syscall(-1), SyscallFailsWithErrno(ENOSYS));
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 04bc2d7b9..0a2d44a2c 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -162,7 +162,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 
   // getdents on fd should not error.
   char buf[1024];
-  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+  ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
               SyscallSucceeds());
 }
 
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 371890110..906f3358d 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -215,6 +215,8 @@ TEST_F(ForkTest, PrivateMapping) {
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
 
+// CPUID is x86 specific.
+#ifdef __x86_64__
 // Test that cpuid works after a fork.
 TEST_F(ForkTest, Cpuid) {
   pid_t child = Fork();
@@ -227,6 +229,7 @@ TEST_F(ForkTest, Cpuid) {
   }
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
+#endif
 
 TEST_F(ForkTest, Mmap) {
   pid_t child = Fork();
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index ad2dbacb8..bfd18d4ff 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -228,19 +228,27 @@ class GetdentsTest : public ::testing::Test {
 
 // Multiple template parameters are not allowed, so we must use explicit
 // template specialization to set the syscall number.
+#ifdef __x86_64__
 template <>
 int GetdentsTest<struct linux_dirent>::SyscallNum() {
   return SYS_getdents;
 }
+#endif
 
 template <>
 int GetdentsTest<struct linux_dirent64>::SyscallNum() {
   return SYS_getdents64;
 }
 
-// Test both legacy getdents and getdents64.
+#ifdef __x86_64__
+// Test both legacy getdents and getdents64 on x86_64.
 typedef ::testing::Types<struct linux_dirent, struct linux_dirent64>
     GetdentsTypes;
+#elif __aarch64__
+// Test only getdents64 on arm64.
+typedef ::testing::Types<struct linux_dirent64>
+    GetdentsTypes;
+#endif
 TYPED_TEST_SUITE(GetdentsTest, GetdentsTypes);
 
 // N.B. TYPED_TESTs require explicitly using this-> to access members of
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index c9246367d..3eeaf6ad8 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -35,6 +35,8 @@ namespace {
 #ifndef SYS_preadv2
 #if defined(__x86_64__)
 #define SYS_preadv2 327
+#elif defined(__aarch64__)
+#define SYS_preadv2 286
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 8cf08991b..5b4f29cd9 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1986,7 +1986,7 @@ TEST(Proc, GetdentsEnoent) {
       },
       nullptr, nullptr));
   char buf[1024];
-  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+  ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
               SyscallFailsWithErrno(ENOENT));
 }
 
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index 1dbc0d6df..3fe5a600f 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -34,6 +34,8 @@ namespace {
 #ifndef SYS_pwritev2
 #if defined(__x86_64__)
 #define SYS_pwritev2 328
+#elif defined(__aarch64__)
+#define SYS_pwritev2 287
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7e41fe7d8..6d7e543b9 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -49,7 +49,12 @@ namespace testing {
 namespace {
 
 // A syscall not implemented by Linux that we don't expect to be called.
+#ifdef __x86_64__
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
+#elif __aarch64__
+// Using arch_specific_syscalls which are not implemented on arm64.
+constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall+15;
+#endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
 // `sysno` and allows all other syscalls. Async-signal-safe.
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 30de2f8ff..7a99f2636 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -557,6 +557,8 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
 #ifndef SYS_statx
 #if defined(__x86_64__)
 #define SYS_statx 332
+#elif defined(__aarch64__)
+#define SYS_statx 291
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
index bcf85c337..e7b66aa51 100644
--- a/test/util/signal_util.h
+++ b/test/util/signal_util.h
@@ -85,6 +85,20 @@ inline void FixupFault(ucontext_t* ctx) {
   // The encoding is 0x48 0xab 0x00.
   ctx->uc_mcontext.gregs[REG_RIP] += 3;
 }
+#elif __aarch64__
+inline void Fault() {
+  // Zero and dereference x0.
+  asm("mov xzr, x0\r\n"
+      "str xzr, [x0]\r\n"
+      :
+      :
+      : "x0");
+}
+
+inline void FixupFault(ucontext_t* ctx) {
+  // Skip the bad instruction above.
+  ctx->uc_mcontext.pc += 4;
+}
 #endif
 
 }  // namespace testing
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 848504c88..a4f78eec2 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -76,7 +76,6 @@ bool IsRunningWithHostinet() {
       "xchg %%rdi, %%rbx\n"                \
       : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
       : "a"(a_inp), "2"(c_inp))
-#endif  // defined(__x86_64__)
 
 CPUVendor GetCPUVendor() {
   uint32_t eax, ebx, ecx, edx;
@@ -93,6 +92,7 @@ CPUVendor GetCPUVendor() {
   }
   return CPUVendor::kUnknownVendor;
 }
+#endif  // defined(__x86_64__)
 
 bool operator==(const KernelVersion& first, const KernelVersion& second) {
   return first.major == second.major && first.minor == second.minor &&
-- 
cgit v1.2.3


From 345df7cab48ac79bccf2620900cd972b3026296d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 17 Jan 2020 08:10:14 -0800
Subject: Add explanation for implementation of BSD full file locks.

PiperOrigin-RevId: 290272560
---
 pkg/sentry/fs/lock/lock.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 41b040818..926538d90 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -78,6 +78,9 @@ const (
 )
 
 // LockEOF is the maximal possible end of a regional file lock.
+//
+// A BSD-style full file lock can be represented as a regional file lock from
+// offset 0 to LockEOF.
 const LockEOF = math.MaxUint64
 
 // Lock is a regional file lock.  It consists of either a single writer
-- 
cgit v1.2.3


From acf2d6dcc34501d2573f9c3f2b6da80308f3267e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 17 Jan 2020 08:22:21 -0800
Subject: Enable stat syscall support on arm64.

x86 and arm64 use a different stat struct in Linux
kernel, so the stat() syscall implementation has
to handle the file stat data separately.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: If3986e915a667362257a54e7fbbcc1fe18951015
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1493 from xiaobo55x:stat f15a216d9297eb9a96d2c483d396a9919145d7fa
PiperOrigin-RevId: 290274287
---
 pkg/sentry/syscalls/linux/BUILD             |  2 +
 pkg/sentry/syscalls/linux/linux64_arm64.go  |  8 +++
 pkg/sentry/syscalls/linux/sys_stat.go       | 51 -------------------
 pkg/sentry/syscalls/linux/sys_stat_amd64.go | 75 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_stat_arm64.go | 77 +++++++++++++++++++++++++++++
 5 files changed, 162 insertions(+), 51 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go

diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 430d796ba..917f74e07 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,6 +42,8 @@ go_library(
         "sys_socket.go",
         "sys_splice.go",
         "sys_stat.go",
+        "sys_stat_amd64.go",
+        "sys_stat_arm64.go",
         "sys_sync.go",
         "sys_sysinfo.go",
         "sys_syslog.go",
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 8c1b20911..c9629f6f3 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -61,6 +61,7 @@ var ARM64 = &kernel.SyscallTable{
 		22:  syscalls.Supported("epoll_pwait", EpollPwait),
 		23:  syscalls.Supported("dup", Dup),
 		24:  syscalls.Supported("dup3", Dup3),
+		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
 		26:  syscalls.Supported("inotify_init1", InotifyInit1),
 		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
 		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
@@ -78,7 +79,9 @@ var ARM64 = &kernel.SyscallTable{
 		40:  syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
 		41:  syscalls.Error("pivot_root", syserror.EPERM, "", nil),
 		42:  syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+		43:  syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
 		44:  syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+		45:  syscalls.Supported("truncate", Truncate),
 		46:  syscalls.Supported("ftruncate", Ftruncate),
 		47:  syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
 		48:  syscalls.Supported("faccessat", Faccessat),
@@ -112,6 +115,7 @@ var ARM64 = &kernel.SyscallTable{
 		76:  syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
 		77:  syscalls.Supported("tee", Tee),
 		78:  syscalls.Supported("readlinkat", Readlinkat),
+		79:  syscalls.Supported("fstatat", Fstatat),
 		80:  syscalls.Supported("fstat", Fstat),
 		81:  syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
 		82:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
@@ -254,6 +258,8 @@ var ARM64 = &kernel.SyscallTable{
 		219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
 		220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
 		221: syscalls.Supported("execve", Execve),
+		222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+		223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
 		224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
 		225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
 		226: syscalls.Supported("mprotect", Mprotect),
@@ -299,6 +305,8 @@ var ARM64 = &kernel.SyscallTable{
 		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
 		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+		// Syscalls after 284 are "backports" from versions of Linux after 4.4.
 		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
 		286: syscalls.Supported("preadv2", Preadv2),
 		287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 5556bc276..69b17b799 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -16,7 +16,6 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -125,56 +124,6 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 	return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
 }
 
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Nlink (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
-}
-
 // Statx implements linux syscall statx(2).
 func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..58afb4a9a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,75 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
+// address dst in t's address space. It encodes the stat struct to bytes
+// manually, as stat() is a very common syscall for many applications, and
+// t.CopyObjectOut has noticeable performance impact due to its many slice
+// allocations and use of reflection.
+func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
+	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
+
+	// Dev (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
+	// Ino (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
+	// Nlink (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
+	// Mode (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
+	// UID (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+	// GID (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
+	// Padding (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
+	// Rdev (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
+	// Size (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
+	// Blksize (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
+	// Blocks (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
+
+	// ATime
+	atime := uattr.AccessTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
+
+	// MTime
+	mtime := uattr.ModificationTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
+
+	// CTime
+	ctime := uattr.StatusChangeTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
+
+	_, err := t.CopyOutBytes(dst, b)
+	return err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..3e1251e0b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,77 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
+// address dst in t's address space. It encodes the stat struct to bytes
+// manually, as stat() is a very common syscall for many applications, and
+// t.CopyObjectOut has noticeable performance impact due to its many slice
+// allocations and use of reflection.
+func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
+	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
+
+	// Dev (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
+	// Ino (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
+	// Mode (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
+	// Nlink (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links))
+	// UID (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+	// GID (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
+	// Rdev (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
+	// Padding (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, 0)
+	// Size (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
+	// Blksize (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize))
+	// Padding (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
+	// Blocks (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
+
+	// ATime
+	atime := uattr.AccessTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
+
+	// MTime
+	mtime := uattr.ModificationTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
+
+	// CTime
+	ctime := uattr.StatusChangeTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
+
+	_, err := t.CopyOutBytes(dst, b)
+	return err
+}
-- 
cgit v1.2.3


From ff9960985848a48863c01f91acd5b34d3e83a9c5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 17 Jan 2020 09:33:14 -0800
Subject: Add /proc/net/* files

Updates #1195

PiperOrigin-RevId: 290285420
---
 pkg/sentry/fsimpl/proc/BUILD             |   2 +-
 pkg/sentry/fsimpl/proc/tasks.go          |   1 +
 pkg/sentry/fsimpl/proc/tasks_net.go      | 541 ++++++++++++++++++++++++++++---
 pkg/sentry/fsimpl/proc/tasks_sys_test.go |   4 +-
 pkg/sentry/fsimpl/proc/tasks_test.go     |   1 +
 5 files changed, 499 insertions(+), 50 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index e92564b5d..f69aa19c4 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -18,7 +18,6 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -37,6 +36,7 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/tcpip/header",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index a97b1753a..5646c602a 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -67,6 +67,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"sys":     newSysDir(root, inoGen),
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
+		"net":     newNetDir(root, inoGen, k),
 		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
 		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 06dc43c26..3dbf3ba41 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -17,33 +17,88 @@ package proc
 import (
 	"bytes"
 	"fmt"
+	"io"
+	"reflect"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
+	var contents map[string]*kernfs.Dentry
+	if stack := k.NetworkStack(); stack != nil {
+		const (
+			arp       = "IP address       HW type     Flags       HW address            Mask     Device"
+			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode"
+			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode"
+			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em"
+			ptype     = "Type Device      Function"
+			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode"
+		)
+		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
+
+		contents = map[string]*kernfs.Dentry{
+			"dev":  newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}),
+			"snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}),
+
+			// The following files are simple stubs until they are implemented in
+			// netstack, if the file contains a header the stub is just the header
+			// otherwise it is an empty file.
+			"arp":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)),
+			"netlink":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)),
+			"netstat":   newDentry(root, inoGen.NextIno(), 0444, &netStatData{}),
+			"packet":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)),
+			"protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)),
+
+			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
+			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
+			"psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)),
+			"ptype":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)),
+			"route":  newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}),
+			"tcp":    newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}),
+			"udp":    newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}),
+			"unix":   newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}),
+		}
+
+		if stack.SupportsIPv6() {
+			contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(""))
+			contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6))
+		}
+	}
+
+	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents)
+}
+
 // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
 //
 // +stateify savable
 type ifinet6 struct {
-	s inet.Stack
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
 }
 
-var _ vfs.DynamicBytesSource = (*ifinet6)(nil)
+var _ dynamicInode = (*ifinet6)(nil)
 
 func (n *ifinet6) contents() []string {
 	var lines []string
-	nics := n.s.Interfaces()
-	for id, naddrs := range n.s.InterfaceAddrs() {
+	nics := n.stack.Interfaces()
+	for id, naddrs := range n.stack.InterfaceAddrs() {
 		nic, ok := nics[id]
 		if !ok {
 			// NIC was added after NICNames was called. We'll just ignore it.
@@ -77,18 +132,20 @@ func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// netDev implements vfs.DynamicBytesSource for /proc/net/dev.
+// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
 //
 // +stateify savable
-type netDev struct {
-	s inet.Stack
+type netDevData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
 }
 
-var _ vfs.DynamicBytesSource = (*netDev)(nil)
+var _ dynamicInode = (*netDevData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	interfaces := n.s.Interfaces()
+func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	interfaces := n.stack.Interfaces()
 	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
 	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
 
@@ -96,7 +153,7 @@ func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// Implements the same format as
 		// net/core/net-procfs.c:dev_seq_printf_stats.
 		var stats inet.StatDev
-		if err := n.s.Statistics(&stats, i.Name); err != nil {
+		if err := n.stack.Statistics(&stats, i.Name); err != nil {
 			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
 			continue
 		}
@@ -128,19 +185,21 @@ func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// netUnix implements vfs.DynamicBytesSource for /proc/net/unix.
+// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
 //
 // +stateify savable
-type netUnix struct {
-	k *kernel.Kernel
+type netUnixData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
 }
 
-var _ vfs.DynamicBytesSource = (*netUnix)(nil)
+var _ dynamicInode = (*netUnixData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
+func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
-	for _, se := range n.k.ListSockets() {
+	for _, se := range n.kernel.ListSockets() {
 		s := se.Sock.Get()
 		if s == nil {
 			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
@@ -213,22 +272,72 @@ func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp.
-//
-// +stateify savable
-type netTCP struct {
-	k *kernel.Kernel
+func networkToHost16(n uint16) uint16 {
+	// n is in network byte order, so is big-endian. The most-significant byte
+	// should be stored in the lower address.
+	//
+	// We manually inline binary.BigEndian.Uint16() because Go does not support
+	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
+	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
+	// interface method call, defeating inlining.
+	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
+	return usermem.ByteOrder.Uint16(buf[:])
 }
 
-var _ vfs.DynamicBytesSource = (*netTCP)(nil)
+func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
+	switch family {
+	case linux.AF_INET:
+		var a linux.SockAddrInet
+		if i != nil {
+			a = *i.(*linux.SockAddrInet)
+		}
+
+		// linux.SockAddrInet.Port is stored in the network byte order and is
+		// printed like a number in host byte order. Note that all numbers in host
+		// byte order are printed with the most-significant byte first when
+		// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
+		port := networkToHost16(a.Port)
+
+		// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
+		// (i.e. most-significant byte in index 0). Linux represents this as a
+		// __be32 which is a typedef for an unsigned int, and is printed with
+		// %X. This means that for a little-endian machine, Linux prints the
+		// least-significant byte of the address first. To emulate this, we first
+		// invert the byte order for the address using usermem.ByteOrder.Uint32,
+		// which makes it have the equivalent encoding to a __be32 on a little
+		// endian machine. Note that this operation is a no-op on a big endian
+		// machine. Then similar to Linux, we format it with %X, which will print
+		// the most-significant byte of the __be32 address first, which is now
+		// actually the least-significant byte of the original address in
+		// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
+		addr := usermem.ByteOrder.Uint32(a.Addr[:])
+
+		fmt.Fprintf(w, "%08X:%04X ", addr, port)
+	case linux.AF_INET6:
+		var a linux.SockAddrInet6
+		if i != nil {
+			a = *i.(*linux.SockAddrInet6)
+		}
 
-func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
+		port := networkToHost16(a.Port)
+		addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4])
+		addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8])
+		addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12])
+		addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16])
+		fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
+	}
+}
+
+func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
 	t := kernel.TaskFromContext(ctx)
-	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
-	for _, se := range n.k.ListSockets() {
+
+	for _, se := range k.ListSockets() {
 		s := se.Sock.Get()
 		if s == nil {
-			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
 			continue
 		}
 		sfile := s.(*fs.File)
@@ -236,7 +345,7 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		if !ok {
 			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
 		}
-		if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
+		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
 			s.DecRef()
 			// Not tcp4 sockets.
 			continue
@@ -250,27 +359,23 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// Field: sl; entry number.
 		fmt.Fprintf(buf, "%4d: ", se.ID)
 
-		portBuf := make([]byte, 2)
-
 		// Field: local_adddress.
-		var localAddr linux.SockAddrInet
-		if local, _, err := sops.GetSockName(t); err == nil {
-			localAddr = *local.(*linux.SockAddrInet)
+		var localAddr linux.SockAddr
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = local
+			}
 		}
-		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
-		fmt.Fprintf(buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(localAddr.Addr[:]),
-			portBuf)
+		writeInetAddr(buf, family, localAddr)
 
 		// Field: rem_address.
-		var remoteAddr linux.SockAddrInet
-		if remote, _, err := sops.GetPeerName(t); err == nil {
-			remoteAddr = *remote.(*linux.SockAddrInet)
+		var remoteAddr linux.SockAddr
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = remote
+			}
 		}
-		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
-		fmt.Fprintf(buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
-			portBuf)
+		writeInetAddr(buf, family, remoteAddr)
 
 		// Field: state; socket state.
 		fmt.Fprintf(buf, "%02X ", sops.State())
@@ -293,7 +398,8 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
 			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout; number of unanswered 0-window probes.
@@ -335,3 +441,344 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 	return nil
 }
+
+// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCPData)(nil)
+
+func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
+}
+
+// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
+//
+// +stateify savable
+type netTCP6Data struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCP6Data)(nil)
+
+func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
+}
+
+// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
+//
+// +stateify savable
+type netUDPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUDPData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	for _, se := range d.kernel.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
+			s.DecRef()
+			// Not udp4 socket.
+			continue
+		}
+
+		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%5d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = *local.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = *remote.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when. Always 0 for UDP.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt. Always 0 for UDP.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout. Always 0 for UDP.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: ref; reference count on the socket inode. Don't count the ref
+		// we obtain while deferencing the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: drops; number of dropped packets. Unimplemented.
+		fmt.Fprintf(buf, "%d", 0)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
+//
+// +stateify savable
+type netSnmpData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netSnmpData)(nil)
+
+type snmpLine struct {
+	prefix string
+	header string
+}
+
+var snmp = []snmpLine{
+	{
+		prefix: "Ip",
+		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
+	},
+	{
+		prefix: "Icmp",
+		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
+	},
+	{
+		prefix: "IcmpMsg",
+	},
+	{
+		prefix: "Tcp",
+		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
+	},
+	{
+		prefix: "Udp",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+	{
+		prefix: "UdpLite",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+}
+
+func toSlice(a interface{}) []uint64 {
+	v := reflect.Indirect(reflect.ValueOf(a))
+	return v.Slice(0, v.Len()).Interface().([]uint64)
+}
+
+func sprintSlice(s []uint64) string {
+	if len(s) == 0 {
+		return ""
+	}
+	r := fmt.Sprint(s)
+	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
+}
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	types := []interface{}{
+		&inet.StatSNMPIP{},
+		&inet.StatSNMPICMP{},
+		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
+		&inet.StatSNMPTCP{},
+		&inet.StatSNMPUDP{},
+		&inet.StatSNMPUDPLite{},
+	}
+	for i, stat := range types {
+		line := snmp[i]
+		if stat == nil {
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			continue
+		}
+		if err := d.stack.Statistics(stat, line.prefix); err != nil {
+			if err == syserror.EOPNOTSUPP {
+				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			} else {
+				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			}
+		}
+
+		fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)
+
+		if line.prefix == "Tcp" {
+			tcp := stat.(*inet.StatSNMPTCP)
+			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
+			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+		} else {
+			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+		}
+	}
+	return nil
+}
+
+// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
+//
+// +stateify savable
+type netRouteData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netRouteData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
+
+	interfaces := d.stack.Interfaces()
+	for _, rt := range d.stack.RouteTable() {
+		// /proc/net/route only includes ipv4 routes.
+		if rt.Family != linux.AF_INET {
+			continue
+		}
+
+		// /proc/net/route does not include broadcast or multicast routes.
+		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
+			continue
+		}
+
+		iface, ok := interfaces[rt.OutputInterface]
+		if !ok || iface.Name == "lo" {
+			continue
+		}
+
+		var (
+			gw     uint32
+			prefix uint32
+			flags  = linux.RTF_UP
+		)
+		if len(rt.GatewayAddr) == header.IPv4AddressSize {
+			flags |= linux.RTF_GATEWAY
+			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
+		}
+		if len(rt.DstAddr) == header.IPv4AddressSize {
+			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
+		}
+		l := fmt.Sprintf(
+			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
+			iface.Name,
+			prefix,
+			gw,
+			flags,
+			0, // RefCnt.
+			0, // Use.
+			0, // Metric.
+			(uint32(1)<<rt.DstLen)-1,
+			0, // MTU.
+			0, // Window.
+			0, // RTT.
+		)
+		fmt.Fprintf(buf, "%-127s\n", l)
+	}
+	return nil
+}
+
+// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
+//
+// +stateify savable
+type netStatData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
+		"EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
+		"LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
+		"PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
+		"ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
+		"TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
+		"TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
+		"TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
+		"TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
+		"TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
+		"TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
+		"TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
+		"TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
+		"TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
+		"TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
+		"TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
+		"TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
+		"TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
+		"TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
+		"TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
+		"TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
+		"TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
+		"TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
+		"TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
+		"TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
+		"TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
+		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
+		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
+		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
+		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index 20a77a8ca..0a1d3f34b 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -31,7 +31,7 @@ func newIPv6TestStack() *inet.TestStack {
 }
 
 func TestIfinet6NoAddresses(t *testing.T) {
-	n := &ifinet6{s: newIPv6TestStack()}
+	n := &ifinet6{stack: newIPv6TestStack()}
 	var buf bytes.Buffer
 	n.Generate(contexttest.Context(t), &buf)
 	if buf.Len() > 0 {
@@ -62,7 +62,7 @@ func TestIfinet6(t *testing.T) {
 		"101112131415161718191a1b1c1d1e1f 02 80 00 00     eth1\n": {},
 	}
 
-	n := &ifinet6{s: s}
+	n := &ifinet6{stack: s}
 	contents := n.contents()
 	if len(contents) != len(want) {
 		t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 6b58c16b9..8eddf95e0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -73,6 +73,7 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 		"loadavg":     {Type: linux.DT_REG},
 		"meminfo":     {Type: linux.DT_REG},
 		"mounts":      {Type: linux.DT_LNK},
+		"net":         {Type: linux.DT_DIR},
 		"self":        selfLink,
 		"stat":        {Type: linux.DT_REG},
 		"sys":         {Type: linux.DT_DIR},
-- 
cgit v1.2.3


From 8e8d0f96f651ce161dfe6003d738dbda28f7cb0e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 17 Jan 2020 10:39:24 -0800
Subject: Add /proc/[pid]/cgroups file

Updates #1195

PiperOrigin-RevId: 290298266
---
 pkg/sentry/fsimpl/proc/filesystem.go | 13 ++++++++++++-
 pkg/sentry/fsimpl/proc/subtasks.go   | 18 ++++++++++--------
 pkg/sentry/fsimpl/proc/task.go       | 30 +++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/proc/tasks.go      | 10 ++++++++--
 pkg/sentry/fsimpl/proc/tasks_test.go | 11 ++++++++++-
 5 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index e9cb7895f..f49819187 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -47,7 +47,12 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
 	procfs := &kernfs.Filesystem{}
 	procfs.VFSFilesystem().Init(vfsObj, procfs)
 
-	_, dentry := newTasksInode(procfs, k, pidns)
+	var data *InternalData
+	if opts.InternalData != nil {
+		data = opts.InternalData.(*InternalData)
+	}
+
+	_, dentry := newTasksInode(procfs, k, pidns, data.Cgroups)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
@@ -78,3 +83,9 @@ var _ dynamicInode = (*staticFile)(nil)
 func newStaticFile(data string) *staticFile {
 	return &staticFile{StaticData: vfs.StaticData{Data: data}}
 }
+
+// InternalData contains internal data passed in to the procfs mount via
+// vfs.GetFilesystemOptions.InternalData.
+type InternalData struct {
+	Cgroups map[string]string
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 8892c5a11..91eded415 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -35,18 +35,20 @@ type subtasksInode struct {
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
 
-	task   *kernel.Task
-	pidns  *kernel.PIDNamespace
-	inoGen InoGenerator
+	task              *kernel.Task
+	pidns             *kernel.PIDNamespace
+	inoGen            InoGenerator
+	cgroupControllers map[string]string
 }
 
 var _ kernfs.Inode = (*subtasksInode)(nil)
 
-func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator) *kernfs.Dentry {
+func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry {
 	subInode := &subtasksInode{
-		task:   task,
-		pidns:  pidns,
-		inoGen: inoGen,
+		task:              task,
+		pidns:             pidns,
+		inoGen:            inoGen,
+		cgroupControllers: cgroupControllers,
 	}
 	// Note: credentials are overridden by taskOwnedInode.
 	subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
@@ -79,7 +81,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
 		return nil, syserror.ENOENT
 	}
 
-	subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false)
+	subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers)
 	return subTaskDentry.VFSDentry(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 621c17cfe..a0580f20d 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -15,6 +15,7 @@
 package proc
 
 import (
+	"bytes"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -42,7 +43,7 @@ type taskInode struct {
 
 var _ kernfs.Inode = (*taskInode)(nil)
 
-func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry {
+func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
 	contents := map[string]*kernfs.Dentry{
 		"auxv":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}),
 		"cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
@@ -68,11 +69,11 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
-		contents["task"] = newSubtasks(task, pidns, inoGen)
+		contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers)
+	}
+	if len(cgroupControllers) > 0 {
+		contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers))
 	}
-	//if len(p.cgroupControllers) > 0 {
-	//	contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
-	//}
 
 	taskInode := &taskInode{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
@@ -227,3 +228,22 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr
 	d.Init(taskInode)
 	return d
 }
+
+// newCgroupData creates inode that shows cgroup information.
+// From man 7 cgroups: "For each cgroup hierarchy of which the process is a
+// member, there is one entry containing three colon-separated fields:
+//   hierarchy-ID:controller-list:cgroup-path"
+func newCgroupData(controllers map[string]string) dynamicInode {
+	buf := bytes.Buffer{}
+
+	// The hierarchy ids must be positive integers (for cgroup v1), but the
+	// exact number does not matter, so long as they are unique. We can
+	// just use a counter, but since linux sorts this file in descending
+	// order, we must count down to preserve this behavior.
+	i := len(controllers)
+	for name, dir := range controllers {
+		fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir)
+		i--
+	}
+	return newStaticFile(buf.String())
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 5646c602a..51f634716 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -54,11 +54,16 @@ type tasksInode struct {
 	// Linux. So handle them outside of OrderedChildren.
 	selfSymlink       *vfs.Dentry
 	threadSelfSymlink *vfs.Dentry
+
+	// cgroupControllers is a map of controller name to directory in the
+	// cgroup hierarchy. These controllers are immutable and will be listed
+	// in /proc/pid/cgroup if not nil.
+	cgroupControllers map[string]string
 }
 
 var _ kernfs.Inode = (*tasksInode)(nil)
 
-func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) {
+func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 	contents := map[string]*kernfs.Dentry{
 		"cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
@@ -78,6 +83,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		inoGen:            inoGen,
 		selfSymlink:       newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
 		threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+		cgroupControllers: cgroupControllers,
 	}
 	inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
 
@@ -111,7 +117,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 		return nil, syserror.ENOENT
 	}
 
-	taskDentry := newTaskInode(i.inoGen, task, i.pidns, true)
+	taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers)
 	return taskDentry.VFSDentry(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 8eddf95e0..002d2f73b 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -87,6 +87,7 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
 		"auxv":    {Type: linux.DT_REG},
+		"cgroup":  {Type: linux.DT_REG},
 		"cmdline": {Type: linux.DT_REG},
 		"comm":    {Type: linux.DT_REG},
 		"environ": {Type: linux.DT_REG},
@@ -145,7 +146,15 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error)
 	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{})
+	fsOpts := vfs.GetFilesystemOptions{
+		InternalData: &InternalData{
+			Cgroups: map[string]string{
+				"cpuset": "/foo/cpuset",
+				"memory": "/foo/memory",
+			},
+		},
+	}
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts)
 	if err != nil {
 		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
 	}
-- 
cgit v1.2.3


From 80d0f9304484897e4307c9701ddbfaacb925715d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 17 Jan 2020 11:20:29 -0800
Subject: Fix data race in tty.queue.readableSize.

We were setting queue.readable without holding the lock.

PiperOrigin-RevId: 290306922
---
 pkg/sentry/fs/tty/line_discipline.go |  4 +++-
 pkg/sentry/fs/tty/queue.go           | 11 ++---------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 894964260..9fe02657e 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -140,8 +140,10 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	// buffer to its read buffer. Anything already in the read buffer is
 	// now readable.
 	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
-		l.inQueue.pushWaitBuf(l)
+		l.inQueue.mu.Lock()
+		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
+		l.inQueue.mu.Unlock()
 		l.slaveWaiter.Notify(waiter.EventIn)
 	}
 
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 8b5d4699a..21ccc6f32 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -197,16 +197,9 @@ func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.pushWaitBufLocked(l)
 }
 
-// pushWaitBuf fills the queue's read buffer with data from the wait buffer.
+// pushWaitBufLocked fills the queue's read buffer with data from the wait
+// buffer.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
-func (q *queue) pushWaitBuf(l *lineDiscipline) int {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-	return q.pushWaitBufLocked(l)
-}
-
 // Preconditions:
 // * l.termiosMu must be held for reading.
 // * q.mu must be locked.
-- 
cgit v1.2.3


From 23fa847910eeee05babeea4f712b905115eeb865 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 17 Jan 2020 11:40:51 -0800
Subject: Remove addPermanentAddressLocked

It was possible to use this function incorrectly, and its separation
wasn't buying us anything.

PiperOrigin-RevId: 290311100
---
 pkg/tcpip/stack/ndp.go | 21 ++++++++++-----------
 pkg/tcpip/stack/nic.go | 46 ++++++++++++++++++----------------------------
 2 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index c99d387d5..7d4b41dfa 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -432,13 +432,12 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	// Should not attempt to perform DAD on an address that is currently in
 	// the DAD process.
 	if _, ok := ndp.dad[addr]; ok {
-		// Should never happen because we should only ever call this
-		// function for newly created addresses. If we attemped to
-		// "add" an address that already existed, we would returned an
-		// error since we attempted to add a duplicate address, or its
-		// reference count would have been increased without doing the
-		// work that would have been done for an address that was brand
-		// new. See NIC.addPermanentAddressLocked.
+		// Should never happen because we should only ever call this function for
+		// newly created addresses. If we attemped to "add" an address that already
+		// existed, we would get an error since we attempted to add a duplicate
+		// address, or its reference count would have been increased without doing
+		// the work that would have been done for an address that was brand new.
+		// See NIC.addAddressLocked.
 		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
@@ -994,7 +993,7 @@ func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration
 	// If the preferred lifetime is zero, then the address should be considered
 	// deprecated.
 	deprecated := pl == 0
-	ref, err := ndp.nic.addPermanentAddressLocked(protocolAddr, FirstPrimaryEndpoint, slaac, deprecated)
+	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
 	if err != nil {
 		log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
 	}
@@ -1164,7 +1163,7 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 //
 // The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupHostOnlyState() {
-	for addr, _ := range ndp.autoGenAddresses {
+	for addr := range ndp.autoGenAddresses {
 		ndp.invalidateAutoGenAddress(addr)
 	}
 
@@ -1172,7 +1171,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 		log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got)
 	}
 
-	for prefix, _ := range ndp.onLinkPrefixes {
+	for prefix := range ndp.onLinkPrefixes {
 		ndp.invalidateOnLinkPrefix(prefix)
 	}
 
@@ -1180,7 +1179,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got)
 	}
 
-	for router, _ := range ndp.defaultRouters {
+	for router := range ndp.defaultRouters {
 		ndp.invalidateDefaultRouter(router)
 	}
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 4452a1302..53abf29e5 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -196,13 +196,13 @@ func (n *NIC) enable() *tcpip.Error {
 			addr = header.LinkLocalAddr(l2addr)
 		}
 
-		if _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
+		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
 			Protocol: header.IPv6ProtocolNumber,
 			AddressWithPrefix: tcpip.AddressWithPrefix{
 				Address:   addr,
 				PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
 			},
-		}, CanBePrimaryEndpoint, static, false /* deprecated */); err != nil {
+		}, CanBePrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -533,14 +533,21 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	return ref
 }
 
-// addPermanentAddressLocked adds a permanent address to n.
+// addAddressLocked adds a new protocolAddress to n.
 //
-// If n already has the address in a non-permanent state,
-// addPermanentAddressLocked will promote it to permanent and update the
-// endpoint with the properties provided.
-func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
-	id := NetworkEndpointID{protocolAddress.AddressWithPrefix.Address}
+// If n already has the address in a non-permanent state, and the kind given is
+// permanent, that address will be promoted in place and its properties set to
+// the properties provided. Otherwise, it returns tcpip.ErrDuplicateAddress.
+func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
+	// TODO(b/141022673): Validate IP addresses before adding them.
+
+	// Sanity check.
+	id := NetworkEndpointID{LocalAddress: protocolAddress.AddressWithPrefix.Address}
 	if ref, ok := n.endpoints[id]; ok {
+		// Endpoint already exists.
+		if kind != permanent {
+			return nil, tcpip.ErrDuplicateAddress
+		}
 		switch ref.getKind() {
 		case permanentTentative, permanent:
 			// The NIC already have a permanent endpoint with that address.
@@ -585,23 +592,6 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 		}
 	}
 
-	return n.addAddressLocked(protocolAddress, peb, permanent, configType, deprecated)
-}
-
-// addAddressLocked adds a new protocolAddress to n.
-//
-// If the address is already known by n (irrespective of the state it is in),
-// addAddressLocked does nothing and returns tcpip.ErrDuplicateAddress.
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
-	// TODO(b/141022673): Validate IP address before adding them.
-
-	// Sanity check.
-	id := NetworkEndpointID{protocolAddress.AddressWithPrefix.Address}
-	if _, ok := n.endpoints[id]; ok {
-		// Endpoint already exists.
-		return nil, tcpip.ErrDuplicateAddress
-	}
-
 	netProto, ok := n.stack.networkProtocols[protocolAddress.Protocol]
 	if !ok {
 		return nil, tcpip.ErrUnknownProtocol
@@ -666,7 +656,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 func (n *NIC) AddAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
 	// Add the endpoint.
 	n.mu.Lock()
-	_, err := n.addPermanentAddressLocked(protocolAddress, peb, static, false /* deprecated */)
+	_, err := n.addAddressLocked(protocolAddress, peb, permanent, static, false /* deprecated */)
 	n.mu.Unlock()
 
 	return err
@@ -942,13 +932,13 @@ func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.A
 		if !ok {
 			return tcpip.ErrUnknownProtocol
 		}
-		if _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
+		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
 			Protocol: protocol,
 			AddressWithPrefix: tcpip.AddressWithPrefix{
 				Address:   addr,
 				PrefixLen: netProto.DefaultPrefixLen(),
 			},
-		}, NeverPrimaryEndpoint, static, false /* deprecated */); err != nil {
+		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
-- 
cgit v1.2.3


From 9073521098ee52cdda74a193565b7bbe75d8c35a Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 17 Jan 2020 13:31:26 -0800
Subject: Convert EventMask to uint64

It is used for signalfd where the maximum signal is 64.

PiperOrigin-RevId: 290331008
---
 pkg/waiter/waiter.go            |   2 +-
 test/syscalls/BUILD             |   2 +
 test/syscalls/linux/signalfd.cc | 118 ++++++++++++++++++++++++----------------
 3 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index f708e95fa..707eb085b 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -62,7 +62,7 @@ import (
 )
 
 // EventMask represents io events as used in the poll() syscall.
-type EventMask uint16
+type EventMask uint64
 
 // Events that waiters can wait on. The meaning is the same as those in the
 // poll() syscall.
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 829693e8e..90d52e73b 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -380,6 +380,8 @@ syscall_test(test = "//test/syscalls/linux:rseq_test")
 
 syscall_test(test = "//test/syscalls/linux:rtsignal_test")
 
+syscall_test(test = "//test/syscalls/linux:signalfd_test")
+
 syscall_test(test = "//test/syscalls/linux:sched_test")
 
 syscall_test(test = "//test/syscalls/linux:sched_yield_test")
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 09ecad34a..95be4b66c 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -39,6 +39,7 @@ namespace testing {
 namespace {
 
 constexpr int kSigno = SIGUSR1;
+constexpr int kSignoMax = 64;  // SIGRTMAX
 constexpr int kSignoAlt = SIGUSR2;
 
 // Returns a new signalfd.
@@ -51,41 +52,45 @@ inline PosixErrorOr<FileDescriptor> NewSignalFD(sigset_t* mask, int flags = 0) {
   return FileDescriptor(fd);
 }
 
-TEST(Signalfd, Basic) {
+class SignalfdTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SignalfdTest, Basic) {
+  int signo = GetParam();
   // Create the signalfd.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Deliver the blocked signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // We should now read the signal.
   struct signalfd_siginfo rbuf;
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 }
 
-TEST(Signalfd, MaskWorks) {
+TEST_P(SignalfdTest, MaskWorks) {
+  int signo = GetParam();
   // Create two signalfds with different masks.
   sigset_t mask1, mask2;
   sigemptyset(&mask1);
   sigemptyset(&mask2);
-  sigaddset(&mask1, kSigno);
+  sigaddset(&mask1, signo);
   sigaddset(&mask2, kSignoAlt);
   FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask1, 0));
   FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask2, 0));
 
   // Deliver the two signals.
   const auto scoped_sigmask1 =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
   const auto scoped_sigmask2 =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSignoAlt));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
   ASSERT_THAT(tgkill(getpid(), gettid(), kSignoAlt), SyscallSucceeds());
 
   // We should see the signals on the appropriate signalfds.
@@ -98,7 +103,7 @@ TEST(Signalfd, MaskWorks) {
   EXPECT_EQ(rbuf2.ssi_signo, kSignoAlt);
   ASSERT_THAT(read(fd1.get(), &rbuf1, sizeof(rbuf1)),
               SyscallSucceedsWithValue(sizeof(rbuf1)));
-  EXPECT_EQ(rbuf1.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf1.ssi_signo, signo);
 }
 
 TEST(Signalfd, Cloexec) {
@@ -111,11 +116,12 @@ TEST(Signalfd, Cloexec) {
   EXPECT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
 }
 
-TEST(Signalfd, Blocking) {
+TEST_P(SignalfdTest, Blocking) {
+  int signo = GetParam();
   // Create the signalfd in blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Shared tid variable.
@@ -136,7 +142,7 @@ TEST(Signalfd, Blocking) {
     struct signalfd_siginfo rbuf;
     ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
                 SyscallSucceedsWithValue(sizeof(rbuf)));
-    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+    EXPECT_EQ(rbuf.ssi_signo, signo);
   });
 
   // Wait until blocked.
@@ -149,20 +155,21 @@ TEST(Signalfd, Blocking) {
   //
   // See gvisor.dev/issue/139.
   if (IsRunningOnGvisor()) {
-    ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
   } else {
-    ASSERT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), tid, signo), SyscallSucceeds());
   }
 
   // Ensure that it was received.
   t.Join();
 }
 
-TEST(Signalfd, ThreadGroup) {
+TEST_P(SignalfdTest, ThreadGroup) {
+  int signo = GetParam();
   // Create the signalfd in blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Shared variable.
@@ -176,7 +183,7 @@ TEST(Signalfd, ThreadGroup) {
     struct signalfd_siginfo rbuf;
     ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
                 SyscallSucceedsWithValue(sizeof(rbuf)));
-    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+    EXPECT_EQ(rbuf.ssi_signo, signo);
 
     // Wait for the other thread.
     absl::MutexLock ml(&mu);
@@ -185,7 +192,7 @@ TEST(Signalfd, ThreadGroup) {
   });
 
   // Deliver the signal to the threadgroup.
-  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
 
   // Wait for the first thread to process.
   {
@@ -194,13 +201,13 @@ TEST(Signalfd, ThreadGroup) {
   }
 
   // Deliver to the thread group again (other thread still exists).
-  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
 
   // Ensure that we can also receive it.
   struct signalfd_siginfo rbuf;
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 
   // Mark the test as done.
   {
@@ -212,11 +219,12 @@ TEST(Signalfd, ThreadGroup) {
   t.Join();
 }
 
-TEST(Signalfd, Nonblock) {
+TEST_P(SignalfdTest, Nonblock) {
+  int signo = GetParam();
   // Create the signalfd in non-blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
 
@@ -227,20 +235,21 @@ TEST(Signalfd, Nonblock) {
 
   // Block and deliver the signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // Ensure that a read actually works.
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 
   // Should block again.
   EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
-TEST(Signalfd, SetMask) {
+TEST_P(SignalfdTest, SetMask) {
+  int signo = GetParam();
   // Create the signalfd matching nothing.
   sigset_t mask;
   sigemptyset(&mask);
@@ -249,8 +258,8 @@ TEST(Signalfd, SetMask) {
 
   // Block and deliver a signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // We should have nothing.
   struct signalfd_siginfo rbuf;
@@ -258,29 +267,30 @@ TEST(Signalfd, SetMask) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 
   // Change the signal mask.
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   ASSERT_THAT(signalfd(fd.get(), &mask, 0), SyscallSucceeds());
 
   // We should now have the signal.
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 }
 
-TEST(Signalfd, Poll) {
+TEST_P(SignalfdTest, Poll) {
+  int signo = GetParam();
   // Create the signalfd.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Block the signal, and start a thread to deliver it.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
   pid_t orig_tid = gettid();
   ScopedThread t([&] {
     absl::SleepFor(absl::Seconds(5));
-    ASSERT_THAT(tgkill(getpid(), orig_tid, kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), orig_tid, signo), SyscallSucceeds());
   });
 
   // Start polling for the signal. We expect that it is not available at the
@@ -297,19 +307,18 @@ TEST(Signalfd, Poll) {
               SyscallSucceedsWithValue(sizeof(rbuf)));
 }
 
-TEST(Signalfd, KillStillKills) {
-  sigset_t mask;
-  sigemptyset(&mask);
-  sigaddset(&mask, SIGKILL);
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
-
-  // Just because there is a signalfd, we shouldn't see any change in behavior
-  // for unblockable signals. It's easier to test this with SIGKILL.
-  const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
-  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+std::string PrintSigno(::testing::TestParamInfo<int> info) {
+  switch (info.param) {
+    case kSigno:
+      return "kSigno";
+    case kSignoMax:
+      return "kSignoMax";
+    default:
+      return absl::StrCat(info.param);
+  }
 }
+INSTANTIATE_TEST_SUITE_P(Signalfd, SignalfdTest,
+                         ::testing::Values(kSigno, kSignoMax), PrintSigno);
 
 TEST(Signalfd, Ppoll) {
   sigset_t mask;
@@ -328,6 +337,20 @@ TEST(Signalfd, Ppoll) {
               SyscallSucceedsWithValue(0));
 }
 
+TEST(Signalfd, KillStillKills) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGKILL);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+
+  // Just because there is a signalfd, we shouldn't see any change in behavior
+  // for unblockable signals. It's easier to test this with SIGKILL.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
+  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+}
+
 }  // namespace
 
 }  // namespace testing
@@ -340,6 +363,7 @@ int main(int argc, char** argv) {
   sigset_t set;
   sigemptyset(&set);
   sigaddset(&set, gvisor::testing::kSigno);
+  sigaddset(&set, gvisor::testing::kSignoMax);
   sigaddset(&set, gvisor::testing::kSignoAlt);
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
-- 
cgit v1.2.3


From f1a5178c589dbd9a1fe4f1b9fb943fbe64791b58 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 17 Jan 2020 14:20:00 -0800
Subject: Fix data race in MountNamespace.resolve.

We must hold fs.renameMu to access Dirent.parent.

PiperOrigin-RevId: 290340804
---
 pkg/sentry/fs/mounts.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index db3dfd096..a9627a9d1 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -609,8 +609,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
 		}
 
 		// Find the node; we resolve relative to the current symlink's parent.
+		renameMu.RLock()
+		parent := node.parent
+		renameMu.RUnlock()
 		*remainingTraversals--
-		d, err := mns.FindInode(ctx, root, node.parent, targetPath, remainingTraversals)
+		d, err := mns.FindInode(ctx, root, parent, targetPath, remainingTraversals)
 		if err != nil {
 			return nil, err
 		}
-- 
cgit v1.2.3


From 47d85257d3d015f0b9f7739c81af0ee9f510aaf5 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 17 Jan 2020 18:24:39 -0800
Subject: Filter out received packets with a local source IP address.

CERT Advisory CA-96.21 III. Solution advises that devices drop packets which
could not have correctly arrived on the wire, such as receiving a packet where
the source IP address is owned by the device that sent it.

Fixes #1507

PiperOrigin-RevId: 290378240
---
 pkg/sentry/socket/netstack/netstack.go | 15 +++++-----
 pkg/sentry/socket/netstack/stack.go    | 38 ++++++++++++-------------
 pkg/tcpip/stack/nic.go                 | 14 +++++++--
 pkg/tcpip/tcpip.go                     | 10 +++++--
 pkg/tcpip/transport/udp/udp_test.go    | 52 ++++++++++++++++++++++++++++++++--
 5 files changed, 95 insertions(+), 34 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index d2f7e987d..fec575357 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -138,13 +138,14 @@ var Metrics = tcpip.Stats{
 		},
 	},
 	IP: tcpip.IPStats{
-		PacketsReceived:            mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
-		InvalidAddressesReceived:   mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."),
-		PacketsDelivered:           mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
-		PacketsSent:                mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."),
-		OutgoingPacketErrors:       mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
-		MalformedPacketsReceived:   mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
-		MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
+		PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
+		InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."),
+		InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Total number of IP packets received with an unknown or invalid source address."),
+		PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
+		PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."),
+		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
+		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
+		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
 	},
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index a0db2d4fd..31ea66eca 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -148,25 +148,25 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 	case *inet.StatSNMPIP:
 		ip := Metrics.IP
 		*stats = inet.StatSNMPIP{
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
-			ip.PacketsReceived.Value(),          // InReceives.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
-			ip.InvalidAddressesReceived.Value(), // InAddrErrors.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
-			ip.PacketsDelivered.Value(),         // InDelivers.
-			ip.PacketsSent.Value(),              // OutRequests.
-			ip.OutgoingPacketErrors.Value(),     // OutDiscards.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
-			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+			0,                          // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
+			0,                          // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			ip.PacketsReceived.Value(), // InReceives.
+			0,                          // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			ip.PacketsDelivered.Value(),     // InDelivers.
+			ip.PacketsSent.Value(),          // OutRequests.
+			ip.OutgoingPacketErrors.Value(), // OutDiscards.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
+			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
 		}
 	case *inet.StatSNMPICMP:
 		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 53abf29e5..4afe7b744 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -984,7 +984,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 
 // DeliverNetworkPacket finds the appropriate network protocol endpoint and
 // hands the packet over for further processing. This function is called when
-// the NIC receives a packet from the physical interface.
+// the NIC receives a packet from the link endpoint.
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
@@ -1029,6 +1029,14 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 	src, dst := netProto.ParseAddresses(pkt.Data.First())
 
+	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
+		// The source address is one of our own, so we never should have gotten a
+		// packet like this unless handleLocal is false. Loopback also calls this
+		// function even though the packets didn't come from the physical interface
+		// so don't drop those.
+		n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
+		return
+	}
 	if ref := n.getRef(protocol, dst); ref != nil {
 		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
 		return
@@ -1041,7 +1049,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	if n.stack.Forwarding() {
 		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
 		if err != nil {
-			n.stack.stats.IP.InvalidAddressesReceived.Increment()
+			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
 			return
 		}
 		defer r.Release()
@@ -1079,7 +1087,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 	// If a packet socket handled the packet, don't treat it as invalid.
 	if len(packetEPs) == 0 {
-		n.stack.stats.IP.InvalidAddressesReceived.Increment()
+		n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
 	}
 }
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index b7813cbc0..6243762e3 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -903,9 +903,13 @@ type IPStats struct {
 	// link layer in nic.DeliverNetworkPacket.
 	PacketsReceived *StatCounter
 
-	// InvalidAddressesReceived is the total number of IP packets received
-	// with an unknown or invalid destination address.
-	InvalidAddressesReceived *StatCounter
+	// InvalidDestinationAddressesReceived is the total number of IP packets
+	// received with an unknown or invalid destination address.
+	InvalidDestinationAddressesReceived *StatCounter
+
+	// InvalidSourceAddressesReceived is the total number of IP packets received
+	// with a source address that should never have been received on the wire.
+	InvalidSourceAddressesReceived *StatCounter
 
 	// PacketsDelivered is the total number of incoming IP packets that
 	// are successfully delivered to the transport layer via HandlePacket.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index ee9d10555..51bb61167 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -274,11 +274,16 @@ type testContext struct {
 
 func newDualTestContext(t *testing.T, mtu uint32) *testContext {
 	t.Helper()
-
-	s := stack.New(stack.Options{
+	return newDualTestContextWithOptions(t, mtu, stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
 	})
+}
+
+func newDualTestContextWithOptions(t *testing.T, mtu uint32, options stack.Options) *testContext {
+	t.Helper()
+
+	s := stack.New(options)
 	ep := channel.New(256, mtu, "")
 	wep := stack.LinkEndpoint(ep)
 
@@ -763,6 +768,49 @@ func TestV6ReadOnV6(t *testing.T) {
 	testRead(c, unicastV6)
 }
 
+// TestV4ReadSelfSource checks that packets coming from a local IP address are
+// correctly dropped when handleLocal is true and not otherwise.
+func TestV4ReadSelfSource(t *testing.T) {
+	for _, tt := range []struct {
+		name              string
+		handleLocal       bool
+		wantErr           *tcpip.Error
+		wantInvalidSource uint64
+	}{
+		{"HandleLocal", false, nil, 0},
+		{"NoHandleLocal", true, tcpip.ErrWouldBlock, 1},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			c := newDualTestContextWithOptions(t, defaultMTU, stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				HandleLocal:        tt.handleLocal,
+			})
+			defer c.cleanup()
+
+			c.createEndpointForFlow(unicastV4)
+
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			payload := newPayload()
+			h := unicastV4.header4Tuple(incoming)
+			h.srcAddr = h.dstAddr
+
+			c.injectV4Packet(payload, &h, true /* valid */)
+
+			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != tt.wantInvalidSource {
+				t.Errorf("c.s.Stats().IP.InvalidSourceAddressesReceived got %d, want %d", got, tt.wantInvalidSource)
+			}
+
+			if _, _, err := c.ep.Read(nil); err != tt.wantErr {
+				t.Errorf("c.ep.Read() got error %v, want %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
 func TestV4ReadOnV4(t *testing.T) {
 	c := newDualTestContext(t, defaultMTU)
 	defer c.cleanup()
-- 
cgit v1.2.3


From 10401599e104d90644a220c1cce3e4c2f224f0b3 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Sat, 18 Jan 2020 09:32:39 -0800
Subject: Include the cgroup name in the superblock options in
 /proc/self/mountinfo.

Java 11 parses /proc/self/mountinfo for cgroup information. Java 11.0.4 uses
the mount path to determine what cgroups existed, but Java 11.0.5 reads the
cgroup names from the superblock options.

This CL adds the cgroup name to the superblock options if the filesystem type
is "cgroup". Since gVisor doesn't actually support cgroups yet, we just infer
the cgroup name from the path.

PiperOrigin-RevId: 290434323
---
 pkg/sentry/fs/proc/mounts.go | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 5aedae799..d4efc86e0 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"sort"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -146,14 +147,35 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		// (10) Mount source: filesystem-specific information or "none".
 		fmt.Fprintf(&buf, "none ")
 
-		// (11) Superblock options. Only "ro/rw" is supported for now,
-		// and is the same as the filesystem option.
-		fmt.Fprintf(&buf, "%s\n", opts)
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(&buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource))
 	})
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountInfoFile)(nil)}}, 0
 }
 
+func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if msrc.Flags.ReadOnly {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if msrc.FilesystemType == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}
+
 // mountsFile is used to implement /proc/[pid]/mounts.
 //
 // +stateify savable
-- 
cgit v1.2.3


From c0e39a8271198f10407009ec1994e5d9efac796c Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 20 Dec 2019 06:00:30 +0000
Subject: Enable uname syscall support on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I206f38416a64d7c6a8531d8eb305c6ea239616b8
---
 pkg/sentry/syscalls/linux/sys_utsname.go | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index 748e8dd8d..a393e28c1 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64
+// +build amd64 arm64
 
 package linux
 
@@ -35,7 +35,15 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	copy(u.Nodename[:], uts.HostName())
 	copy(u.Release[:], version.Release)
 	copy(u.Version[:], version.Version)
-	copy(u.Machine[:], "x86_64") // build tag above.
+	// build tag above.
+	switch t.SyscallTable().Arch {
+	case arch.AMD64:
+		copy(u.Machine[:], "x86_64")
+	case arch.ARM64:
+		copy(u.Machine[:], "aarch64")
+	default:
+		copy(u.Machine[:], "unknown")
+	}
 	copy(u.Domainname[:], uts.DomainName())
 
 	// Copy out the result.
-- 
cgit v1.2.3


From 2ba6198851dc1e293295d7cadf8c0ae456b68beb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 21 Jan 2020 12:41:50 -0800
Subject: Add syscalls for lgetxattr, fgetxattr, lsetxattr, and fsetxattr.

Note that these simply will use the same logic as getxattr and setxattr, which
is not yet implemented for most filesystems.

PiperOrigin-RevId: 290800960
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |   8 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   8 +-
 pkg/sentry/syscalls/linux/sys_xattr.go     | 136 +++++++++++++++++++++--------
 test/syscalls/linux/xattr.cc               |  41 +++++++++
 4 files changed, 150 insertions(+), 43 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 6b2920900..c76771a54 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -229,11 +229,11 @@ var AMD64 = &kernel.SyscallTable{
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
 		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
 		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index c9629f6f3..d3587fda6 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -42,11 +42,11 @@ var ARM64 = &kernel.SyscallTable{
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
 		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 23d20da6f..e35c077d6 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -27,6 +27,40 @@ import (
 
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, true)
+}
+
+// LGetXattr implements linux syscall lgetxattr(2).
+func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, false)
+}
+
+// FGetXattr implements linux syscall fgetxattr(2).
+func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, value, err := getXattr(t, f.Dirent, nameAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
@@ -38,29 +72,17 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		// If getxattr(2) is called with size 0, the size of the value will be
-		// returned successfully even if it is nonzero. In that case, we need to
-		// retrieve the entire attribute value so we can return the correct size.
-		requestedSize := size
-		if size == 0 || size > linux.XATTR_SIZE_MAX {
-			requestedSize = linux.XATTR_SIZE_MAX
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
 		}
 
-		value, err := getXattr(t, d, dirPath, nameAddr, uint64(requestedSize))
+		n, value, err := getXattr(t, d, nameAddr, size)
+		valueLen = n
 		if err != nil {
 			return err
 		}
 
-		valueLen = len(value)
-		if uint64(valueLen) > requestedSize {
-			return syserror.ERANGE
-		}
-
-		// Skip copying out the attribute value if size is 0.
-		if size == 0 {
-			return nil
-		}
 		_, err = t.CopyOutBytes(valueAddr, []byte(value))
 		return err
 	})
@@ -71,29 +93,73 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 }
 
 // getXattr implements getxattr(2) from the given *fs.Dirent.
-func getXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr, size uint64) (string, error) {
-	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-		return "", syserror.ENOTDIR
-	}
-
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) {
 	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
-		return "", err
+		return 0, "", err
 	}
 
 	name, err := copyInXattrName(t, nameAddr)
 	if err != nil {
-		return "", err
+		return 0, "", err
 	}
 
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
+		return 0, "", syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.GetXattr(t, name, size)
+	// If getxattr(2) is called with size 0, the size of the value will be
+	// returned successfully even if it is nonzero. In that case, we need to
+	// retrieve the entire attribute value so we can return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+
+	value, err := d.Inode.GetXattr(t, name, requestedSize)
+	if err != nil {
+		return 0, "", err
+	}
+	n := len(value)
+	if uint64(n) > requestedSize {
+		return 0, "", syserror.ERANGE
+	}
+
+	// Don't copy out the attribute value if size is 0.
+	if size == 0 {
+		return n, "", nil
+	}
+	return n, value, nil
 }
 
 // SetXattr implements linux syscall setxattr(2).
 func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, true)
+}
+
+// LSetXattr implements linux syscall lsetxattr(2).
+func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, false)
+}
+
+// FSetXattr implements linux syscall fsetxattr(2).
+func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+	flags := args[4].Uint()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
+}
+
+func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
@@ -105,19 +171,19 @@ func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		return 0, nil, err
 	}
 
-	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
-		return 0, nil, syserror.EINVAL
-	}
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		return setXattr(t, d, dirPath, nameAddr, valueAddr, uint64(size), flags)
+		return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags)
 	})
 }
 
 // setXattr implements setxattr(2) from the given *fs.Dirent.
-func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
-	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-		return syserror.ENOTDIR
+func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
 	}
 
 	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
@@ -133,7 +199,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
 		return syserror.E2BIG
 	}
 	buf := make([]byte, size)
-	if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
 		return err
 	}
 	value := string(buf)
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index b3bc3463e..e77c355d7 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -26,6 +26,7 @@
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -414,6 +415,46 @@ TEST_F(XattrTest, GetxattrNonexistentName) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
+TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+
+  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(lsetxattr(path, name, &val, size, /*flags=*/0),
+              SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(lgetxattr(path, name, &buf, size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, FGetSetxattr) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
+  const char name[] = "user.test";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(fsetxattr(fd.get(), name, &val, size, /*flags=*/0),
+              SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From d46c397a1cd38f1e2aa5c864c1bb8594fb87bb63 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 21 Jan 2020 13:26:50 -0800
Subject: Add line break to /proc/net files

Some files were missing the last line break.

PiperOrigin-RevId: 290808898
---
 pkg/sentry/fs/proc/net.go           | 14 +++++++-------
 pkg/sentry/fsimpl/proc/tasks_net.go | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 3f17e98ea..bad445f3f 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -52,17 +52,17 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo
 			// implemented in netstack, if the file contains a
 			// header the stub is just the header otherwise it is
 			// an empty file.
-			"arp": newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")),
+			"arp": newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
 
-			"netlink":   newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")),
-			"netstat":   newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")),
-			"packet":    newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")),
-			"protocols": newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")),
+			"netlink":   newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
+			"netstat":   newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
+			"packet":    newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
+			"protocols": newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
 			// Linux sets psched values to: nsec per usec, psched
 			// tick in ns, 1000000, high res timer ticks per sec
 			// (ClockGetres returns 1ns resolution).
 			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
-			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
+			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function\n")),
 			"route":  seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc),
 			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
 			"udp":    seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc),
@@ -73,7 +73,7 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo
 			contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc)
 			contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte(""))
 			contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc)
-			contents["udp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode"))
+			contents["udp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
 		}
 	}
 	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 3dbf3ba41..4aaf23e97 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -41,12 +41,12 @@ func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 	var contents map[string]*kernfs.Dentry
 	if stack := k.NetworkStack(); stack != nil {
 		const (
-			arp       = "IP address       HW type     Flags       HW address            Mask     Device"
-			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode"
-			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode"
-			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em"
-			ptype     = "Type Device      Function"
-			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode"
+			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
+			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
+			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
+			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
+			ptype     = "Type Device      Function\n"
+			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
 		)
 		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
 
@@ -779,6 +779,6 @@ func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
 		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
 		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
-		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")
+		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
 	return nil
 }
-- 
cgit v1.2.3


From 47bc7550c0b8fcde7b3452bf536082e955882026 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 13:37:25 -0800
Subject: Fixing stuff

---
 pkg/sentry/socket/netfilter/netfilter.go |  3 ++
 pkg/tcpip/iptables/types.go              | 21 ++++----------
 pkg/tcpip/packet_buffer.go               | 25 +----------------
 test/iptables/iptables_test.go           | 47 ++++++++++++++++----------------
 4 files changed, 34 insertions(+), 62 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 09a3276c7..4ef8123ac 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -164,6 +164,9 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 		// Each rule corresponds to an entry.
 		entry := linux.KernelIPTEntry{
 			IPTEntry: linux.IPTEntry{
+				IP: linux.IPTIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
 				NextOffset:   linux.SizeOfIPTEntry,
 				TargetOffset: linux.SizeOfIPTEntry,
 			},
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 4f2a4d65e..a0bfc8b41 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -14,7 +14,9 @@
 
 package iptables
 
-import "gvisor.dev/gvisor/pkg/tcpip"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
 
 // A Hook specifies one of the hooks built into the network stack.
 //
@@ -161,21 +163,10 @@ type Rule struct {
 	Target Target
 }
 
-// TODO: This is gross.
-// TODO: Save this in SetEntries.
-// TODO: Utilize this when traversing tables.
+// IPHeaderFilter holds basic IP filtering data common to every rule.
 type IPHeaderFilter struct {
-	Source              [4]byte
-	Destination         [4]byte
-	SourceMask          [4]byte
-	DestinationMask     [4]byte
-	OutputInterface     string
-	InputInterface      string
-	OutputInterfaceMask string
-	InputInterfaceMask  string
-	Protocol            tcpip.TransportProtocolNumber
-	Flags               uint8
-	InverseFlags        uint8
+	// Protocol matches the transport protocol.
+	Protocol tcpip.TransportProtocolNumber
 }
 
 // A Matcher is the interface for matching packets.
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
index 7a036b93c..ab24372e7 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/packet_buffer.go
@@ -13,9 +13,7 @@
 
 package tcpip
 
-import (
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-)
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
 
 // A PacketBuffer contains all the data of a network packet.
 //
@@ -67,24 +65,3 @@ func (pk PacketBuffer) Clone() PacketBuffer {
 	pk.Data = pk.Data.Clone(nil)
 	return pk
 }
-
-//// TransportProtocol returns the transport protocol of pk.
-////
-//// Precondition: pk.NetworkHeader is set.
-//func (pk PacketBuffer) TransportProtocolIPv4() uint16 {
-//	if pk.NetworkHeader == nil {
-//		panic("This should only be called when pk.NetworkHeader is set.")
-//	}
-//	return header.IPv4(pk.NetworkHeader).TransportProtocol()
-//}
-
-// func (pk Packet) findNetHeader() header.IPv4 {
-// 	// Inbound:
-// 	// Data holds everything, but may have had some headers shaved off.
-// 	// Figure out whether it's set or still somewhere in data and return
-// 	// appropriately.
-
-// 	// Outbound:
-// 	// NetworkHeader will be set if we've added one. Otherwise there's no
-// 	// header.
-// }
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 63e691af6..150b44e42 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -23,6 +23,7 @@ import (
 	"time"
 
 	"flag"
+
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/dockerutil"
 	"gvisor.dev/gvisor/runsc/testutil"
@@ -160,29 +161,29 @@ func logContainer(output string, err error) {
 	log.Infof(msg)
 }
 
-// func TestFilterInputDropUDP(t *testing.T) {
-// 	if err := singleTest(FilterInputDropUDP{}); err != nil {
-// 		t.Fatal(err)
-// 	}
-// }
-
-// func TestFilterInputDropUDPPort(t *testing.T) {
-// 	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
-// 		t.Fatal(err)
-// 	}
-// }
-
-// func TestFilterInputDropDifferentUDPPort(t *testing.T) {
-// 	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
-// 		t.Fatal(err)
-// 	}
-// }
-
-// func TestFilterInputDropAll(t *testing.T) {
-// 	if err := singleTest(FilterInputDropAll{}); err != nil {
-// 		t.Fatal(err)
-// 	}
-// }
+func TestFilterInputDropUDP(t *testing.T) {
+	if err := singleTest(FilterInputDropUDP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropAll(t *testing.T) {
+	if err := singleTest(FilterInputDropAll{}); err != nil {
+		t.Fatal(err)
+	}
+}
 
 func TestFilterInputDropOnlyUDP(t *testing.T) {
 	if err := singleTest(FilterInputDropOnlyUDP{}); err != nil {
-- 
cgit v1.2.3


From 9f736ac6a7747917f690596ac9b072c108b5670c Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 13:39:48 -0800
Subject: More little fixes.

---
 pkg/sentry/socket/netfilter/netfilter.go | 4 ++--
 test/iptables/iptables_test.go           | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 4ef8123ac..e1f2bacce 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -325,8 +325,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return syserr.ErrInvalidArgument
 		}
 
-		// TODO(gvisor.dev/issue/170): We should support IPTIP
-		// filtering. We reject any nonzero IPTIP values for now.
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
 		filter, err := filterFromIPTIP(entry.IP)
 		if err != nil {
 			return err
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 150b44e42..679a29bef 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -15,6 +15,7 @@
 package iptables
 
 import (
+	"flag"
 	"fmt"
 	"net"
 	"os"
@@ -22,8 +23,6 @@ import (
 	"testing"
 	"time"
 
-	"flag"
-
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/dockerutil"
 	"gvisor.dev/gvisor/runsc/testutil"
-- 
cgit v1.2.3


From cbc0a92276b75e744511a43a9c0b78fc64946ec6 Mon Sep 17 00:00:00 2001
From: Ryan Heacock <rheacock@google.com>
Date: Tue, 21 Jan 2020 14:15:01 -0800
Subject: Correct todos referencing IPV6_RECVTCLASS

Bug 68320120 was revived because TODOs referenced the IP_RECVTOS bug instead
of the IPV6_RECVTCLASS bug.

PiperOrigin-RevId: 290820178
---
 test/syscalls/linux/udp_socket_test_cases.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 68e0a8109..a2f6ef8cc 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,7 +1349,7 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
           !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
@@ -1422,7 +1422,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 9143fcd7fd38243dd40f927dafaeb75f6ef8ef49 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:47:17 -0800
Subject: Add UDP matchers.

---
 pkg/abi/linux/netfilter.go               |  92 +++++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go | 164 ++++++++++++++++++++++++++-----
 pkg/tcpip/iptables/BUILD                 |   2 +
 pkg/tcpip/iptables/iptables.go           |   3 +
 pkg/tcpip/iptables/tcp_matcher.go        | 122 +++++++++++++++++++++++
 pkg/tcpip/iptables/types.go              |  17 ++++
 pkg/tcpip/iptables/udp_matcher.go        | 127 ++++++++++++++++++++++++
 pkg/tcpip/network/ipv4/ipv4.go           |  10 +-
 test/iptables/filter_input.go            |  46 +++++++++
 9 files changed, 555 insertions(+), 28 deletions(-)
 create mode 100644 pkg/tcpip/iptables/tcp_matcher.go
 create mode 100644 pkg/tcpip/iptables/udp_matcher.go

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 33fcc6c95..fb4588272 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -340,3 +340,95 @@ func goString(cstring []byte) string {
 	}
 	return string(cstring)
 }
+
+// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTTCP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// Option specifies that a particular TCP option must be set.
+	Option uint8
+
+	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
+	// fields.
+	FlagMask uint8
+
+	// FlagCompare is binary and-ed with the TCP flag fields.
+	FlagCompare uint8
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_TCP_INV_* flags.
+	InverseFlags uint8
+}
+
+// SizeOfXTTCP is the size of an XTTCP.
+const SizeOfXTTCP = 12
+
+// Flags in XTTCP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_TCP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_TCP_INV_DSTPT = 0x02
+	// Invert the meaning of FlagCompare.
+	XT_TCP_INV_FLAGS = 0x04
+	// Invert the meaning of Option.
+	XT_TCP_INV_OPTION = 0x08
+	// Enable all flags.
+	XT_TCP_INV_MASK = 0x0F
+)
+
+// XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTUDP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_UDP_INV_* flags.
+	InverseFlags uint8
+
+	_ uint8
+}
+
+// SizeOfXTUDP is the size of an XTUDP.
+const SizeOfXTUDP = 10
+
+// Flags in XTUDP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_UDP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_UDP_INV_DSTPT = 0x02
+	// Enable all flags.
+	XT_UDP_INV_MASK = 0x03
+)
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e1f2bacce..45296b339 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -131,6 +131,7 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
+// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
@@ -318,10 +319,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 		var entry linux.IPTEntry
 		buf := optVal[:linux.SizeOfIPTEntry]
-		optVal = optVal[linux.SizeOfIPTEntry:]
 		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
-		if entry.TargetOffset != linux.SizeOfIPTEntry {
-			// TODO(gvisor.dev/issue/170): Support matchers.
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIPTEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIPTEntry {
+			log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset)
 			return syserr.ErrInvalidArgument
 		}
 
@@ -332,19 +335,41 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return err
 		}
 
+		// TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+		if len(optVal) < int(matchersSize) {
+			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			log.Warningf("netfilter: failed to parse matchers: %v", err)
+			return err
+		}
+		optVal = optVal[matchersSize:]
+
 		// Get the target of the rule.
-		target, consumed, err := parseTarget(optVal)
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+		}
+		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
 			return err
 		}
-		optVal = optVal[consumed:]
+		optVal = optVal[targetSize:]
 
 		table.Rules = append(table.Rules, iptables.Rule{
-			Filter: filter,
-			Target: target,
+			Filter:   filter,
+			Target:   target,
+			Matchers: matchers,
 		})
 		offsets = append(offsets, offset)
-		offset += linux.SizeOfIPTEntry + consumed
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+		}
 	}
 
 	// Go through the list of supported hooks for this table and, for each
@@ -401,12 +426,105 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	return nil
 }
 
-// parseTarget parses a target from the start of optVal and returns the target
-// along with the number of bytes it occupies in optVal.
-func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
+// parseMatchers parses 0 or more matchers from optVal. optVal should contain
+// only the matchers.
+func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+	var matchers []iptables.Matcher
+	for len(optVal) > 0 {
+		log.Infof("parseMatchers: optVal has len %d", len(optVal))
+		// Get the XTEntryMatch.
+		if len(optVal) < linux.SizeOfXTEntryMatch {
+			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var match linux.XTEntryMatch
+		buf := optVal[:linux.SizeOfXTEntryMatch]
+		binary.Unmarshal(buf, usermem.ByteOrder, &match)
+		log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+
+		// Check some invariants.
+		if match.MatchSize < linux.SizeOfXTEntryMatch {
+			log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
+			return nil, syserr.ErrInvalidArgument
+		}
+		if len(optVal) < int(match.MatchSize) {
+			log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize]
+		var matcher iptables.Matcher
+		var err error
+		switch match.Name.String() {
+		case "tcp":
+			if len(buf) < linux.SizeOfXTTCP {
+				log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal))
+				return nil, syserr.ErrInvalidArgument
+			}
+			var matchData linux.XTTCP
+			// For alignment reasons, the match's total size may exceed what's
+			// strictly necessary to hold matchData.
+			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+			log.Infof("parseMatchers: parsed XTTCP: %+v", matchData)
+			matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{
+				SourcePortStart:      matchData.SourcePortStart,
+				SourcePortEnd:        matchData.SourcePortEnd,
+				DestinationPortStart: matchData.DestinationPortStart,
+				DestinationPortEnd:   matchData.DestinationPortEnd,
+				Option:               matchData.Option,
+				FlagMask:             matchData.FlagMask,
+				FlagCompare:          matchData.FlagCompare,
+				InverseFlags:         matchData.InverseFlags,
+			})
+			if err != nil {
+				log.Warningf("netfilter: failed to create TCP matcher: %v", err)
+				return nil, syserr.ErrInvalidArgument
+			}
+
+		case "udp":
+			if len(buf) < linux.SizeOfXTUDP {
+				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
+				return nil, syserr.ErrInvalidArgument
+			}
+			var matchData linux.XTUDP
+			// For alignment reasons, the match's total size may exceed what's
+			// strictly necessary to hold matchData.
+			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
+			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{
+				SourcePortStart:      matchData.SourcePortStart,
+				SourcePortEnd:        matchData.SourcePortEnd,
+				DestinationPortStart: matchData.DestinationPortStart,
+				DestinationPortEnd:   matchData.DestinationPortEnd,
+				InverseFlags:         matchData.InverseFlags,
+			})
+			if err != nil {
+				log.Warningf("netfilter: failed to create UDP matcher: %v", err)
+				return nil, syserr.ErrInvalidArgument
+			}
+
+		default:
+			log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String())
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		matchers = append(matchers, matcher)
+
+		// TODO: Support revision.
+		// TODO: Support proto -- matchers usually specify which proto(s) they work with.
+		optVal = optVal[match.MatchSize:]
+	}
+
+	// TODO: Check that optVal is exhausted.
+	return matchers, nil
+}
+
+// parseTarget parses a target from optVal. optVal should contain only the
+// target.
+func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
-		return nil, 0, syserr.ErrInvalidArgument
+		return nil, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
@@ -414,9 +532,9 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	switch target.Name.String() {
 	case "":
 		// Standard target.
-		if len(optVal) < linux.SizeOfXTStandardTarget {
-			log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
-			return nil, 0, syserr.ErrInvalidArgument
+		if len(optVal) != linux.SizeOfXTStandardTarget {
+			log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
 		}
 		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
@@ -424,22 +542,22 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 
 		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
 		if err != nil {
-			return nil, 0, err
+			return nil, err
 		}
 		switch verdict {
 		case iptables.Accept:
-			return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil
+			return iptables.UnconditionalAcceptTarget{}, nil
 		case iptables.Drop:
-			return iptables.UnconditionalDropTarget{}, linux.SizeOfXTStandardTarget, nil
+			return iptables.UnconditionalDropTarget{}, nil
 		default:
 			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
 		}
 
 	case errorTargetName:
 		// Error target.
-		if len(optVal) < linux.SizeOfXTErrorTarget {
+		if len(optVal) != linux.SizeOfXTErrorTarget {
 			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
-			return nil, 0, syserr.ErrInvalidArgument
+			return nil, syserr.ErrInvalidArgument
 		}
 		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
@@ -454,16 +572,16 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 		//   rules have an error with the name of the chain.
 		switch errorTarget.Name.String() {
 		case errorTargetName:
-			return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil
+			return iptables.ErrorTarget{}, nil
 		default:
 			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
-			return nil, 0, syserr.ErrInvalidArgument
+			return nil, syserr.ErrInvalidArgument
 		}
 	}
 
 	// Unknown target.
 	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-	return nil, 0, syserr.ErrInvalidArgument
+	return nil, syserr.ErrInvalidArgument
 }
 
 func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index 297eaccaf..ff4e3c932 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -7,7 +7,9 @@ go_library(
     srcs = [
         "iptables.go",
         "targets.go",
+        "tcp_matcher.go",
         "types.go",
+        "udp_matcher.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index fc06b5b87..accedba1e 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -138,6 +138,8 @@ func EmptyFilterTable() Table {
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
+//
+// Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
 	// we're missing features. Jumps, the call stack, etc. aren't checked
@@ -163,6 +165,7 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	return true
 }
 
+// Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go
new file mode 100644
index 000000000..6acbd6eb9
--- /dev/null
+++ b/pkg/tcpip/iptables/tcp_matcher.go
@@ -0,0 +1,122 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+type TCPMatcher struct {
+	data TCPMatcherData
+
+	// tablename string
+	// unsigned int matchsize;
+	// unsigned int usersize;
+	// #ifdef CONFIG_COMPAT
+	// unsigned int compatsize;
+	// #endif
+	// unsigned int hooks;
+	// unsigned short proto;
+	// unsigned short family;
+}
+
+// TODO: Delete?
+// MatchCheckEntryParams
+
+type TCPMatcherData struct {
+	// Filter IPHeaderFilter
+
+	SourcePortStart      uint16
+	SourcePortEnd        uint16
+	DestinationPortStart uint16
+	DestinationPortEnd   uint16
+	Option               uint8
+	FlagMask             uint8
+	FlagCompare          uint8
+	InverseFlags         uint8
+}
+
+func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) {
+	// TODO: We currently only support source port and destination port.
+	log.Infof("Adding rule with TCPMatcherData: %+v", data)
+
+	if data.Option != 0 ||
+		data.FlagMask != 0 ||
+		data.FlagCompare != 0 ||
+		data.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported TCP matcher flags set")
+	}
+
+	if filter.Protocol != header.TCPProtocolNumber {
+		log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+	}
+
+	return &TCPMatcher{data: data}, nil
+}
+
+// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
+	// codesearch).
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.")
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO
+	var tcpHeader header.TCP
+	if pkt.TransportHeader != nil {
+		tcpHeader = header.TCP(pkt.TransportHeader)
+	} else {
+		// The TCP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.TCPMinimumSize {
+			// There's no valid TCP header here, so we hotdrop the
+			// packet.
+			// TODO: Stats.
+			log.Warningf("Dropping TCP packet: size to small.")
+			return false, true
+		}
+		tcpHeader = header.TCP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	sourcePort := tcpHeader.SourcePort()
+	destinationPort := tcpHeader.DestinationPort()
+	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index a0bfc8b41..54e66f09a 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -169,12 +169,29 @@ type IPHeaderFilter struct {
 	Protocol tcpip.TransportProtocolNumber
 }
 
+// TODO: Should these be able to marshal/unmarshal themselves?
+// TODO: Something has to map the name to the matcher.
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
 	// should be "hotdropped", i.e. dropped immediately. This is usually
 	// used for suspicious packets.
+	//
+	// Precondition: packet.NetworkHeader is set.
 	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+
+	// TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry?
+	// CheckEntry(params MatchCheckEntryParams) bool
+}
+
+// TODO: Unused?
+type MatchCheckEntryParams struct {
+	Table  string // TODO: Tables should be an enum...
+	Filter IPHeaderFilter
+	Info   interface{} // TODO: Type unsafe.
+	// HookMask       uint8
+	// Family         uint8
+	// NFTCompat      bool
 }
 
 // A Target is the interface for taking an action for a packet.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
new file mode 100644
index 000000000..ce4368a3d
--- /dev/null
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -0,0 +1,127 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"runtime/debug"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+type UDPMatcher struct {
+	data UDPMatcherData
+
+	// tablename string
+	// unsigned int matchsize;
+	// unsigned int usersize;
+	// #ifdef CONFIG_COMPAT
+	// unsigned int compatsize;
+	// #endif
+	// unsigned int hooks;
+	// unsigned short proto;
+	// unsigned short family;
+}
+
+// TODO: Delete?
+// MatchCheckEntryParams
+
+type UDPMatcherData struct {
+	// Filter IPHeaderFilter
+
+	SourcePortStart      uint16
+	SourcePortEnd        uint16
+	DestinationPortStart uint16
+	DestinationPortEnd   uint16
+	InverseFlags         uint8
+}
+
+func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) {
+	// TODO: We currently only support source port and destination port.
+	log.Infof("Adding rule with UDPMatcherData: %+v", data)
+
+	if data.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported UDP matcher flags set")
+	}
+
+	if filter.Protocol != header.UDPProtocolNumber {
+		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+	}
+
+	return &UDPMatcher{data: data}, nil
+}
+
+// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
+	// codesearch).
+	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+		log.Infof("UDPMatcher: wrong protocol number")
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		log.Infof("UDPMatcher: it's a fragment")
+		if frag == 1 {
+			return false, true
+		}
+		log.Warningf("Dropping UDP packet: malicious fragmented packet.")
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO
+	var udpHeader header.UDP
+	if pkt.TransportHeader != nil {
+		log.Infof("UDPMatcher: transport header is not nil")
+		udpHeader = header.UDP(pkt.TransportHeader)
+	} else {
+		log.Infof("UDPMatcher: transport header is nil")
+		log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil)
+		// The UDP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.UDPMinimumSize {
+			// There's no valid UDP header here, so we hotdrop the
+			// packet.
+			// TODO: Stats.
+			log.Warningf("Dropping UDP packet: size to small.")
+			return false, true
+		}
+		udpHeader = header.UDP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	sourcePort := udpHeader.SourcePort()
+	destinationPort := udpHeader.DestinationPort()
+	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
+		udpHeader.SourcePort(), udpHeader.DestinationPort(),
+		tm.data.SourcePortStart, tm.data.SourcePortEnd,
+		tm.data.DestinationPortStart, tm.data.DestinationPortEnd)
+	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 85512f9b2..6597e6781 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -353,6 +353,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	}
 	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
+	hlen := int(h.HeaderLength())
+	tlen := int(h.TotalLength())
+	pkt.Data.TrimFront(hlen)
+	pkt.Data.CapLength(tlen - hlen)
+
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
@@ -361,11 +366,6 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 		return
 	}
 
-	hlen := int(h.HeaderLength())
-	tlen := int(h.TotalLength())
-	pkt.Data.TrimFront(hlen)
-	pkt.Data.CapLength(tlen - hlen)
-
 	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
 	if more || h.FragmentOffset() != 0 {
 		if pkt.Data.Size() == 0 {
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index fd02ff2ff..bc963d40e 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -15,6 +15,7 @@
 package iptables
 
 import (
+	"errors"
 	"fmt"
 	"net"
 	"time"
@@ -248,3 +249,48 @@ func (FilterInputDropAll) ContainerAction(ip net.IP) error {
 func (FilterInputDropAll) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, dropPort, sendloopDuration)
 }
+
+// FilterInputMultiUDPRules verifies that multiple UDP rules are applied
+// correctly. This has the added benefit of testing whether we're serializing
+// rules correctly -- if we do it incorrectly, the iptables tool will
+// misunderstand and save the wrong tables.
+type FilterInputMultiUDPRules struct{}
+
+func (FilterInputMultiUDPRules) Name() string {
+	return "FilterInputMultiUDPRules"
+}
+
+func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+	// if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
+	// 	return err
+	// }
+	return filterTable("-L")
+}
+
+func (FilterInputMultiUDPRules) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// FilterInputRequireProtocolUDP checks that "-m udp" requires "-p udp" to be
+// specified.
+type FilterInputRequireProtocolUDP struct{}
+
+func (FilterInputRequireProtocolUDP) Name() string {
+	return "FilterInputRequireProtocolUDP"
+}
+
+func (FilterInputRequireProtocolUDP) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err == nil {
+		return errors.New("expected iptables to fail with out \"-p udp\", but succeeded")
+	}
+	return nil
+}
+
+func (FilterInputRequireProtocolUDP) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
-- 
cgit v1.2.3


From dc9989720562e5df0131986eed3e3d681616db57 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 21 Jan 2020 14:23:12 -0800
Subject: Add missing verb

PiperOrigin-RevId: 290821997
---
 pkg/abi/linux/netlink_route.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 3898d2314..0e3582ab6 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -190,7 +190,7 @@ const (
 	ARPHRD_LOOPBACK = 772
 )
 
-// RouteMessage struct rtmsg, from uapi/linux/rtnetlink.h.
+// RouteMessage is struct rtmsg, from uapi/linux/rtnetlink.h.
 type RouteMessage struct {
 	Family uint8
 	DstLen uint8
-- 
cgit v1.2.3


From 2661101ad470548cb15dce0afc694296668d780a Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:51:28 -0800
Subject: Removed TCP work (saved in ipt-tcp-match).

---
 pkg/abi/linux/netfilter.go               |  52 -------------
 pkg/sentry/socket/netfilter/netfilter.go |  26 -------
 pkg/tcpip/iptables/tcp_matcher.go        | 122 -------------------------------
 3 files changed, 200 deletions(-)
 delete mode 100644 pkg/tcpip/iptables/tcp_matcher.go

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index fb4588272..f0e544f9c 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -341,58 +341,6 @@ func goString(cstring []byte) string {
 	return string(cstring)
 }
 
-// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
-// in include/uapi/linux/netfilter/xt_tcpudp.h.
-type XTTCP struct {
-	// SourcePortStart specifies the inclusive start of the range of source
-	// ports to which the matcher applies.
-	SourcePortStart uint16
-
-	// SourcePortEnd specifies the inclusive end of the range of source ports
-	// to which the matcher applies.
-	SourcePortEnd uint16
-
-	// DestinationPortStart specifies the start of the destination port
-	// range to which the matcher applies.
-	DestinationPortStart uint16
-
-	// DestinationPortEnd specifies the start of the destination port
-	// range to which the matcher applies.
-	DestinationPortEnd uint16
-
-	// Option specifies that a particular TCP option must be set.
-	Option uint8
-
-	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
-	// fields.
-	FlagMask uint8
-
-	// FlagCompare is binary and-ed with the TCP flag fields.
-	FlagCompare uint8
-
-	// InverseFlags flips the meaning of certain fields. See the
-	// TX_TCP_INV_* flags.
-	InverseFlags uint8
-}
-
-// SizeOfXTTCP is the size of an XTTCP.
-const SizeOfXTTCP = 12
-
-// Flags in XTTCP.InverseFlags. Corresponding constants are in
-// include/uapi/linux/netfilter/xt_tcpudp.h.
-const (
-	// Invert the meaning of SourcePortStart/End.
-	XT_TCP_INV_SRCPT = 0x01
-	// Invert the meaning of DestinationPortStart/End.
-	XT_TCP_INV_DSTPT = 0x02
-	// Invert the meaning of FlagCompare.
-	XT_TCP_INV_FLAGS = 0x04
-	// Invert the meaning of Option.
-	XT_TCP_INV_OPTION = 0x08
-	// Enable all flags.
-	XT_TCP_INV_MASK = 0x0F
-)
-
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 45296b339..f8ed1acbc 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -131,7 +131,6 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
-// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
@@ -456,31 +455,6 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var matcher iptables.Matcher
 		var err error
 		switch match.Name.String() {
-		case "tcp":
-			if len(buf) < linux.SizeOfXTTCP {
-				log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal))
-				return nil, syserr.ErrInvalidArgument
-			}
-			var matchData linux.XTTCP
-			// For alignment reasons, the match's total size may exceed what's
-			// strictly necessary to hold matchData.
-			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
-			log.Infof("parseMatchers: parsed XTTCP: %+v", matchData)
-			matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{
-				SourcePortStart:      matchData.SourcePortStart,
-				SourcePortEnd:        matchData.SourcePortEnd,
-				DestinationPortStart: matchData.DestinationPortStart,
-				DestinationPortEnd:   matchData.DestinationPortEnd,
-				Option:               matchData.Option,
-				FlagMask:             matchData.FlagMask,
-				FlagCompare:          matchData.FlagCompare,
-				InverseFlags:         matchData.InverseFlags,
-			})
-			if err != nil {
-				log.Warningf("netfilter: failed to create TCP matcher: %v", err)
-				return nil, syserr.ErrInvalidArgument
-			}
-
 		case "udp":
 			if len(buf) < linux.SizeOfXTUDP {
 				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go
deleted file mode 100644
index 6acbd6eb9..000000000
--- a/pkg/tcpip/iptables/tcp_matcher.go
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-type TCPMatcher struct {
-	data TCPMatcherData
-
-	// tablename string
-	// unsigned int matchsize;
-	// unsigned int usersize;
-	// #ifdef CONFIG_COMPAT
-	// unsigned int compatsize;
-	// #endif
-	// unsigned int hooks;
-	// unsigned short proto;
-	// unsigned short family;
-}
-
-// TODO: Delete?
-// MatchCheckEntryParams
-
-type TCPMatcherData struct {
-	// Filter IPHeaderFilter
-
-	SourcePortStart      uint16
-	SourcePortEnd        uint16
-	DestinationPortStart uint16
-	DestinationPortEnd   uint16
-	Option               uint8
-	FlagMask             uint8
-	FlagCompare          uint8
-	InverseFlags         uint8
-}
-
-func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) {
-	// TODO: We currently only support source port and destination port.
-	log.Infof("Adding rule with TCPMatcherData: %+v", data)
-
-	if data.Option != 0 ||
-		data.FlagMask != 0 ||
-		data.FlagCompare != 0 ||
-		data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported TCP matcher flags set")
-	}
-
-	if filter.Protocol != header.TCPProtocolNumber {
-		log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
-	}
-
-	return &TCPMatcher{data: data}, nil
-}
-
-// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
-func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-
-	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
-	// codesearch).
-	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return false, false
-	}
-
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.")
-			return false, true
-		}
-		return false, false
-	}
-
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO
-	var tcpHeader header.TCP
-	if pkt.TransportHeader != nil {
-		tcpHeader = header.TCP(pkt.TransportHeader)
-	} else {
-		// The TCP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.TCPMinimumSize {
-			// There's no valid TCP header here, so we hotdrop the
-			// packet.
-			// TODO: Stats.
-			log.Warningf("Dropping TCP packet: size to small.")
-			return false, true
-		}
-		tcpHeader = header.TCP(pkt.Data.First())
-	}
-
-	// Check whether the source and destination ports are within the
-	// matching range.
-	sourcePort := tcpHeader.SourcePort()
-	destinationPort := tcpHeader.DestinationPort()
-	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
-		return false, false
-	}
-	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
-		return false, false
-	}
-
-	return true, false
-}
-- 
cgit v1.2.3


From 421b6ff18154f80ea8cbbfd8340042ab458bf813 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:54:39 -0800
Subject: Passes all filter table UDP tests.

---
 pkg/tcpip/iptables/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index ff4e3c932..e41c645ed 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "iptables.go",
         "targets.go",
-        "tcp_matcher.go",
         "types.go",
         "udp_matcher.go",
     ],
-- 
cgit v1.2.3


From ad1968ed5665c7541d6920edbd7c7492b7db3046 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 21 Jan 2020 14:25:14 -0800
Subject: Implement sysfs.

PiperOrigin-RevId: 290822487
---
 pkg/abi/linux/file.go                   |  13 +++
 pkg/sentry/fsimpl/kernfs/BUILD          |   3 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go | 175 ++++++-------------------------
 pkg/sentry/fsimpl/proc/BUILD            |  13 +--
 pkg/sentry/fsimpl/proc/boot_test.go     | 149 --------------------------
 pkg/sentry/fsimpl/proc/tasks_test.go    |  13 +--
 pkg/sentry/fsimpl/sys/BUILD             |  35 +++++++
 pkg/sentry/fsimpl/sys/sys.go            | 124 ++++++++++++++++++++++
 pkg/sentry/fsimpl/sys/sys_test.go       |  90 ++++++++++++++++
 pkg/sentry/fsimpl/testutil/BUILD        |  36 +++++++
 pkg/sentry/fsimpl/testutil/kernel.go    | 149 ++++++++++++++++++++++++++
 pkg/sentry/fsimpl/testutil/testutil.go  | 178 ++++++++++++++++++++++++++++++++
 12 files changed, 663 insertions(+), 315 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/proc/boot_test.go
 create mode 100644 pkg/sentry/fsimpl/sys/BUILD
 create mode 100644 pkg/sentry/fsimpl/sys/sys.go
 create mode 100644 pkg/sentry/fsimpl/sys/sys_test.go
 create mode 100644 pkg/sentry/fsimpl/testutil/BUILD
 create mode 100644 pkg/sentry/fsimpl/testutil/kernel.go
 create mode 100644 pkg/sentry/fsimpl/testutil/testutil.go

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 16791d03e..6fbdd668d 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -180,6 +180,19 @@ const (
 	DT_WHT     = 14
 )
 
+// DirentType are the friendly strings for linux_dirent64.d_type.
+var DirentType = abi.ValueSet{
+	DT_UNKNOWN: "DT_UNKNOWN",
+	DT_FIFO:    "DT_FIFO",
+	DT_CHR:     "DT_CHR",
+	DT_DIR:     "DT_DIR",
+	DT_BLK:     "DT_BLK",
+	DT_REG:     "DT_REG",
+	DT_LNK:     "DT_LNK",
+	DT_SOCK:    "DT_SOCK",
+	DT_WHT:     "DT_WHT",
+}
+
 // Values for preadv2/pwritev2.
 const (
 	// Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 809178250..66d409785 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -51,13 +51,12 @@ go_test(
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
-        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
-        "//pkg/sync",
         "//pkg/syserror",
         "@com_github_google_go-cmp//cmp:go_default_library",
     ],
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 5c9d580e1..a5fdfbde5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -17,20 +17,17 @@ package kernfs_test
 import (
 	"bytes"
 	"fmt"
-	"io"
-	"runtime"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -41,21 +38,11 @@ const staticFileContent = "This is sample content for a static test file."
 // filesystem. See newTestSystem.
 type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
 
-// TestSystem represents the context for a single test.
-type TestSystem struct {
-	t     *testing.T
-	ctx   context.Context
-	creds *auth.Credentials
-	vfs   *vfs.VirtualFilesystem
-	mns   *vfs.MountNamespace
-	root  vfs.VirtualDentry
-}
-
 // newTestSystem sets up a minimal environment for running a test, including an
 // instance of a test filesystem. Tests can control the contents of the
 // filesystem by providing an appropriate rootFn, which should return a
 // pre-populated root dentry.
-func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
+func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 	v := vfs.New()
@@ -66,57 +53,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
 	}
-
-	s := &TestSystem{
-		t:     t,
-		ctx:   ctx,
-		creds: creds,
-		vfs:   v,
-		mns:   mns,
-		root:  mns.Root(),
-	}
-	runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() })
-	return s
-}
-
-// PathOpAtRoot constructs a vfs.PathOperation for a path from the
-// root of the test filesystem.
-//
-// Precondition: path should be relative path.
-func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
-	return vfs.PathOperation{
-		Root:  s.root,
-		Start: s.root,
-		Path:  fspath.Parse(path),
-	}
-}
-
-// GetDentryOrDie attempts to resolve a dentry referred to by the
-// provided path operation. If unsuccessful, the test fails.
-func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
-	vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{})
-	if err != nil {
-		s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
-	}
-	return vd
-}
-
-func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) {
-	buf := make([]byte, usermem.PageSize)
-	bufIOSeq := usermem.BytesIOSequence(buf)
-	opts := vfs.ReadOptions{}
-
-	var content bytes.Buffer
-	for {
-		n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts)
-		if n == 0 || err != nil {
-			if err == io.EOF {
-				err = nil
-			}
-			return content.String(), err
-		}
-		content.Write(buf[:n])
-	}
+	return testutil.NewSystem(ctx, t, v, mns)
 }
 
 type fsType struct {
@@ -260,6 +197,7 @@ func TestBasic(t *testing.T) {
 			"file1": fs.newFile(creds, staticFileContent),
 		})
 	})
+	defer sys.Destroy()
 	sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef()
 }
 
@@ -269,9 +207,10 @@ func TestMkdirGetDentry(t *testing.T) {
 			"dir1": fs.newDir(creds, 0755, nil),
 		})
 	})
+	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("dir1/a new directory")
-	if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+	if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
 		t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
 	}
 	sys.GetDentryOrDie(pop).DecRef()
@@ -283,20 +222,21 @@ func TestReadStaticFile(t *testing.T) {
 			"file1": fs.newFile(creds, staticFileContent),
 		})
 	})
+	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("file1")
-	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{})
 	if err != nil {
-		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
 	}
 	defer fd.DecRef()
 
 	content, err := sys.ReadToEnd(fd)
 	if err != nil {
-		sys.t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %v", err)
 	}
 	if diff := cmp.Diff(staticFileContent, content); diff != "" {
-		sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+		t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
 	}
 }
 
@@ -306,83 +246,44 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 			"dir1": fs.newDir(creds, 0755, nil),
 		})
 	})
+	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("dir1/newfile")
 	opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
-	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts)
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, opts)
 	if err != nil {
-		sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
+		t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
 	}
 
 	// Close the file. The file should persist.
 	fd.DecRef()
 
-	fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{})
 	if err != nil {
-		sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+		t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
 	}
 	fd.DecRef()
 }
 
-// direntCollector provides an implementation for vfs.IterDirentsCallback for
-// testing. It simply iterates to the end of a given directory FD and collects
-// all dirents emitted by the callback.
-type direntCollector struct {
-	mu      sync.Mutex
-	dirents map[string]vfs.Dirent
-}
-
-// Handle implements vfs.IterDirentsCallback.Handle.
-func (d *direntCollector) Handle(dirent vfs.Dirent) bool {
-	d.mu.Lock()
-	if d.dirents == nil {
-		d.dirents = make(map[string]vfs.Dirent)
-	}
-	d.dirents[dirent.Name] = dirent
-	d.mu.Unlock()
-	return true
-}
-
-// count returns the number of dirents currently in the collector.
-func (d *direntCollector) count() int {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return len(d.dirents)
-}
-
-// contains checks whether the collector has a dirent with the given name and
-// type.
-func (d *direntCollector) contains(name string, typ uint8) error {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	dirent, ok := d.dirents[name]
-	if !ok {
-		return fmt.Errorf("No dirent named %q found", name)
-	}
-	if dirent.Type != typ {
-		return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent)
-	}
-	return nil
-}
-
 func TestDirFDReadWrite(t *testing.T) {
 	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
 		return fs.newReadonlyDir(creds, 0755, nil)
 	})
+	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("/")
-	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{})
 	if err != nil {
-		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
 	}
 	defer fd.DecRef()
 
 	// Read/Write should fail for directory FDs.
-	if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
-		sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
+	if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
+		t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
 	}
-	if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR {
-		sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err)
+	if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR {
+		t.Fatalf("Write for directory FD failed with unexpected error: %v", err)
 	}
 }
 
@@ -397,30 +298,12 @@ func TestDirFDIterDirents(t *testing.T) {
 			"file1": fs.newFile(creds, staticFileContent),
 		})
 	})
+	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("/")
-	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
-	if err != nil {
-		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
-	}
-	defer fd.DecRef()
-
-	collector := &direntCollector{}
-	if err := fd.IterDirents(sys.ctx, collector); err != nil {
-		sys.t.Fatalf("IterDirent failed: %v", err)
-	}
-
-	// Root directory should contain ".", ".." and 3 children:
-	if collector.count() != 5 {
-		sys.t.Fatalf("IterDirent returned too many dirents")
-	}
-	for _, dirName := range []string{".", "..", "dir1", "dir2"} {
-		if err := collector.contains(dirName, linux.DT_DIR); err != nil {
-			sys.t.Fatalf("IterDirent had unexpected results: %v", err)
-		}
-	}
-	if err := collector.contains("file1", linux.DT_REG); err != nil {
-		sys.t.Fatalf("IterDirent had unexpected results: %v", err)
-	}
-
+	sys.AssertDirectoryContains(&pop, map[string]testutil.DirentType{
+		"dir1":  linux.DT_DIR,
+		"dir2":  linux.DT_DIR,
+		"file1": linux.DT_REG,
+	})
 }
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index f69aa19c4..c5b79fb38 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -44,30 +44,19 @@ go_test(
     name = "proc_test",
     size = "small",
     srcs = [
-        "boot_test.go",
         "tasks_sys_test.go",
         "tasks_test.go",
     ],
     embed = [":proc"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/cpuid",
         "//pkg/fspath",
-        "//pkg/memutil",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs",
+        "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/sched",
-        "//pkg/sentry/limits",
-        "//pkg/sentry/loader",
-        "//pkg/sentry/pgalloc",
-        "//pkg/sentry/platform",
-        "//pkg/sentry/platform/kvm",
-        "//pkg/sentry/platform/ptrace",
-        "//pkg/sentry/time",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go
deleted file mode 100644
index 84a93ee56..000000000
--- a/pkg/sentry/fsimpl/proc/boot_test.go
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"flag"
-	"fmt"
-	"os"
-	"runtime"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/cpuid"
-	"gvisor.dev/gvisor/pkg/memutil"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/loader"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/time"
-
-	// Platforms are plugable.
-	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
-	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
-)
-
-var (
-	platformFlag = flag.String("platform", "ptrace", "specify which platform to use")
-)
-
-// boot initializes a new bare bones kernel for test.
-func boot() (*kernel.Kernel, error) {
-	platformCtr, err := platform.Lookup(*platformFlag)
-	if err != nil {
-		return nil, fmt.Errorf("platform not found: %v", err)
-	}
-	deviceFile, err := platformCtr.OpenDevice()
-	if err != nil {
-		return nil, fmt.Errorf("creating platform: %v", err)
-	}
-	plat, err := platformCtr.New(deviceFile)
-	if err != nil {
-		return nil, fmt.Errorf("creating platform: %v", err)
-	}
-
-	k := &kernel.Kernel{
-		Platform: plat,
-	}
-
-	mf, err := createMemoryFile()
-	if err != nil {
-		return nil, err
-	}
-	k.SetMemoryFile(mf)
-
-	// Pass k as the platform since it is savable, unlike the actual platform.
-	vdso, err := loader.PrepareVDSO(nil, k)
-	if err != nil {
-		return nil, fmt.Errorf("creating vdso: %v", err)
-	}
-
-	// Create timekeeper.
-	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
-	if err != nil {
-		return nil, fmt.Errorf("creating timekeeper: %v", err)
-	}
-	tk.SetClocks(time.NewCalibratedClocks())
-
-	creds := auth.NewRootCredentials(auth.NewRootUserNamespace())
-
-	// Initiate the Kernel object, which is required by the Context passed
-	// to createVFS in order to mount (among other things) procfs.
-	if err = k.Init(kernel.InitKernelArgs{
-		ApplicationCores:            uint(runtime.GOMAXPROCS(-1)),
-		FeatureSet:                  cpuid.HostFeatureSet(),
-		Timekeeper:                  tk,
-		RootUserNamespace:           creds.UserNamespace,
-		Vdso:                        vdso,
-		RootUTSNamespace:            kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
-		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
-		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
-		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
-	}); err != nil {
-		return nil, fmt.Errorf("initializing kernel: %v", err)
-	}
-
-	ctx := k.SupervisorContext()
-
-	// Create mount namespace without root as it's the minimum required to create
-	// the global thread group.
-	mntns, err := fs.NewMountNamespace(ctx, nil)
-	if err != nil {
-		return nil, err
-	}
-	ls, err := limits.NewLinuxLimitSet()
-	if err != nil {
-		return nil, err
-	}
-	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
-	k.TestOnly_SetGlobalInit(tg)
-
-	return k, nil
-}
-
-// createTask creates a new bare bones task for tests.
-func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
-	k := kernel.KernelFromContext(ctx)
-	config := &kernel.TaskConfig{
-		Kernel:                  k,
-		ThreadGroup:             tc,
-		TaskContext:             &kernel.TaskContext{Name: name},
-		Credentials:             auth.CredentialsFromContext(ctx),
-		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
-		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
-		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
-		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
-	}
-	return k.TaskSet().NewTask(config)
-}
-
-func createMemoryFile() (*pgalloc.MemoryFile, error) {
-	const memfileName = "test-memory"
-	memfd, err := memutil.CreateMemFD(memfileName, 0)
-	if err != nil {
-		return nil, fmt.Errorf("error creating memfd: %v", err)
-	}
-	memfile := os.NewFile(uintptr(memfd), memfileName)
-	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
-	if err != nil {
-		memfile.Close()
-		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
-	}
-	return mf, nil
-}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 002d2f73b..41977d816 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -134,7 +135,7 @@ func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, e
 }
 
 func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) {
-	k, err := boot()
+	k, err := testutil.Boot()
 	if err != nil {
 		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err)
 	}
@@ -206,7 +207,7 @@ func TestTasks(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
@@ -298,7 +299,7 @@ func TestTasksOffset(t *testing.T) {
 	k := kernel.KernelFromContext(ctx)
 	for i := 0; i < 3; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+		if _, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
 	}
@@ -417,7 +418,7 @@ func TestTask(t *testing.T) {
 
 	k := kernel.KernelFromContext(ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	_, err = createTask(ctx, "name", tc)
+	_, err = testutil.CreateTask(ctx, "name", tc)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -458,7 +459,7 @@ func TestProcSelf(t *testing.T) {
 
 	k := kernel.KernelFromContext(ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	task, err := createTask(ctx, "name", tc)
+	task, err := testutil.CreateTask(ctx, "name", tc)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -555,7 +556,7 @@ func TestTree(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(uberCtx, fmt.Sprintf("name-%d", i), tc)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
new file mode 100644
index 000000000..ee3c842bd
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -0,0 +1,35 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sys",
+    srcs = [
+        "sys.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "sys_test",
+    srcs = ["sys_test.go"],
+    deps = [
+        ":sys",
+        "//pkg/abi/linux",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
new file mode 100644
index 000000000..1305ad01d
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -0,0 +1,124 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sys implements sysfs.
+package sys
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.Filesystem.Init(vfsObj)
+	k := kernel.KernelFromContext(ctx)
+	maxCPUCores := k.ApplicationCores()
+	defaultSysDirMode := linux.FileMode(0755)
+
+	root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+		"block": fs.newDir(creds, defaultSysDirMode, nil),
+		"bus":   fs.newDir(creds, defaultSysDirMode, nil),
+		"class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+			"power_supply": fs.newDir(creds, defaultSysDirMode, nil),
+		}),
+		"dev": fs.newDir(creds, defaultSysDirMode, nil),
+		"devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+			"system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+				"cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+					"online":   fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+					"possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+					"present":  fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+				}),
+			}),
+		}),
+		"firmware": fs.newDir(creds, defaultSysDirMode, nil),
+		"fs":       fs.newDir(creds, defaultSysDirMode, nil),
+		"kernel":   fs.newDir(creds, defaultSysDirMode, nil),
+		"module":   fs.newDir(creds, defaultSysDirMode, nil),
+		"power":    fs.newDir(creds, defaultSysDirMode, nil),
+	})
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// dir implements kernfs.Inode.
+type dir struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+
+	kernfs.OrderedChildren
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	d := &dir{}
+	d.InodeAttrs.Init(creds, fs.NextIno(), linux.ModeDirectory|0755)
+	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	d.dentry.Init(d)
+
+	d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
+
+	return &d.dentry
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// Open implements kernfs.Inode.Open.
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+// cpuFile implements kernfs.Inode.
+type cpuFile struct {
+	kernfs.DynamicBytesFile
+	maxCores uint
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "0-%d", c.maxCores-1)
+	return nil
+}
+
+func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+	c := &cpuFile{maxCores: maxCores}
+	c.DynamicBytesFile.Init(creds, fs.NextIno(), c, mode)
+	d := &kernfs.Dentry{}
+	d.Init(c)
+	return d
+}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
new file mode 100644
index 000000000..60a1634a9
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -0,0 +1,90 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys_test
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func newTestSystem(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Failed to create test kernel: %v", err)
+	}
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+	v := vfs.New()
+	v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("Failed to create new mount namespace: %v", err)
+	}
+	return testutil.NewSystem(ctx, t, v, mns)
+}
+
+func TestReadCPUFile(t *testing.T) {
+	s := newTestSystem(t)
+	defer s.Destroy()
+	k := kernel.KernelFromContext(s.Ctx)
+	maxCPUCores := k.ApplicationCores()
+
+	expected := fmt.Sprintf("0-%d", maxCPUCores-1)
+
+	for _, fname := range []string{"online", "possible", "present"} {
+		pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
+		fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, &pop, &vfs.OpenOptions{})
+		if err != nil {
+			t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+		}
+		defer fd.DecRef()
+		content, err := s.ReadToEnd(fd)
+		if err != nil {
+			t.Fatalf("Read failed: %v", err)
+		}
+		if diff := cmp.Diff(expected, content); diff != "" {
+			t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+		}
+	}
+}
+
+func TestSysRootContainsExpectedEntries(t *testing.T) {
+	s := newTestSystem(t)
+	defer s.Destroy()
+	pop := s.PathOpAtRoot("/")
+	s.AssertDirectoryContains(&pop, map[string]testutil.DirentType{
+		"block":    linux.DT_DIR,
+		"bus":      linux.DT_DIR,
+		"class":    linux.DT_DIR,
+		"dev":      linux.DT_DIR,
+		"devices":  linux.DT_DIR,
+		"firmware": linux.DT_DIR,
+		"fs":       linux.DT_DIR,
+		"kernel":   linux.DT_DIR,
+		"module":   linux.DT_DIR,
+		"power":    linux.DT_DIR,
+	})
+}
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
new file mode 100644
index 000000000..4e70d84a7
--- /dev/null
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -0,0 +1,36 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "testutil",
+    testonly = 1,
+    srcs = [
+        "kernel.go",
+        "testutil.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/cpuid",
+        "//pkg/fspath",
+        "//pkg/memutil",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
new file mode 100644
index 000000000..295da2d52
--- /dev/null
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+
+	// Platforms are plugable.
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+var (
+	platformFlag = flag.String("platform", "ptrace", "specify which platform to use")
+)
+
+// Boot initializes a new bare bones kernel for test.
+func Boot() (*kernel.Kernel, error) {
+	platformCtr, err := platform.Lookup(*platformFlag)
+	if err != nil {
+		return nil, fmt.Errorf("platform not found: %v", err)
+	}
+	deviceFile, err := platformCtr.OpenDevice()
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	plat, err := platformCtr.New(deviceFile)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+
+	k := &kernel.Kernel{
+		Platform: plat,
+	}
+
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, err
+	}
+	k.SetMemoryFile(mf)
+
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(nil, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	creds := auth.NewRootCredentials(auth.NewRootUserNamespace())
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		ApplicationCores:            uint(runtime.GOMAXPROCS(-1)),
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+
+	// Create mount namespace without root as it's the minimum required to create
+	// the global thread group.
+	mntns, err := fs.NewMountNamespace(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	ls, err := limits.NewLinuxLimitSet()
+	if err != nil {
+		return nil, err
+	}
+	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	k.TestOnly_SetGlobalInit(tg)
+
+	return k, nil
+}
+
+// CreateTask creates a new bare bones task for tests.
+func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+	k := kernel.KernelFromContext(ctx)
+	config := &kernel.TaskConfig{
+		Kernel:                  k,
+		ThreadGroup:             tc,
+		TaskContext:             &kernel.TaskContext{Name: name},
+		Credentials:             auth.CredentialsFromContext(ctx),
+		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
+		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
+		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
+		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+	}
+	return k.TaskSet().NewTask(config)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "test-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
new file mode 100644
index 000000000..eada31d94
--- /dev/null
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -0,0 +1,178 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil provides common test utilities for kernfs-based
+// filesystems.
+package testutil
+
+import (
+	"fmt"
+	"io"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// System represents the context for a single test.
+//
+// Test systems must be explicitly destroyed with System.Destroy.
+type System struct {
+	t     *testing.T
+	Ctx   context.Context
+	Creds *auth.Credentials
+	VFS   *vfs.VirtualFilesystem
+	mns   *vfs.MountNamespace
+	root  vfs.VirtualDentry
+}
+
+// NewSystem constructs a System.
+//
+// Precondition: Caller must hold a reference on mns, whose ownership
+// is transferred to the new System.
+func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+	s := &System{
+		t:     t,
+		Ctx:   ctx,
+		Creds: auth.CredentialsFromContext(ctx),
+		VFS:   v,
+		mns:   mns,
+		root:  mns.Root(),
+	}
+	return s
+}
+
+// Destroy release resources associated with a test system.
+func (s *System) Destroy() {
+	s.root.DecRef()
+	s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem.
+}
+
+// ReadToEnd reads the contents of fd until EOF to a string.
+func (s *System) ReadToEnd(fd *vfs.FileDescription) (string, error) {
+	buf := make([]byte, usermem.PageSize)
+	bufIOSeq := usermem.BytesIOSequence(buf)
+	opts := vfs.ReadOptions{}
+
+	var content strings.Builder
+	for {
+		n, err := fd.Read(s.Ctx, bufIOSeq, opts)
+		if n == 0 || err != nil {
+			if err == io.EOF {
+				err = nil
+			}
+			return content.String(), err
+		}
+		content.Write(buf[:n])
+	}
+}
+
+// PathOpAtRoot constructs a PathOperation with the given path from
+// the root of the filesystem.
+func (s *System) PathOpAtRoot(path string) vfs.PathOperation {
+	return vfs.PathOperation{
+		Root:  s.root,
+		Start: s.root,
+		Path:  fspath.Parse(path),
+	}
+}
+
+// GetDentryOrDie attempts to resolve a dentry referred to by the
+// provided path operation. If unsuccessful, the test fails.
+func (s *System) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
+	vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, &pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
+	}
+	return vd
+}
+
+// DirentType is an alias for values for linux_dirent64.d_type.
+type DirentType = uint8
+
+// AssertDirectoryContains verifies that a directory at pop contains the entries
+// specified. AssertDirectoryContains implicitly checks for "." and "..", these
+// need not be included in entries.
+func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[string]DirentType) {
+	// Also implicitly check for "." and "..".
+	entries["."] = linux.DT_DIR
+	entries[".."] = linux.DT_DIR
+
+	fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{Flags: linux.O_RDONLY})
+	if err != nil {
+		s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	collector := &DirentCollector{}
+	if err := fd.IterDirents(s.Ctx, collector); err != nil {
+		s.t.Fatalf("IterDirent failed: %v", err)
+	}
+
+	collectedEntries := make(map[string]DirentType)
+	for _, dirent := range collector.dirents {
+		collectedEntries[dirent.Name] = dirent.Type
+	}
+	if diff := cmp.Diff(entries, collectedEntries); diff != "" {
+		s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+// DirentCollector provides an implementation for vfs.IterDirentsCallback for
+// testing. It simply iterates to the end of a given directory FD and collects
+// all dirents emitted by the callback.
+type DirentCollector struct {
+	mu      sync.Mutex
+	dirents map[string]vfs.Dirent
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
+	d.mu.Lock()
+	if d.dirents == nil {
+		d.dirents = make(map[string]vfs.Dirent)
+	}
+	d.dirents[dirent.Name] = dirent
+	d.mu.Unlock()
+	return true
+}
+
+// Count returns the number of dirents currently in the collector.
+func (d *DirentCollector) Count() int {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return len(d.dirents)
+}
+
+// Contains checks whether the collector has a dirent with the given name and
+// type.
+func (d *DirentCollector) Contains(name string, typ uint8) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	dirent, ok := d.dirents[name]
+	if !ok {
+		return fmt.Errorf("No dirent named %q found", name)
+	}
+	if dirent.Type != typ {
+		return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+	}
+	return nil
+}
-- 
cgit v1.2.3


From 7e6fbc6afe797752efe066a8aa86f9eca973f3a4 Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Tue, 21 Jan 2020 14:47:04 -0800
Subject: Add a new TCP stat for current open connections.

Such a stat accounts for all connections that are currently
established and not yet transitioned to close state.
Also fix bug in double increment of CurrentEstablished stat.

Fixes #1579

PiperOrigin-RevId: 290827365
---
 pkg/sentry/socket/netstack/netstack.go |  3 +-
 pkg/tcpip/tcpip.go                     |  6 ++-
 pkg/tcpip/transport/tcp/accept.go      |  1 -
 pkg/tcpip/transport/tcp/connect.go     |  1 +
 pkg/tcpip/transport/tcp/endpoint.go    |  1 +
 pkg/tcpip/transport/tcp/tcp_test.go    | 83 ++++++++++++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 2662fbc0f..318acbeff 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -150,7 +150,8 @@ var Metrics = tcpip.Stats{
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
-		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."),
+		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
+		CurrentConnected:                   mustCreateMetric("/netstack/tcp/current_open", "Number of connections that are in connected state."),
 		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
 		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."),
 		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 3fc823a36..59c9b3fb0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -938,9 +938,13 @@ type TCPStats struct {
 	PassiveConnectionOpenings *StatCounter
 
 	// CurrentEstablished is the number of TCP connections for which the
-	// current state is either ESTABLISHED or CLOSE-WAIT.
+	// current state is ESTABLISHED.
 	CurrentEstablished *StatCounter
 
+	// CurrentConnected is the number of TCP connections that
+	// are in connected state.
+	CurrentConnected *StatCounter
+
 	// EstablishedResets is the number of times TCP connections have made
 	// a direct transition to the CLOSED state from either the
 	// ESTABLISHED state or the CLOSE-WAIT state.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 1a2e3efa9..d469758eb 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -562,7 +562,6 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// Switch state to connected.
 		// We do not use transitionToStateEstablishedLocked here as there is
 		// no handshake state available when doing a SYN cookie based accept.
-		n.stack.Stats().TCP.CurrentEstablished.Increment()
 		n.isConnectNotified = true
 		n.setEndpointState(StateEstablished)
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a2f384384..4e3c5419c 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -934,6 +934,7 @@ func (e *endpoint) transitionToStateCloseLocked() {
 	// Mark the endpoint as fully closed for reads/writes.
 	e.cleanupLocked()
 	e.setEndpointState(StateClose)
+	e.stack.Stats().TCP.CurrentConnected.Decrement()
 	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4797f11d1..13718ff55 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -594,6 +594,7 @@ func (e *endpoint) setEndpointState(state EndpointState) {
 	switch state {
 	case StateEstablished:
 		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.stack.Stats().TCP.CurrentConnected.Increment()
 	case StateError:
 		fallthrough
 	case StateClose:
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a9dfbe857..df2fb1071 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -470,6 +470,89 @@ func TestConnectResetAfterClose(t *testing.T) {
 	}
 }
 
+// TestCurrentConnectedIncrement tests increment of the current
+// established and connected counters.
+func TestCurrentConnectedIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
+	// after 1 second in TIME_WAIT state.
+	tcpTimeWaitTimeout := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	ep := c.EP
+	c.EP = nil
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 1 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 1", got)
+	}
+	gotConnected := c.Stack().Stats().TCP.CurrentConnected.Value()
+	if gotConnected != 1 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %v, want = 1", gotConnected)
+	}
+
+	ep.Close()
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != gotConnected {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %v, want = %v", got, gotConnected)
+	}
+
+	// Ack and send FIN as well.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Check that the stack acks the FIN.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Wait for the TIME-WAIT state to transition to CLOSED.
+	time.Sleep(1 * time.Second)
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %v, want = 0", got)
+	}
+}
+
 // TestClosingWithEnqueuedSegments tests handling of still enqueued segments
 // when the endpoint transitions to StateClose. The in-flight segments would be
 // re-enqueued to a any listening endpoint.
-- 
cgit v1.2.3


From 538053538dfb378aa8bc512d484ea305177e617b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 16:51:17 -0800
Subject: Adding serialization.

---
 pkg/sentry/socket/netfilter/netfilter.go | 29 ++++++++++++++++++++++++++++-
 pkg/tcpip/iptables/udp_matcher.go        | 14 +++++++-------
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index f8ed1acbc..3caabca9a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -196,7 +196,9 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
-	switch matcher.(type) {
+	switch m := matcher.(type) {
+	case *iptables.UDPMatcher:
+		return marshalUDPMatcher(m)
 	default:
 		// TODO(gvisor.dev/issue/170): We don't support any matchers
 		// yet, so any call to marshalMatcher will panic.
@@ -204,6 +206,31 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 	}
 }
 
+func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
+	type udpMatch struct {
+		linux.XTEntryMatch
+		linux.XTUDP
+	}
+	linuxMatcher := udpMatch{
+		XTEntryMatch: linux.XTEntryMatch{
+			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
+			// Name:      "udp",
+		},
+		XTUDP: linux.XTUDP{
+			SourcePortStart:      matcher.Data.SourcePortStart,
+			SourcePortEnd:        matcher.Data.SourcePortEnd,
+			DestinationPortStart: matcher.Data.DestinationPortStart,
+			DestinationPortEnd:   matcher.Data.DestinationPortEnd,
+			InverseFlags:         matcher.Data.InverseFlags,
+		},
+	}
+	copy(linuxMatcher.Name[:], "udp")
+
+	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte
+	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
+	return buf[:]
+}
+
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index ce4368a3d..fca457199 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -24,7 +24,7 @@ import (
 )
 
 type UDPMatcher struct {
-	data UDPMatcherData
+	Data UDPMatcherData
 
 	// tablename string
 	// unsigned int matchsize;
@@ -62,11 +62,11 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
 	}
 
-	return &UDPMatcher{data: data}, nil
+	return &UDPMatcher{Data: data}, nil
 }
 
 // TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
-func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
 	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
@@ -114,12 +114,12 @@ func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	destinationPort := udpHeader.DestinationPort()
 	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
 		udpHeader.SourcePort(), udpHeader.DestinationPort(),
-		tm.data.SourcePortStart, tm.data.SourcePortEnd,
-		tm.data.DestinationPortStart, tm.data.DestinationPortEnd)
-	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		um.Data.SourcePortStart, um.Data.SourcePortEnd,
+		um.Data.DestinationPortStart, um.Data.DestinationPortEnd)
+	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
 		return false, false
 	}
-	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+	if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort {
 		return false, false
 	}
 
-- 
cgit v1.2.3


From 2296b4734462b6eeef383ea58e2b1b0b1a214d76 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 21 Jan 2020 16:16:51 -0800
Subject: Change to standard types.

PiperOrigin-RevId: 290846481
---
 test/syscalls/linux/aio.cc                         |  2 +-
 test/syscalls/linux/chown.cc                       |  6 +-
 test/syscalls/linux/chroot.cc                      |  2 +-
 test/syscalls/linux/clock_gettime.cc               | 12 ++--
 test/syscalls/linux/eventfd.cc                     | 22 ++++----
 test/syscalls/linux/exceptions.cc                  | 66 +++++++++++-----------
 test/syscalls/linux/exec.cc                        | 10 ++--
 test/syscalls/linux/exec_binary.cc                 | 18 +++---
 test/syscalls/linux/fcntl.cc                       | 22 ++++----
 test/syscalls/linux/fork.cc                        |  2 +-
 test/syscalls/linux/futex.cc                       |  2 +-
 test/syscalls/linux/inotify.cc                     | 24 ++++----
 test/syscalls/linux/ip_socket_test_util.cc         |  4 +-
 test/syscalls/linux/ip_socket_test_util.h          |  4 +-
 test/syscalls/linux/itimer.cc                      |  4 +-
 test/syscalls/linux/kill.cc                        |  4 +-
 test/syscalls/linux/link.cc                        |  5 +-
 test/syscalls/linux/memfd.cc                       |  2 +-
 test/syscalls/linux/memory_accounting.cc           | 14 ++---
 test/syscalls/linux/mempolicy.cc                   | 28 ++++-----
 test/syscalls/linux/mmap.cc                        | 16 +++---
 test/syscalls/linux/open.cc                        |  2 +-
 test/syscalls/linux/partial_bad_buffer.cc          |  2 +-
 test/syscalls/linux/prctl_setuid.cc                |  2 +-
 test/syscalls/linux/proc.cc                        | 42 +++++++-------
 test/syscalls/linux/proc_net_tcp.cc                | 62 ++++++++++----------
 test/syscalls/linux/proc_net_udp.cc                | 32 +++++------
 test/syscalls/linux/proc_net_unix.cc               | 12 ++--
 test/syscalls/linux/proc_pid_uid_gid_map.cc        | 26 ++++-----
 test/syscalls/linux/ptrace.cc                      |  4 +-
 test/syscalls/linux/pty.cc                         | 14 ++---
 test/syscalls/linux/pwrite64.cc                    |  4 +-
 test/syscalls/linux/raw_socket_hdrincl.cc          |  4 +-
 test/syscalls/linux/rseq.cc                        |  2 +-
 test/syscalls/linux/rseq/critical.h                |  2 +-
 test/syscalls/linux/rseq/rseq.cc                   | 50 ++++++++--------
 test/syscalls/linux/rseq/types.h                   | 16 +++---
 test/syscalls/linux/seccomp.cc                     | 14 ++---
 test/syscalls/linux/semaphore.cc                   | 10 ++--
 test/syscalls/linux/shm.cc                         | 10 ++--
 test/syscalls/linux/sigaltstack.cc                 |  2 +-
 test/syscalls/linux/sigiret.cc                     | 14 ++---
 .../linux/socket_bind_to_device_distribution.cc    | 14 ++---
 test/syscalls/linux/socket_generic.cc              |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc        | 56 +++++++++---------
 test/syscalls/linux/socket_ip_unbound.cc           |  8 +--
 test/syscalls/linux/socket_netdevice.cc            |  8 +--
 test/syscalls/linux/socket_netlink_route.cc        | 30 +++++-----
 test/syscalls/linux/socket_netlink_util.cc         |  4 +-
 test/syscalls/linux/socket_netlink_util.h          |  2 +-
 test/syscalls/linux/socket_test_util.cc            |  2 +-
 test/syscalls/linux/splice.cc                      |  2 +-
 test/syscalls/linux/stat.cc                        | 40 ++++++-------
 test/syscalls/linux/sticky.cc                      |  4 +-
 test/syscalls/linux/sysret.cc                      |  8 +--
 test/syscalls/linux/tcp_socket.cc                  |  2 +-
 test/syscalls/linux/time.cc                        |  4 +-
 test/syscalls/linux/timerfd.cc                     | 48 ++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc       | 14 ++---
 test/syscalls/linux/uidgid.cc                      |  8 +--
 test/syscalls/linux/utimes.cc                      | 25 ++++----
 test/syscalls/linux/vfork.cc                       | 14 ++---
 test/syscalls/linux/vsyscall.cc                    |  2 +-
 test/syscalls/linux/wait.cc                        | 18 +++---
 test/util/mount_util.h                             |  6 +-
 test/util/multiprocess_util.cc                     |  2 +-
 test/util/multiprocess_util.h                      |  5 +-
 test/util/proc_util.cc                             |  2 +-
 test/util/temp_path.cc                             |  2 +-
 test/util/test_util.cc                             | 20 +++----
 test/util/test_util.h                              | 12 ++--
 test/util/test_util_test.cc                        |  4 +-
 72 files changed, 483 insertions(+), 480 deletions(-)

diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index 28592bc8f..a33daff17 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -183,7 +183,7 @@ TEST_F(AIOTest, BadWrite) {
 
   // Verify that it fails with the right error code.
   EXPECT_EQ(events[0].data, 0x123);
-  EXPECT_EQ(events[0].obj, reinterpret_cast<uint64>(&cb));
+  EXPECT_EQ(events[0].obj, reinterpret_cast<uint64_t>(&cb));
   EXPECT_LT(events[0].res, 0);
 }
 
diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc
index 1c00e2731..7a28b674d 100644
--- a/test/syscalls/linux/chown.cc
+++ b/test/syscalls/linux/chown.cc
@@ -31,9 +31,9 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid1, 65534, "first scratch UID");
-ABSL_FLAG(int32, scratch_uid2, 65533, "second scratch UID");
-ABSL_FLAG(int32, scratch_gid, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "first scratch GID");
 
 namespace gvisor {
 namespace testing {
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 27e057086..04bc2d7b9 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -253,7 +253,7 @@ TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) {
   // Mmap the newly created file.
   void* foo_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
                        foo.get(), 0);
-  ASSERT_THAT(reinterpret_cast<int64>(foo_map), SyscallSucceeds());
+  ASSERT_THAT(reinterpret_cast<int64_t>(foo_map), SyscallSucceeds());
 
   // Always unmap.
   auto cleanup_map = Cleanup(
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 1d5b5af94..7f6015049 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -34,7 +34,7 @@ namespace testing {
 
 namespace {
 
-int64 clock_gettime_nsecs(clockid_t id) {
+int64_t clock_gettime_nsecs(clockid_t id) {
   struct timespec ts;
   TEST_PCHECK(clock_gettime(id, &ts) == 0);
   return (ts.tv_sec * 1000000000 + ts.tv_nsec);
@@ -42,9 +42,9 @@ int64 clock_gettime_nsecs(clockid_t id) {
 
 // Spin on the CPU for at least ns nanoseconds, based on
 // CLOCK_THREAD_CPUTIME_ID.
-void spin_ns(int64 ns) {
-  int64 start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
-  int64 end = start + ns;
+void spin_ns(int64_t ns) {
+  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+  int64_t end = start + ns;
 
   do {
     constexpr int kLoopCount = 1000000;  // large and arbitrary
@@ -64,7 +64,7 @@ TEST(ClockGettime, CputimeId) {
   // the workers. Note that we test CLOCK_PROCESS_CPUTIME_ID by having the
   // workers execute in parallel and verifying that CLOCK_PROCESS_CPUTIME_ID
   // accumulates the runtime of all threads.
-  int64 start = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+  int64_t start = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
 
   // Create a kNumThreads threads.
   std::list<ScopedThread> threads;
@@ -76,7 +76,7 @@ TEST(ClockGettime, CputimeId) {
     t.Join();
   }
 
-  int64 end = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+  int64_t end = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
 
   // The aggregate time spent in the worker threads must be at least
   // 'kNumThreads' times the time each thread spun.
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index fed67a56e..367682c3d 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -37,7 +37,7 @@ TEST(EventfdTest, Nonblock) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l;
+  uint64_t l;
   ASSERT_THAT(read(efd.get(), &l, sizeof(l)), SyscallFailsWithErrno(EAGAIN));
 
   l = 1;
@@ -52,7 +52,7 @@ TEST(EventfdTest, Nonblock) {
 
 void* read_three_times(void* arg) {
   int efd = *reinterpret_cast<int*>(arg);
-  uint64 l;
+  uint64_t l;
   EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
   EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
   EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
@@ -68,7 +68,7 @@ TEST(EventfdTest, BlockingWrite) {
                              reinterpret_cast<void*>(&efd)),
               SyscallSucceeds());
 
-  uint64 l = 1;
+  uint64_t l = 1;
   ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
   EXPECT_EQ(l, 1);
 
@@ -85,7 +85,7 @@ TEST(EventfdTest, SmallWrite) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l = 16;
+  uint64_t l = 16;
   ASSERT_THAT(write(efd.get(), &l, 4), SyscallFailsWithErrno(EINVAL));
 }
 
@@ -93,7 +93,7 @@ TEST(EventfdTest, SmallRead) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l = 1;
+  uint64_t l = 1;
   ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
 
   l = 0;
@@ -104,7 +104,7 @@ TEST(EventfdTest, BigWrite) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 big[16];
+  uint64_t big[16];
   big[0] = 16;
   ASSERT_THAT(write(efd.get(), big, sizeof(big)), SyscallSucceeds());
 }
@@ -113,10 +113,10 @@ TEST(EventfdTest, BigRead) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l = 1;
+  uint64_t l = 1;
   ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
 
-  uint64 big[16];
+  uint64_t big[16];
   ASSERT_THAT(read(efd.get(), big, sizeof(big)), SyscallSucceeds());
   EXPECT_EQ(big[0], 1);
 }
@@ -125,7 +125,7 @@ TEST(EventfdTest, BigWriteBigRead) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l[16];
+  uint64_t l[16];
   l[0] = 16;
   ASSERT_THAT(write(efd.get(), l, sizeof(l)), SyscallSucceeds());
   ASSERT_THAT(read(efd.get(), l, sizeof(l)), SyscallSucceeds());
@@ -150,7 +150,7 @@ TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
   int wait_out = epoll_wait(epollfd.get(), &out_ev, 1, kEpollTimeoutMs);
   EXPECT_EQ(wait_out, 1);
   EXPECT_EQ(efd.get(), out_ev.data.fd);
-  uint64 val = 0;
+  uint64_t val = 0;
   ASSERT_THAT(read(efd.get(), &val, sizeof(val)), SyscallSucceeds());
   EXPECT_EQ(val, 1);
 
@@ -159,7 +159,7 @@ TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
   // epoll_wait times out.
   ScopedThread t([&efd] {
     sleep(5);
-    uint64 val = 1;
+    uint64_t val = 1;
     EXPECT_THAT(write(efd.get(), &val, sizeof(val)),
                 SyscallSucceedsWithValue(sizeof(val)));
   });
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 0b67eb0ad..3d564e720 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -24,20 +24,20 @@ namespace testing {
 
 // Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5
 // "x87 FPU Control Word".
-constexpr uint16 kX87ControlWordDefault = 0x37f;
+constexpr uint16_t kX87ControlWordDefault = 0x37f;
 
 // Mask for the divide-by-zero exception.
-constexpr uint16 kX87ControlWordDiv0Mask = 1 << 2;
+constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2;
 
 // Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch
 // 11.6.4 "Initialization of SSE/SSE3 Extensions".
-constexpr uint32 kMXCSRDefault = 0x1f80;
+constexpr uint32_t kMXCSRDefault = 0x1f80;
 
 // Mask for the divide-by-zero exception.
-constexpr uint32 kMXCSRDiv0Mask = 1 << 9;
+constexpr uint32_t kMXCSRDiv0Mask = 1 << 9;
 
 // Flag for a pending divide-by-zero exception.
-constexpr uint32 kMXCSRDiv0Flag = 1 << 2;
+constexpr uint32_t kMXCSRDiv0Flag = 1 << 2;
 
 void inline Halt() { asm("hlt\r\n"); }
 
@@ -112,10 +112,10 @@ TEST(ExceptionTest, DivideByZero) {
 
   EXPECT_EXIT(
       {
-        uint32 remainder;
-        uint32 quotient;
-        uint32 divisor = 0;
-        uint64 value = 1;
+        uint32_t remainder;
+        uint32_t quotient;
+        uint32_t divisor = 0;
+        uint64_t value = 1;
         asm("divl 0(%2)\r\n"
             : "=d"(remainder), "=a"(quotient)
             : "r"(&divisor), "d"(value >> 32), "a"(value));
@@ -126,9 +126,9 @@ TEST(ExceptionTest, DivideByZero) {
 
 // By default, x87 exceptions are masked and simply return a default value.
 TEST(ExceptionTest, X87DivideByZeroMasked) {
-  int32 quotient;
-  int32 value = 1;
-  int32 divisor = 0;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
   asm("fildl %[value]\r\n"
       "fidivl %[divisor]\r\n"
       "fistpl %[quotient]\r\n"
@@ -148,12 +148,12 @@ TEST(ExceptionTest, X87DivideByZeroUnmasked) {
   EXPECT_EXIT(
       {
         // Clear the divide by zero exception mask.
-        constexpr uint16 kControlWord =
+        constexpr uint16_t kControlWord =
             kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
 
-        int32 quotient;
-        int32 value = 1;
-        int32 divisor = 0;
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
         asm volatile(
             "fldcw %[cw]\r\n"
             "fildl %[value]\r\n"
@@ -176,12 +176,12 @@ TEST(ExceptionTest, X87StatusClobber) {
   EXPECT_EXIT(
       {
         // Clear the divide by zero exception mask.
-        constexpr uint16 kControlWord =
+        constexpr uint16_t kControlWord =
             kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
 
-        int32 quotient;
-        int32 value = 1;
-        int32 divisor = 0;
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
         asm volatile(
             "fildl %[value]\r\n"
             "fidivl %[divisor]\r\n"
@@ -208,10 +208,10 @@ TEST(ExceptionTest, X87StatusClobber) {
 
 // By default, SSE exceptions are masked and simply return a default value.
 TEST(ExceptionTest, SSEDivideByZeroMasked) {
-  uint32 status;
-  int32 quotient;
-  int32 value = 1;
-  int32 divisor = 0;
+  uint32_t status;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
   asm("cvtsi2ssl %[value], %%xmm0\r\n"
       "cvtsi2ssl %[divisor], %%xmm1\r\n"
       "divss %%xmm1, %%xmm0\r\n"
@@ -233,11 +233,11 @@ TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
   EXPECT_EXIT(
       {
         // Clear the divide by zero exception mask.
-        constexpr uint32 kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
+        constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
 
-        int32 quotient;
-        int32 value = 1;
-        int32 divisor = 0;
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
         asm volatile(
             "ldmxcsr %[mxcsr]\r\n"
             "cvtsi2ssl %[value], %%xmm0\r\n"
@@ -254,10 +254,10 @@ TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
 
 // Pending exceptions in the SSE status register are not clobbered by syscalls.
 TEST(ExceptionTest, SSEStatusClobber) {
-  uint32 mxcsr;
-  int32 quotient;
-  int32 value = 1;
-  int32 divisor = 0;
+  uint32_t mxcsr;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
   asm("cvtsi2ssl %[value], %%xmm0\r\n"
       "cvtsi2ssl %[divisor], %%xmm1\r\n"
       "divss %%xmm1, %%xmm0\r\n"
@@ -336,7 +336,7 @@ TEST(ExceptionTest, AlignmentCheck) {
         SetAlignmentCheck();
         for (int i = 0; i < 8; i++) {
           // At least 7/8 offsets will be unaligned here.
-          uint64* ptr = reinterpret_cast<uint64*>(&array[i]);
+          uint64_t* ptr = reinterpret_cast<uint64_t*>(&array[i]);
           asm("mov %0, 0(%0)\r\n" : : "r"(ptr) : "ax");
         }
       },
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 9c5a11206..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -62,7 +62,7 @@ constexpr char kExecFromThread[] = "--exec_exec_from_thread";
 
 // Runs file specified by dirfd and pathname with argv and checks that the exit
 // status is expect_status and that stderr contains expect_stderr.
-void CheckExecHelper(const absl::optional<int32> dirfd,
+void CheckExecHelper(const absl::optional<int32_t> dirfd,
                      const std::string& pathname, const ExecveArray& argv,
                      const ExecveArray& envv, const int flags,
                      int expect_status, const std::string& expect_stderr) {
@@ -143,15 +143,15 @@ void CheckExecHelper(const absl::optional<int32> dirfd,
 void CheckExec(const std::string& filename, const ExecveArray& argv,
                const ExecveArray& envv, int expect_status,
                const std::string& expect_stderr) {
-  CheckExecHelper(/*dirfd=*/absl::optional<int32>(), filename, argv, envv,
+  CheckExecHelper(/*dirfd=*/absl::optional<int32_t>(), filename, argv, envv,
                   /*flags=*/0, expect_status, expect_stderr);
 }
 
-void CheckExecveat(const int32 dirfd, const std::string& pathname,
+void CheckExecveat(const int32_t dirfd, const std::string& pathname,
                    const ExecveArray& argv, const ExecveArray& envv,
                    const int flags, int expect_status,
                    const std::string& expect_stderr) {
-  CheckExecHelper(absl::optional<int32>(dirfd), pathname, argv, envv, flags,
+  CheckExecHelper(absl::optional<int32_t>(dirfd), pathname, argv, envv, flags,
                   expect_status, expect_stderr);
 }
 
@@ -603,7 +603,7 @@ TEST(ExecveatTest, AbsolutePathWithFDCWD) {
 TEST(ExecveatTest, AbsolutePath) {
   std::string path = RunfilePath(kBasicWorkload);
   // File descriptor should be ignored when an absolute path is given.
-  const int32 badFD = -1;
+  const int32_t badFD = -1;
   CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
                 absl::StrCat(path, "\n"));
 }
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 144bf45cf..736452b0c 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -700,7 +700,7 @@ TEST(ElfTest, PIE) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  const uint64 offset = elf.phdrs[1].p_offset;
+  const uint64_t offset = elf.phdrs[1].p_offset;
   elf.phdrs[1].p_offset = 0x0;
   elf.phdrs[1].p_vaddr = 0x0;
   elf.phdrs[1].p_filesz += offset;
@@ -720,7 +720,7 @@ TEST(ElfTest, PIE) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          // text page.
@@ -789,7 +789,7 @@ TEST(ElfTest, PIENonZeroStart) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
 
   // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
   //
@@ -859,7 +859,7 @@ TEST(ElfTest, ELFInterpreter) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // N.B. Since Linux 4.10 (0036d1f7eb95b "binfmt_elf: fix calculations for bss
   // padding"), Linux unconditionally zeroes the remainder of the highest mapped
   // page in an interpreter, failing if the protections don't allow write. Thus
@@ -912,7 +912,7 @@ TEST(ElfTest, ELFInterpreter) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1047,7 +1047,7 @@ TEST(ElfTest, ELFInterpreterRelative) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // See comment in ElfTest.ELFInterpreter.
   interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
   interpreter.phdrs[1].p_offset = 0x0;
@@ -1086,7 +1086,7 @@ TEST(ElfTest, ELFInterpreterRelative) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1109,7 +1109,7 @@ TEST(ElfTest, ELFInterpreterWrongArch) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // See comment in ElfTest.ELFInterpreter.
   interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
   interpreter.phdrs[1].p_offset = 0x0;
@@ -1190,7 +1190,7 @@ TEST(ElfTest, ElfInterpreterNoExecute) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // See comment in ElfTest.ELFInterpreter.
   interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
   interpreter.phdrs[1].p_offset = 0x0;
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 6eb597eae..4f3aa81d6 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -46,9 +46,9 @@ ABSL_FLAG(bool, blocking, false,
           "Whether to set a blocking lock (otherwise non-blocking).");
 ABSL_FLAG(bool, retry_eintr, false,
           "Whether to retry in the subprocess on EINTR.");
-ABSL_FLAG(uint64, child_setlock_start, 0, "The value of struct flock start");
-ABSL_FLAG(uint64, child_setlock_len, 0, "The value of struct flock len");
-ABSL_FLAG(int32, socket_fd, -1,
+ABSL_FLAG(uint64_t, child_setlock_start, 0, "The value of struct flock start");
+ABSL_FLAG(uint64_t, child_setlock_len, 0, "The value of struct flock len");
+ABSL_FLAG(int32_t, socket_fd, -1,
           "A socket to use for communicating more state back "
           "to the parent.");
 
@@ -71,8 +71,8 @@ class FcntlLockTest : public ::testing::Test {
     EXPECT_THAT(close(fds_[1]), SyscallSucceeds());
   }
 
-  int64 GetSubprocessFcntlTimeInUsec() {
-    int64 ret = 0;
+  int64_t GetSubprocessFcntlTimeInUsec() {
+    int64_t ret = 0;
     EXPECT_THAT(ReadFd(fds_[0], reinterpret_cast<void*>(&ret), sizeof(ret)),
                 SyscallSucceedsWithValue(sizeof(ret)));
     return ret;
@@ -676,7 +676,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) {
   // We will wait kHoldLockForSec before we release our lock allowing the
   // subprocess to obtain it.
   constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
-  const int64 kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
 
   absl::SleepFor(kHoldLockFor);
 
@@ -685,7 +685,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) {
   ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
 
   // Read the blocked time from the subprocess socket.
-  int64 subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
 
   // We must have been waiting at least kMinBlockTime.
   EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
@@ -729,7 +729,7 @@ TEST_F(FcntlLockTest, SetReadLockThenBlockingWriteLock) {
   // subprocess to obtain it.
   constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
 
-  const int64 kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
 
   absl::SleepFor(kHoldLockFor);
 
@@ -738,7 +738,7 @@ TEST_F(FcntlLockTest, SetReadLockThenBlockingWriteLock) {
   ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
 
   // Read the blocked time from the subprocess socket.
-  int64 subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
 
   // We must have been waiting at least kMinBlockTime.
   EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
@@ -782,7 +782,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingReadLock) {
   // subprocess to obtain it.
   constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
 
-  const int64 kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
 
   absl::SleepFor(kHoldLockFor);
 
@@ -791,7 +791,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingReadLock) {
   ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
 
   // Read the blocked time from the subprocess socket.
-  int64 subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
 
   // We must have been waiting at least kMinBlockTime.
   EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 486189697..371890110 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -270,7 +270,7 @@ TEST_F(ForkTest, Alarm) {
 
 // Child cannot affect parent private memory.
 TEST_F(ForkTest, PrivateMemory) {
-  std::atomic<uint32> local(0);
+  std::atomic<uint32_t> local(0);
 
   pid_t child1 = Fork();
   if (child1 == 0) {
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index b4a7cc8d6..40c80a6e1 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -112,7 +112,7 @@ int futex_wake_bitset(bool priv, std::atomic<int>* uaddr, int count,
 }
 
 int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
-                  int nwake1, int nwake2, uint32 sub_op) {
+                  int nwake1, int nwake2, uint32_t sub_op) {
   int op = FUTEX_WAKE_OP;
   if (priv) {
     op |= FUTEX_PRIVATE_FLAG;
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 182d676d5..fdef646eb 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -48,26 +48,26 @@ constexpr int kBufSize = 1024;
 
 // C++-friendly version of struct inotify_event.
 struct Event {
-  int32 wd;
-  uint32 mask;
-  uint32 cookie;
-  uint32 len;
+  int32_t wd;
+  uint32_t mask;
+  uint32_t cookie;
+  uint32_t len;
   std::string name;
 
-  Event(uint32 mask, int32 wd, absl::string_view name, uint32 cookie)
+  Event(uint32_t mask, int32_t wd, absl::string_view name, uint32_t cookie)
       : wd(wd),
         mask(mask),
         cookie(cookie),
         len(name.size()),
         name(std::string(name)) {}
-  Event(uint32 mask, int32 wd, absl::string_view name)
+  Event(uint32_t mask, int32_t wd, absl::string_view name)
       : Event(mask, wd, name, 0) {}
-  Event(uint32 mask, int32 wd) : Event(mask, wd, "", 0) {}
+  Event(uint32_t mask, int32_t wd) : Event(mask, wd, "", 0) {}
   Event() : Event(0, 0, "", 0) {}
 };
 
 // Prints the symbolic name for a struct inotify_event's 'mask' field.
-std::string FlagString(uint32 flags) {
+std::string FlagString(uint32_t flags) {
   std::vector<std::string> names;
 
 #define EMIT(target)          \
@@ -320,7 +320,7 @@ PosixErrorOr<FileDescriptor> InotifyInit1(int flags) {
 }
 
 PosixErrorOr<int> InotifyAddWatch(int fd, const std::string& path,
-                                  uint32 mask) {
+                                  uint32_t mask) {
   int wd;
   EXPECT_THAT(wd = inotify_add_watch(fd, path.c_str(), mask),
               SyscallSucceeds());
@@ -647,7 +647,7 @@ TEST(Inotify, MoveGeneratesEvents) {
            Event(IN_MOVED_TO, root_wd, Basename(newpath), events[1].cookie)}));
   EXPECT_NE(events[0].cookie, 0);
   EXPECT_EQ(events[0].cookie, events[1].cookie);
-  uint32 last_cookie = events[0].cookie;
+  uint32_t last_cookie = events[0].cookie;
 
   // Test move from root -> root/dir1.
   newpath = NewTempAbsPathInDir(dir1.path());
@@ -841,7 +841,7 @@ TEST(Inotify, ConcurrentThreadsGeneratingEvents) {
   }
 
   auto test_thread = [&files]() {
-    uint32 seed = time(nullptr);
+    uint32_t seed = time(nullptr);
     for (int i = 0; i < 20; i++) {
       const TempPath& file = files[rand_r(&seed) % files.size()];
       const FileDescriptor file_fd =
@@ -960,7 +960,7 @@ TEST(Inotify, BlockingReadOnInotifyFd) {
   t.Join();
 
   // Make sure the event we got back is sane.
-  uint32 event_mask;
+  uint32_t event_mask;
   memcpy(&event_mask, buf.data() + offsetof(struct inotify_event, mask),
          sizeof(event_mask));
   EXPECT_EQ(event_mask, IN_ACCESS);
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index f694a6360..6b472eb2f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -24,12 +24,12 @@
 namespace gvisor {
 namespace testing {
 
-uint32 IPFromInetSockaddr(const struct sockaddr* addr) {
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr) {
   auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
   return in_addr->sin_addr.s_addr;
 }
 
-uint16 PortFromInetSockaddr(const struct sockaddr* addr) {
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
   auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
   return ntohs(in_addr->sin_port);
 }
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 0eeca30dd..0f58e0f77 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -27,10 +27,10 @@ namespace gvisor {
 namespace testing {
 
 // Extracts the IP address from an inet sockaddr in network byte order.
-uint32 IPFromInetSockaddr(const struct sockaddr* addr);
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
 
 // Extracts the port from an inet sockaddr in host byte order.
-uint16 PortFromInetSockaddr(const struct sockaddr* addr);
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr);
 
 // InterfaceIndex returns the index of the named interface.
 PosixErrorOr<int> InterfaceIndex(std::string name);
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 52ffbe89d..b77e4cbd1 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -177,8 +177,8 @@ SignalTestResult ItimerSignalTest(int id, clock_t main_clock,
   SignalTestResult result;
 
   // Wait for the workers to be done and collect their sample counts.
-  result.worker_samples.push_back(reinterpret_cast<int64>(th1.Join()));
-  result.worker_samples.push_back(reinterpret_cast<int64>(th2.Join()));
+  result.worker_samples.push_back(reinterpret_cast<int64_t>(th1.Join()));
+  result.worker_samples.push_back(reinterpret_cast<int64_t>(th2.Join()));
   cleanup_itimer.Release()();
   result.expected_total = (Now(main_clock) - start) / kPeriod;
   result.main_thread_samples = signal_test_num_samples.load();
diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc
index a2247fdeb..db29bd59c 100644
--- a/test/syscalls/linux/kill.cc
+++ b/test/syscalls/linux/kill.cc
@@ -32,8 +32,8 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "scratch UID");
-ABSL_FLAG(int32, scratch_gid, 65534, "scratch GID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "scratch GID");
 
 using ::testing::Ge;
 
diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc
index 108a0c23e..e74fa2ed5 100644
--- a/test/syscalls/linux/link.cc
+++ b/test/syscalls/linux/link.cc
@@ -32,7 +32,7 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
 
 namespace gvisor {
 namespace testing {
@@ -55,7 +55,8 @@ TEST(LinkTest, CanCreateLinkFile) {
   const std::string newname = NewTempAbsPath();
 
   // Get the initial link count.
-  uint64 initial_link_count = ASSERT_NO_ERRNO_AND_VALUE(Links(oldfile.path()));
+  uint64_t initial_link_count =
+      ASSERT_NO_ERRNO_AND_VALUE(Links(oldfile.path()));
 
   EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()), SyscallSucceeds());
 
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index e10f250d1..e57b49a4a 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -61,7 +61,7 @@ int memfd_create(const std::string& name, unsigned int flags) {
 }
 
 PosixErrorOr<FileDescriptor> MemfdCreate(const std::string& name,
-                                         uint32 flags) {
+                                         uint32_t flags) {
   int fd = memfd_create(name, flags);
   if (fd < 0) {
     return PosixError(
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index 987dbd151..94aea4077 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -33,7 +33,7 @@ using ::absl::StrFormat;
 
 // AnonUsageFromMeminfo scrapes the current anonymous memory usage from
 // /proc/meminfo and returns it in bytes.
-PosixErrorOr<uint64> AnonUsageFromMeminfo() {
+PosixErrorOr<uint64_t> AnonUsageFromMeminfo() {
   ASSIGN_OR_RETURN_ERRNO(auto meminfo, GetContents("/proc/meminfo"));
   std::vector<std::string> lines(absl::StrSplit(meminfo, '\n'));
 
@@ -47,7 +47,7 @@ PosixErrorOr<uint64> AnonUsageFromMeminfo() {
         absl::StrSplit(line, ' ', absl::SkipEmpty()));
     if (parts.size() == 3) {
       // The size is the second field, let's try to parse it as a number.
-      ASSIGN_OR_RETURN_ERRNO(auto anon_kb, Atoi<uint64>(parts[1]));
+      ASSIGN_OR_RETURN_ERRNO(auto anon_kb, Atoi<uint64_t>(parts[1]));
       return anon_kb * 1024;
     }
 
@@ -65,10 +65,10 @@ TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
   // the test.
   SKIP_IF(!IsRunningOnGvisor());
 
-  uint64 anon_initial = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  uint64_t anon_initial = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
 
   // Cause some anonymous memory usage.
-  uint64 map_bytes = Megabytes(512);
+  uint64_t map_bytes = Megabytes(512);
   char* mem =
       static_cast<char*>(mmap(nullptr, map_bytes, PROT_READ | PROT_WRITE,
                               MAP_POPULATE | MAP_ANON | MAP_PRIVATE, -1, 0));
@@ -77,11 +77,11 @@ TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
 
   // Write something to each page to prevent them from being decommited on
   // S/R. Zero pages are dropped on save.
-  for (uint64 i = 0; i < map_bytes; i += kPageSize) {
+  for (uint64_t i = 0; i < map_bytes; i += kPageSize) {
     mem[i] = 'a';
   }
 
-  uint64 anon_after_alloc = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  uint64_t anon_after_alloc = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
   EXPECT_THAT(anon_after_alloc,
               EquivalentWithin(anon_initial + map_bytes, 0.03));
 
@@ -90,7 +90,7 @@ TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
   MaybeSave();
 
   // Usage should remain the same across S/R.
-  uint64 anon_after_sr = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  uint64_t anon_after_sr = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
   EXPECT_THAT(anon_after_sr, EquivalentWithin(anon_after_alloc, 0.03));
 }
 
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 46bbbc923..9d5f47651 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -43,12 +43,12 @@ namespace {
 #define MPOL_MF_MOVE (1 << 1)
 #define MPOL_MF_MOVE_ALL (1 << 2)
 
-int get_mempolicy(int *policy, uint64 *nmask, uint64 maxnode, void *addr,
+int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
                   int flags) {
   return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
 }
 
-int set_mempolicy(int mode, uint64 *nmask, uint64 maxnode) {
+int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
   return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
 }
 
@@ -68,8 +68,8 @@ Cleanup ScopedMempolicy() {
 
 // Temporarily change the memory policy for the calling thread within the
 // caller's scope.
-PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64 *nmask,
-                                         uint64 maxnode) {
+PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t *nmask,
+                                         uint64_t maxnode) {
   if (set_mempolicy(mode, nmask, maxnode)) {
     return PosixError(errno, "set_mempolicy");
   }
@@ -78,7 +78,7 @@ PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64 *nmask,
 
 TEST(MempolicyTest, CheckDefaultPolicy) {
   int mode = 0;
-  uint64 nodemask = 0;
+  uint64_t nodemask = 0;
   ASSERT_THAT(get_mempolicy(&mode, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
                             nullptr, 0),
               SyscallSucceeds());
@@ -88,12 +88,12 @@ TEST(MempolicyTest, CheckDefaultPolicy) {
 }
 
 TEST(MempolicyTest, PolicyPreservedAfterSetMempolicy) {
-  uint64 nodemask = 0x1;
+  uint64_t nodemask = 0x1;
   auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
       MPOL_BIND, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
 
   int mode = 0;
-  uint64 nodemask_after = 0x0;
+  uint64_t nodemask_after = 0x0;
   ASSERT_THAT(get_mempolicy(&mode, &nodemask_after,
                             sizeof(nodemask_after) * BITS_PER_BYTE, nullptr, 0),
               SyscallSucceeds());
@@ -118,7 +118,7 @@ TEST(MempolicyTest, PolicyPreservedAfterSetMempolicy) {
 
 TEST(MempolicyTest, SetMempolicyRejectsInvalidInputs) {
   auto cleanup = ScopedMempolicy();
-  uint64 nodemask;
+  uint64_t nodemask;
 
   if (IsRunningOnGvisor()) {
     // Invalid nodemask, we only support a single node on gvisor.
@@ -165,7 +165,7 @@ TEST(MempolicyTest, EmptyNodemaskOnSet) {
               SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(set_mempolicy(MPOL_PREFERRED, nullptr, 1), SyscallSucceeds());
 
-  uint64 nodemask = 0x1;
+  uint64_t nodemask = 0x1;
   EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask, 0),
               SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(set_mempolicy(MPOL_BIND, &nodemask, 0),
@@ -175,7 +175,7 @@ TEST(MempolicyTest, EmptyNodemaskOnSet) {
 }
 
 TEST(MempolicyTest, QueryAvailableNodes) {
-  uint64 nodemask = 0;
+  uint64_t nodemask = 0;
   ASSERT_THAT(
       get_mempolicy(nullptr, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
                     nullptr, MPOL_F_MEMS_ALLOWED),
@@ -197,8 +197,8 @@ TEST(MempolicyTest, QueryAvailableNodes) {
 }
 
 TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
-  uint64 dummy_stack_address;
-  auto dummy_heap_address = absl::make_unique<uint64>();
+  uint64_t dummy_stack_address;
+  auto dummy_heap_address = absl::make_unique<uint64_t>();
   int mode;
 
   for (auto ptr : {&dummy_stack_address, dummy_heap_address.get()}) {
@@ -228,7 +228,7 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
 
 TEST(MempolicyTest, GetMempolicyCanOmitPointers) {
   int mode;
-  uint64 nodemask;
+  uint64_t nodemask;
 
   // Omit nodemask pointer.
   ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, 0), SyscallSucceeds());
@@ -249,7 +249,7 @@ TEST(MempolicyTest, GetMempolicyNextInterleaveNode) {
               SyscallFailsWithErrno(EINVAL));
 
   // Set default policy for thread to MPOL_INTERLEAVE.
-  uint64 nodemask = 0x1;
+  uint64_t nodemask = 0x1;
   auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
       MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 9b2270c8d..1c4d9f1c7 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -50,13 +50,13 @@ namespace testing {
 
 namespace {
 
-PosixErrorOr<int64> VirtualMemorySize() {
+PosixErrorOr<int64_t> VirtualMemorySize() {
   ASSIGN_OR_RETURN_ERRNO(auto contents, GetContents("/proc/self/statm"));
   std::vector<std::string> parts = absl::StrSplit(contents, ' ');
   if (parts.empty()) {
     return PosixError(EINVAL, "Unable to parse /proc/self/statm");
   }
-  ASSIGN_OR_RETURN_ERRNO(auto pages, Atoi<int64>(parts[0]));
+  ASSIGN_OR_RETURN_ERRNO(auto pages, Atoi<int64_t>(parts[0]));
   return pages * getpagesize();
 }
 
@@ -245,7 +245,7 @@ TEST_F(MMapTest, MapDevZeroSharedFdNoPersistence) {
   // Create a second mapping via the same fd.
   void* psec_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
                         dev_zero.get(), 0);
-  ASSERT_THAT(reinterpret_cast<int64>(psec_map), SyscallSucceeds());
+  ASSERT_THAT(reinterpret_cast<int64_t>(psec_map), SyscallSucceeds());
 
   // Always unmap.
   auto cleanup_psec_map = Cleanup(
@@ -690,10 +690,10 @@ TEST_F(MMapTest, ExceedLimitDataPrlimitPID) {
 }
 
 TEST_F(MMapTest, NoExceedLimitAS) {
-  constexpr uint64 kAllocBytes = 200 << 20;
+  constexpr uint64_t kAllocBytes = 200 << 20;
   // Add some headroom to the AS limit in case of e.g. unexpected stack
   // expansion.
-  constexpr uint64 kExtraASBytes = kAllocBytes + (20 << 20);
+  constexpr uint64_t kExtraASBytes = kAllocBytes + (20 << 20);
   static_assert(kAllocBytes < kExtraASBytes,
                 "test depends on allocation not exceeding AS limit");
 
@@ -708,10 +708,10 @@ TEST_F(MMapTest, NoExceedLimitAS) {
 }
 
 TEST_F(MMapTest, ExceedLimitAS) {
-  constexpr uint64 kAllocBytes = 200 << 20;
+  constexpr uint64_t kAllocBytes = 200 << 20;
   // Add some headroom to the AS limit in case of e.g. unexpected stack
   // expansion.
-  constexpr uint64 kExtraASBytes = 20 << 20;
+  constexpr uint64_t kExtraASBytes = 20 << 20;
   static_assert(kAllocBytes > kExtraASBytes,
                 "test depends on allocation exceeding AS limit");
 
@@ -1469,7 +1469,7 @@ TEST_F(MMapFileTest, InternalSigBusZeroing) {
               SyscallFailsWithErrno(EFAULT));
 }
 
-// Checks that mmaps with a length of uint64(-PAGE_SIZE + 1) or greater do not
+// Checks that mmaps with a length of uint64_t(-PAGE_SIZE + 1) or greater do not
 // induce a sentry panic (due to "rounding up" to 0).
 TEST_F(MMapTest, HugeLength) {
   EXPECT_THAT(Map(0, static_cast<uint64_t>(-kPageSize + 1), PROT_NONE,
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index a5e790729..267ae19f6 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -193,7 +193,7 @@ TEST_F(OpenTest, Fault) {
 
 TEST_F(OpenTest, AppendOnly) {
   // First write some data to the fresh file.
-  const int64 kBufSize = 1024;
+  const int64_t kBufSize = 1024;
   std::vector<char> buf(kBufSize, 'a');
 
   FileDescriptor fd0 = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 55eb9361f..df7129acc 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -363,7 +363,7 @@ TEST_F(PartialBadBufferTest, SendMsgTCP) {
   // byte past the valid page and check that it triggers an EFAULT
   // correctly. Otherwise in gVisor the sendmsg call will just return with no
   // error with kPageSize bytes written successfully.
-  const uint32 buf_size = kPageSize + 1;
+  const uint32_t buf_size = kPageSize + 1;
   ASSERT_THAT(setsockopt(send_socket.get(), SOL_SOCKET, SO_SNDBUF, &buf_size,
                          sizeof(buf_size)),
               SyscallSucceedsWithValue(0));
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index ad39a8463..30f0d75b3 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -26,7 +26,7 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
 // This flag is used to verify that after an exec PR_GET_KEEPCAPS
 // returns 0, the return code will be offset by kPrGetKeepCapsExitBase.
 ABSL_FLAG(bool, prctl_pr_get_keepcaps, false,
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 0d5899ec9..bf9bb45d3 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -463,12 +463,12 @@ std::string AnonymousMapsEntryForMapping(const Mapping& m, int prot) {
   return AnonymousMapsEntry(m.addr(), m.len(), prot);
 }
 
-PosixErrorOr<std::map<uint64, uint64>> ReadProcSelfAuxv() {
+PosixErrorOr<std::map<uint64_t, uint64_t>> ReadProcSelfAuxv() {
   std::string auxv_file;
   RETURN_IF_ERRNO(GetContents("/proc/self/auxv", &auxv_file));
   const Elf64_auxv_t* auxv_data =
       reinterpret_cast<const Elf64_auxv_t*>(auxv_file.data());
-  std::map<uint64, uint64> auxv_entries;
+  std::map<uint64_t, uint64_t> auxv_entries;
   for (int i = 0; auxv_data[i].a_type != AT_NULL; i++) {
     auto a_type = auxv_data[i].a_type;
     EXPECT_EQ(0, auxv_entries.count(a_type)) << "a_type: " << a_type;
@@ -877,7 +877,7 @@ TEST(ProcStat, Fields) {
 
     // All fields besides itime are valid base 10 numbers.
     for (size_t i = 1; i < fields.size(); i++) {
-      uint64 val;
+      uint64_t val;
       EXPECT_TRUE(absl::SimpleAtoi(fields[i], &val)) << proc_stat;
     }
   }
@@ -904,7 +904,7 @@ TEST(ProcLoadavg, Fields) {
   EXPECT_EQ(fields.size(), 6) << proc_loadvg;
 
   double val;
-  uint64 val2;
+  uint64_t val2;
   // First three fields are floating point numbers.
   EXPECT_TRUE(absl::SimpleAtod(fields[0], &val)) << proc_loadvg;
   EXPECT_TRUE(absl::SimpleAtod(fields[1], &val)) << proc_loadvg;
@@ -936,19 +936,19 @@ TEST_P(ProcPidStatTest, HasBasicFields) {
   // boot time will be very close, and the proc starttime field (which is the
   // delta of the two times) will be 0.  For that unfortunate reason, we can
   // only check that starttime >= 0, and not that it is strictly > 0.
-  uint64 starttime;
+  uint64_t starttime;
   ASSERT_TRUE(absl::SimpleAtoi(fields[21], &starttime));
   EXPECT_GE(starttime, 0);
 
-  uint64 vss;
+  uint64_t vss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[22], &vss));
   EXPECT_GT(vss, 0);
 
-  uint64 rss;
+  uint64_t rss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[23], &rss));
   EXPECT_GT(rss, 0);
 
-  uint64 rsslim;
+  uint64_t rsslim;
   ASSERT_TRUE(absl::SimpleAtoi(fields[24], &rsslim));
   EXPECT_GT(rsslim, 0);
 }
@@ -965,11 +965,11 @@ TEST_P(ProcPidStatmTest, HasBasicFields) {
   std::vector<std::string> fields = absl::StrSplit(proc_pid_statm, ' ');
   ASSERT_GE(fields.size(), 7);
 
-  uint64 vss;
+  uint64_t vss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[0], &vss));
   EXPECT_GT(vss, 0);
 
-  uint64 rss;
+  uint64_t rss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[1], &rss));
   EXPECT_GT(rss, 0);
 }
@@ -977,7 +977,7 @@ TEST_P(ProcPidStatmTest, HasBasicFields) {
 INSTANTIATE_TEST_SUITE_P(SelfAndNumericPid, ProcPidStatmTest,
                          ::testing::Values("self", absl::StrCat(getpid())));
 
-PosixErrorOr<uint64> CurrentRSS() {
+PosixErrorOr<uint64_t> CurrentRSS() {
   ASSIGN_OR_RETURN_ERRNO(auto proc_self_stat, GetContents("/proc/self/stat"));
   if (proc_self_stat.empty()) {
     return PosixError(EINVAL, "empty /proc/self/stat");
@@ -990,7 +990,7 @@ PosixErrorOr<uint64> CurrentRSS() {
         absl::StrCat("/proc/self/stat has too few fields: ", proc_self_stat));
   }
 
-  uint64 rss;
+  uint64_t rss;
   if (!absl::SimpleAtoi(fields[23], &rss)) {
     return PosixError(
         EINVAL, absl::StrCat("/proc/self/stat RSS field is not a number: ",
@@ -1002,14 +1002,14 @@ PosixErrorOr<uint64> CurrentRSS() {
 }
 
 // The size of mapping created by MapPopulateRSS.
-constexpr uint64 kMappingSize = 100 << 20;
+constexpr uint64_t kMappingSize = 100 << 20;
 
 // Tolerance on RSS comparisons to account for background thread mappings,
 // reclaimed pages, newly faulted pages, etc.
-constexpr uint64 kRSSTolerance = 5 << 20;
+constexpr uint64_t kRSSTolerance = 5 << 20;
 
 // Capture RSS before and after an anonymous mapping with passed prot.
-void MapPopulateRSS(int prot, uint64* before, uint64* after) {
+void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
   *before = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
 
   // N.B. The kernel asynchronously accumulates per-task RSS counters into the
@@ -1040,7 +1040,7 @@ void MapPopulateRSS(int prot, uint64* before, uint64* after) {
 
 // PROT_WRITE + MAP_POPULATE anonymous mappings are always committed.
 TEST(ProcSelfStat, PopulateWriteRSS) {
-  uint64 before, after;
+  uint64_t before, after;
   MapPopulateRSS(PROT_READ | PROT_WRITE, &before, &after);
 
   // Mapping is committed.
@@ -1049,7 +1049,7 @@ TEST(ProcSelfStat, PopulateWriteRSS) {
 
 // PROT_NONE + MAP_POPULATE anonymous mappings are never committed.
 TEST(ProcSelfStat, PopulateNoneRSS) {
-  uint64 before, after;
+  uint64_t before, after;
   MapPopulateRSS(PROT_NONE, &before, &after);
 
   // Mapping not committed.
@@ -1766,7 +1766,7 @@ TEST(ProcTask, VerifyTaskDirNlinks) {
 
   // Once we reach the test body, we can count on the thread count being stable
   // unless we spawn a new one.
-  uint64 initial_links = ASSERT_NO_ERRNO_AND_VALUE(Links("/proc/self/task"));
+  uint64_t initial_links = ASSERT_NO_ERRNO_AND_VALUE(Links("/proc/self/task"));
   ASSERT_GE(initial_links, 3);
 
   // For each new subtask, we should gain a new link.
@@ -1864,9 +1864,9 @@ TEST(ProcFilesystems, Bug65172365) {
 }
 
 TEST(ProcFilesystems, PresenceOfShmMaxMniAll) {
-  uint64 shmmax = 0;
-  uint64 shmall = 0;
-  uint64 shmmni = 0;
+  uint64_t shmmax = 0;
+  uint64_t shmall = 0;
+  uint64_t shmmni = 0;
   std::string proc_file;
   proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmmax"));
   ASSERT_FALSE(proc_file.empty());
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 77183420b..5b6e3e3cd 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -40,15 +40,15 @@ constexpr char kProcNetTCPHeader[] =
 
 // TCPEntry represents a single entry from /proc/net/tcp.
 struct TCPEntry {
-  uint32 local_addr;
-  uint16 local_port;
+  uint32_t local_addr;
+  uint16_t local_port;
 
-  uint32 remote_addr;
-  uint16 remote_port;
+  uint32_t remote_addr;
+  uint16_t remote_port;
 
-  uint64 state;
-  uint64 uid;
-  uint64 inode;
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
 };
 
 // Finds the first entry in 'entries' for which 'predicate' returns true.
@@ -69,8 +69,8 @@ bool FindBy(const std::vector<TCPEntry>& entries, TCPEntry* match,
 
 bool FindByLocalAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
                      const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const TCPEntry& e) {
     return (e.local_addr == host && e.local_port == port);
   });
@@ -78,8 +78,8 @@ bool FindByLocalAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
 
 bool FindByRemoteAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
                       const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const TCPEntry& e) {
     return (e.remote_addr == host && e.remote_port == port);
   });
@@ -131,8 +131,8 @@ PosixErrorOr<std::vector<TCPEntry>> ProcNetTCPEntries() {
     ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
 
     ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
-    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64>(fields[11]));
-    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64>(fields[13]));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
 
     entries.push_back(entry);
   }
@@ -234,8 +234,8 @@ TEST(ProcNetTCP, State) {
   FileDescriptor accepted =
       ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
 
-  const uint32 accepted_local_host = IPFromInetSockaddr(&addr);
-  const uint16 accepted_local_port = PortFromInetSockaddr(&addr);
+  const uint32_t accepted_local_host = IPFromInetSockaddr(&addr);
+  const uint16_t accepted_local_port = PortFromInetSockaddr(&addr);
 
   entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
   TCPEntry accepted_entry;
@@ -258,14 +258,14 @@ constexpr char kProcNetTCP6Header[] =
 // TCP6Entry represents a single entry from /proc/net/tcp6.
 struct TCP6Entry {
   struct in6_addr local_addr;
-  uint16 local_port;
+  uint16_t local_port;
 
   struct in6_addr remote_addr;
-  uint16 remote_port;
+  uint16_t remote_port;
 
-  uint64 state;
-  uint64 uid;
-  uint64 inode;
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
 };
 
 bool IPv6AddrEqual(const struct in6_addr* a1, const struct in6_addr* a2) {
@@ -296,7 +296,7 @@ const struct in6_addr* IP6FromInetSockaddr(const struct sockaddr* addr) {
 bool FindByLocalAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
                       const struct sockaddr* addr) {
   const struct in6_addr* local = IP6FromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy6(entries, match, [local, port](const TCP6Entry& e) {
     return (IPv6AddrEqual(&e.local_addr, local) && e.local_port == port);
   });
@@ -305,22 +305,22 @@ bool FindByLocalAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
 bool FindByRemoteAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
                        const struct sockaddr* addr) {
   const struct in6_addr* remote = IP6FromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy6(entries, match, [remote, port](const TCP6Entry& e) {
     return (IPv6AddrEqual(&e.remote_addr, remote) && e.remote_port == port);
   });
 }
 
 void ReadIPv6Address(std::string s, struct in6_addr* addr) {
-  uint32 a0, a1, a2, a3;
+  uint32_t a0, a1, a2, a3;
   const char* fmt = "%08X%08X%08X%08X";
   EXPECT_EQ(sscanf(s.c_str(), fmt, &a0, &a1, &a2, &a3), 4);
 
-  uint8* b = addr->s6_addr;
-  *((uint32*)&b[0]) = a0;
-  *((uint32*)&b[4]) = a1;
-  *((uint32*)&b[8]) = a2;
-  *((uint32*)&b[12]) = a3;
+  uint8_t* b = addr->s6_addr;
+  *((uint32_t*)&b[0]) = a0;
+  *((uint32_t*)&b[4]) = a1;
+  *((uint32_t*)&b[8]) = a2;
+  *((uint32_t*)&b[12]) = a3;
 }
 
 // Returns a parsed representation of /proc/net/tcp6 entries.
@@ -367,8 +367,8 @@ PosixErrorOr<std::vector<TCP6Entry>> ProcNetTCP6Entries() {
     ReadIPv6Address(fields[3], &entry.remote_addr);
     ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
     ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
-    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64>(fields[11]));
-    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64>(fields[13]));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
 
     entries.push_back(entry);
   }
@@ -476,7 +476,7 @@ TEST(ProcNetTCP6, State) {
       ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
 
   const struct in6_addr* local = IP6FromInetSockaddr(addr);
-  const uint16 accepted_local_port = PortFromInetSockaddr(addr);
+  const uint16_t accepted_local_port = PortFromInetSockaddr(addr);
 
   entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
   TCP6Entry accepted_entry;
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index 98c1e0cf1..786b4b4af 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -40,15 +40,15 @@ constexpr char kProcNetUDPHeader[] =
 
 // UDPEntry represents a single entry from /proc/net/udp.
 struct UDPEntry {
-  uint32 local_addr;
-  uint16 local_port;
+  uint32_t local_addr;
+  uint16_t local_port;
 
-  uint32 remote_addr;
-  uint16 remote_port;
+  uint32_t remote_addr;
+  uint16_t remote_port;
 
-  uint64 state;
-  uint64 uid;
-  uint64 inode;
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
 };
 
 std::string DescribeFirstInetSocket(const SocketPair& sockets) {
@@ -81,8 +81,8 @@ bool FindBy(const std::vector<UDPEntry>& entries, UDPEntry* match,
 
 bool FindByLocalAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
                      const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const UDPEntry& e) {
     return (e.local_addr == host && e.local_port == port);
   });
@@ -90,14 +90,14 @@ bool FindByLocalAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
 
 bool FindByRemoteAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
                       const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const UDPEntry& e) {
     return (e.remote_addr == host && e.remote_port == port);
   });
 }
 
-PosixErrorOr<uint64> InodeFromSocketFD(int fd) {
+PosixErrorOr<uint64_t> InodeFromSocketFD(int fd) {
   ASSIGN_OR_RETURN_ERRNO(struct stat s, Fstat(fd));
   if (!S_ISSOCK(s.st_mode)) {
     return PosixError(EINVAL, StrFormat("FD %d is not a socket", fd));
@@ -107,7 +107,7 @@ PosixErrorOr<uint64> InodeFromSocketFD(int fd) {
 
 PosixErrorOr<bool> FindByFD(const std::vector<UDPEntry>& entries,
                             UDPEntry* match, int fd) {
-  ASSIGN_OR_RETURN_ERRNO(uint64 inode, InodeFromSocketFD(fd));
+  ASSIGN_OR_RETURN_ERRNO(uint64_t inode, InodeFromSocketFD(fd));
   return FindBy(entries, match,
                 [inode](const UDPEntry& e) { return (e.inode == inode); });
 }
@@ -158,8 +158,8 @@ PosixErrorOr<std::vector<UDPEntry>> ProcNetUDPEntries() {
     ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
 
     ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
-    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64>(fields[11]));
-    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64>(fields[13]));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
 
     // Linux shares internal data structures between TCP and UDP sockets. The
     // proc entries for UDP sockets share some fields with TCP sockets, but
@@ -267,7 +267,7 @@ TEST(ProcNetUDP, BoundEntry) {
   struct sockaddr addr;
   socklen_t len = sizeof(addr);
   ASSERT_THAT(getsockname(socket->get(), &addr, &len), SyscallSucceeds());
-  uint16 port = PortFromInetSockaddr(&addr);
+  uint16_t port = PortFromInetSockaddr(&addr);
 
   std::vector<UDPEntry> entries =
       ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 2fe63f215..66db0acaa 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -46,12 +46,12 @@ enum {
 // UnixEntry represents a single entry from /proc/net/unix.
 struct UnixEntry {
   uintptr_t addr;
-  uint64 refs;
-  uint64 protocol;
-  uint64 flags;
-  uint64 type;
-  uint64 state;
-  uint64 inode;
+  uint64_t refs;
+  uint64_t protocol;
+  uint64_t flags;
+  uint64_t type;
+  uint64_t state;
+  uint64_t inode;
   std::string path;
 };
 
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
index 8e268ebd1..748f7be58 100644
--- a/test/syscalls/linux/proc_pid_uid_gid_map.cc
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -117,13 +117,13 @@ void DenyPidSetgroups(pid_t pid) {
 }
 
 // Returns a valid UID/GID that isn't id.
-uint32 another_id(uint32 id) { return (id + 1) % 65535; }
+uint32_t another_id(uint32_t id) { return (id + 1) % 65535; }
 
 struct TestParam {
   std::string desc;
   int cap;
   std::function<std::string(absl::string_view)> get_map_filename;
-  std::function<uint32()> get_current_id;
+  std::function<uint32_t()> get_current_id;
 };
 
 std::string DescribeTestParam(const ::testing::TestParamInfo<TestParam>& info) {
@@ -135,17 +135,17 @@ std::vector<TestParam> UidGidMapTestParams() {
                     [](absl::string_view pid) {
                       return absl::StrCat("/proc/", pid, "/uid_map");
                     },
-                    []() -> uint32 { return getuid(); }},
+                    []() -> uint32_t { return getuid(); }},
           TestParam{"GID", CAP_SETGID,
                     [](absl::string_view pid) {
                       return absl::StrCat("/proc/", pid, "/gid_map");
                     },
-                    []() -> uint32 { return getgid(); }}};
+                    []() -> uint32_t { return getgid(); }}};
 }
 
 class ProcUidGidMapTest : public ::testing::TestWithParam<TestParam> {
  protected:
-  uint32 CurrentID() { return GetParam().get_current_id(); }
+  uint32_t CurrentID() { return GetParam().get_current_id(); }
 };
 
 class ProcSelfUidGidMapTest : public ProcUidGidMapTest {
@@ -198,7 +198,7 @@ TEST_P(ProcSelfUidGidMapTest, IsInitiallyEmpty) {
 
 TEST_P(ProcSelfUidGidMapTest, IdentityMapOwnID) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
-  uint32 id = CurrentID();
+  uint32_t id = CurrentID();
   std::string line = absl::StrCat(id, " ", id, " 1");
   EXPECT_THAT(
       InNewUserNamespaceWithMapFD([&](int fd) {
@@ -213,7 +213,7 @@ TEST_P(ProcSelfUidGidMapTest, TrailingNewlineAndNULIgnored) {
   // and an invalid (incomplete) map entry are appended to the valid entry. The
   // newline should be accepted, and everything after the NUL should be ignored.
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
-  uint32 id = CurrentID();
+  uint32_t id = CurrentID();
   std::string line = absl::StrCat(id, " ", id, " 1\n\0 4 3");
   EXPECT_THAT(
       InNewUserNamespaceWithMapFD([&](int fd) {
@@ -227,8 +227,8 @@ TEST_P(ProcSelfUidGidMapTest, TrailingNewlineAndNULIgnored) {
 
 TEST_P(ProcSelfUidGidMapTest, NonIdentityMapOwnID) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
-  uint32 id = CurrentID();
-  uint32 id2 = another_id(id);
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
   std::string line = absl::StrCat(id2, " ", id, " 1");
   EXPECT_THAT(
       InNewUserNamespaceWithMapFD([&](int fd) {
@@ -243,8 +243,8 @@ TEST_P(ProcSelfUidGidMapTest, MapOtherID) {
   // Whether or not we have CAP_SET*ID is irrelevant: the process running in the
   // new (child) user namespace won't have any capabilities in the current
   // (parent) user namespace, which is needed.
-  uint32 id = CurrentID();
-  uint32 id2 = another_id(id);
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
   std::string line = absl::StrCat(id, " ", id2, " 1");
   EXPECT_THAT(InNewUserNamespaceWithMapFD([&](int fd) {
                 DenySelfSetgroups();
@@ -270,8 +270,8 @@ TEST_P(ProcPidUidGidMapTest, MapOtherIDPrivileged) {
   std::tie(child_pid, cleanup_child) =
       ASSERT_NO_ERRNO_AND_VALUE(CreateProcessInNewUserNamespace());
 
-  uint32 id = CurrentID();
-  uint32 id2 = another_id(id);
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
   std::string line = absl::StrCat(id, " ", id2, " 1");
   DenyPidSetgroups(child_pid);
   auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenMapFile(child_pid));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 37dabb1ad..8f3800380 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -574,7 +574,7 @@ TEST_P(PtraceExecveTest, Execve_GetRegs_PeekUser_SIGKILL_TraceClone_TraceExit) {
 #ifdef __x86_64__
   {
     // CS should be 0x33, indicating an 64-bit binary.
-    constexpr uint64 kAMD64UserCS = 0x33;
+    constexpr uint64_t kAMD64UserCS = 0x33;
     EXPECT_THAT(ptrace(PTRACE_PEEKUSER, leader_tid,
                        offsetof(struct user_regs_struct, cs), 0),
                 SyscallSucceedsWithValue(kAMD64UserCS));
@@ -862,7 +862,7 @@ TEST(PtraceTest, Int3) {
 
 TEST(PtraceTest, Sysemu_PokeUser) {
   constexpr int kSysemuHelperFirstExitCode = 126;
-  constexpr uint64 kSysemuInjectedExitGroupReturn = 42;
+  constexpr uint64_t kSysemuInjectedExitGroupReturn = 42;
 
   pid_t const child_pid = fork();
   if (child_pid == 0) {
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 5020372c1..dafe64d20 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -109,13 +109,13 @@ constexpr bool IsControlCharacter(char c) { return c <= 31; }
 
 struct Field {
   const char* name;
-  uint64 mask;
-  uint64 value;
+  uint64_t mask;
+  uint64_t value;
 };
 
 // ParseFields returns a string representation of value, using the names in
 // fields.
-std::string ParseFields(const Field* fields, size_t len, uint64 value) {
+std::string ParseFields(const Field* fields, size_t len, uint64_t value) {
   bool first = true;
   std::string s;
   for (size_t i = 0; i < len; i++) {
@@ -1213,8 +1213,8 @@ TEST_F(PtyTest, GetWindowSize) {
 }
 
 TEST_F(PtyTest, SetSlaveWindowSize) {
-  constexpr uint16 kRows = 343;
-  constexpr uint16 kCols = 2401;
+  constexpr uint16_t kRows = 343;
+  constexpr uint16_t kCols = 2401;
   struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
   ASSERT_THAT(ioctl(slave_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
@@ -1226,8 +1226,8 @@ TEST_F(PtyTest, SetSlaveWindowSize) {
 }
 
 TEST_F(PtyTest, SetMasterWindowSize) {
-  constexpr uint16 kRows = 343;
-  constexpr uint16 kCols = 2401;
+  constexpr uint16_t kRows = 343;
+  constexpr uint16_t kCols = 2401;
   struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
   ASSERT_THAT(ioctl(master_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index 18f847929..b48fe540d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -52,7 +52,7 @@ class Pwrite64 : public ::testing::Test {
 TEST_F(Pwrite64, AppendOnly) {
   int fd;
   ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
-  constexpr int64 kBufSize = 1024;
+  constexpr int64_t kBufSize = 1024;
   std::vector<char> buf(kBufSize);
   std::fill(buf.begin(), buf.end(), 'a');
   EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0),
@@ -64,7 +64,7 @@ TEST_F(Pwrite64, AppendOnly) {
 TEST_F(Pwrite64, InvalidArgs) {
   int fd;
   ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
-  constexpr int64 kBufSize = 1024;
+  constexpr int64_t kBufSize = 1024;
   std::vector<char> buf(kBufSize);
   std::fill(buf.begin(), buf.end(), 'a');
   EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), -1),
diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc
index 0c04b974e..0a27506aa 100644
--- a/test/syscalls/linux/raw_socket_hdrincl.cc
+++ b/test/syscalls/linux/raw_socket_hdrincl.cc
@@ -53,7 +53,7 @@ class RawHDRINCL : public ::testing::Test {
   // Fills in buf with an IP header, UDP header, and payload. Returns false if
   // buf_size isn't large enough to hold everything.
   bool FillPacket(char* buf, size_t buf_size, int port, const char* payload,
-                  uint16 payload_size);
+                  uint16_t payload_size);
 
   // The socket used for both reading and writing.
   int socket_;
@@ -104,7 +104,7 @@ struct iphdr RawHDRINCL::LoopbackHeader() {
 }
 
 bool RawHDRINCL::FillPacket(char* buf, size_t buf_size, int port,
-                            const char* payload, uint16 payload_size) {
+                            const char* payload, uint16_t payload_size) {
   if (buf_size < sizeof(struct iphdr) + sizeof(struct udphdr) + payload_size) {
     return false;
   }
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 9b2a76b91..106c045e3 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -43,7 +43,7 @@ namespace {
 // only be cleared by execve (or knowing the old rseq address), and glibc (based
 // on the current unmerged patches) register rseq before calling main()).
 
-int RSeq(struct rseq* rseq, uint32 rseq_len, int flags, uint32 sig) {
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
   return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
 }
 
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
index 238143fd0..ac987a25e 100644
--- a/test/syscalls/linux/rseq/critical.h
+++ b/test/syscalls/linux/rseq/critical.h
@@ -18,7 +18,7 @@
 #include "test/syscalls/linux/rseq/types.h"
 #include "test/syscalls/linux/rseq/uapi.h"
 
-constexpr uint32 kRseqSignature = 0x90909090;
+constexpr uint32_t kRseqSignature = 0x90909090;
 
 extern "C" {
 
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
index 4fe7c5ecf..f036db26d 100644
--- a/test/syscalls/linux/rseq/rseq.cc
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -49,7 +49,7 @@ int strcmp(const char* s1, const char* s2) {
   return static_cast<int>(*p1) - static_cast<int>(*p2);
 }
 
-int sys_rseq(struct rseq* rseq, uint32 rseq_len, int flags, uint32 sig) {
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
   return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
 }
 
@@ -176,10 +176,10 @@ int TestAbort() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   // Loops until abort. If this returns then abort occurred.
   rseq_loop(&r, &cs);
@@ -198,10 +198,10 @@ int TestAbortBefore() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_early_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
 
   // Loops until abort. If this returns then abort occurred.
   rseq_loop(&r, &cs);
@@ -220,10 +220,10 @@ int TestAbortSignature() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   // Loops until abort. This should SIGSEGV on abort.
   rseq_loop(&r, &cs);
@@ -242,10 +242,10 @@ int TestAbortPreCommit() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_pre_commit);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
 
   // Loops until abort. This should SIGSEGV on abort.
   rseq_loop(&r, &cs);
@@ -264,10 +264,10 @@ int TestAbortClearsCS() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   // Loops until abort. If this returns then abort occurred.
   rseq_loop(&r, &cs);
@@ -290,10 +290,10 @@ int TestInvalidAbortClearsCS() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
 
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
index 7f1e0c5c2..b6afe9817 100644
--- a/test/syscalls/linux/rseq/types.h
+++ b/test/syscalls/linux/rseq/types.h
@@ -18,14 +18,14 @@
 using size_t = __SIZE_TYPE__;
 using uintptr_t = __UINTPTR_TYPE__;
 
-using uint8 = __UINT8_TYPE__;
-using uint16 = __UINT16_TYPE__;
-using uint32 = __UINT32_TYPE__;
-using uint64 = __UINT64_TYPE__;
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
 
-using int8 = __INT8_TYPE__;
-using int16 = __INT16_TYPE__;
-using int32 = __INT32_TYPE__;
-using int64 = __INT64_TYPE__;
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
 
 #endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7a2c1191a..7e41fe7d8 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -49,12 +49,12 @@ namespace testing {
 namespace {
 
 // A syscall not implemented by Linux that we don't expect to be called.
-constexpr uint32 kFilteredSyscall = SYS_vserver;
+constexpr uint32_t kFilteredSyscall = SYS_vserver;
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
 // `sysno` and allows all other syscalls. Async-signal-safe.
-void ApplySeccompFilter(uint32 sysno, uint32 filtered_result,
-                        uint32 flags = 0) {
+void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
+                        uint32_t flags = 0) {
   // "Prior to [PR_SET_SECCOMP], the task must call prctl(PR_SET_NO_NEW_PRIVS,
   // 1) or run with CAP_SYS_ADMIN privileges in its namespace." -
   // Documentation/prctl/seccomp_filter.txt
@@ -162,7 +162,7 @@ TEST(SeccompTest, RetKillOnlyKillsOneThread) {
 TEST(SeccompTest, RetTrapCausesSIGSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
-    constexpr uint16 kTrapValue = 0xdead;
+    constexpr uint16_t kTrapValue = 0xdead;
     RegisterSignalHandler(
         SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
           ucontext_t* uc = static_cast<ucontext_t*>(ucv);
@@ -191,7 +191,7 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
 
 #ifdef __x86_64__
 
-constexpr uint64 kVsyscallTimeEntry = 0xffffffffff600400;
+constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
 
 time_t vsyscall_time(time_t* t) {
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
@@ -202,7 +202,7 @@ TEST(SeccompTest, SeccompAppliesToVsyscall) {
 
   pid_t const pid = fork();
   if (pid == 0) {
-    constexpr uint16 kTrapValue = 0xdead;
+    constexpr uint16_t kTrapValue = 0xdead;
     RegisterSignalHandler(
         SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
           ucontext_t* uc = static_cast<ucontext_t*>(ucv);
@@ -335,7 +335,7 @@ TEST(SeccompTest, TsyncAppliesToAllThreads) {
 
 // This test will validate that seccomp(2) rejects unsupported flags.
 TEST(SeccompTest, SeccompRejectsUnknownFlags) {
-  constexpr uint32 kInvalidFlag = 123;
+  constexpr uint32_t kInvalidFlag = 123;
   ASSERT_THAT(
       syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, kInvalidFlag, nullptr),
       SyscallFailsWithErrno(EINVAL));
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index a9e8a44c1..e9b131ca9 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -274,7 +274,7 @@ TEST(SemaphoreTest, SemOpRandom) {
 
   // Protects the seed below.
   absl::Mutex mutex;
-  uint32 seed = time(nullptr);
+  uint32_t seed = time(nullptr);
 
   int count = 0;      // Tracks semaphore value.
   bool done = false;  // Tells waiters to stop after signal threads are done.
@@ -284,7 +284,7 @@ TEST(SemaphoreTest, SemOpRandom) {
   for (auto& dec : decs) {
     dec = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed, &done] {
       for (size_t i = 0; i < 500; ++i) {
-        int16 val;
+        int16_t val;
         {
           absl::MutexLock l(&mutex);
           if (done) {
@@ -325,7 +325,7 @@ TEST(SemaphoreTest, SemOpRandom) {
   for (auto& inc : incs) {
     inc = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed] {
       for (size_t i = 0; i < 500; ++i) {
-        int16 val;
+        int16_t val;
         {
           absl::MutexLock l(&mutex);
           val = (rand_r(&seed) % 10 + 1);  // Rand between 1 and 10.
@@ -415,14 +415,14 @@ TEST(SemaphoreTest, SemCtlValAll) {
   ASSERT_THAT(sem.get(), SyscallSucceeds());
 
   // Semaphores must start with 0.
-  uint16 get[3] = {10, 10, 10};
+  uint16_t get[3] = {10, 10, 10};
   EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
   for (auto v : get) {
     EXPECT_EQ(v, 0);
   }
 
   // SetAll and check that they were set.
-  uint16 vals[3] = {0, 10, 20};
+  uint16_t vals[3] = {0, 10, 20};
   EXPECT_THAT(semctl(sem.get(), 1, SETALL, vals), SyscallSucceedsWithValue(0));
   EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
   for (size_t i = 0; i < ABSL_ARRAYSIZE(vals); ++i) {
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 80700615f..7ba752599 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -30,7 +30,7 @@ namespace {
 
 using ::testing::_;
 
-const uint64 kAllocSize = kPageSize * 128ULL;
+const uint64_t kAllocSize = kPageSize * 128ULL;
 
 PosixErrorOr<char*> Shmat(int shmid, const void* shmaddr, int shmflg) {
   const intptr_t addr =
@@ -320,11 +320,11 @@ TEST(ShmTest, RemovedSegmentsAreDestroyed) {
       Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
   const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
 
-  const uint64 alloc_pages = kAllocSize / kPageSize;
+  const uint64_t alloc_pages = kAllocSize / kPageSize;
 
   struct shm_info info;
   ASSERT_NO_ERRNO(Shmctl(0 /*ignored*/, SHM_INFO, &info));
-  const uint64 before = info.shm_tot;
+  const uint64_t before = info.shm_tot;
 
   ASSERT_NO_ERRNO(shm.Rmid());
   ASSERT_NO_ERRNO(Shmdt(addr));
@@ -400,7 +400,7 @@ TEST(ShmDeathTest, SegmentNotAccessibleAfterDetach) {
 TEST(ShmTest, RequestingSegmentSmallerThanSHMMINFails) {
   struct shminfo info;
   ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
-  const uint64 size = info.shmmin - 1;
+  const uint64_t size = info.shmmin - 1;
   EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
               PosixErrorIs(EINVAL, _));
 }
@@ -408,7 +408,7 @@ TEST(ShmTest, RequestingSegmentSmallerThanSHMMINFails) {
 TEST(ShmTest, RequestingSegmentLargerThanSHMMAXFails) {
   struct shminfo info;
   ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
-  const uint64 size = info.shmmax + kPageSize;
+  const uint64_t size = info.shmmax + kPageSize;
   EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
               PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 9a0816e10..62b04ef1d 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -114,7 +114,7 @@ TEST(SigaltstackTest, ResetByExecve) {
 
 volatile bool badhandler_on_sigaltstack = true;      // Set by the handler.
 char* volatile badhandler_low_water_mark = nullptr;  // Set by the handler.
-volatile uint8 badhandler_recursive_faults = 0;      // Consumed by the handler.
+volatile uint8_t badhandler_recursive_faults = 0;    // Consumed by the handler.
 
 void badhandler(int sig, siginfo_t* siginfo, void* arg) {
   char stack_var = 0;
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 207506569..a47c781ea 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -28,8 +28,8 @@ namespace testing {
 
 namespace {
 
-constexpr uint64 kOrigRcx = 0xdeadbeeffacefeed;
-constexpr uint64 kOrigR11 = 0xfacefeedbaad1dea;
+constexpr uint64_t kOrigRcx = 0xdeadbeeffacefeed;
+constexpr uint64_t kOrigR11 = 0xfacefeedbaad1dea;
 
 volatile int gotvtalrm, ready;
 
@@ -40,8 +40,8 @@ void sigvtalrm(int sig, siginfo_t* siginfo, void* _uc) {
   // - test is in the busy-wait loop waiting for signal.
   // - %rcx and %r11 values in mcontext_t match kOrigRcx and kOrigR11.
   if (ready &&
-      static_cast<uint64>(uc->uc_mcontext.gregs[REG_RCX]) == kOrigRcx &&
-      static_cast<uint64>(uc->uc_mcontext.gregs[REG_R11]) == kOrigR11) {
+      static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_RCX]) == kOrigRcx &&
+      static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_R11]) == kOrigR11) {
     // Modify the values %rcx and %r11 in the ucontext. These are the
     // values seen by the application after the signal handler returns.
     uc->uc_mcontext.gregs[REG_RCX] = ~kOrigRcx;
@@ -69,8 +69,8 @@ TEST(SigIretTest, CheckRcxR11) {
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_VIRTUAL, itimer));
 
   // Initialize %rcx and %r11 and spin until the signal handler returns.
-  uint64 rcx = kOrigRcx;
-  uint64 r11 = kOrigR11;
+  uint64_t rcx = kOrigRcx;
+  uint64_t r11 = kOrigR11;
   asm volatile(
       "movq %[rcx], %%rcx;"                      // %rcx = rcx
       "movq %[r11], %%r11;"                      // %r11 = r11
@@ -91,7 +91,7 @@ TEST(SigIretTest, CheckRcxR11) {
   EXPECT_EQ(r11, ~kOrigR11);
 }
 
-constexpr uint64 kNonCanonicalRip = 0xCCCC000000000000;
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
 
 // Test that a non-canonical signal handler faults as expected.
 TEST(SigIretTest, BadHandler) {
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index c705da1b4..5ed57625c 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -77,13 +77,13 @@ class BindToDeviceDistributionTest
   }
 };
 
-PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
   switch (family) {
     case AF_INET:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
     case AF_INET6:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
     default:
       return PosixError(EINVAL,
@@ -91,7 +91,7 @@ PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
   }
 }
 
-PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16 port) {
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
   switch (family) {
     case AF_INET:
       reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
@@ -157,7 +157,7 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
   }
@@ -190,7 +190,7 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
             // cause the test to use absurd amounts of memory.
             //
             // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
-            uint16 data;
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -296,7 +296,7 @@ TEST_P(BindToDeviceDistributionTest, Udp) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index ee9856f7f..e8f24a59e 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -507,7 +507,7 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
 
   struct timeval_with_extra {
     struct timeval tv;
-    int64 extra_data;
+    int64_t extra_data;
   } ABSL_ATTRIBUTE_PACKED;
 
   timeval_with_extra tv_extra;
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 12df2b35a..2f9821555 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -47,13 +47,13 @@ namespace {
 
 using ::testing::Gt;
 
-PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
   switch (family) {
     case AF_INET:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
     case AF_INET6:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
     default:
       return PosixError(EINVAL,
@@ -61,7 +61,7 @@ PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
   }
 }
 
-PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16 port) {
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
   switch (family) {
     case AF_INET:
       reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
@@ -276,7 +276,7 @@ void tcpSimpleConnectTest(TestAddress const& listener,
   ASSERT_THAT(getsockname(listen_fd.get(),
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -339,7 +339,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   ASSERT_THAT(getsockname(listen_fd.get(),
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   DisableSave ds;  // Too many system calls.
@@ -400,7 +400,7 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   ASSERT_THAT(getsockname(listen_fd.get(),
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
   int i = 0;
   while (1) {
@@ -468,7 +468,7 @@ TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -576,7 +576,7 @@ TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -650,7 +650,7 @@ TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -717,7 +717,7 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -794,7 +794,7 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  const uint16 port =
+  const uint16_t port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Set the userTimeout on the listening socket.
@@ -898,7 +898,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
@@ -935,7 +935,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
             // cause the test to use absurd amounts of memory.
             //
             // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
-            uint16 data;
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -1022,7 +1022,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
@@ -1138,7 +1138,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
@@ -1174,7 +1174,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
     pollfds[i].events = POLLIN;
   }
 
-  std::map<uint16, int> portToFD;
+  std::map<uint16_t, int> portToFD;
 
   int received = 0;
   while (received < kConnectAttempts * 2) {
@@ -1196,7 +1196,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
                       fd, &data, sizeof(data), 0,
                       reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
                   SyscallSucceedsWithValue(sizeof(data)));
-      uint16 const port =
+      uint16_t const port =
           ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
       auto prev_port = portToFD.find(port);
       // Check that all packets from one client have been delivered to the
@@ -1257,7 +1257,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedLoopbackOnlyReservesV4) {
     ASSERT_THAT(getsockname(fd_dual.get(),
                             reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
                 SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
     // Verify that we can still bind the v6 loopback on the same port.
@@ -1309,7 +1309,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedAnyOnlyReservesV4) {
     ASSERT_THAT(getsockname(fd_dual.get(),
                             reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
                 SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
     // Verify that we can still bind the v6 loopback on the same port.
@@ -1360,7 +1360,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReservesEverything) {
   ASSERT_THAT(getsockname(fd_dual.get(),
                           reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
   // Verify that binding the v6 loopback with the same port fails.
@@ -1419,7 +1419,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
     ASSERT_THAT(getsockname(fd_dual.get(),
                             reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
                 SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
     // Verify that binding the v6 loopback with the same port fails.
@@ -1498,7 +1498,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
                             reinterpret_cast<sockaddr*>(&connected_addr),
                             &connected_addr_len),
                 SyscallSucceeds());
-    uint16 const ephemeral_port =
+    uint16_t const ephemeral_port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
     // Verify that we actually got an ephemeral port.
@@ -1603,7 +1603,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
                           reinterpret_cast<sockaddr*>(&connected_addr),
                           &connected_addr_len),
               SyscallSucceeds());
-  uint16 const ephemeral_port =
+  uint16_t const ephemeral_port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
   // Verify that we actually got an ephemeral port.
@@ -1665,7 +1665,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
                             reinterpret_cast<sockaddr*>(&connected_addr),
                             &connected_addr_len),
                 SyscallSucceeds());
-    uint16 const ephemeral_port =
+    uint16_t const ephemeral_port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
     // Verify that we actually got an ephemeral port.
@@ -1794,7 +1794,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest,
                           reinterpret_cast<sockaddr*>(&connected_addr),
                           &connected_addr_len),
               SyscallSucceeds());
-  uint16 const ephemeral_port =
+  uint16_t const ephemeral_port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
   // Verify that we actually got an ephemeral port.
@@ -1856,7 +1856,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
                             reinterpret_cast<sockaddr*>(&connected_addr),
                             &connected_addr_len),
                 SyscallSucceeds());
-    uint16 const ephemeral_port =
+    uint16_t const ephemeral_port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
     // Verify that we actually got an ephemeral port.
@@ -1988,7 +1988,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
                           reinterpret_cast<sockaddr*>(&connected_addr),
                           &connected_addr_len),
               SyscallSucceeds());
-  uint16 const ephemeral_port =
+  uint16_t const ephemeral_port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
   // Verify that we actually got an ephemeral port.
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index 4a8337159..ca597e267 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -223,7 +223,7 @@ TEST_P(IPUnboundSocketTest, CheckSkipECN) {
   TOSOption t = GetTOSOption(GetParam().domain);
   EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
               SyscallSucceedsWithValue(0));
-  int expect = static_cast<uint8>(set);
+  int expect = static_cast<uint8_t>(set);
   if (GetParam().protocol == IPPROTO_TCP) {
     expect &= ~INET_ECN_MASK;
   }
@@ -267,7 +267,7 @@ TEST_P(IPUnboundSocketTest, SmallTOSOptionSize) {
       EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
                   SyscallSucceedsWithValue(0));
       expect_tos = set;
-      expect_sz = sizeof(uint8);
+      expect_sz = sizeof(uint8_t);
     } else {
       EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
                   SyscallFailsWithErrno(EINVAL));
@@ -314,7 +314,7 @@ TEST_P(IPUnboundSocketTest, NegativeTOS) {
               SyscallSucceedsWithValue(0));
   int expect;
   if (GetParam().domain == AF_INET) {
-    expect = static_cast<uint8>(set);
+    expect = static_cast<uint8_t>(set);
     if (GetParam().protocol == IPPROTO_TCP) {
       expect &= ~INET_ECN_MASK;
     }
@@ -340,7 +340,7 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTOS) {
   if (GetParam().domain == AF_INET) {
     EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
                 SyscallSucceedsWithValue(0));
-    expect = static_cast<uint8>(set);
+    expect = static_cast<uint8_t>(set);
     if (GetParam().protocol == IPPROTO_TCP) {
       expect &= ~INET_ECN_MASK;
     }
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index 689014a59..405dbbd73 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -70,14 +70,14 @@ TEST(NetdeviceTest, Netmask) {
   // netmask obtained via ioctl.
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -109,7 +109,7 @@ TEST(NetdeviceTest, Netmask) {
 
         struct ifaddrmsg *ifaddrmsg =
             reinterpret_cast<struct ifaddrmsg *>(NLMSG_DATA(hdr));
-        if (ifaddrmsg->ifa_index == static_cast<uint32>(ifr.ifr_ifindex) &&
+        if (ifaddrmsg->ifa_index == static_cast<uint32_t>(ifr.ifr_ifindex) &&
             ifaddrmsg->ifa_family == AF_INET) {
           prefixlen = ifaddrmsg->ifa_prefixlen;
         }
@@ -120,7 +120,7 @@ TEST(NetdeviceTest, Netmask) {
 
   // Netmask is stored big endian in struct sockaddr_in, so we do the same for
   // comparison.
-  uint32 mask = 0xffffffff << (32 - prefixlen);
+  uint32_t mask = 0xffffffff << (32 - prefixlen);
   mask = absl::gbswap_32(mask);
 
   // Check that the loopback interface has the correct subnet mask.
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 5612f1a13..ef567f512 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -116,14 +116,14 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -164,7 +164,7 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -198,7 +198,7 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -238,7 +238,7 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -274,7 +274,7 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
 TEST(NetlinkRouteTest, ControlMessageIgnored) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr control_hdr;
@@ -282,7 +282,7 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
 
@@ -310,14 +310,14 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
 TEST(NetlinkRouteTest, GetAddrDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -371,14 +371,14 @@ TEST(NetlinkRouteTest, LookupAll) {
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct rtmsg rtm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -454,7 +454,7 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -531,7 +531,7 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -611,7 +611,7 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -659,7 +659,7 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 17f99c238..723f5d728 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -40,7 +40,7 @@ PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol) {
   return std::move(fd);
 }
 
-PosixErrorOr<uint32> NetlinkPortID(int fd) {
+PosixErrorOr<uint32_t> NetlinkPortID(int fd) {
   struct sockaddr_nl addr;
   socklen_t addrlen = sizeof(addr);
 
@@ -48,7 +48,7 @@ PosixErrorOr<uint32> NetlinkPortID(int fd) {
       getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addrlen));
   MaybeSave();
 
-  return static_cast<uint32>(addr.nl_pid);
+  return static_cast<uint32_t>(addr.nl_pid);
 }
 
 PosixError NetlinkRequestResponse(
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index bd0c1d79b..76e772c48 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -30,7 +30,7 @@ namespace testing {
 PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
 
 // Returns the port ID of the passed socket.
-PosixErrorOr<uint32> NetlinkPortID(int fd);
+PosixErrorOr<uint32_t> NetlinkPortID(int fd);
 
 // Send the passed request and call fn will all response netlink messages.
 PosixError NetlinkRequestResponse(
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 2169ff1c6..eff7d577e 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -507,7 +507,7 @@ void TransferTest(int fd1, int fd2) {
 
 // Initializes the given buffer with random data.
 void RandomizeBuffer(char* ptr, size_t len) {
-  uint32 seed = time(nullptr);
+  uint32_t seed = time(nullptr);
   for (size_t i = 0; i < len; ++i) {
     ptr[i] = static_cast<char>(rand_r(&seed));
   }
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 562b6a8d4..85232cb1f 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -139,7 +139,7 @@ TEST(SpliceTest, PipeOffsets) {
 // Event FDs may be used with splice without an offset.
 TEST(SpliceTest, FromEventFD) {
   // Open the input eventfd with an initial value so that it is readable.
-  constexpr uint64 kEventFDValue = 1;
+  constexpr uint64_t kEventFDValue = 1;
   int efd;
   ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds());
   const FileDescriptor in_fd(efd);
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 6b259cb89..30de2f8ff 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -568,35 +568,35 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
 
 // struct kernel_statx_timestamp is a Linux statx_timestamp struct.
 struct kernel_statx_timestamp {
-  int64 tv_sec;
-  uint32 tv_nsec;
-  int32 __reserved;
+  int64_t tv_sec;
+  uint32_t tv_nsec;
+  int32_t __reserved;
 };
 
 // struct kernel_statx is a Linux statx struct. Old versions of glibc do not
 // expose it. See include/uapi/linux/stat.h
 struct kernel_statx {
-  uint32 stx_mask;
-  uint32 stx_blksize;
-  uint64 stx_attributes;
-  uint32 stx_nlink;
-  uint32 stx_uid;
-  uint32 stx_gid;
-  uint16 stx_mode;
-  uint16 __spare0[1];
-  uint64 stx_ino;
-  uint64 stx_size;
-  uint64 stx_blocks;
-  uint64 stx_attributes_mask;
+  uint32_t stx_mask;
+  uint32_t stx_blksize;
+  uint64_t stx_attributes;
+  uint32_t stx_nlink;
+  uint32_t stx_uid;
+  uint32_t stx_gid;
+  uint16_t stx_mode;
+  uint16_t __spare0[1];
+  uint64_t stx_ino;
+  uint64_t stx_size;
+  uint64_t stx_blocks;
+  uint64_t stx_attributes_mask;
   struct kernel_statx_timestamp stx_atime;
   struct kernel_statx_timestamp stx_btime;
   struct kernel_statx_timestamp stx_ctime;
   struct kernel_statx_timestamp stx_mtime;
-  uint32 stx_rdev_major;
-  uint32 stx_rdev_minor;
-  uint32 stx_dev_major;
-  uint32 stx_dev_minor;
-  uint64 __spare2[14];
+  uint32_t stx_rdev_major;
+  uint32_t stx_rdev_minor;
+  uint32_t stx_dev_major;
+  uint32_t stx_dev_minor;
+  uint64_t __spare2[14];
 };
 
 int statx(int dirfd, const char *pathname, int flags, unsigned int mask,
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index abcabaffb..7e73325bf 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -29,8 +29,8 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "first scratch UID");
-ABSL_FLAG(int32, scratch_gid, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "first scratch GID");
 
 namespace gvisor {
 namespace testing {
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index d98d6be91..819fa655a 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -26,8 +26,8 @@ namespace testing {
 
 namespace {
 
-constexpr uint64 kNonCanonicalRip = 0xCCCC000000000000;
-constexpr uint64 kNonCanonicalRsp = 0xFFFF000000000000;
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
+constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
 
 class SysretTest : public ::testing::Test {
  protected:
@@ -60,12 +60,12 @@ class SysretTest : public ::testing::Test {
     ASSERT_THAT(ptrace(PTRACE_DETACH, child_, 0, 0), SyscallSucceeds());
   }
 
-  void SetRip(uint64 newrip) {
+  void SetRip(uint64_t newrip) {
     regs_.rip = newrip;
     ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
   }
 
-  void SetRsp(uint64 newrsp) {
+  void SetRsp(uint64_t newrsp) {
     regs_.rsp = newrsp;
     ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
   }
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index cb304d6f5..33a5ac66c 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -640,7 +640,7 @@ TEST_P(TcpSocketTest, Tiocinq) {
   size_t size = sizeof(buf);
   ASSERT_THAT(RetryEINTR(write)(s_, buf, size), SyscallSucceedsWithValue(size));
 
-  uint32 seed = time(nullptr);
+  uint32_t seed = time(nullptr);
   const size_t max_chunk = size / 10;
   while (size > 0) {
     size_t chunk = (rand_r(&seed) % max_chunk) + 1;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 03e028f50..c7eead17e 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -28,7 +28,7 @@ constexpr long kFudgeSeconds = 5;
 
 // Mimics the time(2) wrapper from glibc prior to 2.15.
 time_t vsyscall_time(time_t* t) {
-  constexpr uint64 kVsyscallTimeEntry = 0xffffffffff600400;
+  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
 }
 
@@ -63,7 +63,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
 }
 
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
-  constexpr uint64 kVsyscallGettimeofdayEntry = 0xffffffffff600000;
+  constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
       kVsyscallGettimeofdayEntry)(tv, tz);
 }
diff --git a/test/syscalls/linux/timerfd.cc b/test/syscalls/linux/timerfd.cc
index d87dbc666..86ed87b7c 100644
--- a/test/syscalls/linux/timerfd.cc
+++ b/test/syscalls/linux/timerfd.cc
@@ -69,9 +69,9 @@ TEST_P(TimerfdTest, SingleShot) {
 
   // The timer should fire exactly once since the interval is zero.
   absl::SleepFor(kDelay + TimerSlack());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 }
 
@@ -89,9 +89,9 @@ TEST_P(TimerfdTest, Periodic) {
   // Expect to see at least kPeriods expirations. More may occur due to the
   // timer slack, or due to delays from scheduling or save/restore.
   absl::SleepFor(kPeriods * kDelay + TimerSlack());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_GE(val, kPeriods);
 }
 
@@ -106,9 +106,9 @@ TEST_P(TimerfdTest, BlockingRead) {
               SyscallSucceeds());
 
   // read should block until the timer fires.
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   auto const end_time = absl::Now();
   EXPECT_EQ(1, val);
   EXPECT_GE((end_time - start_time) + TimerSlack(), kDelay);
@@ -122,8 +122,8 @@ TEST_P(TimerfdTest, NonblockingRead_NoRandomSave) {
 
   // Since the timer is initially disabled and has never fired, read should
   // return EAGAIN.
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 
   DisableSave ds;  // Timing-sensitive.
@@ -135,19 +135,19 @@ TEST_P(TimerfdTest, NonblockingRead_NoRandomSave) {
               SyscallSucceeds());
 
   // Since the timer has not yet fired, read should return EAGAIN.
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 
   ds.reset();  // No longer timing-sensitive.
 
   // After the timer fires, read should indicate 1 expiration.
   absl::SleepFor(kDelay + TimerSlack());
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 
   // The successful read should have reset the number of expirations.
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 }
 
@@ -179,8 +179,8 @@ TEST_P(TimerfdTest, BlockingPoll_SetTimeResetsExpirations) {
   its.it_value.tv_sec = 0;
   ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
               SyscallSucceeds());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 }
 
@@ -198,16 +198,16 @@ TEST_P(TimerfdTest, SetAbsoluteTime) {
               SyscallSucceeds());
 
   absl::SleepFor(kDelay + TimerSlack());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 }
 
 TEST_P(TimerfdTest, IllegalReadWrite) {
   auto const tfd =
       ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), TFD_NONBLOCK));
-  uint64 val = 0;
+  uint64_t val = 0;
   EXPECT_THAT(PreadFd(tfd.get(), &val, sizeof(val), 0),
               SyscallFailsWithErrno(ESPIPE));
   EXPECT_THAT(WriteFd(tfd.get(), &val, sizeof(val)),
@@ -244,9 +244,9 @@ TEST(TimerfdClockRealtimeTest, ClockRealtime) {
   ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
               SyscallSucceeds());
 
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 }
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index af94d7baa..a2f6ef8cc 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -34,7 +34,7 @@ namespace gvisor {
 namespace testing {
 
 // Gets a pointer to the port component of the given address.
-uint16* Port(struct sockaddr_storage* addr) {
+uint16_t* Port(struct sockaddr_storage* addr) {
   switch (addr->ss_family) {
     case AF_INET: {
       auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
@@ -331,7 +331,7 @@ TEST_P(UdpSocketTest, Connect) {
   EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
 }
 
-void ConnectAny(AddressFamily family, int sockfd, uint16 port) {
+void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
   struct sockaddr_storage addr = {};
 
   // Precondition check.
@@ -1398,7 +1398,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
   received_iov.iov_len = kDataLength;
   received_msg.msg_iov = &received_iov;
   received_msg.msg_iovlen = 1;
-  size_t cmsg_data_len = sizeof(int8);
+  size_t cmsg_data_len = sizeof(int8_t);
   if (sent_type == IPV6_TCLASS) {
     cmsg_data_len = sizeof(int);
   }
@@ -1413,7 +1413,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
   EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
   EXPECT_EQ(cmsg->cmsg_level, sent_level);
   EXPECT_EQ(cmsg->cmsg_type, sent_type);
-  int8 received_tos = 0;
+  int8_t received_tos = 0;
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
@@ -1453,7 +1453,7 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   sent_iov.iov_len = kDataLength;
   sent_msg.msg_iov = &sent_iov;
   sent_msg.msg_iovlen = 1;
-  size_t cmsg_data_len = sizeof(int8);
+  size_t cmsg_data_len = sizeof(int8_t);
   if (sent_level == SOL_IPV6) {
     sent_type = IPV6_TCLASS;
     cmsg_data_len = sizeof(int);
@@ -1467,7 +1467,7 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
   sent_cmsg->cmsg_level = sent_level;
   sent_cmsg->cmsg_type = sent_type;
-  *(int8*)CMSG_DATA(sent_cmsg) = sent_tos;
+  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
 
   ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
               SyscallSucceedsWithValue(kDataLength));
@@ -1491,7 +1491,7 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
   EXPECT_EQ(cmsg->cmsg_level, sent_level);
   EXPECT_EQ(cmsg->cmsg_type, sent_type);
-  int8 received_tos = 0;
+  int8_t received_tos = 0;
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index e0e39e5e3..6218fbce1 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -27,10 +27,10 @@
 #include "test/util/thread_util.h"
 #include "test/util/uid_util.h"
 
-ABSL_FLAG(int32, scratch_uid1, 65534, "first scratch UID");
-ABSL_FLAG(int32, scratch_uid2, 65533, "second scratch UID");
-ABSL_FLAG(int32, scratch_gid1, 65534, "first scratch GID");
-ABSL_FLAG(int32, scratch_gid2, 65533, "second scratch GID");
+ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
+ABSL_FLAG(int32_t, scratch_gid1, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_gid2, 65533, "second scratch GID");
 
 using ::testing::UnorderedElementsAreArray;
 
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index e7bae9c07..3a927a430 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -163,12 +163,12 @@ TEST(FutimesatTest, OnRelPath) {
 TEST(FutimesatTest, InvalidNsec) {
   auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   struct timeval times[4][2] = {{
-                                    {0, 1},                       // Valid
-                                    {1, static_cast<int64>(1e7)}  // Invalid
+                                    {0, 1},                         // Valid
+                                    {1, static_cast<int64_t>(1e7)}  // Invalid
                                 },
                                 {
-                                    {1, static_cast<int64>(1e7)},  // Invalid
-                                    {0, 1}                         // Valid
+                                    {1, static_cast<int64_t>(1e7)},  // Invalid
+                                    {0, 1}                           // Valid
                                 },
                                 {
                                     {0, 1},  // Valid
@@ -288,14 +288,15 @@ TEST(UtimeTest, ZeroAtimeandMtime) {
 
 TEST(UtimensatTest, InvalidNsec) {
   auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  struct timespec times[2][2] = {{
-                                     {0, UTIME_OMIT},               // Valid
-                                     {2, static_cast<int64>(1e10)}  // Invalid
-                                 },
-                                 {
-                                     {2, static_cast<int64>(1e10)},  // Invalid
-                                     {0, UTIME_OMIT}                 // Valid
-                                 }};
+  struct timespec times[2][2] = {
+      {
+          {0, UTIME_OMIT},                 // Valid
+          {2, static_cast<int64_t>(1e10)}  // Invalid
+      },
+      {
+          {2, static_cast<int64_t>(1e10)},  // Invalid
+          {0, UTIME_OMIT}                   // Valid
+      }};
 
   for (unsigned int i = 0; i < sizeof(times) / sizeof(times[0]); i++) {
     std::cout << "test:" << i << "\n";
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 153b3bd69..0aaba482d 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -51,7 +51,7 @@ constexpr absl::Duration kChildDelay = absl::Seconds(10);
 // errno, so kChildExitCode is chosen to be an unlikely errno:
 constexpr int kChildExitCode = 118;  // ENOTNAM: Not a XENIX named type file
 
-int64 MonotonicNow() {
+int64_t MonotonicNow() {
   struct timespec now;
   TEST_PCHECK(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
   return now.tv_sec * 1000000000ll + now.tv_nsec;
@@ -62,7 +62,7 @@ TEST(VforkTest, ParentStopsUntilChildExits) {
     // N.B. Run the test in a single-threaded subprocess because
     // vfork is not safe in a multi-threaded process.
 
-    const int64 start = MonotonicNow();
+    const int64_t start = MonotonicNow();
 
     pid_t pid = vfork();
     if (pid == 0) {
@@ -72,7 +72,7 @@ TEST(VforkTest, ParentStopsUntilChildExits) {
     TEST_PCHECK_MSG(pid > 0, "vfork failed");
     MaybeSave();
 
-    const int64 end = MonotonicNow();
+    const int64_t end = MonotonicNow();
 
     absl::Duration dur = absl::Nanoseconds(end - start);
 
@@ -92,7 +92,7 @@ TEST(VforkTest, ParentStopsUntilChildExecves_NoRandomSave) {
   char* const* const child_argv = owned_child_argv.get();
 
   const auto test = [&] {
-    const int64 start = MonotonicNow();
+    const int64_t start = MonotonicNow();
 
     pid_t pid = vfork();
     if (pid == 0) {
@@ -104,7 +104,7 @@ TEST(VforkTest, ParentStopsUntilChildExecves_NoRandomSave) {
     // since the test expects an upper bound on the time spent
     // stopped.
     int saved_errno = errno;
-    const int64 end = MonotonicNow();
+    const int64_t end = MonotonicNow();
     errno = saved_errno;
     TEST_PCHECK_MSG(pid > 0, "vfork failed");
     MaybeSave();
@@ -143,7 +143,7 @@ TEST(VforkTest, ExecedChildExitDoesntUnstopParent_NoRandomSave) {
     // pid1 exec'd and is now sleeping.
     SleepSafe(kChildDelay / 2);
 
-    const int64 start = MonotonicNow();
+    const int64_t start = MonotonicNow();
 
     pid_t pid2 = vfork();
     if (pid2 == 0) {
@@ -153,7 +153,7 @@ TEST(VforkTest, ExecedChildExitDoesntUnstopParent_NoRandomSave) {
     TEST_PCHECK_MSG(pid2 > 0, "vfork failed");
     MaybeSave();
 
-    const int64 end = MonotonicNow();
+    const int64_t end = MonotonicNow();
 
     absl::Duration dur = absl::Nanoseconds(end - start);
 
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index 99e8c6cea..2c2303358 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -25,7 +25,7 @@ namespace testing {
 namespace {
 
 time_t vsyscall_time(time_t* t) {
-  constexpr uint64 kVsyscallTimeEntry = 0xffffffffff600400;
+  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
 }
 
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index 709b87a21..944149d5e 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -64,7 +64,7 @@ static const size_t kStackSize = 2 * kPageSize;
 // The child thread created in CloneAndExit runs this function.
 // This child does not have the TLS setup, so it must not use glibc functions.
 int CloneChild(void* priv) {
-  int64 sleep = reinterpret_cast<int64>(priv);
+  int64_t sleep = reinterpret_cast<int64_t>(priv);
   SleepSafe(absl::Seconds(sleep));
 
   // glibc's _exit(2) function wrapper will helpfully call exit_group(2),
@@ -75,7 +75,7 @@ int CloneChild(void* priv) {
 
 // ForkAndExit forks a child process which exits with exit_code, after
 // sleeping for the specified duration (seconds).
-pid_t ForkAndExit(int exit_code, int64 sleep) {
+pid_t ForkAndExit(int exit_code, int64_t sleep) {
   pid_t child = fork();
   if (child == 0) {
     SleepSafe(absl::Seconds(sleep));
@@ -84,16 +84,16 @@ pid_t ForkAndExit(int exit_code, int64 sleep) {
   return child;
 }
 
-int64 clock_gettime_nsecs(clockid_t id) {
+int64_t clock_gettime_nsecs(clockid_t id) {
   struct timespec ts;
   TEST_PCHECK(clock_gettime(id, &ts) == 0);
   return (ts.tv_sec * 1000000000 + ts.tv_nsec);
 }
 
-void spin(int64 sec) {
-  int64 ns = sec * 1000000000;
-  int64 start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
-  int64 end = start + ns;
+void spin(int64_t sec) {
+  int64_t ns = sec * 1000000000;
+  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+  int64_t end = start + ns;
 
   do {
     constexpr int kLoopCount = 1000000;  // large and arbitrary
@@ -105,7 +105,7 @@ void spin(int64 sec) {
 
 // ForkSpinAndExit forks a child process which exits with exit_code, after
 // spinning for the specified duration (seconds).
-pid_t ForkSpinAndExit(int exit_code, int64 spintime) {
+pid_t ForkSpinAndExit(int exit_code, int64_t spintime) {
   pid_t child = fork();
   if (child == 0) {
     spin(spintime);
@@ -141,7 +141,7 @@ int FreeStack(uintptr_t addr) {
 // CloneAndExit clones a child thread, which exits with 0 after sleeping for
 // the specified duration (must be in seconds). extra_flags are ORed against
 // the standard clone(2) flags.
-int CloneAndExit(int64 sleep, uintptr_t stack, int extra_flags) {
+int CloneAndExit(int64_t sleep, uintptr_t stack, int extra_flags) {
   return clone(CloneChild, reinterpret_cast<void*>(stack),
                CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_VM | extra_flags,
                reinterpret_cast<void*>(sleep));
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 51119f22f..23eea51a2 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -33,9 +33,9 @@ namespace testing {
 // destroyed.
 inline PosixErrorOr<Cleanup> Mount(const std::string &source,
                                    const std::string &target,
-                                   const std::string &fstype, uint64 mountflags,
-                                   const std::string &data,
-                                   uint64 umountflags) {
+                                   const std::string &fstype,
+                                   uint64_t mountflags, const std::string &data,
+                                   uint64_t umountflags) {
   if (mount(source.c_str(), target.c_str(), fstype.c_str(), mountflags,
             data.c_str()) == -1) {
     return PosixError(errno, "mount failed");
diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc
index ba601f300..8b676751b 100644
--- a/test/util/multiprocess_util.cc
+++ b/test/util/multiprocess_util.cc
@@ -135,7 +135,7 @@ PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
   return ForkAndExecHelper(exec_fn, fn, child, execve_errno);
 }
 
-PosixErrorOr<Cleanup> ForkAndExecveat(const int32 dirfd,
+PosixErrorOr<Cleanup> ForkAndExecveat(const int32_t dirfd,
                                       const std::string& pathname,
                                       const ExecveArray& argv,
                                       const ExecveArray& envv, const int flags,
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index 342e73a52..3e736261b 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -103,13 +103,14 @@ inline PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
 }
 
 // Equivalent to ForkAndExec, except using dirfd and flags with execveat.
-PosixErrorOr<Cleanup> ForkAndExecveat(int32 dirfd, const std::string& pathname,
+PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd,
+                                      const std::string& pathname,
                                       const ExecveArray& argv,
                                       const ExecveArray& envv, int flags,
                                       const std::function<void()>& fn,
                                       pid_t* child, int* execve_errno);
 
-inline PosixErrorOr<Cleanup> ForkAndExecveat(int32 dirfd,
+inline PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd,
                                              const std::string& pathname,
                                              const ExecveArray& argv,
                                              const ExecveArray& envv, int flags,
diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc
index c81f363ef..34d636ba9 100644
--- a/test/util/proc_util.cc
+++ b/test/util/proc_util.cc
@@ -72,7 +72,7 @@ PosixErrorOr<ProcMapsEntry> ParseProcMapsLine(absl::string_view line) {
   ASSIGN_OR_RETURN_ERRNO(map_entry.major, AtoiBase(device[0], 16));
   ASSIGN_OR_RETURN_ERRNO(map_entry.minor, AtoiBase(device[1], 16));
 
-  ASSIGN_OR_RETURN_ERRNO(map_entry.inode, Atoi<int64>(parts[4]));
+  ASSIGN_OR_RETURN_ERRNO(map_entry.inode, Atoi<int64_t>(parts[4]));
   if (parts.size() == 6) {
     // A filename is present. However, absl::StrSplit retained the whitespace
     // between the inode number and the filename.
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index f5096dd53..35aacb172 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -32,7 +32,7 @@ namespace testing {
 
 namespace {
 
-std::atomic<uint64> global_temp_file_number = ATOMIC_VAR_INIT(1);
+std::atomic<uint64_t> global_temp_file_number = ATOMIC_VAR_INIT(1);
 
 // Return a new temp filename, intended to be unique system-wide.
 //
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 51f4b4539..848504c88 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -79,7 +79,7 @@ bool IsRunningWithHostinet() {
 #endif  // defined(__x86_64__)
 
 CPUVendor GetCPUVendor() {
-  uint32 eax, ebx, ecx, edx;
+  uint32_t eax, ebx, ecx, edx;
   std::string vendor_str;
   // Get vendor string (issue CPUID with eax = 0)
   GETCPUID(eax, ebx, ecx, edx, 0, 0);
@@ -179,36 +179,36 @@ PosixErrorOr<std::vector<OpenFd>> GetOpenFDs() {
   return ret_fds;
 }
 
-PosixErrorOr<uint64> Links(const std::string& path) {
+PosixErrorOr<uint64_t> Links(const std::string& path) {
   struct stat st;
   if (stat(path.c_str(), &st)) {
     return PosixError(errno, absl::StrCat("Failed to stat ", path));
   }
-  return static_cast<uint64>(st.st_nlink);
+  return static_cast<uint64_t>(st.st_nlink);
 }
 
 void RandomizeBuffer(void* buffer, size_t len) {
   struct timespec ts = {};
   clock_gettime(CLOCK_MONOTONIC, &ts);
-  uint32 seed = static_cast<uint32>(ts.tv_nsec);
+  uint32_t seed = static_cast<uint32_t>(ts.tv_nsec);
   char* const buf = static_cast<char*>(buffer);
   for (size_t i = 0; i < len; i++) {
     buf[i] = rand_r(&seed) % 255;
   }
 }
 
-std::vector<std::vector<struct iovec>> GenerateIovecs(uint64 total_size,
+std::vector<std::vector<struct iovec>> GenerateIovecs(uint64_t total_size,
                                                       void* buf,
                                                       size_t buflen) {
   std::vector<std::vector<struct iovec>> result;
-  for (uint64 offset = 0; offset < total_size;) {
+  for (uint64_t offset = 0; offset < total_size;) {
     auto& iovec_array = *result.emplace(result.end());
 
     for (; offset < total_size && iovec_array.size() < IOV_MAX;
          offset += buflen) {
       struct iovec iov = {};
       iov.iov_base = buf;
-      iov.iov_len = std::min<uint64>(total_size - offset, buflen);
+      iov.iov_len = std::min<uint64_t>(total_size - offset, buflen);
       iovec_array.push_back(iov);
     }
   }
@@ -216,15 +216,15 @@ std::vector<std::vector<struct iovec>> GenerateIovecs(uint64 total_size,
   return result;
 }
 
-uint64 Megabytes(uint64 n) {
+uint64_t Megabytes(uint64_t n) {
   // Overflow check, upper 20 bits in n shouldn't be set.
   TEST_CHECK(!(0xfffff00000000000 & n));
   return n << 20;
 }
 
-bool Equivalent(uint64 current, uint64 target, double tolerance) {
+bool Equivalent(uint64_t current, uint64_t target, double tolerance) {
   auto abs_diff = target > current ? target - current : current - target;
-  return abs_diff <= static_cast<uint64>(tolerance * target);
+  return abs_diff <= static_cast<uint64_t>(tolerance * target);
 }
 
 }  // namespace testing
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 6eb46ac76..b3235c7e3 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -264,7 +264,7 @@ std::ostream& operator<<(std::ostream& out, OpenFd const& ofd);
 PosixErrorOr<std::vector<OpenFd>> GetOpenFDs();
 
 // Returns the number of hard links to a path.
-PosixErrorOr<uint64> Links(const std::string& path);
+PosixErrorOr<uint64_t> Links(const std::string& path);
 
 namespace internal {
 
@@ -706,7 +706,7 @@ inline PosixErrorOr<T> Atoi(absl::string_view str) {
   return ret;
 }
 
-inline PosixErrorOr<uint64> AtoiBase(absl::string_view str, int base) {
+inline PosixErrorOr<uint64_t> AtoiBase(absl::string_view str, int base) {
   if (base > 255 || base < 2) {
     return PosixError(EINVAL, "Invalid Base");
   }
@@ -737,16 +737,16 @@ inline PosixErrorOr<float> Atof(absl::string_view str) {
 
 // Return the smallest number of iovec arrays that can be used to write
 // "total_bytes" number of bytes, each iovec writing one "buf".
-std::vector<std::vector<struct iovec>> GenerateIovecs(uint64 total_size,
+std::vector<std::vector<struct iovec>> GenerateIovecs(uint64_t total_size,
                                                       void* buf, size_t buflen);
 
 // Returns bytes in 'n' megabytes. Used for readability.
-uint64 Megabytes(uint64 n);
+uint64_t Megabytes(uint64_t n);
 
 // Predicate for checking that a value is within some tolerance of another
 // value. Returns true iff current is in the range [target * (1 - tolerance),
 // target * (1 + tolerance)].
-bool Equivalent(uint64 current, uint64 target, double tolerance);
+bool Equivalent(uint64_t current, uint64_t target, double tolerance);
 
 // Matcher wrapping the Equivalent predicate.
 MATCHER_P2(EquivalentWithin, target, tolerance,
@@ -756,7 +756,7 @@ MATCHER_P2(EquivalentWithin, target, tolerance,
   if (target == 0) {
     *result_listener << ::absl::StreamFormat("difference of infinity%%");
   } else {
-    int64 delta = static_cast<int64>(arg) - static_cast<int64>(target);
+    int64_t delta = static_cast<int64_t>(arg) - static_cast<int64_t>(target);
     double delta_percent =
         static_cast<double>(delta) / static_cast<double>(target) * 100;
     *result_listener << ::absl::StreamFormat("difference of %.2f%%",
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
index 024304535..f42100374 100644
--- a/test/util/test_util_test.cc
+++ b/test/util/test_util_test.cc
@@ -171,7 +171,7 @@ MATCHER_P(IovecsListEq, expected, "") {
     return false;
   }
 
-  for (uint64 i = 0; i < expected.size(); ++i) {
+  for (uint64_t i = 0; i < expected.size(); ++i) {
     const std::vector<struct iovec>& actual_iovecs = arg[i];
     const std::vector<struct iovec>& expected_iovecs = expected[i];
     if (actual_iovecs.size() != expected_iovecs.size()) {
@@ -181,7 +181,7 @@ MATCHER_P(IovecsListEq, expected, "") {
       return false;
     }
 
-    for (uint64 j = 0; j < expected_iovecs.size(); ++j) {
+    for (uint64_t j = 0; j < expected_iovecs.size(); ++j) {
       const struct iovec& actual_iov = actual_iovecs[j];
       const struct iovec& expected_iov = expected_iovecs[j];
       if (actual_iov.iov_base != expected_iov.iov_base) {
-- 
cgit v1.2.3


From a944fcd94626bb278d5edd5453c5be16c72b7ee5 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 21 Jan 2020 16:38:42 -0800
Subject: Install Bazel 2.0.0 on kokoro images.

PiperOrigin-RevId: 290850738
---
 kokoro/ubuntu1604/20_bazel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/ubuntu1604/20_bazel.sh b/kokoro/ubuntu1604/20_bazel.sh
index b9a894024..b33e1656c 100755
--- a/kokoro/ubuntu1604/20_bazel.sh
+++ b/kokoro/ubuntu1604/20_bazel.sh
@@ -16,7 +16,7 @@
 
 set -xeo pipefail
 
-declare -r BAZEL_VERSION=0.29.1
+declare -r BAZEL_VERSION=2.0.0
 
 # Install bazel dependencies.
 apt-get update && apt-get install -y openjdk-8-jdk-headless unzip
-- 
cgit v1.2.3


From 1effdc091b441c4b1ada4327c1422cd360f80f98 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 21 Jan 2020 16:59:24 -0800
Subject: TMutex based on sync.Mutex.

Updates #231

PiperOrigin-RevId: 290854399
---
 pkg/sync/BUILD            |  2 ++
 pkg/sync/tmutex_test.go   | 71 +++++++++++++++++++++++++++++++++++++++++++++++
 pkg/sync/tmutex_unsafe.go | 49 ++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+)
 create mode 100644 pkg/sync/tmutex_test.go
 create mode 100644 pkg/sync/tmutex_unsafe.go

diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index e8cd16b8f..97c4b3b1e 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -38,6 +38,7 @@ go_library(
         "race_unsafe.go",
         "seqcount.go",
         "syncutil.go",
+        "tmutex_unsafe.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sync",
 )
@@ -48,6 +49,7 @@ go_test(
     srcs = [
         "downgradable_rwmutex_test.go",
         "seqcount_test.go",
+        "tmutex_test.go",
     ],
     embed = [":sync"],
 )
diff --git a/pkg/sync/tmutex_test.go b/pkg/sync/tmutex_test.go
new file mode 100644
index 000000000..c640bae23
--- /dev/null
+++ b/pkg/sync/tmutex_test.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync"
+	"testing"
+	"unsafe"
+)
+
+// TestStructSize verifies that syncMutex's size hasn't drifted from the
+// standard library's version.
+//
+// The correctness of this package relies on these remaining in sync.
+func TestStructSize(t *testing.T) {
+	const (
+		got  = unsafe.Sizeof(syncMutex{})
+		want = unsafe.Sizeof(sync.Mutex{})
+	)
+	if got != want {
+		t.Errorf("got sizeof(syncMutex) = %d, want = sizeof(sync.Mutex) = %d", got, want)
+	}
+}
+
+// TestFieldValues verifies that the semantics of syncMutex.state from the
+// standard library's implementation.
+//
+// The correctness of this package relies on these remaining in sync.
+func TestFieldValues(t *testing.T) {
+	var m TMutex
+	m.Lock()
+	if got := *m.state(); got != mutexLocked {
+		t.Errorf("got locked sync.Mutex.state = %d, want = %d", got, mutexLocked)
+	}
+	m.Unlock()
+	if got := *m.state(); got != mutexUnlocked {
+		t.Errorf("got unlocked sync.Mutex.state = %d, want = %d", got, mutexUnlocked)
+	}
+}
+
+func TestDoubleTryLock(t *testing.T) {
+	var m TMutex
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestTryLockAfterLock(t *testing.T) {
+	var m TMutex
+	m.Lock()
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestTryLockUnlock(t *testing.T) {
+	var m TMutex
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	m.Unlock()
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock after unlock")
+	}
+}
diff --git a/pkg/sync/tmutex_unsafe.go b/pkg/sync/tmutex_unsafe.go
new file mode 100644
index 000000000..3c32f8371
--- /dev/null
+++ b/pkg/sync/tmutex_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.15
+
+// When updating the build constraint (above), check that syncMutex matches the
+// standard library sync.Mutex definition.
+
+package sync
+
+import (
+	"sync"
+	"sync/atomic"
+	"unsafe"
+)
+
+// TMutex is a try lock.
+type TMutex struct {
+	sync.Mutex
+}
+
+type syncMutex struct {
+	state int32
+	sema  uint32
+}
+
+func (m *TMutex) state() *int32 {
+	return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
+}
+
+const (
+	mutexUnlocked = 0
+	mutexLocked   = 1
+)
+
+// TryLock tries to aquire the mutex. It returns true if it succeeds and false
+// otherwise. TryLock does not block.
+func (m *TMutex) TryLock() bool {
+	if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) {
+		if RaceEnabled {
+			RaceAcquire(unsafe.Pointer(&m.Mutex))
+		}
+		return true
+	}
+	return false
+}
-- 
cgit v1.2.3


From d0e75f2bef4e16356693987db6ae6bbdce749618 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 21 Jan 2020 18:34:24 -0800
Subject: Add trylock support to DowngradableRWMutex.

Updates #231

PiperOrigin-RevId: 290868875
---
 pkg/sync/downgradable_rwmutex_test.go   | 55 ++++++++++++++++++++++++++
 pkg/sync/downgradable_rwmutex_unsafe.go | 68 +++++++++++++++++++++++++++++----
 2 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go
index f04496bc5..b5cb28ec0 100644
--- a/pkg/sync/downgradable_rwmutex_test.go
+++ b/pkg/sync/downgradable_rwmutex_test.go
@@ -148,3 +148,58 @@ func TestDowngradableRWMutex(t *testing.T) {
 	HammerDowngradableRWMutex(10, 10, n)
 	HammerDowngradableRWMutex(10, 5, n)
 }
+
+func TestRWDoubleTryLock(t *testing.T) {
+	var m DowngradableRWMutex
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestRWTryLockAfterLock(t *testing.T) {
+	var m DowngradableRWMutex
+	m.Lock()
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestRWTryLockUnlock(t *testing.T) {
+	var m DowngradableRWMutex
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	m.Unlock()
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock after unlock")
+	}
+}
+
+func TestTryRLockAfterLock(t *testing.T) {
+	var m DowngradableRWMutex
+	m.Lock()
+	if m.TryRLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestTryLockAfterRLock(t *testing.T) {
+	var m DowngradableRWMutex
+	m.RLock()
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestDoubleTryRLock(t *testing.T) {
+	var m DowngradableRWMutex
+	if !m.TryRLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	if !m.TryRLock() {
+		t.Fatal("failed to read aquire read locked lock")
+	}
+}
diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go
index 9bb55cd3a..0d321f5e3 100644
--- a/pkg/sync/downgradable_rwmutex_unsafe.go
+++ b/pkg/sync/downgradable_rwmutex_unsafe.go
@@ -19,7 +19,6 @@
 package sync
 
 import (
-	"sync"
 	"sync/atomic"
 	"unsafe"
 )
@@ -30,18 +29,43 @@ func runtimeSemacquire(s *uint32)
 //go:linkname runtimeSemrelease sync.runtime_Semrelease
 func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
 
-// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
-// method.
+// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock,
+// TryLock and TryRLock methods.
 type DowngradableRWMutex struct {
-	w           sync.Mutex // held if there are pending writers
-	writerSem   uint32     // semaphore for writers to wait for completing readers
-	readerSem   uint32     // semaphore for readers to wait for completing writers
-	readerCount int32      // number of pending readers
-	readerWait  int32      // number of departing readers
+	w           TMutex // held if there are pending writers
+	writerSem   uint32 // semaphore for writers to wait for completing readers
+	readerSem   uint32 // semaphore for readers to wait for completing writers
+	readerCount int32  // number of pending readers
+	readerWait  int32  // number of departing readers
 }
 
 const rwmutexMaxReaders = 1 << 30
 
+// TryRLock locks rw for reading. It returns true if it succeeds and false
+// otherwise. It does not block.
+func (rw *DowngradableRWMutex) TryRLock() bool {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	for {
+		rc := atomic.LoadInt32(&rw.readerCount)
+		if rc < 0 {
+			if RaceEnabled {
+				RaceEnable()
+			}
+			return false
+		}
+		if !atomic.CompareAndSwapInt32(&rw.readerCount, rc, rc+1) {
+			continue
+		}
+		if RaceEnabled {
+			RaceEnable()
+			RaceAcquire(unsafe.Pointer(&rw.readerSem))
+		}
+		return true
+	}
+}
+
 // RLock locks rw for reading.
 func (rw *DowngradableRWMutex) RLock() {
 	if RaceEnabled {
@@ -78,6 +102,34 @@ func (rw *DowngradableRWMutex) RUnlock() {
 	}
 }
 
+// TryLock locks rw for writing. It returns true if it succeeds and false
+// otherwise. It does not block.
+func (rw *DowngradableRWMutex) TryLock() bool {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	if !rw.w.TryLock() {
+		if RaceEnabled {
+			RaceEnable()
+		}
+		return false
+	}
+	// Only proceed if there are no readers.
+	if !atomic.CompareAndSwapInt32(&rw.readerCount, 0, -rwmutexMaxReaders) {
+		rw.w.Unlock()
+		if RaceEnabled {
+			RaceEnable()
+		}
+		return false
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+	return true
+}
+
 // Lock locks rw for writing.
 func (rw *DowngradableRWMutex) Lock() {
 	if RaceEnabled {
-- 
cgit v1.2.3


From 6a59e7f510a7b12f8b3bd768dfe569033ef07d30 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 21 Jan 2020 19:23:26 -0800
Subject: Rename DowngradableRWMutex to RWmutex.

Also renames TMutex to Mutex.

These custom mutexes aren't any worse than the standard library versions (same
code), so having both seems redundant.

PiperOrigin-RevId: 290873587
---
 pkg/sentry/fs/overlay.go                |  2 +-
 pkg/sentry/mm/mm.go                     |  4 +--
 pkg/sync/aliases.go                     |  6 ----
 pkg/sync/downgradable_rwmutex_test.go   | 50 ++++++++++++++++-----------------
 pkg/sync/downgradable_rwmutex_unsafe.go | 26 ++++++++---------
 pkg/sync/tmutex_test.go                 |  8 +++---
 pkg/sync/tmutex_unsafe.go               |  8 +++---
 7 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 4cad55327..f7702f8f4 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -198,7 +198,7 @@ type overlayEntry struct {
 	upper *Inode
 
 	// dirCacheMu protects dirCache.
-	dirCacheMu sync.DowngradableRWMutex `state:"nosave"`
+	dirCacheMu sync.RWMutex `state:"nosave"`
 
 	// dirCache is cache of DentAttrs from upper and lower Inodes.
 	dirCache *SortedDentryMap
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index fa86ebced..78cc9e6e4 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -80,7 +80,7 @@ type MemoryManager struct {
 	users int32
 
 	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
-	mappingMu sync.DowngradableRWMutex `state:"nosave"`
+	mappingMu sync.RWMutex `state:"nosave"`
 
 	// vmas stores virtual memory areas. Since vmas are stored by value,
 	// clients should usually use vmaIterator.ValuePtr() instead of
@@ -123,7 +123,7 @@ type MemoryManager struct {
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
-	activeMu sync.DowngradableRWMutex `state:"nosave"`
+	activeMu sync.RWMutex `state:"nosave"`
 
 	// pmas stores platform mapping areas used to implement vmas. Since pmas
 	// are stored by value, clients should usually use pmaIterator.ValuePtr()
diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go
index 20c7ca041..d2d7132fa 100644
--- a/pkg/sync/aliases.go
+++ b/pkg/sync/aliases.go
@@ -11,12 +11,6 @@ import (
 
 // Aliases of standard library types.
 type (
-	// Mutex is an alias of sync.Mutex.
-	Mutex = sync.Mutex
-
-	// RWMutex is an alias of sync.RWMutex.
-	RWMutex = sync.RWMutex
-
 	// Cond is an alias of sync.Cond.
 	Cond = sync.Cond
 
diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go
index b5cb28ec0..ce667e825 100644
--- a/pkg/sync/downgradable_rwmutex_test.go
+++ b/pkg/sync/downgradable_rwmutex_test.go
@@ -18,7 +18,7 @@ import (
 	"testing"
 )
 
-func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
+func parallelReader(m *RWMutex, clocked, cunlock, cdone chan bool) {
 	m.RLock()
 	clocked <- true
 	<-cunlock
@@ -28,7 +28,7 @@ func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
 
 func doTestParallelReaders(numReaders, gomaxprocs int) {
 	runtime.GOMAXPROCS(gomaxprocs)
-	var m DowngradableRWMutex
+	var m RWMutex
 	clocked := make(chan bool)
 	cunlock := make(chan bool)
 	cdone := make(chan bool)
@@ -55,7 +55,7 @@ func TestParallelReaders(t *testing.T) {
 	doTestParallelReaders(4, 2)
 }
 
-func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+func reader(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
 	for i := 0; i < numIterations; i++ {
 		rwm.RLock()
 		n := atomic.AddInt32(activity, 1)
@@ -70,7 +70,7 @@ func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone
 	cdone <- true
 }
 
-func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+func writer(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
 	for i := 0; i < numIterations; i++ {
 		rwm.Lock()
 		n := atomic.AddInt32(activity, 10000)
@@ -85,7 +85,7 @@ func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone
 	cdone <- true
 }
 
-func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+func downgradingWriter(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
 	for i := 0; i < numIterations; i++ {
 		rwm.Lock()
 		n := atomic.AddInt32(activity, 10000)
@@ -112,7 +112,7 @@ func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
 	runtime.GOMAXPROCS(gomaxprocs)
 	// Number of active readers + 10000 * number of active writers.
 	var activity int32
-	var rwm DowngradableRWMutex
+	var rwm RWMutex
 	cdone := make(chan bool)
 	go writer(&rwm, numIterations, &activity, cdone)
 	go downgradingWriter(&rwm, numIterations, &activity, cdone)
@@ -150,56 +150,56 @@ func TestDowngradableRWMutex(t *testing.T) {
 }
 
 func TestRWDoubleTryLock(t *testing.T) {
-	var m DowngradableRWMutex
-	if !m.TryLock() {
+	var rwm RWMutex
+	if !rwm.TryLock() {
 		t.Fatal("failed to aquire lock")
 	}
-	if m.TryLock() {
+	if rwm.TryLock() {
 		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
 	}
 }
 
 func TestRWTryLockAfterLock(t *testing.T) {
-	var m DowngradableRWMutex
-	m.Lock()
-	if m.TryLock() {
+	var rwm RWMutex
+	rwm.Lock()
+	if rwm.TryLock() {
 		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
 	}
 }
 
 func TestRWTryLockUnlock(t *testing.T) {
-	var m DowngradableRWMutex
-	if !m.TryLock() {
+	var rwm RWMutex
+	if !rwm.TryLock() {
 		t.Fatal("failed to aquire lock")
 	}
-	m.Unlock()
-	if !m.TryLock() {
+	rwm.Unlock()
+	if !rwm.TryLock() {
 		t.Fatal("failed to aquire lock after unlock")
 	}
 }
 
 func TestTryRLockAfterLock(t *testing.T) {
-	var m DowngradableRWMutex
-	m.Lock()
-	if m.TryRLock() {
+	var rwm RWMutex
+	rwm.Lock()
+	if rwm.TryRLock() {
 		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
 	}
 }
 
 func TestTryLockAfterRLock(t *testing.T) {
-	var m DowngradableRWMutex
-	m.RLock()
-	if m.TryLock() {
+	var rwm RWMutex
+	rwm.RLock()
+	if rwm.TryLock() {
 		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
 	}
 }
 
 func TestDoubleTryRLock(t *testing.T) {
-	var m DowngradableRWMutex
-	if !m.TryRLock() {
+	var rwm RWMutex
+	if !rwm.TryRLock() {
 		t.Fatal("failed to aquire lock")
 	}
-	if !m.TryRLock() {
+	if !rwm.TryRLock() {
 		t.Fatal("failed to read aquire read locked lock")
 	}
 }
diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go
index 0d321f5e3..ea6cdc447 100644
--- a/pkg/sync/downgradable_rwmutex_unsafe.go
+++ b/pkg/sync/downgradable_rwmutex_unsafe.go
@@ -29,10 +29,10 @@ func runtimeSemacquire(s *uint32)
 //go:linkname runtimeSemrelease sync.runtime_Semrelease
 func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
 
-// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock,
+// RWMutex is identical to sync.RWMutex, but adds the DowngradeLock,
 // TryLock and TryRLock methods.
-type DowngradableRWMutex struct {
-	w           TMutex // held if there are pending writers
+type RWMutex struct {
+	w           Mutex  // held if there are pending writers
 	writerSem   uint32 // semaphore for writers to wait for completing readers
 	readerSem   uint32 // semaphore for readers to wait for completing writers
 	readerCount int32  // number of pending readers
@@ -43,7 +43,7 @@ const rwmutexMaxReaders = 1 << 30
 
 // TryRLock locks rw for reading. It returns true if it succeeds and false
 // otherwise. It does not block.
-func (rw *DowngradableRWMutex) TryRLock() bool {
+func (rw *RWMutex) TryRLock() bool {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -67,7 +67,7 @@ func (rw *DowngradableRWMutex) TryRLock() bool {
 }
 
 // RLock locks rw for reading.
-func (rw *DowngradableRWMutex) RLock() {
+func (rw *RWMutex) RLock() {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -82,14 +82,14 @@ func (rw *DowngradableRWMutex) RLock() {
 }
 
 // RUnlock undoes a single RLock call.
-func (rw *DowngradableRWMutex) RUnlock() {
+func (rw *RWMutex) RUnlock() {
 	if RaceEnabled {
 		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
 		RaceDisable()
 	}
 	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
 		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
-			panic("RUnlock of unlocked DowngradableRWMutex")
+			panic("RUnlock of unlocked RWMutex")
 		}
 		// A writer is pending.
 		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
@@ -104,7 +104,7 @@ func (rw *DowngradableRWMutex) RUnlock() {
 
 // TryLock locks rw for writing. It returns true if it succeeds and false
 // otherwise. It does not block.
-func (rw *DowngradableRWMutex) TryLock() bool {
+func (rw *RWMutex) TryLock() bool {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -131,7 +131,7 @@ func (rw *DowngradableRWMutex) TryLock() bool {
 }
 
 // Lock locks rw for writing.
-func (rw *DowngradableRWMutex) Lock() {
+func (rw *RWMutex) Lock() {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -150,7 +150,7 @@ func (rw *DowngradableRWMutex) Lock() {
 }
 
 // Unlock unlocks rw for writing.
-func (rw *DowngradableRWMutex) Unlock() {
+func (rw *RWMutex) Unlock() {
 	if RaceEnabled {
 		RaceRelease(unsafe.Pointer(&rw.writerSem))
 		RaceRelease(unsafe.Pointer(&rw.readerSem))
@@ -159,7 +159,7 @@ func (rw *DowngradableRWMutex) Unlock() {
 	// Announce to readers there is no active writer.
 	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
 	if r >= rwmutexMaxReaders {
-		panic("Unlock of unlocked DowngradableRWMutex")
+		panic("Unlock of unlocked RWMutex")
 	}
 	// Unblock blocked readers, if any.
 	for i := 0; i < int(r); i++ {
@@ -173,7 +173,7 @@ func (rw *DowngradableRWMutex) Unlock() {
 }
 
 // DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *DowngradableRWMutex) DowngradeLock() {
+func (rw *RWMutex) DowngradeLock() {
 	if RaceEnabled {
 		RaceRelease(unsafe.Pointer(&rw.readerSem))
 		RaceDisable()
@@ -181,7 +181,7 @@ func (rw *DowngradableRWMutex) DowngradeLock() {
 	// Announce to readers there is no active writer and one additional reader.
 	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
 	if r >= rwmutexMaxReaders+1 {
-		panic("DowngradeLock of unlocked DowngradableRWMutex")
+		panic("DowngradeLock of unlocked RWMutex")
 	}
 	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
 	// includes this goroutine.
diff --git a/pkg/sync/tmutex_test.go b/pkg/sync/tmutex_test.go
index c640bae23..0838248b4 100644
--- a/pkg/sync/tmutex_test.go
+++ b/pkg/sync/tmutex_test.go
@@ -30,7 +30,7 @@ func TestStructSize(t *testing.T) {
 //
 // The correctness of this package relies on these remaining in sync.
 func TestFieldValues(t *testing.T) {
-	var m TMutex
+	var m Mutex
 	m.Lock()
 	if got := *m.state(); got != mutexLocked {
 		t.Errorf("got locked sync.Mutex.state = %d, want = %d", got, mutexLocked)
@@ -42,7 +42,7 @@ func TestFieldValues(t *testing.T) {
 }
 
 func TestDoubleTryLock(t *testing.T) {
-	var m TMutex
+	var m Mutex
 	if !m.TryLock() {
 		t.Fatal("failed to aquire lock")
 	}
@@ -52,7 +52,7 @@ func TestDoubleTryLock(t *testing.T) {
 }
 
 func TestTryLockAfterLock(t *testing.T) {
-	var m TMutex
+	var m Mutex
 	m.Lock()
 	if m.TryLock() {
 		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
@@ -60,7 +60,7 @@ func TestTryLockAfterLock(t *testing.T) {
 }
 
 func TestTryLockUnlock(t *testing.T) {
-	var m TMutex
+	var m Mutex
 	if !m.TryLock() {
 		t.Fatal("failed to aquire lock")
 	}
diff --git a/pkg/sync/tmutex_unsafe.go b/pkg/sync/tmutex_unsafe.go
index 3c32f8371..3dd15578b 100644
--- a/pkg/sync/tmutex_unsafe.go
+++ b/pkg/sync/tmutex_unsafe.go
@@ -17,8 +17,8 @@ import (
 	"unsafe"
 )
 
-// TMutex is a try lock.
-type TMutex struct {
+// Mutex is a try lock.
+type Mutex struct {
 	sync.Mutex
 }
 
@@ -27,7 +27,7 @@ type syncMutex struct {
 	sema  uint32
 }
 
-func (m *TMutex) state() *int32 {
+func (m *Mutex) state() *int32 {
 	return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
 }
 
@@ -38,7 +38,7 @@ const (
 
 // TryLock tries to aquire the mutex. It returns true if it succeeds and false
 // otherwise. TryLock does not block.
-func (m *TMutex) TryLock() bool {
+func (m *Mutex) TryLock() bool {
 	if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) {
 		if RaceEnabled {
 			RaceAcquire(unsafe.Pointer(&m.Mutex))
-- 
cgit v1.2.3


From d59a3cc959cb14b0bed14b62e33ee4178b89b346 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 22 Jan 2020 05:51:57 +0000
Subject: Enable fault() syscall test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I9b2b2e0d84946c10cf136abeef6c60642fa3b6ec
---
 test/syscalls/linux/fault.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/syscalls/linux/fault.cc b/test/syscalls/linux/fault.cc
index f6e19026f..a85750382 100644
--- a/test/syscalls/linux/fault.cc
+++ b/test/syscalls/linux/fault.cc
@@ -37,6 +37,9 @@ int GetPcFromUcontext(ucontext_t* uc, uintptr_t* pc) {
 #elif defined(__i386__)
   *pc = uc->uc_mcontext.gregs[REG_EIP];
   return 1;
+#elif defined(__aarch64__)
+  *pc = uc->uc_mcontext.pc;
+  return 1;
 #else
   return 0;
 #endif
-- 
cgit v1.2.3


From 38fe05eb699550dc1ae5f6773f0190cad83d5ae8 Mon Sep 17 00:00:00 2001
From: Marek Majkowski <marek@cloudflare.com>
Date: Wed, 22 Jan 2020 11:34:29 +0000
Subject: gonet PacketConn.RemoteAddr() incorrectly returns *net.TCPAddr,
 should be *net.UDPAddr

PacketConn.LocalAddr() already returns *net.UDPAddr correctly.
---
 pkg/tcpip/adapters/gonet/gonet.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index a2f44b496..3bba4028b 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -622,7 +622,7 @@ func (c *PacketConn) RemoteAddr() net.Addr {
 	if err != nil {
 		return nil
 	}
-	return fullToTCPAddr(a)
+	return fullToUDPAddr(a)
 }
 
 // Read implements net.Conn.Read
-- 
cgit v1.2.3


From 747137c120bca27aeb259817d30ef60e01521621 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 22 Jan 2020 10:23:44 -0800
Subject: Address GitHub comments.

---
 pkg/tcpip/iptables/types.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index a0bfc8b41..a8b972f1b 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -153,7 +153,7 @@ func (table *Table) SetMetadata(metadata interface{}) {
 // packets this rule applies to. If there are no matchers in the rule, it
 // applies to any packet.
 type Rule struct {
-	// IPHeaderFilter holds basic IP filtering fields common to every rule.
+	// Filter holds basic IP filtering fields common to every rule.
 	Filter IPHeaderFilter
 
 	// Matchers is the list of matchers for this rule.
-- 
cgit v1.2.3


From cb3906ae00575859a6910b8edc62ab9d531d1c85 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 22 Jan 2020 10:38:03 -0800
Subject: Add tools for generating images.

This formalizes the adhoc scripts previously in kokoro. The image targets can
be used by e.g. benchmarks in order to automated image prepation.

PiperOrigin-RevId: 290982744
---
 kokoro/ubuntu1604/10_core.sh             |  30 ------
 kokoro/ubuntu1604/20_bazel.sh            |  28 -----
 kokoro/ubuntu1604/25_docker.sh           |  35 ------
 kokoro/ubuntu1604/30_containerd.sh       |  76 -------------
 kokoro/ubuntu1604/40_kokoro.sh           |  57 ----------
 kokoro/ubuntu1604/README.md              |  34 ------
 kokoro/ubuntu1604/build.sh               |  20 ----
 kokoro/ubuntu1804/10_core.sh             |   1 -
 kokoro/ubuntu1804/20_bazel.sh            |   1 -
 kokoro/ubuntu1804/25_docker.sh           |   1 -
 kokoro/ubuntu1804/30_containerd.sh       |   1 -
 kokoro/ubuntu1804/40_kokoro.sh           |   1 -
 kokoro/ubuntu1804/build.sh               |  20 ----
 tools/images/BUILD                       |  68 ++++++++++++
 tools/images/build.sh                    | 101 ++++++++++++++++++
 tools/images/defs.bzl                    | 178 +++++++++++++++++++++++++++++++
 tools/images/execute.sh                  | 152 ++++++++++++++++++++++++++
 tools/images/test.cc                     |  23 ++++
 tools/images/ubuntu1604/10_core.sh       |  30 ++++++
 tools/images/ubuntu1604/20_bazel.sh      |  28 +++++
 tools/images/ubuntu1604/25_docker.sh     |  35 ++++++
 tools/images/ubuntu1604/30_containerd.sh |  76 +++++++++++++
 tools/images/ubuntu1604/40_kokoro.sh     |  57 ++++++++++
 tools/images/ubuntu1604/BUILD            |   7 ++
 tools/images/ubuntu1804/BUILD            |   7 ++
 tools/installers/BUILD                   |  22 ++++
 tools/installers/head.sh                 |  21 ++++
 tools/installers/master.sh               |  20 ++++
 tools/installers/shim.sh                 |  24 +++++
 29 files changed, 849 insertions(+), 305 deletions(-)
 delete mode 100755 kokoro/ubuntu1604/10_core.sh
 delete mode 100755 kokoro/ubuntu1604/20_bazel.sh
 delete mode 100755 kokoro/ubuntu1604/25_docker.sh
 delete mode 100755 kokoro/ubuntu1604/30_containerd.sh
 delete mode 100755 kokoro/ubuntu1604/40_kokoro.sh
 delete mode 100644 kokoro/ubuntu1604/README.md
 delete mode 100755 kokoro/ubuntu1604/build.sh
 delete mode 120000 kokoro/ubuntu1804/10_core.sh
 delete mode 120000 kokoro/ubuntu1804/20_bazel.sh
 delete mode 120000 kokoro/ubuntu1804/25_docker.sh
 delete mode 120000 kokoro/ubuntu1804/30_containerd.sh
 delete mode 120000 kokoro/ubuntu1804/40_kokoro.sh
 delete mode 100755 kokoro/ubuntu1804/build.sh
 create mode 100644 tools/images/BUILD
 create mode 100755 tools/images/build.sh
 create mode 100644 tools/images/defs.bzl
 create mode 100755 tools/images/execute.sh
 create mode 100644 tools/images/test.cc
 create mode 100755 tools/images/ubuntu1604/10_core.sh
 create mode 100755 tools/images/ubuntu1604/20_bazel.sh
 create mode 100755 tools/images/ubuntu1604/25_docker.sh
 create mode 100755 tools/images/ubuntu1604/30_containerd.sh
 create mode 100755 tools/images/ubuntu1604/40_kokoro.sh
 create mode 100644 tools/images/ubuntu1604/BUILD
 create mode 100644 tools/images/ubuntu1804/BUILD
 create mode 100644 tools/installers/BUILD
 create mode 100755 tools/installers/head.sh
 create mode 100755 tools/installers/master.sh
 create mode 100755 tools/installers/shim.sh

diff --git a/kokoro/ubuntu1604/10_core.sh b/kokoro/ubuntu1604/10_core.sh
deleted file mode 100755
index 46dda6bb1..000000000
--- a/kokoro/ubuntu1604/10_core.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Install all essential build tools.
-apt-get update && apt-get -y install make git-core build-essential linux-headers-$(uname -r) pkg-config
-
-# Install a recent go toolchain.
-if ! [[ -d /usr/local/go ]]; then
-    wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
-    tar -xvf go1.13.5.linux-amd64.tar.gz
-    mv go /usr/local
-fi
-
-# Link the Go binary from /usr/bin; replacing anything there.
-(cd /usr/bin && rm -f go && sudo ln -fs /usr/local/go/bin/go go)
diff --git a/kokoro/ubuntu1604/20_bazel.sh b/kokoro/ubuntu1604/20_bazel.sh
deleted file mode 100755
index b33e1656c..000000000
--- a/kokoro/ubuntu1604/20_bazel.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-declare -r BAZEL_VERSION=2.0.0
-
-# Install bazel dependencies.
-apt-get update && apt-get install -y openjdk-8-jdk-headless unzip
-
-# Use the release installer.
-curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
-chmod a+x bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
-./bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
-rm -f bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
diff --git a/kokoro/ubuntu1604/25_docker.sh b/kokoro/ubuntu1604/25_docker.sh
deleted file mode 100755
index 1d3defcd3..000000000
--- a/kokoro/ubuntu1604/25_docker.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Add dependencies.
-apt-get update && apt-get -y install \
-    apt-transport-https \
-    ca-certificates \
-    curl \
-    gnupg-agent \
-    software-properties-common
-
-# Install the key.
-curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
-
-# Add the repository.
-add-apt-repository \
-   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
-   $(lsb_release -cs) \
-   stable"
-
-# Install docker.
-apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io
diff --git a/kokoro/ubuntu1604/30_containerd.sh b/kokoro/ubuntu1604/30_containerd.sh
deleted file mode 100755
index a7472bd1c..000000000
--- a/kokoro/ubuntu1604/30_containerd.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Helper for Go packages below.
-install_helper() {
-  PACKAGE="${1}"
-  TAG="${2}"
-  GOPATH="${3}"
-
-  # Clone the repository.
-  mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
-     git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
-
-  # Checkout and build the repository.
-  (cd "${GOPATH}"/src/"${PACKAGE}" && \
-      git checkout "${TAG}" && \
-      GOPATH="${GOPATH}" make && \
-      GOPATH="${GOPATH}" make install)
-}
-
-# Install dependencies for the crictl tests.
-apt-get install -y btrfs-tools libseccomp-dev
-
-# Install containerd & cri-tools.
-GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
-install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
-install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
-
-# Install gvisor-containerd-shim.
-declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
-declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
-declare -r shim_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX)
-wget --no-verbose "${base}"/latest -O ${latest}
-wget --no-verbose "${base}"/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
-chmod +x ${shim_path}
-mv ${shim_path} /usr/local/bin
-
-# Configure containerd-shim.
-declare -r shim_config_path=/etc/containerd
-declare -r shim_config_tmp_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX.toml)
-mkdir -p ${shim_config_path}
-cat > ${shim_config_tmp_path} <<-EOF
-    runc_shim = "/usr/local/bin/containerd-shim"
-
-[runsc_config]
-    debug = "true"
-    debug-log = "/tmp/runsc-logs/"
-    strace = "true"
-    file-access = "shared"
-EOF
-mv ${shim_config_tmp_path} ${shim_config_path}
-
-# Configure CNI.
-(cd "${GOPATH}" && GOPATH="${GOPATH}" \
-    src/github.com/containerd/containerd/script/setup/install-cni)
-
-# Cleanup the above.
-rm -rf "${GOPATH}"
-rm -rf "${latest}"
-rm -rf "${shim_path}"
-rm -rf "${shim_config_tmp_path}"
diff --git a/kokoro/ubuntu1604/40_kokoro.sh b/kokoro/ubuntu1604/40_kokoro.sh
deleted file mode 100755
index 5f2dfc858..000000000
--- a/kokoro/ubuntu1604/40_kokoro.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Declare kokoro's required public keys.
-declare -r ssh_public_keys=(
-    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDg7L/ZaEauETWrPklUTky3kvxqQfe2Ax/2CsSqhNIGNMnK/8d79CHlmY9+dE1FFQ/RzKNCaltgy7XcN/fCYiCZr5jm2ZtnLuGNOTzupMNhaYiPL419qmL+5rZXt4/dWTrsHbFRACxT8j51PcRMO5wgbL0Bg2XXimbx8kDFaurL2gqduQYqlu4lxWCaJqOL71WogcimeL63Nq/yeH5PJPWpqE4P9VUQSwAzBWFK/hLeds/AiP3MgVS65qHBnhq0JsHy8JQsqjZbG7Iidt/Ll0+gqzEbi62gDIcczG4KC0iOVzDDP/1BxDtt1lKeA23ll769Fcm3rJyoBMYxjvdw1TDx sabujp@trigger.mtv.corp.google.com"
-    "ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBNgGK/hCdjmulHfRE3hp4rZs38NCR8yAh0eDsztxqGcuXnuSnL7jOlRrbcQpremJ84omD4eKrIpwJUs+YokMdv4= sabujp@trigger.svl.corp.google.com"
-)
-
-# Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip
-
-# junitparser is used to merge junit xml files.
-pip install junitparser
-
-# We need a kbuilder user.
-if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
-    # User was added successfully; we add the relevant SSH keys here.
-    mkdir -p ~kbuilder/.ssh
-    (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
-    chmod 0600 ~kbuilder/.ssh/authorized_keys
-    chown -R kbuilder ~kbuilder/.ssh
-fi
-
-# Give passwordless sudo access.
-cat > /etc/sudoers.d/kokoro <<EOF
-kbuilder ALL=(ALL) NOPASSWD:ALL
-EOF
-
-# Ensure we can run Docker without sudo.
-usermod -aG docker kbuilder
-
-# Ensure that we can access kvm.
-usermod -aG kvm kbuilder
-
-# Ensure that /tmpfs exists and is writable by kokoro.
-#
-# Note that kokoro will typically attach a second disk (sdb) to the instance
-# that is used for the /tmpfs volume. In the future we could setup an init
-# script that formats and mounts this here; however, we don't expect our build
-# artifacts to be that large.
-mkdir -p /tmpfs && chmod 0777 /tmpfs && touch /tmpfs/READY
diff --git a/kokoro/ubuntu1604/README.md b/kokoro/ubuntu1604/README.md
deleted file mode 100644
index 64f913b9a..000000000
--- a/kokoro/ubuntu1604/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## Image Update
-
-After making changes to files in the directory, you must run the following
-commands to update the image Kokoro uses:
-
-```shell
-gcloud config set project gvisor-kokoro-testing
-third_party/gvisor/kokoro/ubuntu1604/build.sh
-third_party/gvisor/kokoro/ubuntu1804/build.sh
-```
-
-Note: the command above will change your default project for `gcloud`. Run
-`gcloud config set project` again to revert back to your default project.
-
-Note: Files in `third_party/gvisor/kokoro/ubuntu1804/` as symlinks to
-`ubuntu1604`, therefore both images must be updated.
-
-After the script finishes, the last few lines of the output will container the
-image name. If the output was lost, you can run `build.sh` again to print the
-image name.
-
-```
-NAME                    PROJECT                FAMILY  DEPRECATED  STATUS
-image-6777fa4666a968c8  gvisor-kokoro-testing                      READY
-+ cleanup
-+ gcloud compute instances delete --quiet build-tlfrdv
-Deleted [https://www.googleapis.com/compute/v1/projects/gvisor-kokoro-testing/zones/us-central1-f/instances/build-tlfrdv].
-```
-
-To setup Kokoro to use the new image, copy the image names to their
-corresponding file below:
-
-*   //devtools/kokoro/config/gcp/gvisor/ubuntu1604.gcl
-*   //devtools/kokoro/config/gcp/gvisor/ubuntu1804.gcl
diff --git a/kokoro/ubuntu1604/build.sh b/kokoro/ubuntu1604/build.sh
deleted file mode 100755
index d664a3a76..000000000
--- a/kokoro/ubuntu1604/build.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Run the image_build.sh script with appropriate parameters.
-IMAGE_PROJECT=ubuntu-os-cloud IMAGE_FAMILY=ubuntu-1604-lts $(dirname $0)/../../tools/image_build.sh $(dirname $0)/??_*.sh
diff --git a/kokoro/ubuntu1804/10_core.sh b/kokoro/ubuntu1804/10_core.sh
deleted file mode 120000
index 6facceeee..000000000
--- a/kokoro/ubuntu1804/10_core.sh
+++ /dev/null
@@ -1 +0,0 @@
-../ubuntu1604/10_core.sh
\ No newline at end of file
diff --git a/kokoro/ubuntu1804/20_bazel.sh b/kokoro/ubuntu1804/20_bazel.sh
deleted file mode 120000
index 39194c0f5..000000000
--- a/kokoro/ubuntu1804/20_bazel.sh
+++ /dev/null
@@ -1 +0,0 @@
-../ubuntu1604/20_bazel.sh
\ No newline at end of file
diff --git a/kokoro/ubuntu1804/25_docker.sh b/kokoro/ubuntu1804/25_docker.sh
deleted file mode 120000
index 63269bd83..000000000
--- a/kokoro/ubuntu1804/25_docker.sh
+++ /dev/null
@@ -1 +0,0 @@
-../ubuntu1604/25_docker.sh
\ No newline at end of file
diff --git a/kokoro/ubuntu1804/30_containerd.sh b/kokoro/ubuntu1804/30_containerd.sh
deleted file mode 120000
index 6ac2377ed..000000000
--- a/kokoro/ubuntu1804/30_containerd.sh
+++ /dev/null
@@ -1 +0,0 @@
-../ubuntu1604/30_containerd.sh
\ No newline at end of file
diff --git a/kokoro/ubuntu1804/40_kokoro.sh b/kokoro/ubuntu1804/40_kokoro.sh
deleted file mode 120000
index e861fb5e1..000000000
--- a/kokoro/ubuntu1804/40_kokoro.sh
+++ /dev/null
@@ -1 +0,0 @@
-../ubuntu1604/40_kokoro.sh
\ No newline at end of file
diff --git a/kokoro/ubuntu1804/build.sh b/kokoro/ubuntu1804/build.sh
deleted file mode 100755
index 2b5c9a6f2..000000000
--- a/kokoro/ubuntu1804/build.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Run the image_build.sh script with appropriate parameters.
-IMAGE_PROJECT=ubuntu-os-cloud IMAGE_FAMILY=ubuntu-1804-lts $(dirname $0)/../../tools/image_build.sh $(dirname $0)/??_*.sh
diff --git a/tools/images/BUILD b/tools/images/BUILD
new file mode 100644
index 000000000..2b77c2737
--- /dev/null
+++ b/tools/images/BUILD
@@ -0,0 +1,68 @@
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+load("//tools/images:defs.bzl", "vm_image", "vm_test")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+genrule(
+    name = "zone",
+    outs = ["zone.txt"],
+    cmd = "gcloud config get-value compute/zone > $@",
+    tags = [
+        "local",
+        "manual",
+    ],
+)
+
+sh_binary(
+    name = "builder",
+    srcs = ["build.sh"],
+)
+
+sh_binary(
+    name = "executer",
+    srcs = ["execute.sh"],
+)
+
+cc_binary(
+    name = "test",
+    testonly = 1,
+    srcs = ["test.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+vm_image(
+    name = "ubuntu1604",
+    family = "ubuntu-1604-lts",
+    project = "ubuntu-os-cloud",
+    scripts = [
+        "//tools/images/ubuntu1604",
+    ],
+)
+
+vm_test(
+    name = "ubuntu1604_test",
+    image = ":ubuntu1604",
+    targets = [":test"],
+)
+
+vm_image(
+    name = "ubuntu1804",
+    family = "ubuntu-1804-lts",
+    project = "ubuntu-os-cloud",
+    scripts = [
+        "//tools/images/ubuntu1804",
+    ],
+)
+
+vm_test(
+    name = "ubuntu1804_test",
+    image = ":ubuntu1804",
+    targets = [":test"],
+)
diff --git a/tools/images/build.sh b/tools/images/build.sh
new file mode 100755
index 000000000..be462d556
--- /dev/null
+++ b/tools/images/build.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script is responsible for building a new GCP image that: 1) has nested
+# virtualization enabled, and 2) has been completely set up with the
+# image_setup.sh script. This script should be idempotent, as we memoize the
+# setup script with a hash and check for that name.
+
+set -xeou pipefail
+
+# Parameters.
+declare -r USERNAME=${USERNAME:-test}
+declare -r IMAGE_PROJECT=${IMAGE_PROJECT:-ubuntu-os-cloud}
+declare -r IMAGE_FAMILY=${IMAGE_FAMILY:-ubuntu-1604-lts}
+declare -r ZONE=${ZONE:-us-central1-f}
+
+# Random names.
+declare -r DISK_NAME=$(mktemp -u disk-XXXXXX | tr A-Z a-z)
+declare -r SNAPSHOT_NAME=$(mktemp -u snapshot-XXXXXX | tr A-Z a-z)
+declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
+
+# Hash inputs in order to memoize the produced image.
+declare -r SETUP_HASH=$( (echo ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && cat "$@") | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
+declare -r IMAGE_NAME=${IMAGE_FAMILY:-image-}${SETUP_HASH}
+
+# Does the image already exist? Skip the build.
+declare -r existing=$(gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
+if ! [[ -z "${existing}" ]]; then
+  echo "${existing}"
+  exit 0
+fi
+
+# gcloud has path errors; is this a result of being a genrule?
+export PATH=${PATH:-/bin:/usr/bin:/usr/local/bin}
+
+# Start a unique instance. Note that this instance will have a unique persistent
+# disk as it's boot disk with the same name as the instance.
+gcloud compute instances create \
+    --quiet \
+    --image-project "${IMAGE_PROJECT}" \
+    --image-family "${IMAGE_FAMILY}" \
+    --boot-disk-size "200GB" \
+    --zone "${ZONE}" \
+    "${INSTANCE_NAME}" >/dev/null
+function cleanup {
+    gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}"
+}
+trap cleanup EXIT
+
+# Wait for the instance to become available (up to 5 minutes).
+declare timeout=300
+declare success=0
+declare -r start=$(date +%s)
+declare -r end=$((${start}+${timeout}))
+while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
+  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
+    success=$((${success}+1))
+  fi
+done
+if [[ "${success}" -eq "0" ]]; then
+  echo "connect timed out after ${timeout} seconds."
+  exit 1
+fi
+
+# Run the install scripts provided.
+for arg; do
+  gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}" >/dev/null
+done
+
+# Stop the instance; required before creating an image.
+gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null
+
+# Create a snapshot of the instance disk.
+gcloud compute disks snapshot \
+    --quiet \
+    --zone "${ZONE}" \
+    --snapshot-names="${SNAPSHOT_NAME}" \
+    "${INSTANCE_NAME}" >/dev/null
+
+# Create the disk image.
+gcloud compute images create \
+    --quiet \
+    --source-snapshot="${SNAPSHOT_NAME}" \
+    --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
+    "${IMAGE_NAME}" >/dev/null
+
+# Finish up.
+echo "${IMAGE_NAME}"
diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl
new file mode 100644
index 000000000..d8e422a5d
--- /dev/null
+++ b/tools/images/defs.bzl
@@ -0,0 +1,178 @@
+"""Image configuration.
+
+Images can be generated by using the vm_image rule. For example,
+
+  vm_image(
+      name = "ubuntu",
+      project = "...",
+      family = "...",
+      scripts = [
+          "script.sh",
+          "other.sh",
+      ],
+  )
+
+This will always create an vm_image in the current default gcloud project. The
+rule has a text file as its output containing the image name. This will enforce
+serialization for all dependent rules.
+
+Images are always named per the hash of all the hermetic input scripts. This
+allows images to be memoized quickly and easily.
+
+The vm_test rule can be used to execute a command remotely. For example,
+
+  vm_test(
+      name = "mycommand",
+      image = ":myimage",
+      targets = [":test"],
+  )
+"""
+
+def _vm_image_impl(ctx):
+    script_paths = []
+    for script in ctx.files.scripts:
+        script_paths.append(script.short_path)
+
+    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
+        command = "USERNAME=%s ZONE=$(cat %s) IMAGE_PROJECT=%s IMAGE_FAMILY=%s %s %s > %s" %
+                  (
+                      ctx.attr.username,
+                      ctx.files.zone[0].path,
+                      ctx.attr.project,
+                      ctx.attr.family,
+                      ctx.executable.builder.path,
+                      " ".join(script_paths),
+                      ctx.outputs.out.path,
+                  ),
+        tools = [ctx.attr.builder] + ctx.attr.scripts,
+    )
+
+    ctx.actions.run_shell(
+        tools = resolved_inputs,
+        outputs = [ctx.outputs.out],
+        progress_message = "Building image...",
+        execution_requirements = {"local": "true"},
+        command = argv,
+        input_manifests = runfiles_manifests,
+    )
+    return [DefaultInfo(files = depset([ctx.outputs.out]))]
+
+_vm_image = rule(
+    attrs = {
+        "builder": attr.label(
+            executable = True,
+            default = "//tools/images:builder",
+            cfg = "host",
+        ),
+        "username": attr.string(default = "$(whoami)"),
+        "zone": attr.label(
+            default = "//tools/images:zone",
+            cfg = "host",
+        ),
+        "family": attr.string(mandatory = True),
+        "project": attr.string(mandatory = True),
+        "scripts": attr.label_list(allow_files = True),
+    },
+    outputs = {
+        "out": "%{name}.txt",
+    },
+    implementation = _vm_image_impl,
+)
+
+def vm_image(**kwargs):
+    _vm_image(
+        tags = [
+            "local",
+            "manual",
+        ],
+        **kwargs
+    )
+
+def _vm_test_impl(ctx):
+    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
+
+    # Note that the remote execution case must actually generate an
+    # intermediate target in order to collect all the relevant runfiles so that
+    # they can be copied over for remote execution.
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "export ZONE=$(cat %s)" % ctx.files.zone[0].short_path,
+        "export USERNAME=%s" % ctx.attr.username,
+        "export IMAGE=$(cat %s)" % ctx.files.image[0].short_path,
+        "export SUDO=%s" % "true" if ctx.attr.sudo else "false",
+        "%s %s" % (
+            ctx.executable.executer.short_path,
+            " ".join([
+                target.files_to_run.executable.short_path
+                for target in ctx.attr.targets
+            ]),
+        ),
+        "",
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return with all transitive files.
+    runfiles = ctx.runfiles(
+        transitive_files = depset(transitive = [
+            depset(target.data_runfiles.files)
+            for target in ctx.attr.targets
+            if hasattr(target, "data_runfiles")
+        ]),
+        files = ctx.files.executer + ctx.files.zone + ctx.files.image +
+                ctx.files.targets,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_vm_test = rule(
+    attrs = {
+        "image": attr.label(
+            mandatory = True,
+            cfg = "host",
+        ),
+        "executer": attr.label(
+            executable = True,
+            default = "//tools/images:executer",
+            cfg = "host",
+        ),
+        "username": attr.string(default = "$(whoami)"),
+        "zone": attr.label(
+            default = "//tools/images:zone",
+            cfg = "host",
+        ),
+        "sudo": attr.bool(default = True),
+        "machine": attr.string(default = "n1-standard-1"),
+        "targets": attr.label_list(
+            mandatory = True,
+            allow_empty = False,
+            cfg = "target",
+        ),
+    },
+    test = True,
+    implementation = _vm_test_impl,
+)
+
+def vm_test(
+        installer = "//tools/installers:head",
+        **kwargs):
+    """Runs the given targets as a remote test.
+
+    Args:
+      installer: Script to run before all targets.
+      **kwargs: All test arguments. Should include targets and image.
+    """
+    targets = kwargs.pop("targets", [])
+    if installer:
+        targets = [installer] + targets
+    targets = [
+    ] + targets
+    _vm_test(
+        tags = [
+            "local",
+            "manual",
+        ],
+        targets = targets,
+        local = 1,
+        **kwargs
+    )
diff --git a/tools/images/execute.sh b/tools/images/execute.sh
new file mode 100755
index 000000000..ba4b1ac0e
--- /dev/null
+++ b/tools/images/execute.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Required input.
+if ! [[ -v IMAGE ]]; then
+  echo "no image provided: set IMAGE."
+  exit 1
+fi
+
+# Parameters.
+declare -r USERNAME=${USERNAME:-test}
+declare -r KEYNAME=$(mktemp --tmpdir -u key-XXXXXX)
+declare -r SSHKEYS=$(mktemp --tmpdir -u sshkeys-XXXXXX)
+declare -r INSTANCE_NAME=$(mktemp -u test-XXXXXX | tr A-Z a-z)
+declare -r MACHINE=${MACHINE:-n1-standard-1}
+declare -r ZONE=${ZONE:-us-central1-f}
+declare -r SUDO=${SUDO:-false}
+
+# This script is executed as a test rule, which will reset the value of HOME.
+# Unfortunately, it is needed to load the gconfig credentials. We will reset
+# HOME when we actually execute in the remote environment, defined below.
+export HOME=$(eval echo ~$(whoami))
+
+# Generate unique keys for this test.
+[[ -f "${KEYNAME}" ]] || ssh-keygen -t rsa -N "" -f "${KEYNAME}" -C "${USERNAME}"
+cat > "${SSHKEYS}" <<EOF
+${USERNAME}:$(cat ${KEYNAME}.pub)
+EOF
+
+# Start a unique instance. This means that we first generate a unique set of ssh
+# keys to ensure that only we have access to this instance. Note that we must
+# constrain ourselves to Haswell or greater in order to have nested
+# virtualization available.
+gcloud compute instances create \
+    --min-cpu-platform "Intel Haswell" \
+    --preemptible \
+    --no-scopes \
+    --metadata block-project-ssh-keys=TRUE \
+    --metadata-from-file ssh-keys="${SSHKEYS}" \
+    --machine-type "${MACHINE}" \
+    --image "${IMAGE}" \
+    --zone "${ZONE}" \
+    "${INSTANCE_NAME}"
+function cleanup {
+    gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}"
+}
+trap cleanup EXIT
+
+# Wait for the instance to become available (up to 5 minutes).
+declare timeout=300
+declare success=0
+declare -r start=$(date +%s)
+declare -r end=$((${start}+${timeout}))
+while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
+  if gcloud compute ssh --ssh-key-file="${KEYNAME}" --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
+    success=$((${success}+1))
+  fi
+done
+if [[ "${success}" -eq "0" ]]; then
+  echo "connect timed out after ${timeout} seconds."
+  exit 1
+fi
+
+# Copy the local directory over.
+tar czf - --dereference --exclude=.git . |
+    gcloud compute ssh \
+        --ssh-key-file="${KEYNAME}" \
+        --zone "${ZONE}" \
+        "${USERNAME}"@"${INSTANCE_NAME}" -- tar xzf -
+
+# Execute the command remotely.
+for cmd; do
+  # Setup relevant environment.
+  #
+  # N.B. This is not a complete test environment, but is complete enough to
+  # provide rudimentary sharding and test output support.
+  declare -a PREFIX=( "env" )
+  if [[ -v TEST_SHARD_INDEX ]]; then
+    PREFIX+=( "TEST_SHARD_INDEX=${TEST_SHARD_INDEX}" )
+  fi
+  if [[ -v TEST_SHARD_STATUS_FILE ]]; then
+    SHARD_STATUS_FILE=$(mktemp -u test-shard-status-XXXXXX)
+    PREFIX+=( "TEST_SHARD_STATUS_FILE=/tmp/${SHARD_STATUS_FILE}" )
+  fi
+  if [[ -v TEST_TOTAL_SHARDS ]]; then
+    PREFIX+=( "TEST_TOTAL_SHARDS=${TEST_TOTAL_SHARDS}" )
+  fi
+  if [[ -v TEST_TMPDIR ]]; then
+    REMOTE_TMPDIR=$(mktemp -u test-XXXXXX)
+    PREFIX+=( "TEST_TMPDIR=/tmp/${REMOTE_TMPDIR}" )
+    # Create remotely.
+    gcloud compute ssh \
+      --ssh-key-file="${KEYNAME}" \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      mkdir -p "/tmp/${REMOTE_TMPDIR}"
+  fi
+  if [[ -v XML_OUTPUT_FILE ]]; then
+    TEST_XML_OUTPUT=$(mktemp -u xml-output-XXXXXX)
+    PREFIX+=( "XML_OUTPUT_FILE=/tmp/${TEST_XML_OUTPUT}" )
+  fi
+  if [[ "${SUDO}" == "true" ]]; then
+    PREFIX+=( "sudo" "-E" )
+  fi
+
+  # Execute the command.
+  gcloud compute ssh \
+    --ssh-key-file="${KEYNAME}" \
+    --zone "${ZONE}" \
+    "${USERNAME}"@"${INSTANCE_NAME}" -- \
+    "${PREFIX[@]}" "${cmd}"
+
+  # Collect relevant results.
+  if [[ -v TEST_SHARD_STATUS_FILE ]]; then
+    gcloud compute scp \
+        --ssh-key-file="${KEYNAME}" \
+        --zone "${ZONE}" \
+        "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${SHARD_STATUS_FILE}" \
+        "${TEST_SHARD_STATUS_FILE}" 2>/dev/null || true # Allowed to fail.
+  fi
+  if [[ -v XML_OUTPUT_FILE ]]; then
+    gcloud compute scp \
+        --ssh-key-file="${KEYNAME}" \
+        --zone "${ZONE}" \
+        "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${TEST_XML_OUTPUT}" \
+        "${XML_OUTPUT_FILE}" 2>/dev/null || true # Allowed to fail.
+  fi
+
+  # Clean up the temporary directory.
+  if [[ -v TEST_TMPDIR ]]; then
+    gcloud compute ssh \
+      --ssh-key-file="${KEYNAME}" \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      rm -rf "/tmp/${REMOTE_TMPDIR}"
+  fi
+done
diff --git a/tools/images/test.cc b/tools/images/test.cc
new file mode 100644
index 000000000..4f31d93c5
--- /dev/null
+++ b/tools/images/test.cc
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+namespace {
+
+TEST(Image, Sanity) {
+  // Do nothing.
+}
+
+}  // namespace
diff --git a/tools/images/ubuntu1604/10_core.sh b/tools/images/ubuntu1604/10_core.sh
new file mode 100755
index 000000000..46dda6bb1
--- /dev/null
+++ b/tools/images/ubuntu1604/10_core.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Install all essential build tools.
+apt-get update && apt-get -y install make git-core build-essential linux-headers-$(uname -r) pkg-config
+
+# Install a recent go toolchain.
+if ! [[ -d /usr/local/go ]]; then
+    wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
+    tar -xvf go1.13.5.linux-amd64.tar.gz
+    mv go /usr/local
+fi
+
+# Link the Go binary from /usr/bin; replacing anything there.
+(cd /usr/bin && rm -f go && sudo ln -fs /usr/local/go/bin/go go)
diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
new file mode 100755
index 000000000..b33e1656c
--- /dev/null
+++ b/tools/images/ubuntu1604/20_bazel.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+declare -r BAZEL_VERSION=2.0.0
+
+# Install bazel dependencies.
+apt-get update && apt-get install -y openjdk-8-jdk-headless unzip
+
+# Use the release installer.
+curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+chmod a+x bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+./bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+rm -f bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
diff --git a/tools/images/ubuntu1604/25_docker.sh b/tools/images/ubuntu1604/25_docker.sh
new file mode 100755
index 000000000..1d3defcd3
--- /dev/null
+++ b/tools/images/ubuntu1604/25_docker.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Add dependencies.
+apt-get update && apt-get -y install \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    gnupg-agent \
+    software-properties-common
+
+# Install the key.
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
+
+# Add the repository.
+add-apt-repository \
+   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+   $(lsb_release -cs) \
+   stable"
+
+# Install docker.
+apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io
diff --git a/tools/images/ubuntu1604/30_containerd.sh b/tools/images/ubuntu1604/30_containerd.sh
new file mode 100755
index 000000000..a7472bd1c
--- /dev/null
+++ b/tools/images/ubuntu1604/30_containerd.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Helper for Go packages below.
+install_helper() {
+  PACKAGE="${1}"
+  TAG="${2}"
+  GOPATH="${3}"
+
+  # Clone the repository.
+  mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
+     git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
+
+  # Checkout and build the repository.
+  (cd "${GOPATH}"/src/"${PACKAGE}" && \
+      git checkout "${TAG}" && \
+      GOPATH="${GOPATH}" make && \
+      GOPATH="${GOPATH}" make install)
+}
+
+# Install dependencies for the crictl tests.
+apt-get install -y btrfs-tools libseccomp-dev
+
+# Install containerd & cri-tools.
+GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
+install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
+install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
+
+# Install gvisor-containerd-shim.
+declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
+declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
+declare -r shim_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX)
+wget --no-verbose "${base}"/latest -O ${latest}
+wget --no-verbose "${base}"/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
+chmod +x ${shim_path}
+mv ${shim_path} /usr/local/bin
+
+# Configure containerd-shim.
+declare -r shim_config_path=/etc/containerd
+declare -r shim_config_tmp_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX.toml)
+mkdir -p ${shim_config_path}
+cat > ${shim_config_tmp_path} <<-EOF
+    runc_shim = "/usr/local/bin/containerd-shim"
+
+[runsc_config]
+    debug = "true"
+    debug-log = "/tmp/runsc-logs/"
+    strace = "true"
+    file-access = "shared"
+EOF
+mv ${shim_config_tmp_path} ${shim_config_path}
+
+# Configure CNI.
+(cd "${GOPATH}" && GOPATH="${GOPATH}" \
+    src/github.com/containerd/containerd/script/setup/install-cni)
+
+# Cleanup the above.
+rm -rf "${GOPATH}"
+rm -rf "${latest}"
+rm -rf "${shim_path}"
+rm -rf "${shim_config_tmp_path}"
diff --git a/tools/images/ubuntu1604/40_kokoro.sh b/tools/images/ubuntu1604/40_kokoro.sh
new file mode 100755
index 000000000..5f2dfc858
--- /dev/null
+++ b/tools/images/ubuntu1604/40_kokoro.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Declare kokoro's required public keys.
+declare -r ssh_public_keys=(
+    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDg7L/ZaEauETWrPklUTky3kvxqQfe2Ax/2CsSqhNIGNMnK/8d79CHlmY9+dE1FFQ/RzKNCaltgy7XcN/fCYiCZr5jm2ZtnLuGNOTzupMNhaYiPL419qmL+5rZXt4/dWTrsHbFRACxT8j51PcRMO5wgbL0Bg2XXimbx8kDFaurL2gqduQYqlu4lxWCaJqOL71WogcimeL63Nq/yeH5PJPWpqE4P9VUQSwAzBWFK/hLeds/AiP3MgVS65qHBnhq0JsHy8JQsqjZbG7Iidt/Ll0+gqzEbi62gDIcczG4KC0iOVzDDP/1BxDtt1lKeA23ll769Fcm3rJyoBMYxjvdw1TDx sabujp@trigger.mtv.corp.google.com"
+    "ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBNgGK/hCdjmulHfRE3hp4rZs38NCR8yAh0eDsztxqGcuXnuSnL7jOlRrbcQpremJ84omD4eKrIpwJUs+YokMdv4= sabujp@trigger.svl.corp.google.com"
+)
+
+# Install dependencies.
+apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip
+
+# junitparser is used to merge junit xml files.
+pip install junitparser
+
+# We need a kbuilder user.
+if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
+    # User was added successfully; we add the relevant SSH keys here.
+    mkdir -p ~kbuilder/.ssh
+    (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
+    chmod 0600 ~kbuilder/.ssh/authorized_keys
+    chown -R kbuilder ~kbuilder/.ssh
+fi
+
+# Give passwordless sudo access.
+cat > /etc/sudoers.d/kokoro <<EOF
+kbuilder ALL=(ALL) NOPASSWD:ALL
+EOF
+
+# Ensure we can run Docker without sudo.
+usermod -aG docker kbuilder
+
+# Ensure that we can access kvm.
+usermod -aG kvm kbuilder
+
+# Ensure that /tmpfs exists and is writable by kokoro.
+#
+# Note that kokoro will typically attach a second disk (sdb) to the instance
+# that is used for the /tmpfs volume. In the future we could setup an init
+# script that formats and mounts this here; however, we don't expect our build
+# artifacts to be that large.
+mkdir -p /tmpfs && chmod 0777 /tmpfs && touch /tmpfs/READY
diff --git a/tools/images/ubuntu1604/BUILD b/tools/images/ubuntu1604/BUILD
new file mode 100644
index 000000000..ab1df0c4c
--- /dev/null
+++ b/tools/images/ubuntu1604/BUILD
@@ -0,0 +1,7 @@
+package(licenses = ["notice"])
+
+filegroup(
+    name = "ubuntu1604",
+    srcs = glob(["*.sh"]),
+    visibility = ["//:sandbox"],
+)
diff --git a/tools/images/ubuntu1804/BUILD b/tools/images/ubuntu1804/BUILD
new file mode 100644
index 000000000..7aa1ecdf7
--- /dev/null
+++ b/tools/images/ubuntu1804/BUILD
@@ -0,0 +1,7 @@
+package(licenses = ["notice"])
+
+alias(
+    name = "ubuntu1804",
+    actual = "//tools/images/ubuntu1604",
+    visibility = ["//:sandbox"],
+)
diff --git a/tools/installers/BUILD b/tools/installers/BUILD
new file mode 100644
index 000000000..01bc4de8c
--- /dev/null
+++ b/tools/installers/BUILD
@@ -0,0 +1,22 @@
+# Installers for use by the tools/vm_test rules.
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+sh_binary(
+    name = "head",
+    srcs = ["head.sh"],
+    data = ["//runsc"],
+)
+
+sh_binary(
+    name = "master",
+    srcs = ["master.sh"],
+)
+
+sh_binary(
+    name = "shim",
+    srcs = ["shim.sh"],
+)
diff --git a/tools/installers/head.sh b/tools/installers/head.sh
new file mode 100755
index 000000000..4435cb27a
--- /dev/null
+++ b/tools/installers/head.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Install our runtime.
+third_party/gvisor/runsc/runsc install
+
+# Restart docker.
+service docker restart || true
diff --git a/tools/installers/master.sh b/tools/installers/master.sh
new file mode 100755
index 000000000..7b1956454
--- /dev/null
+++ b/tools/installers/master.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Install runsc from the master branch.
+curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
+add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
+apt-get update && apt-get install -y runsc
diff --git a/tools/installers/shim.sh b/tools/installers/shim.sh
new file mode 100755
index 000000000..f7dd790a1
--- /dev/null
+++ b/tools/installers/shim.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reinstall the latest containerd shim.
+declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
+declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
+declare -r shim_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX)
+wget --no-verbose "${base}"/latest -O ${latest}
+wget --no-verbose "${base}"/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
+chmod +x ${shim_path}
+mv ${shim_path} /usr/local/bin/gvisor-containerd-shim
-- 
cgit v1.2.3


From 159992300ddb2924cfbf1de57591a78ea27a3a4b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 22 Jan 2020 11:50:53 -0800
Subject: Toolchain version bumps.

- bazel_toolchain to 2.0.2
- rules_go to 0.21.0
- Go toolchain to 1.13.6
- Use new proto lib archive.

PiperOrigin-RevId: 290999410
---
 WORKSPACE  | 33 ++++++++++++++++-----------------
 test/BUILD |  4 ++--
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index e2afc073c..5d2fc36f9 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,10 +4,10 @@ load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 # Load go bazel rules and gazelle.
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "b9aa86ec08a292b97ec4591cf578e020b35f98e12173bbd4a921f84f583aebd9",
+    sha256 = "b27e55d2dcc9e6020e17614ae6e0374818a3e3ce6f2024036e688ada24110444",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.20.2/rules_go-v0.20.2.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.20.2/rules_go-v0.20.2.tar.gz",
+        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.21.0/rules_go-v0.21.0.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.21.0/rules_go-v0.21.0.tar.gz",
     ],
 )
 
@@ -25,7 +25,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_to
 go_rules_dependencies()
 
 go_register_toolchains(
-    go_version = "1.13.4",
+    go_version = "1.13.6",
     nogo = "@//:nogo",
 )
 
@@ -46,18 +46,17 @@ http_archive(
 
 # Load protobuf dependencies.
 http_archive(
-    name = "com_google_protobuf",
-    sha256 = "532d2575d8c0992065bb19ec5fba13aa3683499726f6055c11b474f91a00bb0c",
-    strip_prefix = "protobuf-7f520092d9050d96fb4b707ad11a51701af4ce49",
+    name = "rules_proto",
+    sha256 = "602e7161d9195e50246177e7c55b2f39950a9cf7366f74ed5f22fd45750cd208",
+    strip_prefix = "rules_proto-97d8af4dc474595af3900dd85cb3a29ad28cc313",
     urls = [
-        "https://mirror.bazel.build/github.com/protocolbuffers/protobuf/archive/7f520092d9050d96fb4b707ad11a51701af4ce49.zip",
-        "https://github.com/protocolbuffers/protobuf/archive/7f520092d9050d96fb4b707ad11a51701af4ce49.zip",
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_proto/archive/97d8af4dc474595af3900dd85cb3a29ad28cc313.tar.gz",
+        "https://github.com/bazelbuild/rules_proto/archive/97d8af4dc474595af3900dd85cb3a29ad28cc313.tar.gz",
     ],
 )
-
-load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
-
-protobuf_deps()
+load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+rules_proto_dependencies()
+rules_proto_toolchains()
 
 # Load python dependencies.
 git_repository(
@@ -83,11 +82,11 @@ pip_install()
 # See releases at https://releases.bazel.build/bazel-toolchains.html
 http_archive(
     name = "bazel_toolchains",
-    sha256 = "a019fbd579ce5aed0239de865b2d8281dbb809efd537bf42e0d366783e8dec65",
-    strip_prefix = "bazel-toolchains-0.29.2",
+    sha256 = "a653c9d318e42b14c0ccd7ac50c4a2a276c0db1e39743ab88b5aa2f0bc9cf607",
+    strip_prefix = "bazel-toolchains-2.0.2",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/0.29.2.tar.gz",
-        "https://github.com/bazelbuild/bazel-toolchains/archive/0.29.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-toolchains/releases/download/2.0.2/bazel-toolchains-2.0.2.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/2.0.2.tar.gz",
     ],
 )
 
diff --git a/test/BUILD b/test/BUILD
index 01fa01f2e..bf834d994 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -20,7 +20,7 @@ platform(
     remote_execution_properties = """
         properties: {
           name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:69c9f1652941d64a46f6f7358a44c1718f25caa5cb1ced4a58ccc5281cd183b5"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
         }
         properties: {
           name: "dockerAddCapabilities"
@@ -39,6 +39,6 @@ toolchain(
     ],
     target_compatible_with = [
     ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/9.0.0/bazel_0.28.0/cc:cc-compiler-k8",
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
     toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
 )
-- 
cgit v1.2.3


From 5ab1213a6c405071546c783d6d93b4e9af52842e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 22 Jan 2020 12:27:16 -0800
Subject: Move VFS2 handling of FD readability/writability to
 vfs.FileDescription.

PiperOrigin-RevId: 291006713
---
 pkg/sentry/fsimpl/ext/inode.go                 |  8 +++-
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 11 +++--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       | 11 ++++-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        | 18 +++++--
 pkg/sentry/fsimpl/tmpfs/filesystem.go          | 15 ++----
 pkg/sentry/fsimpl/tmpfs/named_pipe.go          |  5 +-
 pkg/sentry/fsimpl/tmpfs/regular_file.go        | 14 +-----
 pkg/sentry/kernel/pipe/vfs.go                  | 12 ++---
 pkg/sentry/vfs/file_description.go             | 66 ++++++++++++++++++++++++--
 pkg/sentry/vfs/permissions.go                  |  5 +-
 10 files changed, 111 insertions(+), 54 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 8608805bf..191b39970 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,7 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -166,7 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
 		return &fd.vfsfd, nil
 	case *symlink:
 		if flags&linux.O_PATH == 0 {
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 606ca692d..75624e0b1 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -55,7 +55,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy
 // Open implements Inode.Open.
 func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	fd.Init(rp.Mount(), vfsd, f.data, flags)
+	if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil {
+		return nil, err
+	}
 	return &fd.vfsfd, nil
 }
 
@@ -80,10 +82,13 @@ type DynamicBytesFD struct {
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error {
+	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return err
+	}
 	fd.inode = d.Impl().(*Dentry).inode
 	fd.SetDataSource(data)
-	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
+	return nil
 }
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index bcf069b5f..5fa1fa67b 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -43,9 +43,16 @@ type GenericDirectoryFD struct {
 }
 
 // Init initializes a GenericDirectoryFD.
-func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error {
+	if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 {
+		// Can't open directories for writing.
+		return syserror.EISDIR
+	}
+	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return err
+	}
 	fd.children = children
-	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
+	return nil
 }
 
 // VFSFileDescription returns a pointer to the vfs.FileDescription representing
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index a5fdfbde5..aa3fe76ee 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -115,7 +115,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 
 func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
@@ -225,7 +227,9 @@ func TestReadStaticFile(t *testing.T) {
 	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("file1")
-	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{})
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
 	if err != nil {
 		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
 	}
@@ -258,7 +262,9 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 	// Close the file. The file should persist.
 	fd.DecRef()
 
-	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{})
+	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
 	if err != nil {
 		t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
 	}
@@ -272,7 +278,9 @@ func TestDirFDReadWrite(t *testing.T) {
 	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("/")
-	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{})
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
 	if err != nil {
 		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
 	}
@@ -282,7 +290,7 @@ func TestDirFDReadWrite(t *testing.T) {
 	if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
 		t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
 	}
-	if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR {
+	if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF {
 		t.Fatalf("Write for directory FD failed with unexpected error: %v", err)
 	}
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 4cd7e9aea..a9f66a42a 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -337,19 +337,12 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			return nil, err
 		}
 	}
-	mnt := rp.Mount()
 	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-		if fd.writable {
-			if err := mnt.CheckBeginWrite(); err != nil {
-				return nil, err
-			}
-			// mnt.EndWrite() is called by regularFileFD.Release().
+		if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
 		}
-		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data.Truncate(0, impl.memFile)
@@ -363,7 +356,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
 		return &fd.vfsfd, nil
 	case *symlink:
 		// Can't open symlinks without O_PATH (which is unimplemented).
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 40bde54de..482aabd52 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -50,11 +50,10 @@ type namedPipeFD struct {
 func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
 	var err error
 	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
+	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags)
 	if err != nil {
 		return nil, err
 	}
-	mnt := rp.Mount()
-	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+	fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{})
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 5fa70cc6d..7c633c1b0 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -101,10 +101,6 @@ func (rf *regularFile) truncate(size uint64) (bool, error) {
 type regularFileFD struct {
 	fileDescription
 
-	// These are immutable.
-	readable bool
-	writable bool
-
 	// off is the file offset. off is accessed using atomic memory operations.
 	// offMu serializes operations that may mutate off.
 	off   int64
@@ -113,16 +109,11 @@ type regularFileFD struct {
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *regularFileFD) Release() {
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
+	// noop
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
@@ -147,9 +138,6 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index bf7461cbb..6f83e3cee 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -66,7 +66,7 @@ func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe {
 // for read and write will succeed both in blocking and nonblocking mode. POSIX
 // leaves this behavior undefined. This can be used to open a FIFO for writing
 // while there are no readers available." - fifo(7)
-func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
 	vp.mu.Lock()
 	defer vp.mu.Unlock()
 
@@ -76,7 +76,7 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd
 		return nil, syserror.EINVAL
 	}
 
-	vfd, err := vp.open(rp, vfsd, vfsfd, flags)
+	vfd, err := vp.open(vfsd, vfsfd, flags)
 	if err != nil {
 		return nil, err
 	}
@@ -118,19 +118,13 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd
 }
 
 // Preconditions: vp.mu must be held.
-func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
 	var fd VFSPipeFD
 	fd.flags = flags
 	fd.readable = vfs.MayReadFileWithOpenFlags(flags)
 	fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
 	fd.vfsfd = vfsfd
 	fd.pipe = &vp.pipe
-	if fd.writable {
-		// The corresponding Mount.EndWrite() is in VFSPipe.Release().
-		if err := rp.Mount().CheckBeginWrite(); err != nil {
-			return nil, err
-		}
-	}
 
 	switch {
 	case fd.readable && fd.writable:
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 6afe280bc..51c95c2d9 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -49,8 +49,23 @@ type FileDescription struct {
 	// A reference is held on vd. vd is immutable.
 	vd VirtualDentry
 
+	// opts contains options passed to FileDescription.Init(). opts is
+	// immutable.
 	opts FileDescriptionOptions
 
+	// readable is MayReadFileWithOpenFlags(statusFlags). readable is
+	// immutable.
+	//
+	// readable is analogous to Linux's FMODE_READ.
+	readable bool
+
+	// writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
+	// the FileDescription holds a write count on vd.mount. writable is
+	// immutable.
+	//
+	// writable is analogous to Linux's FMODE_WRITE.
+	writable bool
+
 	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in FileDescription.
 	impl FileDescriptionImpl
@@ -77,10 +92,17 @@ type FileDescriptionOptions struct {
 	UseDentryMetadata bool
 }
 
-// Init must be called before first use of fd. It takes references on mnt and
-// d. statusFlags is the initial file description status flags, which is
-// usually the full set of flags passed to open(2).
-func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
+// Init must be called before first use of fd. If it succeeds, it takes
+// references on mnt and d. statusFlags is the initial file description status
+// flags, which is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+	writable := MayWriteFileWithOpenFlags(statusFlags)
+	if writable {
+		if err := mnt.CheckBeginWrite(); err != nil {
+			return err
+		}
+	}
+
 	fd.refs = 1
 	fd.statusFlags = statusFlags | linux.O_LARGEFILE
 	fd.vd = VirtualDentry{
@@ -89,7 +111,10 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 	}
 	fd.vd.IncRef()
 	fd.opts = *opts
+	fd.readable = MayReadFileWithOpenFlags(statusFlags)
+	fd.writable = writable
 	fd.impl = impl
+	return nil
 }
 
 // IncRef increments fd's reference count.
@@ -117,6 +142,9 @@ func (fd *FileDescription) TryIncRef() bool {
 func (fd *FileDescription) DecRef() {
 	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
 		fd.impl.Release()
+		if fd.writable {
+			fd.vd.mount.EndWrite()
+		}
 		fd.vd.DecRef()
 	} else if refs < 0 {
 		panic("FileDescription.DecRef() called without holding a reference")
@@ -194,6 +222,16 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
 	return nil
 }
 
+// IsReadable returns true if fd was opened for reading.
+func (fd *FileDescription) IsReadable() bool {
+	return fd.readable
+}
+
+// IsWritable returns true if fd was opened for writing.
+func (fd *FileDescription) IsWritable() bool {
+	return fd.writable
+}
+
 // Impl returns the FileDescriptionImpl associated with fd.
 func (fd *FileDescription) Impl() FileDescriptionImpl {
 	return fd.impl
@@ -241,6 +279,8 @@ type FileDescriptionImpl interface {
 	// Errors:
 	//
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for reading.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -254,6 +294,8 @@ type FileDescriptionImpl interface {
 	// Errors:
 	//
 	// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for reading.
 	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
 
 	// PWrite writes src to the file, starting at the given offset, and returns
@@ -268,6 +310,8 @@ type FileDescriptionImpl interface {
 	//
 	// - If opts.Flags specifies unsupported options, PWrite returns
 	// EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for writing.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -281,6 +325,8 @@ type FileDescriptionImpl interface {
 	// Errors:
 	//
 	// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for writing.
 	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
 
 	// IterDirents invokes cb on each entry in the directory represented by the
@@ -411,11 +457,17 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
 // offset, and returns the number of bytes read. PRead is permitted to return
 // partial reads with a nil error.
 func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EBADF
+	}
 	return fd.impl.PRead(ctx, dst, offset, opts)
 }
 
 // Read is similar to PRead, but does not specify an offset.
 func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EBADF
+	}
 	return fd.impl.Read(ctx, dst, opts)
 }
 
@@ -423,11 +475,17 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
 // offset, and returns the number of bytes written. PWrite is permitted to
 // return partial writes with a nil error.
 func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EBADF
+	}
 	return fd.impl.PWrite(ctx, src, offset, opts)
 }
 
 // Write is similar to PWrite, but does not specify an offset.
 func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EBADF
+	}
 	return fd.impl.Write(ctx, src, opts)
 }
 
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index d279d05ca..f664581f4 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -94,14 +94,13 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 // the set of accesses permitted for the opened file:
 //
 // - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
-// mutates the file), but does not permit the opened to write to the file
+// mutates the file), but does not permit writing to the open file description
 // thereafter.
 //
 // - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
 // flags to mean: check for read and write permission on the file and return a
 // file descriptor that can't be used for reading or writing." - open(2). Thus
-// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but
-// filesystems are responsible for ensuring that access is denied.
+// AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
 //
 // Use May{Read,Write}FileWithOpenFlags() for these checks instead.
 func AccessTypesForOpenFlags(flags uint32) AccessTypes {
-- 
cgit v1.2.3


From b7853f688b4bcd3465c0c3087fcbd8d53bdf26ae Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 22 Jan 2020 14:46:15 -0800
Subject: Error marshalling the matcher.

The iptables binary is looking for libxt_.so when it should be looking
for libxt_udp.so, so it's having an issue reading the data in
xt_match_entry. I think it may be an alignment issue.

Trying to fix this is leading to me fighting with the metadata struct,
so I'm gonna go kill that.
---
 pkg/abi/linux/netfilter.go               |  5 +++++
 pkg/sentry/socket/netfilter/netfilter.go | 35 ++++++++++++++++++++------------
 pkg/tcpip/iptables/udp_matcher.go        |  2 +-
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index f0e544f9c..effed7976 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -198,6 +198,11 @@ type XTEntryMatch struct {
 // SizeOfXTEntryMatch is the size of an XTEntryMatch.
 const SizeOfXTEntryMatch = 32
 
+type KernelXTEntryMatch struct {
+	XTEntryMatch
+	Data []byte
+}
+
 // XTEntryTarget holds a target for a rule. For example, it can specify that
 // packets matching the rule should DROP, ACCEPT, or use an extension target.
 // iptables-extension(8) has a list of possible targets.
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3caabca9a..b49fe5b3e 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -207,26 +207,34 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	type udpMatch struct {
-		linux.XTEntryMatch
-		linux.XTUDP
-	}
-	linuxMatcher := udpMatch{
+	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
 			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
 			// Name:      "udp",
 		},
-		XTUDP: linux.XTUDP{
-			SourcePortStart:      matcher.Data.SourcePortStart,
-			SourcePortEnd:        matcher.Data.SourcePortEnd,
-			DestinationPortStart: matcher.Data.DestinationPortStart,
-			DestinationPortEnd:   matcher.Data.DestinationPortEnd,
-			InverseFlags:         matcher.Data.InverseFlags,
-		},
+		Data: make([]byte, linux.SizeOfXTUDP+22),
 	}
+	// copy(linuxMatcher.Name[:], "udp")
 	copy(linuxMatcher.Name[:], "udp")
 
-	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte
+	// TODO: Must be aligned.
+	xtudp := linux.XTUDP{
+		SourcePortStart:      matcher.Data.SourcePortStart,
+		SourcePortEnd:        matcher.Data.SourcePortEnd,
+		DestinationPortStart: matcher.Data.DestinationPortStart,
+		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
+		InverseFlags:         matcher.Data.InverseFlags,
+	}
+	binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp)
+
+	if binary.Size(linuxMatcher)%64 != 0 {
+		panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher)))
+	}
+
+	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte
+	if len(buf)%64 != 0 {
+		panic(fmt.Sprintf("len is actually: %d", len(buf)))
+	}
 	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
 	return buf[:]
 }
@@ -245,6 +253,7 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
+	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index fca457199..65ae7f9e0 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -59,7 +59,7 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
-		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
 	}
 
 	return &UDPMatcher{Data: data}, nil
-- 
cgit v1.2.3


From 1d97adaa6d73dd897bb4e89d4533936a95003951 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 22 Jan 2020 14:50:32 -0800
Subject: Use embedded mutex pattern for stack.NIC

- Wrap NIC's fields that should only be accessed while holding the mutex in
  an anonymous struct with the embedded mutex.
- Make sure NIC's spoofing and promiscuous mode flags are only read while
  holding the NIC's mutex.
- Use the correct endpoint when sending DAD messages.
- Do not hold the NIC's lock when sending DAD messages.

This change does not introduce any behaviour changes.

Tests: Existing tests continue to pass.
PiperOrigin-RevId: 291036251
---
 pkg/tcpip/stack/ndp.go      | 181 +++++++++++++++------------------
 pkg/tcpip/stack/ndp_test.go |  45 ++++-----
 pkg/tcpip/stack/nic.go      | 236 +++++++++++++++++++++++++-------------------
 3 files changed, 234 insertions(+), 228 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 7d4b41dfa..d983ac390 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -15,7 +15,6 @@
 package stack
 
 import (
-	"fmt"
 	"log"
 	"math/rand"
 	"time"
@@ -429,8 +428,13 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		return tcpip.ErrAddressFamilyNotSupported
 	}
 
-	// Should not attempt to perform DAD on an address that is currently in
-	// the DAD process.
+	if ref.getKind() != permanentTentative {
+		// The endpoint should be marked as tentative since we are starting DAD.
+		log.Fatalf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID())
+	}
+
+	// Should not attempt to perform DAD on an address that is currently in the
+	// DAD process.
 	if _, ok := ndp.dad[addr]; ok {
 		// Should never happen because we should only ever call this function for
 		// newly created addresses. If we attemped to "add" an address that already
@@ -438,77 +442,79 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// address, or its reference count would have been increased without doing
 		// the work that would have been done for an address that was brand new.
 		// See NIC.addAddressLocked.
-		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
+		log.Fatalf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID())
 	}
 
 	remaining := ndp.configs.DupAddrDetectTransmits
-
-	{
-		done, err := ndp.doDuplicateAddressDetection(addr, remaining, ref)
-		if err != nil {
-			return err
-		}
-		if done {
-			return nil
-		}
+	if remaining == 0 {
+		ref.setKind(permanent)
+		return nil
 	}
 
-	remaining--
-
 	var done bool
 	var timer *time.Timer
-	timer = time.AfterFunc(ndp.configs.RetransmitTimer, func() {
-		var d bool
-		var err *tcpip.Error
-
-		// doDadIteration does a single iteration of the DAD loop.
-		//
-		// Returns true if the integrator needs to be informed of DAD
-		// completing.
-		doDadIteration := func() bool {
-			ndp.nic.mu.Lock()
-			defer ndp.nic.mu.Unlock()
-
-			if done {
-				// If we reach this point, it means that the DAD
-				// timer fired after another goroutine already
-				// obtained the NIC lock and stopped DAD before
-				// this function obtained the NIC lock. Simply
-				// return here and do nothing further.
-				return false
-			}
+	// We initially start a timer to fire immediately because some of the DAD work
+	// cannot be done while holding the NIC's lock. This is effectively the same
+	// as starting a goroutine but we use a timer that fires immediately so we can
+	// reset it for the next DAD iteration.
+	timer = time.AfterFunc(0, func() {
+		ndp.nic.mu.RLock()
+		if done {
+			// If we reach this point, it means that the DAD timer fired after
+			// another goroutine already obtained the NIC lock and stopped DAD
+			// before this function obtained the NIC lock. Simply return here and do
+			// nothing further.
+			ndp.nic.mu.RUnlock()
+			return
+		}
 
-			ref, ok := ndp.nic.endpoints[NetworkEndpointID{addr}]
-			if !ok {
-				// This should never happen.
-				// We should have an endpoint for addr since we
-				// are still performing DAD on it. If the
-				// endpoint does not exist, but we are doing DAD
-				// on it, then we started DAD at some point, but
-				// forgot to stop it when the endpoint was
-				// deleted.
-				panic(fmt.Sprintf("ndpdad: unrecognized addr %s for NIC(%d)", addr, ndp.nic.ID()))
-			}
+		if ref.getKind() != permanentTentative {
+			// The endpoint should still be marked as tentative since we are still
+			// performing DAD on it.
+			log.Fatalf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID())
+		}
 
-			d, err = ndp.doDuplicateAddressDetection(addr, remaining, ref)
-			if err != nil || d {
-				delete(ndp.dad, addr)
+		dadDone := remaining == 0
+		ndp.nic.mu.RUnlock()
 
-				if err != nil {
-					log.Printf("ndpdad: Error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, ndp.nic.ID(), err)
-				}
+		var err *tcpip.Error
+		if !dadDone {
+			err = ndp.sendDADPacket(addr)
+		}
 
-				// Let the integrator know DAD has completed.
-				return true
-			}
+		ndp.nic.mu.Lock()
+		if done {
+			// If we reach this point, it means that DAD was stopped after we released
+			// the NIC's read lock and before we obtained the write lock.
+			ndp.nic.mu.Unlock()
+			return
+		}
 
+		if dadDone {
+			// DAD has resolved.
+			ref.setKind(permanent)
+		} else if err == nil {
+			// DAD is not done and we had no errors when sending the last NDP NS,
+			// schedule the next DAD timer.
 			remaining--
 			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
-			return false
+
+			ndp.nic.mu.Unlock()
+			return
+		}
+
+		// At this point we know that either DAD is done or we hit an error sending
+		// the last NDP NS. Either way, clean up addr's DAD state and let the
+		// integrator know DAD has completed.
+		delete(ndp.dad, addr)
+		ndp.nic.mu.Unlock()
+
+		if err != nil {
+			log.Printf("ndpdad: error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, ndp.nic.ID(), err)
 		}
 
-		if doDadIteration() && ndp.nic.stack.ndpDisp != nil {
-			ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, d, err)
+		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, dadDone, err)
 		}
 	})
 
@@ -520,45 +526,16 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	return nil
 }
 
-// doDuplicateAddressDetection is called on every iteration of the timer, and
-// when DAD starts.
-//
-// It handles resolving the address (if there are no more NS to send), or
-// sending the next NS if there are more NS to send.
-//
-// This function must only be called by IPv6 addresses that are currently
-// tentative.
-//
-// The NIC that ndp belongs to (n) MUST be locked.
+// sendDADPacket sends a NS message to see if any nodes on ndp's NIC's link owns
+// addr.
 //
-// Returns true if DAD has resolved; false if DAD is still ongoing.
-func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining uint8, ref *referencedNetworkEndpoint) (bool, *tcpip.Error) {
-	if ref.getKind() != permanentTentative {
-		// The endpoint should still be marked as tentative
-		// since we are still performing DAD on it.
-		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
-	}
-
-	if remaining == 0 {
-		// DAD has resolved.
-		ref.setKind(permanent)
-		return true, nil
-	}
-
-	// Send a new NS.
+// addr must be a tentative IPv6 address on ndp's NIC.
+func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
-	snmcRef, ok := ndp.nic.endpoints[NetworkEndpointID{snmc}]
-	if !ok {
-		// This should never happen as if we have the
-		// address, we should have the solicited-node
-		// address.
-		panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", ndp.nic.ID(), snmc, addr))
-	}
-	snmcRef.incRef()
 
-	// Use the unspecified address as the source address when performing
-	// DAD.
-	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), snmcRef, false, false)
+	// Use the unspecified address as the source address when performing DAD.
+	ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
+	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
@@ -569,15 +546,19 @@ func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining u
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 	sent := r.Stats().ICMP.V6PacketsSent
-	if err := r.WritePacket(nil, NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: DefaultTOS}, tcpip.PacketBuffer{
-		Header: hdr,
-	}); err != nil {
+	if err := r.WritePacket(nil,
+		NetworkHeaderParams{
+			Protocol: header.ICMPv6ProtocolNumber,
+			TTL:      header.NDPHopLimit,
+			TOS:      DefaultTOS,
+		}, tcpip.PacketBuffer{Header: hdr},
+	); err != nil {
 		sent.Dropped.Increment()
-		return false, err
+		return err
 	}
 	sent.NeighborSolicit.Increment()
 
-	return false, nil
+	return nil
 }
 
 // stopDuplicateAddressDetection ends a running Duplicate Address Detection
@@ -1212,7 +1193,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 
 	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
 		// Send an RS message with the unspecified source address.
-		ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, true)
+		ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
 		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 1a52e0e68..376681b30 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -301,6 +301,8 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 // Included in the subtests is a test to make sure that an invalid
 // RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
 func TestDADResolve(t *testing.T) {
+	const nicID = 1
+
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -331,44 +333,36 @@ func TestDADResolve(t *testing.T) {
 			opts.NDPConfigs.RetransmitTimer = test.retransTimer
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1)
 			s := stack.New(opts)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
-			}
-
-			stat := s.Stats().ICMP.V6PacketsSent.NeighborSolicit
-
-			// Should have sent an NDP NS immediately.
-			if got := stat.Value(); got != 1 {
-				t.Fatalf("got NeighborSolicit = %d, want = 1", got)
-
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 			}
 
 			// Address should not be considered bound to the NIC yet
 			// (DAD ongoing).
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Wait for the remaining time - some delta (500ms), to
 			// make sure the address is still not resolved.
 			const delta = 500 * time.Millisecond
 			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - delta)
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Wait for DAD to resolve.
@@ -385,8 +379,8 @@ func TestDADResolve(t *testing.T) {
 				if e.err != nil {
 					t.Fatal("got DAD error: ", e.err)
 				}
-				if e.nicID != 1 {
-					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
+				if e.nicID != nicID {
+					t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
 				}
 				if e.addr != addr1 {
 					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
@@ -395,16 +389,16 @@ func TestDADResolve(t *testing.T) {
 					t.Fatal("got DAD event w/ resolved = false, want = true")
 				}
 			}
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr1 {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
 			}
 
 			// Should not have sent any more NS messages.
-			if got := stat.Value(); got != uint64(test.dupAddrDetectTransmits) {
+			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != uint64(test.dupAddrDetectTransmits) {
 				t.Fatalf("got NeighborSolicit = %d, want = %d", got, test.dupAddrDetectTransmits)
 			}
 
@@ -425,7 +419,6 @@ func TestDADResolve(t *testing.T) {
 			}
 		})
 	}
-
 }
 
 // TestDADFail tests to make sure that the DAD process fails if another node is
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index de88c0bfa..79556a36f 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -35,24 +35,21 @@ type NIC struct {
 	linkEP  LinkEndpoint
 	context NICContext
 
-	mu            sync.RWMutex
-	spoofing      bool
-	promiscuous   bool
-	primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
-	endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
-	addressRanges []tcpip.Subnet
-	mcastJoins    map[NetworkEndpointID]int32
-	// packetEPs is protected by mu, but the contained PacketEndpoint
-	// values are not.
-	packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
-
 	stats NICStats
 
-	// ndp is the NDP related state for NIC.
-	//
-	// Note, read and write operations on ndp require that the NIC is
-	// appropriately locked.
-	ndp ndpState
+	mu struct {
+		sync.RWMutex
+		spoofing      bool
+		promiscuous   bool
+		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
+		endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
+		addressRanges []tcpip.Subnet
+		mcastJoins    map[NetworkEndpointID]int32
+		// packetEPs is protected by mu, but the contained PacketEndpoint
+		// values are not.
+		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
+		ndp       ndpState
+	}
 }
 
 // NICStats includes transmitted and received stats.
@@ -97,15 +94,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	// of IPv6 is supported on this endpoint's LinkEndpoint.
 
 	nic := &NIC{
-		stack:      stack,
-		id:         id,
-		name:       name,
-		linkEP:     ep,
-		context:    ctx,
-		primary:    make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint),
-		endpoints:  make(map[NetworkEndpointID]*referencedNetworkEndpoint),
-		mcastJoins: make(map[NetworkEndpointID]int32),
-		packetEPs:  make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint),
+		stack:   stack,
+		id:      id,
+		name:    name,
+		linkEP:  ep,
+		context: ctx,
 		stats: NICStats{
 			Tx: DirectionStats{
 				Packets: &tcpip.StatCounter{},
@@ -116,22 +109,26 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 				Bytes:   &tcpip.StatCounter{},
 			},
 		},
-		ndp: ndpState{
-			configs:          stack.ndpConfigs,
-			dad:              make(map[tcpip.Address]dadState),
-			defaultRouters:   make(map[tcpip.Address]defaultRouterState),
-			onLinkPrefixes:   make(map[tcpip.Subnet]onLinkPrefixState),
-			autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
-		},
 	}
-	nic.ndp.nic = nic
+	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
+	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
+	nic.mu.mcastJoins = make(map[NetworkEndpointID]int32)
+	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
+	nic.mu.ndp = ndpState{
+		nic:              nic,
+		configs:          stack.ndpConfigs,
+		dad:              make(map[tcpip.Address]dadState),
+		defaultRouters:   make(map[tcpip.Address]defaultRouterState),
+		onLinkPrefixes:   make(map[tcpip.Subnet]onLinkPrefixState),
+		autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
+	}
 
 	// Register supported packet endpoint protocols.
 	for _, netProto := range header.Ethertypes {
-		nic.packetEPs[netProto] = []PacketEndpoint{}
+		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
 	}
 	for _, netProto := range stack.networkProtocols {
-		nic.packetEPs[netProto.Number()] = []PacketEndpoint{}
+		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
 	}
 
 	return nic
@@ -215,7 +212,7 @@ func (n *NIC) enable() *tcpip.Error {
 	// and default routers). Therefore, soliciting RAs from other routers on
 	// a link is unnecessary for routers.
 	if !n.stack.forwarding {
-		n.ndp.startSolicitingRouters()
+		n.mu.ndp.startSolicitingRouters()
 	}
 
 	return nil
@@ -230,8 +227,8 @@ func (n *NIC) becomeIPv6Router() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	n.ndp.cleanupHostOnlyState()
-	n.ndp.stopSolicitingRouters()
+	n.mu.ndp.cleanupHostOnlyState()
+	n.mu.ndp.stopSolicitingRouters()
 }
 
 // becomeIPv6Host transitions n into an IPv6 host.
@@ -242,7 +239,7 @@ func (n *NIC) becomeIPv6Host() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	n.ndp.startSolicitingRouters()
+	n.mu.ndp.startSolicitingRouters()
 }
 
 // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
@@ -254,13 +251,13 @@ func (n *NIC) attachLinkEndpoint() {
 // setPromiscuousMode enables or disables promiscuous mode.
 func (n *NIC) setPromiscuousMode(enable bool) {
 	n.mu.Lock()
-	n.promiscuous = enable
+	n.mu.promiscuous = enable
 	n.mu.Unlock()
 }
 
 func (n *NIC) isPromiscuousMode() bool {
 	n.mu.RLock()
-	rv := n.promiscuous
+	rv := n.mu.promiscuous
 	n.mu.RUnlock()
 	return rv
 }
@@ -272,7 +269,7 @@ func (n *NIC) isLoopback() bool {
 // setSpoofing enables or disables address spoofing.
 func (n *NIC) setSpoofing(enable bool) {
 	n.mu.Lock()
-	n.spoofing = enable
+	n.mu.spoofing = enable
 	n.mu.Unlock()
 }
 
@@ -291,8 +288,8 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr t
 	defer n.mu.RUnlock()
 
 	var deprecatedEndpoint *referencedNetworkEndpoint
-	for _, r := range n.primary[protocol] {
-		if !r.isValidForOutgoing() {
+	for _, r := range n.mu.primary[protocol] {
+		if !r.isValidForOutgoingRLocked() {
 			continue
 		}
 
@@ -342,7 +339,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 
-	primaryAddrs := n.primary[header.IPv6ProtocolNumber]
+	primaryAddrs := n.mu.primary[header.IPv6ProtocolNumber]
 
 	if len(primaryAddrs) == 0 {
 		return nil
@@ -425,7 +422,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 // hasPermanentAddrLocked returns true if n has a permanent (including currently
 // tentative) address, addr.
 func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
-	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
 
 	if !ok {
 		return false
@@ -436,24 +433,54 @@ func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
 	return kind == permanent || kind == permanentTentative
 }
 
+type getRefBehaviour int
+
+const (
+	// spoofing indicates that the NIC's spoofing flag should be observed when
+	// getting a NIC's referenced network endpoint.
+	spoofing getRefBehaviour = iota
+
+	// promiscuous indicates that the NIC's promiscuous flag should be observed
+	// when getting a NIC's referenced network endpoint.
+	promiscuous
+
+	// forceSpoofing indicates that the NIC should be assumed to be spoofing,
+	// regardless of what the NIC's spoofing flag is when getting a NIC's
+	// referenced network endpoint.
+	forceSpoofing
+)
+
 func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
-	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, n.promiscuous)
+	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
 }
 
 // findEndpoint finds the endpoint, if any, with the given address.
 func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
-	return n.getRefOrCreateTemp(protocol, address, peb, n.spoofing)
+	return n.getRefOrCreateTemp(protocol, address, peb, spoofing)
 }
 
 // getRefEpOrCreateTemp returns the referenced network endpoint for the given
-// protocol and address. If none exists a temporary one may be created if
-// we are in promiscuous mode or spoofing.
-func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, spoofingOrPromiscuous bool) *referencedNetworkEndpoint {
+// protocol and address.
+//
+// If none exists a temporary one may be created if we are in promiscuous mode
+// or spoofing. Promiscuous mode will only be checked if promiscuous is true.
+// Similarly, spoofing will only be checked if spoofing is true.
+func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint {
 	id := NetworkEndpointID{address}
 
 	n.mu.RLock()
 
-	if ref, ok := n.endpoints[id]; ok {
+	var spoofingOrPromiscuous bool
+	switch tempRef {
+	case spoofing:
+		spoofingOrPromiscuous = n.mu.spoofing
+	case promiscuous:
+		spoofingOrPromiscuous = n.mu.promiscuous
+	case forceSpoofing:
+		spoofingOrPromiscuous = true
+	}
+
+	if ref, ok := n.mu.endpoints[id]; ok {
 		// An endpoint with this id exists, check if it can be used and return it.
 		switch ref.getKind() {
 		case permanentExpired:
@@ -474,7 +501,7 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	// the caller or if the address is found in the NIC's subnets.
 	createTempEP := spoofingOrPromiscuous
 	if !createTempEP {
-		for _, sn := range n.addressRanges {
+		for _, sn := range n.mu.addressRanges {
 			// Skip the subnet address.
 			if address == sn.ID() {
 				continue
@@ -502,7 +529,7 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	// endpoint, create a new "temporary" endpoint. It will only exist while
 	// there's a route through it.
 	n.mu.Lock()
-	if ref, ok := n.endpoints[id]; ok {
+	if ref, ok := n.mu.endpoints[id]; ok {
 		// No need to check the type as we are ok with expired endpoints at this
 		// point.
 		if ref.tryIncRef() {
@@ -543,7 +570,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	// Sanity check.
 	id := NetworkEndpointID{LocalAddress: protocolAddress.AddressWithPrefix.Address}
-	if ref, ok := n.endpoints[id]; ok {
+	if ref, ok := n.mu.endpoints[id]; ok {
 		// Endpoint already exists.
 		if kind != permanent {
 			return nil, tcpip.ErrDuplicateAddress
@@ -562,7 +589,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 				ref.deprecated = deprecated
 				ref.configType = configType
 
-				refs := n.primary[ref.protocol]
+				refs := n.mu.primary[ref.protocol]
 				for i, r := range refs {
 					if r == ref {
 						switch peb {
@@ -572,9 +599,9 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 							if i == 0 {
 								return ref, nil
 							}
-							n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
 						case NeverPrimaryEndpoint:
-							n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
 							return ref, nil
 						}
 					}
@@ -637,13 +664,13 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 		}
 	}
 
-	n.endpoints[id] = ref
+	n.mu.endpoints[id] = ref
 
 	n.insertPrimaryEndpointLocked(ref, peb)
 
 	// If we are adding a tentative IPv6 address, start DAD.
 	if isIPv6Unicast && kind == permanentTentative {
-		if err := n.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
+		if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
 			return nil, err
 		}
 	}
@@ -668,8 +695,8 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 
-	addrs := make([]tcpip.ProtocolAddress, 0, len(n.endpoints))
-	for nid, ref := range n.endpoints {
+	addrs := make([]tcpip.ProtocolAddress, 0, len(n.mu.endpoints))
+	for nid, ref := range n.mu.endpoints {
 		// Don't include tentative, expired or temporary endpoints to
 		// avoid confusion and prevent the caller from using those.
 		switch ref.getKind() {
@@ -695,7 +722,7 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
 	defer n.mu.RUnlock()
 
 	var addrs []tcpip.ProtocolAddress
-	for proto, list := range n.primary {
+	for proto, list := range n.mu.primary {
 		for _, ref := range list {
 			// Don't include tentative, expired or tempory endpoints
 			// to avoid confusion and prevent the caller from using
@@ -726,7 +753,7 @@ func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWit
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 
-	list, ok := n.primary[proto]
+	list, ok := n.mu.primary[proto]
 	if !ok {
 		return tcpip.AddressWithPrefix{}
 	}
@@ -769,7 +796,7 @@ func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWit
 // address.
 func (n *NIC) AddAddressRange(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) {
 	n.mu.Lock()
-	n.addressRanges = append(n.addressRanges, subnet)
+	n.mu.addressRanges = append(n.mu.addressRanges, subnet)
 	n.mu.Unlock()
 }
 
@@ -778,13 +805,13 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Lock()
 
 	// Use the same underlying array.
-	tmp := n.addressRanges[:0]
-	for _, sub := range n.addressRanges {
+	tmp := n.mu.addressRanges[:0]
+	for _, sub := range n.mu.addressRanges {
 		if sub != subnet {
 			tmp = append(tmp, sub)
 		}
 	}
-	n.addressRanges = tmp
+	n.mu.addressRanges = tmp
 
 	n.mu.Unlock()
 }
@@ -793,8 +820,8 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
-	sns := make([]tcpip.Subnet, 0, len(n.addressRanges)+len(n.endpoints))
-	for nid := range n.endpoints {
+	sns := make([]tcpip.Subnet, 0, len(n.mu.addressRanges)+len(n.mu.endpoints))
+	for nid := range n.mu.endpoints {
 		sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress))))
 		if err != nil {
 			// This should never happen as the mask has been carefully crafted to
@@ -803,7 +830,7 @@ func (n *NIC) AddressRanges() []tcpip.Subnet {
 		}
 		sns = append(sns, sn)
 	}
-	return append(sns, n.addressRanges...)
+	return append(sns, n.mu.addressRanges...)
 }
 
 // insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
@@ -813,9 +840,9 @@ func (n *NIC) AddressRanges() []tcpip.Subnet {
 func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb PrimaryEndpointBehavior) {
 	switch peb {
 	case CanBePrimaryEndpoint:
-		n.primary[r.protocol] = append(n.primary[r.protocol], r)
+		n.mu.primary[r.protocol] = append(n.mu.primary[r.protocol], r)
 	case FirstPrimaryEndpoint:
-		n.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.primary[r.protocol]...)
+		n.mu.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.mu.primary[r.protocol]...)
 	}
 }
 
@@ -827,7 +854,7 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 	// and was waiting (on the lock) to be removed and 2) the same address was
 	// re-added in the meantime by removing this endpoint from the list and
 	// adding a new one.
-	if n.endpoints[id] != r {
+	if n.mu.endpoints[id] != r {
 		return
 	}
 
@@ -835,11 +862,11 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 		panic("Reference count dropped to zero before being removed")
 	}
 
-	delete(n.endpoints, id)
-	refs := n.primary[r.protocol]
+	delete(n.mu.endpoints, id)
+	refs := n.mu.primary[r.protocol]
 	for i, ref := range refs {
 		if ref == r {
-			n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+			n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
 			break
 		}
 	}
@@ -854,7 +881,7 @@ func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
 }
 
 func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
-	r, ok := n.endpoints[NetworkEndpointID{addr}]
+	r, ok := n.mu.endpoints[NetworkEndpointID{addr}]
 	if !ok {
 		return tcpip.ErrBadLocalAddress
 	}
@@ -870,13 +897,13 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 		// If we are removing a tentative IPv6 unicast address, stop
 		// DAD.
 		if kind == permanentTentative {
-			n.ndp.stopDuplicateAddressDetection(addr)
+			n.mu.ndp.stopDuplicateAddressDetection(addr)
 		}
 
 		// If we are removing an address generated via SLAAC, cleanup
 		// its SLAAC resources and notify the integrator.
 		if r.configType == slaac {
-			n.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
+			n.mu.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
 		}
 	}
 
@@ -926,7 +953,7 @@ func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.A
 	// outlined in RFC 3810 section 5.
 
 	id := NetworkEndpointID{addr}
-	joins := n.mcastJoins[id]
+	joins := n.mu.mcastJoins[id]
 	if joins == 0 {
 		netProto, ok := n.stack.networkProtocols[protocol]
 		if !ok {
@@ -942,7 +969,7 @@ func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.A
 			return err
 		}
 	}
-	n.mcastJoins[id] = joins + 1
+	n.mu.mcastJoins[id] = joins + 1
 	return nil
 }
 
@@ -960,7 +987,7 @@ func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
 // before leaveGroupLocked is called.
 func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 	id := NetworkEndpointID{addr}
-	joins := n.mcastJoins[id]
+	joins := n.mu.mcastJoins[id]
 	switch joins {
 	case 0:
 		// There are no joins with this address on this NIC.
@@ -971,7 +998,7 @@ func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 			return err
 		}
 	}
-	n.mcastJoins[id] = joins - 1
+	n.mu.mcastJoins[id] = joins - 1
 	return nil
 }
 
@@ -1006,12 +1033,12 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 	// Are any packet sockets listening for this network protocol?
 	n.mu.RLock()
-	packetEPs := n.packetEPs[protocol]
+	packetEPs := n.mu.packetEPs[protocol]
 	// Check whether there are packet sockets listening for every protocol.
 	// If we received a packet with protocol EthernetProtocolAll, then the
 	// previous for loop will have handled it.
 	if protocol != header.EthernetProtocolAll {
-		packetEPs = append(packetEPs, n.packetEPs[header.EthernetProtocolAll]...)
+		packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...)
 	}
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
@@ -1060,8 +1087,8 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		// Found a NIC.
 		n := r.ref.nic
 		n.mu.RLock()
-		ref, ok := n.endpoints[NetworkEndpointID{dst}]
-		ok = ok && ref.isValidForOutgoing() && ref.tryIncRef()
+		ref, ok := n.mu.endpoints[NetworkEndpointID{dst}]
+		ok = ok && ref.isValidForOutgoingRLocked() && ref.tryIncRef()
 		n.mu.RUnlock()
 		if ok {
 			r.RemoteAddress = src
@@ -1181,7 +1208,7 @@ func (n *NIC) Stack() *Stack {
 // false. It will only return true if the address is associated with the NIC
 // AND it is tentative.
 func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
-	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
 	if !ok {
 		return false
 	}
@@ -1197,7 +1224,7 @@ func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
 	if !ok {
 		return tcpip.ErrBadAddress
 	}
@@ -1217,7 +1244,7 @@ func (n *NIC) setNDPConfigs(c NDPConfigurations) {
 	c.validate()
 
 	n.mu.Lock()
-	n.ndp.configs = c
+	n.mu.ndp.configs = c
 	n.mu.Unlock()
 }
 
@@ -1226,7 +1253,7 @@ func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	n.ndp.handleRA(ip, ra)
+	n.mu.ndp.handleRA(ip, ra)
 }
 
 type networkEndpointKind int32
@@ -1268,11 +1295,11 @@ func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep Pa
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	eps, ok := n.packetEPs[netProto]
+	eps, ok := n.mu.packetEPs[netProto]
 	if !ok {
 		return tcpip.ErrNotSupported
 	}
-	n.packetEPs[netProto] = append(eps, ep)
+	n.mu.packetEPs[netProto] = append(eps, ep)
 
 	return nil
 }
@@ -1281,14 +1308,14 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	eps, ok := n.packetEPs[netProto]
+	eps, ok := n.mu.packetEPs[netProto]
 	if !ok {
 		return
 	}
 
 	for i, epOther := range eps {
 		if epOther == ep {
-			n.packetEPs[netProto] = append(eps[:i], eps[i+1:]...)
+			n.mu.packetEPs[netProto] = append(eps[:i], eps[i+1:]...)
 			return
 		}
 	}
@@ -1346,14 +1373,19 @@ func (r *referencedNetworkEndpoint) setKind(kind networkEndpointKind) {
 // packet. It requires the endpoint to not be marked expired (i.e., its address
 // has been removed), or the NIC to be in spoofing mode.
 func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
-	return r.getKind() != permanentExpired || r.nic.spoofing
+	r.nic.mu.RLock()
+	defer r.nic.mu.RUnlock()
+
+	return r.isValidForOutgoingRLocked()
 }
 
-// isValidForIncoming returns true if the endpoint can accept an incoming
-// packet. It requires the endpoint to not be marked expired (i.e., its address
-// has been removed), or the NIC to be in promiscuous mode.
-func (r *referencedNetworkEndpoint) isValidForIncoming() bool {
-	return r.getKind() != permanentExpired || r.nic.promiscuous
+// isValidForOutgoingRLocked returns true if the endpoint can be used to send
+// out a packet. It requires the endpoint to not be marked expired (i.e., its
+// address has been removed), or the NIC to be in spoofing mode.
+//
+// r's NIC must be read locked.
+func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
+	return r.getKind() != permanentExpired || r.nic.mu.spoofing
 }
 
 // decRef decrements the ref count and cleans up the endpoint once it reaches
-- 
cgit v1.2.3


From 896bd654b6622d20cbaf8e82b4554a5375addf81 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 22 Jan 2020 15:14:43 -0800
Subject: De-duplicate common test functionality for VFS2 filesystems.

PiperOrigin-RevId: 291041576
---
 pkg/sentry/fsimpl/kernfs/kernfs_test.go |  12 +-
 pkg/sentry/fsimpl/proc/tasks_test.go    | 411 +++++++++++---------------------
 pkg/sentry/fsimpl/sys/sys_test.go       |   4 +-
 pkg/sentry/fsimpl/testutil/testutil.go  | 149 ++++++++++--
 4 files changed, 278 insertions(+), 298 deletions(-)

diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index aa3fe76ee..fade59491 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -212,7 +212,7 @@ func TestMkdirGetDentry(t *testing.T) {
 	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("dir1/a new directory")
-	if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+	if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
 		t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
 	}
 	sys.GetDentryOrDie(pop).DecRef()
@@ -227,7 +227,7 @@ func TestReadStaticFile(t *testing.T) {
 	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("file1")
-	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
 		Flags: linux.O_RDONLY,
 	})
 	if err != nil {
@@ -254,7 +254,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 
 	pop := sys.PathOpAtRoot("dir1/newfile")
 	opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
-	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, opts)
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, opts)
 	if err != nil {
 		t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
 	}
@@ -262,7 +262,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 	// Close the file. The file should persist.
 	fd.DecRef()
 
-	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{
+	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
 		Flags: linux.O_RDONLY,
 	})
 	if err != nil {
@@ -278,7 +278,7 @@ func TestDirFDReadWrite(t *testing.T) {
 	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("/")
-	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
 		Flags: linux.O_RDONLY,
 	})
 	if err != nil {
@@ -309,7 +309,7 @@ func TestDirFDIterDirents(t *testing.T) {
 	defer sys.Destroy()
 
 	pop := sys.PathOpAtRoot("/")
-	sys.AssertDirectoryContains(&pop, map[string]testutil.DirentType{
+	sys.AssertAllDirentTypes(sys.ListDirents(pop), map[string]testutil.DirentType{
 		"dir1":  linux.DT_DIR,
 		"dir2":  linux.DT_DIR,
 		"file1": linux.DT_REG,
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 41977d816..2c1635f33 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -44,100 +44,47 @@ var (
 	proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1}
 )
 
-type testIterDirentsCallback struct {
-	dirents []vfs.Dirent
-}
-
-func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool {
-	t.dirents = append(t.dirents, d)
-	return true
-}
-
-func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
-	if got := len(dirs); got < 2 {
-		return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs)
-	}
-	for i, want := range []string{".", ".."} {
-		if got := dirs[i].Name; got != want {
-			return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got)
-		}
-		if got := dirs[i].Type; got != linux.DT_DIR {
-			return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got)
-		}
-	}
-	return dirs[2:], nil
-}
-
-func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
-	wants := map[string]vfs.Dirent{
-		"cpuinfo":     {Type: linux.DT_REG},
-		"loadavg":     {Type: linux.DT_REG},
-		"meminfo":     {Type: linux.DT_REG},
-		"mounts":      {Type: linux.DT_LNK},
-		"net":         {Type: linux.DT_DIR},
-		"self":        selfLink,
-		"stat":        {Type: linux.DT_REG},
-		"sys":         {Type: linux.DT_DIR},
-		"thread-self": threadSelfLink,
-		"uptime":      {Type: linux.DT_REG},
-		"version":     {Type: linux.DT_REG},
-	}
-	return checkFiles(gots, wants)
-}
-
-func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
-	wants := map[string]vfs.Dirent{
-		"auxv":    {Type: linux.DT_REG},
-		"cgroup":  {Type: linux.DT_REG},
-		"cmdline": {Type: linux.DT_REG},
-		"comm":    {Type: linux.DT_REG},
-		"environ": {Type: linux.DT_REG},
-		"gid_map": {Type: linux.DT_REG},
-		"io":      {Type: linux.DT_REG},
-		"maps":    {Type: linux.DT_REG},
-		"ns":      {Type: linux.DT_DIR},
-		"smaps":   {Type: linux.DT_REG},
-		"stat":    {Type: linux.DT_REG},
-		"statm":   {Type: linux.DT_REG},
-		"status":  {Type: linux.DT_REG},
-		"task":    {Type: linux.DT_DIR},
-		"uid_map": {Type: linux.DT_REG},
-	}
-	return checkFiles(gots, wants)
-}
-
-func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) {
-	// Go over all files, when there is a match, the file is removed from both
-	// 'gots' and 'wants'. wants is expected to reach 0, as all files must
-	// be present. Remaining files in 'gots', is returned to caller to decide
-	// whether this is valid or not.
-	for i := 0; i < len(gots); i++ {
-		got := gots[i]
-		want, ok := wants[got.Name]
-		if !ok {
-			continue
-		}
-		if want.Type != got.Type {
-			return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got)
-		}
-		if want.NextOff != 0 && want.NextOff != got.NextOff {
-			return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got)
-		}
-
-		delete(wants, got.Name)
-		gots = append(gots[0:i], gots[i+1:]...)
-		i--
-	}
-	if len(wants) != 0 {
-		return gots, fmt.Errorf("not all files were found, missing: %+v", wants)
+var (
+	tasksStaticFiles = map[string]testutil.DirentType{
+		"cpuinfo":     linux.DT_REG,
+		"loadavg":     linux.DT_REG,
+		"meminfo":     linux.DT_REG,
+		"mounts":      linux.DT_LNK,
+		"net":         linux.DT_DIR,
+		"self":        linux.DT_LNK,
+		"stat":        linux.DT_REG,
+		"sys":         linux.DT_DIR,
+		"thread-self": linux.DT_LNK,
+		"uptime":      linux.DT_REG,
+		"version":     linux.DT_REG,
+	}
+	tasksStaticFilesNextOffs = map[string]int64{
+		"self":        selfLink.NextOff,
+		"thread-self": threadSelfLink.NextOff,
+	}
+	taskStaticFiles = map[string]testutil.DirentType{
+		"auxv":    linux.DT_REG,
+		"cgroup":  linux.DT_REG,
+		"cmdline": linux.DT_REG,
+		"comm":    linux.DT_REG,
+		"environ": linux.DT_REG,
+		"gid_map": linux.DT_REG,
+		"io":      linux.DT_REG,
+		"maps":    linux.DT_REG,
+		"ns":      linux.DT_DIR,
+		"smaps":   linux.DT_REG,
+		"stat":    linux.DT_REG,
+		"statm":   linux.DT_REG,
+		"status":  linux.DT_REG,
+		"task":    linux.DT_DIR,
+		"uid_map": linux.DT_REG,
 	}
-	return gots, nil
-}
+)
 
-func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) {
+func setup(t *testing.T) *testutil.System {
 	k, err := testutil.Boot()
 	if err != nil {
-		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err)
+		t.Fatalf("Error creating kernel: %v", err)
 	}
 
 	ctx := k.SupervisorContext()
@@ -157,93 +104,60 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error)
 	}
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts)
 	if err != nil {
-		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
+		t.Fatalf("NewMountNamespace(): %v", err)
 	}
-	return ctx, vfsObj, mntns.Root(), nil
+	return testutil.NewSystem(ctx, t, vfsObj, mntns)
 }
 
 func TestTasksEmpty(t *testing.T) {
-	ctx, vfsObj, root, err := setup()
-	if err != nil {
-		t.Fatalf("Setup failed: %v", err)
-	}
-	defer root.DecRef()
-
-	fd, err := vfsObj.OpenAt(
-		ctx,
-		auth.CredentialsFromContext(ctx),
-		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
-		&vfs.OpenOptions{},
-	)
-	if err != nil {
-		t.Fatalf("vfsfs.OpenAt failed: %v", err)
-	}
+	s := setup(t)
+	defer s.Destroy()
 
-	cb := testIterDirentsCallback{}
-	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
-		t.Fatalf("IterDirents(): %v", err)
-	}
-	cb.dirents, err = checkDots(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	cb.dirents, err = checkTasksStaticFiles(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	if len(cb.dirents) != 0 {
-		t.Errorf("found more files than expected: %+v", cb.dirents)
-	}
+	collector := s.ListDirents(s.PathOpAtRoot("/"))
+	s.AssertAllDirentTypes(collector, tasksStaticFiles)
+	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
 }
 
 func TestTasks(t *testing.T) {
-	ctx, vfsObj, root, err := setup()
-	if err != nil {
-		t.Fatalf("Setup failed: %v", err)
+	s := setup(t)
+	defer s.Destroy()
+
+	expectedDirents := make(map[string]testutil.DirentType)
+	for n, d := range tasksStaticFiles {
+		expectedDirents[n] = d
 	}
-	defer root.DecRef()
 
-	k := kernel.KernelFromContext(ctx)
+	k := kernel.KernelFromContext(s.Ctx)
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
 		tasks = append(tasks, task)
+		expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR
 	}
 
-	fd, err := vfsObj.OpenAt(
-		ctx,
-		auth.CredentialsFromContext(ctx),
-		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
-		&vfs.OpenOptions{},
-	)
-	if err != nil {
-		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
-	}
+	collector := s.ListDirents(s.PathOpAtRoot("/"))
+	s.AssertAllDirentTypes(collector, expectedDirents)
+	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
 
-	cb := testIterDirentsCallback{}
-	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
-		t.Fatalf("IterDirents(): %v", err)
-	}
-	cb.dirents, err = checkDots(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	cb.dirents, err = checkTasksStaticFiles(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
 	lastPid := 0
-	for _, d := range cb.dirents {
+	dirents := collector.OrderedDirents()
+	doneSkippingNonTaskDirs := false
+	for _, d := range dirents {
 		pid, err := strconv.Atoi(d.Name)
 		if err != nil {
+			if !doneSkippingNonTaskDirs {
+				// We haven't gotten to the task dirs yet.
+				continue
+			}
 			t.Fatalf("Invalid process directory %q", d.Name)
 		}
+		doneSkippingNonTaskDirs = true
 		if lastPid > pid {
-			t.Errorf("pids not in order: %v", cb.dirents)
+			t.Errorf("pids not in order: %v", dirents)
 		}
 		found := false
 		for _, t := range tasks {
@@ -260,13 +174,16 @@ func TestTasks(t *testing.T) {
 			t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d)
 		}
 	}
+	if !doneSkippingNonTaskDirs {
+		t.Fatalf("Never found any process directories.")
+	}
 
 	// Test lookup.
 	for _, path := range []string{"/1", "/2"} {
-		fd, err := vfsObj.OpenAt(
-			ctx,
-			auth.CredentialsFromContext(ctx),
-			&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)},
+		fd, err := s.VFS.OpenAt(
+			s.Ctx,
+			s.Creds,
+			s.PathOpAtRoot(path),
 			&vfs.OpenOptions{},
 		)
 		if err != nil {
@@ -274,15 +191,15 @@ func TestTasks(t *testing.T) {
 		}
 		buf := make([]byte, 1)
 		bufIOSeq := usermem.BytesIOSequence(buf)
-		if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
+		if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
 			t.Errorf("wrong error reading directory: %v", err)
 		}
 	}
 
-	if _, err := vfsObj.OpenAt(
-		ctx,
-		auth.CredentialsFromContext(ctx),
-		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")},
+	if _, err := s.VFS.OpenAt(
+		s.Ctx,
+		s.Creds,
+		s.PathOpAtRoot("/9999"),
 		&vfs.OpenOptions{},
 	); err != syserror.ENOENT {
 		t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err)
@@ -290,16 +207,13 @@ func TestTasks(t *testing.T) {
 }
 
 func TestTasksOffset(t *testing.T) {
-	ctx, vfsObj, root, err := setup()
-	if err != nil {
-		t.Fatalf("Setup failed: %v", err)
-	}
-	defer root.DecRef()
+	s := setup(t)
+	defer s.Destroy()
 
-	k := kernel.KernelFromContext(ctx)
+	k := kernel.KernelFromContext(s.Ctx)
 	for i := 0; i < 3; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		if _, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
 	}
@@ -382,134 +296,100 @@ func TestTasksOffset(t *testing.T) {
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
-			fd, err := vfsObj.OpenAt(
-				ctx,
-				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+			s := s.WithSubtest(t)
+			fd, err := s.VFS.OpenAt(
+				s.Ctx,
+				s.Creds,
+				s.PathOpAtRoot("/"),
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
 				t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
 			}
-			if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil {
+			if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil {
 				t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
 			}
 
-			cb := testIterDirentsCallback{}
-			if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
-				t.Fatalf("IterDirents(): %v", err)
+			var collector testutil.DirentCollector
+			if err := fd.IterDirents(s.Ctx, &collector); err != nil {
+				t.Fatalf("IterDirent(): %v", err)
 			}
-			if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil {
-				t.Error(err.Error())
-			}
-			if len(cb.dirents) != 0 {
-				t.Errorf("found more files than expected: %+v", cb.dirents)
+
+			expectedTypes := make(map[string]testutil.DirentType)
+			expectedOffsets := make(map[string]int64)
+			for name, want := range tc.wants {
+				expectedTypes[name] = want.Type
+				if want.NextOff != 0 {
+					expectedOffsets[name] = want.NextOff
+				}
 			}
+
+			collector.SkipDotsChecks(true) // We seek()ed past the dots.
+			s.AssertAllDirentTypes(&collector, expectedTypes)
+			s.AssertDirentOffsets(&collector, expectedOffsets)
 		})
 	}
 }
 
 func TestTask(t *testing.T) {
-	ctx, vfsObj, root, err := setup()
-	if err != nil {
-		t.Fatalf("Setup failed: %v", err)
-	}
-	defer root.DecRef()
+	s := setup(t)
+	defer s.Destroy()
 
-	k := kernel.KernelFromContext(ctx)
+	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	_, err = testutil.CreateTask(ctx, "name", tc)
+	_, err := testutil.CreateTask(s.Ctx, "name", tc)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
 
-	fd, err := vfsObj.OpenAt(
-		ctx,
-		auth.CredentialsFromContext(ctx),
-		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")},
-		&vfs.OpenOptions{},
-	)
-	if err != nil {
-		t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err)
-	}
-
-	cb := testIterDirentsCallback{}
-	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
-		t.Fatalf("IterDirents(): %v", err)
-	}
-	cb.dirents, err = checkDots(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	cb.dirents, err = checkTaskStaticFiles(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	if len(cb.dirents) != 0 {
-		t.Errorf("found more files than expected: %+v", cb.dirents)
-	}
+	collector := s.ListDirents(s.PathOpAtRoot("/1"))
+	s.AssertAllDirentTypes(collector, taskStaticFiles)
 }
 
 func TestProcSelf(t *testing.T) {
-	ctx, vfsObj, root, err := setup()
-	if err != nil {
-		t.Fatalf("Setup failed: %v", err)
-	}
-	defer root.DecRef()
+	s := setup(t)
+	defer s.Destroy()
 
-	k := kernel.KernelFromContext(ctx)
+	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	task, err := testutil.CreateTask(ctx, "name", tc)
+	task, err := testutil.CreateTask(s.Ctx, "name", tc)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
 
-	fd, err := vfsObj.OpenAt(
-		task,
-		auth.CredentialsFromContext(ctx),
-		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true},
-		&vfs.OpenOptions{},
-	)
-	if err != nil {
-		t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err)
-	}
-
-	cb := testIterDirentsCallback{}
-	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
-		t.Fatalf("IterDirents(): %v", err)
-	}
-	cb.dirents, err = checkDots(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	cb.dirents, err = checkTaskStaticFiles(cb.dirents)
-	if err != nil {
-		t.Error(err.Error())
-	}
-	if len(cb.dirents) != 0 {
-		t.Errorf("found more files than expected: %+v", cb.dirents)
-	}
+	collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{
+		Root:               s.Root,
+		Start:              s.Root,
+		Path:               fspath.Parse("/self/"),
+		FollowFinalSymlink: true,
+	})
+	s.AssertAllDirentTypes(collector, taskStaticFiles)
 }
 
-func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) {
+func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) {
 	t.Logf("Iterating: /proc%s", fd.MappedName(ctx))
 
-	cb := testIterDirentsCallback{}
-	if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+	var collector testutil.DirentCollector
+	if err := fd.IterDirents(ctx, &collector); err != nil {
 		t.Fatalf("IterDirents(): %v", err)
 	}
-	var err error
-	cb.dirents, err = checkDots(cb.dirents)
-	if err != nil {
+	if err := collector.Contains(".", linux.DT_DIR); err != nil {
 		t.Error(err.Error())
 	}
-	for _, d := range cb.dirents {
+	if err := collector.Contains("..", linux.DT_DIR); err != nil {
+		t.Error(err.Error())
+	}
+
+	for _, d := range collector.Dirents() {
+		if d.Name == "." || d.Name == ".." {
+			continue
+		}
 		childPath := path.Join(fd.MappedName(ctx), d.Name)
 		if d.Type == linux.DT_LNK {
-			link, err := vfsObj.ReadlinkAt(
+			link, err := s.VFS.ReadlinkAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+				&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)},
 			)
 			if err != nil {
 				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err)
@@ -520,10 +400,10 @@ func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem
 		}
 
 		t.Logf("Opening: /proc%s", childPath)
-		child, err := vfsObj.OpenAt(
+		child, err := s.VFS.OpenAt(
 			ctx,
 			auth.CredentialsFromContext(ctx),
-			&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+			&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)},
 			&vfs.OpenOptions{},
 		)
 		if err != nil {
@@ -539,24 +419,21 @@ func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem
 		}
 		if d.Type == linux.DT_DIR {
 			// Found another dir, let's do it again!
-			iterateDir(ctx, t, vfsObj, root, child)
+			iterateDir(ctx, t, s, child)
 		}
 	}
 }
 
 // TestTree iterates all directories and stats every file.
 func TestTree(t *testing.T) {
-	uberCtx, vfsObj, root, err := setup()
-	if err != nil {
-		t.Fatalf("Setup failed: %v", err)
-	}
-	defer root.DecRef()
+	s := setup(t)
+	defer s.Destroy()
 
-	k := kernel.KernelFromContext(uberCtx)
+	k := kernel.KernelFromContext(s.Ctx)
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(uberCtx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
@@ -564,14 +441,14 @@ func TestTree(t *testing.T) {
 	}
 
 	ctx := tasks[0]
-	fd, err := vfsObj.OpenAt(
+	fd, err := s.VFS.OpenAt(
 		ctx,
-		auth.CredentialsFromContext(uberCtx),
-		&vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+		auth.CredentialsFromContext(s.Ctx),
+		&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")},
 		&vfs.OpenOptions{},
 	)
 	if err != nil {
 		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
 	}
-	iterateDir(ctx, t, vfsObj, root, fd)
+	iterateDir(ctx, t, s, fd)
 }
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 60a1634a9..8b1cf0bd0 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -56,7 +56,7 @@ func TestReadCPUFile(t *testing.T) {
 
 	for _, fname := range []string{"online", "possible", "present"} {
 		pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
-		fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, &pop, &vfs.OpenOptions{})
+		fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{})
 		if err != nil {
 			t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
 		}
@@ -75,7 +75,7 @@ func TestSysRootContainsExpectedEntries(t *testing.T) {
 	s := newTestSystem(t)
 	defer s.Destroy()
 	pop := s.PathOpAtRoot("/")
-	s.AssertDirectoryContains(&pop, map[string]testutil.DirentType{
+	s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{
 		"block":    linux.DT_DIR,
 		"bus":      linux.DT_DIR,
 		"class":    linux.DT_DIR,
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index eada31d94..2a723a89f 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -40,8 +40,8 @@ type System struct {
 	Ctx   context.Context
 	Creds *auth.Credentials
 	VFS   *vfs.VirtualFilesystem
+	Root  vfs.VirtualDentry
 	mns   *vfs.MountNamespace
-	root  vfs.VirtualDentry
 }
 
 // NewSystem constructs a System.
@@ -55,14 +55,49 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
 		mns:   mns,
-		root:  mns.Root(),
+		Root:  mns.Root(),
 	}
 	return s
 }
 
+// WithSubtest creates a temporary test system with a new test harness,
+// referencing all other resources from the original system. This is useful when
+// a system is reused for multiple subtests, and the T needs to change for each
+// case. Note that this is safe when test cases run in parallel, as all
+// resources referenced by the system are immutable, or handle interior
+// mutations in a thread-safe manner.
+//
+// The returned system must not outlive the original and should not be destroyed
+// via System.Destroy.
+func (s *System) WithSubtest(t *testing.T) *System {
+	return &System{
+		t:     t,
+		Ctx:   s.Ctx,
+		Creds: s.Creds,
+		VFS:   s.VFS,
+		mns:   s.mns,
+		Root:  s.Root,
+	}
+}
+
+// WithTemporaryContext constructs a temporary test system with a new context
+// ctx. The temporary system borrows all resources and references from the
+// original system. The returned temporary system must not outlive the original
+// system, and should not be destroyed via System.Destroy.
+func (s *System) WithTemporaryContext(ctx context.Context) *System {
+	return &System{
+		t:     s.t,
+		Ctx:   ctx,
+		Creds: s.Creds,
+		VFS:   s.VFS,
+		mns:   s.mns,
+		Root:  s.Root,
+	}
+}
+
 // Destroy release resources associated with a test system.
 func (s *System) Destroy() {
-	s.root.DecRef()
+	s.Root.DecRef()
 	s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem.
 }
 
@@ -87,18 +122,18 @@ func (s *System) ReadToEnd(fd *vfs.FileDescription) (string, error) {
 
 // PathOpAtRoot constructs a PathOperation with the given path from
 // the root of the filesystem.
-func (s *System) PathOpAtRoot(path string) vfs.PathOperation {
-	return vfs.PathOperation{
-		Root:  s.root,
-		Start: s.root,
+func (s *System) PathOpAtRoot(path string) *vfs.PathOperation {
+	return &vfs.PathOperation{
+		Root:  s.Root,
+		Start: s.Root,
 		Path:  fspath.Parse(path),
 	}
 }
 
 // GetDentryOrDie attempts to resolve a dentry referred to by the
 // provided path operation. If unsuccessful, the test fails.
-func (s *System) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
-	vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, &pop, &vfs.GetDentryOptions{})
+func (s *System) GetDentryOrDie(pop *vfs.PathOperation) vfs.VirtualDentry {
+	vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, pop, &vfs.GetDentryOptions{})
 	if err != nil {
 		s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
 	}
@@ -108,14 +143,8 @@ func (s *System) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
 // DirentType is an alias for values for linux_dirent64.d_type.
 type DirentType = uint8
 
-// AssertDirectoryContains verifies that a directory at pop contains the entries
-// specified. AssertDirectoryContains implicitly checks for "." and "..", these
-// need not be included in entries.
-func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[string]DirentType) {
-	// Also implicitly check for "." and "..".
-	entries["."] = linux.DT_DIR
-	entries[".."] = linux.DT_DIR
-
+// ListDirents lists the Dirents for a directory at pop.
+func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
 	fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{Flags: linux.O_RDONLY})
 	if err != nil {
 		s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
@@ -126,12 +155,52 @@ func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[str
 	if err := fd.IterDirents(s.Ctx, collector); err != nil {
 		s.t.Fatalf("IterDirent failed: %v", err)
 	}
+	return collector
+}
+
+// AssertAllDirentTypes verifies that the set of dirents in collector contains
+// exactly the specified set of expected entries. AssertAllDirentTypes respects
+// collector.skipDots, and implicitly checks for "." and ".." accordingly.
+func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) {
+	// Also implicitly check for "." and "..", if enabled.
+	if !collector.skipDots {
+		expected["."] = linux.DT_DIR
+		expected[".."] = linux.DT_DIR
+	}
 
-	collectedEntries := make(map[string]DirentType)
+	dentryTypes := make(map[string]DirentType)
+	collector.mu.Lock()
 	for _, dirent := range collector.dirents {
-		collectedEntries[dirent.Name] = dirent.Type
+		dentryTypes[dirent.Name] = dirent.Type
 	}
-	if diff := cmp.Diff(entries, collectedEntries); diff != "" {
+	collector.mu.Unlock()
+	if diff := cmp.Diff(expected, dentryTypes); diff != "" {
+		s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+// AssertDirentOffsets verifies that collector contains at least the entries
+// specified in expected, with the given NextOff field. Entries specified in
+// expected but missing from collector result in failure. Extra entries in
+// collector are ignored. AssertDirentOffsets respects collector.skipDots, and
+// implicitly checks for "." and ".." accordingly.
+func (s *System) AssertDirentOffsets(collector *DirentCollector, expected map[string]int64) {
+	// Also implicitly check for "." and "..", if enabled.
+	if !collector.skipDots {
+		expected["."] = 1
+		expected[".."] = 2
+	}
+
+	dentryNextOffs := make(map[string]int64)
+	collector.mu.Lock()
+	for _, dirent := range collector.dirents {
+		// Ignore extra entries in dentries that are not in expected.
+		if _, ok := expected[dirent.Name]; ok {
+			dentryNextOffs[dirent.Name] = dirent.NextOff
+		}
+	}
+	collector.mu.Unlock()
+	if diff := cmp.Diff(expected, dentryNextOffs); diff != "" {
 		s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff)
 	}
 }
@@ -141,16 +210,29 @@ func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[str
 // all dirents emitted by the callback.
 type DirentCollector struct {
 	mu      sync.Mutex
-	dirents map[string]vfs.Dirent
+	order   []*vfs.Dirent
+	dirents map[string]*vfs.Dirent
+	// When the collector is used in various Assert* functions, should "." and
+	// ".." be implicitly checked?
+	skipDots bool
+}
+
+// SkipDotsChecks enables or disables the implicit checks on "." and ".." when
+// the collector is used in various Assert* functions. Note that "." and ".."
+// are still collected if passed to d.Handle, so the caller should only disable
+// the checks when they aren't expected.
+func (d *DirentCollector) SkipDotsChecks(value bool) {
+	d.skipDots = value
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
 func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
 	d.mu.Lock()
 	if d.dirents == nil {
-		d.dirents = make(map[string]vfs.Dirent)
+		d.dirents = make(map[string]*vfs.Dirent)
 	}
-	d.dirents[dirent.Name] = dirent
+	d.order = append(d.order, &dirent)
+	d.dirents[dirent.Name] = &dirent
 	d.mu.Unlock()
 	return true
 }
@@ -176,3 +258,24 @@ func (d *DirentCollector) Contains(name string, typ uint8) error {
 	}
 	return nil
 }
+
+// Dirents returns all dirents discovered by this collector.
+func (d *DirentCollector) Dirents() map[string]*vfs.Dirent {
+	d.mu.Lock()
+	dirents := make(map[string]*vfs.Dirent)
+	for n, d := range d.dirents {
+		dirents[n] = d
+	}
+	d.mu.Unlock()
+	return dirents
+}
+
+// OrderedDirents returns an ordered list of dirents as discovered by this
+// collector.
+func (d *DirentCollector) OrderedDirents() []*vfs.Dirent {
+	d.mu.Lock()
+	dirents := make([]*vfs.Dirent, len(d.order))
+	copy(dirents, d.order)
+	d.mu.Unlock()
+	return dirents
+}
-- 
cgit v1.2.3


From 49e84b10e5ed7f94e6cbe003b9f7268e8235bb08 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 22 Jan 2020 06:22:18 +0000
Subject: Unify the kOLargeFile definition in syscall tests.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Id9d6ae98305a4057d55d622ea4c3ac2228fea212
---
 test/syscalls/linux/fcntl.cc |  5 +----
 test/syscalls/linux/pipe.cc  |  6 +++---
 test/syscalls/linux/proc.cc  | 12 ------------
 test/util/fs_util.h          | 11 +++++++++++
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 4f3aa81d6..421c15b87 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -31,6 +31,7 @@
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/eventfd_util.h"
+#include "test/util/fs_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/save_util.h"
@@ -55,10 +56,6 @@ ABSL_FLAG(int32_t, socket_fd, -1,
 namespace gvisor {
 namespace testing {
 
-// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
-// because "it isn't needed", even though Linux can return it via F_GETFL.
-constexpr int kOLargeFile = 00100000;
-
 class FcntlLockTest : public ::testing::Test {
  public:
   void SetUp() override {
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index ac9b21b24..d8e19e910 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -25,6 +25,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -144,11 +145,10 @@ TEST_P(PipeTest, Flags) {
 
   if (IsNamedPipe()) {
     // May be stubbed to zero; define locally.
-    constexpr int kLargefile = 0100000;
     EXPECT_THAT(fcntl(rfd_.get(), F_GETFL),
-                SyscallSucceedsWithValue(kLargefile | O_RDONLY));
+                SyscallSucceedsWithValue(kOLargeFile | O_RDONLY));
     EXPECT_THAT(fcntl(wfd_.get(), F_GETFL),
-                SyscallSucceedsWithValue(kLargefile | O_WRONLY));
+                SyscallSucceedsWithValue(kOLargeFile | O_WRONLY));
   } else {
     EXPECT_THAT(fcntl(rfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_RDONLY));
     EXPECT_THAT(fcntl(wfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index bf9bb45d3..a03c1e43d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -100,18 +100,6 @@ namespace {
 #define SUID_DUMP_ROOT 2
 #endif /* SUID_DUMP_ROOT */
 
-// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
-// because "it isn't needed", even though Linux can return it via F_GETFL.
-#if defined(__x86_64__) || defined(__i386__)
-constexpr int kOLargeFile = 00100000;
-#elif __aarch64__
-// The value originate from the Linux
-// kernel's arch/arm64/include/uapi/asm/fcntl.h.
-constexpr int kOLargeFile = 00400000;
-#else
-#error "Unknown architecture"
-#endif
-
 #if defined(__x86_64__) || defined(__i386__)
 // This list of "required" fields is taken from reading the file
 // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index ee1b341d7..caf19b24d 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -26,6 +26,17 @@
 
 namespace gvisor {
 namespace testing {
+
+// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
+// because "it isn't needed", even though Linux can return it via F_GETFL.
+#if defined(__x86_64__)
+constexpr int kOLargeFile = 00100000;
+#elif defined(__aarch64__)
+constexpr int kOLargeFile = 00400000;
+#else
+#error "Unknown architecture"
+#endif
+
 // Returns a status or the current working directory.
 PosixErrorOr<std::string> GetCWD();
 
-- 
cgit v1.2.3


From 04e3d56db1d8dee9f4fae51718dbef33559c4101 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 23 Jan 2020 10:43:59 -0800
Subject: Fix master build case.

Otherwise, this will be built when building a specific
release tag (typically in the past), causing the master
binary to be overwritten with something older.

We can generally assume that tags will be applied after
the commit has been integrated, and therefore that any
builds pointing to tags will use only the tags.

Another way to fix this would be to introduce something
akin to the KOKORO_BUILD_NIGHTLY environment variable,
but it doesn't seem strictly necessary.

PiperOrigin-RevId: 291198171
---
 scripts/build.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/build.sh b/scripts/build.sh
index 8b2094cb0..4c042af6c 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -67,11 +67,7 @@ if [[ "${KOKORO_BUILD_NIGHTLY:-false}" == "true" ]]; then
   install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/${stamp}"
   install_repo "${KOKORO_ARTIFACTS_DIR}/dists/nightly"
 else
-  # We keep only the latest master raw release.
-  install_raw  "${KOKORO_ARTIFACTS_DIR}/master/latest"
-  install_repo "${KOKORO_ARTIFACTS_DIR}/dists/master"
-
-  # Is it a tagged release? Build that too.
+  # Is it a tagged release? Build that.
   tags="$(git tag --points-at HEAD)"
   if ! [[ -z "${tags}" ]]; then
     # Note that a given commit can match any number of tags. We have to iterate
@@ -80,8 +76,13 @@ else
       name=$(echo "${tag}" | cut -d'-' -f2)
       base=$(echo "${name}" | cut -d'.' -f1)
       install_raw  "${KOKORO_ARTIFACTS_DIR}/release/${name}"
+      install_raw  "${KOKORO_ARTIFACTS_DIR}/release/latest"
       install_repo "${KOKORO_ARTIFACTS_DIR}/dists/release"
       install_repo "${KOKORO_ARTIFACTS_DIR}/dists/${base}"
     done
+  else
+    # Otherwise, assume it is a raw master commit.
+    install_raw  "${KOKORO_ARTIFACTS_DIR}/master/latest"
+    install_repo "${KOKORO_ARTIFACTS_DIR}/dists/master"
   fi
 fi
-- 
cgit v1.2.3


From 98e83c444fa58669d45ecf162cf4bf48dce790d1 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Thu, 23 Jan 2020 10:58:18 -0800
Subject: Try running kythe build on RBE.

Also add our RBE project/instance to the --config=remote defaults.

PiperOrigin-RevId: 291201426
---
 .bazelrc                       | 20 +++++++-------------
 kokoro/kythe/generate_xrefs.sh |  3 ++-
 scripts/common_bazel.sh        |  9 +--------
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 7f87e94b1..9c35c5e7b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -20,6 +20,13 @@ build --stamp --workspace_status_command tools/workspace_status.sh
 
 # Enable remote execution so actions are performed on the remote systems.
 build:remote --remote_executor=grpcs://remotebuildexecution.googleapis.com
+build:remote --project_id=gvisor-rbe
+build:remote --remote_instance_name=projects/gvisor-rbe/instances/default_instance
+# Enable authentication. This will pick up application default credentials by
+# default. You can use --google_credentials=some_file.json to use a service
+# account credential instead.
+build:remote --google_default_credentials=true
+build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
 
 # Add a custom platform and toolchain that builds in a privileged docker
 # container, which is required by our syscall tests.
@@ -27,25 +34,12 @@ build:remote --host_platform=//test:rbe_ubuntu1604
 build:remote --extra_toolchains=//test:cc-toolchain-clang-x86_64-default
 build:remote --extra_execution_platforms=//test:rbe_ubuntu1604
 build:remote --platforms=//test:rbe_ubuntu1604
-
-# Use default image for crosstool toolchain.
 build:remote --crosstool_top=@rbe_default//cc:toolchain
-
-# Default parallelism and timeout for remote jobs.
 build:remote --jobs=50
 build:remote --remote_timeout=3600
-
 # RBE requires a strong hash function, such as SHA256.
 startup --host_jvm_args=-Dbazel.DigestFunction=SHA256
 
-# Enable authentication. This will pick up application default credentials by
-# default. You can use --google_credentials=some_file.json to use a service
-# account credential instead.
-build:remote --google_default_credentials=true
-
-# Auth scope needed for authentication with RBE.
-build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
-
 # Set flags for uploading to BES in order to view results in the Bazel Build
 # Results UI.
 build:results --bes_backend="buildeventservice.googleapis.com"
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 799467a34..d2ca95c68 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -25,7 +25,7 @@ bazel version
 
 python3 -V
 
-readonly KYTHE_VERSION='v0.0.37'
+readonly KYTHE_VERSION='v0.0.39'
 readonly WORKDIR="$(mktemp -d)"
 readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
 if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
@@ -47,6 +47,7 @@ bazel \
   --override_repository kythe_release="${KYTHE_DIR}" \
   --define=kythe_corpus=gvisor.dev \
   --cxxopt=-std=c++17 \
+  --config=remote \
   //...
 
 "${KYTHE_DIR}/tools/kzip" merge \
diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh
index bbc1a038e..a473a88a4 100755
--- a/scripts/common_bazel.sh
+++ b/scripts/common_bazel.sh
@@ -32,18 +32,11 @@ declare -r BAZEL_FLAGS=(
   "--keep_going"
   "--verbose_failures=true"
 )
-if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]] || [[ -v RBE_PROJECT_ID ]]; then
-  declare -r RBE_PROJECT_ID="${RBE_PROJECT_ID:-gvisor-rbe}"
-  declare -r BAZEL_RBE_FLAGS=(
-    "--config=remote"
-    "--project_id=${RBE_PROJECT_ID}"
-    "--remote_instance_name=projects/${RBE_PROJECT_ID}/instances/default_instance"
-  )
-fi
 if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
   declare -r BAZEL_RBE_AUTH_FLAGS=(
     "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
   )
+  declare -r BAZEL_RBE_FLAGS=("--config=remote")
 fi
 
 # Wrap bazel.
-- 
cgit v1.2.3


From 7a79715504e92be9fc9aebc12fbd65aa46049054 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 23 Jan 2020 11:47:30 -0800
Subject: Check for EINTR from KVM_CREATE_VM

The kernel may return EINTR from:

kvm_create_vm
  kvm_init_mmu_notifier
    mmu_notifier_register
      do_mmu_notifier_register
        mm_take_all_locks

Go 1.14's preemptive scheduling signals make hitting this much more likely.

PiperOrigin-RevId: 291212669
---
 pkg/sentry/platform/kvm/kvm.go | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index a7850faed..d337c5c7c 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -62,9 +62,19 @@ func New(deviceFile *os.File) (*KVM, error) {
 	}
 
 	// Create a new VM fd.
-	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0)
-	if errno != 0 {
-		return nil, fmt.Errorf("creating VM: %v", errno)
+	var (
+		vm    uintptr
+		errno syscall.Errno
+	)
+	for {
+		vm, _, errno = syscall.Syscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0)
+		if errno == syscall.EINTR {
+			continue
+		}
+		if errno != 0 {
+			return nil, fmt.Errorf("creating VM: %v", errno)
+		}
+		break
 	}
 	// We are done with the device file.
 	deviceFile.Close()
-- 
cgit v1.2.3


From 14d2ed1ad7785a54b35ef7ee949d3cf89a87e66d Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Thu, 23 Jan 2020 14:12:40 -0800
Subject: Fix kythe build.

* Pass --auth_credentials now that we're using RBE
* Fix kzips not being uploaded to the root of the GCS bucket

PiperOrigin-RevId: 291241757
---
 kokoro/kythe/generate_xrefs.cfg | 3 ++-
 kokoro/kythe/generate_xrefs.sh  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg
index 03e65c54e..ccf657983 100644
--- a/kokoro/kythe/generate_xrefs.cfg
+++ b/kokoro/kythe/generate_xrefs.cfg
@@ -23,6 +23,7 @@ bazel_setting {
 
 action {
   define_artifacts {
-    regex: "*.kzip"
+    regex: "**/*.kzip"
+    fail_if_no_artifacts: true
   }
 }
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index d2ca95c68..4c104afdb 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -48,6 +48,7 @@ bazel \
   --define=kythe_corpus=gvisor.dev \
   --cxxopt=-std=c++17 \
   --config=remote \
+  --auth_credentials="${KOKORO_BAZEL_AUTH_CREDENTIAL}" \
   //...
 
 "${KYTHE_DIR}/tools/kzip" merge \
-- 
cgit v1.2.3


From 3db317390b5cc491d680fc4a5fc7b8372890b4da Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 23 Jan 2020 16:17:50 -0800
Subject: Remove epoll entry from map when dropping it.

This pattern (delete from map when dropping) is also used in epoll.RemoveEntry,
and seems like generally a good idea.

PiperOrigin-RevId: 291268208
---
 pkg/sentry/kernel/epoll/epoll.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 430311cc0..e84742993 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -174,6 +174,7 @@ func (e *EventPoll) Release() {
 		entry.id.File.EventUnregister(&entry.waiter)
 		entry.file.Drop()
 	}
+	e.files = nil
 }
 
 // Read implements fs.FileOperations.Read.
-- 
cgit v1.2.3


From 24cfbf4b981a76e46cab47650ef514835990b72e Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Fri, 24 Jan 2020 11:44:31 -0800
Subject: Fix corpus_name to match our ingestion config[1].

PiperOrigin-RevId: 291412676
---
 kokoro/kythe/generate_xrefs.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 4c104afdb..7a0fbb3cd 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -16,8 +16,6 @@
 
 set -ex
 
-# Install the latest version of Bazel. The default on Kokoro images is out of
-# date.
 if command -v use_bazel.sh >/dev/null; then
   use_bazel.sh latest
 fi
@@ -45,7 +43,7 @@ bazel \
   --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \
   build \
   --override_repository kythe_release="${KYTHE_DIR}" \
-  --define=kythe_corpus=gvisor.dev \
+  --define=kythe_corpus=github.com/google/gvisor \
   --cxxopt=-std=c++17 \
   --config=remote \
   --auth_credentials="${KOKORO_BAZEL_AUTH_CREDENTIAL}" \
-- 
cgit v1.2.3


From 390bb9c241c2b05c311579562d95cc39d899157b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 24 Jan 2020 11:58:13 -0800
Subject: Ignore external SIGURG

Go 1.14+ sends SIGURG to Ms to attempt asynchronous preemption of a G. Since it
can't guarantee that a SIGURG is only related to preemption, it continues to
forward them to signal.Notify (see runtime.sighandler).

We should ignore these signals, as applications shouldn't receive them. Note
that this means that truly external SIGURG can no longer be sent to the
application (as with SIGCHLD).

PiperOrigin-RevId: 291415357
---
 pkg/sentry/kernel/signal.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index 02eede93d..e8cce37d0 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -38,6 +38,9 @@ const SignalPanic = linux.SIGUSR2
 // Preconditions: Kernel must have an init process.
 func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) {
 	switch linux.Signal(info.Signo) {
+	case linux.SIGURG:
+		// Sent by the Go 1.14+ runtime for asynchronous goroutine preemption.
+
 	case platform.SignalInterrupt:
 		// Assume that a call to platform.Context.Interrupt() misfired.
 
-- 
cgit v1.2.3


From fb80979e3fe2614414d2d23c27e41bdb9e7c8541 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 24 Jan 2020 12:29:13 -0800
Subject: Increase timeouts for NDP tests' async events

Increase the timeout to 1s when waiting for async NDP events to help
reduce flakiness. This will not significantly increase test times as the
async events continue to receive an event on a channel. The increased
timeout allows more time for an event to be sent on the channel as the
previous timeout of 100ms caused some flakes.

Test: Existing tests pass
PiperOrigin-RevId: 291420936
---
 pkg/tcpip/stack/ndp_test.go | 47 +++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 376681b30..f9460bd51 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -35,13 +35,14 @@ import (
 )
 
 const (
-	addr1          = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	addr2          = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-	addr3          = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
-	linkAddr1      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkAddr2      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
-	linkAddr3      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
-	defaultTimeout = 100 * time.Millisecond
+	addr1                    = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	addr2                    = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	addr3                    = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
+	linkAddr1                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkAddr2                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
+	linkAddr3                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
+	defaultTimeout           = 100 * time.Millisecond
+	defaultAsyncEventTimeout = time.Second
 )
 
 var (
@@ -1086,7 +1087,7 @@ func TestRouterDiscovery(t *testing.T) {
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultAsyncEventTimeout)
 
 	// Rx an RA from lladdr2 with huge lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
@@ -1103,7 +1104,7 @@ func TestRouterDiscovery(t *testing.T) {
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultAsyncEventTimeout)
 }
 
 // TestRouterDiscoveryMaxRouters tests that only
@@ -1342,7 +1343,7 @@ func TestPrefixDiscovery(t *testing.T) {
 		if diff := checkPrefixEvent(e, subnet2, false); diff != "" {
 			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(time.Duration(lifetime)*time.Second + defaultTimeout):
+	case <-time.After(time.Duration(lifetime)*time.Second + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for prefix discovery event")
 	}
 
@@ -1681,7 +1682,7 @@ func TestAutoGenAddr(t *testing.T) {
 		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
 			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(newMinVLDuration + defaultTimeout):
+	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 	if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
@@ -1987,7 +1988,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	expectPrimaryAddr(addr1)
 
 	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout)
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
 	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
@@ -2027,7 +2028,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	expectPrimaryAddr(addr1)
 
 	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout)
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
 	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
@@ -2041,7 +2042,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	}
 
 	// Wait for addr of prefix1 to be invalidated.
-	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultTimeout)
+	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncEventTimeout)
 	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
@@ -2073,7 +2074,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
 					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 				}
-			case <-time.After(defaultTimeout):
+			case <-time.After(defaultAsyncEventTimeout):
 				t.Fatal("timed out waiting for addr auto gen event")
 			}
 		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
@@ -2088,7 +2089,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 			t.Fatalf("got unexpected auto-generated event")
 		}
 
-	case <-time.After(newMinVLDuration + defaultTimeout):
+	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
@@ -2213,7 +2214,7 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
 
-				case <-time.After(minVLSeconds*time.Second + defaultTimeout):
+				case <-time.After(minVLSeconds*time.Second + defaultAsyncEventTimeout):
 					t.Fatal("timeout waiting for addr auto gen event")
 				}
 			})
@@ -2701,7 +2702,7 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
 			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultTimeout):
+	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
@@ -3325,12 +3326,12 @@ func TestRouterSolicitation(t *testing.T) {
 				// times.
 				remaining := test.maxRtrSolicit
 				if remaining > 0 {
-					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
 					remaining--
 				}
 				for ; remaining > 0; remaining-- {
 					waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
-					waitForPkt(2 * defaultTimeout)
+					waitForPkt(defaultAsyncEventTimeout)
 				}
 
 				// Make sure no more RS.
@@ -3411,9 +3412,9 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 
 	// Disable forwarding which should start router solicitations.
 	s.SetForwarding(false)
-	waitForPkt(delay + defaultTimeout)
-	waitForPkt(interval + defaultTimeout)
-	waitForPkt(interval + defaultTimeout)
+	waitForPkt(delay + defaultAsyncEventTimeout)
+	waitForPkt(interval + defaultAsyncEventTimeout)
+	waitForPkt(interval + defaultAsyncEventTimeout)
 	select {
 	case <-e.C:
 		t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
-- 
cgit v1.2.3


From d135b5abf6eafa92d2745dc98d48ef39d2f90e75 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 24 Jan 2020 12:53:29 -0800
Subject: Add anonymous device number allocation to VFS2.

Note that in VFS2, filesystem device numbers are per-vfs.FilesystemImpl rather
than global, avoiding the need for a "registry" type to handle save/restore.
(This is more consistent with Linux anyway: compare e.g.
mm/shmem.c:shmem_mount() => fs/super.c:mount_nodev() => (indirectly)
set_anon_super().)

PiperOrigin-RevId: 291425193
---
 pkg/sentry/vfs/device.go | 29 +++++++++++++++++++++++++++++
 pkg/sentry/vfs/vfs.go    | 18 ++++++++++++++----
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index cb672e36f..9f9d6e783 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -98,3 +98,32 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo
 	}
 	return rd.dev.Open(ctx, mnt, d, *opts)
 }
+
+// GetAnonBlockDevMinor allocates and returns an unused minor device number for
+// an "anonymous" block device with major number 0.
+func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
+	vfs.anonBlockDevMinorMu.Lock()
+	defer vfs.anonBlockDevMinorMu.Unlock()
+	minor := vfs.anonBlockDevMinorNext
+	const maxDevMinor = (1 << 20) - 1
+	for minor < maxDevMinor {
+		if _, ok := vfs.anonBlockDevMinor[minor]; !ok {
+			vfs.anonBlockDevMinor[minor] = struct{}{}
+			vfs.anonBlockDevMinorNext = minor + 1
+			return minor, nil
+		}
+		minor++
+	}
+	return 0, syserror.EMFILE
+}
+
+// PutAnonBlockDevMinor deallocates a minor device number returned by a
+// previous call to GetAnonBlockDevMinor.
+func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) {
+	vfs.anonBlockDevMinorMu.Lock()
+	defer vfs.anonBlockDevMinorMu.Unlock()
+	delete(vfs.anonBlockDevMinor, minor)
+	if minor < vfs.anonBlockDevMinorNext {
+		vfs.anonBlockDevMinorNext = minor
+	}
+}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 1f21b0b31..1f6f56293 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -80,6 +80,14 @@ type VirtualFilesystem struct {
 	devicesMu sync.RWMutex
 	devices   map[devTuple]*registeredDevice
 
+	// anonBlockDevMinor contains all allocated anonymous block device minor
+	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
+	// unallocated anonymous block device number. anonBlockDevMinorNext and
+	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
+	anonBlockDevMinorMu   sync.Mutex
+	anonBlockDevMinorNext uint32
+	anonBlockDevMinor     map[uint32]struct{}
+
 	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
 	// fsTypesMu.
 	fsTypesMu sync.RWMutex
@@ -94,10 +102,12 @@ type VirtualFilesystem struct {
 // New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
-		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
-		devices:     make(map[devTuple]*registeredDevice),
-		fsTypes:     make(map[string]*registeredFilesystemType),
-		filesystems: make(map[*Filesystem]struct{}),
+		mountpoints:           make(map[*Dentry]map[*Mount]struct{}),
+		devices:               make(map[devTuple]*registeredDevice),
+		anonBlockDevMinorNext: 1,
+		anonBlockDevMinor:     make(map[uint32]struct{}),
+		fsTypes:               make(map[string]*registeredFilesystemType),
+		filesystems:           make(map[*Filesystem]struct{}),
 	}
 	vfs.mounts.Init()
 	return vfs
-- 
cgit v1.2.3


From 878bda6e19a0d55525ea6b1600f3413e0c5d6a84 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 24 Jan 2020 13:02:01 -0800
Subject: Lock the NIC when checking if an address is tentative

PiperOrigin-RevId: 291426657
---
 pkg/tcpip/stack/nic.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 79556a36f..7dad9a8cb 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1208,6 +1208,9 @@ func (n *NIC) Stack() *Stack {
 // false. It will only return true if the address is associated with the NIC
 // AND it is tentative.
 func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
 	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
 	if !ok {
 		return false
-- 
cgit v1.2.3


From 18a7e1309decb9bc09879e337adbc00f81d420c5 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 24 Jan 2020 17:06:30 -0800
Subject: Add support for device special files to VFS2 tmpfs.

PiperOrigin-RevId: 291471892
---
 pkg/sentry/fsimpl/tmpfs/BUILD          |  1 +
 pkg/sentry/fsimpl/tmpfs/device_file.go | 39 ++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/filesystem.go  | 43 +++++++++++++++++++---------------
 pkg/sentry/fsimpl/tmpfs/tmpfs.go       | 30 +++++++++++++++++++-----
 4 files changed, 88 insertions(+), 25 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/tmpfs/device_file.go

diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 7601c7c04..691476b4f 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -20,6 +20,7 @@ go_library(
     name = "tmpfs",
     srcs = [
         "dentry_list.go",
+        "device_file.go",
         "directory.go",
         "filesystem.go",
         "named_pipe.go",
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
new file mode 100644
index 000000000..84b181b90
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type deviceFile struct {
+	inode inode
+	kind  vfs.DeviceKind
+	major uint32
+	minor uint32
+}
+
+func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+	file := &deviceFile{
+		kind:  kind,
+		major: major,
+		minor: minor,
+	}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index a9f66a42a..d726f03c5 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -228,23 +228,26 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		var childInode *inode
 		switch opts.Mode.FileType() {
 		case 0, linux.S_IFREG:
-			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-			parent.vfsd.InsertChild(&child.vfsd, name)
-			parent.inode.impl.(*directory).childList.PushBack(child)
-			return nil
+			childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
 		case linux.S_IFIFO:
-			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-			parent.vfsd.InsertChild(&child.vfsd, name)
-			parent.inode.impl.(*directory).childList.PushBack(child)
-			return nil
-		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+			childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+		case linux.S_IFBLK:
+			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+		case linux.S_IFCHR:
+			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+		case linux.S_IFSOCK:
 			// Not yet supported.
 			return syserror.EPERM
 		default:
 			return syserror.EINVAL
 		}
+		child := fs.newDentry(childInode)
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
 	})
 }
 
@@ -264,7 +267,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err != nil {
 			return nil, err
 		}
-		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
+		return d.open(ctx, rp, &opts, false /* afterCreate */)
 	}
 
 	mustCreate := opts.Flags&linux.O_EXCL != 0
@@ -279,7 +282,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
+		return start.open(ctx, rp, &opts, false /* afterCreate */)
 	}
 afterTrailingSymlink:
 	parent, err := walkParentDirLocked(rp, start)
@@ -313,7 +316,7 @@ afterTrailingSymlink:
 		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
 		parent.vfsd.InsertChild(&child.vfsd, name)
 		parent.inode.impl.(*directory).childList.PushBack(child)
-		return child.open(ctx, rp, opts.Flags, true)
+		return child.open(ctx, rp, &opts, true)
 	}
 	if err != nil {
 		return nil, err
@@ -327,11 +330,11 @@ afterTrailingSymlink:
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	return child.open(ctx, rp, opts.Flags, false)
+	return child.open(ctx, rp, &opts, false)
 }
 
-func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
 	if !afterCreate {
 		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
 			return nil, err
@@ -340,10 +343,10 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
-		if flags&linux.O_TRUNC != 0 {
+		if opts.Flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data.Truncate(0, impl.memFile)
 			atomic.StoreUint64(&impl.size, 0)
@@ -356,7 +359,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
@@ -364,7 +367,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
+		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags)
+	case *deviceFile:
+		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 1d4889c89..515f033f2 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -149,6 +149,10 @@ type inode struct {
 	ctime int64 // nanoseconds
 	mtime int64 // nanoseconds
 
+	// Only meaningful for device special files.
+	rdevMajor uint32
+	rdevMinor uint32
+
 	impl interface{} // immutable
 }
 
@@ -269,6 +273,15 @@ func (i *inode) statTo(stat *linux.Statx) {
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
 	case *namedPipe:
 		stat.Mode |= linux.S_IFIFO
+	case *deviceFile:
+		switch impl.kind {
+		case vfs.BlockDevice:
+			stat.Mode |= linux.S_IFBLK
+		case vfs.CharDevice:
+			stat.Mode |= linux.S_IFCHR
+		}
+		stat.RdevMajor = impl.major
+		stat.RdevMinor = impl.minor
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
@@ -309,12 +322,8 @@ func (i *inode) setStat(stat linux.Statx) error {
 			}
 		case *directory:
 			return syserror.EISDIR
-		case *symlink:
-			return syserror.EINVAL
-		case *namedPipe:
-			// Nothing.
 		default:
-			panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+			return syserror.EINVAL
 		}
 	}
 	if mask&linux.STATX_ATIME != 0 {
@@ -353,13 +362,22 @@ func allocatedBlocksForSize(size uint64) uint64 {
 }
 
 func (i *inode) direntType() uint8 {
-	switch i.impl.(type) {
+	switch impl := i.impl.(type) {
 	case *regularFile:
 		return linux.DT_REG
 	case *directory:
 		return linux.DT_DIR
 	case *symlink:
 		return linux.DT_LNK
+	case *deviceFile:
+		switch impl.kind {
+		case vfs.BlockDevice:
+			return linux.DT_BLK
+		case vfs.CharDevice:
+			return linux.DT_CHR
+		default:
+			panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
+		}
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
-- 
cgit v1.2.3


From 2946fe81627afa223853769ed736e2a56e0144b7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 24 Jan 2020 17:12:03 -0800
Subject: We can now actually write out the udp matcher.

---
 pkg/sentry/socket/netfilter/netfilter.go | 78 ++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3ca22932d..6c88a50a6 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -36,7 +36,7 @@ const errorTargetName = "ERROR"
 
 // metadata is opaque to netstack. It holds data that we need to translate
 // between Linux's and netstack's iptables representations.
-// TODO(gvisor.dev/issue/170): This might be removable.
+// TODO(gvisor.dev/issue/170): Use metadata to check correctness.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -44,6 +44,14 @@ type metadata struct {
 	Size       uint32
 }
 
+const enableDebugLog = true
+
+func nflog(format string, args ...interface{}) {
+	if enableDebugLog {
+		log.Infof("netfilter: "+format, args...)
+	}
+}
+
 // GetInfo returns information about iptables.
 func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
@@ -72,6 +80,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	info.NumEntries = metadata.NumEntries
 	info.Size = metadata.Size
 
+	nflog("GetInfo returning info: %+v", info)
+
 	return info, nil
 }
 
@@ -80,21 +90,26 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+		log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
 	// Find the appropriate table.
 	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
+		log.Warningf("netfilter: couldn't find table %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, err
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table)
+	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
+	if meta != table.Metadata().(metadata) {
+		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
+	}
 	if binary.Size(entries) > uintptr(outLen) {
 		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -148,15 +163,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
+		nflog("Current offset: %d", entries.Size)
+
 		// Is this a chain entry point?
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
+				nflog("Found hook %d at offset %d", hook, entries.Size)
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
+				nflog("Found underflow %d at offset %d", underflow, entries.Size)
 				meta.Underflow[underflow] = entries.Size
 			}
 		}
@@ -176,6 +195,10 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 			// Serialize the matcher and add it to the
 			// entry.
 			serialized := marshalMatcher(matcher)
+			nflog("matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
 			entry.Elems = append(entry.Elems, serialized...)
 			entry.NextOffset += uint16(len(serialized))
 			entry.TargetOffset += uint16(len(serialized))
@@ -183,18 +206,25 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 
 		// Serialize and append the target.
 		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
 		entry.Elems = append(entry.Elems, serialized...)
 		entry.NextOffset += uint16(len(serialized))
 
+		nflog("Adding entry: %+v", entry)
+
 		entries.Size += uint32(entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
 		meta.NumEntries++
 	}
 
+	nflog("Finished with an marshalled size of %d", meta.Size)
 	meta.Size = entries.Size
 	return entries, meta, nil
 }
 
+// TODO: SOMEHOW THIS IS NOT GETTING APPENDED!
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch m := matcher.(type) {
 	case *iptables.UDPMatcher:
@@ -207,17 +237,17 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
+	nflog("Marshalling UDP matcher: %+v", matcher)
+
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
+			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6,
 			// Name:      "udp",
 		},
-		Data: make([]byte, linux.SizeOfXTUDP+22),
+		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
-	// copy(linuxMatcher.Name[:], "udp")
 	copy(linuxMatcher.Name[:], "udp")
 
-	// TODO: Must be aligned.
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.Data.SourcePortStart,
 		SourcePortEnd:        matcher.Data.SourcePortEnd,
@@ -225,17 +255,17 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
 		InverseFlags:         matcher.Data.InverseFlags,
 	}
-	binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp)
-
-	if binary.Size(linuxMatcher)%64 != 0 {
-		panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher)))
-	}
-
-	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte
-	if len(buf)%64 != 0 {
-		panic(fmt.Sprintf("len is actually: %d", len(buf)))
-	}
-	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
+	nflog("marshalUDPMatcher: xtudp: %+v", xtudp)
+	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
+	nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher)
+
+	// We have to pad this struct size to a multiple of 8 bytes, so we make
+	// this a little longer than it needs to be.
+	buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6)
+	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
+	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
+	nflog("Marshalled into matcher of size %d", len(buf))
+	nflog("marshalUDPMatcher: buf is: %v", buf)
 	return buf[:]
 }
 
@@ -253,6 +283,8 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
+	nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+
 	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -321,7 +353,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
-	printReplace(optVal)
+	// printReplace(optVal)
 
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
@@ -343,10 +375,14 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
+	nflog("Setting entries in table %q", replace.Name.String())
+
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
 	var offsets []uint32
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("Processing entry at offset %d", offset)
+
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
 			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
@@ -464,9 +500,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
 func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+	nflog("Parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
-		log.Infof("parseMatchers: optVal has len %d", len(optVal))
+		nflog("parseMatchers: optVal has len %d", len(optVal))
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
 			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
@@ -475,7 +512,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
 		binary.Unmarshal(buf, usermem.ByteOrder, &match)
-		log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+		nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
@@ -532,6 +569,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
 func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
+	nflog("Parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, syserr.ErrInvalidArgument
-- 
cgit v1.2.3


From 68514d4ba3f7c06a89a8d0cd79327ede62dae65b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Sun, 26 Jan 2020 18:32:52 -0800
Subject: Unroll checksum computation loop.

Checksum computation is one of the most expensive bits of
packet processing. Manual unrolling of the loop provides
significant improvement in checksum speed.

Updates #1656

BenchmarkChecksum/checksum_64-12                49834124                23.6 ns/op
BenchmarkChecksum/checksum_128-12               27111997                44.1 ns/op
BenchmarkChecksum/checksum_256-12               11416683                91.5 ns/op
BenchmarkChecksum/checksum_512-12                6375298               174 ns/op
BenchmarkChecksum/checksum_1024-12               3403852               338 ns/op
BenchmarkChecksum/checksum_1500-12               2343576               493 ns/op
BenchmarkChecksum/checksum_2048-12               1730521               656 ns/op
BenchmarkChecksum/checksum_4096-12                920469              1327 ns/op
BenchmarkChecksum/checksum_8192-12                445885              2637 ns/op
BenchmarkChecksum/checksum_16384-12               226342              5268 ns/op
BenchmarkChecksum/checksum_32767-12               114210             10503 ns/op
BenchmarkChecksum/checksum_32768-12                99138             10610 ns/op
BenchmarkChecksum/checksum_65535-12                53438             21158 ns/op
BenchmarkChecksum/checksum_65536-12                52993             21067 ns/op
BenchmarkUnrolledChecksum/checksum_64-12        61035639                19.1 ns/op
BenchmarkUnrolledChecksum/checksum_128-12               36067015                33.6 ns/op
BenchmarkUnrolledChecksum/checksum_256-12               19731220                60.4 ns/op
BenchmarkUnrolledChecksum/checksum_512-12                9091291               116 ns/op
BenchmarkUnrolledChecksum/checksum_1024-12               4976406               226 ns/op
BenchmarkUnrolledChecksum/checksum_1500-12               3685224               328 ns/op
BenchmarkUnrolledChecksum/checksum_2048-12               2579108               447 ns/op
BenchmarkUnrolledChecksum/checksum_4096-12               1350475               887 ns/op
BenchmarkUnrolledChecksum/checksum_8192-12                658248              1780 ns/op
BenchmarkUnrolledChecksum/checksum_16384-12               335869              3534 ns/op
BenchmarkUnrolledChecksum/checksum_32767-12               168650              7095 ns/op
BenchmarkUnrolledChecksum/checksum_32768-12               168075              7098 ns/op
BenchmarkUnrolledChecksum/checksum_65535-12                75085             14277 ns/op
BenchmarkUnrolledChecksum/checksum_65536-12                75921             14127 ns/op

PiperOrigin-RevId: 291643290
---
 pkg/tcpip/header/checksum.go      | 124 ++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/header/checksum_test.go |  62 +++++++++++++++++++
 2 files changed, 186 insertions(+)

diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index 9749c7f4d..ce57b581a 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -45,6 +45,121 @@ func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
 	return ChecksumCombine(uint16(v), uint16(v>>16)), odd
 }
 
+func unrolledCalculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
+	v := initial
+
+	if odd {
+		v += uint32(buf[0])
+		buf = buf[1:]
+	}
+
+	l := len(buf)
+	odd = l&1 != 0
+	if odd {
+		l--
+		v += uint32(buf[l]) << 8
+	}
+	for (l - 64) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		buf = buf[64:]
+		l = l - 64
+	}
+	if (l - 32) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		buf = buf[32:]
+		l = l - 32
+	}
+	if (l - 16) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		buf = buf[16:]
+		l = l - 16
+	}
+	if (l - 8) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		buf = buf[8:]
+		l = l - 8
+	}
+	if (l - 4) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		buf = buf[4:]
+		l = l - 4
+	}
+
+	// At this point since l was even before we started unrolling
+	// there can be only two bytes left to add.
+	if l != 0 {
+		v += (uint32(buf[0]) << 8) + uint32(buf[1])
+	}
+
+	return ChecksumCombine(uint16(v), uint16(v>>16)), odd
+}
+
 // Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
 // given byte array.
 //
@@ -54,6 +169,15 @@ func Checksum(buf []byte, initial uint16) uint16 {
 	return s
 }
 
+// UnrolledChecksum calculates the checksum (as defined in RFC 1071) of the
+// bytes in the given byte array.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func UnrolledChecksum(buf []byte, initial uint16) uint16 {
+	s, _ := unrolledCalculateChecksum(buf, false, uint32(initial))
+	return s
+}
+
 // ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in
 // the given VectorizedView.
 //
diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go
index 86b466c1c..2fbd16a65 100644
--- a/pkg/tcpip/header/checksum_test.go
+++ b/pkg/tcpip/header/checksum_test.go
@@ -17,6 +17,8 @@
 package header_test
 
 import (
+	"fmt"
+	"math/rand"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -107,3 +109,63 @@ func TestChecksumVVWithOffset(t *testing.T) {
 		})
 	}
 }
+
+func TestChecksum(t *testing.T) {
+	var bufSizes = []int{0, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 257, 1023, 1024}
+	type testCase struct {
+		buf      []byte
+		initial  uint16
+		csumOrig uint16
+		csumNew  uint16
+	}
+	testCases := make([]testCase, 100000)
+	// Ensure same buffer generation for test consistency.
+	rnd := rand.New(rand.NewSource(42))
+	for i := range testCases {
+		testCases[i].buf = make([]byte, bufSizes[i%len(bufSizes)])
+		testCases[i].initial = uint16(rnd.Intn(65536))
+		rnd.Read(testCases[i].buf)
+	}
+
+	for i := range testCases {
+		testCases[i].csumOrig = header.Checksum(testCases[i].buf, testCases[i].initial)
+		testCases[i].csumNew = header.UnrolledChecksum(testCases[i].buf, testCases[i].initial)
+		if got, want := testCases[i].csumNew, testCases[i].csumOrig; got != want {
+			t.Fatalf("new checksum for (buf = %x, initial = %d) does not match old got: %d, want: %d", testCases[i].buf, testCases[i].initial, got, want)
+		}
+	}
+}
+
+func BenchmarkChecksum(b *testing.B) {
+	var bufSizes = []int{64, 128, 256, 512, 1024, 1500, 2048, 4096, 8192, 16384, 32767, 32768, 65535, 65536}
+
+	checkSumImpls := []struct {
+		fn   func([]byte, uint16) uint16
+		name string
+	}{
+		{header.Checksum, fmt.Sprintf("checksum")},
+		{header.UnrolledChecksum, fmt.Sprintf("unrolled_checksum")},
+	}
+
+	for _, csumImpl := range checkSumImpls {
+		// Ensure same buffer generation for test consistency.
+		rnd := rand.New(rand.NewSource(42))
+		for _, bufSz := range bufSizes {
+			b.Run(fmt.Sprintf("%s_%d", csumImpl.name, bufSz), func(b *testing.B) {
+				tc := struct {
+					buf     []byte
+					initial uint16
+					csum    uint16
+				}{
+					buf:     make([]byte, bufSz),
+					initial: uint16(rnd.Intn(65536)),
+				}
+				rnd.Read(tc.buf)
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					tc.csum = csumImpl.fn(tc.buf, tc.initial)
+				}
+			})
+		}
+	}
+}
-- 
cgit v1.2.3


From 6b43cf791a74a746443f70f98d859c1246f87e2a Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 27 Jan 2020 05:33:03 -0800
Subject: Replace calculateChecksum w/ the unrolled version.

Fixes #1656

PiperOrigin-RevId: 291703760
---
 pkg/tcpip/header/checksum.go      | 15 +++++++++------
 pkg/tcpip/header/checksum_test.go |  6 +++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index ce57b581a..204285576 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -160,20 +160,23 @@ func unrolledCalculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bo
 	return ChecksumCombine(uint16(v), uint16(v>>16)), odd
 }
 
-// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
-// given byte array.
+// ChecksumOld calculates the checksum (as defined in RFC 1071) of the bytes in
+// the given byte array. This function uses a non-optimized implementation. Its
+// only retained for reference and to use as a benchmark/test. Most code should
+// use the header.Checksum function.
 //
 // The initial checksum must have been computed on an even number of bytes.
-func Checksum(buf []byte, initial uint16) uint16 {
+func ChecksumOld(buf []byte, initial uint16) uint16 {
 	s, _ := calculateChecksum(buf, false, uint32(initial))
 	return s
 }
 
-// UnrolledChecksum calculates the checksum (as defined in RFC 1071) of the
-// bytes in the given byte array.
+// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
+// given byte array. This function uses an optimized unrolled version of the
+// checksum algorithm.
 //
 // The initial checksum must have been computed on an even number of bytes.
-func UnrolledChecksum(buf []byte, initial uint16) uint16 {
+func Checksum(buf []byte, initial uint16) uint16 {
 	s, _ := unrolledCalculateChecksum(buf, false, uint32(initial))
 	return s
 }
diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go
index 2fbd16a65..309403482 100644
--- a/pkg/tcpip/header/checksum_test.go
+++ b/pkg/tcpip/header/checksum_test.go
@@ -128,8 +128,8 @@ func TestChecksum(t *testing.T) {
 	}
 
 	for i := range testCases {
-		testCases[i].csumOrig = header.Checksum(testCases[i].buf, testCases[i].initial)
-		testCases[i].csumNew = header.UnrolledChecksum(testCases[i].buf, testCases[i].initial)
+		testCases[i].csumOrig = header.ChecksumOld(testCases[i].buf, testCases[i].initial)
+		testCases[i].csumNew = header.Checksum(testCases[i].buf, testCases[i].initial)
 		if got, want := testCases[i].csumNew, testCases[i].csumOrig; got != want {
 			t.Fatalf("new checksum for (buf = %x, initial = %d) does not match old got: %d, want: %d", testCases[i].buf, testCases[i].initial, got, want)
 		}
@@ -143,8 +143,8 @@ func BenchmarkChecksum(b *testing.B) {
 		fn   func([]byte, uint16) uint16
 		name string
 	}{
+		{header.ChecksumOld, fmt.Sprintf("checksum_old")},
 		{header.Checksum, fmt.Sprintf("checksum")},
-		{header.UnrolledChecksum, fmt.Sprintf("unrolled_checksum")},
 	}
 
 	for _, csumImpl := range checkSumImpls {
-- 
cgit v1.2.3


From 45398b160f4ccc3148644dde5eb5e4610e6a2d9b Mon Sep 17 00:00:00 2001
From: Marek Majkowski <marek@cloudflare.com>
Date: Wed, 22 Jan 2020 12:50:28 +0000
Subject: Expose gonet.NewPacketConn, for parity with gonet.NewConn API

gonet.Conn can be created with both gonet.NewConn and gonet.Dial.
gonet.PacketConn was created only by gonet.DialUDP. This prevented
us from being able to use PacketConn in udp.NewForwarder() context.

This simple constructor - NewPacketConn, allows user to create
correct structure from that context.
---
 pkg/tcpip/adapters/gonet/gonet.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 3bba4028b..b659cfccf 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -556,6 +556,17 @@ type PacketConn struct {
 	wq    *waiter.Queue
 }
 
+// NewPacketConn creates a new PacketConn.
+func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketConn {
+	c := &PacketConn{
+		stack: s,
+		ep:    ep,
+		wq:    wq,
+	}
+	c.deadlineTimer.init()
+	return c
+}
+
 // DialUDP creates a new PacketConn.
 //
 // If laddr is nil, a local address is automatically chosen.
-- 
cgit v1.2.3


From d29e59af9fbd420e34378bcbf7ae543134070217 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 10:04:07 -0800
Subject: Standardize on tools directory.

PiperOrigin-RevId: 291745021
---
 .bazelrc                                           |   8 +-
 BUILD                                              |  49 ++++++-
 benchmarks/defs.bzl                                |  18 ---
 benchmarks/harness/BUILD                           |  74 +++++-----
 benchmarks/harness/machine_producers/BUILD         |   4 +-
 benchmarks/runner/BUILD                            |  24 ++--
 benchmarks/tcp/BUILD                               |   3 +-
 benchmarks/workloads/ab/BUILD                      |  19 ++-
 benchmarks/workloads/absl/BUILD                    |  19 ++-
 benchmarks/workloads/curl/BUILD                    |   2 +-
 benchmarks/workloads/ffmpeg/BUILD                  |   2 +-
 benchmarks/workloads/fio/BUILD                     |  19 ++-
 benchmarks/workloads/httpd/BUILD                   |   2 +-
 benchmarks/workloads/iperf/BUILD                   |  19 ++-
 benchmarks/workloads/netcat/BUILD                  |   2 +-
 benchmarks/workloads/nginx/BUILD                   |   2 +-
 benchmarks/workloads/node/BUILD                    |   2 +-
 benchmarks/workloads/node_template/BUILD           |   2 +-
 benchmarks/workloads/redis/BUILD                   |   2 +-
 benchmarks/workloads/redisbenchmark/BUILD          |  19 ++-
 benchmarks/workloads/ruby/BUILD                    |   2 +-
 benchmarks/workloads/ruby_template/BUILD           |   2 +-
 benchmarks/workloads/sleep/BUILD                   |   2 +-
 benchmarks/workloads/sysbench/BUILD                |  19 ++-
 benchmarks/workloads/syscall/BUILD                 |  19 ++-
 benchmarks/workloads/tensorflow/BUILD              |   2 +-
 benchmarks/workloads/true/BUILD                    |   2 +-
 pkg/abi/BUILD                                      |   3 +-
 pkg/abi/linux/BUILD                                |   6 +-
 pkg/amutex/BUILD                                   |   6 +-
 pkg/atomicbitops/BUILD                             |   6 +-
 pkg/binary/BUILD                                   |   6 +-
 pkg/bits/BUILD                                     |   6 +-
 pkg/bpf/BUILD                                      |   6 +-
 pkg/compressio/BUILD                               |   6 +-
 pkg/control/client/BUILD                           |   3 +-
 pkg/control/server/BUILD                           |   3 +-
 pkg/cpuid/BUILD                                    |   8 +-
 pkg/eventchannel/BUILD                             |  16 +--
 pkg/fd/BUILD                                       |   6 +-
 pkg/fdchannel/BUILD                                |   8 +-
 pkg/fdnotifier/BUILD                               |   3 +-
 pkg/flipcall/BUILD                                 |   8 +-
 pkg/fspath/BUILD                                   |  13 +-
 pkg/gate/BUILD                                     |   4 +-
 pkg/goid/BUILD                                     |   6 +-
 pkg/ilist/BUILD                                    |   6 +-
 pkg/linewriter/BUILD                               |   6 +-
 pkg/log/BUILD                                      |   6 +-
 pkg/memutil/BUILD                                  |   3 +-
 pkg/metric/BUILD                                   |  23 +--
 pkg/p9/BUILD                                       |   6 +-
 pkg/p9/p9test/BUILD                                |   6 +-
 pkg/procid/BUILD                                   |   8 +-
 pkg/rand/BUILD                                     |   3 +-
 pkg/refs/BUILD                                     |   6 +-
 pkg/seccomp/BUILD                                  |   6 +-
 pkg/secio/BUILD                                    |   6 +-
 pkg/segment/test/BUILD                             |   6 +-
 pkg/sentry/BUILD                                   |   2 +
 pkg/sentry/arch/BUILD                              |  20 +--
 pkg/sentry/context/BUILD                           |   3 +-
 pkg/sentry/context/contexttest/BUILD               |   3 +-
 pkg/sentry/control/BUILD                           |   8 +-
 pkg/sentry/device/BUILD                            |   6 +-
 pkg/sentry/fs/BUILD                                |   6 +-
 pkg/sentry/fs/anon/BUILD                           |   3 +-
 pkg/sentry/fs/dev/BUILD                            |   3 +-
 pkg/sentry/fs/fdpipe/BUILD                         |   6 +-
 pkg/sentry/fs/filetest/BUILD                       |   3 +-
 pkg/sentry/fs/fsutil/BUILD                         |   6 +-
 pkg/sentry/fs/gofer/BUILD                          |   6 +-
 pkg/sentry/fs/host/BUILD                           |   6 +-
 pkg/sentry/fs/lock/BUILD                           |   6 +-
 pkg/sentry/fs/proc/BUILD                           |   6 +-
 pkg/sentry/fs/proc/device/BUILD                    |   3 +-
 pkg/sentry/fs/proc/seqfile/BUILD                   |   6 +-
 pkg/sentry/fs/ramfs/BUILD                          |   6 +-
 pkg/sentry/fs/sys/BUILD                            |   3 +-
 pkg/sentry/fs/timerfd/BUILD                        |   3 +-
 pkg/sentry/fs/tmpfs/BUILD                          |   6 +-
 pkg/sentry/fs/tty/BUILD                            |   6 +-
 pkg/sentry/fsimpl/ext/BUILD                        |   6 +-
 pkg/sentry/fsimpl/ext/benchmark/BUILD              |   2 +-
 pkg/sentry/fsimpl/ext/disklayout/BUILD             |   6 +-
 pkg/sentry/fsimpl/kernfs/BUILD                     |   6 +-
 pkg/sentry/fsimpl/proc/BUILD                       |   8 +-
 pkg/sentry/fsimpl/sys/BUILD                        |   6 +-
 pkg/sentry/fsimpl/testutil/BUILD                   |   5 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                      |   8 +-
 pkg/sentry/hostcpu/BUILD                           |   6 +-
 pkg/sentry/hostmm/BUILD                            |   3 +-
 pkg/sentry/inet/BUILD                              |   3 +-
 pkg/sentry/kernel/BUILD                            |  24 +---
 pkg/sentry/kernel/auth/BUILD                       |   3 +-
 pkg/sentry/kernel/contexttest/BUILD                |   3 +-
 pkg/sentry/kernel/epoll/BUILD                      |   6 +-
 pkg/sentry/kernel/eventfd/BUILD                    |   6 +-
 pkg/sentry/kernel/fasync/BUILD                     |   3 +-
 pkg/sentry/kernel/futex/BUILD                      |   6 +-
 pkg/sentry/kernel/memevent/BUILD                   |  20 +--
 pkg/sentry/kernel/pipe/BUILD                       |   6 +-
 pkg/sentry/kernel/sched/BUILD                      |   6 +-
 pkg/sentry/kernel/semaphore/BUILD                  |   6 +-
 pkg/sentry/kernel/shm/BUILD                        |   3 +-
 pkg/sentry/kernel/signalfd/BUILD                   |   5 +-
 pkg/sentry/kernel/time/BUILD                       |   3 +-
 pkg/sentry/limits/BUILD                            |   6 +-
 pkg/sentry/loader/BUILD                            |   4 +-
 pkg/sentry/memmap/BUILD                            |   6 +-
 pkg/sentry/mm/BUILD                                |   6 +-
 pkg/sentry/pgalloc/BUILD                           |   6 +-
 pkg/sentry/platform/BUILD                          |   3 +-
 pkg/sentry/platform/interrupt/BUILD                |   6 +-
 pkg/sentry/platform/kvm/BUILD                      |   6 +-
 pkg/sentry/platform/kvm/testutil/BUILD             |   3 +-
 pkg/sentry/platform/ptrace/BUILD                   |   3 +-
 pkg/sentry/platform/ring0/BUILD                    |   3 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD        |   2 +-
 pkg/sentry/platform/ring0/pagetables/BUILD         |  16 +--
 pkg/sentry/platform/safecopy/BUILD                 |   6 +-
 pkg/sentry/safemem/BUILD                           |   6 +-
 pkg/sentry/sighandling/BUILD                       |   3 +-
 pkg/sentry/socket/BUILD                            |   3 +-
 pkg/sentry/socket/control/BUILD                    |   3 +-
 pkg/sentry/socket/hostinet/BUILD                   |   3 +-
 pkg/sentry/socket/netfilter/BUILD                  |   3 +-
 pkg/sentry/socket/netlink/BUILD                    |   3 +-
 pkg/sentry/socket/netlink/port/BUILD               |   6 +-
 pkg/sentry/socket/netlink/route/BUILD              |   3 +-
 pkg/sentry/socket/netlink/uevent/BUILD             |   3 +-
 pkg/sentry/socket/netstack/BUILD                   |   3 +-
 pkg/sentry/socket/unix/BUILD                       |   3 +-
 pkg/sentry/socket/unix/transport/BUILD             |   3 +-
 pkg/sentry/state/BUILD                             |   3 +-
 pkg/sentry/strace/BUILD                            |  20 +--
 pkg/sentry/syscalls/BUILD                          |   3 +-
 pkg/sentry/syscalls/linux/BUILD                    |   3 +-
 pkg/sentry/time/BUILD                              |   6 +-
 pkg/sentry/unimpl/BUILD                            |  21 +--
 pkg/sentry/uniqueid/BUILD                          |   3 +-
 pkg/sentry/usage/BUILD                             |   5 +-
 pkg/sentry/usermem/BUILD                           |   7 +-
 pkg/sentry/vfs/BUILD                               |   8 +-
 pkg/sentry/watchdog/BUILD                          |   3 +-
 pkg/sleep/BUILD                                    |   6 +-
 pkg/state/BUILD                                    |  17 +--
 pkg/state/statefile/BUILD                          |   6 +-
 pkg/sync/BUILD                                     |   6 +-
 pkg/sync/atomicptrtest/BUILD                       |   6 +-
 pkg/sync/seqatomictest/BUILD                       |   6 +-
 pkg/syserr/BUILD                                   |   3 +-
 pkg/syserror/BUILD                                 |   4 +-
 pkg/tcpip/BUILD                                    |   6 +-
 pkg/tcpip/adapters/gonet/BUILD                     |   6 +-
 pkg/tcpip/buffer/BUILD                             |   6 +-
 pkg/tcpip/checker/BUILD                            |   3 +-
 pkg/tcpip/hash/jenkins/BUILD                       |   6 +-
 pkg/tcpip/header/BUILD                             |   6 +-
 pkg/tcpip/iptables/BUILD                           |   3 +-
 pkg/tcpip/link/channel/BUILD                       |   3 +-
 pkg/tcpip/link/fdbased/BUILD                       |   6 +-
 pkg/tcpip/link/loopback/BUILD                      |   3 +-
 pkg/tcpip/link/muxed/BUILD                         |   6 +-
 pkg/tcpip/link/rawfile/BUILD                       |   3 +-
 pkg/tcpip/link/sharedmem/BUILD                     |   6 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD                |   6 +-
 pkg/tcpip/link/sharedmem/queue/BUILD               |   6 +-
 pkg/tcpip/link/sniffer/BUILD                       |   3 +-
 pkg/tcpip/link/tun/BUILD                           |   3 +-
 pkg/tcpip/link/waitable/BUILD                      |   6 +-
 pkg/tcpip/network/BUILD                            |   2 +-
 pkg/tcpip/network/arp/BUILD                        |   4 +-
 pkg/tcpip/network/fragmentation/BUILD              |   6 +-
 pkg/tcpip/network/hash/BUILD                       |   3 +-
 pkg/tcpip/network/ipv4/BUILD                       |   4 +-
 pkg/tcpip/network/ipv6/BUILD                       |   6 +-
 pkg/tcpip/ports/BUILD                              |   6 +-
 pkg/tcpip/sample/tun_tcp_connect/BUILD             |   2 +-
 pkg/tcpip/sample/tun_tcp_echo/BUILD                |   2 +-
 pkg/tcpip/seqnum/BUILD                             |   3 +-
 pkg/tcpip/stack/BUILD                              |   6 +-
 pkg/tcpip/transport/icmp/BUILD                     |   3 +-
 pkg/tcpip/transport/packet/BUILD                   |   3 +-
 pkg/tcpip/transport/raw/BUILD                      |   3 +-
 pkg/tcpip/transport/tcp/BUILD                      |   4 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD      |   3 +-
 pkg/tcpip/transport/tcpconntrack/BUILD             |   4 +-
 pkg/tcpip/transport/udp/BUILD                      |   4 +-
 pkg/tmutex/BUILD                                   |   6 +-
 pkg/unet/BUILD                                     |   6 +-
 pkg/urpc/BUILD                                     |   6 +-
 pkg/waiter/BUILD                                   |   6 +-
 runsc/BUILD                                        |  27 ++--
 runsc/boot/BUILD                                   |   5 +-
 runsc/boot/filter/BUILD                            |   3 +-
 runsc/boot/platforms/BUILD                         |   3 +-
 runsc/cgroup/BUILD                                 |   5 +-
 runsc/cmd/BUILD                                    |   5 +-
 runsc/console/BUILD                                |   3 +-
 runsc/container/BUILD                              |   5 +-
 runsc/container/test_app/BUILD                     |   4 +-
 runsc/criutil/BUILD                                |   3 +-
 runsc/dockerutil/BUILD                             |   3 +-
 runsc/fsgofer/BUILD                                |   9 +-
 runsc/fsgofer/filter/BUILD                         |   3 +-
 runsc/sandbox/BUILD                                |   3 +-
 runsc/specutils/BUILD                              |   5 +-
 runsc/testutil/BUILD                               |   3 +-
 runsc/version_test.sh                              |   2 +-
 scripts/common.sh                                  |   6 +-
 scripts/common_bazel.sh                            |  99 -------------
 scripts/common_build.sh                            |  99 +++++++++++++
 test/BUILD                                         |  45 +-----
 test/e2e/BUILD                                     |   5 +-
 test/image/BUILD                                   |   5 +-
 test/iptables/BUILD                                |   5 +-
 test/iptables/runner/BUILD                         |  12 +-
 test/root/BUILD                                    |   5 +-
 test/root/testdata/BUILD                           |   3 +-
 test/runtimes/BUILD                                |   4 +-
 test/runtimes/build_defs.bzl                       |   5 +-
 test/runtimes/images/proctor/BUILD                 |   4 +-
 test/syscalls/BUILD                                |   2 +-
 test/syscalls/build_defs.bzl                       |   6 +-
 test/syscalls/gtest/BUILD                          |   7 +-
 test/syscalls/linux/BUILD                          |  23 ++-
 test/syscalls/linux/arch_prctl.cc                  |   2 +
 test/syscalls/linux/rseq/BUILD                     |   5 +-
 .../linux/udp_socket_errqueue_test_case.cc         |   4 +
 test/uds/BUILD                                     |   3 +-
 test/util/BUILD                                    |  27 ++--
 test/util/save_util_linux.cc                       |   4 +
 test/util/save_util_other.cc                       |   4 +
 test/util/test_util_runfiles.cc                    |   4 +
 tools/BUILD                                        |   3 +
 tools/build/BUILD                                  |  10 ++
 tools/build/defs.bzl                               |  91 ++++++++++++
 tools/checkunsafe/BUILD                            |   3 +-
 tools/defs.bzl                                     | 154 +++++++++++++++++++++
 tools/go_generics/BUILD                            |   2 +-
 tools/go_generics/globals/BUILD                    |   4 +-
 tools/go_generics/go_merge/BUILD                   |   2 +-
 tools/go_generics/rules_tests/BUILD                |   2 +-
 tools/go_marshal/BUILD                             |   4 +-
 tools/go_marshal/README.md                         |  52 +------
 tools/go_marshal/analysis/BUILD                    |   5 +-
 tools/go_marshal/defs.bzl                          | 112 ++-------------
 tools/go_marshal/gomarshal/BUILD                   |   6 +-
 tools/go_marshal/gomarshal/generator.go            |  20 ++-
 tools/go_marshal/gomarshal/generator_tests.go      |   6 +-
 tools/go_marshal/main.go                           |  11 +-
 tools/go_marshal/marshal/BUILD                     |   5 +-
 tools/go_marshal/test/BUILD                        |   7 +-
 tools/go_marshal/test/external/BUILD               |   6 +-
 tools/go_stateify/BUILD                            |   2 +-
 tools/go_stateify/defs.bzl                         |  79 +----------
 tools/images/BUILD                                 |   2 +-
 tools/images/defs.bzl                              |   6 +-
 tools/issue_reviver/BUILD                          |   2 +-
 tools/issue_reviver/github/BUILD                   |   3 +-
 tools/issue_reviver/reviver/BUILD                  |   5 +-
 tools/workspace_status.sh                          |   2 +-
 vdso/BUILD                                         |  33 ++---
 264 files changed, 1012 insertions(+), 1380 deletions(-)
 delete mode 100644 benchmarks/defs.bzl
 delete mode 100755 scripts/common_bazel.sh
 create mode 100755 scripts/common_build.sh
 create mode 100644 tools/BUILD
 create mode 100644 tools/build/BUILD
 create mode 100644 tools/build/defs.bzl
 create mode 100644 tools/defs.bzl

diff --git a/.bazelrc b/.bazelrc
index 9c35c5e7b..ef214bcfa 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -30,10 +30,10 @@ build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
 
 # Add a custom platform and toolchain that builds in a privileged docker
 # container, which is required by our syscall tests.
-build:remote --host_platform=//test:rbe_ubuntu1604
-build:remote --extra_toolchains=//test:cc-toolchain-clang-x86_64-default
-build:remote --extra_execution_platforms=//test:rbe_ubuntu1604
-build:remote --platforms=//test:rbe_ubuntu1604
+build:remote --host_platform=//:rbe_ubuntu1604
+build:remote --extra_toolchains=//:cc-toolchain-clang-x86_64-default
+build:remote --extra_execution_platforms=//:rbe_ubuntu1604
+build:remote --platforms=//:rbe_ubuntu1604
 build:remote --crosstool_top=@rbe_default//cc:toolchain
 build:remote --jobs=50
 build:remote --remote_timeout=3600
diff --git a/BUILD b/BUILD
index 76286174f..5fd929378 100644
--- a/BUILD
+++ b/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_path", "nogo")
 load("@bazel_gazelle//:def.bzl", "gazelle")
 
+package(licenses = ["notice"])
+
 # The sandbox filegroup is used for sandbox-internal dependencies.
 package_group(
     name = "sandbox",
@@ -49,9 +49,52 @@ gazelle(name = "gazelle")
 # live in the tools subdirectory (unless they are standard).
 nogo(
     name = "nogo",
-    config = "tools/nogo.js",
+    config = "//tools:nogo.js",
     visibility = ["//visibility:public"],
     deps = [
         "//tools/checkunsafe",
     ],
 )
+
+# We need to define a bazel platform and toolchain to specify dockerPrivileged
+# and dockerRunAsRoot options, they are required to run tests on the RBE
+# cluster in Kokoro.
+alias(
+    name = "rbe_ubuntu1604",
+    actual = ":rbe_ubuntu1604_r346485",
+)
+
+platform(
+    name = "rbe_ubuntu1604_r346485",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//tools/cpp:clang",
+        "@bazel_toolchains//constraints:xenial",
+        "@bazel_toolchains//constraints/sanitizers:support_msan",
+    ],
+    remote_execution_properties = """
+        properties: {
+          name: "container-image"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
+        }
+        properties: {
+          name: "dockerAddCapabilities"
+          value: "SYS_ADMIN"
+        }
+        properties: {
+          name: "dockerPrivileged"
+          value: "true"
+        }
+    """,
+)
+
+toolchain(
+    name = "cc-toolchain-clang-x86_64-default",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+    ],
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl
deleted file mode 100644
index 79e6cdbc8..000000000
--- a/benchmarks/defs.bzl
+++ /dev/null
@@ -1,18 +0,0 @@
-"""Provides python helper functions."""
-
-load("@pydeps//:requirements.bzl", _requirement = "requirement")
-
-def filter_deps(deps = None):
-    if deps == None:
-        deps = []
-    return [dep for dep in deps if dep]
-
-def py_library(deps = None, **kwargs):
-    return native.py_library(deps = filter_deps(deps), **kwargs)
-
-def py_test(deps = None, **kwargs):
-    return native.py_test(deps = filter_deps(deps), **kwargs)
-
-def requirement(name, direct = True):
-    """ requirement returns the required dependency. """
-    return _requirement(name)
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
index 081a74243..52d4e42f8 100644
--- a/benchmarks/harness/BUILD
+++ b/benchmarks/harness/BUILD
@@ -1,4 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "requirement")
+load("//tools:defs.bzl", "py_library", "py_requirement")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -25,16 +25,16 @@ py_library(
     srcs = ["container.py"],
     deps = [
         "//benchmarks/workloads",
-        requirement("asn1crypto", False),
-        requirement("chardet", False),
-        requirement("certifi", False),
-        requirement("docker", True),
-        requirement("docker-pycreds", False),
-        requirement("idna", False),
-        requirement("ptyprocess", False),
-        requirement("requests", False),
-        requirement("urllib3", False),
-        requirement("websocket-client", False),
+        py_requirement("asn1crypto", False),
+        py_requirement("chardet", False),
+        py_requirement("certifi", False),
+        py_requirement("docker", True),
+        py_requirement("docker-pycreds", False),
+        py_requirement("idna", False),
+        py_requirement("ptyprocess", False),
+        py_requirement("requests", False),
+        py_requirement("urllib3", False),
+        py_requirement("websocket-client", False),
     ],
 )
 
@@ -47,17 +47,17 @@ py_library(
         "//benchmarks/harness:ssh_connection",
         "//benchmarks/harness:tunnel_dispatcher",
         "//benchmarks/harness/machine_mocks",
-        requirement("asn1crypto", False),
-        requirement("chardet", False),
-        requirement("certifi", False),
-        requirement("docker", True),
-        requirement("docker-pycreds", False),
-        requirement("idna", False),
-        requirement("ptyprocess", False),
-        requirement("requests", False),
-        requirement("six", False),
-        requirement("urllib3", False),
-        requirement("websocket-client", False),
+        py_requirement("asn1crypto", False),
+        py_requirement("chardet", False),
+        py_requirement("certifi", False),
+        py_requirement("docker", True),
+        py_requirement("docker-pycreds", False),
+        py_requirement("idna", False),
+        py_requirement("ptyprocess", False),
+        py_requirement("requests", False),
+        py_requirement("six", False),
+        py_requirement("urllib3", False),
+        py_requirement("websocket-client", False),
     ],
 )
 
@@ -66,10 +66,10 @@ py_library(
     srcs = ["ssh_connection.py"],
     deps = [
         "//benchmarks/harness",
-        requirement("bcrypt", False),
-        requirement("cffi", True),
-        requirement("paramiko", True),
-        requirement("cryptography", False),
+        py_requirement("bcrypt", False),
+        py_requirement("cffi", True),
+        py_requirement("paramiko", True),
+        py_requirement("cryptography", False),
     ],
 )
 
@@ -77,16 +77,16 @@ py_library(
     name = "tunnel_dispatcher",
     srcs = ["tunnel_dispatcher.py"],
     deps = [
-        requirement("asn1crypto", False),
-        requirement("chardet", False),
-        requirement("certifi", False),
-        requirement("docker", True),
-        requirement("docker-pycreds", False),
-        requirement("idna", False),
-        requirement("pexpect", True),
-        requirement("ptyprocess", False),
-        requirement("requests", False),
-        requirement("urllib3", False),
-        requirement("websocket-client", False),
+        py_requirement("asn1crypto", False),
+        py_requirement("chardet", False),
+        py_requirement("certifi", False),
+        py_requirement("docker", True),
+        py_requirement("docker-pycreds", False),
+        py_requirement("idna", False),
+        py_requirement("pexpect", True),
+        py_requirement("ptyprocess", False),
+        py_requirement("requests", False),
+        py_requirement("urllib3", False),
+        py_requirement("websocket-client", False),
     ],
 )
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index c4e943882..48ea0ef39 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -1,4 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "requirement")
+load("//tools:defs.bzl", "py_library", "py_requirement")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -31,7 +31,7 @@ py_library(
     deps = [
         "//benchmarks/harness:machine",
         "//benchmarks/harness/machine_producers:machine_producer",
-        requirement("PyYAML", False),
+        py_requirement("PyYAML", False),
     ],
 )
 
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
index e1b2ea550..fae0ca800 100644
--- a/benchmarks/runner/BUILD
+++ b/benchmarks/runner/BUILD
@@ -1,4 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("//tools:defs.bzl", "py_library", "py_requirement", "py_test")
 
 package(licenses = ["notice"])
 
@@ -28,7 +28,7 @@ py_library(
         "//benchmarks/suites:startup",
         "//benchmarks/suites:sysbench",
         "//benchmarks/suites:syscall",
-        requirement("click", True),
+        py_requirement("click", True),
     ],
 )
 
@@ -36,7 +36,7 @@ py_library(
     name = "commands",
     srcs = ["commands.py"],
     deps = [
-        requirement("click", True),
+        py_requirement("click", True),
     ],
 )
 
@@ -50,14 +50,14 @@ py_test(
     ],
     deps = [
         ":runner",
-        requirement("click", True),
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("click", True),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD
index 735d7127f..d5e401acc 100644
--- a/benchmarks/tcp/BUILD
+++ b/benchmarks/tcp/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-load("@rules_cc//cc:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD
index 4fc0ab735..4dd91ceb3 100644
--- a/benchmarks/workloads/ab/BUILD
+++ b/benchmarks/workloads/ab/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":ab",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD
index 61e010096..55dae3baa 100644
--- a/benchmarks/workloads/absl/BUILD
+++ b/benchmarks/workloads/absl/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":absl",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/curl/BUILD
+++ b/benchmarks/workloads/curl/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD
index be472dfb2..7c41ba631 100644
--- a/benchmarks/workloads/ffmpeg/BUILD
+++ b/benchmarks/workloads/ffmpeg/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD
index de257adad..7b78e8e75 100644
--- a/benchmarks/workloads/fio/BUILD
+++ b/benchmarks/workloads/fio/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":fio",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/httpd/BUILD
+++ b/benchmarks/workloads/httpd/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD
index 8832a996c..570f40148 100644
--- a/benchmarks/workloads/iperf/BUILD
+++ b/benchmarks/workloads/iperf/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":iperf",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/netcat/BUILD
+++ b/benchmarks/workloads/netcat/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/nginx/BUILD
+++ b/benchmarks/workloads/nginx/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD
index 71cd9f519..bfcf78cf9 100644
--- a/benchmarks/workloads/node/BUILD
+++ b/benchmarks/workloads/node/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD
index ca996f068..e142f082a 100644
--- a/benchmarks/workloads/node_template/BUILD
+++ b/benchmarks/workloads/node_template/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/redis/BUILD
+++ b/benchmarks/workloads/redis/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD
index f5994a815..f472a4443 100644
--- a/benchmarks/workloads/redisbenchmark/BUILD
+++ b/benchmarks/workloads/redisbenchmark/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":redisbenchmark",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD
index e37d77804..a3be4fe92 100644
--- a/benchmarks/workloads/ruby/BUILD
+++ b/benchmarks/workloads/ruby/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD
index 27f7c0c46..59443b14a 100644
--- a/benchmarks/workloads/ruby_template/BUILD
+++ b/benchmarks/workloads/ruby_template/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/sleep/BUILD
+++ b/benchmarks/workloads/sleep/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD
index fd2f8f03d..3834af7ed 100644
--- a/benchmarks/workloads/sysbench/BUILD
+++ b/benchmarks/workloads/sysbench/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":sysbench",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD
index 5100cbb21..dba4bb1e7 100644
--- a/benchmarks/workloads/syscall/BUILD
+++ b/benchmarks/workloads/syscall/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":syscall",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD
index 026c3b316..a7b7742f4 100644
--- a/benchmarks/workloads/tensorflow/BUILD
+++ b/benchmarks/workloads/tensorflow/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD
index 221c4b9a7..eba23d325 100644
--- a/benchmarks/workloads/true/BUILD
+++ b/benchmarks/workloads/true/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index f5c08ea06..839f822eb 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,6 +9,5 @@ go_library(
         "abi_linux.go",
         "flag.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/abi",
     visibility = ["//:sandbox"],
 )
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 716ff22d2..1f3c0c687 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 # Package linux contains the constants and types needed to interface with a
 # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix
@@ -60,7 +59,6 @@ go_library(
         "wait.go",
         "xattr.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/abi/linux",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi",
@@ -73,7 +71,7 @@ go_test(
     name = "linux_test",
     size = "small",
     srcs = ["netfilter_test.go"],
-    embed = [":linux"],
+    library = ":linux",
     deps = [
         "//pkg/binary",
     ],
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index d99e37b40..9612f072e 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "amutex",
     srcs = ["amutex.go"],
-    importpath = "gvisor.dev/gvisor/pkg/amutex",
     visibility = ["//:sandbox"],
 )
 
@@ -14,6 +12,6 @@ go_test(
     name = "amutex_test",
     size = "small",
     srcs = ["amutex_test.go"],
-    embed = [":amutex"],
+    library = ":amutex",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 6403c60c2..3948074ba 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "atomic_bitops_arm64.s",
         "atomic_bitops_common.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/atomicbitops",
     visibility = ["//:sandbox"],
 )
 
@@ -19,6 +17,6 @@ go_test(
     name = "atomicbitops_test",
     size = "small",
     srcs = ["atomic_bitops_test.go"],
-    embed = [":atomicbitops"],
+    library = ":atomicbitops",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 543fb54bf..7ca2fda90 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "binary",
     srcs = ["binary.go"],
-    importpath = "gvisor.dev/gvisor/pkg/binary",
     visibility = ["//:sandbox"],
 )
 
@@ -14,5 +12,5 @@ go_test(
     name = "binary_test",
     size = "small",
     srcs = ["binary_test.go"],
-    embed = [":binary"],
+    library = ":binary",
 )
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 93b88a29a..63f4670d7 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -15,7 +14,6 @@ go_library(
         "uint64_arch_arm64_asm.s",
         "uint64_arch_generic.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/bits",
     visibility = ["//:sandbox"],
 )
 
@@ -53,5 +51,5 @@ go_test(
     name = "bits_test",
     size = "small",
     srcs = ["uint64_test.go"],
-    embed = [":bits"],
+    library = ":bits",
 )
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index fba5643e8..2a6977f85 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "interpreter.go",
         "program_builder.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/bpf",
     visibility = ["//visibility:public"],
     deps = ["//pkg/abi/linux"],
 )
@@ -25,7 +23,7 @@ go_test(
         "interpreter_test.go",
         "program_builder_test.go",
     ],
-    embed = [":bpf"],
+    library = ":bpf",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index 2bb581b18..1f75319a7 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "compressio",
     srcs = ["compressio.go"],
-    importpath = "gvisor.dev/gvisor/pkg/compressio",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/binary",
@@ -18,5 +16,5 @@ go_test(
     name = "compressio_test",
     size = "medium",
     srcs = ["compressio_test.go"],
-    embed = [":compressio"],
+    library = ":compressio",
 )
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index 066d7b1a1..1b9e10ee7 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "client.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/control/client",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/unet",
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index adbd1e3f8..002d2ef44 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "server",
     srcs = ["server.go"],
-    importpath = "gvisor.dev/gvisor/pkg/control/server",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index ed111fd2a..43a432190 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "cpu_amd64.s",
         "cpuid.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/cpuid",
     visibility = ["//:sandbox"],
     deps = ["//pkg/log"],
 )
@@ -18,7 +16,7 @@ go_test(
     name = "cpuid_test",
     size = "small",
     srcs = ["cpuid_test.go"],
-    embed = [":cpuid"],
+    library = ":cpuid",
 )
 
 go_test(
@@ -27,6 +25,6 @@ go_test(
     srcs = [
         "cpuid_parse_test.go",
     ],
-    embed = [":cpuid"],
+    library = ":cpuid",
     tags = ["manual"],
 )
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 9d68682c7..bee28b68d 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,6 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -10,7 +8,6 @@ go_library(
         "event.go",
         "rate.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/eventchannel",
     visibility = ["//:sandbox"],
     deps = [
         ":eventchannel_go_proto",
@@ -24,22 +21,15 @@ go_library(
 )
 
 proto_library(
-    name = "eventchannel_proto",
+    name = "eventchannel",
     srcs = ["event.proto"],
     visibility = ["//:sandbox"],
 )
 
-go_proto_library(
-    name = "eventchannel_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto",
-    proto = ":eventchannel_proto",
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "eventchannel_test",
     srcs = ["event_test.go"],
-    embed = [":eventchannel"],
+    library = ":eventchannel",
     deps = [
         "//pkg/sync",
         "@com_github_golang_protobuf//proto:go_default_library",
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index afa8f7659..872361546 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "fd",
     srcs = ["fd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/fd",
     visibility = ["//visibility:public"],
 )
 
@@ -14,5 +12,5 @@ go_test(
     name = "fd_test",
     size = "small",
     srcs = ["fd_test.go"],
-    embed = [":fd"],
+    library = ":fd",
 )
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD
index b0478c672..d9104ef02 100644
--- a/pkg/fdchannel/BUILD
+++ b/pkg/fdchannel/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "fdchannel",
     srcs = ["fdchannel_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/fdchannel",
     visibility = ["//visibility:public"],
 )
 
@@ -14,6 +12,6 @@ go_test(
     name = "fdchannel_test",
     size = "small",
     srcs = ["fdchannel_test.go"],
-    embed = [":fdchannel"],
+    library = ":fdchannel",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
index 91a202a30..235dcc490 100644
--- a/pkg/fdnotifier/BUILD
+++ b/pkg/fdnotifier/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "fdnotifier.go",
         "poll_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/fdnotifier",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index 85bd83af1..9c5ad500b 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -1,7 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "flipcall",
@@ -13,7 +12,6 @@ go_library(
         "io.go",
         "packet_window_allocator.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/flipcall",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
@@ -30,6 +28,6 @@ go_test(
         "flipcall_example_test.go",
         "flipcall_test.go",
     ],
-    embed = [":flipcall"],
+    library = ":flipcall",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index ca540363c..ee84471b2 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -1,10 +1,8 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
 
 go_library(
     name = "fspath",
@@ -13,7 +11,6 @@ go_library(
         "builder_unsafe.go",
         "fspath.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/fspath",
 )
 
 go_test(
@@ -23,5 +20,5 @@ go_test(
         "builder_test.go",
         "fspath_test.go",
     ],
-    embed = [":fspath"],
+    library = ":fspath",
 )
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index f22bd070d..dd3141143 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -8,7 +7,6 @@ go_library(
     srcs = [
         "gate.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/gate",
     visibility = ["//visibility:public"],
 )
 
diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
index 5d31e5366..ea8d2422c 100644
--- a/pkg/goid/BUILD
+++ b/pkg/goid/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "goid_race.go",
         "goid_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/goid",
     visibility = ["//visibility:public"],
 )
 
@@ -22,5 +20,5 @@ go_test(
         "empty_test.go",
         "goid_test.go",
     ],
-    embed = [":goid"],
+    library = ":goid",
 )
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index 34d2673ef..3f6eb07df 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
     srcs = [
         "interface_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/ilist",
     visibility = ["//visibility:public"],
 )
 
@@ -41,7 +39,7 @@ go_test(
         "list_test.go",
         "test_list.go",
     ],
-    embed = [":ilist"],
+    library = ":ilist",
 )
 
 go_template(
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index bcde6d308..41bf104d0 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "linewriter",
     srcs = ["linewriter.go"],
-    importpath = "gvisor.dev/gvisor/pkg/linewriter",
     visibility = ["//visibility:public"],
     deps = ["//pkg/sync"],
 )
@@ -14,5 +12,5 @@ go_library(
 go_test(
     name = "linewriter_test",
     srcs = ["linewriter_test.go"],
-    embed = [":linewriter"],
+    library = ":linewriter",
 )
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 0df0f2849..935d06963 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "json_k8s.go",
         "log.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/log",
     visibility = [
         "//visibility:public",
     ],
@@ -29,5 +27,5 @@ go_test(
         "json_test.go",
         "log_test.go",
     ],
-    embed = [":log"],
+    library = ":log",
 )
diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD
index 7b50e2b28..9d07d98b4 100644
--- a/pkg/memutil/BUILD
+++ b/pkg/memutil/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "memutil",
     srcs = ["memutil_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/memutil",
     visibility = ["//visibility:public"],
     deps = ["@org_golang_x_sys//unix:go_default_library"],
 )
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 9145f3233..58305009d 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,14 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "metric",
     srcs = ["metric.go"],
-    importpath = "gvisor.dev/gvisor/pkg/metric",
     visibility = ["//:sandbox"],
     deps = [
         ":metric_go_proto",
@@ -19,28 +15,15 @@ go_library(
 )
 
 proto_library(
-    name = "metric_proto",
+    name = "metric",
     srcs = ["metric.proto"],
     visibility = ["//:sandbox"],
 )
 
-cc_proto_library(
-    name = "metric_cc_proto",
-    visibility = ["//:sandbox"],
-    deps = [":metric_proto"],
-)
-
-go_proto_library(
-    name = "metric_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/metric/metric_go_proto",
-    proto = ":metric_proto",
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "metric_test",
     srcs = ["metric_test.go"],
-    embed = [":metric"],
+    library = ":metric",
     deps = [
         ":metric_go_proto",
         "//pkg/eventchannel",
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index a3e05c96d..4ccc1de86 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -23,7 +22,6 @@ go_library(
         "transport_flipcall.go",
         "version.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/p9",
     deps = [
         "//pkg/fd",
         "//pkg/fdchannel",
@@ -47,7 +45,7 @@ go_test(
         "transport_test.go",
         "version_test.go",
     ],
-    embed = [":p9"],
+    library = ":p9",
     deps = [
         "//pkg/fd",
         "//pkg/unet",
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index f4edd68b2..7ca67cb19 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -64,7 +63,6 @@ go_library(
         "mocks.go",
         "p9test.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/p9/p9test",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/fd",
@@ -80,7 +78,7 @@ go_test(
     name = "client_test",
     size = "medium",
     srcs = ["client_test.go"],
-    embed = [":p9test"],
+    library = ":p9test",
     deps = [
         "//pkg/fd",
         "//pkg/p9",
diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD
index b506813f0..aa3e3ac0b 100644
--- a/pkg/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "procid_amd64.s",
         "procid_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/procid",
     visibility = ["//visibility:public"],
 )
 
@@ -20,7 +18,7 @@ go_test(
     srcs = [
         "procid_test.go",
     ],
-    embed = [":procid"],
+    library = ":procid",
     deps = ["//pkg/sync"],
 )
 
@@ -31,6 +29,6 @@ go_test(
         "procid_net_test.go",
         "procid_test.go",
     ],
-    embed = [":procid"],
+    library = ":procid",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 9d5b4859b..80b8ceb02 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "rand.go",
         "rand_linux.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/rand",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 974d9af9b..74affc887 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "refcounter_state.go",
         "weak_ref_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/refs",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
@@ -35,6 +33,6 @@ go_test(
     name = "refs_test",
     size = "small",
     srcs = ["refcounter_test.go"],
-    embed = [":refs"],
+    library = ":refs",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index af94e944d..742c8b79b 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_embed_data", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -27,7 +26,6 @@ go_library(
         "seccomp_rules.go",
         "seccomp_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/seccomp",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
@@ -43,7 +41,7 @@ go_test(
         "seccomp_test.go",
         ":victim_data",
     ],
-    embed = [":seccomp"],
+    library = ":seccomp",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 22abdc69f..60f63c7a6 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "full_reader.go",
         "secio.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/secio",
     visibility = ["//pkg/sentry:internal"],
 )
 
@@ -17,5 +15,5 @@ go_test(
     name = "secio_test",
     size = "small",
     srcs = ["secio_test.go"],
-    embed = [":secio"],
+    library = ":secio",
 )
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index a27c35e21..f2d8462d8 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(
@@ -38,7 +37,6 @@ go_library(
         "int_set.go",
         "set_functions.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/segment/segment",
     deps = [
         "//pkg/state",
     ],
@@ -48,5 +46,5 @@ go_test(
     name = "segment_test",
     size = "small",
     srcs = ["segment_test.go"],
-    embed = [":segment"],
+    library = ":segment",
 )
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index 2d6379c86..e8b794179 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -6,6 +6,8 @@ package(licenses = ["notice"])
 package_group(
     name = "internal",
     packages = [
+        "//cloud/gvisor/gopkg/sentry/...",
+        "//cloud/gvisor/sentry/...",
         "//pkg/sentry/...",
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 65f22af2b..51ca09b24 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,6 +1,4 @@
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -27,7 +25,6 @@ go_library(
         "syscalls_amd64.go",
         "syscalls_arm64.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/arch",
     visibility = ["//:sandbox"],
     deps = [
         ":registers_go_proto",
@@ -44,20 +41,7 @@ go_library(
 )
 
 proto_library(
-    name = "registers_proto",
+    name = "registers",
     srcs = ["registers.proto"],
     visibility = ["//visibility:public"],
 )
-
-cc_proto_library(
-    name = "registers_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":registers_proto"],
-)
-
-go_proto_library(
-    name = "registers_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto",
-    proto = ":registers_proto",
-    visibility = ["//visibility:public"],
-)
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index 8dc1a77b1..e13a9ce20 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "context",
     srcs = ["context.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/context",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/amutex",
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 581e7aa96..f91a6d4ed 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "contexttest",
     testonly = 1,
     srcs = ["contexttest.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/context/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/memutil",
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 2561a6109..e69496477 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,9 +11,8 @@ go_library(
         "proc.go",
         "state.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/control",
     visibility = [
-        "//pkg/sentry:internal",
+        "//:sandbox",
     ],
     deps = [
         "//pkg/abi/linux",
@@ -40,7 +38,7 @@ go_test(
     name = "control_test",
     size = "small",
     srcs = ["proc_test.go"],
-    embed = [":control"],
+    library = ":control",
     deps = [
         "//pkg/log",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 97fa1512c..e403cbd8b 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "device",
     srcs = ["device.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/device",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -18,5 +16,5 @@ go_test(
     name = "device_test",
     size = "small",
     srcs = ["device_test.go"],
-    embed = [":device"],
+    library = ":device",
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 7d5d72d5a..605d61dbe 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -44,7 +43,6 @@ go_library(
         "splice.go",
         "sync.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -129,7 +127,7 @@ go_test(
         "mount_test.go",
         "path_test.go",
     ],
-    embed = [":fs"],
+    library = ":fs",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index ae1c9cf76..c14e5405e 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "anon.go",
         "device.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/anon",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index a0d9e8496..0c7247bd7 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -13,7 +13,6 @@ go_library(
         "random.go",
         "tty.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/dev",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index cc43de69d..25ef96299 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "pipe_opener.go",
         "pipe_state.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe",
     imports = ["gvisor.dev/gvisor/pkg/sentry/fs"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -36,7 +34,7 @@ go_test(
         "pipe_opener_test.go",
         "pipe_test.go",
     ],
-    embed = [":fdpipe"],
+    library = ":fdpipe",
     deps = [
         "//pkg/fd",
         "//pkg/fdnotifier",
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index 358dc2be3..9a7608cae 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "filetest",
     testonly = 1,
     srcs = ["filetest.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/filetest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 945b6270d..9142f5bdf 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -75,7 +74,6 @@ go_library(
         "inode.go",
         "inode_cached.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fsutil",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -106,7 +104,7 @@ go_test(
         "dirty_set_test.go",
         "inode_cached_test.go",
     ],
-    embed = [":fsutil"],
+    library = ":fsutil",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index fd870e8e1..cf48e7c03 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -22,7 +21,6 @@ go_library(
         "socket.go",
         "util.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/gofer",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -56,7 +54,7 @@ go_test(
     name = "gofer_test",
     size = "small",
     srcs = ["gofer_test.go"],
-    embed = [":gofer"],
+    library = ":gofer",
     deps = [
         "//pkg/p9",
         "//pkg/p9/p9test",
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 2b581aa69..f586f47c1 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -25,7 +24,6 @@ go_library(
         "util_arm64_unsafe.go",
         "util_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/host",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -69,7 +67,7 @@ go_test(
         "socket_test.go",
         "wait_test.go",
     ],
-    embed = [":host"],
+    library = ":host",
     deps = [
         "//pkg/fd",
         "//pkg/fdnotifier",
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 2c332a82a..ae3331737 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -40,7 +39,6 @@ go_library(
         "lock_set.go",
         "lock_set_functions.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/lock",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
@@ -56,5 +54,5 @@ go_test(
         "lock_range_test.go",
         "lock_test.go",
     ],
-    embed = [":lock"],
+    library = ":lock",
 )
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index cb37c6c6b..b06bead41 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -27,7 +26,6 @@ go_library(
         "uptime.go",
         "version.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -63,7 +61,7 @@ go_test(
         "net_test.go",
         "sys_net_test.go",
     ],
-    embed = [":proc"],
+    library = ":proc",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index 0394451d4..52c9aa93d 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "device",
     srcs = ["device.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/device",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/sentry/device"],
 )
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 38b246dff..310d8dd52 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "seqfile",
     srcs = ["seqfile.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -26,7 +24,7 @@ go_test(
     name = "seqfile_test",
     size = "small",
     srcs = ["seqfile_test.go"],
-    embed = [":seqfile"],
+    library = ":seqfile",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 3fb7b0633..39c4b84f8 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "symlink.go",
         "tree.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -31,7 +29,7 @@ go_test(
     name = "ramfs_test",
     size = "small",
     srcs = ["tree_test.go"],
-    embed = [":ramfs"],
+    library = ":ramfs",
     deps = [
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 25f0f124e..cc6b3bfbf 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,6 @@ go_library(
         "fs.go",
         "sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index a215c1b95..092668e8d 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "timerfd",
     srcs = ["timerfd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/timerfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 3400b940c..04776555f 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "inode_file.go",
         "tmpfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -41,7 +39,7 @@ go_test(
     name = "tmpfs_test",
     size = "small",
     srcs = ["file_test.go"],
-    embed = [":tmpfs"],
+    library = ":tmpfs",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index f6f60d0cf..29f804c6c 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -14,7 +13,6 @@ go_library(
         "slave.go",
         "terminal.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tty",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -40,7 +38,7 @@ go_test(
     name = "tty_test",
     size = "small",
     srcs = ["tty_test.go"],
-    embed = [":tty"],
+    library = ":tty",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 903874141..a718920d5 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -32,7 +31,6 @@ go_library(
         "symlink.go",
         "utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -71,7 +69,7 @@ go_test(
         "//pkg/sentry/fsimpl/ext:assets/tiny.ext3",
         "//pkg/sentry/fsimpl/ext:assets/tiny.ext4",
     ],
-    embed = [":ext"],
+    library = ":ext",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index 4fc8296ef..12f3990c1 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index fcfaf5c3e..9bd9c76c0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -44,6 +42,6 @@ go_test(
         "inode_test.go",
         "superblock_test.go",
     ],
-    embed = [":disklayout"],
+    library = ":disklayout",
     deps = ["//pkg/sentry/kernel/time"],
 )
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 66d409785..7bf83ccba 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -1,8 +1,7 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_template_instance(
     name = "slot_list",
@@ -27,7 +26,6 @@ go_library(
         "slot_list.go",
         "symlink.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index c5b79fb38..3768f55b2 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,7 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "proc",
@@ -15,7 +14,6 @@ go_library(
         "tasks_net.go",
         "tasks_sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
@@ -47,7 +45,7 @@ go_test(
         "tasks_sys_test.go",
         "tasks_test.go",
     ],
-    embed = [":proc"],
+    library = ":proc",
     deps = [
         "//pkg/abi/linux",
         "//pkg/fspath",
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index ee3c842bd..beda141f1 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,14 +1,12 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "sys",
     srcs = [
         "sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 4e70d84a7..12053a5b6 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -1,6 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "testutil",
@@ -9,7 +9,6 @@ go_library(
         "kernel.go",
         "testutil.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 691476b4f..857e98bc5 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -1,8 +1,7 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_template_instance(
     name = "dentry_list",
@@ -28,7 +27,6 @@ go_library(
         "symlink.go",
         "tmpfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
@@ -81,7 +79,7 @@ go_test(
         "regular_file_test.go",
         "stat_test.go",
     ],
-    embed = [":tmpfs"],
+    library = ":tmpfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/fspath",
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index 359468ccc..e6933aa70 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "getcpu_arm64.s",
         "hostcpu.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/hostcpu",
     visibility = ["//:sandbox"],
 )
 
@@ -18,5 +16,5 @@ go_test(
     name = "hostcpu_test",
     size = "small",
     srcs = ["hostcpu_test.go"],
-    embed = [":hostcpu"],
+    library = ":hostcpu",
 )
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
index 67831d5a1..a145a5ca3 100644
--- a/pkg/sentry/hostmm/BUILD
+++ b/pkg/sentry/hostmm/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "cgroup.go",
         "hostmm.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/hostmm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/fd",
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 8d60ad4ad..aa621b724 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -12,7 +12,6 @@ go_library(
         "inet.go",
         "test_stack.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/inet",
     deps = [
         "//pkg/sentry/context",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index ac85ba0c8..cebaccd92 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,8 +1,5 @@
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -78,26 +75,12 @@ go_template_instance(
 )
 
 proto_library(
-    name = "uncaught_signal_proto",
+    name = "uncaught_signal",
     srcs = ["uncaught_signal.proto"],
     visibility = ["//visibility:public"],
     deps = ["//pkg/sentry/arch:registers_proto"],
 )
 
-cc_proto_library(
-    name = "uncaught_signal_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":uncaught_signal_proto"],
-)
-
-go_proto_library(
-    name = "uncaught_signal_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto",
-    proto = ":uncaught_signal_proto",
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_go_proto"],
-)
-
 go_library(
     name = "kernel",
     srcs = [
@@ -156,7 +139,6 @@ go_library(
         "vdso.go",
         "version.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel",
     imports = [
         "gvisor.dev/gvisor/pkg/bpf",
         "gvisor.dev/gvisor/pkg/sentry/device",
@@ -227,7 +209,7 @@ go_test(
         "task_test.go",
         "timekeeper_test.go",
     ],
-    embed = [":kernel"],
+    library = ":kernel",
     deps = [
         "//pkg/abi",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 1aa72fa47..64537c9be 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -57,7 +57,6 @@ go_library(
         "id_map_set.go",
         "user_namespace.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/auth",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index 3a88a585c..daff608d7 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "contexttest",
     testonly = 1,
     srcs = ["contexttest.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index c47f6b6fc..19e16ab3a 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "epoll_list.go",
         "epoll_state.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/epoll",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/refs",
@@ -43,7 +41,7 @@ go_test(
     srcs = [
         "epoll_test.go",
     ],
-    embed = [":epoll"],
+    library = ":epoll",
     deps = [
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs/filetest",
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index c831fbab2..ee2d74864 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "eventfd",
     srcs = ["eventfd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -26,7 +24,7 @@ go_test(
     name = "eventfd_test",
     size = "small",
     srcs = ["eventfd_test.go"],
-    embed = [":eventfd"],
+    library = ":eventfd",
     deps = [
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 6b36bc63e..b9126e946 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "fasync",
     srcs = ["fasync.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/fasync",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 50db443ce..f413d8ae2 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -34,7 +33,6 @@ go_library(
         "futex.go",
         "waiter_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -51,7 +49,7 @@ go_test(
     name = "futex_test",
     size = "small",
     srcs = ["futex_test.go"],
-    embed = [":futex"],
+    library = ":futex",
     deps = [
         "//pkg/sentry/usermem",
         "//pkg/sync",
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index 7f36252a9..4486848d2 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,13 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "memevent",
     srcs = ["memory_events.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent",
     visibility = ["//:sandbox"],
     deps = [
         ":memory_events_go_proto",
@@ -21,20 +18,7 @@ go_library(
 )
 
 proto_library(
-    name = "memory_events_proto",
+    name = "memory_events",
     srcs = ["memory_events.proto"],
     visibility = ["//visibility:public"],
 )
-
-cc_proto_library(
-    name = "memory_events_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":memory_events_proto"],
-)
-
-go_proto_library(
-    name = "memory_events_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto",
-    proto = ":memory_events_proto",
-    visibility = ["//visibility:public"],
-)
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 5eeaeff66..2c7b6206f 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -30,7 +29,6 @@ go_library(
         "vfs.go",
         "writer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -57,7 +55,7 @@ go_test(
         "node_test.go",
         "pipe_test.go",
     ],
-    embed = [":pipe"],
+    library = ":pipe",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 98ea7a0d8..1b82e087b 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "cpuset.go",
         "sched.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/sched",
     visibility = ["//pkg/sentry:internal"],
 )
 
@@ -17,5 +15,5 @@ go_test(
     name = "sched_test",
     size = "small",
     srcs = ["cpuset_test.go"],
-    embed = [":sched"],
+    library = ":sched",
 )
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 13a961594..76e19b551 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +21,6 @@ go_library(
         "semaphore.go",
         "waiter_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -40,7 +38,7 @@ go_test(
     name = "semaphore_test",
     size = "small",
     srcs = ["semaphore_test.go"],
-    embed = [":semaphore"],
+    library = ":semaphore",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 7321b22ed..5547c5abf 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "device.go",
         "shm.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/shm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 89e4d84b1..5d44773d4 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "signalfd",
     srcs = ["signalfd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 4e4de0512..d49594d9f 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "context.go",
         "time.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/time",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 9fa841e8b..67869757f 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "limits.go",
         "linux.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/limits",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
@@ -25,5 +23,5 @@ go_test(
     srcs = [
         "limits_test.go",
     ],
-    embed = [":limits"],
+    library = ":limits",
 )
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 2890393bd..d4ad2bd6c 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_embed_data", "go_library")
 
 package(licenses = ["notice"])
 
@@ -20,7 +19,6 @@ go_library(
         "vdso_state.go",
         ":vdso_bin",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/loader",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 112794e9c..f9a65f086 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -37,7 +36,6 @@ go_library(
         "mapping_set_impl.go",
         "memmap.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/memmap",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
@@ -52,6 +50,6 @@ go_test(
     name = "memmap_test",
     size = "small",
     srcs = ["mapping_set_test.go"],
-    embed = [":memmap"],
+    library = ":memmap",
     deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 83e248431..bd6399fa2 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -96,7 +95,6 @@ go_library(
         "vma.go",
         "vma_set.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/mm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -128,7 +126,7 @@ go_test(
     name = "mm_test",
     size = "small",
     srcs = ["mm_test.go"],
-    embed = [":mm"],
+    library = ":mm",
     deps = [
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index a9a2642c5..02385a3ce 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -60,7 +59,6 @@ go_library(
         "save_restore.go",
         "usage_set.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/pgalloc",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
@@ -82,6 +80,6 @@ go_test(
     name = "pgalloc_test",
     size = "small",
     srcs = ["pgalloc_test.go"],
-    embed = [":pgalloc"],
+    library = ":pgalloc",
     deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 157bffa81..006450b2d 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +22,6 @@ go_library(
         "mmap_min_addr.go",
         "platform.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index 85e882df9..83b385f14 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -8,7 +7,6 @@ go_library(
     srcs = [
         "interrupt.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/sync"],
 )
@@ -17,5 +15,5 @@ go_test(
     name = "interrupt_test",
     size = "small",
     srcs = ["interrupt_test.go"],
-    embed = [":interrupt"],
+    library = ":interrupt",
 )
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 6a358d1d4..a4532a766 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -38,7 +37,6 @@ go_library(
         "physical_map_arm64.go",
         "virtual_map.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -65,7 +63,7 @@ go_test(
         "kvm_test.go",
         "virtual_map_test.go",
     ],
-    embed = [":kvm"],
+    library = ":kvm",
     tags = [
         "manual",
         "nogotsan",
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index b0e45f159..f7605df8a 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,6 +12,5 @@ go_library(
         "testutil_arm64.go",
         "testutil_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil",
     visibility = ["//pkg/sentry/platform/kvm:__pkg__"],
 )
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index cd13390c3..3bcc5e040 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -20,7 +20,6 @@ go_library(
         "subprocess_linux_unsafe.go",
         "subprocess_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 87f4552b5..6dee8fcc5 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -74,7 +74,6 @@ go_library(
         "lib_arm64.s",
         "ring0.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/cpuid",
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 42076fb04..147311ed3 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 387a7f6c3..8b5cdd6c1 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,17 +1,14 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test", "select_arch")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-config_setting(
-    name = "aarch64",
-    constraint_values = ["@bazel_tools//platforms:aarch64"],
-)
-
 go_template(
     name = "generic_walker",
-    srcs = ["walker_amd64.go"],
+    srcs = select_arch(
+        amd64 = ["walker_amd64.go"],
+        arm64 = ["walker_amd64.go"],
+    ),
     opt_types = [
         "Visitor",
     ],
@@ -91,7 +88,6 @@ go_library(
         "walker_map.go",
         "walker_unmap.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables",
     visibility = [
         "//pkg/sentry/platform/kvm:__subpackages__",
         "//pkg/sentry/platform/ring0:__subpackages__",
@@ -111,6 +107,6 @@ go_test(
         "pagetables_test.go",
         "walker_check.go",
     ],
-    embed = [":pagetables"],
+    library = ":pagetables",
     deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 6769cd0a5..b8747585b 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -17,7 +16,6 @@ go_library(
         "sighandler_amd64.s",
         "sighandler_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/syserror"],
 )
@@ -27,5 +25,5 @@ go_test(
     srcs = [
         "safecopy_test.go",
     ],
-    embed = [":safecopy"],
+    library = ":safecopy",
 )
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index 884020f7b..3ab76da97 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "safemem.go",
         "seq_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/safemem",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/platform/safecopy",
@@ -25,5 +23,5 @@ go_test(
         "io_test.go",
         "seq_test.go",
     ],
-    embed = [":safemem"],
+    library = ":safemem",
 )
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index f561670c7..6c38a3f44 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "sighandling.go",
         "sighandling_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/sighandling",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/abi/linux"],
 )
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 26176b10d..8e2b97afb 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "socket",
     srcs = ["socket.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 357517ed4..3850f6345 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "control",
     srcs = ["control.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/control",
     imports = [
         "gvisor.dev/gvisor/pkg/sentry/fs",
     ],
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 4c44c7c0f..42bf7be6a 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "socket_unsafe.go",
         "stack.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/hostinet",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index b70047d81..ed34a8308 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "netfilter.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netfilter",
     # This target depends on netstack and should only be used by epsocket,
     # which is allowed to depend on netstack.
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 103933144..baaac13c6 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "provider.go",
         "socket.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 2d9f4ba9b..3a22923d8 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "port",
     srcs = ["port.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/sync"],
 )
@@ -14,5 +12,5 @@ go_library(
 go_test(
     name = "port_test",
     srcs = ["port_test.go"],
-    embed = [":port"],
+    library = ":port",
 )
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 1d4912753..2137c7aeb 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "route",
     srcs = ["protocol.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD
index 0777f3baf..73fbdf1eb 100644
--- a/pkg/sentry/socket/netlink/uevent/BUILD
+++ b/pkg/sentry/socket/netlink/uevent/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "uevent",
     srcs = ["protocol.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index f78784569..e3d1f90cb 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -11,7 +11,6 @@ go_library(
         "save_restore.go",
         "stack.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netstack",
     visibility = [
         "//pkg/sentry:internal",
     ],
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 5b6a154f6..bade18686 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "io.go",
         "unix.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index d7ba95dff..4bdfc9208 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -25,7 +25,6 @@ go_library(
         "transport_message_list.go",
         "unix.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 88765f4d6..0ea4aab8b 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "state_metadata.go",
         "state_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/state",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index aa1ac720c..ff6fafa63 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,6 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -21,7 +19,6 @@ go_library(
         "strace.go",
         "syscalls.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/strace",
     visibility = ["//:sandbox"],
     deps = [
         ":strace_go_proto",
@@ -42,20 +39,7 @@ go_library(
 )
 
 proto_library(
-    name = "strace_proto",
+    name = "strace",
     srcs = ["strace.proto"],
     visibility = ["//visibility:public"],
 )
-
-cc_proto_library(
-    name = "strace_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":strace_proto"],
-)
-
-go_proto_library(
-    name = "strace_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto",
-    proto = ":strace_proto",
-    visibility = ["//visibility:public"],
-)
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 79d972202..b8d1bd415 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "epoll.go",
         "syscalls.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 917f74e07..7d74e0f70 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -57,7 +57,6 @@ go_library(
         "sys_xattr.go",
         "timespec.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 3cde3a0be..04f81a35b 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -31,7 +30,6 @@ go_library(
         "tsc_amd64.s",
         "tsc_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/time",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
@@ -48,5 +46,5 @@ go_test(
         "parameters_test.go",
         "sampler_test.go",
     ],
-    embed = [":time"],
+    library = ":time",
 )
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index fc7614fff..370fa6ec5 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,34 +1,17 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
 proto_library(
-    name = "unimplemented_syscall_proto",
+    name = "unimplemented_syscall",
     srcs = ["unimplemented_syscall.proto"],
     visibility = ["//visibility:public"],
     deps = ["//pkg/sentry/arch:registers_proto"],
 )
 
-cc_proto_library(
-    name = "unimplemented_syscall_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":unimplemented_syscall_proto"],
-)
-
-go_proto_library(
-    name = "unimplemented_syscall_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto",
-    proto = ":unimplemented_syscall_proto",
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_go_proto"],
-)
-
 go_library(
     name = "unimpl",
     srcs = ["events.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 86a87edd4..e9c18f170 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "uniqueid",
     srcs = ["context.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/uniqueid",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 5518ac3d0..099315613 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -11,9 +11,8 @@ go_library(
         "memory_unsafe.go",
         "usage.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/usage",
     visibility = [
-        "//pkg/sentry:internal",
+        "//:sandbox",
     ],
     deps = [
         "//pkg/bits",
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 684f59a6b..c8322e29e 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -29,7 +28,6 @@ go_library(
         "usermem_unsafe.go",
         "usermem_x86.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/usermem",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/atomicbitops",
@@ -38,7 +36,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
         "//pkg/syserror",
-        "//pkg/tcpip/buffer",
     ],
 )
 
@@ -49,7 +46,7 @@ go_test(
         "addr_range_seq_test.go",
         "usermem_test.go",
     ],
-    embed = [":usermem"],
+    library = ":usermem",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 35c7be259..51acdc4e9 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,7 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "vfs",
@@ -24,7 +23,6 @@ go_library(
         "testutil.go",
         "vfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -47,7 +45,7 @@ go_test(
         "file_description_impl_util_test.go",
         "mount_test.go",
     ],
-    embed = [":vfs"],
+    library = ":vfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 28f21f13d..1c5a1c9b6 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "watchdog",
     srcs = ["watchdog.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/watchdog",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index a23c86fb1..e131455f7 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "commit_noasm.go",
         "sleep_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sleep",
     visibility = ["//:sandbox"],
 )
 
@@ -22,5 +20,5 @@ go_test(
     srcs = [
         "sleep_test.go",
     ],
-    embed = [":sleep"],
+    library = ":sleep",
 )
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index be93750bf..921af9d63 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,6 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -49,7 +47,7 @@ go_library(
         "state.go",
         "stats.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/state",
+    stateify = False,
     visibility = ["//:sandbox"],
     deps = [
         ":object_go_proto",
@@ -58,21 +56,14 @@ go_library(
 )
 
 proto_library(
-    name = "object_proto",
+    name = "object",
     srcs = ["object.proto"],
     visibility = ["//:sandbox"],
 )
 
-go_proto_library(
-    name = "object_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/state/object_go_proto",
-    proto = ":object_proto",
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "state_test",
     timeout = "long",
     srcs = ["state_test.go"],
-    embed = [":state"],
+    library = ":state",
 )
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index 8a865d229..e7581c09b 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "statefile",
     srcs = ["statefile.go"],
-    importpath = "gvisor.dev/gvisor/pkg/state/statefile",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/binary",
@@ -18,6 +16,6 @@ go_test(
     name = "statefile_test",
     size = "small",
     srcs = ["statefile_test.go"],
-    embed = [":statefile"],
+    library = ":statefile",
     deps = ["//pkg/compressio"],
 )
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 97c4b3b1e..5340cf0d6 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template")
 
 package(
@@ -40,7 +39,6 @@ go_library(
         "syncutil.go",
         "tmutex_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sync",
 )
 
 go_test(
@@ -51,5 +49,5 @@ go_test(
         "seqcount_test.go",
         "tmutex_test.go",
     ],
-    embed = [":sync"],
+    library = ":sync",
 )
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 418eda29c..e97553254 100644
--- a/pkg/sync/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -18,12 +17,11 @@ go_template_instance(
 go_library(
     name = "atomicptr",
     srcs = ["atomicptr_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr",
 )
 
 go_test(
     name = "atomicptr_test",
     size = "small",
     srcs = ["atomicptr_test.go"],
-    embed = [":atomicptr"],
+    library = ":atomicptr",
 )
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index eba21518d..5c38c783e 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -18,7 +17,6 @@ go_template_instance(
 go_library(
     name = "seqatomic",
     srcs = ["seqatomic_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic",
     deps = [
         "//pkg/sync",
     ],
@@ -28,6 +26,6 @@ go_test(
     name = "seqatomic_test",
     size = "small",
     srcs = ["seqatomic_test.go"],
-    embed = [":seqatomic"],
+    library = ":seqatomic",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index 5665ad4ee..7d760344a 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "netstack.go",
         "syserr.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/syserr",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index bd3f9fd28..b13c15d9b 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "syserror",
     srcs = ["syserror.go"],
-    importpath = "gvisor.dev/gvisor/pkg/syserror",
     visibility = ["//visibility:public"],
 )
 
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 23e4b09e7..26f7ba86b 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "time_unsafe.go",
         "timer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -25,7 +23,7 @@ go_test(
     name = "tcpip_test",
     size = "small",
     srcs = ["tcpip_test.go"],
-    embed = [":tcpip"],
+    library = ":tcpip",
 )
 
 go_test(
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 3df7d18d3..a984f1712 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "gonet",
     srcs = ["gonet.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -23,7 +21,7 @@ go_test(
     name = "gonet_test",
     size = "small",
     srcs = ["gonet_test.go"],
-    embed = [":gonet"],
+    library = ":gonet",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index d6c31bfa2..563bc78ea 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "prependable.go",
         "view.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/buffer",
     visibility = ["//visibility:public"],
 )
 
@@ -17,5 +15,5 @@ go_test(
     name = "buffer_test",
     size = "small",
     srcs = ["view_test.go"],
-    embed = [":buffer"],
+    library = ":buffer",
 )
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index b6fa6fc37..ed434807f 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "checker",
     testonly = 1,
     srcs = ["checker.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/checker",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
index e648efa71..ff2719291 100644
--- a/pkg/tcpip/hash/jenkins/BUILD
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "jenkins",
     srcs = ["jenkins.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins",
     visibility = ["//visibility:public"],
 )
 
@@ -16,5 +14,5 @@ go_test(
     srcs = [
         "jenkins_test.go",
     ],
-    embed = [":jenkins"],
+    library = ":jenkins",
 )
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index cd747d100..9da0d71f8 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -24,7 +23,6 @@ go_library(
         "tcp.go",
         "udp.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/header",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
@@ -59,7 +57,7 @@ go_test(
         "eth_test.go",
         "ndp_test.go",
     ],
-    embed = [":header"],
+    library = ":header",
     deps = [
         "//pkg/tcpip",
         "@com_github_google_go-cmp//cmp:go_default_library",
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index 297eaccaf..d1b73cfdf 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "targets.go",
         "types.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 7dbc05754..3974c464e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "channel",
     srcs = ["channel.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/channel",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 66cc53ed4..abe725548 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -13,7 +12,6 @@ go_library(
         "mmap_unsafe.go",
         "packet_dispatchers.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -30,7 +28,7 @@ go_test(
     name = "fdbased_test",
     size = "small",
     srcs = ["endpoint_test.go"],
-    embed = [":fdbased"],
+    library = ":fdbased",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index f35fcdff4..6bf3805b7 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "loopback",
     srcs = ["loopback.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/loopback",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index 1ac7948b6..82b441b79 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "muxed",
     srcs = ["injectable.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/muxed",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
@@ -19,7 +17,7 @@ go_test(
     name = "muxed_test",
     size = "small",
     srcs = ["injectable_test.go"],
-    embed = [":muxed"],
+    library = ":muxed",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index d8211e93d..14b527bc2 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "errors.go",
         "rawfile_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/rawfile",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index 09165dd4c..13243ebbb 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "sharedmem_unsafe.go",
         "tx.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
@@ -30,7 +28,7 @@ go_test(
     srcs = [
         "sharedmem_test.go",
     ],
-    embed = [":sharedmem"],
+    library = ":sharedmem",
     deps = [
         "//pkg/sync",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index a0d4ad0be..87020ec08 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "rx.go",
         "tx.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe",
     visibility = ["//visibility:public"],
 )
 
@@ -20,6 +18,6 @@ go_test(
     srcs = [
         "pipe_test.go",
     ],
-    embed = [":pipe"],
+    library = ":pipe",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index 8c9234d54..3ba06af73 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "rx.go",
         "tx.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
@@ -22,7 +20,7 @@ go_test(
     srcs = [
         "queue_test.go",
     ],
-    embed = [":queue"],
+    library = ":queue",
     deps = [
         "//pkg/tcpip/link/sharedmem/pipe",
     ],
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index d6ae0368a..230a8d53a 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "pcap.go",
         "sniffer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sniffer",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index a71a493fc..e5096ea38 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,10 +1,9 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "tun",
     srcs = ["tun_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/tun",
     visibility = ["//visibility:public"],
 )
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 134837943..0956d2c65 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -8,7 +7,6 @@ go_library(
     srcs = [
         "waitable.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/waitable",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/gate",
@@ -23,7 +21,7 @@ go_test(
     srcs = [
         "waitable_test.go",
     ],
-    embed = [":waitable"],
+    library = ":waitable",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 9d16ff8c9..6a4839fb8 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index e7617229b..eddf7b725 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "arp",
     srcs = ["arp.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/arp",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index ed16076fd..d1c728ccf 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -24,7 +23,6 @@ go_library(
         "reassembler.go",
         "reassembler_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
@@ -42,6 +40,6 @@ go_test(
         "fragmentation_test.go",
         "reassembler_test.go",
     ],
-    embed = [":fragmentation"],
+    library = ":fragmentation",
     deps = ["//pkg/tcpip/buffer"],
 )
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index e6db5c0b0..872165866 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "hash",
     srcs = ["hash.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/hash",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/rand",
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 4e2aae9a3..0fef2b1f1 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "icmp.go",
         "ipv4.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv4",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index e4e273460..fb11874c6 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "icmp.go",
         "ipv6.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv6",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
@@ -27,7 +25,7 @@ go_test(
         "ipv6_test.go",
         "ndp_test.go",
     ],
-    embed = [":ipv6"],
+    library = ":ipv6",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index a6ef3bdcc..2bad05a2e 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "ports",
     srcs = ["ports.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/ports",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -17,7 +15,7 @@ go_library(
 go_test(
     name = "ports_test",
     srcs = ["ports_test.go"],
-    embed = [":ports"],
+    library = ":ports",
     deps = [
         "//pkg/tcpip",
     ],
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index d7496fde6..cf0a5fefe 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index 875561566..43264b76d 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index b31ddba2f..45f503845 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,10 +1,9 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "seqnum",
     srcs = ["seqnum.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/seqnum",
     visibility = ["//visibility:public"],
 )
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 783351a69..f5b750046 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -30,7 +29,6 @@ go_library(
         "stack_global_state.go",
         "transport_demuxer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/stack",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/ilist",
@@ -81,7 +79,7 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = ["linkaddrcache_test.go"],
-    embed = [":stack"],
+    library = ":stack",
     deps = [
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 3aa23d529..ac18ec5b1 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +23,6 @@ go_library(
         "icmp_packet_list.go",
         "protocol.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/icmp",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index 4858d150c..d22de6b26 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +22,6 @@ go_library(
         "endpoint_state.go",
         "packet_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 2f2131ff7..c9baf4600 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +23,6 @@ go_library(
         "protocol.go",
         "raw_packet_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 0e3ab05ad..4acd9fb9a 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -55,7 +54,6 @@ go_library(
         "tcp_segment_list.go",
         "timer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index b33ec2087..ce6a2c31d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "context",
     testonly = 1,
     srcs = ["context.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context",
     visibility = [
         "//visibility:public",
     ],
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 43fcc27f0..3ad6994a7 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "tcpconntrack",
     srcs = ["tcp_conntrack.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 57ff123e3..adc908e24 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -25,7 +24,6 @@ go_library(
         "protocol.go",
         "udp_packet_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/udp",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index 07778e4f7..2dcba84ae 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "tmutex",
     srcs = ["tmutex.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tmutex",
     visibility = ["//:sandbox"],
 )
 
@@ -14,6 +12,6 @@ go_test(
     name = "tmutex_test",
     size = "medium",
     srcs = ["tmutex_test.go"],
-    embed = [":tmutex"],
+    library = ":tmutex",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index d1885ae66..a86501fa2 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "unet.go",
         "unet_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/unet",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/gate",
@@ -23,6 +21,6 @@ go_test(
     srcs = [
         "unet_test.go",
     ],
-    embed = [":unet"],
+    library = ":unet",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index b8fdc3125..850c34ed0 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "urpc",
     srcs = ["urpc.go"],
-    importpath = "gvisor.dev/gvisor/pkg/urpc",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/fd",
@@ -20,6 +18,6 @@ go_test(
     name = "urpc_test",
     size = "small",
     srcs = ["urpc_test.go"],
-    embed = [":urpc"],
+    library = ":urpc",
     deps = ["//pkg/unet"],
 )
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 1c6890e52..852480a09 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +21,6 @@ go_library(
         "waiter.go",
         "waiter_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
     deps = ["//pkg/sync"],
 )
@@ -33,5 +31,5 @@ go_test(
     srcs = [
         "waiter_test.go",
     ],
-    embed = [":waiter"],
+    library = ":waiter",
 )
diff --git a/runsc/BUILD b/runsc/BUILD
index e5587421d..b35b41d81 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,7 +1,6 @@
-package(licenses = ["notice"])  # Apache 2.0
+load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar")
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar")
+package(licenses = ["notice"])
 
 go_binary(
     name = "runsc",
@@ -9,7 +8,7 @@ go_binary(
         "main.go",
         "version.go",
     ],
-    pure = "on",
+    pure = True,
     visibility = [
         "//visibility:public",
     ],
@@ -26,10 +25,12 @@ go_binary(
 )
 
 # The runsc-race target is a race-compatible BUILD target. This must be built
-# via "bazel build --features=race //runsc:runsc-race", since the race feature
-# must apply to all dependencies due a bug in gazelle file selection.  The pure
-# attribute must be off because the race detector requires linking with non-Go
-# components, although we still require a static binary.
+# via: bazel build --features=race //runsc:runsc-race
+#
+# This is neccessary because the race feature must apply to all dependencies
+# due a bug in gazelle file selection.  The pure attribute must be off because
+# the race detector requires linking with non-Go components, although we still
+# require a static binary.
 #
 # Note that in the future this might be convertible to a compatible target by
 # using the pure and static attributes within a select function, but select is
@@ -42,7 +43,7 @@ go_binary(
         "main.go",
         "version.go",
     ],
-    static = "on",
+    static = True,
     visibility = [
         "//visibility:public",
     ],
@@ -82,7 +83,12 @@ genrule(
     # because they are assumes to be hermetic).
     srcs = [":runsc"],
     outs = ["version.txt"],
-    cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
+    # Note that the little dance here is necessary because files in the $(SRCS)
+    # attribute are not executable by default, and we can't touch in place.
+    cmd = "cp $(location :runsc) $(@D)/runsc && \
+        chmod a+x $(@D)/runsc && \
+        $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
+        rm -f $(@D)/runsc",
     stamp = 1,
 )
 
@@ -109,5 +115,6 @@ sh_test(
     name = "version_test",
     size = "small",
     srcs = ["version_test.sh"],
+    args = ["$(location :runsc)"],
     data = [":runsc"],
 )
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 3e20f8f2f..f3ebc0231 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -23,7 +23,6 @@ go_library(
         "strace.go",
         "user.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/boot",
     visibility = [
         "//runsc:__subpackages__",
         "//test:__subpackages__",
@@ -107,7 +106,7 @@ go_test(
         "loader_test.go",
         "user_test.go",
     ],
-    embed = [":boot"],
+    library = ":boot",
     deps = [
         "//pkg/control/server",
         "//pkg/log",
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 3a9dcfc04..ce30f6c53 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -13,7 +13,6 @@ go_library(
         "extra_filters_race.go",
         "filter.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/boot/filter",
     visibility = [
         "//runsc/boot:__subpackages__",
     ],
diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD
index 03391cdca..77774f43c 100644
--- a/runsc/boot/platforms/BUILD
+++ b/runsc/boot/platforms/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "platforms",
     srcs = ["platforms.go"],
-    importpath = "gvisor.dev/gvisor/runsc/boot/platforms",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index d6165f9e5..d4c7bdfbb 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "cgroup",
     srcs = ["cgroup.go"],
-    importpath = "gvisor.dev/gvisor/runsc/cgroup",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
@@ -19,6 +18,6 @@ go_test(
     name = "cgroup_test",
     size = "small",
     srcs = ["cgroup_test.go"],
-    embed = [":cgroup"],
+    library = ":cgroup",
     tags = ["local"],
 )
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index b94bc4fa0..09aa46434 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -34,7 +34,6 @@ go_library(
         "syscalls.go",
         "wait.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/cmd",
     visibility = [
         "//runsc:__subpackages__",
     ],
@@ -73,7 +72,7 @@ go_test(
     data = [
         "//runsc",
     ],
-    embed = [":cmd"],
+    library = ":cmd",
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
index e623c1a0f..06924bccd 100644
--- a/runsc/console/BUILD
+++ b/runsc/console/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "console.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/console",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 6dea179e4..e21431e4c 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,6 @@ go_library(
         "state_file.go",
         "status.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/container",
     visibility = [
         "//runsc:__subpackages__",
         "//test:__subpackages__",
@@ -42,7 +41,7 @@ go_test(
         "//runsc",
         "//runsc/container/test_app",
     ],
-    embed = [":container"],
+    library = ":container",
     shard_count = 5,
     tags = [
         "requires-kvm",
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index bfd338bb6..e200bafd9 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,7 @@ go_binary(
         "fds.go",
         "test_app.go",
     ],
-    pure = "on",
+    pure = True,
     visibility = ["//runsc/container:__pkg__"],
     deps = [
         "//pkg/unet",
diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD
index 558133a0e..8a571a000 100644
--- a/runsc/criutil/BUILD
+++ b/runsc/criutil/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "criutil",
     testonly = 1,
     srcs = ["criutil.go"],
-    importpath = "gvisor.dev/gvisor/runsc/criutil",
     visibility = ["//:sandbox"],
     deps = ["//runsc/testutil"],
 )
diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD
index 0e0423504..8621af901 100644
--- a/runsc/dockerutil/BUILD
+++ b/runsc/dockerutil/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "dockerutil",
     testonly = 1,
     srcs = ["dockerutil.go"],
-    importpath = "gvisor.dev/gvisor/runsc/dockerutil",
     visibility = ["//:sandbox"],
     deps = [
         "//runsc/testutil",
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index a9582d92b..64a406ae2 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,10 +10,7 @@ go_library(
         "fsgofer_arm64_unsafe.go",
         "fsgofer_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/fsgofer",
-    visibility = [
-        "//runsc:__subpackages__",
-    ],
+    visibility = ["//runsc:__subpackages__"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/fd",
@@ -30,7 +27,7 @@ go_test(
     name = "fsgofer_test",
     size = "small",
     srcs = ["fsgofer_test.go"],
-    embed = [":fsgofer"],
+    library = ":fsgofer",
     deps = [
         "//pkg/log",
         "//pkg/p9",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index bac73f89d..82b48ef32 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -13,7 +13,6 @@ go_library(
         "extra_filters_race.go",
         "filter.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/fsgofer/filter",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index ddbc37456..c95d50294 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "network_unsafe.go",
         "sandbox.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/sandbox",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 205638803..4ccd77f63 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,6 @@ go_library(
         "namespace.go",
         "specutils.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/specutils",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
@@ -28,6 +27,6 @@ go_test(
     name = "specutils_test",
     size = "small",
     srcs = ["specutils_test.go"],
-    embed = [":specutils"],
+    library = ":specutils",
     deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
 )
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index 3c3027cb5..f845120b0 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "testutil",
     testonly = 1,
     srcs = ["testutil.go"],
-    importpath = "gvisor.dev/gvisor/runsc/testutil",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/version_test.sh b/runsc/version_test.sh
index cc0ca3f05..747350654 100755
--- a/runsc/version_test.sh
+++ b/runsc/version_test.sh
@@ -16,7 +16,7 @@
 
 set -euf -x -o pipefail
 
-readonly runsc="${TEST_SRCDIR}/__main__/runsc/linux_amd64_pure_stripped/runsc"
+readonly runsc="$1"
 readonly version=$($runsc --version)
 
 # Version should should not match VERSION, which is the default and which will
diff --git a/scripts/common.sh b/scripts/common.sh
index fdb1aa142..cd91b9f8e 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -16,11 +16,7 @@
 
 set -xeou pipefail
 
-if [[ -f $(dirname $0)/common_google.sh ]]; then
-  source $(dirname $0)/common_google.sh
-else
-  source $(dirname $0)/common_bazel.sh
-fi
+source $(dirname $0)/common_build.sh
 
 # Ensure it attempts to collect logs in all cases.
 trap collect_logs EXIT
diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh
deleted file mode 100755
index a473a88a4..000000000
--- a/scripts/common_bazel.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Install the latest version of Bazel and log the version.
-(which use_bazel.sh && use_bazel.sh latest) || which bazel
-bazel version
-
-# Switch into the workspace; only necessary if run with kokoro.
-if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
-  cd git/repo
-elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
-  cd github/repo
-fi
-
-# Set the standard bazel flags.
-declare -r BAZEL_FLAGS=(
-  "--show_timestamps"
-  "--test_output=errors"
-  "--keep_going"
-  "--verbose_failures=true"
-)
-if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
-  declare -r BAZEL_RBE_AUTH_FLAGS=(
-    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-  )
-  declare -r BAZEL_RBE_FLAGS=("--config=remote")
-fi
-
-# Wrap bazel.
-function build() {
-  bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
-    tee /dev/fd/2 | grep -E '^  bazel-bin/' | awk '{ print $1; }'
-}
-
-function test() {
-  bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@"
-}
-
-function run() {
-  local binary=$1
-  shift
-  bazel run "${binary}" -- "$@"
-}
-
-function run_as_root() {
-  local binary=$1
-  shift
-  bazel run --run_under="sudo" "${binary}" -- "$@"
-}
-
-function collect_logs() {
-  # Zip out everything into a convenient form.
-  if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
-    # Merge results files of all shards for each test suite.
-    for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
-      junitparser merge `find $d -name test.xml` $d/test.xml
-      cat $d/shard_*_of_*/test.log > $d/test.log
-      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
-    done
-    find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
-    # Move test logs to Kokoro directory. tar is used to conveniently perform
-    # renames while moving files.
-    find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
-      tar --create --files-from - --transform 's/test\./sponge_log./' |
-      tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
-
-    # Collect sentry logs, if any.
-    if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then
-      # Check if the directory is empty or not (only the first line it needed).
-      local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1)
-      if [[ "${logs}" ]]; then
-        local -r archive=runsc_logs_"${RUNTIME}".tar.gz
-        if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then
-          echo "runsc logs will be uploaded to:"
-          echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
-          echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
-        fi
-        tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" .
-      fi
-    fi
-  fi
-}
-
-function find_branch_name() {
-  git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename
-}
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
new file mode 100755
index 000000000..a473a88a4
--- /dev/null
+++ b/scripts/common_build.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Switch into the workspace; only necessary if run with kokoro.
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+  cd git/repo
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+  cd github/repo
+fi
+
+# Set the standard bazel flags.
+declare -r BAZEL_FLAGS=(
+  "--show_timestamps"
+  "--test_output=errors"
+  "--keep_going"
+  "--verbose_failures=true"
+)
+if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
+  declare -r BAZEL_RBE_AUTH_FLAGS=(
+    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+  )
+  declare -r BAZEL_RBE_FLAGS=("--config=remote")
+fi
+
+# Wrap bazel.
+function build() {
+  bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
+    tee /dev/fd/2 | grep -E '^  bazel-bin/' | awk '{ print $1; }'
+}
+
+function test() {
+  bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@"
+}
+
+function run() {
+  local binary=$1
+  shift
+  bazel run "${binary}" -- "$@"
+}
+
+function run_as_root() {
+  local binary=$1
+  shift
+  bazel run --run_under="sudo" "${binary}" -- "$@"
+}
+
+function collect_logs() {
+  # Zip out everything into a convenient form.
+  if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
+    # Merge results files of all shards for each test suite.
+    for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
+      junitparser merge `find $d -name test.xml` $d/test.xml
+      cat $d/shard_*_of_*/test.log > $d/test.log
+      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
+    done
+    find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
+    # Move test logs to Kokoro directory. tar is used to conveniently perform
+    # renames while moving files.
+    find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
+      tar --create --files-from - --transform 's/test\./sponge_log./' |
+      tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
+
+    # Collect sentry logs, if any.
+    if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then
+      # Check if the directory is empty or not (only the first line it needed).
+      local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1)
+      if [[ "${logs}" ]]; then
+        local -r archive=runsc_logs_"${RUNTIME}".tar.gz
+        if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then
+          echo "runsc logs will be uploaded to:"
+          echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
+          echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
+        fi
+        tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" .
+      fi
+    fi
+  fi
+}
+
+function find_branch_name() {
+  git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename
+}
diff --git a/test/BUILD b/test/BUILD
index bf834d994..34b950644 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -1,44 +1 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-# We need to define a bazel platform and toolchain to specify dockerPrivileged
-# and dockerRunAsRoot options, they are required to run tests on the RBE
-# cluster in Kokoro.
-alias(
-    name = "rbe_ubuntu1604",
-    actual = ":rbe_ubuntu1604_r346485",
-)
-
-platform(
-    name = "rbe_ubuntu1604_r346485",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//tools/cpp:clang",
-        "@bazel_toolchains//constraints:xenial",
-        "@bazel_toolchains//constraints/sanitizers:support_msan",
-    ],
-    remote_execution_properties = """
-        properties: {
-          name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
-        }
-        properties: {
-          name: "dockerAddCapabilities"
-          value: "SYS_ADMIN"
-        }
-        properties: {
-          name: "dockerPrivileged"
-          value: "true"
-        }
-    """,
-)
-
-toolchain(
-    name = "cc-toolchain-clang-x86_64-default",
-    exec_compatible_with = [
-    ],
-    target_compatible_with = [
-    ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
+package(licenses = ["notice"])
diff --git a/test/e2e/BUILD b/test/e2e/BUILD
index 4fe03a220..76e04f878 100644
--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,7 @@ go_test(
         "integration_test.go",
         "regression_test.go",
     ],
-    embed = [":integration"],
+    library = ":integration",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
         "manual",
@@ -29,5 +29,4 @@ go_test(
 go_library(
     name = "integration",
     srcs = ["integration.go"],
-    importpath = "gvisor.dev/gvisor/test/integration",
 )
diff --git a/test/image/BUILD b/test/image/BUILD
index 09b0a0ad5..7392ac54e 100644
--- a/test/image/BUILD
+++ b/test/image/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -14,7 +14,7 @@ go_test(
         "ruby.rb",
         "ruby.sh",
     ],
-    embed = [":image"],
+    library = ":image",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
         "manual",
@@ -30,5 +30,4 @@ go_test(
 go_library(
     name = "image",
     srcs = ["image.go"],
-    importpath = "gvisor.dev/gvisor/test/image",
 )
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index 22f470092..6bb3b82b5 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "iptables_util.go",
         "nat.go",
     ],
-    importpath = "gvisor.dev/gvisor/test/iptables",
     visibility = ["//test/iptables:__subpackages__"],
     deps = [
         "//runsc/testutil",
@@ -24,7 +23,7 @@ go_test(
     srcs = [
         "iptables_test.go",
     ],
-    embed = [":iptables"],
+    library = ":iptables",
     tags = [
         "local",
         "manual",
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
index a5b6f082c..b9199387a 100644
--- a/test/iptables/runner/BUILD
+++ b/test/iptables/runner/BUILD
@@ -1,15 +1,21 @@
-load("@io_bazel_rules_docker//go:image.bzl", "go_image")
-load("@io_bazel_rules_docker//container:container.bzl", "container_image")
+load("//tools:defs.bzl", "container_image", "go_binary", "go_image")
 
 package(licenses = ["notice"])
 
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["main.go"],
+    deps = ["//test/iptables"],
+)
+
 container_image(
     name = "iptables-base",
     base = "@iptables-test//image",
 )
 
 go_image(
-    name = "runner",
+    name = "runner-image",
     testonly = 1,
     srcs = ["main.go"],
     base = ":iptables-base",
diff --git a/test/root/BUILD b/test/root/BUILD
index d5dd9bca2..23ce2a70f 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "root",
     srcs = ["root.go"],
-    importpath = "gvisor.dev/gvisor/test/root",
 )
 
 go_test(
@@ -21,7 +20,7 @@ go_test(
     data = [
         "//runsc",
     ],
-    embed = [":root"],
+    library = ":root",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
         # Also test only runs as root.
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
index 125633680..bca5f9cab 100644
--- a/test/root/testdata/BUILD
+++ b/test/root/testdata/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "sandbox.go",
         "simple.go",
     ],
-    importpath = "gvisor.dev/gvisor/test/root/testdata",
     visibility = [
         "//visibility:public",
     ],
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 367295206..2c472bf8d 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -1,6 +1,6 @@
 # These packages are used to run language runtime tests inside gVisor sandboxes.
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_test")
 load("//test/runtimes:build_defs.bzl", "runtime_test")
 
 package(licenses = ["notice"])
@@ -49,5 +49,5 @@ go_test(
     name = "blacklist_test",
     size = "small",
     srcs = ["blacklist_test.go"],
-    embed = [":runner"],
+    library = ":runner",
 )
diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
index 6f84ca852..92e275a76 100644
--- a/test/runtimes/build_defs.bzl
+++ b/test/runtimes/build_defs.bzl
@@ -1,6 +1,6 @@
 """Defines a rule for runtime test targets."""
 
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test", "loopback")
 
 def runtime_test(
         name,
@@ -34,6 +34,7 @@ def runtime_test(
     ]
     data = [
         ":runner",
+        loopback,
     ]
     if blacklist_file:
         args += ["--blacklist_file", "test/runtimes/" + blacklist_file]
@@ -61,7 +62,7 @@ def blacklist_test(name, blacklist_file):
     """Test that a blacklist parses correctly."""
     go_test(
         name = name + "_blacklist_test",
-        embed = [":runner"],
+        library = ":runner",
         srcs = ["blacklist_test.go"],
         args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
         data = [blacklist_file],
diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/images/proctor/BUILD
index 09dc6c42f..85e004c45 100644
--- a/test/runtimes/images/proctor/BUILD
+++ b/test/runtimes/images/proctor/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_test")
 
 package(licenses = ["notice"])
 
@@ -19,7 +19,7 @@ go_test(
     name = "proctor_test",
     size = "small",
     srcs = ["proctor_test.go"],
-    embed = [":proctor"],
+    library = ":proctor",
     deps = [
         "//runsc/testutil",
     ],
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 90d52e73b..40e974314 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 load("//test/syscalls:build_defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index aaf77c65b..1df761dd0 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -1,5 +1,7 @@
 """Defines a rule for syscall test targets."""
 
+load("//tools:defs.bzl", "loopback")
+
 # syscall_test is a macro that will create targets to run the given test target
 # on the host (native) and runsc.
 def syscall_test(
@@ -135,6 +137,7 @@ def _syscall_test(
         name = name,
         data = [
             ":syscall_test_runner",
+            loopback,
             test,
         ],
         args = args,
@@ -148,6 +151,3 @@ def sh_test(**kwargs):
     native.sh_test(
         **kwargs
     )
-
-def select_for_linux(for_linux, for_others = []):
-    return for_linux
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
index 9293f25cb..de4b2727c 100644
--- a/test/syscalls/gtest/BUILD
+++ b/test/syscalls/gtest/BUILD
@@ -1,12 +1,9 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "gtest",
     srcs = ["gtest.go"],
-    importpath = "gvisor.dev/gvisor/test/syscalls/gtest",
-    visibility = [
-        "//test:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
 )
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 4c7ec3f06..c2ef50c1d 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,5 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
-load("//test/syscalls:build_defs.bzl", "select_for_linux")
+load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -126,13 +125,11 @@ cc_library(
     testonly = 1,
     srcs = [
         "socket_test_util.cc",
-    ] + select_for_linux(
-        [
-            "socket_test_util_impl.cc",
-        ],
-    ),
+        "socket_test_util_impl.cc",
+    ],
     hdrs = ["socket_test_util.h"],
-    deps = [
+    defines = select_system(),
+    deps = default_net_util() + [
         "@com_google_googletest//:gtest",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -143,8 +140,7 @@ cc_library(
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:thread_util",
-    ] + select_for_linux([
-    ]),
+    ],
 )
 
 cc_library(
@@ -1443,6 +1439,7 @@ cc_binary(
     srcs = ["arch_prctl.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:file_descriptor",
         "//test/util:test_main",
         "//test/util:test_util",
         "@com_google_googletest//:gtest",
@@ -3383,11 +3380,11 @@ cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
     srcs = [
-        "udp_socket_test_cases.cc",
-    ] + select_for_linux([
         "udp_socket_errqueue_test_case.cc",
-    ]),
+        "udp_socket_test_cases.cc",
+    ],
     hdrs = ["udp_socket_test_cases.h"],
+    defines = select_system(),
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
index 81bf5a775..3a901faf5 100644
--- a/test/syscalls/linux/arch_prctl.cc
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -14,8 +14,10 @@
 
 #include <asm/prctl.h>
 #include <sys/prctl.h>
+#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
 
 // glibc does not provide a prototype for arch_prctl() so declare it here.
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
index 5cfe4e56f..ed488dbc2 100644
--- a/test/syscalls/linux/rseq/BUILD
+++ b/test/syscalls/linux/rseq/BUILD
@@ -1,8 +1,7 @@
 # This package contains a standalone rseq test binary. This binary must not
 # depend on libc, which might use rseq itself.
 
-load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain")
 
 package(licenses = ["notice"])
 
@@ -37,8 +36,8 @@ genrule(
         "$(location start.S)",
     ]),
     toolchains = [
+        cc_toolchain,
         ":no_pie_cc_flags",
-        "@bazel_tools//tools/cpp:current_cc_toolchain",
     ],
     visibility = ["//:sandbox"],
 )
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
index 147978f46..9a24e1df0 100644
--- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __fuchsia__
+
 #include "test/syscalls/linux/udp_socket_test_cases.h"
 
 #include <arpa/inet.h>
@@ -52,3 +54,5 @@ TEST_P(UdpSocketTest, ErrorQueue) {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif  // __fuchsia__
diff --git a/test/uds/BUILD b/test/uds/BUILD
index a3843e699..51e2c7ce8 100644
--- a/test/uds/BUILD
+++ b/test/uds/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -9,7 +9,6 @@ go_library(
     name = "uds",
     testonly = 1,
     srcs = ["uds.go"],
-    importpath = "gvisor.dev/gvisor/test/uds",
     deps = [
         "//pkg/log",
         "//pkg/unet",
diff --git a/test/util/BUILD b/test/util/BUILD
index cbc728159..3c732be62 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,5 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
-load("//test/syscalls:build_defs.bzl", "select_for_linux")
+load("//tools:defs.bzl", "cc_library", "cc_test", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -142,12 +141,13 @@ cc_library(
 cc_library(
     name = "save_util",
     testonly = 1,
-    srcs = ["save_util.cc"] +
-           select_for_linux(
-               ["save_util_linux.cc"],
-               ["save_util_other.cc"],
-           ),
+    srcs = [
+        "save_util.cc",
+        "save_util_linux.cc",
+        "save_util_other.cc",
+    ],
     hdrs = ["save_util.h"],
+    defines = select_system(),
 )
 
 cc_library(
@@ -234,13 +234,16 @@ cc_library(
     testonly = 1,
     srcs = [
         "test_util.cc",
-    ] + select_for_linux(
-        [
-            "test_util_impl.cc",
-            "test_util_runfiles.cc",
+        "test_util_impl.cc",
+        "test_util_runfiles.cc",
+    ],
+    hdrs = ["test_util.h"],
+    defines = select_system(
+        fuchsia = [
+            "__opensource__",
+            "__fuchsia__",
         ],
     ),
-    hdrs = ["test_util.h"],
     deps = [
         ":fs_util",
         ":logging",
diff --git a/test/util/save_util_linux.cc b/test/util/save_util_linux.cc
index cd56118c0..d0aea8e6a 100644
--- a/test/util/save_util_linux.cc
+++ b/test/util/save_util_linux.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __linux__
+
 #include <errno.h>
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -43,3 +45,5 @@ void MaybeSave() {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif
diff --git a/test/util/save_util_other.cc b/test/util/save_util_other.cc
index 1aca663b7..931af2c29 100644
--- a/test/util/save_util_other.cc
+++ b/test/util/save_util_other.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __linux__
+
 namespace gvisor {
 namespace testing {
 
@@ -21,3 +23,5 @@ void MaybeSave() {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif
diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc
index 7210094eb..694d21692 100644
--- a/test/util/test_util_runfiles.cc
+++ b/test/util/test_util_runfiles.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __fuchsia__
+
 #include <iostream>
 #include <string>
 
@@ -44,3 +46,5 @@ std::string RunfilePath(std::string path) {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif  // __fuchsia__
diff --git a/tools/BUILD b/tools/BUILD
new file mode 100644
index 000000000..e73a9c885
--- /dev/null
+++ b/tools/BUILD
@@ -0,0 +1,3 @@
+package(licenses = ["notice"])
+
+exports_files(["nogo.js"])
diff --git a/tools/build/BUILD b/tools/build/BUILD
new file mode 100644
index 000000000..0c0ce3f4d
--- /dev/null
+++ b/tools/build/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"])
+
+# In bazel, no special support is required for loopback networking. This is
+# just a dummy data target that does not change the test environment.
+genrule(
+    name = "loopback",
+    outs = ["loopback.txt"],
+    cmd = "touch $@",
+    visibility = ["//visibility:public"],
+)
diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
new file mode 100644
index 000000000..d0556abd1
--- /dev/null
+++ b/tools/build/defs.bzl
@@ -0,0 +1,91 @@
+"""Bazel implementations of standard rules."""
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
+load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
+load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library")
+load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
+load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
+load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
+load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
+load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
+
+container_image = _container_image
+cc_binary = _cc_binary
+cc_library = _cc_library
+cc_flags_supplier = _cc_flags_supplier
+cc_proto_library = _cc_proto_library
+cc_test = _cc_test
+cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
+go_image = _go_image
+go_embed_data = _go_embed_data
+loopback = "//tools/build:loopback"
+proto_library = native.proto_library
+pkg_deb = _pkg_deb
+pkg_tar = _pkg_tar
+py_library = native.py_library
+py_binary = native.py_binary
+py_test = native.py_test
+
+def go_binary(name, static = False, pure = False, **kwargs):
+    if static:
+        kwargs["static"] = "on"
+    if pure:
+        kwargs["pure"] = "on"
+    _go_binary(
+        name = name,
+        **kwargs
+    )
+
+def go_library(name, **kwargs):
+    _go_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_tool_library(name, **kwargs):
+    _go_tool_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_proto_library(name, proto, **kwargs):
+    deps = kwargs.pop("deps", [])
+    _go_proto_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name,
+        proto = proto,
+        deps = [dep.replace("_proto", "_go_proto") for dep in deps],
+        **kwargs
+    )
+
+def go_test(name, **kwargs):
+    library = kwargs.pop("library", None)
+    if library:
+        kwargs["embed"] = [library]
+    _go_test(
+        name = name,
+        **kwargs
+    )
+
+def py_requirement(name, direct = False):
+    return _py_requirement(name)
+
+def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs):
+    values = {
+        "@bazel_tools//src/conditions:linux_x86_64": amd64,
+        "@bazel_tools//src/conditions:linux_aarch64": arm64,
+    }
+    if default:
+        values["//conditions:default"] = default
+    return select(values, **kwargs)
+
+def select_system(linux = ["__linux__"], **kwargs):
+    return linux  # Only Linux supported.
+
+def default_installer():
+    return None
+
+def default_net_util():
+    return []  # Nothing needed.
diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD
index d85c56131..92ba8ab06 100644
--- a/tools/checkunsafe/BUILD
+++ b/tools/checkunsafe/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_tool_library")
+load("//tools:defs.bzl", "go_tool_library")
 
 package(licenses = ["notice"])
 
 go_tool_library(
     name = "checkunsafe",
     srcs = ["check_unsafe.go"],
-    importpath = "checkunsafe",
     visibility = ["//visibility:public"],
     deps = [
         "@org_golang_x_tools//go/analysis:go_tool_library",
diff --git a/tools/defs.bzl b/tools/defs.bzl
new file mode 100644
index 000000000..819f12b0d
--- /dev/null
+++ b/tools/defs.bzl
@@ -0,0 +1,154 @@
+"""Wrappers for common build rules.
+
+These wrappers apply common BUILD configurations (e.g., proto_library
+automagically creating cc_ and go_ proto targets) and act as a single point of
+change for Google-internal and bazel-compatible rules.
+"""
+
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
+load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+
+# Delegate directly.
+cc_binary = _cc_binary
+cc_library = _cc_library
+cc_test = _cc_test
+cc_toolchain = _cc_toolchain
+cc_flags_supplier = _cc_flags_supplier
+container_image = _container_image
+go_embed_data = _go_embed_data
+go_image = _go_image
+go_test = _go_test
+go_tool_library = _go_tool_library
+pkg_deb = _pkg_deb
+pkg_tar = _pkg_tar
+py_library = _py_library
+py_binary = _py_binary
+py_test = _py_test
+py_requirement = _py_requirement
+select_arch = _select_arch
+select_system = _select_system
+loopback = _loopback
+default_installer = _default_installer
+default_net_util = _default_net_util
+
+def go_binary(name, **kwargs):
+    """Wraps the standard go_binary.
+
+    Args:
+      name: the rule name.
+      **kwargs: standard go_binary arguments.
+    """
+    _go_binary(
+        name = name,
+        **kwargs
+    )
+
+def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
+    """Wraps the standard go_library and does stateification and marshalling.
+
+    The recommended way is to use this rule with mostly identical configuration as the native
+    go_library rule.
+
+    These definitions provide additional flags (stateify, marshal) that can be used
+    with the generators to automatically supplement the library code.
+
+    load("//tools:defs.bzl", "go_library")
+
+    go_library(
+        name = "foo",
+        srcs = ["foo.go"],
+    )
+
+    Args:
+      name: the rule name.
+      srcs: the library sources.
+      deps: the library dependencies.
+      imports: imports required for stateify.
+      stateify: whether statify is enabled (default: true).
+      marshal: whether marshal is enabled (default: false).
+      **kwargs: standard go_library arguments.
+    """
+    if stateify:
+        # Only do stateification for non-state packages without manual autogen.
+        go_stateify(
+            name = name + "_state_autogen",
+            srcs = [src for src in srcs if src.endswith(".go")],
+            imports = imports,
+            package = name,
+            arch = select_arch(),
+            out = name + "_state_autogen.go",
+        )
+        all_srcs = srcs + [name + "_state_autogen.go"]
+        if "//pkg/state" not in deps:
+            all_deps = deps + ["//pkg/state"]
+        else:
+            all_deps = deps
+    else:
+        all_deps = deps
+        all_srcs = srcs
+    if marshal:
+        go_marshal(
+            name = name + "_abi_autogen",
+            srcs = [src for src in srcs if src.endswith(".go")],
+            debug = False,
+            imports = imports,
+            package = name,
+        )
+        extra_deps = [
+            dep
+            for dep in marshal_deps
+            if not dep in all_deps
+        ]
+        all_deps = all_deps + extra_deps
+        all_srcs = srcs + [name + "_abi_autogen_unsafe.go"]
+
+    _go_library(
+        name = name,
+        srcs = all_srcs,
+        deps = all_deps,
+        **kwargs
+    )
+
+    if marshal:
+        # Ignore importpath for go_test.
+        kwargs.pop("importpath", None)
+
+        _go_test(
+            name = name + "_abi_autogen_test",
+            srcs = [name + "_abi_autogen_test.go"],
+            library = ":" + name,
+            deps = marshal_test_deps,
+            **kwargs
+        )
+
+def proto_library(name, srcs, **kwargs):
+    """Wraps the standard proto_library.
+
+    Given a proto_library named "foo", this produces three different targets:
+    - foo_proto: proto_library rule.
+    - foo_go_proto: go_proto_library rule.
+    - foo_cc_proto: cc_proto_library rule.
+
+    Args:
+      srcs: the proto sources.
+      **kwargs: standard proto_library arguments.
+    """
+    deps = kwargs.pop("deps", [])
+    _proto_library(
+        name = name + "_proto",
+        srcs = srcs,
+        deps = deps,
+        **kwargs
+    )
+    _go_proto_library(
+        name = name + "_go_proto",
+        proto = ":" + name + "_proto",
+        deps = deps,
+        **kwargs
+    )
+    _cc_proto_library(
+        name = name + "_cc_proto",
+        deps = [":" + name + "_proto"],
+        **kwargs
+    )
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 39318b877..069df3856 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD
index 74853c7d2..38caa3ce7 100644
--- a/tools/go_generics/globals/BUILD
+++ b/tools/go_generics/globals/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,6 +8,6 @@ go_library(
         "globals_visitor.go",
         "scope.go",
     ],
-    importpath = "gvisor.dev/gvisor/tools/go_generics/globals",
+    stateify = False,
     visibility = ["//tools/go_generics:__pkg__"],
 )
diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD
index 02b09120e..b7d35e272 100644
--- a/tools/go_generics/go_merge/BUILD
+++ b/tools/go_generics/go_merge/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index 9d26a88b7..8a329dfc6 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD
index c862b277c..80d9c0504 100644
--- a/tools/go_marshal/BUILD
+++ b/tools/go_marshal/BUILD
@@ -1,6 +1,6 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_binary(
     name = "go_marshal",
diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md
index 481575bd3..4886efddf 100644
--- a/tools/go_marshal/README.md
+++ b/tools/go_marshal/README.md
@@ -20,19 +20,7 @@ comment `// +marshal`.
 
 # Usage
 
-See `defs.bzl`: two new rules are provided, `go_marshal` and `go_library`.
-
-The recommended way to generate a go library with marshalling is to use the
-`go_library` with mostly identical configuration as the native go_library rule.
-
-```
-load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-```
+See `defs.bzl`: a new rule is provided, `go_marshal`.
 
 Under the hood, the `go_marshal` rule is used to generate a file that will
 appear in a Go target; the output file should appear explicitly in a srcs list.
@@ -54,11 +42,7 @@ go_library(
         "foo.go",
         "foo_abi.go",
     ],
-    deps = [
-        "<PKGPATH>/gvisor/pkg/abi",
-        "<PKGPATH>/gvisor/pkg/sentry/safemem/safemem",
-        "<PKGPATH>/gvisor/pkg/sentry/usermem/usermem",
-    ],
+    ...
 )
 ```
 
@@ -69,22 +53,6 @@ These tests use reflection to verify properties of the ABI struct, and should be
 considered part of the generated interfaces (but are too expensive to execute at
 runtime). Ensure these tests run at some point.
 
-```
-$ cat BUILD
-load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-$ blaze build :foo
-$ blaze query ...
-<path-to-dir>:foo_abi_autogen
-<path-to-dir>:foo_abi_autogen_test
-$ blaze test :foo_abi_autogen_test
-<test-output>
-```
-
 # Restrictions
 
 Not all valid go type definitions can be used with `go_marshal`. `go_marshal` is
@@ -131,22 +99,6 @@ for embedded structs that are not aligned.
 Because of this, it's generally best to avoid using `marshal:"unaligned"` and
 insert explicit padding fields instead.
 
-## Debugging go_marshal
-
-To enable debugging output from the go marshal tool, pass the `-debug` flag to
-the tool. When using the build rules from above, add a `debug = True` field to
-the build rule like this:
-
-```
-load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-    debug = True,
-)
-```
-
 ## Modifying the `go_marshal` Tool
 
 The following are some guidelines for modifying the `go_marshal` tool:
diff --git a/tools/go_marshal/analysis/BUILD b/tools/go_marshal/analysis/BUILD
index c859ced77..c2a4d45c4 100644
--- a/tools/go_marshal/analysis/BUILD
+++ b/tools/go_marshal/analysis/BUILD
@@ -1,12 +1,11 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "analysis",
     testonly = 1,
     srcs = ["analysis_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/analysis",
     visibility = [
         "//:sandbox",
     ],
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index c32eb559f..2918ceffe 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -1,57 +1,14 @@
-"""Marshal is a tool for generating marshalling interfaces for Go types.
-
-The recommended way is to use the go_library rule defined below with mostly
-identical configuration as the native go_library rule.
-
-load("//tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-
-Under the hood, the go_marshal rule is used to generate a file that will
-appear in a Go target; the output file should appear explicitly in a srcs list.
-For example (the above is still the preferred way):
-
-load("//tools/go_marshal:defs.bzl", "go_marshal")
-
-go_marshal(
-    name = "foo_abi",
-    srcs = ["foo.go"],
-    out = "foo_abi.go",
-    package = "foo",
-)
-
-go_library(
-    name = "foo",
-    srcs = [
-        "foo.go",
-        "foo_abi.go",
-    ],
-    deps = [
-       "//tools/go_marshal:marshal",
-       "//pkg/sentry/platform/safecopy",
-       "//pkg/sentry/usermem",
-    ],
-)
-"""
-
-load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test")
+"""Marshal is a tool for generating marshalling interfaces for Go types."""
 
 def _go_marshal_impl(ctx):
     """Execute the go_marshal tool."""
     output = ctx.outputs.lib
     output_test = ctx.outputs.test
-    (build_dir, _, _) = ctx.build_file_path.rpartition("/BUILD")
-
-    decl = "/".join(["gvisor.dev/gvisor", build_dir])
 
     # Run the marshal command.
     args = ["-output=%s" % output.path]
     args += ["-pkg=%s" % ctx.attr.package]
     args += ["-output_test=%s" % output_test.path]
-    args += ["-declarationPkg=%s" % decl]
 
     if ctx.attr.debug:
         args += ["-debug"]
@@ -83,7 +40,6 @@ go_marshal = rule(
     implementation = _go_marshal_impl,
     attrs = {
         "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "libname": attr.string(mandatory = True),
         "imports": attr.string_list(mandatory = False),
         "package": attr.string(mandatory = True),
         "debug": attr.bool(doc = "enable debugging output from the go_marshal tool"),
@@ -95,58 +51,14 @@ go_marshal = rule(
     },
 )
 
-def go_library(name, srcs, deps = [], imports = [], debug = False, **kwargs):
-    """wraps the standard go_library and does mashalling interface generation.
-
-    Args:
-      name: Same as native go_library.
-      srcs: Same as native go_library.
-      deps: Same as native go_library.
-      imports: Extra import paths to pass to the go_marshal tool.
-      debug: Enables debugging output from the go_marshal tool.
-      **kwargs: Remaining args to pass to the native go_library rule unmodified.
-    """
-    go_marshal(
-        name = name + "_abi_autogen",
-        libname = name,
-        srcs = [src for src in srcs if src.endswith(".go")],
-        debug = debug,
-        imports = imports,
-        package = name,
-    )
-
-    extra_deps = [
-        "//tools/go_marshal/marshal",
-        "//pkg/sentry/platform/safecopy",
-        "//pkg/sentry/usermem",
-    ]
-
-    all_srcs = srcs + [name + "_abi_autogen_unsafe.go"]
-    all_deps = deps + []  #  + extra_deps
-
-    for extra in extra_deps:
-        if extra not in deps:
-            all_deps.append(extra)
-
-    _go_library(
-        name = name,
-        srcs = all_srcs,
-        deps = all_deps,
-        **kwargs
-    )
-
-    # Don't pass importpath arg to go_test.
-    kwargs.pop("importpath", "")
-
-    _go_test(
-        name = name + "_abi_autogen_test",
-        srcs = [name + "_abi_autogen_test.go"],
-        # Generated test has a fixed set of dependencies since we generate these
-        # tests. They should only depend on the library generated above, and the
-        # Marshallable interface.
-        deps = [
-            ":" + name,
-            "//tools/go_marshal/analysis",
-        ],
-        **kwargs
-    )
+# marshal_deps are the dependencies requied by generated code.
+marshal_deps = [
+    "//tools/go_marshal/marshal",
+    "//pkg/sentry/platform/safecopy",
+    "//pkg/sentry/usermem",
+]
+
+# marshal_test_deps are required by test targets.
+marshal_test_deps = [
+    "//tools/go_marshal/analysis",
+]
diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD
index a0eae6492..c92b59dd6 100644
--- a/tools/go_marshal/gomarshal/BUILD
+++ b/tools/go_marshal/gomarshal/BUILD
@@ -1,6 +1,6 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "gomarshal",
@@ -10,7 +10,7 @@ go_library(
         "generator_tests.go",
         "util.go",
     ],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/gomarshal",
+    stateify = False,
     visibility = [
         "//:sandbox",
     ],
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 641ccd938..8392f3f6d 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -62,15 +62,12 @@ type Generator struct {
 	outputTest *os.File
 	// Package name for the generated file.
 	pkg string
-	// Go import path for package we're processing. This package should directly
-	// declare the type we're generating code for.
-	declaration string
 	// Set of extra packages to import in the generated file.
 	imports *importTable
 }
 
 // NewGenerator creates a new code Generator.
-func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports []string) (*Generator, error) {
+func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*Generator, error) {
 	f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
 	if err != nil {
 		return nil, fmt.Errorf("Couldn't open output file %q: %v", out, err)
@@ -80,12 +77,11 @@ func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports
 		return nil, fmt.Errorf("Couldn't open test output file %q: %v", out, err)
 	}
 	g := Generator{
-		inputs:      srcs,
-		output:      f,
-		outputTest:  fTest,
-		pkg:         pkg,
-		declaration: declaration,
-		imports:     newImportTable(),
+		inputs:     srcs,
+		output:     f,
+		outputTest: fTest,
+		pkg:        pkg,
+		imports:    newImportTable(),
 	}
 	for _, i := range imports {
 		// All imports on the extra imports list are unconditionally marked as
@@ -264,7 +260,7 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface
 // generateOneTestSuite generates a test suite for the automatically generated
 // implementations type t.
 func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator {
-	i := newTestGenerator(t, g.declaration)
+	i := newTestGenerator(t)
 	i.emitTests()
 	return i
 }
@@ -359,7 +355,7 @@ func (g *Generator) Run() error {
 // source file.
 func (g *Generator) writeTests(ts []*testGenerator) error {
 	var b sourceBuffer
-	b.emit("package %s_test\n\n", g.pkg)
+	b.emit("package %s\n\n", g.pkg)
 	if err := b.write(g.outputTest); err != nil {
 		return err
 	}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index df25cb5b2..bcda17c3b 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -46,7 +46,7 @@ type testGenerator struct {
 	decl *importStmt
 }
 
-func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator {
+func newTestGenerator(t *ast.TypeSpec) *testGenerator {
 	if _, ok := t.Type.(*ast.StructType); !ok {
 		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
 	}
@@ -59,14 +59,12 @@ func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator {
 	for _, i := range standardImports {
 		g.imports.add(i).markUsed()
 	}
-	g.decl = g.imports.add(declaration)
-	g.decl.markUsed()
 
 	return g
 }
 
 func (g *testGenerator) typeName() string {
-	return fmt.Sprintf("%s.%s", g.decl.name, g.t.Name.Name)
+	return g.t.Name.Name
 }
 
 func (g *testGenerator) forEachField(fn func(f *ast.Field)) {
diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go
index 3d12eb93c..e1a97b311 100644
--- a/tools/go_marshal/main.go
+++ b/tools/go_marshal/main.go
@@ -31,11 +31,10 @@ import (
 )
 
 var (
-	pkg            = flag.String("pkg", "", "output package")
-	output         = flag.String("output", "", "output file")
-	outputTest     = flag.String("output_test", "", "output file for tests")
-	imports        = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
-	declarationPkg = flag.String("declarationPkg", "", "import path of target declaring the types we're generating on")
+	pkg        = flag.String("pkg", "", "output package")
+	output     = flag.String("output", "", "output file")
+	outputTest = flag.String("output_test", "", "output file for tests")
+	imports    = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
 )
 
 func main() {
@@ -62,7 +61,7 @@ func main() {
 		// as an import.
 		extraImports = strings.Split(*imports, ",")
 	}
-	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, *declarationPkg, extraImports)
+	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, extraImports)
 	if err != nil {
 		panic(err)
 	}
diff --git a/tools/go_marshal/marshal/BUILD b/tools/go_marshal/marshal/BUILD
index 47dda97a1..ad508c72f 100644
--- a/tools/go_marshal/marshal/BUILD
+++ b/tools/go_marshal/marshal/BUILD
@@ -1,13 +1,12 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "marshal",
     srcs = [
         "marshal.go",
     ],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/marshal",
     visibility = [
         "//:sandbox",
     ],
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index d412e1ccf..38ba49fed 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,7 +1,6 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_marshal:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 package_group(
     name = "gomarshal_test",
@@ -25,6 +24,6 @@ go_library(
     name = "test",
     testonly = 1,
     srcs = ["test.go"],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/test",
+    marshal = True,
     deps = ["//tools/go_marshal/test/external"],
 )
diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD
index 9bb89e1da..0cf6da603 100644
--- a/tools/go_marshal/test/external/BUILD
+++ b/tools/go_marshal/test/external/BUILD
@@ -1,11 +1,11 @@
-load("//tools/go_marshal:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "external",
     testonly = 1,
     srcs = ["external.go"],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/test/external",
+    marshal = True,
     visibility = ["//tools/go_marshal/test:gomarshal_test"],
 )
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index bb53f8ae9..a133d6f8b 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 33267c074..0f261d89f 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -1,41 +1,4 @@
-"""Stateify is a tool for generating state wrappers for Go types.
-
-The recommended way is to use the go_library rule defined below with mostly
-identical configuration as the native go_library rule.
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-
-Under the hood, the go_stateify rule is used to generate a file that will
-appear in a Go target; the output file should appear explicitly in a srcs list.
-For example (the above is still the preferred way):
-
-load("//tools/go_stateify:defs.bzl", "go_stateify")
-
-go_stateify(
-    name = "foo_state",
-    srcs = ["foo.go"],
-    out = "foo_state.go",
-    package = "foo",
-)
-
-go_library(
-    name = "foo",
-    srcs = [
-        "foo.go",
-        "foo_state.go",
-    ],
-    deps = [
-        "//pkg/state",
-    ],
-)
-"""
-
-load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library")
+"""Stateify is a tool for generating state wrappers for Go types."""
 
 def _go_stateify_impl(ctx):
     """Implementation for the stateify tool."""
@@ -103,43 +66,3 @@ files and must be added to the srcs of the relevant go_library.
         "_statepkg": attr.string(default = "gvisor.dev/gvisor/pkg/state"),
     },
 )
-
-def go_library(name, srcs, deps = [], imports = [], **kwargs):
-    """Standard go_library wrapped which generates state source files.
-
-    Args:
-      name: the name of the go_library rule.
-      srcs: sources of the go_library. Each will be processed for stateify
-            annotations.
-      deps: dependencies for the go_library.
-      imports: an optional list of extra non-aliased, Go-style absolute import
-               paths required for stateified types.
-      **kwargs: passed to go_library.
-    """
-    if "encode_unsafe.go" not in srcs and (name + "_state_autogen.go") not in srcs:
-        # Only do stateification for non-state packages without manual autogen.
-        go_stateify(
-            name = name + "_state_autogen",
-            srcs = [src for src in srcs if src.endswith(".go")],
-            imports = imports,
-            package = name,
-            arch = select({
-                "@bazel_tools//src/conditions:linux_aarch64": "arm64",
-                "//conditions:default": "amd64",
-            }),
-            out = name + "_state_autogen.go",
-        )
-        all_srcs = srcs + [name + "_state_autogen.go"]
-        if "//pkg/state" not in deps:
-            all_deps = deps + ["//pkg/state"]
-        else:
-            all_deps = deps
-    else:
-        all_deps = deps
-        all_srcs = srcs
-    _go_library(
-        name = name,
-        srcs = all_srcs,
-        deps = all_deps,
-        **kwargs
-    )
diff --git a/tools/images/BUILD b/tools/images/BUILD
index 2b77c2737..f1699b184 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -1,4 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary")
 load("//tools/images:defs.bzl", "vm_image", "vm_test")
 
 package(
diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl
index d8e422a5d..32235813a 100644
--- a/tools/images/defs.bzl
+++ b/tools/images/defs.bzl
@@ -28,6 +28,8 @@ The vm_test rule can be used to execute a command remotely. For example,
   )
 """
 
+load("//tools:defs.bzl", "default_installer")
+
 def _vm_image_impl(ctx):
     script_paths = []
     for script in ctx.files.scripts:
@@ -165,8 +167,8 @@ def vm_test(
     targets = kwargs.pop("targets", [])
     if installer:
         targets = [installer] + targets
-    targets = [
-    ] + targets
+    if default_installer():
+        targets = [default_installer()] + targets
     _vm_test(
         tags = [
             "local",
diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD
index ee7ea11fd..4ef1a3124 100644
--- a/tools/issue_reviver/BUILD
+++ b/tools/issue_reviver/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD
index 6da22ba1c..da4133472 100644
--- a/tools/issue_reviver/github/BUILD
+++ b/tools/issue_reviver/github/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "github",
     srcs = ["github.go"],
-    importpath = "gvisor.dev/gvisor/tools/issue_reviver/github",
     visibility = [
         "//tools/issue_reviver:__subpackages__",
     ],
diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD
index 2c3675977..d262932bd 100644
--- a/tools/issue_reviver/reviver/BUILD
+++ b/tools/issue_reviver/reviver/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "reviver",
     srcs = ["reviver.go"],
-    importpath = "gvisor.dev/gvisor/tools/issue_reviver/reviver",
     visibility = [
         "//tools/issue_reviver:__subpackages__",
     ],
@@ -15,5 +14,5 @@ go_test(
     name = "reviver_test",
     size = "small",
     srcs = ["reviver_test.go"],
-    embed = [":reviver"],
+    library = ":reviver",
 )
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index fb09ff331..a22c8c9f2 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -15,4 +15,4 @@
 # limitations under the License.
 
 # The STABLE_ prefix will trigger a re-link if it changes.
-echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty)
+echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty || echo 0.0.0)
diff --git a/vdso/BUILD b/vdso/BUILD
index 2b6744c26..d37d4266d 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -3,20 +3,10 @@
 #   normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses
 #   timekeeping parameters managed by the sandbox kernel.
 
-load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "select_arch")
 
 package(licenses = ["notice"])
 
-config_setting(
-    name = "x86_64",
-    constraint_values = ["@bazel_tools//platforms:x86_64"],
-)
-
-config_setting(
-    name = "aarch64",
-    constraint_values = ["@bazel_tools//platforms:aarch64"],
-)
-
 genrule(
     name = "vdso",
     srcs = [
@@ -39,14 +29,15 @@ genrule(
           "-O2 " +
           "-std=c++11 " +
           "-fPIC " +
+          "-fno-sanitize=all " +
           # Some toolchains enable stack protector by default. Disable it, the
           # VDSO has no hooks to handle failures.
           "-fno-stack-protector " +
           "-fuse-ld=gold " +
-          select({
-              ":x86_64": "-m64 ",
-              "//conditions:default": "",
-          }) +
+          select_arch(
+              amd64 = "-m64 ",
+              arm64 = "",
+          ) +
           "-shared " +
           "-nostdlib " +
           "-Wl,-soname=linux-vdso.so.1 " +
@@ -55,12 +46,10 @@ genrule(
           "-Wl,-Bsymbolic " +
           "-Wl,-z,max-page-size=4096 " +
           "-Wl,-z,common-page-size=4096 " +
-          select(
-              {
-                  ":x86_64": "-Wl,-T$(location vdso_amd64.lds) ",
-                  ":aarch64": "-Wl,-T$(location vdso_arm64.lds) ",
-              },
-              no_match_error = "Unsupported architecture",
+          select_arch(
+              amd64 = "-Wl,-T$(location vdso_amd64.lds) ",
+              arm64 = "-Wl,-T$(location vdso_arm64.lds) ",
+              no_match_error = "unsupported architecture",
           ) +
           "-o $(location vdso.so) " +
           "$(location vdso.cc) " +
@@ -73,7 +62,7 @@ genrule(
     ],
     features = ["-pie"],
     toolchains = [
-        "@bazel_tools//tools/cpp:current_cc_toolchain",
+        cc_toolchain,
         ":no_pie_cc_flags",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 29316e66adfc49c158425554761e34c12338f1d9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 12:27:04 -0800
Subject: Cleanup for GH review.

---
 pkg/abi/linux/netfilter.go               |  14 +--
 pkg/sentry/socket/netfilter/netfilter.go | 144 ++++++++++++-------------------
 pkg/tcpip/iptables/types.go              |  15 ----
 pkg/tcpip/iptables/udp_matcher.go        |  62 ++++++-------
 test/iptables/filter_input.go            |   6 +-
 5 files changed, 88 insertions(+), 153 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index effed7976..8e40bcc62 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -198,6 +198,8 @@ type XTEntryMatch struct {
 // SizeOfXTEntryMatch is the size of an XTEntryMatch.
 const SizeOfXTEntryMatch = 32
 
+// KernelXTEntryMatch is identical to XTEntryMatch, but contains
+// variable-length Data field.
 type KernelXTEntryMatch struct {
 	XTEntryMatch
 	Data []byte
@@ -349,19 +351,19 @@ func goString(cstring []byte) string {
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
-	// SourcePortStart specifies the inclusive start of the range of source
-	// ports to which the matcher applies.
+	// SourcePortStart is the inclusive start of the range of source ports
+	// to which the matcher applies.
 	SourcePortStart uint16
 
-	// SourcePortEnd specifies the inclusive end of the range of source ports
-	// to which the matcher applies.
+	// SourcePortEnd is the inclusive end of the range of source ports to
+	// which the matcher applies.
 	SourcePortEnd uint16
 
-	// DestinationPortStart specifies the start of the destination port
+	// DestinationPortStart is the inclusive start of the destination port
 	// range to which the matcher applies.
 	DestinationPortStart uint16
 
-	// DestinationPortEnd specifies the start of the destination port
+	// DestinationPortEnd is the inclusive end of the destination port
 	// range to which the matcher applies.
 	DestinationPortEnd uint16
 
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 6c88a50a6..b8848f08a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -34,9 +34,16 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
-// metadata is opaque to netstack. It holds data that we need to translate
-// between Linux's and netstack's iptables representations.
-// TODO(gvisor.dev/issue/170): Use metadata to check correctness.
+const (
+	matcherNameUDP = "udp"
+)
+
+// Metadata is used to verify that we are correctly serializing and
+// deserializing iptables into structs consumable by the iptables tool. We save
+// a metadata struct when the tables are written, and when they are read out we
+// verify that certain fields are the same.
+//
+// metadata is opaque to netstack.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -44,10 +51,12 @@ type metadata struct {
 	Size       uint32
 }
 
-const enableDebugLog = true
+const enableDebug = false
 
+// nflog logs messages related to the writing and reading of iptables, but only
+// when enableDebug is true.
 func nflog(format string, args ...interface{}) {
-	if enableDebugLog {
+	if enableDebug {
 		log.Infof("netfilter: "+format, args...)
 	}
 }
@@ -80,7 +89,7 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	info.NumEntries = metadata.NumEntries
 	info.Size = metadata.Size
 
-	nflog("GetInfo returning info: %+v", info)
+	nflog("returning info: %+v", info)
 
 	return info, nil
 }
@@ -163,19 +172,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
-		nflog("Current offset: %d", entries.Size)
+		nflog("convert to binary: current offset: %d", entries.Size)
 
 		// Is this a chain entry point?
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
-				nflog("Found hook %d at offset %d", hook, entries.Size)
+				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
-				nflog("Found underflow %d at offset %d", underflow, entries.Size)
+				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
 				meta.Underflow[underflow] = entries.Size
 			}
 		}
@@ -195,7 +204,7 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 			// Serialize the matcher and add it to the
 			// entry.
 			serialized := marshalMatcher(matcher)
-			nflog("matcher serialized as: %v", serialized)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
 			if len(serialized)%8 != 0 {
 				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
 			}
@@ -212,14 +221,14 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 		entry.Elems = append(entry.Elems, serialized...)
 		entry.NextOffset += uint16(len(serialized))
 
-		nflog("Adding entry: %+v", entry)
+		nflog("convert to binary: adding entry: %+v", entry)
 
 		entries.Size += uint32(entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
 		meta.NumEntries++
 	}
 
-	nflog("Finished with an marshalled size of %d", meta.Size)
+	nflog("convert to binary: finished with an marshalled size of %d", meta.Size)
 	meta.Size = entries.Size
 	return entries, meta, nil
 }
@@ -237,16 +246,18 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	nflog("Marshalling UDP matcher: %+v", matcher)
+	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
+
+	// We have to pad this struct size to a multiple of 8 bytes.
+	const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6
 
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6,
-			// Name:      "udp",
+			MatchSize: size,
 		},
 		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
-	copy(linuxMatcher.Name[:], "udp")
+	copy(linuxMatcher.Name[:], matcherNameUDP)
 
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.Data.SourcePortStart,
@@ -255,17 +266,12 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
 		InverseFlags:         matcher.Data.InverseFlags,
 	}
-	nflog("marshalUDPMatcher: xtudp: %+v", xtudp)
 	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
-	nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher)
 
-	// We have to pad this struct size to a multiple of 8 bytes, so we make
-	// this a little longer than it needs to be.
-	buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6)
+	buf := make([]byte, 0, size)
 	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
 	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
-	nflog("Marshalled into matcher of size %d", len(buf))
-	nflog("marshalUDPMatcher: buf is: %v", buf)
+	nflog("convert to binary: marshalled UDP matcher into %v", buf)
 	return buf[:]
 }
 
@@ -283,9 +289,8 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
-	nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
 
-	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
@@ -353,8 +358,6 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
-	// printReplace(optVal)
-
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
 		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
@@ -375,13 +378,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	nflog("Setting entries in table %q", replace.Name.String())
+	nflog("set entries: setting entries in table %q", replace.Name.String())
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
 	var offsets []uint32
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		nflog("Processing entry at offset %d", offset)
+		nflog("set entries: processing entry at offset %d", offset)
 
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
@@ -406,11 +409,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return err
 		}
 
-		// TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables.
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certiain protocols, hooks, tables.
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
 			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 		matchers, err := parseMatchers(filter, optVal[:matchersSize])
 		if err != nil {
@@ -423,6 +428,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		targetSize := entry.NextOffset - entry.TargetOffset
 		if len(optVal) < int(targetSize) {
 			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
@@ -500,10 +506,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
 func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
-	nflog("Parsing matchers of size %d", len(optVal))
+	nflog("set entries: parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
-		nflog("parseMatchers: optVal has len %d", len(optVal))
+		nflog("set entries: optVal has len %d", len(optVal))
+
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
 			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
@@ -512,7 +519,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
 		binary.Unmarshal(buf, usermem.ByteOrder, &match)
-		nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+		nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match)
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
@@ -528,17 +535,17 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var matcher iptables.Matcher
 		var err error
 		switch match.Name.String() {
-		case "udp":
+		case matcherNameUDP:
 			if len(buf) < linux.SizeOfXTUDP {
 				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
 				return nil, syserr.ErrInvalidArgument
 			}
+			// For alignment reasons, the match's total size may
+			// exceed what's strictly necessary to hold matchData.
 			var matchData linux.XTUDP
-			// For alignment reasons, the match's total size may exceed what's
-			// strictly necessary to hold matchData.
 			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
 			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
-			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{
+			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{
 				SourcePortStart:      matchData.SourcePortStart,
 				SourcePortEnd:        matchData.SourcePortEnd,
 				DestinationPortStart: matchData.DestinationPortStart,
@@ -557,19 +564,22 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		matchers = append(matchers, matcher)
 
-		// TODO: Support revision.
-		// TODO: Support proto -- matchers usually specify which proto(s) they work with.
+		// TODO(gvisor.dev/issue/170): Check the revision field.
 		optVal = optVal[match.MatchSize:]
 	}
 
-	// TODO: Check that optVal is exhausted.
+	if len(optVal) != 0 {
+		log.Warningf("netfilter: optVal should be exhausted after parsing matchers")
+		return nil, syserr.ErrInvalidArgument
+	}
+
 	return matchers, nil
 }
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
 func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
-	nflog("Parsing target of size %d", len(optVal))
+	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, syserr.ErrInvalidArgument
@@ -598,7 +608,8 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case iptables.Drop:
 			return iptables.UnconditionalDropTarget{}, nil
 		default:
-			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
+			log.Warningf("Unknown verdict: %v", verdict)
+			return nil, syserr.ErrInvalidArgument
 		}
 
 	case errorTargetName:
@@ -673,52 +684,3 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
-
-// printReplace prints information about the struct ipt_replace in optVal. It
-// is only for debugging.
-func printReplace(optVal []byte) {
-	// Basic replace info.
-	var replace linux.IPTReplace
-	replaceBuf := optVal[:linux.SizeOfIPTReplace]
-	optVal = optVal[linux.SizeOfIPTReplace:]
-	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
-	log.Infof("Replacing table %q: %+v", replace.Name.String(), replace)
-
-	// Read in the list of entries at the end of replace.
-	var totalOffset uint16
-	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		var entry linux.IPTEntry
-		entryBuf := optVal[:linux.SizeOfIPTEntry]
-		binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry)
-		log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
-
-		totalOffset += entry.NextOffset
-		if entry.TargetOffset == linux.SizeOfIPTEntry {
-			log.Infof("Entry has no matches.")
-		} else {
-			log.Infof("Entry has matches.")
-		}
-
-		var target linux.XTEntryTarget
-		targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget]
-		binary.Unmarshal(targetBuf, usermem.ByteOrder, &target)
-		log.Infof("Target named %q: %+v", target.Name.String(), target)
-
-		switch target.Name.String() {
-		case "":
-			var standardTarget linux.XTStandardTarget
-			stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget]
-			binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget)
-			log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
-		case errorTargetName:
-			var errorTarget linux.XTErrorTarget
-			etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget]
-			binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget)
-			log.Infof("Error target with name %q.", errorTarget.Name.String())
-		default:
-			log.Infof("Unknown target type.")
-		}
-
-		optVal = optVal[entry.NextOffset:]
-	}
-}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index d47447d40..ba5ed75b4 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -169,8 +169,6 @@ type IPHeaderFilter struct {
 	Protocol tcpip.TransportProtocolNumber
 }
 
-// TODO: Should these be able to marshal/unmarshal themselves?
-// TODO: Something has to map the name to the matcher.
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
@@ -179,19 +177,6 @@ type Matcher interface {
 	//
 	// Precondition: packet.NetworkHeader is set.
 	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
-
-	// TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry?
-	// CheckEntry(params MatchCheckEntryParams) bool
-}
-
-// TODO: Unused?
-type MatchCheckEntryParams struct {
-	Table  string // TODO: Tables should be an enum...
-	Filter IPHeaderFilter
-	Info   interface{} // TODO: Type unsafe.
-	// HookMask       uint8
-	// Family         uint8
-	// NFTCompat      bool
 }
 
 // A Target is the interface for taking an action for a packet.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index 65ae7f9e0..f59ca2027 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -16,33 +16,28 @@ package iptables
 
 import (
 	"fmt"
-	"runtime/debug"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+// TODO(gvisor.dev/issue/170): The following per-matcher params should be
+// supported:
+// - Table name
+// - Match size
+// - User size
+// - Hooks
+// - Proto
+// - Family
+
+// UDPMatcher matches UDP packets and their headers. It implements Matcher.
 type UDPMatcher struct {
-	Data UDPMatcherData
-
-	// tablename string
-	// unsigned int matchsize;
-	// unsigned int usersize;
-	// #ifdef CONFIG_COMPAT
-	// unsigned int compatsize;
-	// #endif
-	// unsigned int hooks;
-	// unsigned short proto;
-	// unsigned short family;
+	Data UDPMatcherParams
 }
 
-// TODO: Delete?
-// MatchCheckEntryParams
-
-type UDPMatcherData struct {
-	// Filter IPHeaderFilter
-
+// UDPMatcherParams are the parameters used to create a UDPMatcher.
+type UDPMatcherParams struct {
 	SourcePortStart      uint16
 	SourcePortEnd        uint16
 	DestinationPortStart uint16
@@ -50,12 +45,12 @@ type UDPMatcherData struct {
 	InverseFlags         uint8
 }
 
-func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) {
-	// TODO: We currently only support source port and destination port.
-	log.Infof("Adding rule with UDPMatcherData: %+v", data)
+// NewUDPMatcher returns a new instance of UDPMatcher.
+func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) {
+	log.Infof("Adding rule with UDPMatcherParams: %+v", data)
 
 	if data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported UDP matcher flags set")
+		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
@@ -65,21 +60,18 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 	return &UDPMatcher{Data: data}, nil
 }
 
-// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+// Match implements Matcher.Match.
 func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
-	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
-	// codesearch).
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the iptables.Check codepath as matchers are added.
 	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		log.Infof("UDPMatcher: wrong protocol number")
 		return false, false
 	}
 
 	// We dont't match fragments.
 	if frag := netHeader.FragmentOffset(); frag != 0 {
-		log.Infof("UDPMatcher: it's a fragment")
 		if frag == 1 {
 			return false, true
 		}
@@ -89,20 +81,18 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 
 	// Now we need the transport header. However, this may not have been set
 	// yet.
-	// TODO
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
 	var udpHeader header.UDP
 	if pkt.TransportHeader != nil {
-		log.Infof("UDPMatcher: transport header is not nil")
 		udpHeader = header.UDP(pkt.TransportHeader)
 	} else {
-		log.Infof("UDPMatcher: transport header is nil")
-		log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil)
 		// The UDP header hasn't been parsed yet. We have to do it here.
 		if len(pkt.Data.First()) < header.UDPMinimumSize {
 			// There's no valid UDP header here, so we hotdrop the
 			// packet.
-			// TODO: Stats.
-			log.Warningf("Dropping UDP packet: size to small.")
+			log.Warningf("Dropping UDP packet: size too small.")
 			return false, true
 		}
 		udpHeader = header.UDP(pkt.Data.First())
@@ -112,10 +102,6 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	// matching range.
 	sourcePort := udpHeader.SourcePort()
 	destinationPort := udpHeader.DestinationPort()
-	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
-		udpHeader.SourcePort(), udpHeader.DestinationPort(),
-		um.Data.SourcePortStart, um.Data.SourcePortEnd,
-		um.Data.DestinationPortStart, um.Data.DestinationPortEnd)
 	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
 		return false, false
 	}
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index bc963d40e..e9f0978eb 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -264,9 +264,9 @@ func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
 	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
 		return err
 	}
-	// if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-	// 	return err
-	// }
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
+		return err
+	}
 	return filterTable("-L")
 }
 
-- 
cgit v1.2.3


From e889f95671a9b7b1c6f65cb6fbc1b865a896e827 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 12:30:06 -0800
Subject: More cleanup.

---
 pkg/tcpip/iptables/udp_matcher.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index f59ca2027..3bb076f9c 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -73,9 +73,9 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	// We dont't match fragments.
 	if frag := netHeader.FragmentOffset(); frag != 0 {
 		if frag == 1 {
+			log.Warningf("Dropping UDP packet: malicious fragmented packet.")
 			return false, true
 		}
-		log.Warningf("Dropping UDP packet: malicious fragmented packet.")
 		return false, false
 	}
 
-- 
cgit v1.2.3


From 6b14be4246e8ed3779bf69dbd59e669caf3f5704 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 27 Jan 2020 10:08:18 -0800
Subject: Refactor to hide C from channel.Endpoint.

This is to aid later implementation for /dev/net/tun device.

PiperOrigin-RevId: 291746025
---
 pkg/tcpip/link/channel/channel.go                  |  43 ++++-
 pkg/tcpip/network/arp/arp_test.go                  |  16 +-
 pkg/tcpip/network/ipv6/icmp_test.go                |   7 +-
 pkg/tcpip/stack/ndp_test.go                        |  87 +++++-----
 pkg/tcpip/stack/stack_test.go                      |   4 +-
 pkg/tcpip/stack/transport_test.go                  |   6 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  86 +++++-----
 pkg/tcpip/transport/udp/udp_test.go                | 182 +++++++++++----------
 8 files changed, 229 insertions(+), 202 deletions(-)

diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 70188551f..71b9da797 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -18,6 +18,8 @@
 package channel
 
 import (
+	"context"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -38,25 +40,52 @@ type Endpoint struct {
 	linkAddr   tcpip.LinkAddress
 	GSO        bool
 
-	// C is where outbound packets are queued.
-	C chan PacketInfo
+	// c is where outbound packets are queued.
+	c chan PacketInfo
 }
 
 // New creates a new channel endpoint.
 func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 	return &Endpoint{
-		C:        make(chan PacketInfo, size),
+		c:        make(chan PacketInfo, size),
 		mtu:      mtu,
 		linkAddr: linkAddr,
 	}
 }
 
+// Close closes e. Further packet injections will panic. Reads continue to
+// succeed until all packets are read.
+func (e *Endpoint) Close() {
+	close(e.c)
+}
+
+// Read does non-blocking read for one packet from the outbound packet queue.
+func (e *Endpoint) Read() (PacketInfo, bool) {
+	select {
+	case pkt := <-e.c:
+		return pkt, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+// ReadContext does blocking read for one packet from the outbound packet queue.
+// It can be cancelled by ctx, and in this case, it returns false.
+func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	select {
+	case pkt := <-e.c:
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
 // Drain removes all outbound packets from the channel and counts them.
 func (e *Endpoint) Drain() int {
 	c := 0
 	for {
 		select {
-		case <-e.C:
+		case <-e.c:
 			c++
 		default:
 			return c
@@ -125,7 +154,7 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 	}
 
 	select {
-	case e.C <- p:
+	case e.c <- p:
 	default:
 	}
 
@@ -150,7 +179,7 @@ packetLoop:
 		}
 
 		select {
-		case e.C <- p:
+		case e.c <- p:
 			n++
 		default:
 			break packetLoop
@@ -169,7 +198,7 @@ func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	}
 
 	select {
-	case e.C <- p:
+	case e.c <- p:
 	default:
 	}
 
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 8e6048a21..03cf03b6d 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -15,6 +15,7 @@
 package arp_test
 
 import (
+	"context"
 	"strconv"
 	"testing"
 	"time"
@@ -83,7 +84,7 @@ func newTestContext(t *testing.T) *testContext {
 }
 
 func (c *testContext) cleanup() {
-	close(c.linkEP.C)
+	c.linkEP.Close()
 }
 
 func TestDirectRequest(t *testing.T) {
@@ -110,7 +111,7 @@ func TestDirectRequest(t *testing.T) {
 	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
 			inject(address)
-			pi := <-c.linkEP.C
+			pi, _ := c.linkEP.ReadContext(context.Background())
 			if pi.Proto != arp.ProtocolNumber {
 				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
 			}
@@ -134,12 +135,11 @@ func TestDirectRequest(t *testing.T) {
 	}
 
 	inject(stackAddrBad)
-	select {
-	case pkt := <-c.linkEP.C:
+	// Sleep tests are gross, but this will only potentially flake
+	// if there's a bug. If there is no bug this will reliably
+	// succeed.
+	ctx, _ := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	if pkt, ok := c.linkEP.ReadContext(ctx); ok {
 		t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto)
-	case <-time.After(100 * time.Millisecond):
-		// Sleep tests are gross, but this will only potentially flake
-		// if there's a bug. If there is no bug this will reliably
-		// succeed.
 	}
 }
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index a2fdc5dcd..7a6820643 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -15,6 +15,7 @@
 package ipv6
 
 import (
+	"context"
 	"reflect"
 	"strings"
 	"testing"
@@ -264,8 +265,8 @@ func newTestContext(t *testing.T) *testContext {
 }
 
 func (c *testContext) cleanup() {
-	close(c.linkEP0.C)
-	close(c.linkEP1.C)
+	c.linkEP0.Close()
+	c.linkEP1.Close()
 }
 
 type routeArgs struct {
@@ -276,7 +277,7 @@ type routeArgs struct {
 func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) {
 	t.Helper()
 
-	pi := <-args.src.C
+	pi, _ := args.src.ReadContext(context.Background())
 
 	{
 		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index f9460bd51..ad2c6f601 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -15,6 +15,7 @@
 package stack_test
 
 import (
+	"context"
 	"encoding/binary"
 	"fmt"
 	"testing"
@@ -405,7 +406,7 @@ func TestDADResolve(t *testing.T) {
 
 			// Validate the sent Neighbor Solicitation messages.
 			for i := uint8(0); i < test.dupAddrDetectTransmits; i++ {
-				p := <-e.C
+				p, _ := e.ReadContext(context.Background())
 
 				// Make sure its an IPv6 packet.
 				if p.Proto != header.IPv6ProtocolNumber {
@@ -3285,29 +3286,29 @@ func TestRouterSolicitation(t *testing.T) {
 				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
-					select {
-					case p := <-e.C:
-						if p.Proto != header.IPv6ProtocolNumber {
-							t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
-						}
-						checker.IPv6(t,
-							p.Pkt.Header.View(),
-							checker.SrcAddr(header.IPv6Any),
-							checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
-							checker.TTL(header.NDPHopLimit),
-							checker.NDPRS(),
-						)
-
-					case <-time.After(timeout):
+					ctx, _ := context.WithTimeout(context.Background(), timeout)
+					p, ok := e.ReadContext(ctx)
+					if !ok {
 						t.Fatal("timed out waiting for packet")
+						return
 					}
+
+					if p.Proto != header.IPv6ProtocolNumber {
+						t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+					}
+					checker.IPv6(t,
+						p.Pkt.Header.View(),
+						checker.SrcAddr(header.IPv6Any),
+						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+						checker.TTL(header.NDPHopLimit),
+						checker.NDPRS(),
+					)
 				}
 				waitForNothing := func(timeout time.Duration) {
 					t.Helper()
-					select {
-					case <-e.C:
+					ctx, _ := context.WithTimeout(context.Background(), timeout)
+					if _, ok := e.ReadContext(ctx); ok {
 						t.Fatal("unexpectedly got a packet")
-					case <-time.After(timeout):
 					}
 				}
 				s := stack.New(stack.Options{
@@ -3362,20 +3363,21 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 	e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
 	waitForPkt := func(timeout time.Duration) {
 		t.Helper()
-		select {
-		case p := <-e.C:
-			if p.Proto != header.IPv6ProtocolNumber {
-				t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
-			}
-			checker.IPv6(t, p.Pkt.Header.View(),
-				checker.SrcAddr(header.IPv6Any),
-				checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
-				checker.TTL(header.NDPHopLimit),
-				checker.NDPRS())
-
-		case <-time.After(timeout):
+		ctx, _ := context.WithTimeout(context.Background(), timeout)
+		p, ok := e.ReadContext(ctx)
+		if !ok {
 			t.Fatal("timed out waiting for packet")
+			return
 		}
+
+		if p.Proto != header.IPv6ProtocolNumber {
+			t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+		}
+		checker.IPv6(t, p.Pkt.Header.View(),
+			checker.SrcAddr(header.IPv6Any),
+			checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+			checker.TTL(header.NDPHopLimit),
+			checker.NDPRS())
 	}
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
@@ -3391,23 +3393,20 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 
 	// Enable forwarding which should stop router solicitations.
 	s.SetForwarding(true)
-	select {
-	case <-e.C:
+	ctx, _ := context.WithTimeout(context.Background(), delay+defaultTimeout)
+	if _, ok := e.ReadContext(ctx); ok {
 		// A single RS may have been sent before forwarding was enabled.
-		select {
-		case <-e.C:
+		ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
+		if _, ok = e.ReadContext(ctx); ok {
 			t.Fatal("Should not have sent more than one RS message")
-		case <-time.After(interval + defaultTimeout):
 		}
-	case <-time.After(delay + defaultTimeout):
 	}
 
 	// Enabling forwarding again should do nothing.
 	s.SetForwarding(true)
-	select {
-	case <-e.C:
+	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
+	if _, ok := e.ReadContext(ctx); ok {
 		t.Fatal("unexpectedly got a packet after becoming a router")
-	case <-time.After(delay + defaultTimeout):
 	}
 
 	// Disable forwarding which should start router solicitations.
@@ -3415,17 +3414,15 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 	waitForPkt(delay + defaultAsyncEventTimeout)
 	waitForPkt(interval + defaultAsyncEventTimeout)
 	waitForPkt(interval + defaultAsyncEventTimeout)
-	select {
-	case <-e.C:
+	ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
+	if _, ok := e.ReadContext(ctx); ok {
 		t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
-	case <-time.After(interval + defaultTimeout):
 	}
 
 	// Disabling forwarding again should do nothing.
 	s.SetForwarding(false)
-	select {
-	case <-e.C:
+	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
+	if _, ok := e.ReadContext(ctx); ok {
 		t.Fatal("unexpectedly got a packet after becoming a router")
-	case <-time.After(delay + defaultTimeout):
 	}
 }
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index dad288642..834fe9487 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1880,9 +1880,7 @@ func TestNICForwarding(t *testing.T) {
 		Data: buf.ToVectorisedView(),
 	})
 
-	select {
-	case <-ep2.C:
-	default:
+	if _, ok := ep2.Read(); !ok {
 		t.Fatal("Packet not forwarded")
 	}
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index f50604a8a..869c69a6d 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -623,10 +623,8 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatalf("Write failed: %v", err)
 	}
 
-	var p channel.PacketInfo
-	select {
-	case p = <-ep2.C:
-	default:
+	p, ok := ep2.Read()
+	if !ok {
 		t.Fatal("Response packet not forwarded")
 	}
 
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 822907998..730ac4292 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -18,6 +18,7 @@ package context
 
 import (
 	"bytes"
+	"context"
 	"testing"
 	"time"
 
@@ -215,11 +216,9 @@ func (c *Context) Stack() *stack.Stack {
 func (c *Context) CheckNoPacketTimeout(errMsg string, wait time.Duration) {
 	c.t.Helper()
 
-	select {
-	case <-c.linkEP.C:
+	ctx, _ := context.WithTimeout(context.Background(), wait)
+	if _, ok := c.linkEP.ReadContext(ctx); ok {
 		c.t.Fatal(errMsg)
-
-	case <-time.After(wait):
 	}
 }
 
@@ -234,27 +233,27 @@ func (c *Context) CheckNoPacket(errMsg string) {
 // 2 seconds.
 func (c *Context) GetPacket() []byte {
 	c.t.Helper()
-	select {
-	case p := <-c.linkEP.C:
-		if p.Proto != ipv4.ProtocolNumber {
-			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
-		}
 
-		hdr := p.Pkt.Header.View()
-		b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	p, ok := c.linkEP.ReadContext(ctx)
+	if !ok {
+		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
 
-		if p.GSO != nil && p.GSO.L3HdrLen != header.IPv4MinimumSize {
-			c.t.Errorf("L3HdrLen %v (expected %v)", p.GSO.L3HdrLen, header.IPv4MinimumSize)
-		}
+	if p.Proto != ipv4.ProtocolNumber {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
+	}
 
-		checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
-		return b
+	hdr := p.Pkt.Header.View()
+	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
 
-	case <-time.After(2 * time.Second):
-		c.t.Fatalf("Packet wasn't written out")
+	if p.GSO != nil && p.GSO.L3HdrLen != header.IPv4MinimumSize {
+		c.t.Errorf("L3HdrLen %v (expected %v)", p.GSO.L3HdrLen, header.IPv4MinimumSize)
 	}
 
-	return nil
+	checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
+	return b
 }
 
 // GetPacketNonBlocking reads a packet from the link layer endpoint
@@ -263,20 +262,21 @@ func (c *Context) GetPacket() []byte {
 // nil immediately.
 func (c *Context) GetPacketNonBlocking() []byte {
 	c.t.Helper()
-	select {
-	case p := <-c.linkEP.C:
-		if p.Proto != ipv4.ProtocolNumber {
-			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
-		}
 
-		hdr := p.Pkt.Header.View()
-		b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
-
-		checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
-		return b
-	default:
+	p, ok := c.linkEP.Read()
+	if !ok {
 		return nil
 	}
+
+	if p.Proto != ipv4.ProtocolNumber {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
+	}
+
+	hdr := p.Pkt.Header.View()
+	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+
+	checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
+	return b
 }
 
 // SendICMPPacket builds and sends an ICMPv4 packet via the link layer endpoint.
@@ -484,23 +484,23 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 // and asserts that it is an IPv6 Packet with the expected src/dest addresses.
 func (c *Context) GetV6Packet() []byte {
 	c.t.Helper()
-	select {
-	case p := <-c.linkEP.C:
-		if p.Proto != ipv6.ProtocolNumber {
-			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv6.ProtocolNumber)
-		}
-		b := make([]byte, p.Pkt.Header.UsedLength()+p.Pkt.Data.Size())
-		copy(b, p.Pkt.Header.View())
-		copy(b[p.Pkt.Header.UsedLength():], p.Pkt.Data.ToView())
-
-		checker.IPv6(c.t, b, checker.SrcAddr(StackV6Addr), checker.DstAddr(TestV6Addr))
-		return b
 
-	case <-time.After(2 * time.Second):
+	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	p, ok := c.linkEP.ReadContext(ctx)
+	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
+
+	if p.Proto != ipv6.ProtocolNumber {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv6.ProtocolNumber)
 	}
+	b := make([]byte, p.Pkt.Header.UsedLength()+p.Pkt.Data.Size())
+	copy(b, p.Pkt.Header.View())
+	copy(b[p.Pkt.Header.UsedLength():], p.Pkt.Data.ToView())
 
-	return nil
+	checker.IPv6(c.t, b, checker.SrcAddr(StackV6Addr), checker.DstAddr(TestV6Addr))
+	return b
 }
 
 // SendV6Packet builds and sends an IPv6 Packet via the link layer endpoint of
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index c6927cfe3..f0ff3fe71 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -16,6 +16,7 @@ package udp_test
 
 import (
 	"bytes"
+	"context"
 	"fmt"
 	"math/rand"
 	"testing"
@@ -357,30 +358,29 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.NetworkChecker) []byte {
 	c.t.Helper()
 
-	select {
-	case p := <-c.linkEP.C:
-		if p.Proto != flow.netProto() {
-			c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
-		}
-
-		hdr := p.Pkt.Header.View()
-		b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
-
-		h := flow.header4Tuple(outgoing)
-		checkers := append(
-			checkers,
-			checker.SrcAddr(h.srcAddr.Addr),
-			checker.DstAddr(h.dstAddr.Addr),
-			checker.UDP(checker.DstPort(h.dstAddr.Port)),
-		)
-		flow.checkerFn()(c.t, b, checkers...)
-		return b
-
-	case <-time.After(2 * time.Second):
+	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	p, ok := c.linkEP.ReadContext(ctx)
+	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
+		return nil
 	}
 
-	return nil
+	if p.Proto != flow.netProto() {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
+	}
+
+	hdr := p.Pkt.Header.View()
+	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+
+	h := flow.header4Tuple(outgoing)
+	checkers = append(
+		checkers,
+		checker.SrcAddr(h.srcAddr.Addr),
+		checker.DstAddr(h.dstAddr.Addr),
+		checker.UDP(checker.DstPort(h.dstAddr.Port)),
+	)
+	flow.checkerFn()(c.t, b, checkers...)
+	return b
 }
 
 // injectPacket creates a packet of the given flow and with the given payload,
@@ -1541,48 +1541,50 @@ func TestV4UnknownDestination(t *testing.T) {
 			}
 			c.injectPacket(tc.flow, payload)
 			if !tc.icmpRequired {
-				select {
-				case p := <-c.linkEP.C:
+				ctx, _ := context.WithTimeout(context.Background(), time.Second)
+				if p, ok := c.linkEP.ReadContext(ctx); ok {
 					t.Fatalf("unexpected packet received: %+v", p)
-				case <-time.After(1 * time.Second):
-					return
 				}
+				return
 			}
 
-			select {
-			case p := <-c.linkEP.C:
-				var pkt []byte
-				pkt = append(pkt, p.Pkt.Header.View()...)
-				pkt = append(pkt, p.Pkt.Data.ToView()...)
-				if got, want := len(pkt), header.IPv4MinimumProcessableDatagramSize; got > want {
-					t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
-				}
+			// ICMP required.
+			ctx, _ := context.WithTimeout(context.Background(), time.Second)
+			p, ok := c.linkEP.ReadContext(ctx)
+			if !ok {
+				t.Fatalf("packet wasn't written out")
+				return
+			}
 
-				hdr := header.IPv4(pkt)
-				checker.IPv4(t, hdr, checker.ICMPv4(
-					checker.ICMPv4Type(header.ICMPv4DstUnreachable),
-					checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
+			var pkt []byte
+			pkt = append(pkt, p.Pkt.Header.View()...)
+			pkt = append(pkt, p.Pkt.Data.ToView()...)
+			if got, want := len(pkt), header.IPv4MinimumProcessableDatagramSize; got > want {
+				t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
+			}
 
-				icmpPkt := header.ICMPv4(hdr.Payload())
-				payloadIPHeader := header.IPv4(icmpPkt.Payload())
-				wantLen := len(payload)
-				if tc.largePayload {
-					wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize*2 - header.ICMPv4MinimumSize - header.UDPMinimumSize
-				}
+			hdr := header.IPv4(pkt)
+			checker.IPv4(t, hdr, checker.ICMPv4(
+				checker.ICMPv4Type(header.ICMPv4DstUnreachable),
+				checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
 
-				// In case of large payloads the IP packet may be truncated. Update
-				// the length field before retrieving the udp datagram payload.
-				payloadIPHeader.SetTotalLength(uint16(wantLen + header.UDPMinimumSize + header.IPv4MinimumSize))
+			icmpPkt := header.ICMPv4(hdr.Payload())
+			payloadIPHeader := header.IPv4(icmpPkt.Payload())
+			wantLen := len(payload)
+			if tc.largePayload {
+				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize*2 - header.ICMPv4MinimumSize - header.UDPMinimumSize
+			}
 
-				origDgram := header.UDP(payloadIPHeader.Payload())
-				if got, want := len(origDgram.Payload()), wantLen; got != want {
-					t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
-				}
-				if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
-					t.Fatalf("unexpected payload got: %d, want: %d", got, want)
-				}
-			case <-time.After(1 * time.Second):
-				t.Fatalf("packet wasn't written out")
+			// In case of large payloads the IP packet may be truncated. Update
+			// the length field before retrieving the udp datagram payload.
+			payloadIPHeader.SetTotalLength(uint16(wantLen + header.UDPMinimumSize + header.IPv4MinimumSize))
+
+			origDgram := header.UDP(payloadIPHeader.Payload())
+			if got, want := len(origDgram.Payload()), wantLen; got != want {
+				t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
+			}
+			if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
+				t.Fatalf("unexpected payload got: %d, want: %d", got, want)
 			}
 		})
 	}
@@ -1615,47 +1617,49 @@ func TestV6UnknownDestination(t *testing.T) {
 			}
 			c.injectPacket(tc.flow, payload)
 			if !tc.icmpRequired {
-				select {
-				case p := <-c.linkEP.C:
+				ctx, _ := context.WithTimeout(context.Background(), time.Second)
+				if p, ok := c.linkEP.ReadContext(ctx); ok {
 					t.Fatalf("unexpected packet received: %+v", p)
-				case <-time.After(1 * time.Second):
-					return
 				}
+				return
 			}
 
-			select {
-			case p := <-c.linkEP.C:
-				var pkt []byte
-				pkt = append(pkt, p.Pkt.Header.View()...)
-				pkt = append(pkt, p.Pkt.Data.ToView()...)
-				if got, want := len(pkt), header.IPv6MinimumMTU; got > want {
-					t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
-				}
+			// ICMP required.
+			ctx, _ := context.WithTimeout(context.Background(), time.Second)
+			p, ok := c.linkEP.ReadContext(ctx)
+			if !ok {
+				t.Fatalf("packet wasn't written out")
+				return
+			}
+
+			var pkt []byte
+			pkt = append(pkt, p.Pkt.Header.View()...)
+			pkt = append(pkt, p.Pkt.Data.ToView()...)
+			if got, want := len(pkt), header.IPv6MinimumMTU; got > want {
+				t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
+			}
 
-				hdr := header.IPv6(pkt)
-				checker.IPv6(t, hdr, checker.ICMPv6(
-					checker.ICMPv6Type(header.ICMPv6DstUnreachable),
-					checker.ICMPv6Code(header.ICMPv6PortUnreachable)))
+			hdr := header.IPv6(pkt)
+			checker.IPv6(t, hdr, checker.ICMPv6(
+				checker.ICMPv6Type(header.ICMPv6DstUnreachable),
+				checker.ICMPv6Code(header.ICMPv6PortUnreachable)))
 
-				icmpPkt := header.ICMPv6(hdr.Payload())
-				payloadIPHeader := header.IPv6(icmpPkt.Payload())
-				wantLen := len(payload)
-				if tc.largePayload {
-					wantLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize
-				}
-				// In case of large payloads the IP packet may be truncated. Update
-				// the length field before retrieving the udp datagram payload.
-				payloadIPHeader.SetPayloadLength(uint16(wantLen + header.UDPMinimumSize))
+			icmpPkt := header.ICMPv6(hdr.Payload())
+			payloadIPHeader := header.IPv6(icmpPkt.Payload())
+			wantLen := len(payload)
+			if tc.largePayload {
+				wantLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize
+			}
+			// In case of large payloads the IP packet may be truncated. Update
+			// the length field before retrieving the udp datagram payload.
+			payloadIPHeader.SetPayloadLength(uint16(wantLen + header.UDPMinimumSize))
 
-				origDgram := header.UDP(payloadIPHeader.Payload())
-				if got, want := len(origDgram.Payload()), wantLen; got != want {
-					t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
-				}
-				if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
-					t.Fatalf("unexpected payload got: %v, want: %v", got, want)
-				}
-			case <-time.After(1 * time.Second):
-				t.Fatalf("packet wasn't written out")
+			origDgram := header.UDP(payloadIPHeader.Payload())
+			if got, want := len(origDgram.Payload()), wantLen; got != want {
+				t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
+			}
+			if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
+				t.Fatalf("unexpected payload got: %v, want: %v", got, want)
 			}
 		})
 	}
-- 
cgit v1.2.3


From 13c1f38dfa215ab3e3cc70642721f55ab226d5b7 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 27 Jan 2020 12:19:20 -0800
Subject: Update bug number for supporting extended attribute namespaces.

PiperOrigin-RevId: 291774815
---
 pkg/sentry/syscalls/linux/sys_xattr.go | 1 +
 test/syscalls/linux/xattr.cc           | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index e35c077d6..77deb8980 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -103,6 +103,7 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64)
 		return 0, "", err
 	}
 
+	// TODO(b/148380782): Support xattrs in namespaces other than "user".
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
 		return 0, "", syserror.EOPNOTSUPP
 	}
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index e77c355d7..ab21d68c6 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -131,7 +131,7 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
-  // TODO(b/127675828): Support setxattr and getxattr with "trusted" prefix.
+  // TODO(b/148380782): Support setxattr and getxattr with "trusted" prefix.
   SKIP_IF(IsRunningOnGvisor());
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
-- 
cgit v1.2.3


From fbfcfcf5b03b4fddb4f00a3e8721cba07fc5343f Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 27 Jan 2020 12:32:07 -0800
Subject: Update ChecksumVVWithoffset to use unrolled version.

Fixes #1656

PiperOrigin-RevId: 291777279
---
 pkg/tcpip/header/checksum.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index 204285576..14a4b2b44 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -213,7 +213,7 @@ func ChecksumVVWithOffset(vv buffer.VectorisedView, initial uint16, off int, siz
 		}
 		v = v[:l]
 
-		sum, odd = calculateChecksum(v, odd, uint32(sum))
+		sum, odd = unrolledCalculateChecksum(v, odd, uint32(sum))
 
 		size -= len(v)
 		if size == 0 {
-- 
cgit v1.2.3


From 90ec5961667a1c4a21702e64adb383403af8ad25 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 13:22:50 -0800
Subject: Fix licenses.

The preferred Copyright holder is "The gVisor Authors".

PiperOrigin-RevId: 291786657
---
 pkg/sentry/kernel/fd_table.go              | 2 +-
 pkg/sentry/kernel/fd_table_test.go         | 2 +-
 pkg/sentry/kernel/fd_table_unsafe.go       | 2 +-
 pkg/sentry/platform/ring0/entry_arm64.go   | 2 +-
 pkg/sentry/platform/ring0/kernel_arm64.go  | 2 +-
 pkg/sentry/platform/ring0/lib_arm64.go     | 2 +-
 pkg/sentry/platform/ring0/offsets_arm64.go | 2 +-
 pkg/tcpip/iptables/iptables.go             | 2 +-
 pkg/tcpip/iptables/types.go                | 2 +-
 runsc/cmd/help.go                          | 2 +-
 tools/go_marshal/main.go                   | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index cd1501f85..0ad4135b3 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index eccb7d1e7..86164df49 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index e009df974..e9fdb0917 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.go b/pkg/sentry/platform/ring0/entry_arm64.go
index 0dfa42c36..62a93f3d6 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.go
+++ b/pkg/sentry/platform/ring0/entry_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index ed82a131e..c3d341998 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 8bcfe1032..af075aae4 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index cd2a65f97..8c960c749 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index fc06b5b87..4bfb3149e 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor authors.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index a8b972f1b..50893cc55 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor authors.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
index ff4f901cb..930e8454f 100644
--- a/runsc/cmd/help.go
+++ b/runsc/cmd/help.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go
index e1a97b311..f74be5c29 100644
--- a/tools/go_marshal/main.go
+++ b/tools/go_marshal/main.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From 0e2f1b7abd219f39d67cc2cecd00c441a13eeb29 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 15:17:58 -0800
Subject: Update package locations.

Because the abi will depend on the core types for marshalling (usermem,
context, safemem, safecopy), these need to be flattened from the sentry
directory. These packages contain no sentry-specific details.

PiperOrigin-RevId: 291811289
---
 pkg/abi/abi.go                                     |   4 +
 pkg/context/BUILD                                  |  13 +
 pkg/context/context.go                             | 141 +++++
 pkg/safecopy/BUILD                                 |  29 +
 pkg/safecopy/LICENSE                               |  27 +
 pkg/safecopy/atomic_amd64.s                        | 136 +++++
 pkg/safecopy/atomic_arm64.s                        | 126 +++++
 pkg/safecopy/memclr_amd64.s                        | 147 +++++
 pkg/safecopy/memclr_arm64.s                        |  74 +++
 pkg/safecopy/memcpy_amd64.s                        | 250 +++++++++
 pkg/safecopy/memcpy_arm64.s                        |  78 +++
 pkg/safecopy/safecopy.go                           | 144 +++++
 pkg/safecopy/safecopy_test.go                      | 617 +++++++++++++++++++++
 pkg/safecopy/safecopy_unsafe.go                    | 335 +++++++++++
 pkg/safecopy/sighandler_amd64.s                    | 133 +++++
 pkg/safecopy/sighandler_arm64.s                    | 143 +++++
 pkg/safemem/BUILD                                  |  27 +
 pkg/safemem/block_unsafe.go                        | 279 ++++++++++
 pkg/safemem/io.go                                  | 392 +++++++++++++
 pkg/safemem/io_test.go                             | 199 +++++++
 pkg/safemem/safemem.go                             |  16 +
 pkg/safemem/seq_test.go                            | 196 +++++++
 pkg/safemem/seq_unsafe.go                          | 299 ++++++++++
 pkg/sentry/arch/BUILD                              |   4 +-
 pkg/sentry/arch/arch.go                            |   2 +-
 pkg/sentry/arch/arch_aarch64.go                    |   2 +-
 pkg/sentry/arch/arch_amd64.go                      |   2 +-
 pkg/sentry/arch/arch_arm64.go                      |   2 +-
 pkg/sentry/arch/arch_state_x86.go                  |   2 +-
 pkg/sentry/arch/arch_x86.go                        |   2 +-
 pkg/sentry/arch/auxv.go                            |   2 +-
 pkg/sentry/arch/signal.go                          |   2 +-
 pkg/sentry/arch/signal_amd64.go                    |   2 +-
 pkg/sentry/arch/signal_arm64.go                    |   2 +-
 pkg/sentry/arch/signal_stack.go                    |   2 +-
 pkg/sentry/arch/stack.go                           |   4 +-
 pkg/sentry/context/BUILD                           |  13 -
 pkg/sentry/context/context.go                      | 141 -----
 pkg/sentry/context/contexttest/BUILD               |  21 -
 pkg/sentry/context/contexttest/contexttest.go      | 188 -------
 pkg/sentry/contexttest/BUILD                       |  21 +
 pkg/sentry/contexttest/contexttest.go              | 188 +++++++
 pkg/sentry/fs/BUILD                                |  12 +-
 pkg/sentry/fs/anon/BUILD                           |   4 +-
 pkg/sentry/fs/anon/anon.go                         |   4 +-
 pkg/sentry/fs/attr.go                              |   2 +-
 pkg/sentry/fs/context.go                           |   2 +-
 pkg/sentry/fs/copy_up.go                           |   4 +-
 pkg/sentry/fs/copy_up_test.go                      |   2 +-
 pkg/sentry/fs/dev/BUILD                            |   6 +-
 pkg/sentry/fs/dev/dev.go                           |   4 +-
 pkg/sentry/fs/dev/fs.go                            |   2 +-
 pkg/sentry/fs/dev/full.go                          |   4 +-
 pkg/sentry/fs/dev/null.go                          |   2 +-
 pkg/sentry/fs/dev/random.go                        |   6 +-
 pkg/sentry/fs/dev/tty.go                           |   2 +-
 pkg/sentry/fs/dirent.go                            |   2 +-
 pkg/sentry/fs/dirent_refs_test.go                  |   4 +-
 pkg/sentry/fs/fdpipe/BUILD                         |  12 +-
 pkg/sentry/fs/fdpipe/pipe.go                       |   6 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                |   2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go           |   6 +-
 pkg/sentry/fs/fdpipe/pipe_state.go                 |   2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go                  |   4 +-
 pkg/sentry/fs/file.go                              |   4 +-
 pkg/sentry/fs/file_operations.go                   |   4 +-
 pkg/sentry/fs/file_overlay.go                      |   4 +-
 pkg/sentry/fs/file_overlay_test.go                 |   2 +-
 pkg/sentry/fs/filesystems.go                       |   2 +-
 pkg/sentry/fs/filetest/BUILD                       |   6 +-
 pkg/sentry/fs/filetest/filetest.go                 |   6 +-
 pkg/sentry/fs/fs.go                                |   2 +-
 pkg/sentry/fs/fsutil/BUILD                         |  14 +-
 pkg/sentry/fs/fsutil/dirty_set.go                  |   6 +-
 pkg/sentry/fs/fsutil/dirty_set_test.go             |   2 +-
 pkg/sentry/fs/fsutil/file.go                       |   4 +-
 pkg/sentry/fs/fsutil/file_range_set.go             |   6 +-
 pkg/sentry/fs/fsutil/host_file_mapper.go           |   4 +-
 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go    |   2 +-
 pkg/sentry/fs/fsutil/host_mappable.go              |   6 +-
 pkg/sentry/fs/fsutil/inode.go                      |   2 +-
 pkg/sentry/fs/fsutil/inode_cached.go               |   6 +-
 pkg/sentry/fs/fsutil/inode_cached_test.go          |   8 +-
 pkg/sentry/fs/gofer/BUILD                          |  10 +-
 pkg/sentry/fs/gofer/attr.go                        |   4 +-
 pkg/sentry/fs/gofer/cache_policy.go                |   2 +-
 pkg/sentry/fs/gofer/context_file.go                |   2 +-
 pkg/sentry/fs/gofer/file.go                        |   4 +-
 pkg/sentry/fs/gofer/file_state.go                  |   2 +-
 pkg/sentry/fs/gofer/fs.go                          |   2 +-
 pkg/sentry/fs/gofer/gofer_test.go                  |   4 +-
 pkg/sentry/fs/gofer/handles.go                     |   4 +-
 pkg/sentry/fs/gofer/inode.go                       |   4 +-
 pkg/sentry/fs/gofer/inode_state.go                 |   2 +-
 pkg/sentry/fs/gofer/path.go                        |   2 +-
 pkg/sentry/fs/gofer/session.go                     |   2 +-
 pkg/sentry/fs/gofer/session_state.go               |   2 +-
 pkg/sentry/fs/gofer/socket.go                      |   2 +-
 pkg/sentry/fs/gofer/util.go                        |   2 +-
 pkg/sentry/fs/host/BUILD                           |  12 +-
 pkg/sentry/fs/host/control.go                      |   2 +-
 pkg/sentry/fs/host/file.go                         |   6 +-
 pkg/sentry/fs/host/fs.go                           |   2 +-
 pkg/sentry/fs/host/fs_test.go                      |   4 +-
 pkg/sentry/fs/host/inode.go                        |   4 +-
 pkg/sentry/fs/host/inode_state.go                  |   2 +-
 pkg/sentry/fs/host/inode_test.go                   |   2 +-
 pkg/sentry/fs/host/socket.go                       |   2 +-
 pkg/sentry/fs/host/socket_test.go                  |   4 +-
 pkg/sentry/fs/host/tty.go                          |   4 +-
 pkg/sentry/fs/host/wait_test.go                    |   2 +-
 pkg/sentry/fs/inode.go                             |   2 +-
 pkg/sentry/fs/inode_operations.go                  |   2 +-
 pkg/sentry/fs/inode_overlay.go                     |   2 +-
 pkg/sentry/fs/inode_overlay_test.go                |   2 +-
 pkg/sentry/fs/inotify.go                           |   4 +-
 pkg/sentry/fs/inotify_event.go                     |   4 +-
 pkg/sentry/fs/mock.go                              |   2 +-
 pkg/sentry/fs/mount.go                             |   2 +-
 pkg/sentry/fs/mount_overlay.go                     |   2 +-
 pkg/sentry/fs/mount_test.go                        |   2 +-
 pkg/sentry/fs/mounts.go                            |   2 +-
 pkg/sentry/fs/mounts_test.go                       |   2 +-
 pkg/sentry/fs/offset.go                            |   2 +-
 pkg/sentry/fs/overlay.go                           |   4 +-
 pkg/sentry/fs/proc/BUILD                           |   8 +-
 pkg/sentry/fs/proc/cgroup.go                       |   2 +-
 pkg/sentry/fs/proc/cpuinfo.go                      |   2 +-
 pkg/sentry/fs/proc/exec_args.go                    |   4 +-
 pkg/sentry/fs/proc/fds.go                          |   2 +-
 pkg/sentry/fs/proc/filesystems.go                  |   2 +-
 pkg/sentry/fs/proc/fs.go                           |   2 +-
 pkg/sentry/fs/proc/inode.go                        |   4 +-
 pkg/sentry/fs/proc/loadavg.go                      |   2 +-
 pkg/sentry/fs/proc/meminfo.go                      |   4 +-
 pkg/sentry/fs/proc/mounts.go                       |   2 +-
 pkg/sentry/fs/proc/net.go                          |   4 +-
 pkg/sentry/fs/proc/proc.go                         |   2 +-
 pkg/sentry/fs/proc/seqfile/BUILD                   |  10 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go              |   4 +-
 pkg/sentry/fs/proc/seqfile/seqfile_test.go         |   6 +-
 pkg/sentry/fs/proc/stat.go                         |   2 +-
 pkg/sentry/fs/proc/sys.go                          |   4 +-
 pkg/sentry/fs/proc/sys_net.go                      |   4 +-
 pkg/sentry/fs/proc/sys_net_test.go                 |   4 +-
 pkg/sentry/fs/proc/task.go                         |   4 +-
 pkg/sentry/fs/proc/uid_gid_map.go                  |   4 +-
 pkg/sentry/fs/proc/uptime.go                       |   4 +-
 pkg/sentry/fs/proc/version.go                      |   2 +-
 pkg/sentry/fs/ramfs/BUILD                          |   6 +-
 pkg/sentry/fs/ramfs/dir.go                         |   2 +-
 pkg/sentry/fs/ramfs/socket.go                      |   2 +-
 pkg/sentry/fs/ramfs/symlink.go                     |   2 +-
 pkg/sentry/fs/ramfs/tree.go                        |   4 +-
 pkg/sentry/fs/ramfs/tree_test.go                   |   2 +-
 pkg/sentry/fs/splice.go                            |   2 +-
 pkg/sentry/fs/sys/BUILD                            |   4 +-
 pkg/sentry/fs/sys/devices.go                       |   2 +-
 pkg/sentry/fs/sys/fs.go                            |   2 +-
 pkg/sentry/fs/sys/sys.go                           |   4 +-
 pkg/sentry/fs/timerfd/BUILD                        |   4 +-
 pkg/sentry/fs/timerfd/timerfd.go                   |   4 +-
 pkg/sentry/fs/tmpfs/BUILD                          |  10 +-
 pkg/sentry/fs/tmpfs/file_regular.go                |   4 +-
 pkg/sentry/fs/tmpfs/file_test.go                   |   4 +-
 pkg/sentry/fs/tmpfs/fs.go                          |   2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                  |   6 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                       |   4 +-
 pkg/sentry/fs/tty/BUILD                            |  10 +-
 pkg/sentry/fs/tty/dir.go                           |   4 +-
 pkg/sentry/fs/tty/fs.go                            |   2 +-
 pkg/sentry/fs/tty/line_discipline.go               |   4 +-
 pkg/sentry/fs/tty/master.go                        |   4 +-
 pkg/sentry/fs/tty/queue.go                         |   6 +-
 pkg/sentry/fs/tty/slave.go                         |   4 +-
 pkg/sentry/fs/tty/terminal.go                      |   4 +-
 pkg/sentry/fs/tty/tty_test.go                      |   4 +-
 pkg/sentry/fsimpl/ext/BUILD                        |  12 +-
 pkg/sentry/fsimpl/ext/benchmark/BUILD              |   4 +-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go  |   4 +-
 pkg/sentry/fsimpl/ext/directory.go                 |   2 +-
 pkg/sentry/fsimpl/ext/ext.go                       |   2 +-
 pkg/sentry/fsimpl/ext/ext_test.go                  |   6 +-
 pkg/sentry/fsimpl/ext/file_description.go          |   2 +-
 pkg/sentry/fsimpl/ext/filesystem.go                |   2 +-
 pkg/sentry/fsimpl/ext/regular_file.go              |   6 +-
 pkg/sentry/fsimpl/ext/symlink.go                   |   4 +-
 pkg/sentry/fsimpl/kernfs/BUILD                     |  10 +-
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go     |   4 +-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go           |   4 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go             |   2 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go        |   2 +-
 pkg/sentry/fsimpl/kernfs/kernfs.go                 |   2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go            |   6 +-
 pkg/sentry/fsimpl/kernfs/symlink.go                |   2 +-
 pkg/sentry/fsimpl/proc/BUILD                       |  12 +-
 pkg/sentry/fsimpl/proc/filesystem.go               |   2 +-
 pkg/sentry/fsimpl/proc/subtasks.go                 |   2 +-
 pkg/sentry/fsimpl/proc/task.go                     |   2 +-
 pkg/sentry/fsimpl/proc/task_files.go               |   6 +-
 pkg/sentry/fsimpl/proc/tasks.go                    |   2 +-
 pkg/sentry/fsimpl/proc/tasks_files.go              |   4 +-
 pkg/sentry/fsimpl/proc/tasks_net.go                |   4 +-
 pkg/sentry/fsimpl/proc/tasks_sys.go                |   2 +-
 pkg/sentry/fsimpl/proc/tasks_sys_test.go           |   2 +-
 pkg/sentry/fsimpl/proc/tasks_test.go               |   4 +-
 pkg/sentry/fsimpl/sys/BUILD                        |   2 +-
 pkg/sentry/fsimpl/sys/sys.go                       |   2 +-
 pkg/sentry/fsimpl/testutil/BUILD                   |   4 +-
 pkg/sentry/fsimpl/testutil/kernel.go               |   2 +-
 pkg/sentry/fsimpl/testutil/testutil.go             |   4 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                      |  16 +-
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go          |   4 +-
 pkg/sentry/fsimpl/tmpfs/directory.go               |   2 +-
 pkg/sentry/fsimpl/tmpfs/filesystem.go              |   2 +-
 pkg/sentry/fsimpl/tmpfs/named_pipe.go              |   4 +-
 pkg/sentry/fsimpl/tmpfs/pipe_test.go               |   6 +-
 pkg/sentry/fsimpl/tmpfs/regular_file.go            |   6 +-
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go       |   4 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go                   |   2 +-
 pkg/sentry/hostmm/BUILD                            |   2 +-
 pkg/sentry/hostmm/hostmm.go                        |   2 +-
 pkg/sentry/inet/BUILD                              |   2 +-
 pkg/sentry/inet/context.go                         |   2 +-
 pkg/sentry/kernel/BUILD                            |  12 +-
 pkg/sentry/kernel/auth/BUILD                       |   2 +-
 pkg/sentry/kernel/auth/context.go                  |   2 +-
 pkg/sentry/kernel/auth/id_map.go                   |   2 +-
 pkg/sentry/kernel/context.go                       |   2 +-
 pkg/sentry/kernel/contexttest/BUILD                |   4 +-
 pkg/sentry/kernel/contexttest/contexttest.go       |   4 +-
 pkg/sentry/kernel/epoll/BUILD                      |   6 +-
 pkg/sentry/kernel/epoll/epoll.go                   |   4 +-
 pkg/sentry/kernel/epoll/epoll_test.go              |   2 +-
 pkg/sentry/kernel/eventfd/BUILD                    |   8 +-
 pkg/sentry/kernel/eventfd/eventfd.go               |   4 +-
 pkg/sentry/kernel/eventfd/eventfd_test.go          |   4 +-
 pkg/sentry/kernel/fd_table.go                      |   2 +-
 pkg/sentry/kernel/fd_table_test.go                 |   4 +-
 pkg/sentry/kernel/futex/BUILD                      |   6 +-
 pkg/sentry/kernel/futex/futex.go                   |   2 +-
 pkg/sentry/kernel/futex/futex_test.go              |   2 +-
 pkg/sentry/kernel/kernel.go                        |   2 +-
 pkg/sentry/kernel/pipe/BUILD                       |  12 +-
 pkg/sentry/kernel/pipe/buffer.go                   |   2 +-
 pkg/sentry/kernel/pipe/buffer_test.go              |   2 +-
 pkg/sentry/kernel/pipe/node.go                     |   2 +-
 pkg/sentry/kernel/pipe/node_test.go                |   6 +-
 pkg/sentry/kernel/pipe/pipe.go                     |   2 +-
 pkg/sentry/kernel/pipe/pipe_test.go                |   4 +-
 pkg/sentry/kernel/pipe/pipe_util.go                |   4 +-
 pkg/sentry/kernel/pipe/reader_writer.go            |   4 +-
 pkg/sentry/kernel/pipe/vfs.go                      |   4 +-
 pkg/sentry/kernel/ptrace.go                        |   2 +-
 pkg/sentry/kernel/ptrace_amd64.go                  |   2 +-
 pkg/sentry/kernel/ptrace_arm64.go                  |   2 +-
 pkg/sentry/kernel/rseq.go                          |   2 +-
 pkg/sentry/kernel/seccomp.go                       |   2 +-
 pkg/sentry/kernel/semaphore/BUILD                  |   6 +-
 pkg/sentry/kernel/semaphore/semaphore.go           |   2 +-
 pkg/sentry/kernel/semaphore/semaphore_test.go      |   4 +-
 pkg/sentry/kernel/shm/BUILD                        |   4 +-
 pkg/sentry/kernel/shm/shm.go                       |   4 +-
 pkg/sentry/kernel/signalfd/BUILD                   |   4 +-
 pkg/sentry/kernel/signalfd/signalfd.go             |   4 +-
 pkg/sentry/kernel/syscalls.go                      |   2 +-
 pkg/sentry/kernel/task.go                          |   4 +-
 pkg/sentry/kernel/task_clone.go                    |   2 +-
 pkg/sentry/kernel/task_context.go                  |   4 +-
 pkg/sentry/kernel/task_futex.go                    |   2 +-
 pkg/sentry/kernel/task_log.go                      |   2 +-
 pkg/sentry/kernel/task_run.go                      |   2 +-
 pkg/sentry/kernel/task_signals.go                  |   2 +-
 pkg/sentry/kernel/task_start.go                    |   2 +-
 pkg/sentry/kernel/task_syscall.go                  |   2 +-
 pkg/sentry/kernel/task_usermem.go                  |   2 +-
 pkg/sentry/kernel/time/BUILD                       |   2 +-
 pkg/sentry/kernel/time/context.go                  |   2 +-
 pkg/sentry/kernel/timekeeper_test.go               |   4 +-
 pkg/sentry/kernel/vdso.go                          |   4 +-
 pkg/sentry/limits/BUILD                            |   2 +-
 pkg/sentry/limits/context.go                       |   2 +-
 pkg/sentry/loader/BUILD                            |   6 +-
 pkg/sentry/loader/elf.go                           |   4 +-
 pkg/sentry/loader/interpreter.go                   |   4 +-
 pkg/sentry/loader/loader.go                        |   4 +-
 pkg/sentry/loader/vdso.go                          |   6 +-
 pkg/sentry/memmap/BUILD                            |   6 +-
 pkg/sentry/memmap/mapping_set.go                   |   2 +-
 pkg/sentry/memmap/mapping_set_test.go              |   2 +-
 pkg/sentry/memmap/memmap.go                        |   4 +-
 pkg/sentry/mm/BUILD                                |  18 +-
 pkg/sentry/mm/address_space.go                     |   2 +-
 pkg/sentry/mm/aio_context.go                       |   4 +-
 pkg/sentry/mm/debug.go                             |   2 +-
 pkg/sentry/mm/io.go                                |   6 +-
 pkg/sentry/mm/lifecycle.go                         |   4 +-
 pkg/sentry/mm/metadata.go                          |   2 +-
 pkg/sentry/mm/mm.go                                |   4 +-
 pkg/sentry/mm/mm_test.go                           |   6 +-
 pkg/sentry/mm/pma.go                               |   8 +-
 pkg/sentry/mm/procfs.go                            |   4 +-
 pkg/sentry/mm/save_restore.go                      |   2 +-
 pkg/sentry/mm/shm.go                               |   4 +-
 pkg/sentry/mm/special_mappable.go                  |   4 +-
 pkg/sentry/mm/syscalls.go                          |   4 +-
 pkg/sentry/mm/vma.go                               |   4 +-
 pkg/sentry/pgalloc/BUILD                           |   8 +-
 pkg/sentry/pgalloc/context.go                      |   2 +-
 pkg/sentry/pgalloc/pgalloc.go                      |   6 +-
 pkg/sentry/pgalloc/pgalloc_test.go                 |   2 +-
 pkg/sentry/pgalloc/save_restore.go                 |   2 +-
 pkg/sentry/platform/BUILD                          |   8 +-
 pkg/sentry/platform/context.go                     |   2 +-
 pkg/sentry/platform/kvm/BUILD                      |   6 +-
 pkg/sentry/platform/kvm/address_space.go           |   2 +-
 pkg/sentry/platform/kvm/bluepill.go                |   2 +-
 pkg/sentry/platform/kvm/bluepill_fault.go          |   2 +-
 pkg/sentry/platform/kvm/context.go                 |   2 +-
 pkg/sentry/platform/kvm/kvm.go                     |   2 +-
 pkg/sentry/platform/kvm/kvm_test.go                |   2 +-
 pkg/sentry/platform/kvm/machine.go                 |   2 +-
 pkg/sentry/platform/kvm/machine_amd64.go           |   2 +-
 pkg/sentry/platform/kvm/machine_arm64.go           |   2 +-
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go    |   2 +-
 pkg/sentry/platform/kvm/physical_map.go            |   2 +-
 pkg/sentry/platform/kvm/virtual_map.go             |   2 +-
 pkg/sentry/platform/kvm/virtual_map_test.go        |   2 +-
 pkg/sentry/platform/mmap_min_addr.go               |   2 +-
 pkg/sentry/platform/platform.go                    |   4 +-
 pkg/sentry/platform/ptrace/BUILD                   |   4 +-
 pkg/sentry/platform/ptrace/ptrace.go               |   2 +-
 pkg/sentry/platform/ptrace/ptrace_unsafe.go        |   2 +-
 pkg/sentry/platform/ptrace/stub_unsafe.go          |   4 +-
 pkg/sentry/platform/ptrace/subprocess.go           |   2 +-
 pkg/sentry/platform/ring0/BUILD                    |   2 +-
 pkg/sentry/platform/ring0/defs_amd64.go            |   2 +-
 pkg/sentry/platform/ring0/defs_arm64.go            |   2 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD        |   2 +-
 pkg/sentry/platform/ring0/pagetables/BUILD         |   4 +-
 .../platform/ring0/pagetables/allocator_unsafe.go  |   2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go |   2 +-
 .../ring0/pagetables/pagetables_aarch64.go         |   2 +-
 .../ring0/pagetables/pagetables_amd64_test.go      |   2 +-
 .../ring0/pagetables/pagetables_arm64_test.go      |   2 +-
 .../platform/ring0/pagetables/pagetables_test.go   |   2 +-
 .../platform/ring0/pagetables/pagetables_x86.go    |   2 +-
 pkg/sentry/platform/safecopy/BUILD                 |  29 -
 pkg/sentry/platform/safecopy/LICENSE               |  27 -
 pkg/sentry/platform/safecopy/atomic_amd64.s        | 136 -----
 pkg/sentry/platform/safecopy/atomic_arm64.s        | 126 -----
 pkg/sentry/platform/safecopy/memclr_amd64.s        | 147 -----
 pkg/sentry/platform/safecopy/memclr_arm64.s        |  74 ---
 pkg/sentry/platform/safecopy/memcpy_amd64.s        | 250 ---------
 pkg/sentry/platform/safecopy/memcpy_arm64.s        |  78 ---
 pkg/sentry/platform/safecopy/safecopy.go           | 144 -----
 pkg/sentry/platform/safecopy/safecopy_test.go      | 617 ---------------------
 pkg/sentry/platform/safecopy/safecopy_unsafe.go    | 335 -----------
 pkg/sentry/platform/safecopy/sighandler_amd64.s    | 133 -----
 pkg/sentry/platform/safecopy/sighandler_arm64.s    | 143 -----
 pkg/sentry/safemem/BUILD                           |  27 -
 pkg/sentry/safemem/block_unsafe.go                 | 279 ----------
 pkg/sentry/safemem/io.go                           | 392 -------------
 pkg/sentry/safemem/io_test.go                      | 199 -------
 pkg/sentry/safemem/safemem.go                      |  16 -
 pkg/sentry/safemem/seq_test.go                     | 196 -------
 pkg/sentry/safemem/seq_unsafe.go                   | 299 ----------
 pkg/sentry/socket/BUILD                            |   4 +-
 pkg/sentry/socket/control/BUILD                    |   4 +-
 pkg/sentry/socket/control/control.go               |   4 +-
 pkg/sentry/socket/hostinet/BUILD                   |   6 +-
 pkg/sentry/socket/hostinet/socket.go               |   6 +-
 pkg/sentry/socket/hostinet/socket_unsafe.go        |   4 +-
 pkg/sentry/socket/hostinet/stack.go                |   4 +-
 pkg/sentry/socket/netfilter/BUILD                  |   2 +-
 pkg/sentry/socket/netfilter/netfilter.go           |   2 +-
 pkg/sentry/socket/netlink/BUILD                    |   4 +-
 pkg/sentry/socket/netlink/message.go               |   2 +-
 pkg/sentry/socket/netlink/provider.go              |   2 +-
 pkg/sentry/socket/netlink/route/BUILD              |   2 +-
 pkg/sentry/socket/netlink/route/protocol.go        |   2 +-
 pkg/sentry/socket/netlink/socket.go                |   4 +-
 pkg/sentry/socket/netlink/uevent/BUILD             |   2 +-
 pkg/sentry/socket/netlink/uevent/protocol.go       |   2 +-
 pkg/sentry/socket/netstack/BUILD                   |   6 +-
 pkg/sentry/socket/netstack/netstack.go             |   6 +-
 pkg/sentry/socket/netstack/provider.go             |   2 +-
 pkg/sentry/socket/socket.go                        |   4 +-
 pkg/sentry/socket/unix/BUILD                       |   6 +-
 pkg/sentry/socket/unix/io.go                       |   4 +-
 pkg/sentry/socket/unix/transport/BUILD             |   2 +-
 pkg/sentry/socket/unix/transport/connectioned.go   |   2 +-
 pkg/sentry/socket/unix/transport/connectionless.go |   2 +-
 pkg/sentry/socket/unix/transport/unix.go           |   2 +-
 pkg/sentry/socket/unix/unix.go                     |   4 +-
 pkg/sentry/strace/BUILD                            |   2 +-
 pkg/sentry/strace/poll.go                          |   2 +-
 pkg/sentry/strace/select.go                        |   2 +-
 pkg/sentry/strace/signal.go                        |   2 +-
 pkg/sentry/strace/socket.go                        |   2 +-
 pkg/sentry/strace/strace.go                        |   2 +-
 pkg/sentry/syscalls/linux/BUILD                    |   6 +-
 pkg/sentry/syscalls/linux/linux64_amd64.go         |   2 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go         |   2 +-
 pkg/sentry/syscalls/linux/sigset.go                |   2 +-
 pkg/sentry/syscalls/linux/sys_aio.go               |   2 +-
 pkg/sentry/syscalls/linux/sys_epoll.go             |   2 +-
 pkg/sentry/syscalls/linux/sys_file.go              |   4 +-
 pkg/sentry/syscalls/linux/sys_futex.go             |   2 +-
 pkg/sentry/syscalls/linux/sys_getdents.go          |   2 +-
 pkg/sentry/syscalls/linux/sys_mempolicy.go         |   2 +-
 pkg/sentry/syscalls/linux/sys_mmap.go              |   2 +-
 pkg/sentry/syscalls/linux/sys_mount.go             |   2 +-
 pkg/sentry/syscalls/linux/sys_pipe.go              |   2 +-
 pkg/sentry/syscalls/linux/sys_poll.go              |   2 +-
 pkg/sentry/syscalls/linux/sys_random.go            |   4 +-
 pkg/sentry/syscalls/linux/sys_read.go              |   2 +-
 pkg/sentry/syscalls/linux/sys_rlimit.go            |   2 +-
 pkg/sentry/syscalls/linux/sys_seccomp.go           |   2 +-
 pkg/sentry/syscalls/linux/sys_sem.go               |   2 +-
 pkg/sentry/syscalls/linux/sys_signal.go            |   2 +-
 pkg/sentry/syscalls/linux/sys_socket.go            |   2 +-
 pkg/sentry/syscalls/linux/sys_stat.go              |   2 +-
 pkg/sentry/syscalls/linux/sys_stat_amd64.go        |   2 +-
 pkg/sentry/syscalls/linux/sys_stat_arm64.go        |   2 +-
 pkg/sentry/syscalls/linux/sys_thread.go            |   2 +-
 pkg/sentry/syscalls/linux/sys_time.go              |   2 +-
 pkg/sentry/syscalls/linux/sys_timer.go             |   2 +-
 pkg/sentry/syscalls/linux/sys_write.go             |   2 +-
 pkg/sentry/syscalls/linux/sys_xattr.go             |   2 +-
 pkg/sentry/syscalls/linux/timespec.go              |   2 +-
 pkg/sentry/unimpl/BUILD                            |   2 +-
 pkg/sentry/unimpl/events.go                        |   2 +-
 pkg/sentry/uniqueid/BUILD                          |   2 +-
 pkg/sentry/uniqueid/context.go                     |   2 +-
 pkg/sentry/usermem/BUILD                           |  55 --
 pkg/sentry/usermem/README.md                       |  31 --
 pkg/sentry/usermem/access_type.go                  | 128 -----
 pkg/sentry/usermem/addr.go                         | 108 ----
 pkg/sentry/usermem/addr_range_seq_test.go          | 197 -------
 pkg/sentry/usermem/addr_range_seq_unsafe.go        | 277 ---------
 pkg/sentry/usermem/bytes_io.go                     | 141 -----
 pkg/sentry/usermem/bytes_io_unsafe.go              |  47 --
 pkg/sentry/usermem/usermem.go                      | 597 --------------------
 pkg/sentry/usermem/usermem_arm64.go                |  53 --
 pkg/sentry/usermem/usermem_test.go                 | 424 --------------
 pkg/sentry/usermem/usermem_unsafe.go               |  27 -
 pkg/sentry/usermem/usermem_x86.go                  |  38 --
 pkg/sentry/vfs/BUILD                               |  10 +-
 pkg/sentry/vfs/context.go                          |   2 +-
 pkg/sentry/vfs/device.go                           |   2 +-
 pkg/sentry/vfs/file_description.go                 |   4 +-
 pkg/sentry/vfs/file_description_impl_util.go       |   4 +-
 pkg/sentry/vfs/file_description_impl_util_test.go  |   6 +-
 pkg/sentry/vfs/filesystem.go                       |   2 +-
 pkg/sentry/vfs/filesystem_type.go                  |   2 +-
 pkg/sentry/vfs/mount.go                            |   2 +-
 pkg/sentry/vfs/pathname.go                         |   2 +-
 pkg/sentry/vfs/testutil.go                         |   2 +-
 pkg/sentry/vfs/vfs.go                              |   2 +-
 pkg/usermem/BUILD                                  |  55 ++
 pkg/usermem/README.md                              |  31 ++
 pkg/usermem/access_type.go                         | 128 +++++
 pkg/usermem/addr.go                                | 108 ++++
 pkg/usermem/addr_range_seq_test.go                 | 197 +++++++
 pkg/usermem/addr_range_seq_unsafe.go               | 277 +++++++++
 pkg/usermem/bytes_io.go                            | 141 +++++
 pkg/usermem/bytes_io_unsafe.go                     |  47 ++
 pkg/usermem/usermem.go                             | 597 ++++++++++++++++++++
 pkg/usermem/usermem_arm64.go                       |  53 ++
 pkg/usermem/usermem_test.go                        | 424 ++++++++++++++
 pkg/usermem/usermem_unsafe.go                      |  27 +
 pkg/usermem/usermem_x86.go                         |  38 ++
 runsc/boot/BUILD                                   |   6 +-
 runsc/boot/fds.go                                  |   2 +-
 runsc/boot/fs.go                                   |   2 +-
 runsc/boot/loader_test.go                          |   2 +-
 runsc/boot/user.go                                 |   4 +-
 runsc/boot/user_test.go                            |   2 +-
 tools/go_marshal/defs.bzl                          |   4 +-
 tools/go_marshal/gomarshal/generator.go            |   4 +-
 tools/go_marshal/test/BUILD                        |   2 +-
 tools/go_marshal/test/benchmark_test.go            |   2 +-
 483 files changed, 6839 insertions(+), 6835 deletions(-)
 create mode 100644 pkg/context/BUILD
 create mode 100644 pkg/context/context.go
 create mode 100644 pkg/safecopy/BUILD
 create mode 100644 pkg/safecopy/LICENSE
 create mode 100644 pkg/safecopy/atomic_amd64.s
 create mode 100644 pkg/safecopy/atomic_arm64.s
 create mode 100644 pkg/safecopy/memclr_amd64.s
 create mode 100644 pkg/safecopy/memclr_arm64.s
 create mode 100644 pkg/safecopy/memcpy_amd64.s
 create mode 100644 pkg/safecopy/memcpy_arm64.s
 create mode 100644 pkg/safecopy/safecopy.go
 create mode 100644 pkg/safecopy/safecopy_test.go
 create mode 100644 pkg/safecopy/safecopy_unsafe.go
 create mode 100644 pkg/safecopy/sighandler_amd64.s
 create mode 100644 pkg/safecopy/sighandler_arm64.s
 create mode 100644 pkg/safemem/BUILD
 create mode 100644 pkg/safemem/block_unsafe.go
 create mode 100644 pkg/safemem/io.go
 create mode 100644 pkg/safemem/io_test.go
 create mode 100644 pkg/safemem/safemem.go
 create mode 100644 pkg/safemem/seq_test.go
 create mode 100644 pkg/safemem/seq_unsafe.go
 delete mode 100644 pkg/sentry/context/BUILD
 delete mode 100644 pkg/sentry/context/context.go
 delete mode 100644 pkg/sentry/context/contexttest/BUILD
 delete mode 100644 pkg/sentry/context/contexttest/contexttest.go
 create mode 100644 pkg/sentry/contexttest/BUILD
 create mode 100644 pkg/sentry/contexttest/contexttest.go
 delete mode 100644 pkg/sentry/platform/safecopy/BUILD
 delete mode 100644 pkg/sentry/platform/safecopy/LICENSE
 delete mode 100644 pkg/sentry/platform/safecopy/atomic_amd64.s
 delete mode 100644 pkg/sentry/platform/safecopy/atomic_arm64.s
 delete mode 100644 pkg/sentry/platform/safecopy/memclr_amd64.s
 delete mode 100644 pkg/sentry/platform/safecopy/memclr_arm64.s
 delete mode 100644 pkg/sentry/platform/safecopy/memcpy_amd64.s
 delete mode 100644 pkg/sentry/platform/safecopy/memcpy_arm64.s
 delete mode 100644 pkg/sentry/platform/safecopy/safecopy.go
 delete mode 100644 pkg/sentry/platform/safecopy/safecopy_test.go
 delete mode 100644 pkg/sentry/platform/safecopy/safecopy_unsafe.go
 delete mode 100644 pkg/sentry/platform/safecopy/sighandler_amd64.s
 delete mode 100644 pkg/sentry/platform/safecopy/sighandler_arm64.s
 delete mode 100644 pkg/sentry/safemem/BUILD
 delete mode 100644 pkg/sentry/safemem/block_unsafe.go
 delete mode 100644 pkg/sentry/safemem/io.go
 delete mode 100644 pkg/sentry/safemem/io_test.go
 delete mode 100644 pkg/sentry/safemem/safemem.go
 delete mode 100644 pkg/sentry/safemem/seq_test.go
 delete mode 100644 pkg/sentry/safemem/seq_unsafe.go
 delete mode 100644 pkg/sentry/usermem/BUILD
 delete mode 100644 pkg/sentry/usermem/README.md
 delete mode 100644 pkg/sentry/usermem/access_type.go
 delete mode 100644 pkg/sentry/usermem/addr.go
 delete mode 100644 pkg/sentry/usermem/addr_range_seq_test.go
 delete mode 100644 pkg/sentry/usermem/addr_range_seq_unsafe.go
 delete mode 100644 pkg/sentry/usermem/bytes_io.go
 delete mode 100644 pkg/sentry/usermem/bytes_io_unsafe.go
 delete mode 100644 pkg/sentry/usermem/usermem.go
 delete mode 100644 pkg/sentry/usermem/usermem_arm64.go
 delete mode 100644 pkg/sentry/usermem/usermem_test.go
 delete mode 100644 pkg/sentry/usermem/usermem_unsafe.go
 delete mode 100644 pkg/sentry/usermem/usermem_x86.go
 create mode 100644 pkg/usermem/BUILD
 create mode 100644 pkg/usermem/README.md
 create mode 100644 pkg/usermem/access_type.go
 create mode 100644 pkg/usermem/addr.go
 create mode 100644 pkg/usermem/addr_range_seq_test.go
 create mode 100644 pkg/usermem/addr_range_seq_unsafe.go
 create mode 100644 pkg/usermem/bytes_io.go
 create mode 100644 pkg/usermem/bytes_io_unsafe.go
 create mode 100644 pkg/usermem/usermem.go
 create mode 100644 pkg/usermem/usermem_arm64.go
 create mode 100644 pkg/usermem/usermem_test.go
 create mode 100644 pkg/usermem/usermem_unsafe.go
 create mode 100644 pkg/usermem/usermem_x86.go

diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go
index d56c481c9..e6be93c3a 100644
--- a/pkg/abi/abi.go
+++ b/pkg/abi/abi.go
@@ -39,3 +39,7 @@ func (o OS) String() string {
 		return fmt.Sprintf("OS(%d)", o)
 	}
 }
+
+// ABI is an interface that defines OS-specific interactions.
+type ABI interface {
+}
diff --git a/pkg/context/BUILD b/pkg/context/BUILD
new file mode 100644
index 000000000..239f31149
--- /dev/null
+++ b/pkg/context/BUILD
@@ -0,0 +1,13 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "context",
+    srcs = ["context.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/amutex",
+        "//pkg/log",
+    ],
+)
diff --git a/pkg/context/context.go b/pkg/context/context.go
new file mode 100644
index 000000000..23e009ef3
--- /dev/null
+++ b/pkg/context/context.go
@@ -0,0 +1,141 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package context defines an internal context type.
+//
+// The given Context conforms to the standard Go context, but mandates
+// additional methods that are specific to the kernel internals. Note however,
+// that the Context described by this package carries additional constraints
+// regarding concurrent access and retaining beyond the scope of a call.
+//
+// See the Context type for complete details.
+package context
+
+import (
+	"context"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+type contextID int
+
+// Globally accessible values from a context. These keys are defined in the
+// context package to resolve dependency cycles by not requiring the caller to
+// import packages usually required to get these information.
+const (
+	// CtxThreadGroupID is the current thread group ID when a context represents
+	// a task context. The value is represented as an int32.
+	CtxThreadGroupID contextID = iota
+)
+
+// ThreadGroupIDFromContext returns the current thread group ID when ctx
+// represents a task context.
+func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
+	if tgid := ctx.Value(CtxThreadGroupID); tgid != nil {
+		return tgid.(int32), true
+	}
+	return 0, false
+}
+
+// A Context represents a thread of execution (hereafter "goroutine" to reflect
+// Go idiosyncrasy). It carries state associated with the goroutine across API
+// boundaries.
+//
+// While Context exists for essentially the same reasons as Go's standard
+// context.Context, the standard type represents the state of an operation
+// rather than that of a goroutine. This is a critical distinction:
+//
+// - Unlike context.Context, which "may be passed to functions running in
+// different goroutines", it is *not safe* to use the same Context in multiple
+// concurrent goroutines.
+//
+// - It is *not safe* to retain a Context passed to a function beyond the scope
+// of that function call.
+//
+// In both cases, values extracted from the Context should be used instead.
+type Context interface {
+	log.Logger
+	amutex.Sleeper
+	context.Context
+
+	// UninterruptibleSleepStart indicates the beginning of an uninterruptible
+	// sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
+	// is true and the Context represents a Task, the Task's AddressSpace is
+	// deactivated.
+	UninterruptibleSleepStart(deactivate bool)
+
+	// UninterruptibleSleepFinish indicates the end of an uninterruptible sleep
+	// state that was begun by a previous call to UninterruptibleSleepStart. If
+	// activate is true and the Context represents a Task, the Task's
+	// AddressSpace is activated. Normally activate is the same value as the
+	// deactivate parameter passed to UninterruptibleSleepStart.
+	UninterruptibleSleepFinish(activate bool)
+}
+
+// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep
+// methods for anonymous embedding in other types that do not implement sleeps.
+type NoopSleeper struct {
+	amutex.NoopSleeper
+}
+
+// UninterruptibleSleepStart does nothing.
+func (NoopSleeper) UninterruptibleSleepStart(bool) {}
+
+// UninterruptibleSleepFinish does nothing.
+func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
+
+// Deadline returns zero values, meaning no deadline.
+func (NoopSleeper) Deadline() (time.Time, bool) {
+	return time.Time{}, false
+}
+
+// Done returns nil.
+func (NoopSleeper) Done() <-chan struct{} {
+	return nil
+}
+
+// Err returns nil.
+func (NoopSleeper) Err() error {
+	return nil
+}
+
+// logContext implements basic logging.
+type logContext struct {
+	log.Logger
+	NoopSleeper
+}
+
+// Value implements Context.Value.
+func (logContext) Value(key interface{}) interface{} {
+	return nil
+}
+
+// bgContext is the context returned by context.Background.
+var bgContext = &logContext{Logger: log.Log()}
+
+// Background returns an empty context using the default logger.
+//
+// Users should be wary of using a Background context. Please tag any use with
+// FIXME(b/38173783) and a note to remove this use.
+//
+// Generally, one should use the Task as their context when available, or avoid
+// having to use a context in places where a Task is unavailable.
+//
+// Using a Background context for tests is fine, as long as no values are
+// needed from the context in the tested code paths.
+func Background() Context {
+	return bgContext
+}
diff --git a/pkg/safecopy/BUILD b/pkg/safecopy/BUILD
new file mode 100644
index 000000000..426ef30c9
--- /dev/null
+++ b/pkg/safecopy/BUILD
@@ -0,0 +1,29 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "safecopy",
+    srcs = [
+        "atomic_amd64.s",
+        "atomic_arm64.s",
+        "memclr_amd64.s",
+        "memclr_arm64.s",
+        "memcpy_amd64.s",
+        "memcpy_arm64.s",
+        "safecopy.go",
+        "safecopy_unsafe.go",
+        "sighandler_amd64.s",
+        "sighandler_arm64.s",
+    ],
+    visibility = ["//:sandbox"],
+    deps = ["//pkg/syserror"],
+)
+
+go_test(
+    name = "safecopy_test",
+    srcs = [
+        "safecopy_test.go",
+    ],
+    library = ":safecopy",
+)
diff --git a/pkg/safecopy/LICENSE b/pkg/safecopy/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/safecopy/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/safecopy/atomic_amd64.s b/pkg/safecopy/atomic_amd64.s
new file mode 100644
index 000000000..a0cd78f33
--- /dev/null
+++ b/pkg/safecopy/atomic_amd64.s
@@ -0,0 +1,136 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// handleSwapUint32Fault returns the value stored in DI. Control is transferred
+// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as swapUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
+  MOVL DI, sig+20(FP)
+  RET
+
+// swapUint32 atomically stores new into *addr and returns (the previous *addr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+TEXT ·swapUint32(SB), NOSPLIT, $0-24
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleSwapUint32Fault will store a different value in this address.
+  MOVL $0, sig+20(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVL new+8(FP), AX
+  XCHGL AX, 0(DI)
+  MOVL AX, old+16(FP)
+  RET
+
+// handleSwapUint64Fault returns the value stored in DI. Control is transferred
+// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as swapUint64 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
+  MOVL DI, sig+24(FP)
+  RET
+
+// swapUint64 atomically stores new into *addr and returns (the previous *addr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: addr must be aligned to a 8-byte boundary.
+//
+//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+TEXT ·swapUint64(SB), NOSPLIT, $0-28
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleSwapUint64Fault will store a different value in this address.
+  MOVL $0, sig+24(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVQ new+8(FP), AX
+  XCHGQ AX, 0(DI)
+  MOVQ AX, old+16(FP)
+  RET
+
+// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is
+// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the
+// signal number stored in DI.
+//
+// It must have the same frame configuration as compareAndSwapUint32 so that it
+// can undo any potential call frame set up by the assembler.
+TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
+  MOVL DI, sig+20(FP)
+  RET
+
+// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is
+// received during the operation, the value of prev is unspecified, and sig is
+// the number of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
+  // Store 0 as the returned signal number. If we run to completion, this is
+  // the value the caller will see; if a signal is received,
+  // handleCompareAndSwapUint32Fault will store a different value in this
+  // address.
+  MOVL $0, sig+20(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVL old+8(FP), AX
+  MOVL new+12(FP), DX
+  LOCK
+  CMPXCHGL DX, 0(DI)
+  MOVL AX, prev+16(FP)
+  RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+  MOVL DI, sig+12(FP)
+  RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleLoadUint32Fault will store a different value in this address.
+  MOVL $0, sig+12(FP)
+
+  MOVQ addr+0(FP), AX
+  MOVL (AX), BX
+  MOVL BX, val+8(FP)
+  RET
diff --git a/pkg/safecopy/atomic_arm64.s b/pkg/safecopy/atomic_arm64.s
new file mode 100644
index 000000000..d58ed71f7
--- /dev/null
+++ b/pkg/safecopy/atomic_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleSwapUint32Fault returns the value stored in R1. Control is transferred
+// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in R1.
+//
+// It must have the same frame configuration as swapUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
+	MOVW R1, sig+20(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Xchg.
+//
+//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+TEXT ·swapUint32(SB), NOSPLIT, $0-24
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleSwapUint32Fault will store a different value in this address.
+	MOVW $0, sig+20(FP)
+again:
+	MOVD addr+0(FP), R0
+	MOVW new+8(FP), R1
+	LDAXRW (R0), R2
+	STLXRW R1, (R0), R3
+	CBNZ R3, again
+	MOVW R2, old+16(FP)
+	RET
+
+// handleSwapUint64Fault returns the value stored in R1. Control is transferred
+// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in R1.
+//
+// It must have the same frame configuration as swapUint64 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
+	MOVW R1, sig+24(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Xchg64.
+//
+//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+TEXT ·swapUint64(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleSwapUint64Fault will store a different value in this address.
+	MOVW $0, sig+24(FP)
+again:
+	MOVD addr+0(FP), R0
+	MOVD new+8(FP), R1
+	LDAXR (R0), R2
+	STLXR R1, (R0), R3
+	CBNZ R3, again
+	MOVD R2, old+16(FP)
+	RET
+
+// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is
+// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS,
+// with the signal number stored in R1.
+//
+// It must have the same frame configuration as compareAndSwapUint32 so that it
+// can undo any potential call frame set up by the assembler.
+TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
+	MOVW R1, sig+20(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Cas.
+//
+//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
+	// Store 0 as the returned signal number. If we run to completion, this is
+	// the value the caller will see; if a signal is received,
+	// handleCompareAndSwapUint32Fault will store a different value in this
+	// address.
+	MOVW $0, sig+20(FP)
+
+	MOVD addr+0(FP), R0
+	MOVW old+8(FP), R1
+	MOVW new+12(FP), R2
+again:
+	LDAXRW (R0), R3
+	CMPW R1, R3
+	BNE done
+	STLXRW R2, (R0), R4
+	CBNZ R4, again
+done:
+	MOVW R3, prev+16(FP)
+	RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+	MOVW R1, sig+12(FP)
+	RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleLoadUint32Fault will store a different value in this address.
+	MOVW $0, sig+12(FP)
+
+	MOVD addr+0(FP), R0
+	LDARW (R0), R1
+	MOVW R1, val+8(FP)
+	RET
diff --git a/pkg/safecopy/memclr_amd64.s b/pkg/safecopy/memclr_amd64.s
new file mode 100644
index 000000000..64cf32f05
--- /dev/null
+++ b/pkg/safecopy/memclr_amd64.s
@@ -0,0 +1,147 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemclrFault returns (the value stored in AX, the value stored in DI).
+// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in AX and the signal number stored in DI.
+//
+// It must have the same frame configuration as memclr so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemclrFault(SB), NOSPLIT, $0-28
+	MOVQ	AX, addr+16(FP)
+	MOVL	DI, sig+24(FP)
+	RET
+
+// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
+// signal is received during the write, it returns the address that caused the
+// fault and the number of the signal that was received. Otherwise, it returns
+// an unspecified address and a signal number of 0.
+//
+// Data is written in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully written.
+//
+// The code is derived from runtime.memclrNoHeapPointers.
+//
+// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memclr(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemclrFault will store a different value in this address.
+	MOVL	$0, sig+24(FP)
+
+	MOVQ	ptr+0(FP), DI
+	MOVQ	n+8(FP), BX
+	XORQ	AX, AX
+
+	// MOVOU seems always faster than REP STOSQ.
+tail:
+	TESTQ	BX, BX
+	JEQ	_0
+	CMPQ	BX, $2
+	JBE	_1or2
+	CMPQ	BX, $4
+	JBE	_3or4
+	CMPQ	BX, $8
+	JB	_5through7
+	JE	_8
+	CMPQ	BX, $16
+	JBE	_9through16
+	PXOR	X0, X0
+	CMPQ	BX, $32
+	JBE	_17through32
+	CMPQ	BX, $64
+	JBE	_33through64
+	CMPQ	BX, $128
+	JBE	_65through128
+	CMPQ	BX, $256
+	JBE	_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
+
+loop:
+	MOVOU	X0, 0(DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, 128(DI)
+	MOVOU	X0, 144(DI)
+	MOVOU	X0, 160(DI)
+	MOVOU	X0, 176(DI)
+	MOVOU	X0, 192(DI)
+	MOVOU	X0, 208(DI)
+	MOVOU	X0, 224(DI)
+	MOVOU	X0, 240(DI)
+	SUBQ	$256, BX
+	ADDQ	$256, DI
+	CMPQ	BX, $256
+	JAE	loop
+	JMP	tail
+
+_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+_0:
+	RET
+_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+_5through7:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+_8:
+	// We need a separate case for 8 to make sure we clear pointers atomically.
+	MOVQ	AX, (DI)
+	RET
+_9through16:
+	MOVQ	AX, (DI)
+	MOVQ	AX, -8(DI)(BX*1)
+	RET
+_17through32:
+	MOVOU	X0, (DI)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_33through64:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_65through128:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_129through256:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, -128(DI)(BX*1)
+	MOVOU	X0, -112(DI)(BX*1)
+	MOVOU	X0, -96(DI)(BX*1)
+	MOVOU	X0, -80(DI)(BX*1)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
diff --git a/pkg/safecopy/memclr_arm64.s b/pkg/safecopy/memclr_arm64.s
new file mode 100644
index 000000000..7361b9067
--- /dev/null
+++ b/pkg/safecopy/memclr_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemclrFault returns (the value stored in R0, the value stored in R1).
+// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in R0 and the signal number stored in R1.
+//
+// It must have the same frame configuration as memclr so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemclrFault(SB), NOSPLIT, $0-28
+	MOVD R0, addr+16(FP)
+	MOVW R1, sig+24(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from runtime.memclrNoHeapPointers.
+//
+// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memclr(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemclrFault will store a different value in this address.
+	MOVW $0, sig+24(FP)
+	MOVD ptr+0(FP), R0
+	MOVD n+8(FP), R1
+
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP $16, R1
+	BLT tail_zero
+	// Get buffer offset into 16 byte aligned address for better performance
+	ANDS $15, R0, ZR
+	BNE unaligned_to_16
+aligned_to_16:
+	LSR $4, R1, R2
+zero_by_16:
+	STP.P (ZR, ZR), 16(R0) // Store pair with post index.
+	SUBS $1, R2, R2
+	BNE zero_by_16
+	ANDS $15, R1, R1
+	BEQ end
+
+	// Zero buffer with size=R1 < 16
+tail_zero:
+	TBZ $3, R1, tail_zero_4
+	MOVD.P ZR, 8(R0)
+tail_zero_4:
+	TBZ $2, R1, tail_zero_2
+	MOVW.P ZR, 4(R0)
+tail_zero_2:
+	TBZ $1, R1, tail_zero_1
+	MOVH.P ZR, 2(R0)
+tail_zero_1:
+	TBZ $0, R1, end
+	MOVB ZR, (R0)
+end:
+	RET
+
+unaligned_to_16:
+	MOVD R0, R2
+head_loop:
+	MOVBU.P ZR, 1(R0)
+	ANDS $15, R0, ZR
+	BNE head_loop
+	// Adjust length for what remains
+	SUB R2, R0, R3
+	SUB R3, R1
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP $16, R1
+	BLT tail_zero
+	B aligned_to_16
diff --git a/pkg/safecopy/memcpy_amd64.s b/pkg/safecopy/memcpy_amd64.s
new file mode 100644
index 000000000..129691d68
--- /dev/null
+++ b/pkg/safecopy/memcpy_amd64.s
@@ -0,0 +1,250 @@
+// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
+// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+// Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+// handleMemcpyFault returns (the value stored in AX, the value stored in DI).
+// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in AX and the signal number stored in DI.
+//
+// It must have the same frame configuration as memcpy so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
+	MOVQ	AX, addr+24(FP)
+	MOVL	DI, sig+32(FP)
+	RET
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+// The code is derived from the forward copying part of runtime.memmove.
+//
+// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memcpy(SB), NOSPLIT, $0-36
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemcpyFault will store a different value in this address.
+	MOVL	$0, sig+32(FP)
+
+	MOVQ	to+0(FP), DI
+	MOVQ	from+8(FP), SI
+	MOVQ	n+16(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code. The REP MOVSQ instruction is really fast
+	// for large sizes. The cutover is approximately 2K.
+tail:
+	// move_129through256 or smaller work whether or not the source and the
+	// destination memory regions overlap because they load all data into
+	// registers before writing it back.  move_256through2048 on the other
+	// hand can be used only when the memory regions don't overlap or the copy
+	// direction is forward.
+	TESTQ	BX, BX
+	JEQ	move_0
+	CMPQ	BX, $2
+	JBE	move_1or2
+	CMPQ	BX, $4
+	JBE	move_3or4
+	CMPQ	BX, $8
+	JB	move_5through7
+	JE	move_8
+	CMPQ	BX, $16
+	JBE	move_9through16
+	CMPQ	BX, $32
+	JBE	move_17through32
+	CMPQ	BX, $64
+	JBE	move_33through64
+	CMPQ	BX, $128
+	JBE	move_65through128
+	CMPQ	BX, $256
+	JBE	move_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+
+/*
+ * forward copy loop
+ */
+	CMPQ	BX, $2048
+	JLS	move_256through2048
+
+	// Check alignment
+	MOVL	SI, AX
+	ORL	DI, AX
+	TESTL	$7, AX
+	JEQ	fwdBy8
+
+	// Do 1 byte at a time
+	MOVQ	BX, CX
+	REP;	MOVSB
+	RET
+
+fwdBy8:
+	// Do 8 bytes at a time
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+	REP;	MOVSQ
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	AX, (DI)
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	AX, (DI)
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through7:
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_8:
+	// We need a separate case for 8 to make sure we write pointers atomically.
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	RET
+move_9through16:
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	CX, -8(DI)(BX*1)
+	RET
+move_17through32:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	-16(SI)(BX*1), X1
+	MOVOU	X1, -16(DI)(BX*1)
+	RET
+move_33through64:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	-32(SI)(BX*1), X2
+	MOVOU	X2, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X3
+	MOVOU	X3, -16(DI)(BX*1)
+	RET
+move_65through128:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	-64(SI)(BX*1), X4
+	MOVOU	X4, -64(DI)(BX*1)
+	MOVOU	-48(SI)(BX*1), X5
+	MOVOU	X5, -48(DI)(BX*1)
+	MOVOU	-32(SI)(BX*1), X6
+	MOVOU	X6, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X7
+	MOVOU	X7, -16(DI)(BX*1)
+	RET
+move_129through256:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	-128(SI)(BX*1), X8
+	MOVOU	X8, -128(DI)(BX*1)
+	MOVOU	-112(SI)(BX*1), X9
+	MOVOU	X9, -112(DI)(BX*1)
+	MOVOU	-96(SI)(BX*1), X10
+	MOVOU	X10, -96(DI)(BX*1)
+	MOVOU	-80(SI)(BX*1), X11
+	MOVOU	X11, -80(DI)(BX*1)
+	MOVOU	-64(SI)(BX*1), X12
+	MOVOU	X12, -64(DI)(BX*1)
+	MOVOU	-48(SI)(BX*1), X13
+	MOVOU	X13, -48(DI)(BX*1)
+	MOVOU	-32(SI)(BX*1), X14
+	MOVOU	X14, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X15
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+move_256through2048:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	128(SI), X8
+	MOVOU	X8, 128(DI)
+	MOVOU	144(SI), X9
+	MOVOU	X9, 144(DI)
+	MOVOU	160(SI), X10
+	MOVOU	X10, 160(DI)
+	MOVOU	176(SI), X11
+	MOVOU	X11, 176(DI)
+	MOVOU	192(SI), X12
+	MOVOU	X12, 192(DI)
+	MOVOU	208(SI), X13
+	MOVOU	X13, 208(DI)
+	MOVOU	224(SI), X14
+	MOVOU	X14, 224(DI)
+	MOVOU	240(SI), X15
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_256through2048
+	JMP	tail
diff --git a/pkg/safecopy/memcpy_arm64.s b/pkg/safecopy/memcpy_arm64.s
new file mode 100644
index 000000000..e7e541565
--- /dev/null
+++ b/pkg/safecopy/memcpy_arm64.s
@@ -0,0 +1,78 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemcpyFault returns (the value stored in R0, the value stored in R1).
+// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in R0 and the signal number stored in R1.
+//
+// It must have the same frame configuration as memcpy so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
+	MOVD R0, addr+24(FP)
+	MOVW R1, sig+32(FP)
+	RET
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+// The code is derived from the Go source runtime.memmove.
+//
+// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memcpy(SB), NOSPLIT, $-8-36
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemcpyFault will store a different value in this address.
+	MOVW $0, sig+32(FP)
+
+	MOVD to+0(FP), R3
+	MOVD from+8(FP), R4
+	MOVD n+16(FP), R5
+	CMP $0, R5
+	BNE check
+	RET
+
+check:
+	AND $~7, R5, R7     // R7 is N&~7.
+	SUB R7, R5, R6      // R6 is N&7.
+
+	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+	// R3 and R4 are advanced as we copy.
+
+	// (There may be implementations of armv8 where copying by bytes until
+	// at least one of source or dest is word aligned is a worthwhile
+	// optimization, but the on the one tested so far (xgene) it did not
+	// make a significance difference.)
+
+	CMP $0, R7          // Do we need to do any word-by-word copying?
+	BEQ noforwardlarge
+	ADD R3, R7, R9      // R9 points just past where we copy by word.
+
+forwardlargeloop:
+	MOVD.P 8(R4), R8       // R8 is just a scratch register.
+	MOVD.P R8, 8(R3)
+	CMP R3, R9
+	BNE forwardlargeloop
+
+noforwardlarge:
+	CMP $0, R6          // Do we need to do any byte-by-byte copying?
+	BNE forwardtail
+	RET
+
+forwardtail:
+	ADD R3, R6, R9      // R9 points just past the destination memory.
+
+forwardtailloop:
+	MOVBU.P 1(R4), R8
+	MOVBU.P R8, 1(R3)
+	CMP R3, R9
+	BNE forwardtailloop
+	RET
diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go
new file mode 100644
index 000000000..2fb7e5809
--- /dev/null
+++ b/pkg/safecopy/safecopy.go
@@ -0,0 +1,144 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package safecopy provides an efficient implementation of functions to access
+// memory that may result in SIGSEGV or SIGBUS being sent to the accessor.
+package safecopy
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SegvError is returned when a safecopy function receives SIGSEGV.
+type SegvError struct {
+	// Addr is the address at which the SIGSEGV occurred.
+	Addr uintptr
+}
+
+// Error implements error.Error.
+func (e SegvError) Error() string {
+	return fmt.Sprintf("SIGSEGV at %#x", e.Addr)
+}
+
+// BusError is returned when a safecopy function receives SIGBUS.
+type BusError struct {
+	// Addr is the address at which the SIGBUS occurred.
+	Addr uintptr
+}
+
+// Error implements error.Error.
+func (e BusError) Error() string {
+	return fmt.Sprintf("SIGBUS at %#x", e.Addr)
+}
+
+// AlignmentError is returned when a safecopy function is passed an address
+// that does not meet alignment requirements.
+type AlignmentError struct {
+	// Addr is the invalid address.
+	Addr uintptr
+
+	// Alignment is the required alignment.
+	Alignment uintptr
+}
+
+// Error implements error.Error.
+func (e AlignmentError) Error() string {
+	return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment)
+}
+
+var (
+	// The begin and end addresses below are for the functions that are
+	// checked by the signal handler.
+	memcpyBegin               uintptr
+	memcpyEnd                 uintptr
+	memclrBegin               uintptr
+	memclrEnd                 uintptr
+	swapUint32Begin           uintptr
+	swapUint32End             uintptr
+	swapUint64Begin           uintptr
+	swapUint64End             uintptr
+	compareAndSwapUint32Begin uintptr
+	compareAndSwapUint32End   uintptr
+	loadUint32Begin           uintptr
+	loadUint32End             uintptr
+
+	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
+	// configured before we replaced it with our own. We still call into it
+	// when we get a SIGSEGV that is not interesting to us.
+	savedSigSegVHandler uintptr
+
+	// same a above, but for SIGBUS signals.
+	savedSigBusHandler uintptr
+)
+
+// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS
+// signals.
+func signalHandler()
+
+// FindEndAddress returns the end address (one byte beyond the last) of the
+// function that contains the specified address (begin).
+func FindEndAddress(begin uintptr) uintptr {
+	f := runtime.FuncForPC(begin)
+	if f != nil {
+		for p := begin; ; p++ {
+			g := runtime.FuncForPC(p)
+			if f != g {
+				return p
+			}
+		}
+	}
+	return begin
+}
+
+// initializeAddresses initializes the addresses used by the signal handler.
+func initializeAddresses() {
+	// The following functions are written in assembly language, so they won't
+	// be inlined by the existing compiler/linker. Tests will fail if this
+	// assumption is violated.
+	memcpyBegin = reflect.ValueOf(memcpy).Pointer()
+	memcpyEnd = FindEndAddress(memcpyBegin)
+	memclrBegin = reflect.ValueOf(memclr).Pointer()
+	memclrEnd = FindEndAddress(memclrBegin)
+	swapUint32Begin = reflect.ValueOf(swapUint32).Pointer()
+	swapUint32End = FindEndAddress(swapUint32Begin)
+	swapUint64Begin = reflect.ValueOf(swapUint64).Pointer()
+	swapUint64End = FindEndAddress(swapUint64Begin)
+	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
+	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
+	loadUint32Begin = reflect.ValueOf(loadUint32).Pointer()
+	loadUint32End = FindEndAddress(loadUint32Begin)
+}
+
+func init() {
+	initializeAddresses()
+	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
+	}
+	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
+	}
+	syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) {
+		switch e.(type) {
+		case SegvError, BusError, AlignmentError:
+			return syscall.EFAULT, true
+		default:
+			return 0, false
+		}
+	})
+}
diff --git a/pkg/safecopy/safecopy_test.go b/pkg/safecopy/safecopy_test.go
new file mode 100644
index 000000000..5818f7f9b
--- /dev/null
+++ b/pkg/safecopy/safecopy_test.go
@@ -0,0 +1,617 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safecopy
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"runtime/debug"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular
+// dependency.
+const pageSize = 4096
+
+func initRandom(b []byte) {
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+}
+
+func randBuf(size int) []byte {
+	b := make([]byte, size)
+	initRandom(b)
+	return b
+}
+
+func TestCopyInSuccess(t *testing.T) {
+	// Test that CopyIn does not return an error when all pages are accessible.
+	const bufLen = 8192
+	a := randBuf(bufLen)
+	b := make([]byte, bufLen)
+
+	n, err := CopyIn(b, unsafe.Pointer(&a[0]))
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestCopyOutSuccess(t *testing.T) {
+	// Test that CopyOut does not return an error when all pages are
+	// accessible.
+	const bufLen = 8192
+	a := randBuf(bufLen)
+	b := make([]byte, bufLen)
+
+	n, err := CopyOut(unsafe.Pointer(&b[0]), a)
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestCopySuccess(t *testing.T) {
+	// Test that Copy does not return an error when all pages are accessible.
+	const bufLen = 8192
+	a := randBuf(bufLen)
+	b := make([]byte, bufLen)
+
+	n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen)
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestZeroOutSuccess(t *testing.T) {
+	// Test that ZeroOut does not return an error when all pages are
+	// accessible.
+	const bufLen = 8192
+	a := make([]byte, bufLen)
+	b := randBuf(bufLen)
+
+	n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen)
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestSwapUint32Success(t *testing.T) {
+	// Test that SwapUint32 does not return an error when the page is
+	// accessible.
+	before := uint32(rand.Int31())
+	after := uint32(rand.Int31())
+	val := before
+
+	old, err := SwapUint32(unsafe.Pointer(&val), after)
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if old != before {
+		t.Errorf("Unexpected old value: got %v, want %v", old, before)
+	}
+	if val != after {
+		t.Errorf("Unexpected new value: got %v, want %v", val, after)
+	}
+}
+
+func TestSwapUint32AlignmentError(t *testing.T) {
+	// Test that SwapUint32 returns an AlignmentError when passed an unaligned
+	// address.
+	data := new(struct{ val uint64 })
+	addr := uintptr(unsafe.Pointer(&data.val)) + 1
+	want := AlignmentError{Addr: addr, Alignment: 4}
+	if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want {
+		t.Errorf("Unexpected error: got %v, want %v", err, want)
+	}
+}
+
+func TestSwapUint64Success(t *testing.T) {
+	// Test that SwapUint64 does not return an error when the page is
+	// accessible.
+	before := uint64(rand.Int63())
+	after := uint64(rand.Int63())
+	// "The first word in ... an allocated struct or slice can be relied upon
+	// to be 64-bit aligned." - sync/atomic docs
+	data := new(struct{ val uint64 })
+	data.val = before
+
+	old, err := SwapUint64(unsafe.Pointer(&data.val), after)
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if old != before {
+		t.Errorf("Unexpected old value: got %v, want %v", old, before)
+	}
+	if data.val != after {
+		t.Errorf("Unexpected new value: got %v, want %v", data.val, after)
+	}
+}
+
+func TestSwapUint64AlignmentError(t *testing.T) {
+	// Test that SwapUint64 returns an AlignmentError when passed an unaligned
+	// address.
+	data := new(struct{ val1, val2 uint64 })
+	addr := uintptr(unsafe.Pointer(&data.val1)) + 1
+	want := AlignmentError{Addr: addr, Alignment: 8}
+	if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want {
+		t.Errorf("Unexpected error: got %v, want %v", err, want)
+	}
+}
+
+func TestCompareAndSwapUint32Success(t *testing.T) {
+	// Test that CompareAndSwapUint32 does not return an error when the page is
+	// accessible.
+	before := uint32(rand.Int31())
+	after := uint32(rand.Int31())
+	val := before
+
+	old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after)
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if old != before {
+		t.Errorf("Unexpected old value: got %v, want %v", old, before)
+	}
+	if val != after {
+		t.Errorf("Unexpected new value: got %v, want %v", val, after)
+	}
+}
+
+func TestCompareAndSwapUint32AlignmentError(t *testing.T) {
+	// Test that CompareAndSwapUint32 returns an AlignmentError when passed an
+	// unaligned address.
+	data := new(struct{ val uint64 })
+	addr := uintptr(unsafe.Pointer(&data.val)) + 1
+	want := AlignmentError{Addr: addr, Alignment: 4}
+	if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want {
+		t.Errorf("Unexpected error: got %v, want %v", err, want)
+	}
+}
+
+// withSegvErrorTestMapping calls fn with a two-page mapping. The first page
+// contains random data, and the second page generates SIGSEGV when accessed.
+func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) {
+	mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+	}
+	defer syscall.Munmap(mapping)
+	if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil {
+		t.Fatalf("Mprotect failed: %v", err)
+	}
+	initRandom(mapping[:pageSize])
+
+	fn(mapping)
+}
+
+// withBusErrorTestMapping calls fn with a two-page mapping. The first page
+// contains random data, and the second page generates SIGBUS when accessed.
+func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) {
+	f, err := ioutil.TempFile("", "sigbus_test")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	defer f.Close()
+	if err := f.Truncate(pageSize); err != nil {
+		t.Fatalf("Truncate failed: %v", err)
+	}
+	mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+	}
+	defer syscall.Munmap(mapping)
+	initRandom(mapping[:pageSize])
+
+	fn(mapping)
+}
+
+func TestCopyInSegvError(t *testing.T) {
+	// Test that CopyIn returns a SegvError when reaching a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := CopyIn(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyInBusError(t *testing.T) {
+	// Test that CopyIn returns a BusError when reaching a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := CopyIn(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyOutSegvError(t *testing.T) {
+	// Test that CopyOut returns a SegvError when reaching a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := CopyOut(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyOutBusError(t *testing.T) {
+	// Test that CopyOut returns a BusError when reaching a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := CopyOut(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopySourceSegvError(t *testing.T) {
+	// Test that Copy returns a SegvError when copying from a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopySourceBusError(t *testing.T) {
+	// Test that Copy returns a BusError when copying from a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyDestinationSegvError(t *testing.T) {
+	// Test that Copy returns a SegvError when copying to a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyDestinationBusError(t *testing.T) {
+	// Test that Copy returns a BusError when copying to a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestZeroOutSegvError(t *testing.T) {
+	// Test that ZeroOut returns a SegvError when reaching a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				n, err := ZeroOut(dst, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) {
+					t.Errorf("Non-zero bytes in written part of mapping: %v", got)
+				}
+			})
+		})
+	}
+}
+
+func TestZeroOutBusError(t *testing.T) {
+	// Test that ZeroOut returns a BusError when reaching a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				n, err := ZeroOut(dst, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) {
+					t.Errorf("Non-zero bytes in written part of mapping: %v", got)
+				}
+			})
+		})
+	}
+}
+
+func TestSwapUint32SegvError(t *testing.T) {
+	// Test that SwapUint32 returns a SegvError when reaching a page that
+	// signals SIGSEGV.
+	withSegvErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
+		if want := (SegvError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestSwapUint32BusError(t *testing.T) {
+	// Test that SwapUint32 returns a BusError when reaching a page that
+	// signals SIGBUS.
+	withBusErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
+		if want := (BusError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestSwapUint64SegvError(t *testing.T) {
+	// Test that SwapUint64 returns a SegvError when reaching a page that
+	// signals SIGSEGV.
+	withSegvErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
+		if want := (SegvError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestSwapUint64BusError(t *testing.T) {
+	// Test that SwapUint64 returns a BusError when reaching a page that
+	// signals SIGBUS.
+	withBusErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
+		if want := (BusError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestCompareAndSwapUint32SegvError(t *testing.T) {
+	// Test that CompareAndSwapUint32 returns a SegvError when reaching a page
+	// that signals SIGSEGV.
+	withSegvErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
+		if want := (SegvError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestCompareAndSwapUint32BusError(t *testing.T) {
+	// Test that CompareAndSwapUint32 returns a BusError when reaching a page
+	// that signals SIGBUS.
+	withBusErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
+		if want := (BusError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func testCopy(dst, src []byte) (panicked bool) {
+	defer func() {
+		if r := recover(); r != nil {
+			panicked = true
+		}
+	}()
+	debug.SetPanicOnFault(true)
+	copy(dst, src)
+	return
+}
+
+func TestSegVOnMemmove(t *testing.T) {
+	// Test that SIGSEGVs received by runtime.memmove when *not* doing
+	// CopyIn or CopyOut work gets propagated to the runtime.
+	const bufLen = pageSize
+	a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+
+	}
+	defer syscall.Munmap(a)
+	b := randBuf(bufLen)
+
+	if !testCopy(b, a) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+
+	if !testCopy(a, b) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+}
+
+func TestSigbusOnMemmove(t *testing.T) {
+	// Test that SIGBUS received by runtime.memmove when *not* doing
+	// CopyIn or CopyOut work gets propagated to the runtime.
+	const bufLen = pageSize
+	f, err := ioutil.TempFile("", "sigbus_test")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	os.Remove(f.Name())
+	defer f.Close()
+
+	a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+
+	}
+	defer syscall.Munmap(a)
+	b := randBuf(bufLen)
+
+	if !testCopy(b, a) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+
+	if !testCopy(a, b) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+}
diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go
new file mode 100644
index 000000000..eef028e68
--- /dev/null
+++ b/pkg/safecopy/safecopy_unsafe.go
@@ -0,0 +1,335 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safecopy
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+// maxRegisterSize is the maximum register size used in memcpy and memclr. It
+// is used to decide by how much to rewind the copy (for memcpy) or zeroing
+// (for memclr) before proceeding.
+const maxRegisterSize = 16
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+//go:noescape
+func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+
+// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
+// signal is received during the write, it returns the address that caused the
+// fault and the number of the signal that was received. Otherwise, it returns
+// an unspecified address and a signal number of 0.
+//
+// Data is written in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully written.
+//
+//go:noescape
+func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+
+// swapUint32 atomically stores new into *ptr and returns (the previous *ptr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+
+// swapUint64 atomically stores new into *ptr and returns (the previous *ptr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: ptr must be aligned to a 8-byte boundary.
+//
+//go:noescape
+func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+
+// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is
+// received during the operation, the value of prev is unspecified, and sig is
+// the number of the signal that was received.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+
+// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
+// copied and an error if SIGSEGV or SIGBUS is received while reading from src.
+func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
+	toCopy := uintptr(len(dst))
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy)
+	if sig == 0 {
+		return len(dst), nil
+	}
+
+	faultN, srcN := uintptr(fault), uintptr(src)
+	if faultN < srcN || faultN >= srcN+toCopy {
+		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy))
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done int
+	if faultN-srcN > maxRegisterSize {
+		done = int(faultN - srcN - maxRegisterSize)
+	}
+	n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done)))
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// CopyOut copies len(src) bytes from src to dst. If returns the number of
+// bytes done and an error if SIGSEGV or SIGBUS is received while writing to
+// dst.
+func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
+	toCopy := uintptr(len(src))
+	if toCopy == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy)
+	if sig == 0 {
+		return len(src), nil
+	}
+
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	if faultN < dstN || faultN >= dstN+toCopy {
+		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy))
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done int
+	if faultN-dstN > maxRegisterSize {
+		done = int(faultN - dstN - maxRegisterSize)
+	}
+	n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)])
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// Copy copies toCopy bytes from src to dst. It returns the number of bytes
+// copied and an error if SIGSEGV or SIGBUS is received while reading from src
+// or writing to dst.
+//
+// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
+// the resulting contents of dst are unspecified.
+func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
+	if toCopy == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(dst, src, toCopy)
+	if sig == 0 {
+		return toCopy, nil
+	}
+
+	// Did the fault occur while reading from src or writing to dst?
+	faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst)
+	faultAfterSrc := ^uintptr(0)
+	if faultN >= srcN {
+		faultAfterSrc = faultN - srcN
+	}
+	faultAfterDst := ^uintptr(0)
+	if faultN >= dstN {
+		faultAfterDst = faultN - dstN
+	}
+	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
+		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
+	}
+	faultedAfter := faultAfterSrc
+	if faultedAfter > faultAfterDst {
+		faultedAfter = faultAfterDst
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done uintptr
+	if faultedAfter > maxRegisterSize {
+		done = faultedAfter - maxRegisterSize
+	}
+	n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done)
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
+// written and an error if SIGSEGV or SIGBUS is received while writing to dst.
+func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
+	if toZero == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memclr(dst, toZero)
+	if sig == 0 {
+		return toZero, nil
+	}
+
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	if faultN < dstN || faultN >= dstN+toZero {
+		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero))
+	}
+
+	// memclr might have ended the write up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to write up to the fault.
+	var done uintptr
+	if faultN-dstN > maxRegisterSize {
+		done = faultN - dstN - maxRegisterSize
+	}
+	n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done)
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns
+// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
+// not aligned to a 4-byte boundary.
+func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	old, sig := swapUint32(ptr, new)
+	return old, errorFromFaultSignal(ptr, sig)
+}
+
+// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
+// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
+// not aligned to an 8-byte boundary.
+func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
+	if addr := uintptr(ptr); addr&7 != 0 {
+		return 0, AlignmentError{addr, 8}
+	}
+	old, sig := swapUint64(ptr, new)
+	return old, errorFromFaultSignal(ptr, sig)
+}
+
+// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
+// except that it returns an error if SIGSEGV or SIGBUS is received while
+// accessing ptr, or if ptr is not aligned to a 4-byte boundary.
+func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	prev, sig := compareAndSwapUint32(ptr, old, new)
+	return prev, errorFromFaultSignal(ptr, sig)
+}
+
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	val, sig := loadUint32(ptr)
+	return val, errorFromFaultSignal(ptr, sig)
+}
+
+func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
+	switch sig {
+	case 0:
+		return nil
+	case int32(syscall.SIGSEGV):
+		return SegvError{uintptr(addr)}
+	case int32(syscall.SIGBUS):
+		return BusError{uintptr(addr)}
+	default:
+		panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
+	}
+}
+
+// ReplaceSignalHandler replaces the existing signal handler for the provided
+// signal with the one that handles faults in safecopy-protected functions.
+//
+// It stores the value of the previously set handler in previous.
+//
+// This function will be called on initialization in order to install safecopy
+// handlers for appropriate signals. These handlers will call the previous
+// handler however, and if this is function is being used externally then the
+// same courtesy is expected.
+func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error {
+	var sa struct {
+		handler  uintptr
+		flags    uint64
+		restorer uintptr
+		mask     uint64
+	}
+	const maskLen = 8
+
+	// Get the existing signal handler information, and save the current
+	// handler. Once we replace it, we will use this pointer to fall back to
+	// it when we receive other signals.
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 {
+		return e
+	}
+
+	// Fail if there isn't a previous handler.
+	if sa.handler == 0 {
+		return fmt.Errorf("previous handler for signal %x isn't set", sig)
+	}
+
+	*previous = sa.handler
+
+	// Install our own handler.
+	sa.handler = handler
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
+		return e
+	}
+
+	return nil
+}
diff --git a/pkg/safecopy/sighandler_amd64.s b/pkg/safecopy/sighandler_amd64.s
new file mode 100644
index 000000000..475ae48e9
--- /dev/null
+++ b/pkg/safecopy/sighandler_amd64.s
@@ -0,0 +1,133 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// The signals handled by sigHandler.
+#define SIGBUS  7
+#define SIGSEGV 11
+
+// Offsets to the registers in context->uc_mcontext.gregs[].
+#define REG_RDI 0x68
+#define REG_RAX 0x90
+#define REG_IP  0xa8
+
+// Offset to the si_addr field of siginfo.
+#define SI_CODE 0x08
+#define SI_ADDR 0x10
+
+// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
+// not be set up as a handler to any other signals.
+//
+// If the instruction causing the signal is within a safecopy-protected
+// function, the signal is handled such that execution resumes in the
+// appropriate fault handling stub with AX containing the faulting address and
+// DI containing the signal number. Otherwise control is transferred to the
+// previously configured signal handler (savedSigSegvHandler or
+// savedSigBusHandler).
+//
+// This function cannot be written in go because it runs whenever a signal is
+// received by the thread (preempting whatever was running), which includes when
+// garbage collector has stopped or isn't expecting any interactions (like
+// barriers).
+//
+// The arguments are the following:
+// DI - The signal number.
+// SI - Pointer to siginfo_t structure.
+// DX - Pointer to ucontext structure.
+TEXT ·signalHandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel.
+	MOVQ $0x0, CX
+	CMPL CX, SI_CODE(SI)
+	JGE original_handler
+
+	// Check if RIP is within the area we care about.
+	MOVQ REG_IP(DX), CX
+	CMPQ CX, ·memcpyBegin(SB)
+	JB not_memcpy
+	CMPQ CX, ·memcpyEnd(SB)
+	JAE not_memcpy
+
+	// Modify the context such that execution will resume in the fault
+	// handler.
+	LEAQ handleMemcpyFault(SB), CX
+	JMP handle_fault
+
+not_memcpy:
+	CMPQ CX, ·memclrBegin(SB)
+	JB not_memclr
+	CMPQ CX, ·memclrEnd(SB)
+	JAE not_memclr
+
+	LEAQ handleMemclrFault(SB), CX
+	JMP handle_fault
+
+not_memclr:
+	CMPQ CX, ·swapUint32Begin(SB)
+	JB not_swapuint32
+	CMPQ CX, ·swapUint32End(SB)
+	JAE not_swapuint32
+
+	LEAQ handleSwapUint32Fault(SB), CX
+	JMP handle_fault
+
+not_swapuint32:
+	CMPQ CX, ·swapUint64Begin(SB)
+	JB not_swapuint64
+	CMPQ CX, ·swapUint64End(SB)
+	JAE not_swapuint64
+
+	LEAQ handleSwapUint64Fault(SB), CX
+	JMP handle_fault
+
+not_swapuint64:
+	CMPQ CX, ·compareAndSwapUint32Begin(SB)
+	JB not_casuint32
+	CMPQ CX, ·compareAndSwapUint32End(SB)
+	JAE not_casuint32
+
+	LEAQ handleCompareAndSwapUint32Fault(SB), CX
+	JMP handle_fault
+
+not_casuint32:
+	CMPQ CX, ·loadUint32Begin(SB)
+	JB not_loaduint32
+	CMPQ CX, ·loadUint32End(SB)
+	JAE not_loaduint32
+
+	LEAQ handleLoadUint32Fault(SB), CX
+	JMP handle_fault
+
+not_loaduint32:
+original_handler:
+	// Jump to the previous signal handler, which is likely the golang one.
+	XORQ CX, CX
+	MOVQ ·savedSigBusHandler(SB), AX
+	CMPL DI, $SIGSEGV
+	CMOVQEQ ·savedSigSegVHandler(SB), AX
+	JMP AX
+
+handle_fault:
+	// Entered with the address of the fault handler in RCX; store it in
+	// RIP.
+	MOVQ CX, REG_IP(DX)
+
+	// Store the faulting address in RAX.
+	MOVQ SI_ADDR(SI), CX
+	MOVQ CX, REG_RAX(DX)
+
+	// Store the signal number in EDI.
+	MOVL DI, REG_RDI(DX)
+
+	RET
diff --git a/pkg/safecopy/sighandler_arm64.s b/pkg/safecopy/sighandler_arm64.s
new file mode 100644
index 000000000..53e4ac2c1
--- /dev/null
+++ b/pkg/safecopy/sighandler_arm64.s
@@ -0,0 +1,143 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// The signals handled by sigHandler.
+#define SIGBUS 7
+#define SIGSEGV 11
+
+// Offsets to the registers in context->uc_mcontext.gregs[].
+#define REG_R0 0xB8
+#define REG_R1 0xC0
+#define REG_PC 0x1B8
+
+// Offset to the si_addr field of siginfo.
+#define SI_CODE 0x08
+#define SI_ADDR 0x10
+
+// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
+// not be set up as a handler to any other signals.
+//
+// If the instruction causing the signal is within a safecopy-protected
+// function, the signal is handled such that execution resumes in the
+// appropriate fault handling stub with R0 containing the faulting address and
+// R1 containing the signal number. Otherwise control is transferred to the
+// previously configured signal handler (savedSigSegvHandler or
+// savedSigBusHandler).
+//
+// This function cannot be written in go because it runs whenever a signal is
+// received by the thread (preempting whatever was running), which includes when
+// garbage collector has stopped or isn't expecting any interactions (like
+// barriers).
+//
+// The arguments are the following:
+// R0 - The signal number.
+// R1 - Pointer to siginfo_t structure.
+// R2 - Pointer to ucontext structure.
+TEXT ·signalHandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel, si_code > 0 means a kernel signal.
+	MOVD SI_CODE(R1), R7
+	CMPW $0x0, R7
+	BLE original_handler
+
+	// Check if PC is within the area we care about.
+	MOVD REG_PC(R2), R7
+	MOVD ·memcpyBegin(SB), R8
+	CMP R8, R7
+	BLO not_memcpy
+	MOVD ·memcpyEnd(SB), R8
+	CMP R8, R7
+	BHS not_memcpy
+
+	// Modify the context such that execution will resume in the fault handler.
+	MOVD $handleMemcpyFault(SB), R7
+	B handle_fault
+
+not_memcpy:
+	MOVD ·memclrBegin(SB), R8
+	CMP R8, R7
+	BLO not_memclr
+	MOVD ·memclrEnd(SB), R8
+	CMP R8, R7
+	BHS not_memclr
+
+	MOVD $handleMemclrFault(SB), R7
+	B handle_fault
+
+not_memclr:
+	MOVD ·swapUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_swapuint32
+	MOVD ·swapUint32End(SB), R8
+	CMP R8, R7
+	BHS not_swapuint32
+
+	MOVD $handleSwapUint32Fault(SB), R7
+	B handle_fault
+
+not_swapuint32:
+	MOVD ·swapUint64Begin(SB), R8
+	CMP R8, R7
+	BLO not_swapuint64
+	MOVD ·swapUint64End(SB), R8
+	CMP R8, R7
+	BHS not_swapuint64
+
+	MOVD $handleSwapUint64Fault(SB), R7
+	B handle_fault
+
+not_swapuint64:
+	MOVD ·compareAndSwapUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_casuint32
+	MOVD ·compareAndSwapUint32End(SB), R8
+	CMP R8, R7
+	BHS not_casuint32
+
+	MOVD $handleCompareAndSwapUint32Fault(SB), R7
+	B handle_fault
+
+not_casuint32:
+	MOVD ·loadUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_loaduint32
+	MOVD ·loadUint32End(SB), R8
+	CMP R8, R7
+	BHS not_loaduint32
+
+	MOVD $handleLoadUint32Fault(SB), R7
+	B handle_fault
+
+not_loaduint32:
+original_handler:
+	// Jump to the previous signal handler, which is likely the golang one.
+	MOVD ·savedSigBusHandler(SB), R7
+	MOVD ·savedSigSegVHandler(SB), R8
+	CMPW $SIGSEGV, R0
+	CSEL EQ, R8, R7, R7
+	B (R7)
+
+handle_fault:
+	// Entered with the address of the fault handler in R7; store it in PC.
+	MOVD R7, REG_PC(R2)
+
+	// Store the faulting address in R0.
+	MOVD SI_ADDR(R1), R7
+	MOVD R7, REG_R0(R2)
+
+	// Store the signal number in R1.
+	MOVW R0, REG_R1(R2)
+
+	RET
diff --git a/pkg/safemem/BUILD b/pkg/safemem/BUILD
new file mode 100644
index 000000000..ce30382ab
--- /dev/null
+++ b/pkg/safemem/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "safemem",
+    srcs = [
+        "block_unsafe.go",
+        "io.go",
+        "safemem.go",
+        "seq_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/safecopy",
+    ],
+)
+
+go_test(
+    name = "safemem_test",
+    size = "small",
+    srcs = [
+        "io_test.go",
+        "seq_test.go",
+    ],
+    library = ":safemem",
+)
diff --git a/pkg/safemem/block_unsafe.go b/pkg/safemem/block_unsafe.go
new file mode 100644
index 000000000..e7fd30743
--- /dev/null
+++ b/pkg/safemem/block_unsafe.go
@@ -0,0 +1,279 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safecopy"
+)
+
+// A Block is a range of contiguous bytes, similar to []byte but with the
+// following differences:
+//
+// - The memory represented by a Block may require the use of safecopy to
+// access.
+//
+// - Block does not carry a capacity and cannot be expanded.
+//
+// Blocks are immutable and may be copied by value. The zero value of Block
+// represents an empty range, analogous to a nil []byte.
+type Block struct {
+	// [start, start+length) is the represented memory.
+	//
+	// start is an unsafe.Pointer to ensure that Block prevents the represented
+	// memory from being garbage-collected.
+	start  unsafe.Pointer
+	length int
+
+	// needSafecopy is true if accessing the represented memory requires the
+	// use of safecopy.
+	needSafecopy bool
+}
+
+// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to
+// access without safecopy.
+func BlockFromSafeSlice(slice []byte) Block {
+	return blockFromSlice(slice, false)
+}
+
+// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to
+// access without safecopy.
+func BlockFromUnsafeSlice(slice []byte) Block {
+	return blockFromSlice(slice, true)
+}
+
+func blockFromSlice(slice []byte, needSafecopy bool) Block {
+	if len(slice) == 0 {
+		return Block{}
+	}
+	return Block{
+		start:        unsafe.Pointer(&slice[0]),
+		length:       len(slice),
+		needSafecopy: needSafecopy,
+	}
+}
+
+// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is
+// safe to access without safecopy.
+//
+// Preconditions: ptr+len does not overflow.
+func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block {
+	return blockFromPointer(ptr, len, false)
+}
+
+// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which
+// is not safe to access without safecopy.
+//
+// Preconditions: ptr+len does not overflow.
+func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block {
+	return blockFromPointer(ptr, len, true)
+}
+
+func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block {
+	if uptr := uintptr(ptr); uptr+uintptr(len) < uptr {
+		panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len))
+	}
+	return Block{
+		start:        ptr,
+		length:       len,
+		needSafecopy: needSafecopy,
+	}
+}
+
+// DropFirst returns a Block equivalent to b, but with the first n bytes
+// omitted. It is analogous to the [n:] operation on a slice, except that if n
+// > b.Len(), DropFirst returns an empty Block instead of panicking.
+//
+// Preconditions: n >= 0.
+func (b Block) DropFirst(n int) Block {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return b.DropFirst64(uint64(n))
+}
+
+// DropFirst64 is equivalent to DropFirst but takes a uint64.
+func (b Block) DropFirst64(n uint64) Block {
+	if n >= uint64(b.length) {
+		return Block{}
+	}
+	return Block{
+		start:        unsafe.Pointer(uintptr(b.start) + uintptr(n)),
+		length:       b.length - int(n),
+		needSafecopy: b.needSafecopy,
+	}
+}
+
+// TakeFirst returns a Block equivalent to the first n bytes of b. It is
+// analogous to the [:n] operation on a slice, except that if n > b.Len(),
+// TakeFirst returns a copy of b instead of panicking.
+//
+// Preconditions: n >= 0.
+func (b Block) TakeFirst(n int) Block {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return b.TakeFirst64(uint64(n))
+}
+
+// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
+func (b Block) TakeFirst64(n uint64) Block {
+	if n == 0 {
+		return Block{}
+	}
+	if n >= uint64(b.length) {
+		return b
+	}
+	return Block{
+		start:        b.start,
+		length:       int(n),
+		needSafecopy: b.needSafecopy,
+	}
+}
+
+// ToSlice returns a []byte equivalent to b.
+func (b Block) ToSlice() []byte {
+	var bs []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
+	hdr.Data = uintptr(b.start)
+	hdr.Len = b.length
+	hdr.Cap = b.length
+	return bs
+}
+
+// Addr returns b's start address as a uintptr. It returns uintptr instead of
+// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers
+// without importing the unsafe package explicitly.
+//
+// Note that a uintptr is not recognized as a pointer by the garbage collector,
+// such that if there are no uses of b after a call to b.Addr() and the address
+// is to Go-managed memory, the returned uintptr does not prevent garbage
+// collection of the pointee.
+func (b Block) Addr() uintptr {
+	return uintptr(b.start)
+}
+
+// Len returns b's length in bytes.
+func (b Block) Len() int {
+	return b.length
+}
+
+// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy.
+func (b Block) NeedSafecopy() bool {
+	return b.needSafecopy
+}
+
+// String implements fmt.Stringer.String.
+func (b Block) String() string {
+	if uintptr(b.start) == 0 && b.length == 0 {
+		return "<nil>"
+	}
+	var suffix string
+	if b.needSafecopy {
+		suffix = "*"
+	}
+	return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix)
+}
+
+// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src
+// to dst and returns the number of bytes copied.
+//
+// If src and dst overlap, the data stored in dst is unspecified.
+func Copy(dst, src Block) (int, error) {
+	if !dst.needSafecopy && !src.needSafecopy {
+		return copy(dst.ToSlice(), src.ToSlice()), nil
+	}
+
+	n := dst.length
+	if n > src.length {
+		n = src.length
+	}
+	if n == 0 {
+		return 0, nil
+	}
+
+	switch {
+	case dst.needSafecopy && !src.needSafecopy:
+		return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice())
+	case !dst.needSafecopy && src.needSafecopy:
+		return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start)
+	case dst.needSafecopy && src.needSafecopy:
+		n64, err := safecopy.Copy(dst.start, src.start, uintptr(n))
+		return int(n64), err
+	default:
+		panic("unreachable")
+	}
+}
+
+// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed.
+func Zero(dst Block) (int, error) {
+	if !dst.needSafecopy {
+		bs := dst.ToSlice()
+		for i := range bs {
+			bs[i] = 0
+		}
+		return len(bs), nil
+	}
+
+	n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length))
+	return int(n64), err
+}
+
+// Safecopy atomics are no slower than non-safecopy atomics, so use the former
+// even when !b.needSafecopy to get consistent alignment checking.
+
+// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func SwapUint32(b Block, new uint32) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.SwapUint32(b.start, new)
+}
+
+// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b.
+//
+// Preconditions: b.Len() >= 8.
+func SwapUint64(b Block, new uint64) (uint64, error) {
+	if b.length < 8 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.SwapUint64(b.start, new)
+}
+
+// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4
+// bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.CompareAndSwapUint32(b.start, old, new)
+}
+
+// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func LoadUint32(b Block) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.LoadUint32(b.start)
+}
diff --git a/pkg/safemem/io.go b/pkg/safemem/io.go
new file mode 100644
index 000000000..f039a5c34
--- /dev/null
+++ b/pkg/safemem/io.go
@@ -0,0 +1,392 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"errors"
+	"io"
+	"math"
+)
+
+// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write
+// beyond the end of the BlockSeq.
+var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq")
+
+// Reader represents a streaming byte source like io.Reader.
+type Reader interface {
+	// ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the
+	// number of bytes read. It may return a partial read without an error
+	// (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a
+	// full read with an error (i.e. (dsts.NumBytes(), err) where err != nil);
+	// note that this differs from io.Reader.Read (in particular, io.EOF should
+	// not be returned if ReadToBlocks successfully reads dsts.NumBytes()
+	// bytes.)
+	ReadToBlocks(dsts BlockSeq) (uint64, error)
+}
+
+// Writer represents a streaming byte sink like io.Writer.
+type Writer interface {
+	// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
+	// the number of bytes written. It may return a partial write without an
+	// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
+	// return a full write with an error (i.e. srcs.NumBytes(), err) where err
+	// != nil).
+	WriteFromBlocks(srcs BlockSeq) (uint64, error)
+}
+
+// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes()
+// bytes have been read or ReadToBlocks returns an error.
+func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) {
+	var done uint64
+	for !dsts.IsEmpty() {
+		n, err := r.ReadToBlocks(dsts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		dsts = dsts.DropFirst64(n)
+	}
+	return done, nil
+}
+
+// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until
+// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error.
+func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) {
+	var done uint64
+	for !srcs.IsEmpty() {
+		n, err := w.WriteFromBlocks(srcs)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		srcs = srcs.DropFirst64(n)
+	}
+	return done, nil
+}
+
+// BlockSeqReader implements Reader by reading from a BlockSeq.
+type BlockSeqReader struct {
+	Blocks BlockSeq
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	n, err := CopySeq(dsts, r.Blocks)
+	r.Blocks = r.Blocks.DropFirst64(n)
+	if err != nil {
+		return n, err
+	}
+	if n < dsts.NumBytes() {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+// BlockSeqWriter implements Writer by writing to a BlockSeq.
+type BlockSeqWriter struct {
+	Blocks BlockSeq
+}
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	n, err := CopySeq(w.Blocks, srcs)
+	w.Blocks = w.Blocks.DropFirst64(n)
+	if err != nil {
+		return n, err
+	}
+	if n < srcs.NumBytes() {
+		return n, ErrEndOfBlockSeq
+	}
+	return n, nil
+}
+
+// ReaderFunc implements Reader for a function with the semantics of
+// Reader.ReadToBlocks.
+type ReaderFunc func(dsts BlockSeq) (uint64, error)
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	return f(dsts)
+}
+
+// WriterFunc implements Writer for a function with the semantics of
+// Writer.WriteFromBlocks.
+type WriterFunc func(srcs BlockSeq) (uint64, error)
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	return f(srcs)
+}
+
+// ToIOReader implements io.Reader for a (safemem.)Reader.
+//
+// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does
+// so.
+type ToIOReader struct {
+	Reader Reader
+}
+
+// Read implements io.Reader.Read.
+func (r ToIOReader) Read(dst []byte) (int, error) {
+	n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst)))
+	return int(n), err
+}
+
+// ToIOWriter implements io.Writer for a (safemem.)Writer.
+type ToIOWriter struct {
+	Writer Writer
+}
+
+// Write implements io.Writer.Write.
+func (w ToIOWriter) Write(src []byte) (int, error) {
+	// io.Writer does not permit partial writes.
+	n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src)))
+	return int(n), err
+}
+
+// FromIOReader implements Reader for an io.Reader by repeatedly invoking
+// io.Reader.Read until it returns an error or partial read. This is not
+// thread-safe.
+//
+// FromIOReader will return a successful partial read iff Reader.Read does so.
+type FromIOReader struct {
+	Reader io.Reader
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	var buf []byte
+	var done uint64
+	for !dsts.IsEmpty() {
+		dst := dsts.Head()
+		var n int
+		var err error
+		n, buf, err = r.readToBlock(dst, buf)
+		done += uint64(n)
+		if n != dst.Len() {
+			return done, err
+		}
+		dsts = dsts.Tail()
+		if err != nil {
+			if dsts.IsEmpty() && err == io.EOF {
+				return done, nil
+			}
+			return done, err
+		}
+	}
+	return done, nil
+}
+
+func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+	// io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+	// safecopy.
+	if !dst.NeedSafecopy() {
+		n, err := r.Reader.Read(dst.ToSlice())
+		return n, buf, err
+	}
+	if len(buf) < dst.Len() {
+		buf = make([]byte, dst.Len())
+	}
+	rn, rerr := r.Reader.Read(buf[:dst.Len()])
+	wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+	if wberr != nil {
+		return wbn, buf, wberr
+	}
+	return wbn, buf, rerr
+}
+
+// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
+// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
+// read indicates an error. This is not thread-safe.
+type FromIOReaderAt struct {
+	ReaderAt io.ReaderAt
+	Offset   int64
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	var buf []byte
+	var done uint64
+	for !dsts.IsEmpty() {
+		dst := dsts.Head()
+		var n int
+		var err error
+		n, buf, err = r.readToBlock(dst, buf)
+		done += uint64(n)
+		if n != dst.Len() {
+			return done, err
+		}
+		dsts = dsts.Tail()
+		if err != nil {
+			if dsts.IsEmpty() && err == io.EOF {
+				return done, nil
+			}
+			return done, err
+		}
+	}
+	return done, nil
+}
+
+func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+	// io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+	// safecopy.
+	if !dst.NeedSafecopy() {
+		n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
+		r.Offset += int64(n)
+		return n, buf, err
+	}
+	if len(buf) < dst.Len() {
+		buf = make([]byte, dst.Len())
+	}
+	rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
+	r.Offset += int64(rn)
+	wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+	if wberr != nil {
+		return wbn, buf, wberr
+	}
+	return wbn, buf, rerr
+}
+
+// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
+// io.Writer.Write until it returns an error or partial write.
+//
+// FromIOWriter will tolerate implementations of io.Writer.Write that return
+// partial writes with a nil error in contravention of io.Writer's
+// requirements, since Writer is permitted to do so. FromIOWriter will return a
+// successful partial write iff Writer.Write does so.
+type FromIOWriter struct {
+	Writer io.Writer
+}
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	var buf []byte
+	var done uint64
+	for !srcs.IsEmpty() {
+		src := srcs.Head()
+		var n int
+		var err error
+		n, buf, err = w.writeFromBlock(src, buf)
+		done += uint64(n)
+		if n != src.Len() || err != nil {
+			return done, err
+		}
+		srcs = srcs.Tail()
+	}
+	return done, nil
+}
+
+func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) {
+	// io.Writer isn't safecopy-aware, so we have to buffer Blocks that require
+	// safecopy.
+	if !src.NeedSafecopy() {
+		n, err := w.Writer.Write(src.ToSlice())
+		return n, buf, err
+	}
+	if len(buf) < src.Len() {
+		buf = make([]byte, src.Len())
+	}
+	bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src)
+	wn, werr := w.Writer.Write(buf[:bufn])
+	if werr != nil {
+		return wn, buf, werr
+	}
+	return wn, buf, buferr
+}
+
+// FromVecReaderFunc implements Reader for a function that reads data into a
+// [][]byte and returns the number of bytes read as an int64.
+type FromVecReaderFunc struct {
+	ReadVec func(dsts [][]byte) (int64, error)
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+//
+// ReadToBlocks calls r.ReadVec at most once.
+func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	// Ensure that we don't pass a [][]byte with a total length > MaxInt64.
+	dsts = dsts.TakeFirst64(uint64(math.MaxInt64))
+	dstSlices := make([][]byte, 0, dsts.NumBlocks())
+	// Buffer Blocks that require safecopy.
+	for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() {
+		dst := tmp.Head()
+		if dst.NeedSafecopy() {
+			dstSlices = append(dstSlices, make([]byte, dst.Len()))
+		} else {
+			dstSlices = append(dstSlices, dst.ToSlice())
+		}
+	}
+	rn, rerr := r.ReadVec(dstSlices)
+	dsts = dsts.TakeFirst64(uint64(rn))
+	var done uint64
+	var i int
+	for !dsts.IsEmpty() {
+		dst := dsts.Head()
+		if dst.NeedSafecopy() {
+			n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i]))
+			done += uint64(n)
+			if err != nil {
+				return done, err
+			}
+		} else {
+			done += uint64(dst.Len())
+		}
+		dsts = dsts.Tail()
+		i++
+	}
+	return done, rerr
+}
+
+// FromVecWriterFunc implements Writer for a function that writes data from a
+// [][]byte and returns the number of bytes written.
+type FromVecWriterFunc struct {
+	WriteVec func(srcs [][]byte) (int64, error)
+}
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+//
+// WriteFromBlocks calls w.WriteVec at most once.
+func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	// Ensure that we don't pass a [][]byte with a total length > MaxInt64.
+	srcs = srcs.TakeFirst64(uint64(math.MaxInt64))
+	srcSlices := make([][]byte, 0, srcs.NumBlocks())
+	// Buffer Blocks that require safecopy.
+	var buferr error
+	for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() {
+		src := tmp.Head()
+		if src.NeedSafecopy() {
+			slice := make([]byte, src.Len())
+			n, err := Copy(BlockFromSafeSlice(slice), src)
+			srcSlices = append(srcSlices, slice[:n])
+			if err != nil {
+				buferr = err
+				break
+			}
+		} else {
+			srcSlices = append(srcSlices, src.ToSlice())
+		}
+	}
+	n, err := w.WriteVec(srcSlices)
+	if err != nil {
+		return uint64(n), err
+	}
+	return uint64(n), buferr
+}
diff --git a/pkg/safemem/io_test.go b/pkg/safemem/io_test.go
new file mode 100644
index 000000000..629741bee
--- /dev/null
+++ b/pkg/safemem/io_test.go
@@ -0,0 +1,199 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"bytes"
+	"io"
+	"testing"
+)
+
+func makeBlocks(slices ...[]byte) []Block {
+	blocks := make([]Block, 0, len(slices))
+	for _, s := range slices {
+		blocks = append(blocks, BlockFromSafeSlice(s))
+	}
+	return blocks
+}
+
+func TestFromIOReaderFullRead(t *testing.T) {
+	r := FromIOReader{bytes.NewBufferString("foobar")}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("foo"), []byte("bar")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+type eofHidingReader struct {
+	Reader io.Reader
+}
+
+func (r eofHidingReader) Read(dst []byte) (int, error) {
+	n, err := r.Reader.Read(dst)
+	if err == io.EOF {
+		return n, nil
+	}
+	return n, err
+}
+
+func TestFromIOReaderPartialRead(t *testing.T) {
+	r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
+	// FromIOReader should stop after the eofHidingReader returns (1, nil)
+	// for a 3-byte read.
+	if wantN := uint64(4); n != wantN || err != nil {
+		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+type singleByteReader struct {
+	Reader io.Reader
+}
+
+func (r singleByteReader) Read(dst []byte) (int, error) {
+	if len(dst) == 0 {
+		return r.Reader.Read(dst)
+	}
+	return r.Reader.Read(dst[:1])
+}
+
+func TestSingleByteReader(t *testing.T) {
+	r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
+	// FromIOReader should stop after the singleByteReader returns (1, nil)
+	// for a 3-byte read.
+	if wantN := uint64(1); n != wantN || err != nil {
+		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+func TestReadFullToBlocks(t *testing.T) {
+	r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts))
+	// ReadFullToBlocks should call into FromIOReader => singleByteReader
+	// repeatedly until dsts is exhausted.
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("foo"), []byte("bar")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+func TestFromIOWriterFullWrite(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{&dst}
+	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+type limitedWriter struct {
+	Writer io.Writer
+	Done   int
+	Limit  int
+}
+
+func (w *limitedWriter) Write(src []byte) (int, error) {
+	count := len(src)
+	if count > (w.Limit - w.Done) {
+		count = w.Limit - w.Done
+	}
+	n, err := w.Writer.Write(src[:count])
+	w.Done += n
+	return n, err
+}
+
+func TestFromIOWriterPartialWrite(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{&limitedWriter{&dst, 0, 4}}
+	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
+	// FromIOWriter should stop after the limitedWriter returns (1, nil) for a
+	// 3-byte write.
+	if wantN := uint64(4); n != wantN || err != nil {
+		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+type singleByteWriter struct {
+	Writer io.Writer
+}
+
+func (w singleByteWriter) Write(src []byte) (int, error) {
+	if len(src) == 0 {
+		return w.Writer.Write(src)
+	}
+	return w.Writer.Write(src[:1])
+}
+
+func TestSingleByteWriter(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{singleByteWriter{&dst}}
+	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
+	// FromIOWriter should stop after the singleByteWriter returns (1, nil)
+	// for a 3-byte write.
+	if wantN := uint64(1); n != wantN || err != nil {
+		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+func TestWriteFullToBlocks(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{singleByteWriter{&dst}}
+	n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs))
+	// WriteFullToBlocks should call into FromIOWriter => singleByteWriter
+	// repeatedly until srcs is exhausted.
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
diff --git a/pkg/safemem/safemem.go b/pkg/safemem/safemem.go
new file mode 100644
index 000000000..3e70d33a2
--- /dev/null
+++ b/pkg/safemem/safemem.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package safemem provides the Block and BlockSeq types.
+package safemem
diff --git a/pkg/safemem/seq_test.go b/pkg/safemem/seq_test.go
new file mode 100644
index 000000000..eba4bb535
--- /dev/null
+++ b/pkg/safemem/seq_test.go
@@ -0,0 +1,196 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+)
+
+type blockSeqTest struct {
+	desc string
+
+	pieces     []string
+	haveOffset bool
+	offset     uint64
+	haveLimit  bool
+	limit      uint64
+
+	want string
+}
+
+func (t blockSeqTest) NonEmptyByteSlices() [][]byte {
+	// t is a value, so we can mutate it freely.
+	slices := make([][]byte, 0, len(t.pieces))
+	for _, str := range t.pieces {
+		if t.haveOffset {
+			strOff := t.offset
+			if strOff > uint64(len(str)) {
+				strOff = uint64(len(str))
+			}
+			str = str[strOff:]
+			t.offset -= strOff
+		}
+		if t.haveLimit {
+			strLim := t.limit
+			if strLim > uint64(len(str)) {
+				strLim = uint64(len(str))
+			}
+			str = str[:strLim]
+			t.limit -= strLim
+		}
+		if len(str) != 0 {
+			slices = append(slices, []byte(str))
+		}
+	}
+	return slices
+}
+
+func (t blockSeqTest) BlockSeq() BlockSeq {
+	blocks := make([]Block, 0, len(t.pieces))
+	for _, str := range t.pieces {
+		blocks = append(blocks, BlockFromSafeSlice([]byte(str)))
+	}
+	bs := BlockSeqFromSlice(blocks)
+	if t.haveOffset {
+		bs = bs.DropFirst64(t.offset)
+	}
+	if t.haveLimit {
+		bs = bs.TakeFirst64(t.limit)
+	}
+	return bs
+}
+
+var blockSeqTests = []blockSeqTest{
+	{
+		desc: "Empty sequence",
+	},
+	{
+		desc:   "Sequence of length 1",
+		pieces: []string{"foobar"},
+		want:   "foobar",
+	},
+	{
+		desc:   "Sequence of length 2",
+		pieces: []string{"foo", "bar"},
+		want:   "foobar",
+	},
+	{
+		desc:   "Empty Blocks",
+		pieces: []string{"", "foo", "", "", "bar", ""},
+		want:   "foobar",
+	},
+	{
+		desc:       "Sequence with non-zero offset",
+		pieces:     []string{"foo", "bar"},
+		haveOffset: true,
+		offset:     2,
+		want:       "obar",
+	},
+	{
+		desc:      "Sequence with non-maximal limit",
+		pieces:    []string{"foo", "bar"},
+		haveLimit: true,
+		limit:     5,
+		want:      "fooba",
+	},
+	{
+		desc:       "Sequence with offset and limit",
+		pieces:     []string{"foo", "bar"},
+		haveOffset: true,
+		offset:     2,
+		haveLimit:  true,
+		limit:      3,
+		want:       "oba",
+	},
+}
+
+func TestBlockSeqNumBytes(t *testing.T) {
+	for _, test := range blockSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want {
+				t.Errorf("NumBytes: got %d, wanted %d", got, want)
+			}
+		})
+	}
+}
+
+func TestBlockSeqIterBlocks(t *testing.T) {
+	// Tests BlockSeq iteration using Head/Tail.
+	for _, test := range blockSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			srcs := test.BlockSeq()
+			// "Note that a non-nil empty slice and a nil slice ... are not
+			// deeply equal." - reflect
+			slices := make([][]byte, 0, 0)
+			for !srcs.IsEmpty() {
+				src := srcs.Head()
+				slices = append(slices, src.ToSlice())
+				nextSrcs := srcs.Tail()
+				if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want {
+					t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want)
+				}
+				srcs = nextSrcs
+			}
+			if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) {
+				t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices)
+			}
+		})
+	}
+}
+
+func TestBlockSeqIterBytes(t *testing.T) {
+	// Tests BlockSeq iteration using Head/DropFirst.
+	for _, test := range blockSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			srcs := test.BlockSeq()
+			var dst bytes.Buffer
+			for !srcs.IsEmpty() {
+				src := srcs.Head()
+				var b [1]byte
+				n, err := Copy(BlockFromSafeSlice(b[:]), src)
+				if n != 1 || err != nil {
+					t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err)
+				}
+				dst.WriteByte(b[0])
+				nextSrcs := srcs.DropFirst(1)
+				if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want {
+					t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want)
+				}
+				srcs = nextSrcs
+			}
+			if got := string(dst.Bytes()); got != test.want {
+				t.Errorf("Copied string: got %q, wanted %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestBlockSeqDropBeyondLimit(t *testing.T) {
+	blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))}
+	bs := BlockSeqFromSlice(blocks)
+	if got, want := bs.NumBytes(), uint64(4); got != want {
+		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
+	}
+	bs = bs.TakeFirst(1)
+	if got, want := bs.NumBytes(), uint64(1); got != want {
+		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
+	}
+	bs = bs.DropFirst(2)
+	if got, want := bs.NumBytes(), uint64(0); got != want {
+		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
+	}
+}
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
new file mode 100644
index 000000000..354a95dde
--- /dev/null
+++ b/pkg/safemem/seq_unsafe.go
@@ -0,0 +1,299 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+// A BlockSeq represents a sequence of Blocks, each of which has non-zero
+// length.
+//
+// BlockSeqs are immutable and may be copied by value. The zero value of
+// BlockSeq represents an empty sequence.
+type BlockSeq struct {
+	// If length is 0, then the BlockSeq is empty. Invariants: data == 0;
+	// offset == 0; limit == 0.
+	//
+	// If length is -1, then the BlockSeq represents the single Block{data,
+	// limit, false}. Invariants: offset == 0; limit > 0; limit does not
+	// overflow the range of an int.
+	//
+	// If length is -2, then the BlockSeq represents the single Block{data,
+	// limit, true}. Invariants: offset == 0; limit > 0; limit does not
+	// overflow the range of an int.
+	//
+	// Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks
+	// in the array of Blocks starting at address `data`, starting at `offset`
+	// bytes into the first Block and limited to the following `limit` bytes.
+	// Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <=
+	// the combined length of all Blocks in the array; the first Block in the
+	// array has non-zero length.
+	//
+	// length is never 1; sequences consisting of a single Block are always
+	// stored inline (with length < 0).
+	data   unsafe.Pointer
+	length int
+	offset int
+	limit  uint64
+}
+
+// BlockSeqOf returns a BlockSeq representing the single Block b.
+func BlockSeqOf(b Block) BlockSeq {
+	bs := BlockSeq{
+		data:   b.start,
+		length: -1,
+		limit:  uint64(b.length),
+	}
+	if b.needSafecopy {
+		bs.length = -2
+	}
+	return bs
+}
+
+// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice.
+// If slice contains Blocks with zero length, BlockSeq will skip them during
+// iteration.
+//
+// Whether the returned BlockSeq shares memory with slice is unspecified;
+// clients should avoid mutating slices passed to BlockSeqFromSlice.
+//
+// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64.
+func BlockSeqFromSlice(slice []Block) BlockSeq {
+	slice = skipEmpty(slice)
+	var limit uint64
+	for _, b := range slice {
+		sum := limit + uint64(b.Len())
+		if sum < limit {
+			panic("BlockSeq length overflows uint64")
+		}
+		limit = sum
+	}
+	return blockSeqFromSliceLimited(slice, limit)
+}
+
+// Preconditions: The combined length of all Blocks in slice <= limit. If
+// len(slice) != 0, the first Block in slice has non-zero length, and limit >
+// 0.
+func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
+	switch len(slice) {
+	case 0:
+		return BlockSeq{}
+	case 1:
+		return BlockSeqOf(slice[0].TakeFirst64(limit))
+	default:
+		return BlockSeq{
+			data:   unsafe.Pointer(&slice[0]),
+			length: len(slice),
+			limit:  limit,
+		}
+	}
+}
+
+func skipEmpty(slice []Block) []Block {
+	for i, b := range slice {
+		if b.Len() != 0 {
+			return slice[i:]
+		}
+	}
+	return nil
+}
+
+// IsEmpty returns true if bs contains no Blocks.
+//
+// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0).
+// (Of these, prefer to use bs.IsEmpty().)
+func (bs BlockSeq) IsEmpty() bool {
+	return bs.length == 0
+}
+
+// NumBlocks returns the number of Blocks in bs.
+func (bs BlockSeq) NumBlocks() int {
+	// In general, we have to count: if bs represents a windowed slice then the
+	// slice may contain Blocks with zero length, and bs.length may be larger
+	// than the actual number of Blocks due to bs.limit.
+	var n int
+	for !bs.IsEmpty() {
+		n++
+		bs = bs.Tail()
+	}
+	return n
+}
+
+// NumBytes returns the sum of Block.Len() for all Blocks in bs.
+func (bs BlockSeq) NumBytes() uint64 {
+	return bs.limit
+}
+
+// Head returns the first Block in bs.
+//
+// Preconditions: !bs.IsEmpty().
+func (bs BlockSeq) Head() Block {
+	if bs.length == 0 {
+		panic("empty BlockSeq")
+	}
+	if bs.length < 0 {
+		return bs.internalBlock()
+	}
+	return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit)
+}
+
+// Preconditions: bs.length < 0.
+func (bs BlockSeq) internalBlock() Block {
+	return Block{
+		start:        bs.data,
+		length:       int(bs.limit),
+		needSafecopy: bs.length == -2,
+	}
+}
+
+// Tail returns a BlockSeq consisting of all Blocks in bs after the first.
+//
+// Preconditions: !bs.IsEmpty().
+func (bs BlockSeq) Tail() BlockSeq {
+	if bs.length == 0 {
+		panic("empty BlockSeq")
+	}
+	if bs.length < 0 {
+		return BlockSeq{}
+	}
+	head := (*Block)(bs.data).DropFirst(bs.offset)
+	headLen := uint64(head.Len())
+	if headLen >= bs.limit {
+		// The head Block exhausts the limit, so the tail is empty.
+		return BlockSeq{}
+	}
+	var extSlice []Block
+	extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice))
+	extSliceHdr.Data = uintptr(bs.data)
+	extSliceHdr.Len = bs.length
+	extSliceHdr.Cap = bs.length
+	tailSlice := skipEmpty(extSlice[1:])
+	tailLimit := bs.limit - headLen
+	return blockSeqFromSliceLimited(tailSlice, tailLimit)
+}
+
+// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes
+// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq.
+//
+// Preconditions: n >= 0.
+func (bs BlockSeq) DropFirst(n int) BlockSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return bs.DropFirst64(uint64(n))
+}
+
+// DropFirst64 is equivalent to DropFirst but takes an uint64.
+func (bs BlockSeq) DropFirst64(n uint64) BlockSeq {
+	if n >= bs.limit {
+		return BlockSeq{}
+	}
+	for {
+		// Calling bs.Head() here is surprisingly expensive, so inline getting
+		// the head's length.
+		var headLen uint64
+		if bs.length < 0 {
+			headLen = bs.limit
+		} else {
+			headLen = uint64((*Block)(bs.data).Len() - bs.offset)
+		}
+		if n < headLen {
+			// Dropping ends partway through the head Block.
+			if bs.length < 0 {
+				return BlockSeqOf(bs.internalBlock().DropFirst64(n))
+			}
+			bs.offset += int(n)
+			bs.limit -= n
+			return bs
+		}
+		n -= headLen
+		bs = bs.Tail()
+	}
+}
+
+// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n >
+// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs.
+//
+// Preconditions: n >= 0.
+func (bs BlockSeq) TakeFirst(n int) BlockSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return bs.TakeFirst64(uint64(n))
+}
+
+// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
+func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq {
+	if n == 0 {
+		return BlockSeq{}
+	}
+	if bs.limit > n {
+		bs.limit = n
+	}
+	return bs
+}
+
+// String implements fmt.Stringer.String.
+func (bs BlockSeq) String() string {
+	var buf bytes.Buffer
+	buf.WriteByte('[')
+	var sep string
+	for !bs.IsEmpty() {
+		buf.WriteString(sep)
+		sep = " "
+		buf.WriteString(bs.Head().String())
+		bs = bs.Tail()
+	}
+	buf.WriteByte(']')
+	return buf.String()
+}
+
+// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less,
+// from srcs to dsts and returns the number of bytes copied.
+//
+// If srcs and dsts overlap, the data stored in dsts is unspecified.
+func CopySeq(dsts, srcs BlockSeq) (uint64, error) {
+	var done uint64
+	for !dsts.IsEmpty() && !srcs.IsEmpty() {
+		dst := dsts.Head()
+		src := srcs.Head()
+		n, err := Copy(dst, src)
+		done += uint64(n)
+		if err != nil {
+			return done, err
+		}
+		dsts = dsts.DropFirst(n)
+		srcs = srcs.DropFirst(n)
+	}
+	return done, nil
+}
+
+// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed.
+func ZeroSeq(dsts BlockSeq) (uint64, error) {
+	var done uint64
+	for !dsts.IsEmpty() {
+		n, err := Zero(dsts.Head())
+		done += uint64(n)
+		if err != nil {
+			return done, err
+		}
+		dsts = dsts.DropFirst(n)
+	}
+	return done, nil
+}
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 51ca09b24..34c0a867d 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -30,13 +30,13 @@ go_library(
         ":registers_go_proto",
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/cpuid",
         "//pkg/log",
-        "//pkg/sentry/context",
         "//pkg/sentry/limits",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 81ec98a77..1d11cc472 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Arch describes an architecture.
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index ea4dedbdf..3b6987665 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -25,8 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 2aa08b1a9..85d6acc0f 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -25,7 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Host specifies the host architecture.
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 0d5b7d317..94f1a808f 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -21,7 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Host specifies the host architecture.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index 84f11b0d1..d388ee9cf 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -21,7 +21,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ErrFloatingPoint indicates a failed restore due to unusable floating point
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 9f41e566f..a18093155 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -25,9 +25,9 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // System-related constants for x86.
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 4546b2ef9..2b4c8f3fc 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -15,7 +15,7 @@
 package arch
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // An AuxEntry represents an entry in an ELF auxiliary vector.
diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go
index 402e46025..8b03d0187 100644
--- a/pkg/sentry/arch/signal.go
+++ b/pkg/sentry/arch/signal.go
@@ -16,7 +16,7 @@ package arch
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SignalAct represents the action that should be taken when a signal is
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 1e4f9c3c2..81b92bb43 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -23,7 +23,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 7d0e98935..4f4cc46a8 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -19,7 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index d324da705..1a6056171 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -17,7 +17,7 @@
 package arch
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 7472c3c61..09bceabc9 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -18,8 +18,8 @@ import (
 	"encoding/binary"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Stack is a simple wrapper around a usermem.IO and an address.
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
deleted file mode 100644
index e13a9ce20..000000000
--- a/pkg/sentry/context/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "context",
-    srcs = ["context.go"],
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/amutex",
-        "//pkg/log",
-    ],
-)
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
deleted file mode 100644
index 23e009ef3..000000000
--- a/pkg/sentry/context/context.go
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package context defines an internal context type.
-//
-// The given Context conforms to the standard Go context, but mandates
-// additional methods that are specific to the kernel internals. Note however,
-// that the Context described by this package carries additional constraints
-// regarding concurrent access and retaining beyond the scope of a call.
-//
-// See the Context type for complete details.
-package context
-
-import (
-	"context"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/amutex"
-	"gvisor.dev/gvisor/pkg/log"
-)
-
-type contextID int
-
-// Globally accessible values from a context. These keys are defined in the
-// context package to resolve dependency cycles by not requiring the caller to
-// import packages usually required to get these information.
-const (
-	// CtxThreadGroupID is the current thread group ID when a context represents
-	// a task context. The value is represented as an int32.
-	CtxThreadGroupID contextID = iota
-)
-
-// ThreadGroupIDFromContext returns the current thread group ID when ctx
-// represents a task context.
-func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
-	if tgid := ctx.Value(CtxThreadGroupID); tgid != nil {
-		return tgid.(int32), true
-	}
-	return 0, false
-}
-
-// A Context represents a thread of execution (hereafter "goroutine" to reflect
-// Go idiosyncrasy). It carries state associated with the goroutine across API
-// boundaries.
-//
-// While Context exists for essentially the same reasons as Go's standard
-// context.Context, the standard type represents the state of an operation
-// rather than that of a goroutine. This is a critical distinction:
-//
-// - Unlike context.Context, which "may be passed to functions running in
-// different goroutines", it is *not safe* to use the same Context in multiple
-// concurrent goroutines.
-//
-// - It is *not safe* to retain a Context passed to a function beyond the scope
-// of that function call.
-//
-// In both cases, values extracted from the Context should be used instead.
-type Context interface {
-	log.Logger
-	amutex.Sleeper
-	context.Context
-
-	// UninterruptibleSleepStart indicates the beginning of an uninterruptible
-	// sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
-	// is true and the Context represents a Task, the Task's AddressSpace is
-	// deactivated.
-	UninterruptibleSleepStart(deactivate bool)
-
-	// UninterruptibleSleepFinish indicates the end of an uninterruptible sleep
-	// state that was begun by a previous call to UninterruptibleSleepStart. If
-	// activate is true and the Context represents a Task, the Task's
-	// AddressSpace is activated. Normally activate is the same value as the
-	// deactivate parameter passed to UninterruptibleSleepStart.
-	UninterruptibleSleepFinish(activate bool)
-}
-
-// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep
-// methods for anonymous embedding in other types that do not implement sleeps.
-type NoopSleeper struct {
-	amutex.NoopSleeper
-}
-
-// UninterruptibleSleepStart does nothing.
-func (NoopSleeper) UninterruptibleSleepStart(bool) {}
-
-// UninterruptibleSleepFinish does nothing.
-func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
-
-// Deadline returns zero values, meaning no deadline.
-func (NoopSleeper) Deadline() (time.Time, bool) {
-	return time.Time{}, false
-}
-
-// Done returns nil.
-func (NoopSleeper) Done() <-chan struct{} {
-	return nil
-}
-
-// Err returns nil.
-func (NoopSleeper) Err() error {
-	return nil
-}
-
-// logContext implements basic logging.
-type logContext struct {
-	log.Logger
-	NoopSleeper
-}
-
-// Value implements Context.Value.
-func (logContext) Value(key interface{}) interface{} {
-	return nil
-}
-
-// bgContext is the context returned by context.Background.
-var bgContext = &logContext{Logger: log.Log()}
-
-// Background returns an empty context using the default logger.
-//
-// Users should be wary of using a Background context. Please tag any use with
-// FIXME(b/38173783) and a note to remove this use.
-//
-// Generally, one should use the Task as their context when available, or avoid
-// having to use a context in places where a Task is unavailable.
-//
-// Using a Background context for tests is fine, as long as no values are
-// needed from the context in the tested code paths.
-func Background() Context {
-	return bgContext
-}
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
deleted file mode 100644
index f91a6d4ed..000000000
--- a/pkg/sentry/context/contexttest/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "contexttest",
-    testonly = 1,
-    srcs = ["contexttest.go"],
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/memutil",
-        "//pkg/sentry/context",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/time",
-        "//pkg/sentry/limits",
-        "//pkg/sentry/pgalloc",
-        "//pkg/sentry/platform",
-        "//pkg/sentry/platform/ptrace",
-        "//pkg/sentry/uniqueid",
-    ],
-)
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
deleted file mode 100644
index 15cf086a9..000000000
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package contexttest builds a test context.Context.
-package contexttest
-
-import (
-	"os"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/memutil"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
-	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
-)
-
-// Context returns a Context that may be used in tests. Uses ptrace as the
-// platform.Platform.
-//
-// Note that some filesystems may require a minimal kernel for testing, which
-// this test context does not provide. For such tests, see kernel/contexttest.
-func Context(tb testing.TB) context.Context {
-	const memfileName = "contexttest-memory"
-	memfd, err := memutil.CreateMemFD(memfileName, 0)
-	if err != nil {
-		tb.Fatalf("error creating application memory file: %v", err)
-	}
-	memfile := os.NewFile(uintptr(memfd), memfileName)
-	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
-	if err != nil {
-		memfile.Close()
-		tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
-	}
-	p, err := ptrace.New()
-	if err != nil {
-		tb.Fatal(err)
-	}
-	// Test usage of context.Background is fine.
-	return &TestContext{
-		Context:     context.Background(),
-		l:           limits.NewLimitSet(),
-		mf:          mf,
-		platform:    p,
-		creds:       auth.NewAnonymousCredentials(),
-		otherValues: make(map[interface{}]interface{}),
-	}
-}
-
-// TestContext represents a context with minimal functionality suitable for
-// running tests.
-type TestContext struct {
-	context.Context
-	l           *limits.LimitSet
-	mf          *pgalloc.MemoryFile
-	platform    platform.Platform
-	creds       *auth.Credentials
-	otherValues map[interface{}]interface{}
-}
-
-// globalUniqueID tracks incremental unique identifiers for tests.
-var globalUniqueID uint64
-
-// globalUniqueIDProvider implements unix.UniqueIDProvider.
-type globalUniqueIDProvider struct{}
-
-// UniqueID implements unix.UniqueIDProvider.UniqueID.
-func (*globalUniqueIDProvider) UniqueID() uint64 {
-	return atomic.AddUint64(&globalUniqueID, 1)
-}
-
-// lastInotifyCookie is a monotonically increasing counter for generating unique
-// inotify cookies. Must be accessed using atomic ops.
-var lastInotifyCookie uint32
-
-// hostClock implements ktime.Clock.
-type hostClock struct {
-	ktime.WallRateClock
-	ktime.NoClockEvents
-}
-
-// Now implements ktime.Clock.Now.
-func (hostClock) Now() ktime.Time {
-	return ktime.FromNanoseconds(time.Now().UnixNano())
-}
-
-// RegisterValue registers additional values with this test context. Useful for
-// providing values from external packages that contexttest can't depend on.
-func (t *TestContext) RegisterValue(key, value interface{}) {
-	t.otherValues[key] = value
-}
-
-// Value implements context.Context.
-func (t *TestContext) Value(key interface{}) interface{} {
-	switch key {
-	case auth.CtxCredentials:
-		return t.creds
-	case limits.CtxLimits:
-		return t.l
-	case pgalloc.CtxMemoryFile:
-		return t.mf
-	case pgalloc.CtxMemoryFileProvider:
-		return t
-	case platform.CtxPlatform:
-		return t.platform
-	case uniqueid.CtxGlobalUniqueID:
-		return (*globalUniqueIDProvider).UniqueID(nil)
-	case uniqueid.CtxGlobalUniqueIDProvider:
-		return &globalUniqueIDProvider{}
-	case uniqueid.CtxInotifyCookie:
-		return atomic.AddUint32(&lastInotifyCookie, 1)
-	case ktime.CtxRealtimeClock:
-		return hostClock{}
-	default:
-		if val, ok := t.otherValues[key]; ok {
-			return val
-		}
-		return t.Context.Value(key)
-	}
-}
-
-// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
-func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
-	return t.mf
-}
-
-// RootContext returns a Context that may be used in tests that need root
-// credentials. Uses ptrace as the platform.Platform.
-func RootContext(tb testing.TB) context.Context {
-	return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
-}
-
-// WithCreds returns a copy of ctx carrying creds.
-func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
-	return &authContext{ctx, creds}
-}
-
-type authContext struct {
-	context.Context
-	creds *auth.Credentials
-}
-
-// Value implements context.Context.
-func (ac *authContext) Value(key interface{}) interface{} {
-	switch key {
-	case auth.CtxCredentials:
-		return ac.creds
-	default:
-		return ac.Context.Value(key)
-	}
-}
-
-// WithLimitSet returns a copy of ctx carrying l.
-func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context {
-	return limitContext{ctx, l}
-}
-
-type limitContext struct {
-	context.Context
-	l *limits.LimitSet
-}
-
-// Value implements context.Context.
-func (lc limitContext) Value(key interface{}) interface{} {
-	switch key {
-	case limits.CtxLimits:
-		return lc.l
-	default:
-		return lc.Context.Value(key)
-	}
-}
diff --git a/pkg/sentry/contexttest/BUILD b/pkg/sentry/contexttest/BUILD
new file mode 100644
index 000000000..6f4c86684
--- /dev/null
+++ b/pkg/sentry/contexttest/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "contexttest",
+    testonly = 1,
+    srcs = ["contexttest.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/memutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/uniqueid",
+    ],
+)
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
new file mode 100644
index 000000000..031fc64ec
--- /dev/null
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -0,0 +1,188 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package contexttest builds a test context.Context.
+package contexttest
+
+import (
+	"os"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+)
+
+// Context returns a Context that may be used in tests. Uses ptrace as the
+// platform.Platform.
+//
+// Note that some filesystems may require a minimal kernel for testing, which
+// this test context does not provide. For such tests, see kernel/contexttest.
+func Context(tb testing.TB) context.Context {
+	const memfileName = "contexttest-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		tb.Fatalf("error creating application memory file: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	p, err := ptrace.New()
+	if err != nil {
+		tb.Fatal(err)
+	}
+	// Test usage of context.Background is fine.
+	return &TestContext{
+		Context:     context.Background(),
+		l:           limits.NewLimitSet(),
+		mf:          mf,
+		platform:    p,
+		creds:       auth.NewAnonymousCredentials(),
+		otherValues: make(map[interface{}]interface{}),
+	}
+}
+
+// TestContext represents a context with minimal functionality suitable for
+// running tests.
+type TestContext struct {
+	context.Context
+	l           *limits.LimitSet
+	mf          *pgalloc.MemoryFile
+	platform    platform.Platform
+	creds       *auth.Credentials
+	otherValues map[interface{}]interface{}
+}
+
+// globalUniqueID tracks incremental unique identifiers for tests.
+var globalUniqueID uint64
+
+// globalUniqueIDProvider implements unix.UniqueIDProvider.
+type globalUniqueIDProvider struct{}
+
+// UniqueID implements unix.UniqueIDProvider.UniqueID.
+func (*globalUniqueIDProvider) UniqueID() uint64 {
+	return atomic.AddUint64(&globalUniqueID, 1)
+}
+
+// lastInotifyCookie is a monotonically increasing counter for generating unique
+// inotify cookies. Must be accessed using atomic ops.
+var lastInotifyCookie uint32
+
+// hostClock implements ktime.Clock.
+type hostClock struct {
+	ktime.WallRateClock
+	ktime.NoClockEvents
+}
+
+// Now implements ktime.Clock.Now.
+func (hostClock) Now() ktime.Time {
+	return ktime.FromNanoseconds(time.Now().UnixNano())
+}
+
+// RegisterValue registers additional values with this test context. Useful for
+// providing values from external packages that contexttest can't depend on.
+func (t *TestContext) RegisterValue(key, value interface{}) {
+	t.otherValues[key] = value
+}
+
+// Value implements context.Context.
+func (t *TestContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return t.creds
+	case limits.CtxLimits:
+		return t.l
+	case pgalloc.CtxMemoryFile:
+		return t.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t
+	case platform.CtxPlatform:
+		return t.platform
+	case uniqueid.CtxGlobalUniqueID:
+		return (*globalUniqueIDProvider).UniqueID(nil)
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return &globalUniqueIDProvider{}
+	case uniqueid.CtxInotifyCookie:
+		return atomic.AddUint32(&lastInotifyCookie, 1)
+	case ktime.CtxRealtimeClock:
+		return hostClock{}
+	default:
+		if val, ok := t.otherValues[key]; ok {
+			return val
+		}
+		return t.Context.Value(key)
+	}
+}
+
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
+	return t.mf
+}
+
+// RootContext returns a Context that may be used in tests that need root
+// credentials. Uses ptrace as the platform.Platform.
+func RootContext(tb testing.TB) context.Context {
+	return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
+}
+
+// WithCreds returns a copy of ctx carrying creds.
+func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
+	return &authContext{ctx, creds}
+}
+
+type authContext struct {
+	context.Context
+	creds *auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+// WithLimitSet returns a copy of ctx carrying l.
+func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context {
+	return limitContext{ctx, l}
+}
+
+type limitContext struct {
+	context.Context
+	l *limits.LimitSet
+}
+
+// Value implements context.Context.
+func (lc limitContext) Value(key interface{}) interface{} {
+	switch key {
+	case limits.CtxLimits:
+		return lc.l
+	default:
+		return lc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 605d61dbe..ea85ab33c 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -47,13 +47,13 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/context",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/p9",
         "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
@@ -64,10 +64,10 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -107,14 +107,14 @@ go_test(
     ],
     deps = [
         ":fs",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/contexttest",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
@@ -129,7 +129,7 @@ go_test(
     ],
     library = ":fs",
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
     ],
 )
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index c14e5405e..aedcecfa1 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -11,10 +11,10 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index 7323c7222..5c421f5fb 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -18,10 +18,10 @@ package anon
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // NewInode constructs an anonymous Inode that is not associated
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 4f3d6410e..fa9e7d517 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -20,8 +20,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index dd427de5d..0fbd60056 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -16,7 +16,7 @@ package fs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index e03e3e417..f6c79e51b 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -19,12 +19,12 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // copyUp copies a file in an overlay from a lower filesystem to an
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 738580c5f..91792d9fe 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -24,8 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 0c7247bd7..4c4b7d5cc 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -16,8 +16,9 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/rand",
-        "//pkg/sentry/context",
+        "//pkg/safemem",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -26,9 +27,8 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
-        "//pkg/sentry/safemem",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index f739c476c..35bd23991 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -18,11 +18,11 @@ package dev
 import (
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Memory device numbers are from Linux's drivers/char/mem.c
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 55f8af704..5e518fb63 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -15,7 +15,7 @@
 package dev
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 07e0ea010..deb9c6ad8 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -16,11 +16,11 @@ package dev
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 4404b97ef..aec33d0d9 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -16,7 +16,7 @@ package dev
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 49cb92f6e..2a9bbeb18 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -16,12 +16,12 @@ package dev
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/rand"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/dev/tty.go b/pkg/sentry/fs/dev/tty.go
index 87d80e292..760ca563d 100644
--- a/pkg/sentry/fs/dev/tty.go
+++ b/pkg/sentry/fs/dev/tty.go
@@ -16,7 +16,7 @@ package dev
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/waiter"
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 31fc4d87b..acab0411a 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -22,8 +22,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 47bc72a88..98d69c6f2 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -18,8 +18,8 @@ import (
 	"syscall"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 )
 
 func newMockDirInode(ctx context.Context, cache *DirentCache) *Inode {
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 25ef96299..1d09e983c 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -12,17 +12,17 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/sentry/fs"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/secio",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/safemem",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -36,13 +36,13 @@ go_test(
     ],
     library = ":fdpipe",
     deps = [
+        "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
         "@com_github_google_uuid//:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 5b6cfeb0a..9fce177ad 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -19,17 +19,17 @@ import (
 	"os"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/secio"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index 64b558975..0c3595998 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -20,8 +20,8 @@ import (
 	"syscall"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 577445148..e556da48a 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -26,12 +26,12 @@ import (
 
 	"github.com/google/uuid"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type hostOpener struct {
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index cee87f726..af8230a7d 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 	"io/ioutil"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
 )
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 69abc1e71..5aff0cc95 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -23,10 +23,10 @@ import (
 
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func singlePipeFD() (int, error) {
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 7c4586296..ca3466f4f 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -20,16 +20,16 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index b88303f17..beba0f771 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -17,10 +17,10 @@ package fs
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 8991207b4..dcc1df38f 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -17,13 +17,13 @@ package fs
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 2fb824d5c..02538bb4f 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -18,7 +18,7 @@ import (
 	"reflect"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index c5b51620a..084da2a8d 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -19,7 +19,7 @@ import (
 	"sort"
 	"strings"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index 9a7608cae..a8000e010 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -8,12 +8,12 @@ go_library(
     srcs = ["filetest.go"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 22270a494..8049538f2 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -19,12 +19,12 @@ import (
 	"fmt"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 26abf49e2..bdba6efe5 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -54,8 +54,8 @@
 package fs
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 9142f5bdf..4ab2a384f 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -77,22 +77,22 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -106,13 +106,13 @@ go_test(
     ],
     library = ":fsutil",
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/safemem",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 12132680b..c6cd45087 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -17,11 +17,11 @@ package fsutil
 import (
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
index 75575d994..e3579c23c 100644
--- a/pkg/sentry/fs/fsutil/dirty_set_test.go
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -19,7 +19,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func TestDirtySet(t *testing.T) {
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index fc5b3b1a1..08695391c 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -17,12 +17,12 @@ package fsutil
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index f52d712e3..5643cdac9 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -19,13 +19,13 @@ import (
 	"io"
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // FileRangeSet maps offsets into a memmap.Mappable to offsets into a
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 837fc70b5..67278aa86 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -19,11 +19,11 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // HostFileMapper caches mappings of an arbitrary host file descriptor. It is
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
index ad11a0573..2d4778d64 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -17,7 +17,7 @@ package fsutil
 import (
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/safemem"
 )
 
 func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block {
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index a625f0e26..78fec553e 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -17,13 +17,13 @@ package fsutil
 import (
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // HostMappable implements memmap.Mappable and platform.File over a
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index df7b74855..252830572 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -16,7 +16,7 @@ package fsutil
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 20a014402..573b8586e 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -18,18 +18,18 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Lock order (compare the lock order model in mm/mm.go):
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 129f314c8..1547584c5 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -19,14 +19,14 @@ import (
 	"io"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type noopBackingFile struct{}
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index cf48e7c03..971d3718e 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -24,13 +24,14 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/p9",
         "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/secio",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fdpipe",
@@ -39,13 +40,12 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/unet",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -56,10 +56,10 @@ go_test(
     srcs = ["gofer_test.go"],
     library = ":gofer",
     deps = [
+        "//pkg/context",
         "//pkg/p9",
         "//pkg/p9/p9test",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
     ],
 )
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 4848e2374..71cccdc34 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -17,12 +17,12 @@ package gofer
 import (
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // getattr returns the 9p attributes of the p9.File. On success, Mode, Size, and RDev
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index cc11c6339..ebea03c42 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -17,7 +17,7 @@ package gofer
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 2125dafef..3da818aed 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -15,9 +15,9 @@
 package gofer
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
 // contextFile is a wrapper around p9.File that notifies the context that
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 7960b9c7b..23296f246 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -19,16 +19,16 @@ import (
 	"syscall"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index bb8312849..ff96b28ba 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -17,7 +17,7 @@ package gofer
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index cf96dd9fa..9d41fcbdb 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -20,8 +20,8 @@ import (
 	"fmt"
 	"strconv"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 7fc3c32ae..0c2f89ae8 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -20,10 +20,10 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/p9/p9test"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index b86c49b39..9f7c3e89f 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -17,14 +17,14 @@ package gofer
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/secio"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 )
 
 // handles are the open handles of a gofer file. They are reference counted to
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 98d1a8a48..ac28174d2 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -19,17 +19,17 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fdpipe"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 0b2eedb7c..238f7804c 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -20,8 +20,8 @@ import (
 	"path/filepath"
 	"strings"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index c09f3b71c..0c1be05ef 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -18,9 +18,9 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index edc796ce0..498c4645a 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -17,9 +17,9 @@ package gofer
 import (
 	"fmt"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index d045e04ff..0285c5361 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -17,8 +17,8 @@ package gofer
 import (
 	"fmt"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/unet"
 )
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index a45a8f36c..376cfce2c 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -16,9 +16,9 @@ package gofer
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 848e6812b..2d8d3a2ea 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -17,8 +17,8 @@ package gofer
 import (
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index f586f47c1..21003ea45 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -27,13 +27,14 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/secio",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -41,18 +42,17 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/uniqueid",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/unet",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -69,17 +69,17 @@ go_test(
     ],
     library = ":host",
     deps = [
+        "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/tcpip",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 5532ff5a0..1658979fc 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -17,7 +17,7 @@ package host
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index f6c626f2c..e08f56d04 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -18,17 +18,17 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/secio"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 68d2697c0..d3e8e3a36 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -23,8 +23,8 @@ import (
 	"strconv"
 	"strings"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index c6852ee30..3111d2df9 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -23,8 +23,8 @@ import (
 	"sort"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 873a1c52d..6fa39caab 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -18,14 +18,14 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/secio"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index b267ec305..299e0e0b0 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 2d959f10d..7221bc825 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -21,7 +21,7 @@ import (
 	"syscall"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index c076d5bdd..06fc2d80a 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -19,11 +19,11 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 68b38fd1c..eb4afe520 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -21,13 +21,13 @@ import (
 
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 753ef8cd6..3f218b4a7 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -16,14 +16,14 @@ package host
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // TTYFileOperations implements fs.FileOperations for a host file descriptor
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index 88d24d693..d49c3a635 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -19,7 +19,7 @@ import (
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index e4cf5a570..b66c091ab 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -16,10 +16,10 @@ package fs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 13261cb81..70f2eae96 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -17,7 +17,7 @@ package fs
 import (
 	"errors"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index c477de837..4729b4aac 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -19,8 +19,8 @@ import (
 	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 493d98c36..389c219d6 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -17,7 +17,7 @@ package fs_test
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index cc7dd1c92..928c90aa0 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -19,13 +19,13 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index 9f70a3e82..686e1b1cd 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -18,8 +18,8 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // inotifyEventBaseSize is the base size of linux's struct inotify_event. This
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 7a24c6f1b..1d6ea5736 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -15,7 +15,7 @@
 package fs
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 7a9692800..37bae6810 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -19,8 +19,8 @@ import (
 	"fmt"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
 // DirentOperations provide file systems greater control over how long a Dirent
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 299712cd7..78e35b1e6 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -15,7 +15,7 @@
 package fs
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // overlayMountSourceOperations implements MountSourceOperations for an overlay
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 0b84732aa..e672a438c 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 )
 
 // cacheReallyContains iterates through the dirent cache to determine whether
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index a9627a9d1..574a2cc91 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -22,9 +22,9 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index c4c771f2c..a69b41468 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -17,7 +17,7 @@ package fs_test
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go
index f7d844ce7..53b5df175 100644
--- a/pkg/sentry/fs/offset.go
+++ b/pkg/sentry/fs/offset.go
@@ -17,7 +17,7 @@ package fs
 import (
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // OffsetPageEnd returns the file offset rounded up to the nearest
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index f7702f8f4..a8ae7d81d 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -18,12 +18,12 @@ import (
 	"fmt"
 	"strings"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // The virtual filesystem implements an overlay configuration. For a high-level
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index b06bead41..280093c5e 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -29,8 +29,8 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/proc/device",
@@ -46,10 +46,10 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -64,8 +64,8 @@ go_test(
     library = ":proc",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/inet",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/proc/cgroup.go b/pkg/sentry/fs/proc/cgroup.go
index c4abe319d..7c1d9e7e9 100644
--- a/pkg/sentry/fs/proc/cgroup.go
+++ b/pkg/sentry/fs/proc/cgroup.go
@@ -17,7 +17,7 @@ package proc
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index df0c4e3a7..c96533401 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -17,7 +17,7 @@ package proc
 import (
 	"bytes"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index 9aaeb780b..8fe626e1c 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -20,12 +20,12 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 2fa3cfa7d..35972e23c 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -19,7 +19,7 @@ import (
 	"sort"
 	"strconv"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index 7b3b974ab..0a58ac34c 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -18,7 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 )
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 761d24462..daf1ba781 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -17,7 +17,7 @@ package proc
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 723f6b661..d2859a4c2 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -16,14 +16,14 @@ package proc
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // LINT.IfChange
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index d7d2afcb7..139d49c34 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -18,7 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 )
 
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 313c6a32b..465b47da9 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -18,11 +18,11 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // LINT.IfChange
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index d4efc86e0..c10888100 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -20,7 +20,7 @@ import (
 	"sort"
 	"strings"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index bad445f3f..6f2775344 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -22,8 +22,8 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
@@ -33,9 +33,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // LINT.IfChange
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 29867dc3a..c8abb5052 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -20,7 +20,7 @@ import (
 	"sort"
 	"strconv"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 310d8dd52..21338d912 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -8,14 +8,14 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -26,10 +26,10 @@ go_test(
     srcs = ["seqfile_test.go"],
     library = ":seqfile",
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index f9af191d5..6121f0e95 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -19,14 +19,14 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index ebfeee835..98e394569 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -20,11 +20,11 @@ import (
 	"io"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type seqTest struct {
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index bc5b2bc7b..d4fbd76ac 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -19,7 +19,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 2bdcf5f70..f8aad2dbd 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -20,13 +20,13 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index b9e8ef35f..0772d4ae4 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -19,14 +19,14 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 6abae7a60..355e83d47 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -17,9 +17,9 @@ package proc
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func TestQuerySendBufferSize(t *testing.T) {
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 7358d6ef9..ca020e11e 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -22,7 +22,7 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
@@ -32,8 +32,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 3eacc9265..8d9517b95 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -20,13 +20,13 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index adfe58adb..c0f6fb802 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -19,12 +19,12 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index 27fd5b1cb..35e258ff6 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -17,7 +17,7 @@ package proc
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 39c4b84f8..8ca823fb3 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -13,14 +13,14 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -31,7 +31,7 @@ go_test(
     srcs = ["tree_test.go"],
     library = ":ramfs",
     deps = [
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
     ],
 )
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index dcbb8eb2e..bfa304552 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -20,7 +20,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index a24fe2ea2..29ff004f2 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -16,7 +16,7 @@ package ramfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index fcfaa29aa..d988349aa 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -16,7 +16,7 @@ package ramfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/waiter"
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index 702cc4a1e..dfc9d3453 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -19,10 +19,10 @@ import (
 	"path"
 	"strings"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // MakeDirectoryTree constructs a ramfs tree of all directories containing
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 61a7e2900..a6ed8b2c5 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -17,7 +17,7 @@ package ramfs
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 389c330a0..791d1526c 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -18,7 +18,7 @@ import (
 	"io"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index cc6b3bfbf..f2e8b9932 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -13,12 +13,12 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index 4f78ca8d2..b67065956 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index e60b63e75..fd03a4e38 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -15,7 +15,7 @@
 package sys
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index b14bf3f55..0891645e4 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -16,10 +16,10 @@
 package sys
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newFile(ctx context.Context, node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index 092668e8d..d16cdb4df 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -7,13 +7,13 @@ go_library(
     srcs = ["timerfd.go"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index f8bf663bb..88c344089 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -19,13 +19,13 @@ package timerfd
 import (
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 04776555f..aa7199014 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -14,8 +14,9 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/metric",
-        "//pkg/sentry/context",
+        "//pkg/safemem",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -25,12 +26,11 @@ go_library(
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -41,10 +41,10 @@ go_test(
     srcs = ["file_test.go"],
     library = ":tmpfs",
     deps = [
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 9a6943fe4..614f8f8a1 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -15,11 +15,11 @@
 package tmpfs
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index 0075ef023..aaba35502 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -18,11 +18,11 @@ import (
 	"bytes"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newFileInode(ctx context.Context) *fs.Inode {
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index be98ad751..d5be56c3f 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -19,7 +19,7 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index f1c87fe41..dabc10662 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -20,18 +20,18 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/metric"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var (
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 0f718e236..c00cef0a5 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -17,7 +17,7 @@ package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
@@ -25,8 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var fsInfo = fs.Info{
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 29f804c6c..5cb0e0417 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -16,20 +16,20 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -41,7 +41,7 @@ go_test(
     library = ":tty",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/usermem",
+        "//pkg/sentry/contexttest",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 88aa66b24..108654827 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -21,14 +21,14 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index edee56c12..8fe05ebe5 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -15,7 +15,7 @@
 package tty
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/syserror"
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 9fe02657e..12b1c6097 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -19,11 +19,11 @@ import (
 	"unicode/utf8"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 6b07f6bf2..f62da49bd 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -16,13 +16,13 @@ package tty
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 21ccc6f32..1ca79c0b2 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -16,12 +16,12 @@ package tty
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 2a51e6bab..db55cdc48 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -16,12 +16,12 @@ package tty
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 917f90cc0..5883f26db 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -16,11 +16,11 @@ package tty
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Terminal is a pseudoterminal.
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 59f07ff8e..2cbc05678 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -18,8 +18,8 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func TestSimpleMasterToSlave(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index a718920d5..6f78f478f 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -35,21 +35,21 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/syscalls/linux",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -73,14 +73,14 @@ go_test(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/fspath",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index 12f3990c1..6c5a559fd 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -7,9 +7,9 @@ go_test(
     size = "small",
     srcs = ["benchmark_test.go"],
     deps = [
+        "//pkg/context",
         "//pkg/fspath",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index a56b03711..d1436b943 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -24,9 +24,9 @@ import (
 	"strings"
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 8944171c8..ebb72b75e 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -17,8 +17,8 @@ package ext
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 4b7d17dc6..373d23b74 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -21,9 +21,9 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 6c14a1e2d..05f992826 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -25,14 +25,14 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 
 	"gvisor.dev/gvisor/runsc/testutil"
 )
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 841274daf..92f7da40d 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -16,7 +16,7 @@ package ext
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 9afb1a84c..07bf58953 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -19,8 +19,8 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index d11153c90..30135ddb0 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -18,13 +18,13 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // regularFile represents a regular file's inode. This too follows the
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index bdf8705c1..1447a4dc1 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -15,11 +15,11 @@
 package ext
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // symlink represents a symlink inode.
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 7bf83ccba..e73f1f857 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -29,16 +29,16 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
-        "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
@@ -49,13 +49,13 @@ go_test(
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
         "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 75624e0b1..373f801ff 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -18,11 +18,11 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // DynamicBytesFile implements kernfs.Inode and represents a read-only
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 5fa1fa67b..6104751c8 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -16,11 +16,11 @@ package kernfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index a4600ad47..9d65d0179 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -20,8 +20,8 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 1700fffd9..adca2313f 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -19,8 +19,8 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 85bcdcc57..79ebea8a5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -56,8 +56,8 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index fade59491..ee65cf491 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -21,14 +21,14 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const defaultMode linux.FileMode = 01777
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index f19f12854..0ee7eb9b7 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -16,7 +16,7 @@ package kernfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 3768f55b2..12aac2e6a 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -16,8 +16,9 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
+        "//pkg/safemem",
         "//pkg/sentry/fs",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
@@ -26,15 +27,14 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/usermem",
     ],
 )
 
@@ -48,15 +48,15 @@ go_test(
     library = ":proc",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index f49819187..11477b6a9 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -19,7 +19,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 91eded415..353e37195 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -19,7 +19,7 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a0580f20d..eb5bc62c0 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -19,7 +19,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 7bc352ae9..efd3b3453 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -20,17 +20,17 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // mm gets the kernel task's MemoryManager. No additional reference is taken on
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 51f634716..e0cb9c47b 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -20,7 +20,7 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index ad3760e39..434998910 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -20,14 +20,14 @@ import (
 	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type selfSymlink struct {
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 4aaf23e97..608fec017 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -22,8 +22,8 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -32,9 +32,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index aabf2bf0c..ad963870b 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -19,7 +19,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index 0a1d3f34b..be54897bb 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,7 +20,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 )
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 2c1635f33..6fc3524db 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -22,14 +22,14 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var (
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index beda141f1..66c0d8bc8 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -9,7 +9,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 1305ad01d..e35d52d17 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -20,7 +20,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 12053a5b6..efd5974c4 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -12,10 +12,10 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/cpuid",
         "//pkg/fspath",
         "//pkg/memutil",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -27,9 +27,9 @@ go_library(
         "//pkg/sentry/platform/kvm",
         "//pkg/sentry/platform/ptrace",
         "//pkg/sentry/time",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/sync",
+        "//pkg/usermem",
         "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 295da2d52..89f8c4915 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -21,9 +21,9 @@ import (
 	"runtime"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/memutil"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 2a723a89f..1c98335c1 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -24,12 +24,12 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // System represents the context for a single test.
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 857e98bc5..fb436860c 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -30,10 +30,11 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
@@ -43,12 +44,11 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
@@ -59,10 +59,10 @@ go_test(
     deps = [
         ":tmpfs",
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/refs",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/auth",
@@ -82,13 +82,13 @@ go_test(
     library = ":tmpfs",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/contexttest",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index d88c83499..54241c8e8 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -21,10 +21,10 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 887ca2619..dc0d27cf9 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -16,7 +16,7 @@ package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index d726f03c5..5ee9cf1e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -19,8 +19,8 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 482aabd52..0c57fdca3 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -16,11 +16,11 @@ package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type namedPipe struct {
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 70b42a6ec..5ee7f2a72 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -19,13 +19,13 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const fileName = "mypipe"
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 7c633c1b0..e9e6faf67 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -20,17 +20,17 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type regularFile struct {
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 034a29fdb..32552e261 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -22,12 +22,12 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // nextFileID is used to generate unique file names.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 515f033f2..88dbd6e35 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -29,7 +29,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
index a145a5ca3..61c78569d 100644
--- a/pkg/sentry/hostmm/BUILD
+++ b/pkg/sentry/hostmm/BUILD
@@ -12,6 +12,6 @@ go_library(
     deps = [
         "//pkg/fd",
         "//pkg/log",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go
index 19335ca73..506c7864a 100644
--- a/pkg/sentry/hostmm/hostmm.go
+++ b/pkg/sentry/hostmm/hostmm.go
@@ -24,7 +24,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // NotifyCurrentMemcgPressureCallback requests that f is called whenever the
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index aa621b724..334432abf 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -13,7 +13,7 @@ go_library(
         "test_stack.go",
     ],
     deps = [
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go
index 4eda7dd1f..e8cc1bffd 100644
--- a/pkg/sentry/inet/context.go
+++ b/pkg/sentry/inet/context.go
@@ -15,7 +15,7 @@
 package inet
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is the inet package's type for context.Context.Value keys.
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index cebaccd92..0738946d9 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -153,14 +153,15 @@ go_library(
         "//pkg/binary",
         "//pkg/bits",
         "//pkg/bpf",
+        "//pkg/context",
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/secio",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
@@ -180,7 +181,6 @@ go_library(
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/time",
@@ -188,7 +188,6 @@ go_library(
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/state/statefile",
         "//pkg/sync",
@@ -196,6 +195,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/stack",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -212,9 +212,9 @@ go_test(
     library = ":kernel",
     deps = [
         "//pkg/abi",
+        "//pkg/context",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/filetest",
         "//pkg/sentry/kernel/sched",
@@ -222,8 +222,8 @@ go_test(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/time",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 64537c9be..2bc49483a 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -61,8 +61,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/bits",
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
         "//pkg/sync",
         "//pkg/syserror",
     ],
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index 5c0e7d6b6..ef5723127 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -15,7 +15,7 @@
 package auth
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is the auth package's type for context.Context.Value keys.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index 3d74bc610..28cbe159d 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -16,7 +16,7 @@ package auth
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index 3c9dceaba..0c40bf315 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -17,8 +17,8 @@ package kernel
 import (
 	"time"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
 // contextID is the kernel package's type for context.Context.Value keys.
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index daff608d7..9d26392c0 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -8,8 +8,8 @@ go_library(
     srcs = ["contexttest.go"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/kernel",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
index 82f9d8922..22c340e56 100644
--- a/pkg/sentry/kernel/contexttest/contexttest.go
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -19,8 +19,8 @@ package contexttest
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 19e16ab3a..dedf0fa15 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -24,13 +24,13 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/context",
         "//pkg/refs",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -43,7 +43,7 @@ go_test(
     ],
     library = ":epoll",
     deps = [
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs/filetest",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index e84742993..8bffb78fc 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -20,13 +20,13 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index 4a20d4c82..22630e9c5 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -17,7 +17,7 @@ package epoll
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index ee2d74864..9983a32e5 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -8,14 +8,14 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fdnotifier",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -26,8 +26,8 @@ go_test(
     srcs = ["eventfd_test.go"],
     library = ":eventfd",
     deps = [
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/usermem",
+        "//pkg/sentry/contexttest",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 687690679..87951adeb 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -21,14 +21,14 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
index 018c7f3ef..9b4892f74 100644
--- a/pkg/sentry/kernel/eventfd/eventfd_test.go
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -17,8 +17,8 @@ package eventfd
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 0ad4135b3..9460bb235 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -22,8 +22,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 86164df49..261b815f2 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -18,8 +18,8 @@ import (
 	"runtime"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index f413d8ae2..c5021f2db 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -36,12 +36,12 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
@@ -51,7 +51,7 @@ go_test(
     srcs = ["futex_test.go"],
     library = ":futex",
     deps = [
-        "//pkg/sentry/usermem",
         "//pkg/sync",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index d1931c8f4..732e66da4 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -20,9 +20,9 @@ package futex
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // KeyKind indicates the type of a Key.
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index c23126ca5..7c5c7665b 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -22,8 +22,8 @@ import (
 	"testing"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // testData implements the Target interface, and allows us to
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c85e97fef..7b90fac5a 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -40,12 +40,12 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 2c7b6206f..4c049d5b4 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -33,16 +33,16 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/context",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/safemem",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -57,11 +57,11 @@ go_test(
     ],
     library = ":pipe",
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
index 1c0f34269..fe3be5dbd 100644
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ b/pkg/sentry/kernel/pipe/buffer.go
@@ -17,7 +17,7 @@ package pipe
 import (
 	"io"
 
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go
index ee1b90115..4d54b8b8f 100644
--- a/pkg/sentry/kernel/pipe/buffer_test.go
+++ b/pkg/sentry/kernel/pipe/buffer_test.go
@@ -18,7 +18,7 @@ import (
 	"testing"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func TestBufferSize(t *testing.T) {
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 716f589af..4b688c627 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -16,7 +16,7 @@ package pipe
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sync"
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index 16fa80abe..ab75a87ff 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -18,11 +18,11 @@ import (
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type sleeper struct {
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index e4fd7d420..08410283f 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -20,7 +20,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index e3a14b665..bda739dbe 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -18,9 +18,9 @@ import (
 	"bytes"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 8394eb78b..80158239e 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,10 +21,10 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index b4d29fc77..b2b5691ee 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -17,11 +17,11 @@ package pipe
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ReaderWriter satisfies the FileOperations interface and services both
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 6f83e3cee..a5675bd70 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,12 +16,12 @@ package pipe
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 3be171cdc..35ad97d5d 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ptraceOptions are the subset of options controlling a task's ptrace behavior
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index 5514cf432..cef1276ec 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -18,8 +18,8 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ptraceArch implements arch-specific ptrace commands.
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index 61e412911..d971b96b3 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -17,8 +17,8 @@
 package kernel
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ptraceArch implements arch-specific ptrace commands.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index b14429854..efebfd872 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -19,8 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Restartable sequences.
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 2347dcf36..c38c5a40c 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const maxSyscallFilterInstructions = 1 << 15
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 76e19b551..65e5427c1 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -24,8 +24,8 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
@@ -41,8 +41,8 @@ go_test(
     library = ":semaphore",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/kernel/auth",
         "//pkg/syserror",
     ],
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 18299814e..1000f3287 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -19,8 +19,8 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index c235f6ca4..e47acefdf 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -18,8 +18,8 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 5547c5abf..bfd779837 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -11,9 +11,9 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
@@ -22,8 +22,8 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 8ddef7eb8..208569057 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -37,9 +37,9 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -47,9 +47,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Key represents a shm segment key. Analogous to a file name.
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 5d44773d4..3eb78e91b 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -9,14 +9,14 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 28be4a939..8243bb93e 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -18,14 +18,14 @@ package signalfd
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index d2d01add4..93c4fe969 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // maxSyscallNum is the highest supported syscall number.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 978d66da8..95adf2778 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,8 +21,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -35,8 +35,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 247bd4aba..53d4d211b 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,8 +17,8 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SharingOptions controls what resources are shared by a new task created by
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index bb5560acf..2d6e7733c 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -18,13 +18,13 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index c211b5b74..a53e77c9f 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,7 +16,7 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Futex returns t's futex manager.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 0fb3661de..41259210c 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -20,7 +20,7 @@ import (
 	"sort"
 
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 6357273d3..5568c91bc 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -26,7 +26,7 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // A taskRunState is a reified state in the task state machine. See README.md
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 39cd1340d..8802db142 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -26,8 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 58af16ee2..de838beef 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // TaskConfig defines the configuration of a new Task (see below).
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 3180f5560..d555d69a8 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -25,8 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 518bfe1bd..2bf3ce8a8 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,8 +18,8 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // MAX_RW_COUNT is the maximum size in bytes of a single read or write.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index d49594d9f..7ba7dc50c 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -11,7 +11,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/waiter",
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
index 8ef483dd3..00b729d88 100644
--- a/pkg/sentry/kernel/time/context.go
+++ b/pkg/sentry/kernel/time/context.go
@@ -15,7 +15,7 @@
 package time
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is the time package's type for context.Context.Value keys.
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 849c5b646..cf2f7ca72 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -17,12 +17,12 @@ package kernel
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // mockClocks is a sentrytime.Clocks that simply returns the times in the
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index fdd10c56c..f1b3c212c 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -18,10 +18,10 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // vdsoParams are the parameters exposed to the VDSO.
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 67869757f..cf591c4c1 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -12,7 +12,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sync",
     ],
 )
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 6972749ed..77e1fe217 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -15,7 +15,7 @@
 package limits
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is the limit package's type for context.Context.Value keys.
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index d4ad2bd6c..23790378a 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -24,11 +24,12 @@ go_library(
         "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/cpuid",
         "//pkg/log",
         "//pkg/rand",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
@@ -37,12 +38,11 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 6299a3e2f..122ed05c2 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -23,16 +23,16 @@ import (
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index ccf909cac..098a45d36 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -18,10 +18,10 @@ import (
 	"bytes"
 	"io"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index b03eeb005..9a613d6b7 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -24,16 +24,16 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // LoadArgs holds specifications for an executable file to be loaded.
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index df8a81907..52f446ed7 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -20,20 +20,20 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index f9a65f086..a98b66de1 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -38,11 +38,11 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
         "//pkg/sentry/platform",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
@@ -51,5 +51,5 @@ go_test(
     size = "small",
     srcs = ["mapping_set_test.go"],
     library = ":memmap",
-    deps = ["//pkg/sentry/usermem"],
+    deps = ["//pkg/usermem"],
 )
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index 0a5b7ce45..d609c1ae0 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 	"math"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // MappingSet maps offsets into a Mappable to mappings of those offsets. It is
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
index f9b11a59c..d39efe38f 100644
--- a/pkg/sentry/memmap/mapping_set_test.go
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -18,7 +18,7 @@ import (
 	"reflect"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type testMappingSpace struct {
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 16a722a13..c6db9fc8f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -18,9 +18,9 @@ package memmap
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Mappable represents a memory-mappable object, a mutable mapping from uint64
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index bd6399fa2..e5729ced5 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -27,7 +27,7 @@ go_template_instance(
         "minDegree": "8",
     },
     imports = {
-        "usermem": "gvisor.dev/gvisor/pkg/sentry/usermem",
+        "usermem": "gvisor.dev/gvisor/pkg/usermem",
     },
     package = "mm",
     prefix = "vma",
@@ -47,7 +47,7 @@ go_template_instance(
         "minDegree": "8",
     },
     imports = {
-        "usermem": "gvisor.dev/gvisor/pkg/sentry/usermem",
+        "usermem": "gvisor.dev/gvisor/pkg/usermem",
     },
     package = "mm",
     prefix = "pma",
@@ -99,10 +99,12 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/atomicbitops",
+        "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/safecopy",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
         "//pkg/sentry/kernel/auth",
@@ -112,13 +114,11 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
-        "//pkg/sentry/platform/safecopy",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
+        "//pkg/usermem",
     ],
 )
 
@@ -128,14 +128,14 @@ go_test(
     srcs = ["mm_test.go"],
     library = ":mm",
     deps = [
+        "//pkg/context",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index cfebcfd42..e58a63deb 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -20,7 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // AddressSpace returns the platform.AddressSpace bound to mm.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 4b48866ad..cb29d94b0 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -16,15 +16,15 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // aioManager creates and manages asynchronous I/O contexts.
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
index df9adf708..c273c982e 100644
--- a/pkg/sentry/mm/debug.go
+++ b/pkg/sentry/mm/debug.go
@@ -18,7 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 const (
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index b03e7d020..fa776f9c6 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -15,11 +15,11 @@
 package mm
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // There are two supported ways to copy data to/from application virtual
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 4e9ca1de6..47b8fbf43 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -19,13 +19,13 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/atomicbitops"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index d2a01d48a..f550acae0 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -17,7 +17,7 @@ package mm
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Dumpability describes if and how core dumps should be created.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 78cc9e6e4..09e582dd3 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -35,14 +35,14 @@
 package mm
 
 import (
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // MemoryManager implements a virtual address space.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index 4d2bfaaed..edacca741 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -17,15 +17,15 @@ package mm
 import (
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func testMemoryManager(ctx context.Context) *MemoryManager {
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index c976c6f45..62e4c20af 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -17,14 +17,14 @@ package mm
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safecopy"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // existingPMAsLocked checks that pmas exist for all addresses in ar, and
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 79610acb7..1ab92f046 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -19,10 +19,10 @@ import (
 	"fmt"
 	"strings"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 93259c5a3..f56215d9a 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -17,7 +17,7 @@ package mm
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
index b9f2d23e5..6432731d4 100644
--- a/pkg/sentry/mm/shm.go
+++ b/pkg/sentry/mm/shm.go
@@ -15,10 +15,10 @@
 package mm
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // DetachShm unmaps a sysv shared memory segment.
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index ea2d7af74..9ad52082d 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -15,14 +15,14 @@
 package mm
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index c2466c988..c5dfa5972 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -19,14 +19,14 @@ import (
 	mrand "math/rand"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // HandleUserFault handles an application page fault. sp is the faulting
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index f2fd70799..9a14e69e6 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -18,13 +18,13 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Preconditions: mm.mappingMu must be locked for writing. opts must be valid
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 02385a3ce..1eeb9f317 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -61,18 +61,18 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/context",
         "//pkg/log",
         "//pkg/memutil",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/hostmm",
         "//pkg/sentry/platform",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
@@ -81,5 +81,5 @@ go_test(
     size = "small",
     srcs = ["pgalloc_test.go"],
     library = ":pgalloc",
-    deps = ["//pkg/sentry/usermem"],
+    deps = ["//pkg/usermem"],
 )
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go
index 11ccf897b..d25215418 100644
--- a/pkg/sentry/pgalloc/context.go
+++ b/pkg/sentry/pgalloc/context.go
@@ -15,7 +15,7 @@
 package pgalloc
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is this package's type for context.Context.Value keys.
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index c99e023d9..577e9306a 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -29,15 +29,15 @@ import (
 	"syscall"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/hostmm"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // MemoryFile is a platform.File whose pages may be allocated to arbitrary
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 428e6a859..293f22c6b 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -17,7 +17,7 @@ package pgalloc
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index aafce1d00..f8385c146 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -25,8 +25,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SaveTo writes f's state to the given stream.
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 006450b2d..453241eca 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -26,14 +26,14 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/atomicbitops",
+        "//pkg/context",
         "//pkg/log",
+        "//pkg/safecopy",
+        "//pkg/safemem",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/platform/safecopy",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
index e29bc4485..6759cda65 100644
--- a/pkg/sentry/platform/context.go
+++ b/pkg/sentry/platform/context.go
@@ -15,7 +15,7 @@
 package platform
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is the auth package's type for context.Context.Value keys.
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index a4532a766..159f7eafd 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -44,16 +44,16 @@ go_library(
         "//pkg/cpuid",
         "//pkg/log",
         "//pkg/procid",
+        "//pkg/safecopy",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/ring0",
         "//pkg/sentry/platform/ring0/pagetables",
-        "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/time",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
+        "//pkg/usermem",
     ],
 )
 
@@ -75,6 +75,6 @@ go_test(
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/platform/ring0",
         "//pkg/sentry/platform/ring0/pagetables",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index a25f3c449..be213bfe8 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // dirtySet tracks vCPUs for invalidation.
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 30dbb74d6..35cd55fef 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -19,9 +19,9 @@ import (
 	"reflect"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/safecopy"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
-	"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
 )
 
 // bluepill enters guest mode.
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index f6459cda9..e34f46aeb 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -18,7 +18,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index 99450d22d..c769ac7b4 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -19,7 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // context is an implementation of the platform context.
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index d337c5c7c..972ba85c3 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // KVM represents a lightweight VM context.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 30df725d4..c42752d50 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -27,7 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var dummyFPState = (*byte)(arch.NewFloatingPointData())
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index e6d912168..8076c7529 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -25,8 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // machine contains state associated with the VM as a whole.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 873e39dc7..923ce3909 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -26,7 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // initArchState initializes architecture-specific state.
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 3b1f20219..09552837a 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -20,7 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type vCPUArchState struct {
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 3f2f97a6b..1c8384e6b 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -26,7 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // setMemoryRegion initializes a region.
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 91de5dab1..f7fa2f98d 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -21,7 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type region struct {
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index 2d68855ef..c8897d34f 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -22,7 +22,7 @@ import (
 	"regexp"
 	"strconv"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type virtualRegion struct {
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
index 6a2f145be..327e2be4f 100644
--- a/pkg/sentry/platform/kvm/virtual_map_test.go
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -18,7 +18,7 @@ import (
 	"syscall"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type checker struct {
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
index 999787462..091c2e365 100644
--- a/pkg/sentry/platform/mmap_min_addr.go
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -20,7 +20,7 @@ import (
 	"strconv"
 	"strings"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // systemMMapMinAddrSource is the source file.
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index ec22dbf87..2ca696382 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -22,10 +22,10 @@ import (
 	"os"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Platform provides abstractions for execution contexts (Context,
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 3bcc5e040..95abd321e 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -25,14 +25,14 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/procid",
+        "//pkg/safecopy",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
-        "//pkg/sentry/platform/safecopy",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
+        "//pkg/usermem",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index bb0e03880..03adb624b 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -51,8 +51,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var (
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 72c7ec564..6c0ed7b3e 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -20,7 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // getRegs gets the general purpose register set.
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
index aa1b87237..341dde143 100644
--- a/pkg/sentry/platform/ptrace/stub_unsafe.go
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -19,8 +19,8 @@ import (
 	"syscall"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/safecopy"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // stub is defined in arch-specific assembly.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 15dc46a5b..31b7cec53 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -25,8 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Linux kernel errnos which "should never be seen by user programs", but will
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 6dee8fcc5..934b6fbcd 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -78,6 +78,6 @@ go_library(
     deps = [
         "//pkg/cpuid",
         "//pkg/sentry/platform/ring0/pagetables",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 9dae0dccb..9c6c2cf5c 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -18,7 +18,7 @@ package ring0
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var (
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index a850ce6cf..1583dda12 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -18,7 +18,7 @@ package ring0
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var (
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 147311ed3..4cae10459 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -28,6 +28,6 @@ go_binary(
     deps = [
         "//pkg/cpuid",
         "//pkg/sentry/platform/ring0/pagetables",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 8b5cdd6c1..971eed7fa 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -93,8 +93,8 @@ go_library(
         "//pkg/sentry/platform/ring0:__subpackages__",
     ],
     deps = [
-        "//pkg/sentry/usermem",
         "//pkg/sync",
+        "//pkg/usermem",
     ],
 )
 
@@ -108,5 +108,5 @@ go_test(
         "walker_check.go",
     ],
     library = ":pagetables",
-    deps = ["//pkg/sentry/usermem"],
+    deps = ["//pkg/usermem"],
 )
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
index a90394a33..d08bfdeb3 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -17,7 +17,7 @@ package pagetables
 import (
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // newAlignedPTEs returns a set of aligned PTEs.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 30c64a372..87e88e97d 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -21,7 +21,7 @@
 package pagetables
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // PageTables is a set of page tables.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index e78424766..78510ebed 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -19,7 +19,7 @@ package pagetables
 import (
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // archPageTables is architecture-specific data.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index 35e917526..54e8e554f 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -19,7 +19,7 @@ package pagetables
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func Test2MAnd4K(t *testing.T) {
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go
index 254116233..2f73d424f 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go
@@ -19,7 +19,7 @@ package pagetables
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func Test2MAnd4K(t *testing.T) {
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index 6e95ad2b9..5c88d087d 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -17,7 +17,7 @@ package pagetables
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type mapping struct {
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index 3e2383c5e..dcf061df9 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -19,7 +19,7 @@ package pagetables
 import (
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // archPageTables is architecture-specific data.
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
deleted file mode 100644
index b8747585b..000000000
--- a/pkg/sentry/platform/safecopy/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "safecopy",
-    srcs = [
-        "atomic_amd64.s",
-        "atomic_arm64.s",
-        "memclr_amd64.s",
-        "memclr_arm64.s",
-        "memcpy_amd64.s",
-        "memcpy_arm64.s",
-        "safecopy.go",
-        "safecopy_unsafe.go",
-        "sighandler_amd64.s",
-        "sighandler_arm64.s",
-    ],
-    visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/syserror"],
-)
-
-go_test(
-    name = "safecopy_test",
-    srcs = [
-        "safecopy_test.go",
-    ],
-    library = ":safecopy",
-)
diff --git a/pkg/sentry/platform/safecopy/LICENSE b/pkg/sentry/platform/safecopy/LICENSE
deleted file mode 100644
index 6a66aea5e..000000000
--- a/pkg/sentry/platform/safecopy/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
deleted file mode 100644
index a0cd78f33..000000000
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "textflag.h"
-
-// handleSwapUint32Fault returns the value stored in DI. Control is transferred
-// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
-// number stored in DI.
-//
-// It must have the same frame configuration as swapUint32 so that it can undo
-// any potential call frame set up by the assembler.
-TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
-  MOVL DI, sig+20(FP)
-  RET
-
-// swapUint32 atomically stores new into *addr and returns (the previous *addr
-// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
-// value of old is unspecified, and sig is the number of the signal that was
-// received.
-//
-// Preconditions: addr must be aligned to a 4-byte boundary.
-//
-//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
-TEXT ·swapUint32(SB), NOSPLIT, $0-24
-  // Store 0 as the returned signal number. If we run to completion,
-  // this is the value the caller will see; if a signal is received,
-  // handleSwapUint32Fault will store a different value in this address.
-  MOVL $0, sig+20(FP)
-
-  MOVQ addr+0(FP), DI
-  MOVL new+8(FP), AX
-  XCHGL AX, 0(DI)
-  MOVL AX, old+16(FP)
-  RET
-
-// handleSwapUint64Fault returns the value stored in DI. Control is transferred
-// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
-// number stored in DI.
-//
-// It must have the same frame configuration as swapUint64 so that it can undo
-// any potential call frame set up by the assembler.
-TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
-  MOVL DI, sig+24(FP)
-  RET
-
-// swapUint64 atomically stores new into *addr and returns (the previous *addr
-// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
-// value of old is unspecified, and sig is the number of the signal that was
-// received.
-//
-// Preconditions: addr must be aligned to a 8-byte boundary.
-//
-//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
-TEXT ·swapUint64(SB), NOSPLIT, $0-28
-  // Store 0 as the returned signal number. If we run to completion,
-  // this is the value the caller will see; if a signal is received,
-  // handleSwapUint64Fault will store a different value in this address.
-  MOVL $0, sig+24(FP)
-
-  MOVQ addr+0(FP), DI
-  MOVQ new+8(FP), AX
-  XCHGQ AX, 0(DI)
-  MOVQ AX, old+16(FP)
-  RET
-
-// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is
-// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the
-// signal number stored in DI.
-//
-// It must have the same frame configuration as compareAndSwapUint32 so that it
-// can undo any potential call frame set up by the assembler.
-TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
-  MOVL DI, sig+20(FP)
-  RET
-
-// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is
-// received during the operation, the value of prev is unspecified, and sig is
-// the number of the signal that was received.
-//
-// Preconditions: addr must be aligned to a 4-byte boundary.
-//
-//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
-TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
-  // Store 0 as the returned signal number. If we run to completion, this is
-  // the value the caller will see; if a signal is received,
-  // handleCompareAndSwapUint32Fault will store a different value in this
-  // address.
-  MOVL $0, sig+20(FP)
-
-  MOVQ addr+0(FP), DI
-  MOVL old+8(FP), AX
-  MOVL new+12(FP), DX
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  MOVL AX, prev+16(FP)
-  RET
-
-// handleLoadUint32Fault returns the value stored in DI. Control is transferred
-// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
-// number stored in DI.
-//
-// It must have the same frame configuration as loadUint32 so that it can undo
-// any potential call frame set up by the assembler.
-TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
-  MOVL DI, sig+12(FP)
-  RET
-
-// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
-// signal is received, the value returned is unspecified, and sig is the number
-// of the signal that was received.
-//
-// Preconditions: addr must be aligned to a 4-byte boundary.
-//
-//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
-TEXT ·loadUint32(SB), NOSPLIT, $0-16
-  // Store 0 as the returned signal number. If we run to completion,
-  // this is the value the caller will see; if a signal is received,
-  // handleLoadUint32Fault will store a different value in this address.
-  MOVL $0, sig+12(FP)
-
-  MOVQ addr+0(FP), AX
-  MOVL (AX), BX
-  MOVL BX, val+8(FP)
-  RET
diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s
deleted file mode 100644
index d58ed71f7..000000000
--- a/pkg/sentry/platform/safecopy/atomic_arm64.s
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// handleSwapUint32Fault returns the value stored in R1. Control is transferred
-// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
-// number stored in R1.
-//
-// It must have the same frame configuration as swapUint32 so that it can undo
-// any potential call frame set up by the assembler.
-TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
-	MOVW R1, sig+20(FP)
-	RET
-
-// See the corresponding doc in safecopy_unsafe.go
-//
-// The code is derived from Go source runtime/internal/atomic.Xchg.
-//
-//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
-TEXT ·swapUint32(SB), NOSPLIT, $0-24
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleSwapUint32Fault will store a different value in this address.
-	MOVW $0, sig+20(FP)
-again:
-	MOVD addr+0(FP), R0
-	MOVW new+8(FP), R1
-	LDAXRW (R0), R2
-	STLXRW R1, (R0), R3
-	CBNZ R3, again
-	MOVW R2, old+16(FP)
-	RET
-
-// handleSwapUint64Fault returns the value stored in R1. Control is transferred
-// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
-// number stored in R1.
-//
-// It must have the same frame configuration as swapUint64 so that it can undo
-// any potential call frame set up by the assembler.
-TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
-	MOVW R1, sig+24(FP)
-	RET
-
-// See the corresponding doc in safecopy_unsafe.go
-//
-// The code is derived from Go source runtime/internal/atomic.Xchg64.
-//
-//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
-TEXT ·swapUint64(SB), NOSPLIT, $0-28
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleSwapUint64Fault will store a different value in this address.
-	MOVW $0, sig+24(FP)
-again:
-	MOVD addr+0(FP), R0
-	MOVD new+8(FP), R1
-	LDAXR (R0), R2
-	STLXR R1, (R0), R3
-	CBNZ R3, again
-	MOVD R2, old+16(FP)
-	RET
-
-// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is
-// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS,
-// with the signal number stored in R1.
-//
-// It must have the same frame configuration as compareAndSwapUint32 so that it
-// can undo any potential call frame set up by the assembler.
-TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
-	MOVW R1, sig+20(FP)
-	RET
-
-// See the corresponding doc in safecopy_unsafe.go
-//
-// The code is derived from Go source runtime/internal/atomic.Cas.
-//
-//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
-TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
-	// Store 0 as the returned signal number. If we run to completion, this is
-	// the value the caller will see; if a signal is received,
-	// handleCompareAndSwapUint32Fault will store a different value in this
-	// address.
-	MOVW $0, sig+20(FP)
-
-	MOVD addr+0(FP), R0
-	MOVW old+8(FP), R1
-	MOVW new+12(FP), R2
-again:
-	LDAXRW (R0), R3
-	CMPW R1, R3
-	BNE done
-	STLXRW R2, (R0), R4
-	CBNZ R4, again
-done:
-	MOVW R3, prev+16(FP)
-	RET
-
-// handleLoadUint32Fault returns the value stored in DI. Control is transferred
-// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
-// number stored in DI.
-//
-// It must have the same frame configuration as loadUint32 so that it can undo
-// any potential call frame set up by the assembler.
-TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
-	MOVW R1, sig+12(FP)
-	RET
-
-// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
-// signal is received, the value returned is unspecified, and sig is the number
-// of the signal that was received.
-//
-// Preconditions: addr must be aligned to a 4-byte boundary.
-//
-//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
-TEXT ·loadUint32(SB), NOSPLIT, $0-16
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleLoadUint32Fault will store a different value in this address.
-	MOVW $0, sig+12(FP)
-
-	MOVD addr+0(FP), R0
-	LDARW (R0), R1
-	MOVW R1, val+8(FP)
-	RET
diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s
deleted file mode 100644
index 64cf32f05..000000000
--- a/pkg/sentry/platform/safecopy/memclr_amd64.s
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// handleMemclrFault returns (the value stored in AX, the value stored in DI).
-// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
-// with the faulting address stored in AX and the signal number stored in DI.
-//
-// It must have the same frame configuration as memclr so that it can undo any
-// potential call frame set up by the assembler.
-TEXT handleMemclrFault(SB), NOSPLIT, $0-28
-	MOVQ	AX, addr+16(FP)
-	MOVL	DI, sig+24(FP)
-	RET
-
-// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
-// signal is received during the write, it returns the address that caused the
-// fault and the number of the signal that was received. Otherwise, it returns
-// an unspecified address and a signal number of 0.
-//
-// Data is written in order, such that if a fault happens at address p, it is
-// safe to assume that all data before p-maxRegisterSize has already been
-// successfully written.
-//
-// The code is derived from runtime.memclrNoHeapPointers.
-//
-// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
-TEXT ·memclr(SB), NOSPLIT, $0-28
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleMemclrFault will store a different value in this address.
-	MOVL	$0, sig+24(FP)
-
-	MOVQ	ptr+0(FP), DI
-	MOVQ	n+8(FP), BX
-	XORQ	AX, AX
-
-	// MOVOU seems always faster than REP STOSQ.
-tail:
-	TESTQ	BX, BX
-	JEQ	_0
-	CMPQ	BX, $2
-	JBE	_1or2
-	CMPQ	BX, $4
-	JBE	_3or4
-	CMPQ	BX, $8
-	JB	_5through7
-	JE	_8
-	CMPQ	BX, $16
-	JBE	_9through16
-	PXOR	X0, X0
-	CMPQ	BX, $32
-	JBE	_17through32
-	CMPQ	BX, $64
-	JBE	_33through64
-	CMPQ	BX, $128
-	JBE	_65through128
-	CMPQ	BX, $256
-	JBE	_129through256
-	// TODO: use branch table and BSR to make this just a single dispatch
-	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
-
-loop:
-	MOVOU	X0, 0(DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, 128(DI)
-	MOVOU	X0, 144(DI)
-	MOVOU	X0, 160(DI)
-	MOVOU	X0, 176(DI)
-	MOVOU	X0, 192(DI)
-	MOVOU	X0, 208(DI)
-	MOVOU	X0, 224(DI)
-	MOVOU	X0, 240(DI)
-	SUBQ	$256, BX
-	ADDQ	$256, DI
-	CMPQ	BX, $256
-	JAE	loop
-	JMP	tail
-
-_1or2:
-	MOVB	AX, (DI)
-	MOVB	AX, -1(DI)(BX*1)
-	RET
-_0:
-	RET
-_3or4:
-	MOVW	AX, (DI)
-	MOVW	AX, -2(DI)(BX*1)
-	RET
-_5through7:
-	MOVL	AX, (DI)
-	MOVL	AX, -4(DI)(BX*1)
-	RET
-_8:
-	// We need a separate case for 8 to make sure we clear pointers atomically.
-	MOVQ	AX, (DI)
-	RET
-_9through16:
-	MOVQ	AX, (DI)
-	MOVQ	AX, -8(DI)(BX*1)
-	RET
-_17through32:
-	MOVOU	X0, (DI)
-	MOVOU	X0, -16(DI)(BX*1)
-	RET
-_33through64:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
-	RET
-_65through128:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
-	RET
-_129through256:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, -128(DI)(BX*1)
-	MOVOU	X0, -112(DI)(BX*1)
-	MOVOU	X0, -96(DI)(BX*1)
-	MOVOU	X0, -80(DI)(BX*1)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
-	RET
diff --git a/pkg/sentry/platform/safecopy/memclr_arm64.s b/pkg/sentry/platform/safecopy/memclr_arm64.s
deleted file mode 100644
index 7361b9067..000000000
--- a/pkg/sentry/platform/safecopy/memclr_arm64.s
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// handleMemclrFault returns (the value stored in R0, the value stored in R1).
-// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
-// with the faulting address stored in R0 and the signal number stored in R1.
-//
-// It must have the same frame configuration as memclr so that it can undo any
-// potential call frame set up by the assembler.
-TEXT handleMemclrFault(SB), NOSPLIT, $0-28
-	MOVD R0, addr+16(FP)
-	MOVW R1, sig+24(FP)
-	RET
-
-// See the corresponding doc in safecopy_unsafe.go
-//
-// The code is derived from runtime.memclrNoHeapPointers.
-//
-// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
-TEXT ·memclr(SB), NOSPLIT, $0-28
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleMemclrFault will store a different value in this address.
-	MOVW $0, sig+24(FP)
-	MOVD ptr+0(FP), R0
-	MOVD n+8(FP), R1
-
-	// If size is less than 16 bytes, use tail_zero to zero what remains
-	CMP $16, R1
-	BLT tail_zero
-	// Get buffer offset into 16 byte aligned address for better performance
-	ANDS $15, R0, ZR
-	BNE unaligned_to_16
-aligned_to_16:
-	LSR $4, R1, R2
-zero_by_16:
-	STP.P (ZR, ZR), 16(R0) // Store pair with post index.
-	SUBS $1, R2, R2
-	BNE zero_by_16
-	ANDS $15, R1, R1
-	BEQ end
-
-	// Zero buffer with size=R1 < 16
-tail_zero:
-	TBZ $3, R1, tail_zero_4
-	MOVD.P ZR, 8(R0)
-tail_zero_4:
-	TBZ $2, R1, tail_zero_2
-	MOVW.P ZR, 4(R0)
-tail_zero_2:
-	TBZ $1, R1, tail_zero_1
-	MOVH.P ZR, 2(R0)
-tail_zero_1:
-	TBZ $0, R1, end
-	MOVB ZR, (R0)
-end:
-	RET
-
-unaligned_to_16:
-	MOVD R0, R2
-head_loop:
-	MOVBU.P ZR, 1(R0)
-	ANDS $15, R0, ZR
-	BNE head_loop
-	// Adjust length for what remains
-	SUB R2, R0, R3
-	SUB R3, R1
-	// If size is less than 16 bytes, use tail_zero to zero what remains
-	CMP $16, R1
-	BLT tail_zero
-	B aligned_to_16
diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s
deleted file mode 100644
index 129691d68..000000000
--- a/pkg/sentry/platform/safecopy/memcpy_amd64.s
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
-// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
-// Portions Copyright 2009 The Go Authors. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#include "textflag.h"
-
-// handleMemcpyFault returns (the value stored in AX, the value stored in DI).
-// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
-// with the faulting address stored in AX and the signal number stored in DI.
-//
-// It must have the same frame configuration as memcpy so that it can undo any
-// potential call frame set up by the assembler.
-TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
-	MOVQ	AX, addr+24(FP)
-	MOVL	DI, sig+32(FP)
-	RET
-
-// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
-// during the copy, it returns the address that caused the fault and the number
-// of the signal that was received. Otherwise, it returns an unspecified address
-// and a signal number of 0.
-//
-// Data is copied in order, such that if a fault happens at address p, it is
-// safe to assume that all data before p-maxRegisterSize has already been
-// successfully copied.
-//
-// The code is derived from the forward copying part of runtime.memmove.
-//
-// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
-TEXT ·memcpy(SB), NOSPLIT, $0-36
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleMemcpyFault will store a different value in this address.
-	MOVL	$0, sig+32(FP)
-
-	MOVQ	to+0(FP), DI
-	MOVQ	from+8(FP), SI
-	MOVQ	n+16(FP), BX
-
-	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code. The REP MOVSQ instruction is really fast
-	// for large sizes. The cutover is approximately 2K.
-tail:
-	// move_129through256 or smaller work whether or not the source and the
-	// destination memory regions overlap because they load all data into
-	// registers before writing it back.  move_256through2048 on the other
-	// hand can be used only when the memory regions don't overlap or the copy
-	// direction is forward.
-	TESTQ	BX, BX
-	JEQ	move_0
-	CMPQ	BX, $2
-	JBE	move_1or2
-	CMPQ	BX, $4
-	JBE	move_3or4
-	CMPQ	BX, $8
-	JB	move_5through7
-	JE	move_8
-	CMPQ	BX, $16
-	JBE	move_9through16
-	CMPQ	BX, $32
-	JBE	move_17through32
-	CMPQ	BX, $64
-	JBE	move_33through64
-	CMPQ	BX, $128
-	JBE	move_65through128
-	CMPQ	BX, $256
-	JBE	move_129through256
-	// TODO: use branch table and BSR to make this just a single dispatch
-
-/*
- * forward copy loop
- */
-	CMPQ	BX, $2048
-	JLS	move_256through2048
-
-	// Check alignment
-	MOVL	SI, AX
-	ORL	DI, AX
-	TESTL	$7, AX
-	JEQ	fwdBy8
-
-	// Do 1 byte at a time
-	MOVQ	BX, CX
-	REP;	MOVSB
-	RET
-
-fwdBy8:
-	// Do 8 bytes at a time
-	MOVQ	BX, CX
-	SHRQ	$3, CX
-	ANDQ	$7, BX
-	REP;	MOVSQ
-	JMP	tail
-
-move_1or2:
-	MOVB	(SI), AX
-	MOVB	AX, (DI)
-	MOVB	-1(SI)(BX*1), CX
-	MOVB	CX, -1(DI)(BX*1)
-	RET
-move_0:
-	RET
-move_3or4:
-	MOVW	(SI), AX
-	MOVW	AX, (DI)
-	MOVW	-2(SI)(BX*1), CX
-	MOVW	CX, -2(DI)(BX*1)
-	RET
-move_5through7:
-	MOVL	(SI), AX
-	MOVL	AX, (DI)
-	MOVL	-4(SI)(BX*1), CX
-	MOVL	CX, -4(DI)(BX*1)
-	RET
-move_8:
-	// We need a separate case for 8 to make sure we write pointers atomically.
-	MOVQ	(SI), AX
-	MOVQ	AX, (DI)
-	RET
-move_9through16:
-	MOVQ	(SI), AX
-	MOVQ	AX, (DI)
-	MOVQ	-8(SI)(BX*1), CX
-	MOVQ	CX, -8(DI)(BX*1)
-	RET
-move_17through32:
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	-16(SI)(BX*1), X1
-	MOVOU	X1, -16(DI)(BX*1)
-	RET
-move_33through64:
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	16(SI), X1
-	MOVOU	X1, 16(DI)
-	MOVOU	-32(SI)(BX*1), X2
-	MOVOU	X2, -32(DI)(BX*1)
-	MOVOU	-16(SI)(BX*1), X3
-	MOVOU	X3, -16(DI)(BX*1)
-	RET
-move_65through128:
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	16(SI), X1
-	MOVOU	X1, 16(DI)
-	MOVOU	32(SI), X2
-	MOVOU	X2, 32(DI)
-	MOVOU	48(SI), X3
-	MOVOU	X3, 48(DI)
-	MOVOU	-64(SI)(BX*1), X4
-	MOVOU	X4, -64(DI)(BX*1)
-	MOVOU	-48(SI)(BX*1), X5
-	MOVOU	X5, -48(DI)(BX*1)
-	MOVOU	-32(SI)(BX*1), X6
-	MOVOU	X6, -32(DI)(BX*1)
-	MOVOU	-16(SI)(BX*1), X7
-	MOVOU	X7, -16(DI)(BX*1)
-	RET
-move_129through256:
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	16(SI), X1
-	MOVOU	X1, 16(DI)
-	MOVOU	32(SI), X2
-	MOVOU	X2, 32(DI)
-	MOVOU	48(SI), X3
-	MOVOU	X3, 48(DI)
-	MOVOU	64(SI), X4
-	MOVOU	X4, 64(DI)
-	MOVOU	80(SI), X5
-	MOVOU	X5, 80(DI)
-	MOVOU	96(SI), X6
-	MOVOU	X6, 96(DI)
-	MOVOU	112(SI), X7
-	MOVOU	X7, 112(DI)
-	MOVOU	-128(SI)(BX*1), X8
-	MOVOU	X8, -128(DI)(BX*1)
-	MOVOU	-112(SI)(BX*1), X9
-	MOVOU	X9, -112(DI)(BX*1)
-	MOVOU	-96(SI)(BX*1), X10
-	MOVOU	X10, -96(DI)(BX*1)
-	MOVOU	-80(SI)(BX*1), X11
-	MOVOU	X11, -80(DI)(BX*1)
-	MOVOU	-64(SI)(BX*1), X12
-	MOVOU	X12, -64(DI)(BX*1)
-	MOVOU	-48(SI)(BX*1), X13
-	MOVOU	X13, -48(DI)(BX*1)
-	MOVOU	-32(SI)(BX*1), X14
-	MOVOU	X14, -32(DI)(BX*1)
-	MOVOU	-16(SI)(BX*1), X15
-	MOVOU	X15, -16(DI)(BX*1)
-	RET
-move_256through2048:
-	SUBQ	$256, BX
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	16(SI), X1
-	MOVOU	X1, 16(DI)
-	MOVOU	32(SI), X2
-	MOVOU	X2, 32(DI)
-	MOVOU	48(SI), X3
-	MOVOU	X3, 48(DI)
-	MOVOU	64(SI), X4
-	MOVOU	X4, 64(DI)
-	MOVOU	80(SI), X5
-	MOVOU	X5, 80(DI)
-	MOVOU	96(SI), X6
-	MOVOU	X6, 96(DI)
-	MOVOU	112(SI), X7
-	MOVOU	X7, 112(DI)
-	MOVOU	128(SI), X8
-	MOVOU	X8, 128(DI)
-	MOVOU	144(SI), X9
-	MOVOU	X9, 144(DI)
-	MOVOU	160(SI), X10
-	MOVOU	X10, 160(DI)
-	MOVOU	176(SI), X11
-	MOVOU	X11, 176(DI)
-	MOVOU	192(SI), X12
-	MOVOU	X12, 192(DI)
-	MOVOU	208(SI), X13
-	MOVOU	X13, 208(DI)
-	MOVOU	224(SI), X14
-	MOVOU	X14, 224(DI)
-	MOVOU	240(SI), X15
-	MOVOU	X15, 240(DI)
-	CMPQ	BX, $256
-	LEAQ	256(SI), SI
-	LEAQ	256(DI), DI
-	JGE	move_256through2048
-	JMP	tail
diff --git a/pkg/sentry/platform/safecopy/memcpy_arm64.s b/pkg/sentry/platform/safecopy/memcpy_arm64.s
deleted file mode 100644
index e7e541565..000000000
--- a/pkg/sentry/platform/safecopy/memcpy_arm64.s
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// handleMemcpyFault returns (the value stored in R0, the value stored in R1).
-// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
-// with the faulting address stored in R0 and the signal number stored in R1.
-//
-// It must have the same frame configuration as memcpy so that it can undo any
-// potential call frame set up by the assembler.
-TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
-	MOVD R0, addr+24(FP)
-	MOVW R1, sig+32(FP)
-	RET
-
-// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
-// during the copy, it returns the address that caused the fault and the number
-// of the signal that was received. Otherwise, it returns an unspecified address
-// and a signal number of 0.
-//
-// Data is copied in order, such that if a fault happens at address p, it is
-// safe to assume that all data before p-maxRegisterSize has already been
-// successfully copied.
-//
-// The code is derived from the Go source runtime.memmove.
-//
-// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
-TEXT ·memcpy(SB), NOSPLIT, $-8-36
-	// Store 0 as the returned signal number. If we run to completion,
-	// this is the value the caller will see; if a signal is received,
-	// handleMemcpyFault will store a different value in this address.
-	MOVW $0, sig+32(FP)
-
-	MOVD to+0(FP), R3
-	MOVD from+8(FP), R4
-	MOVD n+16(FP), R5
-	CMP $0, R5
-	BNE check
-	RET
-
-check:
-	AND $~7, R5, R7     // R7 is N&~7.
-	SUB R7, R5, R6      // R6 is N&7.
-
-	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
-	// R3 and R4 are advanced as we copy.
-
-	// (There may be implementations of armv8 where copying by bytes until
-	// at least one of source or dest is word aligned is a worthwhile
-	// optimization, but the on the one tested so far (xgene) it did not
-	// make a significance difference.)
-
-	CMP $0, R7          // Do we need to do any word-by-word copying?
-	BEQ noforwardlarge
-	ADD R3, R7, R9      // R9 points just past where we copy by word.
-
-forwardlargeloop:
-	MOVD.P 8(R4), R8       // R8 is just a scratch register.
-	MOVD.P R8, 8(R3)
-	CMP R3, R9
-	BNE forwardlargeloop
-
-noforwardlarge:
-	CMP $0, R6          // Do we need to do any byte-by-byte copying?
-	BNE forwardtail
-	RET
-
-forwardtail:
-	ADD R3, R6, R9      // R9 points just past the destination memory.
-
-forwardtailloop:
-	MOVBU.P 1(R4), R8
-	MOVBU.P R8, 1(R3)
-	CMP R3, R9
-	BNE forwardtailloop
-	RET
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
deleted file mode 100644
index 2fb7e5809..000000000
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package safecopy provides an efficient implementation of functions to access
-// memory that may result in SIGSEGV or SIGBUS being sent to the accessor.
-package safecopy
-
-import (
-	"fmt"
-	"reflect"
-	"runtime"
-	"syscall"
-
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// SegvError is returned when a safecopy function receives SIGSEGV.
-type SegvError struct {
-	// Addr is the address at which the SIGSEGV occurred.
-	Addr uintptr
-}
-
-// Error implements error.Error.
-func (e SegvError) Error() string {
-	return fmt.Sprintf("SIGSEGV at %#x", e.Addr)
-}
-
-// BusError is returned when a safecopy function receives SIGBUS.
-type BusError struct {
-	// Addr is the address at which the SIGBUS occurred.
-	Addr uintptr
-}
-
-// Error implements error.Error.
-func (e BusError) Error() string {
-	return fmt.Sprintf("SIGBUS at %#x", e.Addr)
-}
-
-// AlignmentError is returned when a safecopy function is passed an address
-// that does not meet alignment requirements.
-type AlignmentError struct {
-	// Addr is the invalid address.
-	Addr uintptr
-
-	// Alignment is the required alignment.
-	Alignment uintptr
-}
-
-// Error implements error.Error.
-func (e AlignmentError) Error() string {
-	return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment)
-}
-
-var (
-	// The begin and end addresses below are for the functions that are
-	// checked by the signal handler.
-	memcpyBegin               uintptr
-	memcpyEnd                 uintptr
-	memclrBegin               uintptr
-	memclrEnd                 uintptr
-	swapUint32Begin           uintptr
-	swapUint32End             uintptr
-	swapUint64Begin           uintptr
-	swapUint64End             uintptr
-	compareAndSwapUint32Begin uintptr
-	compareAndSwapUint32End   uintptr
-	loadUint32Begin           uintptr
-	loadUint32End             uintptr
-
-	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
-	// configured before we replaced it with our own. We still call into it
-	// when we get a SIGSEGV that is not interesting to us.
-	savedSigSegVHandler uintptr
-
-	// same a above, but for SIGBUS signals.
-	savedSigBusHandler uintptr
-)
-
-// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS
-// signals.
-func signalHandler()
-
-// FindEndAddress returns the end address (one byte beyond the last) of the
-// function that contains the specified address (begin).
-func FindEndAddress(begin uintptr) uintptr {
-	f := runtime.FuncForPC(begin)
-	if f != nil {
-		for p := begin; ; p++ {
-			g := runtime.FuncForPC(p)
-			if f != g {
-				return p
-			}
-		}
-	}
-	return begin
-}
-
-// initializeAddresses initializes the addresses used by the signal handler.
-func initializeAddresses() {
-	// The following functions are written in assembly language, so they won't
-	// be inlined by the existing compiler/linker. Tests will fail if this
-	// assumption is violated.
-	memcpyBegin = reflect.ValueOf(memcpy).Pointer()
-	memcpyEnd = FindEndAddress(memcpyBegin)
-	memclrBegin = reflect.ValueOf(memclr).Pointer()
-	memclrEnd = FindEndAddress(memclrBegin)
-	swapUint32Begin = reflect.ValueOf(swapUint32).Pointer()
-	swapUint32End = FindEndAddress(swapUint32Begin)
-	swapUint64Begin = reflect.ValueOf(swapUint64).Pointer()
-	swapUint64End = FindEndAddress(swapUint64Begin)
-	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
-	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
-	loadUint32Begin = reflect.ValueOf(loadUint32).Pointer()
-	loadUint32End = FindEndAddress(loadUint32Begin)
-}
-
-func init() {
-	initializeAddresses()
-	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil {
-		panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
-	}
-	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil {
-		panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
-	}
-	syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) {
-		switch e.(type) {
-		case SegvError, BusError, AlignmentError:
-			return syscall.EFAULT, true
-		default:
-			return 0, false
-		}
-	})
-}
diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go
deleted file mode 100644
index 5818f7f9b..000000000
--- a/pkg/sentry/platform/safecopy/safecopy_test.go
+++ /dev/null
@@ -1,617 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safecopy
-
-import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
-	"math/rand"
-	"os"
-	"runtime/debug"
-	"syscall"
-	"testing"
-	"unsafe"
-)
-
-// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular
-// dependency.
-const pageSize = 4096
-
-func initRandom(b []byte) {
-	for i := range b {
-		b[i] = byte(rand.Intn(256))
-	}
-}
-
-func randBuf(size int) []byte {
-	b := make([]byte, size)
-	initRandom(b)
-	return b
-}
-
-func TestCopyInSuccess(t *testing.T) {
-	// Test that CopyIn does not return an error when all pages are accessible.
-	const bufLen = 8192
-	a := randBuf(bufLen)
-	b := make([]byte, bufLen)
-
-	n, err := CopyIn(b, unsafe.Pointer(&a[0]))
-	if n != bufLen {
-		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
-	}
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if !bytes.Equal(a, b) {
-		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
-	}
-}
-
-func TestCopyOutSuccess(t *testing.T) {
-	// Test that CopyOut does not return an error when all pages are
-	// accessible.
-	const bufLen = 8192
-	a := randBuf(bufLen)
-	b := make([]byte, bufLen)
-
-	n, err := CopyOut(unsafe.Pointer(&b[0]), a)
-	if n != bufLen {
-		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
-	}
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if !bytes.Equal(a, b) {
-		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
-	}
-}
-
-func TestCopySuccess(t *testing.T) {
-	// Test that Copy does not return an error when all pages are accessible.
-	const bufLen = 8192
-	a := randBuf(bufLen)
-	b := make([]byte, bufLen)
-
-	n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen)
-	if n != bufLen {
-		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
-	}
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if !bytes.Equal(a, b) {
-		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
-	}
-}
-
-func TestZeroOutSuccess(t *testing.T) {
-	// Test that ZeroOut does not return an error when all pages are
-	// accessible.
-	const bufLen = 8192
-	a := make([]byte, bufLen)
-	b := randBuf(bufLen)
-
-	n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen)
-	if n != bufLen {
-		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
-	}
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if !bytes.Equal(a, b) {
-		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
-	}
-}
-
-func TestSwapUint32Success(t *testing.T) {
-	// Test that SwapUint32 does not return an error when the page is
-	// accessible.
-	before := uint32(rand.Int31())
-	after := uint32(rand.Int31())
-	val := before
-
-	old, err := SwapUint32(unsafe.Pointer(&val), after)
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if old != before {
-		t.Errorf("Unexpected old value: got %v, want %v", old, before)
-	}
-	if val != after {
-		t.Errorf("Unexpected new value: got %v, want %v", val, after)
-	}
-}
-
-func TestSwapUint32AlignmentError(t *testing.T) {
-	// Test that SwapUint32 returns an AlignmentError when passed an unaligned
-	// address.
-	data := new(struct{ val uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 4}
-	if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want {
-		t.Errorf("Unexpected error: got %v, want %v", err, want)
-	}
-}
-
-func TestSwapUint64Success(t *testing.T) {
-	// Test that SwapUint64 does not return an error when the page is
-	// accessible.
-	before := uint64(rand.Int63())
-	after := uint64(rand.Int63())
-	// "The first word in ... an allocated struct or slice can be relied upon
-	// to be 64-bit aligned." - sync/atomic docs
-	data := new(struct{ val uint64 })
-	data.val = before
-
-	old, err := SwapUint64(unsafe.Pointer(&data.val), after)
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if old != before {
-		t.Errorf("Unexpected old value: got %v, want %v", old, before)
-	}
-	if data.val != after {
-		t.Errorf("Unexpected new value: got %v, want %v", data.val, after)
-	}
-}
-
-func TestSwapUint64AlignmentError(t *testing.T) {
-	// Test that SwapUint64 returns an AlignmentError when passed an unaligned
-	// address.
-	data := new(struct{ val1, val2 uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val1)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 8}
-	if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want {
-		t.Errorf("Unexpected error: got %v, want %v", err, want)
-	}
-}
-
-func TestCompareAndSwapUint32Success(t *testing.T) {
-	// Test that CompareAndSwapUint32 does not return an error when the page is
-	// accessible.
-	before := uint32(rand.Int31())
-	after := uint32(rand.Int31())
-	val := before
-
-	old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after)
-	if err != nil {
-		t.Errorf("Unexpected error: %v", err)
-	}
-	if old != before {
-		t.Errorf("Unexpected old value: got %v, want %v", old, before)
-	}
-	if val != after {
-		t.Errorf("Unexpected new value: got %v, want %v", val, after)
-	}
-}
-
-func TestCompareAndSwapUint32AlignmentError(t *testing.T) {
-	// Test that CompareAndSwapUint32 returns an AlignmentError when passed an
-	// unaligned address.
-	data := new(struct{ val uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 4}
-	if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want {
-		t.Errorf("Unexpected error: got %v, want %v", err, want)
-	}
-}
-
-// withSegvErrorTestMapping calls fn with a two-page mapping. The first page
-// contains random data, and the second page generates SIGSEGV when accessed.
-func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) {
-	mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE)
-	if err != nil {
-		t.Fatalf("Mmap failed: %v", err)
-	}
-	defer syscall.Munmap(mapping)
-	if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil {
-		t.Fatalf("Mprotect failed: %v", err)
-	}
-	initRandom(mapping[:pageSize])
-
-	fn(mapping)
-}
-
-// withBusErrorTestMapping calls fn with a two-page mapping. The first page
-// contains random data, and the second page generates SIGBUS when accessed.
-func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) {
-	f, err := ioutil.TempFile("", "sigbus_test")
-	if err != nil {
-		t.Fatalf("TempFile failed: %v", err)
-	}
-	defer f.Close()
-	if err := f.Truncate(pageSize); err != nil {
-		t.Fatalf("Truncate failed: %v", err)
-	}
-	mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
-	if err != nil {
-		t.Fatalf("Mmap failed: %v", err)
-	}
-	defer syscall.Munmap(mapping)
-	initRandom(mapping[:pageSize])
-
-	fn(mapping)
-}
-
-func TestCopyInSegvError(t *testing.T) {
-	// Test that CopyIn returns a SegvError when reaching a page that signals
-	// SIGSEGV.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
-			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				dst := randBuf(pageSize)
-				n, err := CopyIn(dst, src)
-				if n != bytesBeforeFault {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (SegvError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopyInBusError(t *testing.T) {
-	// Test that CopyIn returns a BusError when reaching a page that signals
-	// SIGBUS.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
-			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				dst := randBuf(pageSize)
-				n, err := CopyIn(dst, src)
-				if n != bytesBeforeFault {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (BusError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopyOutSegvError(t *testing.T) {
-	// Test that CopyOut returns a SegvError when reaching a page that signals
-	// SIGSEGV.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
-			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				src := randBuf(pageSize)
-				n, err := CopyOut(dst, src)
-				if n != bytesBeforeFault {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (SegvError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopyOutBusError(t *testing.T) {
-	// Test that CopyOut returns a BusError when reaching a page that signals
-	// SIGBUS.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
-			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				src := randBuf(pageSize)
-				n, err := CopyOut(dst, src)
-				if n != bytesBeforeFault {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (BusError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopySourceSegvError(t *testing.T) {
-	// Test that Copy returns a SegvError when copying from a page that signals
-	// SIGSEGV.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
-			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				dst := randBuf(pageSize)
-				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
-				if n != uintptr(bytesBeforeFault) {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (SegvError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopySourceBusError(t *testing.T) {
-	// Test that Copy returns a BusError when copying from a page that signals
-	// SIGBUS.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
-			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				dst := randBuf(pageSize)
-				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
-				if n != uintptr(bytesBeforeFault) {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (BusError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopyDestinationSegvError(t *testing.T) {
-	// Test that Copy returns a SegvError when copying to a page that signals
-	// SIGSEGV.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
-			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				src := randBuf(pageSize)
-				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
-				if n != uintptr(bytesBeforeFault) {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (SegvError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestCopyDestinationBusError(t *testing.T) {
-	// Test that Copy returns a BusError when copying to a page that signals
-	// SIGBUS.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
-			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				src := randBuf(pageSize)
-				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
-				if n != uintptr(bytesBeforeFault) {
-					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (BusError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
-					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
-				}
-			})
-		})
-	}
-}
-
-func TestZeroOutSegvError(t *testing.T) {
-	// Test that ZeroOut returns a SegvError when reaching a page that signals
-	// SIGSEGV.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
-			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				n, err := ZeroOut(dst, pageSize)
-				if n != uintptr(bytesBeforeFault) {
-					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (SegvError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) {
-					t.Errorf("Non-zero bytes in written part of mapping: %v", got)
-				}
-			})
-		})
-	}
-}
-
-func TestZeroOutBusError(t *testing.T) {
-	// Test that ZeroOut returns a BusError when reaching a page that signals
-	// SIGBUS.
-	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
-		t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
-			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
-				n, err := ZeroOut(dst, pageSize)
-				if n != uintptr(bytesBeforeFault) {
-					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
-				}
-				if want := (BusError{secondPage}); err != want {
-					t.Errorf("Unexpected error: got %v, want %v", err, want)
-				}
-				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) {
-					t.Errorf("Non-zero bytes in written part of mapping: %v", got)
-				}
-			})
-		})
-	}
-}
-
-func TestSwapUint32SegvError(t *testing.T) {
-	// Test that SwapUint32 returns a SegvError when reaching a page that
-	// signals SIGSEGV.
-	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
-		if want := (SegvError{secondPage}); err != want {
-			t.Errorf("Unexpected error: got %v, want %v", err, want)
-		}
-	})
-}
-
-func TestSwapUint32BusError(t *testing.T) {
-	// Test that SwapUint32 returns a BusError when reaching a page that
-	// signals SIGBUS.
-	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
-		if want := (BusError{secondPage}); err != want {
-			t.Errorf("Unexpected error: got %v, want %v", err, want)
-		}
-	})
-}
-
-func TestSwapUint64SegvError(t *testing.T) {
-	// Test that SwapUint64 returns a SegvError when reaching a page that
-	// signals SIGSEGV.
-	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
-		if want := (SegvError{secondPage}); err != want {
-			t.Errorf("Unexpected error: got %v, want %v", err, want)
-		}
-	})
-}
-
-func TestSwapUint64BusError(t *testing.T) {
-	// Test that SwapUint64 returns a BusError when reaching a page that
-	// signals SIGBUS.
-	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
-		if want := (BusError{secondPage}); err != want {
-			t.Errorf("Unexpected error: got %v, want %v", err, want)
-		}
-	})
-}
-
-func TestCompareAndSwapUint32SegvError(t *testing.T) {
-	// Test that CompareAndSwapUint32 returns a SegvError when reaching a page
-	// that signals SIGSEGV.
-	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
-		if want := (SegvError{secondPage}); err != want {
-			t.Errorf("Unexpected error: got %v, want %v", err, want)
-		}
-	})
-}
-
-func TestCompareAndSwapUint32BusError(t *testing.T) {
-	// Test that CompareAndSwapUint32 returns a BusError when reaching a page
-	// that signals SIGBUS.
-	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
-		if want := (BusError{secondPage}); err != want {
-			t.Errorf("Unexpected error: got %v, want %v", err, want)
-		}
-	})
-}
-
-func testCopy(dst, src []byte) (panicked bool) {
-	defer func() {
-		if r := recover(); r != nil {
-			panicked = true
-		}
-	}()
-	debug.SetPanicOnFault(true)
-	copy(dst, src)
-	return
-}
-
-func TestSegVOnMemmove(t *testing.T) {
-	// Test that SIGSEGVs received by runtime.memmove when *not* doing
-	// CopyIn or CopyOut work gets propagated to the runtime.
-	const bufLen = pageSize
-	a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE)
-	if err != nil {
-		t.Fatalf("Mmap failed: %v", err)
-
-	}
-	defer syscall.Munmap(a)
-	b := randBuf(bufLen)
-
-	if !testCopy(b, a) {
-		t.Fatalf("testCopy didn't panic when it should have")
-	}
-
-	if !testCopy(a, b) {
-		t.Fatalf("testCopy didn't panic when it should have")
-	}
-}
-
-func TestSigbusOnMemmove(t *testing.T) {
-	// Test that SIGBUS received by runtime.memmove when *not* doing
-	// CopyIn or CopyOut work gets propagated to the runtime.
-	const bufLen = pageSize
-	f, err := ioutil.TempFile("", "sigbus_test")
-	if err != nil {
-		t.Fatalf("TempFile failed: %v", err)
-	}
-	os.Remove(f.Name())
-	defer f.Close()
-
-	a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
-	if err != nil {
-		t.Fatalf("Mmap failed: %v", err)
-
-	}
-	defer syscall.Munmap(a)
-	b := randBuf(bufLen)
-
-	if !testCopy(b, a) {
-		t.Fatalf("testCopy didn't panic when it should have")
-	}
-
-	if !testCopy(a, b) {
-		t.Fatalf("testCopy didn't panic when it should have")
-	}
-}
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
deleted file mode 100644
index eef028e68..000000000
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ /dev/null
@@ -1,335 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safecopy
-
-import (
-	"fmt"
-	"syscall"
-	"unsafe"
-)
-
-// maxRegisterSize is the maximum register size used in memcpy and memclr. It
-// is used to decide by how much to rewind the copy (for memcpy) or zeroing
-// (for memclr) before proceeding.
-const maxRegisterSize = 16
-
-// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
-// during the copy, it returns the address that caused the fault and the number
-// of the signal that was received. Otherwise, it returns an unspecified address
-// and a signal number of 0.
-//
-// Data is copied in order, such that if a fault happens at address p, it is
-// safe to assume that all data before p-maxRegisterSize has already been
-// successfully copied.
-//
-//go:noescape
-func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
-
-// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
-// signal is received during the write, it returns the address that caused the
-// fault and the number of the signal that was received. Otherwise, it returns
-// an unspecified address and a signal number of 0.
-//
-// Data is written in order, such that if a fault happens at address p, it is
-// safe to assume that all data before p-maxRegisterSize has already been
-// successfully written.
-//
-//go:noescape
-func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
-
-// swapUint32 atomically stores new into *ptr and returns (the previous *ptr
-// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
-// value of old is unspecified, and sig is the number of the signal that was
-// received.
-//
-// Preconditions: ptr must be aligned to a 4-byte boundary.
-//
-//go:noescape
-func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
-
-// swapUint64 atomically stores new into *ptr and returns (the previous *ptr
-// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
-// value of old is unspecified, and sig is the number of the signal that was
-// received.
-//
-// Preconditions: ptr must be aligned to a 8-byte boundary.
-//
-//go:noescape
-func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
-
-// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is
-// received during the operation, the value of prev is unspecified, and sig is
-// the number of the signal that was received.
-//
-// Preconditions: ptr must be aligned to a 4-byte boundary.
-//
-//go:noescape
-func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
-
-// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
-// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
-//
-// Preconditions: ptr must be aligned to a 4-byte boundary.
-//
-//go:noescape
-func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
-
-// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
-// copied and an error if SIGSEGV or SIGBUS is received while reading from src.
-func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
-	toCopy := uintptr(len(dst))
-	if len(dst) == 0 {
-		return 0, nil
-	}
-
-	fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy)
-	if sig == 0 {
-		return len(dst), nil
-	}
-
-	faultN, srcN := uintptr(fault), uintptr(src)
-	if faultN < srcN || faultN >= srcN+toCopy {
-		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy))
-	}
-
-	// memcpy might have ended the copy up to maxRegisterSize bytes before
-	// fault, if an instruction caused a memory access that straddled two
-	// pages, and the second one faulted. Try to copy up to the fault.
-	var done int
-	if faultN-srcN > maxRegisterSize {
-		done = int(faultN - srcN - maxRegisterSize)
-	}
-	n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done)))
-	done += n
-	if err != nil {
-		return done, err
-	}
-	return done, errorFromFaultSignal(fault, sig)
-}
-
-// CopyOut copies len(src) bytes from src to dst. If returns the number of
-// bytes done and an error if SIGSEGV or SIGBUS is received while writing to
-// dst.
-func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
-	toCopy := uintptr(len(src))
-	if toCopy == 0 {
-		return 0, nil
-	}
-
-	fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy)
-	if sig == 0 {
-		return len(src), nil
-	}
-
-	faultN, dstN := uintptr(fault), uintptr(dst)
-	if faultN < dstN || faultN >= dstN+toCopy {
-		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy))
-	}
-
-	// memcpy might have ended the copy up to maxRegisterSize bytes before
-	// fault, if an instruction caused a memory access that straddled two
-	// pages, and the second one faulted. Try to copy up to the fault.
-	var done int
-	if faultN-dstN > maxRegisterSize {
-		done = int(faultN - dstN - maxRegisterSize)
-	}
-	n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)])
-	done += n
-	if err != nil {
-		return done, err
-	}
-	return done, errorFromFaultSignal(fault, sig)
-}
-
-// Copy copies toCopy bytes from src to dst. It returns the number of bytes
-// copied and an error if SIGSEGV or SIGBUS is received while reading from src
-// or writing to dst.
-//
-// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
-// the resulting contents of dst are unspecified.
-func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
-	if toCopy == 0 {
-		return 0, nil
-	}
-
-	fault, sig := memcpy(dst, src, toCopy)
-	if sig == 0 {
-		return toCopy, nil
-	}
-
-	// Did the fault occur while reading from src or writing to dst?
-	faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst)
-	faultAfterSrc := ^uintptr(0)
-	if faultN >= srcN {
-		faultAfterSrc = faultN - srcN
-	}
-	faultAfterDst := ^uintptr(0)
-	if faultN >= dstN {
-		faultAfterDst = faultN - dstN
-	}
-	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
-		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
-	}
-	faultedAfter := faultAfterSrc
-	if faultedAfter > faultAfterDst {
-		faultedAfter = faultAfterDst
-	}
-
-	// memcpy might have ended the copy up to maxRegisterSize bytes before
-	// fault, if an instruction caused a memory access that straddled two
-	// pages, and the second one faulted. Try to copy up to the fault.
-	var done uintptr
-	if faultedAfter > maxRegisterSize {
-		done = faultedAfter - maxRegisterSize
-	}
-	n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done)
-	done += n
-	if err != nil {
-		return done, err
-	}
-	return done, errorFromFaultSignal(fault, sig)
-}
-
-// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
-// written and an error if SIGSEGV or SIGBUS is received while writing to dst.
-func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
-	if toZero == 0 {
-		return 0, nil
-	}
-
-	fault, sig := memclr(dst, toZero)
-	if sig == 0 {
-		return toZero, nil
-	}
-
-	faultN, dstN := uintptr(fault), uintptr(dst)
-	if faultN < dstN || faultN >= dstN+toZero {
-		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero))
-	}
-
-	// memclr might have ended the write up to maxRegisterSize bytes before
-	// fault, if an instruction caused a memory access that straddled two
-	// pages, and the second one faulted. Try to write up to the fault.
-	var done uintptr
-	if faultN-dstN > maxRegisterSize {
-		done = faultN - dstN - maxRegisterSize
-	}
-	n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done)
-	done += n
-	if err != nil {
-		return done, err
-	}
-	return done, errorFromFaultSignal(fault, sig)
-}
-
-// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns
-// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
-// not aligned to a 4-byte boundary.
-func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
-	if addr := uintptr(ptr); addr&3 != 0 {
-		return 0, AlignmentError{addr, 4}
-	}
-	old, sig := swapUint32(ptr, new)
-	return old, errorFromFaultSignal(ptr, sig)
-}
-
-// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
-// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
-// not aligned to an 8-byte boundary.
-func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
-	if addr := uintptr(ptr); addr&7 != 0 {
-		return 0, AlignmentError{addr, 8}
-	}
-	old, sig := swapUint64(ptr, new)
-	return old, errorFromFaultSignal(ptr, sig)
-}
-
-// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
-// except that it returns an error if SIGSEGV or SIGBUS is received while
-// accessing ptr, or if ptr is not aligned to a 4-byte boundary.
-func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
-	if addr := uintptr(ptr); addr&3 != 0 {
-		return 0, AlignmentError{addr, 4}
-	}
-	prev, sig := compareAndSwapUint32(ptr, old, new)
-	return prev, errorFromFaultSignal(ptr, sig)
-}
-
-// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
-// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
-//
-// Preconditions: ptr must be aligned to a 4-byte boundary.
-func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
-	if addr := uintptr(ptr); addr&3 != 0 {
-		return 0, AlignmentError{addr, 4}
-	}
-	val, sig := loadUint32(ptr)
-	return val, errorFromFaultSignal(ptr, sig)
-}
-
-func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
-	switch sig {
-	case 0:
-		return nil
-	case int32(syscall.SIGSEGV):
-		return SegvError{uintptr(addr)}
-	case int32(syscall.SIGBUS):
-		return BusError{uintptr(addr)}
-	default:
-		panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
-	}
-}
-
-// ReplaceSignalHandler replaces the existing signal handler for the provided
-// signal with the one that handles faults in safecopy-protected functions.
-//
-// It stores the value of the previously set handler in previous.
-//
-// This function will be called on initialization in order to install safecopy
-// handlers for appropriate signals. These handlers will call the previous
-// handler however, and if this is function is being used externally then the
-// same courtesy is expected.
-func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error {
-	var sa struct {
-		handler  uintptr
-		flags    uint64
-		restorer uintptr
-		mask     uint64
-	}
-	const maskLen = 8
-
-	// Get the existing signal handler information, and save the current
-	// handler. Once we replace it, we will use this pointer to fall back to
-	// it when we receive other signals.
-	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 {
-		return e
-	}
-
-	// Fail if there isn't a previous handler.
-	if sa.handler == 0 {
-		return fmt.Errorf("previous handler for signal %x isn't set", sig)
-	}
-
-	*previous = sa.handler
-
-	// Install our own handler.
-	sa.handler = handler
-	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
-		return e
-	}
-
-	return nil
-}
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
deleted file mode 100644
index 475ae48e9..000000000
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "textflag.h"
-
-// The signals handled by sigHandler.
-#define SIGBUS  7
-#define SIGSEGV 11
-
-// Offsets to the registers in context->uc_mcontext.gregs[].
-#define REG_RDI 0x68
-#define REG_RAX 0x90
-#define REG_IP  0xa8
-
-// Offset to the si_addr field of siginfo.
-#define SI_CODE 0x08
-#define SI_ADDR 0x10
-
-// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
-// not be set up as a handler to any other signals.
-//
-// If the instruction causing the signal is within a safecopy-protected
-// function, the signal is handled such that execution resumes in the
-// appropriate fault handling stub with AX containing the faulting address and
-// DI containing the signal number. Otherwise control is transferred to the
-// previously configured signal handler (savedSigSegvHandler or
-// savedSigBusHandler).
-//
-// This function cannot be written in go because it runs whenever a signal is
-// received by the thread (preempting whatever was running), which includes when
-// garbage collector has stopped or isn't expecting any interactions (like
-// barriers).
-//
-// The arguments are the following:
-// DI - The signal number.
-// SI - Pointer to siginfo_t structure.
-// DX - Pointer to ucontext structure.
-TEXT ·signalHandler(SB),NOSPLIT,$0
-	// Check if the signal is from the kernel.
-	MOVQ $0x0, CX
-	CMPL CX, SI_CODE(SI)
-	JGE original_handler
-
-	// Check if RIP is within the area we care about.
-	MOVQ REG_IP(DX), CX
-	CMPQ CX, ·memcpyBegin(SB)
-	JB not_memcpy
-	CMPQ CX, ·memcpyEnd(SB)
-	JAE not_memcpy
-
-	// Modify the context such that execution will resume in the fault
-	// handler.
-	LEAQ handleMemcpyFault(SB), CX
-	JMP handle_fault
-
-not_memcpy:
-	CMPQ CX, ·memclrBegin(SB)
-	JB not_memclr
-	CMPQ CX, ·memclrEnd(SB)
-	JAE not_memclr
-
-	LEAQ handleMemclrFault(SB), CX
-	JMP handle_fault
-
-not_memclr:
-	CMPQ CX, ·swapUint32Begin(SB)
-	JB not_swapuint32
-	CMPQ CX, ·swapUint32End(SB)
-	JAE not_swapuint32
-
-	LEAQ handleSwapUint32Fault(SB), CX
-	JMP handle_fault
-
-not_swapuint32:
-	CMPQ CX, ·swapUint64Begin(SB)
-	JB not_swapuint64
-	CMPQ CX, ·swapUint64End(SB)
-	JAE not_swapuint64
-
-	LEAQ handleSwapUint64Fault(SB), CX
-	JMP handle_fault
-
-not_swapuint64:
-	CMPQ CX, ·compareAndSwapUint32Begin(SB)
-	JB not_casuint32
-	CMPQ CX, ·compareAndSwapUint32End(SB)
-	JAE not_casuint32
-
-	LEAQ handleCompareAndSwapUint32Fault(SB), CX
-	JMP handle_fault
-
-not_casuint32:
-	CMPQ CX, ·loadUint32Begin(SB)
-	JB not_loaduint32
-	CMPQ CX, ·loadUint32End(SB)
-	JAE not_loaduint32
-
-	LEAQ handleLoadUint32Fault(SB), CX
-	JMP handle_fault
-
-not_loaduint32:
-original_handler:
-	// Jump to the previous signal handler, which is likely the golang one.
-	XORQ CX, CX
-	MOVQ ·savedSigBusHandler(SB), AX
-	CMPL DI, $SIGSEGV
-	CMOVQEQ ·savedSigSegVHandler(SB), AX
-	JMP AX
-
-handle_fault:
-	// Entered with the address of the fault handler in RCX; store it in
-	// RIP.
-	MOVQ CX, REG_IP(DX)
-
-	// Store the faulting address in RAX.
-	MOVQ SI_ADDR(SI), CX
-	MOVQ CX, REG_RAX(DX)
-
-	// Store the signal number in EDI.
-	MOVL DI, REG_RDI(DX)
-
-	RET
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
deleted file mode 100644
index 53e4ac2c1..000000000
--- a/pkg/sentry/platform/safecopy/sighandler_arm64.s
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "textflag.h"
-
-// The signals handled by sigHandler.
-#define SIGBUS 7
-#define SIGSEGV 11
-
-// Offsets to the registers in context->uc_mcontext.gregs[].
-#define REG_R0 0xB8
-#define REG_R1 0xC0
-#define REG_PC 0x1B8
-
-// Offset to the si_addr field of siginfo.
-#define SI_CODE 0x08
-#define SI_ADDR 0x10
-
-// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
-// not be set up as a handler to any other signals.
-//
-// If the instruction causing the signal is within a safecopy-protected
-// function, the signal is handled such that execution resumes in the
-// appropriate fault handling stub with R0 containing the faulting address and
-// R1 containing the signal number. Otherwise control is transferred to the
-// previously configured signal handler (savedSigSegvHandler or
-// savedSigBusHandler).
-//
-// This function cannot be written in go because it runs whenever a signal is
-// received by the thread (preempting whatever was running), which includes when
-// garbage collector has stopped or isn't expecting any interactions (like
-// barriers).
-//
-// The arguments are the following:
-// R0 - The signal number.
-// R1 - Pointer to siginfo_t structure.
-// R2 - Pointer to ucontext structure.
-TEXT ·signalHandler(SB),NOSPLIT,$0
-	// Check if the signal is from the kernel, si_code > 0 means a kernel signal.
-	MOVD SI_CODE(R1), R7
-	CMPW $0x0, R7
-	BLE original_handler
-
-	// Check if PC is within the area we care about.
-	MOVD REG_PC(R2), R7
-	MOVD ·memcpyBegin(SB), R8
-	CMP R8, R7
-	BLO not_memcpy
-	MOVD ·memcpyEnd(SB), R8
-	CMP R8, R7
-	BHS not_memcpy
-
-	// Modify the context such that execution will resume in the fault handler.
-	MOVD $handleMemcpyFault(SB), R7
-	B handle_fault
-
-not_memcpy:
-	MOVD ·memclrBegin(SB), R8
-	CMP R8, R7
-	BLO not_memclr
-	MOVD ·memclrEnd(SB), R8
-	CMP R8, R7
-	BHS not_memclr
-
-	MOVD $handleMemclrFault(SB), R7
-	B handle_fault
-
-not_memclr:
-	MOVD ·swapUint32Begin(SB), R8
-	CMP R8, R7
-	BLO not_swapuint32
-	MOVD ·swapUint32End(SB), R8
-	CMP R8, R7
-	BHS not_swapuint32
-
-	MOVD $handleSwapUint32Fault(SB), R7
-	B handle_fault
-
-not_swapuint32:
-	MOVD ·swapUint64Begin(SB), R8
-	CMP R8, R7
-	BLO not_swapuint64
-	MOVD ·swapUint64End(SB), R8
-	CMP R8, R7
-	BHS not_swapuint64
-
-	MOVD $handleSwapUint64Fault(SB), R7
-	B handle_fault
-
-not_swapuint64:
-	MOVD ·compareAndSwapUint32Begin(SB), R8
-	CMP R8, R7
-	BLO not_casuint32
-	MOVD ·compareAndSwapUint32End(SB), R8
-	CMP R8, R7
-	BHS not_casuint32
-
-	MOVD $handleCompareAndSwapUint32Fault(SB), R7
-	B handle_fault
-
-not_casuint32:
-	MOVD ·loadUint32Begin(SB), R8
-	CMP R8, R7
-	BLO not_loaduint32
-	MOVD ·loadUint32End(SB), R8
-	CMP R8, R7
-	BHS not_loaduint32
-
-	MOVD $handleLoadUint32Fault(SB), R7
-	B handle_fault
-
-not_loaduint32:
-original_handler:
-	// Jump to the previous signal handler, which is likely the golang one.
-	MOVD ·savedSigBusHandler(SB), R7
-	MOVD ·savedSigSegVHandler(SB), R8
-	CMPW $SIGSEGV, R0
-	CSEL EQ, R8, R7, R7
-	B (R7)
-
-handle_fault:
-	// Entered with the address of the fault handler in R7; store it in PC.
-	MOVD R7, REG_PC(R2)
-
-	// Store the faulting address in R0.
-	MOVD SI_ADDR(R1), R7
-	MOVD R7, REG_R0(R2)
-
-	// Store the signal number in R1.
-	MOVW R0, REG_R1(R2)
-
-	RET
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
deleted file mode 100644
index 3ab76da97..000000000
--- a/pkg/sentry/safemem/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "safemem",
-    srcs = [
-        "block_unsafe.go",
-        "io.go",
-        "safemem.go",
-        "seq_unsafe.go",
-    ],
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/sentry/platform/safecopy",
-    ],
-)
-
-go_test(
-    name = "safemem_test",
-    size = "small",
-    srcs = [
-        "io_test.go",
-        "seq_test.go",
-    ],
-    library = ":safemem",
-)
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
deleted file mode 100644
index 6f03c94bf..000000000
--- a/pkg/sentry/safemem/block_unsafe.go
+++ /dev/null
@@ -1,279 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safemem
-
-import (
-	"fmt"
-	"reflect"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
-)
-
-// A Block is a range of contiguous bytes, similar to []byte but with the
-// following differences:
-//
-// - The memory represented by a Block may require the use of safecopy to
-// access.
-//
-// - Block does not carry a capacity and cannot be expanded.
-//
-// Blocks are immutable and may be copied by value. The zero value of Block
-// represents an empty range, analogous to a nil []byte.
-type Block struct {
-	// [start, start+length) is the represented memory.
-	//
-	// start is an unsafe.Pointer to ensure that Block prevents the represented
-	// memory from being garbage-collected.
-	start  unsafe.Pointer
-	length int
-
-	// needSafecopy is true if accessing the represented memory requires the
-	// use of safecopy.
-	needSafecopy bool
-}
-
-// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to
-// access without safecopy.
-func BlockFromSafeSlice(slice []byte) Block {
-	return blockFromSlice(slice, false)
-}
-
-// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to
-// access without safecopy.
-func BlockFromUnsafeSlice(slice []byte) Block {
-	return blockFromSlice(slice, true)
-}
-
-func blockFromSlice(slice []byte, needSafecopy bool) Block {
-	if len(slice) == 0 {
-		return Block{}
-	}
-	return Block{
-		start:        unsafe.Pointer(&slice[0]),
-		length:       len(slice),
-		needSafecopy: needSafecopy,
-	}
-}
-
-// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is
-// safe to access without safecopy.
-//
-// Preconditions: ptr+len does not overflow.
-func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block {
-	return blockFromPointer(ptr, len, false)
-}
-
-// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which
-// is not safe to access without safecopy.
-//
-// Preconditions: ptr+len does not overflow.
-func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block {
-	return blockFromPointer(ptr, len, true)
-}
-
-func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block {
-	if uptr := uintptr(ptr); uptr+uintptr(len) < uptr {
-		panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len))
-	}
-	return Block{
-		start:        ptr,
-		length:       len,
-		needSafecopy: needSafecopy,
-	}
-}
-
-// DropFirst returns a Block equivalent to b, but with the first n bytes
-// omitted. It is analogous to the [n:] operation on a slice, except that if n
-// > b.Len(), DropFirst returns an empty Block instead of panicking.
-//
-// Preconditions: n >= 0.
-func (b Block) DropFirst(n int) Block {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	return b.DropFirst64(uint64(n))
-}
-
-// DropFirst64 is equivalent to DropFirst but takes a uint64.
-func (b Block) DropFirst64(n uint64) Block {
-	if n >= uint64(b.length) {
-		return Block{}
-	}
-	return Block{
-		start:        unsafe.Pointer(uintptr(b.start) + uintptr(n)),
-		length:       b.length - int(n),
-		needSafecopy: b.needSafecopy,
-	}
-}
-
-// TakeFirst returns a Block equivalent to the first n bytes of b. It is
-// analogous to the [:n] operation on a slice, except that if n > b.Len(),
-// TakeFirst returns a copy of b instead of panicking.
-//
-// Preconditions: n >= 0.
-func (b Block) TakeFirst(n int) Block {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	return b.TakeFirst64(uint64(n))
-}
-
-// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
-func (b Block) TakeFirst64(n uint64) Block {
-	if n == 0 {
-		return Block{}
-	}
-	if n >= uint64(b.length) {
-		return b
-	}
-	return Block{
-		start:        b.start,
-		length:       int(n),
-		needSafecopy: b.needSafecopy,
-	}
-}
-
-// ToSlice returns a []byte equivalent to b.
-func (b Block) ToSlice() []byte {
-	var bs []byte
-	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
-	hdr.Data = uintptr(b.start)
-	hdr.Len = b.length
-	hdr.Cap = b.length
-	return bs
-}
-
-// Addr returns b's start address as a uintptr. It returns uintptr instead of
-// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers
-// without importing the unsafe package explicitly.
-//
-// Note that a uintptr is not recognized as a pointer by the garbage collector,
-// such that if there are no uses of b after a call to b.Addr() and the address
-// is to Go-managed memory, the returned uintptr does not prevent garbage
-// collection of the pointee.
-func (b Block) Addr() uintptr {
-	return uintptr(b.start)
-}
-
-// Len returns b's length in bytes.
-func (b Block) Len() int {
-	return b.length
-}
-
-// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy.
-func (b Block) NeedSafecopy() bool {
-	return b.needSafecopy
-}
-
-// String implements fmt.Stringer.String.
-func (b Block) String() string {
-	if uintptr(b.start) == 0 && b.length == 0 {
-		return "<nil>"
-	}
-	var suffix string
-	if b.needSafecopy {
-		suffix = "*"
-	}
-	return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix)
-}
-
-// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src
-// to dst and returns the number of bytes copied.
-//
-// If src and dst overlap, the data stored in dst is unspecified.
-func Copy(dst, src Block) (int, error) {
-	if !dst.needSafecopy && !src.needSafecopy {
-		return copy(dst.ToSlice(), src.ToSlice()), nil
-	}
-
-	n := dst.length
-	if n > src.length {
-		n = src.length
-	}
-	if n == 0 {
-		return 0, nil
-	}
-
-	switch {
-	case dst.needSafecopy && !src.needSafecopy:
-		return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice())
-	case !dst.needSafecopy && src.needSafecopy:
-		return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start)
-	case dst.needSafecopy && src.needSafecopy:
-		n64, err := safecopy.Copy(dst.start, src.start, uintptr(n))
-		return int(n64), err
-	default:
-		panic("unreachable")
-	}
-}
-
-// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed.
-func Zero(dst Block) (int, error) {
-	if !dst.needSafecopy {
-		bs := dst.ToSlice()
-		for i := range bs {
-			bs[i] = 0
-		}
-		return len(bs), nil
-	}
-
-	n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length))
-	return int(n64), err
-}
-
-// Safecopy atomics are no slower than non-safecopy atomics, so use the former
-// even when !b.needSafecopy to get consistent alignment checking.
-
-// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b.
-//
-// Preconditions: b.Len() >= 4.
-func SwapUint32(b Block, new uint32) (uint32, error) {
-	if b.length < 4 {
-		panic(fmt.Sprintf("insufficient length: %d", b.length))
-	}
-	return safecopy.SwapUint32(b.start, new)
-}
-
-// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b.
-//
-// Preconditions: b.Len() >= 8.
-func SwapUint64(b Block, new uint64) (uint64, error) {
-	if b.length < 8 {
-		panic(fmt.Sprintf("insufficient length: %d", b.length))
-	}
-	return safecopy.SwapUint64(b.start, new)
-}
-
-// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4
-// bytes of b.
-//
-// Preconditions: b.Len() >= 4.
-func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
-	if b.length < 4 {
-		panic(fmt.Sprintf("insufficient length: %d", b.length))
-	}
-	return safecopy.CompareAndSwapUint32(b.start, old, new)
-}
-
-// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b.
-//
-// Preconditions: b.Len() >= 4.
-func LoadUint32(b Block) (uint32, error) {
-	if b.length < 4 {
-		panic(fmt.Sprintf("insufficient length: %d", b.length))
-	}
-	return safecopy.LoadUint32(b.start)
-}
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
deleted file mode 100644
index f039a5c34..000000000
--- a/pkg/sentry/safemem/io.go
+++ /dev/null
@@ -1,392 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safemem
-
-import (
-	"errors"
-	"io"
-	"math"
-)
-
-// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write
-// beyond the end of the BlockSeq.
-var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq")
-
-// Reader represents a streaming byte source like io.Reader.
-type Reader interface {
-	// ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the
-	// number of bytes read. It may return a partial read without an error
-	// (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a
-	// full read with an error (i.e. (dsts.NumBytes(), err) where err != nil);
-	// note that this differs from io.Reader.Read (in particular, io.EOF should
-	// not be returned if ReadToBlocks successfully reads dsts.NumBytes()
-	// bytes.)
-	ReadToBlocks(dsts BlockSeq) (uint64, error)
-}
-
-// Writer represents a streaming byte sink like io.Writer.
-type Writer interface {
-	// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
-	// the number of bytes written. It may return a partial write without an
-	// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
-	// return a full write with an error (i.e. srcs.NumBytes(), err) where err
-	// != nil).
-	WriteFromBlocks(srcs BlockSeq) (uint64, error)
-}
-
-// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes()
-// bytes have been read or ReadToBlocks returns an error.
-func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) {
-	var done uint64
-	for !dsts.IsEmpty() {
-		n, err := r.ReadToBlocks(dsts)
-		done += n
-		if err != nil {
-			return done, err
-		}
-		dsts = dsts.DropFirst64(n)
-	}
-	return done, nil
-}
-
-// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until
-// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error.
-func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) {
-	var done uint64
-	for !srcs.IsEmpty() {
-		n, err := w.WriteFromBlocks(srcs)
-		done += n
-		if err != nil {
-			return done, err
-		}
-		srcs = srcs.DropFirst64(n)
-	}
-	return done, nil
-}
-
-// BlockSeqReader implements Reader by reading from a BlockSeq.
-type BlockSeqReader struct {
-	Blocks BlockSeq
-}
-
-// ReadToBlocks implements Reader.ReadToBlocks.
-func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
-	n, err := CopySeq(dsts, r.Blocks)
-	r.Blocks = r.Blocks.DropFirst64(n)
-	if err != nil {
-		return n, err
-	}
-	if n < dsts.NumBytes() {
-		return n, io.EOF
-	}
-	return n, nil
-}
-
-// BlockSeqWriter implements Writer by writing to a BlockSeq.
-type BlockSeqWriter struct {
-	Blocks BlockSeq
-}
-
-// WriteFromBlocks implements Writer.WriteFromBlocks.
-func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
-	n, err := CopySeq(w.Blocks, srcs)
-	w.Blocks = w.Blocks.DropFirst64(n)
-	if err != nil {
-		return n, err
-	}
-	if n < srcs.NumBytes() {
-		return n, ErrEndOfBlockSeq
-	}
-	return n, nil
-}
-
-// ReaderFunc implements Reader for a function with the semantics of
-// Reader.ReadToBlocks.
-type ReaderFunc func(dsts BlockSeq) (uint64, error)
-
-// ReadToBlocks implements Reader.ReadToBlocks.
-func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
-	return f(dsts)
-}
-
-// WriterFunc implements Writer for a function with the semantics of
-// Writer.WriteFromBlocks.
-type WriterFunc func(srcs BlockSeq) (uint64, error)
-
-// WriteFromBlocks implements Writer.WriteFromBlocks.
-func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
-	return f(srcs)
-}
-
-// ToIOReader implements io.Reader for a (safemem.)Reader.
-//
-// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does
-// so.
-type ToIOReader struct {
-	Reader Reader
-}
-
-// Read implements io.Reader.Read.
-func (r ToIOReader) Read(dst []byte) (int, error) {
-	n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst)))
-	return int(n), err
-}
-
-// ToIOWriter implements io.Writer for a (safemem.)Writer.
-type ToIOWriter struct {
-	Writer Writer
-}
-
-// Write implements io.Writer.Write.
-func (w ToIOWriter) Write(src []byte) (int, error) {
-	// io.Writer does not permit partial writes.
-	n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src)))
-	return int(n), err
-}
-
-// FromIOReader implements Reader for an io.Reader by repeatedly invoking
-// io.Reader.Read until it returns an error or partial read. This is not
-// thread-safe.
-//
-// FromIOReader will return a successful partial read iff Reader.Read does so.
-type FromIOReader struct {
-	Reader io.Reader
-}
-
-// ReadToBlocks implements Reader.ReadToBlocks.
-func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
-	var buf []byte
-	var done uint64
-	for !dsts.IsEmpty() {
-		dst := dsts.Head()
-		var n int
-		var err error
-		n, buf, err = r.readToBlock(dst, buf)
-		done += uint64(n)
-		if n != dst.Len() {
-			return done, err
-		}
-		dsts = dsts.Tail()
-		if err != nil {
-			if dsts.IsEmpty() && err == io.EOF {
-				return done, nil
-			}
-			return done, err
-		}
-	}
-	return done, nil
-}
-
-func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
-	// io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
-	// safecopy.
-	if !dst.NeedSafecopy() {
-		n, err := r.Reader.Read(dst.ToSlice())
-		return n, buf, err
-	}
-	if len(buf) < dst.Len() {
-		buf = make([]byte, dst.Len())
-	}
-	rn, rerr := r.Reader.Read(buf[:dst.Len()])
-	wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
-	if wberr != nil {
-		return wbn, buf, wberr
-	}
-	return wbn, buf, rerr
-}
-
-// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
-// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
-// read indicates an error. This is not thread-safe.
-type FromIOReaderAt struct {
-	ReaderAt io.ReaderAt
-	Offset   int64
-}
-
-// ReadToBlocks implements Reader.ReadToBlocks.
-func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
-	var buf []byte
-	var done uint64
-	for !dsts.IsEmpty() {
-		dst := dsts.Head()
-		var n int
-		var err error
-		n, buf, err = r.readToBlock(dst, buf)
-		done += uint64(n)
-		if n != dst.Len() {
-			return done, err
-		}
-		dsts = dsts.Tail()
-		if err != nil {
-			if dsts.IsEmpty() && err == io.EOF {
-				return done, nil
-			}
-			return done, err
-		}
-	}
-	return done, nil
-}
-
-func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
-	// io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
-	// safecopy.
-	if !dst.NeedSafecopy() {
-		n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
-		r.Offset += int64(n)
-		return n, buf, err
-	}
-	if len(buf) < dst.Len() {
-		buf = make([]byte, dst.Len())
-	}
-	rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
-	r.Offset += int64(rn)
-	wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
-	if wberr != nil {
-		return wbn, buf, wberr
-	}
-	return wbn, buf, rerr
-}
-
-// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
-// io.Writer.Write until it returns an error or partial write.
-//
-// FromIOWriter will tolerate implementations of io.Writer.Write that return
-// partial writes with a nil error in contravention of io.Writer's
-// requirements, since Writer is permitted to do so. FromIOWriter will return a
-// successful partial write iff Writer.Write does so.
-type FromIOWriter struct {
-	Writer io.Writer
-}
-
-// WriteFromBlocks implements Writer.WriteFromBlocks.
-func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
-	var buf []byte
-	var done uint64
-	for !srcs.IsEmpty() {
-		src := srcs.Head()
-		var n int
-		var err error
-		n, buf, err = w.writeFromBlock(src, buf)
-		done += uint64(n)
-		if n != src.Len() || err != nil {
-			return done, err
-		}
-		srcs = srcs.Tail()
-	}
-	return done, nil
-}
-
-func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) {
-	// io.Writer isn't safecopy-aware, so we have to buffer Blocks that require
-	// safecopy.
-	if !src.NeedSafecopy() {
-		n, err := w.Writer.Write(src.ToSlice())
-		return n, buf, err
-	}
-	if len(buf) < src.Len() {
-		buf = make([]byte, src.Len())
-	}
-	bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src)
-	wn, werr := w.Writer.Write(buf[:bufn])
-	if werr != nil {
-		return wn, buf, werr
-	}
-	return wn, buf, buferr
-}
-
-// FromVecReaderFunc implements Reader for a function that reads data into a
-// [][]byte and returns the number of bytes read as an int64.
-type FromVecReaderFunc struct {
-	ReadVec func(dsts [][]byte) (int64, error)
-}
-
-// ReadToBlocks implements Reader.ReadToBlocks.
-//
-// ReadToBlocks calls r.ReadVec at most once.
-func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
-	if dsts.IsEmpty() {
-		return 0, nil
-	}
-	// Ensure that we don't pass a [][]byte with a total length > MaxInt64.
-	dsts = dsts.TakeFirst64(uint64(math.MaxInt64))
-	dstSlices := make([][]byte, 0, dsts.NumBlocks())
-	// Buffer Blocks that require safecopy.
-	for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() {
-		dst := tmp.Head()
-		if dst.NeedSafecopy() {
-			dstSlices = append(dstSlices, make([]byte, dst.Len()))
-		} else {
-			dstSlices = append(dstSlices, dst.ToSlice())
-		}
-	}
-	rn, rerr := r.ReadVec(dstSlices)
-	dsts = dsts.TakeFirst64(uint64(rn))
-	var done uint64
-	var i int
-	for !dsts.IsEmpty() {
-		dst := dsts.Head()
-		if dst.NeedSafecopy() {
-			n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i]))
-			done += uint64(n)
-			if err != nil {
-				return done, err
-			}
-		} else {
-			done += uint64(dst.Len())
-		}
-		dsts = dsts.Tail()
-		i++
-	}
-	return done, rerr
-}
-
-// FromVecWriterFunc implements Writer for a function that writes data from a
-// [][]byte and returns the number of bytes written.
-type FromVecWriterFunc struct {
-	WriteVec func(srcs [][]byte) (int64, error)
-}
-
-// WriteFromBlocks implements Writer.WriteFromBlocks.
-//
-// WriteFromBlocks calls w.WriteVec at most once.
-func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
-	if srcs.IsEmpty() {
-		return 0, nil
-	}
-	// Ensure that we don't pass a [][]byte with a total length > MaxInt64.
-	srcs = srcs.TakeFirst64(uint64(math.MaxInt64))
-	srcSlices := make([][]byte, 0, srcs.NumBlocks())
-	// Buffer Blocks that require safecopy.
-	var buferr error
-	for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() {
-		src := tmp.Head()
-		if src.NeedSafecopy() {
-			slice := make([]byte, src.Len())
-			n, err := Copy(BlockFromSafeSlice(slice), src)
-			srcSlices = append(srcSlices, slice[:n])
-			if err != nil {
-				buferr = err
-				break
-			}
-		} else {
-			srcSlices = append(srcSlices, src.ToSlice())
-		}
-	}
-	n, err := w.WriteVec(srcSlices)
-	if err != nil {
-		return uint64(n), err
-	}
-	return uint64(n), buferr
-}
diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go
deleted file mode 100644
index 629741bee..000000000
--- a/pkg/sentry/safemem/io_test.go
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safemem
-
-import (
-	"bytes"
-	"io"
-	"testing"
-)
-
-func makeBlocks(slices ...[]byte) []Block {
-	blocks := make([]Block, 0, len(slices))
-	for _, s := range slices {
-		blocks = append(blocks, BlockFromSafeSlice(s))
-	}
-	return blocks
-}
-
-func TestFromIOReaderFullRead(t *testing.T) {
-	r := FromIOReader{bytes.NewBufferString("foobar")}
-	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
-	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
-	if wantN := uint64(6); n != wantN || err != nil {
-		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	for i, want := range [][]byte{[]byte("foo"), []byte("bar")} {
-		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
-			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
-		}
-	}
-}
-
-type eofHidingReader struct {
-	Reader io.Reader
-}
-
-func (r eofHidingReader) Read(dst []byte) (int, error) {
-	n, err := r.Reader.Read(dst)
-	if err == io.EOF {
-		return n, nil
-	}
-	return n, err
-}
-
-func TestFromIOReaderPartialRead(t *testing.T) {
-	r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}}
-	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
-	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
-	// FromIOReader should stop after the eofHidingReader returns (1, nil)
-	// for a 3-byte read.
-	if wantN := uint64(4); n != wantN || err != nil {
-		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} {
-		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
-			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
-		}
-	}
-}
-
-type singleByteReader struct {
-	Reader io.Reader
-}
-
-func (r singleByteReader) Read(dst []byte) (int, error) {
-	if len(dst) == 0 {
-		return r.Reader.Read(dst)
-	}
-	return r.Reader.Read(dst[:1])
-}
-
-func TestSingleByteReader(t *testing.T) {
-	r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}}
-	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
-	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
-	// FromIOReader should stop after the singleByteReader returns (1, nil)
-	// for a 3-byte read.
-	if wantN := uint64(1); n != wantN || err != nil {
-		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} {
-		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
-			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
-		}
-	}
-}
-
-func TestReadFullToBlocks(t *testing.T) {
-	r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}}
-	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
-	n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts))
-	// ReadFullToBlocks should call into FromIOReader => singleByteReader
-	// repeatedly until dsts is exhausted.
-	if wantN := uint64(6); n != wantN || err != nil {
-		t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	for i, want := range [][]byte{[]byte("foo"), []byte("bar")} {
-		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
-			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
-		}
-	}
-}
-
-func TestFromIOWriterFullWrite(t *testing.T) {
-	srcs := makeBlocks([]byte("foo"), []byte("bar"))
-	var dst bytes.Buffer
-	w := FromIOWriter{&dst}
-	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
-	if wantN := uint64(6); n != wantN || err != nil {
-		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-}
-
-type limitedWriter struct {
-	Writer io.Writer
-	Done   int
-	Limit  int
-}
-
-func (w *limitedWriter) Write(src []byte) (int, error) {
-	count := len(src)
-	if count > (w.Limit - w.Done) {
-		count = w.Limit - w.Done
-	}
-	n, err := w.Writer.Write(src[:count])
-	w.Done += n
-	return n, err
-}
-
-func TestFromIOWriterPartialWrite(t *testing.T) {
-	srcs := makeBlocks([]byte("foo"), []byte("bar"))
-	var dst bytes.Buffer
-	w := FromIOWriter{&limitedWriter{&dst, 0, 4}}
-	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
-	// FromIOWriter should stop after the limitedWriter returns (1, nil) for a
-	// 3-byte write.
-	if wantN := uint64(4); n != wantN || err != nil {
-		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-}
-
-type singleByteWriter struct {
-	Writer io.Writer
-}
-
-func (w singleByteWriter) Write(src []byte) (int, error) {
-	if len(src) == 0 {
-		return w.Writer.Write(src)
-	}
-	return w.Writer.Write(src[:1])
-}
-
-func TestSingleByteWriter(t *testing.T) {
-	srcs := makeBlocks([]byte("foo"), []byte("bar"))
-	var dst bytes.Buffer
-	w := FromIOWriter{singleByteWriter{&dst}}
-	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
-	// FromIOWriter should stop after the singleByteWriter returns (1, nil)
-	// for a 3-byte write.
-	if wantN := uint64(1); n != wantN || err != nil {
-		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-}
-
-func TestWriteFullToBlocks(t *testing.T) {
-	srcs := makeBlocks([]byte("foo"), []byte("bar"))
-	var dst bytes.Buffer
-	w := FromIOWriter{singleByteWriter{&dst}}
-	n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs))
-	// WriteFullToBlocks should call into FromIOWriter => singleByteWriter
-	// repeatedly until srcs is exhausted.
-	if wantN := uint64(6); n != wantN || err != nil {
-		t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-}
diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go
deleted file mode 100644
index 3e70d33a2..000000000
--- a/pkg/sentry/safemem/safemem.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package safemem provides the Block and BlockSeq types.
-package safemem
diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go
deleted file mode 100644
index eba4bb535..000000000
--- a/pkg/sentry/safemem/seq_test.go
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safemem
-
-import (
-	"bytes"
-	"reflect"
-	"testing"
-)
-
-type blockSeqTest struct {
-	desc string
-
-	pieces     []string
-	haveOffset bool
-	offset     uint64
-	haveLimit  bool
-	limit      uint64
-
-	want string
-}
-
-func (t blockSeqTest) NonEmptyByteSlices() [][]byte {
-	// t is a value, so we can mutate it freely.
-	slices := make([][]byte, 0, len(t.pieces))
-	for _, str := range t.pieces {
-		if t.haveOffset {
-			strOff := t.offset
-			if strOff > uint64(len(str)) {
-				strOff = uint64(len(str))
-			}
-			str = str[strOff:]
-			t.offset -= strOff
-		}
-		if t.haveLimit {
-			strLim := t.limit
-			if strLim > uint64(len(str)) {
-				strLim = uint64(len(str))
-			}
-			str = str[:strLim]
-			t.limit -= strLim
-		}
-		if len(str) != 0 {
-			slices = append(slices, []byte(str))
-		}
-	}
-	return slices
-}
-
-func (t blockSeqTest) BlockSeq() BlockSeq {
-	blocks := make([]Block, 0, len(t.pieces))
-	for _, str := range t.pieces {
-		blocks = append(blocks, BlockFromSafeSlice([]byte(str)))
-	}
-	bs := BlockSeqFromSlice(blocks)
-	if t.haveOffset {
-		bs = bs.DropFirst64(t.offset)
-	}
-	if t.haveLimit {
-		bs = bs.TakeFirst64(t.limit)
-	}
-	return bs
-}
-
-var blockSeqTests = []blockSeqTest{
-	{
-		desc: "Empty sequence",
-	},
-	{
-		desc:   "Sequence of length 1",
-		pieces: []string{"foobar"},
-		want:   "foobar",
-	},
-	{
-		desc:   "Sequence of length 2",
-		pieces: []string{"foo", "bar"},
-		want:   "foobar",
-	},
-	{
-		desc:   "Empty Blocks",
-		pieces: []string{"", "foo", "", "", "bar", ""},
-		want:   "foobar",
-	},
-	{
-		desc:       "Sequence with non-zero offset",
-		pieces:     []string{"foo", "bar"},
-		haveOffset: true,
-		offset:     2,
-		want:       "obar",
-	},
-	{
-		desc:      "Sequence with non-maximal limit",
-		pieces:    []string{"foo", "bar"},
-		haveLimit: true,
-		limit:     5,
-		want:      "fooba",
-	},
-	{
-		desc:       "Sequence with offset and limit",
-		pieces:     []string{"foo", "bar"},
-		haveOffset: true,
-		offset:     2,
-		haveLimit:  true,
-		limit:      3,
-		want:       "oba",
-	},
-}
-
-func TestBlockSeqNumBytes(t *testing.T) {
-	for _, test := range blockSeqTests {
-		t.Run(test.desc, func(t *testing.T) {
-			if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want {
-				t.Errorf("NumBytes: got %d, wanted %d", got, want)
-			}
-		})
-	}
-}
-
-func TestBlockSeqIterBlocks(t *testing.T) {
-	// Tests BlockSeq iteration using Head/Tail.
-	for _, test := range blockSeqTests {
-		t.Run(test.desc, func(t *testing.T) {
-			srcs := test.BlockSeq()
-			// "Note that a non-nil empty slice and a nil slice ... are not
-			// deeply equal." - reflect
-			slices := make([][]byte, 0, 0)
-			for !srcs.IsEmpty() {
-				src := srcs.Head()
-				slices = append(slices, src.ToSlice())
-				nextSrcs := srcs.Tail()
-				if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want {
-					t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want)
-				}
-				srcs = nextSrcs
-			}
-			if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) {
-				t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices)
-			}
-		})
-	}
-}
-
-func TestBlockSeqIterBytes(t *testing.T) {
-	// Tests BlockSeq iteration using Head/DropFirst.
-	for _, test := range blockSeqTests {
-		t.Run(test.desc, func(t *testing.T) {
-			srcs := test.BlockSeq()
-			var dst bytes.Buffer
-			for !srcs.IsEmpty() {
-				src := srcs.Head()
-				var b [1]byte
-				n, err := Copy(BlockFromSafeSlice(b[:]), src)
-				if n != 1 || err != nil {
-					t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err)
-				}
-				dst.WriteByte(b[0])
-				nextSrcs := srcs.DropFirst(1)
-				if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want {
-					t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want)
-				}
-				srcs = nextSrcs
-			}
-			if got := string(dst.Bytes()); got != test.want {
-				t.Errorf("Copied string: got %q, wanted %q", got, test.want)
-			}
-		})
-	}
-}
-
-func TestBlockSeqDropBeyondLimit(t *testing.T) {
-	blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))}
-	bs := BlockSeqFromSlice(blocks)
-	if got, want := bs.NumBytes(), uint64(4); got != want {
-		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
-	}
-	bs = bs.TakeFirst(1)
-	if got, want := bs.NumBytes(), uint64(1); got != want {
-		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
-	}
-	bs = bs.DropFirst(2)
-	if got, want := bs.NumBytes(), uint64(0); got != want {
-		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
-	}
-}
diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go
deleted file mode 100644
index 354a95dde..000000000
--- a/pkg/sentry/safemem/seq_unsafe.go
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package safemem
-
-import (
-	"bytes"
-	"fmt"
-	"reflect"
-	"unsafe"
-)
-
-// A BlockSeq represents a sequence of Blocks, each of which has non-zero
-// length.
-//
-// BlockSeqs are immutable and may be copied by value. The zero value of
-// BlockSeq represents an empty sequence.
-type BlockSeq struct {
-	// If length is 0, then the BlockSeq is empty. Invariants: data == 0;
-	// offset == 0; limit == 0.
-	//
-	// If length is -1, then the BlockSeq represents the single Block{data,
-	// limit, false}. Invariants: offset == 0; limit > 0; limit does not
-	// overflow the range of an int.
-	//
-	// If length is -2, then the BlockSeq represents the single Block{data,
-	// limit, true}. Invariants: offset == 0; limit > 0; limit does not
-	// overflow the range of an int.
-	//
-	// Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks
-	// in the array of Blocks starting at address `data`, starting at `offset`
-	// bytes into the first Block and limited to the following `limit` bytes.
-	// Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <=
-	// the combined length of all Blocks in the array; the first Block in the
-	// array has non-zero length.
-	//
-	// length is never 1; sequences consisting of a single Block are always
-	// stored inline (with length < 0).
-	data   unsafe.Pointer
-	length int
-	offset int
-	limit  uint64
-}
-
-// BlockSeqOf returns a BlockSeq representing the single Block b.
-func BlockSeqOf(b Block) BlockSeq {
-	bs := BlockSeq{
-		data:   b.start,
-		length: -1,
-		limit:  uint64(b.length),
-	}
-	if b.needSafecopy {
-		bs.length = -2
-	}
-	return bs
-}
-
-// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice.
-// If slice contains Blocks with zero length, BlockSeq will skip them during
-// iteration.
-//
-// Whether the returned BlockSeq shares memory with slice is unspecified;
-// clients should avoid mutating slices passed to BlockSeqFromSlice.
-//
-// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64.
-func BlockSeqFromSlice(slice []Block) BlockSeq {
-	slice = skipEmpty(slice)
-	var limit uint64
-	for _, b := range slice {
-		sum := limit + uint64(b.Len())
-		if sum < limit {
-			panic("BlockSeq length overflows uint64")
-		}
-		limit = sum
-	}
-	return blockSeqFromSliceLimited(slice, limit)
-}
-
-// Preconditions: The combined length of all Blocks in slice <= limit. If
-// len(slice) != 0, the first Block in slice has non-zero length, and limit >
-// 0.
-func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
-	switch len(slice) {
-	case 0:
-		return BlockSeq{}
-	case 1:
-		return BlockSeqOf(slice[0].TakeFirst64(limit))
-	default:
-		return BlockSeq{
-			data:   unsafe.Pointer(&slice[0]),
-			length: len(slice),
-			limit:  limit,
-		}
-	}
-}
-
-func skipEmpty(slice []Block) []Block {
-	for i, b := range slice {
-		if b.Len() != 0 {
-			return slice[i:]
-		}
-	}
-	return nil
-}
-
-// IsEmpty returns true if bs contains no Blocks.
-//
-// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0).
-// (Of these, prefer to use bs.IsEmpty().)
-func (bs BlockSeq) IsEmpty() bool {
-	return bs.length == 0
-}
-
-// NumBlocks returns the number of Blocks in bs.
-func (bs BlockSeq) NumBlocks() int {
-	// In general, we have to count: if bs represents a windowed slice then the
-	// slice may contain Blocks with zero length, and bs.length may be larger
-	// than the actual number of Blocks due to bs.limit.
-	var n int
-	for !bs.IsEmpty() {
-		n++
-		bs = bs.Tail()
-	}
-	return n
-}
-
-// NumBytes returns the sum of Block.Len() for all Blocks in bs.
-func (bs BlockSeq) NumBytes() uint64 {
-	return bs.limit
-}
-
-// Head returns the first Block in bs.
-//
-// Preconditions: !bs.IsEmpty().
-func (bs BlockSeq) Head() Block {
-	if bs.length == 0 {
-		panic("empty BlockSeq")
-	}
-	if bs.length < 0 {
-		return bs.internalBlock()
-	}
-	return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit)
-}
-
-// Preconditions: bs.length < 0.
-func (bs BlockSeq) internalBlock() Block {
-	return Block{
-		start:        bs.data,
-		length:       int(bs.limit),
-		needSafecopy: bs.length == -2,
-	}
-}
-
-// Tail returns a BlockSeq consisting of all Blocks in bs after the first.
-//
-// Preconditions: !bs.IsEmpty().
-func (bs BlockSeq) Tail() BlockSeq {
-	if bs.length == 0 {
-		panic("empty BlockSeq")
-	}
-	if bs.length < 0 {
-		return BlockSeq{}
-	}
-	head := (*Block)(bs.data).DropFirst(bs.offset)
-	headLen := uint64(head.Len())
-	if headLen >= bs.limit {
-		// The head Block exhausts the limit, so the tail is empty.
-		return BlockSeq{}
-	}
-	var extSlice []Block
-	extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice))
-	extSliceHdr.Data = uintptr(bs.data)
-	extSliceHdr.Len = bs.length
-	extSliceHdr.Cap = bs.length
-	tailSlice := skipEmpty(extSlice[1:])
-	tailLimit := bs.limit - headLen
-	return blockSeqFromSliceLimited(tailSlice, tailLimit)
-}
-
-// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes
-// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq.
-//
-// Preconditions: n >= 0.
-func (bs BlockSeq) DropFirst(n int) BlockSeq {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	return bs.DropFirst64(uint64(n))
-}
-
-// DropFirst64 is equivalent to DropFirst but takes an uint64.
-func (bs BlockSeq) DropFirst64(n uint64) BlockSeq {
-	if n >= bs.limit {
-		return BlockSeq{}
-	}
-	for {
-		// Calling bs.Head() here is surprisingly expensive, so inline getting
-		// the head's length.
-		var headLen uint64
-		if bs.length < 0 {
-			headLen = bs.limit
-		} else {
-			headLen = uint64((*Block)(bs.data).Len() - bs.offset)
-		}
-		if n < headLen {
-			// Dropping ends partway through the head Block.
-			if bs.length < 0 {
-				return BlockSeqOf(bs.internalBlock().DropFirst64(n))
-			}
-			bs.offset += int(n)
-			bs.limit -= n
-			return bs
-		}
-		n -= headLen
-		bs = bs.Tail()
-	}
-}
-
-// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n >
-// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs.
-//
-// Preconditions: n >= 0.
-func (bs BlockSeq) TakeFirst(n int) BlockSeq {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	return bs.TakeFirst64(uint64(n))
-}
-
-// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
-func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq {
-	if n == 0 {
-		return BlockSeq{}
-	}
-	if bs.limit > n {
-		bs.limit = n
-	}
-	return bs
-}
-
-// String implements fmt.Stringer.String.
-func (bs BlockSeq) String() string {
-	var buf bytes.Buffer
-	buf.WriteByte('[')
-	var sep string
-	for !bs.IsEmpty() {
-		buf.WriteString(sep)
-		sep = " "
-		buf.WriteString(bs.Head().String())
-		bs = bs.Tail()
-	}
-	buf.WriteByte(']')
-	return buf.String()
-}
-
-// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less,
-// from srcs to dsts and returns the number of bytes copied.
-//
-// If srcs and dsts overlap, the data stored in dsts is unspecified.
-func CopySeq(dsts, srcs BlockSeq) (uint64, error) {
-	var done uint64
-	for !dsts.IsEmpty() && !srcs.IsEmpty() {
-		dst := dsts.Head()
-		src := srcs.Head()
-		n, err := Copy(dst, src)
-		done += uint64(n)
-		if err != nil {
-			return done, err
-		}
-		dsts = dsts.DropFirst(n)
-		srcs = srcs.DropFirst(n)
-	}
-	return done, nil
-}
-
-// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed.
-func ZeroSeq(dsts BlockSeq) (uint64, error) {
-	var done uint64
-	for !dsts.IsEmpty() {
-		n, err := Zero(dsts.Head())
-		done += uint64(n)
-		if err != nil {
-			return done, err
-		}
-		dsts = dsts.DropFirst(n)
-	}
-	return done, nil
-}
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 8e2b97afb..611fa22c3 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -9,15 +9,15 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/tcpip",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 3850f6345..79e16d6e8 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -12,13 +12,13 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 1684dfc24..00265f15b 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -19,14 +19,14 @@ package control
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const maxInt = int(^uint(0) >> 1)
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 42bf7be6a..5a07d5d0e 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -16,23 +16,23 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/fdnotifier",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
-        "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/stack",
+        "//pkg/usermem",
         "//pkg/waiter",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c957b0f1d..bde4c7a1e 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -21,19 +21,19 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index e69ec38c2..cd67234d2 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -19,14 +19,14 @@ import (
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func firstBytePtr(bs []byte) unsafe.Pointer {
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index e67b46c9e..034eca676 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -25,13 +25,13 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 var defaultRecvBufSize = inet.TCPBufferSize{
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index ed34a8308..fa2a2cb66 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -15,10 +15,10 @@ go_library(
         "//pkg/binary",
         "//pkg/log",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index c65c36081..6ef740463 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -23,11 +23,11 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // errorTargetName is used to mark targets as error targets. Error targets
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index baaac13c6..f8b8e467d 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -13,8 +13,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -25,11 +25,11 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index ce0a1afd0..b21e0ca4b 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -20,7 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // alignUp rounds a length up to an alignment.
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index be005df24..07f860a49 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 2137c7aeb..0234aadde 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -8,7 +8,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 6b4a0ecf4..80a15d6cb 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -19,7 +19,7 @@ import (
 	"bytes"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index cea56f4ed..c4b95debb 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -20,8 +20,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -32,11 +32,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD
index 73fbdf1eb..b6434923c 100644
--- a/pkg/sentry/socket/netlink/uevent/BUILD
+++ b/pkg/sentry/socket/netlink/uevent/BUILD
@@ -8,7 +8,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/kernel",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
index b5d7808d7..1ee4296bc 100644
--- a/pkg/sentry/socket/netlink/uevent/protocol.go
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -20,7 +20,7 @@ package uevent
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index e3d1f90cb..ab01cb4fa 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -17,10 +17,11 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/log",
         "//pkg/metric",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -28,11 +29,9 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/netfilter",
         "//pkg/sentry/unimpl",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
@@ -45,6 +44,7 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 318acbeff..8619cc506 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -34,20 +34,19 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -57,6 +56,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 2d2c1ba2a..5afff2564 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -18,7 +18,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 2389a9cdb..50d9744e6 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -24,16 +24,16 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ControlMessages represents the union of unix control messages and tcpip
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index bade18686..08743deba 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -12,23 +12,23 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix/transport",
-        "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 2447f24ef..129949990 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -15,8 +15,8 @@
 package unix
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 4bdfc9208..74bcd6300 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -28,9 +28,9 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/ilist",
         "//pkg/refs",
-        "//pkg/sentry/context",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 9e6fbc111..ce5b94ee7 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -16,7 +16,7 @@ package transport
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 0322dec0b..4b06d63ac 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -16,7 +16,7 @@ package transport
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/waiter"
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index fcc0da332..dcbafe0e5 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -19,7 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 7f49ba864..4d30aa714 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -22,9 +22,9 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -33,10 +33,10 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index ff6fafa63..762a946fe 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -34,7 +34,7 @@ go_library(
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/syscalls/linux",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go
index 5187594a7..074e80f9b 100644
--- a/pkg/sentry/strace/poll.go
+++ b/pkg/sentry/strace/poll.go
@@ -22,7 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // PollEventSet is the set of poll(2) event flags.
diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
index c77d418e6..3a4c32aa0 100644
--- a/pkg/sentry/strace/select.go
+++ b/pkg/sentry/strace/select.go
@@ -19,7 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func fdsFromSet(t *kernel.Task, set []byte) []int {
diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go
index 5656d53eb..c41f36e3f 100644
--- a/pkg/sentry/strace/signal.go
+++ b/pkg/sentry/strace/signal.go
@@ -21,7 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // signalNames contains the names of all named signals.
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index b6d7177f4..d2079c85f 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -26,7 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SocketFamily are the possible socket(2) families.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 629c1f308..3fc4a47fc 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -33,7 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // DefaultLogMaximumSize is the default LogMaximumSize.
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 7d74e0f70..8d6c52850 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -63,11 +63,12 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/bpf",
+        "//pkg/context",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/rand",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/lock",
@@ -87,16 +88,15 @@ go_library(
         "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index c76771a54..7435b50bf 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index d3587fda6..03a39fe65 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // ARM64 is a table of Linux arm64 syscall API with the corresponding syscall
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index 333013d8c..2ddb2b146 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -17,8 +17,8 @@ package linux
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index f56411bfe..b401978db 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // I/O commands.
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 65b4a227b..5f11b496c 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 9bc2445a5..c54735148 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -18,8 +18,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
@@ -28,8 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // fileOpAt performs an operation on the second last component in the path.
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index bde17a767..b68261f72 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // futexWaitRestartBlock encapsulates the state required to restart futex(2)
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 912cbe4ff..f66f4ffde 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Getdents implements linux syscall getdents(2) for 64bit systems.
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
index f5a519d8a..ac934dc6f 100644
--- a/pkg/sentry/syscalls/linux/sys_mempolicy.go
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // We unconditionally report a single NUMA node. This also means that our
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 58a05b5bb..9959f6e61 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Brk implements linux syscall brk(2).
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 8c13e2d82..eb5ff48f5 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -19,8 +19,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Mount implements Linux syscall mount(2).
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 418d7fa5f..798344042 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // pipe2 implements the actual system call with flags.
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 2b2df989a..4f8762d7d 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index bc4c588bf..c0aa0fd60 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -19,11 +19,11 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index cd31e0649..f9f594190 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 51e3f836b..e08c333d6 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -19,8 +19,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // rlimit describes an implementation of 'struct rlimit', which may vary from
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 18510ead8..5b7a66f4d 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -19,8 +19,8 @@ import (
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index cde3b54e7..5f54f2456 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const opsMax = 500 // SEMOPM
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index fb6efd5d8..209be2990 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/signalfd"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // "For a process to have permission to send a signal it must
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index cda517a81..2919228d0 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -26,9 +26,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 69b17b799..c841abccb 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -19,8 +19,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Stat implements linux syscall stat(2).
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
index 58afb4a9a..75a567bd4 100644
--- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -21,7 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // copyOutStat copies the attributes (sattr, uattr) to the struct stat at
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
index 3e1251e0b..80c98d05c 100644
--- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -21,7 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // copyOutStat copies the attributes (sattr, uattr) to the struct stat at
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index b47c3b5c4..0c9e2255d 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -24,8 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index b887fa9d7..2d2aa0819 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // The most significant 29 bits hold either a pid or a file descriptor.
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index d4134207b..432351917 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const nsecPerSec = int64(time.Second)
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index ad4b67806..aba892939 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 77deb8980..efb95555c 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // GetXattr implements linux syscall getxattr(2).
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index 4ff8f9234..ddc3ee26e 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -19,8 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index 370fa6ec5..5d4aa3a63 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -14,7 +14,7 @@ go_library(
     srcs = ["events.go"],
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
     ],
 )
diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go
index 79b5de9e4..73ed9372f 100644
--- a/pkg/sentry/unimpl/events.go
+++ b/pkg/sentry/unimpl/events.go
@@ -17,8 +17,8 @@
 package unimpl
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
 // contextID is the events package's type for context.Context.Value keys.
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index e9c18f170..7467e6398 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -7,7 +7,7 @@ go_library(
     srcs = ["context.go"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/sentry/context",
+        "//pkg/context",
         "//pkg/sentry/socket/unix/transport",
     ],
 )
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index 4e466d66d..1fb884a90 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -17,7 +17,7 @@
 package uniqueid
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
deleted file mode 100644
index c8322e29e..000000000
--- a/pkg/sentry/usermem/BUILD
+++ /dev/null
@@ -1,55 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "addr_range",
-    out = "addr_range.go",
-    package = "usermem",
-    prefix = "Addr",
-    template = "//pkg/segment:generic_range",
-    types = {
-        "T": "Addr",
-    },
-)
-
-go_library(
-    name = "usermem",
-    srcs = [
-        "access_type.go",
-        "addr.go",
-        "addr_range.go",
-        "addr_range_seq_unsafe.go",
-        "bytes_io.go",
-        "bytes_io_unsafe.go",
-        "usermem.go",
-        "usermem_arm64.go",
-        "usermem_unsafe.go",
-        "usermem_x86.go",
-    ],
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/atomicbitops",
-        "//pkg/binary",
-        "//pkg/log",
-        "//pkg/sentry/context",
-        "//pkg/sentry/safemem",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "usermem_test",
-    size = "small",
-    srcs = [
-        "addr_range_seq_test.go",
-        "usermem_test.go",
-    ],
-    library = ":usermem",
-    deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/safemem",
-        "//pkg/syserror",
-    ],
-)
diff --git a/pkg/sentry/usermem/README.md b/pkg/sentry/usermem/README.md
deleted file mode 100644
index f6d2137eb..000000000
--- a/pkg/sentry/usermem/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-This package defines primitives for sentry access to application memory.
-
-Major types:
-
--   The `IO` interface represents a virtual address space and provides I/O
-    methods on that address space. `IO` is the lowest-level primitive. The
-    primary implementation of the `IO` interface is `mm.MemoryManager`.
-
--   `IOSequence` represents a collection of individually-contiguous address
-    ranges in a `IO` that is operated on sequentially, analogous to Linux's
-    `struct iov_iter`.
-
-Major usage patterns:
-
--   Access to a task's virtual memory, subject to the application's memory
-    protections and while running on that task's goroutine, from a context that
-    is at or above the level of the `kernel` package (e.g. most syscall
-    implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers
-    defined in `kernel/task_usermem.go`.
-
--   Access to a task's virtual memory, from a context that is at or above the
-    level of the `kernel` package, but where any of the above constraints does
-    not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory
-    protections); obtain the task's `mm.MemoryManager` by calling
-    `kernel.Task.MemoryManager`, and call its `IO` methods directly.
-
--   Access to a task's virtual memory, from a context that is below the level of
-    the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments
-    from higher layers, usually in the form of an `IOSequence`. The
-    `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions
-    in `kernel/task_usermem.go` are convenience functions for doing so.
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
deleted file mode 100644
index 9c1742a59..000000000
--- a/pkg/sentry/usermem/access_type.go
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"syscall"
-)
-
-// AccessType specifies memory access types. This is used for
-// setting mapping permissions, as well as communicating faults.
-//
-// +stateify savable
-type AccessType struct {
-	// Read is read access.
-	Read bool
-
-	// Write is write access.
-	Write bool
-
-	// Execute is executable access.
-	Execute bool
-}
-
-// String returns a pretty representation of access. This looks like the
-// familiar r-x, rw-, etc. and can be relied on as such.
-func (a AccessType) String() string {
-	bits := [3]byte{'-', '-', '-'}
-	if a.Read {
-		bits[0] = 'r'
-	}
-	if a.Write {
-		bits[1] = 'w'
-	}
-	if a.Execute {
-		bits[2] = 'x'
-	}
-	return string(bits[:])
-}
-
-// Any returns true iff at least one of Read, Write or Execute is true.
-func (a AccessType) Any() bool {
-	return a.Read || a.Write || a.Execute
-}
-
-// Prot returns the system prot (syscall.PROT_READ, etc.) for this access.
-func (a AccessType) Prot() int {
-	var prot int
-	if a.Read {
-		prot |= syscall.PROT_READ
-	}
-	if a.Write {
-		prot |= syscall.PROT_WRITE
-	}
-	if a.Execute {
-		prot |= syscall.PROT_EXEC
-	}
-	return prot
-}
-
-// SupersetOf returns true iff the access types in a are a superset of the
-// access types in other.
-func (a AccessType) SupersetOf(other AccessType) bool {
-	if !a.Read && other.Read {
-		return false
-	}
-	if !a.Write && other.Write {
-		return false
-	}
-	if !a.Execute && other.Execute {
-		return false
-	}
-	return true
-}
-
-// Intersect returns the access types set in both a and other.
-func (a AccessType) Intersect(other AccessType) AccessType {
-	return AccessType{
-		Read:    a.Read && other.Read,
-		Write:   a.Write && other.Write,
-		Execute: a.Execute && other.Execute,
-	}
-}
-
-// Union returns the access types set in either a or other.
-func (a AccessType) Union(other AccessType) AccessType {
-	return AccessType{
-		Read:    a.Read || other.Read,
-		Write:   a.Write || other.Write,
-		Execute: a.Execute || other.Execute,
-	}
-}
-
-// Effective returns the set of effective access types allowed by a, even if
-// some types are not explicitly allowed.
-func (a AccessType) Effective() AccessType {
-	// In Linux, Write and Execute access generally imply Read access. See
-	// mm/mmap.c:protection_map.
-	//
-	// The notable exception is get_user_pages, which only checks against
-	// the original vma flags. That said, most user memory accesses do not
-	// use GUP.
-	if a.Write || a.Execute {
-		a.Read = true
-	}
-	return a
-}
-
-// Convenient access types.
-var (
-	NoAccess  = AccessType{}
-	Read      = AccessType{Read: true}
-	Write     = AccessType{Write: true}
-	Execute   = AccessType{Execute: true}
-	ReadWrite = AccessType{Read: true, Write: true}
-	AnyAccess = AccessType{Read: true, Write: true, Execute: true}
-)
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
deleted file mode 100644
index e79210804..000000000
--- a/pkg/sentry/usermem/addr.go
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"fmt"
-)
-
-// Addr represents a generic virtual address.
-//
-// +stateify savable
-type Addr uintptr
-
-// AddLength adds the given length to start and returns the result. ok is true
-// iff adding the length did not overflow the range of Addr.
-//
-// Note: This function is usually used to get the end of an address range
-// defined by its start address and length. Since the resulting end is
-// exclusive, end == 0 is technically valid, and corresponds to a range that
-// extends to the end of the address space, but ok will be false. This isn't
-// expected to ever come up in practice.
-func (v Addr) AddLength(length uint64) (end Addr, ok bool) {
-	end = v + Addr(length)
-	// The second half of the following check is needed in case uintptr is
-	// smaller than 64 bits.
-	ok = end >= v && length <= uint64(^Addr(0))
-	return
-}
-
-// RoundDown returns the address rounded down to the nearest page boundary.
-func (v Addr) RoundDown() Addr {
-	return v & ^Addr(PageSize-1)
-}
-
-// RoundUp returns the address rounded up to the nearest page boundary. ok is
-// true iff rounding up did not wrap around.
-func (v Addr) RoundUp() (addr Addr, ok bool) {
-	addr = Addr(v + PageSize - 1).RoundDown()
-	ok = addr >= v
-	return
-}
-
-// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps
-// around.
-func (v Addr) MustRoundUp() Addr {
-	addr, ok := v.RoundUp()
-	if !ok {
-		panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v))
-	}
-	return addr
-}
-
-// HugeRoundDown returns the address rounded down to the nearest huge page
-// boundary.
-func (v Addr) HugeRoundDown() Addr {
-	return v & ^Addr(HugePageSize-1)
-}
-
-// HugeRoundUp returns the address rounded up to the nearest huge page boundary.
-// ok is true iff rounding up did not wrap around.
-func (v Addr) HugeRoundUp() (addr Addr, ok bool) {
-	addr = Addr(v + HugePageSize - 1).HugeRoundDown()
-	ok = addr >= v
-	return
-}
-
-// PageOffset returns the offset of v into the current page.
-func (v Addr) PageOffset() uint64 {
-	return uint64(v & Addr(PageSize-1))
-}
-
-// IsPageAligned returns true if v.PageOffset() == 0.
-func (v Addr) IsPageAligned() bool {
-	return v.PageOffset() == 0
-}
-
-// AddrRange is a range of Addrs.
-//
-// type AddrRange <generated by go_generics>
-
-// ToRange returns [v, v+length).
-func (v Addr) ToRange(length uint64) (AddrRange, bool) {
-	end, ok := v.AddLength(length)
-	return AddrRange{v, end}, ok
-}
-
-// IsPageAligned returns true if ar.Start.IsPageAligned() and
-// ar.End.IsPageAligned().
-func (ar AddrRange) IsPageAligned() bool {
-	return ar.Start.IsPageAligned() && ar.End.IsPageAligned()
-}
-
-// String implements fmt.Stringer.String.
-func (ar AddrRange) String() string {
-	return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
-}
diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go
deleted file mode 100644
index 82f735026..000000000
--- a/pkg/sentry/usermem/addr_range_seq_test.go
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"testing"
-)
-
-var addrRangeSeqTests = []struct {
-	desc   string
-	ranges []AddrRange
-}{
-	{
-		desc: "Empty sequence",
-	},
-	{
-		desc: "Single empty AddrRange",
-		ranges: []AddrRange{
-			{0x10, 0x10},
-		},
-	},
-	{
-		desc: "Single non-empty AddrRange of length 1",
-		ranges: []AddrRange{
-			{0x10, 0x11},
-		},
-	},
-	{
-		desc: "Single non-empty AddrRange of length 2",
-		ranges: []AddrRange{
-			{0x10, 0x12},
-		},
-	},
-	{
-		desc: "Multiple non-empty AddrRanges",
-		ranges: []AddrRange{
-			{0x10, 0x11},
-			{0x20, 0x22},
-		},
-	},
-	{
-		desc: "Multiple AddrRanges including empty AddrRanges",
-		ranges: []AddrRange{
-			{0x10, 0x10},
-			{0x20, 0x20},
-			{0x30, 0x33},
-			{0x40, 0x44},
-			{0x50, 0x50},
-			{0x60, 0x60},
-			{0x70, 0x77},
-			{0x80, 0x88},
-			{0x90, 0x90},
-			{0xa0, 0xa0},
-		},
-	},
-}
-
-func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) {
-	var wantLen int64
-	for _, ar := range wantRanges {
-		wantLen += int64(ar.Length())
-	}
-
-	var i int
-	for !ars.IsEmpty() {
-		if gotLen := ars.NumBytes(); gotLen != wantLen {
-			t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen)
-		}
-		if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN {
-			t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN)
-		}
-		got := ars.Head()
-		if i >= len(wantRanges) {
-			t.Errorf("Iteration %d: %v.Head(): got %s, wanted <end of sequence>", i, ars, got)
-		} else if want := wantRanges[i]; got != want {
-			t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want)
-		}
-		ars = ars.Tail()
-		wantLen -= int64(got.Length())
-		i++
-	}
-	if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 {
-		t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen)
-	}
-	if gotN := ars.NumRanges(); gotN != 0 {
-		t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN)
-	}
-}
-
-func TestAddrRangeSeqTailIteration(t *testing.T) {
-	for _, test := range addrRangeSeqTests {
-		t.Run(test.desc, func(t *testing.T) {
-			testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges)
-		})
-	}
-}
-
-func TestAddrRangeSeqDropFirstEmpty(t *testing.T) {
-	var ars AddrRangeSeq
-	if got, want := ars.DropFirst(1), ars; got != want {
-		t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want)
-	}
-}
-
-func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) {
-	// Tests AddrRangeSeq iteration using Head/DropFirst, simulating
-	// I/O-per-AddrRange.
-	for _, test := range addrRangeSeqTests {
-		t.Run(test.desc, func(t *testing.T) {
-			// Figure out what AddrRanges we expect to see.
-			var wantLen int64
-			var wantRanges []AddrRange
-			for _, ar := range test.ranges {
-				wantLen += int64(ar.Length())
-				wantRanges = append(wantRanges, ar)
-				if ar.Length() == 0 {
-					// We "do" 0 bytes of I/O and then call DropFirst(0),
-					// advancing to the next AddrRange.
-					continue
-				}
-				// Otherwise we "do" 1 byte of I/O and then call DropFirst(1),
-				// advancing the AddrRange by 1 byte, or to the next AddrRange
-				// if this one is exhausted.
-				for ar.Start++; ar.Length() != 0; ar.Start++ {
-					wantRanges = append(wantRanges, ar)
-				}
-			}
-			t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen)
-
-			ars := AddrRangeSeqFromSlice(test.ranges)
-			var i int
-			for !ars.IsEmpty() {
-				if gotLen := ars.NumBytes(); gotLen != wantLen {
-					t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen)
-				}
-				got := ars.Head()
-				if i >= len(wantRanges) {
-					t.Errorf("Iteration %d: %v.Head(): got %s, wanted <end of sequence>", i, ars, got)
-				} else if want := wantRanges[i]; got != want {
-					t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want)
-				}
-				if got.Length() == 0 {
-					ars = ars.DropFirst(0)
-				} else {
-					ars = ars.DropFirst(1)
-					wantLen--
-				}
-				i++
-			}
-			if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 {
-				t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen)
-			}
-		})
-	}
-}
-
-func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) {
-	var ars AddrRangeSeq
-	if got, want := ars.TakeFirst(1), ars; got != want {
-		t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want)
-	}
-}
-
-func TestAddrRangeSeqTakeFirst(t *testing.T) {
-	ranges := []AddrRange{
-		{0x10, 0x11},
-		{0x20, 0x22},
-		{0x30, 0x30},
-		{0x40, 0x44},
-		{0x50, 0x55},
-		{0x60, 0x60},
-		{0x70, 0x77},
-	}
-	ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5)
-	want := []AddrRange{
-		{0x10, 0x11}, // +1 byte (total 1 byte), not truncated
-		{0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated
-		{0x30, 0x30}, // +0 bytes (total 3 bytes), no change
-		{0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated
-		{0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated
-		{0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change)
-		{0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated
-	}
-	testAddrRangeSeqEqualityWithTailIteration(t, ars, want)
-}
diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go
deleted file mode 100644
index c09337c15..000000000
--- a/pkg/sentry/usermem/addr_range_seq_unsafe.go
+++ /dev/null
@@ -1,277 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"bytes"
-	"fmt"
-	"reflect"
-	"unsafe"
-)
-
-// An AddrRangeSeq represents a sequence of AddrRanges.
-//
-// AddrRangeSeqs are immutable and may be copied by value. The zero value of
-// AddrRangeSeq represents an empty sequence.
-//
-// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary
-// since zero-length AddrRanges are significant to MM bounds checks.
-type AddrRangeSeq struct {
-	// If length is 0, then the AddrRangeSeq represents no AddrRanges.
-	// Invariants: data == 0; offset == 0; limit == 0.
-	//
-	// If length is 1, then the AddrRangeSeq represents the single
-	// AddrRange{offset, offset+limit}. Invariants: data == 0.
-	//
-	// Otherwise, length >= 2, and the AddrRangeSeq represents the `length`
-	// AddrRanges in the array of AddrRanges starting at address `data`,
-	// starting at `offset` bytes into the first AddrRange and limited to the
-	// following `limit` bytes. (AddrRanges after `limit` are still iterated,
-	// but are truncated to a length of 0.) Invariants: data != 0; offset <=
-	// data[0].Length(); limit > 0; offset+limit <= the combined length of all
-	// AddrRanges in the array.
-	data   unsafe.Pointer
-	length int
-	offset Addr
-	limit  Addr
-}
-
-// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar.
-func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq {
-	return AddrRangeSeq{
-		length: 1,
-		offset: ar.Start,
-		limit:  ar.Length(),
-	}
-}
-
-// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in
-// slice.
-//
-// Whether the returned AddrRangeSeq shares memory with slice is unspecified;
-// clients should avoid mutating slices passed to AddrRangeSeqFromSlice.
-//
-// Preconditions: The combined length of all AddrRanges in slice <=
-// math.MaxInt64.
-func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
-	var limit int64
-	for _, ar := range slice {
-		len64 := int64(ar.Length())
-		if len64 < 0 {
-			panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar))
-		}
-		sum := limit + len64
-		if sum < limit {
-			panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice))
-		}
-		limit = sum
-	}
-	return addrRangeSeqFromSliceLimited(slice, limit)
-}
-
-// Preconditions: The combined length of all AddrRanges in slice <= limit.
-// limit >= 0. If len(slice) != 0, then limit > 0.
-func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
-	switch len(slice) {
-	case 0:
-		return AddrRangeSeq{}
-	case 1:
-		return AddrRangeSeq{
-			length: 1,
-			offset: slice[0].Start,
-			limit:  Addr(limit),
-		}
-	default:
-		return AddrRangeSeq{
-			data:   unsafe.Pointer(&slice[0]),
-			length: len(slice),
-			limit:  Addr(limit),
-		}
-	}
-}
-
-// IsEmpty returns true if ars.NumRanges() == 0.
-//
-// Note that since AddrRangeSeq may contain AddrRanges with a length of zero,
-// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not
-// necessarily empty.
-func (ars AddrRangeSeq) IsEmpty() bool {
-	return ars.length == 0
-}
-
-// NumRanges returns the number of AddrRanges in ars.
-func (ars AddrRangeSeq) NumRanges() int {
-	return ars.length
-}
-
-// NumBytes returns the number of bytes represented by ars.
-func (ars AddrRangeSeq) NumBytes() int64 {
-	return int64(ars.limit)
-}
-
-// Head returns the first AddrRange in ars.
-//
-// Preconditions: !ars.IsEmpty().
-func (ars AddrRangeSeq) Head() AddrRange {
-	if ars.length == 0 {
-		panic("empty AddrRangeSeq")
-	}
-	if ars.length == 1 {
-		return AddrRange{ars.offset, ars.offset + ars.limit}
-	}
-	ar := *(*AddrRange)(ars.data)
-	ar.Start += ars.offset
-	if ar.Length() > ars.limit {
-		ar.End = ar.Start + ars.limit
-	}
-	return ar
-}
-
-// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the
-// first.
-//
-// Preconditions: !ars.IsEmpty().
-func (ars AddrRangeSeq) Tail() AddrRangeSeq {
-	if ars.length == 0 {
-		panic("empty AddrRangeSeq")
-	}
-	if ars.length == 1 {
-		return AddrRangeSeq{}
-	}
-	return ars.externalTail()
-}
-
-// Preconditions: ars.length >= 2.
-func (ars AddrRangeSeq) externalTail() AddrRangeSeq {
-	headLen := (*AddrRange)(ars.data).Length() - ars.offset
-	var tailLimit int64
-	if ars.limit > headLen {
-		tailLimit = int64(ars.limit - headLen)
-	}
-	var extSlice []AddrRange
-	extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice))
-	extSliceHdr.Data = uintptr(ars.data)
-	extSliceHdr.Len = ars.length
-	extSliceHdr.Cap = ars.length
-	return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit)
-}
-
-// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n
-// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty
-// AddrRangeSeq.
-//
-// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit
-// at least ars.Head(), even if n == 0. This guarantees that the basic pattern
-// of:
-//
-//     for !ars.IsEmpty() {
-//       n, err = doIOWith(ars.Head())
-//       if err != nil {
-//         return err
-//       }
-//       ars = ars.DropFirst(n)
-//     }
-//
-// works even in the presence of zero-length AddrRanges.
-//
-// Preconditions: n >= 0.
-func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	return ars.DropFirst64(int64(n))
-}
-
-// DropFirst64 is equivalent to DropFirst but takes an int64.
-func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	if Addr(n) > ars.limit {
-		return AddrRangeSeq{}
-	}
-	// Handle initial empty AddrRange.
-	switch ars.length {
-	case 0:
-		return AddrRangeSeq{}
-	case 1:
-		if ars.limit == 0 {
-			return AddrRangeSeq{}
-		}
-	default:
-		if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen {
-			ars = ars.externalTail()
-		}
-	}
-	for n != 0 {
-		// Calling ars.Head() here is surprisingly expensive, so inline getting
-		// the head's length.
-		var headLen Addr
-		if ars.length == 1 {
-			headLen = ars.limit
-		} else {
-			headLen = (*AddrRange)(ars.data).Length() - ars.offset
-		}
-		if Addr(n) < headLen {
-			// Dropping ends partway through the head AddrRange.
-			ars.offset += Addr(n)
-			ars.limit -= Addr(n)
-			return ars
-		}
-		n -= int64(headLen)
-		ars = ars.Tail()
-	}
-	return ars
-}
-
-// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n
-// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the
-// first n bytes are reduced to a length of zero, but will still be iterated.
-//
-// Preconditions: n >= 0.
-func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	return ars.TakeFirst64(int64(n))
-}
-
-// TakeFirst64 is equivalent to TakeFirst but takes an int64.
-func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq {
-	if n < 0 {
-		panic(fmt.Sprintf("invalid n: %d", n))
-	}
-	if ars.limit > Addr(n) {
-		ars.limit = Addr(n)
-	}
-	return ars
-}
-
-// String implements fmt.Stringer.String.
-func (ars AddrRangeSeq) String() string {
-	// This is deliberately chosen to be the same as fmt's automatic stringer
-	// for []AddrRange.
-	var buf bytes.Buffer
-	buf.WriteByte('[')
-	var sep string
-	for !ars.IsEmpty() {
-		buf.WriteString(sep)
-		sep = " "
-		buf.WriteString(ars.Head().String())
-		ars = ars.Tail()
-	}
-	buf.WriteByte(']')
-	return buf.String()
-}
diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
deleted file mode 100644
index 7898851b3..000000000
--- a/pkg/sentry/usermem/bytes_io.go
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-const maxInt = int(^uint(0) >> 1)
-
-// BytesIO implements IO using a byte slice. Addresses are interpreted as
-// offsets into the slice. Reads and writes beyond the end of the slice return
-// EFAULT.
-type BytesIO struct {
-	Bytes []byte
-}
-
-// CopyOut implements IO.CopyOut.
-func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) {
-	rngN, rngErr := b.rangeCheck(addr, len(src))
-	if rngN == 0 {
-		return 0, rngErr
-	}
-	return copy(b.Bytes[int(addr):], src[:rngN]), rngErr
-}
-
-// CopyIn implements IO.CopyIn.
-func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) {
-	rngN, rngErr := b.rangeCheck(addr, len(dst))
-	if rngN == 0 {
-		return 0, rngErr
-	}
-	return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr
-}
-
-// ZeroOut implements IO.ZeroOut.
-func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) {
-	if toZero > int64(maxInt) {
-		return 0, syserror.EINVAL
-	}
-	rngN, rngErr := b.rangeCheck(addr, int(toZero))
-	if rngN == 0 {
-		return 0, rngErr
-	}
-	zeroSlice := b.Bytes[int(addr) : int(addr)+rngN]
-	for i := range zeroSlice {
-		zeroSlice[i] = 0
-	}
-	return int64(rngN), rngErr
-}
-
-// CopyOutFrom implements IO.CopyOutFrom.
-func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) {
-	dsts, rngErr := b.blocksFromAddrRanges(ars)
-	n, err := src.ReadToBlocks(dsts)
-	if err != nil {
-		return int64(n), err
-	}
-	return int64(n), rngErr
-}
-
-// CopyInTo implements IO.CopyInTo.
-func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) {
-	srcs, rngErr := b.blocksFromAddrRanges(ars)
-	n, err := dst.WriteFromBlocks(srcs)
-	if err != nil {
-		return int64(n), err
-	}
-	return int64(n), rngErr
-}
-
-func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) {
-	if length == 0 {
-		return 0, nil
-	}
-	if length < 0 {
-		return 0, syserror.EINVAL
-	}
-	max := Addr(len(b.Bytes))
-	if addr >= max {
-		return 0, syserror.EFAULT
-	}
-	end, ok := addr.AddLength(uint64(length))
-	if !ok || end > max {
-		return int(max - addr), syserror.EFAULT
-	}
-	return length, nil
-}
-
-func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) {
-	switch ars.NumRanges() {
-	case 0:
-		return safemem.BlockSeq{}, nil
-	case 1:
-		block, err := b.blockFromAddrRange(ars.Head())
-		return safemem.BlockSeqOf(block), err
-	default:
-		blocks := make([]safemem.Block, 0, ars.NumRanges())
-		for !ars.IsEmpty() {
-			block, err := b.blockFromAddrRange(ars.Head())
-			if block.Len() != 0 {
-				blocks = append(blocks, block)
-			}
-			if err != nil {
-				return safemem.BlockSeqFromSlice(blocks), err
-			}
-			ars = ars.Tail()
-		}
-		return safemem.BlockSeqFromSlice(blocks), nil
-	}
-}
-
-func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) {
-	n, err := b.rangeCheck(ar.Start, int(ar.Length()))
-	if n == 0 {
-		return safemem.Block{}, err
-	}
-	return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err
-}
-
-// BytesIOSequence returns an IOSequence representing the given byte slice.
-func BytesIOSequence(buf []byte) IOSequence {
-	return IOSequence{
-		IO:    &BytesIO{buf},
-		Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}),
-	}
-}
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
deleted file mode 100644
index fca5952f4..000000000
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"sync/atomic"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/atomicbitops"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-)
-
-// SwapUint32 implements IO.SwapUint32.
-func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) {
-	if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil {
-		return 0, rngErr
-	}
-	return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil
-}
-
-// CompareAndSwapUint32 implements IO.CompareAndSwapUint32.
-func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) {
-	if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil {
-		return 0, rngErr
-	}
-	return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil
-}
-
-// LoadUint32 implements IO.LoadUint32.
-func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) {
-	if _, err := b.rangeCheck(addr, 4); err != nil {
-		return 0, err
-	}
-	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil
-}
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
deleted file mode 100644
index 7b1f312b1..000000000
--- a/pkg/sentry/usermem/usermem.go
+++ /dev/null
@@ -1,597 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package usermem governs access to user memory.
-package usermem
-
-import (
-	"bytes"
-	"errors"
-	"io"
-	"strconv"
-
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// IO provides access to the contents of a virtual memory space.
-//
-// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any
-// meaningful data.
-type IO interface {
-	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
-	// returns the number of bytes copied. If the number of bytes copied is <
-	// len(src), it returns a non-nil error explaining why.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order.
-	//
-	// Postconditions: CopyOut does not retain src.
-	CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error)
-
-	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
-	// It returns the number of bytes copied. If the number of bytes copied is
-	// < len(dst), it returns a non-nil error explaining why.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order.
-	//
-	// Postconditions: CopyIn does not retain dst.
-	CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error)
-
-	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
-	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
-	// non-nil error explaining why.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. toZero >= 0.
-	ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error)
-
-	// CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
-	// ars. It returns the number of bytes copied, which may be less than the
-	// number of bytes read from src if copying fails. CopyOutFrom may return a
-	// partial copy without an error iff src.ReadToBlocks returns a partial
-	// read without an error.
-	//
-	// CopyOutFrom calls src.ReadToBlocks at most once.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. src.ReadToBlocks must not block
-	// on mm.MemoryManager.activeMu or any preceding locks in the lock order.
-	CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)
-
-	// CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
-	// dst. It returns the number of bytes copied. CopyInTo may return a
-	// partial copy without an error iff dst.WriteFromBlocks returns a partial
-	// write without an error.
-	//
-	// CopyInTo calls dst.WriteFromBlocks at most once.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. dst.WriteFromBlocks must not
-	// block on mm.MemoryManager.activeMu or any preceding locks in the lock
-	// order.
-	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
-
-	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
-	// at most once, which is unnecessary in most cases, forces implementations
-	// to gather safemem.Blocks into a single slice to pass to src/dst. Add
-	// CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid
-	// this allocation.
-
-	// SwapUint32 atomically sets the uint32 value at addr to new and
-	// returns the previous value.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
-	SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error)
-
-	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
-	// old; if they are equal, the value in memory is replaced by new. In
-	// either case, the previous value stored in memory is returned.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
-	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
-
-	// LoadUint32 atomically loads the uint32 value at addr and returns it.
-	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
-	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
-}
-
-// IOOpts contains options applicable to all IO methods.
-type IOOpts struct {
-	// If IgnorePermissions is true, application-defined memory protections set
-	// by mmap(2) or mprotect(2) will be ignored. (Memory protections required
-	// by the target of the mapping are never ignored.)
-	IgnorePermissions bool
-
-	// If AddressSpaceActive is true, the IO implementation may assume that it
-	// has an active AddressSpace and can therefore use AddressSpace copying
-	// without performing activation. See mm/io.go for details.
-	AddressSpaceActive bool
-}
-
-// IOReadWriter is an io.ReadWriter that reads from / writes to addresses
-// starting at addr in IO. The preconditions that apply to IO.CopyIn and
-// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write
-// respectively.
-type IOReadWriter struct {
-	Ctx  context.Context
-	IO   IO
-	Addr Addr
-	Opts IOOpts
-}
-
-// Read implements io.Reader.Read.
-//
-// Note that an address space does not have an "end of file", so Read can only
-// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or
-// unreadable memory, or beyond the end of the address space, should return
-// EFAULT.
-func (rw *IOReadWriter) Read(dst []byte) (int, error) {
-	n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts)
-	end, ok := rw.Addr.AddLength(uint64(n))
-	if ok {
-		rw.Addr = end
-	} else {
-		// Disallow wraparound.
-		rw.Addr = ^Addr(0)
-		if err != nil {
-			err = syserror.EFAULT
-		}
-	}
-	return n, err
-}
-
-// Writer implements io.Writer.Write.
-func (rw *IOReadWriter) Write(src []byte) (int, error) {
-	n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts)
-	end, ok := rw.Addr.AddLength(uint64(n))
-	if ok {
-		rw.Addr = end
-	} else {
-		// Disallow wraparound.
-		rw.Addr = ^Addr(0)
-		if err != nil {
-			err = syserror.EFAULT
-		}
-	}
-	return n, err
-}
-
-// CopyObjectOut copies a fixed-size value or slice of fixed-size values from
-// src to the memory mapped at addr in uio. It returns the number of bytes
-// copied.
-//
-// CopyObjectOut must use reflection to encode src; performance-sensitive
-// clients should do encoding manually and use uio.CopyOut directly.
-//
-// Preconditions: As for IO.CopyOut.
-func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) {
-	w := &IOReadWriter{
-		Ctx:  ctx,
-		IO:   uio,
-		Addr: addr,
-		Opts: opts,
-	}
-	// Allocate a byte slice the size of the object being marshaled. This
-	// adds an extra reflection call, but avoids needing to grow the slice
-	// during encoding, which can result in many heap-allocated slices.
-	b := make([]byte, 0, binary.Size(src))
-	return w.Write(binary.Marshal(b, ByteOrder, src))
-}
-
-// CopyObjectIn copies a fixed-size value or slice of fixed-size values from
-// the memory mapped at addr in uio to dst. It returns the number of bytes
-// copied.
-//
-// CopyObjectIn must use reflection to decode dst; performance-sensitive
-// clients should use uio.CopyIn directly and do decoding manually.
-//
-// Preconditions: As for IO.CopyIn.
-func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) {
-	r := &IOReadWriter{
-		Ctx:  ctx,
-		IO:   uio,
-		Addr: addr,
-		Opts: opts,
-	}
-	buf := make([]byte, binary.Size(dst))
-	if _, err := io.ReadFull(r, buf); err != nil {
-		return 0, err
-	}
-	binary.Unmarshal(buf, ByteOrder, dst)
-	return int(r.Addr - addr), nil
-}
-
-// CopyStringIn tuning parameters, defined outside that function for tests.
-const (
-	copyStringIncrement     = 64
-	copyStringMaxInitBufLen = 256
-)
-
-// CopyStringIn copies a NUL-terminated string of unknown length from the
-// memory mapped at addr in uio and returns it as a string (not including the
-// trailing NUL). If the length of the string, including the terminating NUL,
-// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
-// ENAMETOOLONG.
-//
-// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
-func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
-	initLen := maxlen
-	if initLen > copyStringMaxInitBufLen {
-		initLen = copyStringMaxInitBufLen
-	}
-	buf := make([]byte, initLen)
-	var done int
-	for done < maxlen {
-		// Read up to copyStringIncrement bytes at a time.
-		readlen := copyStringIncrement
-		if readlen > maxlen-done {
-			readlen = maxlen - done
-		}
-		end, ok := addr.AddLength(uint64(readlen))
-		if !ok {
-			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
-		}
-		// Shorten the read to avoid crossing page boundaries, since faulting
-		// in a page unnecessarily is expensive. This also ensures that partial
-		// copies up to the end of application-mappable memory succeed.
-		if addr.RoundDown() != end.RoundDown() {
-			end = end.RoundDown()
-			readlen = int(end - addr)
-		}
-		// Ensure that our buffer is large enough to accommodate the read.
-		if done+readlen > len(buf) {
-			newBufLen := len(buf) * 2
-			if newBufLen > maxlen {
-				newBufLen = maxlen
-			}
-			buf = append(buf, make([]byte, newBufLen-len(buf))...)
-		}
-		n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts)
-		// Look for the terminating zero byte, which may have occurred before
-		// hitting err.
-		if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
-			return stringFromImmutableBytes(buf[:done+i]), nil
-		}
-
-		done += n
-		if err != nil {
-			return stringFromImmutableBytes(buf[:done]), err
-		}
-		addr = end
-	}
-	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
-}
-
-// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
-// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is
-// less. CopyOutVec returns the number of bytes copied; if this is less than
-// the maximum, it returns a non-nil error explaining why.
-//
-// Preconditions: As for IO.CopyOut.
-func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
-	var done int
-	for !ars.IsEmpty() && done < len(src) {
-		ar := ars.Head()
-		cplen := len(src) - done
-		if Addr(cplen) >= ar.Length() {
-			cplen = int(ar.Length())
-		}
-		n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts)
-		done += n
-		if err != nil {
-			return done, err
-		}
-		ars = ars.DropFirst(n)
-	}
-	return done, nil
-}
-
-// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The
-// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is
-// less. CopyInVec returns the number of bytes copied; if this is less than the
-// maximum, it returns a non-nil error explaining why.
-//
-// Preconditions: As for IO.CopyIn.
-func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
-	var done int
-	for !ars.IsEmpty() && done < len(dst) {
-		ar := ars.Head()
-		cplen := len(dst) - done
-		if Addr(cplen) >= ar.Length() {
-			cplen = int(ar.Length())
-		}
-		n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts)
-		done += n
-		if err != nil {
-			return done, err
-		}
-		ars = ars.DropFirst(n)
-	}
-	return done, nil
-}
-
-// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum
-// number of bytes written is ars.NumBytes() or toZero, whichever is less.
-// ZeroOutVec returns the number of bytes written; if this is less than the
-// maximum, it returns a non-nil error explaining why.
-//
-// Preconditions: As for IO.ZeroOut.
-func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
-	var done int64
-	for !ars.IsEmpty() && done < toZero {
-		ar := ars.Head()
-		cplen := toZero - done
-		if Addr(cplen) >= ar.Length() {
-			cplen = int64(ar.Length())
-		}
-		n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts)
-		done += n
-		if err != nil {
-			return done, err
-		}
-		ars = ars.DropFirst64(n)
-	}
-	return done, nil
-}
-
-func isASCIIWhitespace(b byte) bool {
-	// Compare Linux include/linux/ctype.h, lib/ctype.c.
-	//  9 => horizontal tab '\t'
-	// 10 => line feed '\n'
-	// 11 => vertical tab '\v'
-	// 12 => form feed '\c'
-	// 13 => carriage return '\r'
-	return b == ' ' || (b >= 9 && b <= 13)
-}
-
-// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal
-// strings from the memory mapped at ars in uio and converts them to int32
-// values in dsts. It returns the number of bytes read.
-//
-// CopyInt32StringsInVec shares the following properties with Linux's
-// kernel/sysctl.c:proc_dointvec(write=1):
-//
-// - If any read value overflows the range of int32, or any invalid characters
-// are encountered during the read, CopyInt32StringsInVec returns EINVAL.
-//
-// - If, upon reaching the end of ars, fewer than len(dsts) values have been
-// read, CopyInt32StringsInVec returns no error if at least 1 value was read
-// and EINVAL otherwise.
-//
-// - Trailing whitespace after the last successfully read value is counted in
-// the number of bytes read.
-//
-// Unlike proc_dointvec():
-//
-// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to
-// PageSize-1; callers that require this must do so explicitly.
-//
-// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
-//
-// Preconditions: As for CopyInVec.
-func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
-	if len(dsts) == 0 {
-		return 0, nil
-	}
-
-	buf := make([]byte, ars.NumBytes())
-	n, cperr := CopyInVec(ctx, uio, ars, buf, opts)
-	buf = buf[:n]
-
-	var i, j int
-	for ; j < len(dsts); j++ {
-		// Skip leading whitespace.
-		for i < len(buf) && isASCIIWhitespace(buf[i]) {
-			i++
-		}
-		if i == len(buf) {
-			break
-		}
-
-		// Find the end of the value to be parsed (next whitespace or end of string).
-		nextI := i + 1
-		for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) {
-			nextI++
-		}
-
-		// Parse a single value.
-		val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32)
-		if err != nil {
-			return int64(i), syserror.EINVAL
-		}
-		dsts[j] = int32(val)
-
-		i = nextI
-	}
-
-	// Skip trailing whitespace.
-	for i < len(buf) && isASCIIWhitespace(buf[i]) {
-		i++
-	}
-
-	if cperr != nil {
-		return int64(i), cperr
-	}
-	if j == 0 {
-		return int64(i), syserror.EINVAL
-	}
-	return int64(i), nil
-}
-
-// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at
-// most one int32.
-func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) {
-	dsts := [1]int32{*dst}
-	n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts)
-	*dst = dsts[0]
-	return n, err
-}
-
-// IOSequence holds arguments to IO methods.
-type IOSequence struct {
-	IO    IO
-	Addrs AddrRangeSeq
-	Opts  IOOpts
-}
-
-// NumBytes returns s.Addrs.NumBytes().
-//
-// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since
-// s.Addrs may contain a non-zero number of zero-length AddrRanges.
-// Many clients of
-// IOSequence currently do something like:
-//
-//     if ioseq.NumBytes() == 0 {
-//       return 0, nil
-//     }
-//     if f.availableBytes == 0 {
-//       return 0, syserror.ErrWouldBlock
-//     }
-//     return ioseq.CopyOutFrom(..., reader)
-//
-// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong
-// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means
-// that we will return success for zero-length I/O in cases where Linux would
-// return EFAULT due to a failed access_ok() check, so in the long term we
-// should move checks for ErrWouldBlock etc. into the body of
-// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead.
-func (s IOSequence) NumBytes() int64 {
-	return s.Addrs.NumBytes()
-}
-
-// DropFirst returns a copy of s with s.Addrs.DropFirst(n).
-//
-// Preconditions: As for AddrRangeSeq.DropFirst.
-func (s IOSequence) DropFirst(n int) IOSequence {
-	return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
-}
-
-// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
-//
-// Preconditions: As for AddrRangeSeq.DropFirst64.
-func (s IOSequence) DropFirst64(n int64) IOSequence {
-	return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
-}
-
-// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
-//
-// Preconditions: As for AddrRangeSeq.TakeFirst.
-func (s IOSequence) TakeFirst(n int) IOSequence {
-	return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
-}
-
-// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
-//
-// Preconditions: As for AddrRangeSeq.TakeFirst64.
-func (s IOSequence) TakeFirst64(n int64) IOSequence {
-	return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
-}
-
-// CopyOut invokes CopyOutVec over s.Addrs.
-//
-// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
-// to s.NumBytes(), and a nil error will be returned.
-//
-// Preconditions: As for CopyOutVec.
-func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
-	return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
-}
-
-// CopyIn invokes CopyInVec over s.Addrs.
-//
-// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
-// s.NumBytes(), and a nil error will be returned.
-//
-// Preconditions: As for CopyInVec.
-func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
-	return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
-}
-
-// ZeroOut invokes ZeroOutVec over s.Addrs.
-//
-// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
-// to s.NumBytes(), and a nil error will be returned.
-//
-// Preconditions: As for ZeroOutVec.
-func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
-	return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
-}
-
-// CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
-//
-// Preconditions: As for IO.CopyOutFrom.
-func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
-	return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
-}
-
-// CopyInTo invokes s.CopyInTo over s.Addrs.
-//
-// Preconditions: As for IO.CopyInTo.
-func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
-	return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
-}
-
-// Reader returns an io.Reader that reads from s. Reads beyond the end of s
-// return io.EOF. The preconditions that apply to s.CopyIn also apply to the
-// returned io.Reader.Read.
-func (s IOSequence) Reader(ctx context.Context) io.Reader {
-	return &ioSequenceReadWriter{ctx, s}
-}
-
-// Writer returns an io.Writer that writes to s. Writes beyond the end of s
-// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also
-// apply to the returned io.Writer.Write.
-func (s IOSequence) Writer(ctx context.Context) io.Writer {
-	return &ioSequenceReadWriter{ctx, s}
-}
-
-// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when
-// attempting to write beyond the end of the IOSequence.
-var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence")
-
-type ioSequenceReadWriter struct {
-	ctx context.Context
-	s   IOSequence
-}
-
-// Read implements io.Reader.Read.
-func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) {
-	n, err := rw.s.CopyIn(rw.ctx, dst)
-	rw.s = rw.s.DropFirst(n)
-	if err == nil && rw.s.NumBytes() == 0 {
-		err = io.EOF
-	}
-	return n, err
-}
-
-// Write implements io.Writer.Write.
-func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) {
-	n, err := rw.s.CopyOut(rw.ctx, src)
-	rw.s = rw.s.DropFirst(n)
-	if err == nil && n < len(src) {
-		err = ErrEndOfIOSequence
-	}
-	return n, err
-}
diff --git a/pkg/sentry/usermem/usermem_arm64.go b/pkg/sentry/usermem/usermem_arm64.go
deleted file mode 100644
index fdfc30a66..000000000
--- a/pkg/sentry/usermem/usermem_arm64.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build arm64
-
-package usermem
-
-import (
-	"encoding/binary"
-	"syscall"
-)
-
-const (
-	// PageSize is the system page size.
-	// arm64 support 4K/16K/64K page size,
-	// which can be get by syscall.Getpagesize().
-	// Currently, only 4K page size is supported.
-	PageSize = 1 << PageShift
-
-	// HugePageSize is the system huge page size.
-	HugePageSize = 1 << HugePageShift
-
-	// PageShift is the binary log of the system page size.
-	PageShift = 12
-
-	// HugePageShift is the binary log of the system huge page size.
-	// Should be calculated by "PageShift + (PageShift - 3)"
-	// when multiple page size support is ready.
-	HugePageShift = 21
-)
-
-var (
-	// ByteOrder is the native byte order (little endian).
-	ByteOrder = binary.LittleEndian
-)
-
-func init() {
-	// Make sure the page size is 4K on arm64 platform.
-	if size := syscall.Getpagesize(); size != PageSize {
-		panic("Only 4K page size is supported on arm64!")
-	}
-}
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
deleted file mode 100644
index 299f64754..000000000
--- a/pkg/sentry/usermem/usermem_test.go
+++ /dev/null
@@ -1,424 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"bytes"
-	"encoding/binary"
-	"fmt"
-	"reflect"
-	"strings"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// newContext returns a context.Context that we can use in these tests (we
-// can't use contexttest because it depends on usermem).
-func newContext() context.Context {
-	return context.Background()
-}
-
-func newBytesIOString(s string) *BytesIO {
-	return &BytesIO{[]byte(s)}
-}
-
-func TestBytesIOCopyOutSuccess(t *testing.T) {
-	b := newBytesIOString("ABCDE")
-	n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{})
-	if wantN := 3; n != wantN || err != nil {
-		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) {
-		t.Errorf("Bytes: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyOutFailure(t *testing.T) {
-	b := newBytesIOString("ABC")
-	n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{})
-	if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr {
-		t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
-	}
-	if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) {
-		t.Errorf("Bytes: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyInSuccess(t *testing.T) {
-	b := newBytesIOString("AfooE")
-	var dst [3]byte
-	n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{})
-	if wantN := 3; n != wantN || err != nil {
-		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyInFailure(t *testing.T) {
-	b := newBytesIOString("Afo")
-	var dst [3]byte
-	n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{})
-	if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr {
-		t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
-	}
-	if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOZeroOutSuccess(t *testing.T) {
-	b := newBytesIOString("ABCD")
-	n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{})
-	if wantN := int64(2); n != wantN || err != nil {
-		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) {
-		t.Errorf("Bytes: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOZeroOutFailure(t *testing.T) {
-	b := newBytesIOString("ABC")
-	n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{})
-	if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr {
-		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
-	}
-	if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) {
-		t.Errorf("Bytes: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyOutFromSuccess(t *testing.T) {
-	b := newBytesIOString("ABCDEFGH")
-	n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{
-		{Start: 4, End: 7},
-		{Start: 1, End: 4},
-	}), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{})
-	if wantN := int64(6); n != wantN || err != nil {
-		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) {
-		t.Errorf("Bytes: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyOutFromFailure(t *testing.T) {
-	b := newBytesIOString("ABCDE")
-	n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{
-		{Start: 1, End: 4},
-		{Start: 4, End: 7},
-	}), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{})
-	if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr {
-		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
-	}
-	if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) {
-		t.Errorf("Bytes: got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyInToSuccess(t *testing.T) {
-	b := newBytesIOString("AfoobarH")
-	var dst bytes.Buffer
-	n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{
-		{Start: 4, End: 7},
-		{Start: 1, End: 4},
-	}), safemem.FromIOWriter{&dst}, IOOpts{})
-	if wantN := int64(6); n != wantN || err != nil {
-		t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) {
-		t.Errorf("dst.Bytes(): got %q, wanted %q", got, want)
-	}
-}
-
-func TestBytesIOCopyInToFailure(t *testing.T) {
-	b := newBytesIOString("Afoob")
-	var dst bytes.Buffer
-	n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{
-		{Start: 1, End: 4},
-		{Start: 4, End: 7},
-	}), safemem.FromIOWriter{&dst}, IOOpts{})
-	if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr {
-		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
-	}
-	if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) {
-		t.Errorf("dst.Bytes(): got %q, wanted %q", got, want)
-	}
-}
-
-type testStruct struct {
-	Int8   int8
-	Uint8  uint8
-	Int16  int16
-	Uint16 uint16
-	Int32  int32
-	Uint32 uint32
-	Int64  int64
-	Uint64 uint64
-}
-
-func TestCopyObject(t *testing.T) {
-	wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8}
-	wantN := binary.Size(wantObj)
-	b := &BytesIO{make([]byte, wantN)}
-	ctx := newContext()
-	if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil {
-		t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	var gotObj testStruct
-	if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil {
-		t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if gotObj != wantObj {
-		t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj)
-	}
-}
-
-func TestCopyStringInShort(t *testing.T) {
-	// Tests for string length <= copyStringIncrement.
-	want := strings.Repeat("A", copyStringIncrement-2)
-	mem := want + "\x00"
-	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
-		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
-	}
-}
-
-func TestCopyStringInLong(t *testing.T) {
-	// Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen
-	// (requiring multiple calls to IO.CopyIn()).
-	want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4)
-	mem := want + "\x00"
-	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
-		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
-	}
-}
-
-func TestCopyStringInVeryLong(t *testing.T) {
-	// Tests for string length > copyStringMaxInitBufLen (requiring buffer
-	// reallocation).
-	want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4)
-	mem := want + "\x00"
-	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil {
-		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
-	}
-}
-
-func TestCopyStringInNoTerminatingZeroByte(t *testing.T) {
-	want := strings.Repeat("A", copyStringIncrement-1)
-	got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{})
-	if wantErr := syserror.EFAULT; got != want || err != wantErr {
-		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr)
-	}
-}
-
-func TestCopyStringInTruncatedByMaxlen(t *testing.T) {
-	got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{})
-	if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr {
-		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr)
-	}
-}
-
-func TestCopyInt32StringsInVec(t *testing.T) {
-	for _, test := range []struct {
-		str     string
-		n       int
-		initial []int32
-		final   []int32
-	}{
-		{
-			str:     "100 200",
-			n:       len("100 200"),
-			initial: []int32{1, 2},
-			final:   []int32{100, 200},
-		},
-		{
-			// Fewer values ok
-			str:     "100",
-			n:       len("100"),
-			initial: []int32{1, 2},
-			final:   []int32{100, 2},
-		},
-		{
-			// Extra values ok
-			str:     "100 200 300",
-			n:       len("100 200 "),
-			initial: []int32{1, 2},
-			final:   []int32{100, 200},
-		},
-		{
-			// Leading and trailing whitespace ok
-			str:     " 100\t200\n",
-			n:       len(" 100\t200\n"),
-			initial: []int32{1, 2},
-			final:   []int32{100, 200},
-		},
-	} {
-		t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) {
-			src := BytesIOSequence([]byte(test.str))
-			dsts := append([]int32(nil), test.initial...)
-			if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil {
-				t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n)
-			}
-			if !reflect.DeepEqual(dsts, test.final) {
-				t.Errorf("dsts: got %v, wanted %v", dsts, test.final)
-			}
-		})
-	}
-}
-
-func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) {
-	for _, s := range []string{"", "\n", "a123"} {
-		t.Run(fmt.Sprintf("%q", s), func(t *testing.T) {
-			src := BytesIOSequence([]byte(s))
-			initial := []int32{1, 2}
-			dsts := append([]int32(nil), initial...)
-			if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL {
-				t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL)
-			}
-			if !reflect.DeepEqual(dsts, initial) {
-				t.Errorf("dsts: got %v, wanted %v", dsts, initial)
-			}
-		})
-	}
-}
-
-func TestIOSequenceCopyOut(t *testing.T) {
-	buf := []byte("ABCD")
-	s := BytesIOSequence(buf)
-
-	// CopyOut limited by len(src).
-	n, err := s.CopyOut(newContext(), []byte("fo"))
-	if wantN := 2; n != wantN || err != nil {
-		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if want := []byte("foCD"); !bytes.Equal(buf, want) {
-		t.Errorf("buf: got %q, wanted %q", buf, want)
-	}
-	s = s.DropFirst(2)
-	if got, want := s.NumBytes(), int64(2); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-
-	// CopyOut limited by s.NumBytes().
-	n, err = s.CopyOut(newContext(), []byte("obar"))
-	if wantN := 2; n != wantN || err != nil {
-		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if want := []byte("foob"); !bytes.Equal(buf, want) {
-		t.Errorf("buf: got %q, wanted %q", buf, want)
-	}
-	s = s.DropFirst(2)
-	if got, want := s.NumBytes(), int64(0); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-}
-
-func TestIOSequenceCopyIn(t *testing.T) {
-	s := BytesIOSequence([]byte("foob"))
-	dst := []byte("ABCDEF")
-
-	// CopyIn limited by len(dst).
-	n, err := s.CopyIn(newContext(), dst[:2])
-	if wantN := 2; n != wantN || err != nil {
-		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if want := []byte("foCDEF"); !bytes.Equal(dst, want) {
-		t.Errorf("dst: got %q, wanted %q", dst, want)
-	}
-	s = s.DropFirst(2)
-	if got, want := s.NumBytes(), int64(2); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-
-	// CopyIn limited by s.Remaining().
-	n, err = s.CopyIn(newContext(), dst[2:])
-	if wantN := 2; n != wantN || err != nil {
-		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if want := []byte("foobEF"); !bytes.Equal(dst, want) {
-		t.Errorf("dst: got %q, wanted %q", dst, want)
-	}
-	s = s.DropFirst(2)
-	if got, want := s.NumBytes(), int64(0); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-}
-
-func TestIOSequenceZeroOut(t *testing.T) {
-	buf := []byte("ABCD")
-	s := BytesIOSequence(buf)
-
-	// ZeroOut limited by toZero.
-	n, err := s.ZeroOut(newContext(), 2)
-	if wantN := int64(2); n != wantN || err != nil {
-		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) {
-		t.Errorf("buf: got %q, wanted %q", buf, want)
-	}
-	s = s.DropFirst(2)
-	if got, want := s.NumBytes(), int64(2); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-
-	// ZeroOut limited by s.NumBytes().
-	n, err = s.ZeroOut(newContext(), 4)
-	if wantN := int64(2); n != wantN || err != nil {
-		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) {
-		t.Errorf("buf: got %q, wanted %q", buf, want)
-	}
-	s = s.DropFirst(2)
-	if got, want := s.NumBytes(), int64(0); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-}
-
-func TestIOSequenceTakeFirst(t *testing.T) {
-	s := BytesIOSequence([]byte("foobar"))
-	if got, want := s.NumBytes(), int64(6); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-
-	s = s.TakeFirst(3)
-	if got, want := s.NumBytes(), int64(3); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-
-	// TakeFirst(n) where n > s.NumBytes() is a no-op.
-	s = s.TakeFirst(9)
-	if got, want := s.NumBytes(), int64(3); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-
-	var dst [3]byte
-	n, err := s.CopyIn(newContext(), dst[:])
-	if wantN := 3; n != wantN || err != nil {
-		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) {
-		t.Errorf("dst: got %q, wanted %q", got, want)
-	}
-	s = s.DropFirst(3)
-	if got, want := s.NumBytes(), int64(0); got != want {
-		t.Errorf("NumBytes: got %v, wanted %v", got, want)
-	}
-}
diff --git a/pkg/sentry/usermem/usermem_unsafe.go b/pkg/sentry/usermem/usermem_unsafe.go
deleted file mode 100644
index 876783e78..000000000
--- a/pkg/sentry/usermem/usermem_unsafe.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"unsafe"
-)
-
-// stringFromImmutableBytes is equivalent to string(bs), except that it never
-// copies even if escape analysis can't prove that bs does not escape. This is
-// only valid if bs is never mutated after stringFromImmutableBytes returns.
-func stringFromImmutableBytes(bs []byte) string {
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go
deleted file mode 100644
index 8059b72d2..000000000
--- a/pkg/sentry/usermem/usermem_x86.go
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64 i386
-
-package usermem
-
-import "encoding/binary"
-
-const (
-	// PageSize is the system page size.
-	PageSize = 1 << PageShift
-
-	// HugePageSize is the system huge page size.
-	HugePageSize = 1 << HugePageShift
-
-	// PageShift is the binary log of the system page size.
-	PageShift = 12
-
-	// HugePageShift is the binary log of the system huge page size.
-	HugePageShift = 21
-)
-
-var (
-	// ByteOrder is the native byte order (little endian).
-	ByteOrder = binary.LittleEndian
-)
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 51acdc4e9..6b1009328 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -26,14 +26,14 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -48,11 +48,11 @@ go_test(
     library = ":vfs",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 705194ebc..d97362b9a 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -15,7 +15,7 @@
 package vfs
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 )
 
 // contextID is this package's type for context.Context.Value keys.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 9f9d6e783..3af2aa58d 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -17,7 +17,7 @@ package vfs
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 51c95c2d9..225024463 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -18,12 +18,12 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index c00b3c84b..fb9b87fdc 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -19,12 +19,12 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 9ed58512f..1720d325d 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -22,11 +22,11 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // fileDescription is the common fd struct which a filesystem implementation
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index ea78f555b..a06a6caf3 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,8 +18,8 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
 // A Filesystem is a tree of nodes represented by Dentries, which forms part of
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index 023301780..c58b70728 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -18,7 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 00177b371..d39528051 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -19,7 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index cf80df90e..b318c681a 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -15,8 +15,8 @@
 package vfs
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index ee5c8b9e2..392c7611e 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -18,8 +18,8 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 1f6f56293..b2bf48853 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -31,8 +31,8 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD
new file mode 100644
index 000000000..ff8b9e91a
--- /dev/null
+++ b/pkg/usermem/BUILD
@@ -0,0 +1,55 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "addr_range",
+    out = "addr_range.go",
+    package = "usermem",
+    prefix = "Addr",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "Addr",
+    },
+)
+
+go_library(
+    name = "usermem",
+    srcs = [
+        "access_type.go",
+        "addr.go",
+        "addr_range.go",
+        "addr_range_seq_unsafe.go",
+        "bytes_io.go",
+        "bytes_io_unsafe.go",
+        "usermem.go",
+        "usermem_arm64.go",
+        "usermem_unsafe.go",
+        "usermem_x86.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/atomicbitops",
+        "//pkg/binary",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/safemem",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "usermem_test",
+    size = "small",
+    srcs = [
+        "addr_range_seq_test.go",
+        "usermem_test.go",
+    ],
+    library = ":usermem",
+    deps = [
+        "//pkg/context",
+        "//pkg/safemem",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/usermem/README.md b/pkg/usermem/README.md
new file mode 100644
index 000000000..f6d2137eb
--- /dev/null
+++ b/pkg/usermem/README.md
@@ -0,0 +1,31 @@
+This package defines primitives for sentry access to application memory.
+
+Major types:
+
+-   The `IO` interface represents a virtual address space and provides I/O
+    methods on that address space. `IO` is the lowest-level primitive. The
+    primary implementation of the `IO` interface is `mm.MemoryManager`.
+
+-   `IOSequence` represents a collection of individually-contiguous address
+    ranges in a `IO` that is operated on sequentially, analogous to Linux's
+    `struct iov_iter`.
+
+Major usage patterns:
+
+-   Access to a task's virtual memory, subject to the application's memory
+    protections and while running on that task's goroutine, from a context that
+    is at or above the level of the `kernel` package (e.g. most syscall
+    implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers
+    defined in `kernel/task_usermem.go`.
+
+-   Access to a task's virtual memory, from a context that is at or above the
+    level of the `kernel` package, but where any of the above constraints does
+    not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory
+    protections); obtain the task's `mm.MemoryManager` by calling
+    `kernel.Task.MemoryManager`, and call its `IO` methods directly.
+
+-   Access to a task's virtual memory, from a context that is below the level of
+    the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments
+    from higher layers, usually in the form of an `IOSequence`. The
+    `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions
+    in `kernel/task_usermem.go` are convenience functions for doing so.
diff --git a/pkg/usermem/access_type.go b/pkg/usermem/access_type.go
new file mode 100644
index 000000000..9c1742a59
--- /dev/null
+++ b/pkg/usermem/access_type.go
@@ -0,0 +1,128 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"syscall"
+)
+
+// AccessType specifies memory access types. This is used for
+// setting mapping permissions, as well as communicating faults.
+//
+// +stateify savable
+type AccessType struct {
+	// Read is read access.
+	Read bool
+
+	// Write is write access.
+	Write bool
+
+	// Execute is executable access.
+	Execute bool
+}
+
+// String returns a pretty representation of access. This looks like the
+// familiar r-x, rw-, etc. and can be relied on as such.
+func (a AccessType) String() string {
+	bits := [3]byte{'-', '-', '-'}
+	if a.Read {
+		bits[0] = 'r'
+	}
+	if a.Write {
+		bits[1] = 'w'
+	}
+	if a.Execute {
+		bits[2] = 'x'
+	}
+	return string(bits[:])
+}
+
+// Any returns true iff at least one of Read, Write or Execute is true.
+func (a AccessType) Any() bool {
+	return a.Read || a.Write || a.Execute
+}
+
+// Prot returns the system prot (syscall.PROT_READ, etc.) for this access.
+func (a AccessType) Prot() int {
+	var prot int
+	if a.Read {
+		prot |= syscall.PROT_READ
+	}
+	if a.Write {
+		prot |= syscall.PROT_WRITE
+	}
+	if a.Execute {
+		prot |= syscall.PROT_EXEC
+	}
+	return prot
+}
+
+// SupersetOf returns true iff the access types in a are a superset of the
+// access types in other.
+func (a AccessType) SupersetOf(other AccessType) bool {
+	if !a.Read && other.Read {
+		return false
+	}
+	if !a.Write && other.Write {
+		return false
+	}
+	if !a.Execute && other.Execute {
+		return false
+	}
+	return true
+}
+
+// Intersect returns the access types set in both a and other.
+func (a AccessType) Intersect(other AccessType) AccessType {
+	return AccessType{
+		Read:    a.Read && other.Read,
+		Write:   a.Write && other.Write,
+		Execute: a.Execute && other.Execute,
+	}
+}
+
+// Union returns the access types set in either a or other.
+func (a AccessType) Union(other AccessType) AccessType {
+	return AccessType{
+		Read:    a.Read || other.Read,
+		Write:   a.Write || other.Write,
+		Execute: a.Execute || other.Execute,
+	}
+}
+
+// Effective returns the set of effective access types allowed by a, even if
+// some types are not explicitly allowed.
+func (a AccessType) Effective() AccessType {
+	// In Linux, Write and Execute access generally imply Read access. See
+	// mm/mmap.c:protection_map.
+	//
+	// The notable exception is get_user_pages, which only checks against
+	// the original vma flags. That said, most user memory accesses do not
+	// use GUP.
+	if a.Write || a.Execute {
+		a.Read = true
+	}
+	return a
+}
+
+// Convenient access types.
+var (
+	NoAccess  = AccessType{}
+	Read      = AccessType{Read: true}
+	Write     = AccessType{Write: true}
+	Execute   = AccessType{Execute: true}
+	ReadWrite = AccessType{Read: true, Write: true}
+	AnyAccess = AccessType{Read: true, Write: true, Execute: true}
+)
diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go
new file mode 100644
index 000000000..e79210804
--- /dev/null
+++ b/pkg/usermem/addr.go
@@ -0,0 +1,108 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"fmt"
+)
+
+// Addr represents a generic virtual address.
+//
+// +stateify savable
+type Addr uintptr
+
+// AddLength adds the given length to start and returns the result. ok is true
+// iff adding the length did not overflow the range of Addr.
+//
+// Note: This function is usually used to get the end of an address range
+// defined by its start address and length. Since the resulting end is
+// exclusive, end == 0 is technically valid, and corresponds to a range that
+// extends to the end of the address space, but ok will be false. This isn't
+// expected to ever come up in practice.
+func (v Addr) AddLength(length uint64) (end Addr, ok bool) {
+	end = v + Addr(length)
+	// The second half of the following check is needed in case uintptr is
+	// smaller than 64 bits.
+	ok = end >= v && length <= uint64(^Addr(0))
+	return
+}
+
+// RoundDown returns the address rounded down to the nearest page boundary.
+func (v Addr) RoundDown() Addr {
+	return v & ^Addr(PageSize-1)
+}
+
+// RoundUp returns the address rounded up to the nearest page boundary. ok is
+// true iff rounding up did not wrap around.
+func (v Addr) RoundUp() (addr Addr, ok bool) {
+	addr = Addr(v + PageSize - 1).RoundDown()
+	ok = addr >= v
+	return
+}
+
+// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps
+// around.
+func (v Addr) MustRoundUp() Addr {
+	addr, ok := v.RoundUp()
+	if !ok {
+		panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v))
+	}
+	return addr
+}
+
+// HugeRoundDown returns the address rounded down to the nearest huge page
+// boundary.
+func (v Addr) HugeRoundDown() Addr {
+	return v & ^Addr(HugePageSize-1)
+}
+
+// HugeRoundUp returns the address rounded up to the nearest huge page boundary.
+// ok is true iff rounding up did not wrap around.
+func (v Addr) HugeRoundUp() (addr Addr, ok bool) {
+	addr = Addr(v + HugePageSize - 1).HugeRoundDown()
+	ok = addr >= v
+	return
+}
+
+// PageOffset returns the offset of v into the current page.
+func (v Addr) PageOffset() uint64 {
+	return uint64(v & Addr(PageSize-1))
+}
+
+// IsPageAligned returns true if v.PageOffset() == 0.
+func (v Addr) IsPageAligned() bool {
+	return v.PageOffset() == 0
+}
+
+// AddrRange is a range of Addrs.
+//
+// type AddrRange <generated by go_generics>
+
+// ToRange returns [v, v+length).
+func (v Addr) ToRange(length uint64) (AddrRange, bool) {
+	end, ok := v.AddLength(length)
+	return AddrRange{v, end}, ok
+}
+
+// IsPageAligned returns true if ar.Start.IsPageAligned() and
+// ar.End.IsPageAligned().
+func (ar AddrRange) IsPageAligned() bool {
+	return ar.Start.IsPageAligned() && ar.End.IsPageAligned()
+}
+
+// String implements fmt.Stringer.String.
+func (ar AddrRange) String() string {
+	return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
+}
diff --git a/pkg/usermem/addr_range_seq_test.go b/pkg/usermem/addr_range_seq_test.go
new file mode 100644
index 000000000..82f735026
--- /dev/null
+++ b/pkg/usermem/addr_range_seq_test.go
@@ -0,0 +1,197 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"testing"
+)
+
+var addrRangeSeqTests = []struct {
+	desc   string
+	ranges []AddrRange
+}{
+	{
+		desc: "Empty sequence",
+	},
+	{
+		desc: "Single empty AddrRange",
+		ranges: []AddrRange{
+			{0x10, 0x10},
+		},
+	},
+	{
+		desc: "Single non-empty AddrRange of length 1",
+		ranges: []AddrRange{
+			{0x10, 0x11},
+		},
+	},
+	{
+		desc: "Single non-empty AddrRange of length 2",
+		ranges: []AddrRange{
+			{0x10, 0x12},
+		},
+	},
+	{
+		desc: "Multiple non-empty AddrRanges",
+		ranges: []AddrRange{
+			{0x10, 0x11},
+			{0x20, 0x22},
+		},
+	},
+	{
+		desc: "Multiple AddrRanges including empty AddrRanges",
+		ranges: []AddrRange{
+			{0x10, 0x10},
+			{0x20, 0x20},
+			{0x30, 0x33},
+			{0x40, 0x44},
+			{0x50, 0x50},
+			{0x60, 0x60},
+			{0x70, 0x77},
+			{0x80, 0x88},
+			{0x90, 0x90},
+			{0xa0, 0xa0},
+		},
+	},
+}
+
+func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) {
+	var wantLen int64
+	for _, ar := range wantRanges {
+		wantLen += int64(ar.Length())
+	}
+
+	var i int
+	for !ars.IsEmpty() {
+		if gotLen := ars.NumBytes(); gotLen != wantLen {
+			t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen)
+		}
+		if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN {
+			t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN)
+		}
+		got := ars.Head()
+		if i >= len(wantRanges) {
+			t.Errorf("Iteration %d: %v.Head(): got %s, wanted <end of sequence>", i, ars, got)
+		} else if want := wantRanges[i]; got != want {
+			t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want)
+		}
+		ars = ars.Tail()
+		wantLen -= int64(got.Length())
+		i++
+	}
+	if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 {
+		t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen)
+	}
+	if gotN := ars.NumRanges(); gotN != 0 {
+		t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN)
+	}
+}
+
+func TestAddrRangeSeqTailIteration(t *testing.T) {
+	for _, test := range addrRangeSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges)
+		})
+	}
+}
+
+func TestAddrRangeSeqDropFirstEmpty(t *testing.T) {
+	var ars AddrRangeSeq
+	if got, want := ars.DropFirst(1), ars; got != want {
+		t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want)
+	}
+}
+
+func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) {
+	// Tests AddrRangeSeq iteration using Head/DropFirst, simulating
+	// I/O-per-AddrRange.
+	for _, test := range addrRangeSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			// Figure out what AddrRanges we expect to see.
+			var wantLen int64
+			var wantRanges []AddrRange
+			for _, ar := range test.ranges {
+				wantLen += int64(ar.Length())
+				wantRanges = append(wantRanges, ar)
+				if ar.Length() == 0 {
+					// We "do" 0 bytes of I/O and then call DropFirst(0),
+					// advancing to the next AddrRange.
+					continue
+				}
+				// Otherwise we "do" 1 byte of I/O and then call DropFirst(1),
+				// advancing the AddrRange by 1 byte, or to the next AddrRange
+				// if this one is exhausted.
+				for ar.Start++; ar.Length() != 0; ar.Start++ {
+					wantRanges = append(wantRanges, ar)
+				}
+			}
+			t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen)
+
+			ars := AddrRangeSeqFromSlice(test.ranges)
+			var i int
+			for !ars.IsEmpty() {
+				if gotLen := ars.NumBytes(); gotLen != wantLen {
+					t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen)
+				}
+				got := ars.Head()
+				if i >= len(wantRanges) {
+					t.Errorf("Iteration %d: %v.Head(): got %s, wanted <end of sequence>", i, ars, got)
+				} else if want := wantRanges[i]; got != want {
+					t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want)
+				}
+				if got.Length() == 0 {
+					ars = ars.DropFirst(0)
+				} else {
+					ars = ars.DropFirst(1)
+					wantLen--
+				}
+				i++
+			}
+			if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 {
+				t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen)
+			}
+		})
+	}
+}
+
+func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) {
+	var ars AddrRangeSeq
+	if got, want := ars.TakeFirst(1), ars; got != want {
+		t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want)
+	}
+}
+
+func TestAddrRangeSeqTakeFirst(t *testing.T) {
+	ranges := []AddrRange{
+		{0x10, 0x11},
+		{0x20, 0x22},
+		{0x30, 0x30},
+		{0x40, 0x44},
+		{0x50, 0x55},
+		{0x60, 0x60},
+		{0x70, 0x77},
+	}
+	ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5)
+	want := []AddrRange{
+		{0x10, 0x11}, // +1 byte (total 1 byte), not truncated
+		{0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated
+		{0x30, 0x30}, // +0 bytes (total 3 bytes), no change
+		{0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated
+		{0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated
+		{0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change)
+		{0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated
+	}
+	testAddrRangeSeqEqualityWithTailIteration(t, ars, want)
+}
diff --git a/pkg/usermem/addr_range_seq_unsafe.go b/pkg/usermem/addr_range_seq_unsafe.go
new file mode 100644
index 000000000..c09337c15
--- /dev/null
+++ b/pkg/usermem/addr_range_seq_unsafe.go
@@ -0,0 +1,277 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+// An AddrRangeSeq represents a sequence of AddrRanges.
+//
+// AddrRangeSeqs are immutable and may be copied by value. The zero value of
+// AddrRangeSeq represents an empty sequence.
+//
+// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary
+// since zero-length AddrRanges are significant to MM bounds checks.
+type AddrRangeSeq struct {
+	// If length is 0, then the AddrRangeSeq represents no AddrRanges.
+	// Invariants: data == 0; offset == 0; limit == 0.
+	//
+	// If length is 1, then the AddrRangeSeq represents the single
+	// AddrRange{offset, offset+limit}. Invariants: data == 0.
+	//
+	// Otherwise, length >= 2, and the AddrRangeSeq represents the `length`
+	// AddrRanges in the array of AddrRanges starting at address `data`,
+	// starting at `offset` bytes into the first AddrRange and limited to the
+	// following `limit` bytes. (AddrRanges after `limit` are still iterated,
+	// but are truncated to a length of 0.) Invariants: data != 0; offset <=
+	// data[0].Length(); limit > 0; offset+limit <= the combined length of all
+	// AddrRanges in the array.
+	data   unsafe.Pointer
+	length int
+	offset Addr
+	limit  Addr
+}
+
+// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar.
+func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq {
+	return AddrRangeSeq{
+		length: 1,
+		offset: ar.Start,
+		limit:  ar.Length(),
+	}
+}
+
+// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in
+// slice.
+//
+// Whether the returned AddrRangeSeq shares memory with slice is unspecified;
+// clients should avoid mutating slices passed to AddrRangeSeqFromSlice.
+//
+// Preconditions: The combined length of all AddrRanges in slice <=
+// math.MaxInt64.
+func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
+	var limit int64
+	for _, ar := range slice {
+		len64 := int64(ar.Length())
+		if len64 < 0 {
+			panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar))
+		}
+		sum := limit + len64
+		if sum < limit {
+			panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice))
+		}
+		limit = sum
+	}
+	return addrRangeSeqFromSliceLimited(slice, limit)
+}
+
+// Preconditions: The combined length of all AddrRanges in slice <= limit.
+// limit >= 0. If len(slice) != 0, then limit > 0.
+func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
+	switch len(slice) {
+	case 0:
+		return AddrRangeSeq{}
+	case 1:
+		return AddrRangeSeq{
+			length: 1,
+			offset: slice[0].Start,
+			limit:  Addr(limit),
+		}
+	default:
+		return AddrRangeSeq{
+			data:   unsafe.Pointer(&slice[0]),
+			length: len(slice),
+			limit:  Addr(limit),
+		}
+	}
+}
+
+// IsEmpty returns true if ars.NumRanges() == 0.
+//
+// Note that since AddrRangeSeq may contain AddrRanges with a length of zero,
+// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not
+// necessarily empty.
+func (ars AddrRangeSeq) IsEmpty() bool {
+	return ars.length == 0
+}
+
+// NumRanges returns the number of AddrRanges in ars.
+func (ars AddrRangeSeq) NumRanges() int {
+	return ars.length
+}
+
+// NumBytes returns the number of bytes represented by ars.
+func (ars AddrRangeSeq) NumBytes() int64 {
+	return int64(ars.limit)
+}
+
+// Head returns the first AddrRange in ars.
+//
+// Preconditions: !ars.IsEmpty().
+func (ars AddrRangeSeq) Head() AddrRange {
+	if ars.length == 0 {
+		panic("empty AddrRangeSeq")
+	}
+	if ars.length == 1 {
+		return AddrRange{ars.offset, ars.offset + ars.limit}
+	}
+	ar := *(*AddrRange)(ars.data)
+	ar.Start += ars.offset
+	if ar.Length() > ars.limit {
+		ar.End = ar.Start + ars.limit
+	}
+	return ar
+}
+
+// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the
+// first.
+//
+// Preconditions: !ars.IsEmpty().
+func (ars AddrRangeSeq) Tail() AddrRangeSeq {
+	if ars.length == 0 {
+		panic("empty AddrRangeSeq")
+	}
+	if ars.length == 1 {
+		return AddrRangeSeq{}
+	}
+	return ars.externalTail()
+}
+
+// Preconditions: ars.length >= 2.
+func (ars AddrRangeSeq) externalTail() AddrRangeSeq {
+	headLen := (*AddrRange)(ars.data).Length() - ars.offset
+	var tailLimit int64
+	if ars.limit > headLen {
+		tailLimit = int64(ars.limit - headLen)
+	}
+	var extSlice []AddrRange
+	extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice))
+	extSliceHdr.Data = uintptr(ars.data)
+	extSliceHdr.Len = ars.length
+	extSliceHdr.Cap = ars.length
+	return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit)
+}
+
+// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n
+// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty
+// AddrRangeSeq.
+//
+// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit
+// at least ars.Head(), even if n == 0. This guarantees that the basic pattern
+// of:
+//
+//     for !ars.IsEmpty() {
+//       n, err = doIOWith(ars.Head())
+//       if err != nil {
+//         return err
+//       }
+//       ars = ars.DropFirst(n)
+//     }
+//
+// works even in the presence of zero-length AddrRanges.
+//
+// Preconditions: n >= 0.
+func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return ars.DropFirst64(int64(n))
+}
+
+// DropFirst64 is equivalent to DropFirst but takes an int64.
+func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	if Addr(n) > ars.limit {
+		return AddrRangeSeq{}
+	}
+	// Handle initial empty AddrRange.
+	switch ars.length {
+	case 0:
+		return AddrRangeSeq{}
+	case 1:
+		if ars.limit == 0 {
+			return AddrRangeSeq{}
+		}
+	default:
+		if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen {
+			ars = ars.externalTail()
+		}
+	}
+	for n != 0 {
+		// Calling ars.Head() here is surprisingly expensive, so inline getting
+		// the head's length.
+		var headLen Addr
+		if ars.length == 1 {
+			headLen = ars.limit
+		} else {
+			headLen = (*AddrRange)(ars.data).Length() - ars.offset
+		}
+		if Addr(n) < headLen {
+			// Dropping ends partway through the head AddrRange.
+			ars.offset += Addr(n)
+			ars.limit -= Addr(n)
+			return ars
+		}
+		n -= int64(headLen)
+		ars = ars.Tail()
+	}
+	return ars
+}
+
+// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n
+// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the
+// first n bytes are reduced to a length of zero, but will still be iterated.
+//
+// Preconditions: n >= 0.
+func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return ars.TakeFirst64(int64(n))
+}
+
+// TakeFirst64 is equivalent to TakeFirst but takes an int64.
+func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	if ars.limit > Addr(n) {
+		ars.limit = Addr(n)
+	}
+	return ars
+}
+
+// String implements fmt.Stringer.String.
+func (ars AddrRangeSeq) String() string {
+	// This is deliberately chosen to be the same as fmt's automatic stringer
+	// for []AddrRange.
+	var buf bytes.Buffer
+	buf.WriteByte('[')
+	var sep string
+	for !ars.IsEmpty() {
+		buf.WriteString(sep)
+		sep = " "
+		buf.WriteString(ars.Head().String())
+		ars = ars.Tail()
+	}
+	buf.WriteByte(']')
+	return buf.String()
+}
diff --git a/pkg/usermem/bytes_io.go b/pkg/usermem/bytes_io.go
new file mode 100644
index 000000000..e177d30eb
--- /dev/null
+++ b/pkg/usermem/bytes_io.go
@@ -0,0 +1,141 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const maxInt = int(^uint(0) >> 1)
+
+// BytesIO implements IO using a byte slice. Addresses are interpreted as
+// offsets into the slice. Reads and writes beyond the end of the slice return
+// EFAULT.
+type BytesIO struct {
+	Bytes []byte
+}
+
+// CopyOut implements IO.CopyOut.
+func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) {
+	rngN, rngErr := b.rangeCheck(addr, len(src))
+	if rngN == 0 {
+		return 0, rngErr
+	}
+	return copy(b.Bytes[int(addr):], src[:rngN]), rngErr
+}
+
+// CopyIn implements IO.CopyIn.
+func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) {
+	rngN, rngErr := b.rangeCheck(addr, len(dst))
+	if rngN == 0 {
+		return 0, rngErr
+	}
+	return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr
+}
+
+// ZeroOut implements IO.ZeroOut.
+func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) {
+	if toZero > int64(maxInt) {
+		return 0, syserror.EINVAL
+	}
+	rngN, rngErr := b.rangeCheck(addr, int(toZero))
+	if rngN == 0 {
+		return 0, rngErr
+	}
+	zeroSlice := b.Bytes[int(addr) : int(addr)+rngN]
+	for i := range zeroSlice {
+		zeroSlice[i] = 0
+	}
+	return int64(rngN), rngErr
+}
+
+// CopyOutFrom implements IO.CopyOutFrom.
+func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) {
+	dsts, rngErr := b.blocksFromAddrRanges(ars)
+	n, err := src.ReadToBlocks(dsts)
+	if err != nil {
+		return int64(n), err
+	}
+	return int64(n), rngErr
+}
+
+// CopyInTo implements IO.CopyInTo.
+func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) {
+	srcs, rngErr := b.blocksFromAddrRanges(ars)
+	n, err := dst.WriteFromBlocks(srcs)
+	if err != nil {
+		return int64(n), err
+	}
+	return int64(n), rngErr
+}
+
+func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) {
+	if length == 0 {
+		return 0, nil
+	}
+	if length < 0 {
+		return 0, syserror.EINVAL
+	}
+	max := Addr(len(b.Bytes))
+	if addr >= max {
+		return 0, syserror.EFAULT
+	}
+	end, ok := addr.AddLength(uint64(length))
+	if !ok || end > max {
+		return int(max - addr), syserror.EFAULT
+	}
+	return length, nil
+}
+
+func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) {
+	switch ars.NumRanges() {
+	case 0:
+		return safemem.BlockSeq{}, nil
+	case 1:
+		block, err := b.blockFromAddrRange(ars.Head())
+		return safemem.BlockSeqOf(block), err
+	default:
+		blocks := make([]safemem.Block, 0, ars.NumRanges())
+		for !ars.IsEmpty() {
+			block, err := b.blockFromAddrRange(ars.Head())
+			if block.Len() != 0 {
+				blocks = append(blocks, block)
+			}
+			if err != nil {
+				return safemem.BlockSeqFromSlice(blocks), err
+			}
+			ars = ars.Tail()
+		}
+		return safemem.BlockSeqFromSlice(blocks), nil
+	}
+}
+
+func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) {
+	n, err := b.rangeCheck(ar.Start, int(ar.Length()))
+	if n == 0 {
+		return safemem.Block{}, err
+	}
+	return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err
+}
+
+// BytesIOSequence returns an IOSequence representing the given byte slice.
+func BytesIOSequence(buf []byte) IOSequence {
+	return IOSequence{
+		IO:    &BytesIO{buf},
+		Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}),
+	}
+}
diff --git a/pkg/usermem/bytes_io_unsafe.go b/pkg/usermem/bytes_io_unsafe.go
new file mode 100644
index 000000000..20de5037d
--- /dev/null
+++ b/pkg/usermem/bytes_io_unsafe.go
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/atomicbitops"
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+// SwapUint32 implements IO.SwapUint32.
+func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) {
+	if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil {
+		return 0, rngErr
+	}
+	return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil
+}
+
+// CompareAndSwapUint32 implements IO.CompareAndSwapUint32.
+func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) {
+	if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil {
+		return 0, rngErr
+	}
+	return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil
+}
+
+// LoadUint32 implements IO.LoadUint32.
+func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) {
+	if _, err := b.rangeCheck(addr, 4); err != nil {
+		return 0, err
+	}
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil
+}
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
new file mode 100644
index 000000000..71fd4e155
--- /dev/null
+++ b/pkg/usermem/usermem.go
@@ -0,0 +1,597 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package usermem governs access to user memory.
+package usermem
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// IO provides access to the contents of a virtual memory space.
+//
+// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any
+// meaningful data.
+type IO interface {
+	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
+	// returns the number of bytes copied. If the number of bytes copied is <
+	// len(src), it returns a non-nil error explaining why.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order.
+	//
+	// Postconditions: CopyOut does not retain src.
+	CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error)
+
+	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
+	// It returns the number of bytes copied. If the number of bytes copied is
+	// < len(dst), it returns a non-nil error explaining why.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order.
+	//
+	// Postconditions: CopyIn does not retain dst.
+	CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error)
+
+	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
+	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
+	// non-nil error explaining why.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. toZero >= 0.
+	ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error)
+
+	// CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
+	// ars. It returns the number of bytes copied, which may be less than the
+	// number of bytes read from src if copying fails. CopyOutFrom may return a
+	// partial copy without an error iff src.ReadToBlocks returns a partial
+	// read without an error.
+	//
+	// CopyOutFrom calls src.ReadToBlocks at most once.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. src.ReadToBlocks must not block
+	// on mm.MemoryManager.activeMu or any preceding locks in the lock order.
+	CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)
+
+	// CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
+	// dst. It returns the number of bytes copied. CopyInTo may return a
+	// partial copy without an error iff dst.WriteFromBlocks returns a partial
+	// write without an error.
+	//
+	// CopyInTo calls dst.WriteFromBlocks at most once.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. dst.WriteFromBlocks must not
+	// block on mm.MemoryManager.activeMu or any preceding locks in the lock
+	// order.
+	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
+
+	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
+	// at most once, which is unnecessary in most cases, forces implementations
+	// to gather safemem.Blocks into a single slice to pass to src/dst. Add
+	// CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid
+	// this allocation.
+
+	// SwapUint32 atomically sets the uint32 value at addr to new and
+	// returns the previous value.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error)
+
+	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
+	// old; if they are equal, the value in memory is replaced by new. In
+	// either case, the previous value stored in memory is returned.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
+}
+
+// IOOpts contains options applicable to all IO methods.
+type IOOpts struct {
+	// If IgnorePermissions is true, application-defined memory protections set
+	// by mmap(2) or mprotect(2) will be ignored. (Memory protections required
+	// by the target of the mapping are never ignored.)
+	IgnorePermissions bool
+
+	// If AddressSpaceActive is true, the IO implementation may assume that it
+	// has an active AddressSpace and can therefore use AddressSpace copying
+	// without performing activation. See mm/io.go for details.
+	AddressSpaceActive bool
+}
+
+// IOReadWriter is an io.ReadWriter that reads from / writes to addresses
+// starting at addr in IO. The preconditions that apply to IO.CopyIn and
+// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write
+// respectively.
+type IOReadWriter struct {
+	Ctx  context.Context
+	IO   IO
+	Addr Addr
+	Opts IOOpts
+}
+
+// Read implements io.Reader.Read.
+//
+// Note that an address space does not have an "end of file", so Read can only
+// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or
+// unreadable memory, or beyond the end of the address space, should return
+// EFAULT.
+func (rw *IOReadWriter) Read(dst []byte) (int, error) {
+	n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts)
+	end, ok := rw.Addr.AddLength(uint64(n))
+	if ok {
+		rw.Addr = end
+	} else {
+		// Disallow wraparound.
+		rw.Addr = ^Addr(0)
+		if err != nil {
+			err = syserror.EFAULT
+		}
+	}
+	return n, err
+}
+
+// Writer implements io.Writer.Write.
+func (rw *IOReadWriter) Write(src []byte) (int, error) {
+	n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts)
+	end, ok := rw.Addr.AddLength(uint64(n))
+	if ok {
+		rw.Addr = end
+	} else {
+		// Disallow wraparound.
+		rw.Addr = ^Addr(0)
+		if err != nil {
+			err = syserror.EFAULT
+		}
+	}
+	return n, err
+}
+
+// CopyObjectOut copies a fixed-size value or slice of fixed-size values from
+// src to the memory mapped at addr in uio. It returns the number of bytes
+// copied.
+//
+// CopyObjectOut must use reflection to encode src; performance-sensitive
+// clients should do encoding manually and use uio.CopyOut directly.
+//
+// Preconditions: As for IO.CopyOut.
+func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) {
+	w := &IOReadWriter{
+		Ctx:  ctx,
+		IO:   uio,
+		Addr: addr,
+		Opts: opts,
+	}
+	// Allocate a byte slice the size of the object being marshaled. This
+	// adds an extra reflection call, but avoids needing to grow the slice
+	// during encoding, which can result in many heap-allocated slices.
+	b := make([]byte, 0, binary.Size(src))
+	return w.Write(binary.Marshal(b, ByteOrder, src))
+}
+
+// CopyObjectIn copies a fixed-size value or slice of fixed-size values from
+// the memory mapped at addr in uio to dst. It returns the number of bytes
+// copied.
+//
+// CopyObjectIn must use reflection to decode dst; performance-sensitive
+// clients should use uio.CopyIn directly and do decoding manually.
+//
+// Preconditions: As for IO.CopyIn.
+func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) {
+	r := &IOReadWriter{
+		Ctx:  ctx,
+		IO:   uio,
+		Addr: addr,
+		Opts: opts,
+	}
+	buf := make([]byte, binary.Size(dst))
+	if _, err := io.ReadFull(r, buf); err != nil {
+		return 0, err
+	}
+	binary.Unmarshal(buf, ByteOrder, dst)
+	return int(r.Addr - addr), nil
+}
+
+// CopyStringIn tuning parameters, defined outside that function for tests.
+const (
+	copyStringIncrement     = 64
+	copyStringMaxInitBufLen = 256
+)
+
+// CopyStringIn copies a NUL-terminated string of unknown length from the
+// memory mapped at addr in uio and returns it as a string (not including the
+// trailing NUL). If the length of the string, including the terminating NUL,
+// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
+// ENAMETOOLONG.
+//
+// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
+func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
+	initLen := maxlen
+	if initLen > copyStringMaxInitBufLen {
+		initLen = copyStringMaxInitBufLen
+	}
+	buf := make([]byte, initLen)
+	var done int
+	for done < maxlen {
+		// Read up to copyStringIncrement bytes at a time.
+		readlen := copyStringIncrement
+		if readlen > maxlen-done {
+			readlen = maxlen - done
+		}
+		end, ok := addr.AddLength(uint64(readlen))
+		if !ok {
+			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
+		}
+		// Shorten the read to avoid crossing page boundaries, since faulting
+		// in a page unnecessarily is expensive. This also ensures that partial
+		// copies up to the end of application-mappable memory succeed.
+		if addr.RoundDown() != end.RoundDown() {
+			end = end.RoundDown()
+			readlen = int(end - addr)
+		}
+		// Ensure that our buffer is large enough to accommodate the read.
+		if done+readlen > len(buf) {
+			newBufLen := len(buf) * 2
+			if newBufLen > maxlen {
+				newBufLen = maxlen
+			}
+			buf = append(buf, make([]byte, newBufLen-len(buf))...)
+		}
+		n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts)
+		// Look for the terminating zero byte, which may have occurred before
+		// hitting err.
+		if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
+			return stringFromImmutableBytes(buf[:done+i]), nil
+		}
+
+		done += n
+		if err != nil {
+			return stringFromImmutableBytes(buf[:done]), err
+		}
+		addr = end
+	}
+	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
+}
+
+// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
+// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is
+// less. CopyOutVec returns the number of bytes copied; if this is less than
+// the maximum, it returns a non-nil error explaining why.
+//
+// Preconditions: As for IO.CopyOut.
+func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
+	var done int
+	for !ars.IsEmpty() && done < len(src) {
+		ar := ars.Head()
+		cplen := len(src) - done
+		if Addr(cplen) >= ar.Length() {
+			cplen = int(ar.Length())
+		}
+		n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		ars = ars.DropFirst(n)
+	}
+	return done, nil
+}
+
+// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The
+// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is
+// less. CopyInVec returns the number of bytes copied; if this is less than the
+// maximum, it returns a non-nil error explaining why.
+//
+// Preconditions: As for IO.CopyIn.
+func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
+	var done int
+	for !ars.IsEmpty() && done < len(dst) {
+		ar := ars.Head()
+		cplen := len(dst) - done
+		if Addr(cplen) >= ar.Length() {
+			cplen = int(ar.Length())
+		}
+		n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		ars = ars.DropFirst(n)
+	}
+	return done, nil
+}
+
+// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum
+// number of bytes written is ars.NumBytes() or toZero, whichever is less.
+// ZeroOutVec returns the number of bytes written; if this is less than the
+// maximum, it returns a non-nil error explaining why.
+//
+// Preconditions: As for IO.ZeroOut.
+func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
+	var done int64
+	for !ars.IsEmpty() && done < toZero {
+		ar := ars.Head()
+		cplen := toZero - done
+		if Addr(cplen) >= ar.Length() {
+			cplen = int64(ar.Length())
+		}
+		n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		ars = ars.DropFirst64(n)
+	}
+	return done, nil
+}
+
+func isASCIIWhitespace(b byte) bool {
+	// Compare Linux include/linux/ctype.h, lib/ctype.c.
+	//  9 => horizontal tab '\t'
+	// 10 => line feed '\n'
+	// 11 => vertical tab '\v'
+	// 12 => form feed '\c'
+	// 13 => carriage return '\r'
+	return b == ' ' || (b >= 9 && b <= 13)
+}
+
+// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal
+// strings from the memory mapped at ars in uio and converts them to int32
+// values in dsts. It returns the number of bytes read.
+//
+// CopyInt32StringsInVec shares the following properties with Linux's
+// kernel/sysctl.c:proc_dointvec(write=1):
+//
+// - If any read value overflows the range of int32, or any invalid characters
+// are encountered during the read, CopyInt32StringsInVec returns EINVAL.
+//
+// - If, upon reaching the end of ars, fewer than len(dsts) values have been
+// read, CopyInt32StringsInVec returns no error if at least 1 value was read
+// and EINVAL otherwise.
+//
+// - Trailing whitespace after the last successfully read value is counted in
+// the number of bytes read.
+//
+// Unlike proc_dointvec():
+//
+// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to
+// PageSize-1; callers that require this must do so explicitly.
+//
+// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
+//
+// Preconditions: As for CopyInVec.
+func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
+	if len(dsts) == 0 {
+		return 0, nil
+	}
+
+	buf := make([]byte, ars.NumBytes())
+	n, cperr := CopyInVec(ctx, uio, ars, buf, opts)
+	buf = buf[:n]
+
+	var i, j int
+	for ; j < len(dsts); j++ {
+		// Skip leading whitespace.
+		for i < len(buf) && isASCIIWhitespace(buf[i]) {
+			i++
+		}
+		if i == len(buf) {
+			break
+		}
+
+		// Find the end of the value to be parsed (next whitespace or end of string).
+		nextI := i + 1
+		for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) {
+			nextI++
+		}
+
+		// Parse a single value.
+		val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32)
+		if err != nil {
+			return int64(i), syserror.EINVAL
+		}
+		dsts[j] = int32(val)
+
+		i = nextI
+	}
+
+	// Skip trailing whitespace.
+	for i < len(buf) && isASCIIWhitespace(buf[i]) {
+		i++
+	}
+
+	if cperr != nil {
+		return int64(i), cperr
+	}
+	if j == 0 {
+		return int64(i), syserror.EINVAL
+	}
+	return int64(i), nil
+}
+
+// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at
+// most one int32.
+func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) {
+	dsts := [1]int32{*dst}
+	n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts)
+	*dst = dsts[0]
+	return n, err
+}
+
+// IOSequence holds arguments to IO methods.
+type IOSequence struct {
+	IO    IO
+	Addrs AddrRangeSeq
+	Opts  IOOpts
+}
+
+// NumBytes returns s.Addrs.NumBytes().
+//
+// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since
+// s.Addrs may contain a non-zero number of zero-length AddrRanges.
+// Many clients of
+// IOSequence currently do something like:
+//
+//     if ioseq.NumBytes() == 0 {
+//       return 0, nil
+//     }
+//     if f.availableBytes == 0 {
+//       return 0, syserror.ErrWouldBlock
+//     }
+//     return ioseq.CopyOutFrom(..., reader)
+//
+// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong
+// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means
+// that we will return success for zero-length I/O in cases where Linux would
+// return EFAULT due to a failed access_ok() check, so in the long term we
+// should move checks for ErrWouldBlock etc. into the body of
+// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead.
+func (s IOSequence) NumBytes() int64 {
+	return s.Addrs.NumBytes()
+}
+
+// DropFirst returns a copy of s with s.Addrs.DropFirst(n).
+//
+// Preconditions: As for AddrRangeSeq.DropFirst.
+func (s IOSequence) DropFirst(n int) IOSequence {
+	return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
+}
+
+// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
+//
+// Preconditions: As for AddrRangeSeq.DropFirst64.
+func (s IOSequence) DropFirst64(n int64) IOSequence {
+	return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
+}
+
+// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
+//
+// Preconditions: As for AddrRangeSeq.TakeFirst.
+func (s IOSequence) TakeFirst(n int) IOSequence {
+	return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
+}
+
+// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
+//
+// Preconditions: As for AddrRangeSeq.TakeFirst64.
+func (s IOSequence) TakeFirst64(n int64) IOSequence {
+	return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
+}
+
+// CopyOut invokes CopyOutVec over s.Addrs.
+//
+// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
+// to s.NumBytes(), and a nil error will be returned.
+//
+// Preconditions: As for CopyOutVec.
+func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
+	return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
+}
+
+// CopyIn invokes CopyInVec over s.Addrs.
+//
+// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
+// s.NumBytes(), and a nil error will be returned.
+//
+// Preconditions: As for CopyInVec.
+func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
+	return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
+}
+
+// ZeroOut invokes ZeroOutVec over s.Addrs.
+//
+// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
+// to s.NumBytes(), and a nil error will be returned.
+//
+// Preconditions: As for ZeroOutVec.
+func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
+	return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
+}
+
+// CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
+//
+// Preconditions: As for IO.CopyOutFrom.
+func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
+	return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
+}
+
+// CopyInTo invokes s.CopyInTo over s.Addrs.
+//
+// Preconditions: As for IO.CopyInTo.
+func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
+	return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
+}
+
+// Reader returns an io.Reader that reads from s. Reads beyond the end of s
+// return io.EOF. The preconditions that apply to s.CopyIn also apply to the
+// returned io.Reader.Read.
+func (s IOSequence) Reader(ctx context.Context) io.Reader {
+	return &ioSequenceReadWriter{ctx, s}
+}
+
+// Writer returns an io.Writer that writes to s. Writes beyond the end of s
+// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also
+// apply to the returned io.Writer.Write.
+func (s IOSequence) Writer(ctx context.Context) io.Writer {
+	return &ioSequenceReadWriter{ctx, s}
+}
+
+// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when
+// attempting to write beyond the end of the IOSequence.
+var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence")
+
+type ioSequenceReadWriter struct {
+	ctx context.Context
+	s   IOSequence
+}
+
+// Read implements io.Reader.Read.
+func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) {
+	n, err := rw.s.CopyIn(rw.ctx, dst)
+	rw.s = rw.s.DropFirst(n)
+	if err == nil && rw.s.NumBytes() == 0 {
+		err = io.EOF
+	}
+	return n, err
+}
+
+// Write implements io.Writer.Write.
+func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) {
+	n, err := rw.s.CopyOut(rw.ctx, src)
+	rw.s = rw.s.DropFirst(n)
+	if err == nil && n < len(src) {
+		err = ErrEndOfIOSequence
+	}
+	return n, err
+}
diff --git a/pkg/usermem/usermem_arm64.go b/pkg/usermem/usermem_arm64.go
new file mode 100644
index 000000000..fdfc30a66
--- /dev/null
+++ b/pkg/usermem/usermem_arm64.go
@@ -0,0 +1,53 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package usermem
+
+import (
+	"encoding/binary"
+	"syscall"
+)
+
+const (
+	// PageSize is the system page size.
+	// arm64 support 4K/16K/64K page size,
+	// which can be get by syscall.Getpagesize().
+	// Currently, only 4K page size is supported.
+	PageSize = 1 << PageShift
+
+	// HugePageSize is the system huge page size.
+	HugePageSize = 1 << HugePageShift
+
+	// PageShift is the binary log of the system page size.
+	PageShift = 12
+
+	// HugePageShift is the binary log of the system huge page size.
+	// Should be calculated by "PageShift + (PageShift - 3)"
+	// when multiple page size support is ready.
+	HugePageShift = 21
+)
+
+var (
+	// ByteOrder is the native byte order (little endian).
+	ByteOrder = binary.LittleEndian
+)
+
+func init() {
+	// Make sure the page size is 4K on arm64 platform.
+	if size := syscall.Getpagesize(); size != PageSize {
+		panic("Only 4K page size is supported on arm64!")
+	}
+}
diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go
new file mode 100644
index 000000000..bf3c5df2b
--- /dev/null
+++ b/pkg/usermem/usermem_test.go
@@ -0,0 +1,424 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// newContext returns a context.Context that we can use in these tests (we
+// can't use contexttest because it depends on usermem).
+func newContext() context.Context {
+	return context.Background()
+}
+
+func newBytesIOString(s string) *BytesIO {
+	return &BytesIO{[]byte(s)}
+}
+
+func TestBytesIOCopyOutSuccess(t *testing.T) {
+	b := newBytesIOString("ABCDE")
+	n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{})
+	if wantN := 3; n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyOutFailure(t *testing.T) {
+	b := newBytesIOString("ABC")
+	n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{})
+	if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInSuccess(t *testing.T) {
+	b := newBytesIOString("AfooE")
+	var dst [3]byte
+	n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{})
+	if wantN := 3; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInFailure(t *testing.T) {
+	b := newBytesIOString("Afo")
+	var dst [3]byte
+	n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{})
+	if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOZeroOutSuccess(t *testing.T) {
+	b := newBytesIOString("ABCD")
+	n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{})
+	if wantN := int64(2); n != wantN || err != nil {
+		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOZeroOutFailure(t *testing.T) {
+	b := newBytesIOString("ABC")
+	n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{})
+	if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyOutFromSuccess(t *testing.T) {
+	b := newBytesIOString("ABCDEFGH")
+	n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 4, End: 7},
+		{Start: 1, End: 4},
+	}), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{})
+	if wantN := int64(6); n != wantN || err != nil {
+		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyOutFromFailure(t *testing.T) {
+	b := newBytesIOString("ABCDE")
+	n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 1, End: 4},
+		{Start: 4, End: 7},
+	}), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{})
+	if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInToSuccess(t *testing.T) {
+	b := newBytesIOString("AfoobarH")
+	var dst bytes.Buffer
+	n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 4, End: 7},
+		{Start: 1, End: 4},
+	}), safemem.FromIOWriter{&dst}, IOOpts{})
+	if wantN := int64(6); n != wantN || err != nil {
+		t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) {
+		t.Errorf("dst.Bytes(): got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInToFailure(t *testing.T) {
+	b := newBytesIOString("Afoob")
+	var dst bytes.Buffer
+	n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 1, End: 4},
+		{Start: 4, End: 7},
+	}), safemem.FromIOWriter{&dst}, IOOpts{})
+	if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) {
+		t.Errorf("dst.Bytes(): got %q, wanted %q", got, want)
+	}
+}
+
+type testStruct struct {
+	Int8   int8
+	Uint8  uint8
+	Int16  int16
+	Uint16 uint16
+	Int32  int32
+	Uint32 uint32
+	Int64  int64
+	Uint64 uint64
+}
+
+func TestCopyObject(t *testing.T) {
+	wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8}
+	wantN := binary.Size(wantObj)
+	b := &BytesIO{make([]byte, wantN)}
+	ctx := newContext()
+	if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil {
+		t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	var gotObj testStruct
+	if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil {
+		t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if gotObj != wantObj {
+		t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj)
+	}
+}
+
+func TestCopyStringInShort(t *testing.T) {
+	// Tests for string length <= copyStringIncrement.
+	want := strings.Repeat("A", copyStringIncrement-2)
+	mem := want + "\x00"
+	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+	}
+}
+
+func TestCopyStringInLong(t *testing.T) {
+	// Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen
+	// (requiring multiple calls to IO.CopyIn()).
+	want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4)
+	mem := want + "\x00"
+	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+	}
+}
+
+func TestCopyStringInVeryLong(t *testing.T) {
+	// Tests for string length > copyStringMaxInitBufLen (requiring buffer
+	// reallocation).
+	want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4)
+	mem := want + "\x00"
+	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+	}
+}
+
+func TestCopyStringInNoTerminatingZeroByte(t *testing.T) {
+	want := strings.Repeat("A", copyStringIncrement-1)
+	got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{})
+	if wantErr := syserror.EFAULT; got != want || err != wantErr {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr)
+	}
+}
+
+func TestCopyStringInTruncatedByMaxlen(t *testing.T) {
+	got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{})
+	if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr)
+	}
+}
+
+func TestCopyInt32StringsInVec(t *testing.T) {
+	for _, test := range []struct {
+		str     string
+		n       int
+		initial []int32
+		final   []int32
+	}{
+		{
+			str:     "100 200",
+			n:       len("100 200"),
+			initial: []int32{1, 2},
+			final:   []int32{100, 200},
+		},
+		{
+			// Fewer values ok
+			str:     "100",
+			n:       len("100"),
+			initial: []int32{1, 2},
+			final:   []int32{100, 2},
+		},
+		{
+			// Extra values ok
+			str:     "100 200 300",
+			n:       len("100 200 "),
+			initial: []int32{1, 2},
+			final:   []int32{100, 200},
+		},
+		{
+			// Leading and trailing whitespace ok
+			str:     " 100\t200\n",
+			n:       len(" 100\t200\n"),
+			initial: []int32{1, 2},
+			final:   []int32{100, 200},
+		},
+	} {
+		t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) {
+			src := BytesIOSequence([]byte(test.str))
+			dsts := append([]int32(nil), test.initial...)
+			if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil {
+				t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n)
+			}
+			if !reflect.DeepEqual(dsts, test.final) {
+				t.Errorf("dsts: got %v, wanted %v", dsts, test.final)
+			}
+		})
+	}
+}
+
+func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) {
+	for _, s := range []string{"", "\n", "a123"} {
+		t.Run(fmt.Sprintf("%q", s), func(t *testing.T) {
+			src := BytesIOSequence([]byte(s))
+			initial := []int32{1, 2}
+			dsts := append([]int32(nil), initial...)
+			if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL {
+				t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL)
+			}
+			if !reflect.DeepEqual(dsts, initial) {
+				t.Errorf("dsts: got %v, wanted %v", dsts, initial)
+			}
+		})
+	}
+}
+
+func TestIOSequenceCopyOut(t *testing.T) {
+	buf := []byte("ABCD")
+	s := BytesIOSequence(buf)
+
+	// CopyOut limited by len(src).
+	n, err := s.CopyOut(newContext(), []byte("fo"))
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foCD"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(2); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// CopyOut limited by s.NumBytes().
+	n, err = s.CopyOut(newContext(), []byte("obar"))
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foob"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
+
+func TestIOSequenceCopyIn(t *testing.T) {
+	s := BytesIOSequence([]byte("foob"))
+	dst := []byte("ABCDEF")
+
+	// CopyIn limited by len(dst).
+	n, err := s.CopyIn(newContext(), dst[:2])
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foCDEF"); !bytes.Equal(dst, want) {
+		t.Errorf("dst: got %q, wanted %q", dst, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(2); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// CopyIn limited by s.Remaining().
+	n, err = s.CopyIn(newContext(), dst[2:])
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foobEF"); !bytes.Equal(dst, want) {
+		t.Errorf("dst: got %q, wanted %q", dst, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
+
+func TestIOSequenceZeroOut(t *testing.T) {
+	buf := []byte("ABCD")
+	s := BytesIOSequence(buf)
+
+	// ZeroOut limited by toZero.
+	n, err := s.ZeroOut(newContext(), 2)
+	if wantN := int64(2); n != wantN || err != nil {
+		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(2); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// ZeroOut limited by s.NumBytes().
+	n, err = s.ZeroOut(newContext(), 4)
+	if wantN := int64(2); n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
+
+func TestIOSequenceTakeFirst(t *testing.T) {
+	s := BytesIOSequence([]byte("foobar"))
+	if got, want := s.NumBytes(), int64(6); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	s = s.TakeFirst(3)
+	if got, want := s.NumBytes(), int64(3); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// TakeFirst(n) where n > s.NumBytes() is a no-op.
+	s = s.TakeFirst(9)
+	if got, want := s.NumBytes(), int64(3); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	var dst [3]byte
+	n, err := s.CopyIn(newContext(), dst[:])
+	if wantN := 3; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+	s = s.DropFirst(3)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go
new file mode 100644
index 000000000..876783e78
--- /dev/null
+++ b/pkg/usermem/usermem_unsafe.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"unsafe"
+)
+
+// stringFromImmutableBytes is equivalent to string(bs), except that it never
+// copies even if escape analysis can't prove that bs does not escape. This is
+// only valid if bs is never mutated after stringFromImmutableBytes returns.
+func stringFromImmutableBytes(bs []byte) string {
+	// Compare strings.Builder.String().
+	return *(*string)(unsafe.Pointer(&bs))
+}
diff --git a/pkg/usermem/usermem_x86.go b/pkg/usermem/usermem_x86.go
new file mode 100644
index 000000000..8059b72d2
--- /dev/null
+++ b/pkg/usermem/usermem_x86.go
@@ -0,0 +1,38 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 i386
+
+package usermem
+
+import "encoding/binary"
+
+const (
+	// PageSize is the system page size.
+	PageSize = 1 << PageShift
+
+	// HugePageSize is the system huge page size.
+	HugePageSize = 1 << HugePageShift
+
+	// PageShift is the binary log of the system page size.
+	PageShift = 12
+
+	// HugePageShift is the binary log of the system huge page size.
+	HugePageShift = 21
+)
+
+var (
+	// ByteOrder is the native byte order (little endian).
+	ByteOrder = binary.LittleEndian
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index f3ebc0231..a96c80261 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -30,6 +30,7 @@ go_library(
     deps = [
         "//pkg/abi",
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
@@ -39,7 +40,6 @@ go_library(
         "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
-        "//pkg/sentry/context",
         "//pkg/sentry/control",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/dev",
@@ -71,7 +71,6 @@ go_library(
         "//pkg/sentry/time",
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
         "//pkg/syserror",
@@ -88,6 +87,7 @@ go_library(
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
+        "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/specutils",
@@ -111,7 +111,7 @@ go_test(
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/p9",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sync",
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index e5de1f3d7..417d2d5fb 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -17,7 +17,7 @@ package boot
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 421ccd255..0f62842ea 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -32,8 +32,8 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index bec0dc292..44aa63196 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -27,7 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/control/server"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index 56cc12ee0..f0aa52135 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -22,10 +22,10 @@ import (
 	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type fileReader struct {
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
index 9aee2ad07..fb4e13dfb 100644
--- a/runsc/boot/user_test.go
+++ b/runsc/boot/user_test.go
@@ -23,7 +23,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index 2918ceffe..d79786a68 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -54,8 +54,8 @@ go_marshal = rule(
 # marshal_deps are the dependencies requied by generated code.
 marshal_deps = [
     "//tools/go_marshal/marshal",
-    "//pkg/sentry/platform/safecopy",
-    "//pkg/sentry/usermem",
+    "//pkg/safecopy",
+    "//pkg/usermem",
 ]
 
 # marshal_test_deps are required by test targets.
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 8392f3f6d..af90bdecb 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -27,8 +27,8 @@ import (
 
 const (
 	marshalImport  = "gvisor.dev/gvisor/tools/go_marshal/marshal"
-	usermemImport  = "gvisor.dev/gvisor/pkg/sentry/usermem"
-	safecopyImport = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
+	safecopyImport = "gvisor.dev/gvisor/pkg/safecopy"
+	usermemImport  = "gvisor.dev/gvisor/pkg/usermem"
 )
 
 // List of identifiers we use in generated code, that may conflict a
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index 38ba49fed..e345e3a8e 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -15,7 +15,7 @@ go_test(
     deps = [
         ":test",
         "//pkg/binary",
-        "//pkg/sentry/usermem",
+        "//pkg/usermem",
         "//tools/go_marshal/analysis",
     ],
 )
diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go
index e70db06d8..e12403741 100644
--- a/tools/go_marshal/test/benchmark_test.go
+++ b/tools/go_marshal/test/benchmark_test.go
@@ -22,7 +22,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/tools/go_marshal/analysis"
 	test "gvisor.dev/gvisor/tools/go_marshal/test"
 )
-- 
cgit v1.2.3


From 253c9e666cf7d52352da97d764818e510f1387c0 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 15:37:28 -0800
Subject: Cleanup glog and add real caller information.

In general, we've learned that logging must be avoided at all
costs in the hot path. It's unlikely that the optimizations
here were significant in any case, since buffer would certainly
escape.

This also adds a test to ensure that the caller identification
works as expected, and so that logging can be benchmarked.

Original:
BenchmarkGoogleLogging-6   	 1222255	       949 ns/op

With this change:
BenchmarkGoogleLogging-6   	  517323	      2346 ns/op

Fixes #184

PiperOrigin-RevId: 291815420
---
 pkg/log/BUILD          |   1 -
 pkg/log/glog.go        | 164 +++++++++++++------------------------------------
 pkg/log/glog_unsafe.go |  32 ----------
 pkg/log/json_k8s.go    |   2 +-
 pkg/log/log.go         |  20 ++++--
 pkg/log/log_test.go    |  35 +++++++++++
 runsc/boot/compat.go   |   2 +-
 runsc/main.go          |   4 +-
 8 files changed, 99 insertions(+), 161 deletions(-)
 delete mode 100644 pkg/log/glog_unsafe.go

diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 935d06963..a7c8f7bef 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -6,7 +6,6 @@ go_library(
     name = "log",
     srcs = [
         "glog.go",
-        "glog_unsafe.go",
         "json.go",
         "json_k8s.go",
         "log.go",
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index 5732785b4..cab5fae55 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -15,149 +15,73 @@
 package log
 
 import (
+	"fmt"
 	"os"
+	"runtime"
+	"strings"
 	"time"
 )
 
 // GoogleEmitter is a wrapper that emits logs in a format compatible with
 // package github.com/golang/glog.
 type GoogleEmitter struct {
-	// Emitter is the underlying emitter.
-	Emitter
-}
-
-// buffer is a simple inline buffer to avoid churn. The data slice is generally
-// kept to the local byte array, and we avoid having to allocate it on the heap.
-type buffer struct {
-	local [256]byte
-	data  []byte
-}
-
-func (b *buffer) start() {
-	b.data = b.local[:0]
-}
-
-func (b *buffer) String() string {
-	return unsafeString(b.data)
-}
-
-func (b *buffer) write(c byte) {
-	b.data = append(b.data, c)
-}
-
-func (b *buffer) writeAll(d []byte) {
-	b.data = append(b.data, d...)
-}
-
-func (b *buffer) writeOneDigit(d byte) {
-	b.write('0' + d)
-}
-
-func (b *buffer) writeTwoDigits(v int) {
-	v = v % 100
-	b.writeOneDigit(byte(v / 10))
-	b.writeOneDigit(byte(v % 10))
-}
-
-func (b *buffer) writeSixDigits(v int) {
-	v = v % 1000000
-	b.writeOneDigit(byte(v / 100000))
-	b.writeOneDigit(byte((v % 100000) / 10000))
-	b.writeOneDigit(byte((v % 10000) / 1000))
-	b.writeOneDigit(byte((v % 1000) / 100))
-	b.writeOneDigit(byte((v % 100) / 10))
-	b.writeOneDigit(byte(v % 10))
-}
-
-func calculateBytes(v int, pad int) []byte {
-	var d []byte
-	r := 1
-
-	for n := 10; v >= r; n = n * 10 {
-		d = append(d, '0'+byte((v%n)/r))
-		r = n
-	}
-
-	for i := len(d); i < pad; i++ {
-		d = append(d, ' ')
-	}
-
-	for i := 0; i < len(d)/2; i++ {
-		d[i], d[len(d)-(i+1)] = d[len(d)-(i+1)], d[i]
-	}
-	return d
+	Writer
 }
 
 // pid is used for the threadid component of the header.
-//
-// The glog package logger uses 7 spaces of padding. See
-// glob.loggingT.formatHeader.
-var pid = calculateBytes(os.Getpid(), 7)
-
-// caller is faked out as the caller. See FIXME below.
-var caller = []byte("x:0")
+var pid = os.Getpid()
 
 // Emit emits the message, google-style.
-func (g GoogleEmitter) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
-	var b buffer
-	b.start()
-
-	// Log lines have this form:
-	//   Lmmdd hh:mm:ss.uuuuuu threadid file:line] msg...
-	//
-	// where the fields are defined as follows:
-	//   L                A single character, representing the log level (eg 'I' for INFO)
-	//   mm               The month (zero padded; ie May is '05')
-	//   dd               The day (zero padded)
-	//   hh:mm:ss.uuuuuu  Time in hours, minutes and fractional seconds
-	//   threadid         The space-padded thread ID as returned by GetTID()
-	//   file             The file name
-	//   line             The line number
-	//   msg              The user-supplied message
-
+//
+// Log lines have this form:
+//   Lmmdd hh:mm:ss.uuuuuu threadid file:line] msg...
+//
+// where the fields are defined as follows:
+//   L                A single character, representing the log level (eg 'I' for INFO)
+//   mm               The month (zero padded; ie May is '05')
+//   dd               The day (zero padded)
+//   hh:mm:ss.uuuuuu  Time in hours, minutes and fractional seconds
+//   threadid         The space-padded thread ID as returned by GetTID()
+//   file             The file name
+//   line             The line number
+//   msg              The user-supplied message
+//
+func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
 	// Log level.
+	prefix := byte('?')
 	switch level {
 	case Debug:
-		b.write('D')
+		prefix = byte('D')
 	case Info:
-		b.write('I')
+		prefix = byte('I')
 	case Warning:
-		b.write('W')
+		prefix = byte('W')
 	}
 
 	// Timestamp.
 	_, month, day := timestamp.Date()
 	hour, minute, second := timestamp.Clock()
-	b.writeTwoDigits(int(month))
-	b.writeTwoDigits(int(day))
-	b.write(' ')
-	b.writeTwoDigits(int(hour))
-	b.write(':')
-	b.writeTwoDigits(int(minute))
-	b.write(':')
-	b.writeTwoDigits(int(second))
-	b.write('.')
-	b.writeSixDigits(int(timestamp.Nanosecond() / 1000))
-	b.write(' ')
-
-	// The pid.
-	b.writeAll(pid)
-	b.write(' ')
-
-	// FIXME(b/73383460): The caller, fabricated. This really sucks, but it
-	// is unacceptable to put runtime.Callers() in the hot path.
-	b.writeAll(caller)
-	b.write(']')
-	b.write(' ')
-
-	// User-provided format string, copied.
-	for i := 0; i < len(format); i++ {
-		b.write(format[i])
+	microsecond := int(timestamp.Nanosecond() / 1000)
+
+	// 0 = this frame.
+	// 1 = Debugf, etc.
+	// 2 = Caller.
+	_, file, line, ok := runtime.Caller(2)
+	if ok {
+		// Trim any directory path from the file.
+		slash := strings.LastIndexByte(file, byte('/'))
+		if slash >= 0 {
+			file = file[slash+1:]
+		}
+	} else {
+		// We don't have a filename.
+		file = "???"
+		line = 0
 	}
 
-	// End with a newline.
-	b.write('\n')
+	// Generate the message.
+	message := fmt.Sprintf(format, args...)
 
-	// Pass to the underlying routine.
-	g.Emitter.Emit(level, timestamp, b.String(), args...)
+	// Emit the formatted result.
+	fmt.Fprintf(&g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message)
 }
diff --git a/pkg/log/glog_unsafe.go b/pkg/log/glog_unsafe.go
deleted file mode 100644
index ea17ae349..000000000
--- a/pkg/log/glog_unsafe.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package log
-
-import (
-	"reflect"
-	"unsafe"
-)
-
-// unsafeString returns a string that points to the given byte array.
-// The byte array must be preserved until the string is disposed.
-func unsafeString(data []byte) (s string) {
-	if len(data) == 0 {
-		return
-	}
-
-	(*reflect.StringHeader)(unsafe.Pointer(&s)).Data = uintptr(unsafe.Pointer(&data[0]))
-	(*reflect.StringHeader)(unsafe.Pointer(&s)).Len = len(data)
-	return
-}
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index c2c019915..cee6eb514 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -33,7 +33,7 @@ type K8sJSONEmitter struct {
 }
 
 // Emit implements Emitter.Emit.
-func (e K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e *K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := k8sJSONLog{
 		Log:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 91a81b288..5056f17e6 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -17,6 +17,18 @@
 // This is separate from the standard logging package because logging may be a
 // high-impact activity, and therefore we wanted to provide as much flexibility
 // as possible in the underlying implementation.
+//
+// Note that logging should still be considered high-impact, and should not be
+// done in the hot path. If necessary, logging statements should be protected
+// with guards regarding the logging level. For example,
+//
+//	if log.IsLogging(log.Debug) {
+//		log.Debugf(...)
+//	}
+//
+// This is because the log.Debugf(...) statement alone will generate a
+// significant amount of garbage and churn in many cases, even if no log
+// message is ultimately emitted.
 package log
 
 import (
@@ -138,8 +150,8 @@ func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...i
 type MultiEmitter []Emitter
 
 // Emit emits to all emitters.
-func (m MultiEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
-	for _, e := range m {
+func (m *MultiEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+	for _, e := range *m {
 		e.Emit(level, timestamp, format, v...)
 	}
 }
@@ -155,7 +167,7 @@ type TestEmitter struct {
 }
 
 // Emit emits to the TestLogger.
-func (t TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (t *TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
 	t.Logf(format, v...)
 }
 
@@ -332,5 +344,5 @@ func CopyStandardLogTo(l Level) error {
 
 func init() {
 	// Store the initial value for the log.
-	log.Store(&BasicLogger{Level: Info, Emitter: GoogleEmitter{&Writer{Next: os.Stderr}}})
+	log.Store(&BasicLogger{Level: Info, Emitter: &GoogleEmitter{Writer{Next: os.Stderr}}})
 }
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index 0634e7c1f..402cc29ae 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -16,18 +16,23 @@ package log
 
 import (
 	"fmt"
+	"strings"
 	"testing"
 )
 
 type testWriter struct {
 	lines []string
 	fail  bool
+	limit int
 }
 
 func (w *testWriter) Write(bytes []byte) (int, error) {
 	if w.fail {
 		return 0, fmt.Errorf("simulated failure")
 	}
+	if w.limit > 0 && len(w.lines) >= w.limit {
+		return len(bytes), nil
+	}
 	w.lines = append(w.lines, string(bytes))
 	return len(bytes), nil
 }
@@ -68,3 +73,33 @@ func TestDropMessages(t *testing.T) {
 		}
 	}
 }
+
+func TestCaller(t *testing.T) {
+	tw := &testWriter{}
+	e := &GoogleEmitter{Writer: Writer{Next: tw}}
+	bl := &BasicLogger{
+		Emitter: e,
+		Level:   Debug,
+	}
+	bl.Debugf("testing...\n") // Just for file + line.
+	if len(tw.lines) != 1 {
+		t.Errorf("expected 1 line, got %d", len(tw.lines))
+	}
+	if !strings.Contains(tw.lines[0], "log_test.go") {
+		t.Errorf("expected log_test.go, got %q", tw.lines[0])
+	}
+}
+
+func BenchmarkGoogleLogging(b *testing.B) {
+	tw := &testWriter{
+		limit: 1, // Only record one message.
+	}
+	e := &GoogleEmitter{Writer: Writer{Next: tw}}
+	bl := &BasicLogger{
+		Emitter: e,
+		Level:   Debug,
+	}
+	for i := 0; i < b.N; i++ {
+		bl.Debugf("hello %d, %d, %d", 1, 2, 3)
+	}
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 9c23b9553..8995d678e 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -65,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
-		target := log.MultiEmitter{c.sink, log.K8sJSONEmitter{log.Writer{Next: f}}}
+		target := &log.MultiEmitter{c.sink, &log.K8sJSONEmitter{log.Writer{Next: f}}}
 		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
 	}
 	return c, nil
diff --git a/runsc/main.go b/runsc/main.go
index abf929511..c2b0d9a9e 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -288,7 +288,7 @@ func main() {
 	}
 
 	if *alsoLogToStderr {
-		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
+		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
 
 	log.SetTarget(e)
@@ -333,7 +333,7 @@ func main() {
 func newEmitter(format string, logFile io.Writer) log.Emitter {
 	switch format {
 	case "text":
-		return &log.GoogleEmitter{&log.Writer{Next: logFile}}
+		return &log.GoogleEmitter{log.Writer{Next: logFile}}
 	case "json":
 		return &log.JSONEmitter{log.Writer{Next: logFile}}
 	case "json-k8s":
-- 
cgit v1.2.3


From d6a2e01d3e57e0837c7e5cfda3b56c4dcfbb4627 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 16:40:46 -0800
Subject: Address GH comments.

---
 pkg/sentry/socket/netfilter/netfilter.go | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index b8848f08a..a06562743 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -43,7 +43,7 @@ const (
 // a metadata struct when the tables are written, and when they are read out we
 // verify that certain fields are the same.
 //
-// metadata is opaque to netstack.
+// metadata is used by this serialization/deserializing code, not netstack.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -51,14 +51,10 @@ type metadata struct {
 	Size       uint32
 }
 
-const enableDebug = false
-
 // nflog logs messages related to the writing and reading of iptables, but only
 // when enableDebug is true.
 func nflog(format string, args ...interface{}) {
-	if enableDebug {
-		log.Infof("netfilter: "+format, args...)
-	}
+	log.Infof("netfilter: "+format, args...)
 }
 
 // GetInfo returns information about iptables.
@@ -233,14 +229,12 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-// TODO: SOMEHOW THIS IS NOT GETTING APPENDED!
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch m := matcher.(type) {
 	case *iptables.UDPMatcher:
 		return marshalUDPMatcher(m)
 	default:
-		// TODO(gvisor.dev/issue/170): We don't support any matchers
-		// yet, so any call to marshalMatcher will panic.
+		// TODO(gvisor.dev/issue/170): Support other matchers.
 		panic(fmt.Errorf("unknown matcher of type %T", matcher))
 	}
 }
@@ -249,11 +243,11 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
 
 	// We have to pad this struct size to a multiple of 8 bytes.
-	const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6
+	size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8)
 
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: size,
+			MatchSize: uint16(size),
 		},
 		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
@@ -270,7 +264,7 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 
 	buf := make([]byte, 0, size)
 	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
-	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
+	buf = append(buf, make([]byte, size-len(buf))...)
 	nflog("convert to binary: marshalled UDP matcher into %v", buf)
 	return buf[:]
 }
@@ -410,7 +404,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 
 		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
-		// that they only work for certiain protocols, hooks, tables.
+		// that they only work for certain protocols, hooks, tables.
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
@@ -684,3 +678,8 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
+
+// alignUp rounds a length up to an alignment. align must be a power of 2.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
-- 
cgit v1.2.3


From 5776a7b6f6b52faf6e0735c3f4a892639c1bd773 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 18:26:26 -0800
Subject: Fix header ordering and format all C++ code.

PiperOrigin-RevId: 291844200
---
 CONTRIBUTING.md                                    |  3 +-
 test/syscalls/linux/32bit.cc                       |  2 +-
 test/syscalls/linux/fpsig_fork.cc                  |  4 +-
 test/syscalls/linux/fpsig_nested.cc                |  8 +--
 test/syscalls/linux/madvise.cc                     |  4 +-
 test/syscalls/linux/mempolicy.cc                   |  6 +-
 test/syscalls/linux/mlock.cc                       |  1 -
 test/syscalls/linux/msync.cc                       |  4 +-
 test/syscalls/linux/ptrace.cc                      |  3 +-
 test/syscalls/linux/seccomp.cc                     |  9 ++-
 test/syscalls/linux/sigaltstack.cc                 |  4 +-
 test/syscalls/linux/sigiret.cc                     |  4 +-
 test/syscalls/linux/socket_stream_blocking.cc      | 64 +++++++++++-----------
 test/syscalls/linux/stat.cc                        |  2 +-
 .../linux/udp_socket_errqueue_test_case.cc         |  3 +-
 test/util/capability_util.cc                       |  8 +--
 test/util/fs_util.cc                               |  2 +-
 test/util/multiprocess_util.h                      |  3 +-
 18 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5d46168bc..55a1ad0d9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,7 +36,8 @@ directory tree.
 
 All Go code should conform to the [Go style guidelines][gostyle]. C++ code
 should conform to the [Google C++ Style Guide][cppstyle] and the guidelines
-described for [tests][teststyle].
+described for [tests][teststyle]. Note that code may be automatically formatted
+per the guidelines when merged.
 
 As a secure runtime, we need to maintain the safety of all of code included in
 gVisor. The following rules help mitigate issues.
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index a7cbee06b..6a15d47e1 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -71,7 +71,7 @@ void ExitGroup32(const char instruction[2], int code) {
       "iretl\n"
       "int $3\n"
       :
-      : [code] "m"(code), [ip] "d"(m.ptr())
+      : [ code ] "m"(code), [ ip ] "d"(m.ptr())
       : "rax", "rbx", "rsp");
 }
 
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
index e7e9f06a1..a346f1f00 100644
--- a/test/syscalls/linux/fpsig_fork.cc
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -76,8 +76,8 @@ TEST(FPSigTest, Fork) {
       "movl %[sig], %%edx;"
       "syscall;"
       :
-      : [killnr] "i"(__NR_tgkill), [parent] "rm"(parent),
-        [tid] "rm"(parent_tid), [sig] "i"(SIGUSR1)
+      : [ killnr ] "i"(__NR_tgkill), [ parent ] "rm"(parent),
+        [ tid ] "rm"(parent_tid), [ sig ] "i"(SIGUSR1)
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
diff --git a/test/syscalls/linux/fpsig_nested.cc b/test/syscalls/linux/fpsig_nested.cc
index 395463aed..c476a8e7a 100644
--- a/test/syscalls/linux/fpsig_nested.cc
+++ b/test/syscalls/linux/fpsig_nested.cc
@@ -61,8 +61,8 @@ void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
       "movl %[sig], %%edx;"
       "syscall;"
       :
-      : [killnr] "i"(__NR_tgkill), [pid] "rm"(pid), [tid] "rm"(tid),
-        [sig] "i"(SIGUSR2)
+      : [ killnr ] "i"(__NR_tgkill), [ pid ] "rm"(pid), [ tid ] "rm"(tid),
+        [ sig ] "i"(SIGUSR2)
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
@@ -107,8 +107,8 @@ TEST(FPSigTest, NestedSignals) {
       "movl %[sig], %%edx;"
       "syscall;"
       :
-      : [killnr] "i"(__NR_tgkill), [pid] "rm"(pid), [tid] "rm"(tid),
-        [sig] "i"(SIGUSR1)
+      : [ killnr ] "i"(__NR_tgkill), [ pid ] "rm"(pid), [ tid ] "rm"(tid),
+        [ sig ] "i"(SIGUSR1)
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index 7fd0ea20c..dbd54ff2a 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -38,7 +38,7 @@ namespace testing {
 
 namespace {
 
-void ExpectAllMappingBytes(Mapping const& m, char c) {
+void ExpectAllMappingBytes(Mapping const &m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     ASSERT_EQ(v[i], c) << "at offset " << i;
@@ -47,7 +47,7 @@ void ExpectAllMappingBytes(Mapping const& m, char c) {
 
 // Equivalent to ExpectAllMappingBytes but async-signal-safe and with less
 // helpful failure messages.
-void CheckAllMappingBytes(Mapping const& m, char c) {
+void CheckAllMappingBytes(Mapping const &m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     TEST_CHECK_MSG(v[i] == c, "mapping contains wrong value");
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 9d5f47651..d21093899 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -213,7 +213,7 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
     }
   }
 
-  void* invalid_address = reinterpret_cast<void*>(-1);
+  void *invalid_address = reinterpret_cast<void *>(-1);
 
   // Invalid address.
   ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, invalid_address,
@@ -221,8 +221,8 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
               SyscallFailsWithErrno(EFAULT));
 
   // Invalid mode pointer.
-  ASSERT_THAT(get_mempolicy(reinterpret_cast<int*>(invalid_address), nullptr, 0,
-                            &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
+  ASSERT_THAT(get_mempolicy(reinterpret_cast<int *>(invalid_address), nullptr,
+                            0, &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
               SyscallFailsWithErrno(EFAULT));
 }
 
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 620b4f8b4..367a90fe1 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -60,7 +60,6 @@ bool IsPageMlocked(uintptr_t addr) {
   return true;
 }
 
-
 TEST(MlockTest, Basic) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
   auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index ac7146017..2b2b6aef9 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -60,9 +60,7 @@ std::vector<std::function<PosixErrorOr<Mapping>()>> SyncableMappings() {
     for (int const mflags : {MAP_PRIVATE, MAP_SHARED}) {
       int const prot = PROT_READ | (writable ? PROT_WRITE : 0);
       int const oflags = O_CREAT | (writable ? O_RDWR : O_RDONLY);
-      funcs.push_back([=] {
-        return MmapAnon(kPageSize, prot, mflags);
-      });
+      funcs.push_back([=] { return MmapAnon(kPageSize, prot, mflags); });
       funcs.push_back([=]() -> PosixErrorOr<Mapping> {
         std::string const path = NewTempAbsPath();
         ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, oflags, 0644));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 8f3800380..ef67b747b 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -178,7 +178,8 @@ TEST(PtraceTest, GetSigMask) {
 
     // Install a signal handler for kBlockSignal to avoid termination and block
     // it.
-    TEST_PCHECK(signal(kBlockSignal, +[](int signo) {}) != SIG_ERR);
+    TEST_PCHECK(signal(
+                    kBlockSignal, +[](int signo) {}) != SIG_ERR);
     MaybeSave();
     TEST_PCHECK(sigprocmask(SIG_SETMASK, &blocked, nullptr) == 0);
     MaybeSave();
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7e41fe7d8..294ee6808 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -113,7 +113,8 @@ TEST(SeccompTest, RetKillCausesDeathBySIGSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
     // Register a signal handler for SIGSYS that we don't expect to be invoked.
-    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
     syscall(kFilteredSyscall);
     TEST_CHECK_MSG(false, "Survived invocation of test syscall");
@@ -132,7 +133,8 @@ TEST(SeccompTest, RetKillOnlyKillsOneThread) {
   pid_t const pid = fork();
   if (pid == 0) {
     // Register a signal handler for SIGSYS that we don't expect to be invoked.
-    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
     // Pass CLONE_VFORK to block the original thread in the child process until
     // the clone thread exits with SIGSYS.
@@ -346,7 +348,8 @@ TEST(SeccompTest, LeastPermissiveFilterReturnValueApplies) {
   // one that causes the kill that should be ignored.
   pid_t const pid = fork();
   if (pid == 0) {
-    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 62b04ef1d..24e7c4960 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -168,8 +168,8 @@ TEST(SigaltstackTest, WalksOffBottom) {
 
   // Trigger a single fault.
   badhandler_low_water_mark =
-      static_cast<char*>(stack.ss_sp) + SIGSTKSZ;        // Expected top.
-  badhandler_recursive_faults = 0;                       // Disable refault.
+      static_cast<char*>(stack.ss_sp) + SIGSTKSZ;  // Expected top.
+  badhandler_recursive_faults = 0;                 // Disable refault.
   Fault();
   EXPECT_TRUE(badhandler_on_sigaltstack);
   EXPECT_THAT(sigaltstack(nullptr, &stack), SyscallSucceeds());
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index a47c781ea..4deb1ae95 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -78,8 +78,8 @@ TEST(SigIretTest, CheckRcxR11) {
       "1: pause; cmpl $0, %[gotvtalrm]; je 1b;"  // while (!gotvtalrm);
       "movq %%rcx, %[rcx];"                      // rcx = %rcx
       "movq %%r11, %[r11];"                      // r11 = %r11
-      : [ready] "=m"(ready), [rcx] "+m"(rcx), [r11] "+m"(r11)
-      : [gotvtalrm] "m"(gotvtalrm)
+      : [ ready ] "=m"(ready), [ rcx ] "+m"(rcx), [ r11 ] "+m"(r11)
+      : [ gotvtalrm ] "m"(gotvtalrm)
       : "cc", "memory", "rcx", "r11");
 
   // If sigreturn(2) returns via 'sysret' then %rcx and %r11 will be
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index e9cc082bf..538ee2268 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -32,38 +32,38 @@ namespace gvisor {
 namespace testing {
 
 TEST_P(BlockingStreamSocketPairTest, BlockPartialWriteClosed) {
-    // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
-    // enforce any limit; it will write arbitrary amounts of data without
-    // blocking.
-    SKIP_IF(IsRunningOnGvisor());
-
-    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-    int buffer_size;
-    socklen_t length = sizeof(buffer_size);
-    ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
-                           &buffer_size, &length),
-                SyscallSucceeds());
-
-    int wfd = sockets->first_fd();
-    ScopedThread t([wfd, buffer_size]() {
-      std::vector<char> buf(2 * buffer_size);
-      // Write more than fits in the buffer. Blocks then returns partial write
-      // when the other end is closed. The next call returns EPIPE.
-      //
-      // N.B. writes occur in chunks, so we may see less than buffer_size from
-      // the first call.
-      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
-                  SyscallSucceedsWithValue(::testing::Gt(0)));
-      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
-                  ::testing::AnyOf(SyscallFailsWithErrno(EPIPE),
-                                   SyscallFailsWithErrno(ECONNRESET)));
-    });
-
-    // Leave time for write to become blocked.
-    absl::SleepFor(absl::Seconds(1));
-
-    ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+  // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
+  // enforce any limit; it will write arbitrary amounts of data without
+  // blocking.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int buffer_size;
+  socklen_t length = sizeof(buffer_size);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                         &buffer_size, &length),
+              SyscallSucceeds());
+
+  int wfd = sockets->first_fd();
+  ScopedThread t([wfd, buffer_size]() {
+    std::vector<char> buf(2 * buffer_size);
+    // Write more than fits in the buffer. Blocks then returns partial write
+    // when the other end is closed. The next call returns EPIPE.
+    //
+    // N.B. writes occur in chunks, so we may see less than buffer_size from
+    // the first call.
+    ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                SyscallSucceedsWithValue(::testing::Gt(0)));
+    ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                ::testing::AnyOf(SyscallFailsWithErrno(EPIPE),
+                                 SyscallFailsWithErrno(ECONNRESET)));
+  });
+
+  // Leave time for write to become blocked.
+  absl::SleepFor(absl::Seconds(1));
+
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
 }
 
 // Random save may interrupt the call to sendmsg() in SendLargeSendMsg(),
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 30de2f8ff..c1e45e10a 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -377,7 +377,7 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   //
   // We need to support this because when a file is unlinked and we forward
   // the stat to the gofer it would return ENOENT.
-  const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
+  const char *uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
   SKIP_IF(uncached_gofer != nullptr);
 
   // We don't support saving unlinked files.
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
index 9a24e1df0..fcdba7279 100644
--- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -14,8 +14,6 @@
 
 #ifndef __fuchsia__
 
-#include "test/syscalls/linux/udp_socket_test_cases.h"
-
 #include <arpa/inet.h>
 #include <fcntl.h>
 #include <linux/errqueue.h>
@@ -29,6 +27,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/udp_socket_test_cases.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index 5d733887b..9fee52fbb 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -36,10 +36,10 @@ PosixErrorOr<bool> CanCreateUserNamespace() {
   ASSIGN_OR_RETURN_ERRNO(
       auto child_stack,
       MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  int const child_pid =
-      clone(+[](void*) { return 0; },
-            reinterpret_cast<void*>(child_stack.addr() + kPageSize),
-            CLONE_NEWUSER | SIGCHLD, /* arg = */ nullptr);
+  int const child_pid = clone(
+      +[](void*) { return 0; },
+      reinterpret_cast<void*>(child_stack.addr() + kPageSize),
+      CLONE_NEWUSER | SIGCHLD, /* arg = */ nullptr);
   if (child_pid > 0) {
     int status;
     int const ret = waitpid(child_pid, &status, /* options = */ 0);
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 042cec94a..052781445 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -452,7 +452,7 @@ PosixErrorOr<std::string> MakeAbsolute(absl::string_view filename,
 
 std::string CleanPath(const absl::string_view unclean_path) {
   std::string path = std::string(unclean_path);
-  const char *src = path.c_str();
+  const char* src = path.c_str();
   std::string::iterator dst = path.begin();
 
   // Check for absolute path and determine initial backtrack limit.
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index 3e736261b..2f3bf4a6f 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -99,7 +99,8 @@ inline PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
                                          const ExecveArray& argv,
                                          const ExecveArray& envv, pid_t* child,
                                          int* execve_errno) {
-  return ForkAndExec(filename, argv, envv, [] {}, child, execve_errno);
+  return ForkAndExec(
+      filename, argv, envv, [] {}, child, execve_errno);
 }
 
 // Equivalent to ForkAndExec, except using dirfd and flags with execveat.
-- 
cgit v1.2.3


From 2a2da5be31ea3c32e66f0c0ff61ef189848f5258 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 27 Jan 2020 20:28:14 -0800
Subject: Add a type to represent the NDP Source Link Layer Address option

Tests:
- header.TestNDPSourceLinkLayerAddressOptionEthernetAddress
- header.TestNDPSourceLinkLayerAddressOptionSerialize
- header.TestNDPOptionsIterCheck
- header.TestNDPOptionsIter
PiperOrigin-RevId: 291856429
---
 pkg/tcpip/header/icmpv6.go      |   2 +-
 pkg/tcpip/header/ndp_options.go |  52 ++++++++++--
 pkg/tcpip/header/ndp_test.go    | 180 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 214 insertions(+), 20 deletions(-)

diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index b4037b6c8..b095dc0ab 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -52,7 +52,7 @@ const (
 	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement
 	// including the NDP Target Link Layer option for an Ethernet
 	// address.
-	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + ndpTargetEthernetLinkLayerAddressSize
+	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + ndpLinkLayerAddressSize
 
 	// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
 	ICMPv6EchoMinimumSize = 8
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 06e0bace2..1e60f3d4f 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -24,13 +24,17 @@ import (
 )
 
 const (
-	// NDPTargetLinkLayerAddressOptionType is the type of the Target
-	// Link-Layer Address option, as per RFC 4861 section 4.6.1.
+	// NDPSourceLinkLayerAddressOptionType is the type of the Source Link Layer
+	// Address option, as per RFC 4861 section 4.6.1.
+	NDPSourceLinkLayerAddressOptionType = 1
+
+	// NDPTargetLinkLayerAddressOptionType is the type of the Target Link Layer
+	// Address option, as per RFC 4861 section 4.6.1.
 	NDPTargetLinkLayerAddressOptionType = 2
 
-	// ndpTargetEthernetLinkLayerAddressSize is the size of a Target
-	// Link Layer Option for an Ethernet address.
-	ndpTargetEthernetLinkLayerAddressSize = 8
+	// ndpLinkLayerAddressSize is the size of a Source or Target Link Layer
+	// Address option.
+	ndpLinkLayerAddressSize = 8
 
 	// NDPPrefixInformationType is the type of the Prefix Information
 	// option, as per RFC 4861 section 4.6.2.
@@ -189,6 +193,9 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 		i.opts = i.opts[numBytes:]
 
 		switch t {
+		case NDPSourceLinkLayerAddressOptionType:
+			return NDPSourceLinkLayerAddressOption(body), false, nil
+
 		case NDPTargetLinkLayerAddressOptionType:
 			return NDPTargetLinkLayerAddressOption(body), false, nil
 
@@ -368,6 +375,41 @@ func (b NDPOptionsSerializer) Length() int {
 	return l
 }
 
+// NDPSourceLinkLayerAddressOption is the NDP Source Link Layer Option
+// as defined by RFC 4861 section 4.6.1.
+//
+// It is the first X bytes following the NDP option's Type and Length field
+// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
+type NDPSourceLinkLayerAddressOption tcpip.LinkAddress
+
+// Type implements NDPOption.Type.
+func (o NDPSourceLinkLayerAddressOption) Type() uint8 {
+	return NDPSourceLinkLayerAddressOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPSourceLinkLayerAddressOption) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPSourceLinkLayerAddressOption) serializeInto(b []byte) int {
+	return copy(b, o)
+}
+
+// EthernetAddress will return an ethernet (MAC) address if the
+// NDPSourceLinkLayerAddressOption's body has at minimum EthernetAddressSize
+// bytes. If the body has more than EthernetAddressSize bytes, only the first
+// EthernetAddressSize bytes are returned as that is all that is needed for an
+// Ethernet address.
+func (o NDPSourceLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
+	if len(o) >= EthernetAddressSize {
+		return tcpip.LinkAddress(o[:EthernetAddressSize])
+	}
+
+	return tcpip.LinkAddress([]byte(nil))
+}
+
 // NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option
 // as defined by RFC 4861 section 4.6.1.
 //
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 2c439d70c..1cb9f5dc8 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -153,6 +153,125 @@ func TestNDPRouterAdvert(t *testing.T) {
 	}
 }
 
+// TestNDPSourceLinkLayerAddressOptionEthernetAddress tests getting the
+// Ethernet address from an NDPSourceLinkLayerAddressOption.
+func TestNDPSourceLinkLayerAddressOptionEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		expected tcpip.LinkAddress
+	}{
+		{
+			"ValidMAC",
+			[]byte{1, 2, 3, 4, 5, 6},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+		{
+			"SLLBodyTooShort",
+			[]byte{1, 2, 3, 4, 5},
+			tcpip.LinkAddress([]byte(nil)),
+		},
+		{
+			"SLLBodyLargerThanNeeded",
+			[]byte{1, 2, 3, 4, 5, 6, 7, 8},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			sll := NDPSourceLinkLayerAddressOption(test.buf)
+			if got := sll.EthernetAddress(); got != test.expected {
+				t.Errorf("got sll.EthernetAddress = %s, want = %s", got, test.expected)
+			}
+		})
+	}
+}
+
+// TestNDPSourceLinkLayerAddressOptionSerialize tests serializing a
+// NDPSourceLinkLayerAddressOption.
+func TestNDPSourceLinkLayerAddressOptionSerialize(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		expectedBuf []byte
+		addr        tcpip.LinkAddress
+	}{
+		{
+			"Ethernet",
+			make([]byte, 8),
+			[]byte{1, 1, 1, 2, 3, 4, 5, 6},
+			"\x01\x02\x03\x04\x05\x06",
+		},
+		{
+			"Padding",
+			[]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			[]byte{1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0},
+			"\x01\x02\x03\x04\x05\x06\x07\x08",
+		},
+		{
+			"Empty",
+			nil,
+			nil,
+			"",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			serializer := NDPOptionsSerializer{
+				NDPSourceLinkLayerAddressOption(test.addr),
+			}
+			if got, want := int(serializer.Length()), len(test.expectedBuf); got != want {
+				t.Fatalf("got Length = %d, want = %d", got, want)
+			}
+			opts.Serialize(serializer)
+			if !bytes.Equal(test.buf, test.expectedBuf) {
+				t.Fatalf("got b = %d, want = %d", test.buf, test.expectedBuf)
+			}
+
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			if len(test.expectedBuf) > 0 {
+				next, done, err := it.Next()
+				if err != nil {
+					t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+				}
+				if done {
+					t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+				}
+				if got := next.Type(); got != NDPSourceLinkLayerAddressOptionType {
+					t.Fatalf("got Type = %d, want = %d", got, NDPSourceLinkLayerAddressOptionType)
+				}
+				sll := next.(NDPSourceLinkLayerAddressOption)
+				if got, want := []byte(sll), test.expectedBuf[2:]; !bytes.Equal(got, want) {
+					t.Fatalf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+				}
+
+				if got, want := sll.EthernetAddress(), tcpip.LinkAddress(test.expectedBuf[2:][:EthernetAddressSize]); got != want {
+					t.Errorf("got sll.EthernetAddress = %s, want = %s", got, want)
+				}
+			}
+
+			// Iterator should not return anything else.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
+		})
+	}
+}
+
 // TestNDPTargetLinkLayerAddressOptionEthernetAddress tests getting the
 // Ethernet address from an NDPTargetLinkLayerAddressOption.
 func TestNDPTargetLinkLayerAddressOptionEthernetAddress(t *testing.T) {
@@ -186,7 +305,6 @@ func TestNDPTargetLinkLayerAddressOptionEthernetAddress(t *testing.T) {
 			}
 		})
 	}
-
 }
 
 // TestNDPTargetLinkLayerAddressOptionSerialize tests serializing a
@@ -212,8 +330,8 @@ func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
 		},
 		{
 			"Empty",
-			[]byte{},
-			[]byte{},
+			nil,
+			nil,
 			"",
 		},
 	}
@@ -246,7 +364,7 @@ func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
 					t.Fatal("got Next = (_, true, _), want = (_, false, _)")
 				}
 				if got := next.Type(); got != NDPTargetLinkLayerAddressOptionType {
-					t.Fatalf("got Type %= %d, want = %d", got, NDPTargetLinkLayerAddressOptionType)
+					t.Fatalf("got Type = %d, want = %d", got, NDPTargetLinkLayerAddressOptionType)
 				}
 				tll := next.(NDPTargetLinkLayerAddressOption)
 				if got, want := []byte(tll), test.expectedBuf[2:]; !bytes.Equal(got, want) {
@@ -254,7 +372,7 @@ func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
 				}
 
 				if got, want := tll.EthernetAddress(), tcpip.LinkAddress(test.expectedBuf[2:][:EthernetAddressSize]); got != want {
-					t.Errorf("got tll.MACAddress = %s, want = %s", got, want)
+					t.Errorf("got tll.EthernetAddress = %s, want = %s", got, want)
 				}
 			}
 
@@ -510,7 +628,7 @@ func TestNDPRecursiveDNSServerOption(t *testing.T) {
 				t.Fatal("got Next = (_, true, _), want = (_, false, _)")
 			}
 			if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
-				t.Fatalf("got Type %= %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+				t.Fatalf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType)
 			}
 
 			opt, ok := next.(NDPRecursiveDNSServer)
@@ -552,6 +670,16 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 			[]byte{0, 0, 0, 0, 0, 0, 0, 0},
 			ErrNDPOptZeroLength,
 		},
+		{
+			"ValidSourceLinkLayerAddressOption",
+			[]byte{1, 1, 1, 2, 3, 4, 5, 6},
+			nil,
+		},
+		{
+			"TooSmallSourceLinkLayerAddressOption",
+			[]byte{1, 1, 1, 2, 3, 4, 5},
+			ErrNDPOptBufExhausted,
+		},
 		{
 			"ValidTargetLinkLayerAddressOption",
 			[]byte{2, 1, 1, 2, 3, 4, 5, 6},
@@ -603,10 +731,13 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 			ErrNDPOptMalformedBody,
 		},
 		{
-			"ValidTargetLinkLayerAddressWithPrefixInformation",
+			"ValidSourceAndTargetLinkLayerAddressWithPrefixInformation",
 			[]byte{
+				// Source Link-Layer Address.
+				1, 1, 1, 2, 3, 4, 5, 6,
+
 				// Target Link-Layer Address.
-				2, 1, 1, 2, 3, 4, 5, 6,
+				2, 1, 7, 8, 9, 10, 11, 12,
 
 				// Prefix information.
 				3, 4, 43, 64,
@@ -621,10 +752,13 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 			nil,
 		},
 		{
-			"ValidTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
+			"ValidSourceAndTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
 			[]byte{
+				// Source Link-Layer Address.
+				1, 1, 1, 2, 3, 4, 5, 6,
+
 				// Target Link-Layer Address.
-				2, 1, 1, 2, 3, 4, 5, 6,
+				2, 1, 7, 8, 9, 10, 11, 12,
 
 				// 255 is an unrecognized type. If 255 ends up
 				// being the type for some recognized type,
@@ -714,8 +848,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 // here.
 func TestNDPOptionsIter(t *testing.T) {
 	buf := []byte{
+		// Source Link-Layer Address.
+		1, 1, 1, 2, 3, 4, 5, 6,
+
 		// Target Link-Layer Address.
-		2, 1, 1, 2, 3, 4, 5, 6,
+		2, 1, 7, 8, 9, 10, 11, 12,
 
 		// 255 is an unrecognized type. If 255 ends up being the type
 		// for some recognized type, update 255 to some other
@@ -740,7 +877,7 @@ func TestNDPOptionsIter(t *testing.T) {
 		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
 	}
 
-	// Test the first (Taret Link-Layer) option.
+	// Test the first (Source Link-Layer) option.
 	next, done, err := it.Next()
 	if err != nil {
 		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
@@ -748,7 +885,22 @@ func TestNDPOptionsIter(t *testing.T) {
 	if done {
 		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
 	}
-	if got, want := []byte(next.(NDPTargetLinkLayerAddressOption)), buf[2:][:6]; !bytes.Equal(got, want) {
+	if got, want := []byte(next.(NDPSourceLinkLayerAddressOption)), buf[2:][:6]; !bytes.Equal(got, want) {
+		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+	}
+	if got := next.Type(); got != NDPSourceLinkLayerAddressOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPSourceLinkLayerAddressOptionType)
+	}
+
+	// Test the next (Target Link-Layer) option.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got, want := []byte(next.(NDPTargetLinkLayerAddressOption)), buf[10:][:6]; !bytes.Equal(got, want) {
 		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
 	}
 	if got := next.Type(); got != NDPTargetLinkLayerAddressOptionType {
@@ -764,7 +916,7 @@ func TestNDPOptionsIter(t *testing.T) {
 	if done {
 		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
 	}
-	if got, want := next.(NDPPrefixInformation), buf[26:][:30]; !bytes.Equal(got, want) {
+	if got, want := next.(NDPPrefixInformation), buf[34:][:30]; !bytes.Equal(got, want) {
 		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
 	}
 	if got := next.Type(); got != NDPPrefixInformationType {
-- 
cgit v1.2.3


From 5d569408ef94c753b7aae9392b5e4ebf7e5ea50d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 22:27:57 -0800
Subject: Create platform_util for tests.

PiperOrigin-RevId: 291869423
---
 test/syscalls/linux/32bit.cc       | 130 +++++++++++++++++++++----------------
 test/syscalls/linux/BUILD          |   5 ++
 test/syscalls/linux/arch_prctl.cc  |   2 -
 test/syscalls/linux/concurrency.cc |   2 +
 test/syscalls/linux/exceptions.cc  |   2 +
 test/syscalls/linux/ptrace.cc      |  10 +--
 test/util/BUILD                    |  15 +++--
 test/util/platform_util.cc         |  49 ++++++++++++++
 test/util/platform_util.h          |  56 ++++++++++++++++
 test/util/test_util.cc             |  11 +---
 test/util/test_util.h              |  27 ++++----
 11 files changed, 216 insertions(+), 93 deletions(-)
 create mode 100644 test/util/platform_util.cc
 create mode 100644 test/util/platform_util.h

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 6a15d47e1..2751fb4e7 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -15,10 +15,12 @@
 #include <string.h>
 #include <sys/mman.h>
 
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
 #include "test/util/memory_util.h"
+#include "test/util/platform_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/test_util.h"
-#include "gtest/gtest.h"
 
 #ifndef __x86_64__
 #error "This test is x86-64 specific."
@@ -30,7 +32,6 @@ namespace testing {
 namespace {
 
 constexpr char kInt3 = '\xcc';
-
 constexpr char kInt80[2] = {'\xcd', '\x80'};
 constexpr char kSyscall[2] = {'\x0f', '\x05'};
 constexpr char kSysenter[2] = {'\x0f', '\x34'};
@@ -43,6 +44,7 @@ void ExitGroup32(const char instruction[2], int code) {
   // Fill with INT 3 in case we execute too far.
   memset(m.ptr(), kInt3, m.len());
 
+  // Copy in the actual instruction.
   memcpy(m.ptr(), instruction, 2);
 
   // We're playing *extremely* fast-and-loose with the various syscall ABIs
@@ -78,70 +80,87 @@ void ExitGroup32(const char instruction[2], int code) {
 constexpr int kExitCode = 42;
 
 TEST(Syscall32Bit, Int80) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): 32-bit segments are broken (but not explictly
-      // disabled).
-      return;
-    case Platform::kPtrace:
-      // TODO(gvisor.dev/issue/167): The ptrace platform does not have a
-      // consistent story here.
-      return;
-    case Platform::kNative:
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
+      break;
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(ExitGroup32(kInt80, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
       break;
-  }
 
-  // Upstream Linux. 32-bit syscalls allowed.
-  EXPECT_EXIT(ExitGroup32(kInt80, kExitCode), ::testing::ExitedWithCode(42),
-              "");
-}
+    case PlatformSupport::Ignored:
+      // Since the call is ignored, we'll hit the int3 trap.
+      EXPECT_EXIT(ExitGroup32(kInt80, kExitCode),
+                  ::testing::KilledBySignal(SIGTRAP), "");
+      break;
 
-TEST(Syscall32Bit, Sysenter) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): See above.
-      return;
-    case Platform::kPtrace:
-      // TODO(gvisor.dev/issue/167): See above.
-      return;
-    case Platform::kNative:
+    case PlatformSupport::Allowed:
+      EXPECT_EXIT(ExitGroup32(kInt80, kExitCode), ::testing::ExitedWithCode(42),
+                  "");
       break;
   }
+}
 
-  if (GetCPUVendor() == CPUVendor::kAMD) {
+TEST(Syscall32Bit, Sysenter) {
+  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+      GetCPUVendor() == CPUVendor::kAMD) {
     // SYSENTER is an illegal instruction in compatibility mode on AMD.
     EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
                 ::testing::KilledBySignal(SIGILL), "");
     return;
   }
 
-  // Upstream Linux on !AMD, 32-bit syscalls allowed.
-  EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode), ::testing::ExitedWithCode(42),
-              "");
-}
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
+      break;
 
-TEST(Syscall32Bit, Syscall) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): See above.
-      return;
-    case Platform::kPtrace:
-      // TODO(gvisor.dev/issue/167): See above.
-      return;
-    case Platform::kNative:
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Ignored:
+      // See above, except expected code is SIGSEGV.
+      EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Allowed:
+      EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                  ::testing::ExitedWithCode(42), "");
       break;
   }
+}
 
-  if (GetCPUVendor() == CPUVendor::kIntel) {
+TEST(Syscall32Bit, Syscall) {
+  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+      GetCPUVendor() == CPUVendor::kIntel) {
     // SYSCALL is an illegal instruction in compatibility mode on Intel.
     EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
                 ::testing::KilledBySignal(SIGILL), "");
     return;
   }
 
-  // Upstream Linux on !Intel, 32-bit syscalls allowed.
-  EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode), ::testing::ExitedWithCode(42),
-              "");
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
+      break;
+
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Ignored:
+      // See above.
+      EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                  ::testing::KilledBySignal(SIGILL), "");
+      break;
+
+    case PlatformSupport::Allowed:
+      EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                  ::testing::ExitedWithCode(42), "");
+      break;
+  }
 }
 
 // Far call code called below.
@@ -205,19 +224,20 @@ void FarCall32() {
 }
 
 TEST(Call32Bit, Disallowed) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): See above.
-      return;
-    case Platform::kPtrace:
-      // The ptrace platform cannot prevent switching to compatibility mode.
-      ABSL_FALLTHROUGH_INTENDED;
-    case Platform::kNative:
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
       break;
-  }
 
-  // Shouldn't crash.
-  FarCall32();
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(FarCall32(), ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Ignored:
+      ABSL_FALLTHROUGH_INTENDED;
+    case PlatformSupport::Allowed:
+      // Shouldn't crash.
+      FarCall32();
+  }
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index c2ef50c1d..74bf068ec 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -197,9 +197,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:memory_util",
+        "//test/util:platform_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -479,6 +481,7 @@ cc_binary(
     srcs = ["concurrency.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:platform_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
@@ -584,6 +587,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:logging",
+        "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -1658,6 +1662,7 @@ cc_binary(
     deps = [
         "//test/util:logging",
         "//test/util:multiprocess_util",
+        "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
index 3a901faf5..81bf5a775 100644
--- a/test/syscalls/linux/arch_prctl.cc
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -14,10 +14,8 @@
 
 #include <asm/prctl.h>
 #include <sys/prctl.h>
-#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
-#include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
 
 // glibc does not provide a prototype for arch_prctl() so declare it here.
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index 00b96b34a..f41f99900 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -20,6 +20,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/util/platform_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -99,6 +100,7 @@ TEST(ConcurrencyTest, MultiProcessMultithreaded) {
 // Test that multiple processes can execute concurrently, even if one process
 // never yields.
 TEST(ConcurrencyTest, MultiProcessConcurrency) {
+  SKIP_IF(PlatformSupportMultiProcess() == PlatformSupport::NotSupported);
 
   pid_t child_pid = fork();
   if (child_pid == 0) {
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 3d564e720..420b9543f 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -16,6 +16,7 @@
 
 #include "gtest/gtest.h"
 #include "test/util/logging.h"
+#include "test/util/platform_util.h"
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
 
@@ -324,6 +325,7 @@ TEST(ExceptionTest, AlignmentHalt) {
 }
 
 TEST(ExceptionTest, AlignmentCheck) {
+  SKIP_IF(PlatformSupportAlignmentCheck() != PlatformSupport::Allowed);
 
   // See above.
   struct sigaction sa = {};
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index ef67b747b..4dd5cf27b 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -32,6 +32,7 @@
 #include "absl/time/time.h"
 #include "test/util/logging.h"
 #include "test/util/multiprocess_util.h"
+#include "test/util/platform_util.h"
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -824,13 +825,8 @@ TEST(PtraceTest,
 // These tests requires knowledge of architecture-specific syscall convention.
 #ifdef __x86_64__
 TEST(PtraceTest, Int3) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/124248694): int3 isn't handled properly.
-      return;
-    default:
-      break;
-  }
+  SKIP_IF(PlatformSupportInt3() == PlatformSupport::NotSupported);
+
   pid_t const child_pid = fork();
   if (child_pid == 0) {
     // In child process.
diff --git a/test/util/BUILD b/test/util/BUILD
index 3c732be62..1ac8b3fd6 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -165,6 +165,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "platform_util",
+    testonly = 1,
+    srcs = ["platform_util.cc"],
+    hdrs = ["platform_util.h"],
+    deps = [":test_util"],
+)
+
 cc_library(
     name = "posix_error",
     testonly = 1,
@@ -238,12 +246,7 @@ cc_library(
         "test_util_runfiles.cc",
     ],
     hdrs = ["test_util.h"],
-    defines = select_system(
-        fuchsia = [
-            "__opensource__",
-            "__fuchsia__",
-        ],
-    ),
+    defines = select_system(),
     deps = [
         ":fs_util",
         ":logging",
diff --git a/test/util/platform_util.cc b/test/util/platform_util.cc
new file mode 100644
index 000000000..2724e63f3
--- /dev/null
+++ b/test/util/platform_util.cc
@@ -0,0 +1,49 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/platform_util.h"
+
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PlatformSupport PlatformSupport32Bit() {
+  if (GvisorPlatform() == Platform::kPtrace) {
+    return PlatformSupport::NotSupported;
+  } else if (GvisorPlatform() == Platform::kKVM) {
+    return PlatformSupport::Segfault;
+  } else {
+    return PlatformSupport::Allowed;
+  }
+}
+
+PlatformSupport PlatformSupportAlignmentCheck() {
+  return PlatformSupport::Allowed;
+}
+
+PlatformSupport PlatformSupportMultiProcess() {
+  return PlatformSupport::Allowed;
+}
+
+PlatformSupport PlatformSupportInt3() {
+  if (GvisorPlatform() == Platform::kKVM) {
+    return PlatformSupport::NotSupported;
+  } else {
+    return PlatformSupport::Allowed;
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/platform_util.h b/test/util/platform_util.h
new file mode 100644
index 000000000..28cc92371
--- /dev/null
+++ b/test/util/platform_util.h
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_PLATFORM_UTIL_H_
+#define GVISOR_TEST_UTIL_PLATFORM_UTIL_H_
+
+namespace gvisor {
+namespace testing {
+
+// PlatformSupport is a generic enumeration of classes of support.
+//
+// It is up to the individual functions and callers to agree on the precise
+// definition for each case. The document here generally refers to 32-bit
+// as an example. Many cases will use only NotSupported and Allowed.
+enum class PlatformSupport {
+  // The feature is not supported on the current platform.
+  //
+  // In the case of 32-bit, this means that calls will generally be interpreted
+  // as 64-bit calls, and there is no support for 32-bit binaries, long calls,
+  // etc. This usually means that the underlying implementation just pretends
+  // that 32-bit doesn't exist.
+  NotSupported,
+
+  // Calls will be ignored by the kernel with a fixed error.
+  Ignored,
+
+  // Calls will result in a SIGSEGV or similar fault.
+  Segfault,
+
+  // The feature is supported as expected.
+  //
+  // In the case of 32-bit, this means that the system call or far call will be
+  // handled properly.
+  Allowed,
+};
+
+PlatformSupport PlatformSupport32Bit();
+PlatformSupport PlatformSupportAlignmentCheck();
+PlatformSupport PlatformSupportMultiProcess();
+PlatformSupport PlatformSupportInt3();
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_PLATFORM_UTL_H_
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 848504c88..15cbc6da6 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -45,20 +45,13 @@ namespace testing {
 
 bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
 
-Platform GvisorPlatform() {
+const std::string GvisorPlatform() {
   // Set by runner.go.
   char* env = getenv(TEST_ON_GVISOR);
   if (!env) {
     return Platform::kNative;
   }
-  if (strcmp(env, "ptrace") == 0) {
-    return Platform::kPtrace;
-  }
-  if (strcmp(env, "kvm") == 0) {
-    return Platform::kKVM;
-  }
-  std::cerr << "unknown platform " << env;
-  abort();
+  return std::string(env);
 }
 
 bool IsRunningWithHostinet() {
diff --git a/test/util/test_util.h b/test/util/test_util.h
index b3235c7e3..2d22b0eb8 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -26,16 +26,13 @@
 // IsRunningOnGvisor returns true if the test is known to be running on gVisor.
 // GvisorPlatform can be used to get more detail:
 //
-//   switch (GvisorPlatform()) {
-//     case Platform::kNative:
-//     case Platform::kGvisor:
-//       EXPECT_THAT(mmap(...), SyscallSucceeds());
-//       break;
-//     case Platform::kPtrace:
-//       EXPECT_THAT(mmap(...), SyscallFailsWithErrno(ENOSYS));
-//       break;
+//   if (GvisorPlatform() == Platform::kPtrace) {
+//       ...
 //   }
 //
+// SetupGvisorDeathTest ensures that signal handling does not interfere with
+/// tests that rely on fatal signals.
+//
 // Matchers
 // ========
 //
@@ -213,13 +210,15 @@ void TestInit(int* argc, char*** argv);
     if (expr) GTEST_SKIP() << #expr; \
   } while (0)
 
-enum class Platform {
-  kNative,
-  kKVM,
-  kPtrace,
-};
+// Platform contains platform names.
+namespace Platform {
+constexpr char kNative[] = "native";
+constexpr char kPtrace[] = "ptrace";
+constexpr char kKVM[] = "kvm";
+}  // namespace Platform
+
 bool IsRunningOnGvisor();
-Platform GvisorPlatform();
+const std::string GvisorPlatform();
 bool IsRunningWithHostinet();
 
 #ifdef __linux__
-- 
cgit v1.2.3


From 74e04506a430535b7f3461eb35f36c9398db735a Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 Jan 2020 11:06:24 -0800
Subject: Prefer Type& over Type &

And Type* over Type *. This is basically a whitespace only change.

gVisor code already prefers left-alignment of pointers and references, but
clang-format formats for consistency with the majority of a file, and some
files leaned the wrong way. This is a one-time pass to make us completely
conforming.

Autogenerated with:

$ find . \( -name "*.cc" -or -name "*.c" -or -name "*.h" \) \
    | xargs clang-format -i -style="{BasedOnStyle: Google,  \
        DerivePointerAlignment: false, PointerAlignment: Left}"

PiperOrigin-RevId: 291972421
---
 test/syscalls/linux/connect_external.cc              | 12 ++++++------
 test/syscalls/linux/getrusage.cc                     |  2 +-
 test/syscalls/linux/iptables.h                       |  2 +-
 test/syscalls/linux/madvise.cc                       | 16 ++++++++--------
 test/syscalls/linux/mempolicy.cc                     | 16 ++++++++--------
 test/syscalls/linux/proc_net.cc                      | 20 ++++++++++----------
 test/syscalls/linux/sendfile_socket.cc               |  8 ++++----
 .../syscalls/linux/socket_bind_to_device_sequence.cc | 10 +++++-----
 test/syscalls/linux/socket_netdevice.cc              | 10 +++++-----
 test/syscalls/linux/stat.cc                          |  6 +++---
 test/util/mount_util.h                               |  8 ++++----
 vdso/syscalls.h                                      |  8 ++++----
 12 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc
index bfe1da82e..1edb50e47 100644
--- a/test/syscalls/linux/connect_external.cc
+++ b/test/syscalls/linux/connect_external.cc
@@ -56,7 +56,7 @@ TEST_P(GoferStreamSeqpacketTest, Echo) {
   ProtocolSocket proto;
   std::tie(env, proto) = GetParam();
 
-  char *val = getenv(env.c_str());
+  char* val = getenv(env.c_str());
   ASSERT_NE(val, nullptr);
   std::string root(val);
 
@@ -69,7 +69,7 @@ TEST_P(GoferStreamSeqpacketTest, Echo) {
   addr.sun_family = AF_UNIX;
   memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
 
-  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
                       sizeof(addr)),
               SyscallSucceeds());
 
@@ -92,7 +92,7 @@ TEST_P(GoferStreamSeqpacketTest, NonListening) {
   ProtocolSocket proto;
   std::tie(env, proto) = GetParam();
 
-  char *val = getenv(env.c_str());
+  char* val = getenv(env.c_str());
   ASSERT_NE(val, nullptr);
   std::string root(val);
 
@@ -105,7 +105,7 @@ TEST_P(GoferStreamSeqpacketTest, NonListening) {
   addr.sun_family = AF_UNIX;
   memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
 
-  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
                       sizeof(addr)),
               SyscallFailsWithErrno(ECONNREFUSED));
 }
@@ -127,7 +127,7 @@ using GoferDgramTest = ::testing::TestWithParam<std::string>;
 // unnamed. The server thus has no way to reply to us.
 TEST_P(GoferDgramTest, Null) {
   std::string env = GetParam();
-  char *val = getenv(env.c_str());
+  char* val = getenv(env.c_str());
   ASSERT_NE(val, nullptr);
   std::string root(val);
 
@@ -140,7 +140,7 @@ TEST_P(GoferDgramTest, Null) {
   addr.sun_family = AF_UNIX;
   memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
 
-  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
                       sizeof(addr)),
               SyscallSucceeds());
 
diff --git a/test/syscalls/linux/getrusage.cc b/test/syscalls/linux/getrusage.cc
index 9bdb1e4cd..0e51d42a8 100644
--- a/test/syscalls/linux/getrusage.cc
+++ b/test/syscalls/linux/getrusage.cc
@@ -67,7 +67,7 @@ TEST(GetrusageTest, Grandchild) {
     pid = fork();
     if (pid == 0) {
       int flags = MAP_ANONYMOUS | MAP_POPULATE | MAP_PRIVATE;
-      void *addr =
+      void* addr =
           mmap(nullptr, kGrandchildSizeKb * 1024, PROT_WRITE, flags, -1, 0);
       TEST_PCHECK(addr != MAP_FAILED);
     } else {
diff --git a/test/syscalls/linux/iptables.h b/test/syscalls/linux/iptables.h
index 616bea550..0719c60a4 100644
--- a/test/syscalls/linux/iptables.h
+++ b/test/syscalls/linux/iptables.h
@@ -188,7 +188,7 @@ struct ipt_replace {
   unsigned int num_counters;
 
   // The unchanged values from each ipt_entry's counters.
-  struct xt_counters *counters;
+  struct xt_counters* counters;
 
   // The entries to write to the table. This will run past the size defined by
   // sizeof(srtuct ipt_replace);
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index dbd54ff2a..5a1973f60 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -38,7 +38,7 @@ namespace testing {
 
 namespace {
 
-void ExpectAllMappingBytes(Mapping const &m, char c) {
+void ExpectAllMappingBytes(Mapping const& m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     ASSERT_EQ(v[i], c) << "at offset " << i;
@@ -47,7 +47,7 @@ void ExpectAllMappingBytes(Mapping const &m, char c) {
 
 // Equivalent to ExpectAllMappingBytes but async-signal-safe and with less
 // helpful failure messages.
-void CheckAllMappingBytes(Mapping const &m, char c) {
+void CheckAllMappingBytes(Mapping const& m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     TEST_CHECK_MSG(v[i] == c, "mapping contains wrong value");
@@ -139,7 +139,7 @@ TEST(MadviseDontneedTest, IgnoresPermissions) {
 TEST(MadviseDontforkTest, AddressLength) {
   auto m =
       ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
-  char *addr = static_cast<char *>(m.ptr());
+  char* addr = static_cast<char*>(m.ptr());
 
   // Address must be page aligned.
   EXPECT_THAT(madvise(addr + 1, kPageSize, MADV_DONTFORK),
@@ -168,9 +168,9 @@ TEST(MadviseDontforkTest, DontforkShared) {
   Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
       nullptr, kPageSize * 2, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
 
-  const Mapping ms1 = Mapping(reinterpret_cast<void *>(m.addr()), kPageSize);
+  const Mapping ms1 = Mapping(reinterpret_cast<void*>(m.addr()), kPageSize);
   const Mapping ms2 =
-      Mapping(reinterpret_cast<void *>(m.addr() + kPageSize), kPageSize);
+      Mapping(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize);
   m.release();
 
   ASSERT_THAT(madvise(ms2.ptr(), kPageSize, MADV_DONTFORK), SyscallSucceeds());
@@ -197,11 +197,11 @@ TEST(MadviseDontforkTest, DontforkAnonPrivate) {
   // Mmap three anonymous pages and MADV_DONTFORK the middle page.
   Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
       MmapAnon(kPageSize * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  const Mapping mp1 = Mapping(reinterpret_cast<void *>(m.addr()), kPageSize);
+  const Mapping mp1 = Mapping(reinterpret_cast<void*>(m.addr()), kPageSize);
   const Mapping mp2 =
-      Mapping(reinterpret_cast<void *>(m.addr() + kPageSize), kPageSize);
+      Mapping(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize);
   const Mapping mp3 =
-      Mapping(reinterpret_cast<void *>(m.addr() + 2 * kPageSize), kPageSize);
+      Mapping(reinterpret_cast<void*>(m.addr() + 2 * kPageSize), kPageSize);
   m.release();
 
   ASSERT_THAT(madvise(mp2.ptr(), kPageSize, MADV_DONTFORK), SyscallSucceeds());
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index d21093899..059fad598 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -43,17 +43,17 @@ namespace {
 #define MPOL_MF_MOVE (1 << 1)
 #define MPOL_MF_MOVE_ALL (1 << 2)
 
-int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
+int get_mempolicy(int* policy, uint64_t* nmask, uint64_t maxnode, void* addr,
                   int flags) {
   return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
 }
 
-int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
+int set_mempolicy(int mode, uint64_t* nmask, uint64_t maxnode) {
   return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
 }
 
-int mbind(void *addr, unsigned long len, int mode,
-          const unsigned long *nodemask, unsigned long maxnode,
+int mbind(void* addr, unsigned long len, int mode,
+          const unsigned long* nodemask, unsigned long maxnode,
           unsigned flags) {
   return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
 }
@@ -68,7 +68,7 @@ Cleanup ScopedMempolicy() {
 
 // Temporarily change the memory policy for the calling thread within the
 // caller's scope.
-PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t *nmask,
+PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t* nmask,
                                          uint64_t maxnode) {
   if (set_mempolicy(mode, nmask, maxnode)) {
     return PosixError(errno, "set_mempolicy");
@@ -213,7 +213,7 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
     }
   }
 
-  void *invalid_address = reinterpret_cast<void *>(-1);
+  void* invalid_address = reinterpret_cast<void*>(-1);
 
   // Invalid address.
   ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, invalid_address,
@@ -221,8 +221,8 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
               SyscallFailsWithErrno(EFAULT));
 
   // Invalid mode pointer.
-  ASSERT_THAT(get_mempolicy(reinterpret_cast<int *>(invalid_address), nullptr,
-                            0, &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
+  ASSERT_THAT(get_mempolicy(reinterpret_cast<int*>(invalid_address), nullptr, 0,
+                            &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
               SyscallFailsWithErrno(EFAULT));
 }
 
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 65bad06d4..3a611a86f 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -68,8 +68,8 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
 }
 
 PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
-                                             const std::string &type,
-                                             const std::string &item) {
+                                             const std::string& type,
+                                             const std::string& item) {
   std::vector<std::string> snmp_vec = absl::StrSplit(snmp, '\n');
 
   // /proc/net/snmp prints a line of headers followed by a line of metrics.
@@ -127,7 +127,7 @@ TEST(ProcNetSnmp, TcpReset_NoRandomSave) {
   };
 
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(connect(s.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(connect(s.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallFailsWithErrno(ECONNREFUSED));
 
   uint64_t newAttemptFails;
@@ -172,19 +172,19 @@ TEST(ProcNetSnmp, TcpEstab_NoRandomSave) {
   };
 
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(bind(s_listen.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(bind(s_listen.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceeds());
   ASSERT_THAT(listen(s_listen.get(), 1), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
   socklen_t addrlen = sizeof(sin);
   ASSERT_THAT(
-      getsockname(s_listen.get(), reinterpret_cast<sockaddr *>(&sin), &addrlen),
+      getsockname(s_listen.get(), reinterpret_cast<sockaddr*>(&sin), &addrlen),
       SyscallSucceeds());
 
   FileDescriptor s_connect =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
-  ASSERT_THAT(connect(s_connect.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(connect(s_connect.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceeds());
 
   auto s_accept =
@@ -260,7 +260,7 @@ TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
       .sin_port = htons(4444),
   };
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceedsWithValue(1));
 
   uint64_t newOutDatagrams;
@@ -295,18 +295,18 @@ TEST(ProcNetSnmp, UdpIn) {
       .sin_port = htons(0),
   };
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(bind(server.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(bind(server.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceeds());
   // Get the port bound by the server socket.
   socklen_t addrlen = sizeof(sin);
   ASSERT_THAT(
-      getsockname(server.get(), reinterpret_cast<sockaddr *>(&sin), &addrlen),
+      getsockname(server.get(), reinterpret_cast<sockaddr*>(&sin), &addrlen),
       SyscallSucceeds());
 
   FileDescriptor client =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
   ASSERT_THAT(
-      sendto(client.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+      sendto(client.get(), "a", 1, 0, (struct sockaddr*)&sin, sizeof(sin)),
       SyscallSucceedsWithValue(1));
 
   char buf[128];
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 3331288b7..8f7ee4163 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -41,15 +41,15 @@ class SendFileTest : public ::testing::TestWithParam<int> {
     struct sockaddr server_addr = {};
     switch (family) {
       case AF_INET: {
-        struct sockaddr_in *server_addr_in =
-            reinterpret_cast<struct sockaddr_in *>(&server_addr);
+        struct sockaddr_in* server_addr_in =
+            reinterpret_cast<struct sockaddr_in*>(&server_addr);
         server_addr_in->sin_family = family;
         server_addr_in->sin_addr.s_addr = INADDR_ANY;
         break;
       }
       case AF_UNIX: {
-        struct sockaddr_un *server_addr_un =
-            reinterpret_cast<struct sockaddr_un *>(&server_addr);
+        struct sockaddr_un* server_addr_un =
+            reinterpret_cast<struct sockaddr_un*>(&server_addr);
         server_addr_un->sun_family = family;
         server_addr_un->sun_path[0] = '\0';
         break;
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index 34b1058a9..637d1151a 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -66,7 +66,7 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
   // Gets a device by device_id.  If the device_id has been seen before, returns
   // the previously returned device.  If not, finds or creates a new device.
   // Returns an empty string on failure.
-  void GetDevice(int device_id, string *device_name) {
+  void GetDevice(int device_id, string* device_name) {
     auto device = devices_.find(device_id);
     if (device != devices_.end()) {
       *device_name = device->second;
@@ -112,7 +112,7 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
   // Sets the socket_id to uniquely identify the socket bound if it is not
   // nullptr.
   void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
-                  int want = 0, int *socket_id = nullptr) {
+                  int want = 0, int* socket_id = nullptr) {
     next_socket_id_++;
     sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
     auto socket_fd = sockets_to_close_[next_socket_id_]->get();
@@ -154,12 +154,12 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     addr.sin_port = port_;
     if (want == 0) {
       ASSERT_THAT(
-          bind(socket_fd, reinterpret_cast<const struct sockaddr *>(&addr),
+          bind(socket_fd, reinterpret_cast<const struct sockaddr*>(&addr),
                sizeof(addr)),
           SyscallSucceeds());
     } else {
       ASSERT_THAT(
-          bind(socket_fd, reinterpret_cast<const struct sockaddr *>(&addr),
+          bind(socket_fd, reinterpret_cast<const struct sockaddr*>(&addr),
                sizeof(addr)),
           SyscallFailsWithErrno(want));
     }
@@ -169,7 +169,7 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
       // remember it for future commands.
       socklen_t addr_size = sizeof(addr);
       ASSERT_THAT(
-          getsockname(socket_fd, reinterpret_cast<struct sockaddr *>(&addr),
+          getsockname(socket_fd, reinterpret_cast<struct sockaddr*>(&addr),
                       &addr_size),
           SyscallSucceeds());
       port_ = addr.sin_port;
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index 405dbbd73..15d4b85a7 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -91,7 +91,7 @@ TEST(NetdeviceTest, Netmask) {
   int prefixlen = -1;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
-      [&](const struct nlmsghdr *hdr) {
+      [&](const struct nlmsghdr* hdr) {
         EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWADDR), Eq(NLMSG_DONE)));
 
         EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
@@ -107,8 +107,8 @@ TEST(NetdeviceTest, Netmask) {
         // RTM_NEWADDR contains at least the header and ifaddrmsg.
         EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
 
-        struct ifaddrmsg *ifaddrmsg =
-            reinterpret_cast<struct ifaddrmsg *>(NLMSG_DATA(hdr));
+        struct ifaddrmsg* ifaddrmsg =
+            reinterpret_cast<struct ifaddrmsg*>(NLMSG_DATA(hdr));
         if (ifaddrmsg->ifa_index == static_cast<uint32_t>(ifr.ifr_ifindex) &&
             ifaddrmsg->ifa_family == AF_INET) {
           prefixlen = ifaddrmsg->ifa_prefixlen;
@@ -127,8 +127,8 @@ TEST(NetdeviceTest, Netmask) {
   snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
   ASSERT_THAT(ioctl(sock.get(), SIOCGIFNETMASK, &ifr), SyscallSucceeds());
   EXPECT_EQ(ifr.ifr_netmask.sa_family, AF_INET);
-  struct sockaddr_in *sin =
-      reinterpret_cast<struct sockaddr_in *>(&ifr.ifr_netmask);
+  struct sockaddr_in* sin =
+      reinterpret_cast<struct sockaddr_in*>(&ifr.ifr_netmask);
   EXPECT_EQ(sin->sin_addr.s_addr, mask);
 }
 
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index c1e45e10a..388d75835 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -377,7 +377,7 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   //
   // We need to support this because when a file is unlinked and we forward
   // the stat to the gofer it would return ENOENT.
-  const char *uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
+  const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
   SKIP_IF(uncached_gofer != nullptr);
 
   // We don't support saving unlinked files.
@@ -599,8 +599,8 @@ struct kernel_statx {
   uint64_t __spare2[14];
 };
 
-int statx(int dirfd, const char *pathname, int flags, unsigned int mask,
-          struct kernel_statx *statxbuf) {
+int statx(int dirfd, const char* pathname, int flags, unsigned int mask,
+          struct kernel_statx* statxbuf) {
   return syscall(SYS_statx, dirfd, pathname, flags, mask, statxbuf);
 }
 
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 23eea51a2..09e2281eb 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -31,10 +31,10 @@ namespace testing {
 
 // Mount mounts the filesystem, and unmounts when the returned reference is
 // destroyed.
-inline PosixErrorOr<Cleanup> Mount(const std::string &source,
-                                   const std::string &target,
-                                   const std::string &fstype,
-                                   uint64_t mountflags, const std::string &data,
+inline PosixErrorOr<Cleanup> Mount(const std::string& source,
+                                   const std::string& target,
+                                   const std::string& fstype,
+                                   uint64_t mountflags, const std::string& data,
                                    uint64_t umountflags) {
   if (mount(source.c_str(), target.c_str(), fstype.c_str(), mountflags,
             data.c_str()) == -1) {
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index f5865bb72..b6d15a7d3 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -65,8 +65,8 @@ static inline int sys_rt_sigreturn(void) {
   return num;
 }
 
-static inline int sys_clock_gettime(clockid_t _clkid, struct timespec *_ts) {
-  register struct timespec *ts asm("x1") = _ts;
+static inline int sys_clock_gettime(clockid_t _clkid, struct timespec* _ts) {
+  register struct timespec* ts asm("x1") = _ts;
   register clockid_t clkid asm("x0") = _clkid;
   register long ret asm("x0");
   register long nr asm("x8") = __NR_clock_gettime;
@@ -78,8 +78,8 @@ static inline int sys_clock_gettime(clockid_t _clkid, struct timespec *_ts) {
   return ret;
 }
 
-static inline int sys_clock_getres(clockid_t _clkid, struct timespec *_ts) {
-  register struct timespec *ts asm("x1") = _ts;
+static inline int sys_clock_getres(clockid_t _clkid, struct timespec* _ts) {
+  register struct timespec* ts asm("x1") = _ts;
   register clockid_t clkid asm("x0") = _clkid;
   register long ret asm("x0");
   register long nr asm("x8") = __NR_clock_getres;
-- 
cgit v1.2.3


From 76483b8b1ec4ee1fb6b6efb6bdcfaf6dba7be4ce Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 Jan 2020 11:12:01 -0800
Subject: Check sigsetsize in rt_sigaction

This isn't in the libc wrapper, but it is in the syscall itself.

Discovered by @xiaobo55x in #1625.

PiperOrigin-RevId: 291973931
---
 pkg/sentry/strace/linux64_amd64.go      |  2 +-
 pkg/sentry/strace/linux64_arm64.go      |  2 +-
 pkg/sentry/syscalls/linux/sys_signal.go |  5 ++++
 test/syscalls/linux/sigaction.cc        | 53 +++++++++++++++++++--------------
 4 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index 1e823b685..85ec66fd3 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -37,7 +37,7 @@ var linuxAMD64 = SyscallMap{
 	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
 	11:  makeSyscallInfo("munmap", Hex, Hex),
 	12:  makeSyscallInfo("brk", Hex),
-	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction, Hex),
 	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	15:  makeSyscallInfo("rt_sigreturn"),
 	16:  makeSyscallInfo("ioctl", FD, Hex, Hex),
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
index c3ac5248d..8bc38545f 100644
--- a/pkg/sentry/strace/linux64_arm64.go
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -158,7 +158,7 @@ var linuxARM64 = SyscallMap{
 	131: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	132: makeSyscallInfo("sigaltstack", Hex, Hex),
 	133: makeSyscallInfo("rt_sigsuspend", Hex),
-	134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+	134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction, Hex),
 	135: makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	136: makeSyscallInfo("rt_sigpending", Hex),
 	137: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 209be2990..7e1747a0c 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -245,6 +245,11 @@ func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 	sig := linux.Signal(args[0].Int())
 	newactarg := args[1].Pointer()
 	oldactarg := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	if sigsetsize != linux.SignalSetSize {
+		return 0, nil, syserror.EINVAL
+	}
 
 	var newactptr *arch.SignalAct
 	if newactarg != 0 {
diff --git a/test/syscalls/linux/sigaction.cc b/test/syscalls/linux/sigaction.cc
index 9a53fd3e0..9d9dd57a8 100644
--- a/test/syscalls/linux/sigaction.cc
+++ b/test/syscalls/linux/sigaction.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <signal.h>
+#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
 #include "test/util/test_util.h"
@@ -23,45 +24,53 @@ namespace testing {
 namespace {
 
 TEST(SigactionTest, GetLessThanOrEqualToZeroFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(-1, NULL, &act), SyscallFailsWithErrno(EINVAL));
-  ASSERT_THAT(sigaction(0, NULL, &act), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(-1, nullptr, &act), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(sigaction(0, nullptr, &act), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetLessThanOrEqualToZeroFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(0, &act, NULL), SyscallFailsWithErrno(EINVAL));
-  ASSERT_THAT(sigaction(0, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(0, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(sigaction(0, &act, nullptr), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, GetGreaterThanMaxFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGRTMAX + 1, NULL, &act),
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGRTMAX + 1, nullptr, &act),
               SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetGreaterThanMaxFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGRTMAX + 1, &act, NULL),
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGRTMAX + 1, &act, nullptr),
               SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetSigkillFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGKILL, NULL, &act), SyscallSucceeds());
-  ASSERT_THAT(sigaction(SIGKILL, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGKILL, nullptr, &act), SyscallSucceeds());
+  ASSERT_THAT(sigaction(SIGKILL, &act, nullptr), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetSigstopFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGSTOP, NULL, &act), SyscallSucceeds());
-  ASSERT_THAT(sigaction(SIGSTOP, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGSTOP, nullptr, &act), SyscallSucceeds());
+  ASSERT_THAT(sigaction(SIGSTOP, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, BadSigsetFails) {
+  constexpr size_t kWrongSigSetSize = 43;
+
+  struct sigaction act = {};
+
+  // The syscall itself (rather than the libc wrapper) takes the sigset_t size.
+  ASSERT_THAT(
+      syscall(SYS_rt_sigaction, SIGTERM, nullptr, &act, kWrongSigSetSize),
+      SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(
+      syscall(SYS_rt_sigaction, SIGTERM, &act, nullptr, kWrongSigSetSize),
+      SyscallFailsWithErrno(EINVAL));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 1119644080ae57c206b9b0d8d127cf48423af7f2 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 28 Jan 2020 12:06:58 -0800
Subject: Implement an anon_inode equivalent for VFS2.

PiperOrigin-RevId: 291986033
---
 pkg/sentry/vfs/BUILD                              |   3 +-
 pkg/sentry/vfs/anonfs.go                          | 259 ++++++++++++++++++++++
 pkg/sentry/vfs/file_description_impl_util_test.go |  18 +-
 pkg/sentry/vfs/testutil.go                        | 173 ---------------
 pkg/sentry/vfs/vfs.go                             |  24 ++
 5 files changed, 289 insertions(+), 188 deletions(-)
 create mode 100644 pkg/sentry/vfs/anonfs.go
 delete mode 100644 pkg/sentry/vfs/testutil.go

diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 6b1009328..33516e6f7 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -5,6 +5,7 @@ licenses(["notice"])
 go_library(
     name = "vfs",
     srcs = [
+        "anonfs.go",
         "context.go",
         "debug.go",
         "dentry.go",
@@ -20,7 +21,6 @@ go_library(
         "pathname.go",
         "permissions.go",
         "resolving_path.go",
-        "testutil.go",
         "vfs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -50,7 +50,6 @@ go_test(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/sentry/contexttest",
-        "//pkg/sentry/kernel/auth",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
new file mode 100644
index 000000000..2db25be49
--- /dev/null
+++ b/pkg/sentry/vfs/anonfs.go
@@ -0,0 +1,259 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name,
+// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References
+// are taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
+	d := anonDentry{
+		name: name,
+	}
+	d.vfsd.Init(&d)
+	vfs.anonMount.IncRef()
+	// anonDentry no-ops refcounting.
+	return VirtualDentry{
+		mount:  vfs.anonMount,
+		dentry: &d.vfsd,
+	}
+}
+
+const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+// anonFilesystem is the implementation of FilesystemImpl that backs
+// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+//
+// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
+// methods that would require an anonDentry to be a directory return ENOTDIR.
+type anonFilesystem struct {
+	vfsfs Filesystem
+
+	devMinor uint32
+}
+
+type anonDentry struct {
+	vfsd Dentry
+
+	name string
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *anonFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *anonFilesystem) Sync(ctx context.Context) error {
+	return nil
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+	if !rp.Done() {
+		return nil, syserror.ENOTDIR
+	}
+	if opts.CheckSearchable {
+		return nil, syserror.ENOTDIR
+	}
+	// anonDentry no-ops refcounting.
+	return rp.Start(), nil
+}
+
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+	if !rp.Final() {
+		return nil, syserror.ENOTDIR
+	}
+	// anonDentry no-ops refcounting.
+	return rp.Start(), nil
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+	if !rp.Done() {
+		return nil, syserror.ENOTDIR
+	}
+	return nil, syserror.ENODEV
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+	if !rp.Done() {
+		return "", syserror.ENOTDIR
+	}
+	return "", syserror.EINVAL
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	// Linux actually permits anon_inode_inode's metadata to be set, which is
+	// visible to all users of anon_inode_inode. We just silently ignore
+	// metadata changes.
+	return nil
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+	if !rp.Done() {
+		return linux.Statx{}, syserror.ENOTDIR
+	}
+	// See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode().
+	return linux.Statx{
+		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+		Blksize:  anonfsBlockSize,
+		Nlink:    1,
+		UID:      uint32(auth.RootKUID),
+		GID:      uint32(auth.RootKGID),
+		Mode:     0600, // no type is correct
+		Ino:      1,
+		Size:     0,
+		Blocks:   0,
+		DevMajor: 0,
+		DevMinor: fs.devMinor,
+	}, nil
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+	if !rp.Done() {
+		return linux.Statfs{}, syserror.ENOTDIR
+	}
+	return linux.Statfs{
+		Type:      linux.ANON_INODE_FS_MAGIC,
+		BlockSize: anonfsBlockSize,
+	}, nil
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+	if !rp.Done() {
+		return nil, syserror.ENOTDIR
+	}
+	return nil, nil
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+	if !rp.Done() {
+		return "", syserror.ENOTDIR
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name))
+	return PrependPathSyntheticError{}
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *anonDentry) IncRef() {
+	// no-op
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *anonDentry) TryIncRef() bool {
+	return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *anonDentry) DecRef() {
+	// no-op
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 1720d325d..0f44e7c8c 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -46,9 +45,11 @@ type genCountFD struct {
 	count uint64 // accessed using atomic memory ops
 }
 
-func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
+func newGenCountFD(vfsObj *VirtualFilesystem) *FileDescription {
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef()
 	var fd genCountFD
-	fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
+	fd.vfsfd.Init(&fd, 0 /* statusFlags */, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
 	fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
 	return &fd.vfsfd
 }
@@ -86,18 +87,9 @@ func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 func TestGenCountFD(t *testing.T) {
 	ctx := contexttest.Context(t)
-	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := New() // vfs.New()
-	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
-	if err != nil {
-		t.Fatalf("failed to create testfs root mount: %v", err)
-	}
-	vd := mntns.Root()
-	defer vd.DecRef()
-
-	fd := newGenCountFD(vd.Mount(), vd.Dentry())
+	fd := newGenCountFD(vfsObj)
 	defer fd.DecRef()
 
 	// The first read causes Generate to be called to fill the FD's buffer.
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
deleted file mode 100644
index 392c7611e..000000000
--- a/pkg/sentry/vfs/testutil.go
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems
-// for which all FilesystemImpl methods taking a path return EPERM. It is used
-// to produce Mounts and Dentries for testing of FileDescriptionImpls that do
-// not depend on their originating Filesystem.
-type FDTestFilesystemType struct{}
-
-// FDTestFilesystem is a test-only FilesystemImpl produced by
-// FDTestFilesystemType.
-type FDTestFilesystem struct {
-	vfsfs Filesystem
-}
-
-// GetFilesystem implements FilesystemType.GetFilesystem.
-func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) {
-	var fs FDTestFilesystem
-	fs.vfsfs.Init(vfsObj, &fs)
-	return &fs.vfsfs, fs.NewDentry(), nil
-}
-
-// Release implements FilesystemImpl.Release.
-func (fs *FDTestFilesystem) Release() {
-}
-
-// Sync implements FilesystemImpl.Sync.
-func (fs *FDTestFilesystem) Sync(ctx context.Context) error {
-	return nil
-}
-
-// GetDentryAt implements FilesystemImpl.GetDentryAt.
-func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
-	return nil, syserror.EPERM
-}
-
-// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
-func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
-	return nil, syserror.EPERM
-}
-
-// LinkAt implements FilesystemImpl.LinkAt.
-func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
-	return syserror.EPERM
-}
-
-// MkdirAt implements FilesystemImpl.MkdirAt.
-func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
-	return syserror.EPERM
-}
-
-// MknodAt implements FilesystemImpl.MknodAt.
-func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
-	return syserror.EPERM
-}
-
-// OpenAt implements FilesystemImpl.OpenAt.
-func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
-	return nil, syserror.EPERM
-}
-
-// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
-func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
-	return "", syserror.EPERM
-}
-
-// RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
-	return syserror.EPERM
-}
-
-// RmdirAt implements FilesystemImpl.RmdirAt.
-func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
-	return syserror.EPERM
-}
-
-// SetStatAt implements FilesystemImpl.SetStatAt.
-func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
-	return syserror.EPERM
-}
-
-// StatAt implements FilesystemImpl.StatAt.
-func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
-	return linux.Statx{}, syserror.EPERM
-}
-
-// StatFSAt implements FilesystemImpl.StatFSAt.
-func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
-	return linux.Statfs{}, syserror.EPERM
-}
-
-// SymlinkAt implements FilesystemImpl.SymlinkAt.
-func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
-	return syserror.EPERM
-}
-
-// UnlinkAt implements FilesystemImpl.UnlinkAt.
-func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
-	return syserror.EPERM
-}
-
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
-	return nil, syserror.EPERM
-}
-
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
-	return "", syserror.EPERM
-}
-
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
-	return syserror.EPERM
-}
-
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
-	return syserror.EPERM
-}
-
-// PrependPath implements FilesystemImpl.PrependPath.
-func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
-	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
-	return PrependPathSyntheticError{}
-}
-
-type fdTestDentry struct {
-	vfsd Dentry
-}
-
-// NewDentry returns a new Dentry.
-func (fs *FDTestFilesystem) NewDentry() *Dentry {
-	var d fdTestDentry
-	d.vfsd.Init(&d)
-	return &d.vfsd
-}
-
-// IncRef implements DentryImpl.IncRef.
-func (d *fdTestDentry) IncRef() {
-}
-
-// TryIncRef implements DentryImpl.TryIncRef.
-func (d *fdTestDentry) TryIncRef() bool {
-	return true
-}
-
-// DecRef implements DentryImpl.DecRef.
-func (d *fdTestDentry) DecRef() {
-}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index b2bf48853..d730530b9 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -75,6 +75,14 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// anonMount is a Mount, not included in mounts or mountpoints,
+	// representing an anonFilesystem. anonMount is used to back
+	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+	// anonMount is immutable.
+	//
+	// anonMount is analogous to Linux's anon_inode_mnt.
+	anonMount *Mount
+
 	// devices contains all registered Devices. devices is protected by
 	// devicesMu.
 	devicesMu sync.RWMutex
@@ -110,6 +118,22 @@ func New() *VirtualFilesystem {
 		filesystems:           make(map[*Filesystem]struct{}),
 	}
 	vfs.mounts.Init()
+
+	// Construct vfs.anonMount.
+	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
+	if err != nil {
+		panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+	}
+	anonfs := anonFilesystem{
+		devMinor: anonfsDevMinor,
+	}
+	anonfs.vfsfs.Init(vfs, &anonfs)
+	vfs.anonMount = &Mount{
+		vfs:  vfs,
+		fs:   &anonfs.vfsfs,
+		refs: 1,
+	}
+
 	return vfs
 }
 
-- 
cgit v1.2.3


From d99329e58492ef91b44a0bac346f757e8af2a7ec Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Tue, 28 Jan 2020 12:31:58 -0800
Subject: netlink: add support for RTM_F_LOOKUP_TABLE

Test command:
  $ ip route get 1.1.1.1

Fixes: #1099

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1121 from tanjianfeng:fix-1099 e6919f3d4ede5aa51a48b3d2be0d7a4b482dd53d
PiperOrigin-RevId: 291990716
---
 pkg/abi/linux/netlink_route.go              |  13 +++
 pkg/sentry/socket/netlink/route/BUILD       |   6 +-
 pkg/sentry/socket/netlink/route/protocol.go | 158 +++++++++++++++++++++++++---
 test/syscalls/linux/socket_netlink_route.cc |  84 +++++++++++++++
 test/syscalls/linux/socket_netlink_util.cc  |  38 +++++++
 test/syscalls/linux/socket_netlink_util.h   |  11 +-
 6 files changed, 295 insertions(+), 15 deletions(-)

diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 0e3582ab6..40bec566c 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -205,6 +205,9 @@ type RouteMessage struct {
 	Flags uint32
 }
 
+// SizeOfRouteMessage is the size of RouteMessage.
+const SizeOfRouteMessage = 12
+
 // Route types, from uapi/linux/rtnetlink.h.
 const (
 	// RTN_UNSPEC represents an unspecified route type.
@@ -331,3 +334,13 @@ const (
 	RTF_GATEWAY = 0x2
 	RTF_UP      = 0x1
 )
+
+// RtAttr is the header of optional addition route information, as a netlink
+// attribute. From include/uapi/linux/rtnetlink.h.
+type RtAttr struct {
+	Len  uint16
+	Type uint16
+}
+
+// SizeOfRtAttr is the size of RtAttr.
+const SizeOfRtAttr = 4
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 0234aadde..622a1eafc 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -4,15 +4,19 @@ package(licenses = ["notice"])
 
 go_library(
     name = "route",
-    srcs = ["protocol.go"],
+    srcs = [
+        "protocol.go",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 80a15d6cb..2b3c7f5b3 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -19,12 +19,14 @@ import (
 	"bytes"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -66,8 +68,14 @@ func (p *Protocol) CanSend() bool {
 	return true
 }
 
-// dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests.
+// dumpLinks handles RTM_GETLINK dump requests.
 func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// TODO(b/68878065): Only the dump variant of the types below are
+	// supported.
+	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
+		return syserr.ErrNotSupported
+	}
+
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -121,8 +129,14 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR + NLM_F_DUMP requests.
+// dumpAddrs handles RTM_GETADDR dump requests.
 func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// TODO(b/68878065): Only the dump variant of the types below are
+	// supported.
+	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
+		return syserr.ErrNotSupported
+	}
+
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -163,22 +177,146 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 	return nil
 }
 
-// dumpRoutes handles RTM_GETROUTE + NLM_F_DUMP requests.
+// commonPrefixLen reports the length of the longest IP address prefix.
+// This is a simplied version from Golang's src/net/addrselect.go.
+func commonPrefixLen(a, b []byte) (cpl int) {
+	for len(a) > 0 {
+		if a[0] == b[0] {
+			cpl += 8
+			a = a[1:]
+			b = b[1:]
+			continue
+		}
+		bits := 8
+		ab, bb := a[0], b[0]
+		for {
+			ab >>= 1
+			bb >>= 1
+			bits--
+			if ab == bb {
+				cpl += bits
+				return
+			}
+		}
+	}
+	return
+}
+
+// fillRoute returns the Route using LPM algorithm. Refer to Linux's
+// net/ipv4/route.c:rt_fill_info().
+func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
+	family := uint8(linux.AF_INET)
+	if len(addr) != 4 {
+		family = linux.AF_INET6
+	}
+
+	idx := -1    // Index of the Route rule to be returned.
+	idxDef := -1 // Index of the default route rule.
+	prefix := 0  // Current longest prefix.
+	for i, route := range routes {
+		if route.Family != family {
+			continue
+		}
+
+		if len(route.GatewayAddr) > 0 && route.DstLen == 0 {
+			idxDef = i
+			continue
+		}
+
+		cpl := commonPrefixLen(addr, route.DstAddr)
+		if cpl < int(route.DstLen) {
+			continue
+		}
+		cpl = int(route.DstLen)
+		if cpl > prefix {
+			idx = i
+			prefix = cpl
+		}
+	}
+	if idx == -1 {
+		idx = idxDef
+	}
+	if idx == -1 {
+		return inet.Route{}, syserr.ErrNoRoute
+	}
+
+	route := routes[idx]
+	if family == linux.AF_INET {
+		route.DstLen = 32
+	} else {
+		route.DstLen = 128
+	}
+	route.DstAddr = addr
+	route.Flags |= linux.RTM_F_CLONED // This route is cloned.
+	return route, nil
+}
+
+// parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
+func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+	var rtMsg linux.RouteMessage
+	if len(data) < linux.SizeOfRouteMessage {
+		return nil, syserr.ErrInvalidArgument
+	}
+	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
+	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
+	// commit bc234301af12. Note we don't check this flag for backward
+	// compatibility.
+	if rtMsg.Flags != 0 && rtMsg.Flags != linux.RTM_F_LOOKUP_TABLE {
+		return nil, syserr.ErrNotSupported
+	}
+
+	data = data[linux.SizeOfRouteMessage:]
+
+	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
+	var rtAttr linux.RtAttr
+	if len(data) < linux.SizeOfRtAttr {
+		return nil, syserr.ErrInvalidArgument
+	}
+	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
+	if rtAttr.Type != linux.RTA_DST {
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	if len(data) < int(rtAttr.Len) {
+		return nil, syserr.ErrInvalidArgument
+	}
+	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+}
+
+// dumpRoutes handles RTM_GETROUTE requests.
 func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
 
-	// We always send back an NLMSG_DONE.
-	ms.Multi = true
-
 	stack := inet.StackFromContext(ctx)
 	if stack == nil {
 		// No network routes.
 		return nil
 	}
 
-	for _, rt := range stack.RouteTable() {
+	routeTables := stack.RouteTable()
+
+	if hdr.Flags == linux.NLM_F_REQUEST {
+		dst, err := parseForDestination(data)
+		if err != nil {
+			return err
+		}
+		route, err := fillRoute(routeTables, dst)
+		if err != nil {
+			// TODO(gvisor.dev/issue/1237): return NLMSG_ERROR with ENETUNREACH.
+			return syserr.ErrNotSupported
+		}
+		routeTables = append([]inet.Route{}, route)
+	} else if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// We always send back an NLMSG_DONE.
+		ms.Multi = true
+	} else {
+		// TODO(b/68878065): Only above cases are supported.
+		return syserr.ErrNotSupported
+	}
+
+	for _, rt := range routeTables {
 		m := ms.AddMessage(linux.NetlinkMessageHeader{
 			Type: linux.RTM_NEWROUTE,
 		})
@@ -236,12 +374,6 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
 	switch hdr.Type {
 	case linux.RTM_GETLINK:
 		return p.dumpLinks(ctx, hdr, data, ms)
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index ef567f512..1e28e658d 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -442,6 +442,90 @@ TEST(NetlinkRouteTest, GetRouteDump) {
   EXPECT_TRUE(dstFound);
 }
 
+// GetRouteRequest tests a RTM_GETROUTE request with RTM_F_LOOKUP_TABLE flag.
+TEST(NetlinkRouteTest, GetRouteRequest) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+  struct __attribute__((__packed__)) request {
+    struct nlmsghdr hdr;
+    struct rtmsg rtm;
+    struct nlattr nla;
+    struct in_addr sin_addr;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETROUTE;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+
+  req.rtm.rtm_family = AF_INET;
+  req.rtm.rtm_dst_len = 32;
+  req.rtm.rtm_src_len = 0;
+  req.rtm.rtm_tos = 0;
+  req.rtm.rtm_table = RT_TABLE_UNSPEC;
+  req.rtm.rtm_protocol = RTPROT_UNSPEC;
+  req.rtm.rtm_scope = RT_SCOPE_UNIVERSE;
+  req.rtm.rtm_type = RTN_UNSPEC;
+  req.rtm.rtm_flags = RTM_F_LOOKUP_TABLE;
+
+  req.nla.nla_len = 8;
+  req.nla.nla_type = RTA_DST;
+  inet_aton("127.0.0.2", &req.sin_addr);
+
+  bool rtDstFound = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponseSingle(
+      fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) {
+        // Validate the reponse to RTM_GETROUTE request with RTM_F_LOOKUP_TABLE
+        // flag.
+        EXPECT_THAT(hdr->nlmsg_type, RTM_NEWROUTE);
+
+        EXPECT_TRUE(hdr->nlmsg_flags == 0) << std::hex << hdr->nlmsg_flags;
+
+        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+        EXPECT_EQ(hdr->nlmsg_pid, port);
+
+        // RTM_NEWROUTE contains at least the header and rtmsg.
+        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct rtmsg)));
+        const struct rtmsg* msg =
+            reinterpret_cast<const struct rtmsg*>(NLMSG_DATA(hdr));
+
+        // NOTE: rtmsg fields are char fields.
+        std::cout << "Found route table=" << static_cast<int>(msg->rtm_table)
+                  << ", protocol=" << static_cast<int>(msg->rtm_protocol)
+                  << ", scope=" << static_cast<int>(msg->rtm_scope)
+                  << ", type=" << static_cast<int>(msg->rtm_type);
+
+        EXPECT_EQ(msg->rtm_family, AF_INET);
+        EXPECT_EQ(msg->rtm_dst_len, 32);
+        EXPECT_TRUE((msg->rtm_flags & RTM_F_CLONED) == RTM_F_CLONED)
+            << std::hex << msg->rtm_flags;
+
+        int len = RTM_PAYLOAD(hdr);
+        std::cout << ", len=" << len;
+        for (struct rtattr* attr = RTM_RTA(msg); RTA_OK(attr, len);
+             attr = RTA_NEXT(attr, len)) {
+          if (attr->rta_type == RTA_DST) {
+            char address[INET_ADDRSTRLEN] = {};
+            inet_ntop(AF_INET, RTA_DATA(attr), address, sizeof(address));
+            std::cout << ", dst=" << address;
+            rtDstFound = true;
+          } else if (attr->rta_type == RTA_OIF) {
+            const char* oif = reinterpret_cast<const char*>(RTA_DATA(attr));
+            std::cout << ", oif=" << oif;
+          }
+        }
+
+        std::cout << std::endl;
+      }));
+  // Found RTA_DST for RTM_F_LOOKUP_TABLE.
+  EXPECT_TRUE(rtDstFound);
+}
+
 // RecvmsgTrunc tests the recvmsg MSG_TRUNC flag with zero length output
 // buffer. MSG_TRUNC with a zero length buffer should consume subsequent
 // messages off the socket.
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 723f5d728..cd2212a1a 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -108,5 +108,43 @@ PosixError NetlinkRequestResponse(
   return NoError();
 }
 
+PosixError NetlinkRequestResponseSingle(
+    const FileDescriptor& fd, void* request, size_t len,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct iovec iov = {};
+  iov.iov_base = request;
+  iov.iov_len = len;
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  // No destination required; it defaults to pid 0, the kernel.
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(sendmsg)(fd.get(), &msg, 0));
+
+  constexpr size_t kBufferSize = 4096;
+  std::vector<char> buf(kBufferSize);
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  int ret;
+  RETURN_ERROR_IF_SYSCALL_FAIL(ret = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
+
+  // We don't bother with the complexity of dealing with truncated messages.
+  // We must allocate a large enough buffer up front.
+  if ((msg.msg_flags & MSG_TRUNC) == MSG_TRUNC) {
+    return PosixError(
+        EIO,
+        absl::StrCat("Received truncated message with flags: ", msg.msg_flags));
+  }
+
+  for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
+       NLMSG_OK(hdr, ret); hdr = NLMSG_NEXT(hdr, ret)) {
+    fn(hdr);
+  }
+
+  return NoError();
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 76e772c48..3678c0599 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -32,12 +32,21 @@ PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
 // Returns the port ID of the passed socket.
 PosixErrorOr<uint32_t> NetlinkPortID(int fd);
 
-// Send the passed request and call fn will all response netlink messages.
+// Send the passed request and call fn on all response netlink messages.
+//
+// To be used on requests with NLM_F_MULTI reponses.
 PosixError NetlinkRequestResponse(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn,
     bool expect_nlmsgerr);
 
+// Send the passed request and call fn on all response netlink messages.
+//
+// To be used on requests without NLM_F_MULTI reponses.
+PosixError NetlinkRequestResponseSingle(
+    const FileDescriptor& fd, void* request, size_t len,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From 34fbd8446c386fb0136dad31ab6b173f17049a58 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 28 Jan 2020 13:10:41 -0800
Subject: Add VFS2 support for epoll.

PiperOrigin-RevId: 291997879
---
 pkg/abi/linux/epoll.go             |  10 +-
 pkg/sentry/vfs/BUILD               |  15 ++
 pkg/sentry/vfs/epoll.go            | 377 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/vfs/file_description.go |  38 ++++
 pkg/sentry/vfs/vfs.go              |  16 +-
 5 files changed, 448 insertions(+), 8 deletions(-)
 create mode 100644 pkg/sentry/vfs/epoll.go

diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
index 72083b604..0e881aa3c 100644
--- a/pkg/abi/linux/epoll.go
+++ b/pkg/abi/linux/epoll.go
@@ -38,8 +38,14 @@ const (
 
 // Per-file descriptor flags.
 const (
-	EPOLLET      = 0x80000000
-	EPOLLONESHOT = 0x40000000
+	EPOLLEXCLUSIVE = 1 << 28
+	EPOLLWAKEUP    = 1 << 29
+	EPOLLONESHOT   = 1 << 30
+	EPOLLET        = 1 << 31
+
+	// EP_PRIVATE_BITS is fs/eventpoll.c:EP_PRIVATE_BITS, the set of all bits
+	// in an epoll event mask that correspond to flags rather than I/O events.
+	EP_PRIVATE_BITS = EPOLLEXCLUSIVE | EPOLLWAKEUP | EPOLLONESHOT | EPOLLET
 )
 
 // Operation flags.
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 33516e6f7..ced9d07b1 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,7 +1,20 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "epoll_interest_list",
+    out = "epoll_interest_list.go",
+    package = "vfs",
+    prefix = "epollInterest",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*epollInterest",
+        "Linker": "*epollInterest",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -10,6 +23,8 @@ go_library(
         "debug.go",
         "dentry.go",
         "device.go",
+        "epoll.go",
+        "epoll_interest_list.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
new file mode 100644
index 000000000..7c83f9a5a
--- /dev/null
+++ b/pkg/sentry/vfs/epoll.go
@@ -0,0 +1,377 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// epollCycleMu serializes attempts to register EpollInstances with other
+// EpollInstances in order to check for cycles.
+var epollCycleMu sync.Mutex
+
+// EpollInstance represents an epoll instance, as described by epoll(7).
+type EpollInstance struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+
+	// q holds waiters on this EpollInstance.
+	q waiter.Queue
+
+	// interest is the set of file descriptors that are registered with the
+	// EpollInstance for monitoring. interest is protected by interestMu.
+	interestMu sync.Mutex
+	interest   map[epollInterestKey]*epollInterest
+
+	// mu protects fields in registered epollInterests.
+	mu sync.Mutex
+
+	// ready is the set of file descriptors that may be "ready" for I/O. Note
+	// that this must be an ordered list, not a map: "If more than maxevents
+	// file descriptors are ready when epoll_wait() is called, then successive
+	// epoll_wait() calls will round robin through the set of ready file
+	// descriptors. This behavior helps avoid starvation scenarios, where a
+	// process fails to notice that additional file descriptors are ready
+	// because it focuses on a set of file descriptors that are already known
+	// to be ready." - epoll_wait(2)
+	ready epollInterestList
+}
+
+type epollInterestKey struct {
+	// file is the registered FileDescription. No reference is held on file;
+	// instead, when the last reference is dropped, FileDescription.DecRef()
+	// removes the FileDescription from all EpollInstances. file is immutable.
+	file *FileDescription
+
+	// num is the file descriptor number with which this entry was registered.
+	// num is immutable.
+	num int32
+}
+
+// epollInterest represents an EpollInstance's interest in a file descriptor.
+type epollInterest struct {
+	// epoll is the owning EpollInstance. epoll is immutable.
+	epoll *EpollInstance
+
+	// key is the file to which this epollInterest applies. key is immutable.
+	key epollInterestKey
+
+	// waiter is registered with key.file. entry is protected by epoll.mu.
+	waiter waiter.Entry
+
+	// mask is the event mask associated with this registration, including
+	// flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
+	mask uint32
+
+	// ready is true if epollInterestEntry is linked into epoll.ready. ready
+	// and epollInterestEntry are protected by epoll.mu.
+	ready bool
+	epollInterestEntry
+
+	// userData is the epoll_data_t associated with this epollInterest.
+	// userData is protected by epoll.mu.
+	userData [2]int32
+}
+
+// NewEpollInstanceFD returns a FileDescription representing a new epoll
+// instance. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+	vd := vfs.NewAnonVirtualDentry("[eventpoll]")
+	defer vd.DecRef()
+	ep := &EpollInstance{
+		interest: make(map[epollInterestKey]*epollInterest),
+	}
+	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &ep.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (ep *EpollInstance) Release() {
+	// Unregister all polled fds.
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+	for key, epi := range ep.interest {
+		file := key.file
+		file.epollMu.Lock()
+		delete(file.epolls, epi)
+		file.epollMu.Unlock()
+		file.EventUnregister(&epi.waiter)
+	}
+	ep.interest = nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn == 0 {
+		return 0
+	}
+	ep.mu.Lock()
+	for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
+		wmask := waiter.EventMaskFromLinux(epi.mask)
+		if epi.key.file.Readiness(wmask)&wmask != 0 {
+			ep.mu.Unlock()
+			return waiter.EventIn
+		}
+	}
+	ep.mu.Unlock()
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	ep.q.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
+	ep.q.EventUnregister(e)
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
+	return 0, nil
+}
+
+// AddInterest implements the semantics of EPOLL_CTL_ADD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+	// Check for cyclic polling if necessary.
+	subep, _ := file.impl.(*EpollInstance)
+	if subep != nil {
+		epollCycleMu.Lock()
+		// epollCycleMu must be locked for the rest of AddInterest to ensure
+		// that cyclic polling is not introduced after the check.
+		defer epollCycleMu.Unlock()
+		if subep.mightPoll(ep) {
+			return syserror.ELOOP
+		}
+	}
+
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+
+	// Fail if the key is already registered.
+	key := epollInterestKey{
+		file: file,
+		num:  num,
+	}
+	if _, ok := ep.interest[key]; ok {
+		return syserror.EEXIST
+	}
+
+	// Register interest in file.
+	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+	epi := &epollInterest{
+		epoll:    ep,
+		key:      key,
+		mask:     mask,
+		userData: userData,
+	}
+	ep.interest[key] = epi
+	wmask := waiter.EventMaskFromLinux(mask)
+	file.EventRegister(&epi.waiter, wmask)
+
+	// Check if the file is already ready.
+	if file.Readiness(wmask)&wmask != 0 {
+		epi.Callback(nil)
+	}
+
+	// Add epi to file.epolls so that it is removed when the last
+	// FileDescription reference is dropped.
+	file.epollMu.Lock()
+	file.epolls[epi] = struct{}{}
+	file.epollMu.Unlock()
+
+	return nil
+}
+
+func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
+	return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
+}
+
+func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+	for key := range ep.interest {
+		nextep, ok := key.file.impl.(*EpollInstance)
+		if !ok {
+			continue
+		}
+		if nextep == ep2 {
+			return true
+		}
+		if remainingRecursion == 0 {
+			return true
+		}
+		if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
+			return true
+		}
+	}
+	return false
+}
+
+// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+
+	// Fail if the key is not already registered.
+	epi, ok := ep.interest[epollInterestKey{
+		file: file,
+		num:  num,
+	}]
+	if !ok {
+		return syserror.ENOENT
+	}
+
+	// Update epi for the next call to ep.ReadEvents().
+	ep.mu.Lock()
+	epi.mask = mask
+	epi.userData = userData
+	ep.mu.Unlock()
+
+	// Re-register with the new mask.
+	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+	file.EventUnregister(&epi.waiter)
+	wmask := waiter.EventMaskFromLinux(mask)
+	file.EventRegister(&epi.waiter, wmask)
+
+	// Check if the file is already ready with the new mask.
+	if file.Readiness(wmask)&wmask != 0 {
+		epi.Callback(nil)
+	}
+
+	return nil
+}
+
+// DeleteInterest implements the semantics of EPOLL_CTL_DEL.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+
+	// Fail if the key is not already registered.
+	epi, ok := ep.interest[epollInterestKey{
+		file: file,
+		num:  num,
+	}]
+	if !ok {
+		return syserror.ENOENT
+	}
+
+	// Unregister from the file so that epi will no longer be readied.
+	file.EventUnregister(&epi.waiter)
+
+	// Forget about epi.
+	ep.removeLocked(epi)
+
+	file.epollMu.Lock()
+	delete(file.epolls, epi)
+	file.epollMu.Unlock()
+
+	return nil
+}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (epi *epollInterest) Callback(*waiter.Entry) {
+	newReady := false
+	epi.epoll.mu.Lock()
+	if !epi.ready {
+		newReady = true
+		epi.ready = true
+		epi.epoll.ready.PushBack(epi)
+	}
+	epi.epoll.mu.Unlock()
+	if newReady {
+		epi.epoll.q.Notify(waiter.EventIn)
+	}
+}
+
+// Preconditions: ep.interestMu must be locked.
+func (ep *EpollInstance) removeLocked(epi *epollInterest) {
+	delete(ep.interest, epi.key)
+	ep.mu.Lock()
+	if epi.ready {
+		epi.ready = false
+		ep.ready.Remove(epi)
+	}
+	ep.mu.Unlock()
+}
+
+// ReadEvents reads up to len(events) ready events into events and returns the
+// number of events read.
+//
+// Preconditions: len(events) != 0.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+	i := 0
+	// Hot path: avoid defer.
+	ep.mu.Lock()
+	var next *epollInterest
+	var requeue epollInterestList
+	for epi := ep.ready.Front(); epi != nil; epi = next {
+		next = epi.Next()
+		// Regardless of what else happens, epi is initially removed from the
+		// ready list.
+		ep.ready.Remove(epi)
+		wmask := waiter.EventMaskFromLinux(epi.mask)
+		ievents := epi.key.file.Readiness(wmask) & wmask
+		if ievents == 0 {
+			// Leave epi off the ready list.
+			epi.ready = false
+			continue
+		}
+		// Determine what we should do with epi.
+		switch {
+		case epi.mask&linux.EPOLLONESHOT != 0:
+			// Clear all events from the mask; they must be re-added by
+			// EPOLL_CTL_MOD.
+			epi.mask &= linux.EP_PRIVATE_BITS
+			fallthrough
+		case epi.mask&linux.EPOLLET != 0:
+			// Leave epi off the ready list.
+			epi.ready = false
+		default:
+			// Queue epi to be moved to the end of the ready list.
+			requeue.PushBack(epi)
+		}
+		// Report ievents.
+		events[i] = linux.EpollEvent{
+			Events: ievents.ToLinux(),
+			Fd:     epi.userData[0],
+			Data:   epi.userData[1],
+		}
+		i++
+		if i == len(events) {
+			break
+		}
+	}
+	ep.ready.PushBackList(&requeue)
+	ep.mu.Unlock()
+	return i
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 225024463..badacb55e 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -45,6 +46,11 @@ type FileDescription struct {
 	// memory operations.
 	statusFlags uint32
 
+	// epolls is the set of epollInterests registered for this FileDescription.
+	// epolls is protected by epollMu.
+	epollMu sync.Mutex
+	epolls  map[*epollInterest]struct{}
+
 	// vd is the filesystem location at which this FileDescription was opened.
 	// A reference is held on vd. vd is immutable.
 	vd VirtualDentry
@@ -141,6 +147,23 @@ func (fd *FileDescription) TryIncRef() bool {
 // DecRef decrements fd's reference count.
 func (fd *FileDescription) DecRef() {
 	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+		// Unregister fd from all epoll instances.
+		fd.epollMu.Lock()
+		epolls := fd.epolls
+		fd.epolls = nil
+		fd.epollMu.Unlock()
+		for epi := range epolls {
+			ep := epi.epoll
+			ep.interestMu.Lock()
+			// Check that epi has not been concurrently unregistered by
+			// EpollInstance.DeleteInterest() or EpollInstance.Release().
+			if _, ok := ep.interest[epi.key]; ok {
+				fd.EventUnregister(&epi.waiter)
+				ep.removeLocked(epi)
+			}
+			ep.interestMu.Unlock()
+		}
+		// Release implementation resources.
 		fd.impl.Release()
 		if fd.writable {
 			fd.vd.mount.EndWrite()
@@ -453,6 +476,21 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
 	return fd.impl.StatFS(ctx)
 }
 
+// Readiness returns fd's I/O readiness.
+func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fd.impl.Readiness(mask)
+}
+
+// EventRegister registers e for I/O readiness events in mask.
+func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.impl.EventRegister(e, mask)
+}
+
+// EventUnregister unregisters e for I/O readiness events.
+func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
+	fd.impl.EventUnregister(e)
+}
+
 // PRead reads from the file represented by fd into dst, starting at the given
 // offset, and returns the number of bytes read. PRead is permitted to return
 // partial reads with a nil error.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index d730530b9..908c69f91 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -16,15 +16,19 @@
 //
 // Lock order:
 //
-// FilesystemImpl/FileDescriptionImpl locks
-//   VirtualFilesystem.mountMu
-//     Dentry.mu
-//       Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
-//     VirtualFilesystem.filesystemsMu
+// EpollInstance.interestMu
+//   FileDescription.epollMu
+//     FilesystemImpl/FileDescriptionImpl locks
+//       VirtualFilesystem.mountMu
+//         Dentry.mu
+//           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+//         VirtualFilesystem.filesystemsMu
+//       EpollInstance.mu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
-// VirtualFilesystem.mountMu.
+// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
+// EpollInstances requires holding epollCycleMu.
 package vfs
 
 import (
-- 
cgit v1.2.3


From f263801a74d4ccac042b068d0928c8738e40af5b Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 28 Jan 2020 13:36:16 -0800
Subject: fs/splice: don't report partial errors for special files

Special files can have additional requirements for granularity.
For example, read from eventfd returns EINVAL if a size is less 8 bytes.

Reported-by: syzbot+3905f5493bec08eb7b02@syzkaller.appspotmail.com
PiperOrigin-RevId: 292002926
---
 pkg/sentry/fs/attr.go                   |  5 +++++
 pkg/sentry/fs/file.go                   |  7 -------
 pkg/sentry/fs/splice.go                 |  5 -----
 pkg/sentry/syscalls/linux/sys_splice.go | 19 +++++++++++++++----
 test/syscalls/linux/eventfd.cc          | 25 +++++++++++++++++++++++++
 5 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index fa9e7d517..f60bd423d 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -206,6 +206,11 @@ func IsPipe(s StableAttr) bool {
 	return s.Type == Pipe
 }
 
+// IsAnonymous returns true if StableAttr.Type matches any type of anonymous.
+func IsAnonymous(s StableAttr) bool {
+	return s.Type == Anonymous
+}
+
 // IsSocket returns true if StableAttr.Type matches any type of socket.
 func IsSocket(s StableAttr) bool {
 	return s.Type == Socket
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index ca3466f4f..78100e448 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -555,10 +555,6 @@ type lockedWriter struct {
 	//
 	// This applies only to Write, not WriteAt.
 	Offset int64
-
-	// Err contains the first error encountered while copying. This is
-	// useful to determine whether Writer or Reader failed during io.Copy.
-	Err error
 }
 
 // Write implements io.Writer.Write.
@@ -594,8 +590,5 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
 			break
 		}
 	}
-	if w.Err == nil {
-		w.Err = err
-	}
 	return written, err
 }
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 791d1526c..33da82868 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -167,11 +167,6 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 		if !srcPipe && !opts.SrcOffset {
 			atomic.StoreInt64(&src.offset, src.offset+n)
 		}
-
-		// Don't report any errors if we have some progress without data loss.
-		if w.Err == nil {
-			err = nil
-		}
 	}
 
 	// Drop locks.
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index dd3a5807f..f43d6c155 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -211,8 +211,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	opts := fs.SpliceOpts{
 		Length: count,
 	}
+	inFileAttr := inFile.Dirent.Inode.StableAttr
+	outFileAttr := outFile.Dirent.Inode.StableAttr
 	switch {
-	case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && !fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+	case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr):
 		if inOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
@@ -229,7 +231,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			opts.DstOffset = true
 			opts.DstStart = offset
 		}
-	case !fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+	case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
 		if outOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
@@ -246,13 +248,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			opts.SrcOffset = true
 			opts.SrcStart = offset
 		}
-	case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+	case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
 		if inOffset != 0 || outOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
 
 		// We may not refer to the same pipe; otherwise it's a continuous loop.
-		if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+		if inFileAttr.InodeID == outFileAttr.InodeID {
 			return 0, nil, syserror.EINVAL
 		}
 	default:
@@ -262,6 +264,15 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	// Splice data.
 	n, err := doSplice(t, outFile, inFile, opts, nonBlock)
 
+	// Special files can have additional requirements for granularity.  For
+	// example, read from eventfd returns EINVAL if a size is less 8 bytes.
+	// Inotify is another example. read will return EINVAL is a buffer is
+	// too small to return the next event, but a size of an event isn't
+	// fixed, it is sizeof(struct inotify_event) + {NAME_LEN} + 1.
+	if n != 0 && err != nil && (fs.IsAnonymous(inFileAttr) || fs.IsAnonymous(outFileAttr)) {
+		err = nil
+	}
+
 	// See above; inFile is chosen arbitrarily here.
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
 }
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index 367682c3d..927001eee 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -132,6 +132,31 @@ TEST(EventfdTest, BigWriteBigRead) {
   EXPECT_EQ(l[0], 1);
 }
 
+TEST(EventfdTest, SpliceFromPipePartialSucceeds) {
+  int pipes[2];
+  ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+  const FileDescriptor pipe_rfd(pipes[0]);
+  const FileDescriptor pipe_wfd(pipes[1]);
+  constexpr uint64_t kVal{1};
+
+  FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK));
+
+  uint64_t event_array[2];
+  event_array[0] = kVal;
+  event_array[1] = kVal;
+  ASSERT_THAT(write(pipe_wfd.get(), event_array, sizeof(event_array)),
+              SyscallSucceedsWithValue(sizeof(event_array)));
+  EXPECT_THAT(splice(pipe_rfd.get(), /*__offin=*/nullptr, efd.get(),
+                     /*__offout=*/nullptr, sizeof(event_array[0]) + 1,
+                     SPLICE_F_NONBLOCK),
+              SyscallSucceedsWithValue(sizeof(event_array[0])));
+
+  uint64_t val;
+  ASSERT_THAT(read(efd.get(), &val, sizeof(val)),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(val, kVal);
+}
+
 // NotifyNonZero is inherently racy, so random save is disabled.
 TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
   // Waits will time out at 10 seconds.
-- 
cgit v1.2.3


From ce0bac4be9d808877248c328fac07ff0d66b9607 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 28 Jan 2020 13:37:10 -0800
Subject: Include the NDP Source Link Layer option when sending DAD messages

Test: stack_test.TestDADResolve
PiperOrigin-RevId: 292003124
---
 pkg/tcpip/checker/checker.go    | 50 +++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/header/icmpv6.go      |  2 +-
 pkg/tcpip/header/ndp_options.go | 35 ++++++++++++++++++++++++++---
 pkg/tcpip/stack/ndp.go          | 22 ++++++++++++++++--
 pkg/tcpip/stack/ndp_test.go     |  6 ++++-
 5 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 885d773b0..4d6ae0871 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -771,6 +771,56 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 	}
 }
 
+// NDPNSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Neighbor Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNS message as far as the size is concerned.
+func NDPNSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+		it, err := ns.Options().Iter(true)
+		if err != nil {
+			t.Errorf("opts.Iter(true): %s", err)
+			return
+		}
+
+		i := 0
+		for {
+			opt, done, _ := it.Next()
+			if done {
+				break
+			}
+
+			if i >= len(opts) {
+				t.Errorf("got unexpected option: %s", opt)
+				continue
+			}
+
+			switch wantOpt := opts[i].(type) {
+			case header.NDPSourceLinkLayerAddressOption:
+				gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
+				if !ok {
+					t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+				} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+					t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+				}
+			default:
+				panic("not implemented")
+			}
+
+			i++
+		}
+
+		if missing := opts[i:]; len(missing) > 0 {
+			t.Errorf("missing options: %s", missing)
+		}
+	}
+}
+
 // NDPRS creates a checker that checks that the packet contains a valid NDP
 // Router Solicitation message (as per the raw wire format).
 func NDPRS() NetworkChecker {
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index b095dc0ab..c7ee2de57 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -52,7 +52,7 @@ const (
 	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement
 	// including the NDP Target Link Layer option for an Ethernet
 	// address.
-	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + ndpLinkLayerAddressSize
+	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + NDPLinkLayerAddressSize
 
 	// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
 	ICMPv6EchoMinimumSize = 8
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 1e60f3d4f..e6a6ad39b 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -17,6 +17,7 @@ package header
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"math"
 	"time"
 
@@ -32,9 +33,9 @@ const (
 	// Address option, as per RFC 4861 section 4.6.1.
 	NDPTargetLinkLayerAddressOptionType = 2
 
-	// ndpLinkLayerAddressSize is the size of a Source or Target Link Layer
-	// Address option.
-	ndpLinkLayerAddressSize = 8
+	// NDPLinkLayerAddressSize is the size of a Source or Target Link Layer
+	// Address option for an Ethernet address.
+	NDPLinkLayerAddressSize = 8
 
 	// NDPPrefixInformationType is the type of the Prefix Information
 	// option, as per RFC 4861 section 4.6.2.
@@ -300,6 +301,8 @@ func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
 
 // NDPOption is the set of functions to be implemented by all NDP option types.
 type NDPOption interface {
+	fmt.Stringer
+
 	// Type returns the type of the receiver.
 	Type() uint8
 
@@ -397,6 +400,11 @@ func (o NDPSourceLinkLayerAddressOption) serializeInto(b []byte) int {
 	return copy(b, o)
 }
 
+// String implements fmt.Stringer.String.
+func (o NDPSourceLinkLayerAddressOption) String() string {
+	return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o))
+}
+
 // EthernetAddress will return an ethernet (MAC) address if the
 // NDPSourceLinkLayerAddressOption's body has at minimum EthernetAddressSize
 // bytes. If the body has more than EthernetAddressSize bytes, only the first
@@ -432,6 +440,11 @@ func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
 	return copy(b, o)
 }
 
+// String implements fmt.Stringer.String.
+func (o NDPTargetLinkLayerAddressOption) String() string {
+	return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o))
+}
+
 // EthernetAddress will return an ethernet (MAC) address if the
 // NDPTargetLinkLayerAddressOption's body has at minimum EthernetAddressSize
 // bytes. If the body has more than EthernetAddressSize bytes, only the first
@@ -478,6 +491,17 @@ func (o NDPPrefixInformation) serializeInto(b []byte) int {
 	return used
 }
 
+// String implements fmt.Stringer.String.
+func (o NDPPrefixInformation) String() string {
+	return fmt.Sprintf("%T(O=%t, A=%t, PL=%s, VL=%s, Prefix=%s)",
+		o,
+		o.OnLinkFlag(),
+		o.AutonomousAddressConfigurationFlag(),
+		o.PreferredLifetime(),
+		o.ValidLifetime(),
+		o.Subnet())
+}
+
 // PrefixLength returns the value in the number of leading bits in the Prefix
 // that are valid.
 //
@@ -587,6 +611,11 @@ func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
 	return used
 }
 
+// String implements fmt.Stringer.String.
+func (o NDPRecursiveDNSServer) String() string {
+	return fmt.Sprintf("%T(%s valid for %s)", o, o.Addresses(), o.Lifetime())
+}
+
 // Lifetime returns the length of time that the DNS server addresses
 // in this option may be used for name resolution.
 //
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index d983ac390..245694118 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -538,11 +538,29 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
-	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
-	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+	linkAddr := ndp.nic.linkEP.LinkAddress()
+	isValidLinkAddr := header.IsValidUnicastEthernetAddress(linkAddr)
+	ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize
+	if isValidLinkAddr {
+		// Only include a Source Link Layer Address option if the NIC has a valid
+		// link layer address.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		ndpNSSize += header.NDPLinkLayerAddressSize
+	}
+
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + ndpNSSize)
+	pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
 	pkt.SetType(header.ICMPv6NeighborSolicit)
 	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
 	ns.SetTargetAddress(addr)
+
+	if isValidLinkAddr {
+		ns.Options().Serialize(header.NDPOptionsSerializer{
+			header.NDPSourceLinkLayerAddressOption(linkAddr),
+		})
+	}
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 	sent := r.Stats().ICMP.V6PacketsSent
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index ad2c6f601..726468e41 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -417,7 +417,11 @@ func TestDADResolve(t *testing.T) {
 				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
 					checker.TTL(header.NDPHopLimit),
 					checker.NDPNS(
-						checker.NDPNSTargetAddress(addr1)))
+						checker.NDPNSTargetAddress(addr1),
+						checker.NDPNSOptions([]header.NDPOption{
+							header.NDPSourceLinkLayerAddressOption(linkAddr1),
+						}),
+					))
 			}
 		})
 	}
-- 
cgit v1.2.3


From 2862b0b1be9ce821e86877802b9608aad3102916 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 28 Jan 2020 15:04:34 -0800
Subject: Add //pkg/sentry/fsimpl/devtmpfs.

PiperOrigin-RevId: 292021389
---
 pkg/sentry/fsimpl/devtmpfs/BUILD            |  33 +++++
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go      | 187 ++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 119 ++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/BUILD               |   1 +
 4 files changed, 340 insertions(+)
 create mode 100644 pkg/sentry/fsimpl/devtmpfs/BUILD
 create mode 100644 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
 create mode 100644 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go

diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
new file mode 100644
index 000000000..aa0c2ad8c
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -0,0 +1,33 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "devtmpfs",
+    srcs = ["devtmpfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "devtmpfs_test",
+    size = "small",
+    srcs = ["devtmpfs_test.go"],
+    library = ":devtmpfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
new file mode 100644
index 000000000..d36fa74fb
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -0,0 +1,187 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package devtmpfs provides an implementation of /dev based on tmpfs,
+// analogous to Linux's devtmpfs.
+package devtmpfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct {
+	initOnce sync.Once
+	initErr  error
+
+	// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
+	// root is fs' root. fs and root are immutable.
+	fs   *vfs.Filesystem
+	root *vfs.Dentry
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fst.initOnce.Do(func() {
+		fs, root, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "" /* source */, vfs.GetFilesystemOptions{
+			Data: "mode=0755", // opts from drivers/base/devtmpfs.c:devtmpfs_init()
+		})
+		if err != nil {
+			fst.initErr = err
+			return
+		}
+		fst.fs = fs
+		fst.root = root
+	})
+	if fst.initErr != nil {
+		return nil, nil, fst.initErr
+	}
+	fst.fs.IncRef()
+	fst.root.IncRef()
+	return fst.fs, fst.root, nil
+}
+
+// Accessor allows devices to create device special files in devtmpfs.
+type Accessor struct {
+	vfsObj *vfs.VirtualFilesystem
+	mntns  *vfs.MountNamespace
+	root   vfs.VirtualDentry
+	creds  *auth.Credentials
+}
+
+// NewAccessor returns an Accessor that supports creation of device special
+// files in the devtmpfs instance registered with name fsTypeName in vfsObj.
+func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return &Accessor{
+		vfsObj: vfsObj,
+		mntns:  mntns,
+		root:   mntns.Root(),
+		creds:  creds,
+	}, nil
+}
+
+// Release must be called when a is no longer in use.
+func (a *Accessor) Release() {
+	a.root.DecRef()
+	a.mntns.DecRef(a.vfsObj)
+}
+
+// accessorContext implements context.Context by extending an existing
+// context.Context with an Accessor's values for VFS-relevant state.
+type accessorContext struct {
+	context.Context
+	a *Accessor
+}
+
+func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
+	return &accessorContext{
+		Context: ctx,
+		a:       a,
+	}
+}
+
+// Value implements context.Context.Value.
+func (ac *accessorContext) Value(key interface{}) interface{} {
+	switch key {
+	case vfs.CtxMountNamespace:
+		return ac.a.mntns
+	case vfs.CtxRoot:
+		ac.a.root.IncRef()
+		return ac.a.root
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
+	return &vfs.PathOperation{
+		Root:  a.root,
+		Start: a.root,
+		Path:  fspath.Parse(pathname),
+	}
+}
+
+// CreateDeviceFile creates a device special file at the given pathname in the
+// devtmpfs instance accessed by the Accessor.
+func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
+	mode := (linux.FileMode)(perms)
+	switch kind {
+	case vfs.BlockDevice:
+		mode |= linux.S_IFBLK
+	case vfs.CharDevice:
+		mode |= linux.S_IFCHR
+	default:
+		panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
+	}
+	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
+	// create, which it recognizes by storing a pointer to the kdevtmpfs struct
+	// thread in struct inode::i_private. Accessor doesn't yet support deletion
+	// of files at all, and probably won't as long as we don't need to support
+	// kernel modules, so this is moot for now.
+	return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
+		Mode:     mode,
+		DevMajor: major,
+		DevMinor: minor,
+	})
+}
+
+// UserspaceInit creates symbolic links and mount points in the devtmpfs
+// instance accessed by the Accessor that are created by userspace in Linux. It
+// does not create mounts.
+func (a *Accessor) UserspaceInit(ctx context.Context) error {
+	actx := a.wrapContext(ctx)
+
+	// systemd: src/shared/dev-setup.c:dev_setup()
+	for _, symlink := range []struct {
+		source string
+		target string
+	}{
+		// /proc/kcore is not implemented.
+		{source: "fd", target: "/proc/self/fd"},
+		{source: "stdin", target: "/proc/self/fd/0"},
+		{source: "stdout", target: "/proc/self/fd/1"},
+		{source: "stderr", target: "/proc/self/fd/2"},
+	} {
+		if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil {
+			return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err)
+		}
+	}
+
+	// systemd: src/core/mount-setup.c:mount_table
+	for _, dir := range []string{
+		"shm",
+		"pts",
+	} {
+		if err := a.vfsObj.MkdirAt(actx, a.creds, a.pathOperationAt(dir), &vfs.MkdirOptions{
+			// systemd: src/core/mount-setup.c:mount_one()
+			Mode: 0755,
+		}); err != nil {
+			return fmt.Errorf("failed to create directory %q: %v", dir, err)
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
new file mode 100644
index 000000000..82c58c900
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -0,0 +1,119 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devtmpfs
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func TestDevtmpfs(t *testing.T) {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := vfs.New()
+	// Register tmpfs just so that we can have a root filesystem that isn't
+	// devtmpfs.
+	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	vfsObj.MustRegisterFilesystemType("devtmpfs", &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	// Create a test mount namespace with devtmpfs mounted at "/dev".
+	const devPath = "/dev"
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("failed to create tmpfs root mount: %v", err)
+	}
+	defer mntns.DecRef(vfsObj)
+	root := mntns.Root()
+	defer root.DecRef()
+	devpop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(devPath),
+	}
+	if err := vfsObj.MkdirAt(ctx, creds, &devpop, &vfs.MkdirOptions{
+		Mode: 0755,
+	}); err != nil {
+		t.Fatalf("failed to create mount point: %v", err)
+	}
+	if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+		t.Fatalf("failed to mount devtmpfs: %v", err)
+	}
+
+	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+	if err != nil {
+		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
+	}
+	defer a.Release()
+
+	// Create "userspace-initialized" files using a devtmpfs.Accessor.
+	if err := a.UserspaceInit(ctx); err != nil {
+		t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
+	}
+	// Created files should be visible in the test mount namespace.
+	abspath := devPath + "/fd"
+	target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(abspath),
+	})
+	if want := "/proc/self/fd"; err != nil || target != want {
+		t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+	}
+
+	// Create a dummy device special file using a devtmpfs.Accessor.
+	const (
+		pathInDev = "dummy"
+		kind      = vfs.CharDevice
+		major     = 12
+		minor     = 34
+		perms     = 0600
+		wantMode  = linux.S_IFCHR | perms
+	)
+	if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
+		t.Fatalf("failed to create device file: %v", err)
+	}
+	// The device special file should be visible in the test mount namespace.
+	abspath = devPath + "/" + pathInDev
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(abspath),
+	}, &vfs.StatOptions{
+		Mask: linux.STATX_TYPE | linux.STATX_MODE,
+	})
+	if err != nil {
+		t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+	}
+	if stat.Mode != wantMode {
+		t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+	}
+	if stat.RdevMajor != major {
+		t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+	}
+	if stat.RdevMinor != minor {
+		t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index fb436860c..c61366224 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -27,6 +27,7 @@ go_library(
         "symlink.go",
         "tmpfs.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
-- 
cgit v1.2.3


From 437c986c6a0ed0e1fccfbfb6706f43d2c801c444 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 28 Jan 2020 15:13:46 -0800
Subject: Add vfs.FileDescription to FD table

FD table now holds both VFS1 and VFS2 types and uses the correct
one based on what's set.

Parts of this CL are just initial changes (e.g. sys_read.go,
runsc/main.go) to serve as a template for the remaining changes.

Updates #1487
Updates #1623

PiperOrigin-RevId: 292023223
---
 pkg/sentry/kernel/BUILD                            |   1 +
 pkg/sentry/kernel/fd_table.go                      | 166 ++++++++++++++++-----
 pkg/sentry/kernel/fd_table_test.go                 |   4 +-
 pkg/sentry/kernel/fd_table_unsafe.go               |  98 ++++++++++--
 pkg/sentry/kernel/kernel.go                        |  31 ++--
 pkg/sentry/kernel/task.go                          |   9 ++
 pkg/sentry/kernel/task_exec.go                     |   3 +-
 pkg/sentry/syscalls/linux/BUILD                    |   1 +
 pkg/sentry/syscalls/linux/error.go                 |  72 ++++++---
 pkg/sentry/syscalls/linux/sys_file.go              |   2 +-
 pkg/sentry/syscalls/linux/vfs2/BUILD               |  24 +++
 pkg/sentry/syscalls/linux/vfs2/linux64.go          |  16 ++
 .../syscalls/linux/vfs2/linux64_override_amd64.go  |  25 ++++
 .../syscalls/linux/vfs2/linux64_override_arm64.go  |  25 ++++
 pkg/sentry/syscalls/linux/vfs2/sys_read.go         |  95 ++++++++++++
 runsc/boot/BUILD                                   |   1 +
 runsc/boot/config.go                               |   3 +
 runsc/boot/loader.go                               |   9 ++
 18 files changed, 496 insertions(+), 89 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/BUILD
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_read.go

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 0738946d9..a27628c0a 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -188,6 +188,7 @@ go_library(
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/state",
         "//pkg/state/statefile",
         "//pkg/sync",
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 9460bb235..56b70ce96 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -62,10 +63,14 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) {
 // Note that this is immutable and can only be changed via operations on the
 // descriptorTable.
 //
+// It contains both VFS1 and VFS2 file types, but only one of them can be set.
+//
 // +stateify savable
 type descriptor struct {
-	file  *fs.File
-	flags FDFlags
+	// TODO(gvisor.dev/issue/1624): Remove fs.File.
+	file     *fs.File
+	fileVFS2 *vfs.FileDescription
+	flags    FDFlags
 }
 
 // FDTable is used to manage File references and flags.
@@ -95,10 +100,11 @@ type FDTable struct {
 
 func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
 	m := make(map[int32]descriptor)
-	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		m[fd] = descriptor{
-			file:  file,
-			flags: flags,
+			file:     file,
+			fileVFS2: fileVFS2,
+			flags:    flags,
 		}
 	})
 	return m
@@ -107,13 +113,17 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
 func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 	f.init() // Initialize table.
 	for fd, d := range m {
-		f.set(fd, d.file, d.flags)
-
-		// Note that we do _not_ need to acquire a extra table
-		// reference here. The table reference will already be
-		// accounted for in the file, so we drop the reference taken by
-		// set above.
-		d.file.DecRef()
+		f.setAll(fd, d.file, d.fileVFS2, d.flags)
+
+		// Note that we do _not_ need to acquire a extra table reference here. The
+		// table reference will already be accounted for in the file, so we drop the
+		// reference taken by set above.
+		switch {
+		case d.file != nil:
+			d.file.DecRef()
+		case d.fileVFS2 != nil:
+			d.fileVFS2.DecRef()
+		}
 	}
 }
 
@@ -139,6 +149,15 @@ func (f *FDTable) drop(file *fs.File) {
 	file.DecRef()
 }
 
+// dropVFS2 drops the table reference.
+func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+	// TODO(gvisor.dev/issue/1480): Release locks.
+	// TODO(gvisor.dev/issue/1479): Send inotify events.
+
+	// Drop the table reference.
+	file.DecRef()
+}
+
 // ID returns a unique identifier for this FDTable.
 func (f *FDTable) ID() uint64 {
 	return f.uid
@@ -156,7 +175,7 @@ func (k *Kernel) NewFDTable() *FDTable {
 
 // destroy removes all of the file descriptors from the map.
 func (f *FDTable) destroy() {
-	f.RemoveIf(func(*fs.File, FDFlags) bool {
+	f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
 		return true
 	})
 }
@@ -175,19 +194,26 @@ func (f *FDTable) Size() int {
 // forEach iterates over all non-nil files.
 //
 // It is the caller's responsibility to acquire an appropriate lock.
-func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
 	fd := int32(0)
 	for {
-		file, flags, ok := f.get(fd)
+		file, fileVFS2, flags, ok := f.getAll(fd)
 		if !ok {
 			break
 		}
-		if file != nil {
+		switch {
+		case file != nil:
 			if !file.TryIncRef() {
 				continue // Race caught.
 			}
-			fn(int32(fd), file, flags)
+			fn(fd, file, nil, flags)
 			file.DecRef()
+		case fileVFS2 != nil:
+			if !fileVFS2.TryIncRef() {
+				continue // Race caught.
+			}
+			fn(fd, nil, fileVFS2, flags)
+			fileVFS2.DecRef()
 		}
 		fd++
 	}
@@ -196,9 +222,21 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
 // String is a stringer for FDTable.
 func (f *FDTable) String() string {
 	var b bytes.Buffer
-	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
-		n, _ := file.Dirent.FullName(nil /* root */)
-		b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+		switch {
+		case file != nil:
+			n, _ := file.Dirent.FullName(nil /* root */)
+			b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+
+		case fileVFS2 != nil:
+			fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+			// TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work?
+			name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+			if err != nil {
+				b.WriteString(fmt.Sprintf("<err: %v>\n", err))
+			}
+			b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name))
+		}
 	})
 	return b.String()
 }
@@ -262,6 +300,17 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 // reference for that FD, the ref count for that existing reference is
 // decremented.
 func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+	return f.newFDAt(ctx, fd, file, nil, flags)
+}
+
+// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return f.newFDAt(ctx, fd, nil, file, flags)
+}
+
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -278,7 +327,7 @@ func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FD
 	}
 
 	// Install the entry.
-	f.set(fd, file, flags)
+	f.setAll(fd, file, fileVFS2, flags)
 	return nil
 }
 
@@ -330,10 +379,35 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
 	}
 }
 
+// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
+	if fd < 0 {
+		return nil, FDFlags{}
+	}
+
+	for {
+		file, flags, _ := f.getVFS2(fd)
+		if file != nil {
+			if !file.TryIncRef() {
+				continue // Race caught.
+			}
+			// Reference acquired.
+			return file, flags
+		}
+		// No file available.
+		return nil, FDFlags{}
+	}
+}
+
 // GetFDs returns a list of valid fds.
 func (f *FDTable) GetFDs() []int32 {
 	fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
-	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+	f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
 		fds = append(fds, fd)
 	})
 	return fds
@@ -344,7 +418,19 @@ func (f *FDTable) GetFDs() []int32 {
 // they're done using the slice.
 func (f *FDTable) GetRefs() []*fs.File {
 	files := make([]*fs.File, 0, f.Size())
-	f.forEach(func(_ int32, file *fs.File, flags FDFlags) {
+	f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+		file.IncRef() // Acquire a reference for caller.
+		files = append(files, file)
+	})
+	return files
+}
+
+// GetRefsVFS2 returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
+	files := make([]*vfs.FileDescription, 0, f.Size())
+	f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
 		file.IncRef() // Acquire a reference for caller.
 		files = append(files, file)
 	})
@@ -355,10 +441,15 @@ func (f *FDTable) GetRefs() []*fs.File {
 func (f *FDTable) Fork() *FDTable {
 	clone := f.k.NewFDTable()
 
-	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		// The set function here will acquire an appropriate table
 		// reference for the clone. We don't need anything else.
-		clone.set(fd, file, flags)
+		switch {
+		case file != nil:
+			clone.set(fd, file, flags)
+		case fileVFS2 != nil:
+			clone.setVFS2(fd, fileVFS2, flags)
+		}
 	})
 	return clone
 }
@@ -366,9 +457,9 @@ func (f *FDTable) Fork() *FDTable {
 // Remove removes an FD from and returns a non-file iff successful.
 //
 // N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) *fs.File {
+func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
 	if fd < 0 {
-		return nil
+		return nil, nil
 	}
 
 	f.mu.Lock()
@@ -379,21 +470,26 @@ func (f *FDTable) Remove(fd int32) *fs.File {
 		f.next = fd
 	}
 
-	orig, _, _ := f.get(fd)
-	if orig != nil {
-		orig.IncRef()             // Reference for caller.
-		f.set(fd, nil, FDFlags{}) // Zap entry.
+	orig, orig2, _, _ := f.getAll(fd)
+
+	// Add reference for caller.
+	switch {
+	case orig != nil:
+		orig.IncRef()
+	case orig2 != nil:
+		orig2.IncRef()
 	}
-	return orig
+	f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+	return orig, orig2
 }
 
 // RemoveIf removes all FDs where cond is true.
-func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
-		if cond(file, flags) {
+	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+		if cond(file, fileVFS2, flags) {
 			f.set(fd, nil, FDFlags{}) // Clear from table.
 			// Update current available position.
 			if fd < f.next {
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 261b815f2..29f95a2c4 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
 			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
 		}
 
-		ref := fdTable.Remove(1)
+		ref, _ := fdTable.Remove(1)
 		if ref == nil {
 			t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
 		}
 		ref.DecRef()
 
-		if ref := fdTable.Remove(1); ref != nil {
+		if ref, _ := fdTable.Remove(1); ref != nil {
 			t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
 		}
 	})
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index e9fdb0917..7fd97dc53 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -19,6 +19,7 @@ import (
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
 type descriptorTable struct {
@@ -41,15 +42,38 @@ func (f *FDTable) init() {
 //
 //go:nosplit
 func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
+	file, _, flags, ok := f.getAll(fd)
+	return file, flags, ok
+}
+
+// getVFS2 gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) {
+	_, file, flags, ok := f.getAll(fd)
+	return file, flags, ok
+}
+
+// getAll gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) {
 	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
 	if fd >= int32(len(slice)) {
-		return nil, FDFlags{}, false
+		return nil, nil, FDFlags{}, false
 	}
 	d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
 	if d == nil {
-		return nil, FDFlags{}, true
+		return nil, nil, FDFlags{}, true
 	}
-	return d.file, d.flags, true
+	if d.file != nil && d.fileVFS2 != nil {
+		panic("VFS1 and VFS2 files set")
+	}
+	return d.file, d.fileVFS2, d.flags, true
 }
 
 // set sets an entry.
@@ -59,6 +83,30 @@ func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
 //
 // Precondition: mu must be held.
 func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
+	f.setAll(fd, file, nil, flags)
+}
+
+// setVFS2 sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
+	f.setAll(fd, nil, file, flags)
+}
+
+// setAll sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+	if file != nil && fileVFS2 != nil {
+		panic("VFS1 and VFS2 files set")
+	}
+
 	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
 
 	// Grow the table as required.
@@ -71,33 +119,51 @@ func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
 		atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
 	}
 
-	// Create the new element.
-	var d *descriptor
-	if file != nil {
-		d = &descriptor{
-			file:  file,
-			flags: flags,
+	var desc *descriptor
+	if file != nil || fileVFS2 != nil {
+		desc = &descriptor{
+			file:     file,
+			fileVFS2: fileVFS2,
+			flags:    flags,
 		}
 	}
 
 	// Update the single element.
-	orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d)))
+	orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc)))
 
 	// Acquire a table reference.
-	if file != nil && (orig == nil || file != orig.file) {
-		file.IncRef()
+	if desc != nil {
+		switch {
+		case desc.file != nil:
+			if orig == nil || desc.file != orig.file {
+				desc.file.IncRef()
+			}
+		case desc.fileVFS2 != nil:
+			if orig == nil || desc.fileVFS2 != orig.fileVFS2 {
+				desc.fileVFS2.IncRef()
+			}
+		}
 	}
 
 	// Drop the table reference.
-	if orig != nil && file != orig.file {
-		f.drop(orig.file)
+	if orig != nil {
+		switch {
+		case orig.file != nil:
+			if desc == nil || desc.file != orig.file {
+				f.drop(orig.file)
+			}
+		case orig.fileVFS2 != nil:
+			if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
+				f.dropVFS2(orig.fileVFS2)
+			}
+		}
 	}
 
 	// Adjust used.
 	switch {
-	case orig == nil && file != nil:
+	case orig == nil && desc != nil:
 		atomic.AddInt32(&f.used, 1)
-	case orig != nil && file == nil:
+	case orig != nil && desc == nil:
 		atomic.AddInt32(&f.used, -1)
 	}
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 7b90fac5a..dcd6e91c4 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -65,6 +65,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/state"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -435,17 +436,17 @@ func (k *Kernel) flushMountSourceRefs() error {
 
 	// There may be some open FDs whose filesystems have been unmounted. We
 	// must flush those as well.
-	return k.tasks.forEachFDPaused(func(file *fs.File) error {
+	return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
 		file.Dirent.Inode.MountSource.FlushDirentRefs()
 		return nil
 	})
 }
 
-// forEachFDPaused applies the given function to each open file descriptor in each
-// task.
+// forEachFDPaused applies the given function to each open file descriptor in
+// each task.
 //
 // Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -453,8 +454,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
 		if t.fdTable == nil {
 			continue
 		}
-		t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
-			if lastErr := f(file); lastErr != nil && err == nil {
+		t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+			if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
 				err = lastErr
 			}
 		})
@@ -463,7 +464,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
 }
 
 func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	return ts.forEachFDPaused(func(file *fs.File) error {
+	// TODO(gvisor.dev/issues/1663): Add save support for VFS2.
+	return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
 		}
@@ -474,12 +476,9 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 		syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
 		if err := fs.SaveFileFsyncError(syncErr); err != nil {
 			name, _ := file.Dirent.FullName(nil /* root */)
-			// Wrap this error in ErrSaveRejection
-			// so that it will trigger a save
-			// error, rather than a panic. This
-			// also allows us to distinguish Fsync
-			// errors from state file errors in
-			// state.Save.
+			// Wrap this error in ErrSaveRejection so that it will trigger a save
+			// error, rather than a panic. This also allows us to distinguish Fsync
+			// errors from state file errors in state.Save.
 			return fs.ErrSaveRejection{
 				Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
 			}
@@ -519,7 +518,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
 				if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
 					e.UnregisterEpollWaiters()
 				}
@@ -921,7 +920,7 @@ func (k *Kernel) pauseTimeLocked() {
 		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
 				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 					tfd.PauseTimer()
 				}
@@ -951,7 +950,7 @@ func (k *Kernel) resumeTimeLocked() {
 			}
 		}
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
 				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 					tfd.ResumeTimer()
 				}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 95adf2778..981e8c7fe 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -743,6 +744,14 @@ func (t *Task) GetFile(fd int32) *fs.File {
 	return f
 }
 
+// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription {
+	f, _ := t.fdTable.GetVFS2(fd)
+	return f
+}
+
 // NewFDs is a convenience wrapper for t.FDTable().NewFDs.
 //
 // This automatically passes the task as the context.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index fa6528386..8f57a34a6 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -69,6 +69,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -198,7 +199,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tg.pidns.owner.mu.Unlock()
 
 	// Remove FDs with the CloseOnExec flag set.
-	t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+	t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
 		return flags.CloseOnExec
 	})
 
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 8d6c52850..be16ee686 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -93,6 +93,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 60469549d..64de56ac5 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -31,20 +32,58 @@ var (
 	partialResultOnce   sync.Once
 )
 
+// HandleIOErrorVFS2 handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
+	known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+	if err != nil {
+		return err
+	}
+	if !known {
+		// An unknown error is encountered with a partial read/write.
+		fs := f.Mount().Filesystem().VirtualFilesystem()
+		root := vfs.RootFromContext(t)
+		name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
+		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+		partialResultOnce.Do(partialResultMetric.Increment)
+	}
+	return nil
+}
+
 // handleIOError handles special error cases for partial results. For some
 // errors, we may consume the error and return only the partial read/write.
 //
 // op and f are used only for panics.
 func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+	known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+	if err != nil {
+		return err
+	}
+	if !known {
+		// An unknown error is encountered with a partial read/write.
+		name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+		partialResultOnce.Do(partialResultMetric.Increment)
+	}
+	return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// Returns false if error is unknown.
+func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) {
 	switch err {
 	case nil:
 		// Typical successful syscall.
-		return nil
+		return true, nil
 	case io.EOF:
 		// EOF is always consumed. If this is a partial read/write
 		// (result != 0), the application will see that, otherwise
 		// they will see 0.
-		return nil
+		return true, nil
 	case syserror.ErrExceedsFileSizeLimit:
 		// Ignore partialResult because this error only applies to
 		// normal files, and for those files we cannot accumulate
@@ -53,20 +92,20 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// Do not consume the error and return it as EFBIG.
 		// Simultaneously send a SIGXFSZ per setrlimit(2).
 		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
-		return syserror.EFBIG
+		return true, syserror.EFBIG
 	case syserror.ErrInterrupted:
 		// The syscall was interrupted. Return nil if it completed
 		// partially, otherwise return the error code that the syscall
 		// needs (to indicate to the kernel what it should do).
 		if partialResult {
-			return nil
+			return true, nil
 		}
-		return intr
+		return true, intr
 	}
 
 	if !partialResult {
 		// Typical syscall error.
-		return err
+		return true, err
 	}
 
 	switch err {
@@ -75,14 +114,14 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// read/write.  Like ErrWouldBlock, since we have a
 		// partial read/write, we consume the error and return
 		// the partial result.
-		return nil
+		return true, nil
 	case syserror.EFAULT:
 		// EFAULT is only shown the user if nothing was
 		// read/written. If we read something (this case), they see
 		// a partial read/write. They will then presumably try again
 		// with an incremented buffer, which will EFAULT with
 		// result == 0.
-		return nil
+		return true, nil
 	case syserror.EPIPE:
 		// Writes to a pipe or socket will return EPIPE if the other
 		// side is gone. The partial write is returned. EPIPE will be
@@ -90,32 +129,29 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		//
 		// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
 		// also be sent to the application.
-		return nil
+		return true, nil
 	case syserror.ENOSPC:
 		// Similar to EPIPE. Return what we wrote this time, and let
 		// ENOSPC be returned on the next call.
-		return nil
+		return true, nil
 	case syserror.ECONNRESET:
 		// For TCP sendfile connections, we may have a reset. But we
 		// should just return n as the result.
-		return nil
+		return true, nil
 	case syserror.ErrWouldBlock:
 		// Syscall would block, but completed a partial read/write.
 		// This case should only be returned by IssueIO for nonblocking
 		// files. Since we have a partial read/write, we consume
 		// ErrWouldBlock, returning the partial result.
-		return nil
+		return true, nil
 	}
 
 	switch err.(type) {
 	case kernel.SyscallRestartErrno:
 		// Identical to the EINTR case.
-		return nil
+		return true, nil
 	}
 
-	// An unknown error is encountered with a partial read/write.
-	name, _ := f.Dirent.FullName(nil /* ignore chroot */)
-	log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
-	partialResultOnce.Do(partialResultMetric.Increment)
-	return nil
+	// Error is unknown and cannot be properly handled.
+	return false, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index c54735148..421845ebb 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -767,7 +767,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	file := t.FDTable().Remove(fd)
+	file, _ := t.FDTable().Remove(fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
new file mode 100644
index 000000000..6b8a00b6e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "vfs2",
+    srcs = [
+        "linux64.go",
+        "linux64_override_amd64.go",
+        "linux64_override_arm64.go",
+        "sys_read.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/sentry/arch",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/syscalls",
+        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go
new file mode 100644
index 000000000..19ee36081
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64.go
@@ -0,0 +1,16 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs2 provides syscall implementations that use VFS2.
+package vfs2
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
new file mode 100644
index 000000000..c134714ee
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override(table map[uintptr]kernel.Syscall) {
+	table[0] = syscalls.Supported("read", Read)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
new file mode 100644
index 000000000..6af5c400f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override(table map[uintptr]kernel.Syscall) {
+	table[63] = syscalls.Supported("read", Read)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
new file mode 100644
index 000000000..b9fb58464
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Read implements linux syscall read(2).  Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once.  Bug for bug.  Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.IsReadable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.Read(t, dst, opts)
+	if err != syserror.ErrWouldBlock {
+		return n, err
+	}
+
+	// Register for notifications.
+	_, ch := waiter.NewChannelEntry(nil)
+	// file.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Read(t, dst, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	//file.EventUnregister(&w)
+
+	return total, err
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index a96c80261..ae4dd102a 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -68,6 +68,7 @@ go_library(
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
         "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/syscalls/linux/vfs2",
         "//pkg/sentry/time",
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index a878bc2ce..35391030f 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -256,6 +256,9 @@ type Config struct {
 	//
 	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
 	CPUNumFromQuota bool
+
+	// Enables VFS2 (not plumbled through yet).
+	VFS2 bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index fad72f4ab..9f0d5d7af 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -26,6 +26,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
@@ -42,6 +43,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
@@ -184,6 +186,13 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
+	if args.Conf.VFS2 {
+		st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host)
+		if ok {
+			vfs2.Override(st.Table)
+		}
+	}
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
-- 
cgit v1.2.3


From 431ff52768c2300e15cba609c2be4f507fd30d5b Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 28 Jan 2020 15:39:48 -0800
Subject: Update link address for senders of Neighbor Solicitations

Update link address for senders of NDP Neighbor Solicitations when the NS
contains an NDP Source Link Layer Address option.

Tests:
- ipv6.TestNeighorSolicitationWithSourceLinkLayerOption
- ipv6.TestNeighorSolicitationWithInvalidSourceLinkLayerOption
PiperOrigin-RevId: 292028553
---
 pkg/tcpip/network/ipv6/icmp.go     |  41 ++++++-----
 pkg/tcpip/network/ipv6/ndp_test.go | 135 +++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 1c3410618..dc20c0fd7 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -137,21 +137,24 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		}
 
 		ns := header.NDPNeighborSolicit(h.NDPPayload())
+		it, err := ns.Options().Iter(true)
+		if err != nil {
+			// If we have a malformed NDP NS option, drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
 		targetAddr := ns.TargetAddress()
 		s := r.Stack()
 		rxNICID := r.NICID()
-
-		isTentative, err := s.IsAddrTentative(rxNICID, targetAddr)
-		if err != nil {
+		if isTentative, err := s.IsAddrTentative(rxNICID, targetAddr); err != nil {
 			// We will only get an error if rxNICID is unrecognized,
 			// which should not happen. For now short-circuit this
 			// packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
-		}
-
-		if isTentative {
+		} else if isTentative {
 			// If the target address is tentative and the source
 			// of the packet is a unicast (specified) address, then
 			// the source of the packet is attempting to perform
@@ -185,6 +188,23 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 			return
 		}
 
+		// If the NS message has the source link layer option, update the link
+		// address cache with the link address for the sender of the message.
+		//
+		// TODO(b/148429853): Properly process the NS message and do Neighbor
+		// Unreachability Detection.
+		for {
+			opt, done, _ := it.Next()
+			if done {
+				break
+			}
+
+			switch opt := opt.(type) {
+			case header.NDPSourceLinkLayerAddressOption:
+				e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, opt.EthernetAddress())
+			}
+		}
+
 		optsSerializer := header.NDPOptionsSerializer{
 			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
 		}
@@ -211,15 +231,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		r.LocalAddress = targetAddr
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-		// TODO(tamird/ghanan): there exists an explicit NDP option that is
-		// used to update the neighbor table with link addresses for a
-		// neighbor from an NS (see the Source Link Layer option RFC
-		// 4861 section 4.6.1 and section 7.2.3).
-		//
-		// Furthermore, the entirety of NDP handling here seems to be
-		// contradicted by RFC 4861.
-		e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, r.RemoteLinkAddress)
-
 		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
 		//
 		// 7.1.2. Validation of Neighbor Advertisements
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index fe895b376..bd732f93f 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -70,6 +70,141 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 	return s, ep
 }
 
+// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving an
+// NDP NS message with the Source Link Layer Address option results in a
+// new entry in the link address cache for the sender of the message.
+func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+	})
+	e := channel.New(0, 1280, linkAddr0)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+	if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+	}
+
+	ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+	pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+	pkt.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	ns.SetTargetAddress(lladdr0)
+	ns.Options().Serialize(header.NDPOptionsSerializer{
+		header.NDPSourceLinkLayerAddressOption(linkAddr1),
+	})
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+	payloadLength := hdr.UsedLength()
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+		HopLimit:      255,
+		SrcAddr:       lladdr1,
+		DstAddr:       lladdr0,
+	})
+	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
+
+	linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+	if err != nil {
+		t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+	}
+	if c != nil {
+		t.Errorf("got unexpected channel")
+	}
+	if linkAddr != linkAddr1 {
+		t.Errorf("got link address = %s, want = %s", linkAddr, linkAddr1)
+	}
+}
+
+// TestNeighorSolicitationWithInvalidSourceLinkLayerOption tests that receiving
+// an NDP NS message with an invalid Source Link Layer Address option does not
+// result in a new entry in the link address cache for the sender of the
+// message.
+func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name    string
+		optsBuf []byte
+	}{
+		{
+			name:    "Too Small",
+			optsBuf: []byte{1, 1, 1, 2, 3, 4, 5},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{1, 2, 1, 2, 3, 4, 5, 6},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr0)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			// Invalid count should have increased.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if err != tcpip.ErrWouldBlock {
+				t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+			}
+			if c == nil {
+				t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+			}
+			if linkAddr != "" {
+				t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (%s, _, ), want = ('', _, _)", nicID, lladdr1, lladdr0, ProtocolNumber, linkAddr)
+			}
+		})
+	}
+}
+
 // TestHopLimitValidation is a test that makes sure that NDP packets are only
 // received if their IP header's hop limit is set to 255.
 func TestHopLimitValidation(t *testing.T) {
-- 
cgit v1.2.3


From 3d046fef06ece6ba20770fa62e0a21569226adaa Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 28 Jan 2020 16:42:05 -0800
Subject: Changes missing in last submit

Updates #1487
Updates #1623

PiperOrigin-RevId: 292040835
---
 pkg/sentry/kernel/fd_table.go              | 18 +++++++++---------
 pkg/sentry/syscalls/linux/sys_read.go      |  2 +-
 pkg/sentry/syscalls/linux/vfs2/sys_read.go | 16 ++++++++--------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 56b70ce96..23b88f7a6 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -15,9 +15,9 @@
 package kernel
 
 import (
-	"bytes"
 	"fmt"
 	"math"
+	"strings"
 	"sync/atomic"
 	"syscall"
 
@@ -221,24 +221,24 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
 
 // String is a stringer for FDTable.
 func (f *FDTable) String() string {
-	var b bytes.Buffer
+	var buf strings.Builder
 	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		switch {
 		case file != nil:
 			n, _ := file.Dirent.FullName(nil /* root */)
-			b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+			fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n)
 
 		case fileVFS2 != nil:
-			fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
-			// TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work?
-			name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+			vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
+			name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
 			if err != nil {
-				b.WriteString(fmt.Sprintf("<err: %v>\n", err))
+				fmt.Fprintf(&buf, "<err: %v>\n", err)
+				return
 			}
-			b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name))
+			fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
 		}
 	})
-	return b.String()
+	return buf.String()
 }
 
 // NewFDs allocates new FDs guaranteed to be the lowest number available
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index f9f594190..227692f06 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -29,7 +29,7 @@ import (
 )
 
 const (
-	// EventMaskRead contains events that can be triggerd on reads.
+	// EventMaskRead contains events that can be triggered on reads.
 	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
 )
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
index b9fb58464..7667524c7 100644
--- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go
+++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
@@ -24,6 +24,11 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+const (
+	// EventMaskRead contains events that can be triggered on reads.
+	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+)
+
 // Read implements linux syscall read(2).  Note that we try to get a buffer that
 // is exactly the size requested because some applications like qemu expect
 // they can do large reads all at once.  Bug for bug.  Same for other read
@@ -39,11 +44,6 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	}
 	defer file.DecRef()
 
-	// Check that the file is readable.
-	if !file.IsReadable() {
-		return 0, nil, syserror.EBADF
-	}
-
 	// Check that the size is legitimate.
 	si := int(size)
 	if si < 0 {
@@ -70,8 +70,8 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
 	}
 
 	// Register for notifications.
-	_, ch := waiter.NewChannelEntry(nil)
-	// file.EventRegister(&w, EventMaskRead)
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, EventMaskRead)
 
 	total := n
 	for {
@@ -89,7 +89,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
 			break
 		}
 	}
-	//file.EventUnregister(&w)
+	file.EventUnregister(&w)
 
 	return total, err
 }
-- 
cgit v1.2.3


From 396c574db276ae1424af7098b5cd917e2bed9921 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 28 Jan 2020 18:30:36 -0800
Subject: Add support for WritableSource in DynamicBytesFileDescriptionImpl

WritableSource is a convenience interface used for files that can
be written to, e.g. /proc/net/ipv4/tpc_sack. It reads max of 4KB
and only from offset 0 which should cover most cases. It can be
extended as neeed.

Updates #1195

PiperOrigin-RevId: 292056924
---
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go    |   4 +-
 pkg/sentry/fsimpl/proc/tasks.go                   |   2 +-
 pkg/sentry/fsimpl/proc/tasks_sys.go               |  84 +++++++++++--
 pkg/sentry/vfs/file_description_impl_util.go      |  76 +++++++++---
 pkg/sentry/vfs/file_description_impl_util_test.go | 138 +++++++++++++++++-----
 5 files changed, 250 insertions(+), 54 deletions(-)

diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 373f801ff..733792c78 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -108,12 +108,12 @@ func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, off
 
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts)
+	return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts)
 }
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+	return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts)
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index e0cb9c47b..14bd334e8 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -69,7 +69,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
 		//"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}),
 		"loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
-		"sys":     newSysDir(root, inoGen),
+		"sys":     newSysDir(root, inoGen, k),
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
 		"net":     newNetDir(root, inoGen, k),
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index ad963870b..c7ce74883 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -21,12 +21,16 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // newSysDir returns the dentry corresponding to /proc/sys directory.
-func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry {
+func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
 		"kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
 			"hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}),
@@ -38,18 +42,18 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry {
 			"mmap_min_addr":     newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}),
 			"overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")),
 		}),
-		"net": newSysNetDir(root, inoGen),
+		"net": newSysNetDir(root, inoGen, k),
 	})
 }
 
 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry {
-	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
+	var contents map[string]*kernfs.Dentry
+
+	if stack := k.NetworkStack(); stack != nil {
+		contents = map[string]*kernfs.Dentry{
 			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-				// Add tcp_sack.
-				// TODO(gvisor.dev/issue/1195): tcp_sack allows write(2)
-				// "tcp_sack": newTCPSackInode(ctx, msrc, s),
+				"tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}),
 
 				// The following files are simple stubs until they are implemented in
 				// netstack, most of these files are configuration related. We use the
@@ -103,7 +107,11 @@ func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry {
 				"wmem_default":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
 				"wmem_max":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
 			}),
-		}),
+		}
+	}
+
+	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents),
 	})
 }
 
@@ -141,3 +149,61 @@ func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("\n")
 	return nil
 }
+
+// tcpSackData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/tcp_sack.
+//
+// +stateify savable
+type tcpSackData struct {
+	kernfs.DynamicBytesFile
+
+	stack   inet.Stack `state:"wait"`
+	enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if d.enabled == nil {
+		sack, err := d.stack.TCPSACKEnabled()
+		if err != nil {
+			return err
+		}
+		d.enabled = &sack
+	}
+
+	val := "0\n"
+	if *d.enabled {
+		// Technically, this is not quite compatible with Linux. Linux stores these
+		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+		// Tough luck.
+		val = "1\n"
+	}
+	buf.WriteString(val)
+	return nil
+}
+
+func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit the amount of memory allocated.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return n, err
+	}
+	if d.enabled == nil {
+		d.enabled = new(bool)
+	}
+	*d.enabled = v != 0
+	return n, d.stack.SetTCPSACKEnabled(*d.enabled)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index fb9b87fdc..a4900c170 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -192,21 +192,6 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
 	panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
 }
 
-// DynamicBytesFileDescriptionImpl may be embedded by implementations of
-// FileDescriptionImpl that represent read-only regular files whose contents
-// are backed by a bytes.Buffer that is regenerated when necessary, consistent
-// with Linux's fs/seq_file.c:single_open().
-//
-// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
-// use.
-type DynamicBytesFileDescriptionImpl struct {
-	data     DynamicBytesSource // immutable
-	mu       sync.Mutex         // protects the following fields
-	buf      bytes.Buffer
-	off      int64
-	lastRead int64 // offset at which the last Read, PRead, or Seek ended
-}
-
 // DynamicBytesSource represents a data source for a
 // DynamicBytesFileDescriptionImpl.
 type DynamicBytesSource interface {
@@ -225,6 +210,30 @@ func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
+// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the
+// underlying source.
+type WritableDynamicBytesSource interface {
+	DynamicBytesSource
+
+	// Write sends writes to the source.
+	Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
+}
+
+// DynamicBytesFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl that represent read-only regular files whose contents
+// are backed by a bytes.Buffer that is regenerated when necessary, consistent
+// with Linux's fs/seq_file.c:single_open().
+//
+// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
+// use.
+type DynamicBytesFileDescriptionImpl struct {
+	data     DynamicBytesSource // immutable
+	mu       sync.Mutex         // protects the following fields
+	buf      bytes.Buffer
+	off      int64
+	lastRead int64 // offset at which the last Read, PRead, or Seek ended
+}
+
 // SetDataSource must be called exactly once on fd before first use.
 func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
 	fd.data = data
@@ -304,6 +313,43 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
 	return offset, nil
 }
 
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	writable, ok := fd.data.(WritableDynamicBytesSource)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+	n, err := writable.Write(ctx, src, offset)
+	if err != nil {
+		return 0, err
+	}
+
+	// Invalidate cached data that might exist prior to this call.
+	fd.buf.Reset()
+	return n, nil
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.pwriteLocked(ctx, src, offset, opts)
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.pwriteLocked(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
 // GenericConfigureMMap may be used by most implementations of
 // FileDescriptionImpl.ConfigureMMap.
 func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 0f44e7c8c..8fa26418e 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -35,61 +35,80 @@ type fileDescription struct {
 	FileDescriptionDefaultImpl
 }
 
-// genCountFD is a read-only FileDescriptionImpl representing a regular file
-// that contains the number of times its DynamicBytesSource.Generate()
+// genCount contains the number of times its DynamicBytesSource.Generate()
 // implementation has been called.
-type genCountFD struct {
+type genCount struct {
+	count uint64 // accessed using atomic memory ops
+}
+
+// Generate implements DynamicBytesSource.Generate.
+func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1))
+	return nil
+}
+
+type storeData struct {
+	data string
+}
+
+var _ WritableDynamicBytesSource = (*storeData)(nil)
+
+// Generate implements DynamicBytesSource.
+func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(d.data)
+	return nil
+}
+
+// Generate implements WritableDynamicBytesSource.
+func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	buf := make([]byte, src.NumBytes())
+	n, err := src.CopyIn(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+
+	d.data = string(buf[:n])
+	return 0, nil
+}
+
+// testFD is a read-only FileDescriptionImpl representing a regular file.
+type testFD struct {
 	fileDescription
 	DynamicBytesFileDescriptionImpl
 
-	count uint64 // accessed using atomic memory ops
+	data DynamicBytesSource
 }
 
-func newGenCountFD(vfsObj *VirtualFilesystem) *FileDescription {
+func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
 	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
 	defer vd.DecRef()
-	var fd genCountFD
-	fd.vfsfd.Init(&fd, 0 /* statusFlags */, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
-	fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
+	var fd testFD
+	fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
+	fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
 	return &fd.vfsfd
 }
 
 // Release implements FileDescriptionImpl.Release.
-func (fd *genCountFD) Release() {
-}
-
-// StatusFlags implements FileDescriptionImpl.StatusFlags.
-func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return 0, nil
+func (fd *testFD) Release() {
 }
 
 // SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
-func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return syserror.EPERM
-}
-
 // Stat implements FileDescriptionImpl.Stat.
-func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
 	// Note that Statx.Mask == 0 in the return value.
 	return linux.Statx{}, nil
 }
 
 // SetStat implements FileDescriptionImpl.SetStat.
-func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
 	return syserror.EPERM
 }
 
-// Generate implements DynamicBytesSource.Generate.
-func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1))
-	return nil
-}
-
 func TestGenCountFD(t *testing.T) {
 	ctx := contexttest.Context(t)
 
 	vfsObj := New() // vfs.New()
-	fd := newGenCountFD(vfsObj)
+	fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
 	defer fd.DecRef()
 
 	// The first read causes Generate to be called to fill the FD's buffer.
@@ -130,4 +149,69 @@ func TestGenCountFD(t *testing.T) {
 	if want := byte('3'); buf[0] != want {
 		t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
 	}
+
+	// Write and PWrite fails.
+	if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
+		t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+	}
+	if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
+		t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+	}
+}
+
+func TestWritable(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	vfsObj := New() // vfs.New()
+	fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
+	defer fd.DecRef()
+
+	buf := make([]byte, 10)
+	ioseq := usermem.BytesIOSequence(buf)
+	if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF {
+		t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err)
+	}
+	if want := "init"; want == string(buf) {
+		t.Fatalf("Read: got %v, wanted %v", string(buf), want)
+	}
+
+	// Test PWrite.
+	want := "write"
+	writeIOSeq := usermem.BytesIOSequence([]byte(want))
+	if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil {
+		t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+	}
+	if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+		t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+	}
+	if want == string(buf) {
+		t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+	}
+
+	// Test Seek to 0 followed by Write.
+	want = "write2"
+	writeIOSeq = usermem.BytesIOSequence([]byte(want))
+	if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil {
+		t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+	}
+	if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil {
+		t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+	}
+	if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+		t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+	}
+	if want == string(buf) {
+		t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+	}
+
+	// Test failure if offset != 0.
+	if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil {
+		t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+	}
+	if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+		t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err)
+	}
+	if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+		t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err)
+	}
 }
-- 
cgit v1.2.3


From 4cb55a7a3b09c430fa2b7197fdc7b84b7e88a6ed Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 28 Jan 2020 18:43:24 -0800
Subject: Prevent arbitrary size allocation when sending UDS messages.

Currently, Send() will copy data into a new byte slice without regard to the
original size. Size checks should be performed before the allocation takes
place.

Note that for the sake of performance, we avoid putting the buffer
allocation into the critical section. As a result, the size checks need to be
performed again within Enqueue() in case the limit has changed.

PiperOrigin-RevId: 292058147
---
 pkg/sentry/socket/unix/transport/queue.go | 40 ++++++++++++++++++++++++-------
 pkg/sentry/socket/unix/transport/unix.go  | 29 +++++++---------------
 2 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 5dcd3d95e..d8f3ad63d 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -18,6 +18,8 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -100,12 +102,16 @@ func (q *queue) IsWritable() bool {
 
 // Enqueue adds an entry to the data queue if room is available.
 //
+// If discardEmpty is true and there are zero bytes of data, the packet is
+// dropped.
+//
 // If truncate is true, Enqueue may truncate the message before enqueuing it.
-// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
+// Otherwise, the entire message must fit. If l is less than the size of data,
+// err indicates why.
 //
 // If notify is true, ReaderQueue.Notify must be called:
 // q.ReaderQueue.Notify(waiter.EventIn)
-func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *syserr.Error) {
+func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) {
 	q.mu.Lock()
 
 	if q.closed {
@@ -113,9 +119,16 @@ func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *s
 		return 0, false, syserr.ErrClosedForSend
 	}
 
-	free := q.limit - q.used
+	for _, d := range data {
+		l += int64(len(d))
+	}
+	if discardEmpty && l == 0 {
+		q.mu.Unlock()
+		c.Release()
+		return 0, false, nil
+	}
 
-	l = e.Length()
+	free := q.limit - q.used
 
 	if l > free && truncate {
 		if free == 0 {
@@ -124,8 +137,7 @@ func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *s
 			return 0, false, syserr.ErrWouldBlock
 		}
 
-		e.Truncate(free)
-		l = e.Length()
+		l = free
 		err = syserr.ErrWouldBlock
 	}
 
@@ -136,14 +148,26 @@ func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *s
 	}
 
 	if l > free {
-		// Message can't fit right now.
+		// Message can't fit right now, and could not be truncated.
 		q.mu.Unlock()
 		return 0, false, syserr.ErrWouldBlock
 	}
 
+	// Aggregate l bytes of data. This will truncate the data if l is less than
+	// the total bytes held in data.
+	v := make([]byte, l)
+	for i, b := 0, v; i < len(data) && len(b) > 0; i++ {
+		n := copy(b, data[i])
+		b = b[n:]
+	}
+
 	notify = q.dataList.Front() == nil
 	q.used += l
-	q.dataList.PushBack(e)
+	q.dataList.PushBack(&message{
+		Data:    buffer.View(v),
+		Control: c,
+		Address: from,
+	})
 
 	q.mu.Unlock()
 
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index dcbafe0e5..2ef654235 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -581,7 +581,7 @@ type ConnectedEndpoint interface {
 	//
 	// syserr.ErrWouldBlock can be returned along with a partial write if
 	// the caller should block to send the rest of the data.
-	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
+	Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
 
 	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
 	// must not be called while holding any endpoint locks.
@@ -653,35 +653,22 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 }
 
 // Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
-	var l int64
-	for _, d := range data {
-		l += int64(len(d))
-	}
-
+func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+	discardEmpty := false
 	truncate := false
 	if e.endpoint.Type() == linux.SOCK_STREAM {
-		// Since stream sockets don't preserve message boundaries, we
-		// can write only as much of the message as fits in the queue.
-		truncate = true
-
 		// Discard empty stream packets. Since stream sockets don't
 		// preserve message boundaries, sending zero bytes is a no-op.
 		// In Linux, the receiver actually uses a zero-length receive
 		// as an indication that the stream was closed.
-		if l == 0 {
-			controlMessages.Release()
-			return 0, false, nil
-		}
-	}
+		discardEmpty = true
 
-	v := make([]byte, 0, l)
-	for _, d := range data {
-		v = append(v, d...)
+		// Since stream sockets don't preserve message boundaries, we
+		// can write only as much of the message as fits in the queue.
+		truncate = true
 	}
 
-	l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
-	return int64(l), notify, err
+	return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate)
 }
 
 // SendNotify implements ConnectedEndpoint.SendNotify.
-- 
cgit v1.2.3


From 89957c6c87b5ad5c7bac68f93d9472388db57702 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 29 Jan 2020 04:45:59 -0500
Subject: Lazy-fpsimd support patch series#2: add fpsimd@Arm64 support to kvm
 module

Add fpsimd support to KVM module so that the test case "TestKernelFloatingPoint"
can be passed on Arm64 platform.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/bluepill_arm64.go        | 30 +++++++++++++++++
 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go | 16 +++++++++
 pkg/sentry/platform/kvm/kvm_const_arm64.go       | 41 ++++++++++++------------
 pkg/sentry/platform/kvm/machine_arm64.go         |  4 +++
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go  | 29 +++--------------
 pkg/sentry/platform/ring0/defs_arm64.go          |  6 ++++
 pkg/sentry/platform/ring0/lib_arm64.go           |  4 +--
 7 files changed, 83 insertions(+), 47 deletions(-)

diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index 552341721..c215d443c 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -54,6 +54,14 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 	context.Pstate = regs.Pstate
 	context.Pstate &^= uint64(ring0.UserFlagsClear)
 	context.Pstate |= ring0.UserFlagsSet
+
+	lazyVfp := c.GetLazyVFP()
+	if lazyVfp != 0 {
+		fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+		context.Fpsimd64.Fpsr = fpsimd.Fpsr
+		context.Fpsimd64.Fpcr = fpsimd.Fpcr
+		context.Fpsimd64.Vregs = fpsimd.Vregs
+	}
 }
 
 // KernelSyscall handles kernel syscalls.
@@ -64,6 +72,17 @@ func (c *vCPU) KernelSyscall() {
 	if regs.Regs[8] != ^uint64(0) {
 		regs.Pc -= 4 // Rewind.
 	}
+
+	vfpEnable := ring0.CPACREL1()
+	if vfpEnable != 0 {
+		fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+		fpcr := ring0.GetFPCR()
+		fpsr := ring0.GetFPSR()
+		fpsimd.Fpcr = uint32(fpcr)
+		fpsimd.Fpsr = uint32(fpsr)
+		ring0.SaveVRegs((*byte)(c.floatingPointState))
+	}
+
 	ring0.Halt()
 }
 
@@ -75,5 +94,16 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
 	if vector == ring0.Vector(bounce) {
 		regs.Pc = 0
 	}
+
+	vfpEnable := ring0.CPACREL1()
+	if vfpEnable != 0 {
+		fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+		fpcr := ring0.GetFPCR()
+		fpsr := ring0.GetFPSR()
+		fpsimd.Fpcr = uint32(fpcr)
+		fpsimd.Fpsr = uint32(fpsr)
+		ring0.SaveVRegs((*byte)(c.floatingPointState))
+	}
+
 	ring0.Halt()
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 2f02c03cf..af093c6ec 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,10 +17,26 @@
 package kvm
 
 import (
+	"unsafe"
+
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
+// fpsimdPtr returns a fpsimd64 for the given address.
+//
+//go:nosplit
+func fpsimdPtr(addr *byte) *arch.FpsimdContext {
+	return (*arch.FpsimdContext)(unsafe.Pointer(addr))
+}
+
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
 	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
 }
+
+// bluepillArchFpContext returns the arch-specific fpsimd context.
+//
+//go:nosplit
+func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext {
+	return &((*arch.SignalContext64)(context).Fpsimd64)
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 5a74c6e36..531ae8b1e 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -19,30 +19,31 @@ const (
 	_KVM_GET_ONE_REG = 0x4010aeab
 	_KVM_SET_ONE_REG = 0x4010aeac
 
-	_KVM_ARM_PREFERRED_TARGET = 0x8020aeaf
-	_KVM_ARM_VCPU_INIT        = 0x4020aeae
-	_KVM_ARM64_REGS_PSTATE    = 0x6030000000100042
-	_KVM_ARM64_REGS_SP_EL1    = 0x6030000000100044
-	_KVM_ARM64_REGS_R0        = 0x6030000000100000
-	_KVM_ARM64_REGS_R1        = 0x6030000000100002
-	_KVM_ARM64_REGS_R2        = 0x6030000000100004
-	_KVM_ARM64_REGS_R3        = 0x6030000000100006
-	_KVM_ARM64_REGS_R8        = 0x6030000000100010
-	_KVM_ARM64_REGS_R18       = 0x6030000000100024
-	_KVM_ARM64_REGS_PC        = 0x6030000000100040
-	_KVM_ARM64_REGS_MAIR_EL1  = 0x603000000013c510
-	_KVM_ARM64_REGS_TCR_EL1   = 0x603000000013c102
-	_KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100
-	_KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101
-	_KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080
-	_KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082
-	_KVM_ARM64_REGS_VBAR_EL1  = 0x603000000013c600
+	_KVM_ARM_TARGET_GENERIC_V8 = 5
+	_KVM_ARM_PREFERRED_TARGET  = 0x8020aeaf
+	_KVM_ARM_VCPU_INIT         = 0x4020aeae
+	_KVM_ARM64_REGS_PSTATE     = 0x6030000000100042
+	_KVM_ARM64_REGS_SP_EL1     = 0x6030000000100044
+	_KVM_ARM64_REGS_R0         = 0x6030000000100000
+	_KVM_ARM64_REGS_R1         = 0x6030000000100002
+	_KVM_ARM64_REGS_R2         = 0x6030000000100004
+	_KVM_ARM64_REGS_R3         = 0x6030000000100006
+	_KVM_ARM64_REGS_R8         = 0x6030000000100010
+	_KVM_ARM64_REGS_R18        = 0x6030000000100024
+	_KVM_ARM64_REGS_PC         = 0x6030000000100040
+	_KVM_ARM64_REGS_MAIR_EL1   = 0x603000000013c510
+	_KVM_ARM64_REGS_TCR_EL1    = 0x603000000013c102
+	_KVM_ARM64_REGS_TTBR0_EL1  = 0x603000000013c100
+	_KVM_ARM64_REGS_TTBR1_EL1  = 0x603000000013c101
+	_KVM_ARM64_REGS_SCTLR_EL1  = 0x603000000013c080
+	_KVM_ARM64_REGS_CPACR_EL1  = 0x603000000013c082
+	_KVM_ARM64_REGS_VBAR_EL1   = 0x603000000013c600
 )
 
 // Arm64: Architectural Feature Access Control Register EL1.
 const (
-	_FPEN_NOTRAP = 0x3
-	_FPEN_SHIFT  = 0x20
+	_FPEN_NOTRAP = 3
+	_FPEN_SHIFT  = 20
 )
 
 // Arm64: System Control Register EL1.
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 09552837a..e42505542 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -28,6 +28,10 @@ type vCPUArchState struct {
 	//
 	// This starts above fixedKernelPCID.
 	PCIDs *pagetables.PCIDs
+
+	// floatingPointState is the floating point state buffer used in guest
+	// to host transitions. See usage in bluepill_arm64.go.
+	floatingPointState *arch.FloatingPointData
 }
 
 const (
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1c8384e6b..00801dee6 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,30 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// setMemoryRegion initializes a region.
-//
-// This may be called from bluepillHandler, and therefore returns an errno
-// directly (instead of wrapping in an error) to avoid allocations.
-//
-//go:nosplit
-func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
-	userRegion := userMemoryRegion{
-		slot:          uint32(slot),
-		flags:         0,
-		guestPhysAddr: uint64(physical),
-		memorySize:    uint64(length),
-		userspaceAddr: uint64(virtual),
-	}
-
-	// Set the region.
-	_, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(m.fd),
-		_KVM_SET_USER_MEMORY_REGION,
-		uintptr(unsafe.Pointer(&userRegion)))
-	return errno
-}
-
 type kvmVcpuInit struct {
 	target   uint32
 	features [7]uint32
@@ -147,6 +123,7 @@ func (c *vCPU) initArchState() error {
 	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
 	regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
 
+	vcpuInit.target = _KVM_ARM_TARGET_GENERIC_V8
 	vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_IOCTL,
@@ -158,7 +135,8 @@ func (c *vCPU) initArchState() error {
 
 	// cpacr_el1
 	reg.id = _KVM_ARM64_REGS_CPACR_EL1
-	data = (_FPEN_NOTRAP << _FPEN_SHIFT)
+	// It is off by default, and it is turned on only when in use.
+	data = 0 // Disable fpsimd.
 	if err := c.setOneRegister(&reg); err != nil {
 		return err
 	}
@@ -250,6 +228,7 @@ func (c *vCPU) initArchState() error {
 		return err
 	}
 
+	c.floatingPointState = arch.NewFloatingPointData()
 	return nil
 }
 
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 1583dda12..0e2ab716c 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -124,6 +124,12 @@ func (c *CPU) SetAppAddr(value uintptr) {
 	c.appAddr = value
 }
 
+// GetLazyVFP returns the value of cpacr_el1.
+//go:nosplit
+func (c *CPU) GetLazyVFP() (value uintptr) {
+	return c.lazyVFP
+}
+
 // SwitchArchOpts are embedded in SwitchOpts.
 type SwitchArchOpts struct {
 	// UserASID indicates that the application ASID to be used on switch,
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index af075aae4..80922f43d 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -20,13 +20,13 @@ package ring0
 func CPACREL1() (value uintptr)
 
 // FPCR returns the value of FPCR register.
-func FPCR() (value uintptr)
+func GetFPCR() (value uintptr)
 
 // SetFPCR writes the FPCR value.
 func SetFPCR(value uintptr)
 
 // FPSR returns the value of FPSR register.
-func FPSR() (value uintptr)
+func GetFPSR() (value uintptr)
 
 // SetFPSR writes the FPSR value.
 func SetFPSR(value uintptr)
-- 
cgit v1.2.3


From 6adbdfe232c3da42a7f6f3a7d882d140196e4068 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 29 Jan 2020 07:50:34 -0500
Subject: supporting sError in guest kernel on Arm64

For test case 'TestBounce', we use KVM_SET_VCPU_EVENTS to trigger sError
to leave guest.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/aarch64.go       |  6 +++---
 pkg/sentry/platform/ring0/entry_arm64.s    | 14 +++++++++++++-
 pkg/sentry/platform/ring0/offsets_arm64.go |  1 +
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 6b078cd1e..f6da41c27 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -88,14 +88,14 @@ const (
 	El0Sync_undef
 	El0Sync_dbg
 	El0Sync_inv
-	VirtualizationException
 	_NR_INTERRUPTS
 )
 
 // System call vectors.
 const (
-	Syscall   Vector = El0Sync_svc
-	PageFault Vector = El0Sync_da
+	Syscall                 Vector = El0Sync_svc
+	PageFault               Vector = El0Sync_da
+	VirtualizationException Vector = El0Error
 )
 
 // VirtualAddressBits returns the number bits available for virtual addresses.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 679842288..baa6c4910 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -601,7 +601,19 @@ TEXT ·El0_fiq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
 TEXT ·El0_error(SB),NOSPLIT,$0
-	B ·Shutdown(SB)
+	KERNEL_ENTRY_FROM_EL0
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+	WORD $0xd538601a     //MRS   FAR_EL1, R26
+
+	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
+
+	MOVD $1, R3
+	MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
+
+	MOVD $VirtualizationException, R3
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
+	B ·Halt(SB)
 
 TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 8c960c749..057fb5c69 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -85,6 +85,7 @@ func Emit(w io.Writer) {
 
 	fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
 	fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
+	fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
 
 	p := &syscall.PtraceRegs{}
 	fmt.Fprintf(w, "\n// Ptrace registers.\n")
-- 
cgit v1.2.3


From 8dcedc953a610b97efe9f68ac8fecf5e15a7e26b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 29 Jan 2020 10:08:25 -0800
Subject: Add //pkg/sentry/devices/memdev.

PiperOrigin-RevId: 292165063
---
 pkg/abi/linux/dev.go                |  3 ++
 pkg/sentry/devices/memdev/BUILD     | 28 +++++++++++
 pkg/sentry/devices/memdev/full.go   | 75 ++++++++++++++++++++++++++++++
 pkg/sentry/devices/memdev/memdev.go | 59 ++++++++++++++++++++++++
 pkg/sentry/devices/memdev/null.go   | 76 ++++++++++++++++++++++++++++++
 pkg/sentry/devices/memdev/random.go | 92 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/devices/memdev/zero.go   | 88 +++++++++++++++++++++++++++++++++++
 7 files changed, 421 insertions(+)
 create mode 100644 pkg/sentry/devices/memdev/BUILD
 create mode 100644 pkg/sentry/devices/memdev/full.go
 create mode 100644 pkg/sentry/devices/memdev/memdev.go
 create mode 100644 pkg/sentry/devices/memdev/null.go
 create mode 100644 pkg/sentry/devices/memdev/random.go
 create mode 100644 pkg/sentry/devices/memdev/zero.go

diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 421e11256..89f9a793f 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -36,6 +36,9 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) {
 //
 // See Documentations/devices.txt and uapi/linux/major.h.
 const (
+	// MEM_MAJOR is the major device number for "memory" character devices.
+	MEM_MAJOR = 1
+
 	// TTYAUX_MAJOR is the major device number for alternate TTY devices.
 	TTYAUX_MAJOR = 5
 
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
new file mode 100644
index 000000000..abe58f818
--- /dev/null
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "memdev",
+    srcs = [
+        "full.go",
+        "memdev.go",
+        "null.go",
+        "random.go",
+        "zero.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/rand",
+        "//pkg/safemem",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
new file mode 100644
index 000000000..c7e197691
--- /dev/null
+++ b/pkg/sentry/devices/memdev/full.go
@@ -0,0 +1,75 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const fullDevMinor = 7
+
+// fullDevice implements vfs.Device for /dev/full.
+type fullDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &fullFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// fullFD implements vfs.FileDescriptionImpl for /dev/full.
+type fullFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fullFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ENOSPC
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ENOSPC
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
diff --git a/pkg/sentry/devices/memdev/memdev.go b/pkg/sentry/devices/memdev/memdev.go
new file mode 100644
index 000000000..5759900c4
--- /dev/null
+++ b/pkg/sentry/devices/memdev/memdev.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memdev implements "mem" character devices, as implemented in Linux
+// by drivers/char/mem.c and drivers/char/random.c.
+package memdev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	for minor, dev := range map[uint32]vfs.Device{
+		nullDevMinor:    nullDevice{},
+		zeroDevMinor:    zeroDevice{},
+		fullDevMinor:    fullDevice{},
+		randomDevMinor:  randomDevice{},
+		urandomDevMinor: randomDevice{},
+	} {
+		if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MEM_MAJOR, minor, dev, &vfs.RegisterDeviceOptions{
+			GroupName: "mem",
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+	for minor, name := range map[uint32]string{
+		nullDevMinor:    "null",
+		zeroDevMinor:    "zero",
+		fullDevMinor:    "full",
+		randomDevMinor:  "random",
+		urandomDevMinor: "urandom",
+	} {
+		if err := dev.CreateDeviceFile(ctx, name, vfs.CharDevice, linux.MEM_MAJOR, minor, 0666 /* mode */); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
new file mode 100644
index 000000000..33d060d02
--- /dev/null
+++ b/pkg/sentry/devices/memdev/null.go
@@ -0,0 +1,76 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const nullDevMinor = 3
+
+// nullDevice implements vfs.Device for /dev/null.
+type nullDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &nullFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// nullFD implements vfs.FileDescriptionImpl for /dev/null.
+type nullFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *nullFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *nullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, io.EOF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *nullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, io.EOF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *nullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *nullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *nullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
new file mode 100644
index 000000000..acfa23149
--- /dev/null
+++ b/pkg/sentry/devices/memdev/random.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	randomDevMinor  = 8
+	urandomDevMinor = 9
+)
+
+// randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+type randomDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &randomFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// randomFD implements vfs.FileDescriptionImpl for /dev/random.
+type randomFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	// off is the "file offset". off is accessed using atomic memory
+	// operations.
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *randomFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *randomFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *randomFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+	atomic.AddInt64(&fd.off, n)
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *randomFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// In Linux, this mixes the written bytes into the entropy pool; we just
+	// throw them away.
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *randomFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	atomic.AddInt64(&fd.off, src.NumBytes())
+	return src.NumBytes(), nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *randomFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Linux: drivers/char/random.c:random_fops.llseek == urandom_fops.llseek
+	// == noop_llseek
+	return atomic.LoadInt64(&fd.off), nil
+}
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
new file mode 100644
index 000000000..3b1372b9e
--- /dev/null
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const zeroDevMinor = 5
+
+// zeroDevice implements vfs.Device for /dev/zero.
+type zeroDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &zeroFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+type zeroFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *zeroFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *zeroFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *zeroFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *zeroFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *zeroFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+	if err != nil {
+		return err
+	}
+	opts.MappingIdentity = m
+	opts.Mappable = m
+	return nil
+}
-- 
cgit v1.2.3


From 37bb502670caefd4113da062495b4e318ea0f72e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 29 Jan 2020 10:35:32 -0800
Subject: sentry: rename SetRSEQInterruptedIP to SetOldRSeqInterruptedIP for
 arm64

For amd64, this has been done on cl/288342928.

PiperOrigin-RevId: 292170856
---
 pkg/sentry/arch/arch_arm64.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 94f1a808f..ac98897b5 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -137,8 +137,8 @@ func (c *context64) SetTLS(value uintptr) bool {
 	return false
 }
 
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 	c.Regs.Regs[3] = uint64(value)
 }
 
-- 
cgit v1.2.3


From 148fda60e8dee29f2df85e3104e3d5de1a225bcf Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 29 Jan 2020 11:15:59 -0800
Subject: Add plumbing for file locks in VFS2.

Updates #1480

PiperOrigin-RevId: 292180192
---
 pkg/sentry/vfs/BUILD                         |  1 +
 pkg/sentry/vfs/file_description.go           | 21 ++++++++++++++++++++-
 pkg/sentry/vfs/file_description_impl_util.go | 21 +++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index ced9d07b1..14b39eb9d 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -44,6 +44,7 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sync",
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index badacb55e..5bac660c7 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -393,7 +394,25 @@ type FileDescriptionImpl interface {
 	// Removexattr removes the given extended attribute from the file.
 	Removexattr(ctx context.Context, name string) error
 
-	// TODO: file locking
+	// LockBSD tries to acquire a BSD-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): BSD-style file locking
+	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
+
+	// LockBSD releases a BSD-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): BSD-style file locking
+	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
+
+	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): POSIX-style file locking
+	LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+
+	// UnlockPOSIX releases a POSIX-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): POSIX-style file locking
+	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index a4900c170..c2a52ec1b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -152,6 +153,26 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
 	return syserror.ENOTSUP
 }
 
+// LockBSD implements FileDescriptionImpl.LockBSD.
+func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+	return syserror.EBADF
+}
+
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
+func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+	return syserror.EBADF
+}
+
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
+func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+	return syserror.EBADF
+}
+
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
+func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+	return syserror.EBADF
+}
+
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
-- 
cgit v1.2.3


From 51b783505b1ec164b02b48a0fd234509fba01a73 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 Jan 2020 15:41:51 -0800
Subject: Add support for TCP_DEFER_ACCEPT.

PiperOrigin-RevId: 292233574
---
 pkg/sentry/socket/netstack/netstack.go      |  22 ++++
 pkg/tcpip/tcpip.go                          |   6 ++
 pkg/tcpip/transport/tcp/BUILD               |   1 +
 pkg/tcpip/transport/tcp/accept.go           |  25 ++---
 pkg/tcpip/transport/tcp/connect.go          |  53 +++++++++-
 pkg/tcpip/transport/tcp/endpoint.go         |  26 ++++-
 pkg/tcpip/transport/tcp/forwarder.go        |   4 +-
 pkg/tcpip/transport/tcp/tcp_test.go         | 126 ++++++++++++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 158 ++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc           |  53 ++++++++++
 10 files changed, 451 insertions(+), 23 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8619cc506..049d04bf2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1260,6 +1260,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_DEFER_ACCEPT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPDeferAcceptOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1713,6 +1725,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_DEFER_ACCEPT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			v = 0
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 59c9b3fb0..0fa141d58 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -626,6 +626,12 @@ type TCPLingerTimeoutOption time.Duration
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
+// accept to return a completed connection only when there is data to be
+// read. This usually means the listening socket will drop the final ACK
+// for a handshake till the specified timeout until a segment with data arrives.
+type TCPDeferAcceptOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 4acd9fb9a..7b4a87a2d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -57,6 +57,7 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d469758eb..6101f2945 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -222,13 +222,13 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
 // the connection parameters given by the arguments.
-func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
 		netProto = s.route.NetProto
 	}
-	n := newEndpoint(l.stack, netProto, nil)
+	n := newEndpoint(l.stack, netProto, queue)
 	n.v6only = l.v6only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
@@ -273,16 +273,17 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 // createEndpoint creates a new endpoint in connected state and then performs
 // the TCP 3-way handshake.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
-	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
+	deferAccept := time.Duration(0)
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
@@ -290,13 +291,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
 	// Perform the 3-way handshake.
-	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
-
-	h.resetToSynRcvd(isn, irs, opts)
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -377,16 +377,14 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer e.decSynRcvdCount()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
 		return
 	}
 	ctx.removePendingEndpoint(n)
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
+	n.startAcceptedLoop()
 	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
 	e.deliverAccepted(n)
@@ -546,7 +544,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
 		}
 
-		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
+		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{})
 		if err != nil {
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
@@ -576,8 +574,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// space available in the backlog.
 
 		// Start the protocol goroutine.
-		wq := &waiter.Queue{}
-		n.startAcceptedLoop(wq)
+		n.startAcceptedLoop()
 		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4e3c5419c..9ff7ac261 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -86,6 +86,19 @@ type handshake struct {
 
 	// rcvWndScale is the receive window scale, as defined in RFC 1323.
 	rcvWndScale int
+
+	// startTime is the time at which the first SYN/SYN-ACK was sent.
+	startTime time.Time
+
+	// deferAccept if non-zero will drop the final ACK for a passive
+	// handshake till an ACK segment with data is received or the timeout is
+	// hit.
+	deferAccept time.Duration
+
+	// acked is true if the the final ACK for a 3-way handshake has
+	// been received. This is required to stop retransmitting the
+	// original SYN-ACK when deferAccept is enabled.
+	acked bool
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
@@ -112,6 +125,12 @@ func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
 	return h
 }
 
+func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake {
+	h := newHandshake(ep, rcvWnd)
+	h.resetToSynRcvd(isn, irs, opts, deferAccept)
+	return h
+}
+
 // FindWndScale determines the window scale to use for the given maximum window
 // size.
 func FindWndScale(wnd seqnum.Size) int {
@@ -181,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -189,6 +208,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
+	h.deferAccept = deferAccept
 	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
@@ -352,6 +372,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
+		// If deferAccept is not zero and this is a bare ACK and the
+		// timeout is not hit then drop the ACK.
+		if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept {
+			h.acked = true
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -365,10 +393,16 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+
 		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+		// If the segment has data then requeue it for the receiver
+		// to process it again once main loop is started.
+		if s.data.Size() > 0 {
+			s.incRef()
+			h.ep.enqueueSegment(s)
+		}
 		h.ep.mu.Unlock()
-
 		return nil
 	}
 
@@ -471,6 +505,7 @@ func (h *handshake) execute() *tcpip.Error {
 		}
 	}
 
+	h.startTime = time.Now()
 	// Initialize the resend timer.
 	resendWaker := sleep.Waker{}
 	timeOut := time.Duration(time.Second)
@@ -524,11 +559,21 @@ func (h *handshake) execute() *tcpip.Error {
 		switch index, _ := s.Fetch(true); index {
 		case wakerForResend:
 			timeOut *= 2
-			if timeOut > 60*time.Second {
+			if timeOut > MaxRTO {
 				return tcpip.ErrTimeout
 			}
 			rt.Reset(timeOut)
-			h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			// Resend the SYN/SYN-ACK only if the following conditions hold.
+			//  - It's an active handshake (deferAccept does not apply)
+			//  - It's a passive handshake and we have not yet got the final-ACK.
+			//  - It's a passive handshake and we got an ACK but deferAccept is
+			//    enabled and we are now past the deferAccept duration.
+			// The last is required to provide a way for the peer to complete
+			// the connection with another ACK or data (as ACKs are never
+			// retransmitted on their own).
+			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
+				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			}
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 13718ff55..8d52414b7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -498,6 +498,13 @@ type endpoint struct {
 	// without any data being acked.
 	userTimeout time.Duration
 
+	// deferAccept if non-zero specifies a user specified time during
+	// which the final ACK of a handshake will be dropped provided the
+	// ACK is a bare ACK and carries no data. If the timeout is crossed then
+	// the bare ACK is accepted and the connection is delivered to the
+	// listener.
+	deferAccept time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1574,6 +1581,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		if time.Duration(v) > MaxRTO {
+			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		}
+		e.deferAccept = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1798,6 +1814,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2149,9 +2171,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
-func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+func (e *endpoint) startAcceptedLoop() {
 	e.mu.Lock()
-	e.waiterQueue = waiterQueue
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2177,7 +2198,6 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
-
 	return n, n.waiterQueue, nil
 }
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7eb613be5..c9ee5bf06 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -157,13 +157,13 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	})
+	}, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Start the protocol goroutine.
-	ep.startAcceptedLoop(queue)
+	ep.startAcceptedLoop()
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index df2fb1071..a12336d47 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6787,3 +6787,129 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		),
 	)
 }
+
+func TestTCPDeferAccept(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give a bit of time for the socket to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestTCPDeferAcceptTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Sleep for a little of the tcpDeferAccept timeout.
+	time.Sleep(tcpDeferAccept + 100*time.Millisecond)
+
+	// On timeout expiry we should get a SYN-ACK retransmission.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+		checker.AckNum(uint32(irs)+1)))
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give sometime for the endpoint to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2f9821555..3bf7081b9 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -828,6 +828,164 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
   EXPECT_EQ(get, kUserTimeout);
 }
 
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now write some data to the socket.
+  int data = 0;
+  ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  // This should now cause the connection to complete and be delivered to the
+  // accept socket.
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Verify that the accepted socket returns the data written.
+  int get = -1;
+  ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0),
+              SyscallSucceedsWithValue(sizeof(get)));
+
+  EXPECT_EQ(get, data);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R once issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT
+  // timeout is hit.
+  absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1));
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout
+  // is hit a SYN-ACK should be retransmitted by the listener as a last ditch
+  // attempt to complete the connection with or without data.
+  absl::SleepFor(absl::Seconds(2));
+
+  // Verify that we have a connection that can be accepted even though no
+  // data was written.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33a5ac66c..525ccbd88 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1286,6 +1286,59 @@ TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
   EXPECT_EQ(get, kTCPUserTimeout);
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // -ve TCP_DEFER_ACCEPT is same as setting it to zero.
+  constexpr int kNeg = -1;
+  EXPECT_THAT(
+      setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)),
+      SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // kTCPDeferAccept is in seconds.
+  // NOTE: linux translates seconds to # of retries and back from
+  //   #of retries to seconds. Which means only certain values
+  //   translate back exactly. That's why we use 3 here, a value of
+  //   5 will result in us getting back 7 instead of 5 in the
+  //   getsockopt.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPDeferAccept);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 0ade523f061d25c2b4abeba9c74e879aae2ce376 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 29 Jan 2020 16:26:28 -0800
Subject: Fix iptables tests that were broken by rename.

The name of the runner binary target changed from "runner" to "runner-image",
causing iptables tests to fail.

PiperOrigin-RevId: 292242263
---
 scripts/iptables_tests.sh      | 4 ++--
 test/iptables/README.md        | 2 +-
 test/iptables/iptables_test.go | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
index c47cbd675..3069d8628 100755
--- a/scripts/iptables_tests.sh
+++ b/scripts/iptables_tests.sh
@@ -19,9 +19,9 @@ source $(dirname $0)/common.sh
 install_runsc_for_test iptables
 
 # Build the docker image for the test.
-run //test/iptables/runner --norun
+run //test/iptables/runner-image --norun
 
 # TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better
 # supported
 test //test/iptables:iptables_test "--test_arg=--runtime=runc" \
-  "--test_arg=--image=bazel/test/iptables/runner:runner"
+  "--test_arg=--image=bazel/test/iptables/runner:runner-image"
diff --git a/test/iptables/README.md b/test/iptables/README.md
index 9f8e34420..8f61b4c41 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -28,7 +28,7 @@ Your test is now runnable with bazel!
 Build the testing Docker container:
 
 ```bash
-$ bazel run //test/iptables/runner -- --norun
+$ bazel run //test/iptables/runner-image -- --norun
 ```
 
 Run an individual test via:
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 679a29bef..41909582a 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -30,7 +30,7 @@ import (
 
 const timeout = 18 * time.Second
 
-var image = flag.String("image", "bazel/test/iptables/runner:runner", "image to run tests in")
+var image = flag.String("image", "bazel/test/iptables/runner:runner-image", "image to run tests in")
 
 type result struct {
 	output string
-- 
cgit v1.2.3


From 6f841c304d7bd9af6167d7d049bd5c594358a1b9 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 29 Jan 2020 19:54:11 -0800
Subject: Do not spawn a goroutine when calling stack.NDPDispatcher's methods

Do not start a new goroutine when calling
stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.

PiperOrigin-RevId: 292268574
---
 pkg/tcpip/stack/ndp.go      | 8 ++++----
 pkg/tcpip/stack/ndp_test.go | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 245694118..281ae786d 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -167,8 +167,8 @@ type NDPDispatcher interface {
 	// reason, such as the address being removed). If an error occured
 	// during DAD, err will be set and resolved must be ignored.
 	//
-	// This function is permitted to block indefinitely without interfering
-	// with the stack's operation.
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
 	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
 
 	// OnDefaultRouterDiscovered will be called when a new default router is
@@ -607,8 +607,8 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 	delete(ndp.dad, addr)
 
 	// Let the integrator know DAD did not resolve.
-	if ndp.nic.stack.ndpDisp != nil {
-		go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
 	}
 }
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 726468e41..8c76e80f2 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -497,7 +497,7 @@ func TestDADFail(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			ndpDisp := ndpDispatcher{
-				dadC: make(chan ndpDADEvent),
+				dadC: make(chan ndpDADEvent, 1),
 			}
 			ndpConfigs := stack.DefaultNDPConfigurations()
 			opts := stack.Options{
@@ -576,7 +576,7 @@ func TestDADFail(t *testing.T) {
 // removed.
 func TestDADStop(t *testing.T) {
 	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent),
+		dadC: make(chan ndpDADEvent, 1),
 	}
 	ndpConfigs := stack.NDPConfigurations{
 		RetransmitTimer:        time.Second,
-- 
cgit v1.2.3


From ec0679737e8f9ab31ef6c7c3adb5a0005586b5a7 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 30 Jan 2020 07:12:04 -0800
Subject: Do not include the Source Link Layer option with an unspecified
 source address

When sending NDP messages with an unspecified source address, the Source
Link Layer address must not be included.

Test: stack_test.TestDADResolve
PiperOrigin-RevId: 292341334
---
 pkg/tcpip/stack/ndp.go      | 22 ++--------------------
 pkg/tcpip/stack/ndp_test.go | 12 ++++++++----
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 281ae786d..31294345d 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -538,29 +538,11 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
-	linkAddr := ndp.nic.linkEP.LinkAddress()
-	isValidLinkAddr := header.IsValidUnicastEthernetAddress(linkAddr)
-	ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize
-	if isValidLinkAddr {
-		// Only include a Source Link Layer Address option if the NIC has a valid
-		// link layer address.
-		//
-		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
-		// LinkEndpoint.LinkAddress) before reaching this point.
-		ndpNSSize += header.NDPLinkLayerAddressSize
-	}
-
-	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + ndpNSSize)
-	pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborSolicit)
 	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
 	ns.SetTargetAddress(addr)
-
-	if isValidLinkAddr {
-		ns.Options().Serialize(header.NDPOptionsSerializer{
-			header.NDPSourceLinkLayerAddressOption(linkAddr),
-		})
-	}
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 	sent := r.Stats().ICMP.V6PacketsSent
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8c76e80f2..bc7cfbcb4 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -413,14 +413,18 @@ func TestDADResolve(t *testing.T) {
 					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 				}
 
-				// Check NDP packet.
+				// Check NDP NS packet.
+				//
+				// As per RFC 4861 section 4.3, a possible option is the Source Link
+				// Layer option, but this option MUST NOT be included when the source
+				// address of the packet is the unspecified address.
 				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(header.SolicitedNodeAddr(addr1)),
 					checker.TTL(header.NDPHopLimit),
 					checker.NDPNS(
 						checker.NDPNSTargetAddress(addr1),
-						checker.NDPNSOptions([]header.NDPOption{
-							header.NDPSourceLinkLayerAddressOption(linkAddr1),
-						}),
+						checker.NDPNSOptions(nil),
 					))
 			}
 		})
-- 
cgit v1.2.3


From ede8dfab3760afc8063c3418f217e52f7ec70d42 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 30 Jan 2020 09:13:36 -0800
Subject: Enforce splice offset limits

Splice must not allow negative offsets. Writes also must not allow offset +
size to overflow int64. Reads are similarly broken, but not just in splice
(b/148095030).

Reported-by: syzbot+0e1ff0b95fb2859b4190@syzkaller.appspotmail.com
PiperOrigin-RevId: 292361208
---
 pkg/sentry/fs/tmpfs/inode_file.go       | 10 ++++--
 pkg/sentry/syscalls/linux/sys_splice.go | 16 ++++------
 test/syscalls/linux/splice.cc           | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index dabc10662..25abbc151 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,6 +17,7 @@ package tmpfs
 import (
 	"fmt"
 	"io"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -444,10 +445,15 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	defer rw.f.dataMu.Unlock()
 
 	// Compute the range to write.
-	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
-	if end == rw.offset { // srcs.NumBytes() == 0?
+	if srcs.NumBytes() == 0 {
+		// Nothing to do.
 		return 0, nil
 	}
+	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+	if end == math.MaxInt64 {
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
 
 	// Check if seals prevent either file growth or all writes.
 	switch {
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index f43d6c155..fd642834b 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -25,6 +25,10 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+		return 0, syserror.EINVAL
+	}
+
 	var (
 		total int64
 		n     int64
@@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	offsetAddr := args[2].Pointer()
 	count := int64(args[3].SizeT())
 
-	// Don't send a negative number of bytes.
-	if count < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
 	// Get files.
 	inFile := t.GetFile(inFD)
 	if inFile == nil {
@@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			return 0, nil, err
 		}
 
-		// The offset must be valid.
-		if offset < 0 {
-			return 0, nil, syserror.EINVAL
-		}
-
 		// Do the splice.
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length:    count,
@@ -227,6 +221,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(outOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the destination offset.
 			opts.DstOffset = true
 			opts.DstStart = offset
@@ -244,6 +239,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(inOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the source offset.
 			opts.SrcOffset = true
 			opts.SrcStart = offset
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 85232cb1f..faa1247f6 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -60,6 +60,62 @@ TEST(SpliceTest, TwoRegularFiles) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SpliceTest, NegativeOffset) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file as write only.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("negative", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  loff_t out_offset = 0xffffffffffffffffull;
+  constexpr int kSize = 2;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Write offset + size overflows int64.
+//
+// This is a regression test for b/148041624.
+TEST(SpliceTest, WriteOverflow) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SpliceTest, SamePipe) {
   // Create a new pipe.
   int fds[2];
-- 
cgit v1.2.3


From 4ee64a248ec16fcc9e526a457a66648546611bfb Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 30 Jan 2020 11:48:36 -0800
Subject: Fix for panic in endpoint.Close().

When sending a RST on shutdown we need to double check the
state after acquiring the work mutex as the endpoint could
have transitioned out of a connected state from the time
we checked it and we acquired the workMutex.

I added two tests but sadly neither reproduce the panic. I am
going to leave the tests in as they are good to have anyway.

PiperOrigin-RevId: 292393800
---
 pkg/tcpip/transport/tcp/BUILD                |  1 +
 pkg/tcpip/transport/tcp/endpoint.go          | 10 ++++-
 pkg/tcpip/transport/tcp/tcp_test.go          | 55 ++++++++++++++++++++++++++++
 test/syscalls/linux/BUILD                    |  1 +
 test/syscalls/linux/socket_ip_tcp_generic.cc | 33 +++++++++++++++++
 5 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 7b4a87a2d..272e8f570 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -91,6 +91,7 @@ go_test(
     tags = ["flaky"],
     deps = [
         ":tcp",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8d52414b7..b5a8e15ee 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2047,8 +2047,14 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 				// work mutex is available.
 				if e.workMu.TryLock() {
 					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.notifyProtocolGoroutine(notifyTickleWorker)
+					// We need to double check here to make
+					// sure worker has not transitioned the
+					// endpoint out of a connected state
+					// before trying to send a reset.
+					if e.EndpointState().connected() {
+						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+						e.notifyProtocolGoroutine(notifyTickleWorker)
+					}
 					e.mu.Unlock()
 					e.workMu.Unlock()
 				} else {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a12336d47..2c1505067 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -6913,3 +6914,57 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SeqNum(uint32(iss+1)),
 		checker.AckNum(uint32(irs+5))))
 }
+
+func TestResetDuringClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	iss := seqnum.Value(789)
+	c.CreateConnected(iss, 30000, -1 /* epRecvBuf */)
+	// Send some data to make sure there is some unread
+	// data to trigger a reset on c.Close.
+	irs := c.IRS
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss.Add(1),
+		AckNum:  irs.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(irs.Add(1))),
+		checker.AckNum(uint32(iss.Add(5)))))
+
+	// Close in a separate goroutine so that we can trigger
+	// a race with the RST we send below. This should not
+	// panic due to the route being released depeding on
+	// whether Close() sends an active RST or the RST sent
+	// below is processed by the worker first.
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			SeqNum:  iss.Add(5),
+			AckNum:  c.IRS.Add(5),
+			RcvWnd:  30000,
+			Flags:   header.TCPFlagRst,
+		})
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.EP.Close()
+	}()
+
+	wg.Wait()
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 74bf068ec..7958fd0d7 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2173,6 +2173,7 @@ cc_library(
         ":socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 57ce8e169..27779e47c 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,7 @@
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -875,5 +876,37 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
   EXPECT_EQ(get, kAbove);
 }
 
+TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+  constexpr int kThreadCount = 1000;
+  std::unique_ptr<ScopedThread> instances[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i] = absl::make_unique<ScopedThread>([&]() {
+      auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+      ScopedThread t([&]() {
+        // Close one end to trigger sending of a FIN.
+        struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+        // Wait up to 20 seconds for the data.
+        constexpr int kPollTimeoutMs = 20000;
+        ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+                    SyscallSucceedsWithValue(1));
+        ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+      });
+
+      // Send some data then close.
+      constexpr char kStr[] = "abc";
+      ASSERT_THAT(write(sockets->first_fd(), kStr, 3),
+                  SyscallSucceedsWithValue(3));
+      absl::SleepFor(absl::Milliseconds(10));
+      ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+      t.Join();
+    });
+  }
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i]->Join();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 9988cf2eeff596ce519046d80c54d09166f7d84b Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Thu, 30 Jan 2020 14:06:54 -0800
Subject: Wrap all GetSocketPairs() in unnamed namespaces

This avoids conflicting definitions of GetSocketPairs() in outer namespace when
multiple such cc files are complied for one binary.

PiperOrigin-RevId: 292420885
---
 test/syscalls/linux/socket_abstract.cc                       | 2 ++
 test/syscalls/linux/socket_filesystem.cc                     | 2 ++
 test/syscalls/linux/socket_ip_tcp_generic_loopback.cc        | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc       | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback.cc                | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback_blocking.cc       | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_unix_abstract_nonblock.cc         | 2 ++
 test/syscalls/linux/socket_unix_blocking_local.cc            | 2 ++
 test/syscalls/linux/socket_unix_dgram_local.cc               | 2 ++
 test/syscalls/linux/socket_unix_domain.cc                    | 2 ++
 test/syscalls/linux/socket_unix_filesystem_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_unix_non_stream_blocking_local.cc | 2 ++
 test/syscalls/linux/socket_unix_pair.cc                      | 2 ++
 test/syscalls/linux/socket_unix_pair_nonblock.cc             | 2 ++
 test/syscalls/linux/socket_unix_seqpacket_local.cc           | 2 ++
 test/syscalls/linux/socket_unix_stream_blocking_local.cc     | 2 ++
 test/syscalls/linux/socket_unix_stream_local.cc              | 2 ++
 test/syscalls/linux/socket_unix_stream_nonblock_local.cc     | 2 ++
 20 files changed, 40 insertions(+)

diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 715d87b76..00999f192 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     AbstractUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index 74e262959..287359363 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     FilesystemUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index d11f7cc23..4e79d21f4 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllTCPSockets, TCPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index fcd20102f..f996b93d2 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingTCPSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 63a05b799..ffa377210 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -38,5 +39,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingTCPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index 1df74a348..c7fa44884 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -44,5 +45,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUDPSockets, UDPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 1e259efa7..d6925a8df 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingUDPSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 74cbd326d..d675eddc6 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUDPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index be31ab2a7..8bef76b67 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingAbstractUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 6f84221b2..77cb8c6d6 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUnixDomainSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index 9134fcdf7..31d2d5216 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
@@ -52,5 +53,6 @@ INSTANTIATE_TEST_SUITE_P(
     DgramUnixSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
index fa3efc7f8..f7dff8b4d 100644
--- a/test/syscalls/linux/socket_unix_domain.cc
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index 8ba7af971..6700b4d90 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingFilesystemUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 8855d5001..fddcdf1c5 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -36,5 +37,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingNonStreamUnixSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index 411fb4518..85999db04 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
@@ -38,5 +39,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 3135d325f..281410a9a 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index dff75a532..69a5f150d 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
@@ -52,5 +53,6 @@ INSTANTIATE_TEST_SUITE_P(
     SeqpacketUnixSockets, UnixNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index 08e579ba7..8429bd429 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingStreamUnixSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index 65eef1a81..a7e3449a9 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -42,5 +43,6 @@ INSTANTIATE_TEST_SUITE_P(
     StreamUnixSockets, StreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index 1936aa135..4b763c8e2 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -20,6 +20,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingStreamUnixSockets, NonBlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 14959250feb71df74dea13f3cb15dcbe8ce6b3f3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 30 Jan 2020 17:37:17 -0800
Subject: Simplify testing link rules.

PiperOrigin-RevId: 292458933
---
 test/syscalls/linux/BUILD | 688 +++++++++++++++++++++++-----------------------
 test/util/BUILD           |  30 +-
 tools/build/defs.bzl      |   1 +
 tools/defs.bzl            |   3 +-
 tools/images/BUILD        |   4 +-
 5 files changed, 363 insertions(+), 363 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ee7a8a673..e4ca5b6db 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_arch", "select_system")
+load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "gtest", "select_arch", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -82,14 +82,14 @@ cc_library(
     srcs = ["base_poll_test.cc"],
     hdrs = ["base_poll_test.h"],
     deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -99,11 +99,11 @@ cc_library(
     hdrs = ["file_base.h"],
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -130,7 +130,7 @@ cc_library(
     hdrs = ["socket_test_util.h"],
     defines = select_system(),
     deps = default_net_util() + [
-        "@com_google_googletest//:gtest",
+        gtest,
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -155,9 +155,9 @@ cc_library(
     hdrs = ["unix_domain_socket_test_util.h"],
     deps = [
         ":socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
 )
 
@@ -179,14 +179,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -199,13 +199,13 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:memory_util",
         "//test/util:platform_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -218,9 +218,9 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -233,9 +233,9 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -247,10 +247,10 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -262,12 +262,12 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -280,12 +280,11 @@ cc_binary(
     ],
     linkstatic = 1,
     deps = [
-        # The heapchecker doesn't recognize that io_destroy munmaps.
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
@@ -302,12 +301,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -320,9 +319,9 @@ cc_binary(
         "//:sandbox",
     ],
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -334,9 +333,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -348,9 +347,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -372,10 +371,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -388,10 +387,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -404,14 +403,14 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -424,12 +423,12 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -443,12 +442,12 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:mount_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -458,9 +457,9 @@ cc_binary(
     srcs = ["clock_getres.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -470,11 +469,11 @@ cc_binary(
     srcs = ["clock_gettime.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -484,13 +483,13 @@ cc_binary(
     srcs = ["concurrency.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:platform_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -503,9 +502,9 @@ cc_binary(
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -516,10 +515,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -530,9 +529,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -544,11 +543,11 @@ cc_binary(
     deps = [
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -561,10 +560,10 @@ cc_binary(
         "//test/util:epoll_util",
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -576,10 +575,10 @@ cc_binary(
     deps = [
         "//test/util:epoll_util",
         "//test/util:eventfd_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -589,12 +588,12 @@ cc_binary(
     srcs = ["exceptions.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -604,10 +603,10 @@ cc_binary(
     srcs = ["getcpu.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -617,10 +616,10 @@ cc_binary(
     srcs = ["getcpu.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -630,13 +629,13 @@ cc_binary(
     srcs = ["getrusage.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -652,14 +651,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -682,15 +681,15 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -701,11 +700,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -718,10 +717,10 @@ cc_binary(
         ":file_base",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -731,9 +730,9 @@ cc_binary(
     srcs = ["fault.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -744,10 +743,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -761,18 +760,18 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:eventfd_util",
         "//test/util:fs_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -786,15 +785,15 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -805,13 +804,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -824,11 +823,11 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -841,10 +840,10 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -855,10 +854,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -869,10 +868,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -884,6 +883,9 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:memory_util",
         "//test/util:save_util",
         "//test/util:temp_path",
@@ -892,9 +894,6 @@ cc_binary(
         "//test/util:thread_util",
         "//test/util:time_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -907,12 +906,12 @@ cc_binary(
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -922,9 +921,9 @@ cc_binary(
     srcs = ["getrandom.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -957,10 +956,10 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -984,9 +983,9 @@ cc_binary(
         ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -997,6 +996,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1004,9 +1006,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1018,15 +1017,15 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1039,14 +1038,14 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1057,10 +1056,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1071,6 +1070,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1078,7 +1078,6 @@ cc_binary(
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1089,12 +1088,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/memory",
+        gtest,
         "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1104,11 +1103,11 @@ cc_binary(
     srcs = ["mincore.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1121,10 +1120,10 @@ cc_binary(
         ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1135,11 +1134,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1151,12 +1150,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:cleanup",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:rlimit_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1169,13 +1168,13 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1188,6 +1187,9 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:mount_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1195,9 +1197,6 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1207,10 +1206,9 @@ cc_binary(
     srcs = ["mremap.cc"],
     linkstatic = 1,
     deps = [
-        # The heap check fails due to MremapDeathTest
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1242,9 +1240,9 @@ cc_binary(
     srcs = ["munmap.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1261,14 +1259,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1282,10 +1280,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1299,11 +1297,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1317,11 +1315,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1333,16 +1331,16 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1354,12 +1352,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:posix_error",
         "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1372,12 +1370,12 @@ cc_binary(
         "//test/syscalls/linux:socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1387,13 +1385,13 @@ cc_binary(
     srcs = ["pause.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1405,15 +1403,15 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1426,13 +1424,13 @@ cc_binary(
         ":base_poll_test",
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1443,11 +1441,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1458,9 +1456,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1472,12 +1470,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:cleanup",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1488,13 +1486,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1505,10 +1503,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1519,6 +1517,8 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:temp_path",
@@ -1526,8 +1526,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1541,13 +1539,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1559,11 +1557,11 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1577,6 +1575,10 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -1584,10 +1586,6 @@ cc_binary(
         "//test/util:thread_util",
         "//test/util:time_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1601,11 +1599,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1617,17 +1615,17 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1641,6 +1639,8 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1648,8 +1648,6 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1660,11 +1658,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1674,6 +1672,9 @@ cc_binary(
     srcs = ["ptrace.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:platform_util",
@@ -1681,9 +1682,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1693,10 +1691,10 @@ cc_binary(
     srcs = ["pwrite64.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1710,12 +1708,12 @@ cc_binary(
     deps = [
         ":file_base",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1729,11 +1727,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1747,10 +1745,10 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1764,10 +1762,10 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1778,10 +1776,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1792,10 +1790,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1811,13 +1809,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1832,12 +1830,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1851,11 +1849,11 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1879,11 +1877,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/syscalls/linux/rseq:lib",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1894,11 +1892,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        gtest,
         "//test/util:logging",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1908,9 +1906,9 @@ cc_binary(
     srcs = ["sched.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1920,9 +1918,9 @@ cc_binary(
     srcs = ["sched_yield.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1932,6 +1930,8 @@ cc_binary(
     srcs = ["seccomp.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1939,8 +1939,6 @@ cc_binary(
         "//test/util:proc_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1952,14 +1950,14 @@ cc_binary(
     deps = [
         ":base_poll_test",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:rlimit_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1971,13 +1969,13 @@ cc_binary(
     deps = [
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1989,12 +1987,12 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2005,13 +2003,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2021,9 +2019,9 @@ cc_binary(
     srcs = ["sigaction.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2038,13 +2036,13 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:fs_util",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2057,7 +2055,7 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
-        "@com_google_googletest//:gtest",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
@@ -2075,14 +2073,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:logging",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2092,10 +2090,10 @@ cc_binary(
     srcs = ["sigprocmask.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2105,13 +2103,13 @@ cc_binary(
     srcs = ["sigstop.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2122,13 +2120,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2144,10 +2142,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2160,8 +2158,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2174,8 +2172,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2191,11 +2189,11 @@ cc_library(
     ],
     deps = [
         ":socket_test_util",
-        "//test/util:test_util",
-        "//test/util:thread_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
+        "//test/util:thread_util",
     ],
     alwayslink = 1,
 )
@@ -2212,8 +2210,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2230,9 +2228,9 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:memory_util",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2250,8 +2248,8 @@ cc_library(
         ":ip_socket_test_util",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2268,8 +2266,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2286,9 +2284,9 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2305,8 +2303,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2323,8 +2321,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2387,9 +2385,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2419,9 +2417,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2451,9 +2449,9 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2551,10 +2549,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2570,10 +2568,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2589,10 +2587,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2638,9 +2636,9 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2719,15 +2717,15 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:save_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2739,9 +2737,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2755,10 +2753,10 @@ cc_binary(
         ":socket_test_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings:str_format",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2771,9 +2769,9 @@ cc_binary(
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2791,9 +2789,9 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2810,11 +2808,11 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2831,10 +2829,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2851,10 +2849,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2871,11 +2869,11 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2892,8 +2890,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2910,10 +2908,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -3007,9 +3005,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3021,9 +3019,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3035,9 +3033,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3052,9 +3050,9 @@ cc_binary(
         ":socket_blocking_test_cases",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3069,9 +3067,9 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_blocking_test_cases",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3086,9 +3084,9 @@ cc_binary(
         ":socket_non_stream_blocking_test_cases",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3103,9 +3101,9 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_non_stream_blocking_test_cases",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3121,9 +3119,9 @@ cc_binary(
         ":socket_unix_cmsg_test_cases",
         ":socket_unix_test_cases",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3135,9 +3133,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3149,9 +3147,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3164,10 +3162,10 @@ cc_binary(
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:endian",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3183,12 +3181,12 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3199,11 +3197,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3217,12 +3215,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3235,10 +3233,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3248,10 +3246,10 @@ cc_binary(
     srcs = ["sync.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3261,10 +3259,10 @@ cc_binary(
     srcs = ["sysinfo.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3274,9 +3272,9 @@ cc_binary(
     srcs = ["syslog.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3286,10 +3284,10 @@ cc_binary(
     srcs = ["sysret.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3301,12 +3299,12 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3316,11 +3314,11 @@ cc_binary(
     srcs = ["tgkill.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3330,10 +3328,10 @@ cc_binary(
     srcs = ["time.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3358,15 +3356,15 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3376,11 +3374,11 @@ cc_binary(
     srcs = ["tkill.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3394,11 +3392,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3414,12 +3412,12 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -3442,9 +3440,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3455,14 +3453,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:uid_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3473,11 +3471,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3490,11 +3488,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3504,11 +3502,11 @@ cc_binary(
     srcs = ["unshare.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3534,11 +3532,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        gtest,
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3548,13 +3546,13 @@ cc_binary(
     srcs = ["vfork.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3566,6 +3564,10 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -3574,10 +3576,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3588,10 +3586,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3602,12 +3600,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3618,14 +3616,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-        "//test/util:thread_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
     ],
 )
 
@@ -3651,10 +3649,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3664,11 +3662,11 @@ cc_binary(
     srcs = ["vdso_clock_gettime.cc"],
     linkstatic = 1,
     deps = [
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -3678,10 +3676,10 @@ cc_binary(
     srcs = ["vsyscall.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3694,11 +3692,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -3710,12 +3708,12 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3727,10 +3725,10 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3742,10 +3740,10 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3761,11 +3759,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/test/util/BUILD b/test/util/BUILD
index 1ac8b3fd6..1f22ebe29 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -41,7 +41,7 @@ cc_library(
         ":save_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -55,7 +55,7 @@ cc_library(
         ":posix_error",
         ":test_util",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -67,7 +67,7 @@ cc_test(
         ":proc_util",
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -87,7 +87,7 @@ cc_library(
         ":file_descriptor",
         ":posix_error",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -101,7 +101,7 @@ cc_test(
         ":temp_path",
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -134,7 +134,7 @@ cc_library(
         ":cleanup",
         ":posix_error",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -183,7 +183,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -194,7 +194,7 @@ cc_test(
     deps = [
         ":posix_error",
         ":test_main",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -218,7 +218,7 @@ cc_library(
         ":cleanup",
         ":posix_error",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -233,7 +233,7 @@ cc_library(
         ":test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -259,7 +259,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -291,7 +291,7 @@ cc_library(
         ":posix_error",
         ":test_util",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -302,7 +302,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -322,7 +322,7 @@ cc_library(
         ":file_descriptor",
         ":posix_error",
         ":save_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
index d0556abd1..967c1f900 100644
--- a/tools/build/defs.bzl
+++ b/tools/build/defs.bzl
@@ -18,6 +18,7 @@ cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
 go_image = _go_image
 go_embed_data = _go_embed_data
+gtest = "@com_google_googletest//:gtest"
 loopback = "//tools/build:loopback"
 proto_library = native.proto_library
 pkg_deb = _pkg_deb
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 819f12b0d..ce677cbbf 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -20,6 +20,7 @@ go_embed_data = _go_embed_data
 go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
+gtest = _gtest
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = _py_library
diff --git a/tools/images/BUILD b/tools/images/BUILD
index f1699b184..fe11f08a3 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary", "gtest")
 load("//tools/images:defs.bzl", "vm_image", "vm_test")
 
 package(
@@ -32,8 +32,8 @@ cc_binary(
     srcs = ["test.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
-- 
cgit v1.2.3


From 7c118f7e192d403e716807c0f75f3f6d077a31ba Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 31 Jan 2020 09:55:51 -0800
Subject: KVM platform does not support 32bit.

Fixes: //test/syscalls:32bit_test_runsc_kvm
Ref change: 5d569408ef94c753b7aae9392b5e4ebf7e5ea50d
PiperOrigin-RevId: 292563926
---
 test/util/platform_util.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/util/platform_util.cc b/test/util/platform_util.cc
index 2724e63f3..c9200d381 100644
--- a/test/util/platform_util.cc
+++ b/test/util/platform_util.cc
@@ -20,10 +20,9 @@ namespace gvisor {
 namespace testing {
 
 PlatformSupport PlatformSupport32Bit() {
-  if (GvisorPlatform() == Platform::kPtrace) {
+  if (GvisorPlatform() == Platform::kPtrace ||
+      GvisorPlatform() == Platform::kKVM) {
     return PlatformSupport::NotSupported;
-  } else if (GvisorPlatform() == Platform::kKVM) {
-    return PlatformSupport::Segfault;
   } else {
     return PlatformSupport::Allowed;
   }
-- 
cgit v1.2.3


From bc3a24d62788bd6b881afeda230f1a5550a5709a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 31 Jan 2020 11:49:37 -0800
Subject: Internal change.

PiperOrigin-RevId: 292587459
---
 kokoro/kythe/generate_xrefs.sh |  2 +-
 pkg/sentry/fs/g3doc/inotify.md | 16 ++++++++--------
 pkg/sentry/mm/README.md        |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 7a0fbb3cd..323b0f77b 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -23,7 +23,7 @@ bazel version
 
 python3 -V
 
-readonly KYTHE_VERSION='v0.0.39'
+readonly KYTHE_VERSION='v0.0.41'
 readonly WORKDIR="$(mktemp -d)"
 readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
 if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md
index 71a577d9d..85063d4e6 100644
--- a/pkg/sentry/fs/g3doc/inotify.md
+++ b/pkg/sentry/fs/g3doc/inotify.md
@@ -112,11 +112,11 @@ attempts to queue a new event, it is already holding `fs.Watches.mu`. If we used
 `Inotify.mu` to also protect the event queue, this would violate the above lock
 ordering.
 
-[dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go
-[event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go
-[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go
-[inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go
-[inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go
-[inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go
-[syscall_dir]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/syscalls/linux/
-[watch]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_watch.go
+[dirent]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/dirent.go
+[event]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_event.go
+[fd_table]: https://github.com/google/gvisor/blob/master/pkg/sentry/kernel/fd_table.go
+[inode]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode.go
+[inode_watches]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode_inotify.go
+[inotify]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify.go
+[syscall_dir]: https://github.com/google/gvisor/blob/master/pkg/sentry/syscalls/linux/
+[watch]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_watch.go
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
index e1322e373..f4d43d927 100644
--- a/pkg/sentry/mm/README.md
+++ b/pkg/sentry/mm/README.md
@@ -274,7 +274,7 @@ In the sentry:
     methods
     [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
 
-[memmap]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/memmap/memmap.go
-[mm]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/mm/mm.go
-[pgalloc]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/pgalloc/pgalloc.go
-[platform]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/platform/platform.go
+[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go
+[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go
+[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go
+[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go
-- 
cgit v1.2.3


From 528dd1ec72fee1dd63c734fe92d1b972b5735b8f Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 31 Jan 2020 13:24:48 -0800
Subject: Extract multicast IP to Ethernet address mapping

Test: header.TestEthernetAddressFromMulticastIPAddress
PiperOrigin-RevId: 292604649
---
 pkg/tcpip/header/eth.go        | 41 +++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/header/eth_test.go   | 34 ++++++++++++++++++++++++++++++++++
 pkg/tcpip/network/arp/arp.go   | 19 ++-----------------
 pkg/tcpip/network/ipv6/icmp.go | 20 ++------------------
 4 files changed, 79 insertions(+), 35 deletions(-)

diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index f5d2c127f..b1e92d2d7 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -134,3 +134,44 @@ func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
 	// addr is a valid unicast ethernet address.
 	return true
 }
+
+// EthernetAddressFromMulticastIPv4Address returns a multicast Ethernet address
+// for a multicast IPv4 address.
+//
+// addr MUST be a multicast IPv4 address.
+func EthernetAddressFromMulticastIPv4Address(addr tcpip.Address) tcpip.LinkAddress {
+	var linkAddrBytes [EthernetAddressSize]byte
+	// RFC 1112 Host Extensions for IP Multicasting
+	//
+	// 6.4. Extensions to an Ethernet Local Network Module:
+	//
+	// An IP host group address is mapped to an Ethernet multicast
+	// address by placing the low-order 23-bits of the IP address
+	// into the low-order 23 bits of the Ethernet multicast address
+	// 01-00-5E-00-00-00 (hex).
+	linkAddrBytes[0] = 0x1
+	linkAddrBytes[2] = 0x5e
+	linkAddrBytes[3] = addr[1] & 0x7F
+	copy(linkAddrBytes[4:], addr[IPv4AddressSize-2:])
+	return tcpip.LinkAddress(linkAddrBytes[:])
+}
+
+// EthernetAddressFromMulticastIPv6Address returns a multicast Ethernet address
+// for a multicast IPv6 address.
+//
+// addr MUST be a multicast IPv6 address.
+func EthernetAddressFromMulticastIPv6Address(addr tcpip.Address) tcpip.LinkAddress {
+	// RFC 2464 Transmission of IPv6 Packets over Ethernet Networks
+	//
+	// 7. Address Mapping -- Multicast
+	//
+	// An IPv6 packet with a multicast destination address DST,
+	// consisting of the sixteen octets DST[1] through DST[16], is
+	// transmitted to the Ethernet multicast address whose first
+	// two octets are the value 3333 hexadecimal and whose last
+	// four octets are the last four octets of DST.
+	linkAddrBytes := []byte(addr[IPv6AddressSize-EthernetAddressSize:])
+	linkAddrBytes[0] = 0x33
+	linkAddrBytes[1] = 0x33
+	return tcpip.LinkAddress(linkAddrBytes[:])
+}
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
index 6634c90f5..7a0014ad9 100644
--- a/pkg/tcpip/header/eth_test.go
+++ b/pkg/tcpip/header/eth_test.go
@@ -66,3 +66,37 @@ func TestIsValidUnicastEthernetAddress(t *testing.T) {
 		})
 	}
 }
+
+func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
+	tests := []struct {
+		name             string
+		addr             tcpip.Address
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "IPv4 Multicast without 24th bit set",
+			addr:             "\xe0\x7e\xdc\xba",
+			expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba",
+		},
+		{
+			name:             "IPv4 Multicast with 24th bit set",
+			addr:             "\xe0\xfe\xdc\xba",
+			expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr {
+				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", got, test.expectedLinkAddr)
+			}
+		})
+	}
+}
+
+func TestEthernetAddressFromMulticastIPv6Address(t *testing.T) {
+	addr := tcpip.Address("\xff\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x1a")
+	if got, want := EthernetAddressFromMulticastIPv6Address(addr), tcpip.LinkAddress("\x33\x33\x0d\x0e\x0f\x1a"); got != want {
+		t.Fatalf("got EthernetAddressFromMulticastIPv6Address(%s) = %s, want = %s", addr, got, want)
+	}
+}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 1ceaebfbd..4da13c5df 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -178,24 +178,9 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 		return broadcastMAC, true
 	}
 	if header.IsV4MulticastAddress(addr) {
-		// RFC 1112 Host Extensions for IP Multicasting
-		//
-		// 6.4. Extensions to an Ethernet Local Network Module:
-		//
-		// An IP host group address is mapped to an Ethernet multicast
-		// address by placing the low-order 23-bits of the IP address
-		// into the low-order 23 bits of the Ethernet multicast address
-		// 01-00-5E-00-00-00 (hex).
-		return tcpip.LinkAddress([]byte{
-			0x01,
-			0x00,
-			0x5e,
-			addr[header.IPv4AddressSize-3] & 0x7f,
-			addr[header.IPv4AddressSize-2],
-			addr[header.IPv4AddressSize-1],
-		}), true
+		return header.EthernetAddressFromMulticastIPv4Address(addr), true
 	}
-	return "", false
+	return tcpip.LinkAddress([]byte(nil)), false
 }
 
 // SetOption implements NetworkProtocol.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index dc20c0fd7..7491cfc41 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -441,23 +441,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 // ResolveStaticAddress implements stack.LinkAddressResolver.
 func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
 	if header.IsV6MulticastAddress(addr) {
-		// RFC 2464 Transmission of IPv6 Packets over Ethernet Networks
-		//
-		// 7. Address Mapping -- Multicast
-		//
-		// An IPv6 packet with a multicast destination address DST,
-		// consisting of the sixteen octets DST[1] through DST[16], is
-		// transmitted to the Ethernet multicast address whose first
-		// two octets are the value 3333 hexadecimal and whose last
-		// four octets are the last four octets of DST.
-		return tcpip.LinkAddress([]byte{
-			0x33,
-			0x33,
-			addr[header.IPv6AddressSize-4],
-			addr[header.IPv6AddressSize-3],
-			addr[header.IPv6AddressSize-2],
-			addr[header.IPv6AddressSize-1],
-		}), true
+		return header.EthernetAddressFromMulticastIPv6Address(addr), true
 	}
-	return "", false
+	return tcpip.LinkAddress([]byte(nil)), false
 }
-- 
cgit v1.2.3


From eba7bdc24d31388ca81eeab251ed2db108f785dc Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 31 Jan 2020 13:46:13 -0800
Subject: iptables: enable TCP matching with "-m tcp".

A couple other things that changed:

- There's a proper extension registration system for matchers. Anyone
  adding another matcher can use tcp_matcher.go or udp_matcher.go as a
  template.
- All logging and use of syserr.Error in the netfilter package happens at the
  highest possible level (public functions). Lower-level functions just
  return normal, descriptive golang errors.
---
 pkg/abi/linux/netfilter.go                 |  52 ++++++++
 pkg/sentry/socket/netfilter/BUILD          |   4 +
 pkg/sentry/socket/netfilter/extensions.go  |  98 +++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go   | 187 ++++++++---------------------
 pkg/sentry/socket/netfilter/tcp_matcher.go | 143 ++++++++++++++++++++++
 pkg/sentry/socket/netfilter/udp_matcher.go | 142 ++++++++++++++++++++++
 pkg/tcpip/iptables/BUILD                   |   1 -
 pkg/tcpip/iptables/types.go                |   3 +
 pkg/tcpip/iptables/udp_matcher.go          | 113 -----------------
 9 files changed, 495 insertions(+), 248 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/extensions.go
 create mode 100644 pkg/sentry/socket/netfilter/tcp_matcher.go
 create mode 100644 pkg/sentry/socket/netfilter/udp_matcher.go
 delete mode 100644 pkg/tcpip/iptables/udp_matcher.go

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 8e40bcc62..e4aabb6bb 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -348,6 +348,58 @@ func goString(cstring []byte) string {
 	return string(cstring)
 }
 
+// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTTCP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// Option specifies that a particular TCP option must be set.
+	Option uint8
+
+	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
+	// fields.
+	FlagMask uint8
+
+	// FlagCompare is binary and-ed with the TCP flag fields.
+	FlagCompare uint8
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_TCP_INV_* flags.
+	InverseFlags uint8
+}
+
+// SizeOfXTTCP is the size of an XTTCP.
+const SizeOfXTTCP = 12
+
+// Flags in XTTCP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_TCP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_TCP_INV_DSTPT = 0x02
+	// Invert the meaning of FlagCompare.
+	XT_TCP_INV_FLAGS = 0x04
+	// Invert the meaning of Option.
+	XT_TCP_INV_OPTION = 0x08
+	// Enable all flags.
+	XT_TCP_INV_MASK = 0x0F
+)
+
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index fa2a2cb66..c91ec7494 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "netfilter",
     srcs = [
+        "extensions.go",
         "netfilter.go",
+        "tcp_matcher.go",
+        "udp_matcher.go",
     ],
     # This target depends on netstack and should only be used by epsocket,
     # which is allowed to depend on netstack.
@@ -17,6 +20,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/syserr",
         "//pkg/tcpip",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
new file mode 100644
index 000000000..5a4cac84c
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -0,0 +1,98 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// TODO(gvisor.dev/issue/170): The following per-matcher params should be
+// supported:
+// - Table name
+// - Match size
+// - User size
+// - Hooks
+// - Proto
+// - Family
+
+// matchMarshaler knows how to (un)marshal the matcher named name().
+type matchMarshaler interface {
+	// name is the matcher name as stored in the xt_entry_match struct.
+	name() string
+
+	// marshal converts from an iptables.Matcher to an ABI struct.
+	marshal(matcher iptables.Matcher) []byte
+
+	// unmarshal converts from the ABI matcher struct to an
+	// iptables.Matcher.
+	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
+}
+
+var matchMarshalers = map[string]matchMarshaler{}
+
+// registerMatchMarshaler should be called by match extensions to register them
+// with the netfilter package.
+func registerMatchMarshaler(mm matchMarshaler) {
+	if _, ok := matchMarshalers[mm.name()]; ok {
+		panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
+	}
+	matchMarshalers[mm.name()] = mm
+}
+
+func marshalMatcher(matcher iptables.Matcher) []byte {
+	matchMaker, ok := matchMarshalers[matcher.Name()]
+	if !ok {
+		panic(fmt.Errorf("Unknown matcher of type %T.", matcher))
+	}
+	return matchMaker.marshal(matcher)
+}
+
+// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and
+// data appended at the end.
+func marshalEntryMatch(name string, data []byte) []byte {
+	nflog("marshaling matcher %q", name)
+
+	// We have to pad this struct size to a multiple of 8 bytes.
+	size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+	matcher := linux.KernelXTEntryMatch{
+		XTEntryMatch: linux.XTEntryMatch{
+			MatchSize: uint16(size),
+		},
+		Data: data,
+	}
+	copy(matcher.Name[:], name)
+
+	buf := make([]byte, 0, size)
+	buf = binary.Marshal(buf, usermem.ByteOrder, matcher)
+	return append(buf, make([]byte, size-len(buf))...)
+}
+
+func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
+	matchMaker, ok := matchMarshalers[match.Name.String()]
+	if !ok {
+		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
+	}
+	return matchMaker.unmarshal(buf, filter)
+}
+
+// alignUp rounds a length up to an alignment. align must be a power of 2.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3dda6c7a1..8f14643b0 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,6 +17,7 @@
 package netfilter
 
 import (
+	"errors"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -34,10 +35,6 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
-const (
-	matcherNameUDP = "udp"
-)
-
 // Metadata is used to verify that we are correctly serializing and
 // deserializing iptables into structs consumable by the iptables tool. We save
 // a metadata struct when the tables are written, and when they are read out we
@@ -68,7 +65,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	// Find the appropriate table.
 	table, err := findTable(stack, info.Name)
 	if err != nil {
-		return linux.IPTGetinfo{}, err
+		nflog("%v", err)
+		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
 	}
 
 	// Get the hooks that apply to this table.
@@ -95,39 +93,40 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
-		log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name)
+		nflog("couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
 	// Find the appropriate table.
 	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
-		log.Warningf("netfilter: couldn't find table %q", userEntries.Name)
-		return linux.KernelIPTGetEntries{}, err
+		nflog("%v", err)
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
 	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
-		return linux.KernelIPTGetEntries{}, err
+		nflog("couldn't read entries: %v", err)
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 	if meta != table.Metadata().(metadata) {
 		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
 	}
 	if binary.Size(entries) > uintptr(outLen) {
-		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
+		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	return entries, nil
 }
 
-func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, *syserr.Error) {
+func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) {
 	ipt := stack.IPTables()
 	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
-		return iptables.Table{}, syserr.ErrInvalidArgument
+		return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
@@ -151,19 +150,19 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
+// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
 
 	// The table name has to fit in the struct.
 	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		log.Warningf("Table name %q too long.", tablename)
-		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+		return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("Table name %q too long.", tablename)
 	}
 	copy(entries.Name[:], tablename)
 
@@ -229,46 +228,6 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-func marshalMatcher(matcher iptables.Matcher) []byte {
-	switch m := matcher.(type) {
-	case *iptables.UDPMatcher:
-		return marshalUDPMatcher(m)
-	default:
-		// TODO(gvisor.dev/issue/170): Support other matchers.
-		panic(fmt.Errorf("unknown matcher of type %T", matcher))
-	}
-}
-
-func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
-
-	// We have to pad this struct size to a multiple of 8 bytes.
-	size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8)
-
-	linuxMatcher := linux.KernelXTEntryMatch{
-		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: uint16(size),
-		},
-		Data: make([]byte, 0, linux.SizeOfXTUDP),
-	}
-	copy(linuxMatcher.Name[:], matcherNameUDP)
-
-	xtudp := linux.XTUDP{
-		SourcePortStart:      matcher.Data.SourcePortStart,
-		SourcePortEnd:        matcher.Data.SourcePortEnd,
-		DestinationPortStart: matcher.Data.DestinationPortStart,
-		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
-		InverseFlags:         matcher.Data.InverseFlags,
-	}
-	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
-
-	buf := make([]byte, 0, size)
-	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
-	buf = append(buf, make([]byte, size-len(buf))...)
-	nflog("convert to binary: marshalled UDP matcher into %v", buf)
-	return buf[:]
-}
-
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
@@ -332,7 +291,7 @@ func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
 
 // translateToStandardVerdict translates from the value in a
 // linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
+func translateToStandardVerdict(val int32) (iptables.Verdict, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
@@ -340,13 +299,12 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 	case -linux.NF_DROP - 1:
 		return iptables.Drop, nil
 	case -linux.NF_QUEUE - 1:
-		log.Warningf("Unsupported iptables verdict QUEUE.")
+		return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		log.Warningf("Unsupported iptables verdict RETURN.")
+		return iptables.Invalid, errors.New("unsupported iptables verdict RETURN")
 	default:
-		log.Warningf("Unknown iptables verdict %d.", val)
+		return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d.", val)
 	}
-	return iptables.Invalid, syserr.ErrInvalidArgument
 }
 
 // SetEntries sets iptables rules for a single table. See
@@ -354,7 +312,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
-		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
+		nflog("optVal has insufficient size for replace %d", len(optVal))
 		return syserr.ErrInvalidArgument
 	}
 	var replace linux.IPTReplace
@@ -368,7 +326,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	case iptables.TablenameFilter:
 		table = iptables.EmptyFilterTable()
 	default:
-		log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
+		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
 	}
 
@@ -382,7 +340,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
-			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
+			nflog("optVal has insufficient size for entry %d", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		var entry linux.IPTEntry
@@ -392,7 +350,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		optVal = optVal[linux.SizeOfIPTEntry:]
 
 		if entry.TargetOffset < linux.SizeOfIPTEntry {
-			log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset)
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
 			return syserr.ErrInvalidArgument
 		}
 
@@ -400,7 +358,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// filtering fields.
 		filter, err := filterFromIPTIP(entry.IP)
 		if err != nil {
-			return err
+			nflog("bad iptip: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 
 		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
@@ -408,25 +367,26 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
-			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		matchers, err := parseMatchers(filter, optVal[:matchersSize])
 		if err != nil {
-			log.Warningf("netfilter: failed to parse matchers: %v", err)
-			return err
+			nflog("failed to parse matchers: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[matchersSize:]
 
 		// Get the target of the rule.
 		targetSize := entry.NextOffset - entry.TargetOffset
 		if len(optVal) < int(targetSize) {
-			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
-			return err
+			nflog("failed to parse target: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[targetSize:]
 
@@ -439,7 +399,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
-			log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
 		}
 	}
 
@@ -457,11 +417,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 				}
 			}
 			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
-				log.Warningf("Hook %v is unset.", hk)
+				nflog("hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
-				log.Warningf("Underflow %v is unset.", hk)
+				nflog("underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -473,7 +433,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, ruleIdx := range table.BuiltinChains {
 		if hook != iptables.Input {
 			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
-				log.Warningf("Hook %d is unsupported.", hook)
+				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -499,7 +459,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
-func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) {
 	nflog("set entries: parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
@@ -507,8 +467,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
-			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal))
 		}
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
@@ -517,45 +476,18 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
-			log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
-			return nil, syserr.ErrInvalidArgument
+
+			return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
 		}
 		if len(optVal) < int(match.MatchSize) {
-			log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal))
 		}
 
-		buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize]
-		var matcher iptables.Matcher
-		var err error
-		switch match.Name.String() {
-		case matcherNameUDP:
-			if len(buf) < linux.SizeOfXTUDP {
-				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
-				return nil, syserr.ErrInvalidArgument
-			}
-			// For alignment reasons, the match's total size may
-			// exceed what's strictly necessary to hold matchData.
-			var matchData linux.XTUDP
-			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
-			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
-			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{
-				SourcePortStart:      matchData.SourcePortStart,
-				SourcePortEnd:        matchData.SourcePortEnd,
-				DestinationPortStart: matchData.DestinationPortStart,
-				DestinationPortEnd:   matchData.DestinationPortEnd,
-				InverseFlags:         matchData.InverseFlags,
-			})
-			if err != nil {
-				log.Warningf("netfilter: failed to create UDP matcher: %v", err)
-				return nil, syserr.ErrInvalidArgument
-			}
-
-		default:
-			log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String())
-			return nil, syserr.ErrInvalidArgument
+		// Parse the specific matcher.
+		matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize])
+		if err != nil {
+			return nil, fmt.Errorf("failed to create matcher: %v", err)
 		}
-
 		matchers = append(matchers, matcher)
 
 		// TODO(gvisor.dev/issue/170): Check the revision field.
@@ -563,8 +495,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 	}
 
 	if len(optVal) != 0 {
-		log.Warningf("netfilter: optVal should be exhausted after parsing matchers")
-		return nil, syserr.ErrInvalidArgument
+		return nil, errors.New("optVal should be exhausted after parsing matchers")
 	}
 
 	return matchers, nil
@@ -572,11 +503,10 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
+func parseTarget(optVal []byte) (iptables.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
-		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
-		return nil, syserr.ErrInvalidArgument
+		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
@@ -585,8 +515,7 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 	case "":
 		// Standard target.
 		if len(optVal) != linux.SizeOfXTStandardTarget {
-			log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
 		}
 		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
@@ -602,15 +531,13 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case iptables.Drop:
 			return iptables.UnconditionalDropTarget{}, nil
 		default:
-			log.Warningf("Unknown verdict: %v", verdict)
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("Unknown verdict: %v", verdict)
 		}
 
 	case errorTargetName:
 		// Error target.
 		if len(optVal) != linux.SizeOfXTErrorTarget {
-			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
 		}
 		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
@@ -627,20 +554,17 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case errorTargetName:
 			return iptables.ErrorTarget{}, nil
 		default:
-			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
 		}
 	}
 
 	// Unknown target.
-	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-	return nil, syserr.ErrInvalidArgument
+	return nil, fmt.Errorf("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
+func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
-		log.Warningf("netfilter: unsupported fields in struct iptip: %+v", iptip)
-		return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument
+		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
 	}
 	return iptables.IPHeaderFilter{
 		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
@@ -678,8 +602,3 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
-
-// alignUp rounds a length up to an alignment. align must be a power of 2.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
new file mode 100644
index 000000000..1646d22f7
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameTCP = "tcp"
+
+func init() {
+	registerMatchMarshaler(tcpMarshaler{})
+}
+
+// tcpMarshaler implements matchMarshaler for TCP matching.
+type tcpMarshaler struct{}
+
+// name implements matchMarshaler.name.
+func (tcpMarshaler) name() string {
+	return matcherNameTCP
+}
+
+// marshal implements matchMarshaler.marshal.
+func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
+	matcher := mr.(*TCPMatcher)
+	xttcp := linux.XTTCP{
+		SourcePortStart:      matcher.sourcePortStart,
+		SourcePortEnd:        matcher.sourcePortEnd,
+		DestinationPortStart: matcher.destinationPortStart,
+		DestinationPortEnd:   matcher.destinationPortEnd,
+	}
+	buf := make([]byte, 0, linux.SizeOfXTUDP)
+	return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp))
+}
+
+// unmarshal implements matchMarshaler.unmarshal.
+func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+	if len(buf) < linux.SizeOfXTTCP {
+		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may
+	// exceed what's strictly necessary to hold matchData.
+	var matchData linux.XTTCP
+	binary.Unmarshal(buf[:linux.SizeOfXTTCP], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed XTTCP: %+v", matchData)
+
+	if matchData.Option != 0 ||
+		matchData.FlagMask != 0 ||
+		matchData.FlagCompare != 0 ||
+		matchData.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported TCP matcher flags set")
+	}
+
+	if filter.Protocol != header.TCPProtocolNumber {
+		return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+	}
+
+	return &TCPMatcher{
+		sourcePortStart:      matchData.SourcePortStart,
+		sourcePortEnd:        matchData.SourcePortEnd,
+		destinationPortStart: matchData.DestinationPortStart,
+		destinationPortEnd:   matchData.DestinationPortEnd,
+	}, nil
+}
+
+// TCPMatcher matches TCP packets and their headers. It implements Matcher.
+type TCPMatcher struct {
+	sourcePortStart      uint16
+	sourcePortEnd        uint16
+	destinationPortStart uint16
+	destinationPortEnd   uint16
+}
+
+// Name implements Matcher.Name.
+func (*TCPMatcher) Name() string {
+	return matcherNameTCP
+}
+
+// Match implements Matcher.Match.
+func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
+	var tcpHeader header.TCP
+	if pkt.TransportHeader != nil {
+		tcpHeader = header.TCP(pkt.TransportHeader)
+	} else {
+		// The TCP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.TCPMinimumSize {
+			// There's no valid TCP header here, so we hotdrop the
+			// packet.
+			return false, true
+		}
+		tcpHeader = header.TCP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	if sourcePort := tcpHeader.SourcePort(); sourcePort < tm.sourcePortStart || tm.sourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort := tcpHeader.DestinationPort(); destinationPort < tm.destinationPortStart || tm.destinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
new file mode 100644
index 000000000..b6e95bbc5
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameUDP = "udp"
+
+func init() {
+	registerMatchMarshaler(udpMarshaler{})
+}
+
+// udpMarshaler implements matchMarshaler for UDP matching.
+type udpMarshaler struct{}
+
+// name implements matchMarshaler.name.
+func (udpMarshaler) name() string {
+	return matcherNameUDP
+}
+
+// marshal implements matchMarshaler.marshal.
+func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
+	matcher := mr.(*UDPMatcher)
+	xtudp := linux.XTUDP{
+		SourcePortStart:      matcher.sourcePortStart,
+		SourcePortEnd:        matcher.sourcePortEnd,
+		DestinationPortStart: matcher.destinationPortStart,
+		DestinationPortEnd:   matcher.destinationPortEnd,
+	}
+	buf := make([]byte, 0, linux.SizeOfXTUDP)
+	return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp))
+}
+
+// unmarshal implements matchMarshaler.unmarshal.
+func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+	if len(buf) < linux.SizeOfXTUDP {
+		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may exceed what's
+	// strictly necessary to hold matchData.
+	var matchData linux.XTUDP
+	binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed XTUDP: %+v", matchData)
+
+	if matchData.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
+	}
+
+	if filter.Protocol != header.UDPProtocolNumber {
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+	}
+
+	return &UDPMatcher{
+		sourcePortStart:      matchData.SourcePortStart,
+		sourcePortEnd:        matchData.SourcePortEnd,
+		destinationPortStart: matchData.DestinationPortStart,
+		destinationPortEnd:   matchData.DestinationPortEnd,
+	}, nil
+}
+
+// UDPMatcher matches UDP packets and their headers. It implements Matcher.
+type UDPMatcher struct {
+	sourcePortStart      uint16
+	sourcePortEnd        uint16
+	destinationPortStart uint16
+	destinationPortEnd   uint16
+}
+
+// Name implements Matcher.Name.
+func (*UDPMatcher) Name() string {
+	return matcherNameUDP
+}
+
+// Match implements Matcher.Match.
+func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the iptables.Check codepath as matchers are added.
+	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
+	var udpHeader header.UDP
+	if pkt.TransportHeader != nil {
+		udpHeader = header.UDP(pkt.TransportHeader)
+	} else {
+		// The UDP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.UDPMinimumSize {
+			// There's no valid UDP header here, so we hotdrop the
+			// packet.
+			return false, true
+		}
+		udpHeader = header.UDP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index bab26580b..d1b73cfdf 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -8,7 +8,6 @@ go_library(
         "iptables.go",
         "targets.go",
         "types.go",
-        "udp_matcher.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7f77802a0..d660aab04 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -171,6 +171,9 @@ type IPHeaderFilter struct {
 
 // A Matcher is the interface for matching packets.
 type Matcher interface {
+	// Name returns the name of the Matcher.
+	Name() string
+
 	// Match returns whether the packet matches and whether the packet
 	// should be "hotdropped", i.e. dropped immediately. This is usually
 	// used for suspicious packets.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
deleted file mode 100644
index 3bb076f9c..000000000
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-// TODO(gvisor.dev/issue/170): The following per-matcher params should be
-// supported:
-// - Table name
-// - Match size
-// - User size
-// - Hooks
-// - Proto
-// - Family
-
-// UDPMatcher matches UDP packets and their headers. It implements Matcher.
-type UDPMatcher struct {
-	Data UDPMatcherParams
-}
-
-// UDPMatcherParams are the parameters used to create a UDPMatcher.
-type UDPMatcherParams struct {
-	SourcePortStart      uint16
-	SourcePortEnd        uint16
-	DestinationPortStart uint16
-	DestinationPortEnd   uint16
-	InverseFlags         uint8
-}
-
-// NewUDPMatcher returns a new instance of UDPMatcher.
-func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) {
-	log.Infof("Adding rule with UDPMatcherParams: %+v", data)
-
-	if data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
-	}
-
-	if filter.Protocol != header.UDPProtocolNumber {
-		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
-	}
-
-	return &UDPMatcher{Data: data}, nil
-}
-
-// Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-
-	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
-	// into the iptables.Check codepath as matchers are added.
-	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		return false, false
-	}
-
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			log.Warningf("Dropping UDP packet: malicious fragmented packet.")
-			return false, true
-		}
-		return false, false
-	}
-
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
-	// added.
-	var udpHeader header.UDP
-	if pkt.TransportHeader != nil {
-		udpHeader = header.UDP(pkt.TransportHeader)
-	} else {
-		// The UDP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.UDPMinimumSize {
-			// There's no valid UDP header here, so we hotdrop the
-			// packet.
-			log.Warningf("Dropping UDP packet: size too small.")
-			return false, true
-		}
-		udpHeader = header.UDP(pkt.Data.First())
-	}
-
-	// Check whether the source and destination ports are within the
-	// matching range.
-	sourcePort := udpHeader.SourcePort()
-	destinationPort := udpHeader.DestinationPort()
-	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
-		return false, false
-	}
-	if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort {
-		return false, false
-	}
-
-	return true, false
-}
-- 
cgit v1.2.3


From 29ad5762e4549d961f48c65292cfdeb7256524f6 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 31 Jan 2020 13:53:58 -0800
Subject: Spelling

---
 pkg/sentry/socket/netfilter/extensions.go  | 18 +++++++++---------
 pkg/sentry/socket/netfilter/tcp_matcher.go | 10 +++++-----
 pkg/sentry/socket/netfilter/udp_matcher.go | 10 +++++-----
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 5a4cac84c..b5fbb52e4 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -32,8 +32,8 @@ import (
 // - Proto
 // - Family
 
-// matchMarshaler knows how to (un)marshal the matcher named name().
-type matchMarshaler interface {
+// matchMaker knows how to (un)marshal the matcher named name().
+type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
@@ -45,19 +45,19 @@ type matchMarshaler interface {
 	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
 }
 
-var matchMarshalers = map[string]matchMarshaler{}
+var matchMakers = map[string]matchMaker{}
 
-// registerMatchMarshaler should be called by match extensions to register them
+// registermatchMaker should be called by match extensions to register them
 // with the netfilter package.
-func registerMatchMarshaler(mm matchMarshaler) {
-	if _, ok := matchMarshalers[mm.name()]; ok {
+func registerMatchMaker(mm matchMaker) {
+	if _, ok := matchMakers[mm.name()]; ok {
 		panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
 	}
-	matchMarshalers[mm.name()] = mm
+	matchMakers[mm.name()] = mm
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
-	matchMaker, ok := matchMarshalers[matcher.Name()]
+	matchMaker, ok := matchMakers[matcher.Name()]
 	if !ok {
 		panic(fmt.Errorf("Unknown matcher of type %T.", matcher))
 	}
@@ -85,7 +85,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 }
 
 func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
-	matchMaker, ok := matchMarshalers[match.Name.String()]
+	matchMaker, ok := matchMakers[match.Name.String()]
 	if !ok {
 		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
 	}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 1646d22f7..6b2f4c31a 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -28,18 +28,18 @@ import (
 const matcherNameTCP = "tcp"
 
 func init() {
-	registerMatchMarshaler(tcpMarshaler{})
+	registerMatchMaker(tcpMarshaler{})
 }
 
-// tcpMarshaler implements matchMarshaler for TCP matching.
+// tcpMarshaler implements matchMaker for TCP matching.
 type tcpMarshaler struct{}
 
-// name implements matchMarshaler.name.
+// name implements matchMaker.name.
 func (tcpMarshaler) name() string {
 	return matcherNameTCP
 }
 
-// marshal implements matchMarshaler.marshal.
+// marshal implements matchMaker.marshal.
 func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 	matcher := mr.(*TCPMatcher)
 	xttcp := linux.XTTCP{
@@ -52,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 	return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp))
 }
 
-// unmarshal implements matchMarshaler.unmarshal.
+// unmarshal implements matchMaker.unmarshal.
 func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
 	if len(buf) < linux.SizeOfXTTCP {
 		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index b6e95bbc5..86aa11696 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -28,18 +28,18 @@ import (
 const matcherNameUDP = "udp"
 
 func init() {
-	registerMatchMarshaler(udpMarshaler{})
+	registerMatchMaker(udpMarshaler{})
 }
 
-// udpMarshaler implements matchMarshaler for UDP matching.
+// udpMarshaler implements matchMaker for UDP matching.
 type udpMarshaler struct{}
 
-// name implements matchMarshaler.name.
+// name implements matchMaker.name.
 func (udpMarshaler) name() string {
 	return matcherNameUDP
 }
 
-// marshal implements matchMarshaler.marshal.
+// marshal implements matchMaker.marshal.
 func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 	matcher := mr.(*UDPMatcher)
 	xtudp := linux.XTUDP{
@@ -52,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 	return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp))
 }
 
-// unmarshal implements matchMarshaler.unmarshal.
+// unmarshal implements matchMaker.unmarshal.
 func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
 	if len(buf) < linux.SizeOfXTUDP {
 		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
-- 
cgit v1.2.3


From 77bf586db75b3dbd9dcb14c349bde8372d26425c Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 31 Jan 2020 13:54:57 -0800
Subject: Use multicast Ethernet address for multicast NDP

As per RFC 2464 section 7, an IPv6 packet with a multicast destination
address is transmitted to the mapped Ethernet multicast address.

Test:
- ipv6.TestLinkResolution
- stack_test.TestDADResolve
- stack_test.TestRouterSolicitation
PiperOrigin-RevId: 292610529
---
 pkg/tcpip/header/ipv6_test.go                      | 29 ++++++++++++++++++++++
 pkg/tcpip/link/channel/channel.go                  | 29 ++++++++++++++--------
 pkg/tcpip/network/ipv6/icmp.go                     |  6 ++++-
 pkg/tcpip/network/ipv6/icmp_test.go                | 12 ++++++---
 pkg/tcpip/stack/ndp.go                             | 17 +++++++++++++
 pkg/tcpip/stack/ndp_test.go                        | 16 +++++++++++-
 pkg/tcpip/stack/route.go                           |  4 ++-
 pkg/tcpip/transport/tcp/testing/context/context.go |  6 ++++-
 8 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index 29f54bc57..c3ad503aa 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -17,6 +17,7 @@ package header_test
 import (
 	"bytes"
 	"crypto/sha256"
+	"fmt"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -300,3 +301,31 @@ func TestScopeForIPv6Address(t *testing.T) {
 		})
 	}
 }
+
+func TestSolicitedNodeAddr(t *testing.T) {
+	tests := []struct {
+		addr tcpip.Address
+		want tcpip.Address
+	}{
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\xa0",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0",
+		},
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x0e\x0f\xa0",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0",
+		},
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x01\x02\x03",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x01\x02\x03",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(fmt.Sprintf("%s", test.addr), func(t *testing.T) {
+			if got := header.SolicitedNodeAddr(test.addr); got != test.want {
+				t.Fatalf("got header.SolicitedNodeAddr(%s) = %s, want = %s", test.addr, got, test.want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 71b9da797..78d447acd 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -30,15 +30,16 @@ type PacketInfo struct {
 	Pkt   tcpip.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
+	Route stack.Route
 }
 
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
-	dispatcher stack.NetworkDispatcher
-	mtu        uint32
-	linkAddr   tcpip.LinkAddress
-	GSO        bool
+	dispatcher         stack.NetworkDispatcher
+	mtu                uint32
+	linkAddr           tcpip.LinkAddress
+	LinkEPCapabilities stack.LinkEndpointCapabilities
 
 	// c is where outbound packets are queued.
 	c chan PacketInfo
@@ -122,11 +123,7 @@ func (e *Endpoint) MTU() uint32 {
 
 // Capabilities implements stack.LinkEndpoint.Capabilities.
 func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	caps := stack.LinkEndpointCapabilities(0)
-	if e.GSO {
-		caps |= stack.CapabilityHardwareGSO
-	}
-	return caps
+	return e.LinkEPCapabilities
 }
 
 // GSOMaxSize returns the maximum GSO packet size.
@@ -146,11 +143,16 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
 	p := PacketInfo{
 		Pkt:   pkt,
 		Proto: protocol,
 		GSO:   gso,
+		Route: route,
 	}
 
 	select {
@@ -162,7 +164,11 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
 	payloadView := pkts[0].Data.ToView()
 	n := 0
 packetLoop:
@@ -176,6 +182,7 @@ packetLoop:
 			},
 			Proto: protocol,
 			GSO:   gso,
+			Route: route,
 		}
 
 		select {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 7491cfc41..60817d36d 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -408,10 +408,14 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 // LinkAddressRequest implements stack.LinkAddressResolver.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
 	snaddr := header.SolicitedNodeAddr(addr)
+
+	// TODO(b/148672031): Use stack.FindRoute instead of manually creating the
+	// route here. Note, we would need the nicID to do this properly so the right
+	// NIC (associated to linkEP) is used to send the NDP NS message.
 	r := &stack.Route{
 		LocalAddress:      localAddr,
 		RemoteAddress:     snaddr,
-		RemoteLinkAddress: broadcastMAC,
+		RemoteLinkAddress: header.EthernetAddressFromMulticastIPv6Address(snaddr),
 	}
 	hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 7a6820643..d0e930e20 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -270,8 +270,9 @@ func (c *testContext) cleanup() {
 }
 
 type routeArgs struct {
-	src, dst *channel.Endpoint
-	typ      header.ICMPv6Type
+	src, dst       *channel.Endpoint
+	typ            header.ICMPv6Type
+	remoteLinkAddr tcpip.LinkAddress
 }
 
 func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) {
@@ -292,6 +293,11 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		t.Errorf("unexpected protocol number %d", pi.Proto)
 		return
 	}
+
+	if len(args.remoteLinkAddr) != 0 && args.remoteLinkAddr != pi.Route.RemoteLinkAddress {
+		t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr)
+	}
+
 	ipv6 := header.IPv6(pi.Pkt.Header.View())
 	transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader())
 	if transProto != header.ICMPv6ProtocolNumber {
@@ -339,7 +345,7 @@ func TestLinkResolution(t *testing.T) {
 				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
 			}
 			for _, args := range []routeArgs{
-				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit},
+				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))},
 				{src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6NeighborAdvert},
 			} {
 				routeICMPv6Packet(t, args, func(t *testing.T, icmpv6 header.ICMPv6) {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 31294345d..6123fda33 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -538,6 +538,14 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
+	// Route should resolve immediately since snmc is a multicast address so a
+	// remote link address can be calculated without a resolution process.
+	if c, err := r.Resolve(nil); err != nil {
+		log.Fatalf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err)
+	} else if c != nil {
+		log.Fatalf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID())
+	}
+
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborSolicit)
@@ -1197,6 +1205,15 @@ func (ndp *ndpState) startSolicitingRouters() {
 		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
+		// Route should resolve immediately since
+		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
+		// remote link address can be calculated without a resolution process.
+		if c, err := r.Resolve(nil); err != nil {
+			log.Fatalf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err)
+		} else if c != nil {
+			log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID())
+		}
+
 		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
 		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index bc7cfbcb4..8af8565f7 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -336,6 +336,7 @@ func TestDADResolve(t *testing.T) {
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
 			e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 			s := stack.New(opts)
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -413,6 +414,12 @@ func TestDADResolve(t *testing.T) {
 					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 				}
 
+				// Make sure the right remote link address is used.
+				snmc := header.SolicitedNodeAddr(addr1)
+				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
+					t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				}
+
 				// Check NDP NS packet.
 				//
 				// As per RFC 4861 section 4.3, a possible option is the Source Link
@@ -420,7 +427,7 @@ func TestDADResolve(t *testing.T) {
 				// address of the packet is the unspecified address.
 				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
 					checker.SrcAddr(header.IPv6Any),
-					checker.DstAddr(header.SolicitedNodeAddr(addr1)),
+					checker.DstAddr(snmc),
 					checker.TTL(header.NDPHopLimit),
 					checker.NDPNS(
 						checker.NDPNSTargetAddress(addr1),
@@ -3292,6 +3299,7 @@ func TestRouterSolicitation(t *testing.T) {
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
 				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
+				e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
 					ctx, _ := context.WithTimeout(context.Background(), timeout)
@@ -3304,6 +3312,12 @@ func TestRouterSolicitation(t *testing.T) {
 					if p.Proto != header.IPv6ProtocolNumber {
 						t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 					}
+
+					// Make sure the right remote link address is used.
+					if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want {
+						t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+					}
+
 					checker.IPv6(t,
 						p.Pkt.Header.View(),
 						checker.SrcAddr(header.IPv6Any),
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 517f4b941..f565aafb2 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -225,7 +225,9 @@ func (r *Route) Release() {
 // Clone Clone a route such that the original one can be released and the new
 // one will remain valid.
 func (r *Route) Clone() Route {
-	r.ref.incRef()
+	if r.ref != nil {
+		r.ref.incRef()
+	}
 	return *r
 }
 
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 730ac4292..1e9a0dea3 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1082,7 +1082,11 @@ func (c *Context) SACKEnabled() bool {
 
 // SetGSOEnabled enables or disables generic segmentation offload.
 func (c *Context) SetGSOEnabled(enable bool) {
-	c.linkEP.GSO = enable
+	if enable {
+		c.linkEP.LinkEPCapabilities |= stack.CapabilityHardwareGSO
+	} else {
+		c.linkEP.LinkEPCapabilities &^= stack.CapabilityHardwareGSO
+	}
 }
 
 // MSSWithoutOptions returns the value for the MSS used by the stack when no
-- 
cgit v1.2.3


From 6c3072243dfbf70062de5f610e14fd6ed2ce5f32 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 31 Jan 2020 14:14:52 -0800
Subject: Implement file locks for regular tmpfs files in VFSv2.

Add a file lock implementation that can be embedded into various filesystem
implementations.

Updates #1480

PiperOrigin-RevId: 292614758
---
 pkg/sentry/fsimpl/tmpfs/BUILD                |  3 ++
 pkg/sentry/fsimpl/tmpfs/regular_file.go      | 23 +++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 56 ++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             | 43 +++++++++++++++++
 pkg/sentry/vfs/lock/BUILD                    | 13 +++++
 pkg/sentry/vfs/lock/lock.go                  | 72 ++++++++++++++++++++++++++++
 6 files changed, 210 insertions(+)
 create mode 100644 pkg/sentry/vfs/lock/BUILD
 create mode 100644 pkg/sentry/vfs/lock/lock.go

diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index c61366224..57abd5583 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
@@ -47,6 +48,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
@@ -86,6 +88,7 @@ go_test(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index e9e6faf67..dab346a41 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -192,6 +193,28 @@ func (fd *regularFileFD) Sync(ctx context.Context) error {
 	return nil
 }
 
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+	return fd.inode().lockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+	fd.inode().unlockBSD(uid)
+	return nil
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+	return fd.inode().lockPOSIX(uid, t, rng, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+	fd.inode().unlockPOSIX(uid, rng)
+	return nil
+}
+
 // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
 type regularFileReadWriter struct {
 	file *regularFile
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 32552e261..2b52992ea 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -24,9 +24,11 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -260,6 +262,60 @@ func TestPWrite(t *testing.T) {
 	}
 }
 
+func TestLocks(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	var (
+		uid1 lock.UniqueID
+		uid2 lock.UniqueID
+		// Non-blocking.
+		block lock.Blocker
+	)
+
+	uid1 = 123
+	uid2 = 456
+
+	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want {
+		t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
+	}
+	if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
+		t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
+	}
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+
+	rng1 := lock.LockRange{0, 1}
+	rng2 := lock.LockRange{1, 2}
+
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want {
+		t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
+	}
+	if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil {
+		t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
+	}
+}
+
 func TestPRead(t *testing.T) {
 	ctx := contexttest.Context(t)
 	fd, cleanup, err := newFileFD(ctx, 0644)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 88dbd6e35..2108d0f4d 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,10 +30,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -153,6 +155,9 @@ type inode struct {
 	rdevMajor uint32
 	rdevMinor uint32
 
+	// Advisory file locks, which lock at the inode level.
+	locks lock.FileLocks
+
 	impl interface{} // immutable
 }
 
@@ -352,6 +357,44 @@ func (i *inode) setStat(stat linux.Statx) error {
 	return nil
 }
 
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		return i.locks.LockBSD(uid, t, block)
+	}
+	return syserror.EBADF
+}
+
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) unlockBSD(uid fslock.UniqueID) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		i.locks.UnlockBSD(uid)
+		return nil
+	}
+	return syserror.EBADF
+}
+
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		return i.locks.LockPOSIX(uid, t, rng, block)
+	}
+	return syserror.EBADF
+}
+
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		i.locks.UnlockPOSIX(uid, rng)
+		return nil
+	}
+	return syserror.EBADF
+}
+
 // allocatedBlocksForSize returns the number of 512B blocks needed to
 // accommodate the given size in bytes, as appropriate for struct
 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
new file mode 100644
index 000000000..d9ab063b7
--- /dev/null
+++ b/pkg/sentry/vfs/lock/BUILD
@@ -0,0 +1,13 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "lock",
+    srcs = ["lock.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/fs/lock",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go
new file mode 100644
index 000000000..724dfe743
--- /dev/null
+++ b/pkg/sentry/vfs/lock/lock.go
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock provides POSIX and BSD style file locking for VFS2 file
+// implementations.
+//
+// The actual implementations can be found in the lock package under
+// sentry/fs/lock.
+package lock
+
+import (
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2)
+// and flock(2) respectively in Linux. It can be embedded into various file
+// implementations for VFS2 that support locking.
+//
+// Note that in Linux these two types of locks are _not_ cooperative, because
+// race and deadlock conditions make merging them prohibitive. We do the same
+// and keep them oblivious to each other.
+type FileLocks struct {
+	// bsd is a set of BSD-style advisory file wide locks, see flock(2).
+	bsd fslock.Locks
+
+	// posix is a set of POSIX-style regional advisory locks, see fcntl(2).
+	posix fslock.Locks
+}
+
+// LockBSD tries to acquire a BSD-style lock on the entire file.
+func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
+		return nil
+	}
+	return syserror.ErrWouldBlock
+}
+
+// UnlockBSD releases a BSD-style lock on the entire file.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
+	fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF})
+}
+
+// LockPOSIX tries to acquire a POSIX-style lock on a file region.
+func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	if fl.posix.LockRegion(uid, t, rng, block) {
+		return nil
+	}
+	return syserror.ErrWouldBlock
+}
+
+// UnlockPOSIX releases a POSIX-style lock on a file region.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+	fl.posix.UnlockRegion(uid, rng)
+}
-- 
cgit v1.2.3


From 04cccaaeeed22a28a42fc4c1406b43a966a5d886 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 31 Jan 2020 14:44:50 -0800
Subject: Fix logic around AMD/Intel cases.

If the support is Ignored, then the call is still executed. We
simply rely on it to fall through to the int3. Therefore, we
must also bail on the vendor check.

PiperOrigin-RevId: 292620558
---
 test/syscalls/linux/32bit.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 2751fb4e7..9883aef61 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -102,7 +102,8 @@ TEST(Syscall32Bit, Int80) {
 }
 
 TEST(Syscall32Bit, Sysenter) {
-  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+  if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+       PlatformSupport32Bit() == PlatformSupport::Ignored) &&
       GetCPUVendor() == CPUVendor::kAMD) {
     // SYSENTER is an illegal instruction in compatibility mode on AMD.
     EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
@@ -133,7 +134,8 @@ TEST(Syscall32Bit, Sysenter) {
 }
 
 TEST(Syscall32Bit, Syscall) {
-  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+  if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+       PlatformSupport32Bit() == PlatformSupport::Ignored) &&
       GetCPUVendor() == CPUVendor::kIntel) {
     // SYSCALL is an illegal instruction in compatibility mode on Intel.
     EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
-- 
cgit v1.2.3


From 02997af5abd62d778fca4d01b047a6bdebab2090 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 31 Jan 2020 15:08:17 -0800
Subject: Fix method comment to match method name.

PiperOrigin-RevId: 292624867
---
 pkg/tcpip/transport/tcp/accept.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 6101f2945..08afb7c17 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -271,8 +271,8 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	return n, nil
 }
 
-// createEndpoint creates a new endpoint in connected state and then performs
-// the TCP 3-way handshake.
+// createEndpointAndPerformHandshake creates a new endpoint in connected state
+// and then performs the TCP 3-way handshake.
 func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
-- 
cgit v1.2.3


From 4d1a648c7c5db8a51416bff647260a1be3b5c12e Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 3 Feb 2020 11:39:01 -0800
Subject: Allow mlock in system call filters

Go 1.14 has a workaround for a Linux 5.2-5.4 bug which requires mlock'ing the g
stack to prevent register corruption. We need to allow this syscall until it is
removed from Go.

PiperOrigin-RevId: 292967478
---
 runsc/boot/filter/config.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 4fb9adca6..f8d351c7b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -174,6 +174,18 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
 	syscall.SYS_MINCORE: {},
+	// Used by the Go runtime as a temporarily workaround for a Linux
+	// 5.2-5.4 bug.
+	//
+	// See src/runtime/os_linux_x86.go.
+	//
+	// TODO(b/148688965): Remove once this is gone from Go.
+	syscall.SYS_MLOCK: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(4096),
+		},
+	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
-- 
cgit v1.2.3


From 9742daf3c201771d257c0d043347f4eebf3088e0 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Mon, 3 Feb 2020 12:03:38 -0800
Subject: Add packetdrill tests that use docker.

PiperOrigin-RevId: 292973224
---
 test/packetdrill/BUILD                 |   8 ++
 test/packetdrill/Dockerfile            |   9 ++
 test/packetdrill/defs.bzl              |  85 +++++++++++++
 test/packetdrill/fin_wait2_timeout.pkt |  23 ++++
 test/packetdrill/packetdrill_setup.sh  |  26 ++++
 test/packetdrill/packetdrill_test.sh   | 213 +++++++++++++++++++++++++++++++++
 6 files changed, 364 insertions(+)
 create mode 100644 test/packetdrill/BUILD
 create mode 100644 test/packetdrill/Dockerfile
 create mode 100644 test/packetdrill/defs.bzl
 create mode 100644 test/packetdrill/fin_wait2_timeout.pkt
 create mode 100755 test/packetdrill/packetdrill_setup.sh
 create mode 100755 test/packetdrill/packetdrill_test.sh

diff --git a/test/packetdrill/BUILD b/test/packetdrill/BUILD
new file mode 100644
index 000000000..d113555b1
--- /dev/null
+++ b/test/packetdrill/BUILD
@@ -0,0 +1,8 @@
+load("defs.bzl", "packetdrill_test")
+
+package(licenses = ["notice"])
+
+packetdrill_test(
+    name = "fin_wait2_timeout",
+    scripts = ["fin_wait2_timeout.pkt"],
+)
diff --git a/test/packetdrill/Dockerfile b/test/packetdrill/Dockerfile
new file mode 100644
index 000000000..bd4451355
--- /dev/null
+++ b/test/packetdrill/Dockerfile
@@ -0,0 +1,9 @@
+FROM ubuntu:bionic
+
+RUN apt-get update
+RUN apt-get install -y net-tools git iptables iputils-ping netcat tcpdump jq tar
+RUN hash -r
+RUN git clone --branch packetdrill-v2.0 \
+        https://github.com/google/packetdrill.git
+RUN cd packetdrill/gtests/net/packetdrill && ./configure && \
+        apt-get install -y bison flex make && make
diff --git a/test/packetdrill/defs.bzl b/test/packetdrill/defs.bzl
new file mode 100644
index 000000000..582f97e0c
--- /dev/null
+++ b/test/packetdrill/defs.bzl
@@ -0,0 +1,85 @@
+"""Defines a rule for packetdrill test targets."""
+
+def _packetdrill_test_impl(ctx):
+    test_runner = ctx.executable._test_runner
+    runner = ctx.actions.declare_file("%s-runner" % ctx.label.name)
+
+    script_paths = []
+    for script in ctx.files.scripts:
+        script_paths.append(script.short_path)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        # This test will run part in a distinct user namespace. This can cause
+        # permission problems, because all runfiles may not be owned by the
+        # current user, and no other users will be mapped in that namespace.
+        # Make sure that everything is readable here.
+        "find . -type f -exec chmod a+rx {} \\;",
+        "find . -type d -exec chmod a+rx {} \\;",
+        "%s %s --init_script %s -- %s\n" % (
+            test_runner.short_path,
+            " ".join(ctx.attr.flags),
+            ctx.files._init_script[0].short_path,
+            " ".join(script_paths),
+        ),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    transitive_files = depset()
+    if hasattr(ctx.attr._test_runner, "data_runfiles"):
+        transitive_files = depset(ctx.attr._test_runner.data_runfiles.files)
+    runfiles = ctx.runfiles(
+        files = [test_runner] + ctx.files._init_script + ctx.files.scripts,
+        transitive_files = transitive_files,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_packetdrill_test = rule(
+    attrs = {
+        "_test_runner": attr.label(
+            executable = True,
+            cfg = "host",
+            allow_files = True,
+            default = "packetdrill_test.sh",
+        ),
+        "_init_script": attr.label(
+            allow_single_file = True,
+            default = "packetdrill_setup.sh",
+        ),
+        "flags": attr.string_list(
+            mandatory = False,
+            default = [],
+        ),
+        "scripts": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+    },
+    test = True,
+    implementation = _packetdrill_test_impl,
+)
+
+_PACKETDRILL_TAGS = ["local", "manual"]
+
+def packetdrill_linux_test(name, **kwargs):
+    if "tags" not in kwargs:
+        kwargs["tags"] = _PACKETDRILL_TAGS
+    _packetdrill_test(
+        name = name + "_linux_test",
+        flags = ["--dut_platform", "linux"],
+        **kwargs
+    )
+
+def packetdrill_netstack_test(name, **kwargs):
+    if "tags" not in kwargs:
+        kwargs["tags"] = _PACKETDRILL_TAGS
+    _packetdrill_test(
+        name = name + "_netstack_test",
+        flags = ["--dut_platform", "netstack"],
+        **kwargs
+    )
+
+def packetdrill_test(**kwargs):
+    packetdrill_linux_test(**kwargs)
+    packetdrill_netstack_test(**kwargs)
diff --git a/test/packetdrill/fin_wait2_timeout.pkt b/test/packetdrill/fin_wait2_timeout.pkt
new file mode 100644
index 000000000..613f0bec9
--- /dev/null
+++ b/test/packetdrill/fin_wait2_timeout.pkt
@@ -0,0 +1,23 @@
+// Test that a socket in FIN_WAIT_2 eventually times out and a subsequent
+// packet generates a RST.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
++0 < P. 1:1(0) ack 1 win 257
+
++0.100 accept(3, ..., ...) = 4
+// set FIN_WAIT2 timeout to 1 seconds.
++0.100 setsockopt(4, SOL_TCP, TCP_LINGER2, [1], 4) = 0
++0 close(4) = 0
+
++0 > F. 1:1(0) ack 1 <...>
++0 < . 1:1(0) ack 2 win 257
+
++1.1 < . 1:1(0) ack 2 win 257
++0 > R  2:2(0) win 0
diff --git a/test/packetdrill/packetdrill_setup.sh b/test/packetdrill/packetdrill_setup.sh
new file mode 100755
index 000000000..b858072f0
--- /dev/null
+++ b/test/packetdrill/packetdrill_setup.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script runs both within the sentry context and natively. It should tweak
+# TCP parameters to match expectations found in the script files.
+sysctl -q net.ipv4.tcp_sack=1
+sysctl -q net.ipv4.tcp_rmem="4096 2097152 $((8*1024*1024))"
+sysctl -q net.ipv4.tcp_wmem="4096 2097152 $((8*1024*1024))"
+
+# There may be errors from the above, but they will show up in the test logs and
+# we always want to proceed from this point. It's possible that values were
+# already set correctly and the nodes were not available in the namespace.
+exit 0
diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh
new file mode 100755
index 000000000..614d94d74
--- /dev/null
+++ b/test/packetdrill/packetdrill_test.sh
@@ -0,0 +1,213 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run a packetdrill test.  Two docker containers are made, one for the
+# Device-Under-Test (DUT) and one for the test runner.  Each is attached with
+# two networks, one for control packets that aid the test and one for test
+# packets which are sent as part of the test and observed for correctness.
+
+set -euxo pipefail
+
+function failure() {
+  local lineno=$1
+  local msg=$2
+  local filename="$0"
+  echo "FAIL: $filename:$lineno: $msg"
+}
+trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
+
+declare -r LONGOPTS="dut_platform:,init_script:"
+
+# Don't use declare below so that the error from getopt will end the script.
+PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
+
+eval set -- "$PARSED"
+
+while true; do
+  case "$1" in
+    --dut_platform)
+      declare -r DUT_PLATFORM="$2"
+      shift 2
+      ;;
+    --init_script)
+      declare -r INIT_SCRIPT="$2"
+      shift 2
+      ;;
+    --)
+      shift
+      break
+      ;;
+    *)
+      echo "Programming error"
+      exit 3
+  esac
+done
+
+# All the other arguments are scripts.
+declare -r scripts="$@"
+
+# Check that the required flags are defined in a way that is safe for "set -u".
+if [[ "${DUT_PLATFORM-}" == "netstack" ]]; then
+  declare -r RUNTIME="--runtime runsc-d"
+elif [[ "${DUT_PLATFORM-}" == "linux" ]]; then
+  declare -r RUNTIME=""
+else
+  echo "FAIL: Bad or missing --dut_platform argument: ${DUT_PLATFORM-}"
+  exit 2
+fi
+if [[ ! -x "${INIT_SCRIPT-}" ]]; then
+  echo "FAIL: Bad or missing --init_script: ${INIT_SCRIPT-}"
+  exit 2
+fi
+
+# Variables specific to the control network and interface start with CTRL_.
+# Variables specific to the test network and interface start with TEST_.
+# Variables specific to the DUT start with DUT_.
+# Variables specific to the test runner start with TEST_RUNNER_.
+declare -r PACKETDRILL="/packetdrill/gtests/net/packetdrill/packetdrill"
+# Use random numbers so that test networks don't collide.
+declare -r CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
+declare -r TEST_NET="test_net-${RANDOM}${RANDOM}"
+declare -r tolerance_usecs=100000
+# On both DUT and test runner, testing packets are on the eth2 interface.
+declare -r TEST_DEVICE="eth2"
+# Number of bits in the *_NET_PREFIX variables.
+declare -r NET_MASK="24"
+function new_net_prefix() {
+  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
+}
+# Last bits of the DUT's IP address.
+declare -r DUT_NET_SUFFIX=".10"
+# Control port.
+declare -r CTRL_PORT="40000"
+# Last bits of the test runner's IP address.
+declare -r TEST_RUNNER_NET_SUFFIX=".20"
+declare -r TIMEOUT="60"
+declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetdrill"
+
+# Make sure that docker is installed.
+docker --version
+
+function finish {
+  local cleanup_success=1
+  for net in "${CTRL_NET}" "${TEST_NET}"; do
+    # Kill all processes attached to ${net}.
+    for docker_command in "kill" "rm"; do
+      (docker network inspect "${net}" \
+        --format '{{range $key, $value := .Containers}}{{$key}} {{end}}' \
+        | xargs -r docker "${docker_command}") || \
+        cleanup_success=0
+    done
+    # Remove the network.
+    docker network rm "${net}" || \
+      cleanup_success=0
+  done
+
+  if ((!$cleanup_success)); then
+    echo "FAIL: Cleanup command failed"
+    exit 4
+  fi
+}
+trap finish EXIT
+
+# Subnet for control packets between test runner and DUT.
+declare CTRL_NET_PREFIX=$(new_net_prefix)
+while ! docker network create \
+  "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
+  sleep 0.1
+  declare CTRL_NET_PREFIX=$(new_net_prefix)
+done
+
+# Subnet for the packets that are part of the test.
+declare TEST_NET_PREFIX=$(new_net_prefix)
+while ! docker network create \
+  "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
+  sleep 0.1
+  declare TEST_NET_PREFIX=$(new_net_prefix)
+done
+
+docker pull "${IMAGE_TAG}"
+
+# Create the DUT container and connect to network.
+DUT=$(docker create ${RUNTIME} --privileged --rm \
+  --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
+docker network connect "${CTRL_NET}" \
+  --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
+  || (docker kill ${DUT}; docker rm ${DUT}; false)
+docker network connect "${TEST_NET}" \
+  --ip "${TEST_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
+  || (docker kill ${DUT}; docker rm ${DUT}; false)
+docker start "${DUT}"
+
+# Create the test runner container and connect to network.
+TEST_RUNNER=$(docker create --privileged --rm \
+  --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
+docker network connect "${CTRL_NET}" \
+  --ip "${CTRL_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" "${TEST_RUNNER}" \
+  || (docker kill ${TEST_RUNNER}; docker rm ${REST_RUNNER}; false)
+docker network connect "${TEST_NET}" \
+  --ip "${TEST_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" "${TEST_RUNNER}" \
+  || (docker kill ${TEST_RUNNER}; docker rm ${REST_RUNNER}; false)
+docker start "${TEST_RUNNER}"
+
+# Run tcpdump in the test runner unbuffered, without dns resolution, just on the
+# interface with the test packets.
+docker exec -t ${TEST_RUNNER} tcpdump -U -n -i "${TEST_DEVICE}" &
+
+# Start a packetdrill server on the test_runner.  The packetdrill server sends
+# packets and asserts that they are received.
+docker exec -d "${TEST_RUNNER}" \
+  ${PACKETDRILL} --wire_server --wire_server_dev="${TEST_DEVICE}" \
+  --wire_server_ip="${CTRL_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" \
+  --wire_server_port="${CTRL_PORT}" \
+  --local_ip="${TEST_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" \
+  --remote_ip="${TEST_NET_PREFIX}${DUT_NET_SUFFIX}"
+
+# Because the Linux kernel receives the SYN-ACK but didn't send the SYN it will
+# issue a RST. To prevent this IPtables can be used to filter those out.
+docker exec "${TEST_RUNNER}" \
+  iptables -A OUTPUT -p tcp --tcp-flags RST RST -j DROP
+
+# Wait for the packetdrill server on the test runner to come.  Attempt to
+# connect to it from the DUT every 100 milliseconds until success.
+while ! docker exec "${DUT}" \
+  nc -zv "${CTRL_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" "${CTRL_PORT}"; do
+  sleep 0.1
+done
+
+# Copy the packetdrill setup script to the DUT.
+docker cp -L "${INIT_SCRIPT}" "${DUT}:packetdrill_setup.sh"
+
+# Copy the packetdrill scripts to the DUT.
+declare -a dut_scripts
+for script in $scripts; do
+  docker cp -L "${script}" "${DUT}:$(basename ${script})"
+  dut_scripts+=("/$(basename ${script})")
+done
+
+# Start a packetdrill client on the DUT.  The packetdrill client runs POSIX
+# socket commands and also sends instructions to the server.
+docker exec -t "${DUT}" \
+  ${PACKETDRILL} --wire_client --wire_client_dev="${TEST_DEVICE}" \
+  --wire_server_ip="${CTRL_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" \
+  --wire_server_port="${CTRL_PORT}" \
+  --local_ip="${TEST_NET_PREFIX}${DUT_NET_SUFFIX}" \
+  --remote_ip="${TEST_NET_PREFIX}${TEST_RUNNER_NET_SUFFIX}" \
+  --init_scripts=/packetdrill_setup.sh \
+  --tolerance_usecs="${tolerance_usecs}" "${dut_scripts[@]}"
+
+echo PASS: No errors.
-- 
cgit v1.2.3


From 80ce7f253783feac041ef197b144d920e57955ec Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 3 Feb 2020 12:08:46 -0800
Subject: Tag version_test as noguitar.

PiperOrigin-RevId: 292974323
---
 runsc/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runsc/BUILD b/runsc/BUILD
index b35b41d81..375241921 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -117,4 +117,5 @@ sh_test(
     srcs = ["version_test.sh"],
     args = ["$(location :runsc)"],
     data = [":runsc"],
+    tags = ["noguitar"],
 )
-- 
cgit v1.2.3


From e7846e50f2df070a15dd33235b334e2223f715f3 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 3 Feb 2020 15:30:28 -0800
Subject: Reduce run time for
 //test/syscalls:socket_inet_loopback_test_runsc_ptrace.

* Tests are picked for a shard differently. It now picks one test from each
  block, instead of picking the whole block. This makes the same kind of tests
  spreads across different shards.

* Reduce the number of connect() calls in TCPListenClose.

PiperOrigin-RevId: 293019281
---
 runsc/testutil/testutil.go                  | 61 ++++++++++++++---------------
 test/runtimes/runner.go                     |  9 ++---
 test/syscalls/linux/socket_inet_loopback.cc | 13 +++---
 test/syscalls/syscall_test_runner.go        |  7 ++--
 4 files changed, 43 insertions(+), 47 deletions(-)

diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index fb22eae39..5d0b0ae54 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -434,43 +434,40 @@ func IsStatic(filename string) (bool, error) {
 	return true, nil
 }
 
-// TestBoundsForShard calculates the beginning and end indices for the test
-// based on the TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. The
-// returned ints are the beginning (inclusive) and end (exclusive) of the
-// subslice corresponding to the shard. If either of the env vars are not
-// present, then the function will return bounds that include all tests. If
-// there are more shards than there are tests, then the returned list may be
-// empty.
-func TestBoundsForShard(numTests int) (int, int, error) {
+// TestIndicesForShard returns indices for this test shard based on the
+// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
+//
+// If either of the env vars are not present, then the function will return all
+// tests. If there are more shards than there are tests, then the returned list
+// may be empty.
+func TestIndicesForShard(numTests int) ([]int, error) {
 	var (
-		begin = 0
-		end   = numTests
+		shardIndex = 0
+		shardTotal = 1
 	)
-	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
-	if indexStr == "" || totalStr == "" {
-		return begin, end, nil
-	}
 
-	// Parse index and total to ints.
-	shardIndex, err := strconv.Atoi(indexStr)
-	if err != nil {
-		return 0, 0, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
-	}
-	shardTotal, err := strconv.Atoi(totalStr)
-	if err != nil {
-		return 0, 0, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
+	if indexStr != "" && totalStr != "" {
+		// Parse index and total to ints.
+		var err error
+		shardIndex, err = strconv.Atoi(indexStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
+		}
+		shardTotal, err = strconv.Atoi(totalStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+		}
 	}
 
 	// Calculate!
-	shardSize := int(math.Ceil(float64(numTests) / float64(shardTotal)))
-	begin = shardIndex * shardSize
-	end = ((shardIndex + 1) * shardSize)
-	if begin > numTests {
-		// Nothing to run.
-		return 0, 0, nil
-	}
-	if end > numTests {
-		end = numTests
+	var indices []int
+	numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal)))
+	for i := 0; i < numBlocks; i++ {
+		pick := i*shardTotal + shardIndex
+		if pick < numTests {
+			indices = append(indices, pick)
+		}
 	}
-	return begin, end, nil
+	return indices, nil
 }
diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go
index bec37c69d..ddb890dbc 100644
--- a/test/runtimes/runner.go
+++ b/test/runtimes/runner.go
@@ -20,7 +20,6 @@ import (
 	"flag"
 	"fmt"
 	"io"
-	"log"
 	"os"
 	"sort"
 	"strings"
@@ -101,17 +100,15 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 	// shard.
 	tests := strings.Fields(list)
 	sort.Strings(tests)
-	begin, end, err := testutil.TestBoundsForShard(len(tests))
+	indices, err := testutil.TestIndicesForShard(len(tests))
 	if err != nil {
 		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
 	}
-	log.Printf("Got bounds [%d:%d) for shard out of %d total tests", begin, end, len(tests))
-	tests = tests[begin:end]
 
 	var itests []testing.InternalTest
-	for _, tc := range tests {
+	for _, tci := range indices {
 		// Capture tc in this scope.
-		tc := tc
+		tc := tests[tci]
 		itests = append(itests, testing.InternalTest{
 			Name: tc,
 			F: func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 3bf7081b9..b24618a88 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -325,6 +325,12 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
+  constexpr int kAcceptCount = 32;
+  constexpr int kBacklog = kAcceptCount * 2;
+  constexpr int kFDs = 128;
+  constexpr int kThreadCount = 4;
+  constexpr int kFDsPerThread = kFDs / kThreadCount;
+
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -332,7 +338,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
                    listener.addr_len),
               SyscallSucceeds());
-  ASSERT_THAT(listen(listen_fd.get(), 1001), SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
   socklen_t addrlen = listener.addr_len;
@@ -345,9 +351,6 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   DisableSave ds;  // Too many system calls.
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  constexpr int kFDs = 2048;
-  constexpr int kThreadCount = 4;
-  constexpr int kFDsPerThread = kFDs / kThreadCount;
   FileDescriptor clients[kFDs];
   std::unique_ptr<ScopedThread> threads[kThreadCount];
   for (int i = 0; i < kFDs; i++) {
@@ -371,7 +374,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   for (int i = 0; i < kThreadCount; i++) {
     threads[i]->Join();
   }
-  for (int i = 0; i < 32; i++) {
+  for (int i = 0; i < kAcceptCount; i++) {
     auto accepted =
         ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
   }
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index b9fd885ff..ae342b68c 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -450,17 +450,16 @@ func main() {
 	}
 
 	// Get subset of tests corresponding to shard.
-	begin, end, err := testutil.TestBoundsForShard(len(testCases))
+	indices, err := testutil.TestIndicesForShard(len(testCases))
 	if err != nil {
 		fatalf("TestsForShard() failed: %v", err)
 	}
-	testCases = testCases[begin:end]
 
 	// Run the tests.
 	var tests []testing.InternalTest
-	for _, tc := range testCases {
+	for _, tci := range indices {
 		// Capture tc.
-		tc := tc
+		tc := testCases[tci]
 		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
 		tests = append(tests, testing.InternalTest{
 			Name: testName,
-- 
cgit v1.2.3


From 6cd7901d7d5f9639e95fff3d8927ba8856a83f91 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 3 Feb 2020 15:30:42 -0800
Subject: Add 1 Kokoro job per runtime test.

PiperOrigin-RevId: 293019326
---
 kokoro/runtime_tests/go1.12.cfg       |  6 ++++++
 kokoro/runtime_tests/java11.cfg       |  6 ++++++
 kokoro/runtime_tests/nodejs12.4.0.cfg |  6 ++++++
 kokoro/runtime_tests/php7.3.6.cfg     |  6 ++++++
 kokoro/runtime_tests/python3.7.3.cfg  |  6 ++++++
 kokoro/runtime_tests/runtime_tests.sh | 25 +++++++++++++++++++++++++
 scripts/runtime_tests.sh              | 25 -------------------------
 7 files changed, 55 insertions(+), 25 deletions(-)
 create mode 100644 kokoro/runtime_tests/go1.12.cfg
 create mode 100644 kokoro/runtime_tests/java11.cfg
 create mode 100644 kokoro/runtime_tests/nodejs12.4.0.cfg
 create mode 100644 kokoro/runtime_tests/php7.3.6.cfg
 create mode 100644 kokoro/runtime_tests/python3.7.3.cfg
 create mode 100755 kokoro/runtime_tests/runtime_tests.sh
 delete mode 100755 scripts/runtime_tests.sh

diff --git a/kokoro/runtime_tests/go1.12.cfg b/kokoro/runtime_tests/go1.12.cfg
new file mode 100644
index 000000000..024740ab2
--- /dev/null
+++ b/kokoro/runtime_tests/go1.12.cfg
@@ -0,0 +1,6 @@
+build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+
+env_vars {
+  key: "RUNTIME_TEST_NAME"
+  value: "go1.12"
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/java11.cfg b/kokoro/runtime_tests/java11.cfg
new file mode 100644
index 000000000..f01d26153
--- /dev/null
+++ b/kokoro/runtime_tests/java11.cfg
@@ -0,0 +1,6 @@
+build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+
+env_vars {
+  key: "RUNTIME_TEST_NAME"
+  value: "java11"
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/nodejs12.4.0.cfg b/kokoro/runtime_tests/nodejs12.4.0.cfg
new file mode 100644
index 000000000..d4861fb07
--- /dev/null
+++ b/kokoro/runtime_tests/nodejs12.4.0.cfg
@@ -0,0 +1,6 @@
+build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+
+env_vars {
+  key: "RUNTIME_TEST_NAME"
+  value: "nodejs12.4.0"
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/php7.3.6.cfg b/kokoro/runtime_tests/php7.3.6.cfg
new file mode 100644
index 000000000..b737ed9cb
--- /dev/null
+++ b/kokoro/runtime_tests/php7.3.6.cfg
@@ -0,0 +1,6 @@
+build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+
+env_vars {
+  key: "RUNTIME_TEST_NAME"
+  value: "php7.3.6"
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/python3.7.3.cfg b/kokoro/runtime_tests/python3.7.3.cfg
new file mode 100644
index 000000000..971fcba05
--- /dev/null
+++ b/kokoro/runtime_tests/python3.7.3.cfg
@@ -0,0 +1,6 @@
+build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+
+env_vars {
+  key: "RUNTIME_TEST_NAME"
+  value: "python3.7.3"
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/runtime_tests.sh b/kokoro/runtime_tests/runtime_tests.sh
new file mode 100755
index 000000000..9ee991e42
--- /dev/null
+++ b/kokoro/runtime_tests/runtime_tests.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+if [ ! -v RUNTIME_TEST_NAME ]; then
+  echo 'Must set $RUNTIME_TEST_NAME' >&2
+  exit 1
+fi
+
+install_runsc_for_test runtimes
+test_runsc "//test/runtimes:${RUNTIME_TEST_NAME}_test"
diff --git a/scripts/runtime_tests.sh b/scripts/runtime_tests.sh
deleted file mode 100755
index 9ee991e42..000000000
--- a/scripts/runtime_tests.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-if [ ! -v RUNTIME_TEST_NAME ]; then
-  echo 'Must set $RUNTIME_TEST_NAME' >&2
-  exit 1
-fi
-
-install_runsc_for_test runtimes
-test_runsc "//test/runtimes:${RUNTIME_TEST_NAME}_test"
-- 
cgit v1.2.3


From f37e913a358820ea98013772dd2880cc8a3c9218 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 3 Feb 2020 16:15:16 -0800
Subject: seccomp: allow to filter syscalls by instruction pointer

PiperOrigin-RevId: 293029446
---
 pkg/seccomp/seccomp.go       | 20 ++++++++++++++++----
 pkg/seccomp/seccomp_rules.go |  6 +++++-
 pkg/seccomp/seccomp_test.go  | 27 +++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index fc36efa23..55fd6967e 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -219,24 +219,36 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc
 				switch a := arg.(type) {
 				case AllowAny:
 				case AllowValue:
+					dataOffsetLow := seccompDataOffsetArgLow(i)
+					dataOffsetHigh := seccompDataOffsetArgHigh(i)
+					if i == RuleIP {
+						dataOffsetLow = seccompDataOffsetIPLow
+						dataOffsetHigh = seccompDataOffsetIPHigh
+					}
 					high, low := uint32(a>>32), uint32(a)
 					// assert arg_low == low
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					// assert arg_high == high
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
 				case GreaterThan:
+					dataOffsetLow := seccompDataOffsetArgLow(i)
+					dataOffsetHigh := seccompDataOffsetArgHigh(i)
+					if i == RuleIP {
+						dataOffsetLow = seccompDataOffsetIPLow
+						dataOffsetHigh = seccompDataOffsetIPHigh
+					}
 					labelGood := fmt.Sprintf("gt%v", i)
 					high, low := uint32(a>>32), uint32(a)
 					// assert arg_high < high
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					// arg_high > high
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
 					// arg_low < low
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
 					labelled = true
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 84c841d7f..06308cd29 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -62,7 +62,11 @@ func (a AllowValue) String() (s string) {
 // rule := Rule {
 //       AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
 // }
-type Rule [6]interface{}
+type Rule [7]interface{} // 6 arguments + RIP
+
+// RuleIP indicates what rules in the Rule array have to be applied to
+// instruction pointer.
+const RuleIP = 6
 
 func (r Rule) String() (s string) {
 	if len(r) == 0 {
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index abbee7051..da5a5e4b2 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -388,6 +388,33 @@ func TestBasic(t *testing.T) {
 				},
 			},
 		},
+		{
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								RuleIP: AllowValue(0x7aabbccdd),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			specs: []spec{
+				{
+					desc: "IP: Syscall instruction pointer allowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{}, instructionPointer: 0x7aabbccdd},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "IP: Syscall instruction pointer disallowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{}, instructionPointer: 0x711223344},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
 	} {
 		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
 		if err != nil {
-- 
cgit v1.2.3


From d7cd484091543827678f1548b8e5668a7a86e13f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 4 Feb 2020 08:20:10 -0800
Subject: Add support for sentry internal pipe for gofer mounts

Internal pipes are supported similarly to how internal UDS is done.
It is also controlled by the same flag.

Fixes #1102

PiperOrigin-RevId: 293150045
---
 pkg/sentry/fs/gofer/BUILD            |   2 +
 pkg/sentry/fs/gofer/cache_policy.go  |   3 +
 pkg/sentry/fs/gofer/fifo.go          |  40 ++++++++
 pkg/sentry/fs/gofer/gofer_test.go    |   2 +-
 pkg/sentry/fs/gofer/path.go          | 183 +++++++++++++++++++++++------------
 pkg/sentry/fs/gofer/session.go       | 183 ++++++++++++++++++++++-------------
 pkg/sentry/fs/gofer/session_state.go |  12 +--
 pkg/sentry/fs/gofer/socket.go        |   8 +-
 test/syscalls/BUILD                  |   1 -
 9 files changed, 293 insertions(+), 141 deletions(-)
 create mode 100644 pkg/sentry/fs/gofer/fifo.go

diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 971d3718e..fea135eea 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -9,6 +9,7 @@ go_library(
         "cache_policy.go",
         "context_file.go",
         "device.go",
+        "fifo.go",
         "file.go",
         "file_state.go",
         "fs.go",
@@ -38,6 +39,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index ebea03c42..07a564e92 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -127,6 +127,9 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 
 	childIops, ok := child.InodeOperations.(*inodeOperations)
 	if !ok {
+		if _, ok := child.InodeOperations.(*fifo); ok {
+			return false
+		}
 		panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations))
 	}
 	parentIops, ok := parent.InodeOperations.(*inodeOperations)
diff --git a/pkg/sentry/fs/gofer/fifo.go b/pkg/sentry/fs/gofer/fifo.go
new file mode 100644
index 000000000..456557058
--- /dev/null
+++ b/pkg/sentry/fs/gofer/fifo.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+// +stateify savable
+type fifo struct {
+	fs.InodeOperations
+	fileIops *inodeOperations
+}
+
+var _ fs.InodeOperations = (*fifo)(nil)
+
+// Rename implements fs.InodeOperations. It forwards the call to the underlying
+// file inode to handle the file rename. Note that file key remains the same
+// after the rename to keep the endpoint mapping.
+func (i *fifo) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return i.fileIops.Rename(ctx, inode, oldParent, oldName, newParent, newName, replacement)
+}
+
+// StatFS implements fs.InodeOperations.
+func (i *fifo) StatFS(ctx context.Context) (fs.Info, error) {
+	return i.fileIops.StatFS(ctx)
+}
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 0c2f89ae8..2df2fe889 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -61,7 +61,7 @@ func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context
 		ctx := contexttest.Context(t)
 		sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{
 			file: rootFile,
-		}, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */)
+		}, root.QID, p9.AttrMaskAll(), root.Attr)
 		m := fs.NewMountSource(ctx, s, &filesystem{}, fs.MountSourceFlags{})
 		rootInode := fs.NewInode(ctx, rootInodeOperations, m, sattr)
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 0c1be05ef..a35c3a23d 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -23,14 +23,24 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
 // encoding of strings, which uses 2 bytes for the length prefix.
 const maxFilenameLen = (1 << 16) - 1
 
+func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode {
+	if newType&^p9.FileModeMask != 0 {
+		panic(fmt.Sprintf("newType contained more bits than just file mode: %x", newType))
+	}
+	clear := mode &^ p9.FileModeMask
+	return clear | newType
+}
+
 // Lookup loads an Inode at name into a Dirent based on the session's cache
 // policy.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
@@ -69,8 +79,25 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 		return nil, err
 	}
 
+	if i.session().overrides != nil {
+		// Check if file belongs to a internal named pipe. Note that it doesn't need
+		// to check for sockets because it's done in newInodeOperations below.
+		deviceKey := device.MultiDeviceKey{
+			Device:          p9attr.RDev,
+			SecondaryDevice: i.session().connID,
+			Inode:           qids[0].Path,
+		}
+		unlock := i.session().overrides.lock()
+		if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil {
+			unlock()
+			pipeInode.IncRef()
+			return fs.NewDirent(ctx, pipeInode, name), nil
+		}
+		unlock()
+	}
+
 	// Construct the Inode operations.
-	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false)
+	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr)
 
 	// Construct a positive Dirent.
 	return fs.NewDirent(ctx, fs.NewInode(ctx, node, dir.MountSource, sattr), name), nil
@@ -138,7 +165,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	qid := qids[0]
 
 	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr)
 
 	// Construct the positive Dirent.
 	d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
@@ -223,82 +250,115 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 		return nil, syserror.ENAMETOOLONG
 	}
 
-	if i.session().endpoints == nil {
+	if i.session().overrides == nil {
 		return nil, syscall.EOPNOTSUPP
 	}
 
-	// Create replaces the directory fid with the newly created/opened
-	// file, so clone this directory so it doesn't change out from under
-	// this node.
-	_, newFile, err := i.fileState.file.walk(ctx, nil)
+	// Stabilize the override map while creation is in progress.
+	unlock := i.session().overrides.lock()
+	defer unlock()
+
+	sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket)
 	if err != nil {
 		return nil, err
 	}
-	// We're not going to use newFile after return.
-	defer newFile.close(ctx)
 
-	// Stabilize the endpoint map while creation is in progress.
-	unlock := i.session().endpoints.lock()
-	defer unlock()
+	// Construct the positive Dirent.
+	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
+	i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep)
+	return childDir, nil
+}
 
-	// Create a regular file in the gofer and then mark it as a socket by
-	// adding this inode key in the 'endpoints' map.
-	owner := fs.FileOwnerFromContext(ctx)
-	hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
-	if err != nil {
-		return nil, err
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
 	}
-	// We're not going to use this file.
-	hostFile.Close()
 
-	i.touchModificationAndStatusChangeTime(ctx, dir)
+	owner := fs.FileOwnerFromContext(ctx)
+	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
 
-	// Get the attributes of the file to create inode key.
-	qid, mask, attr, err := getattr(ctx, newFile)
-	if err != nil {
-		return nil, err
+	// N.B. FIFOs use major/minor numbers 0.
+	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+		if i.session().overrides == nil || err != syscall.EPERM {
+			return err
+		}
+		// If gofer doesn't support mknod, check if we can create an internal fifo.
+		return i.createInternalFifo(ctx, dir, name, owner, perm)
 	}
 
-	key := device.MultiDeviceKey{
-		Device:          attr.RDev,
-		SecondaryDevice: i.session().connID,
-		Inode:           qid.Path,
+	i.touchModificationAndStatusChangeTime(ctx, dir)
+	return nil
+}
+
+func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error {
+	if i.session().overrides == nil {
+		return syserror.EPERM
 	}
 
-	// Create child dirent.
+	// Stabilize the override map while creation is in progress.
+	unlock := i.session().overrides.lock()
+	defer unlock()
 
-	// Get an unopened p9.File for the file we created so that it can be
-	// cloned and re-opened multiple times after creation.
-	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe)
 	if err != nil {
-		return nil, err
+		return err
 	}
 
-	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true)
+	// First create a pipe.
+	p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+
+	// Wrap the fileOps with our Fifo.
+	iops := &fifo{
+		InodeOperations: pipe.NewInodeOperations(ctx, perm, p),
+		fileIops:        fileOps,
+	}
+	inode := fs.NewInode(ctx, iops, dir.MountSource, sattr)
 
 	// Construct the positive Dirent.
 	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
-	i.session().endpoints.add(key, childDir, ep)
-	return childDir, nil
+	i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode)
+	return nil
 }
 
-// CreateFifo implements fs.InodeOperations.CreateFifo.
-func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
-	if len(name) > maxFilenameLen {
-		return syserror.ENAMETOOLONG
+// Caller must hold Session.endpoint lock.
+func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions, fileType p9.FileMode) (fs.StableAttr, *inodeOperations, error) {
+	_, dirClone, err := i.fileState.file.walk(ctx, nil)
+	if err != nil {
+		return fs.StableAttr{}, nil, err
 	}
+	// We're not going to use dirClone after return.
+	defer dirClone.close(ctx)
 
+	// Create a regular file in the gofer and then mark it as a socket by
+	// adding this inode key in the 'overrides' map.
 	owner := fs.FileOwnerFromContext(ctx)
-	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
-
-	// N.B. FIFOs use major/minor numbers 0.
-	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
-		return err
+	hostFile, err := dirClone.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
+	if err != nil {
+		return fs.StableAttr{}, nil, err
 	}
+	// We're not going to use this file.
+	hostFile.Close()
 
 	i.touchModificationAndStatusChangeTime(ctx, dir)
-	return nil
+
+	// Get the attributes of the file to create inode key.
+	qid, mask, attr, err := getattr(ctx, dirClone)
+	if err != nil {
+		return fs.StableAttr{}, nil, err
+	}
+
+	// Get an unopened p9.File for the file we created so that it can be
+	// cloned and re-opened multiple times after creation.
+	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	if err != nil {
+		return fs.StableAttr{}, nil, err
+	}
+
+	// Construct new inode with file type overridden.
+	attr.Mode = changeType(attr.Mode, fileType)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr)
+	return sattr, iops, nil
 }
 
 // Remove implements InodeOperations.Remove.
@@ -307,20 +367,23 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 		return syserror.ENAMETOOLONG
 	}
 
-	var key device.MultiDeviceKey
-	removeSocket := false
-	if i.session().endpoints != nil {
-		// Find out if file being deleted is a socket that needs to be
+	var key *device.MultiDeviceKey
+	if i.session().overrides != nil {
+		// Find out if file being deleted is a socket or pipe that needs to be
 		// removed from endpoint map.
 		if d, err := i.Lookup(ctx, dir, name); err == nil {
 			defer d.DecRef()
-			if fs.IsSocket(d.Inode.StableAttr) {
-				child := d.Inode.InodeOperations.(*inodeOperations)
-				key = child.fileState.key
-				removeSocket = true
 
-				// Stabilize the endpoint map while deletion is in progress.
-				unlock := i.session().endpoints.lock()
+			if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) {
+				switch iops := d.Inode.InodeOperations.(type) {
+				case *inodeOperations:
+					key = &iops.fileState.key
+				case *fifo:
+					key = &iops.fileIops.fileState.key
+				}
+
+				// Stabilize the override map while deletion is in progress.
+				unlock := i.session().overrides.lock()
 				defer unlock()
 			}
 		}
@@ -329,8 +392,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 	if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil {
 		return err
 	}
-	if removeSocket {
-		i.session().endpoints.remove(key)
+	if key != nil {
+		i.session().overrides.remove(*key)
 	}
 	i.touchModificationAndStatusChangeTime(ctx, dir)
 
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 498c4645a..f6b3ef178 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -33,60 +33,107 @@ import (
 var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize
 
 // +stateify savable
-type endpointMaps struct {
-	// mu protexts the direntMap, the keyMap, and the pathMap below.
-	mu sync.RWMutex `state:"nosave"`
+type overrideInfo struct {
+	dirent *fs.Dirent
+
+	// endpoint is set when dirent points to a socket. inode must not be set.
+	endpoint transport.BoundEndpoint
+
+	// inode is set when dirent points to a pipe. endpoint must not be set.
+	inode *fs.Inode
+}
 
-	// direntMap links sockets to their dirents.
-	// It is filled concurrently with the keyMap and is stored upon save.
-	// Before saving, this map is used to populate the pathMap.
-	direntMap map[transport.BoundEndpoint]*fs.Dirent
+func (l *overrideInfo) inodeType() fs.InodeType {
+	switch {
+	case l.endpoint != nil:
+		return fs.Socket
+	case l.inode != nil:
+		return fs.Pipe
+	}
+	panic("endpoint or node must be set")
+}
 
-	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets.
+// +stateify savable
+type overrideMaps struct {
+	// mu protexts the keyMap, and the pathMap below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets/pipes.
 	// It is not stored during save because the inode ID may change upon restore.
-	keyMap map[device.MultiDeviceKey]transport.BoundEndpoint `state:"nosave"`
+	keyMap map[device.MultiDeviceKey]*overrideInfo `state:"nosave"`
 
-	// pathMap links the sockets to their paths.
+	// pathMap links the sockets/pipes to their paths.
 	// It is filled before saving from the direntMap and is stored upon save.
 	// Upon restore, this map is used to re-populate the keyMap.
-	pathMap map[transport.BoundEndpoint]string
+	pathMap map[*overrideInfo]string
+}
+
+// addBoundEndpoint adds the bound endpoint to the map.
+// A reference is taken on the dirent argument.
+//
+// Precondition: maps must have been locked with 'lock'.
+func (e *overrideMaps) addBoundEndpoint(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
+	d.IncRef()
+	e.keyMap[key] = &overrideInfo{dirent: d, endpoint: ep}
 }
 
-// add adds the endpoint to the maps.
+// addPipe adds the pipe inode to the map.
 // A reference is taken on the dirent argument.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
-	e.keyMap[key] = ep
+func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *fs.Inode) {
 	d.IncRef()
-	e.direntMap[ep] = d
+	e.keyMap[key] = &overrideInfo{dirent: d, inode: inode}
 }
 
 // remove deletes the key from the maps.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) remove(key device.MultiDeviceKey) {
-	endpoint := e.get(key)
+func (e *overrideMaps) remove(key device.MultiDeviceKey) {
+	endpoint := e.keyMap[key]
 	delete(e.keyMap, key)
-
-	d := e.direntMap[endpoint]
-	d.DecRef()
-	delete(e.direntMap, endpoint)
+	endpoint.dirent.DecRef()
 }
 
 // lock blocks other addition and removal operations from happening while
 // the backing file is being created or deleted. Returns a function that unlocks
 // the endpoint map.
-func (e *endpointMaps) lock() func() {
+func (e *overrideMaps) lock() func() {
 	e.mu.Lock()
 	return func() { e.mu.Unlock() }
 }
 
-// get returns the endpoint mapped to the given key.
+// getBoundEndpoint returns the bound endpoint mapped to the given key.
 //
-// Precondition: maps must have been locked for reading.
-func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint {
-	return e.keyMap[key]
+// Precondition: maps must have been locked.
+func (e *overrideMaps) getBoundEndpoint(key device.MultiDeviceKey) transport.BoundEndpoint {
+	if v := e.keyMap[key]; v != nil {
+		return v.endpoint
+	}
+	return nil
+}
+
+// getPipe returns the pipe inode mapped to the given key.
+//
+// Precondition: maps must have been locked.
+func (e *overrideMaps) getPipe(key device.MultiDeviceKey) *fs.Inode {
+	if v := e.keyMap[key]; v != nil {
+		return v.inode
+	}
+	return nil
+}
+
+// getType returns the inode type if there is a corresponding endpoint for the
+// given key. Returns false otherwise.
+func (e *overrideMaps) getType(key device.MultiDeviceKey) (fs.InodeType, bool) {
+	e.mu.Lock()
+	v := e.keyMap[key]
+	e.mu.Unlock()
+
+	if v != nil {
+		return v.inodeType(), true
+	}
+	return 0, false
 }
 
 // session holds state for each 9p session established during sys_mount.
@@ -137,16 +184,16 @@ type session struct {
 	// mounter is the EUID/EGID that mounted this file system.
 	mounter fs.FileOwner `state:"wait"`
 
-	// endpoints is used to map inodes that represent socket files to their
-	// corresponding endpoint. Socket files are created as regular files in the
-	// gofer and their presence in this map indicate that they should indeed be
-	// socket files. This allows unix domain sockets to be used with paths that
-	// belong to a gofer.
+	// overrides is used to map inodes that represent socket/pipes files to their
+	// corresponding endpoint/iops. These files are created as regular files in
+	// the gofer and their presence in this map indicate that they should indeed
+	// be socket/pipe files. This allows unix domain sockets and named pipes to
+	// be used with paths that belong to a gofer.
 	//
 	// TODO(gvisor.dev/issue/1200): there are few possible races with someone
 	// stat'ing the file and another deleting it concurrently, where the file
 	// will not be reported as socket file.
-	endpoints *endpointMaps `state:"wait"`
+	overrides *overrideMaps `state:"wait"`
 }
 
 // Destroy tears down the session.
@@ -179,15 +226,21 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) {
 	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
 	// because overlay copyUp may have changed them out from under us.
 	// So much for "immutable".
-	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
-	s.inodeMappings[sattr.InodeID] = path
+	switch iops := inode.InodeOperations.(type) {
+	case *inodeOperations:
+		s.inodeMappings[iops.fileState.sattr.InodeID] = path
+	case *fifo:
+		s.inodeMappings[iops.fileIops.fileState.sattr.InodeID] = path
+	default:
+		panic(fmt.Sprintf("Invalid type: %T", iops))
+	}
 }
 
-// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes
-// (p9.QID, p9.AttrMask, p9.Attr).
+// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File
+// and attributes (p9.QID, p9.AttrMask, p9.Attr).
 //
 // Endpoints lock must not be held if socket == false.
-func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) {
+func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) {
 	deviceKey := device.MultiDeviceKey{
 		Device:          attr.RDev,
 		SecondaryDevice: s.connID,
@@ -201,17 +254,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 		BlockSize: bsize(attr),
 	}
 
-	if s.endpoints != nil {
-		if socket {
-			sattr.Type = fs.Socket
-		} else {
-			// If unix sockets are allowed on this filesystem, check if this file is
-			// supposed to be a socket file.
-			unlock := s.endpoints.lock()
-			if s.endpoints.get(deviceKey) != nil {
-				sattr.Type = fs.Socket
-			}
-			unlock()
+	if s.overrides != nil && sattr.Type == fs.RegularFile {
+		// If overrides are allowed on this filesystem, check if this file is
+		// supposed to be of a different type, e.g. socket.
+		if t, ok := s.overrides.getType(deviceKey); ok {
+			sattr.Type = t
 		}
 	}
 
@@ -267,7 +314,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	s.EnableLeakCheck("gofer.session")
 
 	if o.privateunixsocket {
-		s.endpoints = newEndpointMaps()
+		s.overrides = newOverrideMaps()
 	}
 
 	// Construct the MountSource with the session and superBlockFlags.
@@ -305,26 +352,24 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 		return nil, err
 	}
 
-	sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false)
+	sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr)
 	return fs.NewInode(ctx, iops, m, sattr), nil
 }
 
-// newEndpointMaps creates a new endpointMaps.
-func newEndpointMaps() *endpointMaps {
-	return &endpointMaps{
-		direntMap: make(map[transport.BoundEndpoint]*fs.Dirent),
-		keyMap:    make(map[device.MultiDeviceKey]transport.BoundEndpoint),
-		pathMap:   make(map[transport.BoundEndpoint]string),
+// newOverrideMaps creates a new overrideMaps.
+func newOverrideMaps() *overrideMaps {
+	return &overrideMaps{
+		keyMap:  make(map[device.MultiDeviceKey]*overrideInfo),
+		pathMap: make(map[*overrideInfo]string),
 	}
 }
 
-// fillKeyMap populates key and dirent maps upon restore from saved
-// pathmap.
+// fillKeyMap populates key and dirent maps upon restore from saved pathmap.
 func (s *session) fillKeyMap(ctx context.Context) error {
-	unlock := s.endpoints.lock()
+	unlock := s.overrides.lock()
 	defer unlock()
 
-	for ep, dirPath := range s.endpoints.pathMap {
+	for ep, dirPath := range s.overrides.pathMap {
 		_, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath))
 		if err != nil {
 			return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err)
@@ -341,25 +386,25 @@ func (s *session) fillKeyMap(ctx context.Context) error {
 			Inode:           qid.Path,
 		}
 
-		s.endpoints.keyMap[key] = ep
+		s.overrides.keyMap[key] = ep
 	}
 	return nil
 }
 
-// fillPathMap populates paths for endpoints from dirents in direntMap
+// fillPathMap populates paths for overrides from dirents in direntMap
 // before save.
 func (s *session) fillPathMap() error {
-	unlock := s.endpoints.lock()
+	unlock := s.overrides.lock()
 	defer unlock()
 
-	for ep, dir := range s.endpoints.direntMap {
-		mountRoot := dir.MountRoot()
+	for _, endpoint := range s.overrides.keyMap {
+		mountRoot := endpoint.dirent.MountRoot()
 		defer mountRoot.DecRef()
-		dirPath, _ := dir.FullName(mountRoot)
+		dirPath, _ := endpoint.dirent.FullName(mountRoot)
 		if dirPath == "" {
 			return fmt.Errorf("error getting path from dirent")
 		}
-		s.endpoints.pathMap[ep] = dirPath
+		s.overrides.pathMap[endpoint] = dirPath
 	}
 	return nil
 }
@@ -368,7 +413,7 @@ func (s *session) fillPathMap() error {
 func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// When restoring, only need to create the keyMap because the dirent and path
 	// maps got stored through the save.
-	s.endpoints.keyMap = make(map[device.MultiDeviceKey]transport.BoundEndpoint)
+	s.overrides.keyMap = make(map[device.MultiDeviceKey]*overrideInfo)
 	if err := s.fillKeyMap(ctx); err != nil {
 		return fmt.Errorf("failed to insert sockets into endpoint map: %v", err)
 	}
@@ -376,6 +421,6 @@ func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// Re-create pathMap because it can no longer be trusted as socket paths can
 	// change while process continues to run. Empty pathMap will be re-filled upon
 	// next save.
-	s.endpoints.pathMap = make(map[transport.BoundEndpoint]string)
+	s.overrides.pathMap = make(map[*overrideInfo]string)
 	return nil
 }
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0285c5361..111da59f9 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -25,9 +25,9 @@ import (
 
 // beforeSave is invoked by stateify.
 func (s *session) beforeSave() {
-	if s.endpoints != nil {
+	if s.overrides != nil {
 		if err := s.fillPathMap(); err != nil {
-			panic("failed to save paths to endpoint map before saving" + err.Error())
+			panic("failed to save paths to override map before saving" + err.Error())
 		}
 	}
 }
@@ -74,10 +74,10 @@ func (s *session) afterLoad() {
 		panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname))
 	}
 
-	// Check if endpointMaps exist when uds sockets are enabled
-	// (only pathmap will actualy have been saved).
-	if opts.privateunixsocket != (s.endpoints != nil) {
-		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil))
+	// Check if overrideMaps exist when uds sockets are enabled (only pathmaps
+	// will actually have been saved).
+	if opts.privateunixsocket != (s.overrides != nil) {
+		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.overrides != nil))
 	}
 	if args.Flags != s.superBlockFlags {
 		panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags))
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 376cfce2c..10ba2f5f0 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -32,15 +32,15 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.
 		return nil
 	}
 
-	if i.session().endpoints != nil {
-		unlock := i.session().endpoints.lock()
+	if i.session().overrides != nil {
+		unlock := i.session().overrides.lock()
 		defer unlock()
-		ep := i.session().endpoints.get(i.fileState.key)
+		ep := i.session().overrides.getBoundEndpoint(i.fileState.key)
 		if ep != nil {
 			return ep
 		}
 
-		// Not found in endpoints map, it may be a gofer backed unix socket...
+		// Not found in overrides map, it may be a gofer backed unix socket...
 	}
 
 	inode.IncRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 40e974314..8f2b75a1c 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -225,7 +225,6 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:mknod_test",
-    use_tmpfs = True,  # mknod is not supported over gofer.
 )
 
 syscall_test(
-- 
cgit v1.2.3


From 492229d0176c1af2ab4ea4cf91bf211e940b5b12 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 4 Feb 2020 11:28:36 -0800
Subject: VFS2 gofer client

Updates #1198

Opening host pipes (by spinning in fdpipe) and host sockets is not yet
complete, and will be done in a future CL.

Major differences from VFS1 gofer client (sentry/fs/gofer), with varying levels
of backportability:

- "Cache policies" are replaced by InteropMode, which control the behavior of
  timestamps in addition to caching. Under InteropModeExclusive (analogous to
  cacheAll) and InteropModeWritethrough (analogous to cacheAllWritethrough),
  client timestamps are *not* written back to the server (it is not possible in
  9P or Linux for clients to set ctime, so writing back client-authoritative
  timestamps results in incoherence between atime/mtime and ctime). Under
  InteropModeShared (analogous to cacheRemoteRevalidating), client timestamps
  are not used at all (remote filesystem clocks are authoritative). cacheNone
  is translated to InteropModeShared + new option
  filesystemOptions.specialRegularFiles.

- Under InteropModeShared, "unstable attribute" reloading for permission
  checks, lookup, and revalidation are fused, which is feasible in VFS2 since
  gofer.filesystem controls path resolution. This results in a ~33% reduction
  in RPCs for filesystem operations compared to cacheRemoteRevalidating. For
  example, consider stat("/foo/bar/baz") where "/foo/bar/baz" fails
  revalidation, resulting in the instantiation of a new dentry:

  VFS1 RPCs:
  getattr("/")                          // fs.MountNamespace.FindLink() => fs.Inode.CheckPermission() => gofer.inodeOperations.check() => gofer.inodeOperations.UnstableAttr()
  walkgetattr("/", "foo") = fid1        // fs.Dirent.walk() => gofer.session.Revalidate() => gofer.cachePolicy.Revalidate()
  clunk(fid1)
  getattr("/foo")                       // CheckPermission
  walkgetattr("/foo", "bar") = fid2     // Revalidate
  clunk(fid2)
  getattr("/foo/bar")                   // CheckPermission
  walkgetattr("/foo/bar", "baz") = fid3 // Revalidate
  clunk(fid3)
  walkgetattr("/foo/bar", "baz") = fid4 // fs.Dirent.walk() => gofer.inodeOperations.Lookup
  getattr("/foo/bar/baz")               // linux.stat() => gofer.inodeOperations.UnstableAttr()

  VFS2 RPCs:
  getattr("/")                          // gofer.filesystem.walkExistingLocked()
  walkgetattr("/", "foo") = fid1        // gofer.filesystem.stepExistingLocked()
  clunk(fid1)
                                        // No getattr: walkgetattr already updated metadata for permission check
  walkgetattr("/foo", "bar") = fid2
  clunk(fid2)
  walkgetattr("/foo/bar", "baz") = fid3
                                        // No clunk: fid3 used for new gofer.dentry
                                        // No getattr: walkgetattr already updated metadata for stat()

- gofer.filesystem.unlinkAt() does not require instantiation of a dentry that
  represents the file to be deleted. Updates #898.

- gofer.regularFileFD.OnClose() skips Tflushf for regular files under
  InteropModeExclusive, as it's nonsensical to request a remote file flush
  without flushing locally-buffered writes to that remote file first.

- Symlink targets are cached when InteropModeShared is not in effect.

- p9.QID.Path (which is already required to be unique for each file within a
  server, and is accordingly already synthesized from device/inode numbers in
  all known gofers) is used as-is for inode numbers, rather than being mapped
  along with attr.RDev in the client to yet another synthetic inode number.

- Relevant parts of fsutil.CachingInodeOperations are inlined directly into
  gofer package code. This avoids having to duplicate part of its functionality
  in fsutil.HostMappable.

PiperOrigin-RevId: 293190213
---
 pkg/safemem/seq_unsafe.go                |   17 +
 pkg/sentry/fs/fsutil/BUILD               |    4 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go    |   13 +-
 pkg/sentry/fs/fsutil/inode_cached.go     |    2 +-
 pkg/sentry/fsimpl/gofer/BUILD            |   55 ++
 pkg/sentry/fsimpl/gofer/directory.go     |  190 +++++
 pkg/sentry/fsimpl/gofer/filesystem.go    | 1087 ++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/gofer.go         | 1147 ++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/handle.go        |  135 ++++
 pkg/sentry/fsimpl/gofer/handle_unsafe.go |   66 ++
 pkg/sentry/fsimpl/gofer/p9file.go        |  219 ++++++
 pkg/sentry/fsimpl/gofer/pagemath.go      |   31 +
 pkg/sentry/fsimpl/gofer/regular_file.go  |  860 ++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/special_file.go  |  159 +++++
 pkg/sentry/fsimpl/gofer/symlink.go       |   47 ++
 pkg/sentry/fsimpl/gofer/time.go          |   75 ++
 pkg/sentry/fsimpl/tmpfs/filesystem.go    |    2 +-
 pkg/sentry/socket/hostinet/socket.go     |   23 +-
 18 files changed, 4103 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/gofer/BUILD
 create mode 100644 pkg/sentry/fsimpl/gofer/directory.go
 create mode 100644 pkg/sentry/fsimpl/gofer/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/gofer/gofer.go
 create mode 100644 pkg/sentry/fsimpl/gofer/handle.go
 create mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go
 create mode 100644 pkg/sentry/fsimpl/gofer/p9file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go
 create mode 100644 pkg/sentry/fsimpl/gofer/regular_file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/special_file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/symlink.go
 create mode 100644 pkg/sentry/fsimpl/gofer/time.go

diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index 354a95dde..dcdfc9600 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"reflect"
+	"syscall"
 	"unsafe"
 )
 
@@ -297,3 +298,19 @@ func ZeroSeq(dsts BlockSeq) (uint64, error) {
 	}
 	return done, nil
 }
+
+// IovecsFromBlockSeq returns a []syscall.Iovec representing seq.
+func IovecsFromBlockSeq(bs BlockSeq) []syscall.Iovec {
+	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
+	for ; !bs.IsEmpty(); bs = bs.Tail() {
+		b := bs.Head()
+		iovs = append(iovs, syscall.Iovec{
+			Base: &b.ToSlice()[0],
+			Len:  uint64(b.Len()),
+		})
+		// We don't need to care about b.NeedSafecopy(), because the host
+		// kernel will handle such address ranges just fine (by returning
+		// EFAULT).
+	}
+	return iovs
+}
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 4ab2a384f..789369220 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -28,13 +28,13 @@ go_template_instance(
         "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
     },
     package = "fsutil",
-    prefix = "frameRef",
+    prefix = "FrameRef",
     template = "//pkg/segment:generic_set",
     types = {
         "Key": "uint64",
         "Range": "platform.FileRange",
         "Value": "uint64",
-        "Functions": "frameRefSetFunctions",
+        "Functions": "FrameRefSetFunctions",
     },
 )
 
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index dd63db32b..6564fd0c6 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -20,24 +20,25 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 )
 
-type frameRefSetFunctions struct{}
+// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
+type FrameRefSetFunctions struct{}
 
 // MinKey implements segment.Functions.MinKey.
-func (frameRefSetFunctions) MinKey() uint64 {
+func (FrameRefSetFunctions) MinKey() uint64 {
 	return 0
 }
 
 // MaxKey implements segment.Functions.MaxKey.
-func (frameRefSetFunctions) MaxKey() uint64 {
+func (FrameRefSetFunctions) MaxKey() uint64 {
 	return math.MaxUint64
 }
 
 // ClearValue implements segment.Functions.ClearValue.
-func (frameRefSetFunctions) ClearValue(val *uint64) {
+func (FrameRefSetFunctions) ClearValue(val *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
 	if val1 != val2 {
 		return 0, false
 	}
@@ -45,6 +46,6 @@ func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
 }
 
 // Split implements segment.Functions.Split.
-func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
 	return val, val
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 573b8586e..800c8b4e1 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -111,7 +111,7 @@ type CachingInodeOperations struct {
 	// refs tracks active references to data in the cache.
 	//
 	// refs is protected by dataMu.
-	refs frameRefSet
+	refs FrameRefSet
 }
 
 // CachingInodeOperationsOptions configures a CachingInodeOperations.
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
new file mode 100644
index 000000000..4ba76a1e8
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -0,0 +1,55 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "gofer",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_library(
+    name = "gofer",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "gofer.go",
+        "handle.go",
+        "handle_unsafe.go",
+        "p9file.go",
+        "pagemath.go",
+        "regular_file.go",
+        "special_file.go",
+        "symlink.go",
+        "time.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fd",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/safemem",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/unet",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
new file mode 100644
index 000000000..baa2cdd8e
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -0,0 +1,190 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+	return d.fileType() == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
+// InteropModeShared.
+func (d *dentry) cacheNegativeChildLocked(name string) {
+	if d.negativeChildren == nil {
+		d.negativeChildren = make(map[string]struct{})
+	}
+	d.negativeChildren[name] = struct{}{}
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	if fd.dirents == nil {
+		ds, err := fd.dentry().getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if !cb.Handle(fd.dirents[fd.off]) {
+			return nil
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	// 9P2000.L's readdir does not specify behavior in the presence of
+	// concurrent mutation of an iterated directory, so implementations may
+	// duplicate or omit entries in this case, which violates POSIX semantics.
+	// Thus we read all directory entries while holding d.dirMu to exclude
+	// directory mutations. (Note that it is impossible for the client to
+	// exclude concurrent mutation from other remote filesystem users. Since
+	// there is no way to detect if the server has incorrectly omitted
+	// directory entries, we simply assume that the server is well-behaved
+	// under InteropModeShared.) This is inconsistent with Linux (which appears
+	// to assume that directory fids have the correct semantics, and translates
+	// struct file_operations::readdir calls directly to readdir RPCs), but is
+	// consistent with VFS1.
+
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
+	// so we generate them here.
+	parent := d.vfsd.ParentOrSelf().Impl().(*dentry)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     d.ino,
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     parent.ino,
+			NextOff: 2,
+		},
+	}
+	off := uint64(0)
+	const count = 64 * 1024 // for consistency with the vfs1 client
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	if !d.handleReadable {
+		// This should not be possible because a readable handle should have
+		// been opened when the calling directoryFD was opened.
+		panic("gofer.dentry.getDirents called without a readable handle")
+	}
+	for {
+		p9ds, err := d.handle.file.readdir(ctx, off, count)
+		if err != nil {
+			return nil, err
+		}
+		if len(p9ds) == 0 {
+			// Cache dirents for future directoryFDs if permitted.
+			if d.fs.opts.interop != InteropModeShared {
+				d.dirents = dirents
+			}
+			return dirents, nil
+		}
+		for _, p9d := range p9ds {
+			if p9d.Name == "." || p9d.Name == ".." {
+				continue
+			}
+			dirent := vfs.Dirent{
+				Name:    p9d.Name,
+				Ino:     p9d.QID.Path,
+				NextOff: int64(len(dirents) + 1),
+			}
+			// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+			// DMSOCKET.
+			switch p9d.Type {
+			case p9.TypeSymlink:
+				dirent.Type = linux.DT_LNK
+			case p9.TypeDir:
+				dirent.Type = linux.DT_DIR
+			default:
+				dirent.Type = linux.DT_REG
+			}
+			dirents = append(dirents, dirent)
+		}
+		off = p9ds[len(p9ds)-1].Offset
+	}
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
new file mode 100644
index 000000000..8eb61debf
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -0,0 +1,1087 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// Snapshot current dentries and special files.
+	fs.syncMu.Lock()
+	ds := make([]*dentry, 0, len(fs.dentries))
+	for d := range fs.dentries {
+		ds = append(ds, d)
+	}
+	sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
+	for sffd := range fs.specialFileFDs {
+		sffds = append(sffds, sffd)
+	}
+	fs.syncMu.Unlock()
+
+	// Return the first error we encounter, but sync everything we can
+	// regardless.
+	var retErr error
+
+	// Sync regular files.
+	for _, d := range ds {
+		if !d.TryIncRef() {
+			continue
+		}
+		err := d.syncSharedHandle(ctx)
+		d.DecRef()
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	// Sync special files, which may be writable but do not use dentry shared
+	// handles (so they won't be synced by the above).
+	for _, sffd := range sffds {
+		if !sffd.vfsfd.TryIncRef() {
+			continue
+		}
+		err := sffd.Sync(ctx)
+		sffd.vfsfd.DecRef()
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	return retErr
+}
+
+// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
+// encoding of strings, which uses 2 bytes for the length prefix.
+const maxFilenameLen = (1 << 16) - 1
+
+// dentrySlicePool is a pool of *[]*dentry used to store dentries for which
+// dentry.checkCachingLocked() must be called. The pool holds pointers to
+// slices because Go lacks generics, so sync.Pool operates on interface{}, so
+// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy
+// of the slice header on the heap.
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may become cached as a result of the traversal are appended
+// to *ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
+// metadata must be up to date.
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		parentVFSD, err := rp.ResolveParent(&d.vfsd)
+		if err != nil {
+			return nil, err
+		}
+		parent := parentVFSD.Impl().(*dentry)
+		if fs.opts.interop == InteropModeShared {
+			// We must assume that parentVFSD is correct, because if d has been
+			// moved elsewhere in the remote filesystem so that its parent has
+			// changed, we have no way of determining its new parent's location
+			// in the filesystem. Get updated metadata for parentVFSD.
+			_, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask())
+			if err != nil {
+				return nil, err
+			}
+			parent.updateFromP9Attrs(attrMask, &attr)
+		}
+		rp.Advance()
+		return parent, nil
+	}
+	childVFSD, err := rp.ResolveChild(&d.vfsd, name)
+	if err != nil {
+		return nil, err
+	}
+	// FIXME(jamieliu): Linux performs revalidation before mount lookup
+	// (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(),
+	// __follow_mount_rcu()).
+	child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds)
+	if err != nil {
+		return nil, err
+	}
+	if child == nil {
+		return nil, syserror.ENOENT
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct. If no file
+// exists at name, revalidateChildLocked returns (nil, nil).
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// parent.isDir(). name is not "." or "..".
+//
+// Postconditions: If revalidateChildLocked returns a non-nil dentry, its
+// cached metadata is up to date.
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) {
+	if childVFSD != nil && fs.opts.interop != InteropModeShared {
+		// We have a cached dentry that is assumed to be correct.
+		return childVFSD.Impl().(*dentry), nil
+	}
+	// We either don't have a cached dentry or need to verify that it's still
+	// correct, either of which requires a remote lookup. Check if this name is
+	// valid before performing the lookup.
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+	// Check if we've already cached this lookup with a negative result.
+	if _, ok := parent.negativeChildren[name]; ok {
+		return nil, nil
+	}
+	// Perform the remote lookup.
+	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
+	if err != nil && err != syserror.ENOENT {
+		return nil, err
+	}
+	if childVFSD != nil {
+		child := childVFSD.Impl().(*dentry)
+		if !file.isNil() && qid.Path == child.ino {
+			// The file at this path hasn't changed. Just update cached
+			// metadata.
+			file.close(ctx)
+			child.updateFromP9Attrs(attrMask, &attr)
+			return child, nil
+		}
+		// The file at this path has changed or no longer exists. Remove
+		// the stale dentry from the tree, and re-evaluate its caching
+		// status (i.e. if it has 0 references, drop it).
+		vfsObj.ForceDeleteDentry(childVFSD)
+		*ds = appendDentry(*ds, child)
+		childVFSD = nil
+	}
+	if file.isNil() {
+		// No file exists at this path now. Cache the negative lookup if
+		// allowed.
+		if fs.opts.interop != InteropModeShared {
+			parent.cacheNegativeChildLocked(name)
+		}
+		return nil, nil
+	}
+	// Create a new dentry representing the file.
+	child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
+	if err != nil {
+		file.close(ctx)
+		return nil, err
+	}
+	parent.IncRef() // reference held by child on its parent
+	parent.vfsd.InsertChild(&child.vfsd, name)
+	// For now, child has 0 references, so our caller should call
+	// child.checkCachingLocked().
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop ==
+// InteropModeShared, then d's cached metadata must be up to date.
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for rp.Start() as required by fs.stepLocked().
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	if parent.isDeleted() {
+		return syserror.ENOENT
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	if fs.opts.interop == InteropModeShared {
+		// The existence of a dentry at name would be inconclusive because the
+		// file it represents may have been deleted from the remote filesystem,
+		// so we would need to make an RPC to revalidate the dentry. Just
+		// attempt the file creation RPC instead. If a file does exist, the RPC
+		// will fail with EEXIST like we would have. If the RPC succeeds, and a
+		// stale dentry exists, the dentry will fail revalidation next time
+		// it's used.
+		return create(parent, name)
+	}
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
+	}
+	// No cached dentry exists; however, there might still be an existing file
+	// at name. As above, we attempt the file creation RPC anyway.
+	if err := create(parent, name); err != nil {
+		return err
+	}
+	parent.touchCMtime(ctx)
+	delete(parent.negativeChildren, name)
+	parent.dirents = nil
+	return nil
+}
+
+// Preconditions: !rp.Done().
+func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	name := rp.Component()
+	if dir {
+		if name == "." {
+			return syserror.EINVAL
+		}
+		if name == ".." {
+			return syserror.ENOTEMPTY
+		}
+	} else {
+		if name == "." || name == ".." {
+			return syserror.EISDIR
+		}
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	childVFSD := parent.vfsd.Child(name)
+	var child *dentry
+	// We only need a dentry representing the file at name if it can be a mount
+	// point. If childVFSD is nil, then it can't be a mount point. If childVFSD
+	// is non-nil but stale, the actual file can't be a mount point either; we
+	// detect this case by just speculatively calling PrepareDeleteDentry and
+	// only revalidating the dentry if that fails (indicating that the existing
+	// dentry is a mount point).
+	if childVFSD != nil {
+		child = childVFSD.Impl().(*dentry)
+		if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds)
+			if err != nil {
+				return err
+			}
+			if child != nil {
+				childVFSD = &child.vfsd
+				if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+					return err
+				}
+			} else {
+				childVFSD = nil
+			}
+		}
+	} else if _, ok := parent.negativeChildren[name]; ok {
+		return syserror.ENOENT
+	}
+	flags := uint32(0)
+	if dir {
+		if child != nil && !child.isDir() {
+			return syserror.ENOTDIR
+		}
+		flags = linux.AT_REMOVEDIR
+	} else {
+		if child != nil && child.isDir() {
+			return syserror.EISDIR
+		}
+		if rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+	err = parent.file.unlinkAt(ctx, name, flags)
+	if err != nil {
+		if childVFSD != nil {
+			vfsObj.AbortDeleteDentry(childVFSD)
+		}
+		return err
+	}
+	if fs.opts.interop != InteropModeShared {
+		parent.touchCMtime(ctx)
+		parent.cacheNegativeChildLocked(name)
+		parent.dirents = nil
+	}
+	if child != nil {
+		child.setDeleted()
+		vfsObj.CommitDeleteDentry(childVFSD)
+		ds = appendDentry(ds, child)
+	}
+	return nil
+}
+
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkCachingLocked()
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkCachingLocked()
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		// 9P2000.L supports hard links, but we don't.
+		return syserror.EPERM
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Reject O_TMPFILE, which is not supported; supporting it correctly in the
+	// presence of other remote filesystem users requires remote filesystem
+	// support, and it isn't clear that there's any way to implement this in
+	// 9P.
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		return nil, syserror.EOPNOTSUPP
+	}
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by fs.stepLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	if rp.Done() {
+		return start.openLocked(ctx, rp, opts.Flags)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	if err != nil {
+		parent.dirMu.Unlock()
+		return nil, err
+	}
+	// Open existing child or follow symlink.
+	parent.dirMu.Unlock()
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, opts.Flags)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
+		return nil, err
+	}
+	mnt := rp.Mount()
+	filetype := d.fileType()
+	switch {
+	case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil {
+			return nil, err
+		}
+		fd := &regularFileFD{}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case filetype == linux.S_IFDIR:
+		// Can't open directories with O_CREAT.
+		if flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+			return nil, err
+		}
+		fd := &directoryFD{}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case filetype == linux.S_IFLNK:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	default:
+		if flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0)
+		if err != nil {
+			return nil, err
+		}
+		fd := &specialFileFD{
+			handle: h,
+		}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	}
+}
+
+// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		return nil, err
+	}
+	if d.isDeleted() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	// 9P2000.L's lcreate takes a fid representing the parent directory, and
+	// converts it into an open fid representing the created file, so we need
+	// to duplicate the directory fid first.
+	_, dirfile, err := d.file.walk(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	creds := rp.Credentials()
+	name := rp.Component()
+	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+	if err != nil {
+		dirfile.close(ctx)
+		return nil, err
+	}
+	// Then we need to walk to the file we just created to get a non-open fid
+	// representing it, and to get its metadata. This must use d.file since, as
+	// explained above, dirfile was invalidated by dirfile.Create().
+	walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+	if err != nil {
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	// Sanity-check that we walked to the file we created.
+	if createQID.Path != walkQID.Path {
+		// Probably due to concurrent remote filesystem mutation?
+		ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop)
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, syserror.EAGAIN
+	}
+
+	// Construct the new dentry.
+	child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
+	if err != nil {
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	// Incorporate the fid that was opened by lcreate.
+	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
+	if useRegularFileFD {
+		child.handleMu.Lock()
+		child.handle.file = openFile
+		if fdobj != nil {
+			child.handle.fd = int32(fdobj.Release())
+		}
+		child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags)
+		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
+		child.handleMu.Unlock()
+	}
+	// Take a reference on the new dentry to be held by the new file
+	// description. (This reference also means that the new dentry is not
+	// eligible for caching yet, so we don't need to append to a dentry slice.)
+	child.refs = 1
+	// Insert the dentry into the tree.
+	d.IncRef() // reference held by child on its parent d
+	d.vfsd.InsertChild(&child.vfsd, name)
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime(ctx)
+		delete(d.negativeChildren, name)
+		d.dirents = nil
+	}
+
+	// Finally, construct a file description representing the created file.
+	var childVFSFD *vfs.FileDescription
+	mnt.IncRef()
+	if useRegularFileFD {
+		fd := &regularFileFD{}
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	} else {
+		fd := &specialFileFD{
+			handle: handle{
+				file: openFile,
+				fd:   -1,
+			},
+		}
+		if fdobj != nil {
+			fd.handle.fd = int32(fdobj.Release())
+		}
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			fd.handle.close(ctx)
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	}
+	return childVFSFD, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	if !d.isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return d.readlink(ctx, rp.Mount())
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// Requires 9P support.
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckCaching(&ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		if err := oldParent.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	vfsObj := rp.VirtualFilesystem()
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds)
+	if err != nil {
+		return err
+	}
+	if renamed == nil {
+		return syserror.ENOENT
+	}
+	if renamed.isDir() {
+		if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.isDeleted() {
+		return syserror.ENOENT
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	// This is similar to unlinkAt, except:
+	//
+	// - We revalidate the replaced dentry unconditionally for simplicity.
+	//
+	// - If rp.MustBeDir(), then we need a dentry representing the replaced
+	// file regardless to confirm that it's a directory.
+	if replacedVFSD != nil || rp.MustBeDir() {
+		replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds)
+		if err != nil {
+			return err
+		}
+		if replaced != nil {
+			if replaced.isDir() {
+				if !renamed.isDir() {
+					return syserror.EISDIR
+				}
+			} else {
+				if rp.MustBeDir() || renamed.isDir() {
+					return syserror.ENOTDIR
+				}
+			}
+			replacedVFSD = &replaced.vfsd
+		} else {
+			replacedVFSD = nil
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+		return err
+	}
+	if fs.opts.interop != InteropModeShared {
+		oldParent.cacheNegativeChildLocked(oldName)
+		oldParent.dirents = nil
+		delete(newParent.negativeChildren, newName)
+		newParent.dirents = nil
+	}
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, true /* dir */)
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	// Since walking updates metadata for all traversed dentries under
+	// InteropModeShared, including the returned one, we can return cached
+	// metadata here regardless of fs.opts.interop.
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	fsstat, err := d.file.statFS(ctx)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	nameLen := uint64(fsstat.NameLength)
+	if nameLen > maxFilenameLen {
+		nameLen = maxFilenameLen
+	}
+	return linux.Statfs{
+		// This is primarily for distinguishing a gofer file system in
+		// tests. Testing is important, so instead of defining
+		// something completely random, use a standard value.
+		Type:            linux.V9FS_MAGIC,
+		BlockSize:       int64(fsstat.BlockSize),
+		Blocks:          fsstat.Blocks,
+		BlocksFree:      fsstat.BlocksFree,
+		BlocksAvailable: fsstat.BlocksAvailable,
+		Files:           fsstat.Files,
+		FilesFree:       fsstat.FilesFree,
+		NameLength:      nameLen,
+	}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, false /* dir */)
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	return d.listxattr(ctx)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return d.getxattr(ctx, name)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.setxattr(ctx, &opts)
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.removexattr(ctx, name)
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
new file mode 100644
index 000000000..d0552bd99
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -0,0 +1,1147 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gofer provides a filesystem implementation that is backed by a 9p
+// server, interchangably referred to as "gofers" throughout this package.
+//
+// Lock order:
+//   regularFileFD/directoryFD.mu
+//     filesystem.renameMu
+//       dentry.dirMu
+//         filesystem.syncMu
+//         dentry.metadataMu
+//           *** "memmap.Mappable locks" below this point
+//           dentry.mapsMu
+//             *** "memmap.Mappable locks taken by Translate" below this point
+//             dentry.handleMu
+//               dentry.dataMu
+//
+// Locking dentry.dirMu in multiple dentries requires holding
+// filesystem.renameMu for writing.
+package gofer
+
+import (
+	"fmt"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mfp is used to allocate memory that caches regular file contents. mfp is
+	// immutable.
+	mfp pgalloc.MemoryFileProvider
+
+	// Immutable options.
+	opts filesystemOptions
+
+	// client is the client used by this filesystem. client is immutable.
+	client *p9.Client
+
+	// uid and gid are the effective KUID and KGID of the filesystem's creator,
+	// and are used as the owner and group for files that don't specify one.
+	// uid and gid are immutable.
+	uid auth.KUID
+	gid auth.KGID
+
+	// renameMu serves two purposes:
+	//
+	// - It synchronizes path resolution with renaming initiated by this
+	// client.
+	//
+	// - It is held by path resolution to ensure that reachable dentries remain
+	// valid. A dentry is reachable by path resolution if it has a non-zero
+	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
+	// is reachable from its children), or if it is a child dentry (such that
+	// it is reachable from its parent).
+	renameMu sync.RWMutex
+
+	// cachedDentries contains all dentries with 0 references. (Due to race
+	// conditions, it may also contain dentries with non-zero references.)
+	// cachedDentriesLen is the number of dentries in cachedDentries. These
+	// fields are protected by renameMu.
+	cachedDentries    dentryList
+	cachedDentriesLen uint64
+
+	// dentries contains all dentries in this filesystem. specialFileFDs
+	// contains all open specialFileFDs. These fields are protected by syncMu.
+	syncMu         sync.Mutex
+	dentries       map[*dentry]struct{}
+	specialFileFDs map[*specialFileFD]struct{}
+}
+
+type filesystemOptions struct {
+	// "Standard" 9P options.
+	fd      int
+	aname   string
+	interop InteropMode // derived from the "cache" mount option
+	msize   uint32
+	version string
+
+	// maxCachedDentries is the maximum number of dentries with 0 references
+	// retained by the client.
+	maxCachedDentries uint64
+
+	// If forcePageCache is true, host FDs may not be used for application
+	// memory mappings even if available; instead, the client must perform its
+	// own caching of regular file pages. This is primarily useful for testing.
+	forcePageCache bool
+
+	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
+	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
+	// makes memory accounting behavior more consistent between cases where
+	// host FDs are / are not available, but may increase the frequency of
+	// sentry-handled page faults on files for which a host FD is available.
+	limitHostFDTranslation bool
+
+	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
+	// filesystem may not be coherent with writable host FDs opened later, so
+	// mappings of the former must be replaced by mappings of the latter. This
+	// is usually only the case when the remote filesystem is an overlayfs
+	// mount on Linux < 4.19.
+	overlayfsStaleRead bool
+
+	// If regularFilesUseSpecialFileFD is true, application FDs representing
+	// regular files will use distinct file handles for each FD, in the same
+	// way that application FDs representing "special files" such as sockets
+	// do. Note that this disables client caching and mmap for regular files.
+	regularFilesUseSpecialFileFD bool
+}
+
+// InteropMode controls the client's interaction with other remote filesystem
+// users.
+type InteropMode uint32
+
+const (
+	// InteropModeExclusive is appropriate when the filesystem client is the
+	// only user of the remote filesystem.
+	//
+	// - The client may cache arbitrary filesystem state (file data, metadata,
+	// filesystem structure, etc.).
+	//
+	// - Client changes to filesystem state may be sent to the remote
+	// filesystem asynchronously, except when server permission checks are
+	// necessary.
+	//
+	// - File timestamps are based on client clocks. This ensures that users of
+	// the client observe timestamps that are coherent with their own clocks
+	// and consistent with Linux's semantics. However, since it is not always
+	// possible for clients to set arbitrary atimes and mtimes, and never
+	// possible for clients to set arbitrary ctimes, file timestamp changes are
+	// stored in the client only and never sent to the remote filesystem.
+	InteropModeExclusive InteropMode = iota
+
+	// InteropModeWritethrough is appropriate when there are read-only users of
+	// the remote filesystem that expect to observe changes made by the
+	// filesystem client.
+	//
+	// - The client may cache arbitrary filesystem state.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on client clocks. As a corollary, access
+	// timestamp changes from other remote filesystem users will not be visible
+	// to the client.
+	InteropModeWritethrough
+
+	// InteropModeShared is appropriate when there are users of the remote
+	// filesystem that may mutate its state other than the client.
+	//
+	// - The client must verify cached filesystem state before using it.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on server clocks. This is necessary to
+	// ensure that timestamp changes are synchronized between remote filesystem
+	// users.
+	//
+	// Note that the correctness of InteropModeShared depends on the server
+	// correctly implementing 9P fids (i.e. each fid immutably represents a
+	// single filesystem object), even in the presence of remote filesystem
+	// mutations from other users. If this is violated, the behavior of the
+	// client is undefined.
+	InteropModeShared
+)
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
+		return nil, nil, syserror.EINVAL
+	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	var fsopts filesystemOptions
+
+	// Check that the transport is "fd".
+	trans, ok := mopts["trans"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "trans")
+	if trans != "fd" {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Check that read and write FDs are provided and identical.
+	rfdstr, ok := mopts["rfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "rfdno")
+	rfd, err := strconv.Atoi(rfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	wfdstr, ok := mopts["wfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "wfdno")
+	wfd, err := strconv.Atoi(wfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	if rfd != wfd {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+		return nil, nil, syserror.EINVAL
+	}
+	fsopts.fd = rfd
+
+	// Get the attach name.
+	fsopts.aname = "/"
+	if aname, ok := mopts["aname"]; ok {
+		delete(mopts, "aname")
+		fsopts.aname = aname
+	}
+
+	// Parse the cache policy. For historical reasons, this defaults to the
+	// least generally-applicable option, InteropModeExclusive.
+	fsopts.interop = InteropModeExclusive
+	if cache, ok := mopts["cache"]; ok {
+		delete(mopts, "cache")
+		switch cache {
+		case "fscache":
+			fsopts.interop = InteropModeExclusive
+		case "fscache_writethrough":
+			fsopts.interop = InteropModeWritethrough
+		case "none":
+			fsopts.regularFilesUseSpecialFileFD = true
+			fallthrough
+		case "remote_revalidating":
+			fsopts.interop = InteropModeShared
+		default:
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
+	// Parse the 9P message size.
+	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
+	if msizestr, ok := mopts["msize"]; ok {
+		delete(mopts, "msize")
+		msize, err := strconv.ParseUint(msizestr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.msize = uint32(msize)
+	}
+
+	// Parse the 9P protocol version.
+	fsopts.version = p9.HighestVersionString()
+	if version, ok := mopts["version"]; ok {
+		delete(mopts, "version")
+		fsopts.version = version
+	}
+
+	// Parse the dentry cache limit.
+	fsopts.maxCachedDentries = 1000
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.maxCachedDentries = maxCachedDentries
+	}
+
+	// Handle simple flags.
+	if _, ok := mopts["force_page_cache"]; ok {
+		delete(mopts, "force_page_cache")
+		fsopts.forcePageCache = true
+	}
+	if _, ok := mopts["limit_host_fd_translation"]; ok {
+		delete(mopts, "limit_host_fd_translation")
+		fsopts.limitHostFDTranslation = true
+	}
+	if _, ok := mopts["overlayfs_stale_read"]; ok {
+		delete(mopts, "overlayfs_stale_read")
+		fsopts.overlayfsStaleRead = true
+	}
+	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
+	// "cache=none".
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Establish a connection with the server.
+	conn, err := unet.NewSocket(fsopts.fd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Perform version negotiation with the server.
+	ctx.UninterruptibleSleepStart(false)
+	client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		conn.Close()
+		return nil, nil, err
+	}
+	// Ownership of conn has been transferred to client.
+
+	// Perform attach to obtain the filesystem root.
+	ctx.UninterruptibleSleepStart(false)
+	attached, err := client.Attach(fsopts.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		client.Close()
+		return nil, nil, err
+	}
+	attachFile := p9file{attached}
+	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+	if err != nil {
+		attachFile.close(ctx)
+		client.Close()
+		return nil, nil, err
+	}
+
+	// Construct the filesystem object.
+	fs := &filesystem{
+		mfp:            mfp,
+		opts:           fsopts,
+		uid:            creds.EffectiveKUID,
+		gid:            creds.EffectiveKGID,
+		client:         client,
+		dentries:       make(map[*dentry]struct{}),
+		specialFileFDs: make(map[*specialFileFD]struct{}),
+	}
+	fs.vfsfs.Init(vfsObj, fs)
+
+	// Construct the root dentry.
+	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
+	if err != nil {
+		attachFile.close(ctx)
+		fs.vfsfs.DecRef()
+		return nil, nil, err
+	}
+	// Set the root's reference count to 2. One reference is returned to the
+	// caller, and the other is deliberately leaked to prevent the root from
+	// being "cached" and subsequently evicted. Its resources will still be
+	// cleaned up by fs.Release().
+	root.refs = 2
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	ctx := context.Background()
+	mf := fs.mfp.MemoryFile()
+
+	fs.syncMu.Lock()
+	for d := range fs.dentries {
+		d.handleMu.Lock()
+		d.dataMu.Lock()
+		if d.handleWritable {
+			// Write dirty cached data to the remote file.
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
+			}
+			// TODO(jamieliu): Do we need to flushf/fsync d?
+		}
+		// Discard cached pages.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Close the host fd if one exists.
+		if d.handle.fd >= 0 {
+			syscall.Close(int(d.handle.fd))
+			d.handle.fd = -1
+		}
+		d.handleMu.Unlock()
+	}
+	// There can't be any specialFileFDs still using fs, since each such
+	// FileDescription would hold a reference on a Mount holding a reference on
+	// fs.
+	fs.syncMu.Unlock()
+
+	// Close the connection to the server. This implicitly clunks all fids.
+	fs.client.Close()
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// refs is the reference count. Each dentry holds a reference on its
+	// parent, even if disowned. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// We don't support hard links, so each dentry maps 1:1 to an inode.
+
+	// file is the unopened p9.File that backs this dentry. file is immutable.
+	file p9file
+
+	// If deleted is non-zero, the file represented by this dentry has been
+	// deleted. deleted is accessed using atomic memory operations.
+	deleted uint32
+
+	// If cached is true, dentryEntry links dentry into
+	// filesystem.cachedDentries. cached and dentryEntry are protected by
+	// filesystem.renameMu.
+	cached bool
+	dentryEntry
+
+	dirMu sync.Mutex
+
+	// If this dentry represents a directory, and InteropModeShared is not in
+	// effect, negativeChildren is a set of child names in this directory that
+	// are known not to exist. negativeChildren is protected by dirMu.
+	negativeChildren map[string]struct{}
+
+	// If this dentry represents a directory, InteropModeShared is not in
+	// effect, and dirents is not nil, it is a cache of all entries in the
+	// directory, in the order they were returned by the server. dirents is
+	// protected by dirMu.
+	dirents []vfs.Dirent
+
+	// Cached metadata; protected by metadataMu and accessed using atomic
+	// memory operations unless otherwise specified.
+	metadataMu sync.Mutex
+	ino        uint64 // immutable
+	mode       uint32 // type is immutable, perms are mutable
+	uid        uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32 // auth.KGID, but ...
+	blockSize  uint32 // 0 if unknown
+	// Timestamps, all nsecs from the Unix epoch.
+	atime int64
+	mtime int64
+	ctime int64
+	btime int64
+	// File size, protected by both metadataMu and dataMu (i.e. both must be
+	// locked to mutate it).
+	size uint64
+
+	mapsMu sync.Mutex
+
+	// If this dentry represents a regular file, mappings tracks mappings of
+	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// If this dentry represents a regular file or directory:
+	//
+	// - handle is the I/O handle used by all regularFileFDs/directoryFDs
+	// representing this dentry.
+	//
+	// - handleReadable is true if handle is readable.
+	//
+	// - handleWritable is true if handle is writable.
+	//
+	// Invariants:
+	//
+	// - If handleReadable == handleWritable == false, then handle.file == nil
+	// (i.e. there is no open handle). Conversely, if handleReadable ||
+	// handleWritable == true, then handle.file != nil (i.e. there is an open
+	// handle).
+	//
+	// - handleReadable and handleWritable cannot transition from true to false
+	// (i.e. handles may not be downgraded).
+	//
+	// These fields are protected by handleMu.
+	handleMu       sync.RWMutex
+	handle         handle
+	handleReadable bool
+	handleWritable bool
+
+	dataMu sync.RWMutex
+
+	// If this dentry represents a regular file that is client-cached, cache
+	// maps offsets into the cached file to offsets into
+	// filesystem.mfp.MemoryFile() that store the file's data. cache is
+	// protected by dataMu.
+	cache fsutil.FileRangeSet
+
+	// If this dentry represents a regular file that is client-cached, dirty
+	// tracks dirty segments in cache. dirty is protected by dataMu.
+	dirty fsutil.DirtySet
+
+	// pf implements platform.File for mappings of handle.fd.
+	pf dentryPlatformFile
+
+	// If this dentry represents a symbolic link, InteropModeShared is not in
+	// effect, and haveTarget is true, target is the symlink target. haveTarget
+	// and target are protected by dataMu.
+	haveTarget bool
+	target     string
+}
+
+// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
+// gofer client.
+func dentryAttrMask() p9.AttrMask {
+	return p9.AttrMask{
+		Mode:  true,
+		UID:   true,
+		GID:   true,
+		ATime: true,
+		MTime: true,
+		CTime: true,
+		Size:  true,
+		BTime: true,
+	}
+}
+
+// newDentry creates a new dentry representing the given file. The dentry
+// initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.checkCachingLocked() as appropriate.
+func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
+	if !mask.Mode {
+		ctx.Warningf("can't create gofer.dentry without file type")
+		return nil, syserror.EIO
+	}
+	if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
+		ctx.Warningf("can't create regular file gofer.dentry without file size")
+		return nil, syserror.EIO
+	}
+
+	d := &dentry{
+		fs:        fs,
+		file:      file,
+		ino:       qid.Path,
+		mode:      uint32(attr.Mode),
+		uid:       uint32(fs.uid),
+		gid:       uint32(fs.gid),
+		blockSize: usermem.PageSize,
+		handle: handle{
+			fd: -1,
+		},
+	}
+	d.pf.dentry = d
+	if mask.UID {
+		d.uid = uint32(attr.UID)
+	}
+	if mask.GID {
+		d.gid = uint32(attr.GID)
+	}
+	if mask.Size {
+		d.size = attr.Size
+	}
+	if attr.BlockSize != 0 {
+		d.blockSize = uint32(attr.BlockSize)
+	}
+	if mask.ATime {
+		d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
+	}
+	if mask.MTime {
+		d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
+	}
+	if mask.CTime {
+		d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
+	}
+	if mask.BTime {
+		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
+	}
+	d.vfsd.Init(d)
+
+	fs.syncMu.Lock()
+	fs.dentries[d] = struct{}{}
+	fs.syncMu.Unlock()
+	return d, nil
+}
+
+// updateFromP9Attrs is called to update d's metadata after an update from the
+// remote filesystem.
+func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
+	d.metadataMu.Lock()
+	if mask.Mode {
+		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
+			d.metadataMu.Unlock()
+			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
+		}
+		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
+	}
+	if mask.UID {
+		atomic.StoreUint32(&d.uid, uint32(attr.UID))
+	}
+	if mask.GID {
+		atomic.StoreUint32(&d.gid, uint32(attr.GID))
+	}
+	// There is no P9_GETATTR_* bit for I/O block size.
+	if attr.BlockSize != 0 {
+		atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
+	}
+	if mask.ATime {
+		atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
+	}
+	if mask.MTime {
+		atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
+	}
+	if mask.CTime {
+		atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
+	}
+	if mask.BTime {
+		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
+	}
+	if mask.Size {
+		d.dataMu.Lock()
+		atomic.StoreUint64(&d.size, attr.Size)
+		d.dataMu.Unlock()
+	}
+	d.metadataMu.Unlock()
+}
+
+func (d *dentry) updateFromGetattr(ctx context.Context) error {
+	// Use d.handle.file, which represents a 9P fid that has been opened, in
+	// preference to d.file, which represents a 9P fid that has not. This may
+	// be significantly more efficient in some implementations.
+	var (
+		file            p9file
+		handleMuRLocked bool
+	)
+	d.handleMu.RLock()
+	if !d.handle.file.isNil() {
+		file = d.handle.file
+		handleMuRLocked = true
+	} else {
+		file = d.file
+		d.handleMu.RUnlock()
+	}
+	_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
+	if handleMuRLocked {
+		d.handleMu.RUnlock()
+	}
+	if err != nil {
+		return err
+	}
+	d.updateFromP9Attrs(attrMask, &attr)
+	return nil
+}
+
+func (d *dentry) fileType() uint32 {
+	return atomic.LoadUint32(&d.mode) & linux.S_IFMT
+}
+
+func (d *dentry) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
+	stat.Blksize = atomic.LoadUint32(&d.blockSize)
+	stat.Nlink = 1
+	if d.isDir() {
+		stat.Nlink = 2
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = d.ino
+	stat.Size = atomic.LoadUint64(&d.size)
+	// This is consistent with regularFileFD.Seek(), which treats regular files
+	// as having no holes.
+	stat.Blocks = (stat.Size + 511) / 512
+	stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
+	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
+	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
+	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+	// TODO(jamieliu): device number
+}
+
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+	if stat.Mask == 0 {
+		return nil
+	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	setLocalAtime := false
+	setLocalMtime := false
+	if d.fs.opts.interop != InteropModeShared {
+		// Timestamp updates will be handled locally.
+		setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
+		setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
+		stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
+		if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) {
+			// Truncate updates mtime.
+			setLocalMtime = true
+			stat.Mtime.Nsec = linux.UTIME_NOW
+		}
+	}
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if stat.Mask != 0 {
+		if err := d.file.setAttr(ctx, p9.SetAttrMask{
+			Permissions:        stat.Mask&linux.STATX_MODE != 0,
+			UID:                stat.Mask&linux.STATX_UID != 0,
+			GID:                stat.Mask&linux.STATX_GID != 0,
+			Size:               stat.Mask&linux.STATX_SIZE != 0,
+			ATime:              stat.Mask&linux.STATX_ATIME != 0,
+			MTime:              stat.Mask&linux.STATX_MTIME != 0,
+			ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
+			MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
+		}, p9.SetAttr{
+			Permissions:      p9.FileMode(stat.Mode),
+			UID:              p9.UID(stat.UID),
+			GID:              p9.GID(stat.GID),
+			Size:             stat.Size,
+			ATimeSeconds:     uint64(stat.Atime.Sec),
+			ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+			MTimeSeconds:     uint64(stat.Mtime.Sec),
+			MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+		}); err != nil {
+			return err
+		}
+	}
+	if d.fs.opts.interop == InteropModeShared {
+		// There's no point to updating d's metadata in this case since it'll
+		// be overwritten by revalidation before the next time it's used
+		// anyway. (InteropModeShared inhibits client caching of regular file
+		// data, so there's no cache to truncate either.)
+		return nil
+	}
+	now, haveNow := nowFromContext(ctx)
+	if !haveNow {
+		ctx.Warningf("gofer.dentry.setStat: current time not available")
+	}
+	if stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, stat.GID)
+	}
+	if setLocalAtime {
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			if haveNow {
+				atomic.StoreInt64(&d.atime, now)
+			}
+		} else {
+			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+		}
+	}
+	if setLocalMtime {
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			if haveNow {
+				atomic.StoreInt64(&d.mtime, now)
+			}
+		} else {
+			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+		}
+	}
+	if haveNow {
+		atomic.StoreInt64(&d.ctime, now)
+	}
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		d.dataMu.Lock()
+		oldSize := d.size
+		d.size = stat.Size
+		// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+		// below. This allows concurrent calls to Read/Translate/etc. These
+		// functions synchronize with truncation by refusing to use cache
+		// contents beyond the new d.size. (We are still holding d.metadataMu,
+		// so we can't race with Write or another truncate.)
+		d.dataMu.Unlock()
+		if d.size < oldSize {
+			oldpgend := pageRoundUp(oldSize)
+			newpgend := pageRoundUp(d.size)
+			if oldpgend != newpgend {
+				d.mapsMu.Lock()
+				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+					// Compare Linux's mm/truncate.c:truncate_setsize() =>
+					// truncate_pagecache() =>
+					// mm/memory.c:unmap_mapping_range(evencows=1).
+					InvalidatePrivate: true,
+				})
+				d.mapsMu.Unlock()
+			}
+			// We are now guaranteed that there are no translations of
+			// truncated pages, and can remove them from the cache. Since
+			// truncated pages have been removed from the remote file, they
+			// should be dropped without being written back.
+			d.dataMu.Lock()
+			d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+			d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
+			d.dataMu.Unlock()
+		}
+	}
+	return nil
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkCachingLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked()
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("gofer.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkCachingLocked should be called after d's reference count becomes 0 or it
+// becomes disowned.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkCachingLocked() {
+	// Dentries with a non-zero reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.)
+	if atomic.LoadInt64(&d.refs) != 0 {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		return
+	}
+	// Non-child dentries with zero references are no longer reachable by path
+	// resolution and should be dropped immediately.
+	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		d.destroyLocked()
+		return
+	}
+	// If d is already cached, just move it to the front of the LRU.
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentries.PushFront(d)
+		return
+	}
+	// Cache the dentry, then evict the least recently used cached dentry if
+	// the cache becomes over-full.
+	d.fs.cachedDentries.PushFront(d)
+	d.fs.cachedDentriesLen++
+	d.cached = true
+	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
+		victim := d.fs.cachedDentries.Back()
+		d.fs.cachedDentries.Remove(victim)
+		d.fs.cachedDentriesLen--
+		victim.cached = false
+		// victim.refs may have become non-zero from an earlier path
+		// resolution since it was inserted into fs.cachedDentries; see
+		// dentry.incRefLocked(). Either way, we brought
+		// fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so
+		// we don't loop.
+		if atomic.LoadInt64(&victim.refs) == 0 {
+			if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil {
+				victimParent := victimParentVFSD.Impl().(*dentry)
+				victimParent.dirMu.Lock()
+				if !victim.vfsd.IsDisowned() {
+					// victim can't be a mount point (in any mount
+					// namespace), since VFS holds references on mount
+					// points.
+					d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd)
+					// We're only deleting the dentry, not the file it
+					// represents, so we don't need to update
+					// victimParent.dirents etc.
+				}
+				victimParent.dirMu.Unlock()
+			}
+			victim.destroyLocked()
+		}
+	}
+}
+
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
+// not a child dentry.
+func (d *dentry) destroyLocked() {
+	ctx := context.Background()
+	d.handleMu.Lock()
+	if !d.handle.file.isNil() {
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		// Write dirty pages back to the remote filesystem.
+		if d.handleWritable {
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err)
+			}
+		}
+		// Discard cached data.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Clunk open fids and close open host FDs.
+		d.handle.close(ctx)
+	}
+	d.handleMu.Unlock()
+	d.file.close(ctx)
+	// Remove d from the set of all dentries.
+	d.fs.syncMu.Lock()
+	delete(d.fs.dentries, d)
+	d.fs.syncMu.Unlock()
+	// Drop the reference held by d on its parent.
+	if parentVFSD := d.vfsd.Parent(); parentVFSD != nil {
+		parent := parentVFSD.Impl().(*dentry)
+		// This is parent.DecRef() without recursive locking of d.fs.renameMu.
+		if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 {
+			parent.checkCachingLocked()
+		} else if refs < 0 {
+			panic("gofer.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+func (d *dentry) isDeleted() bool {
+	return atomic.LoadUint32(&d.deleted) != 0
+}
+
+func (d *dentry) setDeleted() {
+	atomic.StoreUint32(&d.deleted, 1)
+}
+
+func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
+	return nil, syserror.ENOTSUP
+}
+
+func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
+	// TODO(jamieliu): add vfs.GetxattrOptions.Size
+	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+}
+
+func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
+}
+
+func (d *dentry) removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
+// Preconditions: d.isRegularFile() || d.isDirectory().
+func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
+	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
+	// O_TRUNC).
+	if !trunc {
+		d.handleMu.RLock()
+		if (!read || d.handleReadable) && (!write || d.handleWritable) {
+			// The current handle is sufficient.
+			d.handleMu.RUnlock()
+			return nil
+		}
+		d.handleMu.RUnlock()
+	}
+
+	haveOldFD := false
+	d.handleMu.Lock()
+	if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc {
+		// Get a new handle.
+		wantReadable := d.handleReadable || read
+		wantWritable := d.handleWritable || write
+		h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc)
+		if err != nil {
+			d.handleMu.Unlock()
+			return err
+		}
+		if !d.handle.file.isNil() {
+			// Check that old and new handles are compatible: If the old handle
+			// includes a host file descriptor but the new one does not, or
+			// vice versa, old and new memory mappings may be incoherent.
+			haveOldFD = d.handle.fd >= 0
+			haveNewFD := h.fd >= 0
+			if haveOldFD != haveNewFD {
+				d.handleMu.Unlock()
+				ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD)
+				h.close(ctx)
+				return syserror.EIO
+			}
+			if haveOldFD {
+				// We may have raced with callers of d.pf.FD() that are now
+				// using the old file descriptor, preventing us from safely
+				// closing it. We could handle this by invalidating existing
+				// memmap.Translations, but this is expensive. Instead, use
+				// dup2() to make the old file descriptor refer to the new file
+				// description, then close the new file descriptor (which is no
+				// longer needed). Racing callers may use the old or new file
+				// description, but this doesn't matter since they refer to the
+				// same file (unless d.fs.opts.overlayfsStaleRead is true,
+				// which we handle separately).
+				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+					d.handleMu.Unlock()
+					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
+					h.close(ctx)
+					return err
+				}
+				syscall.Close(int(h.fd))
+				h.fd = d.handle.fd
+				if d.fs.opts.overlayfsStaleRead {
+					// Replace sentry mappings of the old FD with mappings of
+					// the new FD, since the two are not necessarily coherent.
+					if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
+						d.handleMu.Unlock()
+						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
+						h.close(ctx)
+						return err
+					}
+				}
+				// Clunk the old fid before making the new handle visible (by
+				// unlocking d.handleMu).
+				d.handle.file.close(ctx)
+			}
+		}
+		// Switch to the new handle.
+		d.handle = h
+		d.handleReadable = wantReadable
+		d.handleWritable = wantWritable
+	}
+	d.handleMu.Unlock()
+
+	if d.fs.opts.overlayfsStaleRead && haveOldFD {
+		// Invalidate application mappings that may be using the old FD; they
+		// will be replaced with mappings using the new FD after future calls
+		// to d.Translate(). This requires holding d.mapsMu, which precedes
+		// d.handleMu in the lock order.
+		d.mapsMu.Lock()
+		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+	}
+
+	return nil
+}
+
+// fileDescription is embedded by gofer implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
+		// available?
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	return fd.dentry().listxattr(ctx)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	return fd.dentry().getxattr(ctx, name)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	return fd.dentry().setxattr(ctx, &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.dentry().removexattr(ctx, name)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
new file mode 100644
index 000000000..cfe66f797
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// handle represents a remote "open file descriptor", consisting of an opened
+// fid (p9.File) and optionally a host file descriptor.
+type handle struct {
+	file p9file
+	fd   int32 // -1 if unavailable
+}
+
+// Preconditions: read || write.
+func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) {
+	_, newfile, err := file.walk(ctx, nil)
+	if err != nil {
+		return handle{fd: -1}, err
+	}
+	var flags p9.OpenFlags
+	switch {
+	case read && !write:
+		flags = p9.ReadOnly
+	case !read && write:
+		flags = p9.WriteOnly
+	case read && write:
+		flags = p9.ReadWrite
+	}
+	if trunc {
+		flags |= p9.OpenTruncate
+	}
+	fdobj, _, _, err := newfile.open(ctx, flags)
+	if err != nil {
+		newfile.close(ctx)
+		return handle{fd: -1}, err
+	}
+	fd := int32(-1)
+	if fdobj != nil {
+		fd = int32(fdobj.Release())
+	}
+	return handle{
+		file: newfile,
+		fd:   fd,
+	}, nil
+}
+
+func (h *handle) close(ctx context.Context) {
+	h.file.close(ctx)
+	h.file = p9file{}
+	if h.fd >= 0 {
+		syscall.Close(int(h.fd))
+		h.fd = -1
+	}
+}
+
+func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostPreadv(h.fd, dsts, int64(offset))
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
+		n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the read since p9.File.ReadAt() takes []byte.
+	buf := make([]byte, dsts.NumBytes())
+	n, err := h.file.readAt(ctx, buf, offset)
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
+		return cp, cperr
+	}
+	return uint64(n), err
+}
+
+func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostPwritev(h.fd, srcs, int64(offset))
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
+		n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the write since p9.File.WriteAt() takes []byte.
+	buf := make([]byte, srcs.NumBytes())
+	cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs)
+	if cp == 0 {
+		return 0, cperr
+	}
+	n, err := h.file.writeAt(ctx, buf[:cp], offset)
+	if err != nil {
+		return uint64(n), err
+	}
+	return cp, cperr
+}
+
+func (h *handle) sync(ctx context.Context) error {
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		err := syscall.Fsync(int(h.fd))
+		ctx.UninterruptibleSleepFinish(false)
+		return err
+	}
+	return h.file.fsync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
new file mode 100644
index 000000000..19560ab26
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// Preconditions: !dsts.IsEmpty().
+func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	if dsts.NumBlocks() == 1 {
+		// Use pread() instead of preadv() to avoid iovec allocation and
+		// copying.
+		dst := dsts.Head()
+		n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0)
+		if e != 0 {
+			return 0, e
+		}
+		return uint64(n), nil
+	}
+	iovs := safemem.IovecsFromBlockSeq(dsts)
+	n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
+
+// Preconditions: !srcs.IsEmpty().
+func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	if srcs.NumBlocks() == 1 {
+		// Use pwrite() instead of pwritev() to avoid iovec allocation and
+		// copying.
+		src := srcs.Head()
+		n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0)
+		if e != 0 {
+			return 0, e
+		}
+		return uint64(n), nil
+	}
+	iovs := safemem.IovecsFromBlockSeq(srcs)
+	n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
new file mode 100644
index 000000000..755ac2985
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -0,0 +1,219 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// p9file is a wrapper around p9.File that provides methods that are
+// Context-aware.
+type p9file struct {
+	file p9.File
+}
+
+func (f p9file) isNil() bool {
+	return f.file == nil
+}
+
+func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, err := f.file.Walk(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, err
+}
+
+func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, attrMask, attr, err
+}
+
+// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single
+// path component and returns a single qid.
+func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name})
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err
+	}
+	if len(qids) != 1 {
+		ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids)
+		if newfile != nil {
+			p9file{newfile}.close(ctx)
+		}
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
+	}
+	return qids[0], p9file{newfile}, attrMask, attr, nil
+}
+
+func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fsstat, err := f.file.StatFS()
+	ctx.UninterruptibleSleepFinish(false)
+	return fsstat, err
+}
+
+func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, attrMask, attr, err := f.file.GetAttr(req)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, attrMask, attr, err
+}
+
+func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttr(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	val, err := f.file.GetXattr(name, size)
+	ctx.UninterruptibleSleepFinish(false)
+	return val, err
+}
+
+func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetXattr(name, value, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Allocate(mode, offset, length)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) close(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Close()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, qid, iounit, err := f.file.Open(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, qid, iounit, err
+}
+
+func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.ReadAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.WriteAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) fsync(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.FSync()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, p9file{newfile}, qid, iounit, err
+}
+
+func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mkdir(name, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Symlink(oldName, newName, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) link(ctx context.Context, target p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Link(target.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mknod(name, mode, major, minor, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Rename(newDir.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.UnlinkAt(name, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
+	ctx.UninterruptibleSleepStart(false)
+	dirents, err := f.file.Readdir(offset, count)
+	ctx.UninterruptibleSleepFinish(false)
+	return dirents, err
+}
+
+func (f p9file) readlink(ctx context.Context) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	target, err := f.file.Readlink()
+	ctx.UninterruptibleSleepFinish(false)
+	return target, err
+}
+
+func (f p9file) flush(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Flush()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, err := f.file.Connect(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go
new file mode 100644
index 000000000..847cb0784
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/pagemath.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// This are equivalent to usermem.Addr.RoundDown/Up, but without the
+// potentially truncating conversion to usermem.Addr. This is necessary because
+// there is no way to define generic "PageRoundDown/Up" functions in Go.
+
+func pageRoundDown(x uint64) uint64 {
+	return x &^ (usermem.PageSize - 1)
+}
+
+func pageRoundUp(x uint64) uint64 {
+	return pageRoundDown(x + usermem.PageSize - 1)
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
new file mode 100644
index 000000000..8e11e06b3
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -0,0 +1,860 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return d.fileType() == linux.S_IFREG
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is protected by mu.
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	// Skip flushing if writes may be buffered by the client, since (as with
+	// the VFS1 client) we don't flush buffered writes on close anyway.
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeExclusive {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	return d.handle.file.flush(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Check for reading at EOF before calling into MM (but not under
+	// InteropModeShared, which makes d.size unreliable).
+	d := fd.dentry()
+	if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) {
+		return 0, io.EOF
+	}
+
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Lock d.metadataMu for the rest of the read to prevent d.size from
+		// changing.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+		// Write dirty cached pages that will be touched by the read back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
+			return 0, err
+		}
+	}
+
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the read to go to the remote file.
+		rw.direct = true
+	}
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putDentryReadWriter(rw)
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+		d.touchAtime(ctx, fd.vfsfd.Mount())
+	}
+	return n, err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
+		// file_update_time(). This is d.touchCMtime(), but without locking
+		// d.metadataMu (recursively).
+		if now, ok := nowFromContext(ctx); ok {
+			atomic.StoreInt64(&d.mtime, now)
+			atomic.StoreInt64(&d.ctime, now)
+		}
+	}
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Write dirty cached pages that will be touched by the write back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, err
+		}
+		// Remove touched pages from the cache.
+		pgstart := pageRoundDown(uint64(offset))
+		pgend := pageRoundUp(uint64(offset + src.NumBytes()))
+		if pgend < pgstart {
+			return 0, syserror.EINVAL
+		}
+		mr := memmap.MappableRange{pgstart, pgend}
+		var freed []platform.FileRange
+		d.dataMu.Lock()
+		cseg := d.cache.LowerBoundSegment(mr.Start)
+		for cseg.Ok() && cseg.Start() < mr.End {
+			cseg = d.cache.Isolate(cseg, mr)
+			freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+			cseg = d.cache.Remove(cseg).NextSegment()
+		}
+		d.dataMu.Unlock()
+		// Invalidate mappings of removed pages.
+		d.mapsMu.Lock()
+		d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+		// Finally free pages removed from the cache.
+		mf := d.fs.mfp.MemoryFile()
+		for _, freedFR := range freed {
+			mf.DecRef(freedFR)
+		}
+	}
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the write to go to the remote file.
+		rw.direct = true
+	}
+	n, err := src.CopyInTo(ctx, rw)
+	putDentryReadWriter(rw)
+	if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		// Write dirty cached pages touched by the write back to the remote
+		// file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, err
+		}
+		// Request the remote filesystem to sync the remote file.
+		if err := d.handle.file.fsync(ctx); err != nil {
+			return 0, err
+		}
+	}
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+type dentryReadWriter struct {
+	ctx    context.Context
+	d      *dentry
+	off    uint64
+	direct bool
+}
+
+var dentryReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &dentryReadWriter{}
+	},
+}
+
+func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
+	rw := dentryReadWriterPool.Get().(*dentryReadWriter)
+	rw.ctx = ctx
+	rw.d = d
+	rw.off = uint64(offset)
+	rw.direct = false
+	return rw
+}
+
+func putDentryReadWriter(rw *dentryReadWriter) {
+	rw.ctx = nil
+	rw.d = nil
+	dentryReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents and makes dentry.size
+	// unreliable), or if the file was opened O_DIRECT, read directly from
+	// dentry.handle without locking dentry.dataMu.
+	rw.d.handleMu.RLock()
+	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise read from/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	fillCache := mf.ShouldCacheEvictable()
+	var dataMuUnlock func()
+	if fillCache {
+		rw.d.dataMu.Lock()
+		dataMuUnlock = rw.d.dataMu.Unlock
+	} else {
+		rw.d.dataMu.RLock()
+		dataMuUnlock = rw.d.dataMu.RUnlock
+	}
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.d.size {
+		dataMuUnlock()
+		rw.d.handleMu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.d.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += n
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			gapMR := gap.Range().Intersect(mr)
+			if fillCache {
+				// Read into the cache, then re-enter the loop to read from the
+				// cache.
+				reqMR := memmap.MappableRange{
+					Start: pageRoundDown(gapMR.Start),
+					End:   pageRoundUp(gapMR.End),
+				}
+				optMR := gap.Range()
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
+				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
+				seg, gap = rw.d.cache.Find(rw.off)
+				if !seg.Ok() {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+				// err might have occurred in part of gap.Range() outside
+				// gapMR. Forget about it for now; if the error matters and
+				// persists, we'll run into it again in a later iteration of
+				// this loop.
+			} else {
+				// Read directly from the file.
+				gapDsts := dsts.TakeFirst64(gapMR.Length())
+				n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+				done += n
+				rw.off += n
+				dsts = dsts.DropFirst64(n)
+				// Partial reads are fine. But we must stop reading.
+				if n != gapDsts.NumBytes() || err != nil {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+
+				// Continue.
+				seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+			}
+		}
+	}
+	dataMuUnlock()
+	rw.d.handleMu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.d.metadataMu must be locked.
+func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents), or if the file was
+	// opened with O_DIRECT, write directly to dentry.handle without locking
+	// dentry.dataMu.
+	rw.d.handleMu.RLock()
+	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise write to/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	rw.d.dataMu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	start := rw.off
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			segMR := seg.Range().Intersect(mr)
+			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			rw.d.dirty.MarkDirty(segMR)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Write directly to the file. At present, we never fill the cache
+			// when writing, since doing so can convert small writes into
+			// inefficient read-modify-write cycles, and we have no mechanism
+			// for detecting or avoiding this.
+			gapMR := gap.Range().Intersect(mr)
+			gapSrcs := srcs.TakeFirst64(gapMR.Length())
+			n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			// Partial writes are fine. But we must stop writing.
+			if n != gapSrcs.NumBytes() || err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	if rw.off > rw.d.size {
+		atomic.StoreUint64(&rw.d.size, rw.off)
+		// The remote file's size will implicitly be extended to the correct
+		// value when we write back to it.
+	}
+	// If InteropModeWritethrough is in effect, flush written data back to the
+	// remote filesystem.
+	if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
+		if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
+			Start: start,
+			End:   rw.off,
+		}, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil {
+			// We have no idea how many bytes were actually flushed.
+			rw.off = start
+			done = 0
+			retErr = err
+		}
+	}
+	rw.d.dataMu.Unlock()
+	rw.d.handleMu.RUnlock()
+	return done, retErr
+}
+
+func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
+	if size == 0 {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	// Compute the range of valid bytes (overflow-checked).
+	if uint64(offset) >= d.size {
+		return nil
+	}
+	end := int64(d.size)
+	if rend := offset + size; rend > offset && rend < end {
+		end = rend
+	}
+	return fsutil.SyncDirty(ctx, memmap.MappableRange{
+		Start: uint64(offset),
+		End:   uint64(end),
+	}, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as specified.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Ensure file size is up to date.
+		d := fd.dentry()
+		if fd.filesystem().opts.interop == InteropModeShared {
+			if err := d.updateFromGetattr(ctx); err != nil {
+				return 0, err
+			}
+		}
+		size := int64(atomic.LoadUint64(&d.size))
+		// For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
+		// block of data.
+		switch whence {
+		case linux.SEEK_END:
+			offset += size
+		case linux.SEEK_DATA:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			// Use offset as specified.
+		case linux.SEEK_HOLE:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			offset = size
+		}
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return fd.dentry().syncSharedHandle(ctx)
+}
+
+func (d *dentry) syncSharedHandle(ctx context.Context) error {
+	d.handleMu.RLock()
+	if !d.handleWritable {
+		d.handleMu.RUnlock()
+		return nil
+	}
+	d.dataMu.Lock()
+	// Write dirty cached data to the remote file.
+	err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+	d.dataMu.Unlock()
+	if err == nil {
+		// Sync the remote file.
+		err = d.handle.sync(ctx)
+	}
+	d.handleMu.RUnlock()
+	return err
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+	switch d.fs.opts.interop {
+	case InteropModeExclusive:
+		// Any mapping is fine.
+	case InteropModeWritethrough:
+		// Shared writable mappings require a host FD, since otherwise we can't
+		// synchronously flush memory-mapped writes to the remote file.
+		if opts.Private || !opts.MaxPerms.Write {
+			break
+		}
+		fallthrough
+	case InteropModeShared:
+		// All mappings require a host FD to be coherent with other filesystem
+		// users.
+		if d.fs.opts.forcePageCache {
+			// Whether or not we have a host FD, we're not allowed to use it.
+			return syserror.ENODEV
+		}
+		d.handleMu.RLock()
+		haveFD := d.handle.fd >= 0
+		d.handleMu.RUnlock()
+		if !haveFD {
+			return syserror.ENODEV
+		}
+	default:
+		panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
+	}
+	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
+}
+
+func (d *dentry) mayCachePages() bool {
+	if d.fs.opts.interop == InteropModeShared {
+		return false
+	}
+	if d.fs.opts.forcePageCache {
+		return true
+	}
+	d.handleMu.RLock()
+	haveFD := d.handle.fd >= 0
+	d.handleMu.RUnlock()
+	return haveFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	mapped := d.mappings.AddMapping(ms, ar, offset, writable)
+	// Do this unconditionally since whether we have a host FD can change
+	// across save/restore.
+	for _, r := range mapped {
+		d.pf.hostFileMapper.IncRefOn(r)
+	}
+	if d.mayCachePages() {
+		// d.Evict() will refuse to evict memory-mapped pages, so tell the
+		// MemoryFile to not bother trying.
+		mf := d.fs.mfp.MemoryFile()
+		for _, r := range mapped {
+			mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+	}
+	d.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		d.pf.hostFileMapper.DecRefOn(r)
+	}
+	if d.mayCachePages() {
+		// Pages that are no longer referenced by any application memory
+		// mappings are now considered unused; allow MemoryFile to evict them
+		// when necessary.
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		for _, r := range unmapped {
+			// Since these pages are no longer mapped, they are no longer
+			// concurrently dirtyable by a writable memory mapping.
+			d.dirty.AllowClean(r)
+			mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+		d.dataMu.Unlock()
+	}
+	d.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return d.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.handleMu.RLock()
+	if d.handle.fd >= 0 && !d.fs.opts.forcePageCache {
+		d.handleMu.RUnlock()
+		mr := optional
+		if d.fs.opts.limitHostFDTranslation {
+			mr = maxFillRange(required, optional)
+		}
+		return []memmap.Translation{
+			{
+				Source: mr,
+				File:   &d.pf,
+				Offset: mr.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, nil
+	}
+
+	d.dataMu.Lock()
+
+	// Constrain translations to d.size (rounded up) to prevent translation to
+	// pages that may be concurrently truncated.
+	pgend := pageRoundUp(d.size)
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			d.dataMu.Unlock()
+			d.handleMu.RUnlock()
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	mf := d.fs.mfp.MemoryFile()
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt)
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		// TODO(jamieliu): Make Translations writable even if writability is
+		// not required if already kept-dirty by another writable translation.
+		perms := usermem.AccessType{
+			Read:    true,
+			Execute: true,
+		}
+		if at.Write {
+			// From this point forward, this memory can be dirtied through the
+			// mapping at any time.
+			d.dirty.KeepDirty(segMR)
+			perms.Write = true
+		}
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mf,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  perms,
+		})
+		translatedEnd = segMR.End
+	}
+
+	d.dataMu.Unlock()
+	d.handleMu.RUnlock()
+
+	// Don't return the error returned by c.cache.Fill if it occurred outside
+	// of required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+	if required.Length() >= maxReadahead {
+		return required
+	}
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.Start = required.Start
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.End = optional.Start + maxReadahead
+	return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	// Whether we have a host fd (and consequently what platform.File is
+	// mapped) can change across save/restore, so invalidate all translations
+	// unconditionally.
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+	// Write the cache's contents back to the remote file so that if we have a
+	// host fd after restore, the remote file's contents are coherent.
+	mf := d.fs.mfp.MemoryFile()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+		return err
+	}
+
+	// Discard the cache so that it's not stored in saved state. This is safe
+	// because per InvalidateUnsavable invariants, no new translations can have
+	// been returned after we invalidated all existing translations above.
+	d.cache.DropAll(mf)
+	d.dirty.RemoveAll()
+
+	return nil
+}
+
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+
+	mr := memmap.MappableRange{er.Start, er.End}
+	mf := d.fs.mfp.MemoryFile()
+	// Only allow pages that are no longer memory-mapped to be evicted.
+	for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+		mgapMR := mgap.Range().Intersect(mr)
+		if mgapMR.Length() == 0 {
+			continue
+		}
+		if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+		}
+		d.cache.Drop(mgapMR, mf)
+		d.dirty.KeepClean(mgapMR)
+	}
+}
+
+// dentryPlatformFile implements platform.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+//
+// dentryPlatformFile is only used when a host FD representing the remote file
+// is available (i.e. dentry.handle.fd >= 0), and that FD is used for
+// application memory mappings (i.e. !filesystem.opts.forcePageCache).
+type dentryPlatformFile struct {
+	*dentry
+
+	// fdRefs counts references on platform.File offsets. fdRefs is protected
+	// by dentry.dataMu.
+	fdRefs fsutil.FrameRefSet
+
+	// If this dentry represents a regular file, and handle.fd >= 0,
+	// hostFileMapper caches mappings of handle.fd.
+	hostFileMapper fsutil.HostFileMapper
+}
+
+// IncRef implements platform.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+	d.dataMu.Lock()
+	seg, gap := d.fdRefs.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = d.fdRefs.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			newRange := gap.Range().Intersect(fr)
+			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+			seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+		default:
+			d.fdRefs.MergeAdjacent(fr)
+			d.dataMu.Unlock()
+			return
+		}
+	}
+}
+
+// DecRef implements platform.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+	d.dataMu.Lock()
+	seg := d.fdRefs.FindSegment(fr.Start)
+
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = d.fdRefs.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+			seg = d.fdRefs.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	d.fdRefs.MergeAdjacent(fr)
+	d.dataMu.Unlock()
+
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	d.handleMu.RLock()
+	bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
+	d.handleMu.RUnlock()
+	return bs, err
+}
+
+// FD implements platform.File.FD.
+func (d *dentryPlatformFile) FD() int {
+	d.handleMu.RLock()
+	fd := d.handle.fd
+	d.handleMu.RUnlock()
+	return int(fd)
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
new file mode 100644
index 000000000..08c691c47
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -0,0 +1,159 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// specialFileFD implements vfs.FileDescriptionImpl for files other than
+// regular files, directories, and symlinks: pipes, sockets, etc. It is also
+// used for regular files when filesystemOptions.specialRegularFiles is in
+// effect. specialFileFD differs from regularFileFD by using per-FD handles
+// instead of shared per-dentry handles, and never buffering I/O.
+type specialFileFD struct {
+	fileDescription
+
+	// handle is immutable.
+	handle handle
+
+	// off is the file offset. off is protected by mu. (POSIX 2.9.7 only
+	// requires operations using the file offset to be atomic for regular files
+	// and symlinks; however, since specialFileFD may be used for regular
+	// files, we apply this atomicity unconditionally.)
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *specialFileFD) Release() {
+	fd.handle.close(context.Background())
+	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+	fs.syncMu.Lock()
+	delete(fs.specialFileFDs, fd)
+	fs.syncMu.Unlock()
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *specialFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.file.flush(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Going through dst.CopyOutFrom() holds MM locks around file operations of
+	// unknown duration. For regularFileFD, doing so is necessary to support
+	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
+	// hold here since specialFileFD doesn't client-cache data. Just buffer the
+	// read instead.
+	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(ctx, fd.vfsfd.Mount())
+	}
+	buf := make([]byte, dst.NumBytes())
+	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
+		return int64(cp), cperr
+	}
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Do a buffered write. See rationale in PRead.
+	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime(ctx)
+	}
+	buf := make([]byte, src.NumBytes())
+	// Don't do partial writes if we get a partial read from src.
+	if _, err := src.CopyIn(ctx, buf); err != nil {
+		return 0, err
+	}
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	return int64(n), err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		// SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
+		// clear that file size is even meaningful for these files.
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *specialFileFD) Sync(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.sync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
new file mode 100644
index 000000000..adf43be60
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func (d *dentry) isSymlink() bool {
+	return d.fileType() == linux.S_IFLNK
+}
+
+// Precondition: d.isSymlink().
+func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(ctx, mnt)
+		d.dataMu.Lock()
+		if d.haveTarget {
+			target := d.target
+			d.dataMu.Unlock()
+			return target, nil
+		}
+	}
+	target, err := d.file.readlink(ctx)
+	if d.fs.opts.interop != InteropModeShared {
+		if err == nil {
+			d.haveTarget = true
+			d.target = target
+		}
+		d.dataMu.Unlock()
+	}
+	return target, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
new file mode 100644
index 000000000..7598ec6a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func dentryTimestampFromP9(s, ns uint64) int64 {
+	return int64(s*1e9 + ns)
+}
+
+func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
+	return ts.Sec*1e9 + int64(ts.Nsec)
+}
+
+func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
+	return linux.StatxTimestamp{
+		Sec:  ns / 1e9,
+		Nsec: uint32(ns % 1e9),
+	}
+}
+
+func nowFromContext(ctx context.Context) (int64, bool) {
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		return clock.Now().Nanoseconds(), true
+	}
+	return 0, false
+}
+
+// Preconditions: fs.interop != InteropModeShared.
+func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now, ok := nowFromContext(ctx)
+	if !ok {
+		mnt.EndWrite()
+		return
+	}
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.atime, now)
+	d.metadataMu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: fs.interop != InteropModeShared. The caller has successfully
+// called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime(ctx context.Context) {
+	now, ok := nowFromContext(ctx)
+	if !ok {
+		return
+	}
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 5ee9cf1e9..72bc15264 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -622,7 +622,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
-	if !rp.MustBeDir() {
+	if rp.MustBeDir() {
 		return syserror.ENOTDIR
 	}
 	mnt := rp.Mount()
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index bde4c7a1e..34f63986f 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -126,7 +126,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 			}
 			return uint64(n), nil
 		}
-		return readv(s.fd, iovecsFromBlockSeq(dsts))
+		return readv(s.fd, safemem.IovecsFromBlockSeq(dsts))
 	}))
 	return int64(n), err
 }
@@ -149,7 +149,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 			}
 			return uint64(n), nil
 		}
-		return writev(s.fd, iovecsFromBlockSeq(srcs))
+		return writev(s.fd, safemem.IovecsFromBlockSeq(srcs))
 	}))
 	return int64(n), err
 }
@@ -402,7 +402,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// We always do a non-blocking recv*().
 		sysflags := flags | syscall.MSG_DONTWAIT
 
-		iovs := iovecsFromBlockSeq(dsts)
+		iovs := safemem.IovecsFromBlockSeq(dsts)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
@@ -522,7 +522,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			return uint64(n), nil
 		}
 
-		iovs := iovecsFromBlockSeq(srcs)
+		iovs := safemem.IovecsFromBlockSeq(srcs)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
@@ -567,21 +567,6 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	return int(n), syserr.FromError(err)
 }
 
-func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec {
-	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
-	for ; !bs.IsEmpty(); bs = bs.Tail() {
-		b := bs.Head()
-		iovs = append(iovs, syscall.Iovec{
-			Base: &b.ToSlice()[0],
-			Len:  uint64(b.Len()),
-		})
-		// We don't need to care about b.NeedSafecopy(), because the host
-		// kernel will handle such address ranges just fine (by returning
-		// EFAULT).
-	}
-	return iovs
-}
-
 func translateIOSyscallError(err error) error {
 	if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK {
 		return syserror.ErrWouldBlock
-- 
cgit v1.2.3


From dcffddf0cae026411e7e678744a1e39dc2b513cf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 4 Feb 2020 11:47:41 -0800
Subject: Remove argument from vfs.MountNamespace.DecRef()

Updates #1035

PiperOrigin-RevId: 293194631
---
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go       | 2 +-
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go  | 2 +-
 pkg/sentry/fsimpl/testutil/testutil.go       | 2 +-
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go    | 4 ++--
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 2 +-
 pkg/sentry/vfs/mount.go                      | 3 ++-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index d36fa74fb..e03a0c665 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -86,7 +86,7 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth
 // Release must be called when a is no longer in use.
 func (a *Accessor) Release() {
 	a.root.DecRef()
-	a.mntns.DecRef(a.vfsObj)
+	a.mntns.DecRef()
 }
 
 // accessorContext implements context.Context by extending an existing
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 82c58c900..73308a2b5 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -45,7 +45,7 @@ func TestDevtmpfs(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
-	defer mntns.DecRef(vfsObj)
+	defer mntns.DecRef()
 	root := mntns.Root()
 	defer root.DecRef()
 	devpop := vfs.PathOperation{
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 1c98335c1..69fd84ddd 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 // Destroy release resources associated with a test system.
 func (s *System) Destroy() {
 	s.Root.DecRef()
-	s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem.
+	s.mns.DecRef() // Reference on mns passed to NewSystem.
 }
 
 // ReadToEnd reads the contents of fd until EOF to a string.
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 54241c8e8..9fce5e4b4 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -183,7 +183,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
-			defer mntns.DecRef(vfsObj)
+			defer mntns.DecRef()
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
@@ -374,7 +374,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
-			defer mntns.DecRef(vfsObj)
+			defer mntns.DecRef()
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 2b52992ea..e9f71e334 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -51,7 +51,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
 	root := mntns.Root()
 	return vfsObj, root, func() {
 		root.DecRef()
-		mntns.DecRef(vfsObj)
+		mntns.DecRef()
 	}, nil
 }
 
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index d39528051..1fbb420f9 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -423,7 +423,8 @@ func (mntns *MountNamespace) IncRef() {
 }
 
 // DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
+func (mntns *MountNamespace) DecRef() {
+	vfs := mntns.root.fs.VirtualFilesystem()
 	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
 		vfs.mountMu.Lock()
 		vfs.mounts.seq.BeginWrite()
-- 
cgit v1.2.3


From f5072caaf85b9f067d737a874804c04e2b9039b8 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 4 Feb 2020 12:39:58 -0800
Subject: Fix safecopy test.

This is failing in Go1.14 due to new checkptr constraints. This version
should avoid hitting this constraints by doing "dangerous" pointer math
less dangerously?

PiperOrigin-RevId: 293205764
---
 pkg/safecopy/safecopy_test.go   | 88 ++++++++++++++++++++----------------
 pkg/safecopy/safecopy_unsafe.go | 98 ++++++++++++++++++++++++++---------------
 2 files changed, 112 insertions(+), 74 deletions(-)

diff --git a/pkg/safecopy/safecopy_test.go b/pkg/safecopy/safecopy_test.go
index 5818f7f9b..7f7f69d61 100644
--- a/pkg/safecopy/safecopy_test.go
+++ b/pkg/safecopy/safecopy_test.go
@@ -138,10 +138,14 @@ func TestSwapUint32Success(t *testing.T) {
 func TestSwapUint32AlignmentError(t *testing.T) {
 	// Test that SwapUint32 returns an AlignmentError when passed an unaligned
 	// address.
-	data := new(struct{ val uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 4}
-	if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want {
+	data := make([]byte, 8) // 2 * sizeof(uint32).
+	alignedIndex := uintptr(0)
+	if offset := uintptr(unsafe.Pointer(&data[0])) % 4; offset != 0 {
+		alignedIndex = 4 - offset
+	}
+	ptr := unsafe.Pointer(&data[alignedIndex+1])
+	want := AlignmentError{Addr: uintptr(ptr), Alignment: 4}
+	if _, err := SwapUint32(ptr, 1); err != want {
 		t.Errorf("Unexpected error: got %v, want %v", err, want)
 	}
 }
@@ -171,10 +175,14 @@ func TestSwapUint64Success(t *testing.T) {
 func TestSwapUint64AlignmentError(t *testing.T) {
 	// Test that SwapUint64 returns an AlignmentError when passed an unaligned
 	// address.
-	data := new(struct{ val1, val2 uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val1)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 8}
-	if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want {
+	data := make([]byte, 16) // 2 * sizeof(uint64).
+	alignedIndex := uintptr(0)
+	if offset := uintptr(unsafe.Pointer(&data[0])) % 8; offset != 0 {
+		alignedIndex = 8 - offset
+	}
+	ptr := unsafe.Pointer(&data[alignedIndex+1])
+	want := AlignmentError{Addr: uintptr(ptr), Alignment: 8}
+	if _, err := SwapUint64(ptr, 1); err != want {
 		t.Errorf("Unexpected error: got %v, want %v", err, want)
 	}
 }
@@ -201,10 +209,14 @@ func TestCompareAndSwapUint32Success(t *testing.T) {
 func TestCompareAndSwapUint32AlignmentError(t *testing.T) {
 	// Test that CompareAndSwapUint32 returns an AlignmentError when passed an
 	// unaligned address.
-	data := new(struct{ val uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 4}
-	if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want {
+	data := make([]byte, 8) // 2 * sizeof(uint32).
+	alignedIndex := uintptr(0)
+	if offset := uintptr(unsafe.Pointer(&data[0])) % 4; offset != 0 {
+		alignedIndex = 4 - offset
+	}
+	ptr := unsafe.Pointer(&data[alignedIndex+1])
+	want := AlignmentError{Addr: uintptr(ptr), Alignment: 4}
+	if _, err := CompareAndSwapUint32(ptr, 0, 1); err != want {
 		t.Errorf("Unexpected error: got %v, want %v", err, want)
 	}
 }
@@ -252,8 +264,8 @@ func TestCopyInSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := CopyIn(dst, src)
 				if n != bytesBeforeFault {
@@ -276,8 +288,8 @@ func TestCopyInBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := CopyIn(dst, src)
 				if n != bytesBeforeFault {
@@ -300,8 +312,8 @@ func TestCopyOutSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := CopyOut(dst, src)
 				if n != bytesBeforeFault {
@@ -324,8 +336,8 @@ func TestCopyOutBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := CopyOut(dst, src)
 				if n != bytesBeforeFault {
@@ -348,8 +360,8 @@ func TestCopySourceSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -372,8 +384,8 @@ func TestCopySourceBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -396,8 +408,8 @@ func TestCopyDestinationSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -420,8 +432,8 @@ func TestCopyDestinationBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -444,8 +456,8 @@ func TestZeroOutSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				n, err := ZeroOut(dst, pageSize)
 				if n != uintptr(bytesBeforeFault) {
 					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
@@ -467,8 +479,8 @@ func TestZeroOutBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				n, err := ZeroOut(dst, pageSize)
 				if n != uintptr(bytesBeforeFault) {
 					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
@@ -488,7 +500,7 @@ func TestSwapUint32SegvError(t *testing.T) {
 	// Test that SwapUint32 returns a SegvError when reaching a page that
 	// signals SIGSEGV.
 	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
 		if want := (SegvError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -500,7 +512,7 @@ func TestSwapUint32BusError(t *testing.T) {
 	// Test that SwapUint32 returns a BusError when reaching a page that
 	// signals SIGBUS.
 	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
 		if want := (BusError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -512,7 +524,7 @@ func TestSwapUint64SegvError(t *testing.T) {
 	// Test that SwapUint64 returns a SegvError when reaching a page that
 	// signals SIGSEGV.
 	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
 		if want := (SegvError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -524,7 +536,7 @@ func TestSwapUint64BusError(t *testing.T) {
 	// Test that SwapUint64 returns a BusError when reaching a page that
 	// signals SIGBUS.
 	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
 		if want := (BusError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -536,7 +548,7 @@ func TestCompareAndSwapUint32SegvError(t *testing.T) {
 	// Test that CompareAndSwapUint32 returns a SegvError when reaching a page
 	// that signals SIGSEGV.
 	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
 		if want := (SegvError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -548,7 +560,7 @@ func TestCompareAndSwapUint32BusError(t *testing.T) {
 	// Test that CompareAndSwapUint32 returns a BusError when reaching a page
 	// that signals SIGBUS.
 	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
 		if want := (BusError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go
index eef028e68..41dd567f3 100644
--- a/pkg/safecopy/safecopy_unsafe.go
+++ b/pkg/safecopy/safecopy_unsafe.go
@@ -16,6 +16,7 @@ package safecopy
 
 import (
 	"fmt"
+	"runtime"
 	"syscall"
 	"unsafe"
 )
@@ -35,7 +36,7 @@ const maxRegisterSize = 16
 // successfully copied.
 //
 //go:noescape
-func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+func memcpy(dst, src uintptr, n uintptr) (fault uintptr, sig int32)
 
 // memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
 // signal is received during the write, it returns the address that caused the
@@ -47,7 +48,7 @@ func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32
 // successfully written.
 //
 //go:noescape
-func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+func memclr(ptr uintptr, n uintptr) (fault uintptr, sig int32)
 
 // swapUint32 atomically stores new into *ptr and returns (the previous *ptr
 // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
@@ -90,29 +91,35 @@ func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
 // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
 // copied and an error if SIGSEGV or SIGBUS is received while reading from src.
 func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
+	n, err := copyIn(dst, uintptr(src))
+	runtime.KeepAlive(src)
+	return n, err
+}
+
+// copyIn is the underlying definition for CopyIn.
+func copyIn(dst []byte, src uintptr) (int, error) {
 	toCopy := uintptr(len(dst))
 	if len(dst) == 0 {
 		return 0, nil
 	}
 
-	fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy)
+	fault, sig := memcpy(uintptr(unsafe.Pointer(&dst[0])), src, toCopy)
 	if sig == 0 {
 		return len(dst), nil
 	}
 
-	faultN, srcN := uintptr(fault), uintptr(src)
-	if faultN < srcN || faultN >= srcN+toCopy {
-		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy))
+	if fault < src || fault >= src+toCopy {
+		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, fault, src, src+toCopy))
 	}
 
 	// memcpy might have ended the copy up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to copy up to the fault.
 	var done int
-	if faultN-srcN > maxRegisterSize {
-		done = int(faultN - srcN - maxRegisterSize)
+	if fault-src > maxRegisterSize {
+		done = int(fault - src - maxRegisterSize)
 	}
-	n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done)))
+	n, err := copyIn(dst[done:int(fault-src)], src+uintptr(done))
 	done += n
 	if err != nil {
 		return done, err
@@ -124,29 +131,35 @@ func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
 // bytes done and an error if SIGSEGV or SIGBUS is received while writing to
 // dst.
 func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
+	n, err := copyOut(uintptr(dst), src)
+	runtime.KeepAlive(dst)
+	return n, err
+}
+
+// copyOut is the underlying definition for CopyOut.
+func copyOut(dst uintptr, src []byte) (int, error) {
 	toCopy := uintptr(len(src))
 	if toCopy == 0 {
 		return 0, nil
 	}
 
-	fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy)
+	fault, sig := memcpy(dst, uintptr(unsafe.Pointer(&src[0])), toCopy)
 	if sig == 0 {
 		return len(src), nil
 	}
 
-	faultN, dstN := uintptr(fault), uintptr(dst)
-	if faultN < dstN || faultN >= dstN+toCopy {
-		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy))
+	if fault < dst || fault >= dst+toCopy {
+		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toCopy))
 	}
 
 	// memcpy might have ended the copy up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to copy up to the fault.
 	var done int
-	if faultN-dstN > maxRegisterSize {
-		done = int(faultN - dstN - maxRegisterSize)
+	if fault-dst > maxRegisterSize {
+		done = int(fault - dst - maxRegisterSize)
 	}
-	n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)])
+	n, err := copyOut(dst+uintptr(done), src[done:int(fault-dst)])
 	done += n
 	if err != nil {
 		return done, err
@@ -161,6 +174,14 @@ func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
 // Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
 // the resulting contents of dst are unspecified.
 func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
+	n, err := copyN(uintptr(dst), uintptr(src), toCopy)
+	runtime.KeepAlive(dst)
+	runtime.KeepAlive(src)
+	return n, err
+}
+
+// copyN is the underlying definition for Copy.
+func copyN(dst, src uintptr, toCopy uintptr) (uintptr, error) {
 	if toCopy == 0 {
 		return 0, nil
 	}
@@ -171,17 +192,16 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 	}
 
 	// Did the fault occur while reading from src or writing to dst?
-	faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst)
 	faultAfterSrc := ^uintptr(0)
-	if faultN >= srcN {
-		faultAfterSrc = faultN - srcN
+	if fault >= src {
+		faultAfterSrc = fault - src
 	}
 	faultAfterDst := ^uintptr(0)
-	if faultN >= dstN {
-		faultAfterDst = faultN - dstN
+	if fault >= dst {
+		faultAfterDst = fault - dst
 	}
 	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
-		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
+		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, fault, src, src+toCopy, dst, dst+toCopy))
 	}
 	faultedAfter := faultAfterSrc
 	if faultedAfter > faultAfterDst {
@@ -195,7 +215,7 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 	if faultedAfter > maxRegisterSize {
 		done = faultedAfter - maxRegisterSize
 	}
-	n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done)
+	n, err := copyN(dst+done, src+done, faultedAfter-done)
 	done += n
 	if err != nil {
 		return done, err
@@ -206,6 +226,13 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 // ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
 // written and an error if SIGSEGV or SIGBUS is received while writing to dst.
 func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
+	n, err := zeroOut(uintptr(dst), toZero)
+	runtime.KeepAlive(dst)
+	return n, err
+}
+
+// zeroOut is the underlying definition for ZeroOut.
+func zeroOut(dst uintptr, toZero uintptr) (uintptr, error) {
 	if toZero == 0 {
 		return 0, nil
 	}
@@ -215,19 +242,18 @@ func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
 		return toZero, nil
 	}
 
-	faultN, dstN := uintptr(fault), uintptr(dst)
-	if faultN < dstN || faultN >= dstN+toZero {
-		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero))
+	if fault < dst || fault >= dst+toZero {
+		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toZero))
 	}
 
 	// memclr might have ended the write up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to write up to the fault.
 	var done uintptr
-	if faultN-dstN > maxRegisterSize {
-		done = faultN - dstN - maxRegisterSize
+	if fault-dst > maxRegisterSize {
+		done = fault - dst - maxRegisterSize
 	}
-	n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done)
+	n, err := zeroOut(dst+done, fault-dst-done)
 	done += n
 	if err != nil {
 		return done, err
@@ -243,7 +269,7 @@ func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
 		return 0, AlignmentError{addr, 4}
 	}
 	old, sig := swapUint32(ptr, new)
-	return old, errorFromFaultSignal(ptr, sig)
+	return old, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
 // SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
@@ -254,7 +280,7 @@ func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
 		return 0, AlignmentError{addr, 8}
 	}
 	old, sig := swapUint64(ptr, new)
-	return old, errorFromFaultSignal(ptr, sig)
+	return old, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
 // CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
@@ -265,7 +291,7 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
 		return 0, AlignmentError{addr, 4}
 	}
 	prev, sig := compareAndSwapUint32(ptr, old, new)
-	return prev, errorFromFaultSignal(ptr, sig)
+	return prev, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
 // LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
@@ -277,17 +303,17 @@ func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
 		return 0, AlignmentError{addr, 4}
 	}
 	val, sig := loadUint32(ptr)
-	return val, errorFromFaultSignal(ptr, sig)
+	return val, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
-func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
+func errorFromFaultSignal(addr uintptr, sig int32) error {
 	switch sig {
 	case 0:
 		return nil
 	case int32(syscall.SIGSEGV):
-		return SegvError{uintptr(addr)}
+		return SegvError{addr}
 	case int32(syscall.SIGBUS):
-		return BusError{uintptr(addr)}
+		return BusError{addr}
 	default:
 		panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
 	}
-- 
cgit v1.2.3


From c5d4041623ac6405135e966af6d06c178a86870d Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Tue, 4 Feb 2020 12:53:10 -0800
Subject: Include socket_ip_udp_loopback.cc in exportes_files

So it can be included in fuchsia's syscall tests

PiperOrigin-RevId: 293208306
---
 test/syscalls/linux/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e4ca5b6db..737e2329f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -11,6 +11,7 @@ exports_files(
         "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_udp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
-- 
cgit v1.2.3


From 6823b5e244a5748032130574ae3a25a0a36bbbf5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Feb 2020 13:05:30 -0800
Subject: timer_create(2) should return 0 on success

The timer ID is copied out to the argument.

Fixes #1738

PiperOrigin-RevId: 293210801
---
 pkg/sentry/syscalls/linux/sys_timer.go |  2 +-
 test/syscalls/linux/timers.cc          | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 432351917..a4c400f87 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -146,7 +146,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, err
 	}
 
-	return uintptr(id), nil, nil
+	return 0, nil, nil
 }
 
 // TimerSettime implements linux syscall timer_settime(2).
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 3db18d7ac..2f92c27da 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -297,9 +297,13 @@ class IntervalTimer {
 PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
                                         const struct sigevent& sev) {
   int timerid;
-  if (syscall(SYS_timer_create, clockid, &sev, &timerid) < 0) {
+  int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
+  if (ret < 0) {
     return PosixError(errno, "timer_create");
   }
+  if (ret > 0) {
+    return PosixError(EINVAL, "timer_create should never return positive");
+  }
   MaybeSave();
   return IntervalTimer(timerid);
 }
@@ -317,6 +321,18 @@ TEST(IntervalTimerTest, IsInitiallyStopped) {
   EXPECT_EQ(0, its.it_value.tv_nsec);
 }
 
+// Kernel can create multiple timers without issue.
+//
+// Regression test for gvisor.dev/issue/1738.
+TEST(IntervalTimerTest, MultipleTimers) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+  const auto timer2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+}
+
 TEST(IntervalTimerTest, SingleShotSilent) {
   struct sigevent sev = {};
   sev.sigev_notify = SIGEV_NONE;
-- 
cgit v1.2.3


From 6d8bf405bc5e887247534172713bf7d2f5252734 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 4 Feb 2020 13:15:05 -0800
Subject: Allow mlock in fsgofer system call filters

Go 1.14 has a workaround for a Linux 5.2-5.4 bug which requires mlock'ing the g
stack to prevent register corruption. We need to allow this syscall until it is
removed from Go.

PiperOrigin-RevId: 293212935
---
 runsc/fsgofer/filter/config.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index a1792330f..1dce36965 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -128,6 +128,18 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MADVISE:      {},
 	unix.SYS_MEMFD_CREATE:    {}, /// Used by flipcall.PacketWindowAllocator.Init().
 	syscall.SYS_MKDIRAT:      {},
+	// Used by the Go runtime as a temporarily workaround for a Linux
+	// 5.2-5.4 bug.
+	//
+	// See src/runtime/os_linux_x86.go.
+	//
+	// TODO(b/148688965): Remove once this is gone from Go.
+	syscall.SYS_MLOCK: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(4096),
+		},
+	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
-- 
cgit v1.2.3


From 95ce8bb4c7ecb23e47e68c60b1de0b99ad8a856d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 4 Feb 2020 14:36:43 -0800
Subject: Automatically propagate tags for stateify and marshal.

Note that files will need to be appropriately segmented in order for the
mechanism to work, in suffixes implying special tags. This only needs to happen
for cases where marshal or state structures are defined, which should be rare
and mostly architecture specific.

PiperOrigin-RevId: 293231579
---
 tools/build/defs.bzl                    |   2 +
 tools/build/tags.bzl                    |  36 ++++++++++
 tools/defs.bzl                          | 105 ++++++++++++++++++---------
 tools/go_marshal/gomarshal/BUILD        |   1 +
 tools/go_marshal/gomarshal/generator.go |  11 +++
 tools/go_stateify/BUILD                 |   1 +
 tools/go_stateify/defs.bzl              |  10 +--
 tools/go_stateify/main.go               | 122 +++-----------------------------
 tools/tags/BUILD                        |  11 +++
 tools/tags/tags.go                      |  89 +++++++++++++++++++++++
 10 files changed, 235 insertions(+), 153 deletions(-)
 create mode 100644 tools/build/tags.bzl
 create mode 100644 tools/tags/BUILD
 create mode 100644 tools/tags/tags.go

diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
index 967c1f900..1a1a0d825 100644
--- a/tools/build/defs.bzl
+++ b/tools/build/defs.bzl
@@ -8,6 +8,7 @@ load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
 load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
 load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
 load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
+load("//tools/build:tags.bzl", _go_suffixes = "go_suffixes")
 
 container_image = _container_image
 cc_binary = _cc_binary
@@ -18,6 +19,7 @@ cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
 go_image = _go_image
 go_embed_data = _go_embed_data
+go_suffixes = _go_suffixes
 gtest = "@com_google_googletest//:gtest"
 loopback = "//tools/build:loopback"
 proto_library = native.proto_library
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
new file mode 100644
index 000000000..e99c87f81
--- /dev/null
+++ b/tools/build/tags.bzl
@@ -0,0 +1,36 @@
+"""List of special Go suffixes."""
+
+go_suffixes = [
+    "_386",
+    "_386_unsafe",
+    "_amd64",
+    "_amd64_unsafe",
+    "_aarch64",
+    "_aarch64_unsafe",
+    "_arm",
+    "_arm_unsafe",
+    "_arm64",
+    "_arm64_unsafe",
+    "_mips",
+    "_mips_unsafe",
+    "_mipsle",
+    "_mipsle_unsafe",
+    "_mips64",
+    "_mips64_unsafe",
+    "_mips64le",
+    "_mips64le_unsafe",
+    "_ppc64",
+    "_ppc64_unsafe",
+    "_ppc64le",
+    "_ppc64le_unsafe",
+    "_riscv64",
+    "_riscv64_unsafe",
+    "_s390x",
+    "_s390x_unsafe",
+    "_sparc64",
+    "_sparc64_unsafe",
+    "_wasm",
+    "_wasm_unsafe",
+    "_linux",
+    "_linux_unsafe",
+]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index ce677cbbf..5d5fa134a 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/build:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -45,6 +45,34 @@ def go_binary(name, **kwargs):
         **kwargs
     )
 
+def calculate_sets(srcs):
+    """Calculates special Go sets for templates.
+
+    Args:
+      srcs: the full set of Go sources.
+
+    Returns:
+      A dictionary of the form:
+
+      "": [src1.go, src2.go]
+      "suffix": [src3suffix.go, src4suffix.go]
+
+      Note that suffix will typically start with '_'.
+    """
+    result = dict()
+    for file in srcs:
+        if not file.endswith(".go"):
+            continue
+        target = ""
+        for suffix in go_suffixes:
+            if file.endswith(suffix + ".go"):
+                target = suffix
+        if not target in result:
+            result[target] = [file]
+        else:
+            result[target].append(file)
+    return result
+
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
@@ -70,39 +98,49 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
       marshal: whether marshal is enabled (default: false).
       **kwargs: standard go_library arguments.
     """
+    all_srcs = srcs
+    all_deps = deps
     if stateify:
         # Only do stateification for non-state packages without manual autogen.
-        go_stateify(
-            name = name + "_state_autogen",
-            srcs = [src for src in srcs if src.endswith(".go")],
-            imports = imports,
-            package = name,
-            arch = select_arch(),
-            out = name + "_state_autogen.go",
-        )
-        all_srcs = srcs + [name + "_state_autogen.go"]
-        if "//pkg/state" not in deps:
-            all_deps = deps + ["//pkg/state"]
-        else:
-            all_deps = deps
-    else:
-        all_deps = deps
-        all_srcs = srcs
+        # First, we need to segregate the input files via the special suffixes,
+        # and calculate the final output set.
+        state_sets = calculate_sets(srcs)
+        for (suffix, srcs) in state_sets.items():
+            go_stateify(
+                name = name + suffix + "_state_autogen",
+                srcs = srcs,
+                imports = imports,
+                package = name,
+                out = name + suffix + "_state_autogen.go",
+            )
+        all_srcs = all_srcs + [
+            name + suffix + "_state_autogen.go"
+            for suffix in state_sets.keys()
+        ]
+        if "//pkg/state" not in all_deps:
+            all_deps = all_deps + ["//pkg/state"]
+
     if marshal:
-        go_marshal(
-            name = name + "_abi_autogen",
-            srcs = [src for src in srcs if src.endswith(".go")],
-            debug = False,
-            imports = imports,
-            package = name,
-        )
+        # See above.
+        marshal_sets = calculate_sets(srcs)
+        for (suffix, srcs) in marshal_sets.items():
+            go_marshal(
+                name = name + suffix + "_abi_autogen",
+                srcs = srcs,
+                debug = False,
+                imports = imports,
+                package = name,
+            )
         extra_deps = [
             dep
             for dep in marshal_deps
             if not dep in all_deps
         ]
         all_deps = all_deps + extra_deps
-        all_srcs = srcs + [name + "_abi_autogen_unsafe.go"]
+        all_srcs = all_srcs + [
+            name + suffix + "_abi_autogen_unsafe.go"
+            for suffix in marshal_sets.keys()
+        ]
 
     _go_library(
         name = name,
@@ -115,13 +153,16 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         # Ignore importpath for go_test.
         kwargs.pop("importpath", None)
 
-        _go_test(
-            name = name + "_abi_autogen_test",
-            srcs = [name + "_abi_autogen_test.go"],
-            library = ":" + name,
-            deps = marshal_test_deps,
-            **kwargs
-        )
+        # See above.
+        marshal_sets = calculate_sets(srcs)
+        for (suffix, srcs) in marshal_sets.items():
+            _go_test(
+                name = name + suffix + "_abi_autogen_test",
+                srcs = [name + suffix + "_abi_autogen_test.go"],
+                library = ":" + name + suffix,
+                deps = marshal_test_deps,
+                **kwargs
+            )
 
 def proto_library(name, srcs, **kwargs):
     """Wraps the standard proto_library.
diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD
index c92b59dd6..b5d5a4487 100644
--- a/tools/go_marshal/gomarshal/BUILD
+++ b/tools/go_marshal/gomarshal/BUILD
@@ -14,4 +14,5 @@ go_library(
     visibility = [
         "//:sandbox",
     ],
+    deps = ["//tools/tags"],
 )
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index af90bdecb..0b3f600fe 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -23,6 +23,9 @@ import (
 	"go/token"
 	"os"
 	"sort"
+	"strings"
+
+	"gvisor.dev/gvisor/tools/tags"
 )
 
 const (
@@ -104,6 +107,14 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G
 func (g *Generator) writeHeader() error {
 	var b sourceBuffer
 	b.emit("// Automatically generated marshal implementation. See tools/go_marshal.\n\n")
+
+	// Emit build tags.
+	if t := tags.Aggregate(g.inputs); len(t) > 0 {
+		b.emit(strings.Join(t.Lines(), "\n"))
+		b.emit("\n")
+	}
+
+	// Package header.
 	b.emit("package %s\n\n", g.pkg)
 	if err := b.write(g.output); err != nil {
 		return err
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index a133d6f8b..6036faf7b 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -6,4 +6,5 @@ go_binary(
     name = "stateify",
     srcs = ["main.go"],
     visibility = ["//visibility:public"],
+    deps = ["//tools/tags"],
 )
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 0f261d89f..bdb966362 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -7,7 +7,6 @@ def _go_stateify_impl(ctx):
     # Run the stateify command.
     args = ["-output=%s" % output.path]
     args.append("-pkg=%s" % ctx.attr.package)
-    args.append("-arch=%s" % ctx.attr.arch)
     if ctx.attr._statepkg:
         args.append("-statepkg=%s" % ctx.attr._statepkg)
     if ctx.attr.imports:
@@ -47,15 +46,8 @@ for statified types.
             doc = "The package name for the input sources.",
             mandatory = True,
         ),
-        "arch": attr.string(
-            doc = "Target platform.",
-            mandatory = True,
-        ),
         "out": attr.output(
-            doc = """
-The name of the generated file output. This must not conflict with any other
-files and must be added to the srcs of the relevant go_library.
-""",
+            doc = "Name of the generator output file.",
             mandatory = True,
         ),
         "_tool": attr.label(
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 7d5d291e6..aa9d4543e 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -22,12 +22,12 @@ import (
 	"go/ast"
 	"go/parser"
 	"go/token"
-	"io/ioutil"
 	"os"
-	"path/filepath"
 	"reflect"
 	"strings"
 	"sync"
+
+	"gvisor.dev/gvisor/tools/tags"
 )
 
 var (
@@ -35,113 +35,8 @@ var (
 	imports  = flag.String("imports", "", "extra imports for the output file")
 	output   = flag.String("output", "", "output file")
 	statePkg = flag.String("statepkg", "", "state import package; defaults to empty")
-	arch     = flag.String("arch", "", "specify the target platform")
 )
 
-// The known architectures.
-var okgoarch = []string{
-	"386",
-	"amd64",
-	"arm",
-	"arm64",
-	"mips",
-	"mipsle",
-	"mips64",
-	"mips64le",
-	"ppc64",
-	"ppc64le",
-	"riscv64",
-	"s390x",
-	"sparc64",
-	"wasm",
-}
-
-// readfile returns the content of the named file.
-func readfile(file string) string {
-	data, err := ioutil.ReadFile(file)
-	if err != nil {
-		panic(fmt.Sprintf("readfile err: %v", err))
-	}
-	return string(data)
-}
-
-// matchfield reports whether the field (x,y,z) matches this build.
-// all the elements in the field must be satisfied.
-func matchfield(f string, goarch string) bool {
-	for _, tag := range strings.Split(f, ",") {
-		if !matchtag(tag, goarch) {
-			return false
-		}
-	}
-	return true
-}
-
-// matchtag reports whether the tag (x or !x) matches this build.
-func matchtag(tag string, goarch string) bool {
-	if tag == "" {
-		return false
-	}
-	if tag[0] == '!' {
-		if len(tag) == 1 || tag[1] == '!' {
-			return false
-		}
-		return !matchtag(tag[1:], goarch)
-	}
-	return tag == goarch
-}
-
-// canBuild reports whether we can build this file for target platform by
-// checking file name and build tags. The code is derived from the Go source
-// cmd.dist.build.shouldbuild.
-func canBuild(file, goTargetArch string) bool {
-	name := filepath.Base(file)
-	excluded := func(list []string, ok string) bool {
-		for _, x := range list {
-			if x == ok || (ok == "android" && x == "linux") || (ok == "illumos" && x == "solaris") {
-				continue
-			}
-			i := strings.Index(name, x)
-			if i <= 0 || name[i-1] != '_' {
-				continue
-			}
-			i += len(x)
-			if i == len(name) || name[i] == '.' || name[i] == '_' {
-				return true
-			}
-		}
-		return false
-	}
-	if excluded(okgoarch, goTargetArch) {
-		return false
-	}
-
-	// Check file contents for // +build lines.
-	for _, p := range strings.Split(readfile(file), "\n") {
-		p = strings.TrimSpace(p)
-		if p == "" {
-			continue
-		}
-		if !strings.HasPrefix(p, "//") {
-			break
-		}
-		if !strings.Contains(p, "+build") {
-			continue
-		}
-		fields := strings.Fields(p[2:])
-		if len(fields) < 1 || fields[0] != "+build" {
-			continue
-		}
-		for _, p := range fields[1:] {
-			if matchfield(p, goTargetArch) {
-				goto fieldmatch
-			}
-		}
-		return false
-	fieldmatch:
-	}
-	return true
-}
-
 // resolveTypeName returns a qualified type name.
 func resolveTypeName(name string, typ ast.Expr) (field string, qualified string) {
 	for done := false; !done; {
@@ -329,8 +224,15 @@ func main() {
 		fmt.Fprintf(outputFile, "	m.Save(\"%s\", &x.%s)\n", name, name)
 	}
 
-	// Emit the package name.
+	// Automated warning.
 	fmt.Fprint(outputFile, "// automatically generated by stateify.\n\n")
+
+	// Emit build tags.
+	if t := tags.Aggregate(flag.Args()); len(t) > 0 {
+		fmt.Fprintf(outputFile, "%s\n\n", strings.Join(t.Lines(), "\n"))
+	}
+
+	// Emit the package name.
 	fmt.Fprintf(outputFile, "package %s\n\n", *pkg)
 
 	// Emit the imports lazily.
@@ -364,10 +266,6 @@ func main() {
 			os.Exit(1)
 		}
 
-		if !canBuild(filename, *arch) {
-			continue
-		}
-
 		files = append(files, f)
 	}
 
diff --git a/tools/tags/BUILD b/tools/tags/BUILD
new file mode 100644
index 000000000..1c02e2c89
--- /dev/null
+++ b/tools/tags/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "tags",
+    srcs = ["tags.go"],
+    marshal = False,
+    stateify = False,
+    visibility = ["//tools:__subpackages__"],
+)
diff --git a/tools/tags/tags.go b/tools/tags/tags.go
new file mode 100644
index 000000000..f35904e0a
--- /dev/null
+++ b/tools/tags/tags.go
@@ -0,0 +1,89 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tags is a utility for parsing build tags.
+package tags
+
+import (
+	"fmt"
+	"io/ioutil"
+	"strings"
+)
+
+// OrSet is a set of tags on a single line.
+//
+// Note that tags may include ",", and we don't distinguish this case in the
+// logic below. Ideally, this constraints can be split into separate top-level
+// build tags in order to resolve any issues.
+type OrSet []string
+
+// Line returns the line for this or.
+func (or OrSet) Line() string {
+	return fmt.Sprintf("// +build %s", strings.Join([]string(or), " "))
+}
+
+// AndSet is the set of all OrSets.
+type AndSet []OrSet
+
+// Lines returns the lines to be printed.
+func (and AndSet) Lines() (ls []string) {
+	for _, or := range and {
+		ls = append(ls, or.Line())
+	}
+	return
+}
+
+// Join joins this AndSet with another.
+func (and AndSet) Join(other AndSet) AndSet {
+	return append(and, other...)
+}
+
+// Tags returns the unique set of +build tags.
+//
+// Derived form the runtime's canBuild.
+func Tags(file string) (tags AndSet) {
+	data, err := ioutil.ReadFile(file)
+	if err != nil {
+		return nil
+	}
+	// Check file contents for // +build lines.
+	for _, p := range strings.Split(string(data), "\n") {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		if !strings.HasPrefix(p, "//") {
+			break
+		}
+		if !strings.Contains(p, "+build") {
+			continue
+		}
+		fields := strings.Fields(p[2:])
+		if len(fields) < 1 || fields[0] != "+build" {
+			continue
+		}
+		tags = append(tags, OrSet(fields[1:]))
+	}
+	return tags
+}
+
+// Aggregate aggregates all tags from a set of files.
+//
+// Note that these may be in conflict, in which case the build will fail.
+func Aggregate(files []string) (tags AndSet) {
+	for _, file := range files {
+		tags = tags.Join(Tags(file))
+	}
+	return tags
+}
-- 
cgit v1.2.3


From 3f5642c5afdb8e633287ba10c2cb6b00f1849570 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 4 Feb 2020 15:15:43 -0800
Subject: Increase container_test size.

container_test was flaking because a small percentage of runs timed out. Tested
this fix with --runs_per_test=100.

PiperOrigin-RevId: 293240102
---
 runsc/container/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index e21431e4c..0aaeea3a8 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -30,7 +30,7 @@ go_library(
 
 go_test(
     name = "container_test",
-    size = "medium",
+    size = "large",
     srcs = [
         "console_test.go",
         "container_test.go",
-- 
cgit v1.2.3


From a26a954946ad2e7910d3ad7578960a93b73a1f9b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 4 Feb 2020 15:20:30 -0800
Subject: Add socket connection stress test.

Tests 65k connection attempts on common types of sockets to check for port
leaks.

Also fixes a bug where dual-stack sockets wouldn't properly re-queue
segments received while closing.

PiperOrigin-RevId: 293241166
---
 pkg/tcpip/transport/tcp/connect.go           |   4 ++
 test/syscalls/BUILD                          |   9 +++
 test/syscalls/linux/BUILD                    |  17 +++++
 test/syscalls/linux/ip_socket_test_util.cc   |  27 +++++++
 test/syscalls/linux/ip_socket_test_util.h    |  15 ++++
 test/syscalls/linux/socket_generic_stress.cc |  83 ++++++++++++++++++++++
 test/syscalls/linux/socket_test_util.cc      | 101 ++++++++++++++++++++++++---
 test/syscalls/linux/socket_test_util.h       |   6 ++
 8 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 test/syscalls/linux/socket_generic_stress.cc

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 9ff7ac261..5c5397823 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -989,6 +989,10 @@ func (e *endpoint) transitionToStateCloseLocked() {
 // to any other listening endpoint. We reply with RST if we cannot find one.
 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+		// Dual-stack socket, try IPv4.
+		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
+	}
 	if ep == nil {
 		replyWithReset(s)
 		s.decRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 8f2b75a1c..31d239c0e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -45,6 +45,15 @@ syscall_test(test = "//test/syscalls/linux:brk_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_test")
 
+syscall_test(
+    size = "large",
+    shard_count = 50,
+    # Takes too long for TSAN. Since this is kind of a stress test that doesn't
+    # involve much concurrency, TSAN's usefulness here is limited anyway.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_stress_test",
+)
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chdir_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 737e2329f..273b014d6 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:optional",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -2151,6 +2152,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_binary(
+    name = "socket_stress_test",
+    testonly = 1,
+    srcs = [
+        "socket_generic_stress.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "socket_unix_dgram_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 6b472eb2f..bba022a41 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -79,6 +79,33 @@ SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
                                      /* dual_stack = */ true)};
 }
 
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
+  return SocketPairKind{description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ true)};
+}
+
 SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
   std::string description =
       absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 0f58e0f77..083ebbcf0 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -50,6 +50,21 @@ SocketPairKind IPv4TCPAcceptBindSocketPair(int type);
 // given type bound to the IPv4 loopback.
 SocketPairKind DualStackTCPAcceptBindSocketPair(int type);
 
+// IPv6TCPAcceptBindPersistentListenerSocketPair is like
+// IPv6TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// IPv4TCPAcceptBindPersistentListenerSocketPair is like
+// IPv4TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// DualStackTCPAcceptBindPersistentListenerSocketPair is like
+// DualStackTCPAcceptBindSocketPair except it uses a persistent listening socket
+// to create all socket pairs.
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type);
+
 // IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
 // SocketPairs created with bind() and connect() syscalls with AF_INET6 and the
 // given type bound to the IPv6 loopback.
diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc
new file mode 100644
index 000000000..6a232238d
--- /dev/null
+++ b/test/syscalls/linux/socket_generic_stress.cc
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected sockets.
+using ConnectStressTest = SocketPairTest;
+
+TEST_P(ConnectStressTest, Reset65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+    // Send some data to ensure that the connection gets reset and the port gets
+    // released immediately. This avoids either end entering TIME-WAIT.
+    char sent_data[100] = {};
+    ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+                SyscallSucceedsWithValue(sizeof(sent_data)));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, ConnectStressTest,
+    ::testing::Values(IPv6UDPBidirectionalBindSocketPair(0),
+                      IPv4UDPBidirectionalBindSocketPair(0),
+                      DualStackUDPBidirectionalBindSocketPair(0),
+
+                      // Without REUSEADDR, we get port exhaustion on Linux.
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv6TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv4TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+                          DualStackTCPAcceptBindSocketPair(0))));
+
+// Test fixture for tests that apply to pairs of connected sockets created with
+// a persistent listener (if applicable).
+using PersistentListenerConnectStressTest = SocketPairTest;
+
+TEST_P(PersistentListenerConnectStressTest, 65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, PersistentListenerConnectStressTest,
+    ::testing::Values(
+        IPv6UDPBidirectionalBindSocketPair(0),
+        IPv4UDPBidirectionalBindSocketPair(0),
+        DualStackUDPBidirectionalBindSocketPair(0),
+
+        // Without REUSEADDR, we get port exhaustion on Linux.
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv6TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv4TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            DualStackTCPAcceptBindPersistentListenerSocketPair(0))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index eff7d577e..c0c5ab3fe 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -18,10 +18,13 @@
 #include <poll.h>
 #include <sys/socket.h>
 
+#include <memory>
+
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
+#include "absl/types/optional.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
@@ -109,7 +112,10 @@ Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain,
       MaybeSave();  // Unlinked path.
     }
 
-    return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+    // accepted is before connected to destruct connected before accepted.
+    // Destructors for nonstatic member objects are called in the reverse order
+    // in which they appear in the class declaration.
+    return absl::make_unique<AddrFDSocketPair>(accepted, connected, bind_addr,
                                                extra_addr);
   };
 }
@@ -311,11 +317,16 @@ PosixErrorOr<T> BindIP(int fd, bool dual_stack) {
 }
 
 template <typename T>
-PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
-    int bound, int connected, int type, bool dual_stack) {
-  ASSIGN_OR_RETURN_ERRNO(T bind_addr, BindIP<T>(bound, dual_stack));
-  RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+PosixErrorOr<T> TCPBindAndListen(int fd, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T addr, BindIP<T>(fd, dual_stack));
+  RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5));
+  return addr;
+}
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
+                                 bool dual_stack, T bind_addr) {
   int connect_result = 0;
   RETURN_ERROR_IF_SYSCALL_FAIL(
       (connect_result = RetryEINTR(connect)(
@@ -358,16 +369,27 @@ PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
     absl::SleepFor(absl::Seconds(1));
   }
 
-  // Cleanup no longer needed resources.
-  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
-  MaybeSave();  // Successful close.
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
                                              extra_addr);
 }
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
+    int bound, int connected, int type, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T bind_addr, TCPBindAndListen<T>(bound, dual_stack));
+
+  auto result = CreateTCPConnectAcceptSocketPair(bound, connected, type,
+                                                 dual_stack, bind_addr);
+
+  // Cleanup no longer needed resources.
+  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+  MaybeSave();  // Successful close.
+
+  return result;
+}
+
 Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack) {
@@ -389,6 +411,63 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
   };
 }
 
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack) {
+  // These are lazily initialized below, on the first call to the returned
+  // lambda. These values are private to each returned lambda, but shared across
+  // invocations of a specific lambda.
+  //
+  // The sharing allows pairs created with the same parameters to share a
+  // listener. This prevents future connects from failing if the connecting
+  // socket selects a port which had previously been used by a listening socket
+  // that still has some connections in TIME-WAIT.
+  //
+  // The lazy initialization is to avoid creating sockets during parameter
+  // enumeration. This is important because parameters are enumerated during the
+  // build process where networking may not be available.
+  auto listener = std::make_shared<absl::optional<int>>(absl::optional<int>());
+  auto addr4 = std::make_shared<absl::optional<sockaddr_in>>(
+      absl::optional<sockaddr_in>());
+  auto addr6 = std::make_shared<absl::optional<sockaddr_in6>>(
+      absl::optional<sockaddr_in6>());
+
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    int connected;
+    RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    // Share the listener across invocations.
+    if (!listener->has_value()) {
+      int fd = socket(domain, type, protocol);
+      if (fd < 0) {
+        return PosixError(errno, absl::StrCat("socket(", domain, ", ", type,
+                                              ", ", protocol, ")"));
+      }
+      listener->emplace(fd);
+      MaybeSave();  // Successful socket creation.
+    }
+
+    // Bind the listener once, but create a new connect/accept pair each
+    // time.
+    if (domain == AF_INET) {
+      if (!addr4->has_value()) {
+        addr4->emplace(
+            TCPBindAndListen<sockaddr_in>(listener->value(), dual_stack)
+                .ValueOrDie());
+      }
+      return CreateTCPConnectAcceptSocketPair(listener->value(), connected,
+                                              type, dual_stack, addr4->value());
+    }
+    if (!addr6->has_value()) {
+      addr6->emplace(
+          TCPBindAndListen<sockaddr_in6>(listener->value(), dual_stack)
+              .ValueOrDie());
+    }
+    return CreateTCPConnectAcceptSocketPair(listener->value(), connected, type,
+                                            dual_stack, addr6->value());
+  };
+}
+
 template <typename T>
 PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateUDPBoundSocketPair(
     int sock1, int sock2, int type, bool dual_stack) {
@@ -518,8 +597,8 @@ size_t CalculateUnixSockAddrLen(const char* sun_path) {
   if (sun_path[0] == 0) {
     return sizeof(sockaddr_un);
   }
-  // Filesystem addresses use the address length plus the 2 byte sun_family and
-  // null terminator.
+  // Filesystem addresses use the address length plus the 2 byte sun_family
+  // and null terminator.
   return strlen(sun_path) + 3;
 }
 
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 2dbb8bed3..bfaa6e397 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -273,6 +273,12 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack);
 
+// TCPAcceptBindPersistentListenerSocketPairCreator is like
+// TCPAcceptBindSocketPairCreator, except it uses the same listening socket to
+// create all SocketPairs.
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack);
+
 // UDPBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
 // obtains file descriptors by invoking the bind() and connect() syscalls on UDP
 // sockets.
-- 
cgit v1.2.3


From 665b614e4a6e715bac25bea15c5c29184016e549 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 4 Feb 2020 18:04:26 -0800
Subject: Support RTM_NEWADDR and RTM_GETLINK in (rt)netlink.

PiperOrigin-RevId: 293271055
---
 pkg/sentry/inet/inet.go                      |   4 +
 pkg/sentry/inet/test_stack.go                |   6 +
 pkg/sentry/socket/hostinet/stack.go          |   5 +
 pkg/sentry/socket/netlink/BUILD              |  14 +-
 pkg/sentry/socket/netlink/message.go         | 129 +++++++++++
 pkg/sentry/socket/netlink/message_test.go    | 312 +++++++++++++++++++++++++++
 pkg/sentry/socket/netlink/provider.go        |   2 +-
 pkg/sentry/socket/netlink/route/BUILD        |   2 -
 pkg/sentry/socket/netlink/route/protocol.go  | 238 ++++++++++++++------
 pkg/sentry/socket/netlink/socket.go          |  54 ++---
 pkg/sentry/socket/netlink/uevent/protocol.go |   2 +-
 pkg/sentry/socket/netstack/stack.go          |  55 +++++
 pkg/tcpip/stack/stack.go                     |   9 +
 test/syscalls/linux/BUILD                    |   2 +
 test/syscalls/linux/socket_netlink_route.cc  | 296 ++++++++++++++++++++-----
 test/syscalls/linux/socket_netlink_util.cc   |  45 +++-
 test/syscalls/linux/socket_netlink_util.h    |   9 +
 17 files changed, 1022 insertions(+), 162 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/message_test.go

diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index a7dfb78a7..2916a0644 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -28,6 +28,10 @@ type Stack interface {
 	// interface indexes to a slice of associated interface address properties.
 	InterfaceAddrs() map[int32][]InterfaceAddr
 
+	// AddInterfaceAddr adds an address to the network interface identified by
+	// index.
+	AddInterfaceAddr(idx int32, addr InterfaceAddr) error
+
 	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
 	SupportsIPv6() bool
 
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index dcfcbd97e..d8961fc94 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr {
 	return s.InterfaceAddrsMap
 }
 
+// AddInterfaceAddr implements Stack.AddInterfaceAddr.
+func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
+	s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr)
+	return nil
+}
+
 // SupportsIPv6 implements Stack.SupportsIPv6.
 func (s *TestStack) SupportsIPv6() bool {
 	return s.SupportsIPv6Flag
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 034eca676..a48082631 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return addrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	return syserror.EACCES
+}
+
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
 func (s *Stack) SupportsIPv6() bool {
 	return s.supportsIPv6
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f8b8e467d..1911cd9b8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -33,3 +33,15 @@ go_library(
         "//pkg/waiter",
     ],
 )
+
+go_test(
+    name = "netlink_test",
+    size = "small",
+    srcs = [
+        "message_test.go",
+    ],
+    deps = [
+        ":netlink",
+        "//pkg/abi/linux",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b21e0ca4b..4ea252ccb 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -30,8 +30,16 @@ func alignUp(length int, align uint) int {
 	return (length + int(align) - 1) &^ (int(align) - 1)
 }
 
+// alignPad returns the length of padding required for alignment.
+//
+// Preconditions: align is a power of two.
+func alignPad(length int, align uint) int {
+	return alignUp(length, align) - length
+}
+
 // Message contains a complete serialized netlink message.
 type Message struct {
+	hdr linux.NetlinkMessageHeader
 	buf []byte
 }
 
@@ -40,10 +48,86 @@ type Message struct {
 // The header length will be updated by Finalize.
 func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
 	return &Message{
+		hdr: hdr,
 		buf: binary.Marshal(nil, usermem.ByteOrder, hdr),
 	}
 }
 
+// ParseMessage parses the first message seen at buf, returning the rest of the
+// buffer. If message is malformed, ok of false is returned. For last message,
+// padding check is loose, if there isn't enought padding, whole buf is consumed
+// and ok is set to true.
+func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
+	b := BytesView(buf)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+	var hdr linux.NetlinkMessageHeader
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	// Msg portion.
+	totalMsgLen := int(hdr.Length)
+	_, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+
+	// Padding.
+	numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return
+	}
+
+	return &Message{
+		hdr: hdr,
+		buf: buf[:totalMsgLen],
+	}, []byte(b), true
+}
+
+// Header returns the header of this message.
+func (m *Message) Header() linux.NetlinkMessageHeader {
+	return m.hdr
+}
+
+// GetData unmarshals the payload message header from this netlink message, and
+// returns the attributes portion.
+func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
+	b := BytesView(m.buf)
+
+	_, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return nil, false
+	}
+
+	size := int(binary.Size(msg))
+	msgBytes, ok := b.Extract(size)
+	if !ok {
+		return nil, false
+	}
+	binary.Unmarshal(msgBytes, usermem.ByteOrder, msg)
+
+	numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return nil, false
+	}
+
+	return AttrsView(b), true
+}
+
 // Finalize returns the []byte containing the entire message, with the total
 // length set in the message header. The Message must not be modified after
 // calling Finalize.
@@ -157,3 +241,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
 	ms.Messages = append(ms.Messages, m)
 	return m
 }
+
+// AttrsView is a view into the attributes portion of a netlink message.
+type AttrsView []byte
+
+// Empty returns whether there is no attribute left in v.
+func (v AttrsView) Empty() bool {
+	return len(v) == 0
+}
+
+// ParseFirst parses first netlink attribute at the beginning of v.
+func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) {
+	b := BytesView(v)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+
+	_, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO))
+	if !ok {
+		return
+	}
+
+	return hdr, value, AttrsView(b), ok
+}
+
+// BytesView supports extracting data from a byte slice with bounds checking.
+type BytesView []byte
+
+// Extract removes the first n bytes from v and returns it. If n is out of
+// bounds, it returns false.
+func (v *BytesView) Extract(n int) ([]byte, bool) {
+	if n < 0 || n > len(*v) {
+		return nil, false
+	}
+	extracted := (*v)[:n]
+	*v = (*v)[n:]
+	return extracted, true
+}
diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go
new file mode 100644
index 000000000..ef13d9386
--- /dev/null
+++ b/pkg/sentry/socket/netlink/message_test.go
@@ -0,0 +1,312 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package message_test
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+)
+
+type dummyNetlinkMsg struct {
+	Foo uint16
+}
+
+func TestParseMessage(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		header  linux.NetlinkMessageHeader
+		dataMsg *dummyNetlinkMsg
+		restLen int
+		ok      bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid with next message",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+				0xFF, // Next message (rest)
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 1,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message without padding",
+			input: []byte{
+				0x12, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 18,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message not to be aligned",
+			input: []byte{
+				0x13, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+				0x00, // Excessive 1 byte permitted at end
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 19,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "header.Length too short",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header.Length too long",
+			input: []byte{
+				0xFF, 0xFF, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header incomplete",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+			},
+			ok: false,
+		},
+		{
+			desc:  "empty message",
+			input: []byte{},
+			ok:    false,
+		},
+	}
+	for _, test := range tests {
+		msg, rest, ok := netlink.ParseMessage(test.input)
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+			continue
+		}
+		if !test.ok {
+			continue
+		}
+		if !reflect.DeepEqual(msg.Header(), test.header) {
+			t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header)
+		}
+
+		dataMsg := &dummyNetlinkMsg{}
+		_, dataOk := msg.GetData(dataMsg)
+		if !dataOk {
+			t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk)
+		} else if !reflect.DeepEqual(dataMsg, test.dataMsg) {
+			t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg)
+		}
+
+		if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) {
+			t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
+
+func TestAttrView(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		// Outputs for ParseFirst.
+		hdr     linux.NetlinkAttrHeader
+		value   []byte
+		restLen int
+		ok      bool
+
+		// Outputs for Empty.
+		isEmpty bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x06, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 6,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment with rest data",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+				0xFF, 0xFE, // Rest data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 2,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too long",
+			input: []byte{
+				0xFF, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too short",
+			input: []byte{
+				0x01, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc:    "empty",
+			input:   []byte{},
+			ok:      false,
+			isEmpty: true,
+		},
+	}
+	for _, test := range tests {
+		attrs := netlink.AttrsView(test.input)
+
+		// Test ParseFirst().
+		hdr, value, rest, ok := attrs.ParseFirst()
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+		} else if test.ok {
+			if !reflect.DeepEqual(hdr, test.hdr) {
+				t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr)
+			}
+			if !bytes.Equal(value, test.value) {
+				t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value)
+			}
+			if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) {
+				t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest)
+			}
+		}
+
+		// Test Empty().
+		if got, want := attrs.Empty(), test.isEmpty; got != want {
+			t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 07f860a49..b0dc70e5c 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -42,7 +42,7 @@ type Protocol interface {
 	// If err == nil, any messages added to ms will be sent back to the
 	// other end of the socket. Setting ms.Multi will cause an NLMSG_DONE
 	// message to be sent even if ms contains no messages.
-	ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error
+	ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error
 }
 
 // Provider is a function that creates a new Protocol for a specific netlink
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 622a1eafc..93127398d 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -10,13 +10,11 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
-        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 2b3c7f5b3..c84d8bd7c 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -17,16 +17,15 @@ package route
 
 import (
 	"bytes"
+	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // dumpLinks handles RTM_GETLINK dump requests.
-func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
+func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		return nil
 	}
 
-	for id, i := range stack.Interfaces() {
-		m := ms.AddMessage(linux.NetlinkMessageHeader{
-			Type: linux.RTM_NEWLINK,
-		})
+	for idx, i := range stack.Interfaces() {
+		addNewLinkMessage(ms, idx, i)
+	}
 
-		m.Put(linux.InterfaceInfoMessage{
-			Family: linux.AF_UNSPEC,
-			Type:   i.DeviceType,
-			Index:  id,
-			Flags:  i.Flags,
-		})
+	return nil
+}
 
-		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
-		m.PutAttr(linux.IFLA_MTU, i.MTU)
+// getLinks handles RTM_GETLINK requests.
+func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network devices.
+		return nil
+	}
 
-		mac := make([]byte, 6)
-		brd := mac
-		if len(i.Addr) > 0 {
-			mac = i.Addr
-			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+	// Parse message.
+	var ifi linux.InterfaceInfoMessage
+	attrs, ok := msg.GetData(&ifi)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	// Parse attributes.
+	var byName []byte
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
 		}
-		m.PutAttr(linux.IFLA_ADDRESS, mac)
-		m.PutAttr(linux.IFLA_BROADCAST, brd)
+		attrs = rest
 
-		// TODO(gvisor.dev/issue/578): There are many more attributes.
+		switch ahdr.Type {
+		case linux.IFLA_IFNAME:
+			if len(value) < 1 {
+				return syserr.ErrInvalidArgument
+			}
+			byName = value[:len(value)-1]
+
+			// TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK.
+		}
 	}
 
+	found := false
+	for idx, i := range stack.Interfaces() {
+		switch {
+		case ifi.Index > 0:
+			if idx != ifi.Index {
+				continue
+			}
+		case byName != nil:
+			if string(byName) != i.Name {
+				continue
+			}
+		default:
+			// Criteria not specified.
+			return syserr.ErrInvalidArgument
+		}
+
+		addNewLinkMessage(ms, idx, i)
+		found = true
+		break
+	}
+	if !found {
+		return syserr.ErrNoDevice
+	}
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR dump requests.
-func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
+// addNewLinkMessage appends RTM_NEWLINK message for the given interface into
+// the message set.
+func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.RTM_NEWLINK,
+	})
+
+	m.Put(linux.InterfaceInfoMessage{
+		Family: linux.AF_UNSPEC,
+		Type:   i.DeviceType,
+		Index:  idx,
+		Flags:  i.Flags,
+	})
+
+	m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+	m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+	mac := make([]byte, 6)
+	brd := mac
+	if len(i.Addr) > 0 {
+		mac = i.Addr
+		brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
 	}
+	m.PutAttr(linux.IFLA_ADDRESS, mac)
+	m.PutAttr(linux.IFLA_BROADCAST, brd)
+
+	// TODO(gvisor.dev/issue/578): There are many more attributes.
+}
 
+// dumpAddrs handles RTM_GETADDR dump requests.
+func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 				Index:     uint32(id),
 			})
 
+			m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr))
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
 			// TODO(gvisor.dev/issue/578): There are many more attributes.
@@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
 }
 
 // parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
-func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) {
 	var rtMsg linux.RouteMessage
-	if len(data) < linux.SizeOfRouteMessage {
+	attrs, ok := msg.GetData(&rtMsg)
+	if !ok {
 		return nil, syserr.ErrInvalidArgument
 	}
-	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
 	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
 	// commit bc234301af12. Note we don't check this flag for backward
 	// compatibility.
@@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) {
 		return nil, syserr.ErrNotSupported
 	}
 
-	data = data[linux.SizeOfRouteMessage:]
-
-	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
-	var rtAttr linux.RtAttr
-	if len(data) < linux.SizeOfRtAttr {
-		return nil, syserr.ErrInvalidArgument
+	// Expect first attribute is RTA_DST.
+	if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST {
+		return value, nil
 	}
-	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
-	if rtAttr.Type != linux.RTA_DST {
-		return nil, syserr.ErrInvalidArgument
-	}
-
-	if len(data) < int(rtAttr.Len) {
-		return nil, syserr.ErrInvalidArgument
-	}
-	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+	return nil, syserr.ErrInvalidArgument
 }
 
 // dumpRoutes handles RTM_GETROUTE requests.
-func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 		return nil
 	}
 
+	hdr := msg.Header()
 	routeTables := stack.RouteTable()
 
 	if hdr.Flags == linux.NLM_F_REQUEST {
-		dst, err := parseForDestination(data)
+		dst, err := parseForDestination(msg)
 		if err != nil {
 			return err
 		}
@@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 	return nil
 }
 
+// newAddr handles RTM_NEWADDR requests.
+func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network stack.
+		return syserr.ErrProtocolNotSupported
+	}
+
+	var ifa linux.InterfaceAddrMessage
+	attrs, ok := msg.GetData(&ifa)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
+		}
+		attrs = rest
+
+		switch ahdr.Type {
+		case linux.IFA_LOCAL:
+			err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+				Family:    ifa.Family,
+				PrefixLen: ifa.PrefixLen,
+				Flags:     ifa.Flags,
+				Addr:      value,
+			})
+			if err == syscall.EEXIST {
+				flags := msg.Header().Flags
+				if flags&linux.NLM_F_EXCL != 0 {
+					return syserr.ErrExists
+				}
+			} else if err != nil {
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+	return nil
+}
+
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	hdr := msg.Header()
+
 	// All messages start with a 1 byte protocol family.
-	if len(data) < 1 {
+	var family uint8
+	if _, ok := msg.GetData(&family); !ok {
 		// Linux ignores messages missing the protocol family. See
 		// net/core/rtnetlink.c:rtnetlink_rcv_msg.
 		return nil
@@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	switch hdr.Type {
-	case linux.RTM_GETLINK:
-		return p.dumpLinks(ctx, hdr, data, ms)
-	case linux.RTM_GETADDR:
-		return p.dumpAddrs(ctx, hdr, data, ms)
-	case linux.RTM_GETROUTE:
-		return p.dumpRoutes(ctx, hdr, data, ms)
-	default:
-		return syserr.ErrNotSupported
+	if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// TODO(b/68878065): Only the dump variant of the types below are
+		// supported.
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.dumpLinks(ctx, msg, ms)
+		case linux.RTM_GETADDR:
+			return p.dumpAddrs(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
+	} else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST {
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.getLink(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		case linux.RTM_NEWADDR:
+			return p.newAddr(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
 	}
+	return syserr.ErrNotSupported
 }
 
 // init registers the NETLINK_ROUTE provider.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index c4b95debb..2ca02567d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	return nil
 }
 
-func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) {
 	m := ms.AddMessage(linux.NetlinkMessageHeader{
 		Type: linux.NLMSG_ERROR,
 	})
-
 	m.Put(linux.NetlinkErrorMessage{
 		Error:  int32(-err.ToLinux().Number()),
 		Header: hdr,
 	})
-	return nil
+}
 
+func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.NLMSG_ERROR,
+	})
+	m.Put(linux.NetlinkErrorMessage{
+		Error:  0,
+		Header: hdr,
+	})
 }
 
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
 	for len(buf) > 0 {
-		if len(buf) < linux.NetlinkMessageHeaderSize {
+		msg, rest, ok := ParseMessage(buf)
+		if !ok {
 			// Linux ignores messages that are too short. See
 			// net/netlink/af_netlink.c:netlink_rcv_skb.
 			break
 		}
-
-		var hdr linux.NetlinkMessageHeader
-		binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr)
-
-		if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) {
-			// Linux ignores malformed messages. See
-			// net/netlink/af_netlink.c:netlink_rcv_skb.
-			break
-		}
-
-		// Data from this message.
-		data := buf[linux.NetlinkMessageHeaderSize:hdr.Length]
-
-		// Advance to the next message.
-		next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO)
-		if next >= len(buf)-1 {
-			next = len(buf) - 1
-		}
-		buf = buf[next:]
+		buf = rest
+		hdr := msg.Header()
 
 		// Ignore control messages.
 		if hdr.Type < linux.NLMSG_MIN_TYPE {
@@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 		}
 
 		ms := NewMessageSet(s.portID, hdr.Seq)
-		var err *syserr.Error
-		// TODO(b/68877377): ACKs not supported yet.
-		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
-			err = syserr.ErrNotSupported
-		} else {
-
-			err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
-		}
-		if err != nil {
-			ms = NewMessageSet(s.portID, hdr.Seq)
-			if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
-				return err
-			}
+		if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
+			dumpErrorMesage(hdr, ms, err)
+		} else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
+			dumpAckMesage(hdr, ms)
 		}
 
 		if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
index 1ee4296bc..029ba21b5 100644
--- a/pkg/sentry/socket/netlink/uevent/protocol.go
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// Silently ignore all messages.
 	return nil
 }
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 31ea66eca..0692482e9 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -20,6 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return nicAddrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	var (
+		protocol tcpip.NetworkProtocolNumber
+		address  tcpip.Address
+	)
+	switch addr.Family {
+	case linux.AF_INET:
+		if len(addr.Addr) < header.IPv4AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv4AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv4.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
+
+	case linux.AF_INET6:
+		if len(addr.Addr) < header.IPv6AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv6AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv6.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
+
+	default:
+		return syserror.ENOTSUP
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   address,
+			PrefixLen: int(addr.PrefixLen),
+		},
+	}
+
+	// Attach address to interface.
+	if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+		return syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Add route for local network.
+	s.Stack.AddRoute(tcpip.Route{
+		Destination: protocolAddress.AddressWithPrefix.Subnet(),
+		Gateway:     "", // No gateway for local network.
+		NIC:         tcpip.NICID(idx),
+	})
+	return nil
+}
+
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 	var rs tcp.ReceiveBufferSizeOption
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7057b110e..b793f1d74 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -795,6 +795,8 @@ func (s *Stack) Forwarding() bool {
 
 // SetRouteTable assigns the route table to be used by this stack. It
 // specifies which NIC to use for given destination address ranges.
+//
+// This method takes ownership of the table.
 func (s *Stack) SetRouteTable(table []tcpip.Route) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -809,6 +811,13 @@ func (s *Stack) GetRouteTable() []tcpip.Route {
 	return append([]tcpip.Route(nil), s.routeTable...)
 }
 
+// AddRoute appends a route to the route table.
+func (s *Stack) AddRoute(route tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.routeTable = append(s.routeTable, route)
+}
+
 // NewEndpoint creates a new transport layer endpoint of the given protocol.
 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	t, ok := s.transportProtocols[transport]
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 273b014d6..f2e3c7072 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2769,9 +2769,11 @@ cc_binary(
     deps = [
         ":socket_netlink_util",
         ":socket_test_util",
+        "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 1e28e658d..e5aed1eec 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -14,6 +14,7 @@
 
 #include <arpa/inet.h>
 #include <ifaddrs.h>
+#include <linux/if.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
@@ -25,8 +26,10 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
@@ -38,6 +41,8 @@ namespace testing {
 
 namespace {
 
+constexpr uint32_t kSeq = 12345;
+
 using ::testing::AnyOf;
 using ::testing::Eq;
 
@@ -113,58 +118,224 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
+  // Loopback is common among all tests, check that it's found.
+  bool loopbackFound = false;
+  ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    CheckGetLinkResponse(hdr, kSeq, port);
+    if (hdr->nlmsg_type != RTM_NEWLINK) {
+      return;
+    }
+    ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    std::cout << "Found interface idx=" << msg->ifi_index
+              << ", type=" << std::hex << msg->ifi_type;
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      loopbackFound = true;
+      EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+    }
+  }));
+  EXPECT_TRUE(loopbackFound);
+}
+
+struct Link {
+  int index;
+  std::string name;
+};
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  absl::optional<Link> link;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+      if (rta == nullptr) {
+        // Ignore links that do not have a name.
+        return;
+      }
+
+      link = Link();
+      link->index = msg->ifi_index;
+      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    }
+  }));
+  return link;
+}
+
+// CheckLinkMsg checks a netlink message against an expected link.
+void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
+  ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
+  ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+  const struct ifinfomsg* msg =
+      reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+  EXPECT_EQ(msg->ifi_index, link.index);
+
+  const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+  EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message.";
+  if (rta != nullptr) {
+    std::string name(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    EXPECT_EQ(name, link.name);
+  }
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndex) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = loopback_link->index;
 
-  // Loopback is common among all tests, check that it's found.
-  bool loopbackFound = false;
+  bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckGetLinkResponse(hdr, kSeq, port);
-        if (hdr->nlmsg_type != RTM_NEWLINK) {
-          return;
-        }
-        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
-        const struct ifinfomsg* msg =
-            reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-        std::cout << "Found interface idx=" << msg->ifi_index
-                  << ", type=" << std::hex << msg->ifi_type;
-        if (msg->ifi_type == ARPHRD_LOOPBACK) {
-          loopbackFound = true;
-          EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
-        }
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
       },
       false));
-  EXPECT_TRUE(loopbackFound);
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
 }
 
-TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+TEST(NetlinkRouteTest, GetLinkByName) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
   };
 
-  constexpr uint32_t kSeq = 12345;
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
+  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  bool found = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req),
+      [&](const struct nlmsghdr* hdr) {
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
+      },
+      false));
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndexNotFound) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = 1234590;
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, GetLinkByNameNotFound) {
+  const std::string name = "nodevice?!";
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(name.size() + 1);
+  strncpy(req.ifname, name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -175,18 +346,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
 
-  ASSERT_NO_ERRNO(NetlinkRequestResponse(
-      fd, &req, sizeof(req),
-      [&](const struct nlmsghdr* hdr) {
-        EXPECT_THAT(hdr->nlmsg_type, Eq(NLMSG_ERROR));
-        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
-        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
-
-        const struct nlmsgerr* msg =
-            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
-        EXPECT_EQ(msg->error, -EOPNOTSUPP);
-      },
-      true));
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(EOPNOTSUPP, ::testing::_));
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
@@ -198,8 +359,6 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -238,8 +397,6 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -282,8 +439,6 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
 
   // This control message is ignored. We still receive a response for the
@@ -317,8 +472,6 @@ TEST(NetlinkRouteTest, GetAddrDump) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -367,6 +520,57 @@ TEST(NetlinkRouteTest, LookupAll) {
   ASSERT_GT(count, 0);
 }
 
+TEST(NetlinkRouteTest, AddAddr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifa;
+    struct rtattr rtattr;
+    struct in_addr addr;
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifa.ifa_family = AF_INET;
+  req.ifa.ifa_prefixlen = 24;
+  req.ifa.ifa_flags = 0;
+  req.ifa.ifa_scope = 0;
+  req.ifa.ifa_index = loopback_link->index;
+  req.rtattr.rta_type = IFA_LOCAL;
+  req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
+  inet_pton(AF_INET, "10.0.0.1", &req.addr);
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  // Create should succeed, as no such address in kernel.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Replace an existing address should succeed.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Create exclusive should fail, as we created the address above.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_THAT(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
+      PosixErrorIs(EEXIST, ::testing::_));
+}
+
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
@@ -378,8 +582,6 @@ TEST(NetlinkRouteTest, GetRouteDump) {
     struct rtmsg rtm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETROUTE;
@@ -538,8 +740,6 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -615,8 +815,6 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -695,8 +893,6 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -743,8 +939,6 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index cd2212a1a..952eecfe8 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,6 +16,7 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 #include <sys/socket.h>
 
 #include <vector>
@@ -71,9 +72,10 @@ PosixError NetlinkRequestResponse(
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE
-  // message.
+  // If NLM_F_MULTI is set, response is a series of messages that ends with a
+  // NLMSG_DONE message.
   int type = -1;
+  int flags = 0;
   do {
     int len;
     RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
@@ -89,6 +91,7 @@ PosixError NetlinkRequestResponse(
     for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
          NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
       fn(hdr);
+      flags = hdr->nlmsg_flags;
       type = hdr->nlmsg_type;
       // Done should include an integer payload for dump_done_errno.
       // See net/netlink/af_netlink.c:netlink_dump
@@ -98,11 +101,11 @@ PosixError NetlinkRequestResponse(
         EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int)));
       }
     }
-  } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+  } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR);
 
   if (expect_nlmsgerr) {
     EXPECT_EQ(type, NLMSG_ERROR);
-  } else {
+  } else if (flags & NLM_F_MULTI) {
     EXPECT_EQ(type, NLMSG_DONE);
   }
   return NoError();
@@ -146,5 +149,39 @@ PosixError NetlinkRequestResponseSingle(
   return NoError();
 }
 
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len) {
+  // Dummy negative number for no error message received.
+  // We won't get a negative error number so there will be no confusion.
+  int err = -42;
+  RETURN_IF_ERRNO(NetlinkRequestResponse(
+      fd, request, len,
+      [&](const struct nlmsghdr* hdr) {
+        EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type);
+        EXPECT_EQ(hdr->nlmsg_seq, seq);
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
+
+        const struct nlmsgerr* msg =
+            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
+        err = -msg->error;
+      },
+      true));
+  return PosixError(err);
+}
+
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr) {
+  const int ifi_space = NLMSG_SPACE(sizeof(*msg));
+  int attrlen = hdr->nlmsg_len - ifi_space;
+  const struct rtattr* rta = reinterpret_cast<const struct rtattr*>(
+      reinterpret_cast<const uint8_t*>(hdr) + NLMSG_ALIGN(ifi_space));
+  for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) {
+    if (rta->rta_type == attr) {
+      return rta;
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 3678c0599..e13ead406 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -19,6 +19,7 @@
 // socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -47,6 +48,14 @@ PosixError NetlinkRequestResponseSingle(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn);
 
+// Send the passed request then expect and return an ack or error.
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len);
+
+// Find rtnetlink attribute in message.
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From 37abbbc547d9d78e0bf42f192403c8eca30593d8 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 5 Feb 2020 11:17:14 -0800
Subject: Add packetdrill tests to presubmit and CI testing

PiperOrigin-RevId: 293409718
---
 kokoro/packetdrill_tests.cfg         |  9 +++++++++
 scripts/packetdrill_tests.sh         | 20 ++++++++++++++++++++
 test/packetdrill/defs.bzl            |  6 ++++--
 test/packetdrill/packetdrill_test.sh | 20 ++++++++++++++++----
 4 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 kokoro/packetdrill_tests.cfg
 create mode 100755 scripts/packetdrill_tests.sh

diff --git a/kokoro/packetdrill_tests.cfg b/kokoro/packetdrill_tests.cfg
new file mode 100644
index 000000000..258d7deb4
--- /dev/null
+++ b/kokoro/packetdrill_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/packetdrill_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/scripts/packetdrill_tests.sh b/scripts/packetdrill_tests.sh
new file mode 100755
index 000000000..fc6bef79c
--- /dev/null
+++ b/scripts/packetdrill_tests.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+install_runsc_for_test runsc-d
+test_runsc $(bazel query "attr(tags, manual, tests(//test/packetdrill/...))")
diff --git a/test/packetdrill/defs.bzl b/test/packetdrill/defs.bzl
index 582f97e0c..8623ce7b1 100644
--- a/test/packetdrill/defs.bzl
+++ b/test/packetdrill/defs.bzl
@@ -15,7 +15,7 @@ def _packetdrill_test_impl(ctx):
         # Make sure that everything is readable here.
         "find . -type f -exec chmod a+rx {} \\;",
         "find . -type d -exec chmod a+rx {} \\;",
-        "%s %s --init_script %s -- %s\n" % (
+        "%s %s --init_script %s $@ -- %s\n" % (
             test_runner.short_path,
             " ".join(ctx.attr.flags),
             ctx.files._init_script[0].short_path,
@@ -76,7 +76,9 @@ def packetdrill_netstack_test(name, **kwargs):
         kwargs["tags"] = _PACKETDRILL_TAGS
     _packetdrill_test(
         name = name + "_netstack_test",
-        flags = ["--dut_platform", "netstack"],
+        # This is the default runtime unless
+        # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
+        flags = ["--dut_platform", "netstack", "--runtime", "runsc-d"],
         **kwargs
     )
 
diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh
index 614d94d74..0b22dfd5c 100755
--- a/test/packetdrill/packetdrill_test.sh
+++ b/test/packetdrill/packetdrill_test.sh
@@ -29,7 +29,7 @@ function failure() {
 }
 trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
 
-declare -r LONGOPTS="dut_platform:,init_script:"
+declare -r LONGOPTS="dut_platform:,init_script:,runtime:"
 
 # Don't use declare below so that the error from getopt will end the script.
 PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
@@ -39,6 +39,7 @@ eval set -- "$PARSED"
 while true; do
   case "$1" in
     --dut_platform)
+      # Either "linux" or "netstack".
       declare -r DUT_PLATFORM="$2"
       shift 2
       ;;
@@ -46,6 +47,13 @@ while true; do
       declare -r INIT_SCRIPT="$2"
       shift 2
       ;;
+    --runtime)
+      # Not readonly because there might be multiple --runtime arguments and we
+      # want to use just the last one.  Only used if --dut_platform is
+      # "netstack".
+      declare RUNTIME="$2"
+      shift 2
+      ;;
     --)
       shift
       break
@@ -61,9 +69,13 @@ declare -r scripts="$@"
 
 # Check that the required flags are defined in a way that is safe for "set -u".
 if [[ "${DUT_PLATFORM-}" == "netstack" ]]; then
-  declare -r RUNTIME="--runtime runsc-d"
+  if [[ -z "${RUNTIME-}" ]]; then
+    echo "FAIL: Missing --runtime argument: ${RUNTIME-}"
+    exit 2
+  fi
+  declare -r RUNTIME_ARG="--runtime ${RUNTIME}"
 elif [[ "${DUT_PLATFORM-}" == "linux" ]]; then
-  declare -r RUNTIME=""
+  declare -r RUNTIME_ARG=""
 else
   echo "FAIL: Bad or missing --dut_platform argument: ${DUT_PLATFORM-}"
   exit 2
@@ -143,7 +155,7 @@ done
 docker pull "${IMAGE_TAG}"
 
 # Create the DUT container and connect to network.
-DUT=$(docker create ${RUNTIME} --privileged --rm \
+DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
   --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
 docker network connect "${CTRL_NET}" \
   --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
-- 
cgit v1.2.3


From eea0eeee933ba8406ae688fce4348271f9513514 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 5 Feb 2020 11:25:10 -0800
Subject: Disable get/set xattrs until list/remove exist too.

PiperOrigin-RevId: 293411655
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++++---
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 +++++----
 test/syscalls/linux/xattr.cc               | 124 +++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 29 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 7435b50bf..588f8b087 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,18 +228,21 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 03a39fe65..06e5ee401 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,23 +36,26 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index ab21d68c6..85eb31847 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -39,6 +39,10 @@ namespace {
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNullName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
@@ -48,6 +52,10 @@ TEST_F(XattrTest, XattrNullName) {
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
@@ -56,6 +64,10 @@ TEST_F(XattrTest, XattrEmptyName) {
 }
 
 TEST_F(XattrTest, XattrLargeName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -77,6 +89,10 @@ TEST_F(XattrTest, XattrLargeName) {
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -88,6 +104,10 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -113,6 +133,10 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -143,6 +167,10 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
@@ -152,6 +180,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -163,6 +195,10 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
@@ -181,6 +217,10 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -196,6 +236,10 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -208,6 +252,10 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -223,6 +271,10 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -232,6 +284,10 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -240,6 +296,10 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -256,6 +316,10 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -271,6 +335,10 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -285,6 +353,10 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -296,6 +368,10 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -308,6 +384,10 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -315,6 +395,10 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -327,6 +411,10 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -339,6 +427,10 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -354,6 +446,10 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -367,6 +463,10 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -383,6 +483,10 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -394,6 +498,10 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -410,12 +518,20 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -427,6 +543,10 @@ TEST_F(XattrTest, LGetSetxattrOnSymlink) {
 }
 
 TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -441,6 +561,10 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
 }
 
 TEST_F(XattrTest, FGetSetxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
-- 
cgit v1.2.3


From f2d3efca1deded31a2929ea77c0eecf476764660 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Wed, 5 Feb 2020 11:33:48 -0800
Subject: Fix undeclared variable error in common_build.sh.

PiperOrigin-RevId: 293413711
---
 scripts/common_build.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/common_build.sh b/scripts/common_build.sh
index a473a88a4..2c2a826c7 100755
--- a/scripts/common_build.sh
+++ b/scripts/common_build.sh
@@ -32,21 +32,21 @@ declare -r BAZEL_FLAGS=(
   "--keep_going"
   "--verbose_failures=true"
 )
+BAZEL_RBE_AUTH_FLAGS=""
+BAZEL_RBE_FLAGS=""
 if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
-  declare -r BAZEL_RBE_AUTH_FLAGS=(
-    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-  )
-  declare -r BAZEL_RBE_FLAGS=("--config=remote")
+  declare -r BAZEL_RBE_AUTH_FLAGS="--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+  declare -r BAZEL_RBE_FLAGS="--config=remote"
 fi
 
 # Wrap bazel.
 function build() {
-  bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
+  bazel build "${BAZEL_RBE_FLAGS}" "${BAZEL_RBE_AUTH_FLAGS}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
     tee /dev/fd/2 | grep -E '^  bazel-bin/' | awk '{ print $1; }'
 }
 
 function test() {
-  bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@"
+  bazel test "${BAZEL_RBE_FLAGS}" "${BAZEL_RBE_AUTH_FLAGS}" "${BAZEL_FLAGS[@]}" "$@"
 }
 
 function run() {
-- 
cgit v1.2.3


From f3d95607036b8a502c65aa7b3e8145227274dbbc Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 5 Feb 2020 17:56:00 -0800
Subject: recv() on a closed TCP socket returns ENOTCONN

From RFC 793 s3.9 p58 Event Processing:

If RECEIVE Call arrives in CLOSED state and the user has access to such a
connection, the return should be "error: connection does not exist"

Fixes #1598

PiperOrigin-RevId: 293494287
---
 pkg/sentry/socket/netstack/netstack.go | 7 ++++++-
 pkg/tcpip/tcpip.go                     | 4 ++++
 pkg/tcpip/transport/tcp/endpoint.go    | 4 ++--
 pkg/tcpip/transport/tcp/tcp_test.go    | 9 ++++-----
 test/syscalls/linux/tcp_socket.cc      | 9 +++++++++
 5 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 049d04bf2..ed2fbcceb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2229,11 +2229,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 	var copied int
 
 	// Copy as many views as possible into the user-provided buffer.
-	for dst.NumBytes() != 0 {
+	for {
+		// Always do at least one fetchReadView, even if the number of bytes to
+		// read is 0.
 		err = s.fetchReadView()
 		if err != nil {
 			break
 		}
+		if dst.NumBytes() == 0 {
+			break
+		}
 
 		var n int
 		var e error
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0fa141d58..d29d9a704 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1124,6 +1124,10 @@ type ReadErrors struct {
 	// InvalidEndpointState is the number of times we found the endpoint state
 	// to be unexpected.
 	InvalidEndpointState StatCounter
+
+	// NotConnected is the number of times we tried to read but found that the
+	// endpoint was not connected.
+	NotConnected StatCounter
 }
 
 // WriteErrors collects packet write errors from an endpoint write call.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b5a8e15ee..e4a6b1b8b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1003,8 +1003,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
-		e.stats.ReadErrors.InvalidEndpointState.Increment()
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+		e.stats.ReadErrors.NotConnected.Increment()
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
 	}
 
 	v, err := e.readLocked()
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2c1505067..cc118c993 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5405,12 +5405,11 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
-	// Expect InvalidEndpointState errors on a read at this point.
-	if _, _, err := ep.Read(nil); err != tcpip.ErrInvalidEndpointState {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrInvalidEndpointState)
+	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
+		t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected)
 	}
-	if got := ep.Stats().(*tcp.Stats).ReadErrors.InvalidEndpointState.Value(); got != 1 {
-		t.Fatalf("got EP stats Stats.ReadErrors.InvalidEndpointState got %v want %v", got, 1)
+	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1)
 	}
 
 	if err := ep.Listen(10); err != nil {
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 525ccbd88..8a8b68e75 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1339,6 +1339,15 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
   EXPECT_EQ(get, kTCPDeferAccept);
 }
 
+TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
+  auto s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  char buf[1];
+  EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN));
+  EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 5 Feb 2020 22:45:44 -0800
Subject: Add notes to relevant tests.

These were out-of-band notes that can help provide additional context
and simplify automated imports.

PiperOrigin-RevId: 293525915
---
 pkg/metric/metric.go                          |  1 -
 pkg/sentry/arch/arch_x86.go                   |  4 ++
 pkg/sentry/arch/signal_amd64.go               |  2 +-
 pkg/sentry/fs/file_overlay_test.go            |  1 +
 pkg/sentry/fs/proc/README.md                  |  4 ++
 pkg/sentry/kernel/BUILD                       |  1 +
 pkg/sentry/kernel/kernel.go                   |  3 ++
 pkg/sentry/kernel/kernel_opts.go              | 20 +++++++
 pkg/sentry/socket/hostinet/BUILD              |  1 +
 pkg/sentry/socket/hostinet/socket.go          |  5 +-
 pkg/sentry/socket/hostinet/sockopt_impl.go    | 27 ++++++++++
 pkg/tcpip/transport/tcp/endpoint.go           |  3 ++
 runsc/boot/filter/BUILD                       |  1 +
 runsc/boot/filter/config.go                   | 13 -----
 runsc/boot/filter/config_profile.go           | 34 ++++++++++++
 runsc/container/console_test.go               |  5 +-
 runsc/dockerutil/dockerutil.go                | 11 ++--
 runsc/testutil/BUILD                          |  5 +-
 runsc/testutil/testutil.go                    | 54 -------------------
 runsc/testutil/testutil_runfiles.go           | 75 +++++++++++++++++++++++++++
 test/image/image_test.go                      |  8 +--
 test/syscalls/build_defs.bzl                  | 35 +++++++++++--
 test/syscalls/linux/chroot.cc                 |  2 +-
 test/syscalls/linux/concurrency.cc            |  3 +-
 test/syscalls/linux/exec_proc_exe_workload.cc |  6 +++
 test/syscalls/linux/fork.cc                   |  5 +-
 test/syscalls/linux/mmap.cc                   |  8 +--
 test/syscalls/linux/open_create.cc            |  1 +
 test/syscalls/linux/preadv.cc                 |  1 +
 test/syscalls/linux/proc.cc                   | 46 +++++++++++++---
 test/syscalls/linux/readv.cc                  |  4 +-
 test/syscalls/linux/rseq.cc                   |  2 +-
 test/syscalls/linux/select.cc                 |  2 +-
 test/syscalls/linux/shm.cc                    |  2 +-
 test/syscalls/linux/sigprocmask.cc            |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc |  4 +-
 test/syscalls/linux/symlink.cc                |  2 +-
 test/syscalls/linux/tcp_socket.cc             |  3 +-
 test/syscalls/linux/time.cc                   |  1 +
 test/syscalls/linux/tkill.cc                  |  2 +-
 test/util/temp_path.cc                        |  1 +
 tools/build/tags.bzl                          |  4 ++
 tools/defs.bzl                                | 17 +++++-
 43 files changed, 318 insertions(+), 113 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_opts.go
 create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go
 create mode 100644 runsc/boot/filter/config_profile.go
 create mode 100644 runsc/testutil/testutil_runfiles.go

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 93d4f2b8c..006fcd9ab 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -46,7 +46,6 @@ var (
 //
 // TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
-//
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index a18093155..3db8bd34b 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte {
 	size, align := cpuid.HostFeatureSet().ExtendedStateSize()
 	capacity := size
 	// Always use at least 4096 bytes.
+	//
+	// For the KVM platform, this state is a fixed 4096 bytes, so make sure
+	// that the underlying array is at _least_ that size otherwise we will
+	// corrupt random memory. This is not a pleasant thing to debug.
 	if capacity < 4096 {
 		capacity = 4096
 	}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 81b92bb43..6fb756f0e 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -55,7 +55,7 @@ type SignalContext64 struct {
 	Trapno  uint64
 	Oldmask linux.SignalSet
 	Cr2     uint64
-	// Pointer to a struct _fpstate.
+	// Pointer to a struct _fpstate. See b/33003106#comment8.
 	Fpstate  uint64
 	Reserved [8]uint64
 }
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 02538bb4f..a76d87e3a 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
 // a frozen dirent tree does not make Readdir calls to the underlying files.
+// This is a regression test for b/114808269.
 func TestReaddirOverlayFrozen(t *testing.T) {
 	ctx := contexttest.Context(t)
 
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 5d4ec6c7b..6667a0916 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,6 +11,8 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
+<!-- mdformat off(don't wrap the table) -->
+
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -22,6 +24,8 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
+<!-- mdformat on -->
+
 ### cpuinfo
 
 ```bash
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a27628c0a..2231d6973 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_opts.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index dcd6e91c4..3ee760ba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -235,6 +235,9 @@ type Kernel struct {
 	// events. This is initialized lazily on the first unimplemented
 	// syscall.
 	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
new file mode 100644
index 000000000..2e66ec587
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 5a07d5d0e..023bad156 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -10,6 +10,7 @@ go_library(
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
+        "sockopt_impl.go",
         "stack.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 34f63986f..de76388ac 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	}
 
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := getSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := setSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 			optlen = sizeofInt32
 		}
 	}
+
 	if optlen == 0 {
 		// Pretend to accept socket options we don't understand. This seems
 		// dangerous, but it's what netstack does...
diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go
new file mode 100644
index 000000000..8a783712e
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/sockopt_impl.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+func getSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
+
+func setSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e4a6b1b8b..f2be0e651 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.isRegistered = true
 	e.setEndpointState(StateListen)
 
+	// The channel may be non-nil when we're restoring the endpoint, and it
+	// may be pre-populated with some previously accepted (but not Accepted)
+	// endpoints.
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index ce30f6c53..ed18f0047 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -8,6 +8,7 @@ go_library(
         "config.go",
         "config_amd64.go",
         "config_arm64.go",
+        "config_profile.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f8d351c7b..c69f4c602 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
-
-// profileFilters returns extra syscalls made by runtime/pprof package.
-func profileFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_OPENAT: []seccomp.Rule{
-			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
-			},
-		},
-	}
-}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 060b63bf3..c2518d52b 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) {
 	defer ptyMaster.Close()
 	defer ptySlave.Close()
 
-	// Exec bash and attach a terminal.
+	// Exec bash and attach a terminal. Note that occasionally /bin/sh
+	// may be a different shell or have a different configuration (such
+	// as disabling interactive mode and job control). Since we want to
+	// explicitly test interactive mode, use /bin/bash. See b/116981926.
 	execArgs := &control.ExecArgs{
 		Filename: "/bin/bash",
 		// Don't let bash execute from profile or rc files, otherwise
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 9b6346ca2..1ff5e8cc3 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) {
 		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
 	}
 	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
+		}
+		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
 			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
 		}
@@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) {
 	return dir, nil
 }
 
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
 // do executes docker command.
 func do(args ...string) (string, error) {
 	log.Printf("Running: docker %s\n", args)
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index f845120b0..945405303 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "testutil",
     testonly = 1,
-    srcs = ["testutil.go"],
+    srcs = [
+        "testutil.go",
+        "testutil_runfiles.go",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index edf2e809a..80c2c9680 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -79,60 +79,6 @@ func ConfigureExePath() error {
 	return nil
 }
 
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
-
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
 func TestConfig() *boot.Config {
diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go
new file mode 100644
index 000000000..ece9ea9a1
--- /dev/null
+++ b/runsc/testutil/testutil_runfiles.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
diff --git a/test/image/image_test.go b/test/image/image_test.go
index d0dcb1861..0a1e19d6f 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("http-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -139,7 +139,7 @@ func TestNginx(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("net-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -183,7 +183,7 @@ func TestMysql(t *testing.T) {
 	}
 
 	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("mysql.sql")
+	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -283,7 +283,7 @@ func TestRuby(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("ruby-test")
 
-	dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh")
+	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 1df761dd0..cbab85ef7 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -2,8 +2,6 @@
 
 load("//tools:defs.bzl", "loopback")
 
-# syscall_test is a macro that will create targets to run the given test target
-# on the host (native) and runsc.
 def syscall_test(
         test,
         shard_count = 5,
@@ -13,6 +11,19 @@ def syscall_test(
         add_uds_tree = False,
         add_hostinet = False,
         tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
     _syscall_test(
         test = test,
         shard_count = shard_count,
@@ -111,6 +122,19 @@ def _syscall_test(
     # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
     tags += [full_platform, "file_" + file_access]
 
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
     # Add tag to prevent the tests from running in a Bazel sandbox.
     # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
@@ -118,8 +142,11 @@ def _syscall_test(
     # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
-        tags += ["manual"]
-        tags += ["requires-kvm"]
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
 
     args = [
         # Arguments are passed directly to syscall_test_runner binary.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 0a2d44a2c..85ec013d5 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 }
 
 // Test that link resolution in a chroot can escape the root by following an
-// open proc fd.
+// open proc fd. Regression test for b/32316719.
 TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
 
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index f41f99900..7cd6a75bd 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) {
 }
 
 // Test that multiple threads in this process continue to execute in parallel,
-// even if an unrelated second process is spawned.
+// even if an unrelated second process is spawned. Regression test for
+// b/32119508.
 TEST(ConcurrencyTest, MultiProcessMultithreaded) {
   // In PID 1, start TIDs 1 and 2, and put both to sleep.
   //
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b790fe5be..2989379b7 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -21,6 +21,12 @@
 #include "test/util/posix_error.h"
 
 int main(int argc, char** argv, char** envp) {
+  // This is annoying. Because remote build systems may put these binaries
+  // in a content-addressable-store, you may wind up with /proc/self/exe
+  // pointing to some random path (but with a sensible argv[0]).
+  //
+  // Therefore, this test simply checks that the /proc/self/exe
+  // is absolute and *doesn't* match argv[1].
   std::string exe =
       gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
   if (exe[0] != '/') {
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 906f3358d..ff8bdfeb0 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) {
   EXPECT_EQ(0, alarmed);
 }
 
-// Child cannot affect parent private memory.
+// Child cannot affect parent private memory. Regression test for b/24137240.
 TEST_F(ForkTest, PrivateMemory) {
   std::atomic<uint32_t> local(0);
 
@@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) {
 }
 
 // Kernel-accessed buffers should remain coherent across COW.
+//
+// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates
+// differently. Regression test for b/33811887.
 TEST_F(ForkTest, COWSegment) {
   constexpr int kBufSize = 1024;
   char* read_buf = private_;
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 1c4d9f1c7..11fb1b457 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
 //
 // On most platforms this is trivial, but when the file is mapped via the sentry
 // page cache (which does not yet support writing to shared mappings), a bug
-// caused reads to fail unnecessarily on such mappings.
+// caused reads to fail unnecessarily on such mappings. See b/28913513.
 TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
   uintptr_t addr;
   size_t len = strlen(kFileContents);
@@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
 
 // Tests that EFAULT is returned when invoking a syscall that requires the OS to
 // read past end of file (resulting in a fault in sentry context in the gVisor
-// case).
+// case). See b/28913513.
 TEST_F(MMapFileTest, InternalSigBus) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
@@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) {
 }
 
 // Tests that reading from a file to a memory mapping of the same file does not
-// deadlock.
+// deadlock. See b/34813270.
 TEST_F(MMapFileTest, SelfRead) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) {
 }
 
 // Tests that writing to a file from a memory mapping of the same file does not
-// deadlock.
+// deadlock. Regression test for b/34813270.
 TEST_F(MMapFileTest, SelfWrite) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 431733dbe..902d0a0dc 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
 }
 
 // A file originally created RW, but opened RO can later be opened RW.
+// Regression test for b/65385065.
 TEST(CreateTest, OpenCreateROThenRW) {
   TempPath file(NewTempAbsPath());
 
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index f7ea44054..5b0743fe9 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace testing {
 
 namespace {
 
+// Stress copy-on-write. Attempts to reproduce b/38430174.
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 169b723eb..a23fdb58d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 & gVisor: Syscall succeeds and returns 1
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 &  gVisor: Syscall succeeds and returns 1.
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 }
@@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
 
-  // 4.17: Succeeds and returns 1
-  // gVisor: Succeeds and returns 0
+  // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent
+  // behavior on different kernels.
+  //
+  // ~4.3: Succeds and returns 0.
+  // 4.17: Succeeds and returns 1.
+  // gVisor: Succeeds and returns 0.
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
 
   EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
@@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
+  //
+  // ~4.3: Fails and returns EACCES.
   // gVisor & 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
   //          SyscallFailsWithErrno(EACCES));
 }
@@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels.
+  //
+  // ~4.3: Fails and returns ESRCH.
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(ESRCH));
 
@@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
                                      TaskFiles(initial, {child1.Tid()})));
 
-  // Stat child1's task file.
+  // Stat child1's task file. Regression test for b/32097707.
   struct stat statbuf;
   const std::string child1_task_file =
       absl::StrCat("/proc/self/task/", child1.Tid());
@@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
       "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
 
-  // Stat child1's task file again.  This time it should fail.
+  // Stat child1's task file again.  This time it should fail. See b/32097707.
   EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
               SyscallFailsWithErrno(ENOENT));
 
@@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
 }
 
 // Check that link for proc fd entries point the target node, not the
-// symlink itself.
+// symlink itself. Regression test for b/31155070.
 TEST(ProcTaskFd, FstatatFollowsSymlink) {
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor fd =
@@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) {
   EXPECT_EQ(link, "self/mounts");
 }
 
+TEST(ProcSelfMountinfo, RequiredFieldsArePresent) {
+  auto mountinfo =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo"));
+  EXPECT_THAT(
+      mountinfo,
+      AllOf(
+          // Root mount.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"),
+          // Proc mount - always rw.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)")));
+}
+
 // Check that /proc/self/mounts looks something like a real mounts file.
 TEST(ProcSelfMounts, RequiredFieldsArePresent) {
   auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 4069cbc7e..baaf9f757 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
 // This test depends on the maximum extent of a single readv() syscall, so
 // we can't tolerate interruption from saving.
 TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
-  // Ensure that we won't be interrupted by ITIMER_PROF.
+  // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly
+  // important in environments where automated profiling tools may start
+  // ITIMER_PROF automatically.
   struct itimerval itv = {};
   auto const cleanup_itimer =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 106c045e3..4bfb1ff56 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -36,7 +36,7 @@ namespace {
 // We must be very careful about how these tests are written. Each thread may
 // only have one struct rseq registration, which may be done automatically at
 // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
-// not do so).
+// not do so, but other libraries do).
 //
 // Testing of rseq is thus done primarily in a child process with no
 // registration. This means exec'ing a nostdlib binary, as rseq registration can
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 424e2a67f..be2364fb8 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) {
 
 // This test illustrates Linux's behavior of 'select' calls passing after
 // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
-// this behavior.
+// this behavior. See b/122318458.
 TEST_F(SelectTest, SetrlimitCallNOFILE) {
   fd_set read_set;
   FD_ZERO(&read_set);
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 7ba752599..c7fdbb924 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) {
 }
 
 // Check that sentry does not panic when asked for a zero-length private shm
-// segment.
+// segment. Regression test for b/110694797.
 TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
   EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 654c6a47f..a603fc1d1 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) {
 }
 
 // Check that sigprocmask correctly handles aliasing of the set and oldset
-// pointers.
+// pointers. Regression test for b/30502311.
 TEST_F(SigProcMaskTest, AliasedSets) {
   sigset_t mask;
 
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 276a94eb8..884319e1d 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -109,7 +109,7 @@ PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be sent
-// successfully.
+// successfully. See b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be received
-// into successfully.
+// into successfully. Regression test for b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index b249ff91f..03ee1250d 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) {
 }
 
 // Test that name collisions are checked on the new link path, not the source
-// path.
+// path. Regression test for b/31782115.
 TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
   const std::string srcname = NewTempAbsPath();
   const std::string newname = NewTempAbsPath();
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 8a8b68e75..c4591a3b9 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) {
 }
 
 // Test that a non-blocking write with a buffer that is larger than the send
-// buffer size will not actually write the whole thing at once.
+// buffer size will not actually write the whole thing at once. Regression test
+// for b/64438887.
 TEST_P(TcpSocketTest, NonblockingLargeWrite) {
   // Set the FD to O_NONBLOCK.
   int opts;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index c7eead17e..1ccb95733 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
               ::testing::KilledBySignal(SIGSEGV), "");
 }
 
+// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2.
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index bae377c69..8d8ebbb24 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) {
   TEST_CHECK(info->si_code == SI_TKILL);
 }
 
-// Test with a real signal.
+// Test with a real signal. Regression test for b/24790092.
 TEST(TkillTest, ValidTIDAndRealSignal) {
   struct sigaction sa;
   sa.sa_sigaction = SigHandler;
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 35aacb172..9c10b6674 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -77,6 +77,7 @@ std::string NewTempAbsPath() {
 std::string NewTempRelPath() { return NextTempBasename(); }
 
 std::string GetAbsoluteTestTmpdir() {
+  // Note that TEST_TMPDIR is guaranteed to be set.
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir =
       env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index e99c87f81..a6db44e47 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -33,4 +33,8 @@ go_suffixes = [
     "_wasm_unsafe",
     "_linux",
     "_linux_unsafe",
+    "_opts",
+    "_opts_unsafe",
+    "_impl",
+    "_impl_unsafe",
 ]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 5d5fa134a..c03b557ae 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -73,6 +73,16 @@ def calculate_sets(srcs):
             result[target].append(file)
     return result
 
+def go_imports(name, src, out):
+    """Simplify a single Go source file by eliminating unused imports."""
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        tools = ["@org_golang_x_tools//cmd/goimports:goimports"],
+        cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
+    )
+
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
@@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         state_sets = calculate_sets(srcs)
         for (suffix, srcs) in state_sets.items():
             go_stateify(
-                name = name + suffix + "_state_autogen",
+                name = name + suffix + "_state_autogen_with_imports",
                 srcs = srcs,
                 imports = imports,
                 package = name,
+                out = name + suffix + "_state_autogen_with_imports.go",
+            )
+            go_imports(
+                name = name + suffix + "_state_autogen",
+                src = name + suffix + "_state_autogen_with_imports.go",
                 out = name + suffix + "_state_autogen.go",
             )
         all_srcs = all_srcs + [
-- 
cgit v1.2.3


From 5ff780891e229dbde00d9a37c2f8b6681e592fdb Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 6 Feb 2020 10:06:56 -0800
Subject: Move p9.pool to a separate package

PiperOrigin-RevId: 293617493
---
 pkg/p9/BUILD          |  3 +--
 pkg/p9/client.go      |  9 ++++---
 pkg/p9/pool.go        | 68 ---------------------------------------------------
 pkg/p9/pool_test.go   | 64 ------------------------------------------------
 pkg/pool/BUILD        | 25 +++++++++++++++++++
 pkg/pool/pool.go      | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 pkg/pool/pool_test.go | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 161 insertions(+), 138 deletions(-)
 delete mode 100644 pkg/p9/pool.go
 delete mode 100644 pkg/p9/pool_test.go
 create mode 100644 pkg/pool/BUILD
 create mode 100644 pkg/pool/pool.go
 create mode 100644 pkg/pool/pool_test.go

diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 4ccc1de86..8904afad9 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -16,7 +16,6 @@ go_library(
         "messages.go",
         "p9.go",
         "path_tree.go",
-        "pool.go",
         "server.go",
         "transport.go",
         "transport_flipcall.go",
@@ -27,6 +26,7 @@ go_library(
         "//pkg/fdchannel",
         "//pkg/flipcall",
         "//pkg/log",
+        "//pkg/pool",
         "//pkg/sync",
         "//pkg/unet",
         "@org_golang_x_sys//unix:go_default_library",
@@ -41,7 +41,6 @@ go_test(
         "client_test.go",
         "messages_test.go",
         "p9_test.go",
-        "pool_test.go",
         "transport_test.go",
         "version_test.go",
     ],
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 4045e41fa..a6f493b82 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -22,6 +22,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/flipcall"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/pool"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
@@ -74,10 +75,10 @@ type Client struct {
 	socket *unet.Socket
 
 	// tagPool is the collection of available tags.
-	tagPool pool
+	tagPool pool.Pool
 
 	// fidPool is the collection of available fids.
-	fidPool pool
+	fidPool pool.Pool
 
 	// messageSize is the maximum total size of a message.
 	messageSize uint32
@@ -155,8 +156,8 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 	}
 	c := &Client{
 		socket:      socket,
-		tagPool:     pool{start: 1, limit: uint64(NoTag)},
-		fidPool:     pool{start: 1, limit: uint64(NoFID)},
+		tagPool:     pool.Pool{Start: 1, Limit: uint64(NoTag)},
+		fidPool:     pool.Pool{Start: 1, Limit: uint64(NoFID)},
 		pending:     make(map[Tag]*response),
 		recvr:       make(chan bool, 1),
 		messageSize: messageSize,
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
deleted file mode 100644
index 2b14a5ce3..000000000
--- a/pkg/p9/pool.go
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package p9
-
-import (
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// pool is a simple allocator.
-//
-// It is used for both tags and FIDs.
-type pool struct {
-	mu sync.Mutex
-
-	// cache is the set of returned values.
-	cache []uint64
-
-	// start is the starting value (if needed).
-	start uint64
-
-	// max is the current maximum issued.
-	max uint64
-
-	// limit is the upper limit.
-	limit uint64
-}
-
-// Get gets a value from the pool.
-func (p *pool) Get() (uint64, bool) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Anything cached?
-	if len(p.cache) > 0 {
-		v := p.cache[len(p.cache)-1]
-		p.cache = p.cache[:len(p.cache)-1]
-		return v, true
-	}
-
-	// Over the limit?
-	if p.start == p.limit {
-		return 0, false
-	}
-
-	// Generate a new value.
-	v := p.start
-	p.start++
-	return v, true
-}
-
-// Put returns a value to the pool.
-func (p *pool) Put(v uint64) {
-	p.mu.Lock()
-	p.cache = append(p.cache, v)
-	p.mu.Unlock()
-}
diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go
deleted file mode 100644
index e4746b8da..000000000
--- a/pkg/p9/pool_test.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package p9
-
-import (
-	"testing"
-)
-
-func TestPoolUnique(t *testing.T) {
-	p := pool{start: 1, limit: 3}
-	got := make(map[uint64]bool)
-
-	for {
-		n, ok := p.Get()
-		if !ok {
-			break
-		}
-
-		// Check unique.
-		if _, ok := got[n]; ok {
-			t.Errorf("pool spit out %v multiple times", n)
-		}
-
-		// Record.
-		got[n] = true
-	}
-}
-
-func TestExausted(t *testing.T) {
-	p := pool{start: 1, limit: 500}
-	for i := 0; i < 499; i++ {
-		_, ok := p.Get()
-		if !ok {
-			t.Fatalf("pool exhausted before 499 items")
-		}
-	}
-
-	_, ok := p.Get()
-	if ok {
-		t.Errorf("pool not exhausted when it should be")
-	}
-}
-
-func TestPoolRecycle(t *testing.T) {
-	p := pool{start: 1, limit: 500}
-	n1, _ := p.Get()
-	p.Put(n1)
-	n2, _ := p.Get()
-	if n1 != n2 {
-		t.Errorf("pool not recycling items")
-	}
-}
diff --git a/pkg/pool/BUILD b/pkg/pool/BUILD
new file mode 100644
index 000000000..7b1c6b75b
--- /dev/null
+++ b/pkg/pool/BUILD
@@ -0,0 +1,25 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "pool",
+    srcs = [
+        "pool.go",
+    ],
+    deps = [
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "pool_test",
+    size = "small",
+    srcs = [
+        "pool_test.go",
+    ],
+    library = ":pool",
+)
diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go
new file mode 100644
index 000000000..a1b2e0cfe
--- /dev/null
+++ b/pkg/pool/pool.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pool
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Pool is a simple allocator.
+type Pool struct {
+	mu sync.Mutex
+
+	// cache is the set of returned values.
+	cache []uint64
+
+	// Start is the starting value (if needed).
+	Start uint64
+
+	// max is the current maximum issued.
+	max uint64
+
+	// Limit is the upper limit.
+	Limit uint64
+}
+
+// Get gets a value from the pool.
+func (p *Pool) Get() (uint64, bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Anything cached?
+	if len(p.cache) > 0 {
+		v := p.cache[len(p.cache)-1]
+		p.cache = p.cache[:len(p.cache)-1]
+		return v, true
+	}
+
+	// Over the limit?
+	if p.Start == p.Limit {
+		return 0, false
+	}
+
+	// Generate a new value.
+	v := p.Start
+	p.Start++
+	return v, true
+}
+
+// Put returns a value to the pool.
+func (p *Pool) Put(v uint64) {
+	p.mu.Lock()
+	p.cache = append(p.cache, v)
+	p.mu.Unlock()
+}
diff --git a/pkg/pool/pool_test.go b/pkg/pool/pool_test.go
new file mode 100644
index 000000000..d928439c1
--- /dev/null
+++ b/pkg/pool/pool_test.go
@@ -0,0 +1,64 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pool
+
+import (
+	"testing"
+)
+
+func TestPoolUnique(t *testing.T) {
+	p := Pool{Start: 1, Limit: 3}
+	got := make(map[uint64]bool)
+
+	for {
+		n, ok := p.Get()
+		if !ok {
+			break
+		}
+
+		// Check unique.
+		if _, ok := got[n]; ok {
+			t.Errorf("pool spit out %v multiple times", n)
+		}
+
+		// Record.
+		got[n] = true
+	}
+}
+
+func TestExausted(t *testing.T) {
+	p := Pool{Start: 1, Limit: 500}
+	for i := 0; i < 499; i++ {
+		_, ok := p.Get()
+		if !ok {
+			t.Fatalf("pool exhausted before 499 items")
+		}
+	}
+
+	_, ok := p.Get()
+	if ok {
+		t.Errorf("pool not exhausted when it should be")
+	}
+}
+
+func TestPoolRecycle(t *testing.T) {
+	p := Pool{Start: 1, Limit: 500}
+	n1, _ := p.Get()
+	p.Put(n1)
+	n2, _ := p.Get()
+	if n1 != n2 {
+		t.Errorf("pool not recycling items")
+	}
+}
-- 
cgit v1.2.3


From 0e96fcafd4404e1418c84b7830b9455867e174bb Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 6 Feb 2020 10:11:15 -0800
Subject: Fix test case on AMD.

When ignored, the trap should be executed which generates
a SIGSEGV as in the above case.

PiperOrigin-RevId: 293618489
---
 test/syscalls/linux/32bit.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 9883aef61..c47a05181 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -155,7 +155,7 @@ TEST(Syscall32Bit, Syscall) {
     case PlatformSupport::Ignored:
       // See above.
       EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
-                  ::testing::KilledBySignal(SIGILL), "");
+                  ::testing::KilledBySignal(SIGSEGV), "");
       break;
 
     case PlatformSupport::Allowed:
-- 
cgit v1.2.3


From 6bd59b4e08893281468e8af5aebb5fab0f7a8c0d Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 11:12:41 -0800
Subject: Update link address for targets of Neighbor Adverts

Get the link address for the target of an NDP Neighbor Advertisement
from the NDP Target Link Layer Address option.

Tests:
- ipv6.TestNeighorAdvertisementWithTargetLinkLayerOption
- ipv6.TestNeighorAdvertisementWithInvalidTargetLinkLayerOption
PiperOrigin-RevId: 293632609
---
 pkg/tcpip/network/ipv6/icmp.go      |  44 ++++--
 pkg/tcpip/network/ipv6/icmp_test.go | 186 +++++++++++++++---------
 pkg/tcpip/network/ipv6/ndp_test.go  | 278 +++++++++++++++++++++++++-----------
 pkg/tcpip/stack/ndp_test.go         |   8 +-
 4 files changed, 352 insertions(+), 164 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 60817d36d..45dc757c7 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -15,6 +15,8 @@
 package ipv6
 
 import (
+	"log"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -194,7 +196,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// TODO(b/148429853): Properly process the NS message and do Neighbor
 		// Unreachability Detection.
 		for {
-			opt, done, _ := it.Next()
+			opt, done, err := it.Next()
+			if err != nil {
+				// This should never happen as Iter(true) above did not return an error.
+				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+			}
 			if done {
 				break
 			}
@@ -253,21 +259,25 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		}
 
 		na := header.NDPNeighborAdvert(h.NDPPayload())
+		it, err := na.Options().Iter(true)
+		if err != nil {
+			// If we have a malformed NDP NA option, drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
 		targetAddr := na.TargetAddress()
 		stack := r.Stack()
 		rxNICID := r.NICID()
 
-		isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr)
-		if err != nil {
+		if isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr); err != nil {
 			// We will only get an error if rxNICID is unrecognized,
 			// which should not happen. For now short-circuit this
 			// packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
-		}
-
-		if isTentative {
+		} else if isTentative {
 			// We just got an NA from a node that owns an address we
 			// are performing DAD on, implying the address is not
 			// unique. In this case we let the stack know so it can
@@ -283,13 +293,29 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// scenario is beyond the scope of RFC 4862. As such, we simply
 		// ignore such a scenario for now and proceed as normal.
 		//
+		// If the NA message has the target link layer option, update the link
+		// address cache with the link address for the target of the message.
+		//
 		// TODO(b/143147598): Handle the scenario described above. Also
 		// inform the netstack integration that a duplicate address was
 		// detected outside of DAD.
+		//
+		// TODO(b/148429853): Properly process the NA message and do Neighbor
+		// Unreachability Detection.
+		for {
+			opt, done, err := it.Next()
+			if err != nil {
+				// This should never happen as Iter(true) above did not return an error.
+				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+			}
+			if done {
+				break
+			}
 
-		e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, r.RemoteLinkAddress)
-		if targetAddr != r.RemoteAddress {
-			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, r.RemoteLinkAddress)
+			switch opt := opt.(type) {
+			case header.NDPTargetLinkLayerAddressOption:
+				e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, opt.EthernetAddress())
+			}
 		}
 
 	case header.ICMPv6EchoRequest:
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d0e930e20..50c4b6474 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -121,21 +121,60 @@ func TestICMPCounts(t *testing.T) {
 	}
 	defer r.Release()
 
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
 	types := []struct {
-		typ  header.ICMPv6Type
-		size int
+		typ       header.ICMPv6Type
+		size      int
+		extraData []byte
 	}{
-		{header.ICMPv6DstUnreachable, header.ICMPv6DstUnreachableMinimumSize},
-		{header.ICMPv6PacketTooBig, header.ICMPv6PacketTooBigMinimumSize},
-		{header.ICMPv6TimeExceeded, header.ICMPv6MinimumSize},
-		{header.ICMPv6ParamProblem, header.ICMPv6MinimumSize},
-		{header.ICMPv6EchoRequest, header.ICMPv6EchoMinimumSize},
-		{header.ICMPv6EchoReply, header.ICMPv6EchoMinimumSize},
-		{header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize},
-		{header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize},
-		{header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize},
-		{header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize},
-		{header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize},
+		{
+			typ:  header.ICMPv6DstUnreachable,
+			size: header.ICMPv6DstUnreachableMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6PacketTooBig,
+			size: header.ICMPv6PacketTooBigMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6TimeExceeded,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6ParamProblem,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6EchoRequest,
+			size: header.ICMPv6EchoMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6EchoReply,
+			size: header.ICMPv6EchoMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize},
+		{
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+		},
+		{
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+		},
 	}
 
 	handleIPv6Payload := func(hdr buffer.Prependable) {
@@ -154,10 +193,13 @@ func TestICMPCounts(t *testing.T) {
 	}
 
 	for _, typ := range types {
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size)
+		extraDataLen := len(typ.extraData)
+		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
+		extraData := buffer.View(hdr.Prepend(extraDataLen))
+		copy(extraData, typ.extraData)
 		pkt := header.ICMPv6(hdr.Prepend(typ.size))
 		pkt.SetType(typ.typ)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
 
 		handleIPv6Payload(hdr)
 	}
@@ -372,97 +414,104 @@ func TestLinkResolution(t *testing.T) {
 }
 
 func TestICMPChecksumValidationSimple(t *testing.T) {
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
 	types := []struct {
 		name        string
 		typ         header.ICMPv6Type
 		size        int
+		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
 	}{
 		{
-			"DstUnreachable",
-			header.ICMPv6DstUnreachable,
-			header.ICMPv6DstUnreachableMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "DstUnreachable",
+			typ:  header.ICMPv6DstUnreachable,
+			size: header.ICMPv6DstUnreachableMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.DstUnreachable
 			},
 		},
 		{
-			"PacketTooBig",
-			header.ICMPv6PacketTooBig,
-			header.ICMPv6PacketTooBigMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "PacketTooBig",
+			typ:  header.ICMPv6PacketTooBig,
+			size: header.ICMPv6PacketTooBigMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.PacketTooBig
 			},
 		},
 		{
-			"TimeExceeded",
-			header.ICMPv6TimeExceeded,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "TimeExceeded",
+			typ:  header.ICMPv6TimeExceeded,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.TimeExceeded
 			},
 		},
 		{
-			"ParamProblem",
-			header.ICMPv6ParamProblem,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "ParamProblem",
+			typ:  header.ICMPv6ParamProblem,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.ParamProblem
 			},
 		},
 		{
-			"EchoRequest",
-			header.ICMPv6EchoRequest,
-			header.ICMPv6EchoMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "EchoRequest",
+			typ:  header.ICMPv6EchoRequest,
+			size: header.ICMPv6EchoMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.EchoRequest
 			},
 		},
 		{
-			"EchoReply",
-			header.ICMPv6EchoReply,
-			header.ICMPv6EchoMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "EchoReply",
+			typ:  header.ICMPv6EchoReply,
+			size: header.ICMPv6EchoMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.EchoReply
 			},
 		},
 		{
-			"RouterSolicit",
-			header.ICMPv6RouterSolicit,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "RouterSolicit",
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterSolicit
 			},
 		},
 		{
-			"RouterAdvert",
-			header.ICMPv6RouterAdvert,
-			header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "RouterAdvert",
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterAdvert
 			},
 		},
 		{
-			"NeighborSolicit",
-			header.ICMPv6NeighborSolicit,
-			header.ICMPv6NeighborSolicitMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "NeighborSolicit",
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.NeighborSolicit
 			},
 		},
 		{
-			"NeighborAdvert",
-			header.ICMPv6NeighborAdvert,
-			header.ICMPv6NeighborAdvertSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name:      "NeighborAdvert",
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.NeighborAdvert
 			},
 		},
 		{
-			"RedirectMsg",
-			header.ICMPv6RedirectMsg,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "RedirectMsg",
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RedirectMsg
 			},
 		},
@@ -494,16 +543,19 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 				)
 			}
 
-			handleIPv6Payload := func(typ header.ICMPv6Type, size int, checksum bool) {
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + size)
-				pkt := header.ICMPv6(hdr.Prepend(size))
-				pkt.SetType(typ)
+			handleIPv6Payload := func(checksum bool) {
+				extraDataLen := len(typ.extraData)
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
+				extraData := buffer.View(hdr.Prepend(extraDataLen))
+				copy(extraData, typ.extraData)
+				pkt := header.ICMPv6(hdr.Prepend(typ.size))
+				pkt.SetType(typ.typ)
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, extraData.ToVectorisedView()))
 				}
 				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(size),
+					PayloadLength: uint16(typ.size + extraDataLen),
 					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 					HopLimit:      header.NDPHopLimit,
 					SrcAddr:       lladdr1,
@@ -528,7 +580,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 
 			// Without setting checksum, the incoming packet should
 			// be invalid.
-			handleIPv6Payload(typ.typ, typ.size, false)
+			handleIPv6Payload(false)
 			if got := invalid.Value(); got != 1 {
 				t.Fatalf("got invalid = %d, want = 1", got)
 			}
@@ -538,7 +590,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			}
 
 			// When checksum is set, it should be received.
-			handleIPv6Payload(typ.typ, typ.size, true)
+			handleIPv6Payload(true)
 			if got := typStat.Value(); got != 1 {
 				t.Fatalf("got %s = %d, want = 1", typ.name, got)
 			}
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index bd732f93f..c9395de52 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -70,76 +70,29 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 	return s, ep
 }
 
-// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving an
-// NDP NS message with the Source Link Layer Address option results in a
+// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a
+// valid NDP NS message with the Source Link Layer Address option results in a
 // new entry in the link address cache for the sender of the message.
 func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	const nicID = 1
 
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-	})
-	e := channel.New(0, 1280, linkAddr0)
-	if err := s.CreateNIC(nicID, e); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
-	}
-	if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
-	}
-
-	ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
-	hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
-	pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
-	pkt.SetType(header.ICMPv6NeighborSolicit)
-	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-	ns.SetTargetAddress(lladdr0)
-	ns.Options().Serialize(header.NDPOptionsSerializer{
-		header.NDPSourceLinkLayerAddressOption(linkAddr1),
-	})
-	pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
-	payloadLength := hdr.UsedLength()
-	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-	ip.Encode(&header.IPv6Fields{
-		PayloadLength: uint16(payloadLength),
-		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-		HopLimit:      255,
-		SrcAddr:       lladdr1,
-		DstAddr:       lladdr0,
-	})
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
-		Data: hdr.View().ToVectorisedView(),
-	})
-
-	linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
-	if err != nil {
-		t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
-	}
-	if c != nil {
-		t.Errorf("got unexpected channel")
-	}
-	if linkAddr != linkAddr1 {
-		t.Errorf("got link address = %s, want = %s", linkAddr, linkAddr1)
-	}
-}
-
-// TestNeighorSolicitationWithInvalidSourceLinkLayerOption tests that receiving
-// an NDP NS message with an invalid Source Link Layer Address option does not
-// result in a new entry in the link address cache for the sender of the
-// message.
-func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) {
-	const nicID = 1
-
 	tests := []struct {
-		name    string
-		optsBuf []byte
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
 	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{1, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
 		{
 			name:    "Too Small",
-			optsBuf: []byte{1, 1, 1, 2, 3, 4, 5},
+			optsBuf: []byte{1, 1, 2, 3, 4, 5, 6},
 		},
 		{
 			name:    "Invalid Length",
-			optsBuf: []byte{1, 2, 1, 2, 3, 4, 5, 6},
+			optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7},
 		},
 	}
 
@@ -186,20 +139,138 @@ func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) {
 				Data: hdr.View().ToVectorisedView(),
 			})
 
-			// Invalid count should have increased.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if linkAddr != test.expectedLinkAddr {
+				t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr)
 			}
 
-			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
-			if err != tcpip.ErrWouldBlock {
-				t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+			if test.expectedLinkAddr != "" {
+				if err != nil {
+					t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+				}
+				if c != nil {
+					t.Errorf("got unexpected channel")
+				}
+
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+				}
+				if c == nil {
+					t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+				}
+
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+// TestNeighorAdvertisementWithTargetLinkLayerOption tests that receiving a
+// valid NDP NA message with the Target Link Layer Address option results in a
+// new entry in the link address cache for the target of the message.
+func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{2, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+			pkt.SetType(header.ICMPv6NeighborAdvert)
+			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr1)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
 			}
-			if c == nil {
-				t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+
+			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if linkAddr != test.expectedLinkAddr {
+				t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr)
 			}
-			if linkAddr != "" {
-				t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (%s, _, ), want = ('', _, _)", nicID, lladdr1, lladdr0, ProtocolNumber, linkAddr)
+
+			if test.expectedLinkAddr != "" {
+				if err != nil {
+					t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+				}
+				if c != nil {
+					t.Errorf("got unexpected channel")
+				}
+
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+				}
+				if c == nil {
+					t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+				}
+
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
 			}
 		})
 	}
@@ -238,27 +309,59 @@ func TestHopLimitValidation(t *testing.T) {
 		})
 	}
 
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
 	types := []struct {
 		name        string
 		typ         header.ICMPv6Type
 		size        int
+		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
 	}{
-		{"RouterSolicit", header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.RouterSolicit
-		}},
-		{"RouterAdvert", header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.RouterAdvert
-		}},
-		{"NeighborSolicit", header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.NeighborSolicit
-		}},
-		{"NeighborAdvert", header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.NeighborAdvert
-		}},
-		{"RedirectMsg", header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.RedirectMsg
-		}},
+		{
+			name: "RouterSolicit",
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterSolicit
+			},
+		},
+		{
+			name: "RouterAdvert",
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterAdvert
+			},
+		},
+		{
+			name: "NeighborSolicit",
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborSolicit
+			},
+		},
+		{
+			name:      "NeighborAdvert",
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborAdvert
+			},
+		},
+		{
+			name: "RedirectMsg",
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RedirectMsg
+			},
+		},
 	}
 
 	for _, typ := range types {
@@ -270,10 +373,13 @@ func TestHopLimitValidation(t *testing.T) {
 			invalid := stats.Invalid
 			typStat := typ.statCounter(stats)
 
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size)
+			extraDataLen := len(typ.extraData)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
+			extraData := buffer.View(hdr.Prepend(extraDataLen))
+			copy(extraData, typ.extraData)
 			pkt := header.ICMPv6(hdr.Prepend(typ.size))
 			pkt.SetType(typ.typ)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
 
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8af8565f7..9a4607dcb 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -478,13 +478,17 @@ func TestDADFail(t *testing.T) {
 		{
 			"RxAdvert",
 			func(tgt tcpip.Address) buffer.Prependable {
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
-				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
+				pkt := header.ICMPv6(hdr.Prepend(naSize))
 				pkt.SetType(header.ICMPv6NeighborAdvert)
 				na := header.NDPNeighborAdvert(pkt.NDPPayload())
 				na.SetSolicitedFlag(true)
 				na.SetOverrideFlag(true)
 				na.SetTargetAddress(tgt)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
 				pkt.SetChecksum(header.ICMPv6Checksum(pkt, tgt, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
 				payloadLength := hdr.UsedLength()
 				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-- 
cgit v1.2.3


From 615d66111214f5ae9b41fb2a89bb3549c03fc7af Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 6 Feb 2020 14:01:45 -0800
Subject: runsc/container_test: hide host /etc in test containers

The host /etc can contain config files which affect tests.

For example, bash reads /etc/passwd and if it is too big
a test can fail by timeout.

PiperOrigin-RevId: 293670637
---
 runsc/testutil/testutil.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 80c2c9680..92d677e71 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -119,6 +119,13 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 			Capabilities: specutils.AllCapabilities(),
 		},
 		Mounts: []specs.Mount{
+			// Hide the host /etc to avoid any side-effects.
+			// For example, bash reads /etc/passwd and if it is
+			// very big, tests can fail by timeout.
+			{
+				Type:        "tmpfs",
+				Destination: "/etc",
+			},
 			// Root is readonly, but many tests want to write to tmpdir.
 			// This creates a writable mount inside the root. Also, when tmpdir points
 			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
-- 
cgit v1.2.3


From 736775e0ac592c266acac0a7415f21d54715a54c Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 6 Feb 2020 14:03:49 -0800
Subject: Make gonet consistent both internally and with the net package.

The types gonet.Conn and gonet.PacketConn were confusingly named as both
implemented net.Conn. Further, gonet.Conn was perhaps unexpectedly
TCP-specific (net.Conn is not). This change renames them to gonet.TCPConn and
gonet.UDPConn.

Renames gonet.NewListener to gonet.ListenTCP and adds a new gonet.NewTCPListner
function to be consistent with both the gonet.DialXxx and gonet.NewXxxConn
functions as well as net.ListenTCP.

Updates #1632

PiperOrigin-RevId: 293671303
---
 benchmarks/tcp/tcp_proxy.go            |   2 +-
 pkg/tcpip/adapters/gonet/gonet.go      | 111 +++++++++++++++++----------------
 pkg/tcpip/adapters/gonet/gonet_test.go |  22 +++----
 3 files changed, 70 insertions(+), 65 deletions(-)

diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index 72ada5700..73b7c4f5b 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -274,7 +274,7 @@ func (n netstackImpl) listen(port int) (net.Listener, error) {
 		NIC:  nicID,
 		Port: uint16(port),
 	}
-	listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber)
+	listener, err := gonet.ListenTCP(n.s, addr, ipv4.ProtocolNumber)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 711969b9b..6e0db2741 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -43,18 +43,28 @@ func (e *timeoutError) Error() string   { return "i/o timeout" }
 func (e *timeoutError) Timeout() bool   { return true }
 func (e *timeoutError) Temporary() bool { return true }
 
-// A Listener is a wrapper around a tcpip endpoint that implements
+// A TCPListener is a wrapper around a TCP tcpip.Endpoint that implements
 // net.Listener.
-type Listener struct {
+type TCPListener struct {
 	stack  *stack.Stack
 	ep     tcpip.Endpoint
 	wq     *waiter.Queue
 	cancel chan struct{}
 }
 
-// NewListener creates a new Listener.
-func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Listener, error) {
-	// Create TCP endpoint, bind it, then start listening.
+// NewTCPListener creates a new TCPListener from a listening tcpip.Endpoint.
+func NewTCPListener(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *TCPListener {
+	return &TCPListener{
+		stack:  s,
+		ep:     ep,
+		wq:     wq,
+		cancel: make(chan struct{}),
+	}
+}
+
+// ListenTCP creates a new TCPListener.
+func ListenTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPListener, error) {
+	// Create a TCP endpoint, bind it, then start listening.
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq)
 	if err != nil {
@@ -81,28 +91,23 @@ func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkPr
 		}
 	}
 
-	return &Listener{
-		stack:  s,
-		ep:     ep,
-		wq:     &wq,
-		cancel: make(chan struct{}),
-	}, nil
+	return NewTCPListener(s, &wq, ep), nil
 }
 
 // Close implements net.Listener.Close.
-func (l *Listener) Close() error {
+func (l *TCPListener) Close() error {
 	l.ep.Close()
 	return nil
 }
 
 // Shutdown stops the HTTP server.
-func (l *Listener) Shutdown() {
+func (l *TCPListener) Shutdown() {
 	l.ep.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
 	close(l.cancel) // broadcast cancellation
 }
 
 // Addr implements net.Listener.Addr.
-func (l *Listener) Addr() net.Addr {
+func (l *TCPListener) Addr() net.Addr {
 	a, err := l.ep.GetLocalAddress()
 	if err != nil {
 		return nil
@@ -208,9 +213,9 @@ func (d *deadlineTimer) SetDeadline(t time.Time) error {
 	return nil
 }
 
-// A Conn is a wrapper around a tcpip.Endpoint that implements the net.Conn
+// A TCPConn is a wrapper around a TCP tcpip.Endpoint that implements the net.Conn
 // interface.
-type Conn struct {
+type TCPConn struct {
 	deadlineTimer
 
 	wq *waiter.Queue
@@ -228,9 +233,9 @@ type Conn struct {
 	read buffer.View
 }
 
-// NewConn creates a new Conn.
-func NewConn(wq *waiter.Queue, ep tcpip.Endpoint) *Conn {
-	c := &Conn{
+// NewTCPConn creates a new TCPConn.
+func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn {
+	c := &TCPConn{
 		wq: wq,
 		ep: ep,
 	}
@@ -239,7 +244,7 @@ func NewConn(wq *waiter.Queue, ep tcpip.Endpoint) *Conn {
 }
 
 // Accept implements net.Conn.Accept.
-func (l *Listener) Accept() (net.Conn, error) {
+func (l *TCPListener) Accept() (net.Conn, error) {
 	n, wq, err := l.ep.Accept()
 
 	if err == tcpip.ErrWouldBlock {
@@ -272,7 +277,7 @@ func (l *Listener) Accept() (net.Conn, error) {
 		}
 	}
 
-	return NewConn(wq, n), nil
+	return NewTCPConn(wq, n), nil
 }
 
 type opErrorer interface {
@@ -323,7 +328,7 @@ func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, a
 }
 
 // Read implements net.Conn.Read.
-func (c *Conn) Read(b []byte) (int, error) {
+func (c *TCPConn) Read(b []byte) (int, error) {
 	c.readMu.Lock()
 	defer c.readMu.Unlock()
 
@@ -352,7 +357,7 @@ func (c *Conn) Read(b []byte) (int, error) {
 }
 
 // Write implements net.Conn.Write.
-func (c *Conn) Write(b []byte) (int, error) {
+func (c *TCPConn) Write(b []byte) (int, error) {
 	deadline := c.writeCancel()
 
 	// Check if deadlineTimer has already expired.
@@ -431,7 +436,7 @@ func (c *Conn) Write(b []byte) (int, error) {
 }
 
 // Close implements net.Conn.Close.
-func (c *Conn) Close() error {
+func (c *TCPConn) Close() error {
 	c.ep.Close()
 	return nil
 }
@@ -440,7 +445,7 @@ func (c *Conn) Close() error {
 // should just use Close.
 //
 // A TCP Half-Close is performed the same as CloseRead for *net.TCPConn.
-func (c *Conn) CloseRead() error {
+func (c *TCPConn) CloseRead() error {
 	if terr := c.ep.Shutdown(tcpip.ShutdownRead); terr != nil {
 		return c.newOpError("close", errors.New(terr.String()))
 	}
@@ -451,7 +456,7 @@ func (c *Conn) CloseRead() error {
 // should just use Close.
 //
 // A TCP Half-Close is performed the same as CloseWrite for *net.TCPConn.
-func (c *Conn) CloseWrite() error {
+func (c *TCPConn) CloseWrite() error {
 	if terr := c.ep.Shutdown(tcpip.ShutdownWrite); terr != nil {
 		return c.newOpError("close", errors.New(terr.String()))
 	}
@@ -459,7 +464,7 @@ func (c *Conn) CloseWrite() error {
 }
 
 // LocalAddr implements net.Conn.LocalAddr.
-func (c *Conn) LocalAddr() net.Addr {
+func (c *TCPConn) LocalAddr() net.Addr {
 	a, err := c.ep.GetLocalAddress()
 	if err != nil {
 		return nil
@@ -468,7 +473,7 @@ func (c *Conn) LocalAddr() net.Addr {
 }
 
 // RemoteAddr implements net.Conn.RemoteAddr.
-func (c *Conn) RemoteAddr() net.Addr {
+func (c *TCPConn) RemoteAddr() net.Addr {
 	a, err := c.ep.GetRemoteAddress()
 	if err != nil {
 		return nil
@@ -476,7 +481,7 @@ func (c *Conn) RemoteAddr() net.Addr {
 	return fullToTCPAddr(a)
 }
 
-func (c *Conn) newOpError(op string, err error) *net.OpError {
+func (c *TCPConn) newOpError(op string, err error) *net.OpError {
 	return &net.OpError{
 		Op:     op,
 		Net:    "tcp",
@@ -494,14 +499,14 @@ func fullToUDPAddr(addr tcpip.FullAddress) *net.UDPAddr {
 	return &net.UDPAddr{IP: net.IP(addr.Addr), Port: int(addr.Port)}
 }
 
-// DialTCP creates a new TCP Conn connected to the specified address.
-func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Conn, error) {
+// DialTCP creates a new TCPConn connected to the specified address.
+func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) {
 	return DialContextTCP(context.Background(), s, addr, network)
 }
 
-// DialContextTCP creates a new TCP Conn connected to the specified address
+// DialContextTCP creates a new TCPConn connected to the specified address
 // with the option of adding cancellation and timeouts.
-func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Conn, error) {
+func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) {
 	// Create TCP endpoint, then connect.
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq)
@@ -543,12 +548,12 @@ func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress,
 		}
 	}
 
-	return NewConn(&wq, ep), nil
+	return NewTCPConn(&wq, ep), nil
 }
 
-// A PacketConn is a wrapper around a tcpip endpoint that implements
-// net.PacketConn.
-type PacketConn struct {
+// A UDPConn is a wrapper around a UDP tcpip.Endpoint that implements
+// net.Conn and net.PacketConn.
+type UDPConn struct {
 	deadlineTimer
 
 	stack *stack.Stack
@@ -556,9 +561,9 @@ type PacketConn struct {
 	wq    *waiter.Queue
 }
 
-// NewPacketConn creates a new PacketConn.
-func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketConn {
-	c := &PacketConn{
+// NewUDPConn creates a new UDPConn.
+func NewUDPConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *UDPConn {
+	c := &UDPConn{
 		stack: s,
 		ep:    ep,
 		wq:    wq,
@@ -567,12 +572,12 @@ func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketC
 	return c
 }
 
-// DialUDP creates a new PacketConn.
+// DialUDP creates a new UDPConn.
 //
 // If laddr is nil, a local address is automatically chosen.
 //
-// If raddr is nil, the PacketConn is left unconnected.
-func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*PacketConn, error) {
+// If raddr is nil, the UDPConn is left unconnected.
+func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*UDPConn, error) {
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(udp.ProtocolNumber, network, &wq)
 	if err != nil {
@@ -591,7 +596,7 @@ func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.Netw
 		}
 	}
 
-	c := NewPacketConn(s, &wq, ep)
+	c := NewUDPConn(s, &wq, ep)
 
 	if raddr != nil {
 		if err := c.ep.Connect(*raddr); err != nil {
@@ -608,11 +613,11 @@ func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.Netw
 	return c, nil
 }
 
-func (c *PacketConn) newOpError(op string, err error) *net.OpError {
+func (c *UDPConn) newOpError(op string, err error) *net.OpError {
 	return c.newRemoteOpError(op, nil, err)
 }
 
-func (c *PacketConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError {
+func (c *UDPConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError {
 	return &net.OpError{
 		Op:     op,
 		Net:    "udp",
@@ -623,7 +628,7 @@ func (c *PacketConn) newRemoteOpError(op string, remote net.Addr, err error) *ne
 }
 
 // RemoteAddr implements net.Conn.RemoteAddr.
-func (c *PacketConn) RemoteAddr() net.Addr {
+func (c *UDPConn) RemoteAddr() net.Addr {
 	a, err := c.ep.GetRemoteAddress()
 	if err != nil {
 		return nil
@@ -632,13 +637,13 @@ func (c *PacketConn) RemoteAddr() net.Addr {
 }
 
 // Read implements net.Conn.Read
-func (c *PacketConn) Read(b []byte) (int, error) {
+func (c *UDPConn) Read(b []byte) (int, error) {
 	bytesRead, _, err := c.ReadFrom(b)
 	return bytesRead, err
 }
 
 // ReadFrom implements net.PacketConn.ReadFrom.
-func (c *PacketConn) ReadFrom(b []byte) (int, net.Addr, error) {
+func (c *UDPConn) ReadFrom(b []byte) (int, net.Addr, error) {
 	deadline := c.readCancel()
 
 	var addr tcpip.FullAddress
@@ -650,12 +655,12 @@ func (c *PacketConn) ReadFrom(b []byte) (int, net.Addr, error) {
 	return copy(b, read), fullToUDPAddr(addr), nil
 }
 
-func (c *PacketConn) Write(b []byte) (int, error) {
+func (c *UDPConn) Write(b []byte) (int, error) {
 	return c.WriteTo(b, nil)
 }
 
 // WriteTo implements net.PacketConn.WriteTo.
-func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) {
+func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 	deadline := c.writeCancel()
 
 	// Check if deadline has already expired.
@@ -713,13 +718,13 @@ func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 }
 
 // Close implements net.PacketConn.Close.
-func (c *PacketConn) Close() error {
+func (c *UDPConn) Close() error {
 	c.ep.Close()
 	return nil
 }
 
 // LocalAddr implements net.PacketConn.LocalAddr.
-func (c *PacketConn) LocalAddr() net.Addr {
+func (c *UDPConn) LocalAddr() net.Addr {
 	a, err := c.ep.GetLocalAddress()
 	if err != nil {
 		return nil
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index ee077ae83..ea0a0409a 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -41,7 +41,7 @@ const (
 )
 
 func TestTimeouts(t *testing.T) {
-	nc := NewConn(nil, nil)
+	nc := NewTCPConn(nil, nil)
 	dlfs := []struct {
 		name string
 		f    func(time.Time) error
@@ -132,7 +132,7 @@ func TestCloseReader(t *testing.T) {
 
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
-	l, e := NewListener(s, addr, ipv4.ProtocolNumber)
+	l, e := ListenTCP(s, addr, ipv4.ProtocolNumber)
 	if e != nil {
 		t.Fatalf("NewListener() = %v", e)
 	}
@@ -168,7 +168,7 @@ func TestCloseReader(t *testing.T) {
 	sender.close()
 }
 
-// TestCloseReaderWithForwarder tests that Conn.Close() wakes Conn.Read() when
+// TestCloseReaderWithForwarder tests that TCPConn.Close wakes TCPConn.Read when
 // using tcp.Forwarder.
 func TestCloseReaderWithForwarder(t *testing.T) {
 	s, err := newLoopbackStack()
@@ -192,7 +192,7 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 		defer ep.Close()
 		r.Complete(false)
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		// Give c.Read() a chance to block before closing the connection.
 		time.AfterFunc(time.Millisecond*50, func() {
@@ -238,7 +238,7 @@ func TestCloseRead(t *testing.T) {
 		defer ep.Close()
 		r.Complete(false)
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		buf := make([]byte, 256)
 		n, e := c.Read(buf)
@@ -257,7 +257,7 @@ func TestCloseRead(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("connect() = %v", terr)
 	}
-	c := NewConn(tc.wq, tc.ep)
+	c := NewTCPConn(tc.wq, tc.ep)
 
 	if err := c.CloseRead(); err != nil {
 		t.Errorf("c.CloseRead() = %v", err)
@@ -291,7 +291,7 @@ func TestCloseWrite(t *testing.T) {
 		defer ep.Close()
 		r.Complete(false)
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		n, e := c.Read(make([]byte, 256))
 		if n != 0 || e != io.EOF {
@@ -309,7 +309,7 @@ func TestCloseWrite(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("connect() = %v", terr)
 	}
-	c := NewConn(tc.wq, tc.ep)
+	c := NewTCPConn(tc.wq, tc.ep)
 
 	if err := c.CloseWrite(); err != nil {
 		t.Errorf("c.CloseWrite() = %v", err)
@@ -353,7 +353,7 @@ func TestUDPForwarder(t *testing.T) {
 		}
 		defer ep.Close()
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		buf := make([]byte, 256)
 		n, e := c.Read(buf)
@@ -396,7 +396,7 @@ func TestDeadlineChange(t *testing.T) {
 
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
-	l, e := NewListener(s, addr, ipv4.ProtocolNumber)
+	l, e := ListenTCP(s, addr, ipv4.ProtocolNumber)
 	if e != nil {
 		t.Fatalf("NewListener() = %v", e)
 	}
@@ -541,7 +541,7 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) {
 	addr := tcpip.FullAddress{NICID, ip, 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, ip)
 
-	l, err := NewListener(s, addr, ipv4.ProtocolNumber)
+	l, err := ListenTCP(s, addr, ipv4.ProtocolNumber)
 	if err != nil {
 		return nil, nil, nil, fmt.Errorf("NewListener: %v", err)
 	}
-- 
cgit v1.2.3


From bfa4a235f401599492a2cf39471df62715f9f1cf Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 6 Feb 2020 14:26:41 -0800
Subject: Fix `bazel run` target in docs.

PiperOrigin-RevId: 293676954
---
 test/iptables/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/iptables/README.md b/test/iptables/README.md
index 8f61b4c41..c2b934e1f 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -28,7 +28,7 @@ Your test is now runnable with bazel!
 Build the testing Docker container:
 
 ```bash
-$ bazel run //test/iptables/runner-image -- --norun
+$ bazel run //test/iptables/runner:runner-image -- --norun
 ```
 
 Run an individual test via:
-- 
cgit v1.2.3


From 940d255971c38af9f91ceed1345fd973f8fdb41d Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 15:57:34 -0800
Subject: Perform DAD on IPv6 addresses when enabling a NIC

Addresses may be added before a NIC is enabled. Make sure DAD is
performed on the permanent IPv6 addresses when they get enabled.

Test:
- stack_test.TestDoDADWhenNICEnabled
- stack.TestDisabledRxStatsWhenNICDisabled
PiperOrigin-RevId: 293697429
---
 pkg/tcpip/stack/BUILD         |   6 ++-
 pkg/tcpip/stack/ndp_test.go   |  74 +++++++++++++--------------
 pkg/tcpip/stack/nic.go        |  84 ++++++++++++++++++++++--------
 pkg/tcpip/stack/nic_test.go   |  62 +++++++++++++++++++++++
 pkg/tcpip/stack/stack_test.go | 115 ++++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/tcpip.go            |   8 +--
 6 files changed, 287 insertions(+), 62 deletions(-)
 create mode 100644 pkg/tcpip/stack/nic_test.go

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index f5b750046..705cf01ee 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -78,11 +78,15 @@ go_test(
 go_test(
     name = "stack_test",
     size = "small",
-    srcs = ["linkaddrcache_test.go"],
+    srcs = [
+        "linkaddrcache_test.go",
+        "nic_test.go",
+    ],
     library = ":stack",
     deps = [
         "//pkg/sleep",
         "//pkg/sync",
         "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
     ],
 )
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 9a4607dcb..1e575bdaf 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1539,7 +1539,7 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 }
 
 // Checks to see if list contains an IPv6 address, item.
-func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
+func containsV6Addr(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
 	protocolAddress := tcpip.ProtocolAddress{
 		Protocol:          header.IPv6ProtocolNumber,
 		AddressWithPrefix: item,
@@ -1665,7 +1665,7 @@ func TestAutoGenAddr(t *testing.T) {
 	// with non-zero lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 
@@ -1681,10 +1681,10 @@ func TestAutoGenAddr(t *testing.T) {
 	// Receive an RA with prefix2 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) {
 		t.Fatalf("Should have %s in the list of addresses", addr2)
 	}
 
@@ -1705,10 +1705,10 @@ func TestAutoGenAddr(t *testing.T) {
 	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
-	if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) {
 		t.Fatalf("Should have %s in the list of addresses", addr2)
 	}
 }
@@ -1853,7 +1853,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Receive PI for prefix1.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
 	expectPrimaryAddr(addr1)
@@ -1861,7 +1861,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Deprecate addr for prefix1 immedaitely.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr1, deprecatedAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
 	// addr should still be the primary endpoint as there are no other addresses.
@@ -1879,7 +1879,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Receive PI for prefix2.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr2)
@@ -1887,7 +1887,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Deprecate addr for prefix2 immedaitely.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr2, deprecatedAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	// addr1 should be the primary endpoint now since addr2 is deprecated but
@@ -1982,7 +1982,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// Receive PI for prefix2.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr2)
@@ -1990,10 +1990,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// Receive a PI for prefix1.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr1)
@@ -2009,10 +2009,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 
 	// Wait for addr of prefix1 to be deprecated.
 	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	// addr2 should be the primary endpoint now since addr1 is deprecated but
@@ -2049,10 +2049,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 
 	// Wait for addr of prefix1 to be deprecated.
 	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	// addr2 should be the primary endpoint now since it is not deprecated.
@@ -2063,10 +2063,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 
 	// Wait for addr of prefix1 to be invalidated.
 	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncEventTimeout)
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr2)
@@ -2112,10 +2112,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should not have %s in the list of addresses", addr2)
 	}
 	// Should not have any primary endpoints.
@@ -2600,7 +2600,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil {
 		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err)
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 
@@ -2613,7 +2613,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 		t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
 	default:
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 
@@ -2624,7 +2624,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 		t.Fatal("unexpectedly received an auto gen addr event")
 	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 }
@@ -2702,17 +2702,17 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	const validLifetimeSecondPrefix1 = 1
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
 
 	// Receive an RA with prefix2 in a PI with a large valid lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 
@@ -2725,10 +2725,10 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 }
@@ -3014,16 +3014,16 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo := s.NICInfo()
 	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
-	if !contains(nic1Addrs, e1Addr1) {
+	if !containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
-	if !contains(nic1Addrs, e1Addr2) {
+	if !containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
-	if !contains(nic2Addrs, e2Addr1) {
+	if !containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
-	if !contains(nic2Addrs, e2Addr2) {
+	if !containsV6Addr(nic2Addrs, e2Addr2) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
 	}
 
@@ -3102,16 +3102,16 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo = s.NICInfo()
 	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
-	if contains(nic1Addrs, e1Addr1) {
+	if containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
-	if contains(nic1Addrs, e1Addr2) {
+	if containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
-	if contains(nic2Addrs, e2Addr1) {
+	if containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
-	if contains(nic2Addrs, e2Addr2) {
+	if containsV6Addr(nic2Addrs, e2Addr2) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
 	}
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 7dad9a8cb..682e9c416 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"log"
+	"reflect"
 	"sort"
 	"strings"
 	"sync/atomic"
@@ -39,6 +40,7 @@ type NIC struct {
 
 	mu struct {
 		sync.RWMutex
+		enabled       bool
 		spoofing      bool
 		promiscuous   bool
 		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
@@ -56,6 +58,14 @@ type NIC struct {
 type NICStats struct {
 	Tx DirectionStats
 	Rx DirectionStats
+
+	DisabledRx DirectionStats
+}
+
+func makeNICStats() NICStats {
+	var s NICStats
+	tcpip.InitStatCounters(reflect.ValueOf(&s).Elem())
+	return s
 }
 
 // DirectionStats includes packet and byte counts.
@@ -99,16 +109,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		name:    name,
 		linkEP:  ep,
 		context: ctx,
-		stats: NICStats{
-			Tx: DirectionStats{
-				Packets: &tcpip.StatCounter{},
-				Bytes:   &tcpip.StatCounter{},
-			},
-			Rx: DirectionStats{
-				Packets: &tcpip.StatCounter{},
-				Bytes:   &tcpip.StatCounter{},
-			},
-		},
+		stats:   makeNICStats(),
 	}
 	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
 	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
@@ -137,14 +138,30 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 // enable enables the NIC. enable will attach the link to its LinkEndpoint and
 // join the IPv6 All-Nodes Multicast address (ff02::1).
 func (n *NIC) enable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if n.mu.enabled {
+		return nil
+	}
+
+	n.mu.enabled = true
+
 	n.attachLinkEndpoint()
 
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if err := n.AddAddress(tcpip.ProtocolAddress{
+		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
 			Protocol:          header.IPv4ProtocolNumber,
 			AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize},
-		}, NeverPrimaryEndpoint); err != nil {
+		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -166,8 +183,22 @@ func (n *NIC) enable() *tcpip.Error {
 		return nil
 	}
 
-	n.mu.Lock()
-	defer n.mu.Unlock()
+	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
+	// state.
+	//
+	// Addresses may have aleady completed DAD but in the time since the NIC was
+	// last enabled, other devices may have acquired the same addresses.
+	for _, r := range n.mu.endpoints {
+		addr := r.ep.ID().LocalAddress
+		if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) {
+			continue
+		}
+
+		r.setKind(permanentTentative)
+		if err := n.mu.ndp.startDuplicateAddressDetection(addr, r); err != nil {
+			return err
+		}
+	}
 
 	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
 		return err
@@ -633,7 +664,9 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
 
 	// If the address is an IPv6 address and it is a permanent address,
-	// mark it as tentative so it goes through the DAD process.
+	// mark it as tentative so it goes through the DAD process if the NIC is
+	// enabled. If the NIC is not enabled, DAD will be started when the NIC is
+	// enabled.
 	if isIPv6Unicast && kind == permanent {
 		kind = permanentTentative
 	}
@@ -668,8 +701,8 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	n.insertPrimaryEndpointLocked(ref, peb)
 
-	// If we are adding a tentative IPv6 address, start DAD.
-	if isIPv6Unicast && kind == permanentTentative {
+	// If we are adding a tentative IPv6 address, start DAD if the NIC is enabled.
+	if isIPv6Unicast && kind == permanentTentative && n.mu.enabled {
 		if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
 			return nil, err
 		}
@@ -700,9 +733,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 		// Don't include tentative, expired or temporary endpoints to
 		// avoid confusion and prevent the caller from using those.
 		switch ref.getKind() {
-		case permanentTentative, permanentExpired, temporary:
-			// TODO(b/140898488): Should tentative addresses be
-			//                    returned?
+		case permanentExpired, temporary:
 			continue
 		}
 		addrs = append(addrs, tcpip.ProtocolAddress{
@@ -1016,11 +1047,23 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
 func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	// If the NIC is not yet enabled, don't receive any packets.
+	if !enabled {
+		n.mu.RUnlock()
+
+		n.stats.DisabledRx.Packets.Increment()
+		n.stats.DisabledRx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+		return
+	}
+
 	n.stats.Rx.Packets.Increment()
 	n.stats.Rx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
 
 	netProto, ok := n.stack.networkProtocols[protocol]
 	if !ok {
+		n.mu.RUnlock()
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
 		return
 	}
@@ -1032,7 +1075,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 
 	// Are any packet sockets listening for this network protocol?
-	n.mu.RLock()
 	packetEPs := n.mu.packetEPs[protocol]
 	// Check whether there are packet sockets listening for every protocol.
 	// If we received a packet with protocol EthernetProtocolAll, then the
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
new file mode 100644
index 000000000..edaee3b86
--- /dev/null
+++ b/pkg/tcpip/stack/nic_test.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
+	// When the NIC is disabled, the only field that matters is the stats field.
+	// This test is limited to stats counter checks.
+	nic := NIC{
+		stats: makeNICStats(),
+	}
+
+	if got := nic.stats.DisabledRx.Packets.Value(); got != 0 {
+		t.Errorf("got DisabledRx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.DisabledRx.Bytes.Value(); got != 0 {
+		t.Errorf("got DisabledRx.Bytes = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Packets.Value(); got != 0 {
+		t.Errorf("got Rx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Bytes.Value(); got != 0 {
+		t.Errorf("got Rx.Bytes = %d, want = 0", got)
+	}
+
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	nic.DeliverNetworkPacket(nil, "", "", 0, tcpip.PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+
+	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
+		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
+	}
+	if got := nic.stats.DisabledRx.Bytes.Value(); got != 4 {
+		t.Errorf("got DisabledRx.Bytes = %d, want = 4", got)
+	}
+	if got := nic.stats.Rx.Packets.Value(); got != 0 {
+		t.Errorf("got Rx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Bytes.Value(); got != 0 {
+		t.Errorf("got Rx.Bytes = %d, want = 0", got)
+	}
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 834fe9487..243868f3a 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2561,3 +2561,118 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 		})
 	}
 }
+
+// TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
+// was disabled have DAD performed on them when the NIC is enabled.
+func TestDoDADWhenNICEnabled(t *testing.T) {
+	t.Parallel()
+
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits: dadTransmits,
+			RetransmitTimer:        retransmitTimer,
+		},
+		NDPDisp: &ndpDisp,
+	}
+
+	e := channel.New(dadTransmits, 1280, linkAddr1)
+	s := stack.New(opts)
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	addr := tcpip.ProtocolAddress{
+		Protocol: header.IPv6ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   llAddr1,
+			PrefixLen: 128,
+		},
+	}
+	if err := s.AddProtocolAddress(nicID, addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err)
+	}
+
+	// Address should be in the list of all addresses.
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+
+	// Address should be tentative so it should not be a main address.
+	got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+
+	// Enabling the NIC should start DAD for the address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+
+	// Address should not be considered bound to the NIC yet (DAD ongoing).
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicID != nicID {
+			t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
+		}
+		if e.addr != addr.AddressWithPrefix.Address {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", e.addr, addr.AddressWithPrefix.Address)
+		}
+		if !e.resolved {
+			t.Fatal("got DAD event w/ resolved = false, want = true")
+		}
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if got != addr.AddressWithPrefix {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
+	}
+
+	// Enabling the NIC again should be a no-op.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if got != addr.AddressWithPrefix {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
+	}
+}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index d29d9a704..0e944712f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1170,7 +1170,9 @@ type TransportEndpointStats struct {
 // marker interface.
 func (*TransportEndpointStats) IsEndpointStats() {}
 
-func fillIn(v reflect.Value) {
+// InitStatCounters initializes v's fields with nil StatCounter fields to new
+// StatCounters.
+func InitStatCounters(v reflect.Value) {
 	for i := 0; i < v.NumField(); i++ {
 		v := v.Field(i)
 		if s, ok := v.Addr().Interface().(**StatCounter); ok {
@@ -1178,14 +1180,14 @@ func fillIn(v reflect.Value) {
 				*s = new(StatCounter)
 			}
 		} else {
-			fillIn(v)
+			InitStatCounters(v)
 		}
 	}
 }
 
 // FillIn returns a copy of s with nil fields initialized to new StatCounters.
 func (s Stats) FillIn() Stats {
-	fillIn(reflect.ValueOf(&s).Elem())
+	InitStatCounters(reflect.ValueOf(&s).Elem())
 	return s
 }
 
-- 
cgit v1.2.3


From 16561e461e82f8d846ef1f3ada990270ef39ccc6 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 6 Feb 2020 15:59:44 -0800
Subject: Add logic to run from baked images.

Change adds the following:
- logic to run from "baked images". See [GVISOR_DIR]/tools/images
- installers which install modified files from a workspace. This
allows users to run benchmarks while modifying runsc.
- removes the --preemptible tag from built GCE instances. Preemptible
instances are much more likely to be preempted on startup, which
manifests for the user as a failed benchmark. I don't currently have
a way to detect if a VM has been preempted that will work for this
change.
https://cloud.google.com/compute/docs/instances/preemptible#preemption_process
https://cloud.google.com/compute/docs/instances/preemptible#preemption_selection

PiperOrigin-RevId: 293697949
---
 benchmarks/BUILD                                   |   8 ++
 benchmarks/README.md                               |  21 +++-
 benchmarks/harness/BUILD                           |  21 ++++
 benchmarks/harness/__init__.py                     |  36 ++++++-
 benchmarks/harness/machine.py                      |  43 +++++++-
 benchmarks/harness/machine_producers/BUILD         |   1 +
 .../harness/machine_producers/gcloud_producer.py   | 114 ++++++++-------------
 benchmarks/harness/ssh_connection.py               |  25 +++--
 benchmarks/runner/__init__.py                      |  75 ++++----------
 benchmarks/runner/commands.py                      |  70 ++++++-------
 tools/images/defs.bzl                              |   5 +-
 tools/installers/BUILD                             |   7 +-
 tools/installers/head.sh                           |   2 +-
 13 files changed, 250 insertions(+), 178 deletions(-)

diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 1455c6c5b..43614cf5d 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -3,8 +3,16 @@ package(licenses = ["notice"])
 py_binary(
     name = "benchmarks",
     srcs = ["run.py"],
+    data = [
+        "//tools/images:ubuntu1604",
+        "//tools/images:zone",
+    ],
     main = "run.py",
     python_version = "PY3",
     srcs_version = "PY3",
+    tags = [
+        "local",
+        "manual",
+    ],
     deps = ["//benchmarks/runner"],
 )
diff --git a/benchmarks/README.md b/benchmarks/README.md
index ff21614c5..975321c99 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -26,6 +26,8 @@ For configuring the environment manually, consult the
 
 ## Running benchmarks
 
+### Locally
+
 Run the following from the benchmarks directory:
 
 ```bash
@@ -44,7 +46,7 @@ runtime, runc. Running on another installed runtime, like say runsc, is as
 simple as:
 
 ```bash
-bazel run :benchmakrs -- run-local startup --runtime=runsc
+bazel run :benchmarks -- run-local startup --runtime=runsc
 ```
 
 There is help: ``bash bash bazel run :benchmarks -- --help bazel
@@ -104,6 +106,23 @@ Or with different parameters:
 bazel run :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
 ```
 
+### On Google Compute Engine (GCE)
+
+Benchmarks may be run on GCE in an automated way. The default project configured
+for `gcloud` will be used.
+
+An additional parameter `installers` may be provided to ensure that the latest
+runtime is installed from the workspace. See the files in `tools/installers` for
+supported install targets.
+
+```bash
+bazel run :benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu
+```
+
+When running on GCE, the scripts generate a per run SSH key, which is added to
+your project. The key is set to expire in GCE after 60 minutes and is stored in
+a temporary directory on the local machine running the scripts.
+
 ## Writing benchmarks
 
 To write new benchmarks, you should familiarize yourself with the structure of
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
index 52d4e42f8..4d03e3a06 100644
--- a/benchmarks/harness/BUILD
+++ b/benchmarks/harness/BUILD
@@ -1,3 +1,4 @@
+load("//tools:defs.bzl", "pkg_tar")
 load("//tools:defs.bzl", "py_library", "py_requirement")
 
 package(
@@ -5,9 +6,29 @@ package(
     licenses = ["notice"],
 )
 
+pkg_tar(
+    name = "installers",
+    srcs = [
+        "//tools/installers:head",
+        "//tools/installers:master",
+        "//tools/installers:runsc",
+    ],
+    mode = "0755",
+)
+
+filegroup(
+    name = "files",
+    srcs = [
+        ":installers",
+    ],
+)
+
 py_library(
     name = "harness",
     srcs = ["__init__.py"],
+    data = [
+        ":files",
+    ],
 )
 
 py_library(
diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
index 61fd25f73..15aa2a69a 100644
--- a/benchmarks/harness/__init__.py
+++ b/benchmarks/harness/__init__.py
@@ -1,5 +1,5 @@
 # python3
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,18 +15,48 @@
 
 import getpass
 import os
+import subprocess
+import tempfile
 
 # LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a
 # format string that accepts a single string parameter.
-LOCAL_WORKLOADS_PATH = os.path.join(
-    os.path.dirname(__file__), "../workloads/{}/tar.tar")
+LOCAL_WORKLOADS_PATH = os.path.dirname(__file__) + "/../workloads/{}/tar.tar"
 
 # REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the
 # remote host. This is a format string that accepts a single string parameter.
 REMOTE_WORKLOADS_PATH = "workloads/{}"
 
+# INSTALLER_ROOT is the set of files that needs to be copied.
+INSTALLER_ARCHIVE = os.readlink(os.path.join(
+    os.path.dirname(__file__), "installers.tar"))
+
+# SSH_KEY_DIR holds SSH_PRIVATE_KEY for this run. bm-tools paramiko requires
+# keys generated with the '-t rsa -m PEM' options from ssh-keygen. This is
+# abstracted away from the user.
+SSH_KEY_DIR = tempfile.TemporaryDirectory()
+SSH_PRIVATE_KEY = "key"
+
 # DEFAULT_USER is the default user running this script.
 DEFAULT_USER = getpass.getuser()
 
 # DEFAULT_USER_HOME is the home directory of the user running the script.
 DEFAULT_USER_HOME = os.environ["HOME"] if "HOME" in os.environ else ""
+
+# Default directory to remotely installer "installer" targets.
+REMOTE_INSTALLERS_PATH = "installers"
+
+
+def make_key():
+  """Wraps a valid ssh key in a temporary directory."""
+  path = os.path.join(SSH_KEY_DIR.name, SSH_PRIVATE_KEY)
+  if not os.path.exists(path):
+    cmd = "ssh-keygen -t rsa -m PEM -b 4096 -f {key} -q -N".format(
+        key=path).split(" ")
+    cmd.append("")
+    subprocess.run(cmd, check=True)
+  return path
+
+
+def delete_key():
+  """Deletes temporary directory containing private key."""
+  SSH_KEY_DIR.cleanup()
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index 2df4c9e31..3d32d3dda 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -1,5 +1,5 @@
 # python3
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,10 +29,11 @@ to run contianers.
 """
 
 import logging
+import os
 import re
 import subprocess
 import time
-from typing import Tuple
+from typing import List, Tuple
 
 import docker
 
@@ -201,6 +202,7 @@ class RemoteMachine(Machine):
     self._tunnel = tunnel_dispatcher.Tunnel(name, **kwargs)
     self._tunnel.connect()
     self._docker_client = self._tunnel.get_docker_client()
+    self._has_installers = False
 
   def run(self, cmd: str) -> Tuple[str, str]:
     return self._ssh_connection.run(cmd)
@@ -210,14 +212,45 @@ class RemoteMachine(Machine):
     stdout, stderr = self._ssh_connection.run("cat '{}'".format(path))
     return stdout + stderr
 
+  def install(self,
+              installer: str,
+              results: List[bool] = None,
+              index: int = -1):
+    """Method unique to RemoteMachine to handle installation of installers.
+
+    Handles installers, which install things that may change between runs (e.g.
+    runsc). Usually called from gcloud_producer, which expects this method to
+    to store results.
+
+    Args:
+      installer: the installer target to run.
+      results: Passed by the caller of where to store success.
+      index: Index for this method to store the result in the passed results
+        list.
+    """
+    # This generates a tarball of the full installer root (which will generate
+    # be the full bazel root directory) and sends it over.
+    if not self._has_installers:
+      archive = self._ssh_connection.send_installers()
+      self.run("tar -xvf {archive} -C {dir}".format(
+          archive=archive, dir=harness.REMOTE_INSTALLERS_PATH))
+      self._has_installers = True
+
+      # Execute the remote installer.
+      self.run("sudo {dir}/{file}".format(
+          dir=harness.REMOTE_INSTALLERS_PATH, file=installer))
+    if results:
+      results[index] = True
+
   def pull(self, workload: str) -> str:
     # Push to the remote machine and build.
     logging.info("Building %s@%s remotely...", workload, self._name)
     remote_path = self._ssh_connection.send_workload(workload)
+    remote_dir = os.path.dirname(remote_path)
     # Workloads are all tarballs.
-    self.run("tar -xvf {remote_path}/tar.tar -C {remote_path}".format(
-        remote_path=remote_path))
-    self.run("docker build --tag={} {}".format(workload, remote_path))
+    self.run("tar -xvf {remote_path} -C {remote_dir}".format(
+        remote_path=remote_path, remote_dir=remote_dir))
+    self.run("docker build --tag={} {}".format(workload, remote_dir))
     return workload  # Workload is the tag.
 
   def container(self, image: str, **kwargs) -> container.Container:
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index 48ea0ef39..3711a397f 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -76,5 +76,6 @@ py_test(
     python_version = "PY3",
     tags = [
         "local",
+        "manual",
     ],
 )
diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
index e0b77d52b..513d16e4f 100644
--- a/benchmarks/harness/machine_producers/gcloud_producer.py
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -1,5 +1,5 @@
 # python3
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,12 +46,11 @@ class GCloudProducer(machine_producer.MachineProducer):
   Produces Machine objects backed by GCP instances.
 
   Attributes:
-    project: The GCP project name under which to create the machines.
-    ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys.
     image: image name as a string.
-    image_project: image project as a string.
-    machine_type: type of GCP to create. e.g. n1-standard-4
     zone: string to a valid GCP zone.
+    machine_type: type of GCP to create (e.g. n1-standard-4).
+    installers: list of installers post-boot.
+    ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys.
     ssh_user: string of user name for ssh_key
     ssh_password: string of password for ssh key
     mock: a mock printer which will print mock data if required. Mock data is
@@ -60,21 +59,19 @@ class GCloudProducer(machine_producer.MachineProducer):
   """
 
   def __init__(self,
-               project: str,
-               ssh_key_file: str,
                image: str,
-               image_project: str,
-               machine_type: str,
                zone: str,
+               machine_type: str,
+               installers: List[str],
+               ssh_key_file: str,
                ssh_user: str,
                ssh_password: str,
                mock: gcloud_mock_recorder.MockPrinter = None):
-    self.project = project
-    self.ssh_key_file = ssh_key_file
     self.image = image
-    self.image_project = image_project
-    self.machine_type = machine_type
     self.zone = zone
+    self.machine_type = machine_type
+    self.installers = installers
+    self.ssh_key_file = ssh_key_file
     self.ssh_user = ssh_user
     self.ssh_password = ssh_password
     self.mock = mock
@@ -87,10 +84,34 @@ class GCloudProducer(machine_producer.MachineProducer):
           "Cannot ask for {num} machines!".format(num=num_machines))
     with self.condition:
       names = self._get_unique_names(num_machines)
-      self._build_instances(names)
-    instances = self._start_command(names)
+      instances = self._build_instances(names)
     self._add_ssh_key_to_instances(names)
-    return self._machines_from_instances(instances)
+    machines = self._machines_from_instances(instances)
+
+    # Install all bits in lock-step.
+    #
+    # This will perform paralell installations for however many machines we
+    # have, but it's easy to track errors because if installing (a, b, c), we
+    # won't install "c" until "b" is installed on all machines.
+    for installer in self.installers:
+      threads = [None] * len(machines)
+      results = [False] * len(machines)
+      for i in range(len(machines)):
+        threads[i] = threading.Thread(
+            target=machines[i].install, args=(installer, results, i))
+        threads[i].start()
+      for thread in threads:
+        thread.join()
+      for result in results:
+        if not result:
+          raise NotImplementedError(
+              "Installers failed on at least one machine!")
+
+    # Add this user to each machine's docker group.
+    for m in machines:
+      m.run("sudo setfacl -m user:$USER:rw /var/run/docker.sock")
+
+    return machines
 
   def release_machines(self, machine_list: List[machine.Machine]):
     """Releases the requested number of machines, deleting the instances."""
@@ -123,15 +144,7 @@ class GCloudProducer(machine_producer.MachineProducer):
 
   def _get_unique_names(self, num_names) -> List[str]:
     """Returns num_names unique names based on data from the GCP project."""
-    curr_machines = self._list_machines()
-    curr_names = set([machine["name"] for machine in curr_machines])
-    ret = []
-    while len(ret) < num_names:
-      new_name = "machine-" + str(uuid.uuid4())
-      if new_name not in curr_names:
-        ret.append(new_name)
-        curr_names.update(new_name)
-    return ret
+    return ["machine-" + str(uuid.uuid4()) for _ in range(0, num_names)]
 
   def _build_instances(self, names: List[str]) -> List[Dict[str, Any]]:
     """Creates instances using gcloud command.
@@ -151,34 +164,9 @@ class GCloudProducer(machine_producer.MachineProducer):
           "_build_instances cannot create instances without names.")
     cmd = "gcloud compute instances create".split(" ")
     cmd.extend(names)
-    cmd.extend(
-        "--preemptible --image={image} --zone={zone} --machine-type={machine_type}"
-        .format(
-            image=self.image, zone=self.zone,
-            machine_type=self.machine_type).split(" "))
-    if self.image_project:
-      cmd.append("--image-project={project}".format(project=self.image_project))
-    res = self._run_command(cmd)
-    return json.loads(res.stdout)
-
-  def _start_command(self, names):
-    """Starts instances using gcloud command.
-
-    Runs the command `gcloud compute instances start` on list of instances by
-    name and returns json data on started instances on success.
-
-    Args:
-      names: list of names of instances to start.
-
-    Returns:
-      List of json data describing started machines.
-    """
-    if not names:
-      raise ValueError("_start_command cannot start empty instance list.")
-    cmd = "gcloud compute instances start".split(" ")
-    cmd.extend(names)
-    cmd.append("--zone={zone}".format(zone=self.zone))
-    cmd.append("--project={project}".format(project=self.project))
+    cmd.append("--image=" + self.image)
+    cmd.append("--zone=" + self.zone)
+    cmd.append("--machine-type=" + self.machine_type)
     res = self._run_command(cmd)
     return json.loads(res.stdout)
 
@@ -186,7 +174,7 @@ class GCloudProducer(machine_producer.MachineProducer):
     """Adds ssh key to instances by calling gcloud ssh command.
 
     Runs the command `gcloud compute ssh instance_name` on list of images by
-    name. Tries to ssh into given instance
+    name. Tries to ssh into given instance.
 
     Args:
       names: list of machine names to which to add the ssh-key
@@ -202,30 +190,18 @@ class GCloudProducer(machine_producer.MachineProducer):
       cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file))
       cmd.append("--zone={zone}".format(zone=self.zone))
       cmd.append("--command=uname")
+      cmd.append("--ssh-key-expire-after=60m")
       timeout = datetime.timedelta(seconds=5 * 60)
       start = datetime.datetime.now()
       while datetime.datetime.now() <= timeout + start:
         try:
           self._run_command(cmd)
           break
-        except subprocess.CalledProcessError as e:
+        except subprocess.CalledProcessError:
           if datetime.datetime.now() > timeout + start:
             raise TimeoutError(
                 "Could not SSH into instance after 5 min: {name}".format(
                     name=name))
-          # 255 is the returncode for ssh connection refused.
-          elif e.returncode == 255:
-
-            continue
-          else:
-            raise e
-
-  def _list_machines(self) -> List[Dict[str, Any]]:
-    """Runs `list` gcloud command and returns list of Machine data."""
-    cmd = "gcloud compute instances list --project {project}".format(
-        project=self.project).split(" ")
-    res = self._run_command(cmd)
-    return json.loads(res.stdout)
 
   def _run_command(self,
                    cmd: List[str],
@@ -261,7 +237,7 @@ class GCloudProducer(machine_producer.MachineProducer):
       self.mock.record(res)
     if res.returncode != 0:
       raise subprocess.CalledProcessError(
-          cmd=res.args,
+          cmd=" ".join(res.args),
           output=res.stdout,
           stderr=res.stderr,
           returncode=res.returncode)
diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py
index e0bf258f1..a50e34293 100644
--- a/benchmarks/harness/ssh_connection.py
+++ b/benchmarks/harness/ssh_connection.py
@@ -1,5 +1,5 @@
 # python3
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 """SSHConnection handles the details of SSH connections."""
 
+
 import os
 import warnings
 
@@ -24,18 +25,24 @@ from benchmarks import harness
 warnings.filterwarnings(action="ignore", module=".*paramiko.*")
 
 
-def send_one_file(client: paramiko.SSHClient, path: str, remote_dir: str):
+def send_one_file(client: paramiko.SSHClient, path: str,
+                  remote_dir: str) -> str:
   """Sends a single file via an SSH client.
 
   Args:
     client: The existing SSH client.
     path: The local path.
     remote_dir: The remote directory.
+
+  Returns:
+    :return: The remote path as a string.
   """
   filename = path.split("/").pop()
-  client.exec_command("mkdir -p " + remote_dir)
+  if remote_dir != ".":
+    client.exec_command("mkdir -p " + remote_dir)
   with client.open_sftp() as ftp_client:
     ftp_client.put(path, os.path.join(remote_dir, filename))
+  return os.path.join(remote_dir, filename)
 
 
 class SSHConnection:
@@ -103,6 +110,12 @@ class SSHConnection:
       The remote path.
     """
     with self._client() as client:
-      send_one_file(client, harness.LOCAL_WORKLOADS_PATH.format(name),
-                    harness.REMOTE_WORKLOADS_PATH.format(name))
-    return harness.REMOTE_WORKLOADS_PATH.format(name)
+      return send_one_file(client, harness.LOCAL_WORKLOADS_PATH.format(name),
+                           harness.REMOTE_WORKLOADS_PATH.format(name))
+
+  def send_installers(self) -> str:
+    with self._client() as client:
+      return send_one_file(
+          client,
+          path=harness.INSTALLER_ARCHIVE,
+          remote_dir=harness.REMOTE_INSTALLERS_PATH)
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
index ba80d83d7..ba27dc69f 100644
--- a/benchmarks/runner/__init__.py
+++ b/benchmarks/runner/__init__.py
@@ -1,5 +1,5 @@
 # python3
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,13 +15,10 @@
 
 import copy
 import csv
-import json
 import logging
-import os
 import pkgutil
 import pydoc
 import re
-import subprocess
 import sys
 import types
 from typing import List
@@ -123,57 +120,29 @@ def run_mock(ctx, **kwargs):
 
 @runner.command("run-gcp", commands.GCPCommand)
 @click.pass_context
-def run_gcp(ctx, project: str, ssh_key_file: str, image: str,
-            image_project: str, machine_type: str, zone: str, ssh_user: str,
-            ssh_password: str, **kwargs):
+def run_gcp(ctx, image_file: str, zone_file: str, machine_type: str,
+            installers: List[str], **kwargs):
   """Runs all benchmarks on GCP instances."""
 
-  if not ssh_user:
-    ssh_user = harness.DEFAULT_USER
-
-  # Get the default project if one was not provided.
-  if not project:
-    sub = subprocess.run(
-        "gcloud config get-value project".split(" "), stdout=subprocess.PIPE)
-    if sub.returncode:
-      raise ValueError(
-          "Cannot get default project from gcloud. Is it configured>")
-    project = sub.stdout.decode("utf-8").strip("\n")
-
-  if not image_project:
-    image_project = project
-
-  # Check that the ssh-key exists and is readable.
-  if not os.access(ssh_key_file, os.R_OK):
-    raise ValueError(
-        "ssh key given `{ssh_key}` is does not exist or is not readable."
-        .format(ssh_key=ssh_key_file))
-
-  # Check that the image exists.
-  sub = subprocess.run(
-      "gcloud compute images describe {image} --project {image_project} --format=json"
-      .format(image=image, image_project=image_project).split(" "),
-      stdout=subprocess.PIPE)
-  if sub.returncode or "READY" not in json.loads(sub.stdout)["status"]:
-    raise ValueError(
-        "given image was not found or is not ready: {image} {image_project}."
-        .format(image=image, image_project=image_project))
-
-  # Check and set zone to default.
-  if not zone:
-    sub = subprocess.run(
-        "gcloud config get-value compute/zone".split(" "),
-        stdout=subprocess.PIPE)
-    if sub.returncode:
-      raise ValueError(
-          "Default zone is not set in gcloud. Set one or pass a zone with the --zone flag."
-      )
-    zone = sub.stdout.decode("utf-8").strip("\n")
-
-  producer = gcloud_producer.GCloudProducer(project, ssh_key_file, image,
-                                            image_project, machine_type, zone,
-                                            ssh_user, ssh_password)
-  run(ctx, producer, **kwargs)
+  # Resolve all files.
+  image = open(image_file).read().rstrip()
+  zone = open(zone_file).read().rstrip()
+
+  key_file = harness.make_key()
+
+  producer = gcloud_producer.GCloudProducer(
+      image,
+      zone,
+      machine_type,
+      installers,
+      ssh_key_file=key_file,
+      ssh_user=harness.DEFAULT_USER,
+      ssh_password="")
+
+  try:
+    run(ctx, producer, **kwargs)
+  finally:
+    harness.delete_key()
 
 
 def run(ctx, producer: machine_producer.MachineProducer, method: str, runs: int,
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
index 7ab12fac6..0fccb2fad 100644
--- a/benchmarks/runner/commands.py
+++ b/benchmarks/runner/commands.py
@@ -1,5 +1,5 @@
 # python3
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@ def run_mock(**kwargs):
   # mock implementation
 
 """
-import click
+import os
 
-from benchmarks import harness
+import click
 
 
 class RunCommand(click.core.Command):
@@ -90,46 +90,40 @@ class GCPCommand(RunCommand):
   """GCPCommand inherits all flags from RunCommand and adds flags for run_gcp method.
 
   Attributes:
-    project: GCP project
-    ssh_key_path: path to the ssh-key to use for the run
-    image: name of the image to build machines from
-    image_project: GCP project under which to find image
-    zone: a GCP zone (e.g. us-west1-b)
-    ssh_user: username to use for the ssh-key
-    ssh_password: password to use for the ssh-key
+    image_file: name of the image to build machines from
+    zone_file: a GCP zone (e.g. us-west1-b)
+    installers: named installers for post-create
+    machine_type: type of machine to create (e.g. n1-standard-4)
   """
 
   def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
 
-    project = click.core.Option(
-        ("--project",),
-        help="Project to run on if not default value given by 'gcloud config get-value project'."
+    image_file = click.core.Option(
+        ("--image_file",),
+        help="The file containing the image for VMs.",
+        default=os.path.join(
+            os.path.dirname(__file__), "../../tools/images/ubuntu1604.txt"),
+    )
+    zone_file = click.core.Option(
+        ("--zone_file",),
+        help="The file containing the GCP zone.",
+        default=os.path.join(
+            os.path.dirname(__file__), "../../tools/images/zone.txt"),
+    )
+    installers = click.core.Option(
+        ("--installers",),
+        help="The set of installers to use.",
+        multiple=True,
+    )
+    machine_type = click.core.Option(
+        ("--machine_type",),
+        help="Type to make all machines.",
+        default="n1-standard-4",
     )
-    ssh_key_path = click.core.Option(
-        ("--ssh-key-file",),
-        help="Path to a valid ssh private key to use. See README on generating a valid ssh key. Set to ~/.ssh/benchmark-tools by default.",
-        default=harness.DEFAULT_USER_HOME + "/.ssh/benchmark-tools")
-    image = click.core.Option(("--image",),
-                              help="The image on which to build VMs.",
-                              default="bm-tools-testing")
-    image_project = click.core.Option(
-        ("--image_project",),
-        help="The project under which the image to be used is listed.",
-        default="")
-    machine_type = click.core.Option(("--machine_type",),
-                                     help="Type to make all machines.",
-                                     default="n1-standard-4")
-    zone = click.core.Option(("--zone",),
-                             help="The GCP zone to run on.",
-                             default="")
-    ssh_user = click.core.Option(("--ssh-user",),
-                                 help="User for the ssh key.",
-                                 default=harness.DEFAULT_USER)
-    ssh_password = click.core.Option(("--ssh-password",),
-                                     help="Password for the ssh key.",
-                                     default="")
     self.params.extend([
-        project, ssh_key_path, image, image_project, machine_type, zone,
-        ssh_user, ssh_password
+        image_file,
+        zone_file,
+        machine_type,
+        installers,
     ])
diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl
index 32235813a..de365d153 100644
--- a/tools/images/defs.bzl
+++ b/tools/images/defs.bzl
@@ -57,7 +57,10 @@ def _vm_image_impl(ctx):
         command = argv,
         input_manifests = runfiles_manifests,
     )
-    return [DefaultInfo(files = depset([ctx.outputs.out]))]
+    return [DefaultInfo(
+        files = depset([ctx.outputs.out]),
+        runfiles = ctx.runfiles(files = [ctx.outputs.out]),
+    )]
 
 _vm_image = rule(
     attrs = {
diff --git a/tools/installers/BUILD b/tools/installers/BUILD
index 01bc4de8c..d78a265ca 100644
--- a/tools/installers/BUILD
+++ b/tools/installers/BUILD
@@ -5,10 +5,15 @@ package(
     licenses = ["notice"],
 )
 
+filegroup(
+    name = "runsc",
+    srcs = ["//runsc"],
+)
+
 sh_binary(
     name = "head",
     srcs = ["head.sh"],
-    data = ["//runsc"],
+    data = [":runsc"],
 )
 
 sh_binary(
diff --git a/tools/installers/head.sh b/tools/installers/head.sh
index 4435cb27a..9de8f138c 100755
--- a/tools/installers/head.sh
+++ b/tools/installers/head.sh
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 # Install our runtime.
-third_party/gvisor/runsc/runsc install
+$(dirname $0)/runsc install
 
 # Restart docker.
 service docker restart || true
-- 
cgit v1.2.3


From 3700221b1f3ff0779a0f4479fd2bafa3312d5a23 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 16:42:37 -0800
Subject: Auto-generate link-local address as a SLAAC address

Auto-generated link-local addresses should have the same lifecycle hooks
as global SLAAC addresses.

The Stack's NDP dispatcher should be notified when link-local addresses
are auto-generated and invalidated. They should also be removed when a
NIC is disabled (which will be supported in a later change).

Tests:
- stack_test.TestNICAutoGenAddrWithOpaque
- stack_test.TestNICAutoGenAddr
PiperOrigin-RevId: 293706760
---
 pkg/tcpip/stack/ndp.go        |  36 +++--
 pkg/tcpip/stack/ndp_test.go   |  85 +++++++-----
 pkg/tcpip/stack/nic.go        |  30 +----
 pkg/tcpip/stack/stack_test.go | 307 +++++++++++++++++++-----------------------
 4 files changed, 218 insertions(+), 240 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 6123fda33..fae5f5014 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -906,22 +906,21 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 		return
 	}
 
-	// We do not already have an address within the prefix, prefix. Do the
+	// We do not already have an address with the prefix prefix. Do the
 	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
-	// to auto-generated global addresses by SLAAC.
-	ndp.newAutoGenAddress(prefix, pl, vl)
+	// to auto-generate global addresses by SLAAC.
+	if !ndp.configs.AutoGenGlobalAddresses {
+		return
+	}
+
+	ndp.doSLAAC(prefix, pl, vl)
 }
 
-// newAutoGenAddress generates a new SLAAC address with the provided lifetimes
+// doSLAAC generates a new SLAAC address with the provided lifetimes
 // for prefix.
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
-func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration) {
-	// Are we configured to auto-generate new global addresses?
-	if !ndp.configs.AutoGenGlobalAddresses {
-		return
-	}
-
+func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// If we do not already have an address for this prefix and the valid
 	// lifetime is 0, no need to do anything further, as per RFC 4862
 	// section 5.5.3.d.
@@ -1152,12 +1151,21 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 //
 // The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupHostOnlyState() {
+	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
+	linkLocalAddrs := 0
 	for addr := range ndp.autoGenAddresses {
+		// RFC 4862 section 5 states that routers are also expected to generate a
+		// link-local address so we do not invalidate them.
+		if linkLocalSubnet.Contains(addr) {
+			linkLocalAddrs++
+			continue
+		}
+
 		ndp.invalidateAutoGenAddress(addr)
 	}
 
-	if got := len(ndp.autoGenAddresses); got != 0 {
-		log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got)
+	if got := len(ndp.autoGenAddresses); got != linkLocalAddrs {
+		log.Fatalf("ndp: still have non-linklocal auto-generated addresses after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalAddrs)
 	}
 
 	for prefix := range ndp.onLinkPrefixes {
@@ -1165,7 +1173,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 	}
 
 	if got := len(ndp.onLinkPrefixes); got != 0 {
-		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got)
+		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got)
 	}
 
 	for router := range ndp.defaultRouters {
@@ -1173,7 +1181,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 	}
 
 	if got := len(ndp.defaultRouters); got != 0 {
-		log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got)
+		log.Fatalf("ndp: still have discovered default routers after cleaning up; found = %d", got)
 	}
 }
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 1e575bdaf..e13509fbd 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -42,6 +42,7 @@ const (
 	linkAddr1                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr2                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
 	linkAddr3                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
+	linkAddr4                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09")
 	defaultTimeout           = 100 * time.Millisecond
 	defaultAsyncEventTimeout = time.Second
 )
@@ -50,6 +51,7 @@ var (
 	llAddr1 = header.LinkLocalAddr(linkAddr1)
 	llAddr2 = header.LinkLocalAddr(linkAddr2)
 	llAddr3 = header.LinkLocalAddr(linkAddr3)
+	llAddr4 = header.LinkLocalAddr(linkAddr4)
 	dstAddr = tcpip.FullAddress{
 		Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
 		Port: 25,
@@ -2882,8 +2884,8 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 }
 
 // TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
-// and prefixes, and auto-generated addresses get invalidated when a NIC
-// becomes a router.
+// and prefixes, and non-linklocal auto-generated addresses are invalidated when
+// a NIC becomes a router.
 func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	t.Parallel()
 
@@ -2898,6 +2900,14 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1)
 	e2Addr1 := addrForSubnet(subnet1, linkAddr2)
 	e2Addr2 := addrForSubnet(subnet2, linkAddr2)
+	llAddrWithPrefix1 := tcpip.AddressWithPrefix{
+		Address:   llAddr1,
+		PrefixLen: 64,
+	}
+	llAddrWithPrefix2 := tcpip.AddressWithPrefix{
+		Address:   llAddr2,
+		PrefixLen: 64,
+	}
 
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, maxEvents),
@@ -2907,7 +2917,8 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		autoGenAddrC:   make(chan ndpAutoGenAddrEvent, maxEvents),
 	}
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		AutoGenIPv6LinkLocal: true,
 		NDPConfigs: stack.NDPConfigurations{
 			HandleRAs:              true,
 			DiscoverDefaultRouters: true,
@@ -2917,16 +2928,6 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		NDPDisp: &ndpDisp,
 	})
 
-	e1 := channel.New(0, 1280, linkAddr1)
-	if err := s.CreateNIC(nicID1, e1); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
-	}
-
-	e2 := channel.New(0, 1280, linkAddr2)
-	if err := s.CreateNIC(nicID2, e2); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
-	}
-
 	expectRouterEvent := func() (bool, ndpRouterEvent) {
 		select {
 		case e := <-ndpDisp.routerC:
@@ -2957,18 +2958,30 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		return false, ndpAutoGenAddrEvent{}
 	}
 
-	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr1 and
-	// llAddr2) w/ PI (for prefix1 in RA from llAddr1 and prefix2 in RA from
-	// llAddr2) to discover multiple routers and prefixes, and auto-gen
-	// multiple addresses.
-
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+	e1 := channel.New(0, 1280, linkAddr1)
+	if err := s.CreateNIC(nicID1, e1); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+	}
 	// We have other tests that make sure we receive the *correct* events
 	// on normal discovery of routers/prefixes, and auto-generated
 	// addresses. Here we just make sure we get an event and let other tests
 	// handle the correctness check.
+	expectAutoGenAddrEvent()
+
+	e2 := channel.New(0, 1280, linkAddr2)
+	if err := s.CreateNIC(nicID2, e2); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+	}
+	expectAutoGenAddrEvent()
+
+	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
+	// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
+	// llAddr4) to discover multiple routers and prefixes, and auto-gen
+	// multiple addresses.
+
+	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID1)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
@@ -2977,9 +2990,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
 	}
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID1)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
@@ -2988,9 +3001,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
 	}
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID2)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
@@ -2999,9 +3012,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
 	}
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID2)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
@@ -3014,12 +3027,18 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo := s.NICInfo()
 	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+	}
 	if !containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
 	if !containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
+	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+	}
 	if !containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
@@ -3071,10 +3090,10 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	}
 
 	expectedRouterEvents := map[ndpRouterEvent]int{
-		{nicID: nicID1, addr: llAddr1, discovered: false}: 1,
-		{nicID: nicID1, addr: llAddr2, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr1, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr2, discovered: false}: 1,
+		{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
+		{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
+		{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
+		{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
 	}
 	if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
 		t.Errorf("router events mismatch (-want +got):\n%s", diff)
@@ -3102,12 +3121,18 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo = s.NICInfo()
 	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+	}
 	if containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
 	if containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
+	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+	}
 	if containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 682e9c416..78d451cca 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -206,33 +206,9 @@ func (n *NIC) enable() *tcpip.Error {
 
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
 	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
-		var addr tcpip.Address
-		if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-			addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey)
-		} else {
-			l2addr := n.linkEP.LinkAddress()
-
-			// Only attempt to generate the link-local address if we have a valid MAC
-			// address.
-			//
-			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
-			// LinkEndpoint.LinkAddress) before reaching this point.
-			if !header.IsValidUnicastEthernetAddress(l2addr) {
-				return nil
-			}
-
-			addr = header.LinkLocalAddr(l2addr)
-		}
-
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol: header.IPv6ProtocolNumber,
-			AddressWithPrefix: tcpip.AddressWithPrefix{
-				Address:   addr,
-				PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
-			},
-		}, CanBePrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
-			return err
-		}
+		// The valid and preferred lifetime is infinite for the auto-generated
+		// link-local address.
+		n.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
 	}
 
 	// If we are operating as a router, then do not solicit routers since we
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 243868f3a..b2c1763bf 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1894,112 +1894,6 @@ func TestNICForwarding(t *testing.T) {
 	}
 }
 
-// TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses
-// using the modified EUI-64 of the NIC's MAC address (or lack there-of if
-// disabled (default)). Note, DAD will be disabled in these tests.
-func TestNICAutoGenAddr(t *testing.T) {
-	tests := []struct {
-		name      string
-		autoGen   bool
-		linkAddr  tcpip.LinkAddress
-		iidOpts   stack.OpaqueInterfaceIdentifierOptions
-		shouldGen bool
-	}{
-		{
-			"Disabled",
-			false,
-			linkAddr1,
-			stack.OpaqueInterfaceIdentifierOptions{
-				NICNameFromID: func(nicID tcpip.NICID, _ string) string {
-					return fmt.Sprintf("nic%d", nicID)
-				},
-			},
-			false,
-		},
-		{
-			"Enabled",
-			true,
-			linkAddr1,
-			stack.OpaqueInterfaceIdentifierOptions{},
-			true,
-		},
-		{
-			"Nil MAC",
-			true,
-			tcpip.LinkAddress([]byte(nil)),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Empty MAC",
-			true,
-			tcpip.LinkAddress(""),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Invalid MAC",
-			true,
-			tcpip.LinkAddress("\x01\x02\x03"),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Multicast MAC",
-			true,
-			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Unspecified MAC",
-			true,
-			tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				OpaqueIIDOpts:    test.iidOpts,
-			}
-
-			if test.autoGen {
-				// Only set opts.AutoGenIPv6LinkLocal when test.autoGen is true because
-				// opts.AutoGenIPv6LinkLocal should be false by default.
-				opts.AutoGenIPv6LinkLocal = true
-			}
-
-			e := channel.New(10, 1280, test.linkAddr)
-			s := stack.New(opts)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
-
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
-			}
-
-			if test.shouldGen {
-				// Should have auto-generated an address and resolved immediately (DAD
-				// is disabled).
-				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
-				}
-			} else {
-				// Should not have auto-generated an address.
-				if want := (tcpip.AddressWithPrefix{}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
-				}
-			}
-		})
-	}
-}
-
 // TestNICContextPreservation tests that you can read out via stack.NICInfo the
 // Context data you pass via NICContext.Context in stack.CreateNICWithOptions.
 func TestNICContextPreservation(t *testing.T) {
@@ -2040,11 +1934,9 @@ func TestNICContextPreservation(t *testing.T) {
 	}
 }
 
-// TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local
-// addresses with opaque interface identifiers. Link Local addresses should
-// always be generated with opaque IIDs if configured to use them, even if the
-// NIC has an invalid MAC address.
-func TestNICAutoGenAddrWithOpaque(t *testing.T) {
+// TestNICAutoGenLinkLocalAddr tests the auto-generation of IPv6 link-local
+// addresses.
+func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 	const nicID = 1
 
 	var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte
@@ -2056,108 +1948,185 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n)
 	}
 
+	nicNameFunc := func(_ tcpip.NICID, name string) string {
+		return name
+	}
+
 	tests := []struct {
-		name      string
-		nicName   string
-		autoGen   bool
-		linkAddr  tcpip.LinkAddress
-		secretKey []byte
+		name         string
+		nicName      string
+		autoGen      bool
+		linkAddr     tcpip.LinkAddress
+		iidOpts      stack.OpaqueInterfaceIdentifierOptions
+		shouldGen    bool
+		expectedAddr tcpip.Address
 	}{
 		{
 			name:      "Disabled",
 			nicName:   "nic1",
 			autoGen:   false,
 			linkAddr:  linkAddr1,
-			secretKey: secretKey[:],
+			shouldGen: false,
 		},
 		{
-			name:      "Enabled",
-			nicName:   "nic1",
-			autoGen:   true,
-			linkAddr:  linkAddr1,
-			secretKey: secretKey[:],
+			name:     "Disabled without OIID options",
+			nicName:  "nic1",
+			autoGen:  false,
+			linkAddr: linkAddr1,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:],
+			},
+			shouldGen: false,
 		},
-		// These are all cases where we would not have generated a
-		// link-local address if opaque IIDs were disabled.
+
+		// Tests for EUI64 based addresses.
 		{
-			name:      "Nil MAC and empty nicName",
-			nicName:   "",
+			name:         "EUI64 Enabled",
+			autoGen:      true,
+			linkAddr:     linkAddr1,
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddr(linkAddr1),
+		},
+		{
+			name:      "EUI64 Empty MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress([]byte(nil)),
-			secretKey: secretKey[:1],
+			shouldGen: false,
 		},
 		{
-			name:      "Empty MAC and empty nicName",
+			name:      "EUI64 Invalid MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress(""),
-			secretKey: secretKey[:2],
+			linkAddr:  "\x01\x02\x03",
+			shouldGen: false,
 		},
 		{
-			name:      "Invalid MAC",
-			nicName:   "test",
+			name:      "EUI64 Multicast MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress("\x01\x02\x03"),
-			secretKey: secretKey[:3],
+			linkAddr:  "\x01\x02\x03\x04\x05\x06",
+			shouldGen: false,
 		},
 		{
-			name:      "Multicast MAC",
-			nicName:   "test2",
+			name:      "EUI64 Unspecified MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
-			secretKey: secretKey[:4],
+			linkAddr:  "\x00\x00\x00\x00\x00\x00",
+			shouldGen: false,
 		},
+
+		// Tests for Opaque IID based addresses.
 		{
-			name:     "Unspecified MAC and nil SecretKey",
+			name:     "OIID Enabled",
+			nicName:  "nic1",
+			autoGen:  true,
+			linkAddr: linkAddr1,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("nic1", 0, secretKey[:]),
+		},
+		// These are all cases where we would not have generated a
+		// link-local address if opaque IIDs were disabled.
+		{
+			name:    "OIID Empty MAC and empty nicName",
+			autoGen: true,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:1],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("", 0, secretKey[:1]),
+		},
+		{
+			name:     "OIID Invalid MAC",
+			nicName:  "test",
+			autoGen:  true,
+			linkAddr: "\x01\x02\x03",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:2],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test", 0, secretKey[:2]),
+		},
+		{
+			name:     "OIID Multicast MAC",
+			nicName:  "test2",
+			autoGen:  true,
+			linkAddr: "\x01\x02\x03\x04\x05\x06",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:3],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test2", 0, secretKey[:3]),
+		},
+		{
+			name:     "OIID Unspecified MAC and nil SecretKey",
 			nicName:  "test3",
 			autoGen:  true,
-			linkAddr: tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+			linkAddr: "\x00\x00\x00\x00\x00\x00",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test3", 0, nil),
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-					NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-						return nicName
-					},
-					SecretKey: test.secretKey,
-				},
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
-
-			if test.autoGen {
-				// Only set opts.AutoGenIPv6LinkLocal when
-				// test.autoGen is true because
-				// opts.AutoGenIPv6LinkLocal should be false by
-				// default.
-				opts.AutoGenIPv6LinkLocal = true
+			opts := stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: test.autoGen,
+				NDPDisp:              &ndpDisp,
+				OpaqueIIDOpts:        test.iidOpts,
 			}
 
-			e := channel.New(10, 1280, test.linkAddr)
+			e := channel.New(0, 1280, test.linkAddr)
 			s := stack.New(opts)
 			nicOpts := stack.NICOptions{Name: test.nicName}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
 				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
 			}
 
-			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
-			}
+			var expectedMainAddr tcpip.AddressWithPrefix
+
+			if test.shouldGen {
+				expectedMainAddr = tcpip.AddressWithPrefix{
+					Address:   test.expectedAddr,
+					PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+				}
 
-			if test.autoGen {
-				// Should have auto-generated an address and
-				// resolved immediately (DAD is disabled).
-				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID(test.nicName, 0, test.secretKey), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+				// Should have auto-generated an address and resolved immediately (DAD
+				// is disabled).
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, expectedMainAddr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
 				}
 			} else {
 				// Should not have auto-generated an address.
-				if want := (tcpip.AddressWithPrefix{}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				select {
+				case <-ndpDisp.autoGenAddrC:
+					t.Fatal("unexpectedly auto-generated an address")
+				default:
 				}
 			}
+
+			gotMainAddr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+			if gotMainAddr != expectedMainAddr {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", gotMainAddr, expectedMainAddr)
+			}
 		})
 	}
 }
@@ -2226,7 +2195,7 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 		NDPDisp:              &ndpDisp,
 	}
 
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
 	s := stack.New(opts)
 	if err := s.CreateNIC(1, e); err != nil {
 		t.Fatalf("CreateNIC(_) = %s", err)
-- 
cgit v1.2.3


From 386a1a1564e57c36726ea5a45d6e4f739847a658 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 6 Feb 2020 17:06:14 -0800
Subject: Fix TestPauseResume in container test failed with connection refused.

Sometimes we get this error under TSAN:
"""
error getting process data from container: connecting to control server at PID
XXXX: connection refused
"""

The theory is that the top "sleep 20" was too short for TSAN, and the container
already exited, so we get connected refused. This commit changes the test to
let container signaling it's running by touching a file repeatedly forever
during the test.

PiperOrigin-RevId: 293710957
---
 runsc/container/container_test.go | 201 ++++++++++++++++----------------------
 1 file changed, 85 insertions(+), 116 deletions(-)

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index b54d8f712..04a7dc237 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -163,7 +163,7 @@ func createWriteableOutputFile(path string) (*os.File, error) {
 	return outputFile, nil
 }
 
-func waitForFile(f *os.File) error {
+func waitForFileNotEmpty(f *os.File) error {
 	op := func() error {
 		fi, err := f.Stat()
 		if err != nil {
@@ -178,6 +178,17 @@ func waitForFile(f *os.File) error {
 	return testutil.Poll(op, 30*time.Second)
 }
 
+func waitForFileExist(path string) error {
+	op := func() error {
+		if _, err := os.Stat(path); os.IsNotExist(err) {
+			return err
+		}
+		return nil
+	}
+
+	return testutil.Poll(op, 30*time.Second)
+}
+
 // readOutputNum reads a file at given filepath and returns the int at the
 // requested position.
 func readOutputNum(file string, position int) (int, error) {
@@ -187,7 +198,7 @@ func readOutputNum(file string, position int) (int, error) {
 	}
 
 	// Ensure that there is content in output file.
-	if err := waitForFile(f); err != nil {
+	if err := waitForFileNotEmpty(f); err != nil {
 		return 0, fmt.Errorf("error waiting for output file: %v", err)
 	}
 
@@ -801,7 +812,7 @@ func TestCheckpointRestore(t *testing.T) {
 		defer file.Close()
 
 		// Wait until application has ran.
-		if err := waitForFile(outputFile); err != nil {
+		if err := waitForFileNotEmpty(outputFile); err != nil {
 			t.Fatalf("Failed to wait for output file: %v", err)
 		}
 
@@ -843,7 +854,7 @@ func TestCheckpointRestore(t *testing.T) {
 		}
 
 		// Wait until application has ran.
-		if err := waitForFile(outputFile2); err != nil {
+		if err := waitForFileNotEmpty(outputFile2); err != nil {
 			t.Fatalf("Failed to wait for output file: %v", err)
 		}
 
@@ -887,7 +898,7 @@ func TestCheckpointRestore(t *testing.T) {
 		}
 
 		// Wait until application has ran.
-		if err := waitForFile(outputFile3); err != nil {
+		if err := waitForFileNotEmpty(outputFile3); err != nil {
 			t.Fatalf("Failed to wait for output file: %v", err)
 		}
 
@@ -981,7 +992,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		defer os.RemoveAll(imagePath)
 
 		// Wait until application has ran.
-		if err := waitForFile(outputFile); err != nil {
+		if err := waitForFileNotEmpty(outputFile); err != nil {
 			t.Fatalf("Failed to wait for output file: %v", err)
 		}
 
@@ -1023,7 +1034,7 @@ func TestUnixDomainSockets(t *testing.T) {
 		}
 
 		// Wait until application has ran.
-		if err := waitForFile(outputFile2); err != nil {
+		if err := waitForFileNotEmpty(outputFile2); err != nil {
 			t.Fatalf("Failed to wait for output file: %v", err)
 		}
 
@@ -1042,126 +1053,84 @@ func TestUnixDomainSockets(t *testing.T) {
 }
 
 // TestPauseResume tests that we can successfully pause and resume a container.
-// It checks starts running sleep and executes another sleep. It pauses and checks
-// that both processes are still running: sleep will be paused and still exist.
-// It will then unpause and confirm that both processes are running. Then it will
-// wait until one sleep completes and check to make sure the other is running.
+// The container will keep touching a file to indicate it's running. The test
+// pauses the container, removes the file, and checks that it doesn't get
+// recreated. Then it resumes the container, verify that the file gets created
+// again.
 func TestPauseResume(t *testing.T) {
 	for _, conf := range configs(noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-		const uid = 343
-		spec := testutil.NewSpecWithArgs("sleep", "20")
-
-		lock, err := ioutil.TempFile(testutil.TmpDir(), "lock")
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer lock.Close()
+		t.Run(fmt.Sprintf("conf: %+v", conf), func(t *testing.T) {
+			t.Logf("Running test with conf: %+v", conf)
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
-
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
-
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-			{
-				UID:     uid,
-				PID:     2,
-				PPID:    0,
-				C:       0,
-				Cmd:     "bash",
-				Threads: []kernel.ThreadID{2},
-			},
-		}
-
-		script := fmt.Sprintf("while [[ -f %q ]]; do sleep 0.1; done", lock.Name())
-		execArgs := &control.ExecArgs{
-			Filename:         "/bin/bash",
-			Argv:             []string{"bash", "-c", script},
-			WorkingDirectory: "/",
-			KUID:             uid,
-		}
+			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
+			if err != nil {
+				t.Fatalf("error creating temp dir: %v", err)
+			}
+			defer os.RemoveAll(tmpDir)
 
-		// First, start running exec.
-		_, err = cont.Execute(execArgs)
-		if err != nil {
-			t.Fatalf("error executing: %v", err)
-		}
+			running := path.Join(tmpDir, "running")
+			script := fmt.Sprintf("while [[ true ]]; do touch %q; sleep 0.1; done", running)
+			spec := testutil.NewSpecWithArgs("/bin/bash", "-c", script)
 
-		// Verify that "sleep 5" is running.
-		if err := waitForProcessList(cont, expectedPL); err != nil {
-			t.Fatal(err)
-		}
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(rootDir)
+			defer os.RemoveAll(bundleDir)
 
-		// Pause the running container.
-		if err := cont.Pause(); err != nil {
-			t.Errorf("error pausing container: %v", err)
-		}
-		if got, want := cont.Status, Paused; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.UniqueContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		if err := os.Remove(lock.Name()); err != nil {
-			t.Fatalf("os.Remove(lock) failed: %v", err)
-		}
-		// Script loops and sleeps for 100ms. Give a bit a time for it to exit in
-		// case pause didn't work.
-		time.Sleep(200 * time.Millisecond)
+			// Wait until container starts running, observed by the existence of running
+			// file.
+			if err := waitForFileExist(running); err != nil {
+				t.Errorf("error waiting for container to start: %v", err)
+			}
 
-		// Verify that the two processes still exist.
-		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
-			t.Fatal(err)
-		}
+			// Pause the running container.
+			if err := cont.Pause(); err != nil {
+				t.Errorf("error pausing container: %v", err)
+			}
+			if got, want := cont.Status, Paused; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
 
-		// Resume the running container.
-		if err := cont.Resume(); err != nil {
-			t.Errorf("error pausing container: %v", err)
-		}
-		if got, want := cont.Status, Running; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			if err := os.Remove(running); err != nil {
+				t.Fatalf("os.Remove(%q) failed: %v", running, err)
+			}
+			// Script touches the file every 100ms. Give a bit a time for it to run to
+			// catch the case that pause didn't work.
+			time.Sleep(200 * time.Millisecond)
+			if _, err := os.Stat(running); !os.IsNotExist(err) {
+				t.Fatalf("container did not pause: file exist check: %v", err)
+			}
 
-		expectedPL2 := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-		}
+			// Resume the running container.
+			if err := cont.Resume(); err != nil {
+				t.Errorf("error pausing container: %v", err)
+			}
+			if got, want := cont.Status, Running; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
 
-		// Verify that deleting the file triggered the process to exit.
-		if err := waitForProcessList(cont, expectedPL2); err != nil {
-			t.Fatal(err)
-		}
+			// Verify that the file is once again created by container.
+			if err := waitForFileExist(running); err != nil {
+				t.Fatalf("error resuming container: file exist check: %v", err)
+			}
+		})
 	}
 }
 
-- 
cgit v1.2.3


From ca30dfa065f5458228b06dcf5379ed4edf29c165 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 19:49:30 -0800
Subject: Send DAD event when DAD resolves immediately

Previously, a DAD event would not be sent if DAD was disabled.

This allows integrators to do some work when an IPv6 address is bound to
a NIC without special logic that checks if DAD is enabled.

Without this change, integrators would need to check if a NIC has DAD
enabled when an address is auto-generated. If DAD is enabled, it would
need to delay the work until the DAD completion event; otherwise, it
would need to do the work in the address auto-generated event handler.

Test: stack_test.TestDADDisabled
PiperOrigin-RevId: 293732914
---
 pkg/tcpip/stack/ndp.go        |   7 ++
 pkg/tcpip/stack/ndp_test.go   | 263 +++++++++++++++++++++---------------------
 pkg/tcpip/stack/stack_test.go |  44 +++----
 3 files changed, 154 insertions(+), 160 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index fae5f5014..045409bda 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -448,6 +448,13 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	remaining := ndp.configs.DupAddrDetectTransmits
 	if remaining == 0 {
 		ref.setKind(permanent)
+
+		// Consider DAD to have resolved even if no DAD messages were actually
+		// transmitted.
+		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, true, nil)
+		}
+
 		return nil
 	}
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index e13509fbd..1f6f77439 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -86,39 +86,6 @@ func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWi
 	return prefix, subnet, addrForSubnet(subnet, linkAddr)
 }
 
-// TestDADDisabled tests that an address successfully resolves immediately
-// when DAD is not enabled (the default for an empty stack.Options).
-func TestDADDisabled(t *testing.T) {
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-	}
-
-	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
-	}
-
-	if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
-	}
-
-	// Should get the address immediately since we should not have performed
-	// DAD on it.
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
-	}
-	if addr.Address != addr1 {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
-	}
-
-	// We should not have sent any NDP NS messages.
-	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
-		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
-	}
-}
-
 // ndpDADEvent is a set of parameters that was passed to
 // ndpDispatcher.OnDuplicateAddressDetectionStatus.
 type ndpDADEvent struct {
@@ -300,6 +267,58 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 	}
 }
 
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// resolved flag set to resolved with the specified err.
+func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string {
+	return cmp.Diff(ndpDADEvent{nicID: nicID, addr: addr, resolved: resolved, err: err}, e, cmp.AllowUnexported(e))
+}
+
+// TestDADDisabled tests that an address successfully resolves immediately
+// when DAD is not enabled (the default for an empty stack.Options).
+func TestDADDisabled(t *testing.T) {
+	const nicID = 1
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent, 1),
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPDisp:          &ndpDisp,
+	}
+
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+	}
+
+	// Should get the address immediately since we should not have performed
+	// DAD on it.
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected DAD event")
+	}
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(%d, %d) err = %s", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if addr.Address != addr1 {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
+	}
+
+	// We should not have sent any NDP NS messages.
+	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
+		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -381,17 +400,8 @@ func TestDADResolve(t *testing.T) {
 				// means something is wrong.
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
-				if e.err != nil {
-					t.Fatal("got DAD error: ", e.err)
-				}
-				if e.nicID != nicID {
-					t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
-				}
-				if e.addr != addr1 {
-					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
-				}
-				if !e.resolved {
-					t.Fatal("got DAD event w/ resolved = false, want = true")
+				if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
 			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
@@ -445,6 +455,8 @@ func TestDADResolve(t *testing.T) {
 // a node doing DAD for the same address), or if another node is detected to own
 // the address already (receive an NA message for the tentative address).
 func TestDADFail(t *testing.T) {
+	const nicID = 1
+
 	tests := []struct {
 		name    string
 		makeBuf func(tgt tcpip.Address) buffer.Prependable
@@ -526,22 +538,22 @@ func TestDADFail(t *testing.T) {
 
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(opts)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 			}
 
 			// Address should not be considered bound to the NIC yet
 			// (DAD ongoing).
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Receive a packet to simulate multiple nodes owning or
@@ -565,25 +577,16 @@ func TestDADFail(t *testing.T) {
 				// something is wrong.
 				t.Fatal("timed out waiting for DAD failure")
 			case e := <-ndpDisp.dadC:
-				if e.err != nil {
-					t.Fatal("got DAD error: ", e.err)
-				}
-				if e.nicID != 1 {
-					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-				}
-				if e.addr != addr1 {
-					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
-				}
-				if e.resolved {
-					t.Fatal("got DAD event w/ resolved = true, want = false")
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 		})
 	}
@@ -592,6 +595,8 @@ func TestDADFail(t *testing.T) {
 // TestDADStop tests to make sure that the DAD process stops when an address is
 // removed.
 func TestDADStop(t *testing.T) {
+	const nicID = 1
+
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent, 1),
 	}
@@ -607,26 +612,26 @@ func TestDADStop(t *testing.T) {
 
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 	}
 
 	// Address should not be considered bound to the NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 
 	// Remove the address. This should stop DAD.
-	if err := s.RemoveAddress(1, addr1); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr1, err)
+	if err := s.RemoveAddress(nicID, addr1); err != nil {
+		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
 	}
 
 	// Wait for DAD to fail (since the address was removed during DAD).
@@ -636,26 +641,16 @@ func TestDADStop(t *testing.T) {
 		// time + extra 1s buffer, something is wrong.
 		t.Fatal("timed out waiting for DAD failure")
 	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != 1 {
-			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-		}
-		if e.addr != addr1 {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+		if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
-		if e.resolved {
-			t.Fatal("got DAD event w/ resolved = true, want = false")
-		}
-
 	}
-	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 
 	// Should not have sent more than 1 NS message.
@@ -681,6 +676,10 @@ func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
 // configurations without affecting the default NDP configurations or other
 // interfaces' configurations.
 func TestSetNDPConfigurations(t *testing.T) {
+	const nicID1 = 1
+	const nicID2 = 2
+	const nicID3 = 3
+
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -704,7 +703,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			ndpDisp := ndpDispatcher{
-				dadC: make(chan ndpDADEvent),
+				dadC: make(chan ndpDADEvent, 1),
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
@@ -712,17 +711,28 @@ func TestSetNDPConfigurations(t *testing.T) {
 				NDPDisp:          &ndpDisp,
 			})
 
+			expectDADEvent := func(nicID tcpip.NICID, addr tcpip.Address) {
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatalf("expected DAD event for %s", addr)
+				}
+			}
+
 			// This NIC(1)'s NDP configurations will be updated to
 			// be different from the default.
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(1) = %s", err)
+			if err := s.CreateNIC(nicID1, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
 			}
 
 			// Created before updating NIC(1)'s NDP configurations
 			// but updating NIC(1)'s NDP configurations should not
 			// affect other existing NICs.
-			if err := s.CreateNIC(2, e); err != nil {
-				t.Fatalf("CreateNIC(2) = %s", err)
+			if err := s.CreateNIC(nicID2, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
 			}
 
 			// Update the NDP configurations on NIC(1) to use DAD.
@@ -730,36 +740,38 @@ func TestSetNDPConfigurations(t *testing.T) {
 				DupAddrDetectTransmits: test.dupAddrDetectTransmits,
 				RetransmitTimer:        test.retransmitTimer,
 			}
-			if err := s.SetNDPConfigurations(1, configs); err != nil {
-				t.Fatalf("got SetNDPConfigurations(1, _) = %s", err)
+			if err := s.SetNDPConfigurations(nicID1, configs); err != nil {
+				t.Fatalf("got SetNDPConfigurations(%d, _) = %s", nicID1, err)
 			}
 
 			// Created after updating NIC(1)'s NDP configurations
 			// but the stack's default NDP configurations should not
 			// have been updated.
-			if err := s.CreateNIC(3, e); err != nil {
-				t.Fatalf("CreateNIC(3) = %s", err)
+			if err := s.CreateNIC(nicID3, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID3, err)
 			}
 
 			// Add addresses for each NIC.
-			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-				t.Fatalf("AddAddress(1, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			if err := s.AddAddress(nicID1, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addr1, err)
 			}
-			if err := s.AddAddress(2, header.IPv6ProtocolNumber, addr2); err != nil {
-				t.Fatalf("AddAddress(2, %d, %s) = %s", header.IPv6ProtocolNumber, addr2, err)
+			if err := s.AddAddress(nicID2, header.IPv6ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addr2, err)
 			}
-			if err := s.AddAddress(3, header.IPv6ProtocolNumber, addr3); err != nil {
-				t.Fatalf("AddAddress(3, %d, %s) = %s", header.IPv6ProtocolNumber, addr3, err)
+			expectDADEvent(nicID2, addr2)
+			if err := s.AddAddress(nicID3, header.IPv6ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addr3, err)
 			}
+			expectDADEvent(nicID3, addr3)
 
 			// Address should not be considered bound to NIC(1) yet
 			// (DAD ongoing).
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err := s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Should get the address on NIC(2) and NIC(3)
@@ -767,31 +779,31 @@ func TestSetNDPConfigurations(t *testing.T) {
 			// it as the stack was configured to not do DAD by
 			// default and we only updated the NDP configurations on
 			// NIC(1).
-			addr, err = s.GetMainNICAddress(2, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID2, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(2, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID2, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr2 {
-				t.Fatalf("got stack.GetMainNICAddress(2, _) = %s, want = %s", addr, addr2)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID2, header.IPv6ProtocolNumber, addr, addr2)
 			}
-			addr, err = s.GetMainNICAddress(3, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID3, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(3, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID3, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr3 {
-				t.Fatalf("got stack.GetMainNICAddress(3, _) = %s, want = %s", addr, addr3)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID3, header.IPv6ProtocolNumber, addr, addr3)
 			}
 
 			// Sleep until right (500ms before) before resolution to
 			// make sure the address didn't resolve on NIC(1) yet.
 			const delta = 500 * time.Millisecond
 			time.Sleep(time.Duration(test.dupAddrDetectTransmits)*test.expectedRetransmitTimer - delta)
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Wait for DAD to resolve.
@@ -805,25 +817,16 @@ func TestSetNDPConfigurations(t *testing.T) {
 				// means something is wrong.
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
-				if e.err != nil {
-					t.Fatal("got DAD error: ", e.err)
-				}
-				if e.nicID != 1 {
-					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-				}
-				if e.addr != addr1 {
-					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
-				}
-				if !e.resolved {
-					t.Fatal("got DAD event w/ resolved = false, want = true")
+				if diff := checkDADEvent(e, nicID1, addr1, true, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(1, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr1 {
-				t.Fatalf("got stack.GetMainNICAddress(1, _) = %s, want = %s", addr, addr1)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID1, header.IPv6ProtocolNumber, addr, addr1)
 			}
 		})
 	}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b2c1763bf..24133e6f2 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2184,6 +2184,8 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 // TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
 // link-local addresses will only be assigned after the DAD process resolves.
 func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	const nicID = 1
+
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent),
 	}
@@ -2197,18 +2199,18 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 
 	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
 	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
 	// Address should not be considered bound to the
 	// NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 
 	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
@@ -2222,25 +2224,16 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 		// means something is wrong.
 		t.Fatal("timed out waiting for DAD resolution")
 	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != 1 {
-			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-		}
-		if e.addr != linkLocalAddr {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
-		}
-		if !e.resolved {
-			t.Fatal("got DAD event w/ resolved = false, want = true")
+		if diff := checkDADEvent(e, nicID, linkLocalAddr, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
 	}
-	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 }
 
@@ -2606,17 +2599,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) {
 	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for DAD resolution")
 	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != nicID {
-			t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
-		}
-		if e.addr != addr.AddressWithPrefix.Address {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", e.addr, addr.AddressWithPrefix.Address)
-		}
-		if !e.resolved {
-			t.Fatal("got DAD event w/ resolved = false, want = true")
+		if diff := checkDADEvent(e, nicID, addr.AddressWithPrefix.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
 	}
 	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
-- 
cgit v1.2.3


From 6de49546cb32806896cec27d3ab76e96323ecac1 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Fri, 7 Feb 2020 13:18:19 -0800
Subject: Refactor syscall tests

- Move shared helpers V4Multicast and V4Broadcast to socket_test_util
- Add unnamed namespace so socket_ipv4_tcp_unbound_external_networking_test.cc
  and socket_ipv4_udp_unbound_external_networking_test.cc can be compiled
  together
- Add test files to "exports_files" so they can be included by Fuchsia's syscall
  test setup

PiperOrigin-RevId: 293880429
---
 test/syscalls/linux/BUILD                           |  3 +++
 ...ket_ipv4_tcp_unbound_external_networking_test.cc |  3 +++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc      | 21 ---------------------
 .../socket_ipv4_udp_unbound_external_networking.cc  | 20 --------------------
 ...ket_ipv4_udp_unbound_external_networking_test.cc |  3 +++
 test/syscalls/linux/socket_test_util.cc             | 18 ++++++++++++++++++
 test/syscalls/linux/socket_test_util.h              |  5 +++++
 7 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f2e3c7072..12d389c3e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,6 +12,9 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_unbound.cc",
+        "socket_ipv4_tcp_unbound_external_networking_test.cc",
+        "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 3ac790873..797c4174e 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketKind> GetSockets() {
   return ApplyVec<SocketKind>(
@@ -32,5 +33,7 @@ std::vector<SocketKind> GetSockets() {
 INSTANTIATE_TEST_SUITE_P(IPv4TCPUnboundSockets,
                          IPv4TCPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index aa6fb4e3f..990ccf23c 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -30,27 +30,6 @@
 namespace gvisor {
 namespace testing {
 
-constexpr char kMulticastAddress[] = "224.0.2.1";
-constexpr char kBroadcastAddress[] = "255.255.255.255";
-
-TestAddress V4Multicast() {
-  TestAddress t("V4Multicast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kMulticastAddress);
-  return t;
-}
-
-TestAddress V4Broadcast() {
-  TestAddress t("V4Broadcast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kBroadcastAddress);
-  return t;
-}
-
 // Check that packets are not received without a group membership. Default send
 // interface configured by bind.
 TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 98ae414f3..40e673625 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -41,26 +41,6 @@ TestAddress V4EmptyAddress() {
   return t;
 }
 
-constexpr char kMulticastAddress[] = "224.0.2.1";
-
-TestAddress V4Multicast() {
-  TestAddress t("V4Multicast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kMulticastAddress);
-  return t;
-}
-
-TestAddress V4Broadcast() {
-  TestAddress t("V4Broadcast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      htonl(INADDR_BROADCAST);
-  return t;
-}
-
 void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
   got_if_infos_ = false;
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 8f47952b0..f6e64c157 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketKind> GetSockets() {
   return ApplyVec<SocketKind>(
@@ -32,5 +33,7 @@ std::vector<SocketKind> GetSockets() {
 INSTANTIATE_TEST_SUITE_P(IPv4UDPUnboundSockets,
                          IPv4UDPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index c0c5ab3fe..5d3a39868 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -805,6 +805,24 @@ TestAddress V4MappedLoopback() {
   return t;
 }
 
+TestAddress V4Multicast() {
+  TestAddress t("V4Multicast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      inet_addr(kMulticastAddress);
+  return t;
+}
+
+TestAddress V4Broadcast() {
+  TestAddress t("V4Broadcast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      htonl(INADDR_BROADCAST);
+  return t;
+}
+
 TestAddress V6Any() {
   TestAddress t("V6Any");
   t.addr.ss_family = AF_INET6;
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index bfaa6e397..734b48b96 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -484,10 +484,15 @@ struct TestAddress {
       : description(std::move(description)), addr(), addr_len() {}
 };
 
+constexpr char kMulticastAddress[] = "224.0.2.1";
+constexpr char kBroadcastAddress[] = "255.255.255.255";
+
 TestAddress V4Any();
+TestAddress V4Broadcast();
 TestAddress V4Loopback();
 TestAddress V4MappedAny();
 TestAddress V4MappedLoopback();
+TestAddress V4Multicast();
 TestAddress V6Any();
 TestAddress V6Loopback();
 
-- 
cgit v1.2.3


From c141eb5f430dc50f6bf90232c369b7b3a542155e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 7 Feb 2020 13:47:57 -0800
Subject: Address GH comments.

---
 pkg/abi/linux/netfilter.go                | 2 +-
 pkg/sentry/socket/netfilter/extensions.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index e4aabb6bb..7363185b7 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -363,7 +363,7 @@ type XTTCP struct {
 	// range to which the matcher applies.
 	DestinationPortStart uint16
 
-	// DestinationPortEnd specifies the start of the destination port
+	// DestinationPortEnd specifies the end of the destination port
 	// range to which the matcher applies.
 	DestinationPortEnd uint16
 
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index b5fbb52e4..3082976cd 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From e1587a28876f8aac689a2cd1b7630f1637655b58 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 7 Feb 2020 14:00:50 -0800
Subject: Log level, optname, optval and optlen in getsockopt/setsockopt in
 strace.

Log 8, 16, and 32 int optvals and dump the memory of other sizes.

Updates #1782

PiperOrigin-RevId: 293889388
---
 pkg/sentry/strace/linux64_amd64.go |   4 +-
 pkg/sentry/strace/socket.go        | 215 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/strace.go        |  22 +++-
 pkg/sentry/strace/syscalls.go      |  22 +++-
 4 files changed, 258 insertions(+), 5 deletions(-)

diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index 85ec66fd3..a4de545e9 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -78,8 +78,8 @@ var linuxAMD64 = SyscallMap{
 	51:  makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
 	52:  makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
 	53:  makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
-	54:  makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
-	55:  makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
+	54:  makeSyscallInfo("setsockopt", FD, SockOptLevel, SockOptName, SetSockOptVal, Hex /* length by value, not a pointer */),
+	55:  makeSyscallInfo("getsockopt", FD, SockOptLevel, SockOptName, GetSockOptVal, SockLen),
 	56:  makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
 	57:  makeSyscallInfo("fork"),
 	58:  makeSyscallInfo("vfork"),
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index d2079c85f..f7ff4573e 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -419,3 +419,218 @@ func sockFlags(flags int32) string {
 	}
 	return SocketFlagSet.Parse(uint64(flags))
 }
+
+func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen usermem.Addr, maximumBlobSize uint, rval uintptr) string {
+	if int(rval) < 0 {
+		return hexNum(uint64(optVal))
+	}
+	if optVal == 0 {
+		return "null"
+	}
+	l, err := copySockLen(t, optLen)
+	if err != nil {
+		return fmt.Sprintf("%#x {error reading length: %v}", optLen, err)
+	}
+	return sockOptVal(t, level, optname, optVal, uint64(l), maximumBlobSize)
+}
+
+func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string {
+	switch optLen {
+	case 1:
+		var v uint8
+		_, err := t.CopyIn(optVal, &v)
+		if err != nil {
+			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
+		}
+		return fmt.Sprintf("%#x {value=%v}", optVal, v)
+	case 2:
+		var v uint16
+		_, err := t.CopyIn(optVal, &v)
+		if err != nil {
+			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
+		}
+		return fmt.Sprintf("%#x {value=%v}", optVal, v)
+	case 4:
+		var v uint32
+		_, err := t.CopyIn(optVal, &v)
+		if err != nil {
+			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
+		}
+		return fmt.Sprintf("%#x {value=%v}", optVal, v)
+	default:
+		return dump(t, optVal, uint(optLen), maximumBlobSize)
+	}
+}
+
+var sockOptLevels = abi.ValueSet{
+	linux.SOL_IP:      "SOL_IP",
+	linux.SOL_SOCKET:  "SOL_SOCKET",
+	linux.SOL_TCP:     "SOL_TCP",
+	linux.SOL_UDP:     "SOL_UDP",
+	linux.SOL_IPV6:    "SOL_IPV6",
+	linux.SOL_ICMPV6:  "SOL_ICMPV6",
+	linux.SOL_RAW:     "SOL_RAW",
+	linux.SOL_PACKET:  "SOL_PACKET",
+	linux.SOL_NETLINK: "SOL_NETLINK",
+}
+
+var sockOptNames = map[uint64]abi.ValueSet{
+	linux.SOL_IP: {
+		linux.IP_TTL:                    "IP_TTL",
+		linux.IP_MULTICAST_TTL:          "IP_MULTICAST_TTL",
+		linux.IP_MULTICAST_IF:           "IP_MULTICAST_IF",
+		linux.IP_MULTICAST_LOOP:         "IP_MULTICAST_LOOP",
+		linux.IP_TOS:                    "IP_TOS",
+		linux.IP_RECVTOS:                "IP_RECVTOS",
+		linux.IPT_SO_GET_INFO:           "IPT_SO_GET_INFO",
+		linux.IPT_SO_GET_ENTRIES:        "IPT_SO_GET_ENTRIES",
+		linux.IP_ADD_MEMBERSHIP:         "IP_ADD_MEMBERSHIP",
+		linux.IP_DROP_MEMBERSHIP:        "IP_DROP_MEMBERSHIP",
+		linux.MCAST_JOIN_GROUP:          "MCAST_JOIN_GROUP",
+		linux.IP_ADD_SOURCE_MEMBERSHIP:  "IP_ADD_SOURCE_MEMBERSHIP",
+		linux.IP_BIND_ADDRESS_NO_PORT:   "IP_BIND_ADDRESS_NO_PORT",
+		linux.IP_BLOCK_SOURCE:           "IP_BLOCK_SOURCE",
+		linux.IP_CHECKSUM:               "IP_CHECKSUM",
+		linux.IP_DROP_SOURCE_MEMBERSHIP: "IP_DROP_SOURCE_MEMBERSHIP",
+		linux.IP_FREEBIND:               "IP_FREEBIND",
+		linux.IP_HDRINCL:                "IP_HDRINCL",
+		linux.IP_IPSEC_POLICY:           "IP_IPSEC_POLICY",
+		linux.IP_MINTTL:                 "IP_MINTTL",
+		linux.IP_MSFILTER:               "IP_MSFILTER",
+		linux.IP_MTU_DISCOVER:           "IP_MTU_DISCOVER",
+		linux.IP_MULTICAST_ALL:          "IP_MULTICAST_ALL",
+		linux.IP_NODEFRAG:               "IP_NODEFRAG",
+		linux.IP_OPTIONS:                "IP_OPTIONS",
+		linux.IP_PASSSEC:                "IP_PASSSEC",
+		linux.IP_PKTINFO:                "IP_PKTINFO",
+		linux.IP_RECVERR:                "IP_RECVERR",
+		linux.IP_RECVFRAGSIZE:           "IP_RECVFRAGSIZE",
+		linux.IP_RECVOPTS:               "IP_RECVOPTS",
+		linux.IP_RECVORIGDSTADDR:        "IP_RECVORIGDSTADDR",
+		linux.IP_RECVTTL:                "IP_RECVTTL",
+		linux.IP_RETOPTS:                "IP_RETOPTS",
+		linux.IP_TRANSPARENT:            "IP_TRANSPARENT",
+		linux.IP_UNBLOCK_SOURCE:         "IP_UNBLOCK_SOURCE",
+		linux.IP_UNICAST_IF:             "IP_UNICAST_IF",
+		linux.IP_XFRM_POLICY:            "IP_XFRM_POLICY",
+		linux.MCAST_BLOCK_SOURCE:        "MCAST_BLOCK_SOURCE",
+		linux.MCAST_JOIN_SOURCE_GROUP:   "MCAST_JOIN_SOURCE_GROUP",
+		linux.MCAST_LEAVE_GROUP:         "MCAST_LEAVE_GROUP",
+		linux.MCAST_LEAVE_SOURCE_GROUP:  "MCAST_LEAVE_SOURCE_GROUP",
+		linux.MCAST_MSFILTER:            "MCAST_MSFILTER",
+		linux.MCAST_UNBLOCK_SOURCE:      "MCAST_UNBLOCK_SOURCE",
+		linux.IP_ROUTER_ALERT:           "IP_ROUTER_ALERT",
+		linux.IP_PKTOPTIONS:             "IP_PKTOPTIONS",
+		linux.IP_MTU:                    "IP_MTU",
+	},
+	linux.SOL_SOCKET: {
+		linux.SO_ERROR:        "SO_ERROR",
+		linux.SO_PEERCRED:     "SO_PEERCRED",
+		linux.SO_PASSCRED:     "SO_PASSCRED",
+		linux.SO_SNDBUF:       "SO_SNDBUF",
+		linux.SO_RCVBUF:       "SO_RCVBUF",
+		linux.SO_REUSEADDR:    "SO_REUSEADDR",
+		linux.SO_REUSEPORT:    "SO_REUSEPORT",
+		linux.SO_BINDTODEVICE: "SO_BINDTODEVICE",
+		linux.SO_BROADCAST:    "SO_BROADCAST",
+		linux.SO_KEEPALIVE:    "SO_KEEPALIVE",
+		linux.SO_LINGER:       "SO_LINGER",
+		linux.SO_SNDTIMEO:     "SO_SNDTIMEO",
+		linux.SO_RCVTIMEO:     "SO_RCVTIMEO",
+		linux.SO_OOBINLINE:    "SO_OOBINLINE",
+		linux.SO_TIMESTAMP:    "SO_TIMESTAMP",
+	},
+	linux.SOL_TCP: {
+		linux.TCP_NODELAY:              "TCP_NODELAY",
+		linux.TCP_CORK:                 "TCP_CORK",
+		linux.TCP_QUICKACK:             "TCP_QUICKACK",
+		linux.TCP_MAXSEG:               "TCP_MAXSEG",
+		linux.TCP_KEEPIDLE:             "TCP_KEEPIDLE",
+		linux.TCP_KEEPINTVL:            "TCP_KEEPINTVL",
+		linux.TCP_USER_TIMEOUT:         "TCP_USER_TIMEOUT",
+		linux.TCP_INFO:                 "TCP_INFO",
+		linux.TCP_CC_INFO:              "TCP_CC_INFO",
+		linux.TCP_NOTSENT_LOWAT:        "TCP_NOTSENT_LOWAT",
+		linux.TCP_ZEROCOPY_RECEIVE:     "TCP_ZEROCOPY_RECEIVE",
+		linux.TCP_CONGESTION:           "TCP_CONGESTION",
+		linux.TCP_LINGER2:              "TCP_LINGER2",
+		linux.TCP_DEFER_ACCEPT:         "TCP_DEFER_ACCEPT",
+		linux.TCP_REPAIR_OPTIONS:       "TCP_REPAIR_OPTIONS",
+		linux.TCP_INQ:                  "TCP_INQ",
+		linux.TCP_FASTOPEN:             "TCP_FASTOPEN",
+		linux.TCP_FASTOPEN_CONNECT:     "TCP_FASTOPEN_CONNECT",
+		linux.TCP_FASTOPEN_KEY:         "TCP_FASTOPEN_KEY",
+		linux.TCP_FASTOPEN_NO_COOKIE:   "TCP_FASTOPEN_NO_COOKIE",
+		linux.TCP_KEEPCNT:              "TCP_KEEPCNT",
+		linux.TCP_QUEUE_SEQ:            "TCP_QUEUE_SEQ",
+		linux.TCP_REPAIR:               "TCP_REPAIR",
+		linux.TCP_REPAIR_QUEUE:         "TCP_REPAIR_QUEUE",
+		linux.TCP_REPAIR_WINDOW:        "TCP_REPAIR_WINDOW",
+		linux.TCP_SAVED_SYN:            "TCP_SAVED_SYN",
+		linux.TCP_SAVE_SYN:             "TCP_SAVE_SYN",
+		linux.TCP_SYNCNT:               "TCP_SYNCNT",
+		linux.TCP_THIN_DUPACK:          "TCP_THIN_DUPACK",
+		linux.TCP_THIN_LINEAR_TIMEOUTS: "TCP_THIN_LINEAR_TIMEOUTS",
+		linux.TCP_TIMESTAMP:            "TCP_TIMESTAMP",
+		linux.TCP_ULP:                  "TCP_ULP",
+		linux.TCP_WINDOW_CLAMP:         "TCP_WINDOW_CLAMP",
+	},
+	linux.SOL_IPV6: {
+		linux.IPV6_V6ONLY:              "IPV6_V6ONLY",
+		linux.IPV6_PATHMTU:             "IPV6_PATHMTU",
+		linux.IPV6_TCLASS:              "IPV6_TCLASS",
+		linux.IPV6_ADD_MEMBERSHIP:      "IPV6_ADD_MEMBERSHIP",
+		linux.IPV6_DROP_MEMBERSHIP:     "IPV6_DROP_MEMBERSHIP",
+		linux.IPV6_IPSEC_POLICY:        "IPV6_IPSEC_POLICY",
+		linux.IPV6_JOIN_ANYCAST:        "IPV6_JOIN_ANYCAST",
+		linux.IPV6_LEAVE_ANYCAST:       "IPV6_LEAVE_ANYCAST",
+		linux.IPV6_PKTINFO:             "IPV6_PKTINFO",
+		linux.IPV6_ROUTER_ALERT:        "IPV6_ROUTER_ALERT",
+		linux.IPV6_XFRM_POLICY:         "IPV6_XFRM_POLICY",
+		linux.MCAST_BLOCK_SOURCE:       "MCAST_BLOCK_SOURCE",
+		linux.MCAST_JOIN_GROUP:         "MCAST_JOIN_GROUP",
+		linux.MCAST_JOIN_SOURCE_GROUP:  "MCAST_JOIN_SOURCE_GROUP",
+		linux.MCAST_LEAVE_GROUP:        "MCAST_LEAVE_GROUP",
+		linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP",
+		linux.MCAST_UNBLOCK_SOURCE:     "MCAST_UNBLOCK_SOURCE",
+		linux.IPV6_2292DSTOPTS:         "IPV6_2292DSTOPTS",
+		linux.IPV6_2292HOPLIMIT:        "IPV6_2292HOPLIMIT",
+		linux.IPV6_2292HOPOPTS:         "IPV6_2292HOPOPTS",
+		linux.IPV6_2292PKTINFO:         "IPV6_2292PKTINFO",
+		linux.IPV6_2292PKTOPTIONS:      "IPV6_2292PKTOPTIONS",
+		linux.IPV6_2292RTHDR:           "IPV6_2292RTHDR",
+		linux.IPV6_ADDR_PREFERENCES:    "IPV6_ADDR_PREFERENCES",
+		linux.IPV6_AUTOFLOWLABEL:       "IPV6_AUTOFLOWLABEL",
+		linux.IPV6_DONTFRAG:            "IPV6_DONTFRAG",
+		linux.IPV6_DSTOPTS:             "IPV6_DSTOPTS",
+		linux.IPV6_FLOWINFO:            "IPV6_FLOWINFO",
+		linux.IPV6_FLOWINFO_SEND:       "IPV6_FLOWINFO_SEND",
+		linux.IPV6_FLOWLABEL_MGR:       "IPV6_FLOWLABEL_MGR",
+		linux.IPV6_FREEBIND:            "IPV6_FREEBIND",
+		linux.IPV6_HOPOPTS:             "IPV6_HOPOPTS",
+		linux.IPV6_MINHOPCOUNT:         "IPV6_MINHOPCOUNT",
+		linux.IPV6_MTU:                 "IPV6_MTU",
+		linux.IPV6_MTU_DISCOVER:        "IPV6_MTU_DISCOVER",
+		linux.IPV6_MULTICAST_ALL:       "IPV6_MULTICAST_ALL",
+		linux.IPV6_MULTICAST_HOPS:      "IPV6_MULTICAST_HOPS",
+		linux.IPV6_MULTICAST_IF:        "IPV6_MULTICAST_IF",
+		linux.IPV6_MULTICAST_LOOP:      "IPV6_MULTICAST_LOOP",
+		linux.IPV6_RECVDSTOPTS:         "IPV6_RECVDSTOPTS",
+		linux.IPV6_RECVERR:             "IPV6_RECVERR",
+		linux.IPV6_RECVFRAGSIZE:        "IPV6_RECVFRAGSIZE",
+		linux.IPV6_RECVHOPLIMIT:        "IPV6_RECVHOPLIMIT",
+		linux.IPV6_RECVHOPOPTS:         "IPV6_RECVHOPOPTS",
+		linux.IPV6_RECVORIGDSTADDR:     "IPV6_RECVORIGDSTADDR",
+		linux.IPV6_RECVPATHMTU:         "IPV6_RECVPATHMTU",
+		linux.IPV6_RECVPKTINFO:         "IPV6_RECVPKTINFO",
+		linux.IPV6_RECVRTHDR:           "IPV6_RECVRTHDR",
+		linux.IPV6_RECVTCLASS:          "IPV6_RECVTCLASS",
+		linux.IPV6_RTHDR:               "IPV6_RTHDR",
+		linux.IPV6_RTHDRDSTOPTS:        "IPV6_RTHDRDSTOPTS",
+		linux.IPV6_TRANSPARENT:         "IPV6_TRANSPARENT",
+		linux.IPV6_UNICAST_HOPS:        "IPV6_UNICAST_HOPS",
+		linux.IPV6_UNICAST_IF:          "IPV6_UNICAST_IF",
+		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
+		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
+	},
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 3fc4a47fc..a796b2396 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -55,6 +55,14 @@ var ItimerTypes = abi.ValueSet{
 	linux.ITIMER_PROF:    "ITIMER_PROF",
 }
 
+func hexNum(num uint64) string {
+	return "0x" + strconv.FormatUint(num, 16)
+}
+
+func hexArg(arg arch.SyscallArgument) string {
+	return hexNum(arg.Uint64())
+}
+
 func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr)
@@ -389,6 +397,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, path(t, args[arg].Pointer()))
 		case ExecveStringVector:
 			output = append(output, stringVector(t, args[arg].Pointer()))
+		case SetSockOptVal:
+			output = append(output, sockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Uint64() /* optLen */, maximumBlobSize))
+		case SockOptLevel:
+			output = append(output, sockOptLevels.Parse(args[arg].Uint64()))
+		case SockOptName:
+			output = append(output, sockOptNames[args[arg-1].Uint64() /* level */].Parse(args[arg].Uint64()))
 		case SockAddr:
 			output = append(output, sockAddr(t, args[arg].Pointer(), uint32(args[arg+1].Uint64())))
 		case SockLen:
@@ -446,7 +460,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 		case Hex:
 			fallthrough
 		default:
-			output = append(output, "0x"+strconv.FormatUint(args[arg].Uint64(), 16))
+			output = append(output, hexArg(args[arg]))
 		}
 	}
 
@@ -507,6 +521,12 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
 		case PollFDs:
 			output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true)
+		case GetSockOptVal:
+			output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval)
+		case SetSockOptVal:
+			// No need to print the value again. While it usually
+			// isn't, the string version of this arg can be long.
+			output[arg] = hexArg(args[arg])
 		}
 	}
 }
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 24e29a2ba..446d1e0f6 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -207,9 +207,27 @@ const (
 	// array is in the next argument.
 	PollFDs
 
-	// SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of
-	// fds represented must be the first argument.
+	// SelectFDSet is an fd_set argument in select(2)/pselect(2). The
+	// number of FDs represented must be the first argument.
 	SelectFDSet
+
+	// GetSockOptVal is the optval argument in getsockopt(2).
+	//
+	// Formatted after syscall execution.
+	GetSockOptVal
+
+	// SetSockOptVal is the optval argument in setsockopt(2).
+	//
+	// Contents omitted after syscall execution.
+	SetSockOptVal
+
+	// SockOptLevel is the level argument in getsockopt(2) and
+	// setsockopt(2).
+	SockOptLevel
+
+	// SockOptLevel is the optname argument in getsockopt(2) and
+	// setsockopt(2).
+	SockOptName
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From 17b9f5e66238bde1e4ed3bd9e5fb67342c8b58ec Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 7 Feb 2020 14:46:24 -0800
Subject: Support listxattr and removexattr syscalls.

Note that these are only implemented for tmpfs, and other impls will still
return EOPNOTSUPP.

PiperOrigin-RevId: 293899385
---
 pkg/p9/client_file.go                      |  33 ++++
 pkg/p9/file.go                             |  16 ++
 pkg/p9/handlers.go                         |  33 ++++
 pkg/p9/messages.go                         | 199 +++++++++++++++----
 pkg/p9/p9.go                               |   4 +
 pkg/p9/version.go                          |   8 +-
 pkg/sentry/fs/copy_up.go                   |   2 +-
 pkg/sentry/fs/fsutil/inode.go              |  20 +-
 pkg/sentry/fs/gofer/context_file.go        |  14 ++
 pkg/sentry/fs/gofer/inode.go               |  13 +-
 pkg/sentry/fs/inode.go                     |  14 +-
 pkg/sentry/fs/inode_operations.go          |  13 +-
 pkg/sentry/fs/inode_overlay.go             |  18 +-
 pkg/sentry/fs/tmpfs/tmpfs.go               |   9 +-
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++-
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 ++--
 pkg/sentry/syscalls/linux/sys_xattr.go     | 200 +++++++++++++++++++-
 runsc/fsgofer/fsgofer.go                   |  16 +-
 test/syscalls/linux/BUILD                  |   1 +
 test/syscalls/linux/xattr.cc               | 294 ++++++++++++++++-------------
 20 files changed, 733 insertions(+), 238 deletions(-)

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 0254e4ccc..2ee07b664 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -194,6 +194,39 @@ func (c *clientFile) SetXattr(name, value string, flags uint32) error {
 	return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
 }
 
+// ListXattr implements File.ListXattr.
+func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return nil, syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return nil, syscall.EOPNOTSUPP
+	}
+
+	rlistxattr := Rlistxattr{}
+	if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil {
+		return nil, err
+	}
+
+	xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs))
+	for _, x := range rlistxattr.Xattrs {
+		xattrs[x] = struct{}{}
+	}
+	return xattrs, nil
+}
+
+// RemoveXattr implements File.RemoveXattr.
+func (c *clientFile) RemoveXattr(name string) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
+
+	return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{})
+}
+
 // Allocate implements File.Allocate.
 func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
 	if atomic.LoadUint32(&c.closed) != 0 {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 4607cfcdf..d4ffbc8e3 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -105,6 +105,22 @@ type File interface {
 	// TODO(b/127675828): Determine concurrency guarantees once implemented.
 	SetXattr(name, value string, flags uint32) error
 
+	// ListXattr lists the names of the extended attributes on this node.
+	//
+	// Size indicates the size of the buffer that has been allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	ListXattr(size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes extended attributes on this node.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	RemoveXattr(name string) error
+
 	// Allocate allows the caller to directly manipulate the allocated disk space
 	// for the file. See fallocate(2) for more details.
 	Allocate(mode AllocateMode, offset, length uint64) error
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 7d6653a07..2ac45eb80 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -941,6 +941,39 @@ func (t *Tsetxattr) handle(cs *connState) message {
 	return &Rsetxattr{}
 }
 
+// handle implements handler.handle.
+func (t *Tlistxattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	xattrs, err := ref.file.ListXattr(t.Size)
+	if err != nil {
+		return newErr(err)
+	}
+	xattrList := make([]string, 0, len(xattrs))
+	for x := range xattrs {
+		xattrList = append(xattrList, x)
+	}
+	return &Rlistxattr{Xattrs: xattrList}
+}
+
+// handle implements handler.handle.
+func (t *Tremovexattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.file.RemoveXattr(t.Name); err != nil {
+		return newErr(err)
+	}
+	return &Rremovexattr{}
+}
+
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.Directory)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index ceb723d86..b1cede5f5 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -174,11 +174,11 @@ type Rflush struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflush) Decode(b *buffer) {
+func (*Rflush) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflush) Encode(b *buffer) {
+func (*Rflush) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -188,7 +188,7 @@ func (*Rflush) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rflush) String() string {
-	return fmt.Sprintf("RFlush{}")
+	return "RFlush{}"
 }
 
 // Twalk is a walk request.
@@ -300,11 +300,11 @@ type Rclunk struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rclunk) Decode(b *buffer) {
+func (*Rclunk) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rclunk) Encode(b *buffer) {
+func (*Rclunk) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -314,7 +314,7 @@ func (*Rclunk) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rclunk) String() string {
-	return fmt.Sprintf("Rclunk{}")
+	return "Rclunk{}"
 }
 
 // Tremove is a remove request.
@@ -350,11 +350,11 @@ type Rremove struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rremove) Decode(b *buffer) {
+func (*Rremove) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rremove) Encode(b *buffer) {
+func (*Rremove) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -364,7 +364,7 @@ func (*Rremove) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rremove) String() string {
-	return fmt.Sprintf("Rremove{}")
+	return "Rremove{}"
 }
 
 // Rlerror is an error response.
@@ -745,16 +745,16 @@ func (*Rlink) Type() MsgType {
 }
 
 // Decode implements encoder.Decode.
-func (*Rlink) Decode(b *buffer) {
+func (*Rlink) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rlink) Encode(b *buffer) {
+func (*Rlink) Encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
 func (r *Rlink) String() string {
-	return fmt.Sprintf("Rlink{}")
+	return "Rlink{}"
 }
 
 // Trenameat is a rename request.
@@ -803,11 +803,11 @@ type Rrenameat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrenameat) Decode(b *buffer) {
+func (*Rrenameat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrenameat) Encode(b *buffer) {
+func (*Rrenameat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -817,7 +817,7 @@ func (*Rrenameat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrenameat) String() string {
-	return fmt.Sprintf("Rrenameat{}")
+	return "Rrenameat{}"
 }
 
 // Tunlinkat is an unlink request.
@@ -861,11 +861,11 @@ type Runlinkat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Runlinkat) Decode(b *buffer) {
+func (*Runlinkat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Runlinkat) Encode(b *buffer) {
+func (*Runlinkat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -875,7 +875,7 @@ func (*Runlinkat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Runlinkat) String() string {
-	return fmt.Sprintf("Runlinkat{}")
+	return "Runlinkat{}"
 }
 
 // Trename is a rename request.
@@ -922,11 +922,11 @@ type Rrename struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrename) Decode(b *buffer) {
+func (*Rrename) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrename) Encode(b *buffer) {
+func (*Rrename) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -936,7 +936,7 @@ func (*Rrename) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrename) String() string {
-	return fmt.Sprintf("Rrename{}")
+	return "Rrename{}"
 }
 
 // Treadlink is a readlink request.
@@ -1409,11 +1409,11 @@ type Rsetattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rsetattr) Decode(b *buffer) {
+func (*Rsetattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rsetattr) Encode(b *buffer) {
+func (*Rsetattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1423,7 +1423,7 @@ func (*Rsetattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetattr) String() string {
-	return fmt.Sprintf("Rsetattr{}")
+	return "Rsetattr{}"
 }
 
 // Tallocate is an allocate request. This is an extension to 9P protocol, not
@@ -1466,11 +1466,11 @@ type Rallocate struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rallocate) Decode(b *buffer) {
+func (*Rallocate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rallocate) Encode(b *buffer) {
+func (*Rallocate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1480,7 +1480,71 @@ func (*Rallocate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rallocate) String() string {
-	return fmt.Sprintf("Rallocate{}")
+	return "Rallocate{}"
+}
+
+// Tlistxattr is a listxattr request.
+type Tlistxattr struct {
+	// FID refers to the file on which to list xattrs.
+	FID FID
+
+	// Size is the buffer size for the xattr list.
+	Size uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tlistxattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Size = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tlistxattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.Write64(t.Size)
+}
+
+// Type implements message.Type.
+func (*Tlistxattr) Type() MsgType {
+	return MsgTlistxattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tlistxattr) String() string {
+	return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size)
+}
+
+// Rlistxattr is a listxattr response.
+type Rlistxattr struct {
+	// Xattrs is a list of extended attribute names.
+	Xattrs []string
+}
+
+// Decode implements encoder.Decode.
+func (r *Rlistxattr) Decode(b *buffer) {
+	n := b.Read16()
+	r.Xattrs = r.Xattrs[:0]
+	for i := 0; i < int(n); i++ {
+		r.Xattrs = append(r.Xattrs, b.ReadString())
+	}
+}
+
+// Encode implements encoder.Encode.
+func (r *Rlistxattr) Encode(b *buffer) {
+	b.Write16(uint16(len(r.Xattrs)))
+	for _, x := range r.Xattrs {
+		b.WriteString(x)
+	}
+}
+
+// Type implements message.Type.
+func (*Rlistxattr) Type() MsgType {
+	return MsgRlistxattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rlistxattr) String() string {
+	return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs)
 }
 
 // Txattrwalk walks extended attributes.
@@ -1594,11 +1658,11 @@ type Rxattrcreate struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(b *buffer) {
+func (r *Rxattrcreate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(b *buffer) {
+func (r *Rxattrcreate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1608,7 +1672,7 @@ func (*Rxattrcreate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rxattrcreate) String() string {
-	return fmt.Sprintf("Rxattrcreate{}")
+	return "Rxattrcreate{}"
 }
 
 // Tgetxattr is a getxattr request.
@@ -1719,11 +1783,11 @@ type Rsetxattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(b *buffer) {
+func (r *Rsetxattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(b *buffer) {
+func (r *Rsetxattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1733,7 +1797,60 @@ func (*Rsetxattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetxattr) String() string {
-	return fmt.Sprintf("Rsetxattr{}")
+	return "Rsetxattr{}"
+}
+
+// Tremovexattr is a removexattr request.
+type Tremovexattr struct {
+	// FID refers to the file on which to set xattrs.
+	FID FID
+
+	// Name is the attribute name.
+	Name string
+}
+
+// Decode implements encoder.Decode.
+func (t *Tremovexattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Name = b.ReadString()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tremovexattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.WriteString(t.Name)
+}
+
+// Type implements message.Type.
+func (*Tremovexattr) Type() MsgType {
+	return MsgTremovexattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tremovexattr) String() string {
+	return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name)
+}
+
+// Rremovexattr is a removexattr response.
+type Rremovexattr struct {
+}
+
+// Decode implements encoder.Decode.
+func (r *Rremovexattr) Decode(*buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (r *Rremovexattr) Encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rremovexattr) Type() MsgType {
+	return MsgRremovexattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rremovexattr) String() string {
+	return "Rremovexattr{}"
 }
 
 // Treaddir is a readdir request.
@@ -1880,11 +1997,11 @@ type Rfsync struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rfsync) Decode(b *buffer) {
+func (*Rfsync) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rfsync) Encode(b *buffer) {
+func (*Rfsync) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1894,7 +2011,7 @@ func (*Rfsync) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rfsync) String() string {
-	return fmt.Sprintf("Rfsync{}")
+	return "Rfsync{}"
 }
 
 // Tstatfs is a stat request.
@@ -1980,11 +2097,11 @@ type Rflushf struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflushf) Decode(b *buffer) {
+func (*Rflushf) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflushf) Encode(b *buffer) {
+func (*Rflushf) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1994,7 +2111,7 @@ func (*Rflushf) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (*Rflushf) String() string {
-	return fmt.Sprintf("Rflushf{}")
+	return "Rflushf{}"
 }
 
 // Twalkgetattr is a walk request.
@@ -2484,6 +2601,8 @@ func init() {
 	msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} })
 	msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} })
 	msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} })
+	msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} })
+	msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} })
 	msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} })
 	msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} })
 	msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} })
@@ -2492,6 +2611,8 @@ func init() {
 	msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} })
 	msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} })
 	msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} })
+	msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} })
+	msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} })
 	msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} })
 	msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} })
 	msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} })
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 5ab00d625..20ab31f7a 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -335,6 +335,8 @@ const (
 	MsgRgetattr             = 25
 	MsgTsetattr             = 26
 	MsgRsetattr             = 27
+	MsgTlistxattr           = 28
+	MsgRlistxattr           = 29
 	MsgTxattrwalk           = 30
 	MsgRxattrwalk           = 31
 	MsgTxattrcreate         = 32
@@ -343,6 +345,8 @@ const (
 	MsgRgetxattr            = 35
 	MsgTsetxattr            = 36
 	MsgRsetxattr            = 37
+	MsgTremovexattr         = 38
+	MsgRremovexattr         = 39
 	MsgTreaddir             = 40
 	MsgRreaddir             = 41
 	MsgTfsync               = 50
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 34a15eb55..09cde9f5a 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 10
+	highestSupportedVersion uint32 = 11
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -167,3 +167,9 @@ func VersionSupportsOpenTruncateFlag(v uint32) bool {
 func versionSupportsGetSetXattr(v uint32) bool {
 	return v >= 10
 }
+
+// versionSupportsListRemoveXattr returns true if version v supports
+// the Tlistxattr and Tremovexattr messages.
+func versionSupportsListRemoveXattr(v uint32) bool {
+	return v >= 11
+}
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index f6c79e51b..b060a12ff 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -401,7 +401,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error
 	if err != nil {
 		return err
 	}
-	lowerXattr, err := lower.ListXattr(ctx)
+	lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX)
 	if err != nil && err != syserror.EOPNOTSUPP {
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 252830572..daecc4ffe 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -247,7 +247,7 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode,
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	i.mu.RLock()
 	names := make(map[string]struct{}, len(i.xattrs))
 	for name := range i.xattrs {
@@ -257,6 +257,17 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (m
 	return names, nil
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
+	i.mu.RLock()
+	defer i.mu.RUnlock()
+	if _, ok := i.xattrs[name]; ok {
+		delete(i.xattrs, name)
+		return nil
+	}
+	return syserror.ENOATTR
+}
+
 // staticFile is a file with static contents. It is returned by
 // InodeStaticFileGetter.GetFile.
 //
@@ -460,10 +471,15 @@ func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, st
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	return nil, syserror.EOPNOTSUPP
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error {
+	return syserror.EOPNOTSUPP
+}
+
 // InodeNoopRelease implements fs.InodeOperations.Release as a noop.
 type InodeNoopRelease struct{}
 
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 3da818aed..125907d70 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -73,6 +73,20 @@ func (c *contextFile) setXattr(ctx context.Context, name, value string, flags ui
 	return err
 }
 
+func (c *contextFile) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := c.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
+func (c *contextFile) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index ac28174d2..1c934981b 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -604,18 +604,23 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 }
 
 // GetXattr implements fs.InodeOperations.GetXattr.
-func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) {
+func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) {
 	return i.fileState.file.getXattr(ctx, name, size)
 }
 
 // SetXattr implements fs.InodeOperations.SetXattr.
-func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error {
+func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error {
 	return i.fileState.file.setXattr(ctx, name, value, flags)
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return i.fileState.file.listXattr(ctx, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error {
+	return i.fileState.file.removeXattr(ctx, name)
 }
 
 // Allocate implements fs.InodeOperations.Allocate.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b66c091ab..55fb71c16 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -278,11 +278,19 @@ func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, fla
 }
 
 // ListXattr calls i.InodeOperations.ListXattr with i as the Inode.
-func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) {
+func (i *Inode) ListXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
 	if i.overlay != nil {
-		return overlayListXattr(ctx, i.overlay)
+		return overlayListXattr(ctx, i.overlay, size)
 	}
-	return i.InodeOperations.ListXattr(ctx, i)
+	return i.InodeOperations.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr calls i.InodeOperations.RemoveXattr with i as the Inode.
+func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error {
+	if i.overlay != nil {
+		return overlayRemoveXattr(ctx, i.overlay, d, name)
+	}
+	return i.InodeOperations.RemoveXattr(ctx, i, name)
 }
 
 // CheckPermission will check if the caller may access this file in the
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 70f2eae96..2bbfb72ef 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -190,7 +190,18 @@ type InodeOperations interface {
 	// ListXattr returns the set of all extended attributes names that
 	// have values. Inodes that do not support extended attributes return
 	// EOPNOTSUPP.
-	ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error)
+	//
+	// If this is called through the listxattr(2) syscall, size indicates the
+	// size of the buffer that the application has allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely. All size checking is done independently
+	// at the syscall layer.
+	ListXattr(ctx context.Context, inode *Inode, size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes an extended attribute specified by name. Inodes that
+	// do not support extended attributes return EOPNOTSUPP.
+	RemoveXattr(ctx context.Context, inode *Inode, name string) error
 
 	// Check determines whether an Inode can be accessed with the
 	// requested permission mask using the context (which gives access
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 4729b4aac..5ada33a32 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -564,15 +564,15 @@ func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, valu
 	return o.upper.SetXattr(ctx, d, name, value, flags)
 }
 
-func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) {
+func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	var names map[string]struct{}
 	var err error
 	if o.upper != nil {
-		names, err = o.upper.ListXattr(ctx)
+		names, err = o.upper.ListXattr(ctx, size)
 	} else {
-		names, err = o.lower.ListXattr(ctx)
+		names, err = o.lower.ListXattr(ctx, size)
 	}
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
@@ -584,6 +584,18 @@ func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}
 	return names, err
 }
 
+func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
+	// Don't allow changes to overlay xattrs through a removexattr syscall.
+	if strings.HasPrefix(XattrOverlayPrefix, name) {
+		return syserror.EPERM
+	}
+
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.RemoveXattr(ctx, d, name)
+}
+
 func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
 	o.copyMu.RLock()
 	// Hot path. Avoid defers.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index c00cef0a5..3c2b583ae 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -159,8 +159,13 @@ func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, fla
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) {
-	return d.ramfsDir.ListXattr(ctx, i)
+func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return d.ramfsDir.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (d *Dir) RemoveXattr(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.RemoveXattr(ctx, i, name)
 }
 
 // Lookup implements fs.InodeOperations.Lookup.
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 588f8b087..79066ad2a 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,21 +228,18 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 06e5ee401..7421619de 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,26 +36,23 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index efb95555c..342337726 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -72,7 +72,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -172,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 		return 0, nil, err
 	}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -187,12 +187,12 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EINVAL
 	}
 
-	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
 		return err
 	}
 
-	name, err := copyInXattrName(t, nameAddr)
-	if err != nil {
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
 		return err
 	}
 
@@ -226,12 +226,18 @@ func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
 	return name, nil
 }
 
+// Restrict xattrs to regular files and directories.
+//
+// TODO(b/148380782): In Linux, this restriction technically only applies to
+// xattrs in the "user.*" namespace. Make file type checks specific to the
+// namespace once we allow other xattr prefixes.
+func xattrFileTypeOk(i *fs.Inode) bool {
+	return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr)
+}
+
 func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
 	// Restrict xattrs to regular files and directories.
-	//
-	// In Linux, this restriction technically only applies to xattrs in the
-	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
-	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+	if !xattrFileTypeOk(i) {
 		if perms.Write {
 			return syserror.EPERM
 		}
@@ -240,3 +246,179 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error
 
 	return i.CheckPermission(t, perms)
 }
+
+// ListXattr implements linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, true)
+}
+
+// LListXattr implements linux syscall llistxattr(2).
+func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, false)
+}
+
+// FListXattr implements linux syscall flistxattr(2).
+func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, err := listXattr(t, f.Dirent, listAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		n, err = listXattr(t, d, listAddr, size)
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) {
+	if !xattrFileTypeOk(d.Inode) {
+		return 0, nil
+	}
+
+	// If listxattr(2) is called with size 0, the buffer size needed to contain
+	// the xattr list will be returned successfully even if it is nonzero. In
+	// that case, we need to retrieve the entire list so we can compute and
+	// return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+	xattrs, err := d.Inode.ListXattr(t, requestedSize)
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(b/148380782): support namespaces other than "user".
+	for x := range xattrs {
+		if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			delete(xattrs, x)
+		}
+	}
+
+	listSize := xattrListSize(xattrs)
+	if listSize > linux.XATTR_SIZE_MAX {
+		return 0, syserror.E2BIG
+	}
+	if uint64(listSize) > requestedSize {
+		return 0, syserror.ERANGE
+	}
+
+	// Don't copy out the attributes if size is 0.
+	if size == 0 {
+		return listSize, nil
+	}
+
+	buf := make([]byte, 0, listSize)
+	for x := range xattrs {
+		buf = append(buf, []byte(x)...)
+		buf = append(buf, 0)
+	}
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return 0, err
+	}
+
+	return len(buf), nil
+}
+
+func xattrListSize(xattrs map[string]struct{}) int {
+	size := 0
+	for x := range xattrs {
+		size += len(x) + 1
+	}
+	return size
+}
+
+// RemoveXattr implements linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, true)
+}
+
+// LRemoveXattr implements linux syscall lremovexattr(2).
+func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, false)
+}
+
+// FRemoveXattr implements linux syscall fremovexattr(2).
+func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, removeXattr(t, f.Dirent, nameAddr)
+}
+
+func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		return removeXattr(t, d, nameAddr)
+	})
+}
+
+// removeXattr implements removexattr(2) from the given *fs.Dirent.
+func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.RemoveXattr(t, d, name)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 4d84ad999..cadd83273 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -768,12 +768,22 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 // TODO(b/127675828): support getxattr.
-func (l *localFile) GetXattr(name string, size uint64) (string, error) {
+func (*localFile) GetXattr(string, uint64) (string, error) {
 	return "", syscall.EOPNOTSUPP
 }
 
 // TODO(b/127675828): support setxattr.
-func (l *localFile) SetXattr(name, value string, flags uint32) error {
+func (*localFile) SetXattr(string, string, uint32) error {
+	return syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support listxattr.
+func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
+	return nil, syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support removexattr.
+func (*localFile) RemoveXattr(string) error {
 	return syscall.EOPNOTSUPP
 }
 
@@ -790,7 +800,7 @@ func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error
 }
 
 // Rename implements p9.File; this should never be called.
-func (l *localFile) Rename(p9.File, string) error {
+func (*localFile) Rename(p9.File, string) error {
 	panic("rename called directly")
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 12d389c3e..ca1af209a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3782,6 +3782,7 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         gtest,
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 85eb31847..8b00ef44c 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -24,6 +24,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/container/flat_hash_set.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
@@ -38,36 +39,36 @@ namespace {
 
 class XattrTest : public FileTest {};
 
-TEST_F(XattrTest, XattrNullName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, XattrNonexistentFile) {
+  const char* path = "/does/not/exist";
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+}
 
+TEST_F(XattrTest, XattrNullName) {
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
   EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
               SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(EFAULT));
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, ""), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrLargeName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -86,28 +87,23 @@ TEST_F(XattrTest, XattrLargeName) {
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, name.c_str()), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EOPNOTSUPP));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(removexattr(path, name.c_str()),
+              SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -124,19 +120,21 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EACCES));
 
   char buf = '-';
   EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 }
 
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -152,6 +150,14 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+
+  // listxattr will succeed even without read permissions.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
@@ -163,64 +169,66 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
   const char name[] = "trusted.abc";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(dir.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(dir.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(link.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(link.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(link.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(link.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
-  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(char_device, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+  EXPECT_THAT(getxattr(char_device, name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(char_device, nullptr, 0), SyscallSucceedsWithValue(0));
 
   // Use tmpfs, where creation of named pipes is supported.
   const std::string fifo = NewTempAbsPathInDir("/dev/shm");
   const char* path = fifo.c_str();
   EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
-  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -236,10 +244,6 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -252,10 +256,6 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -271,10 +271,6 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -284,10 +280,6 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -296,10 +288,6 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -316,10 +304,6 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -335,10 +319,6 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -353,10 +333,6 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -368,10 +344,6 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -384,10 +356,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -395,10 +363,6 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -411,10 +375,6 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -427,10 +387,6 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -446,10 +402,6 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -463,10 +415,6 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -483,10 +431,6 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -498,10 +442,6 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -518,35 +458,109 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, Listxattr) {
+  const char* path = test_file_name_.c_str();
+  const std::string name = "user.test";
+  const std::string name2 = "user.test2";
+  const std::string name3 = "user.test3";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name2.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name3.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
 
+  std::vector<char> list(name.size() + 1 + name2.size() + 1 + name3.size() + 1);
+  char* buf = list.data();
+  EXPECT_THAT(listxattr(path, buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(list.size()));
+
+  absl::flat_hash_set<std::string> got = {};
+  for (char* p = buf; p < buf + list.size(); p += strlen(p) + 1) {
+    got.insert(std::string{p});
+  }
+
+  absl::flat_hash_set<std::string> expected = {name, name2, name3};
+  EXPECT_EQ(got, expected);
+}
+
+TEST_F(XattrTest, ListxattrNoXattrs) {
+  const char* path = test_file_name_.c_str();
+
+  std::vector<char> list, expected;
+  EXPECT_THAT(listxattr(path, list.data(), sizeof(list)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(list, expected);
+
+  // Listxattr should succeed if there are no attributes, even if the buffer
+  // passed in is a nullptr.
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, ListxattrNullBuffer) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(name)),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, ListxattrSizeTooSmall) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  char list[sizeof(name) - 1];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, ListxattrZeroSize) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(listxattr(path, nullptr, 0),
+              SyscallSucceedsWithValue(sizeof(name)));
+}
+
+TEST_F(XattrTest, RemoveXattr) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, RemoveXattrNonexistentName) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENODATA));
+}
 
+TEST_F(XattrTest, LXattrOnSymlink) {
+  const char name[] = "user.test";
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
 
-  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+  EXPECT_THAT(lsetxattr(link.path().c_str(), name, nullptr, 0, 0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+  EXPECT_THAT(lgetxattr(link.path().c_str(), name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(llistxattr(link.path().c_str(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lremovexattr(link.path().c_str(), name),
+              SyscallFailsWithErrno(EPERM));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
+TEST_F(XattrTest, LXattrOnNonsymlink) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -558,13 +572,16 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
   EXPECT_THAT(lgetxattr(path, name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
-}
 
-TEST_F(XattrTest, FGetSetxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  char list[sizeof(name)];
+  EXPECT_THAT(llistxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 
+  EXPECT_THAT(lremovexattr(path, name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrWithFD) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
@@ -577,6 +594,13 @@ TEST_F(XattrTest, FGetSetxattr) {
   EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(flistxattr(fd.get(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 9cbf5a3dcc71d85cd857f117d4b7189a101be9c1 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 25 Oct 2019 03:09:59 +0000
Subject: Enable pkg/cpuid support on arm64.

Fixes #1255

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8614e6f3ee321c2989567e4e712aa8f28cc9db14
---
 pkg/cpuid/BUILD                   |    9 +-
 pkg/cpuid/cpuid.go                | 1098 +-----------------------------------
 pkg/cpuid/cpuid_arm64.go          |  461 ++++++++++++++++
 pkg/cpuid/cpuid_arm64_test.go     |   55 ++
 pkg/cpuid/cpuid_parse_test.go     |  142 -----
 pkg/cpuid/cpuid_parse_x86_test.go |  144 +++++
 pkg/cpuid/cpuid_test.go           |  241 --------
 pkg/cpuid/cpuid_x86.go            | 1100 +++++++++++++++++++++++++++++++++++++
 pkg/cpuid/cpuid_x86_test.go       |  243 ++++++++
 9 files changed, 2018 insertions(+), 1475 deletions(-)
 create mode 100644 pkg/cpuid/cpuid_arm64.go
 create mode 100644 pkg/cpuid/cpuid_arm64_test.go
 delete mode 100644 pkg/cpuid/cpuid_parse_test.go
 create mode 100644 pkg/cpuid/cpuid_parse_x86_test.go
 delete mode 100644 pkg/cpuid/cpuid_test.go
 create mode 100644 pkg/cpuid/cpuid_x86.go
 create mode 100644 pkg/cpuid/cpuid_x86_test.go

diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 43a432190..d6cb1a549 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -7,6 +7,8 @@ go_library(
     srcs = [
         "cpu_amd64.s",
         "cpuid.go",
+        "cpuid_arm64.go",
+        "cpuid_x86.go",
     ],
     visibility = ["//:sandbox"],
     deps = ["//pkg/log"],
@@ -15,7 +17,10 @@ go_library(
 go_test(
     name = "cpuid_test",
     size = "small",
-    srcs = ["cpuid_test.go"],
+    srcs = [
+        "cpuid_arm64_test.go",
+        "cpuid_x86_test.go",
+    ],
     library = ":cpuid",
 )
 
@@ -23,7 +28,7 @@ go_test(
     name = "cpuid_parse_test",
     size = "small",
     srcs = [
-        "cpuid_parse_test.go",
+        "cpuid_parse_x86_test.go",
     ],
     library = ":cpuid",
     tags = ["manual"],
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index cf50ee53f..f7f9dbf86 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
-
 // Package cpuid provides basic functionality for creating and adjusting CPU
 // feature sets.
 //
@@ -21,1100 +19,20 @@
 // known platform, or HostFeatureSet()) and then add, remove, and test for
 // features as desired.
 //
-// For example: Test for hardware extended state saving, and if we don't have
-// it, don't expose AVX, which cannot be saved with fxsave.
+// For example: on x86, test for hardware extended state saving, and if
+// we don't have it, don't expose AVX, which cannot be saved with fxsave.
 //
 //   if !HostFeatureSet().HasFeature(X86FeatureXSAVE) {
 //     exposedFeatures.Remove(X86FeatureAVX)
 //   }
 package cpuid
 
-import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/log"
-)
-
-// Common references for CPUID leaves and bits:
-//
-// Intel:
-//   * Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date)
-//   * Intel Application Note 485 (more detailed)
-//
-// AMD:
-//   * AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..."
-
 // Feature is a unique identifier for a particular cpu feature. We just use an
-// int as a feature number on x86.
+// int as a feature number on x86 and arm64.
 //
-// Features are numbered according to "blocks". Each block is 32 bits, and
-// feature bits from the same source (cpuid leaf/level) are in the same block.
-type Feature int
-
-// block is a collection of 32 Feature bits.
-type block int
-
-const blockSize = 32
-
-// Feature bits are numbered according to "blocks". Each block is 32 bits, and
+// On x86, features are numbered according to "blocks". Each block is 32 bits, and
 // feature bits from the same source (cpuid leaf/level) are in the same block.
-func featureID(b block, bit int) Feature {
-	return Feature(32*int(b) + bit)
-}
-
-// Block 0 constants are all of the "basic" feature bits returned by a cpuid in
-// ecx with eax=1.
-const (
-	X86FeatureSSE3 Feature = iota
-	X86FeaturePCLMULDQ
-	X86FeatureDTES64
-	X86FeatureMONITOR
-	X86FeatureDSCPL
-	X86FeatureVMX
-	X86FeatureSMX
-	X86FeatureEST
-	X86FeatureTM2
-	X86FeatureSSSE3 // Not a typo, "supplemental" SSE3.
-	X86FeatureCNXTID
-	X86FeatureSDBG
-	X86FeatureFMA
-	X86FeatureCX16
-	X86FeatureXTPR
-	X86FeaturePDCM
-	_ // ecx bit 16 is reserved.
-	X86FeaturePCID
-	X86FeatureDCA
-	X86FeatureSSE4_1
-	X86FeatureSSE4_2
-	X86FeatureX2APIC
-	X86FeatureMOVBE
-	X86FeaturePOPCNT
-	X86FeatureTSCD
-	X86FeatureAES
-	X86FeatureXSAVE
-	X86FeatureOSXSAVE
-	X86FeatureAVX
-	X86FeatureF16C
-	X86FeatureRDRAND
-	_ // ecx bit 31 is reserved.
-)
-
-// Block 1 constants are all of the "basic" feature bits returned by a cpuid in
-// edx with eax=1.
-const (
-	X86FeatureFPU Feature = 32 + iota
-	X86FeatureVME
-	X86FeatureDE
-	X86FeaturePSE
-	X86FeatureTSC
-	X86FeatureMSR
-	X86FeaturePAE
-	X86FeatureMCE
-	X86FeatureCX8
-	X86FeatureAPIC
-	_ // edx bit 10 is reserved.
-	X86FeatureSEP
-	X86FeatureMTRR
-	X86FeaturePGE
-	X86FeatureMCA
-	X86FeatureCMOV
-	X86FeaturePAT
-	X86FeaturePSE36
-	X86FeaturePSN
-	X86FeatureCLFSH
-	_ // edx bit 20 is reserved.
-	X86FeatureDS
-	X86FeatureACPI
-	X86FeatureMMX
-	X86FeatureFXSR
-	X86FeatureSSE
-	X86FeatureSSE2
-	X86FeatureSS
-	X86FeatureHTT
-	X86FeatureTM
-	X86FeatureIA64
-	X86FeaturePBE
-)
-
-// Block 2 bits are the "structured extended" features returned in ebx for
-// eax=7, ecx=0.
-const (
-	X86FeatureFSGSBase Feature = 2*32 + iota
-	X86FeatureTSC_ADJUST
-	_ // ebx bit 2 is reserved.
-	X86FeatureBMI1
-	X86FeatureHLE
-	X86FeatureAVX2
-	X86FeatureFDP_EXCPTN_ONLY
-	X86FeatureSMEP
-	X86FeatureBMI2
-	X86FeatureERMS
-	X86FeatureINVPCID
-	X86FeatureRTM
-	X86FeatureCQM
-	X86FeatureFPCSDS
-	X86FeatureMPX
-	X86FeatureRDT
-	X86FeatureAVX512F
-	X86FeatureAVX512DQ
-	X86FeatureRDSEED
-	X86FeatureADX
-	X86FeatureSMAP
-	X86FeatureAVX512IFMA
-	X86FeaturePCOMMIT
-	X86FeatureCLFLUSHOPT
-	X86FeatureCLWB
-	X86FeatureIPT // Intel processor trace.
-	X86FeatureAVX512PF
-	X86FeatureAVX512ER
-	X86FeatureAVX512CD
-	X86FeatureSHA
-	X86FeatureAVX512BW
-	X86FeatureAVX512VL
-)
-
-// Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0.
-const (
-	X86FeaturePREFETCHWT1 Feature = 3*32 + iota
-	X86FeatureAVX512VBMI
-	X86FeatureUMIP
-	X86FeaturePKU
-	X86FeatureOSPKE
-	X86FeatureWAITPKG
-	X86FeatureAVX512_VBMI2
-	_ // ecx bit 7 is reserved
-	X86FeatureGFNI
-	X86FeatureVAES
-	X86FeatureVPCLMULQDQ
-	X86FeatureAVX512_VNNI
-	X86FeatureAVX512_BITALG
-	X86FeatureTME
-	X86FeatureAVX512_VPOPCNTDQ
-	_ // ecx bit 15 is reserved
-	X86FeatureLA57
-	// ecx bits 17-21 are reserved
-	_
-	_
-	_
-	_
-	_
-	X86FeatureRDPID
-	// ecx bits 23-24 are reserved
-	_
-	_
-	X86FeatureCLDEMOTE
-	_ // ecx bit 26 is reserved
-	X86FeatureMOVDIRI
-	X86FeatureMOVDIR64B
-)
-
-// Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX.
-// The CPUID leaf is available only if 'X86FeatureXSAVE' is present.
-const (
-	X86FeatureXSAVEOPT Feature = 4*32 + iota
-	X86FeatureXSAVEC
-	X86FeatureXGETBV1
-	X86FeatureXSAVES
-	// EAX[31:4] are reserved.
-)
-
-// Block 5 constants are the extended feature bits in
-// CPUID.(EAX=0x80000001):ECX.
-const (
-	X86FeatureLAHF64 Feature = 5*32 + iota
-	X86FeatureCMP_LEGACY
-	X86FeatureSVM
-	X86FeatureEXTAPIC
-	X86FeatureCR8_LEGACY
-	X86FeatureLZCNT
-	X86FeatureSSE4A
-	X86FeatureMISALIGNSSE
-	X86FeaturePREFETCHW
-	X86FeatureOSVW
-	X86FeatureIBS
-	X86FeatureXOP
-	X86FeatureSKINIT
-	X86FeatureWDT
-	_ // ecx bit 14 is reserved.
-	X86FeatureLWP
-	X86FeatureFMA4
-	X86FeatureTCE
-	_ // ecx bit 18 is reserved.
-	_ // ecx bit 19 is reserved.
-	_ // ecx bit 20 is reserved.
-	X86FeatureTBM
-	X86FeatureTOPOLOGY
-	X86FeaturePERFCTR_CORE
-	X86FeaturePERFCTR_NB
-	_ // ecx bit 25 is reserved.
-	X86FeatureBPEXT
-	X86FeaturePERFCTR_TSC
-	X86FeaturePERFCTR_LLC
-	X86FeatureMWAITX
-	// ECX[31:30] are reserved.
-)
-
-// Block 6 constants are the extended feature bits in
-// CPUID.(EAX=0x80000001):EDX.
 //
-// These are sparse, and so the bit positions are assigned manually.
-const (
-	// On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features
-	// also defined in block 1 (in identical bit positions). Those features
-	// are not listed here.
-	block6DuplicateMask = 0x183f3ff
-
-	X86FeatureSYSCALL  Feature = 6*32 + 11
-	X86FeatureNX       Feature = 6*32 + 20
-	X86FeatureMMXEXT   Feature = 6*32 + 22
-	X86FeatureFXSR_OPT Feature = 6*32 + 25
-	X86FeatureGBPAGES  Feature = 6*32 + 26
-	X86FeatureRDTSCP   Feature = 6*32 + 27
-	X86FeatureLM       Feature = 6*32 + 29
-	X86Feature3DNOWEXT Feature = 6*32 + 30
-	X86Feature3DNOW    Feature = 6*32 + 31
-)
-
-// linuxBlockOrder defines the order in which linux organizes the feature
-// blocks. Linux also tracks feature bits in 32-bit blocks, but in an order
-// which doesn't match well here, so for the /proc/cpuinfo generation we simply
-// re-map the blocks to Linux's ordering and then go through the bits in each
-// block.
-var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3}
-
-// To make emulation of /proc/cpuinfo easy, these names match the names of the
-// basic features in Linux defined in arch/x86/kernel/cpu/capflags.c.
-var x86FeatureStrings = map[Feature]string{
-	// Block 0.
-	X86FeatureSSE3:     "pni",
-	X86FeaturePCLMULDQ: "pclmulqdq",
-	X86FeatureDTES64:   "dtes64",
-	X86FeatureMONITOR:  "monitor",
-	X86FeatureDSCPL:    "ds_cpl",
-	X86FeatureVMX:      "vmx",
-	X86FeatureSMX:      "smx",
-	X86FeatureEST:      "est",
-	X86FeatureTM2:      "tm2",
-	X86FeatureSSSE3:    "ssse3",
-	X86FeatureCNXTID:   "cid",
-	X86FeatureSDBG:     "sdbg",
-	X86FeatureFMA:      "fma",
-	X86FeatureCX16:     "cx16",
-	X86FeatureXTPR:     "xtpr",
-	X86FeaturePDCM:     "pdcm",
-	X86FeaturePCID:     "pcid",
-	X86FeatureDCA:      "dca",
-	X86FeatureSSE4_1:   "sse4_1",
-	X86FeatureSSE4_2:   "sse4_2",
-	X86FeatureX2APIC:   "x2apic",
-	X86FeatureMOVBE:    "movbe",
-	X86FeaturePOPCNT:   "popcnt",
-	X86FeatureTSCD:     "tsc_deadline_timer",
-	X86FeatureAES:      "aes",
-	X86FeatureXSAVE:    "xsave",
-	X86FeatureAVX:      "avx",
-	X86FeatureF16C:     "f16c",
-	X86FeatureRDRAND:   "rdrand",
-
-	// Block 1.
-	X86FeatureFPU:   "fpu",
-	X86FeatureVME:   "vme",
-	X86FeatureDE:    "de",
-	X86FeaturePSE:   "pse",
-	X86FeatureTSC:   "tsc",
-	X86FeatureMSR:   "msr",
-	X86FeaturePAE:   "pae",
-	X86FeatureMCE:   "mce",
-	X86FeatureCX8:   "cx8",
-	X86FeatureAPIC:  "apic",
-	X86FeatureSEP:   "sep",
-	X86FeatureMTRR:  "mtrr",
-	X86FeaturePGE:   "pge",
-	X86FeatureMCA:   "mca",
-	X86FeatureCMOV:  "cmov",
-	X86FeaturePAT:   "pat",
-	X86FeaturePSE36: "pse36",
-	X86FeaturePSN:   "pn",
-	X86FeatureCLFSH: "clflush",
-	X86FeatureDS:    "dts",
-	X86FeatureACPI:  "acpi",
-	X86FeatureMMX:   "mmx",
-	X86FeatureFXSR:  "fxsr",
-	X86FeatureSSE:   "sse",
-	X86FeatureSSE2:  "sse2",
-	X86FeatureSS:    "ss",
-	X86FeatureHTT:   "ht",
-	X86FeatureTM:    "tm",
-	X86FeatureIA64:  "ia64",
-	X86FeaturePBE:   "pbe",
-
-	// Block 2.
-	X86FeatureFSGSBase:   "fsgsbase",
-	X86FeatureTSC_ADJUST: "tsc_adjust",
-	X86FeatureBMI1:       "bmi1",
-	X86FeatureHLE:        "hle",
-	X86FeatureAVX2:       "avx2",
-	X86FeatureSMEP:       "smep",
-	X86FeatureBMI2:       "bmi2",
-	X86FeatureERMS:       "erms",
-	X86FeatureINVPCID:    "invpcid",
-	X86FeatureRTM:        "rtm",
-	X86FeatureCQM:        "cqm",
-	X86FeatureMPX:        "mpx",
-	X86FeatureRDT:        "rdt_a",
-	X86FeatureAVX512F:    "avx512f",
-	X86FeatureAVX512DQ:   "avx512dq",
-	X86FeatureRDSEED:     "rdseed",
-	X86FeatureADX:        "adx",
-	X86FeatureSMAP:       "smap",
-	X86FeatureCLWB:       "clwb",
-	X86FeatureAVX512PF:   "avx512pf",
-	X86FeatureAVX512ER:   "avx512er",
-	X86FeatureAVX512CD:   "avx512cd",
-	X86FeatureSHA:        "sha_ni",
-	X86FeatureAVX512BW:   "avx512bw",
-	X86FeatureAVX512VL:   "avx512vl",
-
-	// Block 3.
-	X86FeatureAVX512VBMI:       "avx512vbmi",
-	X86FeatureUMIP:             "umip",
-	X86FeaturePKU:              "pku",
-	X86FeatureOSPKE:            "ospke",
-	X86FeatureWAITPKG:          "waitpkg",
-	X86FeatureAVX512_VBMI2:     "avx512_vbmi2",
-	X86FeatureGFNI:             "gfni",
-	X86FeatureVAES:             "vaes",
-	X86FeatureVPCLMULQDQ:       "vpclmulqdq",
-	X86FeatureAVX512_VNNI:      "avx512_vnni",
-	X86FeatureAVX512_BITALG:    "avx512_bitalg",
-	X86FeatureTME:              "tme",
-	X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq",
-	X86FeatureLA57:             "la57",
-	X86FeatureRDPID:            "rdpid",
-	X86FeatureCLDEMOTE:         "cldemote",
-	X86FeatureMOVDIRI:          "movdiri",
-	X86FeatureMOVDIR64B:        "movdir64b",
-
-	// Block 4.
-	X86FeatureXSAVEOPT: "xsaveopt",
-	X86FeatureXSAVEC:   "xsavec",
-	X86FeatureXGETBV1:  "xgetbv1",
-	X86FeatureXSAVES:   "xsaves",
-
-	// Block 5.
-	X86FeatureLAHF64:       "lahf_lm", // LAHF/SAHF in long mode
-	X86FeatureCMP_LEGACY:   "cmp_legacy",
-	X86FeatureSVM:          "svm",
-	X86FeatureEXTAPIC:      "extapic",
-	X86FeatureCR8_LEGACY:   "cr8_legacy",
-	X86FeatureLZCNT:        "abm", // Advanced bit manipulation
-	X86FeatureSSE4A:        "sse4a",
-	X86FeatureMISALIGNSSE:  "misalignsse",
-	X86FeaturePREFETCHW:    "3dnowprefetch",
-	X86FeatureOSVW:         "osvw",
-	X86FeatureIBS:          "ibs",
-	X86FeatureXOP:          "xop",
-	X86FeatureSKINIT:       "skinit",
-	X86FeatureWDT:          "wdt",
-	X86FeatureLWP:          "lwp",
-	X86FeatureFMA4:         "fma4",
-	X86FeatureTCE:          "tce",
-	X86FeatureTBM:          "tbm",
-	X86FeatureTOPOLOGY:     "topoext",
-	X86FeaturePERFCTR_CORE: "perfctr_core",
-	X86FeaturePERFCTR_NB:   "perfctr_nb",
-	X86FeatureBPEXT:        "bpext",
-	X86FeaturePERFCTR_TSC:  "ptsc",
-	X86FeaturePERFCTR_LLC:  "perfctr_llc",
-	X86FeatureMWAITX:       "mwaitx",
-
-	// Block 6.
-	X86FeatureSYSCALL:  "syscall",
-	X86FeatureNX:       "nx",
-	X86FeatureMMXEXT:   "mmxext",
-	X86FeatureFXSR_OPT: "fxsr_opt",
-	X86FeatureGBPAGES:  "pdpe1gb",
-	X86FeatureRDTSCP:   "rdtscp",
-	X86FeatureLM:       "lm",
-	X86Feature3DNOWEXT: "3dnowext",
-	X86Feature3DNOW:    "3dnow",
-}
-
-// These flags are parse only---they can be used for setting / unsetting the
-// flags, but will not get printed out in /proc/cpuinfo.
-var x86FeatureParseOnlyStrings = map[Feature]string{
-	// Block 0.
-	X86FeatureOSXSAVE: "osxsave",
-
-	// Block 2.
-	X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
-	X86FeatureFPCSDS:          "fpcsds",
-	X86FeatureIPT:             "pt",
-	X86FeatureCLFLUSHOPT:      "clfushopt",
-
-	// Block 3.
-	X86FeaturePREFETCHWT1: "prefetchwt1",
-}
-
-// intelCacheDescriptors describe the caches and TLBs on the system. They are
-// returned in the registers for eax=2. Intel only.
-type intelCacheDescriptor uint8
-
-// Valid cache/TLB descriptors. All descriptors can be found in Intel SDM Vol.
-// 2, Ch. 3.2, "CPUID", Table 3-12 "Encoding of CPUID Leaf 2 Descriptors".
-const (
-	intelNullDescriptor    intelCacheDescriptor = 0
-	intelNoTLBDescriptor   intelCacheDescriptor = 0xfe
-	intelNoCacheDescriptor intelCacheDescriptor = 0xff
-
-	// Most descriptors omitted for brevity as they are currently unused.
-)
-
-// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4.
-type CacheType uint8
-
-const (
-	// cacheNull indicates that there are no more entries.
-	cacheNull CacheType = iota
-
-	// CacheData is a data cache.
-	CacheData
-
-	// CacheInstruction is an instruction cache.
-	CacheInstruction
-
-	// CacheUnified is a unified instruction and data cache.
-	CacheUnified
-)
-
-// Cache describes the parameters of a single cache on the system.
-//
-// +stateify savable
-type Cache struct {
-	// Level is the hierarchical level of this cache (L1, L2, etc).
-	Level uint32
-
-	// Type is the type of cache.
-	Type CacheType
-
-	// FullyAssociative indicates that entries may be placed in any block.
-	FullyAssociative bool
-
-	// Partitions is the number of physical partitions in the cache.
-	Partitions uint32
-
-	// Ways is the number of ways of associativity in the cache.
-	Ways uint32
-
-	// Sets is the number of sets in the cache.
-	Sets uint32
-
-	// InvalidateHierarchical indicates that WBINVD/INVD from threads
-	// sharing this cache acts upon lower level caches for threads sharing
-	// this cache.
-	InvalidateHierarchical bool
-
-	// Inclusive indicates that this cache is inclusive of lower cache
-	// levels.
-	Inclusive bool
-
-	// DirectMapped indicates that this cache is directly mapped from
-	// address, rather than using a hash function.
-	DirectMapped bool
-}
-
-// Just a way to wrap cpuid function numbers.
-type cpuidFunction uint32
-
-// The constants below are the lower or "standard" cpuid functions, ordered as
-// defined by the hardware.
-const (
-	vendorID                      cpuidFunction = iota // Returns vendor ID and largest standard function.
-	featureInfo                                        // Returns basic feature bits and processor signature.
-	intelCacheDescriptors                              // Returns list of cache descriptors. Intel only.
-	intelSerialNumber                                  // Returns processor serial number (obsolete on new hardware). Intel only.
-	intelDeterministicCacheParams                      // Returns deterministic cache information. Intel only.
-	monitorMwaitParams                                 // Returns information about monitor/mwait instructions.
-	powerParams                                        // Returns information about power management and thermal sensors.
-	extendedFeatureInfo                                // Returns extended feature bits.
-	_                                                  // Function 0x8 is reserved.
-	intelDCAParams                                     // Returns direct cache access information. Intel only.
-	intelPMCInfo                                       // Returns information about performance monitoring features. Intel only.
-	intelX2APICInfo                                    // Returns core/logical processor topology. Intel only.
-	_                                                  // Function 0xc is reserved.
-	xSaveInfo                                          // Returns information about extended state management.
-)
-
-// The "extended" functions start at 0x80000000.
-const (
-	extendedFunctionInfo cpuidFunction = 0x80000000 + iota // Returns highest available extended function in eax.
-	extendedFeatures                                       // Returns some extended feature bits in edx and ecx.
-)
-
-// These are the extended floating point state features. They are used to
-// enumerate floating point features in XCR0, XSTATE_BV, etc.
-const (
-	XSAVEFeatureX87         = 1 << 0
-	XSAVEFeatureSSE         = 1 << 1
-	XSAVEFeatureAVX         = 1 << 2
-	XSAVEFeatureBNDREGS     = 1 << 3
-	XSAVEFeatureBNDCSR      = 1 << 4
-	XSAVEFeatureAVX512op    = 1 << 5
-	XSAVEFeatureAVX512zmm0  = 1 << 6
-	XSAVEFeatureAVX512zmm16 = 1 << 7
-	XSAVEFeaturePKRU        = 1 << 9
-)
-
-var cpuFreqMHz float64
-
-// x86FeaturesFromString includes features from x86FeatureStrings and
-// x86FeatureParseOnlyStrings.
-var x86FeaturesFromString = make(map[string]Feature)
-
-// FeatureFromString returns the Feature associated with the given feature
-// string plus a bool to indicate if it could find the feature.
-func FeatureFromString(s string) (Feature, bool) {
-	f, b := x86FeaturesFromString[s]
-	return f, b
-}
-
-// String implements fmt.Stringer.
-func (f Feature) String() string {
-	if s := f.flagString(false); s != "" {
-		return s
-	}
-
-	block := int(f) / 32
-	bit := int(f) % 32
-	return fmt.Sprintf("<cpuflag %d; block %d bit %d>", f, block, bit)
-}
-
-func (f Feature) flagString(cpuinfoOnly bool) string {
-	if s, ok := x86FeatureStrings[f]; ok {
-		return s
-	}
-	if !cpuinfoOnly {
-		return x86FeatureParseOnlyStrings[f]
-	}
-	return ""
-}
-
-// FeatureSet is a set of Features for a CPU.
-//
-// +stateify savable
-type FeatureSet struct {
-	// Set is the set of features that are enabled in this FeatureSet.
-	Set map[Feature]bool
-
-	// VendorID is the 12-char string returned in ebx:edx:ecx for eax=0.
-	VendorID string
-
-	// ExtendedFamily is part of the processor signature.
-	ExtendedFamily uint8
-
-	// ExtendedModel is part of the processor signature.
-	ExtendedModel uint8
-
-	// ProcessorType is part of the processor signature.
-	ProcessorType uint8
-
-	// Family is part of the processor signature.
-	Family uint8
-
-	// Model is part of the processor signature.
-	Model uint8
-
-	// SteppingID is part of the processor signature.
-	SteppingID uint8
-
-	// Caches describes the caches on the CPU.
-	Caches []Cache
-
-	// CacheLine is the size of a cache line in bytes.
-	//
-	// All caches use the same line size. This is not enforced in the CPUID
-	// encoding, but is true on all known x86 processors.
-	CacheLine uint32
-}
-
-// FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is
-// equivalent to the "flags" field in /proc/cpuinfo.
-func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string {
-	var s []string
-	for _, b := range linuxBlockOrder {
-		for i := 0; i < blockSize; i++ {
-			if f := featureID(b, i); fs.Set[f] {
-				if fstr := f.flagString(cpuinfoOnly); fstr != "" {
-					s = append(s, fstr)
-				}
-			}
-		}
-	}
-	return strings.Join(s, " ")
-}
-
-// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
-// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
-// not always printed in Linux. The bogomips field is simply made up.
-func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
-	fmt.Fprintf(b, "processor\t: %d\n", cpu)
-	fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID)
-	fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
-	fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
-	fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now.
-	fmt.Fprintf(b, "stepping\t: %s\n", "unknown")   // Unknown for now.
-	fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
-	fmt.Fprintln(b, "fpu\t\t: yes")
-	fmt.Fprintln(b, "fpu_exception\t: yes")
-	fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
-	fmt.Fprintln(b, "wp\t\t: yes")
-	fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true))
-	fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
-	fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine)
-	fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine)
-	fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
-	fmt.Fprintln(b, "power management:") // This is always here, but can be blank.
-	fmt.Fprintln(b, "")                  // The /proc/cpuinfo file ends with an extra newline.
-}
-
-const (
-	amdVendorID   = "AuthenticAMD"
-	intelVendorID = "GenuineIntel"
-)
-
-// AMD returns true if fs describes an AMD CPU.
-func (fs *FeatureSet) AMD() bool {
-	return fs.VendorID == amdVendorID
-}
-
-// Intel returns true if fs describes an Intel CPU.
-func (fs *FeatureSet) Intel() bool {
-	return fs.VendorID == intelVendorID
-}
-
-// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a
-// subset of the host feature set.
-type ErrIncompatible struct {
-	message string
-}
-
-// Error implements error.
-func (e ErrIncompatible) Error() string {
-	return e.message
-}
-
-// CheckHostCompatible returns nil if fs is a subset of the host feature set.
-func (fs *FeatureSet) CheckHostCompatible() error {
-	hfs := HostFeatureSet()
-
-	if diff := fs.Subtract(hfs); diff != nil {
-		return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)}
-	}
-
-	// The size of a cache line must match, as it is critical to correctly
-	// utilizing CLFLUSH. Other cache properties are allowed to change, as
-	// they are not important to correctness.
-	if fs.CacheLine != hfs.CacheLine {
-		return ErrIncompatible{fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fs.CacheLine, hfs.CacheLine)}
-	}
-
-	return nil
-}
-
-// Helper to convert 3 regs into 12-byte vendor ID.
-func vendorIDFromRegs(bx, cx, dx uint32) string {
-	bytes := make([]byte, 0, 12)
-	for i := uint(0); i < 4; i++ {
-		b := byte(bx >> (i * 8))
-		bytes = append(bytes, b)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		b := byte(dx >> (i * 8))
-		bytes = append(bytes, b)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		b := byte(cx >> (i * 8))
-		bytes = append(bytes, b)
-	}
-	return string(bytes)
-}
-
-// ExtendedStateSize returns the number of bytes needed to save the "extended
-// state" for this processor and the boundary it must be aligned to. Extended
-// state includes floating point registers, and other cpu state that's not
-// associated with the normal task context.
-//
-// Note: We can save some space here with an optimization where we use a
-// smaller chunk of memory depending on features that are actually enabled.
-// Currently we just use the largest possible size for simplicity (which is
-// about 2.5K worst case, with avx512).
-func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
-	if fs.UseXsave() {
-		// Leaf 0 of xsaveinfo function returns the size for currently
-		// enabled xsave features in ebx, the maximum size if all valid
-		// features are saved with xsave in ecx, and valid XCR0 bits in
-		// edx:eax.
-		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
-		return uint(maxSize), 64
-	}
-
-	// If we don't support xsave, we fall back to fxsave, which requires
-	// 512 bytes aligned to 16 bytes.
-	return 512, 16
-}
-
-// ValidXCR0Mask returns the bits that may be set to 1 in control register
-// XCR0.
-func (fs *FeatureSet) ValidXCR0Mask() uint64 {
-	if !fs.UseXsave() {
-		return 0
-	}
-	eax, _, _, edx := HostID(uint32(xSaveInfo), 0)
-	return uint64(edx)<<32 | uint64(eax)
-}
-
-// vendorIDRegs returns the 3 register values used to construct the 12-byte
-// vendor ID string for eax=0.
-func (fs *FeatureSet) vendorIDRegs() (bx, dx, cx uint32) {
-	for i := uint(0); i < 4; i++ {
-		bx |= uint32(fs.VendorID[i]) << (i * 8)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		dx |= uint32(fs.VendorID[i+4]) << (i * 8)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		cx |= uint32(fs.VendorID[i+8]) << (i * 8)
-	}
-	return
-}
-
-// signature returns the signature dword that's returned in eax when eax=1.
-func (fs *FeatureSet) signature() uint32 {
-	var s uint32
-	s |= uint32(fs.SteppingID & 0xf)
-	s |= uint32(fs.Model&0xf) << 4
-	s |= uint32(fs.Family&0xf) << 8
-	s |= uint32(fs.ProcessorType&0x3) << 12
-	s |= uint32(fs.ExtendedModel&0xf) << 16
-	s |= uint32(fs.ExtendedFamily&0xff) << 20
-	return s
-}
-
-// Helper to deconstruct signature dword.
-func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) {
-	sid = uint8(v & 0xf)
-	m = uint8(v>>4) & 0xf
-	f = uint8(v>>8) & 0xf
-	pt = uint8(v>>12) & 0x3
-	em = uint8(v>>16) & 0xf
-	ef = uint8(v >> 20)
-	return
-}
-
-// Helper to convert blockwise feature bit masks into a set of features. Masks
-// must be provided in order for each block, without skipping them. If a block
-// does not matter for this feature set, 0 is specified.
-func setFromBlockMasks(blocks ...uint32) map[Feature]bool {
-	s := make(map[Feature]bool)
-	for b, blockMask := range blocks {
-		for i := 0; i < blockSize; i++ {
-			if blockMask&1 != 0 {
-				s[featureID(block(b), i)] = true
-			}
-			blockMask >>= 1
-		}
-	}
-	return s
-}
-
-// blockMask returns the 32-bit mask associated with a block of features.
-func (fs *FeatureSet) blockMask(b block) uint32 {
-	var mask uint32
-	for i := 0; i < blockSize; i++ {
-		if fs.Set[featureID(b, i)] {
-			mask |= 1 << uint(i)
-		}
-	}
-	return mask
-}
-
-// Remove removes a Feature from a FeatureSet. It ignores features
-// that are not in the FeatureSet.
-func (fs *FeatureSet) Remove(feature Feature) {
-	delete(fs.Set, feature)
-}
-
-// Add adds a Feature to a FeatureSet. It ignores duplicate features.
-func (fs *FeatureSet) Add(feature Feature) {
-	fs.Set[feature] = true
-}
-
-// HasFeature tests whether or not a feature is in the given feature set.
-func (fs *FeatureSet) HasFeature(feature Feature) bool {
-	return fs.Set[feature]
-}
-
-// Subtract returns the features present in fs that are not present in other.
-// If all features in fs are present in other, Subtract returns nil.
-func (fs *FeatureSet) Subtract(other *FeatureSet) (diff map[Feature]bool) {
-	for f := range fs.Set {
-		if !other.Set[f] {
-			if diff == nil {
-				diff = make(map[Feature]bool)
-			}
-			diff[f] = true
-		}
-	}
-
-	return
-}
-
-// EmulateID emulates a cpuid instruction based on the feature set.
-func (fs *FeatureSet) EmulateID(origAx, origCx uint32) (ax, bx, cx, dx uint32) {
-	switch cpuidFunction(origAx) {
-	case vendorID:
-		ax = uint32(xSaveInfo) // 0xd (xSaveInfo) is the highest function we support.
-		bx, dx, cx = fs.vendorIDRegs()
-	case featureInfo:
-		// CLFLUSH line size is encoded in quadwords. Other fields in bx unsupported.
-		bx = (fs.CacheLine / 8) << 8
-		cx = fs.blockMask(block(0))
-		dx = fs.blockMask(block(1))
-		ax = fs.signature()
-	case intelCacheDescriptors:
-		if !fs.Intel() {
-			// Reserved on non-Intel.
-			return 0, 0, 0, 0
-		}
-
-		// "The least-significant byte in register EAX (register AL)
-		// will always return 01H. Software should ignore this value
-		// and not interpret it as an informational descriptor." - SDM
-		//
-		// We only support reporting cache parameters via
-		// intelDeterministicCacheParams; report as much here.
-		//
-		// We do not support exposing TLB information at all.
-		ax = 1 | (uint32(intelNoCacheDescriptor) << 8)
-	case intelDeterministicCacheParams:
-		if !fs.Intel() {
-			// Reserved on non-Intel.
-			return 0, 0, 0, 0
-		}
-
-		// cx is the index of the cache to describe.
-		if int(origCx) >= len(fs.Caches) {
-			return uint32(cacheNull), 0, 0, 0
-		}
-		c := fs.Caches[origCx]
-
-		ax = uint32(c.Type)
-		ax |= c.Level << 5
-		ax |= 1 << 8 // Always claim the cache is "self-initializing".
-		if c.FullyAssociative {
-			ax |= 1 << 9
-		}
-		// Processor topology not supported.
-
-		bx = fs.CacheLine - 1
-		bx |= (c.Partitions - 1) << 12
-		bx |= (c.Ways - 1) << 22
-
-		cx = c.Sets - 1
-
-		if !c.InvalidateHierarchical {
-			dx |= 1
-		}
-		if c.Inclusive {
-			dx |= 1 << 1
-		}
-		if !c.DirectMapped {
-			dx |= 1 << 2
-		}
-	case xSaveInfo:
-		if !fs.UseXsave() {
-			return 0, 0, 0, 0
-		}
-		return HostID(uint32(xSaveInfo), origCx)
-	case extendedFeatureInfo:
-		if origCx != 0 {
-			break // Only leaf 0 is supported.
-		}
-		bx = fs.blockMask(block(2))
-		cx = fs.blockMask(block(3))
-	case extendedFunctionInfo:
-		// We only support showing the extended features.
-		ax = uint32(extendedFeatures)
-		cx = 0
-	case extendedFeatures:
-		cx = fs.blockMask(block(5))
-		dx = fs.blockMask(block(6))
-		if fs.AMD() {
-			// AMD duplicates some block 1 features in block 6.
-			dx |= fs.blockMask(block(1)) & block6DuplicateMask
-		}
-	}
-
-	return
-}
-
-// UseXsave returns the choice of fp state saving instruction.
-func (fs *FeatureSet) UseXsave() bool {
-	return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE)
-}
-
-// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
-func (fs *FeatureSet) UseXsaveopt() bool {
-	return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT)
-}
-
-// HostID executes a native CPUID instruction.
-func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)
-
-// HostFeatureSet uses cpuid to get host values and construct a feature set
-// that matches that of the host machine. Note that there are several places
-// where there appear to be some unnecessary assignments between register names
-// (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
-// where the different feature blocks come from, to make the code easier to
-// inspect and read.
-func HostFeatureSet() *FeatureSet {
-	// eax=0 gets max supported feature and vendor ID.
-	_, bx, cx, dx := HostID(0, 0)
-	vendorID := vendorIDFromRegs(bx, cx, dx)
-
-	// eax=1 gets basic features in ecx:edx.
-	ax, bx, cx, dx := HostID(1, 0)
-	featureBlock0 := cx
-	featureBlock1 := dx
-	ef, em, pt, f, m, sid := signatureSplit(ax)
-	cacheLine := 8 * (bx >> 8) & 0xff
-
-	// eax=4, ecx=i gets details about cache index i. Only supported on Intel.
-	var caches []Cache
-	if vendorID == intelVendorID {
-		// ecx selects the cache index until a null type is returned.
-		for i := uint32(0); ; i++ {
-			ax, bx, cx, dx := HostID(4, i)
-			t := CacheType(ax & 0xf)
-			if t == cacheNull {
-				break
-			}
-
-			lineSize := (bx & 0xfff) + 1
-			if lineSize != cacheLine {
-				panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine))
-			}
-
-			caches = append(caches, Cache{
-				Type:                   t,
-				Level:                  (ax >> 5) & 0x7,
-				FullyAssociative:       ((ax >> 9) & 1) == 1,
-				Partitions:             ((bx >> 12) & 0x3ff) + 1,
-				Ways:                   ((bx >> 22) & 0x3ff) + 1,
-				Sets:                   cx + 1,
-				InvalidateHierarchical: (dx & 1) == 0,
-				Inclusive:              ((dx >> 1) & 1) == 1,
-				DirectMapped:           ((dx >> 2) & 1) == 0,
-			})
-		}
-	}
-
-	// eax=7, ecx=0 gets extended features in ecx:ebx.
-	_, bx, cx, _ = HostID(7, 0)
-	featureBlock2 := bx
-	featureBlock3 := cx
-
-	// Leaf 0xd is supported only if CPUID.1:ECX.XSAVE[bit 26] is set.
-	var featureBlock4 uint32
-	if (featureBlock0 & (1 << 26)) != 0 {
-		featureBlock4, _, _, _ = HostID(uint32(xSaveInfo), 1)
-	}
-
-	// eax=0x80000000 gets supported extended levels. We use this to
-	// determine if there are any non-zero block 4 or block 6 bits to find.
-	var featureBlock5, featureBlock6 uint32
-	if ax, _, _, _ := HostID(uint32(extendedFunctionInfo), 0); ax >= uint32(extendedFeatures) {
-		// eax=0x80000001 gets AMD added feature bits.
-		_, _, cx, dx = HostID(uint32(extendedFeatures), 0)
-		featureBlock5 = cx
-		// Ignore features duplicated from block 1 on AMD. These bits
-		// are reserved on Intel.
-		featureBlock6 = dx &^ block6DuplicateMask
-	}
-
-	set := setFromBlockMasks(featureBlock0, featureBlock1, featureBlock2, featureBlock3, featureBlock4, featureBlock5, featureBlock6)
-	return &FeatureSet{
-		Set:            set,
-		VendorID:       vendorID,
-		ExtendedFamily: ef,
-		ExtendedModel:  em,
-		ProcessorType:  pt,
-		Family:         f,
-		Model:          m,
-		SteppingID:     sid,
-		CacheLine:      cacheLine,
-		Caches:         caches,
-	}
-}
-
-// Reads max cpu frequency from host /proc/cpuinfo. Must run before
-// whitelisting. This value is used to create the fake /proc/cpuinfo from a
-// FeatureSet.
-func initCPUFreq() {
-	cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
-	if err != nil {
-		// Leave it as 0... The standalone VDSO bails out in the same
-		// way.
-		log.Warningf("Could not read /proc/cpuinfo: %v", err)
-		return
-	}
-	cpuinfo := string(cpuinfob)
-
-	// We get the value straight from host /proc/cpuinfo. On machines with
-	// frequency scaling enabled, this will only get the current value
-	// which will likely be inaccurate. This is fine on machines with
-	// frequency scaling disabled.
-	for _, line := range strings.Split(cpuinfo, "\n") {
-		if strings.Contains(line, "cpu MHz") {
-			splitMHz := strings.Split(line, ":")
-			if len(splitMHz) < 2 {
-				log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line")
-				return
-			}
-
-			// If there was a problem, leave cpuFreqMHz as 0.
-			var err error
-			cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
-			if err != nil {
-				log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err)
-				cpuFreqMHz = 0
-				return
-			}
-			return
-		}
-	}
-	log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz")
-}
-
-func initFeaturesFromString() {
-	for f, s := range x86FeatureStrings {
-		x86FeaturesFromString[s] = f
-	}
-	for f, s := range x86FeatureParseOnlyStrings {
-		x86FeaturesFromString[s] = f
-	}
-}
-
-func init() {
-	// initCpuFreq must be run before whitelists are enabled.
-	initCPUFreq()
-	initFeaturesFromString()
-}
+// On arm64, features are numbered according to the ELF HWCAP definition.
+// arch/arm64/include/uapi/asm/hwcap.h
+type Feature int
diff --git a/pkg/cpuid/cpuid_arm64.go b/pkg/cpuid/cpuid_arm64.go
new file mode 100644
index 000000000..6d71290c9
--- /dev/null
+++ b/pkg/cpuid/cpuid_arm64.go
@@ -0,0 +1,461 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package cpuid
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// ARM64 doesn't have a 'cpuid' equivalent, which means it have no architected
+// discovery mechanism for hardware features available to userspace code at EL0.
+// The kernel exposes the presence of these features to userspace through a set
+// of flags(HWCAP/HWCAP2) bits, exposed in the auxilliary vector.
+// Ref Documentation/arm64/elf_hwcaps.rst for more info.
+//
+// Currently, only the HWCAP bits are supported.
+
+const (
+	// Single and double precision float point types.
+	ARM64FeatureFP Feature = iota
+
+	// Advanced SIMD with single and double precision
+	// float point arithmetic.
+	ARM64FeatureASIMD
+
+	// The generic timer is configured to generate
+	// events at a frequency of approximately 100KHz.
+	ARM64FeatureEVTSTRM
+
+	// AES instructions(AESE/AESD/AESMC/AESIMC).
+	ARM64FeatureAES
+
+	// AES instructions(PMULL/PMULL2).
+	ARM64FeaturePMULL
+
+	// SHA1 instructions(SHA1C/SHA1P/SHA1M etc).
+	ARM64FeatureSHA1
+
+	// SHA2 instructions(SHA256H/SHA256H2/SHA256SU0 etc).
+	ARM64FeatureSHA2
+
+	// CRC32 instructions(CRC32B/CRC32H/CRC32W etc).
+	ARM64FeatureCRC32
+
+	// Atomic instructions(LDADD/LDCLR/LDEOR/LDSET etc).
+	ARM64FeatureATOMICS
+
+	// Half precision float point arithmetic.
+	ARM64FeatureFPHP
+
+	// ASIMD with half precision float point arithmetic.
+	ARM64FeatureASIMDHP
+
+	// EL0 access to certain ID registers is available.
+	ARM64FeatureCPUID
+
+	// SQRDMLAH and SQRDMLSH instructions implemented.
+	ARM64FeatureASIMDRDM
+
+	// The FJCVTZS instruction is implemented.
+	ARM64FeatureJSCVT
+
+	// The FCMLA and FCADD instructions are implemented.
+	ARM64FeatureFCMA
+
+	// The LDAPRB/LDAPRH/LDAPR instructions are implemented.
+	ARM64FeatureLRCPC
+
+	// DC instruction(DC CVAP) supported.
+	ARM64FeatureDCPOP
+
+	// SHA3 instructions(EOR3/RAX1/XAR/BCAX) implemented.
+	ARM64FeatureSHA3
+
+	// SM3 instructions(SM3SS1/SM3TT1A/SM3TT1B) implemented.
+	ARM64FeatureSM3
+
+	// SM4 instructions(SM4E/SM4EKEY) implemented.
+	ARM64FeatureSM4
+
+	// Dot Product instructions(UDOT/SDOT) implemented.
+	ARM64FeatureASIMDDP
+
+	// SHA2 instructions(SHA512H/SHA512H2/SHA512SU0) implemented.
+	ARM64FeatureSHA512
+
+	// Scalable Vector Extension implemented.
+	ARM64FeatureSVE
+
+	// FMLAL and FMLSL instructions are implemented.
+	ARM64FeatureASIMDFHM
+)
+
+// ELF auxiliary vector tags
+const (
+	_AT_NULL   = 0  // End of vector
+	_AT_HWCAP  = 16 // hardware capability bit vector
+	_AT_HWCAP2 = 26 // hardware capability bit vector 2
+)
+
+// These should not be changed after they are initialized.
+var hwCap uint
+
+// To make emulation of /proc/cpuinfo easy, these names match the names of the
+// basic features in Linux defined in arch/arm64/kernel/cpuinfo.c.
+var arm64FeatureStrings = map[Feature]string{
+	ARM64FeatureFP:       "fp",
+	ARM64FeatureASIMD:    "asimd",
+	ARM64FeatureEVTSTRM:  "evtstrm",
+	ARM64FeatureAES:      "aes",
+	ARM64FeaturePMULL:    "pmull",
+	ARM64FeatureSHA1:     "sha1",
+	ARM64FeatureSHA2:     "sha2",
+	ARM64FeatureCRC32:    "crc32",
+	ARM64FeatureATOMICS:  "atomics",
+	ARM64FeatureFPHP:     "fphp",
+	ARM64FeatureASIMDHP:  "asimdhp",
+	ARM64FeatureCPUID:    "cpuid",
+	ARM64FeatureASIMDRDM: "asimdrdm",
+	ARM64FeatureJSCVT:    "jscvt",
+	ARM64FeatureFCMA:     "fcma",
+	ARM64FeatureLRCPC:    "lrcpc",
+	ARM64FeatureDCPOP:    "dcpop",
+	ARM64FeatureSHA3:     "sha3",
+	ARM64FeatureSM3:      "sm3",
+	ARM64FeatureSM4:      "sm4",
+	ARM64FeatureASIMDDP:  "asimddp",
+	ARM64FeatureSHA512:   "sha512",
+	ARM64FeatureSVE:      "sve",
+	ARM64FeatureASIMDFHM: "asimdfhm",
+}
+
+var (
+	cpuFreqMHz float64
+	cpuImplHex uint64
+	cpuArchDec uint64
+	cpuVarHex  uint64
+	cpuPartHex uint64
+	cpuRevDec  uint64
+)
+
+// arm64FeaturesFromString includes features from arm64FeatureStrings.
+var arm64FeaturesFromString = make(map[string]Feature)
+
+// FeatureFromString returns the Feature associated with the given feature
+// string plus a bool to indicate if it could find the feature.
+func FeatureFromString(s string) (Feature, bool) {
+	f, b := arm64FeaturesFromString[s]
+	return f, b
+}
+
+// String implements fmt.Stringer.
+func (f Feature) String() string {
+	if s := f.flagString(); s != "" {
+		return s
+	}
+
+	return fmt.Sprintf("<cpuflag %d>", f)
+}
+
+func (f Feature) flagString() string {
+	if s, ok := arm64FeatureStrings[f]; ok {
+		return s
+	}
+
+	return ""
+}
+
+// FeatureSet is a set of Features for a CPU.
+//
+// +stateify savable
+type FeatureSet struct {
+	// Set is the set of features that are enabled in this FeatureSet.
+	Set map[Feature]bool
+
+	// CPUImplementer is part of the processor signature.
+	CPUImplementer uint8
+
+	// CPUArchitecture is part of the processor signature.
+	CPUArchitecture uint8
+
+	// CPUVariant is part of the processor signature.
+	CPUVariant uint8
+
+	// CPUPartnum is part of the processor signature.
+	CPUPartnum uint16
+
+	// CPURevision is part of the processor signature.
+	CPURevision uint8
+}
+
+// CheckHostCompatible returns nil if fs is a subset of the host feature set.
+// Noop on arm64.
+func (fs *FeatureSet) CheckHostCompatible() error {
+	return nil
+}
+
+// ExtendedStateSize returns the number of bytes needed to save the "extended
+// state" for this processor and the boundary it must be aligned to. Extended
+// state includes floating point(NEON) registers, and other cpu state that's not
+// associated with the normal task context.
+func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
+	// ARMv8 provide 32x128bits NEON registers.
+	//
+	// Ref arch/arm64/include/uapi/asm/ptrace.h
+	// struct user_fpsimd_state {
+	//        __uint128_t     vregs[32];
+	//        __u32           fpsr;
+	//	  __u32           fpcr;
+	//	  __u32           __reserved[2];
+	// };
+	return 528, 16
+}
+
+// HasFeature tests whether or not a feature is in the given feature set.
+func (fs *FeatureSet) HasFeature(feature Feature) bool {
+	return fs.Set[feature]
+}
+
+// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
+// Noop on arm64.
+func (fs *FeatureSet) UseXsave() bool {
+	return false
+}
+
+// FlagsString prints out supported CPU "flags" field in /proc/cpuinfo.
+func (fs *FeatureSet) FlagsString() string {
+	var s []string
+	for f, _ := range arm64FeatureStrings {
+		if fs.Set[f] {
+			if fstr := f.flagString(); fstr != "" {
+				s = append(s, fstr)
+			}
+		}
+	}
+	return strings.Join(s, " ")
+}
+
+// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
+// a minimal /proc/cpuinfo, and the bogomips field is simply made up.
+func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
+	fmt.Fprintf(b, "processor\t: %d\n", cpu)
+	fmt.Fprintf(b, "BogoMIPS\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
+	fmt.Fprintf(b, "Features\t\t: %s\n", fs.FlagsString())
+	fmt.Fprintf(b, "CPU implementer\t: 0x%x\n", cpuImplHex)
+	fmt.Fprintf(b, "CPU architecture\t: %d\n", cpuArchDec)
+	fmt.Fprintf(b, "CPU variant\t: 0x%x\n", cpuVarHex)
+	fmt.Fprintf(b, "CPU part\t: 0x%x\n", cpuPartHex)
+	fmt.Fprintf(b, "CPU revision\t: %d\n", cpuRevDec)
+	fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline.
+}
+
+// HostFeatureSet uses hwCap to get host values and construct a feature set
+// that matches that of the host machine.
+func HostFeatureSet() *FeatureSet {
+	s := make(map[Feature]bool)
+
+	for f, _ := range arm64FeatureStrings {
+		if hwCap&(1<<f) != 0 {
+			s[f] = true
+		}
+	}
+
+	return &FeatureSet{
+		Set:             s,
+		CPUImplementer:  uint8(cpuImplHex),
+		CPUArchitecture: uint8(cpuArchDec),
+		CPUVariant:      uint8(cpuVarHex),
+		CPUPartnum:      uint16(cpuPartHex),
+		CPURevision:     uint8(cpuRevDec),
+	}
+}
+
+// Reads bogomips from host /proc/cpuinfo. Must run before whitelisting.
+// This value is used to create the fake /proc/cpuinfo from a FeatureSet.
+func initCPUInfo() {
+	cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		// Leave it as 0. The standalone VDSO bails out in the same way.
+		log.Warningf("Could not read /proc/cpuinfo: %v", err)
+		return
+	}
+	cpuinfo := string(cpuinfob)
+
+	// We get the value straight from host /proc/cpuinfo.
+	for _, line := range strings.Split(cpuinfo, "\n") {
+		switch {
+		case strings.Contains(line, "BogoMIPS"):
+			{
+				splitMHz := strings.Split(line, ":")
+				if len(splitMHz) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed BogoMIPS")
+					break
+				}
+
+				// If there was a problem, leave cpuFreqMHz as 0.
+				var err error
+				cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
+				if err != nil {
+					log.Warningf("Could not parse BogoMIPS value %v: %v", splitMHz[1], err)
+					cpuFreqMHz = 0
+				}
+			}
+		case strings.Contains(line, "CPU implementer"):
+			{
+				splitImpl := strings.Split(line, ":")
+				if len(splitImpl) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU implementer")
+					break
+				}
+
+				// If there was a problem, leave cpuImplHex as 0.
+				var err error
+				cpuImplHex, err = strconv.ParseUint(strings.TrimSpace(splitImpl[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU implementer value %v: %v", splitImpl[1], err)
+					cpuImplHex = 0
+				}
+			}
+		case strings.Contains(line, "CPU architecture"):
+			{
+				splitArch := strings.Split(line, ":")
+				if len(splitArch) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU architecture")
+					break
+				}
+
+				// If there was a problem, leave cpuArchDec as 0.
+				var err error
+				cpuArchDec, err = strconv.ParseUint(strings.TrimSpace(splitArch[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU architecture value %v: %v", splitArch[1], err)
+					cpuArchDec = 0
+				}
+			}
+		case strings.Contains(line, "CPU variant"):
+			{
+				splitVar := strings.Split(line, ":")
+				if len(splitVar) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU variant")
+					break
+				}
+
+				// If there was a problem, leave cpuVarHex as 0.
+				var err error
+				cpuVarHex, err = strconv.ParseUint(strings.TrimSpace(splitVar[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU variant value %v: %v", splitVar[1], err)
+					cpuVarHex = 0
+				}
+			}
+		case strings.Contains(line, "CPU part"):
+			{
+				splitPart := strings.Split(line, ":")
+				if len(splitPart) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU part")
+					break
+				}
+
+				// If there was a problem, leave cpuPartHex as 0.
+				var err error
+				cpuPartHex, err = strconv.ParseUint(strings.TrimSpace(splitPart[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU part value %v: %v", splitPart[1], err)
+					cpuPartHex = 0
+				}
+			}
+		case strings.Contains(line, "CPU revision"):
+			{
+				splitRev := strings.Split(line, ":")
+				if len(splitRev) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU revision")
+					break
+				}
+
+				// If there was a problem, leave cpuRevDec as 0.
+				var err error
+				cpuRevDec, err = strconv.ParseUint(strings.TrimSpace(splitRev[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU revision value %v: %v", splitRev[1], err)
+					cpuRevDec = 0
+				}
+			}
+		}
+	}
+}
+
+// The auxiliary vector of a process on the Linux system can be read
+// from /proc/self/auxv, and tags and values are stored as 8-bytes
+// decimal key-value pairs on the 64-bit system.
+//
+// $ od -t d8 /proc/self/auxv
+//  0000000                   33      140734615224320
+//  0000020                   16           3219913727
+//  0000040                    6                 4096
+//  0000060                   17                  100
+//  0000100                    3       94665627353152
+//  0000120                    4                   56
+//  0000140                    5                    9
+//  0000160                    7      140425502162944
+//  0000200                    8                    0
+//  0000220                    9       94665627365760
+//  0000240                   11                 1000
+//  0000260                   12                 1000
+//  0000300                   13                 1000
+//  0000320                   14                 1000
+//  0000340                   23                    0
+//  0000360                   25      140734614619513
+//  0000400                   26                    0
+//  0000420                   31      140734614626284
+//  0000440                   15      140734614619529
+//  0000460                    0                    0
+func initHwCap() {
+	auxv, err := ioutil.ReadFile("/proc/self/auxv")
+	if err != nil {
+		log.Warningf("Could not read /proc/self/auxv: %v", err)
+		return
+	}
+
+	l := len(auxv) / 16
+	for i := 0; i < l; i++ {
+		tag := binary.LittleEndian.Uint64(auxv[i*16:])
+		val := binary.LittleEndian.Uint64(auxv[(i*16 + 8):])
+		if tag == _AT_HWCAP {
+			hwCap = uint(val)
+			break
+		}
+	}
+}
+
+func initFeaturesFromString() {
+	for f, s := range arm64FeatureStrings {
+		arm64FeaturesFromString[s] = f
+	}
+}
+
+func init() {
+	initCPUInfo()
+	initHwCap()
+	initFeaturesFromString()
+}
diff --git a/pkg/cpuid/cpuid_arm64_test.go b/pkg/cpuid/cpuid_arm64_test.go
new file mode 100644
index 000000000..a34f67779
--- /dev/null
+++ b/pkg/cpuid/cpuid_arm64_test.go
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package cpuid
+
+import (
+	"testing"
+)
+
+var justFP = &FeatureSet{
+	Set: map[Feature]bool{
+		ARM64FeatureFP: true,
+	}}
+
+func TestHostFeatureSet(t *testing.T) {
+	hostFeatures := HostFeatureSet()
+	if len(hostFeatures.Set) == 0 {
+		t.Errorf("Got invalid feature set %v from HostFeatureSet()", hostFeatures)
+	}
+}
+
+func TestHasFeature(t *testing.T) {
+	if !justFP.HasFeature(ARM64FeatureFP) {
+		t.Errorf("HasFeature failed, %v should contain %v", justFP, ARM64FeatureFP)
+	}
+
+	if justFP.HasFeature(ARM64FeatureSM3) {
+		t.Errorf("HasFeature failed, %v should not contain %v", justFP, ARM64FeatureSM3)
+	}
+}
+
+func TestFeatureFromString(t *testing.T) {
+	f, ok := FeatureFromString("asimd")
+	if f != ARM64FeatureASIMD || !ok {
+		t.Errorf("got %v want asimd", f)
+	}
+
+	f, ok = FeatureFromString("bad")
+	if ok {
+		t.Errorf("got %v want nothing", f)
+	}
+}
diff --git a/pkg/cpuid/cpuid_parse_test.go b/pkg/cpuid/cpuid_parse_test.go
deleted file mode 100644
index dd9969db4..000000000
--- a/pkg/cpuid/cpuid_parse_test.go
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package cpuid
-
-import (
-	"fmt"
-	"io/ioutil"
-	"regexp"
-	"strconv"
-	"strings"
-	"syscall"
-	"testing"
-)
-
-func kernelVersion() (int, int, error) {
-	var u syscall.Utsname
-	if err := syscall.Uname(&u); err != nil {
-		return 0, 0, err
-	}
-
-	var r string
-	for _, b := range u.Release {
-		if b == 0 {
-			break
-		}
-		r += string(b)
-	}
-
-	s := strings.Split(r, ".")
-	if len(s) < 2 {
-		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
-	}
-
-	major, err := strconv.Atoi(s[0])
-	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
-	}
-
-	minor, err := strconv.Atoi(s[1])
-	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
-	}
-
-	return major, minor, nil
-}
-
-// TestHostFeatureFlags tests that all features detected by HostFeatureSet are
-// on the host.
-//
-// It does *not* verify that all features reported by the host are detected by
-// HostFeatureSet.
-//
-// i.e., test that HostFeatureSet is a subset of the host features.
-func TestHostFeatureFlags(t *testing.T) {
-	cpuinfoBytes, _ := ioutil.ReadFile("/proc/cpuinfo")
-	cpuinfo := string(cpuinfoBytes)
-	t.Logf("Host cpu info:\n%s", cpuinfo)
-
-	major, minor, err := kernelVersion()
-	if err != nil {
-		t.Fatalf("Unable to parse kernel version: %v", err)
-	}
-
-	re := regexp.MustCompile(`(?m)^flags\s+: (.*)$`)
-	m := re.FindStringSubmatch(cpuinfo)
-	if len(m) != 2 {
-		t.Fatalf("Unable to extract flags from %q", cpuinfo)
-	}
-
-	cpuinfoFlags := make(map[string]struct{})
-	for _, f := range strings.Split(m[1], " ") {
-		cpuinfoFlags[f] = struct{}{}
-	}
-
-	fs := HostFeatureSet()
-
-	// All features have a string and appear in host cpuinfo.
-	for f := range fs.Set {
-		name := f.flagString(false)
-		if name == "" {
-			t.Errorf("Non-parsable feature: %v", f)
-		}
-
-		// Special cases not consistently visible. We don't mind if
-		// they are exposed in earlier versions.
-		switch {
-		// Block 0.
-		case f == X86FeatureSDBG && (major < 4 || major == 4 && minor < 3):
-			// SDBG only exposed in
-			// b1c599b8ff80ea79b9f8277a3f9f36a7b0cfedce (4.3).
-			continue
-		// Block 2.
-		case f == X86FeatureRDT && (major < 4 || major == 4 && minor < 10):
-			// RDT only exposed in
-			// 4ab1586488cb56ed8728e54c4157cc38646874d9 (4.10).
-			continue
-		// Block 3.
-		case f == X86FeatureAVX512VBMI && (major < 4 || major == 4 && minor < 10):
-			// AVX512VBMI only exposed in
-			// a8d9df5a509a232a959e4ef2e281f7ecd77810d6 (4.10).
-			continue
-		case f == X86FeatureUMIP && (major < 4 || major == 4 && minor < 15):
-			// UMIP only exposed in
-			// 3522c2a6a4f341058b8291326a945e2a2d2aaf55 (4.15).
-			continue
-		case f == X86FeaturePKU && (major < 4 || major == 4 && minor < 9):
-			// PKU only exposed in
-			// dfb4a70f20c5b3880da56ee4c9484bdb4e8f1e65 (4.9).
-			continue
-		// Block 4.
-		case f == X86FeatureXSAVES && (major < 4 || major == 4 && minor < 8):
-			// XSAVES only exposed in
-			// b8be15d588060a03569ac85dc4a0247460988f5b (4.8).
-			continue
-		// Block 5.
-		case f == X86FeaturePERFCTR_LLC && (major < 4 || major == 4 && minor < 14):
-			// PERFCTR_LLC renamed in
-			// 910448bbed066ab1082b510eef1ae61bb792d854 (4.14).
-			continue
-		}
-
-		hidden := f.flagString(true) == ""
-		_, ok := cpuinfoFlags[name]
-		if hidden && ok {
-			t.Errorf("Unexpectedly hidden flag: %v", f)
-		} else if !hidden && !ok {
-			t.Errorf("Non-native flag: %v", f)
-		}
-	}
-}
diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
new file mode 100644
index 000000000..d48418e69
--- /dev/null
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -0,0 +1,144 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package cpuid
+
+import (
+	"fmt"
+	"io/ioutil"
+	"regexp"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+)
+
+func kernelVersion() (int, int, error) {
+	var u syscall.Utsname
+	if err := syscall.Uname(&u); err != nil {
+		return 0, 0, err
+	}
+
+	var r string
+	for _, b := range u.Release {
+		if b == 0 {
+			break
+		}
+		r += string(b)
+	}
+
+	s := strings.Split(r, ".")
+	if len(s) < 2 {
+		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
+	}
+
+	major, err := strconv.Atoi(s[0])
+	if err != nil {
+		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
+	}
+
+	minor, err := strconv.Atoi(s[1])
+	if err != nil {
+		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
+	}
+
+	return major, minor, nil
+}
+
+// TestHostFeatureFlags tests that all features detected by HostFeatureSet are
+// on the host.
+//
+// It does *not* verify that all features reported by the host are detected by
+// HostFeatureSet.
+//
+// i.e., test that HostFeatureSet is a subset of the host features.
+func TestHostFeatureFlags(t *testing.T) {
+	cpuinfoBytes, _ := ioutil.ReadFile("/proc/cpuinfo")
+	cpuinfo := string(cpuinfoBytes)
+	t.Logf("Host cpu info:\n%s", cpuinfo)
+
+	major, minor, err := kernelVersion()
+	if err != nil {
+		t.Fatalf("Unable to parse kernel version: %v", err)
+	}
+
+	re := regexp.MustCompile(`(?m)^flags\s+: (.*)$`)
+	m := re.FindStringSubmatch(cpuinfo)
+	if len(m) != 2 {
+		t.Fatalf("Unable to extract flags from %q", cpuinfo)
+	}
+
+	cpuinfoFlags := make(map[string]struct{})
+	for _, f := range strings.Split(m[1], " ") {
+		cpuinfoFlags[f] = struct{}{}
+	}
+
+	fs := HostFeatureSet()
+
+	// All features have a string and appear in host cpuinfo.
+	for f := range fs.Set {
+		name := f.flagString(false)
+		if name == "" {
+			t.Errorf("Non-parsable feature: %v", f)
+		}
+
+		// Special cases not consistently visible. We don't mind if
+		// they are exposed in earlier versions.
+		switch {
+		// Block 0.
+		case f == X86FeatureSDBG && (major < 4 || major == 4 && minor < 3):
+			// SDBG only exposed in
+			// b1c599b8ff80ea79b9f8277a3f9f36a7b0cfedce (4.3).
+			continue
+		// Block 2.
+		case f == X86FeatureRDT && (major < 4 || major == 4 && minor < 10):
+			// RDT only exposed in
+			// 4ab1586488cb56ed8728e54c4157cc38646874d9 (4.10).
+			continue
+		// Block 3.
+		case f == X86FeatureAVX512VBMI && (major < 4 || major == 4 && minor < 10):
+			// AVX512VBMI only exposed in
+			// a8d9df5a509a232a959e4ef2e281f7ecd77810d6 (4.10).
+			continue
+		case f == X86FeatureUMIP && (major < 4 || major == 4 && minor < 15):
+			// UMIP only exposed in
+			// 3522c2a6a4f341058b8291326a945e2a2d2aaf55 (4.15).
+			continue
+		case f == X86FeaturePKU && (major < 4 || major == 4 && minor < 9):
+			// PKU only exposed in
+			// dfb4a70f20c5b3880da56ee4c9484bdb4e8f1e65 (4.9).
+			continue
+		// Block 4.
+		case f == X86FeatureXSAVES && (major < 4 || major == 4 && minor < 8):
+			// XSAVES only exposed in
+			// b8be15d588060a03569ac85dc4a0247460988f5b (4.8).
+			continue
+		// Block 5.
+		case f == X86FeaturePERFCTR_LLC && (major < 4 || major == 4 && minor < 14):
+			// PERFCTR_LLC renamed in
+			// 910448bbed066ab1082b510eef1ae61bb792d854 (4.14).
+			continue
+		}
+
+		hidden := f.flagString(true) == ""
+		_, ok := cpuinfoFlags[name]
+		if hidden && ok {
+			t.Errorf("Unexpectedly hidden flag: %v", f)
+		} else if !hidden && !ok {
+			t.Errorf("Non-native flag: %v", f)
+		}
+	}
+}
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
deleted file mode 100644
index a707ebb55..000000000
--- a/pkg/cpuid/cpuid_test.go
+++ /dev/null
@@ -1,241 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package cpuid
-
-import (
-	"testing"
-)
-
-// These are the default values of various FeatureSet fields.
-const (
-	defaultVendorID = "GenuineIntel"
-
-	// These processor signature defaults are derived from the values
-	// listed in Intel Application Note 485 for i7/Xeon processors.
-	defaultExtFamily  uint8 = 0
-	defaultExtModel   uint8 = 1
-	defaultType       uint8 = 0
-	defaultFamily     uint8 = 0x06
-	defaultModel      uint8 = 0x0a
-	defaultSteppingID uint8 = 0
-)
-
-// newEmptyFeatureSet creates a new FeatureSet with a sensible default model and no features.
-func newEmptyFeatureSet() *FeatureSet {
-	return &FeatureSet{
-		Set:            make(map[Feature]bool),
-		VendorID:       defaultVendorID,
-		ExtendedFamily: defaultExtFamily,
-		ExtendedModel:  defaultExtModel,
-		ProcessorType:  defaultType,
-		Family:         defaultFamily,
-		Model:          defaultModel,
-		SteppingID:     defaultSteppingID,
-	}
-}
-
-var justFPU = &FeatureSet{
-	Set: map[Feature]bool{
-		X86FeatureFPU: true,
-	}}
-
-var justFPUandPAE = &FeatureSet{
-	Set: map[Feature]bool{
-		X86FeatureFPU: true,
-		X86FeaturePAE: true,
-	}}
-
-func TestSubtract(t *testing.T) {
-	if diff := justFPU.Subtract(justFPUandPAE); diff != nil {
-		t.Errorf("Got %v is not subset of %v, want diff (%v) to be nil", justFPU, justFPUandPAE, diff)
-	}
-
-	if justFPUandPAE.Subtract(justFPU) == nil {
-		t.Errorf("Got %v is a subset of %v, want diff to be nil", justFPU, justFPUandPAE)
-	}
-}
-
-// TODO(b/73346484): Run this test on a very old platform, and make sure more
-// bits are enabled than just FPU and PAE. This test currently may not detect
-// if HostFeatureSet gives back junk bits.
-func TestHostFeatureSet(t *testing.T) {
-	hostFeatures := HostFeatureSet()
-	if justFPUandPAE.Subtract(hostFeatures) != nil {
-		t.Errorf("Got invalid feature set %v from HostFeatureSet()", hostFeatures)
-	}
-}
-
-func TestHasFeature(t *testing.T) {
-	if !justFPU.HasFeature(X86FeatureFPU) {
-		t.Errorf("HasFeature failed, %v should contain %v", justFPU, X86FeatureFPU)
-	}
-
-	if justFPU.HasFeature(X86FeatureAVX) {
-		t.Errorf("HasFeature failed, %v should not contain %v", justFPU, X86FeatureAVX)
-	}
-}
-
-// Note: these tests are aware of and abuse internal details of FeatureSets.
-// Users of FeatureSets should not depend on this.
-func TestAdd(t *testing.T) {
-	// Test a basic insertion into the FeatureSet.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureCLFSH)
-	if len(testFeatures.Set) != 1 {
-		t.Errorf("Got length %v want 1", len(testFeatures.Set))
-	}
-
-	if !testFeatures.HasFeature(X86FeatureCLFSH) {
-		t.Errorf("Add failed, got %v want set with %v", testFeatures, X86FeatureCLFSH)
-	}
-
-	// Test that duplicates are ignored.
-	testFeatures.Add(X86FeatureCLFSH)
-	if len(testFeatures.Set) != 1 {
-		t.Errorf("Got length %v, want 1", len(testFeatures.Set))
-	}
-}
-
-func TestRemove(t *testing.T) {
-	// Try removing the last feature.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureFPU)
-	testFeatures.Add(X86FeaturePAE)
-	testFeatures.Remove(X86FeaturePAE)
-	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 || testFeatures.HasFeature(X86FeaturePAE) {
-		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
-	}
-
-	// Try removing a feature not in the set.
-	testFeatures.Remove(X86FeatureRDRAND)
-	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 {
-		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
-	}
-}
-
-func TestFeatureFromString(t *testing.T) {
-	f, ok := FeatureFromString("avx")
-	if f != X86FeatureAVX || !ok {
-		t.Errorf("got %v want avx", f)
-	}
-
-	f, ok = FeatureFromString("bad")
-	if ok {
-		t.Errorf("got %v want nothing", f)
-	}
-}
-
-// This tests function 0 (eax=0), which returns the vendor ID and highest cpuid
-// function reported to be available.
-func TestEmulateIDVendorAndLength(t *testing.T) {
-	testFeatures := newEmptyFeatureSet()
-
-	ax, bx, cx, dx := testFeatures.EmulateID(0, 0)
-	wantEax := uint32(0xd) // Highest supported cpuid function.
-
-	// These magical constants are the characters of "GenuineIntel".
-	// See Intel AN485 for a reference on why they are laid out like this.
-	wantEbx := uint32(0x756e6547)
-	wantEcx := uint32(0x6c65746e)
-	wantEdx := uint32(0x49656e69)
-	if wantEax != ax {
-		t.Errorf("highest function failed, got %x want %x", ax, wantEax)
-	}
-
-	if wantEbx != bx || wantEcx != cx || wantEdx != dx {
-		t.Errorf("vendor string emulation failed, bx:cx:dx, got %x:%x:%x want %x:%x:%x", bx, cx, dx, wantEbx, wantEcx, wantEdx)
-	}
-}
-
-func TestEmulateIDBasicFeatures(t *testing.T) {
-	// Make a minimal test feature set.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureCLFSH)
-	testFeatures.Add(X86FeatureAVX)
-	testFeatures.CacheLine = 64
-
-	ax, bx, cx, dx := testFeatures.EmulateID(1, 0)
-	ECXAVXBit := uint32(1 << uint(X86FeatureAVX))
-	EDXCLFlushBit := uint32(1 << uint(X86FeatureCLFSH-32)) // We adjust by 32 since it's in block 1.
-
-	if EDXCLFlushBit&dx == 0 || dx&^EDXCLFlushBit != 0 {
-		t.Errorf("EmulateID failed, got feature bits %x want %x", dx, testFeatures.blockMask(1))
-	}
-
-	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
-		t.Errorf("EmulateID failed, got feature bits %x want %x", cx, testFeatures.blockMask(0))
-	}
-
-	// Default signature bits, based on values for i7/Xeon.
-	// See Intel AN485 for information on stepping/model bits.
-	defaultSignature := uint32(0x000106a0)
-	if defaultSignature != ax {
-		t.Errorf("EmulateID stepping emulation failed, got %x want %x", ax, defaultSignature)
-	}
-
-	clflushSizeInfo := uint32(8 << 8)
-	if clflushSizeInfo != bx {
-		t.Errorf("EmulateID bx emulation failed, got %x want %x", bx, clflushSizeInfo)
-	}
-}
-
-func TestEmulateIDExtendedFeatures(t *testing.T) {
-	// Make a minimal test feature set, one bit in each extended feature word.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureSMEP)
-	testFeatures.Add(X86FeatureAVX512VBMI)
-
-	ax, bx, cx, dx := testFeatures.EmulateID(7, 0)
-	EBXSMEPBit := uint32(1 << uint(X86FeatureSMEP-2*32))      // Adjust by 2*32 since SMEP is a block 2 feature.
-	ECXAVXBit := uint32(1 << uint(X86FeatureAVX512VBMI-3*32)) // We adjust by 3*32 since it's a block 3 feature.
-
-	// Test that the desired bit is set and no other bits are set.
-	if EBXSMEPBit&bx == 0 || bx&^EBXSMEPBit != 0 {
-		t.Errorf("extended feature emulation failed, got feature bits %x want %x", bx, testFeatures.blockMask(2))
-	}
-
-	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
-		t.Errorf("extended feature emulation failed, got feature bits %x want %x", cx, testFeatures.blockMask(3))
-	}
-
-	if ax != 0 || dx != 0 {
-		t.Errorf("extended feature emulation failed, ax:dx, got %x:%x want 0:0", ax, dx)
-	}
-
-	// Check that no subleaves other than 0 do anything.
-	ax, bx, cx, dx = testFeatures.EmulateID(7, 1)
-	if ax != 0 || bx != 0 || cx != 0 || dx != 0 {
-		t.Errorf("extended feature emulation failed, got %x:%x:%x:%x want 0:0", ax, bx, cx, dx)
-	}
-
-}
-
-// Checks that the expected extended features are available via cpuid functions
-// 0x80000000 and up.
-func TestEmulateIDExtended(t *testing.T) {
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureSYSCALL)
-	EDXSYSCALLBit := uint32(1 << uint(X86FeatureSYSCALL-6*32)) // Adjust by 6*32 since SYSCALL is a block 6 feature.
-
-	ax, bx, cx, dx := testFeatures.EmulateID(0x80000000, 0)
-	if ax != 0x80000001 || bx != 0 || cx != 0 || dx != 0 {
-		t.Errorf("EmulateID extended emulation failed, ax:bx:cx:dx, got %x:%x:%x:%x want 0x80000001:0:0:0", ax, bx, cx, dx)
-	}
-
-	_, _, _, dx = testFeatures.EmulateID(0x80000001, 0)
-	if EDXSYSCALLBit&dx == 0 || dx&^EDXSYSCALLBit != 0 {
-		t.Errorf("extended feature emulation failed, got feature bits %x want %x", dx, testFeatures.blockMask(6))
-	}
-}
diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
new file mode 100644
index 000000000..333ca0a04
--- /dev/null
+++ b/pkg/cpuid/cpuid_x86.go
@@ -0,0 +1,1100 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package cpuid
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// Common references for CPUID leaves and bits:
+//
+// Intel:
+//   * Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date)
+//   * Intel Application Note 485 (more detailed)
+//
+// AMD:
+//   * AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..."
+
+// block is a collection of 32 Feature bits.
+type block int
+
+const blockSize = 32
+
+// Feature bits are numbered according to "blocks". Each block is 32 bits, and
+// feature bits from the same source (cpuid leaf/level) are in the same block.
+func featureID(b block, bit int) Feature {
+	return Feature(32*int(b) + bit)
+}
+
+// Block 0 constants are all of the "basic" feature bits returned by a cpuid in
+// ecx with eax=1.
+const (
+	X86FeatureSSE3 Feature = iota
+	X86FeaturePCLMULDQ
+	X86FeatureDTES64
+	X86FeatureMONITOR
+	X86FeatureDSCPL
+	X86FeatureVMX
+	X86FeatureSMX
+	X86FeatureEST
+	X86FeatureTM2
+	X86FeatureSSSE3 // Not a typo, "supplemental" SSE3.
+	X86FeatureCNXTID
+	X86FeatureSDBG
+	X86FeatureFMA
+	X86FeatureCX16
+	X86FeatureXTPR
+	X86FeaturePDCM
+	_ // ecx bit 16 is reserved.
+	X86FeaturePCID
+	X86FeatureDCA
+	X86FeatureSSE4_1
+	X86FeatureSSE4_2
+	X86FeatureX2APIC
+	X86FeatureMOVBE
+	X86FeaturePOPCNT
+	X86FeatureTSCD
+	X86FeatureAES
+	X86FeatureXSAVE
+	X86FeatureOSXSAVE
+	X86FeatureAVX
+	X86FeatureF16C
+	X86FeatureRDRAND
+	_ // ecx bit 31 is reserved.
+)
+
+// Block 1 constants are all of the "basic" feature bits returned by a cpuid in
+// edx with eax=1.
+const (
+	X86FeatureFPU Feature = 32 + iota
+	X86FeatureVME
+	X86FeatureDE
+	X86FeaturePSE
+	X86FeatureTSC
+	X86FeatureMSR
+	X86FeaturePAE
+	X86FeatureMCE
+	X86FeatureCX8
+	X86FeatureAPIC
+	_ // edx bit 10 is reserved.
+	X86FeatureSEP
+	X86FeatureMTRR
+	X86FeaturePGE
+	X86FeatureMCA
+	X86FeatureCMOV
+	X86FeaturePAT
+	X86FeaturePSE36
+	X86FeaturePSN
+	X86FeatureCLFSH
+	_ // edx bit 20 is reserved.
+	X86FeatureDS
+	X86FeatureACPI
+	X86FeatureMMX
+	X86FeatureFXSR
+	X86FeatureSSE
+	X86FeatureSSE2
+	X86FeatureSS
+	X86FeatureHTT
+	X86FeatureTM
+	X86FeatureIA64
+	X86FeaturePBE
+)
+
+// Block 2 bits are the "structured extended" features returned in ebx for
+// eax=7, ecx=0.
+const (
+	X86FeatureFSGSBase Feature = 2*32 + iota
+	X86FeatureTSC_ADJUST
+	_ // ebx bit 2 is reserved.
+	X86FeatureBMI1
+	X86FeatureHLE
+	X86FeatureAVX2
+	X86FeatureFDP_EXCPTN_ONLY
+	X86FeatureSMEP
+	X86FeatureBMI2
+	X86FeatureERMS
+	X86FeatureINVPCID
+	X86FeatureRTM
+	X86FeatureCQM
+	X86FeatureFPCSDS
+	X86FeatureMPX
+	X86FeatureRDT
+	X86FeatureAVX512F
+	X86FeatureAVX512DQ
+	X86FeatureRDSEED
+	X86FeatureADX
+	X86FeatureSMAP
+	X86FeatureAVX512IFMA
+	X86FeaturePCOMMIT
+	X86FeatureCLFLUSHOPT
+	X86FeatureCLWB
+	X86FeatureIPT // Intel processor trace.
+	X86FeatureAVX512PF
+	X86FeatureAVX512ER
+	X86FeatureAVX512CD
+	X86FeatureSHA
+	X86FeatureAVX512BW
+	X86FeatureAVX512VL
+)
+
+// Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0.
+const (
+	X86FeaturePREFETCHWT1 Feature = 3*32 + iota
+	X86FeatureAVX512VBMI
+	X86FeatureUMIP
+	X86FeaturePKU
+	X86FeatureOSPKE
+	X86FeatureWAITPKG
+	X86FeatureAVX512_VBMI2
+	_ // ecx bit 7 is reserved
+	X86FeatureGFNI
+	X86FeatureVAES
+	X86FeatureVPCLMULQDQ
+	X86FeatureAVX512_VNNI
+	X86FeatureAVX512_BITALG
+	X86FeatureTME
+	X86FeatureAVX512_VPOPCNTDQ
+	_ // ecx bit 15 is reserved
+	X86FeatureLA57
+	// ecx bits 17-21 are reserved
+	_
+	_
+	_
+	_
+	_
+	X86FeatureRDPID
+	// ecx bits 23-24 are reserved
+	_
+	_
+	X86FeatureCLDEMOTE
+	_ // ecx bit 26 is reserved
+	X86FeatureMOVDIRI
+	X86FeatureMOVDIR64B
+)
+
+// Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX.
+// The CPUID leaf is available only if 'X86FeatureXSAVE' is present.
+const (
+	X86FeatureXSAVEOPT Feature = 4*32 + iota
+	X86FeatureXSAVEC
+	X86FeatureXGETBV1
+	X86FeatureXSAVES
+	// EAX[31:4] are reserved.
+)
+
+// Block 5 constants are the extended feature bits in
+// CPUID.(EAX=0x80000001):ECX.
+const (
+	X86FeatureLAHF64 Feature = 5*32 + iota
+	X86FeatureCMP_LEGACY
+	X86FeatureSVM
+	X86FeatureEXTAPIC
+	X86FeatureCR8_LEGACY
+	X86FeatureLZCNT
+	X86FeatureSSE4A
+	X86FeatureMISALIGNSSE
+	X86FeaturePREFETCHW
+	X86FeatureOSVW
+	X86FeatureIBS
+	X86FeatureXOP
+	X86FeatureSKINIT
+	X86FeatureWDT
+	_ // ecx bit 14 is reserved.
+	X86FeatureLWP
+	X86FeatureFMA4
+	X86FeatureTCE
+	_ // ecx bit 18 is reserved.
+	_ // ecx bit 19 is reserved.
+	_ // ecx bit 20 is reserved.
+	X86FeatureTBM
+	X86FeatureTOPOLOGY
+	X86FeaturePERFCTR_CORE
+	X86FeaturePERFCTR_NB
+	_ // ecx bit 25 is reserved.
+	X86FeatureBPEXT
+	X86FeaturePERFCTR_TSC
+	X86FeaturePERFCTR_LLC
+	X86FeatureMWAITX
+	// ECX[31:30] are reserved.
+)
+
+// Block 6 constants are the extended feature bits in
+// CPUID.(EAX=0x80000001):EDX.
+//
+// These are sparse, and so the bit positions are assigned manually.
+const (
+	// On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features
+	// also defined in block 1 (in identical bit positions). Those features
+	// are not listed here.
+	block6DuplicateMask = 0x183f3ff
+
+	X86FeatureSYSCALL  Feature = 6*32 + 11
+	X86FeatureNX       Feature = 6*32 + 20
+	X86FeatureMMXEXT   Feature = 6*32 + 22
+	X86FeatureFXSR_OPT Feature = 6*32 + 25
+	X86FeatureGBPAGES  Feature = 6*32 + 26
+	X86FeatureRDTSCP   Feature = 6*32 + 27
+	X86FeatureLM       Feature = 6*32 + 29
+	X86Feature3DNOWEXT Feature = 6*32 + 30
+	X86Feature3DNOW    Feature = 6*32 + 31
+)
+
+// linuxBlockOrder defines the order in which linux organizes the feature
+// blocks. Linux also tracks feature bits in 32-bit blocks, but in an order
+// which doesn't match well here, so for the /proc/cpuinfo generation we simply
+// re-map the blocks to Linux's ordering and then go through the bits in each
+// block.
+var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3}
+
+// To make emulation of /proc/cpuinfo easy, these names match the names of the
+// basic features in Linux defined in arch/x86/kernel/cpu/capflags.c.
+var x86FeatureStrings = map[Feature]string{
+	// Block 0.
+	X86FeatureSSE3:     "pni",
+	X86FeaturePCLMULDQ: "pclmulqdq",
+	X86FeatureDTES64:   "dtes64",
+	X86FeatureMONITOR:  "monitor",
+	X86FeatureDSCPL:    "ds_cpl",
+	X86FeatureVMX:      "vmx",
+	X86FeatureSMX:      "smx",
+	X86FeatureEST:      "est",
+	X86FeatureTM2:      "tm2",
+	X86FeatureSSSE3:    "ssse3",
+	X86FeatureCNXTID:   "cid",
+	X86FeatureSDBG:     "sdbg",
+	X86FeatureFMA:      "fma",
+	X86FeatureCX16:     "cx16",
+	X86FeatureXTPR:     "xtpr",
+	X86FeaturePDCM:     "pdcm",
+	X86FeaturePCID:     "pcid",
+	X86FeatureDCA:      "dca",
+	X86FeatureSSE4_1:   "sse4_1",
+	X86FeatureSSE4_2:   "sse4_2",
+	X86FeatureX2APIC:   "x2apic",
+	X86FeatureMOVBE:    "movbe",
+	X86FeaturePOPCNT:   "popcnt",
+	X86FeatureTSCD:     "tsc_deadline_timer",
+	X86FeatureAES:      "aes",
+	X86FeatureXSAVE:    "xsave",
+	X86FeatureAVX:      "avx",
+	X86FeatureF16C:     "f16c",
+	X86FeatureRDRAND:   "rdrand",
+
+	// Block 1.
+	X86FeatureFPU:   "fpu",
+	X86FeatureVME:   "vme",
+	X86FeatureDE:    "de",
+	X86FeaturePSE:   "pse",
+	X86FeatureTSC:   "tsc",
+	X86FeatureMSR:   "msr",
+	X86FeaturePAE:   "pae",
+	X86FeatureMCE:   "mce",
+	X86FeatureCX8:   "cx8",
+	X86FeatureAPIC:  "apic",
+	X86FeatureSEP:   "sep",
+	X86FeatureMTRR:  "mtrr",
+	X86FeaturePGE:   "pge",
+	X86FeatureMCA:   "mca",
+	X86FeatureCMOV:  "cmov",
+	X86FeaturePAT:   "pat",
+	X86FeaturePSE36: "pse36",
+	X86FeaturePSN:   "pn",
+	X86FeatureCLFSH: "clflush",
+	X86FeatureDS:    "dts",
+	X86FeatureACPI:  "acpi",
+	X86FeatureMMX:   "mmx",
+	X86FeatureFXSR:  "fxsr",
+	X86FeatureSSE:   "sse",
+	X86FeatureSSE2:  "sse2",
+	X86FeatureSS:    "ss",
+	X86FeatureHTT:   "ht",
+	X86FeatureTM:    "tm",
+	X86FeatureIA64:  "ia64",
+	X86FeaturePBE:   "pbe",
+
+	// Block 2.
+	X86FeatureFSGSBase:   "fsgsbase",
+	X86FeatureTSC_ADJUST: "tsc_adjust",
+	X86FeatureBMI1:       "bmi1",
+	X86FeatureHLE:        "hle",
+	X86FeatureAVX2:       "avx2",
+	X86FeatureSMEP:       "smep",
+	X86FeatureBMI2:       "bmi2",
+	X86FeatureERMS:       "erms",
+	X86FeatureINVPCID:    "invpcid",
+	X86FeatureRTM:        "rtm",
+	X86FeatureCQM:        "cqm",
+	X86FeatureMPX:        "mpx",
+	X86FeatureRDT:        "rdt_a",
+	X86FeatureAVX512F:    "avx512f",
+	X86FeatureAVX512DQ:   "avx512dq",
+	X86FeatureRDSEED:     "rdseed",
+	X86FeatureADX:        "adx",
+	X86FeatureSMAP:       "smap",
+	X86FeatureCLWB:       "clwb",
+	X86FeatureAVX512PF:   "avx512pf",
+	X86FeatureAVX512ER:   "avx512er",
+	X86FeatureAVX512CD:   "avx512cd",
+	X86FeatureSHA:        "sha_ni",
+	X86FeatureAVX512BW:   "avx512bw",
+	X86FeatureAVX512VL:   "avx512vl",
+
+	// Block 3.
+	X86FeatureAVX512VBMI:       "avx512vbmi",
+	X86FeatureUMIP:             "umip",
+	X86FeaturePKU:              "pku",
+	X86FeatureOSPKE:            "ospke",
+	X86FeatureWAITPKG:          "waitpkg",
+	X86FeatureAVX512_VBMI2:     "avx512_vbmi2",
+	X86FeatureGFNI:             "gfni",
+	X86FeatureVAES:             "vaes",
+	X86FeatureVPCLMULQDQ:       "vpclmulqdq",
+	X86FeatureAVX512_VNNI:      "avx512_vnni",
+	X86FeatureAVX512_BITALG:    "avx512_bitalg",
+	X86FeatureTME:              "tme",
+	X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq",
+	X86FeatureLA57:             "la57",
+	X86FeatureRDPID:            "rdpid",
+	X86FeatureCLDEMOTE:         "cldemote",
+	X86FeatureMOVDIRI:          "movdiri",
+	X86FeatureMOVDIR64B:        "movdir64b",
+
+	// Block 4.
+	X86FeatureXSAVEOPT: "xsaveopt",
+	X86FeatureXSAVEC:   "xsavec",
+	X86FeatureXGETBV1:  "xgetbv1",
+	X86FeatureXSAVES:   "xsaves",
+
+	// Block 5.
+	X86FeatureLAHF64:       "lahf_lm", // LAHF/SAHF in long mode
+	X86FeatureCMP_LEGACY:   "cmp_legacy",
+	X86FeatureSVM:          "svm",
+	X86FeatureEXTAPIC:      "extapic",
+	X86FeatureCR8_LEGACY:   "cr8_legacy",
+	X86FeatureLZCNT:        "abm", // Advanced bit manipulation
+	X86FeatureSSE4A:        "sse4a",
+	X86FeatureMISALIGNSSE:  "misalignsse",
+	X86FeaturePREFETCHW:    "3dnowprefetch",
+	X86FeatureOSVW:         "osvw",
+	X86FeatureIBS:          "ibs",
+	X86FeatureXOP:          "xop",
+	X86FeatureSKINIT:       "skinit",
+	X86FeatureWDT:          "wdt",
+	X86FeatureLWP:          "lwp",
+	X86FeatureFMA4:         "fma4",
+	X86FeatureTCE:          "tce",
+	X86FeatureTBM:          "tbm",
+	X86FeatureTOPOLOGY:     "topoext",
+	X86FeaturePERFCTR_CORE: "perfctr_core",
+	X86FeaturePERFCTR_NB:   "perfctr_nb",
+	X86FeatureBPEXT:        "bpext",
+	X86FeaturePERFCTR_TSC:  "ptsc",
+	X86FeaturePERFCTR_LLC:  "perfctr_llc",
+	X86FeatureMWAITX:       "mwaitx",
+
+	// Block 6.
+	X86FeatureSYSCALL:  "syscall",
+	X86FeatureNX:       "nx",
+	X86FeatureMMXEXT:   "mmxext",
+	X86FeatureFXSR_OPT: "fxsr_opt",
+	X86FeatureGBPAGES:  "pdpe1gb",
+	X86FeatureRDTSCP:   "rdtscp",
+	X86FeatureLM:       "lm",
+	X86Feature3DNOWEXT: "3dnowext",
+	X86Feature3DNOW:    "3dnow",
+}
+
+// These flags are parse only---they can be used for setting / unsetting the
+// flags, but will not get printed out in /proc/cpuinfo.
+var x86FeatureParseOnlyStrings = map[Feature]string{
+	// Block 0.
+	X86FeatureOSXSAVE: "osxsave",
+
+	// Block 2.
+	X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
+	X86FeatureFPCSDS:          "fpcsds",
+	X86FeatureIPT:             "pt",
+	X86FeatureCLFLUSHOPT:      "clfushopt",
+
+	// Block 3.
+	X86FeaturePREFETCHWT1: "prefetchwt1",
+}
+
+// intelCacheDescriptors describe the caches and TLBs on the system. They are
+// returned in the registers for eax=2. Intel only.
+type intelCacheDescriptor uint8
+
+// Valid cache/TLB descriptors. All descriptors can be found in Intel SDM Vol.
+// 2, Ch. 3.2, "CPUID", Table 3-12 "Encoding of CPUID Leaf 2 Descriptors".
+const (
+	intelNullDescriptor    intelCacheDescriptor = 0
+	intelNoTLBDescriptor   intelCacheDescriptor = 0xfe
+	intelNoCacheDescriptor intelCacheDescriptor = 0xff
+
+	// Most descriptors omitted for brevity as they are currently unused.
+)
+
+// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4.
+type CacheType uint8
+
+const (
+	// cacheNull indicates that there are no more entries.
+	cacheNull CacheType = iota
+
+	// CacheData is a data cache.
+	CacheData
+
+	// CacheInstruction is an instruction cache.
+	CacheInstruction
+
+	// CacheUnified is a unified instruction and data cache.
+	CacheUnified
+)
+
+// Cache describes the parameters of a single cache on the system.
+//
+// +stateify savable
+type Cache struct {
+	// Level is the hierarchical level of this cache (L1, L2, etc).
+	Level uint32
+
+	// Type is the type of cache.
+	Type CacheType
+
+	// FullyAssociative indicates that entries may be placed in any block.
+	FullyAssociative bool
+
+	// Partitions is the number of physical partitions in the cache.
+	Partitions uint32
+
+	// Ways is the number of ways of associativity in the cache.
+	Ways uint32
+
+	// Sets is the number of sets in the cache.
+	Sets uint32
+
+	// InvalidateHierarchical indicates that WBINVD/INVD from threads
+	// sharing this cache acts upon lower level caches for threads sharing
+	// this cache.
+	InvalidateHierarchical bool
+
+	// Inclusive indicates that this cache is inclusive of lower cache
+	// levels.
+	Inclusive bool
+
+	// DirectMapped indicates that this cache is directly mapped from
+	// address, rather than using a hash function.
+	DirectMapped bool
+}
+
+// Just a way to wrap cpuid function numbers.
+type cpuidFunction uint32
+
+// The constants below are the lower or "standard" cpuid functions, ordered as
+// defined by the hardware.
+const (
+	vendorID                      cpuidFunction = iota // Returns vendor ID and largest standard function.
+	featureInfo                                        // Returns basic feature bits and processor signature.
+	intelCacheDescriptors                              // Returns list of cache descriptors. Intel only.
+	intelSerialNumber                                  // Returns processor serial number (obsolete on new hardware). Intel only.
+	intelDeterministicCacheParams                      // Returns deterministic cache information. Intel only.
+	monitorMwaitParams                                 // Returns information about monitor/mwait instructions.
+	powerParams                                        // Returns information about power management and thermal sensors.
+	extendedFeatureInfo                                // Returns extended feature bits.
+	_                                                  // Function 0x8 is reserved.
+	intelDCAParams                                     // Returns direct cache access information. Intel only.
+	intelPMCInfo                                       // Returns information about performance monitoring features. Intel only.
+	intelX2APICInfo                                    // Returns core/logical processor topology. Intel only.
+	_                                                  // Function 0xc is reserved.
+	xSaveInfo                                          // Returns information about extended state management.
+)
+
+// The "extended" functions start at 0x80000000.
+const (
+	extendedFunctionInfo cpuidFunction = 0x80000000 + iota // Returns highest available extended function in eax.
+	extendedFeatures                                       // Returns some extended feature bits in edx and ecx.
+)
+
+// These are the extended floating point state features. They are used to
+// enumerate floating point features in XCR0, XSTATE_BV, etc.
+const (
+	XSAVEFeatureX87         = 1 << 0
+	XSAVEFeatureSSE         = 1 << 1
+	XSAVEFeatureAVX         = 1 << 2
+	XSAVEFeatureBNDREGS     = 1 << 3
+	XSAVEFeatureBNDCSR      = 1 << 4
+	XSAVEFeatureAVX512op    = 1 << 5
+	XSAVEFeatureAVX512zmm0  = 1 << 6
+	XSAVEFeatureAVX512zmm16 = 1 << 7
+	XSAVEFeaturePKRU        = 1 << 9
+)
+
+var cpuFreqMHz float64
+
+// x86FeaturesFromString includes features from x86FeatureStrings and
+// x86FeatureParseOnlyStrings.
+var x86FeaturesFromString = make(map[string]Feature)
+
+// FeatureFromString returns the Feature associated with the given feature
+// string plus a bool to indicate if it could find the feature.
+func FeatureFromString(s string) (Feature, bool) {
+	f, b := x86FeaturesFromString[s]
+	return f, b
+}
+
+// String implements fmt.Stringer.
+func (f Feature) String() string {
+	if s := f.flagString(false); s != "" {
+		return s
+	}
+
+	block := int(f) / 32
+	bit := int(f) % 32
+	return fmt.Sprintf("<cpuflag %d; block %d bit %d>", f, block, bit)
+}
+
+func (f Feature) flagString(cpuinfoOnly bool) string {
+	if s, ok := x86FeatureStrings[f]; ok {
+		return s
+	}
+	if !cpuinfoOnly {
+		return x86FeatureParseOnlyStrings[f]
+	}
+	return ""
+}
+
+// FeatureSet is a set of Features for a CPU.
+//
+// +stateify savable
+type FeatureSet struct {
+	// Set is the set of features that are enabled in this FeatureSet.
+	Set map[Feature]bool
+
+	// VendorID is the 12-char string returned in ebx:edx:ecx for eax=0.
+	VendorID string
+
+	// ExtendedFamily is part of the processor signature.
+	ExtendedFamily uint8
+
+	// ExtendedModel is part of the processor signature.
+	ExtendedModel uint8
+
+	// ProcessorType is part of the processor signature.
+	ProcessorType uint8
+
+	// Family is part of the processor signature.
+	Family uint8
+
+	// Model is part of the processor signature.
+	Model uint8
+
+	// SteppingID is part of the processor signature.
+	SteppingID uint8
+
+	// Caches describes the caches on the CPU.
+	Caches []Cache
+
+	// CacheLine is the size of a cache line in bytes.
+	//
+	// All caches use the same line size. This is not enforced in the CPUID
+	// encoding, but is true on all known x86 processors.
+	CacheLine uint32
+}
+
+// FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is
+// equivalent to the "flags" field in /proc/cpuinfo.
+func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string {
+	var s []string
+	for _, b := range linuxBlockOrder {
+		for i := 0; i < blockSize; i++ {
+			if f := featureID(b, i); fs.Set[f] {
+				if fstr := f.flagString(cpuinfoOnly); fstr != "" {
+					s = append(s, fstr)
+				}
+			}
+		}
+	}
+	return strings.Join(s, " ")
+}
+
+// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
+// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
+// not always printed in Linux. The bogomips field is simply made up.
+func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
+	fmt.Fprintf(b, "processor\t: %d\n", cpu)
+	fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID)
+	fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
+	fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
+	fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now.
+	fmt.Fprintf(b, "stepping\t: %s\n", "unknown")   // Unknown for now.
+	fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
+	fmt.Fprintln(b, "fpu\t\t: yes")
+	fmt.Fprintln(b, "fpu_exception\t: yes")
+	fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
+	fmt.Fprintln(b, "wp\t\t: yes")
+	fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true))
+	fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
+	fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine)
+	fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine)
+	fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
+	fmt.Fprintln(b, "power management:") // This is always here, but can be blank.
+	fmt.Fprintln(b, "")                  // The /proc/cpuinfo file ends with an extra newline.
+}
+
+const (
+	amdVendorID   = "AuthenticAMD"
+	intelVendorID = "GenuineIntel"
+)
+
+// AMD returns true if fs describes an AMD CPU.
+func (fs *FeatureSet) AMD() bool {
+	return fs.VendorID == amdVendorID
+}
+
+// Intel returns true if fs describes an Intel CPU.
+func (fs *FeatureSet) Intel() bool {
+	return fs.VendorID == intelVendorID
+}
+
+// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a
+// subset of the host feature set.
+type ErrIncompatible struct {
+	message string
+}
+
+// Error implements error.
+func (e ErrIncompatible) Error() string {
+	return e.message
+}
+
+// CheckHostCompatible returns nil if fs is a subset of the host feature set.
+func (fs *FeatureSet) CheckHostCompatible() error {
+	hfs := HostFeatureSet()
+
+	if diff := fs.Subtract(hfs); diff != nil {
+		return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)}
+	}
+
+	// The size of a cache line must match, as it is critical to correctly
+	// utilizing CLFLUSH. Other cache properties are allowed to change, as
+	// they are not important to correctness.
+	if fs.CacheLine != hfs.CacheLine {
+		return ErrIncompatible{fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fs.CacheLine, hfs.CacheLine)}
+	}
+
+	return nil
+}
+
+// Helper to convert 3 regs into 12-byte vendor ID.
+func vendorIDFromRegs(bx, cx, dx uint32) string {
+	bytes := make([]byte, 0, 12)
+	for i := uint(0); i < 4; i++ {
+		b := byte(bx >> (i * 8))
+		bytes = append(bytes, b)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		b := byte(dx >> (i * 8))
+		bytes = append(bytes, b)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		b := byte(cx >> (i * 8))
+		bytes = append(bytes, b)
+	}
+	return string(bytes)
+}
+
+// ExtendedStateSize returns the number of bytes needed to save the "extended
+// state" for this processor and the boundary it must be aligned to. Extended
+// state includes floating point registers, and other cpu state that's not
+// associated with the normal task context.
+//
+// Note: We can save some space here with an optimization where we use a
+// smaller chunk of memory depending on features that are actually enabled.
+// Currently we just use the largest possible size for simplicity (which is
+// about 2.5K worst case, with avx512).
+func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
+	if fs.UseXsave() {
+		// Leaf 0 of xsaveinfo function returns the size for currently
+		// enabled xsave features in ebx, the maximum size if all valid
+		// features are saved with xsave in ecx, and valid XCR0 bits in
+		// edx:eax.
+		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
+		return uint(maxSize), 64
+	}
+
+	// If we don't support xsave, we fall back to fxsave, which requires
+	// 512 bytes aligned to 16 bytes.
+	return 512, 16
+}
+
+// ValidXCR0Mask returns the bits that may be set to 1 in control register
+// XCR0.
+func (fs *FeatureSet) ValidXCR0Mask() uint64 {
+	if !fs.UseXsave() {
+		return 0
+	}
+	eax, _, _, edx := HostID(uint32(xSaveInfo), 0)
+	return uint64(edx)<<32 | uint64(eax)
+}
+
+// vendorIDRegs returns the 3 register values used to construct the 12-byte
+// vendor ID string for eax=0.
+func (fs *FeatureSet) vendorIDRegs() (bx, dx, cx uint32) {
+	for i := uint(0); i < 4; i++ {
+		bx |= uint32(fs.VendorID[i]) << (i * 8)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		dx |= uint32(fs.VendorID[i+4]) << (i * 8)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		cx |= uint32(fs.VendorID[i+8]) << (i * 8)
+	}
+	return
+}
+
+// signature returns the signature dword that's returned in eax when eax=1.
+func (fs *FeatureSet) signature() uint32 {
+	var s uint32
+	s |= uint32(fs.SteppingID & 0xf)
+	s |= uint32(fs.Model&0xf) << 4
+	s |= uint32(fs.Family&0xf) << 8
+	s |= uint32(fs.ProcessorType&0x3) << 12
+	s |= uint32(fs.ExtendedModel&0xf) << 16
+	s |= uint32(fs.ExtendedFamily&0xff) << 20
+	return s
+}
+
+// Helper to deconstruct signature dword.
+func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) {
+	sid = uint8(v & 0xf)
+	m = uint8(v>>4) & 0xf
+	f = uint8(v>>8) & 0xf
+	pt = uint8(v>>12) & 0x3
+	em = uint8(v>>16) & 0xf
+	ef = uint8(v >> 20)
+	return
+}
+
+// Helper to convert blockwise feature bit masks into a set of features. Masks
+// must be provided in order for each block, without skipping them. If a block
+// does not matter for this feature set, 0 is specified.
+func setFromBlockMasks(blocks ...uint32) map[Feature]bool {
+	s := make(map[Feature]bool)
+	for b, blockMask := range blocks {
+		for i := 0; i < blockSize; i++ {
+			if blockMask&1 != 0 {
+				s[featureID(block(b), i)] = true
+			}
+			blockMask >>= 1
+		}
+	}
+	return s
+}
+
+// blockMask returns the 32-bit mask associated with a block of features.
+func (fs *FeatureSet) blockMask(b block) uint32 {
+	var mask uint32
+	for i := 0; i < blockSize; i++ {
+		if fs.Set[featureID(b, i)] {
+			mask |= 1 << uint(i)
+		}
+	}
+	return mask
+}
+
+// Remove removes a Feature from a FeatureSet. It ignores features
+// that are not in the FeatureSet.
+func (fs *FeatureSet) Remove(feature Feature) {
+	delete(fs.Set, feature)
+}
+
+// Add adds a Feature to a FeatureSet. It ignores duplicate features.
+func (fs *FeatureSet) Add(feature Feature) {
+	fs.Set[feature] = true
+}
+
+// HasFeature tests whether or not a feature is in the given feature set.
+func (fs *FeatureSet) HasFeature(feature Feature) bool {
+	return fs.Set[feature]
+}
+
+// Subtract returns the features present in fs that are not present in other.
+// If all features in fs are present in other, Subtract returns nil.
+func (fs *FeatureSet) Subtract(other *FeatureSet) (diff map[Feature]bool) {
+	for f := range fs.Set {
+		if !other.Set[f] {
+			if diff == nil {
+				diff = make(map[Feature]bool)
+			}
+			diff[f] = true
+		}
+	}
+
+	return
+}
+
+// EmulateID emulates a cpuid instruction based on the feature set.
+func (fs *FeatureSet) EmulateID(origAx, origCx uint32) (ax, bx, cx, dx uint32) {
+	switch cpuidFunction(origAx) {
+	case vendorID:
+		ax = uint32(xSaveInfo) // 0xd (xSaveInfo) is the highest function we support.
+		bx, dx, cx = fs.vendorIDRegs()
+	case featureInfo:
+		// CLFLUSH line size is encoded in quadwords. Other fields in bx unsupported.
+		bx = (fs.CacheLine / 8) << 8
+		cx = fs.blockMask(block(0))
+		dx = fs.blockMask(block(1))
+		ax = fs.signature()
+	case intelCacheDescriptors:
+		if !fs.Intel() {
+			// Reserved on non-Intel.
+			return 0, 0, 0, 0
+		}
+
+		// "The least-significant byte in register EAX (register AL)
+		// will always return 01H. Software should ignore this value
+		// and not interpret it as an informational descriptor." - SDM
+		//
+		// We only support reporting cache parameters via
+		// intelDeterministicCacheParams; report as much here.
+		//
+		// We do not support exposing TLB information at all.
+		ax = 1 | (uint32(intelNoCacheDescriptor) << 8)
+	case intelDeterministicCacheParams:
+		if !fs.Intel() {
+			// Reserved on non-Intel.
+			return 0, 0, 0, 0
+		}
+
+		// cx is the index of the cache to describe.
+		if int(origCx) >= len(fs.Caches) {
+			return uint32(cacheNull), 0, 0, 0
+		}
+		c := fs.Caches[origCx]
+
+		ax = uint32(c.Type)
+		ax |= c.Level << 5
+		ax |= 1 << 8 // Always claim the cache is "self-initializing".
+		if c.FullyAssociative {
+			ax |= 1 << 9
+		}
+		// Processor topology not supported.
+
+		bx = fs.CacheLine - 1
+		bx |= (c.Partitions - 1) << 12
+		bx |= (c.Ways - 1) << 22
+
+		cx = c.Sets - 1
+
+		if !c.InvalidateHierarchical {
+			dx |= 1
+		}
+		if c.Inclusive {
+			dx |= 1 << 1
+		}
+		if !c.DirectMapped {
+			dx |= 1 << 2
+		}
+	case xSaveInfo:
+		if !fs.UseXsave() {
+			return 0, 0, 0, 0
+		}
+		return HostID(uint32(xSaveInfo), origCx)
+	case extendedFeatureInfo:
+		if origCx != 0 {
+			break // Only leaf 0 is supported.
+		}
+		bx = fs.blockMask(block(2))
+		cx = fs.blockMask(block(3))
+	case extendedFunctionInfo:
+		// We only support showing the extended features.
+		ax = uint32(extendedFeatures)
+		cx = 0
+	case extendedFeatures:
+		cx = fs.blockMask(block(5))
+		dx = fs.blockMask(block(6))
+		if fs.AMD() {
+			// AMD duplicates some block 1 features in block 6.
+			dx |= fs.blockMask(block(1)) & block6DuplicateMask
+		}
+	}
+
+	return
+}
+
+// UseXsave returns the choice of fp state saving instruction.
+func (fs *FeatureSet) UseXsave() bool {
+	return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE)
+}
+
+// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
+func (fs *FeatureSet) UseXsaveopt() bool {
+	return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT)
+}
+
+// HostID executes a native CPUID instruction.
+func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)
+
+// HostFeatureSet uses cpuid to get host values and construct a feature set
+// that matches that of the host machine. Note that there are several places
+// where there appear to be some unnecessary assignments between register names
+// (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
+// where the different feature blocks come from, to make the code easier to
+// inspect and read.
+func HostFeatureSet() *FeatureSet {
+	// eax=0 gets max supported feature and vendor ID.
+	_, bx, cx, dx := HostID(0, 0)
+	vendorID := vendorIDFromRegs(bx, cx, dx)
+
+	// eax=1 gets basic features in ecx:edx.
+	ax, bx, cx, dx := HostID(1, 0)
+	featureBlock0 := cx
+	featureBlock1 := dx
+	ef, em, pt, f, m, sid := signatureSplit(ax)
+	cacheLine := 8 * (bx >> 8) & 0xff
+
+	// eax=4, ecx=i gets details about cache index i. Only supported on Intel.
+	var caches []Cache
+	if vendorID == intelVendorID {
+		// ecx selects the cache index until a null type is returned.
+		for i := uint32(0); ; i++ {
+			ax, bx, cx, dx := HostID(4, i)
+			t := CacheType(ax & 0xf)
+			if t == cacheNull {
+				break
+			}
+
+			lineSize := (bx & 0xfff) + 1
+			if lineSize != cacheLine {
+				panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine))
+			}
+
+			caches = append(caches, Cache{
+				Type:                   t,
+				Level:                  (ax >> 5) & 0x7,
+				FullyAssociative:       ((ax >> 9) & 1) == 1,
+				Partitions:             ((bx >> 12) & 0x3ff) + 1,
+				Ways:                   ((bx >> 22) & 0x3ff) + 1,
+				Sets:                   cx + 1,
+				InvalidateHierarchical: (dx & 1) == 0,
+				Inclusive:              ((dx >> 1) & 1) == 1,
+				DirectMapped:           ((dx >> 2) & 1) == 0,
+			})
+		}
+	}
+
+	// eax=7, ecx=0 gets extended features in ecx:ebx.
+	_, bx, cx, _ = HostID(7, 0)
+	featureBlock2 := bx
+	featureBlock3 := cx
+
+	// Leaf 0xd is supported only if CPUID.1:ECX.XSAVE[bit 26] is set.
+	var featureBlock4 uint32
+	if (featureBlock0 & (1 << 26)) != 0 {
+		featureBlock4, _, _, _ = HostID(uint32(xSaveInfo), 1)
+	}
+
+	// eax=0x80000000 gets supported extended levels. We use this to
+	// determine if there are any non-zero block 4 or block 6 bits to find.
+	var featureBlock5, featureBlock6 uint32
+	if ax, _, _, _ := HostID(uint32(extendedFunctionInfo), 0); ax >= uint32(extendedFeatures) {
+		// eax=0x80000001 gets AMD added feature bits.
+		_, _, cx, dx = HostID(uint32(extendedFeatures), 0)
+		featureBlock5 = cx
+		// Ignore features duplicated from block 1 on AMD. These bits
+		// are reserved on Intel.
+		featureBlock6 = dx &^ block6DuplicateMask
+	}
+
+	set := setFromBlockMasks(featureBlock0, featureBlock1, featureBlock2, featureBlock3, featureBlock4, featureBlock5, featureBlock6)
+	return &FeatureSet{
+		Set:            set,
+		VendorID:       vendorID,
+		ExtendedFamily: ef,
+		ExtendedModel:  em,
+		ProcessorType:  pt,
+		Family:         f,
+		Model:          m,
+		SteppingID:     sid,
+		CacheLine:      cacheLine,
+		Caches:         caches,
+	}
+}
+
+// Reads max cpu frequency from host /proc/cpuinfo. Must run before
+// whitelisting. This value is used to create the fake /proc/cpuinfo from a
+// FeatureSet.
+func initCPUFreq() {
+	cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		// Leave it as 0... The standalone VDSO bails out in the same
+		// way.
+		log.Warningf("Could not read /proc/cpuinfo: %v", err)
+		return
+	}
+	cpuinfo := string(cpuinfob)
+
+	// We get the value straight from host /proc/cpuinfo. On machines with
+	// frequency scaling enabled, this will only get the current value
+	// which will likely be inaccurate. This is fine on machines with
+	// frequency scaling disabled.
+	for _, line := range strings.Split(cpuinfo, "\n") {
+		if strings.Contains(line, "cpu MHz") {
+			splitMHz := strings.Split(line, ":")
+			if len(splitMHz) < 2 {
+				log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line")
+				return
+			}
+
+			// If there was a problem, leave cpuFreqMHz as 0.
+			var err error
+			cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
+			if err != nil {
+				log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err)
+				cpuFreqMHz = 0
+				return
+			}
+			return
+		}
+	}
+	log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz")
+}
+
+func initFeaturesFromString() {
+	for f, s := range x86FeatureStrings {
+		x86FeaturesFromString[s] = f
+	}
+	for f, s := range x86FeatureParseOnlyStrings {
+		x86FeaturesFromString[s] = f
+	}
+}
+
+func init() {
+	// initCpuFreq must be run before whitelists are enabled.
+	initCPUFreq()
+	initFeaturesFromString()
+}
diff --git a/pkg/cpuid/cpuid_x86_test.go b/pkg/cpuid/cpuid_x86_test.go
new file mode 100644
index 000000000..0fe20c213
--- /dev/null
+++ b/pkg/cpuid/cpuid_x86_test.go
@@ -0,0 +1,243 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package cpuid
+
+import (
+	"testing"
+)
+
+// These are the default values of various FeatureSet fields.
+const (
+	defaultVendorID = "GenuineIntel"
+
+	// These processor signature defaults are derived from the values
+	// listed in Intel Application Note 485 for i7/Xeon processors.
+	defaultExtFamily  uint8 = 0
+	defaultExtModel   uint8 = 1
+	defaultType       uint8 = 0
+	defaultFamily     uint8 = 0x06
+	defaultModel      uint8 = 0x0a
+	defaultSteppingID uint8 = 0
+)
+
+// newEmptyFeatureSet creates a new FeatureSet with a sensible default model and no features.
+func newEmptyFeatureSet() *FeatureSet {
+	return &FeatureSet{
+		Set:            make(map[Feature]bool),
+		VendorID:       defaultVendorID,
+		ExtendedFamily: defaultExtFamily,
+		ExtendedModel:  defaultExtModel,
+		ProcessorType:  defaultType,
+		Family:         defaultFamily,
+		Model:          defaultModel,
+		SteppingID:     defaultSteppingID,
+	}
+}
+
+var justFPU = &FeatureSet{
+	Set: map[Feature]bool{
+		X86FeatureFPU: true,
+	}}
+
+var justFPUandPAE = &FeatureSet{
+	Set: map[Feature]bool{
+		X86FeatureFPU: true,
+		X86FeaturePAE: true,
+	}}
+
+func TestSubtract(t *testing.T) {
+	if diff := justFPU.Subtract(justFPUandPAE); diff != nil {
+		t.Errorf("Got %v is not subset of %v, want diff (%v) to be nil", justFPU, justFPUandPAE, diff)
+	}
+
+	if justFPUandPAE.Subtract(justFPU) == nil {
+		t.Errorf("Got %v is a subset of %v, want diff to be nil", justFPU, justFPUandPAE)
+	}
+}
+
+// TODO(b/73346484): Run this test on a very old platform, and make sure more
+// bits are enabled than just FPU and PAE. This test currently may not detect
+// if HostFeatureSet gives back junk bits.
+func TestHostFeatureSet(t *testing.T) {
+	hostFeatures := HostFeatureSet()
+	if justFPUandPAE.Subtract(hostFeatures) != nil {
+		t.Errorf("Got invalid feature set %v from HostFeatureSet()", hostFeatures)
+	}
+}
+
+func TestHasFeature(t *testing.T) {
+	if !justFPU.HasFeature(X86FeatureFPU) {
+		t.Errorf("HasFeature failed, %v should contain %v", justFPU, X86FeatureFPU)
+	}
+
+	if justFPU.HasFeature(X86FeatureAVX) {
+		t.Errorf("HasFeature failed, %v should not contain %v", justFPU, X86FeatureAVX)
+	}
+}
+
+// Note: these tests are aware of and abuse internal details of FeatureSets.
+// Users of FeatureSets should not depend on this.
+func TestAdd(t *testing.T) {
+	// Test a basic insertion into the FeatureSet.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureCLFSH)
+	if len(testFeatures.Set) != 1 {
+		t.Errorf("Got length %v want 1", len(testFeatures.Set))
+	}
+
+	if !testFeatures.HasFeature(X86FeatureCLFSH) {
+		t.Errorf("Add failed, got %v want set with %v", testFeatures, X86FeatureCLFSH)
+	}
+
+	// Test that duplicates are ignored.
+	testFeatures.Add(X86FeatureCLFSH)
+	if len(testFeatures.Set) != 1 {
+		t.Errorf("Got length %v, want 1", len(testFeatures.Set))
+	}
+}
+
+func TestRemove(t *testing.T) {
+	// Try removing the last feature.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureFPU)
+	testFeatures.Add(X86FeaturePAE)
+	testFeatures.Remove(X86FeaturePAE)
+	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 || testFeatures.HasFeature(X86FeaturePAE) {
+		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
+	}
+
+	// Try removing a feature not in the set.
+	testFeatures.Remove(X86FeatureRDRAND)
+	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 {
+		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
+	}
+}
+
+func TestFeatureFromString(t *testing.T) {
+	f, ok := FeatureFromString("avx")
+	if f != X86FeatureAVX || !ok {
+		t.Errorf("got %v want avx", f)
+	}
+
+	f, ok = FeatureFromString("bad")
+	if ok {
+		t.Errorf("got %v want nothing", f)
+	}
+}
+
+// This tests function 0 (eax=0), which returns the vendor ID and highest cpuid
+// function reported to be available.
+func TestEmulateIDVendorAndLength(t *testing.T) {
+	testFeatures := newEmptyFeatureSet()
+
+	ax, bx, cx, dx := testFeatures.EmulateID(0, 0)
+	wantEax := uint32(0xd) // Highest supported cpuid function.
+
+	// These magical constants are the characters of "GenuineIntel".
+	// See Intel AN485 for a reference on why they are laid out like this.
+	wantEbx := uint32(0x756e6547)
+	wantEcx := uint32(0x6c65746e)
+	wantEdx := uint32(0x49656e69)
+	if wantEax != ax {
+		t.Errorf("highest function failed, got %x want %x", ax, wantEax)
+	}
+
+	if wantEbx != bx || wantEcx != cx || wantEdx != dx {
+		t.Errorf("vendor string emulation failed, bx:cx:dx, got %x:%x:%x want %x:%x:%x", bx, cx, dx, wantEbx, wantEcx, wantEdx)
+	}
+}
+
+func TestEmulateIDBasicFeatures(t *testing.T) {
+	// Make a minimal test feature set.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureCLFSH)
+	testFeatures.Add(X86FeatureAVX)
+	testFeatures.CacheLine = 64
+
+	ax, bx, cx, dx := testFeatures.EmulateID(1, 0)
+	ECXAVXBit := uint32(1 << uint(X86FeatureAVX))
+	EDXCLFlushBit := uint32(1 << uint(X86FeatureCLFSH-32)) // We adjust by 32 since it's in block 1.
+
+	if EDXCLFlushBit&dx == 0 || dx&^EDXCLFlushBit != 0 {
+		t.Errorf("EmulateID failed, got feature bits %x want %x", dx, testFeatures.blockMask(1))
+	}
+
+	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
+		t.Errorf("EmulateID failed, got feature bits %x want %x", cx, testFeatures.blockMask(0))
+	}
+
+	// Default signature bits, based on values for i7/Xeon.
+	// See Intel AN485 for information on stepping/model bits.
+	defaultSignature := uint32(0x000106a0)
+	if defaultSignature != ax {
+		t.Errorf("EmulateID stepping emulation failed, got %x want %x", ax, defaultSignature)
+	}
+
+	clflushSizeInfo := uint32(8 << 8)
+	if clflushSizeInfo != bx {
+		t.Errorf("EmulateID bx emulation failed, got %x want %x", bx, clflushSizeInfo)
+	}
+}
+
+func TestEmulateIDExtendedFeatures(t *testing.T) {
+	// Make a minimal test feature set, one bit in each extended feature word.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureSMEP)
+	testFeatures.Add(X86FeatureAVX512VBMI)
+
+	ax, bx, cx, dx := testFeatures.EmulateID(7, 0)
+	EBXSMEPBit := uint32(1 << uint(X86FeatureSMEP-2*32))      // Adjust by 2*32 since SMEP is a block 2 feature.
+	ECXAVXBit := uint32(1 << uint(X86FeatureAVX512VBMI-3*32)) // We adjust by 3*32 since it's a block 3 feature.
+
+	// Test that the desired bit is set and no other bits are set.
+	if EBXSMEPBit&bx == 0 || bx&^EBXSMEPBit != 0 {
+		t.Errorf("extended feature emulation failed, got feature bits %x want %x", bx, testFeatures.blockMask(2))
+	}
+
+	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
+		t.Errorf("extended feature emulation failed, got feature bits %x want %x", cx, testFeatures.blockMask(3))
+	}
+
+	if ax != 0 || dx != 0 {
+		t.Errorf("extended feature emulation failed, ax:dx, got %x:%x want 0:0", ax, dx)
+	}
+
+	// Check that no subleaves other than 0 do anything.
+	ax, bx, cx, dx = testFeatures.EmulateID(7, 1)
+	if ax != 0 || bx != 0 || cx != 0 || dx != 0 {
+		t.Errorf("extended feature emulation failed, got %x:%x:%x:%x want 0:0", ax, bx, cx, dx)
+	}
+
+}
+
+// Checks that the expected extended features are available via cpuid functions
+// 0x80000000 and up.
+func TestEmulateIDExtended(t *testing.T) {
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureSYSCALL)
+	EDXSYSCALLBit := uint32(1 << uint(X86FeatureSYSCALL-6*32)) // Adjust by 6*32 since SYSCALL is a block 6 feature.
+
+	ax, bx, cx, dx := testFeatures.EmulateID(0x80000000, 0)
+	if ax != 0x80000001 || bx != 0 || cx != 0 || dx != 0 {
+		t.Errorf("EmulateID extended emulation failed, ax:bx:cx:dx, got %x:%x:%x:%x want 0x80000001:0:0:0", ax, bx, cx, dx)
+	}
+
+	_, _, _, dx = testFeatures.EmulateID(0x80000001, 0)
+	if EDXSYSCALLBit&dx == 0 || dx&^EDXSYSCALLBit != 0 {
+		t.Errorf("extended feature emulation failed, got feature bits %x want %x", dx, testFeatures.blockMask(6))
+	}
+}
-- 
cgit v1.2.3


From 724bafa094c70f1eb0d5eb6f1ef6744f761452ea Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 10 Feb 2020 08:28:00 -0800
Subject: Fix wrong path for ruby benchmark.

PiperOrigin-RevId: 294224689
---
 benchmarks/workloads/ruby_template/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD
index 59443b14a..72ed9403d 100644
--- a/benchmarks/workloads/ruby_template/BUILD
+++ b/benchmarks/workloads/ruby_template/BUILD
@@ -15,5 +15,4 @@ pkg_tar(
         "index.erb",
         "main.rb",
     ],
-    strip_prefix = "third_party/gvisor/benchmarks/workloads/ruby_template",
 )
-- 
cgit v1.2.3


From 75412ed9f5b6b327dec05ffff99d7fe6198d25a8 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 10 Feb 2020 10:28:56 -0800
Subject: Internal change.

PiperOrigin-RevId: 294250370
---
 test/syscalls/linux/inotify.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index fdef646eb..0e13ad190 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1055,9 +1055,9 @@ TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
   const TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
 
-  const FileDescriptor root_fd =
+  FileDescriptor root_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(root.path(), O_RDONLY));
-  const FileDescriptor file1_fd =
+  FileDescriptor file1_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
 
@@ -1091,6 +1091,11 @@ TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
   ASSERT_THAT(fchmodat(root_fd.get(), file1_basename.c_str(), S_IWGRP, 0),
               SyscallSucceeds());
   verify_chmod_events();
+
+  // Make sure the chmod'ed file descriptors are destroyed before DisableSave
+  // is destructed.
+  root_fd.reset();
+  file1_fd.reset();
 }
 
 TEST(Inotify, TruncateGeneratesModifyEvent) {
-- 
cgit v1.2.3


From 223931da847f0ad25b44e18fb59da3391d028139 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 10 Feb 2020 10:40:33 -0800
Subject: Bump rules_go, gazelle, and go toolchain.

PiperOrigin-RevId: 294253155
---
 WORKSPACE | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 5d2fc36f9..2827c3a26 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,19 +4,19 @@ load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 # Load go bazel rules and gazelle.
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "b27e55d2dcc9e6020e17614ae6e0374818a3e3ce6f2024036e688ada24110444",
+    sha256 = "f99a9d76e972e0c8f935b2fe6d0d9d778f67c760c6d2400e23fc2e469016e2bd",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.21.0/rules_go-v0.21.0.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.21.0/rules_go-v0.21.0.tar.gz",
+        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.21.2/rules_go-v0.21.2.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.21.2/rules_go-v0.21.2.tar.gz",
     ],
 )
 
 http_archive(
     name = "bazel_gazelle",
-    sha256 = "86c6d481b3f7aedc1d60c1c211c6f76da282ae197c3b3160f54bd3a8f847896f",
+    sha256 = "d8c45ee70ec39a57e7a05e5027c32b1576cc7f16d9dd37135b0eddde45cf1b10",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/bazel-gazelle/releases/download/v0.19.1/bazel-gazelle-v0.19.1.tar.gz",
-        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.19.1/bazel-gazelle-v0.19.1.tar.gz",
+        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/bazel-gazelle/releases/download/v0.20.0/bazel-gazelle-v0.20.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.20.0/bazel-gazelle-v0.20.0.tar.gz",
     ],
 )
 
@@ -25,7 +25,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_to
 go_rules_dependencies()
 
 go_register_toolchains(
-    go_version = "1.13.6",
+    go_version = "1.13.7",
     nogo = "@//:nogo",
 )
 
-- 
cgit v1.2.3


From 31f2182cd3fc2a6fdb1aecf1c56f1302f16f6453 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 10 Feb 2020 11:08:24 -0800
Subject: iptables: add instructions for runsc building.

The readme didn't mention that users need to:
- `bazel build` when working on iptables tests
- enable raw sockets in /etc/docker/daemon.json.

PiperOrigin-RevId: 294260169
---
 test/iptables/README.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/iptables/README.md b/test/iptables/README.md
index c2b934e1f..cc8a2fcac 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -2,6 +2,9 @@
 
 iptables tests are run via `scripts/iptables_test.sh`.
 
+iptables requires raw socket support, so you must add the `--net-raw=true` flag
+to `/etc/docker/daemon.json` in order to use it.
+
 ## Test Structure
 
 Each test implements `TestCase`, providing (1) a function to run inside the
@@ -25,7 +28,14 @@ Your test is now runnable with bazel!
 
 ## Run individual tests
 
-Build the testing Docker container:
+Build and install `runsc`. Re-run this when you modify gVisor:
+
+```bash
+$ bazel build //runsc && sudo cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc $(which runsc)
+```
+
+Build the testing Docker container. Re-run this when you modify the test code in
+this directory:
 
 ```bash
 $ bazel run //test/iptables/runner:runner-image -- --norun
-- 
cgit v1.2.3


From 0efa8168c7c04ec0a4bd62e2d2eb8718b5d72ea7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 11:28:57 -0800
Subject: Update visibility.

PiperOrigin-RevId: 294265019
---
 pkg/seccomp/BUILD                | 2 +-
 test/root/testdata/BUILD         | 2 +-
 tools/build/BUILD                | 2 +-
 tools/checkunsafe/BUILD          | 2 +-
 tools/go_generics/BUILD          | 2 +-
 tools/go_generics/go_merge/BUILD | 2 +-
 tools/go_stateify/BUILD          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 742c8b79b..c5fca2ba3 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -26,7 +26,7 @@ go_library(
         "seccomp_rules.go",
         "seccomp_unsafe.go",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/bpf",
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
index bca5f9cab..6859541ad 100644
--- a/test/root/testdata/BUILD
+++ b/test/root/testdata/BUILD
@@ -13,6 +13,6 @@ go_library(
         "simple.go",
     ],
     visibility = [
-        "//visibility:public",
+        "//:sandbox",
     ],
 )
diff --git a/tools/build/BUILD b/tools/build/BUILD
index 0c0ce3f4d..00a467473 100644
--- a/tools/build/BUILD
+++ b/tools/build/BUILD
@@ -6,5 +6,5 @@ genrule(
     name = "loopback",
     outs = ["loopback.txt"],
     cmd = "touch $@",
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
 )
diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD
index 92ba8ab06..4f1a31a6d 100644
--- a/tools/checkunsafe/BUILD
+++ b/tools/checkunsafe/BUILD
@@ -5,7 +5,7 @@ package(licenses = ["notice"])
 go_tool_library(
     name = "checkunsafe",
     srcs = ["check_unsafe.go"],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = [
         "@org_golang_x_tools//go/analysis:go_tool_library",
     ],
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 069df3856..32a949c93 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -9,7 +9,7 @@ go_binary(
         "imports.go",
         "remove.go",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = ["//tools/go_generics/globals"],
 )
 
diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD
index b7d35e272..2fd5a200d 100644
--- a/tools/go_generics/go_merge/BUILD
+++ b/tools/go_generics/go_merge/BUILD
@@ -5,5 +5,5 @@ package(licenses = ["notice"])
 go_binary(
     name = "go_merge",
     srcs = ["main.go"],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
 )
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index 6036faf7b..503cdf2e5 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -5,6 +5,6 @@ package(licenses = ["notice"])
 go_binary(
     name = "stateify",
     srcs = ["main.go"],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = ["//tools/tags"],
 )
-- 
cgit v1.2.3


From 20840bfec087d45853e81d1ac34940f3b2fb920a Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 10 Feb 2020 11:57:31 -0800
Subject: Move x86 state definition to its own file.

PiperOrigin-RevId: 294271541
---
 pkg/sentry/arch/BUILD             |  1 +
 pkg/sentry/arch/arch_state_x86.go |  4 ++--
 pkg/sentry/arch/arch_x86.go       | 15 --------------
 pkg/sentry/arch/arch_x86_impl.go  | 43 +++++++++++++++++++++++++++++++++++++++
 tools/build/tags.bzl              | 24 +++++++++++-----------
 5 files changed, 58 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sentry/arch/arch_x86_impl.go

diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 34c0a867d..e27f21e5e 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -14,6 +14,7 @@ go_library(
         "arch_state_aarch64.go",
         "arch_state_x86.go",
         "arch_x86.go",
+        "arch_x86_impl.go",
         "auxv.go",
         "signal.go",
         "signal_act.go",
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index d388ee9cf..e35c9214a 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -43,8 +43,8 @@ func (e ErrFloatingPoint) Error() string {
 // and SSE state, so this is the equivalent XSTATE_BV value.
 const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE
 
-// afterLoad is invoked by stateify.
-func (s *State) afterLoad() {
+// afterLoadFPState is invoked by afterLoad.
+func (s *State) afterLoadFPState() {
 	old := s.x86FPState
 
 	// Recreate the slice. This is done to ensure that it is aligned
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 3db8bd34b..88b40a9d1 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -155,21 +155,6 @@ func NewFloatingPointData() *FloatingPointData {
 	return (*FloatingPointData)(&(newX86FPState()[0]))
 }
 
-// State contains the common architecture bits for X86 (the build tag of this
-// file ensures it's only built on x86).
-//
-// +stateify savable
-type State struct {
-	// The system registers.
-	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
-
-	// Our floating point state.
-	x86FPState `state:"wait"`
-
-	// FeatureSet is a pointer to the currently active feature set.
-	FeatureSet *cpuid.FeatureSet
-}
-
 // Proto returns a protobuf representation of the system registers in State.
 func (s State) Proto() *rpb.Registers {
 	regs := &rpb.AMD64Registers{
diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go
new file mode 100644
index 000000000..04ac283c6
--- /dev/null
+++ b/pkg/sentry/arch/arch_x86_impl.go
@@ -0,0 +1,43 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 i386
+
+package arch
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/cpuid"
+)
+
+// State contains the common architecture bits for X86 (the build tag of this
+// file ensures it's only built on x86).
+//
+// +stateify savable
+type State struct {
+	// The system registers.
+	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+
+	// Our floating point state.
+	x86FPState `state:"wait"`
+
+	// FeatureSet is a pointer to the currently active feature set.
+	FeatureSet *cpuid.FeatureSet
+}
+
+// afterLoad is invoked by stateify.
+func (s *State) afterLoad() {
+	s.afterLoadFPState()
+}
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index a6db44e47..558fb53ae 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -3,22 +3,28 @@
 go_suffixes = [
     "_386",
     "_386_unsafe",
-    "_amd64",
-    "_amd64_unsafe",
     "_aarch64",
     "_aarch64_unsafe",
+    "_amd64",
+    "_amd64_unsafe",
     "_arm",
-    "_arm_unsafe",
     "_arm64",
     "_arm64_unsafe",
+    "_arm_unsafe",
+    "_impl",
+    "_impl_unsafe",
+    "_linux",
+    "_linux_unsafe",
     "_mips",
-    "_mips_unsafe",
-    "_mipsle",
-    "_mipsle_unsafe",
     "_mips64",
     "_mips64_unsafe",
     "_mips64le",
     "_mips64le_unsafe",
+    "_mips_unsafe",
+    "_mipsle",
+    "_mipsle_unsafe",
+    "_opts",
+    "_opts_unsafe",
     "_ppc64",
     "_ppc64_unsafe",
     "_ppc64le",
@@ -31,10 +37,4 @@ go_suffixes = [
     "_sparc64_unsafe",
     "_wasm",
     "_wasm_unsafe",
-    "_linux",
-    "_linux_unsafe",
-    "_opts",
-    "_opts_unsafe",
-    "_impl",
-    "_impl_unsafe",
 ]
-- 
cgit v1.2.3


From bfa0bba72abb69cbc7f4da27d3b4b116c3784495 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 10 Feb 2020 12:02:47 -0800
Subject: Redirect FIXME to gvisor.dev

PiperOrigin-RevId: 294272755
---
 pkg/sentry/fsimpl/kernfs/filesystem.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 9d65d0179..e49303c26 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -111,10 +111,10 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// Dentry isn't cached; it either doesn't exist or failed
 		// revalidation. Attempt to resolve it via Lookup.
 		//
-		// FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry,
-		// not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries
-		// in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl
-		// casts accordingly.
+		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
+		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
+		// that all dentries in the filesystem are (kernfs.)Dentry and performs
+		// vfs.DentryImpl casts accordingly.
 		var err error
 		childVFSD, err = parent.inode.Lookup(ctx, name)
 		if err != nil {
-- 
cgit v1.2.3


From c9a18b16ade6ec0bc90fc75d0a4ab0621f9d01d6 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 12:06:06 -0800
Subject: Document MinimumTotalMemoryBytes.

PiperOrigin-RevId: 294273559
---
 pkg/sentry/usage/memory.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 538c645eb..4320ad17f 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -253,6 +253,10 @@ func (m *MemoryLocked) Copy() (MemoryStats, uint64) {
 }
 
 // MinimumTotalMemoryBytes is the minimum reported total system memory.
+//
+// This can be configured through options provided to the Sentry at start.
+// This number is purely synthetic. This is only set before the application
+// starts executing, and must not be modified.
 var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB
 
 // TotalMemory returns the "total usable memory" available.
-- 
cgit v1.2.3


From 4d4d47f0c0a21d3404d2edae527b187d62daa3c8 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:04:29 -0800
Subject: Add contextual note.

PiperOrigin-RevId: 294285723
---
 pkg/sentry/fsimpl/gofer/directory.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index baa2cdd8e..6d4ebc2bf 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -87,6 +87,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// to assume that directory fids have the correct semantics, and translates
 	// struct file_operations::readdir calls directly to readdir RPCs), but is
 	// consistent with VFS1.
+	//
+	// NOTE(b/135560623): In particular, some gofer implementations may not
+	// retain state between calls to Readdir, so may not provide a coherent
+	// directory stream across in the presence of mutation.
 
 	d.fs.renameMu.RLock()
 	defer d.fs.renameMu.RUnlock()
-- 
cgit v1.2.3


From bc504d52026790b7deed6506371ebc6d5d17c948 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 10 Feb 2020 13:06:57 -0800
Subject: Fix build_file in runtimes_tests.

PiperOrigin-RevId: 294286242
---
 kokoro/runtime_tests/go1.12.cfg       | 4 ++--
 kokoro/runtime_tests/java11.cfg       | 4 ++--
 kokoro/runtime_tests/nodejs12.4.0.cfg | 4 ++--
 kokoro/runtime_tests/php7.3.6.cfg     | 4 ++--
 kokoro/runtime_tests/python3.7.3.cfg  | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/kokoro/runtime_tests/go1.12.cfg b/kokoro/runtime_tests/go1.12.cfg
index 024740ab2..164ddc18f 100644
--- a/kokoro/runtime_tests/go1.12.cfg
+++ b/kokoro/runtime_tests/go1.12.cfg
@@ -1,6 +1,6 @@
-build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "go1.12"
-}
\ No newline at end of file
+}
diff --git a/kokoro/runtime_tests/java11.cfg b/kokoro/runtime_tests/java11.cfg
index f01d26153..4957d4794 100644
--- a/kokoro/runtime_tests/java11.cfg
+++ b/kokoro/runtime_tests/java11.cfg
@@ -1,6 +1,6 @@
-build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "java11"
-}
\ No newline at end of file
+}
diff --git a/kokoro/runtime_tests/nodejs12.4.0.cfg b/kokoro/runtime_tests/nodejs12.4.0.cfg
index d4861fb07..1df343f95 100644
--- a/kokoro/runtime_tests/nodejs12.4.0.cfg
+++ b/kokoro/runtime_tests/nodejs12.4.0.cfg
@@ -1,6 +1,6 @@
-build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "nodejs12.4.0"
-}
\ No newline at end of file
+}
diff --git a/kokoro/runtime_tests/php7.3.6.cfg b/kokoro/runtime_tests/php7.3.6.cfg
index b737ed9cb..8e3667125 100644
--- a/kokoro/runtime_tests/php7.3.6.cfg
+++ b/kokoro/runtime_tests/php7.3.6.cfg
@@ -1,6 +1,6 @@
-build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "php7.3.6"
-}
\ No newline at end of file
+}
diff --git a/kokoro/runtime_tests/python3.7.3.cfg b/kokoro/runtime_tests/python3.7.3.cfg
index 971fcba05..0ca70d5bb 100644
--- a/kokoro/runtime_tests/python3.7.3.cfg
+++ b/kokoro/runtime_tests/python3.7.3.cfg
@@ -1,6 +1,6 @@
-build_file: "github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "python3.7.3"
-}
\ No newline at end of file
+}
-- 
cgit v1.2.3


From bb22ebd7fbfc66556b38df669be5c6372daba018 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:20:44 -0800
Subject: Add contextual comment.

PiperOrigin-RevId: 294289066
---
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index d1436b943..2015a8871 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -15,6 +15,9 @@
 // These benchmarks emulate memfs benchmarks. Ext4 images must be created
 // before this benchmark is run using the `make_deep_ext4.sh` script at
 // /tmp/image-{depth}.ext4 for all the depths tested below.
+//
+// The benchmark itself cannot run the script because the script requires
+// sudo privileges to create the file system images.
 package benchmark_test
 
 import (
-- 
cgit v1.2.3


From a6f9361c2f7c5b46a200de1dc891a0ce059ad90e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:51:26 -0800
Subject: Add context to comments.

PiperOrigin-RevId: 294295852
---
 pkg/sentry/fs/gofer/attr.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 71cccdc34..6db4b762d 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -88,8 +88,9 @@ func bsize(pattr p9.Attr) int64 {
 	if pattr.BlockSize > 0 {
 		return int64(pattr.BlockSize)
 	}
-	// Some files may have no clue of their block size. Better not to report
-	// something misleading or buggy and have a safe default.
+	// Some files, particularly those that are not on a local file system,
+	// may have no clue of their block size. Better not to report something
+	// misleading or buggy and have a safe default.
 	return usermem.PageSize
 }
 
@@ -149,6 +150,7 @@ func links(valid p9.AttrMask, pattr p9.Attr) uint64 {
 	}
 
 	// This node is likely backed by a file system that doesn't support links.
+	//
 	// We could readdir() and count children directories to provide an accurate
 	// link count. However this may be expensive since the gofer may be backed by remote
 	// storage. Instead, simply return 2 links for directories and 1 for everything else
-- 
cgit v1.2.3


From 3e8b38d08bbe200a11909bc268e66e4ee1d27d79 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:56:17 -0800
Subject: Add flag package to limit visibility.

PiperOrigin-RevId: 294297004
---
 runsc/BUILD                          |  2 ++
 runsc/cmd/BUILD                      |  1 +
 runsc/cmd/boot.go                    |  2 +-
 runsc/cmd/checkpoint.go              |  2 +-
 runsc/cmd/create.go                  |  2 +-
 runsc/cmd/debug.go                   |  2 +-
 runsc/cmd/delete.go                  |  2 +-
 runsc/cmd/do.go                      |  2 +-
 runsc/cmd/events.go                  |  2 +-
 runsc/cmd/exec.go                    |  2 +-
 runsc/cmd/gofer.go                   |  2 +-
 runsc/cmd/help.go                    |  2 +-
 runsc/cmd/install.go                 |  2 +-
 runsc/cmd/kill.go                    |  2 +-
 runsc/cmd/list.go                    |  2 +-
 runsc/cmd/pause.go                   |  2 +-
 runsc/cmd/ps.go                      |  2 +-
 runsc/cmd/restore.go                 |  2 +-
 runsc/cmd/resume.go                  |  2 +-
 runsc/cmd/run.go                     |  2 +-
 runsc/cmd/spec.go                    |  2 +-
 runsc/cmd/start.go                   |  2 +-
 runsc/cmd/state.go                   |  2 +-
 runsc/cmd/syscalls.go                |  2 +-
 runsc/cmd/wait.go                    |  2 +-
 runsc/container/test_app/BUILD       |  1 +
 runsc/container/test_app/fds.go      |  2 +-
 runsc/container/test_app/test_app.go |  2 +-
 runsc/flag/BUILD                     |  9 +++++++++
 runsc/flag/flag.go                   | 33 +++++++++++++++++++++++++++++++++
 runsc/main.go                        |  3 +--
 31 files changed, 72 insertions(+), 27 deletions(-)
 create mode 100644 runsc/flag/BUILD
 create mode 100644 runsc/flag/flag.go

diff --git a/runsc/BUILD b/runsc/BUILD
index 375241921..02a56657a 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -19,6 +19,7 @@ go_binary(
         "//pkg/sentry/platform",
         "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/flag",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
@@ -54,6 +55,7 @@ go_binary(
         "//pkg/sentry/platform",
         "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/flag",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
     ],
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 09aa46434..2a88b85a9 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//runsc/boot/platforms",
         "//runsc/console",
         "//runsc/container",
+        "//runsc/flag",
         "//runsc/fsgofer",
         "//runsc/fsgofer/filter",
         "//runsc/specutils",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index b40fded5b..0f3da69a0 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -21,12 +21,12 @@ import (
 	"strings"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index d8b3a8573..8a29e521e 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -20,11 +20,11 @@ import (
 	"path/filepath"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 1815c93b9..910e97577 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -17,10 +17,10 @@ package cmd
 import (
 	"context"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index f37415810..79965460e 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -22,12 +22,12 @@ import (
 	"syscall"
 	"time"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Debug implements subcommands.Command for the "debug" command.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 30d8164b1..0e4863f50 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -19,11 +19,11 @@ import (
 	"fmt"
 	"os"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Delete implements subcommands.Command for the "delete" command.
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 9a8a49054..b184bd402 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -27,12 +27,12 @@ import (
 	"strings"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 3972e9224..51f6a98ed 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -20,11 +20,11 @@ import (
 	"os"
 	"time"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Events implements subcommands.Command for the "events" command.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index d1e99243b..d9a94903e 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -27,7 +27,6 @@ import (
 	"syscall"
 	"time"
 
-	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
@@ -37,6 +36,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 7df7995f0..6e06f3c0f 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -23,7 +23,6 @@ import (
 	"strings"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
@@ -32,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 	"gvisor.dev/gvisor/runsc/fsgofer/filter"
 	"gvisor.dev/gvisor/runsc/specutils"
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
index 930e8454f..c7d210140 100644
--- a/runsc/cmd/help.go
+++ b/runsc/cmd/help.go
@@ -18,8 +18,8 @@ import (
 	"context"
 	"fmt"
 
-	"flag"
 	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // NewHelp returns a help command for the given commander.
diff --git a/runsc/cmd/install.go b/runsc/cmd/install.go
index 441c1db0d..2e223e3be 100644
--- a/runsc/cmd/install.go
+++ b/runsc/cmd/install.go
@@ -23,8 +23,8 @@ import (
 	"os"
 	"path"
 
-	"flag"
 	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Install implements subcommands.Command.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 6c1f197a6..8282ea0e0 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -21,11 +21,11 @@ import (
 	"strings"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Kill implements subcommands.Command for the "kill" command.
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index dd2d99a6b..d8d906fe3 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -22,11 +22,11 @@ import (
 	"text/tabwriter"
 	"time"
 
-	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // List implements subcommands.Command for the "list" command for the "list" command.
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 9c0e92001..6f95a9837 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -17,10 +17,10 @@ package cmd
 import (
 	"context"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Pause implements subcommands.Command for the "pause" command.
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 45c644f3f..7fb8041af 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -18,11 +18,11 @@ import (
 	"context"
 	"fmt"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // PS implements subcommands.Command for the "ps" command.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 7be60cd7d..72584b326 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -19,10 +19,10 @@ import (
 	"path/filepath"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index b2df5c640..61a55a554 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -17,10 +17,10 @@ package cmd
 import (
 	"context"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Resume implements subcommands.Command for the "resume" command.
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 33f4bc12b..cf41581ad 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -18,10 +18,10 @@ import (
 	"context"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 344da13ba..8e2b36e85 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -20,8 +20,8 @@ import (
 	"os"
 	"path/filepath"
 
-	"flag"
 	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 var specTemplate = []byte(`{
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 5e9bc53ab..0205fd9f7 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -17,10 +17,10 @@ package cmd
 import (
 	"context"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Start implements subcommands.Command for the "start" command.
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index e9f41cbd8..cf2413deb 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -19,11 +19,11 @@ import (
 	"encoding/json"
 	"os"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // State implements subcommands.Command for the "state" command.
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
index fb6c1ab29..7072547be 100644
--- a/runsc/cmd/syscalls.go
+++ b/runsc/cmd/syscalls.go
@@ -25,9 +25,9 @@ import (
 	"strconv"
 	"text/tabwriter"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Syscalls implements subcommands.Command for the "syscalls" command.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 046489687..29c0a15f0 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -20,10 +20,10 @@ import (
 	"os"
 	"syscall"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
 )
 
 const (
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index e200bafd9..0defbd9fc 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -13,6 +13,7 @@ go_binary(
     visibility = ["//runsc/container:__pkg__"],
     deps = [
         "//pkg/unet",
+        "//runsc/flag",
         "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
diff --git a/runsc/container/test_app/fds.go b/runsc/container/test_app/fds.go
index a90cc1662..2a146a2c3 100644
--- a/runsc/container/test_app/fds.go
+++ b/runsc/container/test_app/fds.go
@@ -21,9 +21,9 @@ import (
 	"os"
 	"time"
 
-	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index a1c8a741a..01c47c79f 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -30,9 +30,9 @@ import (
 	sys "syscall"
 	"time"
 
-	"flag"
 	"github.com/google/subcommands"
 	"github.com/kr/pty"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
diff --git a/runsc/flag/BUILD b/runsc/flag/BUILD
new file mode 100644
index 000000000..5cb7604a8
--- /dev/null
+++ b/runsc/flag/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "flag",
+    srcs = ["flag.go"],
+    visibility = ["//:sandbox"],
+)
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
new file mode 100644
index 000000000..0ca4829d7
--- /dev/null
+++ b/runsc/flag/flag.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package flag
+
+import (
+	"flag"
+)
+
+type FlagSet = flag.FlagSet
+
+var (
+	NewFlagSet  = flag.NewFlagSet
+	String      = flag.String
+	Bool        = flag.Bool
+	Int         = flag.Int
+	Uint        = flag.Uint
+	CommandLine = flag.CommandLine
+	Parse       = flag.Parse
+)
+
+const ContinueOnError = flag.ContinueOnError
diff --git a/runsc/main.go b/runsc/main.go
index c2b0d9a9e..762b0f801 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -28,14 +28,13 @@ import (
 	"syscall"
 	"time"
 
-	"flag"
-
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cmd"
+	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
-- 
cgit v1.2.3


From 2889ffa84ec4737b651b14c3ce019c5005d0dd9c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 14:09:14 -0800
Subject: Add context to note.

PiperOrigin-RevId: 294300040
---
 pkg/sentry/arch/arch_amd64.s | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
index bd61402cf..6c10336e7 100644
--- a/pkg/sentry/arch/arch_amd64.s
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -26,10 +26,11 @@
 //
 // func initX86FPState(data *FloatingPointData, useXsave bool)
 //
-// We need to clear out and initialize an empty fp state area since the sentry
-// may have left sensitive information in the floating point registers.
+// We need to clear out and initialize an empty fp state area since the sentry,
+// or any previous loader, may have left sensitive information in the floating
+// point registers.
 //
-// Preconditions: data is zeroed
+// Preconditions: data is zeroed.
 TEXT ·initX86FPState(SB), $24-16
 	// Save MXCSR (callee-save)
 	STMXCSR	mxcsr-8(SP)
-- 
cgit v1.2.3


From afcab8fe9f6fb3504ebdbb95d35299277c2d67ca Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 14:11:04 -0800
Subject: Clean-up comments in runsc/BUILD and CONTRIBUTING.md.

PiperOrigin-RevId: 294300437
---
 CONTRIBUTING.md | 2 +-
 runsc/BUILD     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 55a1ad0d9..71650a4b8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -47,7 +47,7 @@ Definitions for the rules below:
 `core`:
 
 *   `//pkg/sentry/...`
-*   Transitive dependencies in `//pkg/...`, `//third_party/...`.
+*   Transitive dependencies in `//pkg/...`, etc.
 
 `runsc`:
 
diff --git a/runsc/BUILD b/runsc/BUILD
index 02a56657a..757f6d44c 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -26,7 +26,7 @@ go_binary(
 )
 
 # The runsc-race target is a race-compatible BUILD target. This must be built
-# via: bazel build --features=race //runsc:runsc-race
+# via: bazel build --features=race :runsc-race
 #
 # This is neccessary because the race feature must apply to all dependencies
 # due a bug in gazelle file selection.  The pure attribute must be off because
-- 
cgit v1.2.3


From 475316e87dac806d69bcb06ea4065f3c138bb47e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 10 Feb 2020 14:47:00 -0800
Subject: Refactor getxattr.

Put most of the logic for getxattr in one place for clarity. This simplifies
FGetXattr and getXattrFromPath, which are just wrappers for getXattr.

PiperOrigin-RevId: 294308332
---
 pkg/sentry/syscalls/linux/sys_xattr.go | 44 ++++++++++++++++------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 342337726..9d8140b8a 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -49,14 +49,11 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 	defer f.DecRef()
 
-	n, value, err := getXattr(t, f.Dirent, nameAddr, size)
+	n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size)
 	if err != nil {
 		return 0, nil, err
 	}
 
-	if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
-		return 0, nil, err
-	}
 	return uintptr(n), nil, nil
 }
 
@@ -71,41 +68,36 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 		return 0, nil, err
 	}
 
-	valueLen := 0
+	n := 0
 	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
 
-		n, value, err := getXattr(t, d, nameAddr, size)
-		valueLen = n
-		if err != nil {
-			return err
-		}
-
-		_, err = t.CopyOutBytes(valueAddr, []byte(value))
+		n, err = getXattr(t, d, nameAddr, valueAddr, size)
 		return err
 	})
 	if err != nil {
 		return 0, nil, err
 	}
-	return uintptr(valueLen), nil, nil
+
+	return uintptr(n), nil, nil
 }
 
 // getXattr implements getxattr(2) from the given *fs.Dirent.
-func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) {
-	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
-		return 0, "", err
-	}
-
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64) (int, error) {
 	name, err := copyInXattrName(t, nameAddr)
 	if err != nil {
-		return 0, "", err
+		return 0, err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+		return 0, err
 	}
 
 	// TODO(b/148380782): Support xattrs in namespaces other than "user".
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return 0, "", syserror.EOPNOTSUPP
+		return 0, syserror.EOPNOTSUPP
 	}
 
 	// If getxattr(2) is called with size 0, the size of the value will be
@@ -118,18 +110,22 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64)
 
 	value, err := d.Inode.GetXattr(t, name, requestedSize)
 	if err != nil {
-		return 0, "", err
+		return 0, err
 	}
 	n := len(value)
 	if uint64(n) > requestedSize {
-		return 0, "", syserror.ERANGE
+		return 0, syserror.ERANGE
 	}
 
 	// Don't copy out the attribute value if size is 0.
 	if size == 0 {
-		return n, "", nil
+		return n, nil
+	}
+
+	if _, err = t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+		return 0, err
 	}
-	return n, value, nil
+	return n, nil
 }
 
 // SetXattr implements linux syscall setxattr(2).
-- 
cgit v1.2.3


From dc5a8e52d7004e3796feaadb0a0b0960f7289884 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 15:43:36 -0800
Subject: Rename build to builddefs and minor build clean-up.

The name 'bazel' also doesn't work because bazel will treat it specially.

Fixes #1807

PiperOrigin-RevId: 294321221
---
 tools/bazeldefs/BUILD    | 10 ++++++
 tools/bazeldefs/defs.bzl | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 tools/bazeldefs/tags.bzl | 40 +++++++++++++++++++++
 tools/build/BUILD        | 10 ------
 tools/build/defs.bzl     | 94 ------------------------------------------------
 tools/build/tags.bzl     | 40 ---------------------
 tools/defs.bzl           |  2 +-
 7 files changed, 145 insertions(+), 145 deletions(-)
 create mode 100644 tools/bazeldefs/BUILD
 create mode 100644 tools/bazeldefs/defs.bzl
 create mode 100644 tools/bazeldefs/tags.bzl
 delete mode 100644 tools/build/BUILD
 delete mode 100644 tools/build/defs.bzl
 delete mode 100644 tools/build/tags.bzl

diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD
new file mode 100644
index 000000000..00a467473
--- /dev/null
+++ b/tools/bazeldefs/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"])
+
+# In bazel, no special support is required for loopback networking. This is
+# just a dummy data target that does not change the test environment.
+genrule(
+    name = "loopback",
+    outs = ["loopback.txt"],
+    cmd = "touch $@",
+    visibility = ["//:sandbox"],
+)
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
new file mode 100644
index 000000000..08c29ff1c
--- /dev/null
+++ b/tools/bazeldefs/defs.bzl
@@ -0,0 +1,94 @@
+"""Bazel implementations of standard rules."""
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
+load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
+load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library")
+load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
+load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
+load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
+load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
+load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
+load("//tools/bazeldefs:tags.bzl", _go_suffixes = "go_suffixes")
+
+container_image = _container_image
+cc_binary = _cc_binary
+cc_library = _cc_library
+cc_flags_supplier = _cc_flags_supplier
+cc_proto_library = _cc_proto_library
+cc_test = _cc_test
+cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
+go_image = _go_image
+go_embed_data = _go_embed_data
+go_suffixes = _go_suffixes
+gtest = "@com_google_googletest//:gtest"
+loopback = "//tools/bazeldefs:loopback"
+proto_library = native.proto_library
+pkg_deb = _pkg_deb
+pkg_tar = _pkg_tar
+py_library = native.py_library
+py_binary = native.py_binary
+py_test = native.py_test
+
+def go_binary(name, static = False, pure = False, **kwargs):
+    if static:
+        kwargs["static"] = "on"
+    if pure:
+        kwargs["pure"] = "on"
+    _go_binary(
+        name = name,
+        **kwargs
+    )
+
+def go_library(name, **kwargs):
+    _go_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_tool_library(name, **kwargs):
+    _go_tool_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_proto_library(name, proto, **kwargs):
+    deps = kwargs.pop("deps", [])
+    _go_proto_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name,
+        proto = proto,
+        deps = [dep.replace("_proto", "_go_proto") for dep in deps],
+        **kwargs
+    )
+
+def go_test(name, **kwargs):
+    library = kwargs.pop("library", None)
+    if library:
+        kwargs["embed"] = [library]
+    _go_test(
+        name = name,
+        **kwargs
+    )
+
+def py_requirement(name, direct = False):
+    return _py_requirement(name)
+
+def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs):
+    values = {
+        "@bazel_tools//src/conditions:linux_x86_64": amd64,
+        "@bazel_tools//src/conditions:linux_aarch64": arm64,
+    }
+    if default:
+        values["//conditions:default"] = default
+    return select(values, **kwargs)
+
+def select_system(linux = ["__linux__"], **kwargs):
+    return linux  # Only Linux supported.
+
+def default_installer():
+    return None
+
+def default_net_util():
+    return []  # Nothing needed.
diff --git a/tools/bazeldefs/tags.bzl b/tools/bazeldefs/tags.bzl
new file mode 100644
index 000000000..558fb53ae
--- /dev/null
+++ b/tools/bazeldefs/tags.bzl
@@ -0,0 +1,40 @@
+"""List of special Go suffixes."""
+
+go_suffixes = [
+    "_386",
+    "_386_unsafe",
+    "_aarch64",
+    "_aarch64_unsafe",
+    "_amd64",
+    "_amd64_unsafe",
+    "_arm",
+    "_arm64",
+    "_arm64_unsafe",
+    "_arm_unsafe",
+    "_impl",
+    "_impl_unsafe",
+    "_linux",
+    "_linux_unsafe",
+    "_mips",
+    "_mips64",
+    "_mips64_unsafe",
+    "_mips64le",
+    "_mips64le_unsafe",
+    "_mips_unsafe",
+    "_mipsle",
+    "_mipsle_unsafe",
+    "_opts",
+    "_opts_unsafe",
+    "_ppc64",
+    "_ppc64_unsafe",
+    "_ppc64le",
+    "_ppc64le_unsafe",
+    "_riscv64",
+    "_riscv64_unsafe",
+    "_s390x",
+    "_s390x_unsafe",
+    "_sparc64",
+    "_sparc64_unsafe",
+    "_wasm",
+    "_wasm_unsafe",
+]
diff --git a/tools/build/BUILD b/tools/build/BUILD
deleted file mode 100644
index 00a467473..000000000
--- a/tools/build/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-package(licenses = ["notice"])
-
-# In bazel, no special support is required for loopback networking. This is
-# just a dummy data target that does not change the test environment.
-genrule(
-    name = "loopback",
-    outs = ["loopback.txt"],
-    cmd = "touch $@",
-    visibility = ["//:sandbox"],
-)
diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
deleted file mode 100644
index 1a1a0d825..000000000
--- a/tools/build/defs.bzl
+++ /dev/null
@@ -1,94 +0,0 @@
-"""Bazel implementations of standard rules."""
-
-load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
-load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
-load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library")
-load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
-load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
-load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
-load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
-load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
-load("//tools/build:tags.bzl", _go_suffixes = "go_suffixes")
-
-container_image = _container_image
-cc_binary = _cc_binary
-cc_library = _cc_library
-cc_flags_supplier = _cc_flags_supplier
-cc_proto_library = _cc_proto_library
-cc_test = _cc_test
-cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
-go_image = _go_image
-go_embed_data = _go_embed_data
-go_suffixes = _go_suffixes
-gtest = "@com_google_googletest//:gtest"
-loopback = "//tools/build:loopback"
-proto_library = native.proto_library
-pkg_deb = _pkg_deb
-pkg_tar = _pkg_tar
-py_library = native.py_library
-py_binary = native.py_binary
-py_test = native.py_test
-
-def go_binary(name, static = False, pure = False, **kwargs):
-    if static:
-        kwargs["static"] = "on"
-    if pure:
-        kwargs["pure"] = "on"
-    _go_binary(
-        name = name,
-        **kwargs
-    )
-
-def go_library(name, **kwargs):
-    _go_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name(),
-        **kwargs
-    )
-
-def go_tool_library(name, **kwargs):
-    _go_tool_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name(),
-        **kwargs
-    )
-
-def go_proto_library(name, proto, **kwargs):
-    deps = kwargs.pop("deps", [])
-    _go_proto_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name,
-        proto = proto,
-        deps = [dep.replace("_proto", "_go_proto") for dep in deps],
-        **kwargs
-    )
-
-def go_test(name, **kwargs):
-    library = kwargs.pop("library", None)
-    if library:
-        kwargs["embed"] = [library]
-    _go_test(
-        name = name,
-        **kwargs
-    )
-
-def py_requirement(name, direct = False):
-    return _py_requirement(name)
-
-def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs):
-    values = {
-        "@bazel_tools//src/conditions:linux_x86_64": amd64,
-        "@bazel_tools//src/conditions:linux_aarch64": arm64,
-    }
-    if default:
-        values["//conditions:default"] = default
-    return select(values, **kwargs)
-
-def select_system(linux = ["__linux__"], **kwargs):
-    return linux  # Only Linux supported.
-
-def default_installer():
-    return None
-
-def default_net_util():
-    return []  # Nothing needed.
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
deleted file mode 100644
index 558fb53ae..000000000
--- a/tools/build/tags.bzl
+++ /dev/null
@@ -1,40 +0,0 @@
-"""List of special Go suffixes."""
-
-go_suffixes = [
-    "_386",
-    "_386_unsafe",
-    "_aarch64",
-    "_aarch64_unsafe",
-    "_amd64",
-    "_amd64_unsafe",
-    "_arm",
-    "_arm64",
-    "_arm64_unsafe",
-    "_arm_unsafe",
-    "_impl",
-    "_impl_unsafe",
-    "_linux",
-    "_linux_unsafe",
-    "_mips",
-    "_mips64",
-    "_mips64_unsafe",
-    "_mips64le",
-    "_mips64le_unsafe",
-    "_mips_unsafe",
-    "_mipsle",
-    "_mipsle_unsafe",
-    "_opts",
-    "_opts_unsafe",
-    "_ppc64",
-    "_ppc64_unsafe",
-    "_ppc64le",
-    "_ppc64le_unsafe",
-    "_riscv64",
-    "_riscv64_unsafe",
-    "_s390x",
-    "_s390x_unsafe",
-    "_sparc64",
-    "_sparc64_unsafe",
-    "_wasm",
-    "_wasm_unsafe",
-]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index c03b557ae..d4690cc1a 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/build:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
-- 
cgit v1.2.3


From 71af006b6fe4504fccb86f0222a8a1864d33fb7d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 17:12:03 -0800
Subject: Cleanup internal package group.

PiperOrigin-RevId: 294339229
---
 pkg/sentry/BUILD | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index e8b794179..e759dc36f 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -1,13 +1,11 @@
-# This BUILD file defines a package_group that allows for interdependencies for
-# sentry-internal packages.
-
 package(licenses = ["notice"])
 
+# The "internal" package_group should be used as much as possible by packages
+# that should remain Sentry-internal (i.e. not be exposed directly to command
+# line tooling or APIs).
 package_group(
     name = "internal",
     packages = [
-        "//cloud/gvisor/gopkg/sentry/...",
-        "//cloud/gvisor/sentry/...",
         "//pkg/sentry/...",
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
-- 
cgit v1.2.3


From 762e4761cc4edd92108f6836ad1933c7158b8be8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 11:08:28 -0800
Subject: Move Align{Up,Down} into binary package.

PiperOrigin-RevId: 294477647
---
 pkg/binary/binary.go                      | 10 ++++++++++
 pkg/sentry/socket/control/control.go      | 26 ++++++++------------------
 pkg/sentry/socket/netfilter/extensions.go |  7 +------
 pkg/sentry/socket/netlink/message.go      | 15 ++++-----------
 pkg/sentry/strace/BUILD                   |  1 -
 pkg/sentry/strace/socket.go               |  7 +++----
 6 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 631785f7b..25065aef9 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -254,3 +254,13 @@ func WriteUint64(w io.Writer, order binary.ByteOrder, num uint64) error {
 	_, err := w.Write(buf)
 	return err
 }
+
+// AlignUp rounds a length up to an alignment. align must be a power of 2.
+func AlignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
+
+// AlignDown rounds a length down to an alignment. align must be a power of 2.
+func AlignDown(length int, align uint) int {
+	return length & ^(int(align) - 1)
+}
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 00265f15b..6145a7fc3 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -189,7 +189,7 @@ func putUint32(buf []byte, n uint32) []byte {
 // putCmsg writes a control message header and as much data as will fit into
 // the unused capacity of a buffer.
 func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) {
-	space := AlignDown(cap(buf)-len(buf), 4)
+	space := binary.AlignDown(cap(buf)-len(buf), 4)
 
 	// We can't write to space that doesn't exist, so if we are going to align
 	// the available space, we must align down.
@@ -282,19 +282,9 @@ func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int
 	return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c)
 }
 
-// AlignUp rounds a length up to an alignment. align must be a power of 2.
-func AlignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
-
-// AlignDown rounds a down to an alignment. align must be a power of 2.
-func AlignDown(length int, align uint) int {
-	return length & ^(int(align) - 1)
-}
-
 // alignSlice extends a slice's length (up to the capacity) to align it.
 func alignSlice(buf []byte, align uint) []byte {
-	aligned := AlignUp(len(buf), align)
+	aligned := binary.AlignUp(len(buf), align)
 	if aligned > cap(buf) {
 		// Linux allows unaligned data if there isn't room for alignment.
 		// Since there isn't room for alignment, there isn't room for any
@@ -377,7 +367,7 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 
 // cmsgSpace is equivalent to CMSG_SPACE in Linux.
 func cmsgSpace(t *kernel.Task, dataLen int) int {
-	return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width())
+	return linux.SizeOfControlMessageHeader + binary.AlignUp(dataLen, t.Arch().Width())
 }
 
 // CmsgsSpace returns the number of bytes needed to fit the control messages
@@ -437,7 +427,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_SOCKET:
 			switch h.Type {
 			case linux.SCM_RIGHTS:
-				rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
+				rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
 				numRights := rightsSize / linux.SizeOfControlMessageRight
 
 				if len(fds)+numRights > linux.SCM_MAX_FD {
@@ -448,7 +438,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 					fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
 				}
 
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			case linux.SCM_CREDENTIALS:
 				if length < linux.SizeOfControlMessageCredentials {
@@ -462,7 +452,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 					return socket.ControlMessages{}, err
 				}
 				cmsgs.Unix.Credentials = scmCreds
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				// Unknown message type.
@@ -476,7 +466,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				}
 				cmsgs.IP.HasTOS = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
@@ -489,7 +479,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				}
 				cmsgs.IP.HasTClass = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 22fd0ebe7..b4b244abf 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -72,7 +72,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 	nflog("marshaling matcher %q", name)
 
 	// We have to pad this struct size to a multiple of 8 bytes.
-	size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+	size := binary.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8)
 	matcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
 			MatchSize: uint16(size),
@@ -93,8 +93,3 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter,
 	}
 	return matchMaker.unmarshal(buf, filter)
 }
-
-// alignUp rounds a length up to an alignment. align must be a power of 2.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index 4ea252ccb..0899c61d1 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -23,18 +23,11 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// alignUp rounds a length up to an alignment.
-//
-// Preconditions: align is a power of two.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) &^ (int(align) - 1)
-}
-
 // alignPad returns the length of padding required for alignment.
 //
 // Preconditions: align is a power of two.
 func alignPad(length int, align uint) int {
-	return alignUp(length, align) - length
+	return binary.AlignUp(length, align) - length
 }
 
 // Message contains a complete serialized netlink message.
@@ -138,7 +131,7 @@ func (m *Message) Finalize() []byte {
 	// Align the message. Note that the message length in the header (set
 	// above) is the useful length of the message, not the total aligned
 	// length. See net/netlink/af_netlink.c:__nlmsg_put.
-	aligned := alignUp(len(m.buf), linux.NLMSG_ALIGNTO)
+	aligned := binary.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO)
 	m.putZeros(aligned - len(m.buf))
 	return m.buf
 }
@@ -173,7 +166,7 @@ func (m *Message) PutAttr(atype uint16, v interface{}) {
 	m.Put(v)
 
 	// Align the attribute.
-	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
 	m.putZeros(aligned - l)
 }
 
@@ -190,7 +183,7 @@ func (m *Message) PutAttrString(atype uint16, s string) {
 	m.putZeros(1)
 
 	// Align the attribute.
-	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
 	m.putZeros(aligned - l)
 }
 
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 762a946fe..2f39a6f2b 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/syscalls/linux",
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index f7ff4573e..51e6d81b2 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
@@ -220,13 +219,13 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 		if skipData {
 			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
-			i += control.AlignUp(length, width)
+			i += binary.AlignUp(length, width)
 			continue
 		}
 
 		switch h.Type {
 		case linux.SCM_RIGHTS:
-			rightsSize := control.AlignDown(length, linux.SizeOfControlMessageRight)
+			rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
 
 			numRights := rightsSize / linux.SizeOfControlMessageRight
 			fds := make(linux.ControlMessageRights, numRights)
@@ -295,7 +294,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 		default:
 			panic("unreachable")
 		}
-		i += control.AlignUp(length, width)
+		i += binary.AlignUp(length, width)
 	}
 
 	return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", "))
-- 
cgit v1.2.3


From 115898e368e4afe5418a7290d9545fafc7f6f25e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 11 Feb 2020 11:37:12 -0800
Subject: Prevent DATA RACE in UnstableAttr.

The slaveInodeOperations is currently copying the object when
truncate is called (which is a no-op). This may result in a
(unconsequential) data race when being modified concurrently.

PiperOrigin-RevId: 294484276
---
 pkg/sentry/fs/tty/slave.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index db55cdc48..6a2dbc576 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -73,7 +73,7 @@ func (si *slaveInodeOperations) Release(ctx context.Context) {
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
-- 
cgit v1.2.3


From 9be46e55c2aadcf40c9abd4b515c3fe899d9fa08 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 11:40:51 -0800
Subject: Stateify: register types with full package names

This is to avoid conflicts with types that share the same
[short] package and type names, e.g. proc.smapsData exist
in pkg/sentry/fs/proc and pkg/sentry/fsimpl/proc.

Updates #1663

PiperOrigin-RevId: 294485146
---
 tools/defs.bzl             |  4 +++-
 tools/go_stateify/defs.bzl |  4 ++--
 tools/go_stateify/main.go  | 10 ++++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/defs.bzl b/tools/defs.bzl
index d4690cc1a..46249f9c4 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -110,6 +110,8 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
     """
     all_srcs = srcs
     all_deps = deps
+    dirname, _, _ = native.package_name().rpartition("/")
+    full_pkg = dirname + "/" + name
     if stateify:
         # Only do stateification for non-state packages without manual autogen.
         # First, we need to segregate the input files via the special suffixes,
@@ -120,7 +122,7 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
                 name = name + suffix + "_state_autogen_with_imports",
                 srcs = srcs,
                 imports = imports,
-                package = name,
+                package = full_pkg,
                 out = name + suffix + "_state_autogen_with_imports.go",
             )
             go_imports(
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index bdb966362..6a5e666f0 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -6,7 +6,7 @@ def _go_stateify_impl(ctx):
 
     # Run the stateify command.
     args = ["-output=%s" % output.path]
-    args.append("-pkg=%s" % ctx.attr.package)
+    args.append("-fullpkg=%s" % ctx.attr.package)
     if ctx.attr._statepkg:
         args.append("-statepkg=%s" % ctx.attr._statepkg)
     if ctx.attr.imports:
@@ -43,7 +43,7 @@ for statified types.
             mandatory = False,
         ),
         "package": attr.string(
-            doc = "The package name for the input sources.",
+            doc = "The fully qualified package name for the input sources.",
             mandatory = True,
         ),
         "out": attr.output(
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index aa9d4543e..3437aa476 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -23,6 +23,7 @@ import (
 	"go/parser"
 	"go/token"
 	"os"
+	"path/filepath"
 	"reflect"
 	"strings"
 	"sync"
@@ -31,7 +32,7 @@ import (
 )
 
 var (
-	pkg      = flag.String("pkg", "", "output package")
+	fullPkg  = flag.String("fullpkg", "", "fully qualified output package")
 	imports  = flag.String("imports", "", "extra imports for the output file")
 	output   = flag.String("output", "", "output file")
 	statePkg = flag.String("statepkg", "", "state import package; defaults to empty")
@@ -170,7 +171,7 @@ func main() {
 		flag.Usage()
 		os.Exit(1)
 	}
-	if *pkg == "" {
+	if *fullPkg == "" {
 		fmt.Fprintf(os.Stderr, "Error: package required.")
 		os.Exit(1)
 	}
@@ -202,7 +203,7 @@ func main() {
 
 	// Declare our emission closures.
 	emitRegister := func(name string) {
-		initCalls = append(initCalls, fmt.Sprintf("%sRegister(\"%s.%s\", (*%s)(nil), state.Fns{Save: (*%s).save, Load: (*%s).load})", statePrefix, *pkg, name, name, name, name))
+		initCalls = append(initCalls, fmt.Sprintf("%sRegister(\"%s.%s\", (*%s)(nil), state.Fns{Save: (*%s).save, Load: (*%s).load})", statePrefix, *fullPkg, name, name, name, name))
 	}
 	emitZeroCheck := func(name string) {
 		fmt.Fprintf(outputFile, "	if !%sIsZeroValue(x.%s) { m.Failf(\"%s is %%v, expected zero\", x.%s) }\n", statePrefix, name, name, name)
@@ -233,7 +234,8 @@ func main() {
 	}
 
 	// Emit the package name.
-	fmt.Fprintf(outputFile, "package %s\n\n", *pkg)
+	_, pkg := filepath.Split(*fullPkg)
+	fmt.Fprintf(outputFile, "package %s\n\n", pkg)
 
 	// Emit the imports lazily.
 	var once sync.Once
-- 
cgit v1.2.3


From b8e22e241cab625d2809034c74d0ff808b948b4c Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 12:58:13 -0800
Subject: Disallow duplicate NIC names.

PiperOrigin-RevId: 294500858
---
 pkg/tcpip/stack/nic.go        |  5 +++
 pkg/tcpip/stack/stack.go      |  9 +++++
 pkg/tcpip/stack/stack_test.go | 85 +++++++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/network.go      | 30 +++++++--------
 4 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 78d451cca..ca3a7a07e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1215,6 +1215,11 @@ func (n *NIC) ID() tcpip.NICID {
 	return n.id
 }
 
+// Name returns the name of n.
+func (n *NIC) Name() string {
+	return n.name
+}
+
 // Stack returns the instance of the Stack that owns this NIC.
 func (n *NIC) Stack() *Stack {
 	return n.stack
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index b793f1d74..6eac16e16 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -890,6 +890,15 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 		return tcpip.ErrDuplicateNICID
 	}
 
+	// Make sure name is unique, unless unnamed.
+	if opts.Name != "" {
+		for _, n := range s.nics {
+			if n.Name() == opts.Name {
+				return tcpip.ErrDuplicateNICID
+			}
+		}
+	}
+
 	n := newNIC(s, id, opts.Name, ep, opts.Context)
 
 	s.nics[id] = n
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 24133e6f2..7ba604442 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1792,6 +1792,91 @@ func TestAddProtocolAddressWithOptions(t *testing.T) {
 	verifyAddresses(t, expectedAddresses, gotAddresses)
 }
 
+func TestCreateNICWithOptions(t *testing.T) {
+	type callArgsAndExpect struct {
+		nicID tcpip.NICID
+		opts  stack.NICOptions
+		err   *tcpip.Error
+	}
+
+	tests := []struct {
+		desc  string
+		calls []callArgsAndExpect
+	}{
+		{
+			desc: "DuplicateNICID",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "eth1"},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "eth2"},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+		{
+			desc: "DuplicateName",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "lo"},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(2),
+					opts:  stack.NICOptions{Name: "lo"},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+		{
+			desc: "Unnamed",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(2),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+			},
+		},
+		{
+			desc: "UnnamedDuplicateNICID",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.desc, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			ep := channel.New(0, 0, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"))
+			for _, call := range test.calls {
+				if got, want := s.CreateNICWithOptions(call.nicID, ep, call.opts), call.err; got != want {
+					t.Fatalf("CreateNICWithOptions(%v, _, %+v) = %v, want %v", call.nicID, call.opts, got, want)
+				}
+			}
+		})
+	}
+}
+
 func TestNICStats(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index ff48f5646..99e143696 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -174,13 +174,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
 		}
 
-		// We build our own loopback devices.
+		// We build our own loopback device.
 		if iface.Flags&net.FlagLoopback != 0 {
-			links, err := loopbackLinks(iface, allAddrs)
+			link, err := loopbackLink(iface, allAddrs)
 			if err != nil {
-				return fmt.Errorf("getting loopback routes and links for iface %q: %v", iface.Name, err)
+				return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err)
 			}
-			args.LoopbackLinks = append(args.LoopbackLinks, links...)
+			args.LoopbackLinks = append(args.LoopbackLinks, link)
 			continue
 		}
 
@@ -339,25 +339,25 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 	return &socketEntry{deviceFile, gsoMaxSize}, nil
 }
 
-// loopbackLinks collects the links for a loopback interface.
-func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
-	var links []boot.LoopbackLink
+// loopbackLink returns the link with addresses and routes for a loopback
+// interface.
+func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
+	link := boot.LoopbackLink{
+		Name: iface.Name,
+	}
 	for _, addr := range addrs {
 		ipNet, ok := addr.(*net.IPNet)
 		if !ok {
-			return nil, fmt.Errorf("address is not IPNet: %+v", addr)
+			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
 		}
 		dst := *ipNet
 		dst.IP = dst.IP.Mask(dst.Mask)
-		links = append(links, boot.LoopbackLink{
-			Name:      iface.Name,
-			Addresses: []net.IP{ipNet.IP},
-			Routes: []boot.Route{{
-				Destination: dst,
-			}},
+		link.Addresses = append(link.Addresses, ipNet.IP)
+		link.Routes = append(link.Routes, boot.Route{
+			Destination: dst,
 		})
 	}
-	return links, nil
+	return link, nil
 }
 
 // routesForIface iterates over all routes for the given interface and converts
-- 
cgit v1.2.3


From e07eacc99f00c7318df789f24e7559a7fb941b8e Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 13:37:40 -0800
Subject: Fix up test/runtimes/README.md.

In particular, explain how to push updates to the images.

PiperOrigin-RevId: 294508879
---
 test/runtimes/README.md | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/test/runtimes/README.md b/test/runtimes/README.md
index e41e78f77..42d722553 100644
--- a/test/runtimes/README.md
+++ b/test/runtimes/README.md
@@ -12,24 +12,39 @@ The following runtimes are currently supported:
 -   PHP 7.3
 -   Python 3.7
 
-#### Prerequisites:
+### Building and pushing the images:
 
-1) [Install and configure Docker](https://docs.docker.com/install/)
-
-2) Build each Docker container from the runtimes/images directory:
+The canonical source of images is the
+[gvisor-presubmit container registry](https://gcr.io/gvisor-presubmit/). You can
+build new images with the following command:
 
 ```bash
 $ cd images
 $ docker build -f Dockerfile_$LANG [-t $NAME] .
 ```
 
-### Testing:
+To push them to our container registry, set the tag in the command above to
+`gcr.io/gvisor-presubmit/$LANG`, then push them. (Note that you will need
+appropriate permissions to the `gvisor-presubmit` GCP project.)
+
+```bash
+gcloud docker -- push gcr.io/gvisor-presubmit/$LANG
+```
+
+#### Running in Docker locally:
+
+1) [Install and configure Docker](https://docs.docker.com/install/)
+
+2) Pull the image you want to run:
+
+```bash
+$ docker pull gcr.io/gvisor-presubmit/$LANG
+```
 
-If the prerequisites have been fulfilled, you can run the tests with the
-following command:
+3) Run docker with the image.
 
 ```bash
-$ docker run --rm -it $NAME [FLAG]
+$ docker run [--runtime=runsc] --rm -it $NAME [FLAG]
 ```
 
 Running the command with no flags will cause all the available tests to execute.
-- 
cgit v1.2.3


From 6dced977eab69401a114257e386addb9cb03a39d Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 17:38:04 -0800
Subject: Ensure fsimpl/gofer.dentryPlatformFile.hostFileMapper is initialized.

Fixes #1812. (The more direct cause of the deadlock is panic unsafety because
the historically high cost of defer means that we avoid it in hot paths,
including much of MM; defer is much cheaper as of Go 1.14, but still a
measurable overhead.)

PiperOrigin-RevId: 294560316
---
 pkg/sentry/fs/fsutil/host_file_mapper.go | 17 +++++++++++------
 pkg/sentry/fsimpl/gofer/regular_file.go  |  5 +++++
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 67278aa86..e82afd112 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -65,13 +65,18 @@ type mapping struct {
 	writable bool
 }
 
-// NewHostFileMapper returns a HostFileMapper with no references or cached
-// mappings.
+// Init must be called on zero-value HostFileMappers before first use.
+func (f *HostFileMapper) Init() {
+	f.refs = make(map[uint64]int32)
+	f.mappings = make(map[uint64]mapping)
+}
+
+// NewHostFileMapper returns an initialized HostFileMapper allocated on the
+// heap with no references or cached mappings.
 func NewHostFileMapper() *HostFileMapper {
-	return &HostFileMapper{
-		refs:     make(map[uint64]int32),
-		mappings: make(map[uint64]mapping),
-	}
+	f := &HostFileMapper{}
+	f.Init()
+	return f
 }
 
 // IncRefOn increments the reference count on all offsets in mr.
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 8e11e06b3..54c1031a7 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -571,6 +571,8 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
 	default:
 		panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
 	}
+	// After this point, d may be used as a memmap.Mappable.
+	d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
 	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
 }
 
@@ -799,6 +801,9 @@ type dentryPlatformFile struct {
 	// If this dentry represents a regular file, and handle.fd >= 0,
 	// hostFileMapper caches mappings of handle.fd.
 	hostFileMapper fsutil.HostFileMapper
+
+	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
+	hostFileMapperInitOnce sync.Once
 }
 
 // IncRef implements platform.File.IncRef.
-- 
cgit v1.2.3


From 5205bc7e583f90e5a5855a69de26a0baa888cbdf Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 20:35:55 -0800
Subject: Simplify atomic operations

PiperOrigin-RevId: 294582802
---
 pkg/sleep/commit_noasm.go | 13 ++-----------
 pkg/sleep/sleep_unsafe.go | 23 ++++++++++-------------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
index 3af447fb9..f59061f37 100644
--- a/pkg/sleep/commit_noasm.go
+++ b/pkg/sleep/commit_noasm.go
@@ -28,15 +28,6 @@ import "sync/atomic"
 // It is written in assembly because it is called from g0, so it doesn't have
 // a race context.
 func commitSleep(g uintptr, waitingG *uintptr) bool {
-	for {
-		// Check if the wait was aborted.
-		if atomic.LoadUintptr(waitingG) == 0 {
-			return false
-		}
-
-		// Try to store the G so that wakers know who to wake.
-		if atomic.CompareAndSwapUintptr(waitingG, preparingG, g) {
-			return true
-		}
-	}
+	// Try to store the G so that wakers know who to wake.
+	return atomic.CompareAndSwapUintptr(waitingG, preparingG, g)
 }
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index acbf0229b..65bfcf778 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -299,20 +299,17 @@ func (s *Sleeper) enqueueAssertedWaker(w *Waker) {
 		}
 	}
 
-	for {
-		// Nothing to do if there isn't a G waiting.
-		g := atomic.LoadUintptr(&s.waitingG)
-		if g == 0 {
-			return
-		}
+	// Nothing to do if there isn't a G waiting.
+	if atomic.LoadUintptr(&s.waitingG) == 0 {
+		return
+	}
 
-		// Signal to the sleeper that a waker has been asserted.
-		if atomic.CompareAndSwapUintptr(&s.waitingG, g, 0) {
-			if g != preparingG {
-				// We managed to get a G. Wake it up.
-				goready(g, 0)
-			}
-		}
+	// Signal to the sleeper that a waker has been asserted.
+	switch g := atomic.SwapUintptr(&s.waitingG, 0); g {
+	case 0, preparingG:
+	default:
+		// We managed to get a G. Wake it up.
+		goready(g, 0)
 	}
 }
 
-- 
cgit v1.2.3


From 46a36b64d5164d1ac887aa528d23bb2f2c74489e Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 12 Feb 2020 06:35:20 -0800
Subject: Include more test files in exports_files

So that they can be included by Fuchsia's syscall tests

PiperOrigin-RevId: 294654890
---
 test/syscalls/linux/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ca1af209a..e7c82adfc 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -10,13 +10,16 @@ exports_files(
         "socket.cc",
         "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
+        "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
+        "udp_bind.cc",
         "udp_socket.cc",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 6fdf2c53a1d084b70602170b660242036fd8fe4f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 7 Feb 2020 11:21:07 -0800
Subject: iptables: User chains

- Adds creation of user chains via `-N <chainname>`
- Adds `-j RETURN` support for built-in chains, which triggers the
  chain's underflow rule (usually the default policy).
- Adds tests for chain creation, default policies, and `-j RETURN' from
  built-in chains.
---
 pkg/sentry/socket/netfilter/netfilter.go | 115 +++++++++++++++++++-----------
 pkg/tcpip/iptables/iptables.go           |  74 ++++++++++++-------
 pkg/tcpip/iptables/targets.go            |  41 ++++++++---
 pkg/tcpip/iptables/types.go              |  50 +++++--------
 test/iptables/filter_input.go            | 117 ++++++++++++++++++++++++++++++-
 test/iptables/iptables_test.go           |  24 +++++++
 6 files changed, 310 insertions(+), 111 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index ea02627de..3fc80e0de 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -50,7 +50,9 @@ type metadata struct {
 
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
-	log.Infof("netfilter: "+format, args...)
+	if log.IsLogging(log.Debug) {
+		log.Debugf("netfilter: "+format, args...)
+	}
 }
 
 // GetInfo returns information about iptables.
@@ -227,19 +229,23 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 }
 
 func marshalTarget(target iptables.Target) []byte {
-	switch target.(type) {
-	case iptables.UnconditionalAcceptTarget:
-		return marshalStandardTarget(iptables.Accept)
-	case iptables.UnconditionalDropTarget:
-		return marshalStandardTarget(iptables.Drop)
+	switch tg := target.(type) {
+	case iptables.AcceptTarget:
+		return marshalStandardTarget(iptables.RuleAccept)
+	case iptables.DropTarget:
+		return marshalStandardTarget(iptables.RuleDrop)
 	case iptables.ErrorTarget:
-		return marshalErrorTarget()
+		return marshalErrorTarget(errorTargetName)
+	case iptables.UserChainTarget:
+		return marshalErrorTarget(tg.Name)
+	case iptables.ReturnTarget:
+		return marshalStandardTarget(iptables.RuleReturn)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
-func marshalStandardTarget(verdict iptables.Verdict) []byte {
+func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
 	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
 
 	// The target's name will be the empty string.
@@ -254,14 +260,14 @@ func marshalStandardTarget(verdict iptables.Verdict) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
-func marshalErrorTarget() []byte {
+func marshalErrorTarget(errorName string) []byte {
 	// This is an error target named error
 	target := linux.XTErrorTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTErrorTarget,
 		},
 	}
-	copy(target.Name[:], errorTargetName)
+	copy(target.Name[:], errorName)
 	copy(target.Target.Name[:], errorTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
@@ -270,38 +276,35 @@ func marshalErrorTarget() []byte {
 
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
+func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
 	switch verdict {
-	case iptables.Accept:
+	case iptables.RuleAccept:
 		return -linux.NF_ACCEPT - 1
-	case iptables.Drop:
+	case iptables.RuleDrop:
 		return -linux.NF_DROP - 1
-	case iptables.Queue:
-		return -linux.NF_QUEUE - 1
-	case iptables.Return:
+	case iptables.RuleReturn:
 		return linux.NF_RETURN
-	case iptables.Jump:
+	default:
 		// TODO(gvisor.dev/issue/170): Support Jump.
-		panic("Jump isn't supported yet")
+		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
 	}
-	panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
 }
 
-// translateToStandardVerdict translates from the value in a
+// translateToStandardTarget translates from the value in a
 // linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardVerdict(val int32) (iptables.Verdict, error) {
+func translateToStandardTarget(val int32) (iptables.Target, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return iptables.Accept, nil
+		return iptables.AcceptTarget{}, nil
 	case -linux.NF_DROP - 1:
-		return iptables.Drop, nil
+		return iptables.DropTarget{}, nil
 	case -linux.NF_QUEUE - 1:
-		return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE")
+		return nil, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		return iptables.Invalid, errors.New("unsupported iptables verdict RETURN")
+		return iptables.ReturnTarget{}, nil
 	default:
-		return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d", val)
+		return nil, fmt.Errorf("unknown iptables verdict %d", val)
 	}
 }
 
@@ -411,6 +414,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
+					if !validUnderflow(table.Rules[ruleIdx]) {
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						return syserr.ErrInvalidArgument
+					}
 					table.Underflows[hk] = ruleIdx
 				}
 			}
@@ -425,12 +432,34 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
+	// Add the user chains.
+	for ruleIdx, rule := range table.Rules {
+		target, ok := rule.Target.(iptables.UserChainTarget)
+		if !ok {
+			continue
+		}
+
+		// We found a user chain. Before inserting it into the table,
+		// check that:
+		// - There's some other rule after it.
+		// - There are no matchers.
+		if ruleIdx == len(table.Rules)-1 {
+			nflog("user chain must have a rule or default policy.")
+			return syserr.ErrInvalidArgument
+		}
+		if len(table.Rules[ruleIdx].Matchers) != 0 {
+			nflog("user chain's first node must have no matcheres.")
+			return syserr.ErrInvalidArgument
+		}
+		table.UserChains[target.Name] = ruleIdx + 1
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
 	// Since we only support modifying the INPUT chain right now, make sure
 	// all other chains point to ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
 		if hook != iptables.Input {
-			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
+			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -519,18 +548,7 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
-		if err != nil {
-			return nil, err
-		}
-		switch verdict {
-		case iptables.Accept:
-			return iptables.UnconditionalAcceptTarget{}, nil
-		case iptables.Drop:
-			return iptables.UnconditionalDropTarget{}, nil
-		default:
-			return nil, fmt.Errorf("Unknown verdict: %v", verdict)
-		}
+		return translateToStandardTarget(standardTarget.Verdict)
 
 	case errorTargetName:
 		// Error target.
@@ -548,11 +566,14 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		//   somehow fall through every rule.
 		// * To mark the start of a user defined chain. These
 		//   rules have an error with the name of the chain.
-		switch errorTarget.Name.String() {
+		switch name := errorTarget.Name.String(); name {
 		case errorTargetName:
+			nflog("set entries: error target")
 			return iptables.ErrorTarget{}, nil
 		default:
-			return nil, fmt.Errorf("unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
+			// User defined chain.
+			nflog("set entries: user-defined target %q", name)
+			return iptables.UserChainTarget{Name: name}, nil
 		}
 	}
 
@@ -585,6 +606,18 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 		iptip.InverseFlags != 0
 }
 
+func validUnderflow(rule iptables.Rule) bool {
+	if len(rule.Matchers) != 0 {
+		return false
+	}
+	switch rule.Target.(type) {
+	case iptables.AcceptTarget, iptables.DropTarget:
+		return true
+	default:
+		return false
+	}
+}
+
 func hookFromLinux(hook int) iptables.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 1b9485bbd..75a433a3b 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -52,10 +52,10 @@ func DefaultTables() IPTables {
 		Tables: map[string]Table{
 			TablenameNat: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -74,8 +74,8 @@ func DefaultTables() IPTables {
 			},
 			TablenameMangle: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -90,9 +90,9 @@ func DefaultTables() IPTables {
 			},
 			TablenameFilter: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -149,13 +149,11 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	for _, tablename := range it.Priorities[hook] {
 		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
 		// If the table returns Accept, move on to the next table.
-		case Accept:
+		case TableAccept:
 			continue
 		// The Drop verdict is final.
-		case Drop:
+		case TableDrop:
 			return false
-		case Stolen, Queue, Repeat, None, Jump, Return, Continue:
-			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
 		default:
 			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
@@ -166,36 +164,58 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
+func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
 	table := it.Tables[tablename]
 	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
 		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
-		// In either of these cases, this table is done with the packet.
-		case Accept, Drop:
-			return verdict
-		// Continue traversing the rules of the table.
-		case Continue:
+		case RuleAccept:
+			return TableAccept
+
+		case RuleDrop:
+			return TableDrop
+
+		case RuleContinue:
 			continue
-		case Stolen, Queue, Repeat, None, Jump, Return:
-			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
+
+		case RuleReturn:
+			// TODO(gvisor.dev/issue/170): We don't implement jump
+			// yet, so any Return is from a built-in chain. That
+			// means we have to to call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			// Underflow is guaranteed to be an unconditional
+			// ACCEPT or DROP.
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				return TableAccept
+			case RuleDrop:
+				return TableDrop
+			case RuleContinue, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
 		default:
-			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
+			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 		}
+
 	}
 
-	panic(fmt.Sprintf("Traversed past the entire list of iptables rules in table %q.", tablename))
+	// We got through the entire table without a decision. Default to DROP
+	// for safety.
+	return TableDrop
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return Continue
+		return RuleContinue
 	}
 
 	// Go through each rule matcher. If they all match, run
@@ -203,10 +223,10 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	for _, matcher := range rule.Matchers {
 		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
-			return Drop
+			return RuleDrop
 		}
 		if !matches {
-			return Continue
+			return RuleContinue
 		}
 	}
 
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 4dd281371..9fc60cfad 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -21,20 +21,20 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
-// UnconditionalAcceptTarget accepts all packets.
-type UnconditionalAcceptTarget struct{}
+// AcceptTarget accepts packets.
+type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalAcceptTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	return Accept, ""
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleAccept, ""
 }
 
-// UnconditionalDropTarget denies all packets.
-type UnconditionalDropTarget struct{}
+// DropTarget drops packets.
+type DropTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	return Drop, ""
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleDrop, ""
 }
 
 // ErrorTarget logs an error and drops the packet. It represents a target that
@@ -42,7 +42,26 @@ func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, strin
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	log.Warningf("ErrorTarget triggered.")
-	return Drop, ""
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	log.Debugf("ErrorTarget triggered.")
+	return RuleDrop, ""
+}
+
+// UserChainTarget marks a rule as the beginning of a user chain.
+type UserChainTarget struct {
+	Name string
+}
+
+// Action implements Target.Action.
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+	panic("UserChainTarget should never be called.")
+}
+
+// ReturnTarget returns from the current chain. If the chain is a built-in, the
+// hook's underflow should be called.
+type ReturnTarget struct{}
+
+// Action implements Target.Action.
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleReturn, ""
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7d593c35c..5735d001b 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -56,44 +56,32 @@ const (
 	NumHooks
 )
 
-// A Verdict is returned by a rule's target to indicate how traversal of rules
-// should (or should not) continue.
-type Verdict int
+// A TableVerdict is what a table decides should be done with a packet.
+type TableVerdict int
 
 const (
-	// Invalid indicates an unkonwn or erroneous verdict.
-	Invalid Verdict = iota
+	// TableAccept indicates the packet should continue through netstack.
+	TableAccept TableVerdict = iota
 
-	// Accept indicates the packet should continue traversing netstack as
-	// normal.
-	Accept
-
-	// Drop inicates the packet should be dropped, stopping traversing
-	// netstack.
-	Drop
-
-	// Stolen indicates the packet was co-opted by the target and should
-	// stop traversing netstack.
-	Stolen
-
-	// Queue indicates the packet should be queued for userspace processing.
-	Queue
+	// TableAccept indicates the packet should be dropped.
+	TableDrop
+)
 
-	// Repeat indicates the packet should re-traverse the chains for the
-	// current hook.
-	Repeat
+// A RuleVerdict is what a rule decides should be done with a packet.
+type RuleVerdict int
 
-	// None indicates no verdict was reached.
-	None
+const (
+	// RuleAccept indicates the packet should continue through netstack.
+	RuleAccept RuleVerdict = iota
 
-	// Jump indicates a jump to another chain.
-	Jump
+	// RuleContinue indicates the packet should continue to the next rule.
+	RuleContinue
 
-	// Continue indicates that traversal should continue at the next rule.
-	Continue
+	// RuleDrop indicates the packet should be dropped.
+	RuleDrop
 
-	// Return indicates that traversal should return to the calling chain.
-	Return
+	// RuleReturn indicates the packet should return to the previous chain.
+	RuleReturn
 )
 
 // IPTables holds all the tables for a netstack.
@@ -187,5 +175,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (Verdict, string)
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
 }
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index bd6059921..e26d6a7d2 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -36,6 +36,10 @@ func init() {
 	RegisterTestCase(FilterInputDropTCPSrcPort{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropUDP{})
+	RegisterTestCase(FilterInputCreateUserChain{})
+	RegisterTestCase(FilterInputDefaultPolicyAccept{})
+	RegisterTestCase(FilterInputDefaultPolicyDrop{})
+	RegisterTestCase(FilterInputReturnUnderflow{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -295,8 +299,119 @@ func (FilterInputRequireProtocolUDP) ContainerAction(ip net.IP) error {
 	return nil
 }
 
-// LocalAction implements TestCase.LocalAction.
 func (FilterInputRequireProtocolUDP) LocalAction(ip net.IP) error {
 	// No-op.
 	return nil
 }
+
+// FilterInputCreateUserChain tests chain creation.
+type FilterInputCreateUserChain struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputCreateUserChain) Name() string {
+	return "FilterInputCreateUserChain"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
+	// Create a chain.
+	const chainName = "foochain"
+	if err := filterTable("-N", chainName); err != nil {
+		return err
+	}
+
+	// Add a simple rule to the chain.
+	return filterTable("-A", chainName, "-j", "DROP")
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputCreateUserChain) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// FilterInputDefaultPolicyAccept tests the default ACCEPT policy.
+type FilterInputDefaultPolicyAccept struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDefaultPolicyAccept) Name() string {
+	return "FilterInputDefaultPolicyAccept"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDefaultPolicyAccept) ContainerAction(ip net.IP) error {
+	// Set the default policy to accept, then receive a packet.
+	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDefaultPolicyAccept) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputDefaultPolicyDrop tests the default DROP policy.
+type FilterInputDefaultPolicyDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDefaultPolicyDrop) Name() string {
+	return "FilterInputDefaultPolicyDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDefaultPolicyDrop) ContainerAction(ip net.IP) error {
+	if err := filterTable("-P", "INPUT", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDefaultPolicyDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputReturnUnderflow tests that -j RETURN in a built-in chain causes
+// the underflow rule (i.e. default policy) to be executed.
+type FilterInputReturnUnderflow struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputReturnUnderflow) Name() string {
+	return "FilterInputReturnUnderflow"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
+	// Add a RETURN rule followed by an unconditional accept, and set the
+	// default policy to DROP.
+	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
+		return err
+	}
+	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
+		return err
+	}
+	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// We should receive packets, as the RETURN rule will trigger the default
+	// ACCEPT policy.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 41909582a..46a7c99b0 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -214,6 +214,30 @@ func TestFilterInputDropTCPSrcPort(t *testing.T) {
 	}
 }
 
+func TestFilterInputCreateUserChain(t *testing.T) {
+	if err := singleTest(FilterInputCreateUserChain{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDefaultPolicyAccept(t *testing.T) {
+	if err := singleTest(FilterInputDefaultPolicyAccept{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDefaultPolicyDrop(t *testing.T) {
+	if err := singleTest(FilterInputDefaultPolicyDrop{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputReturnUnderflow(t *testing.T) {
+	if err := singleTest(FilterInputReturnUnderflow{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
-- 
cgit v1.2.3


From d30a884775556474f1a893fd30460a6edf0a3039 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 11 Feb 2020 00:52:23 +0000
Subject: Add definition of arch.ARMTrapFlag.

Fixes #1708

Signed-off-by: Haibo Xu haibo.xu@arm.com
Change-Id: Ib15768692ead17c81c06f7666ca3f0a14064c3a0
---
 pkg/sentry/arch/arch_aarch64.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 3b6987665..d7794db63 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -34,6 +34,9 @@ const (
 	SyscallWidth = 4
 )
 
+// ARMTrapFlag is the mask for the trap flag.
+const ARMTrapFlag = uint64(1) << 21
+
 // aarch64FPState is aarch64 floating point state.
 type aarch64FPState []byte
 
-- 
cgit v1.2.3


From cf1e50a80976fae95eef4ab05d961200b04e2346 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 12 Feb 2020 16:26:08 -0800
Subject: Minor runtime test fixes.

* Allow scripts/common.sh to be sourced from outside the scripts/ directory
* Fix passing empty args to Bazel, which causes the tool to exit with a failure
  even if the command succeeds.

PiperOrigin-RevId: 294785456
---
 kokoro/runtime_tests/runtime_tests.sh |  6 +++++-
 scripts/common.sh                     | 12 +++++++++++-
 scripts/common_build.sh               | 27 ++++++++++++++++-----------
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/kokoro/runtime_tests/runtime_tests.sh b/kokoro/runtime_tests/runtime_tests.sh
index 9ee991e42..73a58f806 100755
--- a/kokoro/runtime_tests/runtime_tests.sh
+++ b/kokoro/runtime_tests/runtime_tests.sh
@@ -14,7 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-source $(dirname $0)/common.sh
+# Run in the root of the repo.
+cd "$(dirname "$0")"
+cd "$(git rev-parse --show-toplevel)"
+
+source scripts/common.sh
 
 if [ ! -v RUNTIME_TEST_NAME ]; then
   echo 'Must set $RUNTIME_TEST_NAME' >&2
diff --git a/scripts/common.sh b/scripts/common.sh
index cd91b9f8e..3ca699e4a 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -16,7 +16,17 @@
 
 set -xeou pipefail
 
-source $(dirname $0)/common_build.sh
+# Get the path to the directory this script lives in.
+# If this script is being called with `source`, $0 will be the path of the
+# *sourcing* script, so we can't use `dirname $0` to find scripts in this
+# directory.
+if [[ -v BASH_SOURCE && "$0" != "$BASH_SOURCE" ]]; then
+  declare -r script_dir="$(dirname "$BASH_SOURCE")"
+else
+  declare -r script_dir="$(dirname "$0")"
+fi
+
+source "${script_dir}/common_build.sh"
 
 # Ensure it attempts to collect logs in all cases.
 trap collect_logs EXIT
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
index 2c2a826c7..ae8b67383 100755
--- a/scripts/common_build.sh
+++ b/scripts/common_build.sh
@@ -14,8 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Install the latest version of Bazel and log the version.
-(which use_bazel.sh && use_bazel.sh latest) || which bazel
+which bazel
 bazel version
 
 # Switch into the workspace; only necessary if run with kokoro.
@@ -26,27 +25,30 @@ elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
 fi
 
 # Set the standard bazel flags.
-declare -r BAZEL_FLAGS=(
+declare -a BAZEL_FLAGS=(
   "--show_timestamps"
   "--test_output=errors"
   "--keep_going"
   "--verbose_failures=true"
 )
-BAZEL_RBE_AUTH_FLAGS=""
-BAZEL_RBE_FLAGS=""
 if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
-  declare -r BAZEL_RBE_AUTH_FLAGS="--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-  declare -r BAZEL_RBE_FLAGS="--config=remote"
+  BAZEL_FLAGS+=(
+    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+    "--config=remote"
+  )
 fi
+declare -r BAZEL_FLAGS
 
 # Wrap bazel.
 function build() {
-  bazel build "${BAZEL_RBE_FLAGS}" "${BAZEL_RBE_AUTH_FLAGS}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
-    tee /dev/fd/2 | grep -E '^  bazel-bin/' | awk '{ print $1; }'
+  bazel build "${BAZEL_FLAGS[@]}" "$@" 2>&1 \
+    | tee /dev/fd/2 \
+    | grep -E '^  bazel-bin/' \
+    | awk '{ print $1; }'
 }
 
 function test() {
-  bazel test "${BAZEL_RBE_FLAGS}" "${BAZEL_RBE_AUTH_FLAGS}" "${BAZEL_FLAGS[@]}" "$@"
+  bazel test "${BAZEL_FLAGS[@]}" "$@"
 }
 
 function run() {
@@ -95,5 +97,8 @@ function collect_logs() {
 }
 
 function find_branch_name() {
-  git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename
+  git branch --show-current \
+    || git rev-parse HEAD \
+    || bazel info workspace \
+    | xargs basename
 }
-- 
cgit v1.2.3


From 3ad6d3056371b031fb0c16c4e365d5c7e60bdaf0 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 09:20:30 -0800
Subject: Call py_requirement with named argument for optional kwarg.

PiperOrigin-RevId: 294930818
---
 benchmarks/defs.bzl                        |  14 +++
 benchmarks/harness/BUILD                   | 165 ++++++++++++++++++++++-------
 benchmarks/harness/machine_producers/BUILD |   5 +-
 benchmarks/runner/BUILD                    |  17 +--
 benchmarks/workloads/ab/BUILD              |  13 +--
 benchmarks/workloads/absl/BUILD            |  13 +--
 benchmarks/workloads/fio/BUILD             |  13 +--
 benchmarks/workloads/iperf/BUILD           |  13 +--
 benchmarks/workloads/redisbenchmark/BUILD  |  13 +--
 benchmarks/workloads/sysbench/BUILD        |  13 +--
 benchmarks/workloads/syscall/BUILD         |  13 +--
 tools/bazeldefs/defs.bzl                   |   2 +-
 12 files changed, 172 insertions(+), 122 deletions(-)
 create mode 100644 benchmarks/defs.bzl

diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl
new file mode 100644
index 000000000..56d28223e
--- /dev/null
+++ b/benchmarks/defs.bzl
@@ -0,0 +1,14 @@
+"""Provides attributes common to many workload tests."""
+
+load("//tools:defs.bzl", "py_requirement")
+
+test_deps = [
+    py_requirement("attrs", direct = False),
+    py_requirement("atomicwrites", direct = False),
+    py_requirement("more-itertools", direct = False),
+    py_requirement("pathlib2", direct = False),
+    py_requirement("pluggy", direct = False),
+    py_requirement("py", direct = False),
+    py_requirement("pytest"),
+    py_requirement("six", direct = False),
+]
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
index 4d03e3a06..48c548d59 100644
--- a/benchmarks/harness/BUILD
+++ b/benchmarks/harness/BUILD
@@ -1,5 +1,4 @@
-load("//tools:defs.bzl", "pkg_tar")
-load("//tools:defs.bzl", "py_library", "py_requirement")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -46,16 +45,43 @@ py_library(
     srcs = ["container.py"],
     deps = [
         "//benchmarks/workloads",
-        py_requirement("asn1crypto", False),
-        py_requirement("chardet", False),
-        py_requirement("certifi", False),
-        py_requirement("docker", True),
-        py_requirement("docker-pycreds", False),
-        py_requirement("idna", False),
-        py_requirement("ptyprocess", False),
-        py_requirement("requests", False),
-        py_requirement("urllib3", False),
-        py_requirement("websocket-client", False),
+        py_requirement(
+            "asn1crypto",
+            direct = False,
+        ),
+        py_requirement(
+            "chardet",
+            direct = False,
+        ),
+        py_requirement(
+            "certifi",
+            direct = False,
+        ),
+        py_requirement("docker"),
+        py_requirement(
+            "docker-pycreds",
+            direct = False,
+        ),
+        py_requirement(
+            "idna",
+            direct = False,
+        ),
+        py_requirement(
+            "ptyprocess",
+            direct = False,
+        ),
+        py_requirement(
+            "requests",
+            direct = False,
+        ),
+        py_requirement(
+            "urllib3",
+            direct = False,
+        ),
+        py_requirement(
+            "websocket-client",
+            direct = False,
+        ),
     ],
 )
 
@@ -68,17 +94,47 @@ py_library(
         "//benchmarks/harness:ssh_connection",
         "//benchmarks/harness:tunnel_dispatcher",
         "//benchmarks/harness/machine_mocks",
-        py_requirement("asn1crypto", False),
-        py_requirement("chardet", False),
-        py_requirement("certifi", False),
-        py_requirement("docker", True),
-        py_requirement("docker-pycreds", False),
-        py_requirement("idna", False),
-        py_requirement("ptyprocess", False),
-        py_requirement("requests", False),
-        py_requirement("six", False),
-        py_requirement("urllib3", False),
-        py_requirement("websocket-client", False),
+        py_requirement(
+            "asn1crypto",
+            direct = False,
+        ),
+        py_requirement(
+            "chardet",
+            direct = False,
+        ),
+        py_requirement(
+            "certifi",
+            direct = False,
+        ),
+        py_requirement("docker"),
+        py_requirement(
+            "docker-pycreds",
+            direct = False,
+        ),
+        py_requirement(
+            "idna",
+            direct = False,
+        ),
+        py_requirement(
+            "ptyprocess",
+            direct = False,
+        ),
+        py_requirement(
+            "requests",
+            direct = False,
+        ),
+        py_requirement(
+            "six",
+            direct = False,
+        ),
+        py_requirement(
+            "urllib3",
+            direct = False,
+        ),
+        py_requirement(
+            "websocket-client",
+            direct = False,
+        ),
     ],
 )
 
@@ -87,10 +143,16 @@ py_library(
     srcs = ["ssh_connection.py"],
     deps = [
         "//benchmarks/harness",
-        py_requirement("bcrypt", False),
-        py_requirement("cffi", True),
-        py_requirement("paramiko", True),
-        py_requirement("cryptography", False),
+        py_requirement(
+            "bcrypt",
+            direct = False,
+        ),
+        py_requirement("cffi"),
+        py_requirement("paramiko"),
+        py_requirement(
+            "cryptography",
+            direct = False,
+        ),
     ],
 )
 
@@ -98,16 +160,43 @@ py_library(
     name = "tunnel_dispatcher",
     srcs = ["tunnel_dispatcher.py"],
     deps = [
-        py_requirement("asn1crypto", False),
-        py_requirement("chardet", False),
-        py_requirement("certifi", False),
-        py_requirement("docker", True),
-        py_requirement("docker-pycreds", False),
-        py_requirement("idna", False),
-        py_requirement("pexpect", True),
-        py_requirement("ptyprocess", False),
-        py_requirement("requests", False),
-        py_requirement("urllib3", False),
-        py_requirement("websocket-client", False),
+        py_requirement(
+            "asn1crypto",
+            direct = False,
+        ),
+        py_requirement(
+            "chardet",
+            direct = False,
+        ),
+        py_requirement(
+            "certifi",
+            direct = False,
+        ),
+        py_requirement("docker"),
+        py_requirement(
+            "docker-pycreds",
+            direct = False,
+        ),
+        py_requirement(
+            "idna",
+            direct = False,
+        ),
+        py_requirement("pexpect"),
+        py_requirement(
+            "ptyprocess",
+            direct = False,
+        ),
+        py_requirement(
+            "requests",
+            direct = False,
+        ),
+        py_requirement(
+            "urllib3",
+            direct = False,
+        ),
+        py_requirement(
+            "websocket-client",
+            direct = False,
+        ),
     ],
 )
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index 3711a397f..81f19bd08 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -31,7 +31,10 @@ py_library(
     deps = [
         "//benchmarks/harness:machine",
         "//benchmarks/harness/machine_producers:machine_producer",
-        py_requirement("PyYAML", False),
+        py_requirement(
+            "PyYAML",
+            direct = False,
+        ),
     ],
 )
 
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
index fae0ca800..471debfdf 100644
--- a/benchmarks/runner/BUILD
+++ b/benchmarks/runner/BUILD
@@ -1,4 +1,5 @@
 load("//tools:defs.bzl", "py_library", "py_requirement", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(licenses = ["notice"])
 
@@ -28,7 +29,7 @@ py_library(
         "//benchmarks/suites:startup",
         "//benchmarks/suites:sysbench",
         "//benchmarks/suites:syscall",
-        py_requirement("click", True),
+        py_requirement("click"),
     ],
 )
 
@@ -36,7 +37,7 @@ py_library(
     name = "commands",
     srcs = ["commands.py"],
     deps = [
-        py_requirement("click", True),
+        py_requirement("click"),
     ],
 )
 
@@ -48,16 +49,8 @@ py_test(
         "local",
         "manual",
     ],
-    deps = [
+    deps = test_deps + [
         ":runner",
-        py_requirement("click", True),
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
+        py_requirement("click"),
     ],
 )
diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD
index 4dd91ceb3..945ac7026 100644
--- a/benchmarks/workloads/ab/BUILD
+++ b/benchmarks/workloads/ab/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "ab_test",
     srcs = ["ab_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":ab",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD
index 55dae3baa..bb1a308bf 100644
--- a/benchmarks/workloads/absl/BUILD
+++ b/benchmarks/workloads/absl/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "absl_test",
     srcs = ["absl_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":absl",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD
index 7b78e8e75..24d909c53 100644
--- a/benchmarks/workloads/fio/BUILD
+++ b/benchmarks/workloads/fio/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "fio_test",
     srcs = ["fio_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":fio",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD
index 570f40148..91b953718 100644
--- a/benchmarks/workloads/iperf/BUILD
+++ b/benchmarks/workloads/iperf/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "iperf_test",
     srcs = ["iperf_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":iperf",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD
index f472a4443..147cfedd2 100644
--- a/benchmarks/workloads/redisbenchmark/BUILD
+++ b/benchmarks/workloads/redisbenchmark/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "redisbenchmark_test",
     srcs = ["redisbenchmark_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":redisbenchmark",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD
index 3834af7ed..ab2556064 100644
--- a/benchmarks/workloads/sysbench/BUILD
+++ b/benchmarks/workloads/sysbench/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "sysbench_test",
     srcs = ["sysbench_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":sysbench",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD
index dba4bb1e7..f8c43bca1 100644
--- a/benchmarks/workloads/syscall/BUILD
+++ b/benchmarks/workloads/syscall/BUILD
@@ -1,4 +1,5 @@
-load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_test")
+load("//benchmarks:defs.bzl", "test_deps")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -14,16 +15,8 @@ py_test(
     name = "syscall_test",
     srcs = ["syscall_test.py"],
     python_version = "PY3",
-    deps = [
+    deps = test_deps + [
         ":syscall",
-        py_requirement("attrs", False),
-        py_requirement("atomicwrites", False),
-        py_requirement("more-itertools", False),
-        py_requirement("pathlib2", False),
-        py_requirement("pluggy", False),
-        py_requirement("py", False),
-        py_requirement("pytest", True),
-        py_requirement("six", False),
     ],
 )
 
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 08c29ff1c..6798362dc 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -72,7 +72,7 @@ def go_test(name, **kwargs):
         **kwargs
     )
 
-def py_requirement(name, direct = False):
+def py_requirement(name, direct = True):
     return _py_requirement(name)
 
 def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs):
-- 
cgit v1.2.3


From 69bf39e8a47d3b4dcbbd04d2e8df476cdfab5e74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 10:58:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 294952610
---
 pkg/abi/linux/socket.go                        | 13 ++++
 pkg/sentry/socket/control/BUILD                |  1 +
 pkg/sentry/socket/control/control.go           | 43 +++++++++++++
 pkg/sentry/socket/hostinet/socket.go           | 11 +++-
 pkg/sentry/socket/netstack/netstack.go         | 37 ++++++++++--
 pkg/tcpip/tcpip.go                             | 25 ++++++++
 pkg/tcpip/transport/udp/endpoint.go            | 26 ++++++++
 test/syscalls/linux/socket_ip_udp_generic.cc   | 44 ++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 84 ++++++++++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc   |  1 -
 10 files changed, 278 insertions(+), 7 deletions(-)

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 766ee4014..4a14ef691 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -411,6 +411,15 @@ type ControlMessageCredentials struct {
 	GID uint32
 }
 
+// A ControlMessageIPPacketInfo is IP_PKTINFO socket control message.
+//
+// ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h.
+type ControlMessageIPPacketInfo struct {
+	NIC             int32
+	LocalAddr       InetAddr
+	DestinationAddr InetAddr
+}
+
 // SizeOfControlMessageCredentials is the binary size of a
 // ControlMessageCredentials struct.
 var SizeOfControlMessageCredentials = int(binary.Size(ControlMessageCredentials{}))
@@ -431,6 +440,10 @@ const SizeOfControlMessageTOS = 1
 // SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
 const SizeOfControlMessageTClass = 4
 
+// SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO
+// control message.
+const SizeOfControlMessageIPPacketInfo = 12
+
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 79e16d6e8..4d42d29cb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/syserror",
+        "//pkg/tcpip",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 6145a7fc3..4667373d2 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -338,6 +339,22 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
+// PackIPPacketInfo packs an IP_PKTINFO socket control message.
+func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte {
+	var p linux.ControlMessageIPPacketInfo
+	p.NIC = int32(packetInfo.NIC)
+	copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr))
+	copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr))
+
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IP,
+		linux.IP_PKTINFO,
+		t.Arch().Width(),
+		p,
+	)
+}
+
 // PackControlMessages packs control messages into the given buffer.
 //
 // We skip control messages specific to Unix domain sockets.
@@ -362,6 +379,10 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
+	if cmsgs.IP.HasIPPacketInfo {
+		buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf)
+	}
+
 	return buf
 }
 
@@ -394,6 +415,16 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
 	return space
 }
 
+// NewIPPacketInfo returns the IPPacketInfo struct.
+func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo {
+	var p tcpip.IPPacketInfo
+	p.NIC = tcpip.NICID(packetInfo.NIC)
+	copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:])
+	copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:])
+
+	return p
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
@@ -468,6 +499,18 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += binary.AlignUp(length, width)
 
+			case linux.IP_PKTINFO:
+				if length < linux.SizeOfControlMessageIPPacketInfo {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				cmsgs.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+
+				cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo)
+				i += binary.AlignUp(length, width)
+
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
 			}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index de76388ac..22f78d2e2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -289,7 +289,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_TOS, linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
@@ -336,6 +336,8 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 		switch name {
 		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
+		case linux.IP_PKTINFO:
+			optlen = linux.SizeOfControlMessageIPPacketInfo
 		}
 	case linux.SOL_IPV6:
 		switch name {
@@ -473,7 +475,14 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			case syscall.IP_TOS:
 				controlMessages.IP.HasTOS = true
 				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+
+			case syscall.IP_PKTINFO:
+				controlMessages.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+				controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo)
 			}
+
 		case syscall.SOL_IPV6:
 			switch unixCmsg.Header.Type {
 			case syscall.IPV6_TCLASS:
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index ed2fbcceb..9757fbfba 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1414,6 +1414,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return o, nil
 
+	case linux.IP_PKTINFO:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1762,6 +1777,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		linux.IPV6_IPSEC_POLICY,
 		linux.IPV6_JOIN_ANYCAST,
 		linux.IPV6_LEAVE_ANYCAST,
+		// TODO(b/148887420): Add support for IPV6_PKTINFO.
 		linux.IPV6_PKTINFO,
 		linux.IPV6_ROUTER_ALERT,
 		linux.IPV6_XFRM_POLICY,
@@ -1949,6 +1965,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
 
+	case linux.IP_PKTINFO:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1964,7 +1990,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
-		linux.IP_PKTINFO,
 		linux.IP_RECVERR,
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
@@ -2395,10 +2420,12 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
 	return socket.ControlMessages{
 		IP: tcpip.ControlMessages{
-			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
-			Timestamp:    s.readCM.Timestamp,
-			HasTOS:       s.readCM.HasTOS,
-			TOS:          s.readCM.TOS,
+			HasTimestamp:    s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:       s.readCM.Timestamp,
+			HasTOS:          s.readCM.HasTOS,
+			TOS:             s.readCM.TOS,
+			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
+			PacketInfo:      s.readCM.PacketInfo,
 		},
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0e944712f..9ca39ce40 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -328,6 +328,12 @@ type ControlMessages struct {
 
 	// Tclass is the IPv6 traffic class of the associated packet.
 	TClass int32
+
+	// HasIPPacketInfo indicates whether PacketInfo is set.
+	HasIPPacketInfo bool
+
+	// PacketInfo holds interface and address data on an incoming packet.
+	PacketInfo IPPacketInfo
 }
 
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
@@ -503,6 +509,11 @@ const (
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
 	V6OnlyOption
+
+	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
+	// if more inforamtion is provided with incoming packets such
+	// as interface index and address.
+	ReceiveIPPacketInfoOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -685,6 +696,20 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// IPPacketInfo is the message struture for IP_PKTINFO.
+//
+// +stateify savable
+type IPPacketInfo struct {
+	// NIC is the ID of the NIC to be used.
+	NIC NICID
+
+	// LocalAddr is the local address.
+	LocalAddr Address
+
+	// DestinationAddr is the destination address.
+	DestinationAddr Address
+}
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c9cbed8f4..3fe91cac2 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -29,6 +29,7 @@ import (
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
+	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
 	tos           uint8
@@ -118,6 +119,9 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveIPPacketInfo determines if the packet info is returned by Read.
+	receiveIPPacketInfo bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -254,11 +258,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
+
+	if receiveIPPacketInfo {
+		cm.HasIPPacketInfo = true
+		cm.PacketInfo = p.packetInfo
+	}
 	return p.data.ToView(), cm, nil
 }
 
@@ -495,6 +505,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
+		return nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+		return nil
 	}
 
 	return nil
@@ -703,6 +720,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 
 		return v, nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
 	}
 
 	return false, tcpip.ErrUnknownProtocolOption
@@ -1247,6 +1270,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	switch r.NetProto {
 	case header.IPv4ProtocolNumber:
 		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+		packet.packetInfo.LocalAddr = r.LocalAddress
+		packet.packetInfo.DestinationAddr = r.RemoteAddress
+		packet.packetInfo.NIC = r.NICID()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..db5663ecd 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -357,5 +357,49 @@ TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Test getsockopt for a socket which is not set with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, IPPKTINFODefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_IP, IP_PKTINFO, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test setsockopt and getsockopt for a socket with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  // Check getsockopt before IP_PKTINFO is set.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_len, sizeof(get));
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_len, sizeof(get));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 990ccf23c..bc4b07a62 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
 
 #include <arpa/inet.h>
+#include <net/if.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -2128,5 +2129,88 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Test that socket will receive packet info control message.
+TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF((IsRunningWithHostinet()));
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto sender_addr = V4Loopback();
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t sender_addr_len = sender_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                          &sender_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+  auto receiver_addr = V4Loopback();
+  reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&sender_addr.addr)->sin_port;
+  ASSERT_THAT(
+      connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+              receiver_addr.addr_len),
+      SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  ASSERT_THAT(
+      setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  msghdr sent_msg = {};
+  iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = sent_data;
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  sent_msg.msg_flags = 0;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  msghdr received_msg = {};
+  iovec received_iov = {};
+  char received_data[kDataLength];
+  char received_cmsg_buf[CMSG_SPACE(sizeof(in_pktinfo))] = {};
+  size_t cmsg_data_len = sizeof(in_pktinfo);
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  received_msg.msg_control = received_cmsg_buf;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, level);
+  EXPECT_EQ(cmsg->cmsg_type, type);
+
+  // Get loopback index.
+  ifreq ifr = {};
+  absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  ASSERT_NE(ifr.ifr_ifindex, 0);
+
+  // Check the data
+  in_pktinfo received_pktinfo = {};
+  memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo));
+  EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex);
+  EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK));
+  EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index a2f6ef8cc..9f8de6b48 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1495,6 +1495,5 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 336f758d59a8a0411c745d744a1e5c3294eaf78a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 16:31:33 -0800
Subject: Ensure the marshalled object doesn't escape.

Add new Marshallable interface methods CopyIn/CopyOut, which can be directly
called on the marshalled object, avoiding an interface indirection. Such
indirections are problematic because they always cause the marshalled object to
escape.

PiperOrigin-RevId: 295028010
---
 tools/go_marshal/gomarshal/generator.go            | 39 +++++----
 tools/go_marshal/gomarshal/generator_interfaces.go | 98 ++++++++++++++++++++++
 tools/go_marshal/gomarshal/generator_tests.go      |  1 +
 tools/go_marshal/gomarshal/util.go                 |  5 ++
 tools/go_marshal/marshal/BUILD                     |  3 +
 tools/go_marshal/marshal/marshal.go                | 42 +++++++++-
 6 files changed, 169 insertions(+), 19 deletions(-)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 0b3f600fe..01be7c477 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -34,9 +34,9 @@ const (
 	usermemImport  = "gvisor.dev/gvisor/pkg/usermem"
 )
 
-// List of identifiers we use in generated code, that may conflict a
-// similarly-named source identifier. Avoid problems by refusing the generate
-// code when we see these.
+// List of identifiers we use in generated code that may conflict with a
+// similarly-named source identifier. Abort gracefully when we see these to
+// avoid potentially confusing compilation failures in generated code.
 //
 // This only applies to import aliases at the moment. All other identifiers
 // are qualified by a receiver argument, since they're struct fields.
@@ -44,10 +44,20 @@ const (
 // All recievers are single letters, so we don't allow import aliases to be a
 // single letter.
 var badIdents = []string{
-	"src", "srcs", "dst", "dsts", "blk", "buf", "err",
+	"addr", "blk", "buf", "dst", "dsts", "err", "hdr", "len", "ptr", "src", "srcs", "task", "val",
 	// All single-letter identifiers.
 }
 
+// Constructed fromt badIdents in init().
+var badIdentsMap map[string]struct{}
+
+func init() {
+	badIdentsMap = make(map[string]struct{})
+	for _, ident := range badIdents {
+		badIdentsMap[ident] = struct{}{}
+	}
+}
+
 // Generator drives code generation for a single invocation of the go_marshal
 // utility.
 //
@@ -88,16 +98,18 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G
 	}
 	for _, i := range imports {
 		// All imports on the extra imports list are unconditionally marked as
-		// used, so they're always added to the generated code.
+		// used, so that they're always added to the generated code.
 		g.imports.add(i).markUsed()
 	}
 	g.imports.add(marshalImport).markUsed()
-	// The follow imports may or may not be used by the generated
-	// code, depending what's required for the target types. Don't
-	// mark these imports as used by default.
-	g.imports.add(usermemImport)
+	// The following imports may or may not be used by the generated code,
+	// depending on what's required for the target types. Don't mark these as
+	// used by default.
+	g.imports.add("reflect")
+	g.imports.add("runtime")
 	g.imports.add(safecopyImport)
 	g.imports.add("unsafe")
+	g.imports.add(usermemImport)
 
 	return &g, nil
 }
@@ -229,11 +241,6 @@ func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*as
 // identifiers in the generated code don't conflict with any imported package
 // names.
 func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]importStmt {
-	badImportNames := make(map[string]bool)
-	for _, i := range badIdents {
-		badImportNames[i] = true
-	}
-
 	is := make(map[string]importStmt)
 	for _, decl := range a.Decls {
 		gdecl, ok := decl.(*ast.GenDecl)
@@ -250,7 +257,7 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp
 			if len(i.name) == 1 {
 				abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import has a single character local name '%s'; this may conflict with code generated by go_marshal, use a multi-character import alias", i.name))
 			}
-			if badImportNames[i.name] {
+			if _, ok := badIdentsMap[i.name]; ok {
 				abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import name '%s' is likely to conflict with code generated by go_marshal, use a different import alias", i.name))
 			}
 		}
@@ -371,6 +378,7 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 		return err
 	}
 
+	// Collect and write test import statements.
 	imports := newImportTable()
 	for _, t := range ts {
 		imports.merge(t.imports)
@@ -380,6 +388,7 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 		return err
 	}
 
+	// Write test functions.
 	for _, t := range ts {
 		if err := t.write(g.outputTest); err != nil {
 			return err
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index a712c14dc..f25331ac5 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -504,4 +504,102 @@ func (g *interfaceGenerator) emitMarshallable() {
 	})
 	g.emit("}\n\n")
 
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
+			g.emit("%s.MarshalBytes(buf)\n", g.r)
+			g.emit("return task.CopyOutBytes(addr, buf)\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+			g.emit("val := uintptr(ptr)\n")
+			g.emit("val = val^0\n\n")
+
+			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+			g.emit("var buf []byte\n")
+			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+			g.emit("hdr.Data = val\n")
+			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+			g.emit("len, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+			g.emit("// must live until after the CopyOutBytes.\n")
+			g.emit("runtime.KeepAlive(%s)\n", g.r)
+			g.emit("return len, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
+			g.emit("n, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("if err != nil {\n")
+			g.inIndent(func() {
+				g.emit("return n, err\n")
+			})
+			g.emit("}\n")
+
+			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
+			g.emit("return n, nil\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast deserialization.
+			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+			g.emit("val := uintptr(ptr)\n")
+			g.emit("val = val^0\n\n")
+
+			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+			g.emit("var buf []byte\n")
+			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+			g.emit("hdr.Data = val\n")
+			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+			g.emit("len, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+			g.emit("// must live until after the CopyInBytes.\n")
+			g.emit("runtime.KeepAlive(%s)\n", g.r)
+			g.emit("return len, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
 }
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index bcda17c3b..cc760b6d0 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -25,6 +25,7 @@ var standardImports = []string{
 	"fmt",
 	"reflect",
 	"testing",
+
 	"gvisor.dev/gvisor/tools/go_marshal/analysis",
 }
 
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index 967537abf..3d86935b4 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -219,6 +219,11 @@ type sourceBuffer struct {
 	b bytes.Buffer
 }
 
+func (b *sourceBuffer) reset() {
+	b.indent = 0
+	b.b.Reset()
+}
+
 func (b *sourceBuffer) incIndent() {
 	b.indent++
 }
diff --git a/tools/go_marshal/marshal/BUILD b/tools/go_marshal/marshal/BUILD
index ad508c72f..bacfaa5a4 100644
--- a/tools/go_marshal/marshal/BUILD
+++ b/tools/go_marshal/marshal/BUILD
@@ -10,4 +10,7 @@ go_library(
     visibility = [
         "//:sandbox",
     ],
+    deps = [
+        "//pkg/usermem",
+    ],
 )
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index a313a27ed..10614ec4d 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -20,6 +20,26 @@
 // tools/go_marshal. See the go_marshal README for details.
 package marshal
 
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Task provides a subset of kernel.Task, used in marshalling. We don't import
+// the kernel package directly to avoid circular dependency.
+type Task interface {
+	// CopyScratchBuffer provides a task goroutine-local scratch buffer. See
+	// kernel.CopyScratchBuffer.
+	CopyScratchBuffer(size int) []byte
+
+	// CopyOutBytes writes the contents of b to the task's memory. See
+	// kernel.CopyOutBytes.
+	CopyOutBytes(addr usermem.Addr, b []byte) (int, error)
+
+	// CopyInBytes reads the contents of the task's memory to b. See
+	// kernel.CopyInBytes.
+	CopyInBytes(addr usermem.Addr, b []byte) (int, error)
+}
+
 // Marshallable represents a type that can be marshalled to and from memory.
 type Marshallable interface {
 	// SizeBytes is the size of the memory representation of a type in
@@ -48,13 +68,27 @@ type Marshallable interface {
 	// MarshalBytes.
 	MarshalUnsafe(dst []byte)
 
-	// UnmarshalUnsafe deserializes a type directly to the underlying memory
-	// allocated for the object by the runtime.
+	// UnmarshalUnsafe deserializes a type by directly copying to the underlying
+	// memory allocated for the object by the runtime.
 	//
 	// This allows much faster unmarshalling of types which have no implicit
 	// padding, see Marshallable.Packed. When Packed would return false,
 	// UnmarshalUnsafe should fall back to the safer but slower unmarshal
-	// mechanism implemented in UnmarshalBytes (usually by calling
-	// UnmarshalBytes directly).
+	// mechanism implemented in UnmarshalBytes.
 	UnmarshalUnsafe(src []byte)
+
+	// CopyIn deserializes a Marshallable type from a task's memory. This may
+	// only be called from a task goroutine. This is more efficient than calling
+	// UnmarshalUnsafe on Marshallable.Packed types, as the type being
+	// marshalled does not escape. The implementation should avoid creating
+	// extra copies in memory by directly deserializing to the object's
+	// underlying memory.
+	CopyIn(task Task, addr usermem.Addr) (int, error)
+
+	// CopyOut serializes a Marshallable type to a task's memory. This may only
+	// be called from a task goroutine. This is more efficient than calling
+	// MarshalUnsafe on Marshallable.Packed types, as the type being serialized
+	// does not escape. The implementation should avoid creating extra copies in
+	// memory by directly serializing from the object's underlying memory.
+	CopyOut(task Task, addr usermem.Addr) (int, error)
 }
-- 
cgit v1.2.3


From 6ef63cd7da107d487fda7c48af50fa9802913cd9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 12 Feb 2020 16:19:06 -0800
Subject: We can now create and jump in iptables. For example:

$ iptables -N foochain
$ iptables -A INPUT -j foochain
---
 pkg/abi/linux/netfilter.go               |   9 +-
 pkg/sentry/socket/netfilter/BUILD        |   1 +
 pkg/sentry/socket/netfilter/netfilter.go |  62 +++++++--
 pkg/sentry/socket/netfilter/targets.go   |  35 +++++
 pkg/tcpip/iptables/iptables.go           | 103 +++++++++------
 pkg/tcpip/iptables/targets.go            |  20 ++-
 pkg/tcpip/iptables/types.go              |  21 +--
 test/iptables/filter_input.go            | 217 ++++++++++++++++++++++++++++---
 test/iptables/iptables_test.go           |  36 +++++
 test/iptables/iptables_util.go           |  10 ++
 10 files changed, 420 insertions(+), 94 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/targets.go

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index bbc4df74c..bd2e13ba1 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -225,11 +225,14 @@ type XTEntryTarget struct {
 // SizeOfXTEntryTarget is the size of an XTEntryTarget.
 const SizeOfXTEntryTarget = 32
 
-// XTStandardTarget is a builtin target, one of ACCEPT, DROP, JUMP, QUEUE, or
-// RETURN. It corresponds to struct xt_standard_target in
+// XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
+// RETURN, or jump. It corresponds to struct xt_standard_target in
 // include/uapi/linux/netfilter/x_tables.h.
 type XTStandardTarget struct {
-	Target  XTEntryTarget
+	Target XTEntryTarget
+	// A positive verdict indicates a jump, and is the offset from the
+	// start of the table to jump to. A negative value means one of the
+	// other built-in targets.
 	Verdict int32
 	_       [4]byte
 }
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index c91ec7494..7cd2ce55b 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "extensions.go",
         "netfilter.go",
+        "targets.go",
         "tcp_matcher.go",
         "udp_matcher.go",
     ],
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3fc80e0de..d322e4144 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -240,13 +240,15 @@ func marshalTarget(target iptables.Target) []byte {
 		return marshalErrorTarget(tg.Name)
 	case iptables.ReturnTarget:
 		return marshalStandardTarget(iptables.RuleReturn)
+	case JumpTarget:
+		return marshalJumpTarget(tg)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
 func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
-	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -274,6 +276,23 @@ func marshalErrorTarget(errorName string) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
+func marshalJumpTarget(jt JumpTarget) []byte {
+	nflog("convert to binary: marshalling jump target")
+
+	// The target's name will be the empty string.
+	target := linux.XTStandardTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTStandardTarget,
+		},
+		// Verdict is overloaded by the ABI. When positive, it holds
+		// the jump offset from the start of the table.
+		Verdict: int32(jt.Offset),
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
 func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
@@ -335,7 +354,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
-	var offsets []uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
 		nflog("set entries: processing entry at offset %d", offset)
 
@@ -396,11 +416,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			Target:   target,
 			Matchers: matchers,
 		})
-		offsets = append(offsets, offset)
+		offsets[offset] = int(entryIdx)
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
 			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 	}
 
@@ -409,13 +430,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, _ := range replace.HookEntry {
 		if table.ValidHooks()&(1<<hook) != 0 {
 			hk := hookFromLinux(hook)
-			for ruleIdx, offset := range offsets {
+			for offset, ruleIdx := range offsets {
 				if offset == replace.HookEntry[hook] {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
 					if !validUnderflow(table.Rules[ruleIdx]) {
-						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP")
 						return syserr.ErrInvalidArgument
 					}
 					table.Underflows[hk] = ruleIdx
@@ -444,16 +465,35 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// - There's some other rule after it.
 		// - There are no matchers.
 		if ruleIdx == len(table.Rules)-1 {
-			nflog("user chain must have a rule or default policy.")
+			nflog("user chain must have a rule or default policy")
 			return syserr.ErrInvalidArgument
 		}
 		if len(table.Rules[ruleIdx].Matchers) != 0 {
-			nflog("user chain's first node must have no matcheres.")
+			nflog("user chain's first node must have no matchers")
 			return syserr.ErrInvalidArgument
 		}
 		table.UserChains[target.Name] = ruleIdx + 1
 	}
 
+	// Set each jump to point to the appropriate rule. Right now they hold byte
+	// offsets.
+	for ruleIdx, rule := range table.Rules {
+		jump, ok := rule.Target.(JumpTarget)
+		if !ok {
+			continue
+		}
+
+		// Find the rule corresponding to the jump rule offset.
+		jumpTo, ok := offsets[jump.Offset]
+		if !ok {
+			nflog("failed to find a rule to jump to")
+			return syserr.ErrInvalidArgument
+		}
+		jump.RuleNum = jumpTo
+		rule.Target = jump
+		table.Rules[ruleIdx] = rule
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
 	// Since we only support modifying the INPUT chain right now, make sure
 	// all other chains point to ACCEPT rules.
@@ -548,7 +588,13 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		return translateToStandardTarget(standardTarget.Verdict)
+		if standardTarget.Verdict < 0 {
+			// A Verdict < 0 indicates a non-jump verdict.
+			return translateToStandardTarget(standardTarget.Verdict)
+		} else {
+			// A verdict >= 0 indicates a jump.
+			return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
+		}
 
 	case errorTargetName:
 		// Error target.
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
new file mode 100644
index 000000000..c421b87cf
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+)
+
+// JumpTarget implements iptables.Target.
+type JumpTarget struct {
+	// Offset is the byte offset of the rule to jump to. It is used for
+	// marshaling and unmarshaling.
+	Offset uint32
+
+	// RuleNum is the rule to jump to.
+	RuleNum int
+}
+
+// Action implements iptables.Target.Action.
+func (jt JumpTarget) Action(tcpip.PacketBuffer) (iptables.RuleVerdict, int) {
+	return iptables.RuleJump, jt.RuleNum
+}
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 75a433a3b..dbaccbb36 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -135,25 +135,53 @@ func EmptyFilterTable() Table {
 	}
 }
 
+// A chainVerdict is what a table decides should be done with a packet.
+type chainVerdict int
+
+const (
+	// chainAccept indicates the packet should continue through netstack.
+	chainAccept chainVerdict = iota
+
+	// chainAccept indicates the packet should be dropped.
+	chainDrop
+
+	// chainReturn indicates the packet should return to the calling chain
+	// or the underflow rule of a builtin chain.
+	chainReturn
+)
+
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
 //
 // Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
-	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
-	// we're missing features. Jumps, the call stack, etc. aren't checked
-	// for yet because we're yet to support them.
-
 	// Go through each table containing the hook.
 	for _, tablename := range it.Priorities[hook] {
-		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
+		table := it.Tables[tablename]
+		ruleIdx := table.BuiltinChains[hook]
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx); verdict {
 		// If the table returns Accept, move on to the next table.
-		case TableAccept:
+		case chainAccept:
 			continue
 		// The Drop verdict is final.
-		case TableDrop:
+		case chainDrop:
 			return false
+		case chainReturn:
+			// Any Return from a built-in chain means we have to
+			// call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				continue
+			case RuleDrop:
+				return false
+			case RuleJump, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
 		default:
 			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
@@ -164,37 +192,37 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
+func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
-	table := it.Tables[tablename]
-	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
-		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
+	for ruleIdx < len(table.Rules) {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx); verdict {
 		case RuleAccept:
-			return TableAccept
+			return chainAccept
 
 		case RuleDrop:
-			return TableDrop
-
-		case RuleContinue:
-			continue
+			return chainDrop
 
 		case RuleReturn:
-			// TODO(gvisor.dev/issue/170): We don't implement jump
-			// yet, so any Return is from a built-in chain. That
-			// means we have to to call the underflow.
-			underflow := table.Rules[table.Underflows[hook]]
-			// Underflow is guaranteed to be an unconditional
-			// ACCEPT or DROP.
-			switch v, _ := underflow.Target.Action(pkt); v {
-			case RuleAccept:
-				return TableAccept
-			case RuleDrop:
-				return TableDrop
-			case RuleContinue, RuleReturn:
-				panic("Underflows should only return RuleAccept or RuleDrop.")
+			return chainReturn
+
+		case RuleJump:
+			// "Jumping" to the next rule just means we're
+			// continuing on down the list.
+			if jumpTo == ruleIdx+1 {
+				ruleIdx++
+				continue
+			}
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo); verdict {
+			case chainAccept:
+				return chainAccept
+			case chainDrop:
+				return chainDrop
+			case chainReturn:
+				ruleIdx++
+				continue
 			default:
-				panic(fmt.Sprintf("Unknown verdict: %d", v))
+				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 			}
 
 		default:
@@ -205,17 +233,18 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 
 	// We got through the entire table without a decision. Default to DROP
 	// for safety.
-	return TableDrop
+	return chainDrop
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return RuleContinue
+		// Continue on to the next rule.
+		return RuleJump, ruleIdx + 1
 	}
 
 	// Go through each rule matcher. If they all match, run
@@ -223,14 +252,14 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	for _, matcher := range rule.Matchers {
 		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
-			return RuleDrop
+			return RuleDrop, 0
 		}
 		if !matches {
-			return RuleContinue
+			// Continue on to the next rule.
+			return RuleJump, ruleIdx + 1
 		}
 	}
 
 	// All the matchers matched, so run the target.
-	verdict, _ := rule.Target.Action(pkt)
-	return verdict
+	return rule.Target.Action(pkt)
 }
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 9fc60cfad..81a2e39a2 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This file contains various Targets.
-
 package iptables
 
 import (
@@ -25,16 +23,16 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleAccept, ""
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleAccept, 0
 }
 
 // DropTarget drops packets.
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleDrop, ""
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleDrop, 0
 }
 
 // ErrorTarget logs an error and drops the packet. It represents a target that
@@ -42,9 +40,9 @@ func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
-	return RuleDrop, ""
+	return RuleDrop, 0
 }
 
 // UserChainTarget marks a rule as the beginning of a user chain.
@@ -53,7 +51,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -62,6 +60,6 @@ func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleReturn, ""
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleReturn, 0
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 5735d001b..7d032fd23 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -56,17 +56,6 @@ const (
 	NumHooks
 )
 
-// A TableVerdict is what a table decides should be done with a packet.
-type TableVerdict int
-
-const (
-	// TableAccept indicates the packet should continue through netstack.
-	TableAccept TableVerdict = iota
-
-	// TableAccept indicates the packet should be dropped.
-	TableDrop
-)
-
 // A RuleVerdict is what a rule decides should be done with a packet.
 type RuleVerdict int
 
@@ -74,12 +63,12 @@ const (
 	// RuleAccept indicates the packet should continue through netstack.
 	RuleAccept RuleVerdict = iota
 
-	// RuleContinue indicates the packet should continue to the next rule.
-	RuleContinue
-
 	// RuleDrop indicates the packet should be dropped.
 	RuleDrop
 
+	// RuleJump indicates the packet should jump to another chain.
+	RuleJump
+
 	// RuleReturn indicates the packet should return to the previous chain.
 	RuleReturn
 )
@@ -174,6 +163,6 @@ type Matcher interface {
 type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
-	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
+	// Jump, it also returns the index of the rule to jump to.
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
 }
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index e26d6a7d2..706c09cea 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -26,6 +26,7 @@ const (
 	acceptPort       = 2402
 	sendloopDuration = 2 * time.Second
 	network          = "udp4"
+	chainName        = "foochain"
 )
 
 func init() {
@@ -40,6 +41,12 @@ func init() {
 	RegisterTestCase(FilterInputDefaultPolicyAccept{})
 	RegisterTestCase(FilterInputDefaultPolicyDrop{})
 	RegisterTestCase(FilterInputReturnUnderflow{})
+	RegisterTestCase(FilterInputSerializeJump{})
+	RegisterTestCase(FilterInputJumpBasic{})
+	RegisterTestCase(FilterInputJumpReturn{})
+	RegisterTestCase(FilterInputJumpReturnDrop{})
+	RegisterTestCase(FilterInputJumpBuiltin{})
+	RegisterTestCase(FilterInputJumpTwice{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -267,13 +274,12 @@ func (FilterInputMultiUDPRules) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"},
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"},
+		{"-L"},
 	}
-	return filterTable("-L")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -314,14 +320,13 @@ func (FilterInputCreateUserChain) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
-	// Create a chain.
-	const chainName = "foochain"
-	if err := filterTable("-N", chainName); err != nil {
-		return err
+	rules := [][]string{
+		// Create a chain.
+		{"-N", chainName},
+		// Add a simple rule to the chain.
+		{"-A", chainName, "-j", "DROP"},
 	}
-
-	// Add a simple rule to the chain.
-	return filterTable("-A", chainName, "-j", "DROP")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -396,13 +401,12 @@ func (FilterInputReturnUnderflow) Name() string {
 func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 	// Add a RETURN rule followed by an unconditional accept, and set the
 	// default policy to DROP.
-	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-j", "RETURN"},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-P", "INPUT", "ACCEPT"},
 	}
-	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+	if err := filterTableRules(rules); err != nil {
 		return err
 	}
 
@@ -415,3 +419,178 @@ func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// Verify that we can serialize jumps.
+type FilterInputSerializeJump struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputSerializeJump) Name() string {
+	return "FilterInputSerializeJump"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputSerializeJump) ContainerAction(ip net.IP) error {
+	// Write a JUMP rule, the serialize it with `-L`.
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-L"},
+	}
+	return filterTableRules(rules)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputSerializeJump) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// Jump to a chain and execute a rule there.
+type FilterInputJumpBasic struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBasic) Name() string {
+	return "FilterInputJumpBasic"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBasic) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBasic) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// Jump, return, and execute a rule.
+type FilterInputJumpReturn struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturn) Name() string {
+	return "FilterInputJumpReturn"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturn) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-P", "INPUT", "ACCEPT"},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "RETURN"},
+		{"-A", chainName, "-j", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturn) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+type FilterInputJumpReturnDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturnDrop) Name() string {
+	return "FilterInputJumpReturnDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturnDrop) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-A", chainName, "-j", "RETURN"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturnDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// Jumping to a top-levl chain is illegal.
+type FilterInputJumpBuiltin struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBuiltin) Name() string {
+	return "FilterInputJumpBuiltin"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBuiltin) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-j", "OUTPUT"); err == nil {
+		return fmt.Errorf("iptables should be unable to jump to a built-in chain")
+	}
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBuiltin) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// Jump twice, then return twice and execute a rule.
+type FilterInputJumpTwice struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpTwice) Name() string {
+	return "FilterInputJumpTwice"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpTwice) ContainerAction(ip net.IP) error {
+	const chainName2 = chainName + "2"
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-N", chainName2},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", chainName2},
+		{"-A", "INPUT", "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// UDP packets should jump and return twice, eventually hitting the
+	// ACCEPT rule.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpTwice) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 46a7c99b0..0621861eb 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -249,3 +249,39 @@ func TestFilterOutputDropTCPSrcPort(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestJumpSerialize(t *testing.T) {
+	if err := singleTest(FilterInputSerializeJump{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpBasic(t *testing.T) {
+	if err := singleTest(FilterInputJumpBasic{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpReturn(t *testing.T) {
+	if err := singleTest(FilterInputJumpReturn{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpReturnDrop(t *testing.T) {
+	if err := singleTest(FilterInputJumpReturnDrop{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpBuiltin(t *testing.T) {
+	if err := singleTest(FilterInputJumpBuiltin{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpTwice(t *testing.T) {
+	if err := singleTest(FilterInputJumpTwice{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 043114c78..293c4e6ed 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -35,6 +35,16 @@ func filterTable(args ...string) error {
 	return nil
 }
 
+// filterTableRules is like filterTable, but runs multiple iptables commands.
+func filterTableRules(argsList [][]string) error {
+	for _, args := range argsList {
+		if err := filterTable(args...); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
 // the first read on that port.
 func listenUDP(port int, timeout time.Duration) error {
-- 
cgit v1.2.3


From a6024f7f5f6f438c11e30be0f93657b1956fd5ba Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 17:56:34 -0800
Subject: Add FileExec flag to OpenOptions

This allow callers to say whether the file is being
opened to be executed, so that the proper checks can
be done from FilesystemImpl.OpenAt()

Updates #1623

PiperOrigin-RevId: 295042595
---
 pkg/sentry/fsimpl/ext/filesystem.go            |  2 +-
 pkg/sentry/fsimpl/ext/inode.go                 | 12 ++++++------
 pkg/sentry/fsimpl/gofer/filesystem.go          | 24 ++++++++++++------------
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  4 ++--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |  6 +++---
 pkg/sentry/fsimpl/kernfs/filesystem.go         | 10 +++++-----
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  6 +++---
 pkg/sentry/fsimpl/kernfs/kernfs.go             |  2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |  8 ++++----
 pkg/sentry/fsimpl/proc/subtasks.go             |  4 ++--
 pkg/sentry/fsimpl/proc/task.go                 |  4 ++--
 pkg/sentry/fsimpl/proc/tasks.go                |  4 ++--
 pkg/sentry/fsimpl/sys/sys.go                   |  4 ++--
 pkg/sentry/fsimpl/tmpfs/filesystem.go          |  2 +-
 pkg/sentry/vfs/options.go                      |  5 +++++
 pkg/sentry/vfs/permissions.go                  | 19 ++++++++++++-------
 pkg/sentry/vfs/vfs.go                          | 19 +++++++++++++++++++
 17 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 07bf58953..e05429d41 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -296,7 +296,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
 		return nil, syserror.EROFS
 	}
-	return inode.open(rp, vfsd, opts.Flags)
+	return inode.open(rp, vfsd, &opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 191b39970..6962083f5 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -148,8 +148,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 }
 
 // open creates and returns a file description for the dentry passed in.
-func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
@@ -157,7 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
@@ -168,17 +168,17 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
 	case *symlink:
-		if flags&linux.O_PATH == 0 {
+		if opts.Flags&linux.O_PATH == 0 {
 			// Can't open symlinks without O_PATH.
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 8eb61debf..138adb9f7 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -593,7 +593,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		}
 	}
 	if rp.Done() {
-		return start.openLocked(ctx, rp, opts.Flags)
+		return start.openLocked(ctx, rp, &opts)
 	}
 
 afterTrailingSymlink:
@@ -633,12 +633,12 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
-	return child.openLocked(ctx, rp, opts.Flags)
+	return child.openLocked(ctx, rp, &opts)
 }
 
 // Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
 		return nil, err
 	}
@@ -646,11 +646,11 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
 	filetype := d.fileType()
 	switch {
 	case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
-		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil {
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
 			return nil, err
 		}
 		fd := &regularFileFD{}
-		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
 			AllowDirectIO: true,
 		}); err != nil {
 			return nil, err
@@ -658,21 +658,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
 		return &fd.vfsfd, nil
 	case filetype == linux.S_IFDIR:
 		// Can't open directories with O_CREAT.
-		if flags&linux.O_CREAT != 0 {
+		if opts.Flags&linux.O_CREAT != 0 {
 			return nil, syserror.EISDIR
 		}
 		// Can't open directories writably.
 		if ats&vfs.MayWrite != 0 {
 			return nil, syserror.EISDIR
 		}
-		if flags&linux.O_DIRECT != 0 {
+		if opts.Flags&linux.O_DIRECT != 0 {
 			return nil, syserror.EINVAL
 		}
 		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
 			return nil, err
 		}
 		fd := &directoryFD{}
-		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
@@ -680,17 +680,17 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	default:
-		if flags&linux.O_DIRECT != 0 {
+		if opts.Flags&linux.O_DIRECT != 0 {
 			return nil, syserror.EINVAL
 		}
-		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0)
+		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
 		if err != nil {
 			return nil, err
 		}
 		fd := &specialFileFD{
 			handle: h,
 		}
-		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			h.close(ctx)
 			return nil, err
 		}
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 733792c78..d092ccb2a 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -53,9 +53,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 6104751c8..eda781155 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -43,12 +43,12 @@ type GenericDirectoryFD struct {
 }
 
 // Init initializes a GenericDirectoryFD.
-func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error {
-	if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 {
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error {
+	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
-	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
 	fd.children = children
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index e49303c26..ee98eb66a 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -365,7 +365,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// appropriate bits in rp), but are returned by
 	// FileDescriptionImpl.StatusFlags().
 	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
-	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+	ats := vfs.AccessTypesForOpenFlags(&opts)
 
 	// Do not create new file.
 	if opts.Flags&linux.O_CREAT == 0 {
@@ -379,7 +379,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts.Flags)
+		return inode.Open(rp, vfsd, opts)
 	}
 
 	// May create new file.
@@ -398,7 +398,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts.Flags)
+		return inode.Open(rp, vfsd, opts)
 	}
 afterTrailingSymlink:
 	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
@@ -438,7 +438,7 @@ afterTrailingSymlink:
 			return nil, err
 		}
 		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
+		return child.Impl().(*Dentry).inode.Open(rp, child, opts)
 	}
 	// Open existing file or follow symlink.
 	if mustCreate {
@@ -463,7 +463,7 @@ afterTrailingSymlink:
 	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	return childInode.Open(rp, childVFSD, opts.Flags)
+	return childInode.Open(rp, childVFSD, opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index adca2313f..099d70a16 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -507,7 +507,7 @@ type InodeSymlink struct {
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -549,8 +549,8 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F
 }
 
 // Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 79ebea8a5..c74fa999b 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -303,7 +303,7 @@ type Inode interface {
 	// inode for its lifetime.
 	//
 	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
-	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error)
+	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
 }
 
 type inodeRefs interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index ee65cf491..96a16e654 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -113,9 +113,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 	return &dir.dentry
 }
 
-func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
@@ -143,9 +143,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &dir.dentry
 }
 
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 353e37195..102af0e93 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -114,9 +114,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 }
 
 // Open implements kernfs.Inode.
-func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index eb5bc62c0..2d814668a 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -98,9 +98,9 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 }
 
 // Open implements kernfs.Inode.
-func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 14bd334e8..ebe21630c 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -205,9 +205,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 }
 
 // Open implements kernfs.Inode.
-func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index e35d52d17..d693fceae 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -97,9 +97,9 @@ func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 72bc15264..8785452b6 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -334,7 +334,7 @@ afterTrailingSymlink:
 }
 
 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if !afterCreate {
 		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
 			return nil, err
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index b7774bf28..fdf8be157 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -72,6 +72,11 @@ type OpenOptions struct {
 	// If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
 	// created file.
 	Mode linux.FileMode
+
+	// FileExec is set when the file is being opened to be executed.
+	// VirtualFilesystem.OpenAt() checks that the caller has execute permissions
+	// on the file, and that the file is a regular file.
+	FileExec bool
 }
 
 // ReadOptions contains options to FileDescription.PRead(),
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f664581f4..8e250998a 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -103,17 +103,22 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 // AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
 //
 // Use May{Read,Write}FileWithOpenFlags() for these checks instead.
-func AccessTypesForOpenFlags(flags uint32) AccessTypes {
-	switch flags & linux.O_ACCMODE {
+func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes {
+	ats := AccessTypes(0)
+	if opts.FileExec {
+		ats |= MayExec
+	}
+
+	switch opts.Flags & linux.O_ACCMODE {
 	case linux.O_RDONLY:
-		if flags&linux.O_TRUNC != 0 {
-			return MayRead | MayWrite
+		if opts.Flags&linux.O_TRUNC != 0 {
+			return ats | MayRead | MayWrite
 		}
-		return MayRead
+		return ats | MayRead
 	case linux.O_WRONLY:
-		return MayWrite
+		return ats | MayWrite
 	default:
-		return MayRead | MayWrite
+		return ats | MayRead | MayWrite
 	}
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 908c69f91..9629afee9 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -379,6 +379,25 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
+
+			// TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
+			// to FileDescription.Stat().
+			if opts.FileExec {
+				// Only a regular file can be executed.
+				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
+				if err != nil {
+					return nil, err
+				}
+				if stat.Mask&linux.STATX_TYPE != 0 {
+					// This shouldn't happen, but if type can't be retrieved, file can't
+					// be executed.
+					return nil, syserror.EACCES
+				}
+				if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular {
+					return nil, syserror.EACCES
+				}
+			}
+
 			return fd, nil
 		}
 		if !rp.handleError(err) {
-- 
cgit v1.2.3


From ebaf29abeb5e6e6cd87f4b0088719abb30e9d4cb Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 14 Feb 2020 02:04:41 -0500
Subject: passed the kvm test case of "TestKernelSyscall" on Arm64

For kvm test case "TestKernelSyscall",
redpill/syscall(-1) in guest kernel level will be trapped in el1_svc.
And in el1_svc, we use mmio_exit to leave the guest.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s   | 73 ++++++++++++++++++++++++++-----
 pkg/sentry/platform/ring0/kernel_arm64.go |  8 ++++
 2 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index baa6c4910..d42eda37b 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -25,10 +25,14 @@
 // not available for calls.
 //
 
+// ERET returns using the ELR and SPSR for the current exception level.
 #define ERET() \
   WORD $0xd69f03e0
 
+// RSV_REG is a register that holds el1 information temporarily.
 #define RSV_REG 	R18_PLATFORM
+
+// RSV_REG_APP is a register that holds el0 information temporarily.
 #define RSV_REG_APP 	R9
 
 #define FPEN_NOTRAP 	0x3
@@ -36,6 +40,12 @@
 
 #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
 
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: R9, R18.
 #define REGISTERS_SAVE(reg, offset) \
   MOVD R0, offset+PTRACE_R0(reg); \
   MOVD R1, offset+PTRACE_R1(reg); \
@@ -67,6 +77,12 @@
   MOVD R29, offset+PTRACE_R29(reg); \
   MOVD R30, offset+PTRACE_R30(reg);
 
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: R9, R18.
 #define REGISTERS_LOAD(reg, offset) \
   MOVD offset+PTRACE_R0(reg), R0; \
   MOVD offset+PTRACE_R1(reg), R1; \
@@ -98,7 +114,7 @@
   MOVD offset+PTRACE_R29(reg), R29; \
   MOVD offset+PTRACE_R30(reg), R30;
 
-//NOP
+// NOP-s
 #define nop31Instructions() \
         WORD $0xd503201f; \
         WORD $0xd503201f; \
@@ -254,6 +270,7 @@
 #define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
 
+// LOAD_KERNEL_ADDRESS loads a kernel address.
 #define LOAD_KERNEL_ADDRESS(from, to) \
 	MOVD from, to; \
 	ORR $0xffff000000000000, to, to;
@@ -263,15 +280,18 @@
 	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
 	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
 	MOVD RSV_REG, RSP; \
+	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
 #define SWITCH_TO_APP_PAGETABLE(from) \
 	MOVD CPU_TTBR0_APP(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
 #define SWITCH_TO_KVM_PAGETABLE(from) \
 	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
@@ -294,6 +314,7 @@
 	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
 	ISB $15;
 
+// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1.
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -315,19 +336,22 @@
 	WORD $0xd5384103; \      //  MRS SP_EL0, R3
 	MOVD R3, PTRACE_SP(RSV_REG_APP);
 
+// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
 #define KERNEL_ENTRY_FROM_EL1 \
 	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
-	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// save sentry context
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// Save sentry context.
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
 	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
 	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
 	MRS ELR_EL1, R4; \
 	MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \
 	MOVD RSP, R4; \
-	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG);
+	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
+	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
+// Halt halts execution.
 TEXT ·Halt(SB),NOSPLIT,$0
-	// clear bluepill.
+	// Clear bluepill.
 	WORD $0xd538d092   //MRS   TPIDR_EL1, R18
 	CMP RSV_REG, R9
 	BNE mmio_exit
@@ -341,8 +365,22 @@ mmio_exit:
 	// MMIO_EXIT.
 	MOVD $0, R9
 	MOVD R0, 0xffff000000001000(R9)
-	B ·kernelExitToEl1(SB)
+	RET
+
+// HaltAndResume halts execution and point the pointer to the resume function.
+TEXT ·HaltAndResume(SB),NOSPLIT,$0
+	BL ·Halt(SB)
+	B ·kernelExitToEl1(SB) // Resume.
 
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0
+	WORD $0xd538d092            // MRS TPIDR_EL1, R18
+	MOVD CPU_SELF(RSV_REG), R3  // Load vCPU.
+	MOVD R3, 8(RSP)             // First argument (vCPU).
+	CALL ·kernelSyscall(SB)     // Call the trampoline.
+	B ·kernelExitToEl1(SB)      // Resume.
+
+// Shutdown stops the guest.
 TEXT ·Shutdown(SB),NOSPLIT,$0
 	// PSCI EVENT.
 	MOVD $0x84000009, R0
@@ -429,6 +467,7 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 	ERET()
 
+// Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
 	IRQ_DISABLE
 	MOVD R8, RSV_REG
@@ -437,18 +476,23 @@ TEXT ·Start(SB),NOSPLIT,$0
 
 	B ·kernelExitToEl1(SB)
 
+// El1_sync_invalid is the handler for an invalid EL1_sync.
 TEXT ·El1_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_irq_invalid is the handler for an invalid El1_irq.
 TEXT ·El1_irq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq_invalid is the handler for an invalid El1_fiq.
 TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error_invalid is the handler for an invalid El1_error.
 TEXT ·El1_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_sync is the handler for El1_sync.
 TEXT ·El1_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL1
 	WORD $0xd5385219        // MRS ESR_EL1, R25
@@ -484,10 +528,10 @@ el1_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_ia:
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_sp_pc:
 	B ·Shutdown(SB)
@@ -496,7 +540,9 @@ el1_undef:
 	B ·Shutdown(SB)
 
 el1_svc:
-	B ·Halt(SB)
+	MOVD $0, CPU_ERROR_CODE(RSV_REG)
+	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
+	B ·HaltEl1SvcAndResume(SB)
 
 el1_dbg:
 	B ·Shutdown(SB)
@@ -508,15 +554,19 @@ el1_fpsimd_acc:
 el1_invalid:
 	B ·Shutdown(SB)
 
+// El1_irq is the handler for El1_irq.
 TEXT ·El1_irq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq is the handler for El1_fiq.
 TEXT ·El1_fiq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error is the handler for El1_error.
 TEXT ·El1_error(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El0_sync is the handler for El0_sync.
 TEXT ·El0_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL0
 	WORD $0xd5385219	// MRS ESR_EL1, R25
@@ -554,7 +604,7 @@ el0_svc:
 	MOVD $Syscall, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_da:
 	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
@@ -568,7 +618,7 @@ el0_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_ia:
 	B ·Shutdown(SB)
@@ -613,7 +663,7 @@ TEXT ·El0_error(SB),NOSPLIT,$0
 	MOVD $VirtualizationException, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
@@ -627,6 +677,7 @@ TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0
 TEXT ·El0_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// Vectors implements exception vector table.
 TEXT ·Vectors(SB),NOSPLIT,$0
 	B ·El1_sync_invalid(SB)
 	nop31Instructions()
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index c3d341998..ccacaea6b 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -16,6 +16,14 @@
 
 package ring0
 
+// HaltAndResume halts execution and point the pointer to the resume function.
+//go:nosplit
+func HaltAndResume()
+
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+//go:nosplit
+func HaltEl1SvcAndResume()
+
 // init initializes architecture-specific state.
 func (k *Kernel) init(opts KernelOpts) {
 	// Save the root page tables.
-- 
cgit v1.2.3


From b2e86906ea4f7bc43b8d2d3a4735a87eca779b33 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 03:26:42 -0800
Subject: Fix various issues related to enabling go-marshal.

- Add missing build tags to files in the abi package.

- Add the marshal package as a sentry dependency, allowed by deps_test.

- Fix an issue with our top-level go_library BUILD rule, which
  incorrectly shadows the variable containing the input set of source
  files. This caused the expansion for the go_marshal clause to
  silently omit input files.

- Fix formatting when copying build tags to gomarshal-generated files.

- Fix a bug with import statement collision detection in go-marshal.

PiperOrigin-RevId: 295112284
---
 pkg/abi/linux/file_amd64.go             |  2 ++
 pkg/abi/linux/file_arm64.go             |  2 ++
 tools/defs.bzl                          | 12 ++++++------
 tools/go_marshal/gomarshal/generator.go |  2 +-
 tools/go_marshal/gomarshal/util.go      | 25 ++++++++++++++++++-------
 5 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
index 9d307e840..8693d49c8 100644
--- a/pkg/abi/linux/file_amd64.go
+++ b/pkg/abi/linux/file_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package linux
 
 // Constants for open(2).
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
index 26a54f416..ea3adc5f5 100644
--- a/pkg/abi/linux/file_arm64.go
+++ b/pkg/abi/linux/file_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package linux
 
 // Constants for open(2).
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 46249f9c4..39f035f12 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -117,10 +117,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         # First, we need to segregate the input files via the special suffixes,
         # and calculate the final output set.
         state_sets = calculate_sets(srcs)
-        for (suffix, srcs) in state_sets.items():
+        for (suffix, src_subset) in state_sets.items():
             go_stateify(
                 name = name + suffix + "_state_autogen_with_imports",
-                srcs = srcs,
+                srcs = src_subset,
                 imports = imports,
                 package = full_pkg,
                 out = name + suffix + "_state_autogen_with_imports.go",
@@ -140,10 +140,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
     if marshal:
         # See above.
         marshal_sets = calculate_sets(srcs)
-        for (suffix, srcs) in marshal_sets.items():
+        for (suffix, src_subset) in marshal_sets.items():
             go_marshal(
                 name = name + suffix + "_abi_autogen",
-                srcs = srcs,
+                srcs = src_subset,
                 debug = False,
                 imports = imports,
                 package = name,
@@ -172,11 +172,11 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
 
         # See above.
         marshal_sets = calculate_sets(srcs)
-        for (suffix, srcs) in marshal_sets.items():
+        for (suffix, _) in marshal_sets.items():
             _go_test(
                 name = name + suffix + "_abi_autogen_test",
                 srcs = [name + suffix + "_abi_autogen_test.go"],
-                library = ":" + name + suffix,
+                library = ":" + name,
                 deps = marshal_test_deps,
                 **kwargs
             )
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 01be7c477..fbec7bb9a 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -123,7 +123,7 @@ func (g *Generator) writeHeader() error {
 	// Emit build tags.
 	if t := tags.Aggregate(g.inputs); len(t) > 0 {
 		b.emit(strings.Join(t.Lines(), "\n"))
-		b.emit("\n")
+		b.emit("\n\n")
 	}
 
 	// Package header.
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index 3d86935b4..e2bca4e7c 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -310,7 +310,7 @@ func (i *importStmt) markUsed() {
 }
 
 func (i *importStmt) equivalent(other *importStmt) bool {
-	return i == other
+	return i.name == other.name && i.path == other.path && i.aliased == other.aliased
 }
 
 // importTable represents a collection of importStmts.
@@ -329,7 +329,7 @@ func newImportTable() *importTable {
 // result in a panic.
 func (i *importTable) merge(other *importTable) {
 	for name, im := range other.is {
-		if dup, ok := i.is[name]; ok && dup.equivalent(im) {
+		if dup, ok := i.is[name]; ok && !dup.equivalent(im) {
 			panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
 		}
 
@@ -337,16 +337,27 @@ func (i *importTable) merge(other *importTable) {
 	}
 }
 
+func (i *importTable) addStmt(s *importStmt) *importStmt {
+	if old, ok := i.is[s.name]; ok && !old.equivalent(s) {
+		// A collision should always be between an import inserted by the
+		// go-marshal tool and an import from the original source file (assuming
+		// the original source file was valid). We could theoretically handle
+		// the collision by assigning a local name to our import. However, this
+		// would need to be plumbed throughout the generator. Given that
+		// collisions should be rare, simply panic on collision.
+		panic(fmt.Sprintf("Import collision: old: %s as %v; new: %v as %v", old.path, old.name, s.path, s.name))
+	}
+	i.is[s.name] = s
+	return s
+}
+
 func (i *importTable) add(s string) *importStmt {
 	n := newImport(s)
-	i.is[n.name] = n
-	return n
+	return i.addStmt(n)
 }
 
 func (i *importTable) addFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
-	n := newImportFromSpec(spec, f)
-	i.is[n.name] = n
-	return n
+	return i.addStmt(newImportFromSpec(spec, f))
 }
 
 // Marks the import named n as used. If no such import is in the table, returns
-- 
cgit v1.2.3


From 4075de11be44372c454aae7f9650cdc814c52229 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 11:11:55 -0800
Subject: Plumb VFS2 inside the Sentry

- Added fsbridge package with interface that can be used to open
  and read from VFS1 and VFS2 files.
- Converted ELF loader to use fsbridge
- Added VFS2 types to FSContext
- Added vfs.MountNamespace to ThreadGroup

Updates #1623

PiperOrigin-RevId: 295183950
---
 pkg/sentry/control/BUILD                           |   5 +
 pkg/sentry/control/proc.go                         | 127 +++++++++++++--
 pkg/sentry/fs/proc/BUILD                           |   1 +
 pkg/sentry/fs/proc/task.go                         |  17 +-
 pkg/sentry/fsbridge/BUILD                          |  24 +++
 pkg/sentry/fsbridge/bridge.go                      |  54 ++++++
 pkg/sentry/fsbridge/fs.go                          | 181 +++++++++++++++++++++
 pkg/sentry/fsbridge/vfs.go                         | 134 +++++++++++++++
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go             |   4 +
 pkg/sentry/fsimpl/gofer/filesystem.go              |   5 +-
 pkg/sentry/fsimpl/gofer/gofer.go                   |   3 +
 pkg/sentry/fsimpl/kernfs/filesystem.go             |  10 +-
 pkg/sentry/fsimpl/proc/BUILD                       |   1 +
 pkg/sentry/fsimpl/proc/filesystem.go               |  18 +-
 pkg/sentry/fsimpl/proc/tasks_test.go               |  17 +-
 pkg/sentry/fsimpl/sys/BUILD                        |   1 +
 pkg/sentry/fsimpl/sys/sys.go                       |   3 +
 pkg/sentry/fsimpl/sys/sys_test.go                  |   7 +-
 pkg/sentry/fsimpl/testutil/BUILD                   |   2 +-
 pkg/sentry/fsimpl/testutil/kernel.go               |  24 +--
 pkg/sentry/fsimpl/testutil/testutil.go             |  12 +-
 pkg/sentry/fsimpl/tmpfs/filesystem.go              |  12 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go                   |   3 +
 pkg/sentry/kernel/BUILD                            |   2 +
 pkg/sentry/kernel/fs_context.go                    |  98 +++++++++--
 pkg/sentry/kernel/kernel.go                        | 145 +++++++++++++----
 pkg/sentry/kernel/task.go                          |  27 +++
 pkg/sentry/kernel/task_clone.go                    |  11 +-
 pkg/sentry/kernel/task_context.go                  |   2 +-
 pkg/sentry/kernel/task_exit.go                     |   7 +
 pkg/sentry/kernel/task_log.go                      |  15 +-
 pkg/sentry/kernel/task_start.go                    |  49 +++---
 pkg/sentry/kernel/thread_group.go                  |   6 +-
 pkg/sentry/loader/BUILD                            |   2 +
 pkg/sentry/loader/elf.go                           |  28 ++--
 pkg/sentry/loader/interpreter.go                   |   6 +-
 pkg/sentry/loader/loader.go                        | 179 ++++++--------------
 pkg/sentry/loader/vdso.go                          |   7 +-
 pkg/sentry/mm/BUILD                                |   2 +-
 pkg/sentry/mm/metadata.go                          |  10 +-
 pkg/sentry/mm/mm.go                                |   4 +-
 pkg/sentry/strace/strace.go                        |  28 ++++
 pkg/sentry/syscalls/linux/BUILD                    |   1 +
 pkg/sentry/syscalls/linux/sys_prctl.go             |   3 +-
 pkg/sentry/syscalls/linux/sys_thread.go            |  17 +-
 .../syscalls/linux/vfs2/linux64_override_amd64.go  | 106 ++++++++++++
 pkg/sentry/vfs/BUILD                               |   1 +
 pkg/sentry/vfs/context.go                          |   7 +-
 pkg/sentry/vfs/mount.go                            |  10 +-
 pkg/sentry/vfs/options.go                          |   2 +-
 pkg/sentry/vfs/vfs.go                              |   5 +-
 runsc/boot/loader.go                               |  11 +-
 52 files changed, 1134 insertions(+), 322 deletions(-)
 create mode 100644 pkg/sentry/fsbridge/BUILD
 create mode 100644 pkg/sentry/fsbridge/bridge.go
 create mode 100644 pkg/sentry/fsbridge/fs.go
 create mode 100644 pkg/sentry/fsbridge/vfs.go

diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e69496477..d16d78aa5 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,10 +16,13 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
@@ -27,8 +30,10 @@ go_library(
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
+        "//pkg/syserror",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
     ],
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index ced51c66c..8973754c8 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,19 +18,26 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"path"
 	"sort"
 	"strings"
 	"text/tabwriter"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -60,6 +67,12 @@ type ExecArgs struct {
 	// process's MountNamespace.
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 is the mount namespace to execute the new process in.
+	// A reference on MountNamespace must be held for the lifetime of the
+	// ExecArgs. If MountNamespace is nil, it will default to the init
+	// process's MountNamespace.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// WorkingDirectory defines the working directory for the new process.
 	WorkingDirectory string `json:"wd"`
 
@@ -150,6 +163,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
 		MountNamespace:          args.MountNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		Credentials:             creds,
 		FDTable:                 fdTable,
 		Umask:                   0022,
@@ -166,24 +180,53 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		// be donated to the new process in CreateProcess.
 		initArgs.MountNamespace.IncRef()
 	}
+	if initArgs.MountNamespaceVFS2 != nil {
+		// initArgs must hold a reference on MountNamespaceVFS2, which will
+		// be donated to the new process in CreateProcess.
+		initArgs.MountNamespaceVFS2.IncRef()
+	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
 	if initArgs.Filename == "" {
-		// Get the full path to the filename from the PATH env variable.
-		paths := fs.GetPath(initArgs.Envv)
-		mns := initArgs.MountNamespace
-		if mns == nil {
-			mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
-		}
-		f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-		if err != nil {
-			return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+		if kernel.VFS2Enabled {
+			// Get the full path to the filename from the PATH env variable.
+			if initArgs.MountNamespaceVFS2 == nil {
+				// Set initArgs so that 'ctx' returns the namespace.
+				//
+				// MountNamespaceVFS2 adds a reference to the namespace, which is
+				// transferred to the new process.
+				initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+			}
+
+			paths := fs.GetPath(initArgs.Envv)
+			vfsObj := proc.Kernel.VFS
+			file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+			if err != nil {
+				return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			}
+			initArgs.File = fsbridge.NewVFSFile(file)
+		} else {
+			// Get the full path to the filename from the PATH env variable.
+			paths := fs.GetPath(initArgs.Envv)
+			if initArgs.MountNamespace == nil {
+				// Set initArgs so that 'ctx' returns the namespace.
+				initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
+
+				// initArgs must hold a reference on MountNamespace, which will
+				// be donated to the new process in CreateProcess.
+				initArgs.MountNamespaceVFS2.IncRef()
+			}
+			f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+			if err != nil {
+				return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			}
+			initArgs.Filename = f
 		}
-		initArgs.Filename = f
 	}
 
 	mounter := fs.FileOwnerFromContext(ctx)
 
+	// TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2.
 	var ttyFile *fs.File
 	for appFD, hostFile := range args.FilePayload.Files {
 		var appFile *fs.File
@@ -411,3 +454,67 @@ func ttyName(tty *kernel.TTY) string {
 	}
 	return fmt.Sprintf("pts/%d", tty.Index)
 }
+
+// ResolveExecutablePath resolves the given executable name given a set of
+// paths that might contain it.
+func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return openExecutable(ctx, vfsObj, creds, root, name)
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		if len(wd) == 0 {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return nil, fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
+	}
+
+	// Otherwise, we must lookup the name in the paths, starting from the
+	// calling context's root directory.
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
+		if err != nil {
+			return nil, err
+		}
+		if f == nil {
+			continue // Not found/no access.
+		}
+		return f, nil
+	}
+	return nil, syserror.ENOENT
+}
+
+func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root, // binPath is absolute, Start can be anything.
+		Path:               fspath.Parse(path),
+		FollowFinalSymlink: true,
+	}
+	opts := &vfs.OpenOptions{
+		Flags:    linux.O_RDONLY,
+		FileExec: true,
+	}
+	f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
+	if err == syserror.ENOENT || err == syserror.EACCES {
+		return nil, nil
+	}
+	return f, err
+}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 280093c5e..77c2c5c0e 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -36,6 +36,7 @@ go_library(
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/fs/proc/seqfile",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index ca020e11e..8ab8d8a02 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -249,7 +250,7 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 	return newProcInode(t, exeSymlink, msrc, fs.Symlink, t)
 }
 
-func (e *exe) executable() (d *fs.Dirent, err error) {
+func (e *exe) executable() (file fsbridge.File, err error) {
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
@@ -262,8 +263,8 @@ func (e *exe) executable() (d *fs.Dirent, err error) {
 		// The MemoryManager may be destroyed, in which case
 		// MemoryManager.destroy will simply set the executable to nil
 		// (with locks held).
-		d = mm.Executable()
-		if d == nil {
+		file = mm.Executable()
+		if file == nil {
 			err = syserror.ENOENT
 		}
 	})
@@ -283,15 +284,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	}
 	defer exec.DecRef()
 
-	root := fs.RootFromContext(ctx)
-	if root == nil {
-		// This doesn't correspond to anything in Linux because the vfs is
-		// global there.
-		return "", syserror.EINVAL
-	}
-	defer root.DecRef()
-	n, _ := exec.FullName(root)
-	return n, nil
+	return exec.PathnameWithDeleted(ctx), nil
 }
 
 // namespaceSymlink represents a symlink in the namespacefs, such as the files
diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD
new file mode 100644
index 000000000..6c798f0bd
--- /dev/null
+++ b/pkg/sentry/fsbridge/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "fsbridge",
+    srcs = [
+        "bridge.go",
+        "fs.go",
+        "vfs.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go
new file mode 100644
index 000000000..8e7590721
--- /dev/null
+++ b/pkg/sentry/fsbridge/bridge.go
@@ -0,0 +1,54 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2
+// files.
+package fsbridge
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// File provides a common interface to bridge between VFS1 and VFS2 files.
+type File interface {
+	// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+	// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+	// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+	PathnameWithDeleted(ctx context.Context) string
+
+	// ReadFull read all contents from the file.
+	ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+
+	// ConfigureMMap mutates opts to implement mmap(2) for the file.
+	ConfigureMMap(context.Context, *memmap.MMapOpts) error
+
+	// Type returns the file type, e.g. linux.S_IFREG.
+	Type(context.Context) (linux.FileMode, error)
+
+	// IncRef increments reference.
+	IncRef()
+
+	// DecRef decrements reference.
+	DecRef()
+}
+
+// Lookup provides a common interface to open files.
+type Lookup interface {
+	// OpenPath opens a file.
+	OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error)
+}
diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go
new file mode 100644
index 000000000..093ce1fb3
--- /dev/null
+++ b/pkg/sentry/fsbridge/fs.go
@@ -0,0 +1,181 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+	"io"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over fs.File.
+//
+// +stateify savable
+type fsFile struct {
+	file *fs.File
+}
+
+var _ File = (*fsFile)(nil)
+
+// NewFSFile creates a new File over fs.File.
+func NewFSFile(file *fs.File) File {
+	return &fsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *fsFile) PathnameWithDeleted(ctx context.Context) string {
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// This doesn't correspond to anything in Linux because the vfs is
+		// global there.
+		return ""
+	}
+	defer root.DecRef()
+
+	name, _ := f.file.Dirent.FullName(root)
+	return name
+}
+
+// ReadFull implements File.
+func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.file.Preadv(ctx, dst, offset+total)
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *fsFile) Type(context.Context) (linux.FileMode, error) {
+	return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil
+}
+
+// IncRef implements File.
+func (f *fsFile) IncRef() {
+	f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *fsFile) DecRef() {
+	f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type fsLookup struct {
+	mntns *fs.MountNamespace
+
+	root       *fs.Dirent
+	workingDir *fs.Dirent
+}
+
+var _ Lookup = (*fsLookup)(nil)
+
+// NewFSLookup creates a new Lookup using VFS1.
+func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup {
+	return &fsLookup{
+		mntns:      mntns,
+		root:       root,
+		workingDir: workingDir,
+	}
+}
+
+// OpenPath implements Lookup.
+func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) {
+	var d *fs.Dirent
+	var err error
+	if resolveFinal {
+		d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals)
+	} else {
+		d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer d.DecRef()
+
+	if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+		return nil, syserror.ELOOP
+	}
+
+	fsPerm := openOptionsToPermMask(&opts)
+	if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil {
+		return nil, err
+	}
+
+	// If they claim it's a directory, then make sure.
+	if strings.HasSuffix(path, "/") {
+		if d.Inode.StableAttr.Type != fs.Directory {
+			return nil, syserror.ENOTDIR
+		}
+	}
+
+	if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile {
+		ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type)
+		return nil, syserror.EACCES
+	}
+
+	f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags))
+	if err != nil {
+		return nil, err
+	}
+
+	return &fsFile{file: f}, nil
+}
+
+func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask {
+	mode := opts.Flags & linux.O_ACCMODE
+	return fs.PermMask{
+		Read:    mode == linux.O_RDONLY || mode == linux.O_RDWR,
+		Write:   mode == linux.O_WRONLY || mode == linux.O_RDWR,
+		Execute: opts.FileExec,
+	}
+}
+
+func flagsToFileFlags(flags uint32) fs.FileFlags {
+	return fs.FileFlags{
+		Direct:      flags&linux.O_DIRECT != 0,
+		DSync:       flags&(linux.O_DSYNC|linux.O_SYNC) != 0,
+		Sync:        flags&linux.O_SYNC != 0,
+		NonBlocking: flags&linux.O_NONBLOCK != 0,
+		Read:        (flags & linux.O_ACCMODE) != linux.O_WRONLY,
+		Write:       (flags & linux.O_ACCMODE) != linux.O_RDONLY,
+		Append:      flags&linux.O_APPEND != 0,
+		Directory:   flags&linux.O_DIRECTORY != 0,
+		Async:       flags&linux.O_ASYNC != 0,
+		LargeFile:   flags&linux.O_LARGEFILE != 0,
+		Truncate:    flags&linux.O_TRUNC != 0,
+	}
+}
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
new file mode 100644
index 000000000..e657c39bc
--- /dev/null
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over vfs.FileDescription.
+//
+// +stateify savable
+type vfsFile struct {
+	file *vfs.FileDescription
+}
+
+var _ File = (*vfsFile)(nil)
+
+// NewVFSFile creates a new File over fs.File.
+func NewVFSFile(file *vfs.FileDescription) File {
+	return &vfsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
+	return name
+}
+
+// ReadFull implements File.
+func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
+	stat, err := f.file.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		return 0, err
+	}
+	return linux.FileMode(stat.Mode).FileType(), nil
+}
+
+// IncRef implements File.
+func (f *vfsFile) IncRef() {
+	f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *vfsFile) DecRef() {
+	f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type vfsLookup struct {
+	mntns *vfs.MountNamespace
+
+	root       vfs.VirtualDentry
+	workingDir vfs.VirtualDentry
+}
+
+var _ Lookup = (*vfsLookup)(nil)
+
+// NewVFSLookup creates a new Lookup using VFS2.
+func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup {
+	return &vfsLookup{
+		mntns:      mntns,
+		root:       root,
+		workingDir: workingDir,
+	}
+}
+
+// OpenPath implements Lookup.
+//
+// remainingTraversals is not configurable in VFS2, all callers are using the
+// default anyways.
+//
+// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
+func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	pop := &vfs.PathOperation{
+		Root:               l.root,
+		Start:              l.root,
+		Path:               fspath.Parse(path),
+		FollowFinalSymlink: resolveFinal,
+	}
+	fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return &vfsFile{file: fd}, nil
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index e03a0c665..abd4f24e7 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -28,6 +28,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
+// Name is the default filesystem name.
+const Name = "devtmpfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct {
 	initOnce sync.Once
@@ -107,6 +110,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
 func (ac *accessorContext) Value(key interface{}) interface{} {
 	switch key {
 	case vfs.CtxMountNamespace:
+		ac.a.mntns.IncRef()
 		return ac.a.mntns
 	case vfs.CtxRoot:
 		ac.a.root.IncRef()
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 138adb9f7..5cfb0dc4c 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -400,6 +400,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 	vfsObj := rp.VirtualFilesystem()
 	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
 	childVFSD := parent.vfsd.Child(name)
@@ -934,7 +935,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if oldParent == newParent && oldName == newName {
 		return nil
 	}
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
 		return err
 	}
 	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d0552bd99..d00850e25 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -52,6 +52,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Name is the default filesystem name.
+const Name = "9p"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index ee98eb66a..292f58afd 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -544,6 +544,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
 	virtfs := rp.VirtualFilesystem()
 
 	srcDirDentry := srcDirVFSD.Impl().(*Dentry)
@@ -595,7 +596,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parentDentry := vfsd.Parent().Impl().(*Dentry)
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
-	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
 	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
@@ -697,7 +701,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parentDentry := vfsd.Parent().Impl().(*Dentry)
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
-	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
 	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 12aac2e6a..a83245866 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -14,6 +14,7 @@ go_library(
         "tasks_net.go",
         "tasks_sys.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 11477b6a9..5c19d5522 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -26,15 +26,18 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-// procFSType is the factory class for procfs.
+// Name is the default filesystem name.
+const Name = "proc"
+
+// FilesystemType is the factory class for procfs.
 //
 // +stateify savable
-type procFSType struct{}
+type FilesystemType struct{}
 
-var _ vfs.FilesystemType = (*procFSType)(nil)
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
 
 // GetFilesystem implements vfs.FilesystemType.
-func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, nil, fmt.Errorf("procfs requires a kernel")
@@ -47,12 +50,13 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
 	procfs := &kernfs.Filesystem{}
 	procfs.VFSFilesystem().Init(vfsObj, procfs)
 
-	var data *InternalData
+	var cgroups map[string]string
 	if opts.InternalData != nil {
-		data = opts.InternalData.(*InternalData)
+		data := opts.InternalData.(*InternalData)
+		cgroups = data.Cgroups
 	}
 
-	_, dentry := newTasksInode(procfs, k, pidns, data.Cgroups)
+	_, dentry := newTasksInode(procfs, k, pidns, cgroups)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 6fc3524db..96c72cbc9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -90,8 +90,7 @@ func setup(t *testing.T) *testutil.System {
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 	fsOpts := vfs.GetFilesystemOptions{
@@ -102,11 +101,11 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts)
+	mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts)
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
-	return testutil.NewSystem(ctx, t, vfsObj, mntns)
+	return testutil.NewSystem(ctx, t, k.VFS, mntns)
 }
 
 func TestTasksEmpty(t *testing.T) {
@@ -131,7 +130,7 @@ func TestTasks(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
@@ -213,7 +212,7 @@ func TestTasksOffset(t *testing.T) {
 	k := kernel.KernelFromContext(s.Ctx)
 	for i := 0; i < 3; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
 	}
@@ -337,7 +336,7 @@ func TestTask(t *testing.T) {
 
 	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	_, err := testutil.CreateTask(s.Ctx, "name", tc)
+	_, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -352,7 +351,7 @@ func TestProcSelf(t *testing.T) {
 
 	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	task, err := testutil.CreateTask(s.Ctx, "name", tc)
+	task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -433,7 +432,7 @@ func TestTree(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 66c0d8bc8..a741e2bb6 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "sys.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index d693fceae..c36c4fa11 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -28,6 +28,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the default filesystem name.
+const Name = "sysfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 8b1cf0bd0..5d1ba5867 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System {
 	}
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
-	v := vfs.New()
-	v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 
-	mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
-	return testutil.NewSystem(ctx, t, v, mns)
+	return testutil.NewSystem(ctx, t, k.VFS, mns)
 }
 
 func TestReadCPUFile(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index efd5974c4..e4f36f4ae 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -16,7 +16,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/fspath",
         "//pkg/memutil",
-        "//pkg/sentry/fs",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/sched",
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 89f8c4915..a91b3ec4d 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/memutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	// Platforms are plugable.
 	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
@@ -99,26 +100,27 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
-	ctx := k.SupervisorContext()
+	kernel.VFS2Enabled = true
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	k.VFS = vfsObj
 
-	// Create mount namespace without root as it's the minimum required to create
-	// the global thread group.
-	mntns, err := fs.NewMountNamespace(ctx, nil)
-	if err != nil {
-		return nil, err
-	}
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
 		return nil, err
 	}
-	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
 	k.TestOnly_SetGlobalInit(tg)
 
 	return k, nil
 }
 
 // CreateTask creates a new bare bones task for tests.
-func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
 	k := kernel.KernelFromContext(ctx)
 	config := &kernel.TaskConfig{
 		Kernel:                  k,
@@ -129,6 +131,8 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kern
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
 		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		MountNamespaceVFS2:      mntns,
+		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
 	}
 	return k.TaskSet().NewTask(config)
 }
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 69fd84ddd..b97e3534a 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -41,12 +41,12 @@ type System struct {
 	Creds *auth.Credentials
 	VFS   *vfs.VirtualFilesystem
 	Root  vfs.VirtualDentry
-	mns   *vfs.MountNamespace
+	MntNs *vfs.MountNamespace
 }
 
 // NewSystem constructs a System.
 //
-// Precondition: Caller must hold a reference on mns, whose ownership
+// Precondition: Caller must hold a reference on MntNs, whose ownership
 // is transferred to the new System.
 func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
 	s := &System{
@@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns
 		Ctx:   ctx,
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
-		mns:   mns,
+		MntNs: mns,
 		Root:  mns.Root(),
 	}
 	return s
@@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System {
 		Ctx:   s.Ctx,
 		Creds: s.Creds,
 		VFS:   s.VFS,
-		mns:   s.mns,
+		MntNs: s.MntNs,
 		Root:  s.Root,
 	}
 }
@@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 		Ctx:   ctx,
 		Creds: s.Creds,
 		VFS:   s.VFS,
-		mns:   s.mns,
+		MntNs: s.MntNs,
 		Root:  s.Root,
 	}
 }
@@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 // Destroy release resources associated with a test system.
 func (s *System) Destroy() {
 	s.Root.DecRef()
-	s.mns.DecRef() // Reference on mns passed to NewSystem.
+	s.MntNs.DecRef() // Reference on MntNs passed to NewSystem.
 }
 
 // ReadToEnd reads the contents of fd until EOF to a string.
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 8785452b6..7f7b791c4 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -486,7 +486,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	vfsObj := rp.VirtualFilesystem()
 	oldParentDir := oldParent.inode.impl.(*directory)
 	newParentDir := newParent.inode.impl.(*directory)
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil {
 		return err
 	}
 	if replaced != nil {
@@ -543,7 +545,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	}
 	defer mnt.EndWrite()
 	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
 		return err
 	}
 	parent.inode.impl.(*directory).childList.Remove(child)
@@ -631,7 +635,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	}
 	defer mnt.EndWrite()
 	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
 		return err
 	}
 	parent.inode.impl.(*directory).childList.Remove(child)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 2108d0f4d..c5bb17562 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -40,6 +40,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the default filesystem name.
+const Name = "tmpfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 2231d6973..46306945f 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -157,6 +157,7 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/refs",
@@ -167,6 +168,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 2448c1d99..7218aa24e 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -37,10 +38,16 @@ type FSContext struct {
 	// destroyed.
 	root *fs.Dirent
 
+	// rootVFS2 is the filesystem root.
+	rootVFS2 vfs.VirtualDentry
+
 	// cwd is the current working directory. Will be nil iff the FSContext
 	// has been destroyed.
 	cwd *fs.Dirent
 
+	// cwdVFS2 is the current working directory.
+	cwdVFS2 vfs.VirtualDentry
+
 	// umask is the current file mode creation mask. When a thread using this
 	// context invokes a syscall that creates a file, bits set in umask are
 	// removed from the permissions that the file is created with.
@@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 	return &f
 }
 
+// NewFSContextVFS2 returns a new filesystem context.
+func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	f := FSContext{
+		rootVFS2: root,
+		cwdVFS2:  cwd,
+		umask:    umask,
+	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
+}
+
 // destroy is the destructor for an FSContext.
 //
 // This will call DecRef on both root and cwd Dirents.  If either call to
@@ -75,11 +95,17 @@ func (f *FSContext) destroy() {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.root.DecRef()
-	f.root = nil
-
-	f.cwd.DecRef()
-	f.cwd = nil
+	if VFS2Enabled {
+		f.rootVFS2.DecRef()
+		f.rootVFS2 = vfs.VirtualDentry{}
+		f.cwdVFS2.DecRef()
+		f.cwdVFS2 = vfs.VirtualDentry{}
+	} else {
+		f.root.DecRef()
+		f.root = nil
+		f.cwd.DecRef()
+		f.cwd = nil
+	}
 }
 
 // DecRef implements RefCounter.DecRef with destructor f.destroy.
@@ -93,12 +119,21 @@ func (f *FSContext) DecRef() {
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.cwd.IncRef()
-	f.root.IncRef()
+
+	if VFS2Enabled {
+		f.cwdVFS2.IncRef()
+		f.rootVFS2.IncRef()
+	} else {
+		f.cwd.IncRef()
+		f.root.IncRef()
+	}
+
 	return &FSContext{
-		cwd:   f.cwd,
-		root:  f.root,
-		umask: f.umask,
+		cwd:      f.cwd,
+		root:     f.root,
+		cwdVFS2:  f.cwdVFS2,
+		rootVFS2: f.rootVFS2,
+		umask:    f.umask,
 	}
 }
 
@@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext {
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	if f.cwd != nil {
-		f.cwd.IncRef()
-	}
+
+	f.cwd.IncRef()
 	return f.cwd
 }
 
+// WorkingDirectoryVFS2 returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.cwdVFS2.IncRef()
+	return f.cwdVFS2
+}
+
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
@@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetWorkingDirectoryVFS2 sets the current working directory.
+// This will take an extra reference on the VirtualDentry.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	old := f.cwdVFS2
+	f.cwdVFS2 = d
+	d.IncRef()
+	old.DecRef()
+}
+
 // RootDirectory returns the current filesystem root.
 //
 // This will return nil if called after destroy(), otherwise it will return a
@@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 	return f.root
 }
 
+// RootDirectoryVFS2 returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.rootVFS2.IncRef()
+	return f.rootVFS2
+}
+
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3ee760ba2..2665f057c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -43,11 +43,13 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -71,6 +73,10 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
+// easy access everywhere. To be removed once VFS2 becomes the default.
+var VFS2Enabled = false
+
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
 //
@@ -238,6 +244,9 @@ type Kernel struct {
 
 	// SpecialOpts contains special kernel options.
 	SpecialOpts
+
+	// VFS keeps the filesystem state used across the kernel.
+	VFS *vfs.VirtualFilesystem
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -624,7 +633,7 @@ type CreateProcessArgs struct {
 	// File is a passed host FD pointing to a file to load as the init binary.
 	//
 	// This is checked if and only if Filename is "".
-	File *fs.File
+	File fsbridge.File
 
 	// Argvv is a list of arguments.
 	Argv []string
@@ -673,6 +682,13 @@ type CreateProcessArgs struct {
 	// increment it).
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 optionally contains the mount namespace for this
+	// process. If nil, the init process's mount namespace is used.
+	//
+	// Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
+	// increment it).
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// ContainerID is the container that the process belongs to.
 	ContainerID string
 }
@@ -711,11 +727,22 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.args.Credentials
 	case fs.CtxRoot:
 		if ctx.args.MountNamespace != nil {
-			// MountNamespace.Root() will take a reference on the root
-			// dirent for us.
+			// MountNamespace.Root() will take a reference on the root dirent for us.
 			return ctx.args.MountNamespace.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.args.MountNamespaceVFS2 == nil {
+			return nil
+		}
+		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
+		return ctx.args.MountNamespaceVFS2.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2 takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
@@ -757,34 +784,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
-	// Grab the mount namespace.
-	mounts := args.MountNamespace
-	if mounts == nil {
-		mounts = k.GlobalInit().Leader().MountNamespace()
-		mounts.IncRef()
-	}
-
-	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
-	// Get the root directory from the MountNamespace.
-	root := mounts.Root()
-	// The call to newFSContext below will take a reference on root, so we
-	// don't need to hold this one.
-	defer root.DecRef()
-
-	// Grab the working directory.
-	remainingTraversals := uint(args.MaxSymlinkTraversals)
-	wd := root // Default.
-	if args.WorkingDirectory != "" {
-		var err error
-		wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
-		if err != nil {
-			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+	var (
+		opener    fsbridge.Lookup
+		fsContext *FSContext
+		mntns     *fs.MountNamespace
+	)
+
+	if VFS2Enabled {
+		mntnsVFS2 := args.MountNamespaceVFS2
+		if mntnsVFS2 == nil {
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+		}
+		// Get the root directory from the MountNamespace.
+		root := args.MountNamespaceVFS2.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			pop := vfs.PathOperation{
+				Root:               root,
+				Start:              wd,
+				Path:               fspath.Parse(args.WorkingDirectory),
+				FollowFinalSymlink: true,
+			}
+			var err error
+			wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
+		fsContext = NewFSContextVFS2(root, wd, args.Umask)
+
+	} else {
+		mntns = args.MountNamespace
+		if mntns == nil {
+			mntns = k.GlobalInit().Leader().MountNamespace()
+			mntns.IncRef()
 		}
-		defer wd.DecRef()
+		// Get the root directory from the MountNamespace.
+		root := mntns.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		remainingTraversals := args.MaxSymlinkTraversals
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			var err error
+			wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewFSLookup(mntns, root, wd)
+		fsContext = newFSContext(root, wd, args.Umask)
 	}
 
+	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+
 	// Check which file to start from.
 	switch {
 	case args.Filename != "":
@@ -805,11 +875,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	// Create a fresh task context.
-	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	remainingTraversals := args.MaxSymlinkTraversals
 	loadArgs := loader.LoadArgs{
-		Mounts:              mounts,
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              opener,
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        true,
 		Filename:            args.Filename,
@@ -834,13 +902,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		Kernel:                  k,
 		ThreadGroup:             tg,
 		TaskContext:             tc,
-		FSContext:               newFSContext(root, wd, args.Umask),
+		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
@@ -1378,6 +1447,20 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 			return ctx.k.globalInit.mounts.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.k.globalInit == nil {
+			return vfs.VirtualDentry{}
+		}
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		defer mntns.DecRef()
+		// Root() takes a reference on the root dirent for us.
+		return mntns.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2() takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 981e8c7fe..a3443ff21 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -424,6 +424,11 @@ type Task struct {
 	// abstractSockets is protected by mu.
 	abstractSockets *AbstractSocketNamespace
 
+	// mountNamespaceVFS2 is the task's mount namespace.
+	//
+	// It is protected by mu. It is owned by the task goroutine.
+	mountNamespaceVFS2 *vfs.MountNamespace
+
 	// parentDeathSignal is sent to this task's thread group when its parent exits.
 	//
 	// parentDeathSignal is protected by mu.
@@ -638,6 +643,11 @@ func (t *Task) Value(key interface{}) interface{} {
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.fsContext.RootDirectory()
+	case vfs.CtxRoot:
+		return t.fsContext.RootDirectoryVFS2()
+	case vfs.CtxMountNamespace:
+		t.mountNamespaceVFS2.IncRef()
+		return t.mountNamespaceVFS2
 	case fs.CtxDirentCacheLimiter:
 		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -701,6 +711,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) IsChrooted() bool {
+	if VFS2Enabled {
+		realRoot := t.mountNamespaceVFS2.Root()
+		defer realRoot.DecRef()
+		root := t.fsContext.RootDirectoryVFS2()
+		defer root.DecRef()
+		return root != realRoot
+	}
+
 	realRoot := t.tg.mounts.Root()
 	defer realRoot.DecRef()
 	root := t.fsContext.RootDirectory()
@@ -796,6 +814,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 	return t.tg.mounts
 }
 
+// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
+// returned mount namespace.
+func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mountNamespaceVFS2.IncRef()
+	return t.mountNamespaceVFS2
+}
+
 // AbstractSockets returns t's AbstractSocketNamespace.
 func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 	return t.abstractSockets
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 53d4d211b..ba74b4c1c 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -199,6 +199,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	// TODO(b/63601033): Implement CLONE_NEWNS.
+	mntnsVFS2 := t.mountNamespaceVFS2
+	if mntnsVFS2 != nil {
+		mntnsVFS2.IncRef()
+	}
+
 	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
@@ -241,7 +247,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	rseqAddr := usermem.Addr(0)
 	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
-		tg.mounts.IncRef()
+		if tg.mounts != nil {
+			tg.mounts.IncRef()
+		}
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
@@ -265,6 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		MountNamespaceVFS2:      mntnsVFS2,
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2d6e7733c..2be982684 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -136,7 +136,7 @@ func (t *Task) Stack() *arch.Stack {
 func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
 	// If File is not nil, we should load that instead of resolving Filename.
 	if args.File != nil {
-		args.Filename = args.File.MappedName(ctx)
+		args.Filename = args.File.PathnameWithDeleted(ctx)
 	}
 
 	// Prepare a new user address space to load into.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 435761e5a..c4ade6e8e 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.fsContext.DecRef()
 	t.fdTable.DecRef()
 
+	t.mu.Lock()
+	if t.mountNamespaceVFS2 != nil {
+		t.mountNamespaceVFS2.DecRef()
+		t.mountNamespaceVFS2 = nil
+	}
+	t.mu.Unlock()
+
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 41259210c..6d737d3e5 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
 	if !trace.IsEnabled() {
 		return
 	}
-	d := tc.MemoryManager.Executable()
-	if d == nil {
+	file := tc.MemoryManager.Executable()
+	if file == nil {
 		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
 		return
 	}
-	defer d.DecRef()
-	root := t.fsContext.RootDirectory()
-	if root == nil {
-		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
-		return
-	}
-	defer root.DecRef()
-	n, _ := d.FullName(root)
-	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
+	defer file.DecRef()
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index de838beef..f9236a842 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -80,6 +81,9 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// MountNamespaceVFS2 is the MountNamespace of the new task.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
 	RSeqAddr usermem.Addr
 
@@ -116,28 +120,29 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 			parent:   cfg.Parent,
 			children: make(map[*Task]struct{}),
 		},
-		runState:        (*runApp)(nil),
-		interruptChan:   make(chan struct{}, 1),
-		signalMask:      cfg.SignalMask,
-		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
-		tc:              *tc,
-		fsContext:       cfg.FSContext,
-		fdTable:         cfg.FDTable,
-		p:               cfg.Kernel.Platform.NewContext(),
-		k:               cfg.Kernel,
-		ptraceTracees:   make(map[*Task]struct{}),
-		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
-		ioUsage:         &usage.IO{},
-		niceness:        cfg.Niceness,
-		netns:           cfg.NetworkNamespaced,
-		utsns:           cfg.UTSNamespace,
-		ipcns:           cfg.IPCNamespace,
-		abstractSockets: cfg.AbstractSocketNamespace,
-		rseqCPU:         -1,
-		rseqAddr:        cfg.RSeqAddr,
-		rseqSignature:   cfg.RSeqSignature,
-		futexWaiter:     futex.NewWaiter(),
-		containerID:     cfg.ContainerID,
+		runState:           (*runApp)(nil),
+		interruptChan:      make(chan struct{}, 1),
+		signalMask:         cfg.SignalMask,
+		signalStack:        arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:                 *tc,
+		fsContext:          cfg.FSContext,
+		fdTable:            cfg.FDTable,
+		p:                  cfg.Kernel.Platform.NewContext(),
+		k:                  cfg.Kernel,
+		ptraceTracees:      make(map[*Task]struct{}),
+		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
+		ioUsage:            &usage.IO{},
+		niceness:           cfg.Niceness,
+		netns:              cfg.NetworkNamespaced,
+		utsns:              cfg.UTSNamespace,
+		ipcns:              cfg.IPCNamespace,
+		abstractSockets:    cfg.AbstractSocketNamespace,
+		mountNamespaceVFS2: cfg.MountNamespaceVFS2,
+		rseqCPU:            -1,
+		rseqAddr:           cfg.RSeqAddr,
+		rseqSignature:      cfg.RSeqSignature,
+		futexWaiter:        futex.NewWaiter(),
+		containerID:        cfg.ContainerID,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 768e958d2..268f62e9d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -256,7 +256,7 @@ type ThreadGroup struct {
 	tty *TTY
 }
 
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
@@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() {
 	for _, it := range its {
 		it.DestroyTimer()
 	}
-	tg.mounts.DecRef()
+	if tg.mounts != nil {
+		tg.mounts.DecRef()
+	}
 }
 
 // forEachChildThreadGroupLocked indicates over all child ThreadGroups.
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 23790378a..c6aa65f28 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
@@ -40,6 +41,7 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 122ed05c2..616fafa2c 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -27,7 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -97,11 +97,11 @@ type elfInfo struct {
 // accepts from the ELF, and it doesn't parse unnecessary parts of the file.
 //
 // ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
 	// Check ident first; it will tell us the endianness of the rest of the
 	// structs.
 	var ident [elf.EI_NIDENT]byte
-	_, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+	_, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0)
 	if err != nil {
 		log.Infof("Error reading ELF ident: %v", err)
 		// The entire ident array always exists.
@@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 
 	var hdr elf.Header64
 	hdrBuf := make([]byte, header64Size)
-	_, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0)
 	if err != nil {
 		log.Infof("Error reading ELF header: %v", err)
 		// The entire header always exists.
@@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 	}
 
 	phdrBuf := make([]byte, totalPhdrSize)
-	_, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
 	if err != nil {
 		log.Infof("Error reading ELF phdrs: %v", err)
 		// If phdrs were specified, they should all exist.
@@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 
 // mapSegment maps a phdr into the Task. offset is the offset to apply to
 // phdr.Vaddr.
-func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
 	// We must make a page-aligned mapping.
 	adjust := usermem.Addr(phdr.Vaddr).PageOffset()
 
@@ -395,7 +395,7 @@ type loadedELF struct {
 //
 // Preconditions:
 //  * f is an ELF file
-func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
 	first := true
 	var start, end usermem.Addr
 	var interpreter string
@@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 			}
 
 			path := make([]byte, phdr.Filesz)
-			_, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+			_, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
 			if err != nil {
 				// If an interpreter was specified, it should exist.
 				ctx.Infof("Error reading PT_INTERP path: %v", err)
@@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 // Preconditions:
 //  * f is an ELF file
 //  * f is the first ELF loaded into m
-func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		ctx.Infof("Failed to parse initial ELF: %v", err)
@@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
 //
 // Preconditions:
 //  * f is an ELF file
-func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		if err == syserror.ENOEXEC {
@@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
 		// Refresh the traversal limit.
 		*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		args.Filename = bin.interpreter
-		d, i, err := openPath(ctx, args)
+		intFile, err := openPath(ctx, args)
 		if err != nil {
 			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
 			return loadedELF{}, nil, err
 		}
-		defer i.DecRef()
-		// We don't need the Dirent.
-		d.DecRef()
+		defer intFile.DecRef()
 
-		interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin)
+		interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
 		if err != nil {
 			ctx.Infof("Error loading interpreter: %v", err)
 			return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 098a45d36..3886b4d33 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -19,7 +19,7 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,9 +37,9 @@ const (
 )
 
 // parseInterpreterScript returns the interpreter path and argv.
-func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) {
+func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) {
 	line := make([]byte, interpMaxLineLength)
-	n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+	n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0)
 	// Short read is OK.
 	if err != nil && err != io.ErrUnexpectedEOF {
 		if err == io.EOF {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 9a613d6b7..d6675b8f0 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"io"
 	"path"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -29,8 +28,10 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -41,16 +42,6 @@ type LoadArgs struct {
 	// MemoryManager is the memory manager to load the executable into.
 	MemoryManager *mm.MemoryManager
 
-	// Mounts is the mount namespace in which to look up Filename.
-	Mounts *fs.MountNamespace
-
-	// Root is the root directory under which to look up Filename.
-	Root *fs.Dirent
-
-	// WorkingDirectory is the working directory under which to look up
-	// Filename.
-	WorkingDirectory *fs.Dirent
-
 	// RemainingTraversals is the maximum number of symlinks to follow to
 	// resolve Filename. This counter is passed by reference to keep it
 	// updated throughout the call stack.
@@ -65,7 +56,12 @@ type LoadArgs struct {
 
 	// File is an open fs.File object of the executable. If File is not
 	// nil, then File will be loaded and Filename will be ignored.
-	File *fs.File
+	//
+	// The caller is responsible for checking that the user can execute this file.
+	File fsbridge.File
+
+	// Opener is used to open the executable file when 'File' is nil.
+	Opener fsbridge.Lookup
 
 	// CloseOnExec indicates that the executable (or one of its parent
 	// directories) was opened with O_CLOEXEC. If the executable is an
@@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDTable. The caller takes ownership of both.
 //
 // args.Filename must be a readable, executable, regular file.
-func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) {
 	if args.Filename == "" {
 		ctx.Infof("cannot open empty name")
-		return nil, nil, syserror.ENOENT
-	}
-
-	var d *fs.Dirent
-	var err error
-	if args.ResolveFinal {
-		d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
-	} else {
-		d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
-	}
-	if err != nil {
-		return nil, nil, err
-	}
-	// Defer a DecRef for the sake of failure cases.
-	defer d.DecRef()
-
-	if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
-		return nil, nil, syserror.ELOOP
-	}
-
-	if err := checkPermission(ctx, d); err != nil {
-		return nil, nil, err
-	}
-
-	// If they claim it's a directory, then make sure.
-	//
-	// N.B. we reject directories below, but we must first reject
-	// non-directories passed as directories.
-	if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) {
-		return nil, nil, syserror.ENOTDIR
-	}
-
-	if err := checkIsRegularFile(ctx, d, args.Filename); err != nil {
-		return nil, nil, err
-	}
-
-	f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		return nil, nil, err
-	}
-	// Defer a DecRef for the sake of failure cases.
-	defer f.DecRef()
-
-	if err := checkPread(ctx, f, args.Filename); err != nil {
-		return nil, nil, err
-	}
-
-	d.IncRef()
-	f.IncRef()
-	return d, f, err
-}
-
-// checkFile performs checks on a file to be executed.
-func checkFile(ctx context.Context, f *fs.File, filename string) error {
-	if err := checkPermission(ctx, f.Dirent); err != nil {
-		return err
-	}
-
-	if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil {
-		return err
+		return nil, syserror.ENOENT
 	}
 
-	return checkPread(ctx, f, filename)
-}
-
-// checkPermission checks whether the file is readable and executable.
-func checkPermission(ctx context.Context, d *fs.Dirent) error {
-	perms := fs.PermMask{
-		// TODO(gvisor.dev/issue/160): Linux requires only execute
-		// permission, not read. However, our backing filesystems may
-		// prevent us from reading the file without read permission.
-		//
-		// Additionally, a task with a non-readable executable has
-		// additional constraints on access via ptrace and procfs.
-		Read:    true,
-		Execute: true,
+	// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+	// not read. However, our backing filesystems may prevent us from reading
+	// the file without read permission. Additionally, a task with a
+	// non-readable executable has additional constraints on access via
+	// ptrace and procfs.
+	opts := vfs.OpenOptions{
+		Flags:    linux.O_RDONLY,
+		FileExec: true,
 	}
-	return d.Inode.CheckPermission(ctx, perms)
+	return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal)
 }
 
 // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc.
-func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error {
-	attr := d.Inode.StableAttr
-	if !fs.IsRegular(attr) {
-		ctx.Infof("%s is not regular: %v", filename, attr)
-		return syserror.EACCES
+func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error {
+	t, err := file.Type(ctx)
+	if err != nil {
+		return err
 	}
-	return nil
-}
-
-// checkPread checks whether we can read the file at arbitrary offsets.
-func checkPread(ctx context.Context, f *fs.File, filename string) error {
-	if !f.Flags().Pread {
-		ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags())
+	if t != linux.ModeRegular {
+		ctx.Infof("%q is not a regular file: %v", filename, t)
 		return syserror.EACCES
 	}
 	return nil
@@ -224,8 +149,10 @@ const (
 	maxLoaderAttempts = 6
 )
 
-// loadExecutable loads an executable that is pointed to by args.File. If nil,
-// the path args.Filename is resolved and loaded. If the executable is an
+// loadExecutable loads an executable that is pointed to by args.File. The
+// caller is responsible for checking that the user can execute this file.
+// If nil, the path args.Filename is resolved and loaded (check that the user
+// can execute this file is done here in this case). If the executable is an
 // interpreter script rather than an ELF, the binary of the corresponding
 // interpreter will be loaded.
 //
@@ -234,37 +161,27 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated args.Argv
-func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		var (
-			d   *fs.Dirent
-			err error
-		)
 		if args.File == nil {
-			d, args.File, err = openPath(ctx, args)
-			// We will return d in the successful case, but defer a DecRef for the
-			// sake of intermediate loops and failure cases.
-			if d != nil {
-				defer d.DecRef()
-			}
-			if args.File != nil {
-				defer args.File.DecRef()
+			var err error
+			args.File, err = openPath(ctx, args)
+			if err != nil {
+				ctx.Infof("Error opening %s: %v", args.Filename, err)
+				return loadedELF{}, nil, nil, nil, err
 			}
+			// Ensure file is release in case the code loops or errors out.
+			defer args.File.DecRef()
 		} else {
-			d = args.File.Dirent
-			d.IncRef()
-			defer d.DecRef()
-			err = checkFile(ctx, args.File, args.Filename)
-		}
-		if err != nil {
-			ctx.Infof("Error opening %s: %v", args.Filename, err)
-			return loadedELF{}, nil, nil, nil, err
+			if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
+				return loadedELF{}, nil, nil, nil, err
+			}
 		}
 
 		// Check the header. Is this an ELF or interpreter script?
 		var hdr [4]uint8
 		// N.B. We assume that reading from a regular file cannot block.
-		_, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0)
+		_, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0)
 		// Allow unexpected EOF, as a valid executable could be only three bytes
 		// (e.g., #!a).
 		if err != nil && err != io.ErrUnexpectedEOF {
@@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
-			// An ELF is always terminal. Hold on to d.
-			d.IncRef()
-			return loaded, ac, d, args.Argv, err
+			// An ELF is always terminal. Hold on to file.
+			args.File.IncRef()
+			return loaded, ac, args.File, args.Argv, err
+
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
 			if args.CloseOnExec {
 				return loadedELF{}, nil, nil, nil, syserror.ENOENT
@@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 			}
 			// Refresh the traversal limit for the interpreter.
 			*args.RemainingTraversals = linux.MaxSymlinkTraversals
+
 		default:
 			ctx.Infof("Unknown magic: %v", hdr)
 			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
@@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 //  * Load is called on the Task goroutine.
 func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the executable itself.
-	loaded, ac, d, newArgv, err := loadExecutable(ctx, args)
+	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
 	}
-	defer d.DecRef()
+	defer file.DecRef()
 
 	// Load the VDSO.
 	vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
@@ -390,7 +309,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	m.SetEnvvStart(sl.EnvvStart)
 	m.SetEnvvEnd(sl.EnvvEnd)
 	m.SetAuxv(auxv)
-	m.SetExecutable(d)
+	m.SetExecutable(file)
 
 	ac.SetIP(uintptr(loaded.entry))
 	ac.SetStack(uintptr(stack.Bottom))
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 52f446ed7..161b28c2c 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -69,6 +70,8 @@ type byteReader struct {
 var _ fs.FileOperations = (*byteReader)(nil)
 
 // newByteReaderFile creates a fake file to read data from.
+//
+// TODO(gvisor.dev/issue/1623): Convert to VFS2.
 func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
 	// Create a fake inode.
 	inode := fs.NewInode(
@@ -123,7 +126,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
 // * PT_LOAD segments don't extend beyond the end of the file.
 //
 // ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		log.Infof("Unable to parse VDSO header: %v", err)
@@ -221,7 +224,7 @@ type VDSO struct {
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
 func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
-	vdsoFile := newByteReaderFile(ctx, vdsoBin)
+	vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
 	// nil context can be passed.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index e5729ced5..73591dab7 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -105,8 +105,8 @@ go_library(
         "//pkg/safecopy",
         "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index f550acae0..6a49334f4 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -16,7 +16,7 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
 //
 // An additional reference will be taken in the case of a non-nil executable,
 // which must be released by the caller.
-func (mm *MemoryManager) Executable() *fs.Dirent {
+func (mm *MemoryManager) Executable() fsbridge.File {
 	mm.metadataMu.Lock()
 	defer mm.metadataMu.Unlock()
 
@@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent {
 // SetExecutable sets the executable.
 //
 // This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
 	mm.metadataMu.Lock()
 
 	// Grab a new reference.
-	d.IncRef()
+	file.IncRef()
 
 	// Set the executable.
 	orig := mm.executable
-	mm.executable = d
+	mm.executable = file
 
 	mm.metadataMu.Unlock()
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 09e582dd3..637383c7a 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -37,7 +37,7 @@ package mm
 import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -215,7 +215,7 @@ type MemoryManager struct {
 	// is not nil, it holds a reference on the Dirent.
 	//
 	// executable is protected by metadataMu.
-	executable *fs.Dirent
+	executable fsbridge.File
 
 	// dumpability describes if and how this MemoryManager may be dumped to
 	// userspace.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a796b2396..46cb2a1cc 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -141,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string {
 }
 
 func fd(t *kernel.Task, fd int32) string {
+	if kernel.VFS2Enabled {
+		return fdVFS2(t, fd)
+	}
+
 	root := t.FSContext().RootDirectory()
 	if root != nil {
 		defer root.DecRef()
@@ -169,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string {
 	return fmt.Sprintf("%#x %s", fd, name)
 }
 
+func fdVFS2(t *kernel.Task, fd int32) string {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+
+	vfsObj := root.Mount().Filesystem().VirtualFilesystem()
+	if fd == linux.AT_FDCWD {
+		wd := t.FSContext().WorkingDirectoryVFS2()
+		defer wd.DecRef()
+
+		name, _ := vfsObj.PathnameWithDeleted(t, root, wd)
+		return fmt.Sprintf("AT_FDCWD %s", name)
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		// Cast FD to uint64 to avoid printing negative hex.
+		return fmt.Sprintf("%#x (bad FD)", uint64(fd))
+	}
+	defer file.DecRef()
+
+	name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry())
+	return fmt.Sprintf("%#x %s", fd, name)
+}
+
 func fdpair(t *kernel.Task, addr usermem.Addr) string {
 	var fds [2]int32
 	_, err := t.CopyIn(addr, &fds)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index be16ee686..0d24fd3c4 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -74,6 +74,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/epoll",
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 98db32d77..9c6728530 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 
 			// Set the underlying executable.
-			t.MemoryManager().SetExecutable(file.Dirent)
+			t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
 
 		case linux.PR_SET_MM_AUXV,
 			linux.PR_SET_MM_START_CODE,
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 0c9e2255d..00915fdde 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
@@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	defer root.DecRef()
 
 	var wd *fs.Dirent
-	var executable *fs.File
+	var executable fsbridge.File
 	var closeOnExec bool
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
 		// Even if the pathname is absolute, we may still need the wd
@@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		closeOnExec = fdFlags.CloseOnExec
 
 		if atEmptyPath && len(pathname) == 0 {
-			executable = f
+			// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+			// not read. However, our backing filesystems may prevent us from reading
+			// the file without read permission. Additionally, a task with a
+			// non-readable executable has additional constraints on access via
+			// ptrace and procfs.
+			if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
+				return 0, nil, err
+			}
+			executable = fsbridge.NewFSFile(f)
 		} else {
 			wd = f.Dirent
 			wd.IncRef()
@@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	// Load the new TaskContext.
 	remainingTraversals := uint(linux.MaxSymlinkTraversals)
 	loadArgs := loader.LoadArgs{
-		Mounts:              t.MountNamespace(),
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        resolveFinal,
 		Filename:            pathname,
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index c134714ee..e0ac32b33 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -22,4 +22,110 @@ import (
 // Override syscall table to add syscalls implementations from this package.
 func Override(table map[uintptr]kernel.Syscall) {
 	table[0] = syscalls.Supported("read", Read)
+
+	// Remove syscalls that haven't been converted yet. It's better to get ENOSYS
+	// rather than a SIGSEGV deep in the stack.
+	delete(table, 1)   // write
+	delete(table, 2)   // open
+	delete(table, 3)   // close
+	delete(table, 4)   // stat
+	delete(table, 5)   // fstat
+	delete(table, 6)   // lstat
+	delete(table, 7)   // poll
+	delete(table, 8)   // lseek
+	delete(table, 9)   // mmap
+	delete(table, 16)  // ioctl
+	delete(table, 17)  // pread64
+	delete(table, 18)  // pwrite64
+	delete(table, 19)  // readv
+	delete(table, 20)  // writev
+	delete(table, 21)  // access
+	delete(table, 22)  // pipe
+	delete(table, 32)  // dup
+	delete(table, 33)  // dup2
+	delete(table, 40)  // sendfile
+	delete(table, 59)  // execve
+	delete(table, 72)  // fcntl
+	delete(table, 73)  // flock
+	delete(table, 74)  // fsync
+	delete(table, 75)  // fdatasync
+	delete(table, 76)  // truncate
+	delete(table, 77)  // ftruncate
+	delete(table, 78)  // getdents
+	delete(table, 79)  // getcwd
+	delete(table, 80)  // chdir
+	delete(table, 81)  // fchdir
+	delete(table, 82)  // rename
+	delete(table, 83)  // mkdir
+	delete(table, 84)  // rmdir
+	delete(table, 85)  // creat
+	delete(table, 86)  // link
+	delete(table, 87)  // unlink
+	delete(table, 88)  // symlink
+	delete(table, 89)  // readlink
+	delete(table, 90)  // chmod
+	delete(table, 91)  // fchmod
+	delete(table, 92)  // chown
+	delete(table, 93)  // fchown
+	delete(table, 94)  // lchown
+	delete(table, 133) // mknod
+	delete(table, 137) // statfs
+	delete(table, 138) // fstatfs
+	delete(table, 161) // chroot
+	delete(table, 162) // sync
+	delete(table, 165) // mount
+	delete(table, 166) // umount2
+	delete(table, 172) // iopl
+	delete(table, 173) // ioperm
+	delete(table, 187) // readahead
+	delete(table, 188) // setxattr
+	delete(table, 189) // lsetxattr
+	delete(table, 190) // fsetxattr
+	delete(table, 191) // getxattr
+	delete(table, 192) // lgetxattr
+	delete(table, 193) // fgetxattr
+	delete(table, 206) // io_setup
+	delete(table, 207) // io_destroy
+	delete(table, 208) // io_getevents
+	delete(table, 209) // io_submit
+	delete(table, 210) // io_cancel
+	delete(table, 213) // epoll_create
+	delete(table, 214) // epoll_ctl_old
+	delete(table, 215) // epoll_wait_old
+	delete(table, 216) // remap_file_pages
+	delete(table, 217) // getdents64
+	delete(table, 232) // epoll_wait
+	delete(table, 233) // epoll_ctl
+	delete(table, 253) // inotify_init
+	delete(table, 254) // inotify_add_watch
+	delete(table, 255) // inotify_rm_watch
+	delete(table, 257) // openat
+	delete(table, 258) // mkdirat
+	delete(table, 259) // mknodat
+	delete(table, 260) // fchownat
+	delete(table, 261) // futimesat
+	delete(table, 262) // fstatat
+	delete(table, 263) // unlinkat
+	delete(table, 264) // renameat
+	delete(table, 265) // linkat
+	delete(table, 266) // symlinkat
+	delete(table, 267) // readlinkat
+	delete(table, 268) // fchmodat
+	delete(table, 269) // faccessat
+	delete(table, 270) // pselect
+	delete(table, 271) // ppoll
+	delete(table, 285) // fallocate
+	delete(table, 291) // epoll_create1
+	delete(table, 292) // dup3
+	delete(table, 293) // pipe2
+	delete(table, 294) // inotify_init1
+	delete(table, 295) // preadv
+	delete(table, 296) // pwritev
+	delete(table, 306) // syncfs
+	delete(table, 316) // renameat2
+	delete(table, 319) // memfd_create
+	delete(table, 322) // execveat
+	delete(table, 327) // preadv2
+	delete(table, 328) // pwritev2
+	delete(table, 332) // statx
 }
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 14b39eb9d..0b4f18ab5 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index d97362b9a..82781e6d3 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -29,9 +29,10 @@ const (
 	CtxRoot
 )
 
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
 func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	if v := ctx.Value(CtxMountNamespace); v != nil {
 		return v.(*MountNamespace)
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1fbb420f9..ad2c9fcf4 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -114,6 +114,7 @@ type MountNamespace struct {
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
+		ctx.Warningf("Unknown filesystem: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -231,9 +232,12 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 		return syserror.EINVAL
 	}
 	vfs.mountMu.Lock()
-	if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
-		vfs.mountMu.Unlock()
-		return syserror.EINVAL
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+		defer mntns.DecRef()
+		if mntns != vd.mount.ns {
+			vfs.mountMu.Unlock()
+			return syserror.EINVAL
+		}
 	}
 
 	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index fdf8be157..6af7fdac1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -61,7 +61,7 @@ type MountOptions struct {
 type OpenOptions struct {
 	// Flags contains access mode and flags as specified for open(2).
 	//
-	// FilesystemImpls is reponsible for implementing the following flags:
+	// FilesystemImpls are responsible for implementing the following flags:
 	// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
 	// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
 	// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9629afee9..51deae313 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -393,7 +393,8 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 					// be executed.
 					return nil, syserror.EACCES
 				}
-				if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular {
+				if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
+					ctx.Infof("%q is not a regular file: %v", pop.Path, t)
 					return nil, syserror.EACCES
 				}
 			}
@@ -743,6 +744,8 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 // VirtualDentry methods require that a reference is held on the VirtualDentry.
 //
 // VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
 type VirtualDentry struct {
 	mount  *Mount
 	dentry *Dentry
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9f0d5d7af..239ca5302 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -795,16 +795,19 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
+	// TODO(gvisor.dev/issue/1623): Add VFS2 support
+
 	// Get the container MountNamespace from the Task.
 	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		// task.MountNamespace() does not take a ref, so we must do so
-		// ourselves.
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
 		args.MountNamespace = t.MountNamespace()
 		args.MountNamespace.IncRef()
 	})
-	defer args.MountNamespace.DecRef()
+	if args.MountNamespace != nil {
+		defer args.MountNamespace.DecRef()
+	}
 
-	// Add the HOME enviroment varible if it is not already set.
+	// Add the HOME environment variable if it is not already set.
 	root := args.MountNamespace.Root()
 	defer root.DecRef()
 	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
-- 
cgit v1.2.3


From 3c26f5ecb0087337b1f194b6d429ce68f3af70eb Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 12:07:08 -0800
Subject: Enable automated marshalling for struct stat.

This requires fixing a few build issues for non-am64 platforms.

PiperOrigin-RevId: 295196922
---
 pkg/abi/linux/BUILD                         |  1 +
 pkg/abi/linux/file_amd64.go                 |  2 +
 pkg/abi/linux/file_arm64.go                 |  2 +
 pkg/abi/linux/time.go                       |  2 +
 pkg/sentry/kernel/BUILD                     |  1 +
 pkg/sentry/syscalls/linux/BUILD             |  2 -
 pkg/sentry/syscalls/linux/sys_stat.go       | 26 +++++++++-
 pkg/sentry/syscalls/linux/sys_stat_amd64.go | 75 ----------------------------
 pkg/sentry/syscalls/linux/sys_stat_arm64.go | 77 -----------------------------
 9 files changed, 32 insertions(+), 156 deletions(-)
 delete mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go
 delete mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 1f3c0c687..b7015367b 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -59,6 +59,7 @@ go_library(
         "wait.go",
         "xattr.go",
     ],
+    marshal = True,
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
index 8693d49c8..6b72364ea 100644
--- a/pkg/abi/linux/file_amd64.go
+++ b/pkg/abi/linux/file_amd64.go
@@ -25,6 +25,8 @@ const (
 )
 
 // Stat represents struct stat.
+//
+// +marshal
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
index ea3adc5f5..6492c9038 100644
--- a/pkg/abi/linux/file_arm64.go
+++ b/pkg/abi/linux/file_arm64.go
@@ -25,6 +25,8 @@ const (
 )
 
 // Stat represents struct stat.
+//
+// +marshal
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 5c5a58cd4..e562b46d9 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -101,6 +101,8 @@ func NsecToTimeT(nsec int64) TimeT {
 }
 
 // Timespec represents struct timespec in <time.h>.
+//
+// +marshal
 type Timespec struct {
 	Sec  int64
 	Nsec int64
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 46306945f..beba29a09 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -201,6 +201,7 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 0d24fd3c4..c7883e68e 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,8 +42,6 @@ go_library(
         "sys_socket.go",
         "sys_splice.go",
         "sys_stat.go",
-        "sys_stat_amd64.go",
-        "sys_stat_arm64.go",
         "sys_sync.go",
         "sys_sysinfo.go",
         "sys_syslog.go",
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index c841abccb..8b66a9006 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,6 +23,24 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
 // Stat implements linux syscall stat(2).
 func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -112,7 +130,9 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	if err != nil {
 		return err
 	}
-	return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr)
+	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -121,7 +141,9 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
+	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // Statx implements linux syscall statx(2).
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
deleted file mode 100644
index 75a567bd4..000000000
--- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//+build amd64
-
-package linux
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Nlink (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
-}
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
deleted file mode 100644
index 80c98d05c..000000000
--- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//+build arm64
-
-package linux
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// Nlink (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Padding (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, 0)
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
-}
-- 
cgit v1.2.3


From 50c493193b72997a6b09f353fd9217349941c494 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 12:22:25 -0800
Subject: Un-export p9 message encode/decode functions.

These are not used outside of the p9 package.

PiperOrigin-RevId: 295200052
---
 pkg/p9/buffer.go             |  10 +-
 pkg/p9/messages.go           | 672 +++++++++++++++++++++----------------------
 pkg/p9/messages_test.go      |   4 +-
 pkg/p9/p9.go                 |  68 ++---
 pkg/p9/transport.go          |   4 +-
 pkg/p9/transport_flipcall.go |   4 +-
 pkg/p9/transport_test.go     |   8 +-
 7 files changed, 385 insertions(+), 385 deletions(-)

diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index 249536d8a..6a4951821 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -20,16 +20,16 @@ import (
 
 // encoder is used for messages and 9P primitives.
 type encoder interface {
-	// Decode decodes from the given buffer. Decode may be called more than once
+	// decode decodes from the given buffer. decode may be called more than once
 	// to reuse the instance. It must clear any previous state.
 	//
 	// This may not fail, exhaustion will be recorded in the buffer.
-	Decode(b *buffer)
+	decode(b *buffer)
 
-	// Encode encodes to the given buffer.
+	// encode encodes to the given buffer.
 	//
 	// This may not fail.
-	Encode(b *buffer)
+	encode(b *buffer)
 }
 
 // order is the byte order used for encoding.
@@ -39,7 +39,7 @@ var order = binary.LittleEndian
 //
 // This is passed to the encoder methods.
 type buffer struct {
-	// data is the underlying data. This may grow during Encode.
+	// data is the underlying data. This may grow during encode.
 	data []byte
 
 	// overflow indicates whether an overflow has occurred.
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index b1cede5f5..3863ad1f5 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -51,7 +51,7 @@ type payloader interface {
 	// SetPayload returns the decoded message.
 	//
 	// This is going to be total message size - FixedSize. But this should
-	// be validated during Decode, which will be called after SetPayload.
+	// be validated during decode, which will be called after SetPayload.
 	SetPayload([]byte)
 }
 
@@ -90,14 +90,14 @@ type Tversion struct {
 	Version string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tversion) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tversion) decode(b *buffer) {
 	t.MSize = b.Read32()
 	t.Version = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tversion) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tversion) encode(b *buffer) {
 	b.Write32(t.MSize)
 	b.WriteString(t.Version)
 }
@@ -121,14 +121,14 @@ type Rversion struct {
 	Version string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rversion) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rversion) decode(b *buffer) {
 	r.MSize = b.Read32()
 	r.Version = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rversion) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rversion) encode(b *buffer) {
 	b.Write32(r.MSize)
 	b.WriteString(r.Version)
 }
@@ -149,13 +149,13 @@ type Tflush struct {
 	OldTag Tag
 }
 
-// Decode implements encoder.Decode.
-func (t *Tflush) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tflush) decode(b *buffer) {
 	t.OldTag = b.ReadTag()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tflush) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tflush) encode(b *buffer) {
 	b.WriteTag(t.OldTag)
 }
 
@@ -173,12 +173,12 @@ func (t *Tflush) String() string {
 type Rflush struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rflush) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rflush) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rflush) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rflush) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -203,8 +203,8 @@ type Twalk struct {
 	Names []string
 }
 
-// Decode implements encoder.Decode.
-func (t *Twalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twalk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	n := b.Read16()
@@ -214,8 +214,8 @@ func (t *Twalk) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (t *Twalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Twalk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.Write16(uint16(len(t.Names)))
@@ -240,22 +240,22 @@ type Rwalk struct {
 	QIDs []QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rwalk) decode(b *buffer) {
 	n := b.Read16()
 	r.QIDs = r.QIDs[:0]
 	for i := 0; i < int(n); i++ {
 		var q QID
-		q.Decode(b)
+		q.decode(b)
 		r.QIDs = append(r.QIDs, q)
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rwalk) encode(b *buffer) {
 	b.Write16(uint16(len(r.QIDs)))
 	for _, q := range r.QIDs {
-		q.Encode(b)
+		q.encode(b)
 	}
 }
 
@@ -275,13 +275,13 @@ type Tclunk struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tclunk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tclunk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tclunk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tclunk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -299,12 +299,12 @@ func (t *Tclunk) String() string {
 type Rclunk struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rclunk) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rclunk) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rclunk) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rclunk) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -325,13 +325,13 @@ type Tremove struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tremove) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tremove) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tremove) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tremove) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -349,12 +349,12 @@ func (t *Tremove) String() string {
 type Rremove struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rremove) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rremove) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rremove) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rremove) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -374,13 +374,13 @@ type Rlerror struct {
 	Error uint32
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlerror) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rlerror) decode(b *buffer) {
 	r.Error = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlerror) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rlerror) encode(b *buffer) {
 	b.Write32(r.Error)
 }
 
@@ -409,16 +409,16 @@ type Tauth struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tauth) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tauth) decode(b *buffer) {
 	t.AuthenticationFID = b.ReadFID()
 	t.UserName = b.ReadString()
 	t.AttachName = b.ReadString()
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tauth) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tauth) encode(b *buffer) {
 	b.WriteFID(t.AuthenticationFID)
 	b.WriteString(t.UserName)
 	b.WriteString(t.AttachName)
@@ -437,7 +437,7 @@ func (t *Tauth) String() string {
 
 // Rauth is an authentication response.
 //
-// Encode, Decode and Length are inherited directly from QID.
+// encode and decode are inherited directly from QID.
 type Rauth struct {
 	QID
 }
@@ -463,16 +463,16 @@ type Tattach struct {
 	Auth Tauth
 }
 
-// Decode implements encoder.Decode.
-func (t *Tattach) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tattach) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Auth.Decode(b)
+	t.Auth.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tattach) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tattach) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Auth.Encode(b)
+	t.Auth.encode(b)
 }
 
 // Type implements message.Type.
@@ -509,14 +509,14 @@ type Tlopen struct {
 	Flags OpenFlags
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlopen) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlopen) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Flags = b.ReadOpenFlags()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlopen) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlopen) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteOpenFlags(t.Flags)
 }
@@ -542,15 +542,15 @@ type Rlopen struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlopen) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rlopen) decode(b *buffer) {
+	r.QID.decode(b)
 	r.IoUnit = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlopen) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rlopen) encode(b *buffer) {
+	r.QID.encode(b)
 	b.Write32(r.IoUnit)
 }
 
@@ -587,8 +587,8 @@ type Tlcreate struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlcreate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlcreate) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.OpenFlags = b.ReadOpenFlags()
@@ -596,8 +596,8 @@ func (t *Tlcreate) Decode(b *buffer) {
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlcreate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlcreate) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.WriteOpenFlags(t.OpenFlags)
@@ -617,7 +617,7 @@ func (t *Tlcreate) String() string {
 
 // Rlcreate is a create response.
 //
-// The Encode, Decode, etc. methods are inherited from Rlopen.
+// The encode, decode, etc. methods are inherited from Rlopen.
 type Rlcreate struct {
 	Rlopen
 }
@@ -647,16 +647,16 @@ type Tsymlink struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsymlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsymlink) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Target = b.ReadString()
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsymlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsymlink) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WriteString(t.Target)
@@ -679,14 +679,14 @@ type Rsymlink struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rsymlink) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rsymlink) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rsymlink) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rsymlink) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -711,15 +711,15 @@ type Tlink struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlink) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Target = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlink) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteFID(t.Target)
 	b.WriteString(t.Name)
@@ -744,12 +744,12 @@ func (*Rlink) Type() MsgType {
 	return MsgRlink
 }
 
-// Decode implements encoder.Decode.
-func (*Rlink) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rlink) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rlink) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rlink) encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
@@ -772,16 +772,16 @@ type Trenameat struct {
 	NewName string
 }
 
-// Decode implements encoder.Decode.
-func (t *Trenameat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Trenameat) decode(b *buffer) {
 	t.OldDirectory = b.ReadFID()
 	t.OldName = b.ReadString()
 	t.NewDirectory = b.ReadFID()
 	t.NewName = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Trenameat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Trenameat) encode(b *buffer) {
 	b.WriteFID(t.OldDirectory)
 	b.WriteString(t.OldName)
 	b.WriteFID(t.NewDirectory)
@@ -802,12 +802,12 @@ func (t *Trenameat) String() string {
 type Rrenameat struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rrenameat) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rrenameat) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rrenameat) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rrenameat) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -832,15 +832,15 @@ type Tunlinkat struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tunlinkat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tunlinkat) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tunlinkat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tunlinkat) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.Write32(t.Flags)
@@ -860,12 +860,12 @@ func (t *Tunlinkat) String() string {
 type Runlinkat struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Runlinkat) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Runlinkat) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Runlinkat) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Runlinkat) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -893,15 +893,15 @@ type Trename struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Trename) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Trename) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Trename) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Trename) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
@@ -921,12 +921,12 @@ func (t *Trename) String() string {
 type Rrename struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rrename) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rrename) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rrename) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rrename) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -945,13 +945,13 @@ type Treadlink struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Treadlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Treadlink) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Treadlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Treadlink) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -971,13 +971,13 @@ type Rreadlink struct {
 	Target string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rreadlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rreadlink) decode(b *buffer) {
 	r.Target = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rreadlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rreadlink) encode(b *buffer) {
 	b.WriteString(r.Target)
 }
 
@@ -1003,15 +1003,15 @@ type Tread struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tread) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tread) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Offset = b.Read64()
 	t.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tread) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tread) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Offset)
 	b.Write32(t.Count)
@@ -1033,20 +1033,20 @@ type Rread struct {
 	Data []byte
 }
 
-// Decode implements encoder.Decode.
+// decode implements encoder.decode.
 //
 // Data is automatically decoded via Payload.
-func (r *Rread) Decode(b *buffer) {
+func (r *Rread) decode(b *buffer) {
 	count := b.Read32()
 	if count != uint32(len(r.Data)) {
 		b.markOverrun()
 	}
 }
 
-// Encode implements encoder.Encode.
+// encode implements encoder.encode.
 //
 // Data is automatically encoded via Payload.
-func (r *Rread) Encode(b *buffer) {
+func (r *Rread) encode(b *buffer) {
 	b.Write32(uint32(len(r.Data)))
 }
 
@@ -1087,8 +1087,8 @@ type Twrite struct {
 	Data []byte
 }
 
-// Decode implements encoder.Decode.
-func (t *Twrite) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twrite) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Offset = b.Read64()
 	count := b.Read32()
@@ -1097,10 +1097,10 @@ func (t *Twrite) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
+// encode implements encoder.encode.
 //
 // This uses the buffer payload to avoid a copy.
-func (t *Twrite) Encode(b *buffer) {
+func (t *Twrite) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Offset)
 	b.Write32(uint32(len(t.Data)))
@@ -1137,13 +1137,13 @@ type Rwrite struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwrite) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rwrite) decode(b *buffer) {
 	r.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwrite) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rwrite) encode(b *buffer) {
 	b.Write32(r.Count)
 }
 
@@ -1178,8 +1178,8 @@ type Tmknod struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tmknod) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tmknod) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Mode = b.ReadFileMode()
@@ -1188,8 +1188,8 @@ func (t *Tmknod) Decode(b *buffer) {
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tmknod) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tmknod) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WriteFileMode(t.Mode)
@@ -1214,14 +1214,14 @@ type Rmknod struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rmknod) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rmknod) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rmknod) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rmknod) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -1249,16 +1249,16 @@ type Tmkdir struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tmkdir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tmkdir) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Permissions = b.ReadPermissions()
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tmkdir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tmkdir) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WritePermissions(t.Permissions)
@@ -1281,14 +1281,14 @@ type Rmkdir struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rmkdir) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rmkdir) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rmkdir) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rmkdir) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -1310,16 +1310,16 @@ type Tgetattr struct {
 	AttrMask AttrMask
 }
 
-// Decode implements encoder.Decode.
-func (t *Tgetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tgetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.AttrMask.Decode(b)
+	t.AttrMask.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tgetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tgetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.AttrMask.Encode(b)
+	t.AttrMask.encode(b)
 }
 
 // Type implements message.Type.
@@ -1344,18 +1344,18 @@ type Rgetattr struct {
 	Attr Attr
 }
 
-// Decode implements encoder.Decode.
-func (r *Rgetattr) Decode(b *buffer) {
-	r.Valid.Decode(b)
-	r.QID.Decode(b)
-	r.Attr.Decode(b)
+// decode implements encoder.decode.
+func (r *Rgetattr) decode(b *buffer) {
+	r.Valid.decode(b)
+	r.QID.decode(b)
+	r.Attr.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rgetattr) Encode(b *buffer) {
-	r.Valid.Encode(b)
-	r.QID.Encode(b)
-	r.Attr.Encode(b)
+// encode implements encoder.encode.
+func (r *Rgetattr) encode(b *buffer) {
+	r.Valid.encode(b)
+	r.QID.encode(b)
+	r.Attr.encode(b)
 }
 
 // Type implements message.Type.
@@ -1380,18 +1380,18 @@ type Tsetattr struct {
 	SetAttr SetAttr
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Valid.Decode(b)
-	t.SetAttr.Decode(b)
+	t.Valid.decode(b)
+	t.SetAttr.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Valid.Encode(b)
-	t.SetAttr.Encode(b)
+	t.Valid.encode(b)
+	t.SetAttr.encode(b)
 }
 
 // Type implements message.Type.
@@ -1408,12 +1408,12 @@ func (t *Tsetattr) String() string {
 type Rsetattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rsetattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rsetattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rsetattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rsetattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1435,18 +1435,18 @@ type Tallocate struct {
 	Length uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tallocate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tallocate) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Mode.Decode(b)
+	t.Mode.decode(b)
 	t.Offset = b.Read64()
 	t.Length = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tallocate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tallocate) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Mode.Encode(b)
+	t.Mode.encode(b)
 	b.Write64(t.Offset)
 	b.Write64(t.Length)
 }
@@ -1465,12 +1465,12 @@ func (t *Tallocate) String() string {
 type Rallocate struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rallocate) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rallocate) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rallocate) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rallocate) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1492,14 +1492,14 @@ type Tlistxattr struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlistxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlistxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlistxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlistxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Size)
 }
@@ -1520,8 +1520,8 @@ type Rlistxattr struct {
 	Xattrs []string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlistxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rlistxattr) decode(b *buffer) {
 	n := b.Read16()
 	r.Xattrs = r.Xattrs[:0]
 	for i := 0; i < int(n); i++ {
@@ -1529,8 +1529,8 @@ func (r *Rlistxattr) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlistxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rlistxattr) encode(b *buffer) {
 	b.Write16(uint16(len(r.Xattrs)))
 	for _, x := range r.Xattrs {
 		b.WriteString(x)
@@ -1559,15 +1559,15 @@ type Txattrwalk struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Txattrwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Txattrwalk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Txattrwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Txattrwalk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.WriteString(t.Name)
@@ -1589,13 +1589,13 @@ type Rxattrwalk struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (r *Rxattrwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rxattrwalk) decode(b *buffer) {
 	r.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rxattrwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rxattrwalk) encode(b *buffer) {
 	b.Write64(r.Size)
 }
 
@@ -1627,16 +1627,16 @@ type Txattrcreate struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Txattrcreate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Txattrcreate) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.AttrSize = b.Read64()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Txattrcreate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Txattrcreate) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.Write64(t.AttrSize)
@@ -1657,12 +1657,12 @@ func (t *Txattrcreate) String() string {
 type Rxattrcreate struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rxattrcreate) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rxattrcreate) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1687,15 +1687,15 @@ type Tgetxattr struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tgetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tgetxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tgetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tgetxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.Write64(t.Size)
@@ -1717,13 +1717,13 @@ type Rgetxattr struct {
 	Value string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rgetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rgetxattr) decode(b *buffer) {
 	r.Value = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rgetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rgetxattr) encode(b *buffer) {
 	b.WriteString(r.Value)
 }
 
@@ -1752,16 +1752,16 @@ type Tsetxattr struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsetxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Value = b.ReadString()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsetxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.WriteString(t.Value)
@@ -1782,12 +1782,12 @@ func (t *Tsetxattr) String() string {
 type Rsetxattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rsetxattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rsetxattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1809,14 +1809,14 @@ type Tremovexattr struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tremovexattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tremovexattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tremovexattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tremovexattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 }
@@ -1835,12 +1835,12 @@ func (t *Tremovexattr) String() string {
 type Rremovexattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rremovexattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rremovexattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rremovexattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rremovexattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1865,15 +1865,15 @@ type Treaddir struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Treaddir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Treaddir) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Offset = b.Read64()
 	t.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Treaddir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Treaddir) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.Write64(t.Offset)
 	b.Write32(t.Count)
@@ -1907,14 +1907,14 @@ type Rreaddir struct {
 	payload []byte
 }
 
-// Decode implements encoder.Decode.
-func (r *Rreaddir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rreaddir) decode(b *buffer) {
 	r.Count = b.Read32()
 	entriesBuf := buffer{data: r.payload}
 	r.Entries = r.Entries[:0]
 	for {
 		var d Dirent
-		d.Decode(&entriesBuf)
+		d.decode(&entriesBuf)
 		if entriesBuf.isOverrun() {
 			// Couldn't decode a complete entry.
 			break
@@ -1923,11 +1923,11 @@ func (r *Rreaddir) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rreaddir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rreaddir) encode(b *buffer) {
 	entriesBuf := buffer{}
 	for _, d := range r.Entries {
-		d.Encode(&entriesBuf)
+		d.encode(&entriesBuf)
 		if len(entriesBuf.data) >= int(r.Count) {
 			break
 		}
@@ -1972,13 +1972,13 @@ type Tfsync struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tfsync) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tfsync) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tfsync) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tfsync) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -1996,12 +1996,12 @@ func (t *Tfsync) String() string {
 type Rfsync struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rfsync) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rfsync) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rfsync) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rfsync) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -2020,13 +2020,13 @@ type Tstatfs struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tstatfs) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tstatfs) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tstatfs) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tstatfs) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -2046,14 +2046,14 @@ type Rstatfs struct {
 	FSStat FSStat
 }
 
-// Decode implements encoder.Decode.
-func (r *Rstatfs) Decode(b *buffer) {
-	r.FSStat.Decode(b)
+// decode implements encoder.decode.
+func (r *Rstatfs) decode(b *buffer) {
+	r.FSStat.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rstatfs) Encode(b *buffer) {
-	r.FSStat.Encode(b)
+// encode implements encoder.encode.
+func (r *Rstatfs) encode(b *buffer) {
+	r.FSStat.encode(b)
 }
 
 // Type implements message.Type.
@@ -2072,13 +2072,13 @@ type Tflushf struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tflushf) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tflushf) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tflushf) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tflushf) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -2096,12 +2096,12 @@ func (t *Tflushf) String() string {
 type Rflushf struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rflushf) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rflushf) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rflushf) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rflushf) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -2126,8 +2126,8 @@ type Twalkgetattr struct {
 	Names []string
 }
 
-// Decode implements encoder.Decode.
-func (t *Twalkgetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twalkgetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	n := b.Read16()
@@ -2137,8 +2137,8 @@ func (t *Twalkgetattr) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (t *Twalkgetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Twalkgetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.Write16(uint16(len(t.Names)))
@@ -2169,26 +2169,26 @@ type Rwalkgetattr struct {
 	QIDs []QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwalkgetattr) Decode(b *buffer) {
-	r.Valid.Decode(b)
-	r.Attr.Decode(b)
+// decode implements encoder.decode.
+func (r *Rwalkgetattr) decode(b *buffer) {
+	r.Valid.decode(b)
+	r.Attr.decode(b)
 	n := b.Read16()
 	r.QIDs = r.QIDs[:0]
 	for i := 0; i < int(n); i++ {
 		var q QID
-		q.Decode(b)
+		q.decode(b)
 		r.QIDs = append(r.QIDs, q)
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwalkgetattr) Encode(b *buffer) {
-	r.Valid.Encode(b)
-	r.Attr.Encode(b)
+// encode implements encoder.encode.
+func (r *Rwalkgetattr) encode(b *buffer) {
+	r.Valid.encode(b)
+	r.Attr.encode(b)
 	b.Write16(uint16(len(r.QIDs)))
 	for _, q := range r.QIDs {
-		q.Encode(b)
+		q.encode(b)
 	}
 }
 
@@ -2210,15 +2210,15 @@ type Tucreate struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tucreate) Decode(b *buffer) {
-	t.Tlcreate.Decode(b)
+// decode implements encoder.decode.
+func (t *Tucreate) decode(b *buffer) {
+	t.Tlcreate.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tucreate) Encode(b *buffer) {
-	t.Tlcreate.Encode(b)
+// encode implements encoder.encode.
+func (t *Tucreate) encode(b *buffer) {
+	t.Tlcreate.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2255,15 +2255,15 @@ type Tumkdir struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tumkdir) Decode(b *buffer) {
-	t.Tmkdir.Decode(b)
+// decode implements encoder.decode.
+func (t *Tumkdir) decode(b *buffer) {
+	t.Tmkdir.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tumkdir) Encode(b *buffer) {
-	t.Tmkdir.Encode(b)
+// encode implements encoder.encode.
+func (t *Tumkdir) encode(b *buffer) {
+	t.Tmkdir.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2300,15 +2300,15 @@ type Tumknod struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tumknod) Decode(b *buffer) {
-	t.Tmknod.Decode(b)
+// decode implements encoder.decode.
+func (t *Tumknod) decode(b *buffer) {
+	t.Tmknod.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tumknod) Encode(b *buffer) {
-	t.Tmknod.Encode(b)
+// encode implements encoder.encode.
+func (t *Tumknod) encode(b *buffer) {
+	t.Tmknod.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2345,15 +2345,15 @@ type Tusymlink struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tusymlink) Decode(b *buffer) {
-	t.Tsymlink.Decode(b)
+// decode implements encoder.decode.
+func (t *Tusymlink) decode(b *buffer) {
+	t.Tsymlink.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tusymlink) Encode(b *buffer) {
-	t.Tsymlink.Encode(b)
+// encode implements encoder.encode.
+func (t *Tusymlink) encode(b *buffer) {
+	t.Tsymlink.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2391,14 +2391,14 @@ type Tlconnect struct {
 	Flags ConnectFlags
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlconnect) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlconnect) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Flags = b.ReadConnectFlags()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlconnect) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlconnect) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteConnectFlags(t.Flags)
 }
@@ -2418,11 +2418,11 @@ type Rlconnect struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlconnect) Decode(*buffer) {}
+// decode implements encoder.decode.
+func (r *Rlconnect) decode(*buffer) {}
 
-// Encode implements encoder.Encode.
-func (r *Rlconnect) Encode(*buffer) {}
+// encode implements encoder.encode.
+func (r *Rlconnect) encode(*buffer) {}
 
 // Type implements message.Type.
 func (*Rlconnect) Type() MsgType {
@@ -2445,14 +2445,14 @@ type Tchannel struct {
 	Control uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tchannel) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tchannel) decode(b *buffer) {
 	t.ID = b.Read32()
 	t.Control = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tchannel) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tchannel) encode(b *buffer) {
 	b.Write32(t.ID)
 	b.Write32(t.Control)
 }
@@ -2474,14 +2474,14 @@ type Rchannel struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rchannel) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rchannel) decode(b *buffer) {
 	r.Offset = b.Read64()
 	r.Length = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rchannel) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rchannel) encode(b *buffer) {
 	b.Write64(r.Offset)
 	b.Write64(r.Length)
 }
@@ -2577,7 +2577,7 @@ func calculateSize(m message) uint32 {
 		return p.FixedSize()
 	}
 	var dataBuf buffer
-	m.Encode(&dataBuf)
+	m.encode(&dataBuf)
 	return uint32(len(dataBuf.data))
 }
 
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 825c939da..c20324404 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -382,7 +382,7 @@ func TestEncodeDecode(t *testing.T) {
 		// Encode the original.
 		data := make([]byte, initialBufferLength)
 		buf := buffer{data: data[:0]}
-		enc.Encode(&buf)
+		enc.encode(&buf)
 
 		// Create a new object, same as the first.
 		enc2 := reflect.New(reflect.ValueOf(enc).Elem().Type()).Interface().(encoder)
@@ -399,7 +399,7 @@ func TestEncodeDecode(t *testing.T) {
 		}
 
 		// Mark sure it was okay.
-		enc2.Decode(&buf2)
+		enc2.decode(&buf2)
 		if buf2.isOverrun() {
 			t.Errorf("object %#v->%#v got overrun on decode", enc, enc2)
 			continue
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 20ab31f7a..28d851ff5 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -450,15 +450,15 @@ func (q QID) String() string {
 	return fmt.Sprintf("QID{Type: %d, Version: %d, Path: %d}", q.Type, q.Version, q.Path)
 }
 
-// Decode implements encoder.Decode.
-func (q *QID) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (q *QID) decode(b *buffer) {
 	q.Type = b.ReadQIDType()
 	q.Version = b.Read32()
 	q.Path = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (q *QID) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (q *QID) encode(b *buffer) {
 	b.WriteQIDType(q.Type)
 	b.Write32(q.Version)
 	b.Write64(q.Path)
@@ -515,8 +515,8 @@ type FSStat struct {
 	NameLength uint32
 }
 
-// Decode implements encoder.Decode.
-func (f *FSStat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (f *FSStat) decode(b *buffer) {
 	f.Type = b.Read32()
 	f.BlockSize = b.Read32()
 	f.Blocks = b.Read64()
@@ -528,8 +528,8 @@ func (f *FSStat) Decode(b *buffer) {
 	f.NameLength = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (f *FSStat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (f *FSStat) encode(b *buffer) {
 	b.Write32(f.Type)
 	b.Write32(f.BlockSize)
 	b.Write64(f.Blocks)
@@ -679,8 +679,8 @@ func (a AttrMask) String() string {
 	return fmt.Sprintf("AttrMask{with: %s}", strings.Join(masks, " "))
 }
 
-// Decode implements encoder.Decode.
-func (a *AttrMask) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *AttrMask) decode(b *buffer) {
 	mask := b.Read64()
 	a.Mode = mask&0x00000001 != 0
 	a.NLink = mask&0x00000002 != 0
@@ -698,8 +698,8 @@ func (a *AttrMask) Decode(b *buffer) {
 	a.DataVersion = mask&0x00002000 != 0
 }
 
-// Encode implements encoder.Encode.
-func (a *AttrMask) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *AttrMask) encode(b *buffer) {
 	var mask uint64
 	if a.Mode {
 		mask |= 0x00000001
@@ -774,8 +774,8 @@ func (a Attr) String() string {
 		a.Mode, a.UID, a.GID, a.NLink, a.RDev, a.Size, a.BlockSize, a.Blocks, a.ATimeSeconds, a.ATimeNanoSeconds, a.MTimeSeconds, a.MTimeNanoSeconds, a.CTimeSeconds, a.CTimeNanoSeconds, a.BTimeSeconds, a.BTimeNanoSeconds, a.Gen, a.DataVersion)
 }
 
-// Encode implements encoder.Encode.
-func (a *Attr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *Attr) encode(b *buffer) {
 	b.WriteFileMode(a.Mode)
 	b.WriteUID(a.UID)
 	b.WriteGID(a.GID)
@@ -796,8 +796,8 @@ func (a *Attr) Encode(b *buffer) {
 	b.Write64(a.DataVersion)
 }
 
-// Decode implements encoder.Decode.
-func (a *Attr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *Attr) decode(b *buffer) {
 	a.Mode = b.ReadFileMode()
 	a.UID = b.ReadUID()
 	a.GID = b.ReadGID()
@@ -926,8 +926,8 @@ func (s SetAttrMask) Empty() bool {
 	return !s.Permissions && !s.UID && !s.GID && !s.Size && !s.ATime && !s.MTime && !s.CTime && !s.ATimeNotSystemTime && !s.MTimeNotSystemTime
 }
 
-// Decode implements encoder.Decode.
-func (s *SetAttrMask) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (s *SetAttrMask) decode(b *buffer) {
 	mask := b.Read32()
 	s.Permissions = mask&0x00000001 != 0
 	s.UID = mask&0x00000002 != 0
@@ -972,8 +972,8 @@ func (s SetAttrMask) bitmask() uint32 {
 	return mask
 }
 
-// Encode implements encoder.Encode.
-func (s *SetAttrMask) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (s *SetAttrMask) encode(b *buffer) {
 	b.Write32(s.bitmask())
 }
 
@@ -994,8 +994,8 @@ func (s SetAttr) String() string {
 	return fmt.Sprintf("SetAttr{Permissions: 0o%o, UID: %d, GID: %d, Size: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}}", s.Permissions, s.UID, s.GID, s.Size, s.ATimeSeconds, s.ATimeNanoSeconds, s.MTimeSeconds, s.MTimeNanoSeconds)
 }
 
-// Decode implements encoder.Decode.
-func (s *SetAttr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (s *SetAttr) decode(b *buffer) {
 	s.Permissions = b.ReadPermissions()
 	s.UID = b.ReadUID()
 	s.GID = b.ReadGID()
@@ -1006,8 +1006,8 @@ func (s *SetAttr) Decode(b *buffer) {
 	s.MTimeNanoSeconds = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (s *SetAttr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (s *SetAttr) encode(b *buffer) {
 	b.WritePermissions(s.Permissions)
 	b.WriteUID(s.UID)
 	b.WriteGID(s.GID)
@@ -1064,17 +1064,17 @@ func (d Dirent) String() string {
 	return fmt.Sprintf("Dirent{QID: %d, Offset: %d, Type: 0x%X, Name: %s}", d.QID, d.Offset, d.Type, d.Name)
 }
 
-// Decode implements encoder.Decode.
-func (d *Dirent) Decode(b *buffer) {
-	d.QID.Decode(b)
+// decode implements encoder.decode.
+func (d *Dirent) decode(b *buffer) {
+	d.QID.decode(b)
 	d.Offset = b.Read64()
 	d.Type = b.ReadQIDType()
 	d.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (d *Dirent) Encode(b *buffer) {
-	d.QID.Encode(b)
+// encode implements encoder.encode.
+func (d *Dirent) encode(b *buffer) {
+	d.QID.encode(b)
 	b.Write64(d.Offset)
 	b.WriteQIDType(d.Type)
 	b.WriteString(d.Name)
@@ -1118,8 +1118,8 @@ func (a *AllocateMode) ToLinux() uint32 {
 	return rv
 }
 
-// Decode implements encoder.Decode.
-func (a *AllocateMode) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *AllocateMode) decode(b *buffer) {
 	mask := b.Read32()
 	a.KeepSize = mask&0x01 != 0
 	a.PunchHole = mask&0x02 != 0
@@ -1130,8 +1130,8 @@ func (a *AllocateMode) Decode(b *buffer) {
 	a.Unshare = mask&0x40 != 0
 }
 
-// Encode implements encoder.Encode.
-func (a *AllocateMode) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *AllocateMode) encode(b *buffer) {
 	mask := uint32(0)
 	if a.KeepSize {
 		mask |= 0x01
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 9c11e28ce..7cec0e86d 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -80,7 +80,7 @@ func send(s *unet.Socket, tag Tag, m message) error {
 	}
 
 	// Encode the message. The buffer will grow automatically.
-	m.Encode(&dataBuf)
+	m.encode(&dataBuf)
 
 	// Get our vectors to send.
 	var hdr [headerLength]byte
@@ -316,7 +316,7 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	}
 
 	// Decode the message data.
-	m.Decode(&dataBuf)
+	m.decode(&dataBuf)
 	if dataBuf.isOverrun() {
 		// No need to drain the socket.
 		return NoTag, nil, ErrNoValidMessage
diff --git a/pkg/p9/transport_flipcall.go b/pkg/p9/transport_flipcall.go
index 233f825e3..a0d274f3b 100644
--- a/pkg/p9/transport_flipcall.go
+++ b/pkg/p9/transport_flipcall.go
@@ -151,7 +151,7 @@ func (ch *channel) send(m message) (uint32, error) {
 	} else {
 		ch.buf.Write8(0) // No incoming FD.
 	}
-	m.Encode(&ch.buf)
+	m.encode(&ch.buf)
 	ssz := uint32(len(ch.buf.data)) // Updated below.
 
 	// Is there a payload?
@@ -205,7 +205,7 @@ func (ch *channel) recv(r message, rsz uint32) (message, error) {
 		ch.buf.data = ch.buf.data[:fs]
 	}
 
-	r.Decode(&ch.buf)
+	r.decode(&ch.buf)
 	if ch.buf.isOverrun() {
 		// Nothing valid was available.
 		log.Debugf("recv [got %d bytes, needed more]", rsz)
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index 2f50ff3ea..3668fcad7 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -56,8 +56,8 @@ func TestSendRecv(t *testing.T) {
 // badDecode overruns on decode.
 type badDecode struct{}
 
-func (*badDecode) Decode(b *buffer) { b.markOverrun() }
-func (*badDecode) Encode(b *buffer) {}
+func (*badDecode) decode(b *buffer) { b.markOverrun() }
+func (*badDecode) encode(b *buffer) {}
 func (*badDecode) Type() MsgType    { return MsgTypeBadDecode }
 func (*badDecode) String() string   { return "badDecode{}" }
 
@@ -81,8 +81,8 @@ func TestRecvOverrun(t *testing.T) {
 // unregistered is not registered on decode.
 type unregistered struct{}
 
-func (*unregistered) Decode(b *buffer) {}
-func (*unregistered) Encode(b *buffer) {}
+func (*unregistered) decode(b *buffer) {}
+func (*unregistered) encode(b *buffer) {}
 func (*unregistered) Type() MsgType    { return MsgTypeUnregistered }
 func (*unregistered) String() string   { return "unregistered{}" }
 
-- 
cgit v1.2.3


From e4c7f3e6f6c19f3259820a4c41b69e85c0454379 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 13:39:51 -0800
Subject: Inline vfs.VirtualFilesystem in Kernel struct

This saves one pointer dereference per VFS access.

Updates #1623

PiperOrigin-RevId: 295216176
---
 pkg/sentry/control/proc.go                        |  2 +-
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go       |  5 +++-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  5 +++-
 pkg/sentry/fsimpl/ext/ext_test.go                 |  5 +++-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |  5 +++-
 pkg/sentry/fsimpl/proc/tasks_test.go              |  6 ++--
 pkg/sentry/fsimpl/sys/sys_test.go                 |  6 ++--
 pkg/sentry/fsimpl/testutil/kernel.go              |  7 +++--
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go         | 10 +++++--
 pkg/sentry/fsimpl/tmpfs/pipe_test.go              |  5 +++-
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go      |  6 +++-
 pkg/sentry/kernel/kernel.go                       |  9 ++++--
 pkg/sentry/vfs/dentry.go                          |  4 ++-
 pkg/sentry/vfs/device.go                          |  3 ++
 pkg/sentry/vfs/file_description_impl_util_test.go | 10 +++++--
 pkg/sentry/vfs/filesystem.go                      |  2 ++
 pkg/sentry/vfs/filesystem_type.go                 |  1 +
 pkg/sentry/vfs/mount.go                           |  4 +++
 pkg/sentry/vfs/mount_unsafe.go                    |  8 ++++--
 pkg/sentry/vfs/vfs.go                             | 35 +++++++++++------------
 20 files changed, 94 insertions(+), 44 deletions(-)

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 8973754c8..5457ba5e7 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -199,7 +199,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 			}
 
 			paths := fs.GetPath(initArgs.Envv)
-			vfsObj := proc.Kernel.VFS
+			vfsObj := proc.Kernel.VFS()
 			file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 			if err != nil {
 				return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 73308a2b5..b6d52c015 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	// Register tmpfs just so that we can have a root filesystem that isn't
 	// devtmpfs.
 	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 2015a8871..89caee3df 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -52,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		return nil, nil, nil, nil, err
+	}
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 05f992826..ef6127f3c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -65,7 +65,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 96a16e654..0459fb305 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
 func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-	v := vfs.New()
+	v := &vfs.VirtualFilesystem{}
+	if err := v.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 96c72cbc9..c5d531fe0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -90,7 +90,7 @@ func setup(t *testing.T) *testutil.System {
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
 
-	k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 	fsOpts := vfs.GetFilesystemOptions{
@@ -101,11 +101,11 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts)
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts)
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
-	return testutil.NewSystem(ctx, t, k.VFS, mntns)
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
 }
 
 func TestTasksEmpty(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 5d1ba5867..4b3602d47 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -34,15 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System {
 	}
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
-	k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 
-	mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
-	return testutil.NewSystem(ctx, t, k.VFS, mns)
+	return testutil.NewSystem(ctx, t, k.VFS(), mns)
 }
 
 func TestReadCPUFile(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index a91b3ec4d..d0be32e72 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -102,12 +102,13 @@ func Boot() (*kernel.Kernel, error) {
 
 	kernel.VFS2Enabled = true
 
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	if err := k.VFS().Init(); err != nil {
+		return nil, fmt.Errorf("VFS init: %v", err)
+	}
+	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 		AllowUserList:  true,
 	})
-	k.VFS = vfsObj
 
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 9fce5e4b4..383133e44 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -175,7 +175,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
@@ -366,7 +369,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 5ee7f2a72..1614f2c39 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index e9f71e334..0399725cf 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -40,7 +40,11 @@ var nextFileID int64
 func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+	}
+
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 2665f057c..ea21af33f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -246,7 +246,7 @@ type Kernel struct {
 	SpecialOpts
 
 	// VFS keeps the filesystem state used across the kernel.
-	VFS *vfs.VirtualFilesystem
+	vfs vfs.VirtualFilesystem
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -815,7 +815,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 				FollowFinalSymlink: true,
 			}
 			var err error
-			wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+			wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
 				CheckSearchable: true,
 			})
 			if err != nil {
@@ -1506,3 +1506,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 		Registers: t.Arch().StateData().Proto(),
 	})
 }
+
+// VFS returns the virtual filesystem for the kernel.
+func (k *Kernel) VFS() *vfs.VirtualFilesystem {
+	return &k.vfs
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 486a76475..35b208721 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -71,6 +71,8 @@ import (
 // lifetime. Dentry reference counts only indicate the extent to which VFS
 // requires Dentries to exist; Filesystems may elect to cache or discard
 // Dentries with zero references.
+//
+// +stateify savable
 type Dentry struct {
 	// parent is this Dentry's parent in this Filesystem. If this Dentry is
 	// independent, parent is nil.
@@ -89,7 +91,7 @@ type Dentry struct {
 	children map[string]*Dentry
 
 	// mu synchronizes disowning and mounting over this Dentry.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// impl is the DentryImpl associated with this Dentry. impl is immutable.
 	// This should be the last field in Dentry.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 3af2aa58d..bda5576fa 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -56,6 +56,7 @@ type Device interface {
 	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
 }
 
+// +stateify savable
 type registeredDevice struct {
 	dev  Device
 	opts RegisterDeviceOptions
@@ -63,6 +64,8 @@ type registeredDevice struct {
 
 // RegisterDeviceOptions contains options to
 // VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
 type RegisterDeviceOptions struct {
 	// GroupName is the name shown for this device registration in
 	// /proc/devices. If GroupName is empty, this registration will not be
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 8fa26418e..3a75d4d62 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
 func TestGenCountFD(t *testing.T) {
 	ctx := contexttest.Context(t)
 
-	vfsObj := New() // vfs.New()
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
 	defer fd.DecRef()
 
@@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) {
 func TestWritable(t *testing.T) {
 	ctx := contexttest.Context(t)
 
-	vfsObj := New() // vfs.New()
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
 	defer fd.DecRef()
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index a06a6caf3..556976d0b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -29,6 +29,8 @@ import (
 // Filesystem methods require that a reference is held.
 //
 // Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
 type Filesystem struct {
 	// refs is the reference count. refs is accessed using atomic memory
 	// operations.
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c58b70728..bb9cada81 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -44,6 +44,7 @@ type GetFilesystemOptions struct {
 	InternalData interface{}
 }
 
+// +stateify savable
 type registeredFilesystemType struct {
 	fsType FilesystemType
 	opts   RegisterFilesystemTypeOptions
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index ad2c9fcf4..9912df799 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -38,6 +38,8 @@ import (
 //
 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
 // between struct mount and struct vfsmount.)
+//
+// +stateify savable
 type Mount struct {
 	// vfs, fs, and root are immutable. References are held on fs and root.
 	//
@@ -85,6 +87,8 @@ type Mount struct {
 // MountNamespace methods require that a reference is held.
 //
 // MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
 type MountNamespace struct {
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bd90d36c4..1fe766a44 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -64,6 +64,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) {
 // (provided mutation is sufficiently uncommon).
 //
 // mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
 type mountTable struct {
 	// mountTable is implemented as a seqcount-protected hash table that
 	// resolves collisions with linear probing, featuring Robin Hood insertion
@@ -75,8 +77,8 @@ type mountTable struct {
 	// intrinsics and inline assembly, limiting the performance of this
 	// approach.)
 
-	seq  sync.SeqCount
-	seed uint32 // for hashing keys
+	seq  sync.SeqCount `state:"nosave"`
+	seed uint32        // for hashing keys
 
 	// size holds both length (number of elements) and capacity (number of
 	// slots): capacity is stored as its base-2 log (referred to as order) in
@@ -89,7 +91,7 @@ type mountTable struct {
 	// length and cap in separate uint32s) for ~free.
 	size uint64
 
-	slots unsafe.Pointer // []mountSlot; never nil after Init
+	slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
 }
 
 type mountSlot struct {
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 51deae313..8f29031b2 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -46,11 +46,13 @@ import (
 //
 // There is no analogue to the VirtualFilesystem type in Linux, as the
 // equivalent state in Linux is global.
+//
+// +stateify savable
 type VirtualFilesystem struct {
 	// mountMu serializes mount mutations.
 	//
 	// mountMu is analogous to Linux's namespace_sem.
-	mountMu sync.Mutex
+	mountMu sync.Mutex `state:"nosave"`
 
 	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
 	// are uniquely namespaced, including mount parent in the key correctly
@@ -89,44 +91,42 @@ type VirtualFilesystem struct {
 
 	// devices contains all registered Devices. devices is protected by
 	// devicesMu.
-	devicesMu sync.RWMutex
+	devicesMu sync.RWMutex `state:"nosave"`
 	devices   map[devTuple]*registeredDevice
 
 	// anonBlockDevMinor contains all allocated anonymous block device minor
 	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
 	// unallocated anonymous block device number. anonBlockDevMinorNext and
 	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
-	anonBlockDevMinorMu   sync.Mutex
+	anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
 	anonBlockDevMinorNext uint32
 	anonBlockDevMinor     map[uint32]struct{}
 
 	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
 	// fsTypesMu.
-	fsTypesMu sync.RWMutex
+	fsTypesMu sync.RWMutex `state:"nosave"`
 	fsTypes   map[string]*registeredFilesystemType
 
 	// filesystems contains all Filesystems. filesystems is protected by
 	// filesystemsMu.
-	filesystemsMu sync.Mutex
+	filesystemsMu sync.Mutex `state:"nosave"`
 	filesystems   map[*Filesystem]struct{}
 }
 
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
-	vfs := &VirtualFilesystem{
-		mountpoints:           make(map[*Dentry]map[*Mount]struct{}),
-		devices:               make(map[devTuple]*registeredDevice),
-		anonBlockDevMinorNext: 1,
-		anonBlockDevMinor:     make(map[uint32]struct{}),
-		fsTypes:               make(map[string]*registeredFilesystemType),
-		filesystems:           make(map[*Filesystem]struct{}),
-	}
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+	vfs.devices = make(map[devTuple]*registeredDevice)
+	vfs.anonBlockDevMinorNext = 1
+	vfs.anonBlockDevMinor = make(map[uint32]struct{})
+	vfs.fsTypes = make(map[string]*registeredFilesystemType)
+	vfs.filesystems = make(map[*Filesystem]struct{})
 	vfs.mounts.Init()
 
 	// Construct vfs.anonMount.
 	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
 	if err != nil {
-		panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+		return err
 	}
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
@@ -137,8 +137,7 @@ func New() *VirtualFilesystem {
 		fs:   &anonfs.vfsfs,
 		refs: 1,
 	}
-
-	return vfs
+	return nil
 }
 
 // PathOperation specifies the path operated on by a VFS method.
-- 
cgit v1.2.3


From 87bc2834c97a958d0762833fe8db749ccc6d5d50 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 14:23:35 -0800
Subject: Enable automated marshalling for RSeqCriticalSection.

PiperOrigin-RevId: 295226468
---
 pkg/sentry/kernel/rseq.go | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index efebfd872..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
-	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+	var cs linux.RSeqCriticalSection
+	if _, err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 		return
 	}
 
-	// Manually marshal RSeqCriticalSection as this is in the hot path when
-	// rseq is enabled. It must be as fast as possible.
-	//
-	// TODO(b/130243041): Replace with go_marshal.
-	cs := linux.RSeqCriticalSection{
-		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
-		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
-		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
-		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
-		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
-	}
-
 	if cs.Version != 0 {
 		t.Debugf("Unknown version in %+v", cs)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
-- 
cgit v1.2.3


From 3557b2665198b57c04924ad4be8dbf9e42cedf71 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 14:39:40 -0800
Subject: Allow vfs.IterDirentsCallback.Handle() to return an error.

This is easier than storing errors from e.g. CopyOut in the callback.

PiperOrigin-RevId: 295230021
---
 pkg/sentry/fsimpl/ext/directory.go       |  6 +++---
 pkg/sentry/fsimpl/ext/ext_test.go        |  4 ++--
 pkg/sentry/fsimpl/gofer/directory.go     |  4 ++--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 12 ++++++------
 pkg/sentry/fsimpl/proc/subtasks.go       |  4 ++--
 pkg/sentry/fsimpl/proc/tasks.go          | 12 ++++++------
 pkg/sentry/fsimpl/testutil/testutil.go   |  4 ++--
 pkg/sentry/fsimpl/tmpfs/directory.go     | 18 +++++++++---------
 pkg/sentry/vfs/file_description.go       | 10 +++++-----
 9 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index ebb72b75e..bd6ede995 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -188,14 +188,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 				childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
 			}
 
-			if !cb.Handle(vfs.Dirent{
+			if err := cb.Handle(vfs.Dirent{
 				Name:    child.diskDirent.FileName(),
 				Type:    fs.ToDirentType(childType),
 				Ino:     uint64(child.diskDirent.Inode()),
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index ef6127f3c..29bb73765 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -499,9 +499,9 @@ func newIterDirentCb() *iterDirentsCb {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error {
 	cb.dirents = append(cb.dirents, dirent)
-	return true
+	return nil
 }
 
 // TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 6d4ebc2bf..5dbfc6250 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -65,8 +65,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	}
 
 	for fd.off < int64(len(fd.dirents)) {
-		if !cb.Handle(fd.dirents[fd.off]) {
-			return nil
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
 		}
 		fd.off++
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index eda781155..5650512e0 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -116,8 +116,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 			Ino:     stat.Ino,
 			NextOff: 1,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -132,8 +132,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 			Ino:     stat.Ino,
 			NextOff: 2,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -153,8 +153,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 			Ino:     stat.Ino,
 			NextOff: fd.off + 1,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 102af0e93..f3f4e49b4 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -105,8 +105,8 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 			Ino:     i.inoGen.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ebe21630c..ce08a7d53 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -151,8 +151,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 			Ino:     i.inoGen.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -163,8 +163,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 			Ino:     i.inoGen.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -196,8 +196,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 			Ino:     i.inoGen.NextIno(),
 			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index b97e3534a..e16808c63 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -226,7 +226,7 @@ func (d *DirentCollector) SkipDotsChecks(value bool) {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
+func (d *DirentCollector) Handle(dirent vfs.Dirent) error {
 	d.mu.Lock()
 	if d.dirents == nil {
 		d.dirents = make(map[string]*vfs.Dirent)
@@ -234,7 +234,7 @@ func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
 	d.order = append(d.order, &dirent)
 	d.dirents[dirent.Name] = &dirent
 	d.mu.Unlock()
-	return true
+	return nil
 }
 
 // Count returns the number of dirents currently in the collector.
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index dc0d27cf9..b4380af38 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -74,25 +74,25 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	defer fs.mu.Unlock()
 
 	if fd.off == 0 {
-		if !cb.Handle(vfs.Dirent{
+		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
 			Ino:     vfsd.Impl().(*dentry).inode.ino,
 			NextOff: 1,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
 	if fd.off == 1 {
 		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
-		if !cb.Handle(vfs.Dirent{
+		if err := cb.Handle(vfs.Dirent{
 			Name:    "..",
 			Type:    parentInode.direntType(),
 			Ino:     parentInode.ino,
 			NextOff: 2,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -111,14 +111,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	for child != nil {
 		// Skip other directoryFD iterators.
 		if child.inode != nil {
-			if !cb.Handle(vfs.Dirent{
+			if err := cb.Handle(vfs.Dirent{
 				Name:    child.vfsd.Name(),
 				Type:    child.inode.direntType(),
 				Ino:     child.inode.ino,
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5bac660c7..9a1ad630c 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -435,11 +435,11 @@ type Dirent struct {
 
 // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
 type IterDirentsCallback interface {
-	// Handle handles the given iterated Dirent. It returns true if iteration
-	// should continue, and false if FileDescriptionImpl.IterDirents should
-	// terminate now and restart with the same Dirent the next time it is
-	// called.
-	Handle(dirent Dirent) bool
+	// Handle handles the given iterated Dirent. If Handle returns a non-nil
+	// error, FileDescriptionImpl.IterDirents must stop iteration and return
+	// the error; the next call to FileDescriptionImpl.IterDirents should
+	// restart with the same Dirent.
+	Handle(dirent Dirent) error
 }
 
 // OnClose is called when a file descriptor representing the FileDescription is
-- 
cgit v1.2.3


From 48d9aa7ab371691d28a44533f67e495173554098 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 15:19:53 -0800
Subject: Add a minimal binary target for escape analysis on go-marshal.

Note that this is not an automated test.

PiperOrigin-RevId: 295238672
---
 tools/go_marshal/test/BUILD             |  14 +++-
 tools/go_marshal/test/benchmark_test.go |   2 +-
 tools/go_marshal/test/escape.go         | 114 ++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 tools/go_marshal/test/escape.go

diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index e345e3a8e..f27c5ce52 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_library", "go_test")
 
 licenses(["notice"])
 
@@ -27,3 +27,15 @@ go_library(
     marshal = True,
     deps = ["//tools/go_marshal/test/external"],
 )
+
+go_binary(
+    name = "escape",
+    testonly = 1,
+    srcs = ["escape.go"],
+    gc_goopts = ["-m"],
+    deps = [
+        ":test",
+        "//pkg/usermem",
+        "//tools/go_marshal/marshal",
+    ],
+)
diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go
index e12403741..c79defe9e 100644
--- a/tools/go_marshal/test/benchmark_test.go
+++ b/tools/go_marshal/test/benchmark_test.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/tools/go_marshal/analysis"
-	test "gvisor.dev/gvisor/tools/go_marshal/test"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
 )
 
 // Marshalling using the standard encoding/binary package.
diff --git a/tools/go_marshal/test/escape.go b/tools/go_marshal/test/escape.go
new file mode 100644
index 000000000..184f05ea3
--- /dev/null
+++ b/tools/go_marshal/test/escape.go
@@ -0,0 +1,114 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This binary provides a convienient target for analyzing how the go-marshal
+// API causes its various arguments to escape to the heap. To use, build and
+// observe the output from the go compiler's escape analysis:
+//
+// $ bazel build :escape
+// ...
+// escape.go:67:2: moved to heap: task
+// escape.go:77:31: make([]byte, size) escapes to heap
+// escape.go:87:31: make([]byte, size) escapes to heap
+// escape.go:96:6: moved to heap: stat
+// ...
+//
+// This is not an automated test, but simply a minimal binary for easy analysis.
+package main
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
+)
+
+// dummyTask implements marshal.Task.
+type dummyTask struct {
+}
+
+func (*dummyTask) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+func (*dummyTask) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
+	return len(b), nil
+}
+
+func (*dummyTask) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
+	return len(b), nil
+}
+
+func (task *dummyTask) MarshalBytes(addr usermem.Addr, marshallable marshal.Marshallable) {
+	buf := task.CopyScratchBuffer(marshallable.SizeBytes())
+	marshallable.MarshalBytes(buf)
+	task.CopyOutBytes(addr, buf)
+}
+
+func (task *dummyTask) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marshallable) {
+	buf := task.CopyScratchBuffer(marshallable.SizeBytes())
+	marshallable.MarshalUnsafe(buf)
+	task.CopyOutBytes(addr, buf)
+}
+
+// Expected escapes:
+// - task: passed to marshal.Marshallable.CopyOut as the marshal.Task interface.
+func doCopyOut() {
+	task := dummyTask{}
+	var stat test.Stat
+	stat.CopyOut(&task, usermem.Addr(0xf000ba12))
+}
+
+// Expected escapes:
+// - buf: make allocates on the heap.
+func doMarshalBytesDirect() {
+	task := dummyTask{}
+	var stat test.Stat
+	buf := task.CopyScratchBuffer(stat.SizeBytes())
+	stat.MarshalBytes(buf)
+	task.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
+}
+
+// Expected escapes:
+// - buf: make allocates on the heap.
+func doMarshalUnsafeDirect() {
+	task := dummyTask{}
+	var stat test.Stat
+	buf := task.CopyScratchBuffer(stat.SizeBytes())
+	stat.MarshalUnsafe(buf)
+	task.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
+}
+
+// Expected escapes:
+// - stat: passed to dummyTask.MarshalBytes as the marshal.Marshallable interface.
+func doMarshalBytesViaMarshallable() {
+	task := dummyTask{}
+	var stat test.Stat
+	task.MarshalBytes(usermem.Addr(0xf000ba12), &stat)
+}
+
+// Expected escapes:
+// - stat: passed to dummyTask.MarshalUnsafe as the marshal.Marshallable interface.
+func doMarshalUnsafeViaMarshallable() {
+	task := dummyTask{}
+	var stat test.Stat
+	task.MarshalUnsafe(usermem.Addr(0xf000ba12), &stat)
+}
+
+func main() {
+	doCopyOut()
+	doMarshalBytesDirect()
+	doMarshalUnsafeDirect()
+	doMarshalBytesViaMarshallable()
+	doMarshalUnsafeViaMarshallable()
+}
-- 
cgit v1.2.3


From 5baf9dc2fbb459828b4102b0a1c5214879434c03 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 15:48:09 -0800
Subject: Synchronize signalling with S/R

This is to fix a data race between sending an external signal to
a ThreadGroup and kernel saving state for S/R.

PiperOrigin-RevId: 295244281
---
 pkg/sentry/kernel/kernel.go | 8 ++++++++
 runsc/boot/loader.go        | 8 ++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index ea21af33f..7da0368f1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1169,6 +1169,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.sendExternalSignal(info, context)
 }
 
+// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
+// This function doesn't skip signals like SendExternalSignal does.
+func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return tg.SendSignal(info)
+}
+
 // SendContainerSignal sends the given signal to all processes inside the
 // namespace that match the given container ID.
 func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 239ca5302..eef43b9df 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -997,7 +997,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
 	if err == nil {
 		// Send signal directly to the identified process.
-		return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
 	}
 
 	// The caller may be signaling a process not started directly via exec.
@@ -1014,7 +1014,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	if tg.Leader().ContainerID() != cid {
 		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
 	}
-	return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 }
 
 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
@@ -1032,7 +1032,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 		// No foreground process group has been set. Signal the
 		// original thread group.
 		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
-		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 	}
 	// Send the signal to all processes in the process group.
 	var lastErr error
@@ -1040,7 +1040,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 		if tg.ProcessGroup() != pg {
 			continue
 		}
-		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
 			lastErr = err
 		}
 	}
-- 
cgit v1.2.3


From a5069f820f22734b6c466068a02bbbe83ba091da Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 16:19:00 -0800
Subject: Remove linux.EpollEvent.Fd.

glibc defines struct epoll_event in such a way that epoll_event.data.fd exists.
However, the kernel's definition of struct epoll_event makes epoll_event.data
an opaque uint64, so naming half of it "fd" just introduces confusion. Remove
the Fd field, and make Data a [2]int32 to compensate.

Also add required padding to linux.EpollEvent on ARM64.

PiperOrigin-RevId: 295250424
---
 pkg/abi/linux/BUILD                    |  2 ++
 pkg/abi/linux/epoll.go                 |  7 -------
 pkg/abi/linux/epoll_amd64.go           | 25 +++++++++++++++++++++++++
 pkg/abi/linux/epoll_arm64.go           | 24 ++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_epoll.go |  3 +--
 pkg/sentry/vfs/epoll.go                | 19 +++++++++----------
 6 files changed, 61 insertions(+), 19 deletions(-)
 create mode 100644 pkg/abi/linux/epoll_amd64.go
 create mode 100644 pkg/abi/linux/epoll_arm64.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index b7015367b..a89f34d4b 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -17,6 +17,8 @@ go_library(
         "dev.go",
         "elf.go",
         "epoll.go",
+        "epoll_amd64.go",
+        "epoll_arm64.go",
         "errors.go",
         "eventfd.go",
         "exec.go",
diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
index 0e881aa3c..6e4de69da 100644
--- a/pkg/abi/linux/epoll.go
+++ b/pkg/abi/linux/epoll.go
@@ -14,13 +14,6 @@
 
 package linux
 
-// EpollEvent is equivalent to struct epoll_event from epoll(2).
-type EpollEvent struct {
-	Events uint32
-	Fd     int32
-	Data   int32
-}
-
 // Event masks.
 const (
 	EPOLLIN     = 0x1
diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go
new file mode 100644
index 000000000..57041491c
--- /dev/null
+++ b/pkg/abi/linux/epoll_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+type EpollEvent struct {
+	Events uint32
+	// Linux makes struct epoll_event::data a __u64. We represent it as
+	// [2]int32 because, on amd64, Linux also makes struct epoll_event
+	// __attribute__((packed)), such that there is no padding between Events
+	// and Data.
+	Data [2]int32
+}
diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go
new file mode 100644
index 000000000..62ef5821e
--- /dev/null
+++ b/pkg/abi/linux/epoll_arm64.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+type EpollEvent struct {
+	Events uint32
+	// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding
+	// here.
+	_    int32
+	Data [2]int32
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 5f11b496c..fbef5b376 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -83,8 +83,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		}
 
 		mask = waiter.EventMaskFromLinux(e.Events)
-		data[0] = e.Fd
-		data[1] = e.Data
+		data = e.Data
 	}
 
 	// Perform the requested operations.
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 7c83f9a5a..eed41139b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -85,8 +85,8 @@ type epollInterest struct {
 	ready bool
 	epollInterestEntry
 
-	// userData is the epoll_data_t associated with this epollInterest.
-	// userData is protected by epoll.mu.
+	// userData is the struct epoll_event::data associated with this
+	// epollInterest. userData is protected by epoll.mu.
 	userData [2]int32
 }
 
@@ -157,7 +157,7 @@ func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (
 // AddInterest implements the semantics of EPOLL_CTL_ADD.
 //
 // Preconditions: A reference must be held on file.
-func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
 	// Check for cyclic polling if necessary.
 	subep, _ := file.impl.(*EpollInstance)
 	if subep != nil {
@@ -183,12 +183,12 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
 	}
 
 	// Register interest in file.
-	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
 	epi := &epollInterest{
 		epoll:    ep,
 		key:      key,
 		mask:     mask,
-		userData: userData,
+		userData: event.Data,
 	}
 	ep.interest[key] = epi
 	wmask := waiter.EventMaskFromLinux(mask)
@@ -236,7 +236,7 @@ func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursi
 // ModifyInterest implements the semantics of EPOLL_CTL_MOD.
 //
 // Preconditions: A reference must be held on file.
-func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
 	ep.interestMu.Lock()
 	defer ep.interestMu.Unlock()
 
@@ -250,13 +250,13 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask u
 	}
 
 	// Update epi for the next call to ep.ReadEvents().
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
 	ep.mu.Lock()
 	epi.mask = mask
-	epi.userData = userData
+	epi.userData = event.Data
 	ep.mu.Unlock()
 
 	// Re-register with the new mask.
-	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
 	file.EventUnregister(&epi.waiter)
 	wmask := waiter.EventMaskFromLinux(mask)
 	file.EventRegister(&epi.waiter, wmask)
@@ -363,8 +363,7 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
 		// Report ievents.
 		events[i] = linux.EpollEvent{
 			Events: ievents.ToLinux(),
-			Fd:     epi.userData[0],
-			Data:   epi.userData[1],
+			Data:   epi.userData,
 		}
 		i++
 		if i == len(events) {
-- 
cgit v1.2.3


From 3d32ad1367b4e84a0822808f44bd7b9f9351db71 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 18:31:55 -0800
Subject: Generate implementation of io.WriterTo via go-marshal.

PiperOrigin-RevId: 295269654
---
 tools/go_marshal/gomarshal/generator.go            |  6 ++-
 tools/go_marshal/gomarshal/generator_interfaces.go | 46 ++++++++++++++++++++++
 tools/go_marshal/gomarshal/generator_tests.go      | 34 ++++++++++++++--
 tools/go_marshal/marshal/marshal.go                |  4 ++
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index fbec7bb9a..0294ba5ba 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -101,14 +101,16 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G
 		// used, so that they're always added to the generated code.
 		g.imports.add(i).markUsed()
 	}
-	g.imports.add(marshalImport).markUsed()
+
 	// The following imports may or may not be used by the generated code,
 	// depending on what's required for the target types. Don't mark these as
 	// used by default.
+	g.imports.add("io")
 	g.imports.add("reflect")
 	g.imports.add("runtime")
-	g.imports.add(safecopyImport)
 	g.imports.add("unsafe")
+	g.imports.add(marshalImport)
+	g.imports.add(safecopyImport)
 	g.imports.add(usermemImport)
 
 	return &g, nil
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index f25331ac5..22aae0f6b 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -602,4 +602,50 @@ func (g *interfaceGenerator) emitMarshallable() {
 		}
 	})
 	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.recordUsedImport("io")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r)
+			g.emit("%s.MarshalBytes(buf)\n", g.r)
+			g.emit("n, err := w.Write(buf)\n")
+			g.emit("return int64(n), err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+			g.emit("val := uintptr(ptr)\n")
+			g.emit("val = val^0\n\n")
+
+			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+			g.emit("var buf []byte\n")
+			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+			g.emit("hdr.Data = val\n")
+			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+			g.emit("len, err := w.Write(buf)\n")
+			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+			g.emit("// must live until after the Write.\n")
+			g.emit("runtime.KeepAlive(%s)\n", g.r)
+			g.emit("return int64(len), err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
 }
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index cc760b6d0..5ad97af14 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -22,6 +22,7 @@ import (
 )
 
 var standardImports = []string{
+	"bytes",
 	"fmt",
 	"reflect",
 	"testing",
@@ -117,26 +118,50 @@ func (g *testGenerator) emitTestMarshalUnmarshalPreservesData() {
 		g.emit("y.UnmarshalBytes(buf)\n")
 		g.emit("if !reflect.DeepEqual(x, y) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across Marshal/Unmarshal cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, y))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalBytes/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n")
 		})
 		g.emit("}\n")
 		g.emit("yUnsafe.UnmarshalBytes(bufUnsafe)\n")
 		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/Unmarshal cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, yUnsafe))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n")
 		})
 		g.emit("}\n\n")
 
 		g.emit("z.UnmarshalUnsafe(buf)\n")
 		g.emit("if !reflect.DeepEqual(x, z) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across Marshal/UnmarshalUnsafe cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, z))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalBytes/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, z))\n")
 		})
 		g.emit("}\n")
 		g.emit("zUnsafe.UnmarshalUnsafe(bufUnsafe)\n")
 		g.emit("if !reflect.DeepEqual(x, zUnsafe) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalUnsafe cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, zUnsafe))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, zUnsafe))\n")
+		})
+		g.emit("}\n")
+	})
+}
+
+func (g *testGenerator) emitTestWriteToUnmarshalPreservesData() {
+	g.inTestFunction("TestWriteToUnmarshalPreservesData", func() {
+		g.emit("var x, y, yUnsafe %s\n", g.typeName())
+		g.emit("analysis.RandomizeValue(&x)\n\n")
+
+		g.emit("var buf bytes.Buffer\n\n")
+
+		g.emit("x.WriteTo(&buf)\n")
+		g.emit("y.UnmarshalBytes(buf.Bytes())\n\n")
+		g.emit("yUnsafe.UnmarshalUnsafe(buf.Bytes())\n\n")
+
+		g.emit("if !reflect.DeepEqual(x, y) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across WriteTo/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n")
+		})
+		g.emit("}\n")
+		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across WriteTo/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n")
 		})
 		g.emit("}\n")
 	})
@@ -146,6 +171,7 @@ func (g *testGenerator) emitTests() {
 	g.emitTestNonZeroSize()
 	g.emitTestSuspectAlignment()
 	g.emitTestMarshalUnmarshalPreservesData()
+	g.emitTestWriteToUnmarshalPreservesData()
 }
 
 func (g *testGenerator) write(out io.Writer) error {
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index 10614ec4d..e521b50bd 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -21,6 +21,8 @@
 package marshal
 
 import (
+	"io"
+
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -42,6 +44,8 @@ type Task interface {
 
 // Marshallable represents a type that can be marshalled to and from memory.
 type Marshallable interface {
+	io.WriterTo
+
 	// SizeBytes is the size of the memory representation of a type in
 	// marshalled form.
 	SizeBytes() int
-- 
cgit v1.2.3


From 5cc0bbbafb2dc7d248bc3141b4cfa022d420abd1 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Sat, 15 Feb 2020 00:00:04 -0800
Subject: Ensure Marshallable.SizeBytes() always works on a typed nil pointer.

This lets go-marshal replace various calls to binary.Size() throughout
the sentry without requiring concrete objects.

PiperOrigin-RevId: 295299965
---
 tools/go_marshal/gomarshal/generator_interfaces.go |  2 +-
 tools/go_marshal/gomarshal/generator_tests.go      | 15 +++++++++++++++
 tools/go_marshal/marshal/marshal.go                |  4 ++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 22aae0f6b..3aa299ccd 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -301,7 +301,7 @@ func (g *interfaceGenerator) emitMarshallable() {
 					primitiveSize += size
 				} else {
 					g.recordUsedMarshallable(t.Name)
-					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%s.SizeBytes()", g.fieldAccessor(n)))
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", t.Name))
 				}
 			},
 			selector: func(n, tX, tSel *ast.Ident) {
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index 5ad97af14..8c28b00d0 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -167,11 +167,26 @@ func (g *testGenerator) emitTestWriteToUnmarshalPreservesData() {
 	})
 }
 
+func (g *testGenerator) emitTestSizeBytesOnTypedNilPtr() {
+	g.inTestFunction("TestSizeBytesOnTypedNilPtr", func() {
+		g.emit("var x %s\n", g.typeName())
+		g.emit("sizeFromConcrete := x.SizeBytes()\n")
+		g.emit("sizeFromTypedNilPtr := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emit("if sizeFromTypedNilPtr != sizeFromConcrete {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatalf(\"SizeBytes() on typed nil pointer (%v) doesn't match size returned by a concrete object (%v).\\n\", sizeFromTypedNilPtr, sizeFromConcrete)")
+		})
+		g.emit("}\n")
+	})
+}
+
 func (g *testGenerator) emitTests() {
 	g.emitTestNonZeroSize()
 	g.emitTestSuspectAlignment()
 	g.emitTestMarshalUnmarshalPreservesData()
 	g.emitTestWriteToUnmarshalPreservesData()
+	g.emitTestSizeBytesOnTypedNilPtr()
 }
 
 func (g *testGenerator) write(out io.Writer) error {
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index e521b50bd..20353850d 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -48,6 +48,10 @@ type Marshallable interface {
 
 	// SizeBytes is the size of the memory representation of a type in
 	// marshalled form.
+	//
+	// SizeBytes must handle a nil receiver. Practically, this means SizeBytes
+	// cannot deference any fields on the object implementing it (but will
+	// likely make use of the type of these fields).
 	SizeBytes() int
 
 	// MarshalBytes serializes a copy of a type to dst. dst must be at least
-- 
cgit v1.2.3


From fe92fb36acf0f325cb34e563839c4e7721af77b8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 17 Feb 2020 15:01:28 -0800
Subject: Add flag to run without building baked images.

Adds flag to :benchmarks to allow running without
building "baked images", which depends on gcloud.

Users can skip gcloud workflows using the following:

blaze run --define gcloud=off :benchmarks -- my-command-without-gcloud

"run-gcp" will not work with the flag set, but all other commands will.

PiperOrigin-RevId: 295627718
---
 benchmarks/BUILD     | 18 ++++++++++++++----
 benchmarks/README.md | 23 ++++++++++++-----------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 43614cf5d..2a2d15d7e 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -1,12 +1,22 @@
 package(licenses = ["notice"])
 
+config_setting(
+    name = "gcloud_rule",
+    values = {
+        "define": "gcloud=off",
+    },
+)
+
 py_binary(
     name = "benchmarks",
     srcs = ["run.py"],
-    data = [
-        "//tools/images:ubuntu1604",
-        "//tools/images:zone",
-    ],
+    data = select({
+        ":gcloud_rule": [],
+        "//conditions:default": [
+            "//tools/images:ubuntu1604",
+            "//tools/images:zone",
+        ],
+    }),
     main = "run.py",
     python_version = "PY3",
     srcs_version = "PY3",
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 975321c99..6d1ea3ae2 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -28,10 +28,12 @@ For configuring the environment manually, consult the
 
 ### Locally
 
-Run the following from the benchmarks directory:
+The tool is built to, by default, use Google Cloud Platform to run benchmarks,
+but it does support GCP workflows. To run locally, run the following from the
+benchmarks directory:
 
 ```bash
-bazel run :benchmarks -- run-local startup
+bazel run --define gcloud=off :benchmarks -- run-local startup
 
 ...
 method,metric,result
@@ -46,17 +48,16 @@ runtime, runc. Running on another installed runtime, like say runsc, is as
 simple as:
 
 ```bash
-bazel run :benchmarks -- run-local startup --runtime=runsc
+bazel run  --define gcloud=off :benchmarks -- run-local startup --runtime=runsc
 ```
 
-There is help: ``bash bash bazel run :benchmarks -- --help bazel
-run :benchmarks -- run-local --help` ``
+There is help: `bash bazel run --define gcloud=off :benchmarks -- --help bazel
+run --define gcloud=off :benchmarks -- run-local --help`
 
 To list available benchmarks, use the `list` commmand:
 
 ```bash
-bazel run :benchmarks -- list
-ls
+bazel --define gcloud=off  run :benchmarks -- list
 
 ...
 Benchmark: sysbench.cpu
@@ -69,7 +70,7 @@ Metrics: events_per_second
 You can choose benchmarks by name or regex like:
 
 ```bash
-bazel run :benchmarks -- run-local startup.node
+bazel run --define gcloud=off :benchmarks -- run-local startup.node
 ...
 metric,result
 startup_time_ms,1671.7178000000001
@@ -79,7 +80,7 @@ startup_time_ms,1671.7178000000001
 or
 
 ```bash
-bazel run :benchmarks -- run-local s
+bazel run --define gcloud=off :benchmarks -- run-local s
 ...
 method,metric,result
 startup.empty,startup_time_ms,1792.8292
@@ -97,13 +98,13 @@ You can run parameterized benchmarks, for example to run with different
 runtimes:
 
 ```bash
-bazel run :benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
+bazel run --define gcloud=off :benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
 ```
 
 Or with different parameters:
 
 ```bash
-bazel run :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
+bazel run --define gcloud=off :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
 ```
 
 ### On Google Compute Engine (GCE)
-- 
cgit v1.2.3


From fae3de21af7f50266565643c6283912b087b0f5a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 10:49:47 -0800
Subject: ring0/pagetables: fix typo

PiperOrigin-RevId: 295770717
---
 pkg/sentry/platform/ring0/pagetables/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 971eed7fa..4f2406ce3 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -7,7 +7,7 @@ go_template(
     name = "generic_walker",
     srcs = select_arch(
         amd64 = ["walker_amd64.go"],
-        arm64 = ["walker_amd64.go"],
+        arm64 = ["walker_arm64.go"],
     ),
     opt_types = [
         "Visitor",
-- 
cgit v1.2.3


From b30b7f3422202232ad1c385a7ac0d775151fee2f Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Tue, 18 Feb 2020 11:30:42 -0800
Subject: Add nat table support for iptables.

Add nat table support for Prerouting hook with Redirect option.
Add tests to check redirect of ports.
---
 pkg/abi/linux/netfilter.go               | 27 ++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go | 58 +++++++++++++++++++++++++++--
 pkg/tcpip/iptables/iptables.go           | 21 +++++++++++
 pkg/tcpip/iptables/targets.go            | 24 ++++++++++++
 pkg/tcpip/stack/nic.go                   | 23 ++++++++++++
 test/iptables/iptables_test.go           | 12 ++++++
 test/iptables/iptables_util.go           | 10 +++++
 test/iptables/nat.go                     | 64 +++++++++++++++++++++++++++++++-
 8 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index bbc4df74c..ba4d84962 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -250,6 +250,33 @@ type XTErrorTarget struct {
 // SizeOfXTErrorTarget is the size of an XTErrorTarget.
 const SizeOfXTErrorTarget = 64
 
+// NfNATIPV4Range. It corresponds to struct nf_nat_ipv4_range
+// in include/uapi/linux/netfilter/nf_nat.h.
+type NfNATIPV4Range struct {
+	Flags   uint32
+	MinIP   [4]byte
+	MaxIP   [4]byte
+	MinPort uint16
+	MaxPort uint16
+}
+
+// NfNATIPV4MultiRangeCompat. It corresponds to struct
+// nf_nat_ipv4_multi_range_compat in include/uapi/linux/netfilter/nf_nat.h.
+type NfNATIPV4MultiRangeCompat struct {
+	Rangesize uint32
+	RangeIPV4 [1]NfNATIPV4Range
+}
+
+// XTRedirectTarget triggers a redirect when reached.
+type XTRedirectTarget struct {
+	Target  XTEntryTarget
+	NfRange NfNATIPV4MultiRangeCompat
+	_       [4]byte
+}
+
+// SizeOfXTRedirectTarget is the size of an XTRedirectTarget.
+const SizeOfXTRedirectTarget = 56
+
 // IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds
 // to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h.
 type IPTGetinfo struct {
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3fc80e0de..512ad624a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -35,6 +35,11 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
+// redirectTargetName is used to mark targets as redirect targets. Redirect
+// targets should be reached for only NAT and Mangle tables. These targets will
+// change the destination port/destination IP for packets.
+const redirectTargetName = "REDIRECT"
+
 // Metadata is used to verify that we are correctly serializing and
 // deserializing iptables into structs consumable by the iptables tool. We save
 // a metadata struct when the tables are written, and when they are read out we
@@ -240,6 +245,8 @@ func marshalTarget(target iptables.Target) []byte {
 		return marshalErrorTarget(tg.Name)
 	case iptables.ReturnTarget:
 		return marshalStandardTarget(iptables.RuleReturn)
+	case iptables.RedirectTarget:
+		return marshalRedirectTarget()
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
@@ -274,6 +281,18 @@ func marshalErrorTarget(errorName string) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
+func marshalRedirectTarget() []byte {
+	// This is a redirect target named redirect
+	target := linux.XTRedirectTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTRedirectTarget,
+		},
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
 func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
@@ -326,6 +345,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	switch replace.Name.String() {
 	case iptables.TablenameFilter:
 		table = iptables.EmptyFilterTable()
+	case iptables.TablenameNat:
+		table = iptables.EmptyNatTable()
 	default:
 		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
@@ -455,10 +476,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	}
 
 	// TODO(gvisor.dev/issue/170): Support other chains.
-	// Since we only support modifying the INPUT chain right now, make sure
-	// all other chains point to ACCEPT rules.
+	// Since we only support modifying the INPUT chain and redirect for
+	// PREROUTING chain right now, make sure all other chains point to
+	// ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
-		if hook != iptables.Input {
+		if hook != iptables.Input && hook != iptables.Prerouting {
 			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
@@ -575,6 +597,36 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 			nflog("set entries: user-defined target %q", name)
 			return iptables.UserChainTarget{Name: name}, nil
 		}
+
+	case redirectTargetName:
+		// Redirect target.
+		if len(optVal) < linux.SizeOfXTRedirectTarget {
+			return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
+		}
+
+		var redirectTarget linux.XTRedirectTarget
+		buf = optVal[:linux.SizeOfXTRedirectTarget]
+		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+		// Copy linux.XTRedirectTarget to iptables.RedirectTarget.
+		var target iptables.RedirectTarget
+		nfRange := redirectTarget.NfRange
+
+		target.RangeSize = nfRange.Rangesize
+		target.Flags = nfRange.RangeIPV4[0].Flags
+
+		target.MinIP = tcpip.Address(nfRange.RangeIPV4[0].MinIP[:])
+		target.MaxIP = tcpip.Address(nfRange.RangeIPV4[0].MaxIP[:])
+
+		// Convert port from big endian to little endian.
+		port := make([]byte, 2)
+		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4[0].MinPort)
+		target.MinPort = binary.LittleEndian.Uint16(port)
+
+		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4[0].MaxPort)
+		target.MaxPort = binary.LittleEndian.Uint16(port)
+		return target, nil
+
 	}
 
 	// Unknown target.
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 75a433a3b..c00d012c0 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -135,6 +135,27 @@ func EmptyFilterTable() Table {
 	}
 }
 
+// EmptyNatTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyNatTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 9fc60cfad..06e65bece 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -19,6 +19,7 @@ package iptables
 import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // AcceptTarget accepts packets.
@@ -65,3 +66,26 @@ type ReturnTarget struct{}
 func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
 	return RuleReturn, ""
 }
+
+// RedirectTarget redirects the packet by modifying the destination port/IP.
+type RedirectTarget struct {
+	RangeSize uint32
+	Flags     uint32
+	MinIP     tcpip.Address
+	MaxIP     tcpip.Address
+	MinPort   uint16
+	MaxPort   uint16
+}
+
+// Action implements Target.Action.
+func (rt RedirectTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	log.Infof("RedirectTarget triggered.")
+
+	// TODO(gvisor.dev/issue/170): Checking only for UDP protocol.
+	// We're yet to support for TCP protocol.
+	headerView := packet.Data.First()
+	h := header.UDP(headerView)
+	h.SetDestinationPort(rt.MinPort)
+
+	return RuleAccept, ""
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ca3a7a07e..2028f5201 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 )
 
 // NIC represents a "network interface card" to which the networking stack is
@@ -1012,6 +1013,7 @@ func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
+
 	ref.ep.HandlePacket(&r, pkt)
 	ref.decRef()
 }
@@ -1082,6 +1084,27 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
 		return
 	}
+
+	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
+	if protocol == header.IPv4ProtocolNumber {
+		newPkt := pkt.Clone()
+
+		headerView := newPkt.Data.First()
+		h := header.IPv4(headerView)
+		newPkt.NetworkHeader = headerView[:h.HeaderLength()]
+
+		hlen := int(h.HeaderLength())
+		tlen := int(h.TotalLength())
+		newPkt.Data.TrimFront(hlen)
+		newPkt.Data.CapLength(tlen - hlen)
+
+		ipt := n.stack.IPTables()
+		if ok := ipt.Check(iptables.Prerouting, newPkt); !ok {
+			// iptables is telling us to drop the packet.
+			return
+		}
+	}
+
 	if ref := n.getRef(protocol, dst); ref != nil {
 		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
 		return
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 46a7c99b0..7d061acba 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -196,12 +196,24 @@ func TestNATRedirectUDPPort(t *testing.T) {
 	}
 }
 
+func TestNATRedirectTCPPort(t *testing.T) {
+	if err := singleTest(NATRedirectTCPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestNATDropUDP(t *testing.T) {
 	if err := singleTest(NATDropUDP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
+func TestNATAcceptAll(t *testing.T) {
+	if err := singleTest(NATAcceptAll{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestFilterInputDropTCPDestPort(t *testing.T) {
 	if err := singleTest(FilterInputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 043114c78..5c9199abf 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -35,6 +35,16 @@ func filterTable(args ...string) error {
 	return nil
 }
 
+// natTable calls `iptables -t nat` with the given args.
+func natTable(args ...string) error {
+	args = append([]string{"-t", "nat"}, args...)
+	cmd := exec.Command(iptablesBinary, args...)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out))
+	}
+	return nil
+}
+
 // listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
 // the first read on that port.
 func listenUDP(port int, timeout time.Duration) error {
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index b5c6f927e..306cbd1b3 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -25,7 +25,9 @@ const (
 
 func init() {
 	RegisterTestCase(NATRedirectUDPPort{})
+	RegisterTestCase(NATRedirectTCPPort{})
 	RegisterTestCase(NATDropUDP{})
+	RegisterTestCase(NATAcceptAll{})
 }
 
 // NATRedirectUDPPort tests that packets are redirected to different port.
@@ -38,13 +40,14 @@ func (NATRedirectUDPPort) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (NATRedirectUDPPort) ContainerAction(ip net.IP) error {
-	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
 	if err := listenUDP(redirectPort, sendloopDuration); err != nil {
 		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", redirectPort, err)
 	}
+
 	return nil
 }
 
@@ -53,6 +56,37 @@ func (NATRedirectUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
 
+// NATRedirectTCPPort tests that connections are redirected on specified ports.
+type NATRedirectTCPPort struct{}
+
+// Name implements TestCase.Name.
+func (NATRedirectTCPPort) Name() string {
+	return "NATRedirectTCPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATRedirectTCPPort) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "tcp", "-m", "tcp", "--dport", fmt.Sprintf("%d", dropPort), "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on redirect port.
+	if err := listenTCP(redirectPort, sendloopDuration); err != nil {
+		return fmt.Errorf("connection on port %d should be accepted, but got error %v", redirectPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATRedirectTCPPort) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("connection destined to port %d should be accepted, but got error %v", dropPort, err)
+	}
+
+	return nil
+}
+
 // NATDropUDP tests that packets are not received in ports other than redirect port.
 type NATDropUDP struct{}
 
@@ -63,7 +97,7 @@ func (NATDropUDP) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (NATDropUDP) ContainerAction(ip net.IP) error {
-	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
@@ -78,3 +112,29 @@ func (NATDropUDP) ContainerAction(ip net.IP) error {
 func (NATDropUDP) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// NATAcceptAll tests that all UDP packets are accepted.
+type NATAcceptAll struct{}
+
+// Name implements TestCase.Name.
+func (NATAcceptAll) Name() string {
+	return "NATAcceptAll"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATAcceptAll) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	if err := listenUDP(acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATAcceptAll) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
-- 
cgit v1.2.3


From c841373013ec8659b2954563796479f275b00bfa Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 12:00:58 -0800
Subject: Deflake fallocate syscall test.

- Retry if fallocate returns EINTR.

- If fallocate fails, don't try to fstat and confirm the result.

PiperOrigin-RevId: 295789790
---
 test/syscalls/linux/fallocate.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index 1c3d00287..7819f4ac3 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -33,7 +33,7 @@ namespace testing {
 namespace {
 
 int fallocate(int fd, int mode, off_t offset, off_t len) {
-  return syscall(__NR_fallocate, fd, mode, offset, len);
+  return RetryEINTR(syscall)(__NR_fallocate, fd, mode, offset, len);
 }
 
 class AllocateTest : public FileTest {
@@ -47,27 +47,27 @@ TEST_F(AllocateTest, Fallocate) {
   EXPECT_EQ(buf.st_size, 0);
 
   // Grow to ten bytes.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Allocate to a smaller size should be noop.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Grow again.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 20);
 
   // Grow with offset.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 30);
 
   // Grow with offset beyond EOF.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 40);
 }
-- 
cgit v1.2.3


From 906eb6295d54a05663a223f1dc379a16148de2d1 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 13:42:31 -0800
Subject: atomicbitops package cleanups

- Redocument memory ordering from "no ordering" to "acquire-release". (No
  functional change: both LOCK WHATEVER on x86, and LDAXR/STLXR loops on ARM64,
  already have this property.)

- Remove IncUnlessZeroInt32 and DecUnlessOneInt32, which were only faster than
  the equivalent loops using sync/atomic before the Go compiler inlined
  non-unsafe.Pointer atomics many releases ago.

PiperOrigin-RevId: 295811743
---
 pkg/atomicbitops/BUILD                   |   9 +-
 pkg/atomicbitops/atomic_bitops.go        |  60 -------
 pkg/atomicbitops/atomic_bitops_amd64.s   | 115 --------------
 pkg/atomicbitops/atomic_bitops_arm64.s   | 139 ----------------
 pkg/atomicbitops/atomic_bitops_common.go | 147 -----------------
 pkg/atomicbitops/atomic_bitops_test.go   | 262 -------------------------------
 pkg/atomicbitops/atomicbitops.go         |  47 ++++++
 pkg/atomicbitops/atomicbitops_amd64.s    |  77 +++++++++
 pkg/atomicbitops/atomicbitops_arm64.s    | 105 +++++++++++++
 pkg/atomicbitops/atomicbitops_noasm.go   | 105 +++++++++++++
 pkg/atomicbitops/atomicbitops_test.go    | 198 +++++++++++++++++++++++
 pkg/sentry/mm/address_space.go           |  23 ++-
 pkg/sentry/mm/lifecycle.go               |  11 +-
 13 files changed, 563 insertions(+), 735 deletions(-)
 delete mode 100644 pkg/atomicbitops/atomic_bitops.go
 delete mode 100644 pkg/atomicbitops/atomic_bitops_amd64.s
 delete mode 100644 pkg/atomicbitops/atomic_bitops_arm64.s
 delete mode 100644 pkg/atomicbitops/atomic_bitops_common.go
 delete mode 100644 pkg/atomicbitops/atomic_bitops_test.go
 create mode 100644 pkg/atomicbitops/atomicbitops.go
 create mode 100644 pkg/atomicbitops/atomicbitops_amd64.s
 create mode 100644 pkg/atomicbitops/atomicbitops_arm64.s
 create mode 100644 pkg/atomicbitops/atomicbitops_noasm.go
 create mode 100644 pkg/atomicbitops/atomicbitops_test.go

diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 3948074ba..ba8b06071 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -5,10 +5,9 @@ package(licenses = ["notice"])
 go_library(
     name = "atomicbitops",
     srcs = [
-        "atomic_bitops.go",
-        "atomic_bitops_amd64.s",
-        "atomic_bitops_arm64.s",
-        "atomic_bitops_common.go",
+        "atomicbitops.go",
+        "atomicbitops_amd64.s",
+        "atomicbitops_noasm.go",
     ],
     visibility = ["//:sandbox"],
 )
@@ -16,7 +15,7 @@ go_library(
 go_test(
     name = "atomicbitops_test",
     size = "small",
-    srcs = ["atomic_bitops_test.go"],
+    srcs = ["atomicbitops_test.go"],
     library = ":atomicbitops",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomic_bitops.go
deleted file mode 100644
index fcc41a9ea..000000000
--- a/pkg/atomicbitops/atomic_bitops.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64 arm64
-
-// Package atomicbitops provides basic bitwise operations in an atomic way.
-// The implementation on amd64 leverages the LOCK prefix directly instead of
-// relying on the generic cas primitives, and the arm64 leverages the LDAXR
-// and STLXR pair primitives.
-//
-// WARNING: the bitwise ops provided in this package doesn't imply any memory
-// ordering. Using them to construct locks must employ proper memory barriers.
-package atomicbitops
-
-// AndUint32 atomically applies bitwise and operation to *addr with val.
-func AndUint32(addr *uint32, val uint32)
-
-// OrUint32 atomically applies bitwise or operation to *addr with val.
-func OrUint32(addr *uint32, val uint32)
-
-// XorUint32 atomically applies bitwise xor operation to *addr with val.
-func XorUint32(addr *uint32, val uint32)
-
-// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint32(addr *uint32, old, new uint32) uint32
-
-// AndUint64 atomically applies bitwise and operation to *addr with val.
-func AndUint64(addr *uint64, val uint64)
-
-// OrUint64 atomically applies bitwise or operation to *addr with val.
-func OrUint64(addr *uint64, val uint64)
-
-// XorUint64 atomically applies bitwise xor operation to *addr with val.
-func XorUint64(addr *uint64, val uint64)
-
-// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint64(addr *uint64, old, new uint64) uint64
-
-// IncUnlessZeroInt32 increments the value stored at the given address and
-// returns true; unless the value stored in the pointer is zero, in which case
-// it is left unmodified and false is returned.
-func IncUnlessZeroInt32(addr *int32) bool
-
-// DecUnlessOneInt32 decrements the value stored at the given address and
-// returns true; unless the value stored in the pointer is 1, in which case it
-// is left unmodified and false is returned.
-func DecUnlessOneInt32(addr *int32) bool
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomic_bitops_amd64.s
deleted file mode 100644
index db0972001..000000000
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64
-
-#include "textflag.h"
-
-TEXT ·AndUint32(SB),$0-12
-  MOVQ  addr+0(FP), BP
-  MOVL  val+8(FP), AX
-  LOCK
-  ANDL   AX, 0(BP)
-  RET
-
-TEXT ·OrUint32(SB),$0-12
-  MOVQ  addr+0(FP), BP
-  MOVL  val+8(FP), AX
-  LOCK
-  ORL   AX, 0(BP)
-  RET
-
-TEXT ·XorUint32(SB),$0-12
-  MOVQ  addr+0(FP), BP
-  MOVL  val+8(FP), AX
-  LOCK
-  XORL   AX, 0(BP)
-  RET
-
-TEXT ·CompareAndSwapUint32(SB),$0-20
-  MOVQ  addr+0(FP), DI
-  MOVL  old+8(FP), AX
-  MOVL  new+12(FP), DX
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  MOVL  AX, ret+16(FP)
-  RET
-
-TEXT ·AndUint64(SB),$0-16
-  MOVQ  addr+0(FP), BP
-  MOVQ  val+8(FP), AX
-  LOCK
-  ANDQ   AX, 0(BP)
-  RET
-
-TEXT ·OrUint64(SB),$0-16
-  MOVQ  addr+0(FP), BP
-  MOVQ  val+8(FP), AX
-  LOCK
-  ORQ   AX, 0(BP)
-  RET
-
-TEXT ·XorUint64(SB),$0-16
-  MOVQ  addr+0(FP), BP
-  MOVQ  val+8(FP), AX
-  LOCK
-  XORQ   AX, 0(BP)
-  RET
-
-TEXT ·CompareAndSwapUint64(SB),$0-32
-  MOVQ  addr+0(FP), DI
-  MOVQ  old+8(FP), AX
-  MOVQ  new+16(FP), DX
-  LOCK
-  CMPXCHGQ DX, 0(DI)
-  MOVQ  AX, ret+24(FP)
-  RET
-
-TEXT ·IncUnlessZeroInt32(SB),NOSPLIT,$0-9
-  MOVQ     addr+0(FP), DI
-  MOVL     0(DI), AX
-
-retry:
-  TESTL    AX, AX
-  JZ       fail
-  LEAL     1(AX), DX
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  JNZ      retry
-
-  SETEQ    ret+8(FP)
-  RET
-
-fail:
-  MOVB     AX, ret+8(FP)
-  RET
-
-TEXT ·DecUnlessOneInt32(SB),NOSPLIT,$0-9
-  MOVQ     addr+0(FP), DI
-  MOVL     0(DI), AX
-
-retry:
-  LEAL     -1(AX), DX
-  TESTL    DX, DX
-  JZ       fail
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  JNZ      retry
-
-  SETEQ    ret+8(FP)
-  RET
-
-fail:
-  MOVB     DX, ret+8(FP)
-  RET
diff --git a/pkg/atomicbitops/atomic_bitops_arm64.s b/pkg/atomicbitops/atomic_bitops_arm64.s
deleted file mode 100644
index 97f8808c1..000000000
--- a/pkg/atomicbitops/atomic_bitops_arm64.s
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build arm64
-
-#include "textflag.h"
-
-TEXT ·AndUint32(SB),$0-12
-  MOVD    ptr+0(FP), R0
-  MOVW    val+8(FP), R1
-again:
-  LDAXRW  (R0), R2
-  ANDW    R1, R2
-  STLXRW  R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·OrUint32(SB),$0-12
-  MOVD    ptr+0(FP), R0
-  MOVW    val+8(FP), R1
-again:
-  LDAXRW  (R0), R2
-  ORRW    R1, R2
-  STLXRW  R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·XorUint32(SB),$0-12
-  MOVD    ptr+0(FP), R0
-  MOVW    val+8(FP), R1
-again:
-  LDAXRW  (R0), R2
-  EORW    R1, R2
-  STLXRW  R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·CompareAndSwapUint32(SB),$0-20
-  MOVD addr+0(FP), R0
-  MOVW old+8(FP), R1
-  MOVW new+12(FP), R2
-
-again:
-  LDAXRW (R0), R3
-  CMPW R1, R3
-  BNE done
-  STLXRW R2, (R0), R4
-  CBNZ R4, again
-done:
-  MOVW R3, prev+16(FP)
-  RET
-
-TEXT ·AndUint64(SB),$0-16
-  MOVD    ptr+0(FP), R0
-  MOVD    val+8(FP), R1
-again:
-  LDAXR   (R0), R2
-  AND     R1, R2
-  STLXR   R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·OrUint64(SB),$0-16
-  MOVD    ptr+0(FP), R0
-  MOVD    val+8(FP), R1
-again:
-  LDAXR   (R0), R2
-  ORR     R1, R2
-  STLXR   R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·XorUint64(SB),$0-16
-  MOVD    ptr+0(FP), R0
-  MOVD    val+8(FP), R1
-again:
-  LDAXR   (R0), R2
-  EOR     R1, R2
-  STLXR   R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·CompareAndSwapUint64(SB),$0-32
-  MOVD addr+0(FP), R0
-  MOVD old+8(FP), R1
-  MOVD new+16(FP), R2
-
-again:
-  LDAXR (R0), R3
-  CMP R1, R3
-  BNE done
-  STLXR R2, (R0), R4
-  CBNZ R4, again
-done:
-  MOVD R3, prev+24(FP)
-  RET
-
-TEXT ·IncUnlessZeroInt32(SB),NOSPLIT,$0-9
-  MOVD addr+0(FP), R0
-
-again:
-  LDAXRW (R0), R1
-  CBZ R1, fail
-  ADDW $1, R1
-  STLXRW R1, (R0), R2
-  CBNZ R2, again
-  MOVW $1, R2
-  MOVB R2, ret+8(FP)
-  RET
-fail:
-  MOVB ZR, ret+8(FP)
-  RET
-
-TEXT ·DecUnlessOneInt32(SB),NOSPLIT,$0-9
-  MOVD addr+0(FP), R0
-
-again:
-  LDAXRW (R0), R1
-  SUBSW $1, R1, R1
-  BEQ fail
-  STLXRW R1, (R0), R2
-  CBNZ R2, again
-  MOVW $1, R2
-  MOVB R2, ret+8(FP)
-  RET
-fail:
-  MOVB ZR, ret+8(FP)
-  RET
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomic_bitops_common.go
deleted file mode 100644
index 85163ad62..000000000
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build !amd64,!arm64
-
-package atomicbitops
-
-import (
-	"sync/atomic"
-)
-
-// AndUint32 atomically applies bitwise and operation to *addr with val.
-func AndUint32(addr *uint32, val uint32) {
-	for {
-		o := atomic.LoadUint32(addr)
-		n := o & val
-		if atomic.CompareAndSwapUint32(addr, o, n) {
-			break
-		}
-	}
-}
-
-// OrUint32 atomically applies bitwise or operation to *addr with val.
-func OrUint32(addr *uint32, val uint32) {
-	for {
-		o := atomic.LoadUint32(addr)
-		n := o | val
-		if atomic.CompareAndSwapUint32(addr, o, n) {
-			break
-		}
-	}
-}
-
-// XorUint32 atomically applies bitwise xor operation to *addr with val.
-func XorUint32(addr *uint32, val uint32) {
-	for {
-		o := atomic.LoadUint32(addr)
-		n := o ^ val
-		if atomic.CompareAndSwapUint32(addr, o, n) {
-			break
-		}
-	}
-}
-
-// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
-	for {
-		prev = atomic.LoadUint32(addr)
-		if prev != old {
-			return
-		}
-		if atomic.CompareAndSwapUint32(addr, old, new) {
-			return
-		}
-	}
-}
-
-// AndUint64 atomically applies bitwise and operation to *addr with val.
-func AndUint64(addr *uint64, val uint64) {
-	for {
-		o := atomic.LoadUint64(addr)
-		n := o & val
-		if atomic.CompareAndSwapUint64(addr, o, n) {
-			break
-		}
-	}
-}
-
-// OrUint64 atomically applies bitwise or operation to *addr with val.
-func OrUint64(addr *uint64, val uint64) {
-	for {
-		o := atomic.LoadUint64(addr)
-		n := o | val
-		if atomic.CompareAndSwapUint64(addr, o, n) {
-			break
-		}
-	}
-}
-
-// XorUint64 atomically applies bitwise xor operation to *addr with val.
-func XorUint64(addr *uint64, val uint64) {
-	for {
-		o := atomic.LoadUint64(addr)
-		n := o ^ val
-		if atomic.CompareAndSwapUint64(addr, o, n) {
-			break
-		}
-	}
-}
-
-// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
-	for {
-		prev = atomic.LoadUint64(addr)
-		if prev != old {
-			return
-		}
-		if atomic.CompareAndSwapUint64(addr, old, new) {
-			return
-		}
-	}
-}
-
-// IncUnlessZeroInt32 increments the value stored at the given address and
-// returns true; unless the value stored in the pointer is zero, in which case
-// it is left unmodified and false is returned.
-func IncUnlessZeroInt32(addr *int32) bool {
-	for {
-		v := atomic.LoadInt32(addr)
-		if v == 0 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(addr, v, v+1) {
-			return true
-		}
-	}
-}
-
-// DecUnlessOneInt32 decrements the value stored at the given address and
-// returns true; unless the value stored in the pointer is 1, in which case it
-// is left unmodified and false is returned.
-func DecUnlessOneInt32(addr *int32) bool {
-	for {
-		v := atomic.LoadInt32(addr)
-		if v == 1 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(addr, v, v-1) {
-			return true
-		}
-	}
-}
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
deleted file mode 100644
index 9466d3e23..000000000
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package atomicbitops
-
-import (
-	"runtime"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-const iterations = 100
-
-func detectRaces32(val, target uint32, fn func(*uint32, uint32)) bool {
-	runtime.GOMAXPROCS(100)
-	for n := 0; n < iterations; n++ {
-		x := val
-		var wg sync.WaitGroup
-		for i := uint32(0); i < 32; i++ {
-			wg.Add(1)
-			go func(a *uint32, i uint32) {
-				defer wg.Done()
-				fn(a, uint32(1<<i))
-			}(&x, i)
-		}
-		wg.Wait()
-		if x != target {
-			return true
-		}
-	}
-	return false
-}
-
-func detectRaces64(val, target uint64, fn func(*uint64, uint64)) bool {
-	runtime.GOMAXPROCS(100)
-	for n := 0; n < iterations; n++ {
-		x := val
-		var wg sync.WaitGroup
-		for i := uint64(0); i < 64; i++ {
-			wg.Add(1)
-			go func(a *uint64, i uint64) {
-				defer wg.Done()
-				fn(a, uint64(1<<i))
-			}(&x, i)
-		}
-		wg.Wait()
-		if x != target {
-			return true
-		}
-	}
-	return false
-}
-
-func TestOrUint32(t *testing.T) {
-	if detectRaces32(0x0, 0xffffffff, OrUint32) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestAndUint32(t *testing.T) {
-	if detectRaces32(0xf0f0f0f0, 0x00000000, AndUint32) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestXorUint32(t *testing.T) {
-	if detectRaces32(0xf0f0f0f0, 0x0f0f0f0f, XorUint32) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestOrUint64(t *testing.T) {
-	if detectRaces64(0x0, 0xffffffffffffffff, OrUint64) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestAndUint64(t *testing.T) {
-	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0, AndUint64) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestXorUint64(t *testing.T) {
-	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, XorUint64) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestCompareAndSwapUint32(t *testing.T) {
-	tests := []struct {
-		name string
-		prev uint32
-		old  uint32
-		new  uint32
-		next uint32
-	}{
-		{
-			name: "Successful compare-and-swap with prev == new",
-			prev: 10,
-			old:  10,
-			new:  10,
-			next: 10,
-		},
-		{
-			name: "Successful compare-and-swap with prev != new",
-			prev: 20,
-			old:  20,
-			new:  22,
-			next: 22,
-		},
-		{
-			name: "Failed compare-and-swap with prev == new",
-			prev: 31,
-			old:  30,
-			new:  31,
-			next: 31,
-		},
-		{
-			name: "Failed compare-and-swap with prev != new",
-			prev: 41,
-			old:  40,
-			new:  42,
-			next: 41,
-		},
-	}
-	for _, test := range tests {
-		val := test.prev
-		prev := CompareAndSwapUint32(&val, test.old, test.new)
-		if got, want := prev, test.prev; got != want {
-			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
-		}
-		if got, want := val, test.next; got != want {
-			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
-		}
-	}
-}
-
-func TestCompareAndSwapUint64(t *testing.T) {
-	tests := []struct {
-		name string
-		prev uint64
-		old  uint64
-		new  uint64
-		next uint64
-	}{
-		{
-			name: "Successful compare-and-swap with prev == new",
-			prev: 0x100000000,
-			old:  0x100000000,
-			new:  0x100000000,
-			next: 0x100000000,
-		},
-		{
-			name: "Successful compare-and-swap with prev != new",
-			prev: 0x200000000,
-			old:  0x200000000,
-			new:  0x200000002,
-			next: 0x200000002,
-		},
-		{
-			name: "Failed compare-and-swap with prev == new",
-			prev: 0x300000001,
-			old:  0x300000000,
-			new:  0x300000001,
-			next: 0x300000001,
-		},
-		{
-			name: "Failed compare-and-swap with prev != new",
-			prev: 0x400000001,
-			old:  0x400000000,
-			new:  0x400000002,
-			next: 0x400000001,
-		},
-	}
-	for _, test := range tests {
-		val := test.prev
-		prev := CompareAndSwapUint64(&val, test.old, test.new)
-		if got, want := prev, test.prev; got != want {
-			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
-		}
-		if got, want := val, test.next; got != want {
-			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
-		}
-	}
-}
-
-func TestIncUnlessZeroInt32(t *testing.T) {
-	for _, test := range []struct {
-		initial int32
-		final   int32
-		ret     bool
-	}{
-		{
-			initial: 0,
-			final:   0,
-			ret:     false,
-		},
-		{
-			initial: 1,
-			final:   2,
-			ret:     true,
-		},
-		{
-			initial: 2,
-			final:   3,
-			ret:     true,
-		},
-	} {
-		val := test.initial
-		if got, want := IncUnlessZeroInt32(&val), test.ret; got != want {
-			t.Errorf("For initial value of %d: incorrect return value: got %v, wanted %v", test.initial, got, want)
-		}
-		if got, want := val, test.final; got != want {
-			t.Errorf("For initial value of %d: incorrect final value: got %d, wanted %d", test.initial, got, want)
-		}
-	}
-}
-
-func TestDecUnlessOneInt32(t *testing.T) {
-	for _, test := range []struct {
-		initial int32
-		final   int32
-		ret     bool
-	}{
-		{
-			initial: 0,
-			final:   -1,
-			ret:     true,
-		},
-		{
-			initial: 1,
-			final:   1,
-			ret:     false,
-		},
-		{
-			initial: 2,
-			final:   1,
-			ret:     true,
-		},
-	} {
-		val := test.initial
-		if got, want := DecUnlessOneInt32(&val), test.ret; got != want {
-			t.Errorf("For initial value of %d: incorrect return value: got %v, wanted %v", test.initial, got, want)
-		}
-		if got, want := val, test.final; got != want {
-			t.Errorf("For initial value of %d: incorrect final value: got %d, wanted %d", test.initial, got, want)
-		}
-	}
-}
diff --git a/pkg/atomicbitops/atomicbitops.go b/pkg/atomicbitops/atomicbitops.go
new file mode 100644
index 000000000..1be081719
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops.go
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 arm64
+
+// Package atomicbitops provides extensions to the sync/atomic package.
+//
+// All read-modify-write operations implemented by this package have
+// acquire-release memory ordering (like sync/atomic).
+package atomicbitops
+
+// AndUint32 atomically applies bitwise AND operation to *addr with val.
+func AndUint32(addr *uint32, val uint32)
+
+// OrUint32 atomically applies bitwise OR operation to *addr with val.
+func OrUint32(addr *uint32, val uint32)
+
+// XorUint32 atomically applies bitwise XOR operation to *addr with val.
+func XorUint32(addr *uint32, val uint32)
+
+// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// the value previously stored at addr.
+func CompareAndSwapUint32(addr *uint32, old, new uint32) uint32
+
+// AndUint64 atomically applies bitwise AND operation to *addr with val.
+func AndUint64(addr *uint64, val uint64)
+
+// OrUint64 atomically applies bitwise OR operation to *addr with val.
+func OrUint64(addr *uint64, val uint64)
+
+// XorUint64 atomically applies bitwise XOR operation to *addr with val.
+func XorUint64(addr *uint64, val uint64)
+
+// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
+// the value previously stored at addr.
+func CompareAndSwapUint64(addr *uint64, old, new uint64) uint64
diff --git a/pkg/atomicbitops/atomicbitops_amd64.s b/pkg/atomicbitops/atomicbitops_amd64.s
new file mode 100644
index 000000000..54c887ee5
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_amd64.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+#include "textflag.h"
+
+TEXT ·AndUint32(SB),$0-12
+  MOVQ  addr+0(FP), BP
+  MOVL  val+8(FP), AX
+  LOCK
+  ANDL   AX, 0(BP)
+  RET
+
+TEXT ·OrUint32(SB),$0-12
+  MOVQ  addr+0(FP), BP
+  MOVL  val+8(FP), AX
+  LOCK
+  ORL   AX, 0(BP)
+  RET
+
+TEXT ·XorUint32(SB),$0-12
+  MOVQ  addr+0(FP), BP
+  MOVL  val+8(FP), AX
+  LOCK
+  XORL   AX, 0(BP)
+  RET
+
+TEXT ·CompareAndSwapUint32(SB),$0-20
+  MOVQ  addr+0(FP), DI
+  MOVL  old+8(FP), AX
+  MOVL  new+12(FP), DX
+  LOCK
+  CMPXCHGL DX, 0(DI)
+  MOVL  AX, ret+16(FP)
+  RET
+
+TEXT ·AndUint64(SB),$0-16
+  MOVQ  addr+0(FP), BP
+  MOVQ  val+8(FP), AX
+  LOCK
+  ANDQ   AX, 0(BP)
+  RET
+
+TEXT ·OrUint64(SB),$0-16
+  MOVQ  addr+0(FP), BP
+  MOVQ  val+8(FP), AX
+  LOCK
+  ORQ   AX, 0(BP)
+  RET
+
+TEXT ·XorUint64(SB),$0-16
+  MOVQ  addr+0(FP), BP
+  MOVQ  val+8(FP), AX
+  LOCK
+  XORQ   AX, 0(BP)
+  RET
+
+TEXT ·CompareAndSwapUint64(SB),$0-32
+  MOVQ  addr+0(FP), DI
+  MOVQ  old+8(FP), AX
+  MOVQ  new+16(FP), DX
+  LOCK
+  CMPXCHGQ DX, 0(DI)
+  MOVQ  AX, ret+24(FP)
+  RET
diff --git a/pkg/atomicbitops/atomicbitops_arm64.s b/pkg/atomicbitops/atomicbitops_arm64.s
new file mode 100644
index 000000000..5c780851b
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_arm64.s
@@ -0,0 +1,105 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+#include "textflag.h"
+
+TEXT ·AndUint32(SB),$0-12
+  MOVD    ptr+0(FP), R0
+  MOVW    val+8(FP), R1
+again:
+  LDAXRW  (R0), R2
+  ANDW    R1, R2
+  STLXRW  R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·OrUint32(SB),$0-12
+  MOVD    ptr+0(FP), R0
+  MOVW    val+8(FP), R1
+again:
+  LDAXRW  (R0), R2
+  ORRW    R1, R2
+  STLXRW  R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·XorUint32(SB),$0-12
+  MOVD    ptr+0(FP), R0
+  MOVW    val+8(FP), R1
+again:
+  LDAXRW  (R0), R2
+  EORW    R1, R2
+  STLXRW  R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·CompareAndSwapUint32(SB),$0-20
+  MOVD addr+0(FP), R0
+  MOVW old+8(FP), R1
+  MOVW new+12(FP), R2
+again:
+  LDAXRW (R0), R3
+  CMPW R1, R3
+  BNE done
+  STLXRW R2, (R0), R4
+  CBNZ R4, again
+done:
+  MOVW R3, prev+16(FP)
+  RET
+
+TEXT ·AndUint64(SB),$0-16
+  MOVD    ptr+0(FP), R0
+  MOVD    val+8(FP), R1
+again:
+  LDAXR   (R0), R2
+  AND     R1, R2
+  STLXR   R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·OrUint64(SB),$0-16
+  MOVD    ptr+0(FP), R0
+  MOVD    val+8(FP), R1
+again:
+  LDAXR   (R0), R2
+  ORR     R1, R2
+  STLXR   R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·XorUint64(SB),$0-16
+  MOVD    ptr+0(FP), R0
+  MOVD    val+8(FP), R1
+again:
+  LDAXR   (R0), R2
+  EOR     R1, R2
+  STLXR   R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·CompareAndSwapUint64(SB),$0-32
+  MOVD addr+0(FP), R0
+  MOVD old+8(FP), R1
+  MOVD new+16(FP), R2
+again:
+  LDAXR (R0), R3
+  CMP R1, R3
+  BNE done
+  STLXR R2, (R0), R4
+  CBNZ R4, again
+done:
+  MOVD R3, prev+24(FP)
+  RET
diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go
new file mode 100644
index 000000000..3b2898256
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_noasm.go
@@ -0,0 +1,105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !amd64,!arm64
+
+package atomicbitops
+
+import (
+	"sync/atomic"
+)
+
+func AndUint32(addr *uint32, val uint32) {
+	for {
+		o := atomic.LoadUint32(addr)
+		n := o & val
+		if atomic.CompareAndSwapUint32(addr, o, n) {
+			break
+		}
+	}
+}
+
+func OrUint32(addr *uint32, val uint32) {
+	for {
+		o := atomic.LoadUint32(addr)
+		n := o | val
+		if atomic.CompareAndSwapUint32(addr, o, n) {
+			break
+		}
+	}
+}
+
+func XorUint32(addr *uint32, val uint32) {
+	for {
+		o := atomic.LoadUint32(addr)
+		n := o ^ val
+		if atomic.CompareAndSwapUint32(addr, o, n) {
+			break
+		}
+	}
+}
+
+func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
+	for {
+		prev = atomic.LoadUint32(addr)
+		if prev != old {
+			return
+		}
+		if atomic.CompareAndSwapUint32(addr, old, new) {
+			return
+		}
+	}
+}
+
+func AndUint64(addr *uint64, val uint64) {
+	for {
+		o := atomic.LoadUint64(addr)
+		n := o & val
+		if atomic.CompareAndSwapUint64(addr, o, n) {
+			break
+		}
+	}
+}
+
+func OrUint64(addr *uint64, val uint64) {
+	for {
+		o := atomic.LoadUint64(addr)
+		n := o | val
+		if atomic.CompareAndSwapUint64(addr, o, n) {
+			break
+		}
+	}
+}
+
+func XorUint64(addr *uint64, val uint64) {
+	for {
+		o := atomic.LoadUint64(addr)
+		n := o ^ val
+		if atomic.CompareAndSwapUint64(addr, o, n) {
+			break
+		}
+	}
+}
+
+func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
+	for {
+		prev = atomic.LoadUint64(addr)
+		if prev != old {
+			return
+		}
+		if atomic.CompareAndSwapUint64(addr, old, new) {
+			return
+		}
+	}
+}
diff --git a/pkg/atomicbitops/atomicbitops_test.go b/pkg/atomicbitops/atomicbitops_test.go
new file mode 100644
index 000000000..73af71bb4
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_test.go
@@ -0,0 +1,198 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package atomicbitops
+
+import (
+	"runtime"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+const iterations = 100
+
+func detectRaces32(val, target uint32, fn func(*uint32, uint32)) bool {
+	runtime.GOMAXPROCS(100)
+	for n := 0; n < iterations; n++ {
+		x := val
+		var wg sync.WaitGroup
+		for i := uint32(0); i < 32; i++ {
+			wg.Add(1)
+			go func(a *uint32, i uint32) {
+				defer wg.Done()
+				fn(a, uint32(1<<i))
+			}(&x, i)
+		}
+		wg.Wait()
+		if x != target {
+			return true
+		}
+	}
+	return false
+}
+
+func detectRaces64(val, target uint64, fn func(*uint64, uint64)) bool {
+	runtime.GOMAXPROCS(100)
+	for n := 0; n < iterations; n++ {
+		x := val
+		var wg sync.WaitGroup
+		for i := uint64(0); i < 64; i++ {
+			wg.Add(1)
+			go func(a *uint64, i uint64) {
+				defer wg.Done()
+				fn(a, uint64(1<<i))
+			}(&x, i)
+		}
+		wg.Wait()
+		if x != target {
+			return true
+		}
+	}
+	return false
+}
+
+func TestOrUint32(t *testing.T) {
+	if detectRaces32(0x0, 0xffffffff, OrUint32) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestAndUint32(t *testing.T) {
+	if detectRaces32(0xf0f0f0f0, 0x00000000, AndUint32) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestXorUint32(t *testing.T) {
+	if detectRaces32(0xf0f0f0f0, 0x0f0f0f0f, XorUint32) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestOrUint64(t *testing.T) {
+	if detectRaces64(0x0, 0xffffffffffffffff, OrUint64) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestAndUint64(t *testing.T) {
+	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0, AndUint64) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestXorUint64(t *testing.T) {
+	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, XorUint64) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestCompareAndSwapUint32(t *testing.T) {
+	tests := []struct {
+		name string
+		prev uint32
+		old  uint32
+		new  uint32
+		next uint32
+	}{
+		{
+			name: "Successful compare-and-swap with prev == new",
+			prev: 10,
+			old:  10,
+			new:  10,
+			next: 10,
+		},
+		{
+			name: "Successful compare-and-swap with prev != new",
+			prev: 20,
+			old:  20,
+			new:  22,
+			next: 22,
+		},
+		{
+			name: "Failed compare-and-swap with prev == new",
+			prev: 31,
+			old:  30,
+			new:  31,
+			next: 31,
+		},
+		{
+			name: "Failed compare-and-swap with prev != new",
+			prev: 41,
+			old:  40,
+			new:  42,
+			next: 41,
+		},
+	}
+	for _, test := range tests {
+		val := test.prev
+		prev := CompareAndSwapUint32(&val, test.old, test.new)
+		if got, want := prev, test.prev; got != want {
+			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
+		}
+		if got, want := val, test.next; got != want {
+			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
+		}
+	}
+}
+
+func TestCompareAndSwapUint64(t *testing.T) {
+	tests := []struct {
+		name string
+		prev uint64
+		old  uint64
+		new  uint64
+		next uint64
+	}{
+		{
+			name: "Successful compare-and-swap with prev == new",
+			prev: 0x100000000,
+			old:  0x100000000,
+			new:  0x100000000,
+			next: 0x100000000,
+		},
+		{
+			name: "Successful compare-and-swap with prev != new",
+			prev: 0x200000000,
+			old:  0x200000000,
+			new:  0x200000002,
+			next: 0x200000002,
+		},
+		{
+			name: "Failed compare-and-swap with prev == new",
+			prev: 0x300000001,
+			old:  0x300000000,
+			new:  0x300000001,
+			next: 0x300000001,
+		},
+		{
+			name: "Failed compare-and-swap with prev != new",
+			prev: 0x400000001,
+			old:  0x400000000,
+			new:  0x400000002,
+			next: 0x400000001,
+		},
+	}
+	for _, test := range tests {
+		val := test.prev
+		prev := CompareAndSwapUint64(&val, test.old, test.new)
+		if got, want := prev, test.prev; got != want {
+			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
+		}
+		if got, want := val, test.next; got != want {
+			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e58a63deb..94d39af60 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -42,8 +41,15 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
 func (mm *MemoryManager) Activate() error {
 	// Fast path: the MemoryManager already has an active
 	// platform.AddressSpace, and we just need to indicate that we need it too.
-	if atomicbitops.IncUnlessZeroInt32(&mm.active) {
-		return nil
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 0 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
+			return nil
+		}
 	}
 
 	for {
@@ -118,8 +124,15 @@ func (mm *MemoryManager) Activate() error {
 func (mm *MemoryManager) Deactivate() {
 	// Fast path: this is not the last goroutine to deactivate the
 	// MemoryManager.
-	if atomicbitops.DecUnlessOneInt32(&mm.active) {
-		return
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 1 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
+			return
+		}
 	}
 
 	mm.activeMu.Lock()
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 47b8fbf43..3c263ebaa 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -229,7 +228,15 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 // IncUsers increments mm's user count and returns true. If the user count is
 // already 0, IncUsers does nothing and returns false.
 func (mm *MemoryManager) IncUsers() bool {
-	return atomicbitops.IncUnlessZeroInt32(&mm.users)
+	for {
+		users := atomic.LoadInt32(&mm.users)
+		if users == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
+			return true
+		}
+	}
 }
 
 // DecUsers decrements mm's user count. If the user count reaches 0, all
-- 
cgit v1.2.3


From a3582de6186edcc88e022af2b9f9c1cef90e44ed Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 13:49:17 -0800
Subject: cpuid: cache the maximum size of xsave state

perf shows that ExtendedStateSize cosumes more than 20% of cpu:

    23.61%    23.61%  [.] pkg/cpuid/cpuid.HostID

PiperOrigin-RevId: 295813263
---
 pkg/cpuid/cpuid_x86.go | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
index 333ca0a04..a0bc55ea1 100644
--- a/pkg/cpuid/cpuid_x86.go
+++ b/pkg/cpuid/cpuid_x86.go
@@ -725,6 +725,18 @@ func vendorIDFromRegs(bx, cx, dx uint32) string {
 	return string(bytes)
 }
 
+var maxXsaveSize = func() uint32 {
+	// Leaf 0 of xsaveinfo function returns the size for currently
+	// enabled xsave features in ebx, the maximum size if all valid
+	// features are saved with xsave in ecx, and valid XCR0 bits in
+	// edx:eax.
+	//
+	// If xSaveInfo isn't supported, cpuid will not fault but will
+	// return bogus values.
+	_, _, maxXsaveSize, _ := HostID(uint32(xSaveInfo), 0)
+	return maxXsaveSize
+}()
+
 // ExtendedStateSize returns the number of bytes needed to save the "extended
 // state" for this processor and the boundary it must be aligned to. Extended
 // state includes floating point registers, and other cpu state that's not
@@ -736,12 +748,7 @@ func vendorIDFromRegs(bx, cx, dx uint32) string {
 // about 2.5K worst case, with avx512).
 func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
 	if fs.UseXsave() {
-		// Leaf 0 of xsaveinfo function returns the size for currently
-		// enabled xsave features in ebx, the maximum size if all valid
-		// features are saved with xsave in ecx, and valid XCR0 bits in
-		// edx:eax.
-		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
-		return uint(maxSize), 64
+		return uint(maxXsaveSize), 64
 	}
 
 	// If we don't support xsave, we fall back to fxsave, which requires
-- 
cgit v1.2.3


From 737a3d072ef6e3edf5099505e41deed49f9e5b5c Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:08:11 -0800
Subject: go-marshal: Stop complaining about files with no +marshal types.

Since we tag entire packages as marshallable, due to conditional
compiling for different architectures we can end up with sets of
source files that don't contain any marshallable types. It's safe to
silently ignore this scenario.

PiperOrigin-RevId: 295831871
---
 tools/go_marshal/gomarshal/generator.go | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 0294ba5ba..d3c2f72f5 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -338,17 +338,6 @@ func (g *Generator) Run() error {
 		}
 	}
 
-	// Tool was invoked with input files with no data structures marked for code
-	// generation. This is probably not what the user intended.
-	if len(impls) == 0 {
-		var buf bytes.Buffer
-		fmt.Fprintf(&buf, "go_marshal invoked on these files, but they don't contain any types requiring code generation. Perhaps mark some with \"// +marshal\"?:\n")
-		for _, i := range g.inputs {
-			fmt.Fprintf(&buf, "  %s\n", i)
-		}
-		abort(buf.String())
-	}
-
 	// Write output file header. These include things like package name and
 	// import statements.
 	if err := g.writeHeader(); err != nil {
-- 
cgit v1.2.3


From 55c553ae8c7937be4a7e10e0c7a727d132317e89 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:17:45 -0800
Subject: Add //pkg/syncevent.

Package syncevent is intended to subsume ~all uses of channels in the sentry
(including //pkg/waiter), as well as //pkg/sleep.

Compared to channels:

- Delivery of events to a syncevent.Receiver allows *synchronous* execution of
  an arbitrary callback, whereas delivery of events to a channel requires a
  goroutine to receive from that channel, resulting in substantial scheduling
  overhead. (This is also part of the motivation for the waiter package.)

- syncevent.Waiter can wait on multiple event sources without the high O(N)
  overhead of select. (This is the same motivation as for the sleep package.)

Compared to the waiter package:

- syncevent.Waiters are intended to be persistent (i.e. per-kernel.Task), and
  syncevent.Broadcaster (analogous to waiter.Queue) is a hash table rather than
  a linked list, such that blocking is (usually) allocation-free.

- syncevent.Source (analogous to waiter.Waitable) does not include an equivalent
  to waiter.Waitable.Readiness(), since this is inappropriate for transient
  events (see e.g. //pkg/sentry/kernel/time.ClockEventSource).

Compared to the sleep package:

- syncevent events are represented by bits in a bitmask rather than discrete
  sleep.Waker objects, reducing overhead and making it feasible to broadcast
  events to multiple syncevent.Receivers.

- syncevent.Receiver invokes an arbitrary callback, which is required by the
  sentry's epoll implementation. (syncevent.Waiter, which is analogous to
  sleep.Sleeper, pairs a syncevent.Receiver with a callback that wakes a
  waiting goroutine; the implementation of this aspect is nearly identical to
  that of sleep.Sleeper, except that it represents *runtime.g as unsafe.Pointer
  rather than uintptr.)

- syncevent.Waiter.Wait (analogous to sleep.Sleeper.Fetch(block=true)) does not
  automatically un-assert returned events. This is useful in cases where the
  path for handling an event is not the same as the path that observes it, such
  as for application signals (a la Linux's TIF_SIGPENDING).

- Unlike sleep.Sleeper, which Fetches Wakers in the order that they were
  Asserted, the event bitmasks used by syncevent.Receiver have no way of
  preserving event arrival order. (This is similar to select, which goes out of
  its way to randomize event ordering.)

The disadvantage of the syncevent package is that, since events are represented
by bits in a uint64 bitmask, each syncevent.Receiver can "only" multiplex
between 64 distinct events; this does not affect any known use case.

Benchmarks:

BenchmarkBroadcasterSubscribeUnsubscribe
BenchmarkBroadcasterSubscribeUnsubscribe-12         	45133884	        26.3 ns/op
BenchmarkMapSubscribeUnsubscribe
BenchmarkMapSubscribeUnsubscribe-12                 	28504662	        41.8 ns/op
BenchmarkQueueSubscribeUnsubscribe
BenchmarkQueueSubscribeUnsubscribe-12               	22747668	        45.6 ns/op
BenchmarkBroadcasterSubscribeUnsubscribeBatch
BenchmarkBroadcasterSubscribeUnsubscribeBatch-12    	31609177	        37.8 ns/op
BenchmarkMapSubscribeUnsubscribeBatch
BenchmarkMapSubscribeUnsubscribeBatch-12            	17563906	        62.1 ns/op
BenchmarkQueueSubscribeUnsubscribeBatch
BenchmarkQueueSubscribeUnsubscribeBatch-12          	26248838	        46.6 ns/op
BenchmarkBroadcasterBroadcastRedundant
BenchmarkBroadcasterBroadcastRedundant/0
BenchmarkBroadcasterBroadcastRedundant/0-12         	100907563	        11.8 ns/op
BenchmarkBroadcasterBroadcastRedundant/1
BenchmarkBroadcasterBroadcastRedundant/1-12         	85103068	        13.3 ns/op
BenchmarkBroadcasterBroadcastRedundant/4
BenchmarkBroadcasterBroadcastRedundant/4-12         	52716502	        22.3 ns/op
BenchmarkBroadcasterBroadcastRedundant/16
BenchmarkBroadcasterBroadcastRedundant/16-12        	20278165	        58.7 ns/op
BenchmarkBroadcasterBroadcastRedundant/64
BenchmarkBroadcasterBroadcastRedundant/64-12        	 5905428	       205 ns/op
BenchmarkMapBroadcastRedundant
BenchmarkMapBroadcastRedundant/0
BenchmarkMapBroadcastRedundant/0-12                 	87532734	        13.5 ns/op
BenchmarkMapBroadcastRedundant/1
BenchmarkMapBroadcastRedundant/1-12                 	28488411	        36.3 ns/op
BenchmarkMapBroadcastRedundant/4
BenchmarkMapBroadcastRedundant/4-12                 	19628920	        60.9 ns/op
BenchmarkMapBroadcastRedundant/16
BenchmarkMapBroadcastRedundant/16-12                	 6026980	       192 ns/op
BenchmarkMapBroadcastRedundant/64
BenchmarkMapBroadcastRedundant/64-12                	 1640858	       754 ns/op
BenchmarkQueueBroadcastRedundant
BenchmarkQueueBroadcastRedundant/0
BenchmarkQueueBroadcastRedundant/0-12               	96904807	        12.0 ns/op
BenchmarkQueueBroadcastRedundant/1
BenchmarkQueueBroadcastRedundant/1-12               	73521873	        16.3 ns/op
BenchmarkQueueBroadcastRedundant/4
BenchmarkQueueBroadcastRedundant/4-12               	39209468	        31.2 ns/op
BenchmarkQueueBroadcastRedundant/16
BenchmarkQueueBroadcastRedundant/16-12              	10810058	       105 ns/op
BenchmarkQueueBroadcastRedundant/64
BenchmarkQueueBroadcastRedundant/64-12              	 2998046	       376 ns/op
BenchmarkBroadcasterBroadcastAck
BenchmarkBroadcasterBroadcastAck/1
BenchmarkBroadcasterBroadcastAck/1-12               	44472397	        26.4 ns/op
BenchmarkBroadcasterBroadcastAck/4
BenchmarkBroadcasterBroadcastAck/4-12               	17653509	        69.7 ns/op
BenchmarkBroadcasterBroadcastAck/16
BenchmarkBroadcasterBroadcastAck/16-12              	 4082617	       260 ns/op
BenchmarkBroadcasterBroadcastAck/64
BenchmarkBroadcasterBroadcastAck/64-12              	 1220534	      1027 ns/op
BenchmarkMapBroadcastAck
BenchmarkMapBroadcastAck/1
BenchmarkMapBroadcastAck/1-12                       	26760705	        44.2 ns/op
BenchmarkMapBroadcastAck/4
BenchmarkMapBroadcastAck/4-12                       	11495636	       100 ns/op
BenchmarkMapBroadcastAck/16
BenchmarkMapBroadcastAck/16-12                      	 2937590	       343 ns/op
BenchmarkMapBroadcastAck/64
BenchmarkMapBroadcastAck/64-12                      	  861037	      1344 ns/op
BenchmarkQueueBroadcastAck
BenchmarkQueueBroadcastAck/1
BenchmarkQueueBroadcastAck/1-12                     	19832679	        55.0 ns/op
BenchmarkQueueBroadcastAck/4
BenchmarkQueueBroadcastAck/4-12                     	 5618214	       189 ns/op
BenchmarkQueueBroadcastAck/16
BenchmarkQueueBroadcastAck/16-12                    	 1569980	       713 ns/op
BenchmarkQueueBroadcastAck/64
BenchmarkQueueBroadcastAck/64-12                    	  437672	      2814 ns/op
BenchmarkWaiterNotifyRedundant
BenchmarkWaiterNotifyRedundant-12                   	650823090	         1.96 ns/op
BenchmarkSleeperNotifyRedundant
BenchmarkSleeperNotifyRedundant-12                  	619871544	         1.61 ns/op
BenchmarkChannelNotifyRedundant
BenchmarkChannelNotifyRedundant-12                  	298903778	         3.67 ns/op
BenchmarkWaiterNotifyWaitAck
BenchmarkWaiterNotifyWaitAck-12                     	68358360	        17.8 ns/op
BenchmarkSleeperNotifyWaitAck
BenchmarkSleeperNotifyWaitAck-12                    	25044883	        41.2 ns/op
BenchmarkChannelNotifyWaitAck
BenchmarkChannelNotifyWaitAck-12                    	29572404	        40.2 ns/op
BenchmarkSleeperMultiNotifyWaitAck
BenchmarkSleeperMultiNotifyWaitAck-12               	16122969	        73.8 ns/op
BenchmarkWaiterTempNotifyWaitAck
BenchmarkWaiterTempNotifyWaitAck-12                 	46111489	        25.8 ns/op
BenchmarkSleeperTempNotifyWaitAck
BenchmarkSleeperTempNotifyWaitAck-12                	15541882	        73.6 ns/op
BenchmarkWaiterNotifyWaitMultiAck
BenchmarkWaiterNotifyWaitMultiAck-12                	65878500	        18.2 ns/op
BenchmarkSleeperNotifyWaitMultiAck
BenchmarkSleeperNotifyWaitMultiAck-12               	28798623	        41.5 ns/op
BenchmarkChannelNotifyWaitMultiAck
BenchmarkChannelNotifyWaitMultiAck-12               	11308468	       101 ns/op
BenchmarkWaiterNotifyAsyncWaitAck
BenchmarkWaiterNotifyAsyncWaitAck-12                	 2475387	       492 ns/op
BenchmarkSleeperNotifyAsyncWaitAck
BenchmarkSleeperNotifyAsyncWaitAck-12               	 2184507	       518 ns/op
BenchmarkChannelNotifyAsyncWaitAck
BenchmarkChannelNotifyAsyncWaitAck-12               	 2120365	       562 ns/op
BenchmarkWaiterNotifyAsyncWaitMultiAck
BenchmarkWaiterNotifyAsyncWaitMultiAck-12           	 2351247	       494 ns/op
BenchmarkSleeperNotifyAsyncWaitMultiAck
BenchmarkSleeperNotifyAsyncWaitMultiAck-12          	 2205799	       522 ns/op
BenchmarkChannelNotifyAsyncWaitMultiAck
BenchmarkChannelNotifyAsyncWaitMultiAck-12          	 1238079	       928 ns/op

Updates #1074

PiperOrigin-RevId: 295834087
---
 pkg/syncevent/BUILD                     |  39 +++
 pkg/syncevent/broadcaster.go            | 218 +++++++++++++++++
 pkg/syncevent/broadcaster_test.go       | 376 +++++++++++++++++++++++++++++
 pkg/syncevent/receiver.go               | 103 ++++++++
 pkg/syncevent/source.go                 |  59 +++++
 pkg/syncevent/syncevent.go              |  32 +++
 pkg/syncevent/syncevent_example_test.go | 108 +++++++++
 pkg/syncevent/waiter_amd64.s            |  32 +++
 pkg/syncevent/waiter_arm64.s            |  34 +++
 pkg/syncevent/waiter_asm_unsafe.go      |  24 ++
 pkg/syncevent/waiter_noasm_unsafe.go    |  39 +++
 pkg/syncevent/waiter_test.go            | 414 ++++++++++++++++++++++++++++++++
 pkg/syncevent/waiter_unsafe.go          | 206 ++++++++++++++++
 13 files changed, 1684 insertions(+)
 create mode 100644 pkg/syncevent/BUILD
 create mode 100644 pkg/syncevent/broadcaster.go
 create mode 100644 pkg/syncevent/broadcaster_test.go
 create mode 100644 pkg/syncevent/receiver.go
 create mode 100644 pkg/syncevent/source.go
 create mode 100644 pkg/syncevent/syncevent.go
 create mode 100644 pkg/syncevent/syncevent_example_test.go
 create mode 100644 pkg/syncevent/waiter_amd64.s
 create mode 100644 pkg/syncevent/waiter_arm64.s
 create mode 100644 pkg/syncevent/waiter_asm_unsafe.go
 create mode 100644 pkg/syncevent/waiter_noasm_unsafe.go
 create mode 100644 pkg/syncevent/waiter_test.go
 create mode 100644 pkg/syncevent/waiter_unsafe.go

diff --git a/pkg/syncevent/BUILD b/pkg/syncevent/BUILD
new file mode 100644
index 000000000..0500a22cf
--- /dev/null
+++ b/pkg/syncevent/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "syncevent",
+    srcs = [
+        "broadcaster.go",
+        "receiver.go",
+        "source.go",
+        "syncevent.go",
+        "waiter_amd64.s",
+        "waiter_arm64.s",
+        "waiter_asm_unsafe.go",
+        "waiter_noasm_unsafe.go",
+        "waiter_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/atomicbitops",
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "syncevent_test",
+    size = "small",
+    srcs = [
+        "broadcaster_test.go",
+        "syncevent_example_test.go",
+        "waiter_test.go",
+    ],
+    library = ":syncevent",
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/syncevent/broadcaster.go b/pkg/syncevent/broadcaster.go
new file mode 100644
index 000000000..4bff59e7d
--- /dev/null
+++ b/pkg/syncevent/broadcaster.go
@@ -0,0 +1,218 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Broadcaster is an implementation of Source that supports any number of
+// subscribed Receivers.
+//
+// The zero value of Broadcaster is valid and has no subscribed Receivers.
+// Broadcaster is not copyable by value.
+//
+// All Broadcaster methods may be called concurrently from multiple goroutines.
+type Broadcaster struct {
+	// Broadcaster is implemented as a hash table where keys are assigned by
+	// the Broadcaster and returned as SubscriptionIDs, making it safe to use
+	// the identity function for hashing. The hash table resolves collisions
+	// using linear probing and features Robin Hood insertion and backward
+	// shift deletion in order to support a relatively high load factor
+	// efficiently, which matters since the cost of Broadcast is linear in the
+	// size of the table.
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// Invariants: len(table) is 0 or a power of 2.
+	table []broadcasterSlot
+
+	// load is the number of entries in table with receiver != nil.
+	load int
+
+	lastID SubscriptionID
+}
+
+type broadcasterSlot struct {
+	// Invariants: If receiver == nil, then filter == NoEvents and id == 0.
+	// Otherwise, id != 0.
+	receiver *Receiver
+	filter   Set
+	id       SubscriptionID
+}
+
+const (
+	broadcasterMinNonZeroTableSize = 2 // must be a power of 2 > 1
+
+	broadcasterMaxLoadNum = 13
+	broadcasterMaxLoadDen = 16
+)
+
+// SubscribeEvents implements Source.SubscribeEvents.
+func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID {
+	b.mu.Lock()
+
+	// Assign an ID for this subscription.
+	b.lastID++
+	id := b.lastID
+
+	// Expand the table if over the maximum load factor:
+	//
+	//          load / len(b.table) > broadcasterMaxLoadNum / broadcasterMaxLoadDen
+	// load * broadcasterMaxLoadDen > broadcasterMaxLoadNum * len(b.table)
+	b.load++
+	if (b.load * broadcasterMaxLoadDen) > (broadcasterMaxLoadNum * len(b.table)) {
+		// Double the number of slots in the new table.
+		newlen := broadcasterMinNonZeroTableSize
+		if len(b.table) != 0 {
+			newlen = 2 * len(b.table)
+		}
+		if newlen <= cap(b.table) {
+			// Reuse excess capacity in the current table, moving entries not
+			// already in their first-probed positions to better ones.
+			newtable := b.table[:newlen]
+			newmask := uint64(newlen - 1)
+			for i := range b.table {
+				if b.table[i].receiver != nil && uint64(b.table[i].id)&newmask != uint64(i) {
+					entry := b.table[i]
+					b.table[i] = broadcasterSlot{}
+					broadcasterTableInsert(newtable, entry.id, entry.receiver, entry.filter)
+				}
+			}
+			b.table = newtable
+		} else {
+			newtable := make([]broadcasterSlot, newlen)
+			// Copy existing entries to the new table.
+			for i := range b.table {
+				if b.table[i].receiver != nil {
+					broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter)
+				}
+			}
+			// Switch to the new table.
+			b.table = newtable
+		}
+	}
+
+	broadcasterTableInsert(b.table, id, r, filter)
+	b.mu.Unlock()
+	return id
+}
+
+// Preconditions: table must not be full. len(table) is a power of 2.
+func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) {
+	entry := broadcasterSlot{
+		receiver: r,
+		filter:   filter,
+		id:       id,
+	}
+	mask := uint64(len(table) - 1)
+	i := uint64(id) & mask
+	disp := uint64(0)
+	for {
+		if table[i].receiver == nil {
+			table[i] = entry
+			return
+		}
+		// If we've been displaced farther from our first-probed slot than the
+		// element stored in this one, swap elements and switch to inserting
+		// the replaced one. (This is Robin Hood insertion.)
+		slotDisp := (i - uint64(table[i].id)) & mask
+		if disp > slotDisp {
+			table[i], entry = entry, table[i]
+			disp = slotDisp
+		}
+		i = (i + 1) & mask
+		disp++
+	}
+}
+
+// UnsubscribeEvents implements Source.UnsubscribeEvents.
+func (b *Broadcaster) UnsubscribeEvents(id SubscriptionID) {
+	b.mu.Lock()
+
+	mask := uint64(len(b.table) - 1)
+	i := uint64(id) & mask
+	for {
+		if b.table[i].id == id {
+			// Found the element to remove. Move all subsequent elements
+			// backward until we either find an empty slot, or an element that
+			// is already in its first-probed slot. (This is backward shift
+			// deletion.)
+			for {
+				next := (i + 1) & mask
+				if b.table[next].receiver == nil {
+					break
+				}
+				if uint64(b.table[next].id)&mask == next {
+					break
+				}
+				b.table[i] = b.table[next]
+				i = next
+			}
+			b.table[i] = broadcasterSlot{}
+			break
+		}
+		i = (i + 1) & mask
+	}
+
+	// If a table 1/4 of the current size would still be at or under the
+	// maximum load factor (i.e. the current table size is at least two
+	// expansions bigger than necessary), halve the size of the table to reduce
+	// the cost of Broadcast. Since we are concerned with iteration time and
+	// not memory usage, reuse the existing slice to reduce future allocations
+	// from table re-expansion.
+	b.load--
+	if len(b.table) > broadcasterMinNonZeroTableSize && (b.load*(4*broadcasterMaxLoadDen)) <= (broadcasterMaxLoadNum*len(b.table)) {
+		newlen := len(b.table) / 2
+		newtable := b.table[:newlen]
+		for i := newlen; i < len(b.table); i++ {
+			if b.table[i].receiver != nil {
+				broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter)
+				b.table[i] = broadcasterSlot{}
+			}
+		}
+		b.table = newtable
+	}
+
+	b.mu.Unlock()
+}
+
+// Broadcast notifies all Receivers subscribed to the Broadcaster of the subset
+// of events to which they subscribed. The order in which Receivers are
+// notified is unspecified.
+func (b *Broadcaster) Broadcast(events Set) {
+	b.mu.Lock()
+	for i := range b.table {
+		if intersection := events & b.table[i].filter; intersection != 0 {
+			// We don't need to check if broadcasterSlot.receiver is nil, since
+			// if it is then broadcasterSlot.filter is 0.
+			b.table[i].receiver.Notify(intersection)
+		}
+	}
+	b.mu.Unlock()
+}
+
+// FilteredEvents returns the set of events for which Broadcast will notify at
+// least one Receiver, i.e. the union of filters for all subscribed Receivers.
+func (b *Broadcaster) FilteredEvents() Set {
+	var es Set
+	b.mu.Lock()
+	for i := range b.table {
+		es |= b.table[i].filter
+	}
+	b.mu.Unlock()
+	return es
+}
diff --git a/pkg/syncevent/broadcaster_test.go b/pkg/syncevent/broadcaster_test.go
new file mode 100644
index 000000000..e88779e23
--- /dev/null
+++ b/pkg/syncevent/broadcaster_test.go
@@ -0,0 +1,376 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestBroadcasterFilter(t *testing.T) {
+	const numReceivers = 2 * MaxEvents
+
+	var br Broadcaster
+	ws := make([]Waiter, numReceivers)
+	for i := range ws {
+		ws[i].Init()
+		br.SubscribeEvents(ws[i].Receiver(), 1<<(i%MaxEvents))
+	}
+	for ev := 0; ev < MaxEvents; ev++ {
+		br.Broadcast(1 << ev)
+		for i := range ws {
+			want := NoEvents
+			if i%MaxEvents == ev {
+				want = 1 << ev
+			}
+			if got := ws[i].Receiver().PendingAndAckAll(); got != want {
+				t.Errorf("after Broadcast of event %d: waiter %d has pending event set %#x, wanted %#x", ev, i, got, want)
+			}
+		}
+	}
+}
+
+// TestBroadcasterManySubscriptions tests that subscriptions are not lost by
+// table expansion/compaction.
+func TestBroadcasterManySubscriptions(t *testing.T) {
+	const numReceivers = 5000 // arbitrary
+
+	var br Broadcaster
+	ws := make([]Waiter, numReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+
+	ids := make([]SubscriptionID, numReceivers)
+	for i := 0; i < numReceivers; i++ {
+		// Subscribe receiver i.
+		ids[i] = br.SubscribeEvents(ws[i].Receiver(), 1)
+		// Check that receivers [0, i] are subscribed.
+		br.Broadcast(1)
+		for j := 0; j <= i; j++ {
+			if ws[j].Pending() != 1 {
+				t.Errorf("receiver %d did not receive an event after subscription of receiver %d", j, i)
+			}
+			ws[j].Ack(1)
+		}
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numReceivers)
+	for i := 0; i < numReceivers; i++ {
+		// Unsubscribe receiver unsub[i].
+		br.UnsubscribeEvents(ids[unsub[i]])
+		// Check that receivers [unsub[0], unsub[i]] are not subscribed, and that
+		// receivers (unsub[i], unsub[numReceivers]) are still subscribed.
+		br.Broadcast(1)
+		for j := 0; j <= i; j++ {
+			if ws[unsub[j]].Pending() != 0 {
+				t.Errorf("unsub iteration %d: receiver %d received an event after unsubscription of receiver %d", i, unsub[j], unsub[i])
+			}
+		}
+		for j := i + 1; j < numReceivers; j++ {
+			if ws[unsub[j]].Pending() != 1 {
+				t.Errorf("unsub iteration %d: receiver %d did not receive an event after unsubscription of receiver %d", i, unsub[j], unsub[i])
+			}
+			ws[unsub[j]].Ack(1)
+		}
+	}
+}
+
+var (
+	receiverCountsNonZero       = []int{1, 4, 16, 64}
+	receiverCountsIncludingZero = append([]int{0}, receiverCountsNonZero...)
+)
+
+// BenchmarkBroadcasterX, BenchmarkMapX, and BenchmarkQueueX benchmark usage
+// pattern X (described in terms of Broadcaster) with Broadcaster, a
+// Mutex-protected map[*Receiver]Set, and waiter.Queue respectively.
+
+// BenchmarkXxxSubscribeUnsubscribe measures the cost of a Subscribe/Unsubscribe
+// cycle.
+
+func BenchmarkBroadcasterSubscribeUnsubscribe(b *testing.B) {
+	var br Broadcaster
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		id := br.SubscribeEvents(w.Receiver(), 1)
+		br.UnsubscribeEvents(id)
+	}
+}
+
+func BenchmarkMapSubscribeUnsubscribe(b *testing.B) {
+	var mu sync.Mutex
+	m := make(map[*Receiver]Set)
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		mu.Lock()
+		m[w.Receiver()] = Set(1)
+		mu.Unlock()
+		mu.Lock()
+		delete(m, w.Receiver())
+		mu.Unlock()
+	}
+}
+
+func BenchmarkQueueSubscribeUnsubscribe(b *testing.B) {
+	var q waiter.Queue
+	e, _ := waiter.NewChannelEntry(nil)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		q.EventRegister(&e, 1)
+		q.EventUnregister(&e)
+	}
+}
+
+// BenchmarkXxxSubscribeUnsubscribeBatch is similar to
+// BenchmarkXxxSubscribeUnsubscribe, but subscribes and unsubscribes a large
+// number of Receivers at a time in order to measure the amortized overhead of
+// table expansion/compaction. (Since waiter.Queue is implemented using a
+// linked list, BenchmarkQueueSubscribeUnsubscribe and
+// BenchmarkQueueSubscribeUnsubscribeBatch should produce nearly the same
+// result.)
+
+const numBatchReceivers = 1000
+
+func BenchmarkBroadcasterSubscribeUnsubscribeBatch(b *testing.B) {
+	var br Broadcaster
+	ws := make([]Waiter, numBatchReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+	ids := make([]SubscriptionID, numBatchReceivers)
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			ids[j] = br.SubscribeEvents(ws[j].Receiver(), 1)
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			br.UnsubscribeEvents(ids[unsub[j]])
+		}
+	}
+}
+
+func BenchmarkMapSubscribeUnsubscribeBatch(b *testing.B) {
+	var mu sync.Mutex
+	m := make(map[*Receiver]Set)
+	ws := make([]Waiter, numBatchReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			mu.Lock()
+			m[ws[j].Receiver()] = Set(1)
+			mu.Unlock()
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			mu.Lock()
+			delete(m, ws[unsub[j]].Receiver())
+			mu.Unlock()
+		}
+	}
+}
+
+func BenchmarkQueueSubscribeUnsubscribeBatch(b *testing.B) {
+	var q waiter.Queue
+	es := make([]waiter.Entry, numBatchReceivers)
+	for i := range es {
+		es[i], _ = waiter.NewChannelEntry(nil)
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			q.EventRegister(&es[j], 1)
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			q.EventUnregister(&es[unsub[j]])
+		}
+	}
+}
+
+// BenchmarkXxxBroadcastRedundant measures how long it takes to Broadcast
+// already-pending events to multiple Receivers.
+
+func BenchmarkBroadcasterBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var br Broadcaster
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				br.SubscribeEvents(ws[i].Receiver(), 1)
+			}
+			br.Broadcast(1)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				br.Broadcast(1)
+			}
+		})
+	}
+}
+
+func BenchmarkMapBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var mu sync.Mutex
+			m := make(map[*Receiver]Set)
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				m[ws[i].Receiver()] = Set(1)
+			}
+			mu.Lock()
+			for r := range m {
+				r.Notify(1)
+			}
+			mu.Unlock()
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				mu.Lock()
+				for r := range m {
+					r.Notify(1)
+				}
+				mu.Unlock()
+			}
+		})
+	}
+}
+
+func BenchmarkQueueBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var q waiter.Queue
+			for i := 0; i < n; i++ {
+				e, _ := waiter.NewChannelEntry(nil)
+				q.EventRegister(&e, 1)
+			}
+			q.Notify(1)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				q.Notify(1)
+			}
+		})
+	}
+}
+
+// BenchmarkXxxBroadcastAck measures how long it takes to Broadcast events to
+// multiple Receivers, check that all Receivers have received the event, and
+// clear the event from all Receivers.
+
+func BenchmarkBroadcasterBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var br Broadcaster
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				br.SubscribeEvents(ws[i].Receiver(), 1)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				br.Broadcast(1)
+				for j := range ws {
+					if got, want := ws[j].Pending(), Set(1); got != want {
+						b.Fatalf("Receiver.Pending(): got %#x, wanted %#x", got, want)
+					}
+					ws[j].Ack(1)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkMapBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var mu sync.Mutex
+			m := make(map[*Receiver]Set)
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				m[ws[i].Receiver()] = Set(1)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				mu.Lock()
+				for r := range m {
+					r.Notify(1)
+				}
+				mu.Unlock()
+				for j := range ws {
+					if got, want := ws[j].Pending(), Set(1); got != want {
+						b.Fatalf("Receiver.Pending(): got %#x, wanted %#x", got, want)
+					}
+					ws[j].Ack(1)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkQueueBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var q waiter.Queue
+			chs := make([]chan struct{}, n)
+			for i := range chs {
+				e, ch := waiter.NewChannelEntry(nil)
+				q.EventRegister(&e, 1)
+				chs[i] = ch
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				q.Notify(1)
+				for _, ch := range chs {
+					select {
+					case <-ch:
+					default:
+						b.Fatalf("channel did not receive event")
+					}
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/syncevent/receiver.go b/pkg/syncevent/receiver.go
new file mode 100644
index 000000000..5c86e5400
--- /dev/null
+++ b/pkg/syncevent/receiver.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/atomicbitops"
+)
+
+// Receiver is an event sink that holds pending events and invokes a callback
+// whenever new events become pending. Receiver's methods may be called
+// concurrently from multiple goroutines.
+//
+// Receiver.Init() must be called before first use.
+type Receiver struct {
+	// pending is the set of pending events. pending is accessed using atomic
+	// memory operations.
+	pending uint64
+
+	// cb is notified when new events become pending. cb is immutable after
+	// Init().
+	cb ReceiverCallback
+}
+
+// ReceiverCallback receives callbacks from a Receiver.
+type ReceiverCallback interface {
+	// NotifyPending is called when the corresponding Receiver has new pending
+	// events.
+	//
+	// NotifyPending is called synchronously from Receiver.Notify(), so
+	// implementations must not take locks that may be held by callers of
+	// Receiver.Notify(). NotifyPending may be called concurrently from
+	// multiple goroutines.
+	NotifyPending()
+}
+
+// Init must be called before first use of r.
+func (r *Receiver) Init(cb ReceiverCallback) {
+	r.cb = cb
+}
+
+// Pending returns the set of pending events.
+func (r *Receiver) Pending() Set {
+	return Set(atomic.LoadUint64(&r.pending))
+}
+
+// Notify sets the given events as pending.
+func (r *Receiver) Notify(es Set) {
+	p := Set(atomic.LoadUint64(&r.pending))
+	// Optimization: Skip the atomic CAS on r.pending if all events are
+	// already pending.
+	if p&es == es {
+		return
+	}
+	// When this is uncontended (the common case), CAS is faster than
+	// atomic-OR because the former is inlined and the latter (which we
+	// implement in assembly ourselves) is not.
+	if !atomic.CompareAndSwapUint64(&r.pending, uint64(p), uint64(p|es)) {
+		// If the CAS fails, fall back to atomic-OR.
+		atomicbitops.OrUint64(&r.pending, uint64(es))
+	}
+	r.cb.NotifyPending()
+}
+
+// Ack unsets the given events as pending.
+func (r *Receiver) Ack(es Set) {
+	p := Set(atomic.LoadUint64(&r.pending))
+	// Optimization: Skip the atomic CAS on r.pending if all events are
+	// already not pending.
+	if p&es == 0 {
+		return
+	}
+	// When this is uncontended (the common case), CAS is faster than
+	// atomic-AND because the former is inlined and the latter (which we
+	// implement in assembly ourselves) is not.
+	if !atomic.CompareAndSwapUint64(&r.pending, uint64(p), uint64(p&^es)) {
+		// If the CAS fails, fall back to atomic-AND.
+		atomicbitops.AndUint64(&r.pending, ^uint64(es))
+	}
+}
+
+// PendingAndAckAll unsets all events as pending and returns the set of
+// previously-pending events.
+//
+// PendingAndAckAll should only be used in preference to a call to Pending
+// followed by a conditional call to Ack when the caller expects events to be
+// pending (e.g. after a call to ReceiverCallback.NotifyPending()).
+func (r *Receiver) PendingAndAckAll() Set {
+	return Set(atomic.SwapUint64(&r.pending, 0))
+}
diff --git a/pkg/syncevent/source.go b/pkg/syncevent/source.go
new file mode 100644
index 000000000..ddffb171a
--- /dev/null
+++ b/pkg/syncevent/source.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+// Source represents an event source.
+type Source interface {
+	// SubscribeEvents causes the Source to notify the given Receiver of the
+	// given subset of events.
+	//
+	// Preconditions: r != nil. The ReceiverCallback for r must not take locks
+	// that are ordered prior to the Source; for example, it cannot call any
+	// Source methods.
+	SubscribeEvents(r *Receiver, filter Set) SubscriptionID
+
+	// UnsubscribeEvents causes the Source to stop notifying the Receiver
+	// subscribed by a previous call to SubscribeEvents that returned the given
+	// SubscriptionID.
+	//
+	// Preconditions: UnsubscribeEvents may be called at most once for any
+	// given SubscriptionID.
+	UnsubscribeEvents(id SubscriptionID)
+}
+
+// SubscriptionID identifies a call to Source.SubscribeEvents.
+type SubscriptionID uint64
+
+// UnsubscribeAndAck is a convenience function that unsubscribes r from the
+// given events from src and also clears them from r.
+func UnsubscribeAndAck(src Source, r *Receiver, filter Set, id SubscriptionID) {
+	src.UnsubscribeEvents(id)
+	r.Ack(filter)
+}
+
+// NoopSource implements Source by never sending events to subscribed
+// Receivers.
+type NoopSource struct{}
+
+// SubscribeEvents implements Source.SubscribeEvents.
+func (NoopSource) SubscribeEvents(*Receiver, Set) SubscriptionID {
+	return 0
+}
+
+// UnsubscribeEvents implements Source.UnsubscribeEvents.
+func (NoopSource) UnsubscribeEvents(SubscriptionID) {
+}
+
+// See Broadcaster for a non-noop implementations of Source.
diff --git a/pkg/syncevent/syncevent.go b/pkg/syncevent/syncevent.go
new file mode 100644
index 000000000..9fb6a06de
--- /dev/null
+++ b/pkg/syncevent/syncevent.go
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syncevent provides efficient primitives for goroutine
+// synchronization based on event bitmasks.
+package syncevent
+
+// Set is a bitmask where each bit represents a distinct user-defined event.
+// The event package does not treat any bits in Set specially.
+type Set uint64
+
+const (
+	// NoEvents is a Set containing no events.
+	NoEvents = Set(0)
+
+	// AllEvents is a Set containing all possible events.
+	AllEvents = ^Set(0)
+
+	// MaxEvents is the number of distinct events that can be represented by a Set.
+	MaxEvents = 64
+)
diff --git a/pkg/syncevent/syncevent_example_test.go b/pkg/syncevent/syncevent_example_test.go
new file mode 100644
index 000000000..bfb18e2ea
--- /dev/null
+++ b/pkg/syncevent/syncevent_example_test.go
@@ -0,0 +1,108 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+)
+
+func Example_ioReadinessInterrputible() {
+	const (
+		evReady = Set(1 << iota)
+		evInterrupt
+	)
+	errNotReady := fmt.Errorf("not ready for I/O")
+
+	// State of some I/O object.
+	var (
+		br    Broadcaster
+		ready uint32
+	)
+	doIO := func() error {
+		if atomic.LoadUint32(&ready) == 0 {
+			return errNotReady
+		}
+		return nil
+	}
+	go func() {
+		// The I/O object eventually becomes ready for I/O.
+		time.Sleep(100 * time.Millisecond)
+		// When it does, it first ensures that future calls to isReady() return
+		// true, then broadcasts the readiness event to Receivers.
+		atomic.StoreUint32(&ready, 1)
+		br.Broadcast(evReady)
+	}()
+
+	// Each user of the I/O object owns a Waiter.
+	var w Waiter
+	w.Init()
+	// The Waiter may be asynchronously interruptible, e.g. for signal
+	// handling in the sentry.
+	go func() {
+		time.Sleep(200 * time.Millisecond)
+		w.Receiver().Notify(evInterrupt)
+	}()
+
+	// To use the I/O object:
+	//
+	// Optionally, if the I/O object is likely to be ready, attempt I/O first.
+	err := doIO()
+	if err == nil {
+		// Success, we're done.
+		return /* nil */
+	}
+	if err != errNotReady {
+		// Failure, I/O failed for some reason other than readiness.
+		return /* err */
+	}
+	// Subscribe for readiness events from the I/O object.
+	id := br.SubscribeEvents(w.Receiver(), evReady)
+	// When we are finished blocking, unsubscribe from readiness events and
+	// remove readiness events from the pending event set.
+	defer UnsubscribeAndAck(&br, w.Receiver(), evReady, id)
+	for {
+		// Attempt I/O again. This must be done after the call to SubscribeEvents,
+		// since the I/O object might have become ready between the previous call
+		// to doIO and the call to SubscribeEvents.
+		err = doIO()
+		if err == nil {
+			return /* nil */
+		}
+		if err != errNotReady {
+			return /* err */
+		}
+		// Block until either the I/O object indicates it is ready, or we are
+		// interrupted.
+		events := w.Wait()
+		if events&evInterrupt != 0 {
+			// In the specific case of sentry signal handling, signal delivery
+			// is handled by another system, so we aren't responsible for
+			// acknowledging evInterrupt.
+			return /* errInterrupted */
+		}
+		// Note that, in a concurrent context, the I/O object might become
+		// ready and then not ready again. To handle this:
+		//
+		// - evReady must be acknowledged before calling doIO() again (rather
+		// than after), so that if the I/O object becomes ready *again* after
+		// the call to doIO(), the readiness event is not lost.
+		//
+		// - We must loop instead of just calling doIO() once after receiving
+		// evReady.
+		w.Ack(evReady)
+	}
+}
diff --git a/pkg/syncevent/waiter_amd64.s b/pkg/syncevent/waiter_amd64.s
new file mode 100644
index 000000000..985b56ae5
--- /dev/null
+++ b/pkg/syncevent/waiter_amd64.s
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+//
+// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
+TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+	MOVQ g+0(FP), DI
+	MOVQ wg+8(FP), SI
+
+	MOVQ $·preparingG(SB), AX
+	LOCK
+	CMPXCHGQ DI, 0(SI)
+
+	SETEQ AX
+	MOVB AX, ret+16(FP)
+
+	RET
+
diff --git a/pkg/syncevent/waiter_arm64.s b/pkg/syncevent/waiter_arm64.s
new file mode 100644
index 000000000..20d7ac23b
--- /dev/null
+++ b/pkg/syncevent/waiter_arm64.s
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+//
+// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
+TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+	MOVD wg+8(FP), R0
+	MOVD $·preparingG(SB), R1
+	MOVD g+0(FP), R2
+again:
+	LDAXR (R0), R3
+	CMP R1, R3
+	BNE ok
+	STLXR R2, (R0), R3
+	CBNZ R3, again
+ok:
+	CSET EQ, R0
+	MOVB R0, ret+16(FP)
+	RET
+
diff --git a/pkg/syncevent/waiter_asm_unsafe.go b/pkg/syncevent/waiter_asm_unsafe.go
new file mode 100644
index 000000000..0995e9053
--- /dev/null
+++ b/pkg/syncevent/waiter_asm_unsafe.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 arm64
+
+package syncevent
+
+import (
+	"unsafe"
+)
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
diff --git a/pkg/syncevent/waiter_noasm_unsafe.go b/pkg/syncevent/waiter_noasm_unsafe.go
new file mode 100644
index 000000000..1c4b0e39a
--- /dev/null
+++ b/pkg/syncevent/waiter_noasm_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// waiterUnlock is called from g0, so when the race detector is enabled,
+// waiterUnlock must be implemented in assembly since no race context is
+// available.
+//
+// +build !race
+// +build !amd64,!arm64
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// waiterUnlock is the "unlock function" passed to runtime.gopark by
+// Waiter.Wait*. wg is &Waiter.g, and g is a pointer to the calling runtime.g.
+// waiterUnlock returns true if Waiter.Wait should sleep and false if sleeping
+// should be aborted.
+//
+//go:nosplit
+func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool {
+	// The only way this CAS can fail is if a call to Waiter.NotifyPending()
+	// has replaced *wg with nil, in which case we should not sleep.
+	return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), g)
+}
diff --git a/pkg/syncevent/waiter_test.go b/pkg/syncevent/waiter_test.go
new file mode 100644
index 000000000..3c8cbcdd8
--- /dev/null
+++ b/pkg/syncevent/waiter_test.go
@@ -0,0 +1,414 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestWaiterAlreadyPending(t *testing.T) {
+	var w Waiter
+	w.Init()
+	want := Set(1)
+	w.Notify(want)
+	if got := w.Wait(); got != want {
+		t.Errorf("Waiter.Wait: got %#x, wanted %#x", got, want)
+	}
+}
+
+func TestWaiterAsyncNotify(t *testing.T) {
+	var w Waiter
+	w.Init()
+	want := Set(1)
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		w.Notify(want)
+	}()
+	if got := w.Wait(); got != want {
+		t.Errorf("Waiter.Wait: got %#x, wanted %#x", got, want)
+	}
+}
+
+func TestWaiterWaitFor(t *testing.T) {
+	var w Waiter
+	w.Init()
+	evWaited := Set(1)
+	evOther := Set(2)
+	w.Notify(evOther)
+	notifiedEvent := uint32(0)
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		atomic.StoreUint32(&notifiedEvent, 1)
+		w.Notify(evWaited)
+	}()
+	if got, want := w.WaitFor(evWaited), evWaited|evOther; got != want {
+		t.Errorf("Waiter.WaitFor: got %#x, wanted %#x", got, want)
+	}
+	if atomic.LoadUint32(&notifiedEvent) == 0 {
+		t.Errorf("Waiter.WaitFor returned before goroutine notified waited-for event")
+	}
+}
+
+func TestWaiterWaitAndAckAll(t *testing.T) {
+	var w Waiter
+	w.Init()
+	w.Notify(AllEvents)
+	if got := w.WaitAndAckAll(); got != AllEvents {
+		t.Errorf("Waiter.WaitAndAckAll: got %#x, wanted %#x", got, AllEvents)
+	}
+	if got := w.Pending(); got != NoEvents {
+		t.Errorf("Waiter.WaitAndAckAll did not ack all events: got %#x, wanted 0", got)
+	}
+}
+
+// BenchmarkWaiterX, BenchmarkSleeperX, and BenchmarkChannelX benchmark usage
+// pattern X (described in terms of Waiter) with Waiter, sleep.Sleeper, and
+// buffered chan struct{} respectively. When the maximum number of event
+// sources is relevant, we use 3 event sources because this is representative
+// of the kernel.Task.block() use case: an interrupt source, a timeout source,
+// and the actual event source being waited on.
+
+// Event set used by most benchmarks.
+const evBench Set = 1
+
+// BenchmarkXxxNotifyRedundant measures how long it takes to notify a Waiter of
+// an event that is already pending.
+
+func BenchmarkWaiterNotifyRedundant(b *testing.B) {
+	var w Waiter
+	w.Init()
+	w.Notify(evBench)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyRedundant(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+	w.Assert()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Assert()
+	}
+}
+
+func BenchmarkChannelNotifyRedundant(b *testing.B) {
+	ch := make(chan struct{}, 1)
+	ch <- struct{}{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+	}
+}
+
+// BenchmarkXxxNotifyWaitAck measures how long it takes to notify a Waiter an
+// event, return that event using a blocking check, and then unset the event as
+// pending.
+
+func BenchmarkWaiterNotifyWaitAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+		w.Wait()
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Assert()
+		s.Fetch(true)
+	}
+}
+
+func BenchmarkChannelNotifyWaitAck(b *testing.B) {
+	ch := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// notify
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+
+		// wait + ack
+		<-ch
+	}
+}
+
+// BenchmarkSleeperMultiNotifyWaitAck is equivalent to
+// BenchmarkSleeperNotifyWaitAck, but also includes allocation of a
+// temporary sleep.Waker. This is necessary when multiple goroutines may wait
+// for the same event, since each sleep.Waker can wake only a single
+// sleep.Sleeper.
+//
+// The syncevent package does not require a distinct object for each
+// waiter-waker relationship, so BenchmarkWaiterNotifyWaitAck and
+// BenchmarkWaiterMultiNotifyWaitAck would be identical. The analogous state
+// for channels, runtime.sudog, is inescapably runtime-allocated, so
+// BenchmarkChannelNotifyWaitAck and BenchmarkChannelMultiNotifyWaitAck would
+// also be identical.
+
+func BenchmarkSleeperMultiNotifyWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	// The sleep package doesn't provide sync.Pool allocation of Wakers;
+	// we do for a fairer comparison.
+	wakerPool := sync.Pool{
+		New: func() interface{} {
+			return &sleep.Waker{}
+		},
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w := wakerPool.Get().(*sleep.Waker)
+		s.AddWaker(w, 0)
+		w.Assert()
+		s.Fetch(true)
+		s.Done()
+		wakerPool.Put(w)
+	}
+}
+
+// BenchmarkXxxTempNotifyWaitAck is equivalent to NotifyWaitAck, but also
+// includes allocation of a temporary Waiter. This models the case where a
+// goroutine not already associated with a Waiter needs one in order to block.
+//
+// The analogous state for channels is built into runtime.g, so
+// BenchmarkChannelNotifyWaitAck and BenchmarkChannelTempNotifyWaitAck would be
+// identical.
+
+func BenchmarkWaiterTempNotifyWaitAck(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w := GetWaiter()
+		w.Notify(evBench)
+		w.Wait()
+		w.Ack(evBench)
+		PutWaiter(w)
+	}
+}
+
+func BenchmarkSleeperTempNotifyWaitAck(b *testing.B) {
+	// The sleep package doesn't provide sync.Pool allocation of Sleepers;
+	// we do for a fairer comparison.
+	sleeperPool := sync.Pool{
+		New: func() interface{} {
+			return &sleep.Sleeper{}
+		},
+	}
+	var w sleep.Waker
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s := sleeperPool.Get().(*sleep.Sleeper)
+		s.AddWaker(&w, 0)
+		w.Assert()
+		s.Fetch(true)
+		s.Done()
+		sleeperPool.Put(s)
+	}
+}
+
+// BenchmarkXxxNotifyWaitMultiAck is equivalent to NotifyWaitAck, but allows
+// for multiple event sources.
+
+func BenchmarkWaiterNotifyWaitMultiAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+		if e := w.Wait(); e != evBench {
+			b.Fatalf("Wait: got %#x, wanted %#x", e, evBench)
+		}
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyWaitMultiAck(b *testing.B) {
+	var s sleep.Sleeper
+	var ws [3]sleep.Waker
+	for i := range ws {
+		s.AddWaker(&ws[i], i)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ws[0].Assert()
+		if id, _ := s.Fetch(true); id != 0 {
+			b.Fatalf("Fetch: got %d, wanted 0", id)
+		}
+	}
+}
+
+func BenchmarkChannelNotifyWaitMultiAck(b *testing.B) {
+	ch0 := make(chan struct{}, 1)
+	ch1 := make(chan struct{}, 1)
+	ch2 := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// notify
+		select {
+		case ch0 <- struct{}{}:
+		default:
+		}
+
+		// wait + clear
+		select {
+		case <-ch0:
+			// ok
+		case <-ch1:
+			b.Fatalf("received from ch1")
+		case <-ch2:
+			b.Fatalf("received from ch2")
+		}
+	}
+}
+
+// BenchmarkXxxNotifyAsyncWaitAck measures how long it takes to wait for an
+// event while another goroutine signals the event. This assumes that a new
+// goroutine doesn't run immediately (i.e. the creator of a new goroutine is
+// allowed to go to sleep before the new goroutine has a chance to run).
+
+func BenchmarkWaiterNotifyAsyncWaitAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Notify(1)
+		}()
+		w.Wait()
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyAsyncWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Assert()
+		}()
+		s.Fetch(true)
+	}
+}
+
+func BenchmarkChannelNotifyAsyncWaitAck(b *testing.B) {
+	ch := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			select {
+			case ch <- struct{}{}:
+			default:
+			}
+		}()
+		<-ch
+	}
+}
+
+// BenchmarkXxxNotifyAsyncWaitMultiAck is equivalent to NotifyAsyncWaitAck, but
+// allows for multiple event sources.
+
+func BenchmarkWaiterNotifyAsyncWaitMultiAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Notify(evBench)
+		}()
+		if e := w.Wait(); e != evBench {
+			b.Fatalf("Wait: got %#x, wanted %#x", e, evBench)
+		}
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyAsyncWaitMultiAck(b *testing.B) {
+	var s sleep.Sleeper
+	var ws [3]sleep.Waker
+	for i := range ws {
+		s.AddWaker(&ws[i], i)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			ws[0].Assert()
+		}()
+		if id, _ := s.Fetch(true); id != 0 {
+			b.Fatalf("Fetch: got %d, expected 0", id)
+		}
+	}
+}
+
+func BenchmarkChannelNotifyAsyncWaitMultiAck(b *testing.B) {
+	ch0 := make(chan struct{}, 1)
+	ch1 := make(chan struct{}, 1)
+	ch2 := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			select {
+			case ch0 <- struct{}{}:
+			default:
+			}
+		}()
+
+		select {
+		case <-ch0:
+			// ok
+		case <-ch1:
+			b.Fatalf("received from ch1")
+		case <-ch2:
+			b.Fatalf("received from ch2")
+		}
+	}
+}
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
new file mode 100644
index 000000000..112e0e604
--- /dev/null
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -0,0 +1,206 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.11
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+//go:linkname gopark runtime.gopark
+func gopark(unlockf func(unsafe.Pointer, *unsafe.Pointer) bool, wg *unsafe.Pointer, reason uint8, traceEv byte, traceskip int)
+
+//go:linkname goready runtime.goready
+func goready(g unsafe.Pointer, traceskip int)
+
+const (
+	waitReasonSelect     = 9  // Go: src/runtime/runtime2.go
+	traceEvGoBlockSelect = 24 // Go: src/runtime/trace.go
+)
+
+// Waiter allows a goroutine to block on pending events received by a Receiver.
+//
+// Waiter.Init() must be called before first use.
+type Waiter struct {
+	r Receiver
+
+	// g is one of:
+	//
+	// - nil: No goroutine is blocking in Wait.
+	//
+	// - &preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet
+	// completed waiterUnlock(). Thus the wait can only be interrupted by
+	// replacing the value of g with nil (the G may not be in state Gwaiting
+	// yet, so we can't call goready.)
+	//
+	// - Otherwise: g is a pointer to the runtime.g in state Gwaiting for the
+	// goroutine blocked in Wait, which can only be woken by calling goready.
+	g unsafe.Pointer `state:"zerovalue"`
+}
+
+// Sentinel object for Waiter.g.
+var preparingG struct{}
+
+// Init must be called before first use of w.
+func (w *Waiter) Init() {
+	w.r.Init(w)
+}
+
+// Receiver returns the Receiver that receives events that unblock calls to
+// w.Wait().
+func (w *Waiter) Receiver() *Receiver {
+	return &w.r
+}
+
+// Pending returns the set of pending events.
+func (w *Waiter) Pending() Set {
+	return w.r.Pending()
+}
+
+// Wait blocks until at least one event is pending, then returns the set of
+// pending events. It does not affect the set of pending events; callers must
+// call w.Ack() to do so, or use w.WaitAndAck() instead.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) Wait() Set {
+	return w.WaitFor(AllEvents)
+}
+
+// WaitFor blocks until at least one event in es is pending, then returns the
+// set of pending events (including those not in es). It does not affect the
+// set of pending events; callers must call w.Ack() to do so.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) WaitFor(es Set) Set {
+	for {
+		// Optimization: Skip the atomic store to w.g if an event is already
+		// pending.
+		if p := w.r.Pending(); p&es != NoEvents {
+			return p
+		}
+
+		// Indicate that we're preparing to go to sleep.
+		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+
+		// If an event is pending, abort the sleep.
+		if p := w.r.Pending(); p&es != NoEvents {
+			atomic.StorePointer(&w.g, nil)
+			return p
+		}
+
+		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
+		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// w.NotifyPending() => goready().
+		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+	}
+}
+
+// Ack marks the given events as not pending.
+func (w *Waiter) Ack(es Set) {
+	w.r.Ack(es)
+}
+
+// WaitAndAckAll blocks until at least one event is pending, then marks all
+// events as not pending and returns the set of previously-pending events.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) WaitAndAckAll() Set {
+	// Optimization: Skip the atomic store to w.g if an event is already
+	// pending. Call Pending() first since, in the common case that events are
+	// not yet pending, this skips an atomic swap on w.r.pending.
+	if w.r.Pending() != NoEvents {
+		if p := w.r.PendingAndAckAll(); p != NoEvents {
+			return p
+		}
+	}
+
+	for {
+		// Indicate that we're preparing to go to sleep.
+		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+
+		// If an event is pending, abort the sleep.
+		if w.r.Pending() != NoEvents {
+			if p := w.r.PendingAndAckAll(); p != NoEvents {
+				atomic.StorePointer(&w.g, nil)
+				return p
+			}
+		}
+
+		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
+		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// w.NotifyPending() => goready().
+		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+
+		// Check for pending events. We call PendingAndAckAll() directly now since
+		// we only expect to be woken after events become pending.
+		if p := w.r.PendingAndAckAll(); p != NoEvents {
+			return p
+		}
+	}
+}
+
+// Notify marks the given events as pending, possibly unblocking concurrent
+// calls to w.Wait() or w.WaitFor().
+func (w *Waiter) Notify(es Set) {
+	w.r.Notify(es)
+}
+
+// NotifyPending implements ReceiverCallback.NotifyPending. Users of Waiter
+// should not call NotifyPending.
+func (w *Waiter) NotifyPending() {
+	// Optimization: Skip the atomic swap on w.g if there is no sleeping
+	// goroutine. NotifyPending is called after w.r.Pending() is updated, so
+	// concurrent and future calls to w.Wait() will observe pending events and
+	// abort sleeping.
+	if atomic.LoadPointer(&w.g) == nil {
+		return
+	}
+	// Wake a sleeping G, or prevent a G that is preparing to sleep from doing
+	// so. Swap is needed here to ensure that only one call to NotifyPending
+	// calls goready.
+	if g := atomic.SwapPointer(&w.g, nil); g != nil && g != (unsafe.Pointer)(&preparingG) {
+		goready(g, 0)
+	}
+}
+
+var waiterPool = sync.Pool{
+	New: func() interface{} {
+		w := &Waiter{}
+		w.Init()
+		return w
+	},
+}
+
+// GetWaiter returns an unused Waiter. PutWaiter should be called to release
+// the Waiter once it is no longer needed.
+//
+// Where possible, users should prefer to associate each goroutine that calls
+// Waiter.Wait() with a distinct pre-allocated Waiter to avoid allocation of
+// Waiters in hot paths.
+func GetWaiter() *Waiter {
+	return waiterPool.Get().(*Waiter)
+}
+
+// PutWaiter releases an unused Waiter previously returned by GetWaiter.
+func PutWaiter(w *Waiter) {
+	waiterPool.Put(w)
+}
-- 
cgit v1.2.3


From 247843bbc51d459b279db24a262f68b4dac1cc01 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:24:59 -0800
Subject: iptables: use "-t nat" for NAT tests

PiperOrigin-RevId: 295835807
---
 test/iptables/iptables_util.go | 11 ++++++++++-
 test/iptables/nat.go           |  4 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 293c4e6ed..32cf5a417 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -27,7 +27,16 @@ const iptablesBinary = "iptables"
 
 // filterTable calls `iptables -t filter` with the given args.
 func filterTable(args ...string) error {
-	args = append([]string{"-t", "filter"}, args...)
+	return tableCmd("filter", args)
+}
+
+// natTable calls `iptables -t nat` with the given args.
+func natTable(args ...string) error {
+	return tableCmd("nat", args)
+}
+
+func tableCmd(table string, args []string) error {
+	args = append([]string{"-t", table}, args...)
 	cmd := exec.Command(iptablesBinary, args...)
 	if out, err := cmd.CombinedOutput(); err != nil {
 		return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out))
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index b5c6f927e..a01117ec8 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -38,7 +38,7 @@ func (NATRedirectUDPPort) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (NATRedirectUDPPort) ContainerAction(ip net.IP) error {
-	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
@@ -63,7 +63,7 @@ func (NATDropUDP) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (NATDropUDP) ContainerAction(ip net.IP) error {
-	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
-- 
cgit v1.2.3


From 56fd9504aab44a738d3df164cbee8e572b309f28 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:44:22 -0800
Subject: Enable IPV6_RECVTCLASS socket option for datagram sockets

Added the ability to get/set the IP_RECVTCLASS socket option on UDP endpoints.
If enabled, traffic class from the incoming Network Header passed as ancillary
data in the ControlMessages.

Adding Get/SetSockOptBool to decrease the overhead of getting/setting simple
options. (This was absorbed in a CL that will be landing before this one).

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test for verifying getting/setting
* Removed test skip for existing syscall test to enable end to end test.
PiperOrigin-RevId: 295840218
---
 pkg/sentry/socket/control/control.go         |   2 +-
 pkg/sentry/socket/netstack/netstack.go       |  27 +++++-
 pkg/tcpip/checker/checker.go                 |  14 +++
 pkg/tcpip/tcpip.go                           |  15 ++-
 pkg/tcpip/transport/udp/endpoint.go          |  38 +++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 120 ++++++++++++++----------
 test/syscalls/linux/ip_socket_test_util.h    |  16 ++--
 test/syscalls/linux/socket_ip_udp_generic.cc | 133 +++++++++++++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc |   4 -
 9 files changed, 260 insertions(+), 109 deletions(-)

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4667373d2..8834a1e1a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -329,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 }
 
 // PackTClass packs an IPV6_TCLASS socket control message.
-func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IPV6,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9757fbfba..e187276c5 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1318,6 +1318,22 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		}
 		return ib, nil
 
+	case linux.IPV6_RECVTCLASS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1803,6 +1819,14 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
 
+	case linux.IPV6_RECVTCLASS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -2086,7 +2110,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 		linux.IPV6_RECVPATHMTU,
 		linux.IPV6_RECVPKTINFO,
 		linux.IPV6_RECVRTHDR,
-		linux.IPV6_RECVTCLASS,
 		linux.IPV6_RTHDR,
 		linux.IPV6_RTHDRDSTOPTS,
 		linux.IPV6_TCLASS,
@@ -2424,6 +2447,8 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages {
 			Timestamp:       s.readCM.Timestamp,
 			HasTOS:          s.readCM.HasTOS,
 			TOS:             s.readCM.TOS,
+			HasTClass:       s.readCM.HasTClass,
+			TClass:          s.readCM.TClass,
 			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
 			PacketInfo:      s.readCM.PacketInfo,
 		},
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 4d6ae0871..c6c160dfc 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -161,6 +161,20 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
 // ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
 func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9ca39ce40..ce5527391 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -323,11 +323,11 @@ type ControlMessages struct {
 	// TOS is the IPv4 type of service of the associated packet.
 	TOS uint8
 
-	// HasTClass indicates whether Tclass is valid/set.
+	// HasTClass indicates whether TClass is valid/set.
 	HasTClass bool
 
-	// Tclass is the IPv6 traffic class of the associated packet.
-	TClass int32
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
 
 	// HasIPPacketInfo indicates whether PacketInfo is set.
 	HasIPPacketInfo bool
@@ -502,9 +502,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
+	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	ReceiveTClassOption SockOptBool = iota
+
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
-	ReceiveTOSOption SockOptBool = iota
+	ReceiveTOSOption
 
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
@@ -514,6 +518,9 @@ const (
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
+
+	// TODO(b/146901447): convert existing bool socket options to be handled via
+	// Get/SetSockOptBool
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3fe91cac2..eff7f3600 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,8 @@ type udpPacket struct {
 	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -119,6 +120,10 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
 	// receiveIPPacketInfo determines if the packet info is returned by Read.
 	receiveIPPacketInfo bool
 
@@ -258,13 +263,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
 	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
 	if receiveIPPacketInfo {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
@@ -490,6 +500,17 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -709,6 +730,17 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrNotSupported
+		}
+
+		e.mu.RLock()
+		v := e.receiveTClass
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1273,6 +1305,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		packet.packetInfo.LocalAddr = r.LocalAddress
 		packet.packetInfo.DestinationAddr = r.RemoteAddress
 		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f0ff3fe71..34b7c2360 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -409,6 +409,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
@@ -1336,7 +1337,7 @@ func TestSetTTL(t *testing.T) {
 	}
 }
 
-func TestTOSV4(t *testing.T) {
+func TestSetTOS(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1347,23 +1348,23 @@ func TestTOSV4(t *testing.T) {
 			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
 			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv4TOSOption(tos), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
 			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1371,7 +1372,7 @@ func TestTOSV4(t *testing.T) {
 	}
 }
 
-func TestTOSV6(t *testing.T) {
+func TestSetTClass(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1379,71 +1380,92 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tClass = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt failed: %s", err)
+			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tClass)); err != nil {
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv6TrafficClassOption(tClass), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if want := tcpip.IPv6TrafficClassOption(tClass); v != want {
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
-			testWrite(c, flow, checker.TOS(tos, 0))
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
 		})
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
 
-			c.createEndpointForFlow(flow)
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
 
-			// Verify that setting and reading the option works.
-			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
-			}
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
 
-			want := true
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
-				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
-			}
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
 
-			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			if got != want {
-				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
-			}
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
 
-			// Verify that the correct received TOS is handed through as
-			// ancillary data to the ControlMessages struct.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatal("Bind failed:", err)
-			}
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
 	}
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 083ebbcf0..39fd6709d 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -84,20 +84,20 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
-// IPv4UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
 SocketKind IPv4UDPUnboundSocket(int type);
 
-// IPv6UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_DGRAM, and the given type.
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
 SocketKind IPv6UDPUnboundSocket(int type);
 
-// IPv4TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
 SocketKind IPv4TCPUnboundSocket(int type);
 
-// IPv6TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_STREAM and the given type.
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
 SocketKind IPv6TCPUnboundSocket(int type);
 
 // IfAddrHelper is a helper class that determines the local interfaces present
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index db5663ecd..1c533fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_udp_generic.h"
 
+#include <errno.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -209,46 +210,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -401,5 +362,97 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
   EXPECT_EQ(get_len, sizeof(get));
 }
 
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+  int level;
+  int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+  TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+  RecvTosOption opt;
+  switch (domain) {
+    case AF_INET:
+      opt.level = IPPROTO_IP;
+      opt.option = IP_RECVTOS;
+      break;
+    case AF_INET6:
+      opt.level = IPPROTO_IPV6;
+      opt.option = IPV6_RECVTCLASS;
+      break;
+  }
+  return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(AF_INET);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+  // This should only test AF_INET sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET);
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+                         &get, &get_len),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9f8de6b48..57b1a357c 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,7 +1419,6 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 8dae8a10f01b49d9b28c1ae72b67cb5b83238963 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 17:03:59 -0800
Subject: Internal change.

PiperOrigin-RevId: 295857142
---
 kokoro/benchmarks/benchmarks.cfg | 26 ++++++++++++++++++++++++++
 kokoro/benchmarks/benchmarks.sh  | 25 +++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 kokoro/benchmarks/benchmarks.cfg
 create mode 100644 kokoro/benchmarks/benchmarks.sh

diff --git a/kokoro/benchmarks/benchmarks.cfg b/kokoro/benchmarks/benchmarks.cfg
new file mode 100644
index 000000000..9b9073e62
--- /dev/null
+++ b/kokoro/benchmarks/benchmarks.cfg
@@ -0,0 +1,26 @@
+build_file : 'github/github/kokoro/benchmarks/benchmarks.sh'
+
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+        keystore_config_id : 73898
+        keyname : 'kokoro-rbe-service-account'
+    },
+  }
+}
+
+env_vars {
+  key : 'PROJECT'
+  value : 'gvisor-kokoro-testing'
+}
+
+env_vars {
+  key : 'ZONE'
+  value : 'us-central1-b'
+}
+
+env_vars {
+  key : 'KOKORO_SERVICE_ACCOUNT'
+  value : '73898_kokoro-rbe-service-account'
+}
diff --git a/kokoro/benchmarks/benchmarks.sh b/kokoro/benchmarks/benchmarks.sh
new file mode 100644
index 000000000..a0317db02
--- /dev/null
+++ b/kokoro/benchmarks/benchmarks.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run in the root of the repo.
+cd "$(dirname "$0")"
+
+KEY_PATH=${KEY_PATH:-"${KOKORO_KEYSTORE_DIR}/${KOKORO_SERVICE_ACCOUNT}"}
+
+gcloud auth activate-service-account --key-file "${KEY_PATH}"
+
+gcloud compute instances list
+
-- 
cgit v1.2.3


From 92d2d78876a938871327685e1104d7b4ff46986e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 18 Feb 2020 21:20:41 -0800
Subject: Fix mis-named comment.

---
 pkg/tcpip/stack/registration.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index ec91f60dd..d83adf0ec 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -277,7 +277,7 @@ type NetworkProtocol interface {
 	// DefaultPrefixLen returns the protocol's default prefix length.
 	DefaultPrefixLen() int
 
-	// ParsePorts returns the source and destination addresses stored in a
+	// ParseAddresses returns the source and destination addresses stored in a
 	// packet of this protocol.
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
-- 
cgit v1.2.3


From 90d0851c0e37146158e4f46f6aa1707d786fb48a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 11:35:16 -0800
Subject: Internal change.

PiperOrigin-RevId: 296016675
---
 kokoro/benchmark.sh              | 25 +++++++++++++++++++++++++
 kokoro/benchmark_tests.cfg       | 26 ++++++++++++++++++++++++++
 kokoro/benchmarks/benchmarks.cfg | 26 --------------------------
 kokoro/benchmarks/benchmarks.sh  | 25 -------------------------
 4 files changed, 51 insertions(+), 51 deletions(-)
 create mode 100644 kokoro/benchmark.sh
 create mode 100644 kokoro/benchmark_tests.cfg
 delete mode 100644 kokoro/benchmarks/benchmarks.cfg
 delete mode 100644 kokoro/benchmarks/benchmarks.sh

diff --git a/kokoro/benchmark.sh b/kokoro/benchmark.sh
new file mode 100644
index 000000000..a0317db02
--- /dev/null
+++ b/kokoro/benchmark.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run in the root of the repo.
+cd "$(dirname "$0")"
+
+KEY_PATH=${KEY_PATH:-"${KOKORO_KEYSTORE_DIR}/${KOKORO_SERVICE_ACCOUNT}"}
+
+gcloud auth activate-service-account --key-file "${KEY_PATH}"
+
+gcloud compute instances list
+
diff --git a/kokoro/benchmark_tests.cfg b/kokoro/benchmark_tests.cfg
new file mode 100644
index 000000000..76ba30752
--- /dev/null
+++ b/kokoro/benchmark_tests.cfg
@@ -0,0 +1,26 @@
+build_file : 'github/github/kokoro/benchmarks.sh'
+
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+        keystore_config_id : 73898
+        keyname : 'kokoro-rbe-service-account'
+    },
+  }
+}
+
+env_vars {
+  key : 'PROJECT'
+  value : 'gvisor-kokoro-testing'
+}
+
+env_vars {
+  key : 'ZONE'
+  value : 'us-central1-b'
+}
+
+env_vars {
+  key : 'KOKORO_SERVICE_ACCOUNT'
+  value : '73898_kokoro-rbe-service-account'
+}
diff --git a/kokoro/benchmarks/benchmarks.cfg b/kokoro/benchmarks/benchmarks.cfg
deleted file mode 100644
index 9b9073e62..000000000
--- a/kokoro/benchmarks/benchmarks.cfg
+++ /dev/null
@@ -1,26 +0,0 @@
-build_file : 'github/github/kokoro/benchmarks/benchmarks.sh'
-
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-        keystore_config_id : 73898
-        keyname : 'kokoro-rbe-service-account'
-    },
-  }
-}
-
-env_vars {
-  key : 'PROJECT'
-  value : 'gvisor-kokoro-testing'
-}
-
-env_vars {
-  key : 'ZONE'
-  value : 'us-central1-b'
-}
-
-env_vars {
-  key : 'KOKORO_SERVICE_ACCOUNT'
-  value : '73898_kokoro-rbe-service-account'
-}
diff --git a/kokoro/benchmarks/benchmarks.sh b/kokoro/benchmarks/benchmarks.sh
deleted file mode 100644
index a0317db02..000000000
--- a/kokoro/benchmarks/benchmarks.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Copyright 2020 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Run in the root of the repo.
-cd "$(dirname "$0")"
-
-KEY_PATH=${KEY_PATH:-"${KOKORO_KEYSTORE_DIR}/${KOKORO_SERVICE_ACCOUNT}"}
-
-gcloud auth activate-service-account --key-file "${KEY_PATH}"
-
-gcloud compute instances list
-
-- 
cgit v1.2.3


From 55c99ce106e03c419729318947e0be477ed181d0 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 12:31:43 -0800
Subject: Include more test files in exports_files

So that they can be included by Fuchsia's syscall tests

PiperOrigin-RevId: 296030383
---
 test/syscalls/linux/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e7c82adfc..05a818795 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,8 +12,12 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_loopback_blocking.cc",
+        "socket_ip_tcp_loopback_nonblock.cc",
         "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_udp_loopback_blocking.cc",
+        "socket_ip_udp_loopback_nonblock.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
-- 
cgit v1.2.3


From ddc02e489cfe3990f76882463c0ad386710a9a94 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 13:11:38 -0800
Subject: Internal change.

PiperOrigin-RevId: 296039442
---
 kokoro/benchmark_tests.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/benchmark_tests.cfg b/kokoro/benchmark_tests.cfg
index 76ba30752..6a5d9b114 100644
--- a/kokoro/benchmark_tests.cfg
+++ b/kokoro/benchmark_tests.cfg
@@ -1,4 +1,4 @@
-build_file : 'github/github/kokoro/benchmarks.sh'
+build_file : 'repo/kokoro/benchmarks.sh'
 
 
 before_action {
-- 
cgit v1.2.3


From 3a20eccf8b2d30eeff60f616a3e1d0d15c6ffac4 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 14:17:12 -0800
Subject: Internal change.

PiperOrigin-RevId: 296055452
---
 kokoro/benchmark.sh        | 25 -------------------------
 kokoro/benchmark_tests.cfg |  2 +-
 scripts/benchmark.sh       | 25 +++++++++++++++++++++++++
 3 files changed, 26 insertions(+), 26 deletions(-)
 delete mode 100644 kokoro/benchmark.sh
 create mode 100644 scripts/benchmark.sh

diff --git a/kokoro/benchmark.sh b/kokoro/benchmark.sh
deleted file mode 100644
index a0317db02..000000000
--- a/kokoro/benchmark.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Copyright 2020 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Run in the root of the repo.
-cd "$(dirname "$0")"
-
-KEY_PATH=${KEY_PATH:-"${KOKORO_KEYSTORE_DIR}/${KOKORO_SERVICE_ACCOUNT}"}
-
-gcloud auth activate-service-account --key-file "${KEY_PATH}"
-
-gcloud compute instances list
-
diff --git a/kokoro/benchmark_tests.cfg b/kokoro/benchmark_tests.cfg
index 6a5d9b114..c48518a05 100644
--- a/kokoro/benchmark_tests.cfg
+++ b/kokoro/benchmark_tests.cfg
@@ -1,4 +1,4 @@
-build_file : 'repo/kokoro/benchmarks.sh'
+build_file : 'repo/scripts/benchmark.sh'
 
 
 before_action {
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
new file mode 100644
index 000000000..a0317db02
--- /dev/null
+++ b/scripts/benchmark.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run in the root of the repo.
+cd "$(dirname "$0")"
+
+KEY_PATH=${KEY_PATH:-"${KOKORO_KEYSTORE_DIR}/${KOKORO_SERVICE_ACCOUNT}"}
+
+gcloud auth activate-service-account --key-file "${KEY_PATH}"
+
+gcloud compute instances list
+
-- 
cgit v1.2.3


From 660cfdff3f2ac771c6f0f18834921cfc043b2f3a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 15:41:22 -0800
Subject: Handle situations where go-marshal generates an empty test file.

This can happen due to conditional compilation, where a subset of the
source files contain no marshallable types. go-marshal is still
required to write an output file in these cases, since bazel defines
the output package before calling go-marshal.

PiperOrigin-RevId: 296074321
---
 tools/go_marshal/gomarshal/generator.go | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index d3c2f72f5..0fa868415 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -380,6 +380,26 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 	}
 
 	// Write test functions.
+
+	// If we didn't generate any Marshallable implementations, we can't just
+	// emit an empty test file, since that causes the build to fail with "no
+	// tests/benchmarks/examples found". Unfortunately we can't signal bazel to
+	// omit the entire package since the outputs are already defined before
+	// go-marshal is called. If we'd otherwise emit an empty test suite, emit an
+	// empty example instead.
+	if len(ts) == 0 {
+		b.reset()
+		b.emit("func ExampleEmptyTestSuite() {\n")
+		b.inIndent(func() {
+			b.emit("// This example is intentionally empty to ensure this file contains at least\n")
+			b.emit("// one testable entity. go-marshal is forced to emit a test file if a package\n")
+			b.emit("// is marked marshallable, but emitting a test file with no entities results\n")
+			b.emit("// in a build failure.\n")
+		})
+		b.emit("}\n")
+		return b.write(g.outputTest)
+	}
+
 	for _, t := range ts {
 		if err := t.write(g.outputTest); err != nil {
 			return err
-- 
cgit v1.2.3


From 2daa21e4d73f2297a8bca32c76100333e9ac4af4 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 16:47:58 -0800
Subject: Internal change.

PiperOrigin-RevId: 296088213
---
 pkg/sentry/socket/netstack/provider.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 5afff2564..5f181f017 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -75,6 +75,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		switch protocol {
 		case syscall.IPPROTO_ICMP:
 			return header.ICMPv4ProtocolNumber, true, nil
+		case syscall.IPPROTO_ICMPV6:
+			return header.ICMPv6ProtocolNumber, true, nil
 		case syscall.IPPROTO_UDP:
 			return header.UDPProtocolNumber, true, nil
 		case syscall.IPPROTO_TCP:
-- 
cgit v1.2.3


From 30794512d3977ebb2b185e5e9cfb969d558a07a4 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 19 Feb 2020 18:20:52 -0800
Subject: Add basic microbenchmarks.

PiperOrigin-RevId: 296104390
---
 WORKSPACE                                  |  10 +
 test/perf/BUILD                            | 114 +++++++
 test/perf/linux/BUILD                      | 356 +++++++++++++++++++++
 test/perf/linux/clock_getres_benchmark.cc  |  39 +++
 test/perf/linux/clock_gettime_benchmark.cc |  60 ++++
 test/perf/linux/death_benchmark.cc         |  36 +++
 test/perf/linux/epoll_benchmark.cc         |  99 ++++++
 test/perf/linux/fork_benchmark.cc          | 350 +++++++++++++++++++++
 test/perf/linux/futex_benchmark.cc         | 248 +++++++++++++++
 test/perf/linux/getdents_benchmark.cc      | 149 +++++++++
 test/perf/linux/getpid_benchmark.cc        |  37 +++
 test/perf/linux/gettid_benchmark.cc        |  38 +++
 test/perf/linux/mapping_benchmark.cc       | 163 ++++++++++
 test/perf/linux/open_benchmark.cc          |  56 ++++
 test/perf/linux/pipe_benchmark.cc          |  66 ++++
 test/perf/linux/randread_benchmark.cc      | 100 ++++++
 test/perf/linux/read_benchmark.cc          |  53 ++++
 test/perf/linux/sched_yield_benchmark.cc   |  37 +++
 test/perf/linux/send_recv_benchmark.cc     | 372 ++++++++++++++++++++++
 test/perf/linux/seqwrite_benchmark.cc      |  66 ++++
 test/perf/linux/signal_benchmark.cc        |  59 ++++
 test/perf/linux/sleep_benchmark.cc         |  60 ++++
 test/perf/linux/stat_benchmark.cc          |  62 ++++
 test/perf/linux/unlink_benchmark.cc        |  66 ++++
 test/perf/linux/write_benchmark.cc         |  52 ++++
 test/runner/BUILD                          |  22 ++
 test/runner/defs.bzl                       | 218 +++++++++++++
 test/runner/gtest/BUILD                    |   9 +
 test/runner/gtest/gtest.go                 | 154 +++++++++
 test/runner/runner.go                      | 477 ++++++++++++++++++++++++++++
 test/syscalls/BUILD                        |  21 +-
 test/syscalls/build_defs.bzl               | 180 -----------
 test/syscalls/gtest/BUILD                  |   9 -
 test/syscalls/gtest/gtest.go               |  93 ------
 test/syscalls/linux/alarm.cc               |   3 +-
 test/syscalls/linux/exec.cc                |   3 +-
 test/syscalls/linux/fcntl.cc               |   2 +-
 test/syscalls/linux/itimer.cc              |   3 +-
 test/syscalls/linux/prctl.cc               |   2 +-
 test/syscalls/linux/prctl_setuid.cc        |   2 +-
 test/syscalls/linux/proc.cc                |   2 +-
 test/syscalls/linux/ptrace.cc              |   2 +-
 test/syscalls/linux/rtsignal.cc            |   3 +-
 test/syscalls/linux/seccomp.cc             |   2 +-
 test/syscalls/linux/sigiret.cc             |   3 +-
 test/syscalls/linux/signalfd.cc            |   2 +-
 test/syscalls/linux/sigstop.cc             |   2 +-
 test/syscalls/linux/sigtimedwait.cc        |   3 +-
 test/syscalls/linux/timers.cc              |   2 +-
 test/syscalls/linux/vfork.cc               |   2 +-
 test/syscalls/syscall_test_runner.go       | 482 -----------------------------
 test/syscalls/syscall_test_runner.sh       |  34 --
 test/util/BUILD                            |   3 +-
 test/util/test_main.cc                     |   2 +-
 test/util/test_util.h                      |   1 +
 test/util/test_util_impl.cc                |  14 +
 tools/bazeldefs/defs.bzl                   |   1 +
 tools/defs.bzl                             |   3 +-
 58 files changed, 3666 insertions(+), 843 deletions(-)
 create mode 100644 test/perf/BUILD
 create mode 100644 test/perf/linux/BUILD
 create mode 100644 test/perf/linux/clock_getres_benchmark.cc
 create mode 100644 test/perf/linux/clock_gettime_benchmark.cc
 create mode 100644 test/perf/linux/death_benchmark.cc
 create mode 100644 test/perf/linux/epoll_benchmark.cc
 create mode 100644 test/perf/linux/fork_benchmark.cc
 create mode 100644 test/perf/linux/futex_benchmark.cc
 create mode 100644 test/perf/linux/getdents_benchmark.cc
 create mode 100644 test/perf/linux/getpid_benchmark.cc
 create mode 100644 test/perf/linux/gettid_benchmark.cc
 create mode 100644 test/perf/linux/mapping_benchmark.cc
 create mode 100644 test/perf/linux/open_benchmark.cc
 create mode 100644 test/perf/linux/pipe_benchmark.cc
 create mode 100644 test/perf/linux/randread_benchmark.cc
 create mode 100644 test/perf/linux/read_benchmark.cc
 create mode 100644 test/perf/linux/sched_yield_benchmark.cc
 create mode 100644 test/perf/linux/send_recv_benchmark.cc
 create mode 100644 test/perf/linux/seqwrite_benchmark.cc
 create mode 100644 test/perf/linux/signal_benchmark.cc
 create mode 100644 test/perf/linux/sleep_benchmark.cc
 create mode 100644 test/perf/linux/stat_benchmark.cc
 create mode 100644 test/perf/linux/unlink_benchmark.cc
 create mode 100644 test/perf/linux/write_benchmark.cc
 create mode 100644 test/runner/BUILD
 create mode 100644 test/runner/defs.bzl
 create mode 100644 test/runner/gtest/BUILD
 create mode 100644 test/runner/gtest/gtest.go
 create mode 100644 test/runner/runner.go
 delete mode 100644 test/syscalls/build_defs.bzl
 delete mode 100644 test/syscalls/gtest/BUILD
 delete mode 100644 test/syscalls/gtest/gtest.go
 delete mode 100644 test/syscalls/syscall_test_runner.go
 delete mode 100755 test/syscalls/syscall_test_runner.sh

diff --git a/WORKSPACE b/WORKSPACE
index 2827c3a26..ff0196dc6 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -330,3 +330,13 @@ http_archive(
         "https://github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz",
     ],
 )
+
+http_archive(
+    name = "com_google_benchmark",
+    sha256 = "3c6a165b6ecc948967a1ead710d4a181d7b0fbcaa183ef7ea84604994966221a",
+    strip_prefix = "benchmark-1.5.0",
+    urls = [
+        "https://mirror.bazel.build/github.com/google/benchmark/archive/v1.5.0.tar.gz",
+        "https://github.com/google/benchmark/archive/v1.5.0.tar.gz",
+    ],
+)
diff --git a/test/perf/BUILD b/test/perf/BUILD
new file mode 100644
index 000000000..7a2bf10ed
--- /dev/null
+++ b/test/perf/BUILD
@@ -0,0 +1,114 @@
+load("//test/runner:defs.bzl", "syscall_test")
+
+package(licenses = ["notice"])
+
+syscall_test(
+    test = "//test/perf/linux:clock_getres_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:clock_gettime_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:death_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:epoll_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:fork_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:futex_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getdents_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getpid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:gettid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:mapping_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:open_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:pipe_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:randread_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:read_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:sched_yield_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:send_recv_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:seqwrite_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:signal_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:sleep_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:stat_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:unlink_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:write_benchmark",
+)
diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD
new file mode 100644
index 000000000..b4e907826
--- /dev/null
+++ b/test/perf/linux/BUILD
@@ -0,0 +1,356 @@
+load("//tools:defs.bzl", "cc_binary", "gbenchmark", "gtest")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+cc_binary(
+    name = "getpid_benchmark",
+    testonly = 1,
+    srcs = [
+        "getpid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "send_recv_benchmark",
+    testonly = 1,
+    srcs = [
+        "send_recv_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/syscalls/linux:socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "gettid_benchmark",
+    testonly = 1,
+    srcs = [
+        "gettid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "sched_yield_benchmark",
+    testonly = 1,
+    srcs = [
+        "sched_yield_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "clock_getres_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_getres_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "clock_gettime_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_gettime_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "open_benchmark",
+    testonly = 1,
+    srcs = [
+        "open_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "read_benchmark",
+    testonly = 1,
+    srcs = [
+        "read_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "randread_benchmark",
+    testonly = 1,
+    srcs = [
+        "randread_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "write_benchmark",
+    testonly = 1,
+    srcs = [
+        "write_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "seqwrite_benchmark",
+    testonly = 1,
+    srcs = [
+        "seqwrite_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "pipe_benchmark",
+    testonly = 1,
+    srcs = [
+        "pipe_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
+    name = "fork_benchmark",
+    testonly = 1,
+    srcs = [
+        "fork_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "futex_benchmark",
+    testonly = 1,
+    srcs = [
+        "futex_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "epoll_benchmark",
+    testonly = 1,
+    srcs = [
+        "epoll_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:epoll_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "death_benchmark",
+    testonly = 1,
+    srcs = [
+        "death_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "mapping_benchmark",
+    testonly = 1,
+    srcs = [
+        "mapping_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "signal_benchmark",
+    testonly = 1,
+    srcs = [
+        "signal_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "getdents_benchmark",
+    testonly = 1,
+    srcs = [
+        "getdents_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "sleep_benchmark",
+    testonly = 1,
+    srcs = [
+        "sleep_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "stat_benchmark",
+    testonly = 1,
+    srcs = [
+        "stat_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
+    name = "unlink_benchmark",
+    testonly = 1,
+    srcs = [
+        "unlink_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
diff --git a/test/perf/linux/clock_getres_benchmark.cc b/test/perf/linux/clock_getres_benchmark.cc
new file mode 100644
index 000000000..b051293ad
--- /dev/null
+++ b/test/perf/linux/clock_getres_benchmark.cc
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// clock_getres(1) is very nearly a no-op syscall, but it does require copying
+// out to a userspace struct. It thus provides a nice small copy-out benchmark.
+void BM_ClockGetRes(benchmark::State& state) {
+  struct timespec ts;
+  for (auto _ : state) {
+    clock_getres(CLOCK_MONOTONIC, &ts);
+  }
+}
+
+BENCHMARK(BM_ClockGetRes);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/clock_gettime_benchmark.cc b/test/perf/linux/clock_gettime_benchmark.cc
new file mode 100644
index 000000000..6691bebd9
--- /dev/null
+++ b/test/perf/linux/clock_gettime_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pthread.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_ClockGettimeThreadCPUTime(benchmark::State& state) {
+  clockid_t clockid;
+  ASSERT_EQ(0, pthread_getcpuclockid(pthread_self(), &clockid));
+  struct timespec tp;
+
+  for (auto _ : state) {
+    clock_gettime(clockid, &tp);
+  }
+}
+
+BENCHMARK(BM_ClockGettimeThreadCPUTime);
+
+void BM_VDSOClockGettime(benchmark::State& state) {
+  const clockid_t clock = state.range(0);
+  struct timespec tp;
+  absl::Time start = absl::Now();
+
+  // Don't benchmark the calibration phase.
+  while (absl::Now() < start + absl::Milliseconds(2100)) {
+    clock_gettime(clock, &tp);
+  }
+
+  for (auto _ : state) {
+    clock_gettime(clock, &tp);
+  }
+}
+
+BENCHMARK(BM_VDSOClockGettime)->Arg(CLOCK_MONOTONIC)->Arg(CLOCK_REALTIME);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/death_benchmark.cc b/test/perf/linux/death_benchmark.cc
new file mode 100644
index 000000000..cb2b6fd07
--- /dev/null
+++ b/test/perf/linux/death_benchmark.cc
@@ -0,0 +1,36 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// DeathTest is not so much a microbenchmark as a macrobenchmark. It is testing
+// the ability of gVisor (on whatever platform) to execute all the related
+// stack-dumping routines associated with EXPECT_EXIT / EXPECT_DEATH.
+TEST(DeathTest, ZeroEqualsOne) {
+  EXPECT_EXIT({ TEST_CHECK(0 == 1); }, ::testing::KilledBySignal(SIGABRT), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/epoll_benchmark.cc b/test/perf/linux/epoll_benchmark.cc
new file mode 100644
index 000000000..0b121338a
--- /dev/null
+++ b/test/perf/linux/epoll_benchmark.cc
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/epoll_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Returns a new eventfd.
+PosixErrorOr<FileDescriptor> NewEventFD() {
+  int fd = eventfd(0, /* flags = */ 0);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "eventfd");
+  }
+  return FileDescriptor(fd);
+}
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollTimeout(benchmark::State& state) {
+  constexpr int kFDsPerEpoll = 3;
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+  }
+
+  struct epoll_event result[kFDsPerEpoll];
+  int timeout_ms = state.range(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, epoll_wait(epollfd.get(), result, kFDsPerEpoll, timeout_ms));
+  }
+}
+
+BENCHMARK(BM_EpollTimeout)->Range(0, 8);
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollAllEvents(benchmark::State& state) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  const int fds_per_epoll = state.range(0);
+  constexpr uint64_t kEventVal = 5;
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < fds_per_epoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+
+    ASSERT_THAT(WriteFd(eventfds[i].get(), &kEventVal, sizeof(kEventVal)),
+                SyscallSucceedsWithValue(sizeof(kEventVal)));
+  }
+
+  std::vector<struct epoll_event> result(fds_per_epoll);
+
+  for (auto _ : state) {
+    EXPECT_EQ(fds_per_epoll,
+              epoll_wait(epollfd.get(), result.data(), fds_per_epoll, 0));
+  }
+}
+
+BENCHMARK(BM_EpollAllEvents)->Range(2, 1024);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/fork_benchmark.cc b/test/perf/linux/fork_benchmark.cc
new file mode 100644
index 000000000..84fdbc8a0
--- /dev/null
+++ b/test/perf/linux/fork_benchmark.cc
@@ -0,0 +1,350 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/barrier.h"
+#include "benchmark/benchmark.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBusyMax = 250;
+
+// Do some CPU-bound busy-work.
+int busy(int max) {
+  // Prevent the compiler from optimizing this work away,
+  volatile int count = 0;
+
+  for (int i = 1; i < max; i++) {
+    for (int j = 2; j < i / 2; j++) {
+      if (i % j == 0) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+void BM_CPUBoundUniprocess(benchmark::State& state) {
+  for (auto _ : state) {
+    busy(kBusyMax);
+  }
+}
+
+BENCHMARK(BM_CPUBoundUniprocess);
+
+void BM_CPUBoundAsymmetric(benchmark::State& state) {
+  const size_t max = state.max_iterations;
+  pid_t child = fork();
+  if (child == 0) {
+    for (int i = 0; i < max; i++) {
+      busy(kBusyMax);
+    }
+    _exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+  ASSERT_TRUE(state.KeepRunningBatch(max));
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+  ASSERT_FALSE(state.KeepRunning());
+}
+
+BENCHMARK(BM_CPUBoundAsymmetric)->UseRealTime();
+
+void BM_CPUBoundSymmetric(benchmark::State& state) {
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+    ASSERT_FALSE(state.KeepRunning());
+  });
+
+  const int processes = state.range(0);
+  for (int i = 0; i < processes; i++) {
+    size_t cur = (state.max_iterations + (processes - 1)) / processes;
+    if ((state.iterations() + cur) >= state.max_iterations) {
+      cur = state.max_iterations - state.iterations();
+    }
+    pid_t child = fork();
+    if (child == 0) {
+      for (int i = 0; i < cur; i++) {
+        busy(kBusyMax);
+      }
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    if (cur > 0) {
+      // We can have a zero cur here, depending.
+      ASSERT_TRUE(state.KeepRunningBatch(cur));
+    }
+    children.push_back(child);
+  }
+}
+
+BENCHMARK(BM_CPUBoundSymmetric)->Range(2, 16)->UseRealTime();
+
+// Child routine for ProcessSwitch/ThreadSwitch.
+// Reads from readfd and writes the result to writefd.
+void SwitchChild(int readfd, int writefd) {
+  while (1) {
+    char buf;
+    int ret = ReadFd(readfd, &buf, 1);
+    if (ret == 0) {
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "read failed");
+
+    ret = WriteFd(writefd, &buf, 1);
+    if (ret == -1) {
+      TEST_CHECK_MSG(errno == EPIPE, "unexpected write failure");
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "write failed");
+  }
+}
+
+// Send bytes in a loop through a series of pipes, each passing through a
+// different process.
+//
+//  Proc 0        Proc 1
+//    * ----------> *
+//    ^   Pipe 1    |
+//    |             |
+//    | Pipe 0      | Pipe 2
+//    |             |
+//    |             |
+//    |   Pipe 3    v
+//    * <---------- *
+//  Proc 3        Proc 2
+//
+// This exercises context switching through multiple processes.
+void BM_ProcessSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two processes.
+  const int num_processes = state.range(0);
+  ASSERT_GE(num_processes, 2);
+
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+  });
+
+  // Must come after children, as the FDs must be closed before the children
+  // will exit.
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_processes; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This process is one of the processes in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_processes; i++) {
+    // Read from current pipe index, write to next.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].get();
+
+    const int write_index = (i + 1) % num_processes;
+    const int write_fd = write_fds[write_index].get();
+
+    // std::vector isn't safe to use from the fork child.
+    FileDescriptor* read_array = read_fds.data();
+    FileDescriptor* write_array = write_fds.data();
+
+    pid_t child = fork();
+    if (!child) {
+      // Close all other FDs.
+      for (int j = 0; j < num_processes; j++) {
+        if (j != read_index) {
+          read_array[j].reset();
+        }
+        if (j != write_index) {
+          write_array[j].reset();
+        }
+      }
+
+      SwitchChild(read_fd, write_fd);
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    children.push_back(child);
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+}
+
+BENCHMARK(BM_ProcessSwitch)->Range(2, 16)->UseRealTime();
+
+// Equivalent to BM_ThreadSwitch using threads instead of processes.
+void BM_ThreadSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two threads.
+  const int num_threads = state.range(0);
+  ASSERT_GE(num_threads, 2);
+
+  // Must come after threads, as the FDs must be closed before the children
+  // will exit.
+  std::vector<std::unique_ptr<ScopedThread>> threads;
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_threads; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This thread is one of the threads in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_threads; i++) {
+    // Read from current pipe index, write to next.
+    //
+    // Transfer ownership of the FDs to the thread.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].release();
+
+    const int write_index = (i + 1) % num_threads;
+    const int write_fd = write_fds[write_index].release();
+
+    threads.emplace_back(std::make_unique<ScopedThread>([read_fd, write_fd] {
+      FileDescriptor read(read_fd);
+      FileDescriptor write(write_fd);
+      SwitchChild(read.get(), write.get());
+    }));
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+
+  // The two FDs still owned by this thread are closed, causing the next thread
+  // to exit its loop and close its FDs, and so on until all threads exit.
+}
+
+BENCHMARK(BM_ThreadSwitch)->Range(2, 16)->UseRealTime();
+
+void BM_ThreadStart(benchmark::State& state) {
+  const int num_threads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    auto barrier = new absl::Barrier(num_threads + 1);
+    std::vector<std::unique_ptr<ScopedThread>> threads;
+
+    state.ResumeTiming();
+
+    for (size_t i = 0; i < num_threads; ++i) {
+      threads.emplace_back(std::make_unique<ScopedThread>([barrier] {
+        if (barrier->Block()) {
+          delete barrier;
+        }
+      }));
+    }
+
+    if (barrier->Block()) {
+      delete barrier;
+    }
+
+    state.PauseTiming();
+
+    for (const auto& thread : threads) {
+      thread->Join();
+    }
+
+    state.ResumeTiming();
+  }
+}
+
+BENCHMARK(BM_ThreadStart)->Range(1, 2048)->UseRealTime();
+
+// Benchmark the complete fork + exit + wait.
+void BM_ProcessLifecycle(benchmark::State& state) {
+  const int num_procs = state.range(0);
+
+  std::vector<pid_t> pids(num_procs);
+  for (auto _ : state) {
+    for (size_t i = 0; i < num_procs; ++i) {
+      int pid = fork();
+      if (pid == 0) {
+        _exit(0);
+      }
+      ASSERT_THAT(pid, SyscallSucceeds());
+      pids[i] = pid;
+    }
+
+    for (const int pid : pids) {
+      ASSERT_THAT(RetryEINTR(waitpid)(pid, nullptr, 0),
+                  SyscallSucceedsWithValue(pid));
+    }
+  }
+}
+
+BENCHMARK(BM_ProcessLifecycle)->Range(1, 512)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/futex_benchmark.cc b/test/perf/linux/futex_benchmark.cc
new file mode 100644
index 000000000..b349d50bf
--- /dev/null
+++ b/test/perf/linux/futex_benchmark.cc
@@ -0,0 +1,248 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/futex.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+inline int FutexWait(std::atomic<int32_t>* v, int32_t val) {
+  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, nullptr);
+}
+
+inline int FutexWaitRelativeTimeout(std::atomic<int32_t>* v, int32_t val,
+                                    const struct timespec* reltime) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, reltime);
+}
+
+inline int FutexWaitAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
+                                    const struct timespec* abstime) {
+  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, abstime);
+}
+
+inline int FutexWaitBitsetAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
+                                          int32_t bits,
+                                          const struct timespec* abstime) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE | FUTEX_CLOCK_REALTIME,
+                 val, abstime, nullptr, bits);
+}
+
+inline int FutexWake(std::atomic<int32_t>* v, int32_t count) {
+  return syscall(SYS_futex, v, FUTEX_WAKE_PRIVATE, count);
+}
+
+// This just uses FUTEX_WAKE on an address with nothing waiting, very simple.
+void BM_FutexWakeNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, FutexWake(&v, 1));
+  }
+}
+
+BENCHMARK(BM_FutexWakeNop);
+
+// This just uses FUTEX_WAIT on an address whose value has changed, i.e., the
+// syscall won't wait.
+void BM_FutexWaitNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(-EAGAIN, FutexWait(&v, 1));
+  }
+}
+
+BENCHMARK(BM_FutexWaitNop);
+
+// This uses FUTEX_WAIT with a timeout on an address whose value never
+// changes, such that it always times out. Timeout overhead can be estimated by
+// timer overruns for short timeouts.
+void BM_FutexWaitTimeout(benchmark::State& state) {
+  const int timeout_ns = state.range(0);
+  std::atomic<int32_t> v(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+
+  for (auto _ : state) {
+    EXPECT_EQ(-ETIMEDOUT, FutexWaitRelativeTimeout(&v, 0, &ts));
+  }
+}
+
+BENCHMARK(BM_FutexWaitTimeout)
+    ->Arg(1)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000);
+
+// This calls FUTEX_WAIT_BITSET with CLOCK_REALTIME.
+void BM_FutexWaitBitset(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  int timeout_ns = state.range(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+  for (auto _ : state) {
+    EXPECT_EQ(-ETIMEDOUT, FutexWaitBitsetAbsoluteTimeout(&v, 0, 1, &ts));
+  }
+}
+
+BENCHMARK(BM_FutexWaitBitset)->Range(0, 100000);
+
+int64_t GetCurrentMonotonicTimeNanos() {
+  struct timespec ts;
+  TEST_CHECK(clock_gettime(CLOCK_MONOTONIC, &ts) != -1);
+  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+void SpinNanos(int64_t delay_ns) {
+  if (delay_ns <= 0) {
+    return;
+  }
+  const int64_t end = GetCurrentMonotonicTimeNanos() + delay_ns;
+  while (GetCurrentMonotonicTimeNanos() < end) {
+    // spin
+  }
+}
+
+// Each iteration of FutexRoundtripDelayed involves a thread sending a futex
+// wakeup to another thread, which spins for delay_us and then sends a futex
+// wakeup back. The time per iteration is 2*  (delay_us + kBeforeWakeDelayNs +
+// futex/scheduling overhead).
+void BM_FutexRoundtripDelayed(benchmark::State& state) {
+  const int delay_us = state.range(0);
+
+  const int64_t delay_ns = delay_us * 1000;
+  // Spin for an extra kBeforeWakeDelayNs before invoking FUTEX_WAKE to reduce
+  // the probability that the wakeup comes before the wait, preventing the wait
+  // from ever taking effect and causing the benchmark to underestimate the
+  // actual wakeup time.
+  constexpr int64_t kBeforeWakeDelayNs = 500;
+  std::atomic<int32_t> v(0);
+  ScopedThread t([&] {
+    for (int i = 0; i < state.max_iterations; i++) {
+      SpinNanos(delay_ns);
+      while (v.load(std::memory_order_acquire) == 0) {
+        FutexWait(&v, 0);
+      }
+      SpinNanos(kBeforeWakeDelayNs + delay_ns);
+      v.store(0, std::memory_order_release);
+      FutexWake(&v, 1);
+    }
+  });
+  for (auto _ : state) {
+    SpinNanos(kBeforeWakeDelayNs + delay_ns);
+    v.store(1, std::memory_order_release);
+    FutexWake(&v, 1);
+    SpinNanos(delay_ns);
+    while (v.load(std::memory_order_acquire) == 1) {
+      FutexWait(&v, 1);
+    }
+  }
+}
+
+BENCHMARK(BM_FutexRoundtripDelayed)
+    ->Arg(0)
+    ->Arg(10)
+    ->Arg(20)
+    ->Arg(50)
+    ->Arg(100);
+
+// FutexLock is a simple, dumb futex based lock implementation.
+// It will try to acquire the lock by atomically incrementing the
+// lock word. If it did not increment the lock from 0 to 1, someone
+// else has the lock, so it will FUTEX_WAIT until it is woken in
+// the unlock path.
+class FutexLock {
+ public:
+  FutexLock() : lock_word_(0) {}
+
+  void lock(struct timespec* deadline) {
+    int32_t val;
+    while ((val = lock_word_.fetch_add(1, std::memory_order_acquire) + 1) !=
+           1) {
+      // If we didn't get the lock by incrementing from 0 to 1,
+      // do a FUTEX_WAIT with the desired current value set to
+      // val. If val is no longer what the atomic increment returned,
+      // someone might have set it to 0 so we can try to acquire
+      // again.
+      int ret = FutexWaitAbsoluteTimeout(&lock_word_, val, deadline);
+      if (ret == 0 || ret == -EWOULDBLOCK || ret == -EINTR) {
+        continue;
+      } else {
+        FAIL() << "unexpected FUTEX_WAIT return: " << ret;
+      }
+    }
+  }
+
+  void unlock() {
+    // Store 0 into the lock word and wake one waiter. We intentionally
+    // ignore the return value of the FUTEX_WAKE here, since there may be
+    // no waiters to wake anyway.
+    lock_word_.store(0, std::memory_order_release);
+    (void)FutexWake(&lock_word_, 1);
+  }
+
+ private:
+  std::atomic<int32_t> lock_word_;
+};
+
+FutexLock* test_lock;  // Used below.
+
+void FutexContend(benchmark::State& state, int thread_index,
+                  struct timespec* deadline) {
+  int counter = 0;
+  if (thread_index == 0) {
+    test_lock = new FutexLock();
+  }
+  for (auto _ : state) {
+    test_lock->lock(deadline);
+    counter++;
+    test_lock->unlock();
+  }
+  if (thread_index == 0) {
+    delete test_lock;
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+void BM_FutexContend(benchmark::State& state) {
+  FutexContend(state, state.thread_index, nullptr);
+}
+
+BENCHMARK(BM_FutexContend)->ThreadRange(1, 1024)->UseRealTime();
+
+void BM_FutexDeadlineContend(benchmark::State& state) {
+  auto deadline = absl::ToTimespec(absl::Now() + absl::Minutes(10));
+  FutexContend(state, state.thread_index, &deadline);
+}
+
+BENCHMARK(BM_FutexDeadlineContend)->ThreadRange(1, 1024)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
new file mode 100644
index 000000000..0e03975b4
--- /dev/null
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+#ifndef SYS_getdents64
+#if defined(__x86_64__)
+#define SYS_getdents64 217
+#elif defined(__aarch64__)
+#define SYS_getdents64 217
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_getdents64
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBufferSize = 16384;
+
+PosixErrorOr<TempPath> CreateDirectory(int count,
+                                       std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(TempPath dir, TempPath::CreateDir());
+
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (int i = 0; i < count; i++) {
+    auto file = NewTempRelPath();
+    auto res = MknodAt(dfd, file, S_IFREG | 0644, 0);
+    RETURN_IF_ERRNO(res);
+    files->push_back(file);
+  }
+
+  return std::move(dir);
+}
+
+PosixError CleanupDirectory(const TempPath& dir,
+                            std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (auto it = files->begin(); it != files->end(); ++it) {
+    auto res = UnlinkAt(dfd, *it, 0);
+    RETURN_IF_ERRNO(res);
+  }
+  return NoError();
+}
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a single FD.
+void BM_GetdentsSameFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    ASSERT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceeds());
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsSameFD)->Range(1, 1 << 16)->UseRealTime();
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a new FD each time.
+void BM_GetdentsNewFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsNewFD)->Range(1, 1 << 16)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getpid_benchmark.cc b/test/perf/linux/getpid_benchmark.cc
new file mode 100644
index 000000000..db74cb264
--- /dev/null
+++ b/test/perf/linux/getpid_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Getpid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_getpid);
+  }
+}
+
+BENCHMARK(BM_Getpid);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/gettid_benchmark.cc b/test/perf/linux/gettid_benchmark.cc
new file mode 100644
index 000000000..8f4961f5e
--- /dev/null
+++ b/test/perf/linux/gettid_benchmark.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Gettid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_gettid);
+  }
+}
+
+BENCHMARK(BM_Gettid)->ThreadRange(1, 4000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/mapping_benchmark.cc b/test/perf/linux/mapping_benchmark.cc
new file mode 100644
index 000000000..39c30fe69
--- /dev/null
+++ b/test/perf/linux/mapping_benchmark.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Conservative value for /proc/sys/vm/max_map_count, which limits the number of
+// VMAs, minus a safety margin for VMAs that already exist for the test binary.
+// The default value for max_map_count is
+// include/linux/mm.h:DEFAULT_MAX_MAP_COUNT = 65530.
+constexpr size_t kMaxVMAs = 64001;
+
+// Map then unmap pages without touching them.
+void BM_MapUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map, touch, then unmap pages.
+void BM_MapTouchUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    char* c = reinterpret_cast<char*>(addr);
+    char* end = c + pages * kPageSize;
+    while (c < end) {
+      *c = 42;
+      c += kPageSize;
+    }
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapTouchUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map and touch many pages, unmapping all at once.
+//
+// NOTE(b/111429208): This is a regression test to ensure performant mapping and
+// allocation even with tons of mappings.
+void BM_MapTouchMany(benchmark::State& state) {
+  // Number of pages to map.
+  const int page_count = state.range(0);
+
+  while (state.KeepRunning()) {
+    std::vector<void*> pages;
+
+    for (int i = 0; i < page_count; i++) {
+      void* addr = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+      char* c = reinterpret_cast<char*>(addr);
+      *c = 42;
+
+      pages.push_back(addr);
+    }
+
+    for (void* addr : pages) {
+      int ret = munmap(addr, kPageSize);
+      TEST_CHECK_MSG(ret == 0, "munmap failed");
+    }
+  }
+
+  state.SetBytesProcessed(kPageSize * page_count * state.iterations());
+}
+
+BENCHMARK(BM_MapTouchMany)->Range(1, 1 << 12)->UseRealTime();
+
+void BM_PageFault(benchmark::State& state) {
+  // Map the region in which we will take page faults. To ensure that each page
+  // fault maps only a single page, each page we touch must correspond to a
+  // distinct VMA. Thus we need a 1-page gap between each 1-page VMA. However,
+  // each gap consists of a PROT_NONE VMA, instead of an unmapped hole, so that
+  // if there are background threads running, they can't inadvertently creating
+  // mappings in our gaps that are unmapped when the test ends.
+  size_t test_pages = kMaxVMAs;
+  // Ensure that test_pages is odd, since we want the test region to both
+  // begin and end with a mapped page.
+  if (test_pages % 2 == 0) {
+    test_pages--;
+  }
+  const size_t test_region_bytes = test_pages * kPageSize;
+  // Use MAP_SHARED here because madvise(MADV_DONTNEED) on private mappings on
+  // gVisor won't force future sentry page faults (by design). Use MAP_POPULATE
+  // so that Linux pre-allocates the shmem file used to back the mapping.
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(test_region_bytes, PROT_READ, MAP_SHARED | MAP_POPULATE));
+  for (size_t i = 0; i < test_pages / 2; i++) {
+    ASSERT_THAT(
+        mprotect(reinterpret_cast<void*>(m.addr() + ((2 * i + 1) * kPageSize)),
+                 kPageSize, PROT_NONE),
+        SyscallSucceeds());
+  }
+
+  const size_t mapped_pages = test_pages / 2 + 1;
+  // "Start" at the end of the mapped region to force the mapped region to be
+  // reset, since we mapped it with MAP_POPULATE.
+  size_t cur_page = mapped_pages;
+  for (auto _ : state) {
+    if (cur_page >= mapped_pages) {
+      // We've reached the end of our mapped region and have to reset it to
+      // incur page faults again.
+      state.PauseTiming();
+      ASSERT_THAT(madvise(m.ptr(), test_region_bytes, MADV_DONTNEED),
+                  SyscallSucceeds());
+      cur_page = 0;
+      state.ResumeTiming();
+    }
+    const uintptr_t addr = m.addr() + (2 * cur_page * kPageSize);
+    const char c = *reinterpret_cast<volatile char*>(addr);
+    benchmark::DoNotOptimize(c);
+    cur_page++;
+  }
+}
+
+BENCHMARK(BM_PageFault)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/open_benchmark.cc b/test/perf/linux/open_benchmark.cc
new file mode 100644
index 000000000..68008f6d5
--- /dev/null
+++ b/test/perf/linux/open_benchmark.cc
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Open(benchmark::State& state) {
+  const int size = state.range(0);
+  std::vector<TempPath> cache;
+  for (int i = 0; i < size; i++) {
+    auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+    cache.emplace_back(std::move(path));
+  }
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    const int chosen = rand_r(&seed) % size;
+    int fd = open(cache[chosen].path().c_str(), O_RDONLY);
+    TEST_CHECK(fd != -1);
+    close(fd);
+  }
+}
+
+BENCHMARK(BM_Open)->Range(1, 128)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/pipe_benchmark.cc b/test/perf/linux/pipe_benchmark.cc
new file mode 100644
index 000000000..8f5f6a2a3
--- /dev/null
+++ b/test/perf/linux/pipe_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Pipe(benchmark::State& state) {
+  int fds[2];
+  TEST_CHECK(pipe(fds) == 0);
+
+  const int size = state.range(0);
+  std::vector<char> wbuf(size);
+  std::vector<char> rbuf(size);
+  RandomizeBuffer(wbuf.data(), size);
+
+  ScopedThread t([&] {
+    auto const fd = fds[1];
+    for (int i = 0; i < state.max_iterations; i++) {
+      TEST_CHECK(WriteFd(fd, wbuf.data(), wbuf.size()) == size);
+    }
+  });
+
+  for (auto _ : state) {
+    TEST_CHECK(ReadFd(fds[0], rbuf.data(), rbuf.size()) == size);
+  }
+
+  t.Join();
+
+  close(fds[0]);
+  close(fds[1]);
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Pipe)->Range(1, 1 << 20)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/randread_benchmark.cc b/test/perf/linux/randread_benchmark.cc
new file mode 100644
index 000000000..b0eb8c24e
--- /dev/null
+++ b/test/perf/linux/randread_benchmark.cc
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Create a 1GB file that will be read from at random positions. This should
+// invalid any performance gains from caching.
+const uint64_t kFileSize = 1ULL << 30;
+
+// How many bytes to write at once to initialize the file used to read from.
+const uint32_t kWriteSize = 65536;
+
+// Largest benchmarked read unit.
+const uint32_t kMaxRead = 1UL << 26;
+
+TempPath CreateFile(uint64_t file_size) {
+  auto path = TempPath::CreateFile().ValueOrDie();
+  FileDescriptor fd = Open(path.path(), O_WRONLY).ValueOrDie();
+
+  // Try to minimize syscalls by using maximum size writev() requests.
+  std::vector<char> buffer(kWriteSize);
+  RandomizeBuffer(buffer.data(), buffer.size());
+  const std::vector<std::vector<struct iovec>> iovecs_list =
+      GenerateIovecs(file_size, buffer.data(), buffer.size());
+  for (const auto& iovecs : iovecs_list) {
+    TEST_CHECK(writev(fd.get(), iovecs.data(), iovecs.size()) >= 0);
+  }
+
+  return path;
+}
+
+// Global test state, initialized once per process lifetime.
+struct GlobalState {
+  const TempPath tmpfile;
+  explicit GlobalState(TempPath tfile) : tmpfile(std::move(tfile)) {}
+};
+
+GlobalState& GetGlobalState() {
+  // This gets created only once throughout the lifetime of the process.
+  // Use a dynamically allocated object (that is never deleted) to avoid order
+  // of destruction of static storage variables issues.
+  static GlobalState* const state =
+      // The actual file size is the maximum random seek range (kFileSize) + the
+      // maximum read size so we can read that number of bytes at the end of the
+      // file.
+      new GlobalState(CreateFile(kFileSize + kMaxRead));
+  return *state;
+}
+
+void BM_RandRead(benchmark::State& state) {
+  const int size = state.range(0);
+
+  GlobalState& global_state = GetGlobalState();
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(global_state.tmpfile.path(), O_RDONLY));
+  std::vector<char> buf(size);
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(),
+                       rand_r(&seed) % kFileSize) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_RandRead)->Range(1, kMaxRead)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/read_benchmark.cc b/test/perf/linux/read_benchmark.cc
new file mode 100644
index 000000000..62445867d
--- /dev/null
+++ b/test/perf/linux/read_benchmark.cc
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Read(benchmark::State& state) {
+  const int size = state.range(0);
+  const std::string contents(size, 0);
+  auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), contents, TempPath::kDefaultFileMode));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDONLY));
+
+  std::vector<char> buf(size);
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(), 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Read)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sched_yield_benchmark.cc b/test/perf/linux/sched_yield_benchmark.cc
new file mode 100644
index 000000000..6756b5575
--- /dev/null
+++ b/test/perf/linux/sched_yield_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Sched_yield(benchmark::State& state) {
+  for (auto ignored : state) {
+    TEST_CHECK(sched_yield() == 0);
+  }
+}
+
+BENCHMARK(BM_Sched_yield)->ThreadRange(1, 2000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/send_recv_benchmark.cc b/test/perf/linux/send_recv_benchmark.cc
new file mode 100644
index 000000000..d73e49523
--- /dev/null
+++ b/test/perf/linux/send_recv_benchmark.cc
@@ -0,0 +1,372 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "benchmark/benchmark.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr ssize_t kMessageSize = 1024;
+
+class Message {
+ public:
+  explicit Message(int byte = 0) : Message(byte, kMessageSize, 0) {}
+
+  explicit Message(int byte, int sz) : Message(byte, sz, 0) {}
+
+  explicit Message(int byte, int sz, int cmsg_sz)
+      : buffer_(sz, byte), cmsg_buffer_(cmsg_sz, 0) {
+    iov_.iov_base = buffer_.data();
+    iov_.iov_len = sz;
+    hdr_.msg_iov = &iov_;
+    hdr_.msg_iovlen = 1;
+    hdr_.msg_control = cmsg_buffer_.data();
+    hdr_.msg_controllen = cmsg_sz;
+  }
+
+  struct msghdr* header() {
+    return &hdr_;
+  }
+
+ private:
+  std::vector<char> buffer_;
+  std::vector<char> cmsg_buffer_;
+  struct iovec iov_ = {};
+  struct msghdr hdr_ = {};
+};
+
+void BM_Recvmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvmsg)->UseRealTime();
+
+void BM_Sendmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendmsg(send_socket.get(), send_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendmsg)->UseRealTime();
+
+void BM_Recvfrom(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&send_socket, &send_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    }
+  });
+
+  int bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+                     nullptr);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvfrom)->UseRealTime();
+
+void BM_Sendto(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&recv_socket, &recv_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+               nullptr);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendto)->UseRealTime();
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  addr.ss_family = family;
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+          htonl(INADDR_LOOPBACK);
+      break;
+    case AF_INET6:
+      reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+          in6addr_loopback;
+      break;
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+  return addr;
+}
+
+// BM_RecvmsgWithControlBuf measures the performance of recvmsg when we allocate
+// space for control messages. Note that we do not expect to receive any.
+void BM_RecvmsgWithControlBuf(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET6));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  absl::Notification notification;
+  Message send_msg('a');
+  // Create a msghdr with a buffer allocated for control messages.
+  Message recv_msg(0, kMessageSize, /*cmsg_sz=*/24);
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_RecvmsgWithControlBuf)->UseRealTime();
+
+// BM_SendmsgTCP measures the sendmsg throughput with varying payload sizes.
+//
+// state.Args[0] indicates whether the underlying socket should be blocking or
+// non-blocking w/ 0 indicating non-blocking and 1 to indicate blocking.
+// state.Args[1] is the size of the payload to be used per sendmsg call.
+void BM_SendmsgTCP(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  // Check if we want to run the test w/ a blocking send socket
+  // or non-blocking.
+  const int blocking = state.range(0);
+  if (!blocking) {
+    // Set the send FD to O_NONBLOCK.
+    int opts;
+    ASSERT_THAT(opts = fcntl(send_socket.get(), F_GETFL), SyscallSucceeds());
+    opts |= O_NONBLOCK;
+    ASSERT_THAT(fcntl(send_socket.get(), F_SETFL, opts), SyscallSucceeds());
+  }
+
+  absl::Notification notification;
+
+  // Get the buffer size we should use for this iteration of the test.
+  const int buf_size = state.range(1);
+  Message send_msg('a', buf_size), recv_msg(0, buf_size);
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      TEST_CHECK(recvmsg(recv_socket.get(), recv_msg.header(), 0) >= 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  int ncalls = 0;
+  for (auto ignored : state) {
+    int sent = 0;
+    while (true) {
+      struct msghdr hdr = {};
+      struct iovec iov = {};
+      struct msghdr* snd_header = send_msg.header();
+      iov.iov_base = static_cast<char*>(snd_header->msg_iov->iov_base) + sent;
+      iov.iov_len = snd_header->msg_iov->iov_len - sent;
+      hdr.msg_iov = &iov;
+      hdr.msg_iovlen = 1;
+      int n = RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0);
+      ncalls++;
+      if (n > 0) {
+        sent += n;
+        if (sent == buf_size) {
+          break;
+        }
+        // n can be > 0 but less than requested size. In which case we don't
+        // poll.
+        continue;
+      }
+      // Poll the fd for it to become writable.
+      struct pollfd poll_fd = {send_socket.get(), POLL_OUT, 0};
+      EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10),
+                  SyscallSucceedsWithValue(0));
+    }
+    bytes_sent += static_cast<int64_t>(sent);
+  }
+
+  notification.Notify();
+  send_socket.reset();
+  state.SetBytesProcessed(bytes_sent);
+}
+
+void Args(benchmark::internal::Benchmark* benchmark) {
+  for (int blocking = 0; blocking < 2; blocking++) {
+    for (int buf_size = 1024; buf_size <= 256 << 20; buf_size *= 2) {
+      benchmark->Args({blocking, buf_size});
+    }
+  }
+}
+
+BENCHMARK(BM_SendmsgTCP)->Apply(&Args)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/seqwrite_benchmark.cc b/test/perf/linux/seqwrite_benchmark.cc
new file mode 100644
index 000000000..af49e4477
--- /dev/null
+++ b/test/perf/linux/seqwrite_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The maximum file size of the test file, when writes get beyond this point
+// they wrap around. This should be large enough to blow away caches.
+const uint64_t kMaxFile = 1 << 30;
+
+// Perform writes of various sizes sequentially to one file. Wraps around if it
+// goes above a certain maximum file size.
+void BM_SeqWrite(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), buf.size());
+
+  // Start writes at offset 0.
+  uint64_t offset = 0;
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), buf.size(), offset) ==
+               buf.size());
+    offset += buf.size();
+    // Wrap around if going above the maximum file size.
+    if (offset >= kMaxFile) {
+      offset = 0;
+    }
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_SeqWrite)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/signal_benchmark.cc b/test/perf/linux/signal_benchmark.cc
new file mode 100644
index 000000000..a6928df58
--- /dev/null
+++ b/test/perf/linux/signal_benchmark.cc
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <string.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void FixupHandler(int sig, siginfo_t* si, void* void_ctx) {
+  static unsigned int dataval = 0;
+
+  // Skip the offending instruction.
+  ucontext_t* ctx = reinterpret_cast<ucontext_t*>(void_ctx);
+  ctx->uc_mcontext.gregs[REG_RAX] = reinterpret_cast<greg_t>(&dataval);
+}
+
+void BM_FaultSignalFixup(benchmark::State& state) {
+  // Set up the signal handler.
+  struct sigaction sa = {};
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = FixupHandler;
+  sa.sa_flags = SA_SIGINFO;
+  TEST_CHECK(sigaction(SIGSEGV, &sa, nullptr) == 0);
+
+  // Fault, fault, fault.
+  for (auto _ : state) {
+    register volatile unsigned int* ptr asm("rax");
+
+    // Trigger the segfault.
+    ptr = nullptr;
+    *ptr = 0;
+  }
+}
+
+BENCHMARK(BM_FaultSignalFixup)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sleep_benchmark.cc b/test/perf/linux/sleep_benchmark.cc
new file mode 100644
index 000000000..99ef05117
--- /dev/null
+++ b/test/perf/linux/sleep_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Sleep for 'param' nanoseconds.
+void BM_Sleep(benchmark::State& state) {
+  const int nanoseconds = state.range(0);
+
+  for (auto _ : state) {
+    struct timespec ts;
+    ts.tv_sec = 0;
+    ts.tv_nsec = nanoseconds;
+
+    int ret;
+    do {
+      ret = syscall(SYS_nanosleep, &ts, &ts);
+      if (ret < 0) {
+        TEST_CHECK(errno == EINTR);
+      }
+    } while (ret < 0);
+  }
+}
+
+BENCHMARK(BM_Sleep)
+    ->Arg(0)
+    ->Arg(1)
+    ->Arg(1000)              // 1us
+    ->Arg(1000 * 1000)       // 1ms
+    ->Arg(10 * 1000 * 1000)  // 10ms
+    ->Arg(50 * 1000 * 1000)  // 50ms
+    ->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/stat_benchmark.cc b/test/perf/linux/stat_benchmark.cc
new file mode 100644
index 000000000..f15424482
--- /dev/null
+++ b/test/perf/linux/stat_benchmark.cc
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a file in a nested directory hierarchy at least `depth` directories
+// deep, and stats that file multiple times.
+void BM_Stat(benchmark::State& state) {
+  // Create nested directories with given depth.
+  int depth = state.range(0);
+  const TempPath top_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string dir_path = top_dir.path();
+
+  while (depth-- > 0) {
+    // Don't use TempPath because it will make paths too long to use.
+    //
+    // The top_dir destructor will clean up this whole tree.
+    dir_path = JoinPath(dir_path, absl::StrCat(depth));
+    ASSERT_NO_ERRNO(Mkdir(dir_path, 0755));
+  }
+
+  // Create the file that will be stat'd.
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir_path));
+
+  struct stat st;
+  for (auto _ : state) {
+    ASSERT_THAT(stat(file.path().c_str(), &st), SyscallSucceeds());
+  }
+}
+
+BENCHMARK(BM_Stat)->Range(1, 100)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/unlink_benchmark.cc b/test/perf/linux/unlink_benchmark.cc
new file mode 100644
index 000000000..92243a042
--- /dev/null
+++ b/test/perf/linux/unlink_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a directory containing `files` files, and unlinks all the files.
+void BM_Unlink(benchmark::State& state) {
+  // Create directory with given files.
+  const int file_count = state.range(0);
+
+  // We unlink all files on each iteration, but report this as a "batch"
+  // iteration so that reported times are per file.
+  TempPath dir;
+  while (state.KeepRunningBatch(file_count)) {
+    state.PauseTiming();
+    // N.B. dir is declared outside the loop so that destruction of the previous
+    // iteration's directory occurs here, inside of PauseTiming.
+    dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+    std::vector<TempPath> files;
+    for (int i = 0; i < file_count; i++) {
+      TempPath file =
+          ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+      files.push_back(std::move(file));
+    }
+    state.ResumeTiming();
+
+    while (!files.empty()) {
+      // Destructor unlinks.
+      files.pop_back();
+    }
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_Unlink)->Range(1, 100 * 1000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/write_benchmark.cc b/test/perf/linux/write_benchmark.cc
new file mode 100644
index 000000000..7b060c70e
--- /dev/null
+++ b/test/perf/linux/write_benchmark.cc
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Write(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), size);
+
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), size, 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Write)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/runner/BUILD b/test/runner/BUILD
new file mode 100644
index 000000000..9959ef9b0
--- /dev/null
+++ b/test/runner/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["runner.go"],
+    data = [
+        "//runsc",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//runsc/specutils",
+        "//runsc/testutil",
+        "//test/runner/gtest",
+        "//test/uds",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
new file mode 100644
index 000000000..5e97c1867
--- /dev/null
+++ b/test/runner/defs.bzl
@@ -0,0 +1,218 @@
+"""Defines a rule for syscall test targets."""
+
+load("//tools:defs.bzl", "loopback")
+
+def _runner_test_impl(ctx):
+    # Generate a runner binary.
+    runner = ctx.actions.declare_file("%s-runner" % ctx.label.name)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "set -euf -x -o pipefail",
+        "if [[ -n \"${TEST_UNDECLARED_OUTPUTS_DIR}\" ]]; then",
+        "  mkdir -p \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "  chmod a+rwx \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "fi",
+        "exec %s %s %s\n" % (
+            ctx.files.runner[0].short_path,
+            " ".join(ctx.attr.runner_args),
+            ctx.files.test[0].short_path,
+        ),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return with all transitive files.
+    runfiles = ctx.runfiles(
+        transitive_files = depset(transitive = [
+            depset(target.data_runfiles.files)
+            for target in (ctx.attr.runner, ctx.attr.test)
+            if hasattr(target, "data_runfiles")
+        ]),
+        files = ctx.files.runner + ctx.files.test,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_runner_test = rule(
+    attrs = {
+        "runner": attr.label(
+            default = "//test/runner:runner",
+        ),
+        "test": attr.label(
+            mandatory = True,
+        ),
+        "runner_args": attr.string_list(),
+        "data": attr.label_list(
+            allow_files = True,
+        ),
+    },
+    test = True,
+    implementation = _runner_test_impl,
+)
+
+def _syscall_test(
+        test,
+        shard_count,
+        size,
+        platform,
+        use_tmpfs,
+        tags,
+        network = "none",
+        file_access = "exclusive",
+        overlay = False,
+        add_uds_tree = False):
+    # Prepend "runsc" to non-native platform names.
+    full_platform = platform if platform == "native" else "runsc_" + platform
+
+    # Name the test appropriately.
+    name = test.split(":")[1] + "_" + full_platform
+    if file_access == "shared":
+        name += "_shared"
+    if overlay:
+        name += "_overlay"
+    if network != "none":
+        name += "_" + network + "net"
+
+    # Apply all tags.
+    if tags == None:
+        tags = []
+
+    # Add the full_platform and file access in a tag to make it easier to run
+    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
+    tags += [full_platform, "file_" + file_access]
+
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
+    # Add tag to prevent the tests from running in a Bazel sandbox.
+    # TODO(b/120560048): Make the tests run without this tag.
+    tags.append("no-sandbox")
+
+    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
+    # more stable.
+    if platform == "kvm":
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
+
+    runner_args = [
+        # Arguments are passed directly to runner binary.
+        "--platform=" + platform,
+        "--network=" + network,
+        "--use-tmpfs=" + str(use_tmpfs),
+        "--file-access=" + file_access,
+        "--overlay=" + str(overlay),
+        "--add-uds-tree=" + str(add_uds_tree),
+    ]
+
+    # Call the rule above.
+    _runner_test(
+        name = name,
+        test = test,
+        runner_args = runner_args,
+        data = [loopback],
+        size = size,
+        tags = tags,
+        shard_count = shard_count,
+    )
+
+def syscall_test(
+        test,
+        shard_count = 5,
+        size = "small",
+        use_tmpfs = False,
+        add_overlay = False,
+        add_uds_tree = False,
+        add_hostinet = False,
+        tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "native",
+        use_tmpfs = False,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "kvm",
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "ptrace",
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    if add_overlay:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+            overlay = True,
+        )
+
+    if not use_tmpfs:
+        # Also test shared gofer access.
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+            file_access = "shared",
+        )
+
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+        )
diff --git a/test/runner/gtest/BUILD b/test/runner/gtest/BUILD
new file mode 100644
index 000000000..de4b2727c
--- /dev/null
+++ b/test/runner/gtest/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gtest",
+    srcs = ["gtest.go"],
+    visibility = ["//:sandbox"],
+)
diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
new file mode 100644
index 000000000..23bf7b5f6
--- /dev/null
+++ b/test/runner/gtest/gtest.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gtest contains helpers for running google-test tests from Go.
+package gtest
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+var (
+	// listTestFlag is the flag that will list tests in gtest binaries.
+	listTestFlag = "--gtest_list_tests"
+
+	// filterTestFlag is the flag that will filter tests in gtest binaries.
+	filterTestFlag = "--gtest_filter"
+
+	// listBechmarkFlag is the flag that will list benchmarks in gtest binaries.
+	listBenchmarkFlag = "--benchmark_list_tests"
+
+	// filterBenchmarkFlag is the flag that will run specified benchmarks.
+	filterBenchmarkFlag = "--benchmark_filter"
+)
+
+// TestCase is a single gtest test case.
+type TestCase struct {
+	// Suite is the suite for this test.
+	Suite string
+
+	// Name is the name of this individual test.
+	Name string
+
+	// benchmark indicates that this is a benchmark. In this case, the
+	// suite will be empty, and we will use the appropriate test and
+	// benchmark flags.
+	benchmark bool
+}
+
+// FullName returns the name of the test including the suite. It is suitable to
+// pass to "-gtest_filter".
+func (tc TestCase) FullName() string {
+	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
+}
+
+// Args returns arguments to be passed when invoking the test.
+func (tc TestCase) Args() []string {
+	if tc.benchmark {
+		return []string{
+			fmt.Sprintf("%s=^$", filterTestFlag),
+			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+		}
+	}
+	return []string{
+		fmt.Sprintf("%s=^%s$", filterTestFlag, tc.FullName()),
+		fmt.Sprintf("%s=^$", filterBenchmarkFlag),
+	}
+}
+
+// ParseTestCases calls a gtest test binary to list its test and returns a
+// slice with the name and suite of each test.
+//
+// If benchmarks is true, then benchmarks will be included in the list of test
+// cases provided. Note that this requires the binary to support the
+// benchmarks_list_tests flag.
+func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]TestCase, error) {
+	// Run to extract test cases.
+	args := append([]string{listTestFlag}, extraArgs...)
+	cmd := exec.Command(testBin, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
+	}
+
+	// Parse test output.
+	var t []TestCase
+	var suite string
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// New suite?
+		if !strings.HasPrefix(line, " ") {
+			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
+			continue
+		}
+
+		// Individual test.
+		name := strings.TrimSpace(line)
+
+		// Do we have a suite yet?
+		if suite == "" {
+			return nil, fmt.Errorf("test without a suite: %v", name)
+		}
+
+		// Add this individual test.
+		t = append(t, TestCase{
+			Suite: suite,
+			Name:  name,
+		})
+
+	}
+
+	// Finished?
+	if !benchmarks {
+		return t, nil
+	}
+
+	// Run again to extract benchmarks.
+	args = append([]string{listBenchmarkFlag}, extraArgs...)
+	cmd = exec.Command(testBin, args...)
+	out, err = cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
+	}
+
+	// Parse benchmark output.
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// Single benchmark.
+		name := strings.TrimSpace(line)
+
+		// Add the single benchmark.
+		t = append(t, TestCase{
+			Suite:     "Benchmarks",
+			Name:      name,
+			benchmark: true,
+		})
+	}
+
+	return t, nil
+}
diff --git a/test/runner/runner.go b/test/runner/runner.go
new file mode 100644
index 000000000..a78ef38e0
--- /dev/null
+++ b/test/runner/runner.go
@@ -0,0 +1,477 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary syscall_test_runner runs the syscall test suites in gVisor
+// containers and on the host platform.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/test/runner/gtest"
+	"gvisor.dev/gvisor/test/uds"
+)
+
+var (
+	debug      = flag.Bool("debug", false, "enable debug logs")
+	strace     = flag.Bool("strace", false, "enable strace logs")
+	platform   = flag.String("platform", "ptrace", "platform to run on")
+	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
+	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
+	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
+	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
+	parallel   = flag.Bool("parallel", false, "run tests in parallel")
+	runscPath  = flag.String("runsc", "", "path to runsc binary")
+
+	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
+)
+
+// runTestCaseNative runs the test case directly on the host machine.
+func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
+	// These tests might be running in parallel, so make sure they have a
+	// unique test temp dir.
+	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+	if err != nil {
+		t.Fatalf("could not create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Replace TEST_TMPDIR in the current environment with something
+	// unique.
+	env := os.Environ()
+	newEnvVar := "TEST_TMPDIR=" + tmpDir
+	var found bool
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = newEnvVar
+			found = true
+			break
+		}
+	}
+	if !found {
+		env = append(env, newEnvVar)
+	}
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	if *addUDSTree {
+		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+		if err != nil {
+			t.Fatalf("failed to create socket tree: %v", err)
+		}
+		defer cleanup()
+
+		env = append(env, "TEST_UDS_TREE="+socketDir)
+		// On Linux, the concept of "attach" location doesn't exist.
+		// Just pass the same path to make these test identical.
+		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
+	}
+
+	cmd := exec.Command(testBin, tc.Args()...)
+	cmd.Env = env
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
+		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
+	}
+}
+
+// runRunsc runs spec in runsc in a standard test configuration.
+//
+// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
+//
+// Returns an error if the sandboxed application exits non-zero.
+func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
+	bundleDir, err := testutil.SetupBundleDir(spec)
+	if err != nil {
+		return fmt.Errorf("SetupBundleDir failed: %v", err)
+	}
+	defer os.RemoveAll(bundleDir)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		return fmt.Errorf("SetupRootDir failed: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	name := tc.FullName()
+	id := testutil.UniqueContainerID()
+	log.Infof("Running test %q in container %q", name, id)
+	specutils.LogSpec(spec)
+
+	args := []string{
+		"-root", rootDir,
+		"-network", *network,
+		"-log-format=text",
+		"-TESTONLY-unsafe-nonroot=true",
+		"-net-raw=true",
+		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
+		"-watchdog-action=panic",
+		"-platform", *platform,
+		"-file-access", *fileAccess,
+	}
+	if *overlay {
+		args = append(args, "-overlay")
+	}
+	if *debug {
+		args = append(args, "-debug", "-log-packets=true")
+	}
+	if *strace {
+		args = append(args, "-strace")
+	}
+	if *addUDSTree {
+		args = append(args, "-fsgofer-host-uds")
+	}
+
+	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
+		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
+		if err := os.MkdirAll(tdir, 0755); err != nil {
+			return fmt.Errorf("could not create test dir: %v", err)
+		}
+		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
+		if err != nil {
+			return fmt.Errorf("could not create temp dir: %v", err)
+		}
+		debugLogDir += "/"
+		log.Infof("runsc logs: %s", debugLogDir)
+		args = append(args, "-debug-log", debugLogDir)
+
+		// Default -log sends messages to stderr which makes reading the test log
+		// difficult. Instead, drop them when debug log is enabled given it's a
+		// better place for these messages.
+		args = append(args, "-log=/dev/null")
+	}
+
+	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
+	// as root inside that namespace to get it.
+	rArgs := append(args, "run", "--bundle", bundleDir, id)
+	cmd := exec.Command(*runscPath, rArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		GidMappingsEnableSetgroups: false,
+		Credential: &syscall.Credential{
+			Uid: 0,
+			Gid: 0,
+		},
+	}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	sig := make(chan os.Signal, 1)
+	signal.Notify(sig, syscall.SIGTERM)
+	go func() {
+		s, ok := <-sig
+		if !ok {
+			return
+		}
+		log.Warningf("%s: Got signal: %v", name, s)
+		done := make(chan bool)
+		dArgs := append([]string{}, args...)
+		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
+		go func(dArgs []string) {
+			cmd := exec.Command(*runscPath, dArgs...)
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+			cmd.Run()
+			done <- true
+		}(dArgs)
+
+		timeout := time.After(3 * time.Second)
+		select {
+		case <-timeout:
+			log.Infof("runsc debug --stacks is timeouted")
+		case <-done:
+		}
+
+		log.Warningf("Send SIGTERM to the sandbox process")
+		dArgs = append(args, "debug",
+			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
+			id)
+		cmd := exec.Command(*runscPath, dArgs...)
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		cmd.Run()
+	}()
+
+	err = cmd.Run()
+
+	signal.Stop(sig)
+	close(sig)
+
+	return err
+}
+
+// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
+func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
+	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create socket tree: %v", err)
+	}
+
+	// Standard access to entire tree.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets",
+		Source:      socketDir,
+		Type:        "bind",
+	})
+
+	// Individial attach points for each socket to test mounts that attach
+	// directly to the sockets.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/echo",
+		Source:      filepath.Join(socketDir, "stream/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/nonlistening",
+		Source:      filepath.Join(socketDir, "stream/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/echo",
+		Source:      filepath.Join(socketDir, "seqpacket/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
+		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/dgram/null",
+		Source:      filepath.Join(socketDir, "dgram/null"),
+		Type:        "bind",
+	})
+
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
+
+	return cleanup, nil
+}
+
+// runsTestCaseRunsc runs the test case in runsc.
+func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
+	// Run a new container with the test executable and filter for the
+	// given test suite and name.
+	spec := testutil.NewSpecWithArgs(append([]string{testBin}, tc.Args()...)...)
+
+	// Mark the root as writeable, as some tests attempt to
+	// write to the rootfs, and expect EACCES, not EROFS.
+	spec.Root.Readonly = false
+
+	// Test spec comes with pre-defined mounts that we don't want. Reset it.
+	spec.Mounts = nil
+	if *useTmpfs {
+		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
+		// features only available in gVisor's internal tmpfs may fail.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/tmp",
+			Type:        "tmpfs",
+		})
+	} else {
+		// Use a gofer-backed directory as '/tmp'.
+		//
+		// Tests might be running in parallel, so make sure each has a
+		// unique test temp dir.
+		//
+		// Some tests (e.g., sticky) access this mount from other
+		// users, so make sure it is world-accessible.
+		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+		if err != nil {
+			t.Fatalf("could not create temp dir: %v", err)
+		}
+		defer os.RemoveAll(tmpDir)
+
+		if err := os.Chmod(tmpDir, 0777); err != nil {
+			t.Fatalf("could not chmod temp dir: %v", err)
+		}
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp",
+			Source:      tmpDir,
+		})
+	}
+
+	// Set environment variables that indicate we are
+	// running in gVisor with the given platform and network.
+	platformVar := "TEST_ON_GVISOR"
+	networkVar := "GVISOR_NETWORK"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
+
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
+	// be backed by tmpfs.
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = "TEST_TMPDIR=/tmp"
+			break
+		}
+	}
+
+	spec.Process.Env = env
+
+	if *addUDSTree {
+		cleanup, err := setupUDSTree(spec)
+		if err != nil {
+			t.Fatalf("error creating UDS tree: %v", err)
+		}
+		defer cleanup()
+	}
+
+	if err := runRunsc(tc, spec); err != nil {
+		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
+	}
+}
+
+// filterEnv returns an environment with the blacklisted variables removed.
+func filterEnv(env, blacklist []string) []string {
+	var out []string
+	for _, kv := range env {
+		ok := true
+		for _, k := range blacklist {
+			if strings.HasPrefix(kv, k+"=") {
+				ok = false
+				break
+			}
+		}
+		if ok {
+			out = append(out, kv)
+		}
+	}
+	return out
+}
+
+func fatalf(s string, args ...interface{}) {
+	fmt.Fprintf(os.Stderr, s+"\n", args...)
+	os.Exit(1)
+}
+
+func matchString(a, b string) (bool, error) {
+	return a == b, nil
+}
+
+func main() {
+	flag.Parse()
+	if flag.NArg() != 1 {
+		fatalf("test must be provided")
+	}
+	testBin := flag.Args()[0] // Only argument.
+
+	log.SetLevel(log.Info)
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+
+	if *platform != "native" && *runscPath == "" {
+		if err := testutil.ConfigureExePath(); err != nil {
+			panic(err.Error())
+		}
+		*runscPath = specutils.ExePath
+	}
+
+	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
+	// from outside the sandbox can (and will) stomp on logs from inside
+	// the sandbox.
+	for _, f := range []*os.File{os.Stdout, os.Stderr} {
+		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
+		if err != nil {
+			fatalf("error getting file flags for %v: %v", f, err)
+		}
+		if flags&unix.O_APPEND == 0 {
+			flags |= unix.O_APPEND
+			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
+				fatalf("error setting file flags for %v: %v", f, err)
+			}
+		}
+	}
+
+	// Get all test cases in each binary.
+	testCases, err := gtest.ParseTestCases(testBin, true)
+	if err != nil {
+		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
+	}
+
+	// Get subset of tests corresponding to shard.
+	indices, err := testutil.TestIndicesForShard(len(testCases))
+	if err != nil {
+		fatalf("TestsForShard() failed: %v", err)
+	}
+
+	// Resolve the absolute path for the binary.
+	testBin, err = filepath.Abs(testBin)
+	if err != nil {
+		fatalf("Abs() failed: %v", err)
+	}
+
+	// Run the tests.
+	var tests []testing.InternalTest
+	for _, tci := range indices {
+		// Capture tc.
+		tc := testCases[tci]
+		tests = append(tests, testing.InternalTest{
+			Name: fmt.Sprintf("%s_%s", tc.Suite, tc.Name),
+			F: func(t *testing.T) {
+				if *parallel {
+					t.Parallel()
+				}
+				if *platform == "native" {
+					// Run the test case on host.
+					runTestCaseNative(testBin, tc, t)
+				} else {
+					// Run the test case in runsc.
+					runTestCaseRunsc(testBin, tc, t)
+				}
+			},
+		})
+	}
+
+	testing.Main(matchString, tests, nil, nil)
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 31d239c0e..d69ac8356 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,5 +1,4 @@
-load("//tools:defs.bzl", "go_binary")
-load("//test/syscalls:build_defs.bzl", "syscall_test")
+load("//test/runner:defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
 
@@ -726,21 +725,3 @@ syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
 syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
-
-go_binary(
-    name = "syscall_test_runner",
-    testonly = 1,
-    srcs = ["syscall_test_runner.go"],
-    data = [
-        "//runsc",
-    ],
-    deps = [
-        "//pkg/log",
-        "//runsc/specutils",
-        "//runsc/testutil",
-        "//test/syscalls/gtest",
-        "//test/uds",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
deleted file mode 100644
index cbab85ef7..000000000
--- a/test/syscalls/build_defs.bzl
+++ /dev/null
@@ -1,180 +0,0 @@
-"""Defines a rule for syscall test targets."""
-
-load("//tools:defs.bzl", "loopback")
-
-def syscall_test(
-        test,
-        shard_count = 5,
-        size = "small",
-        use_tmpfs = False,
-        add_overlay = False,
-        add_uds_tree = False,
-        add_hostinet = False,
-        tags = None):
-    """syscall_test is a macro that will create targets for all platforms.
-
-    Args:
-      test: the test target.
-      shard_count: shards for defined tests.
-      size: the defined test size.
-      use_tmpfs: use tmpfs in the defined tests.
-      add_overlay: add an overlay test.
-      add_uds_tree: add a UDS test.
-      add_hostinet: add a hostinet test.
-      tags: starting test tags.
-    """
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "native",
-        use_tmpfs = False,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "kvm",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "ptrace",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    if add_overlay:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            overlay = True,
-        )
-
-    if not use_tmpfs:
-        # Also test shared gofer access.
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            file_access = "shared",
-        )
-
-    if add_hostinet:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            network = "host",
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-        )
-
-def _syscall_test(
-        test,
-        shard_count,
-        size,
-        platform,
-        use_tmpfs,
-        tags,
-        network = "none",
-        file_access = "exclusive",
-        overlay = False,
-        add_uds_tree = False):
-    test_name = test.split(":")[1]
-
-    # Prepend "runsc" to non-native platform names.
-    full_platform = platform if platform == "native" else "runsc_" + platform
-
-    name = test_name + "_" + full_platform
-    if file_access == "shared":
-        name += "_shared"
-    if overlay:
-        name += "_overlay"
-    if network != "none":
-        name += "_" + network + "net"
-
-    if tags == None:
-        tags = []
-
-    # Add the full_platform and file access in a tag to make it easier to run
-    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
-    tags += [full_platform, "file_" + file_access]
-
-    # Hash this target into one of 15 buckets. This can be used to
-    # randomly split targets between different workflows.
-    hash15 = hash(native.package_name() + name) % 15
-    tags.append("hash15:" + str(hash15))
-
-    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
-    # we figure out how to request ipv4 sockets on Guitar machines.
-    if network == "host":
-        tags.append("noguitar")
-
-    # Disable off-host networking.
-    tags.append("requires-net:loopback")
-
-    # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO(b/120560048): Make the tests run without this tag.
-    tags.append("no-sandbox")
-
-    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
-    # more stable.
-    if platform == "kvm":
-        tags.append("manual")
-        tags.append("requires-kvm")
-
-        # TODO(b/112165693): Remove when tests pass reliably.
-        tags.append("notap")
-
-    args = [
-        # Arguments are passed directly to syscall_test_runner binary.
-        "--test-name=" + test_name,
-        "--platform=" + platform,
-        "--network=" + network,
-        "--use-tmpfs=" + str(use_tmpfs),
-        "--file-access=" + file_access,
-        "--overlay=" + str(overlay),
-        "--add-uds-tree=" + str(add_uds_tree),
-    ]
-
-    sh_test(
-        srcs = ["syscall_test_runner.sh"],
-        name = name,
-        data = [
-            ":syscall_test_runner",
-            loopback,
-            test,
-        ],
-        args = args,
-        size = size,
-        tags = tags,
-        shard_count = shard_count,
-    )
-
-def sh_test(**kwargs):
-    """Wraps the standard sh_test."""
-    native.sh_test(
-        **kwargs
-    )
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
deleted file mode 100644
index de4b2727c..000000000
--- a/test/syscalls/gtest/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "gtest",
-    srcs = ["gtest.go"],
-    visibility = ["//:sandbox"],
-)
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
deleted file mode 100644
index bdec8eb07..000000000
--- a/test/syscalls/gtest/gtest.go
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package gtest contains helpers for running google-test tests from Go.
-package gtest
-
-import (
-	"fmt"
-	"os/exec"
-	"strings"
-)
-
-var (
-	// ListTestFlag is the flag that will list tests in gtest binaries.
-	ListTestFlag = "--gtest_list_tests"
-
-	// FilterTestFlag is the flag that will filter tests in gtest binaries.
-	FilterTestFlag = "--gtest_filter"
-)
-
-// TestCase is a single gtest test case.
-type TestCase struct {
-	// Suite is the suite for this test.
-	Suite string
-
-	// Name is the name of this individual test.
-	Name string
-}
-
-// FullName returns the name of the test including the suite. It is suitable to
-// pass to "-gtest_filter".
-func (tc TestCase) FullName() string {
-	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
-}
-
-// ParseTestCases calls a gtest test binary to list its test and returns a
-// slice with the name and suite of each test.
-func ParseTestCases(testBin string, extraArgs ...string) ([]TestCase, error) {
-	args := append([]string{ListTestFlag}, extraArgs...)
-	cmd := exec.Command(testBin, args...)
-	out, err := cmd.Output()
-	if err != nil {
-		exitErr, ok := err.(*exec.ExitError)
-		if !ok {
-			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
-		}
-		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
-	}
-
-	var t []TestCase
-	var suite string
-	for _, line := range strings.Split(string(out), "\n") {
-		// Strip comments.
-		line = strings.Split(line, "#")[0]
-
-		// New suite?
-		if !strings.HasPrefix(line, " ") {
-			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
-			continue
-		}
-
-		// Individual test.
-		name := strings.TrimSpace(line)
-
-		// Do we have a suite yet?
-		if suite == "" {
-			return nil, fmt.Errorf("test without a suite: %v", name)
-		}
-
-		// Add this individual test.
-		t = append(t, TestCase{
-			Suite: suite,
-			Name:  name,
-		})
-
-	}
-
-	if len(t) == 0 {
-		return nil, fmt.Errorf("no tests parsed from %v", testBin)
-	}
-	return t, nil
-}
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
index d89269985..940c97285 100644
--- a/test/syscalls/linux/alarm.cc
+++ b/test/syscalls/linux/alarm.cc
@@ -188,6 +188,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index b5e0a512b..07bd527e6 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -868,6 +868,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 421c15b87..c7cc5816e 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -1128,5 +1128,5 @@ int main(int argc, char** argv) {
     exit(err);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index b77e4cbd1..8b48f0804 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -349,6 +349,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index d07571a5f..04c5161f5 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -226,5 +226,5 @@ int main(int argc, char** argv) {
          prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index 30f0d75b3..c4e9cf528 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -264,5 +264,5 @@ int main(int argc, char** argv) {
            prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index a23fdb58d..f91187e75 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -2076,5 +2076,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 4dd5cf27b..bfe3e2603 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1208,5 +1208,5 @@ int main(int argc, char** argv) {
     gvisor::testing::RunExecveChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index 81d193ffd..ed27e2566 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -167,6 +167,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 2c947feb7..cf6499f8b 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -411,5 +411,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 4deb1ae95..6227774a4 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -132,6 +132,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 95be4b66c..389e5fca2 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -369,5 +369,5 @@ int main(int argc, char** argv) {
 
   gvisor::testing::TestInit(&argc, &argv);
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
index 7db57d968..b2fcedd62 100644
--- a/test/syscalls/linux/sigstop.cc
+++ b/test/syscalls/linux/sigstop.cc
@@ -147,5 +147,5 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 1e5bf5942..4f8afff15 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -319,6 +319,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 2f92c27da..4b3c44527 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -658,5 +658,5 @@ int main(int argc, char** argv) {
     }
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 0aaba482d..19d05998e 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -191,5 +191,5 @@ int main(int argc, char** argv) {
     return gvisor::testing::RunChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
deleted file mode 100644
index ae342b68c..000000000
--- a/test/syscalls/syscall_test_runner.go
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary syscall_test_runner runs the syscall test suites in gVisor
-// containers and on the host platform.
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"testing"
-	"time"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
-	"gvisor.dev/gvisor/test/syscalls/gtest"
-	"gvisor.dev/gvisor/test/uds"
-)
-
-// Location of syscall tests, relative to the repo root.
-const testDir = "test/syscalls/linux"
-
-var (
-	testName   = flag.String("test-name", "", "name of test binary to run")
-	debug      = flag.Bool("debug", false, "enable debug logs")
-	strace     = flag.Bool("strace", false, "enable strace logs")
-	platform   = flag.String("platform", "ptrace", "platform to run on")
-	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
-	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
-	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
-	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
-	parallel   = flag.Bool("parallel", false, "run tests in parallel")
-	runscPath  = flag.String("runsc", "", "path to runsc binary")
-
-	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
-)
-
-// runTestCaseNative runs the test case directly on the host machine.
-func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
-	// These tests might be running in parallel, so make sure they have a
-	// unique test temp dir.
-	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-	if err != nil {
-		t.Fatalf("could not create temp dir: %v", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	// Replace TEST_TMPDIR in the current environment with something
-	// unique.
-	env := os.Environ()
-	newEnvVar := "TEST_TMPDIR=" + tmpDir
-	var found bool
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = newEnvVar
-			found = true
-			break
-		}
-	}
-	if !found {
-		env = append(env, newEnvVar)
-	}
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	if *addUDSTree {
-		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
-		if err != nil {
-			t.Fatalf("failed to create socket tree: %v", err)
-		}
-		defer cleanup()
-
-		env = append(env, "TEST_UDS_TREE="+socketDir)
-		// On Linux, the concept of "attach" location doesn't exist.
-		// Just pass the same path to make these test identical.
-		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
-	}
-
-	cmd := exec.Command(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-	cmd.Env = env
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
-		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
-	}
-}
-
-// runRunsc runs spec in runsc in a standard test configuration.
-//
-// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
-//
-// Returns an error if the sandboxed application exits non-zero.
-func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
-	bundleDir, err := testutil.SetupBundleDir(spec)
-	if err != nil {
-		return fmt.Errorf("SetupBundleDir failed: %v", err)
-	}
-	defer os.RemoveAll(bundleDir)
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		return fmt.Errorf("SetupRootDir failed: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	name := tc.FullName()
-	id := testutil.UniqueContainerID()
-	log.Infof("Running test %q in container %q", name, id)
-	specutils.LogSpec(spec)
-
-	args := []string{
-		"-root", rootDir,
-		"-network", *network,
-		"-log-format=text",
-		"-TESTONLY-unsafe-nonroot=true",
-		"-net-raw=true",
-		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
-		"-watchdog-action=panic",
-		"-platform", *platform,
-		"-file-access", *fileAccess,
-	}
-	if *overlay {
-		args = append(args, "-overlay")
-	}
-	if *debug {
-		args = append(args, "-debug", "-log-packets=true")
-	}
-	if *strace {
-		args = append(args, "-strace")
-	}
-	if *addUDSTree {
-		args = append(args, "-fsgofer-host-uds")
-	}
-
-	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
-		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
-		if err := os.MkdirAll(tdir, 0755); err != nil {
-			return fmt.Errorf("could not create test dir: %v", err)
-		}
-		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
-		if err != nil {
-			return fmt.Errorf("could not create temp dir: %v", err)
-		}
-		debugLogDir += "/"
-		log.Infof("runsc logs: %s", debugLogDir)
-		args = append(args, "-debug-log", debugLogDir)
-
-		// Default -log sends messages to stderr which makes reading the test log
-		// difficult. Instead, drop them when debug log is enabled given it's a
-		// better place for these messages.
-		args = append(args, "-log=/dev/null")
-	}
-
-	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
-	// as root inside that namespace to get it.
-	rArgs := append(args, "run", "--bundle", bundleDir, id)
-	cmd := exec.Command(*runscPath, rArgs...)
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
-		// Set current user/group as root inside the namespace.
-		UidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
-		},
-		GidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
-		},
-		GidMappingsEnableSetgroups: false,
-		Credential: &syscall.Credential{
-			Uid: 0,
-			Gid: 0,
-		},
-	}
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	sig := make(chan os.Signal, 1)
-	signal.Notify(sig, syscall.SIGTERM)
-	go func() {
-		s, ok := <-sig
-		if !ok {
-			return
-		}
-		log.Warningf("%s: Got signal: %v", name, s)
-		done := make(chan bool)
-		dArgs := append([]string{}, args...)
-		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
-		go func(dArgs []string) {
-			cmd := exec.Command(*runscPath, dArgs...)
-			cmd.Stdout = os.Stdout
-			cmd.Stderr = os.Stderr
-			cmd.Run()
-			done <- true
-		}(dArgs)
-
-		timeout := time.After(3 * time.Second)
-		select {
-		case <-timeout:
-			log.Infof("runsc debug --stacks is timeouted")
-		case <-done:
-		}
-
-		log.Warningf("Send SIGTERM to the sandbox process")
-		dArgs = append(args, "debug",
-			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
-			id)
-		cmd := exec.Command(*runscPath, dArgs...)
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-		cmd.Run()
-	}()
-
-	err = cmd.Run()
-
-	signal.Stop(sig)
-	close(sig)
-
-	return err
-}
-
-// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
-func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
-	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
-	if err != nil {
-		return nil, fmt.Errorf("failed to create socket tree: %v", err)
-	}
-
-	// Standard access to entire tree.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets",
-		Source:      socketDir,
-		Type:        "bind",
-	})
-
-	// Individial attach points for each socket to test mounts that attach
-	// directly to the sockets.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/stream/echo",
-		Source:      filepath.Join(socketDir, "stream/echo"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/stream/nonlistening",
-		Source:      filepath.Join(socketDir, "stream/nonlistening"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/seqpacket/echo",
-		Source:      filepath.Join(socketDir, "seqpacket/echo"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
-		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/dgram/null",
-		Source:      filepath.Join(socketDir, "dgram/null"),
-		Type:        "bind",
-	})
-
-	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
-	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
-
-	return cleanup, nil
-}
-
-// runsTestCaseRunsc runs the test case in runsc.
-func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
-	// Run a new container with the test executable and filter for the
-	// given test suite and name.
-	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-
-	// Mark the root as writeable, as some tests attempt to
-	// write to the rootfs, and expect EACCES, not EROFS.
-	spec.Root.Readonly = false
-
-	// Test spec comes with pre-defined mounts that we don't want. Reset it.
-	spec.Mounts = nil
-	if *useTmpfs {
-		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
-		// features only available in gVisor's internal tmpfs may fail.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/tmp",
-			Type:        "tmpfs",
-		})
-	} else {
-		// Use a gofer-backed directory as '/tmp'.
-		//
-		// Tests might be running in parallel, so make sure each has a
-		// unique test temp dir.
-		//
-		// Some tests (e.g., sticky) access this mount from other
-		// users, so make sure it is world-accessible.
-		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-		if err != nil {
-			t.Fatalf("could not create temp dir: %v", err)
-		}
-		defer os.RemoveAll(tmpDir)
-
-		if err := os.Chmod(tmpDir, 0777); err != nil {
-			t.Fatalf("could not chmod temp dir: %v", err)
-		}
-
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
-	}
-
-	// Set environment variables that indicate we are
-	// running in gVisor with the given platform and network.
-	platformVar := "TEST_ON_GVISOR"
-	networkVar := "GVISOR_NETWORK"
-	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
-
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
-	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
-
-	spec.Process.Env = env
-
-	if *addUDSTree {
-		cleanup, err := setupUDSTree(spec)
-		if err != nil {
-			t.Fatalf("error creating UDS tree: %v", err)
-		}
-		defer cleanup()
-	}
-
-	if err := runRunsc(tc, spec); err != nil {
-		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
-	}
-}
-
-// filterEnv returns an environment with the blacklisted variables removed.
-func filterEnv(env, blacklist []string) []string {
-	var out []string
-	for _, kv := range env {
-		ok := true
-		for _, k := range blacklist {
-			if strings.HasPrefix(kv, k+"=") {
-				ok = false
-				break
-			}
-		}
-		if ok {
-			out = append(out, kv)
-		}
-	}
-	return out
-}
-
-func fatalf(s string, args ...interface{}) {
-	fmt.Fprintf(os.Stderr, s+"\n", args...)
-	os.Exit(1)
-}
-
-func matchString(a, b string) (bool, error) {
-	return a == b, nil
-}
-
-func main() {
-	flag.Parse()
-	if *testName == "" {
-		fatalf("test-name flag must be provided")
-	}
-
-	log.SetLevel(log.Info)
-	if *debug {
-		log.SetLevel(log.Debug)
-	}
-
-	if *platform != "native" && *runscPath == "" {
-		if err := testutil.ConfigureExePath(); err != nil {
-			panic(err.Error())
-		}
-		*runscPath = specutils.ExePath
-	}
-
-	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
-	// from outside the sandbox can (and will) stomp on logs from inside
-	// the sandbox.
-	for _, f := range []*os.File{os.Stdout, os.Stderr} {
-		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
-		if err != nil {
-			fatalf("error getting file flags for %v: %v", f, err)
-		}
-		if flags&unix.O_APPEND == 0 {
-			flags |= unix.O_APPEND
-			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
-				fatalf("error setting file flags for %v: %v", f, err)
-			}
-		}
-	}
-
-	// Get path to test binary.
-	fullTestName := filepath.Join(testDir, *testName)
-	testBin, err := testutil.FindFile(fullTestName)
-	if err != nil {
-		fatalf("FindFile(%q) failed: %v", fullTestName, err)
-	}
-
-	// Get all test cases in each binary.
-	testCases, err := gtest.ParseTestCases(testBin)
-	if err != nil {
-		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
-	}
-
-	// Get subset of tests corresponding to shard.
-	indices, err := testutil.TestIndicesForShard(len(testCases))
-	if err != nil {
-		fatalf("TestsForShard() failed: %v", err)
-	}
-
-	// Run the tests.
-	var tests []testing.InternalTest
-	for _, tci := range indices {
-		// Capture tc.
-		tc := testCases[tci]
-		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
-		tests = append(tests, testing.InternalTest{
-			Name: testName,
-			F: func(t *testing.T) {
-				if *parallel {
-					t.Parallel()
-				}
-				if *platform == "native" {
-					// Run the test case on host.
-					runTestCaseNative(testBin, tc, t)
-				} else {
-					// Run the test case in runsc.
-					runTestCaseRunsc(testBin, tc, t)
-				}
-			},
-		})
-	}
-
-	testing.Main(matchString, tests, nil, nil)
-}
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
deleted file mode 100755
index 864bb2de4..000000000
--- a/test/syscalls/syscall_test_runner.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# syscall_test_runner.sh is a simple wrapper around the go syscall test runner.
-# It exists so that we can build the syscall test runner once, and use it for
-# all syscall tests, rather than build it for each test run.
-
-set -euf -x -o pipefail
-
-echo -- "$@"
-
-if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then
-  mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}"
-  chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}"
-fi
-
-# Get location of syscall_test_runner binary.
-readonly runner=$(find "${TEST_SRCDIR}" -name syscall_test_runner)
-
-# Pass the arguments of this script directly to the runner.
-exec "${runner}" "$@"
diff --git a/test/util/BUILD b/test/util/BUILD
index 1f22ebe29..8b5a0f25c 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gbenchmark", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -260,6 +260,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         gtest,
+        gbenchmark,
     ],
 )
 
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
index 5c7ee0064..1f389e58f 100644
--- a/test/util/test_main.cc
+++ b/test/util/test_main.cc
@@ -16,5 +16,5 @@
 
 int main(int argc, char** argv) {
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 2d22b0eb8..c5cb9d6d6 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -771,6 +771,7 @@ std::string RunfilePath(std::string path);
 #endif
 
 void TestInit(int* argc, char*** argv);
+int RunAllTests(void);
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/test_util_impl.cc b/test/util/test_util_impl.cc
index ba7c0a85b..7e1ad9e66 100644
--- a/test/util/test_util_impl.cc
+++ b/test/util/test_util_impl.cc
@@ -17,8 +17,12 @@
 #include "gtest/gtest.h"
 #include "absl/flags/flag.h"
 #include "absl/flags/parse.h"
+#include "benchmark/benchmark.h"
 #include "test/util/logging.h"
 
+extern bool FLAGS_benchmark_list_tests;
+extern std::string FLAGS_benchmark_filter;
+
 namespace gvisor {
 namespace testing {
 
@@ -26,6 +30,7 @@ void SetupGvisorDeathTest() {}
 
 void TestInit(int* argc, char*** argv) {
   ::testing::InitGoogleTest(argc, *argv);
+  benchmark::Initialize(argc, *argv);
   ::absl::ParseCommandLine(*argc, *argv);
 
   // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
@@ -34,5 +39,14 @@ void TestInit(int* argc, char*** argv) {
   TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
 }
 
+int RunAllTests() {
+  if (FLAGS_benchmark_list_tests || FLAGS_benchmark_filter != ".") {
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+  } else {
+    return RUN_ALL_TESTS();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 6798362dc..6f091d759 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -21,6 +21,7 @@ go_image = _go_image
 go_embed_data = _go_embed_data
 go_suffixes = _go_suffixes
 gtest = "@com_google_googletest//:gtest"
+gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
 proto_library = native.proto_library
 pkg_deb = _pkg_deb
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 39f035f12..4eece2d83 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -21,6 +21,7 @@ go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
 gtest = _gtest
+gbenchmark = _gbenchmark
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = _py_library
-- 
cgit v1.2.3


From ec5630527bc4473081048d2d13d1dcfadc6c7cdd Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 19 Feb 2020 18:27:48 -0800
Subject: Add statefile command to runsc.

PiperOrigin-RevId: 296105337
---
 runsc/cmd/BUILD        |   3 ++
 runsc/cmd/statefile.go | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
 runsc/main.go          |   3 +-
 3 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 runsc/cmd/statefile.go

diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 2a88b85a9..d0bb4613a 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -31,6 +31,7 @@ go_library(
         "spec.go",
         "start.go",
         "state.go",
+        "statefile.go",
         "syscalls.go",
         "wait.go",
     ],
@@ -43,6 +44,8 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/state",
+        "//pkg/state/statefile",
         "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go
new file mode 100644
index 000000000..e6f1907da
--- /dev/null
+++ b/runsc/cmd/statefile.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/statefile"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Statefile implements subcommands.Command for the "statefile" command.
+type Statefile struct {
+	list   bool
+	get    string
+	key    string
+	output string
+	html   bool
+}
+
+// Name implements subcommands.Command.
+func (*Statefile) Name() string {
+	return "state"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Statefile) Synopsis() string {
+	return "shows information about a statefile"
+}
+
+// Usage implements subcommands.Command.
+func (*Statefile) Usage() string {
+	return `statefile [flags] <statefile>`
+}
+
+// SetFlags implements subcommands.Command.
+func (s *Statefile) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&s.list, "list", false, "lists the metdata in the statefile.")
+	f.StringVar(&s.get, "get", "", "extracts the given metadata key.")
+	f.StringVar(&s.key, "key", "", "the integrity key for the file.")
+	f.StringVar(&s.output, "output", "", "target to write the result.")
+	f.BoolVar(&s.html, "html", false, "outputs in HTML format.")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Check arguments.
+	if s.list && s.get != "" {
+		Fatalf("error: can't specify -list and -get simultaneously.")
+	}
+
+	// Setup output.
+	var output = os.Stdout // Default.
+	if s.output != "" {
+		f, err := os.OpenFile(s.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
+		if err != nil {
+			Fatalf("error opening output: %v", err)
+		}
+		defer func() {
+			if err := f.Close(); err != nil {
+				Fatalf("error flushing output: %v", err)
+			}
+		}()
+		output = f
+	}
+
+	// Open the file.
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	input, err := os.Open(f.Arg(0))
+	if err != nil {
+		Fatalf("error opening input: %v\n", err)
+	}
+
+	if s.html {
+		fmt.Fprintf(output, "<html><body>\n")
+		defer fmt.Fprintf(output, "</body></html>\n")
+	}
+
+	// Dump the full file?
+	if !s.list && s.get == "" {
+		var key []byte
+		if s.key != "" {
+			key = []byte(s.key)
+		}
+		rc, _, err := statefile.NewReader(input, key)
+		if err != nil {
+			Fatalf("error parsing statefile: %v", err)
+		}
+		if err := state.PrettyPrint(output, rc, s.html); err != nil {
+			Fatalf("error printing state: %v", err)
+		}
+		return subcommands.ExitSuccess
+	}
+
+	// Load just the metadata.
+	metadata, err := statefile.MetadataUnsafe(input)
+	if err != nil {
+		Fatalf("error reading metadata: %v", err)
+	}
+
+	// Is it a single key?
+	if s.get != "" {
+		val, ok := metadata[s.get]
+		if !ok {
+			Fatalf("metadata key %s: not found", s.get)
+		}
+		fmt.Fprintf(output, "%s\n", val)
+		return subcommands.ExitSuccess
+	}
+
+	// List all keys.
+	if s.html {
+		fmt.Fprintf(output, " <ul>\n")
+		defer fmt.Fprintf(output, " </ul>\n")
+	}
+	for key := range metadata {
+		if s.html {
+			fmt.Fprintf(output, "  <li>%s</li>\n", key)
+		} else {
+			fmt.Fprintf(output, "%s\n", key)
+		}
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/main.go b/runsc/main.go
index 762b0f801..af73bed97 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -116,8 +116,8 @@ func main() {
 	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Spec), "")
-	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.State), "")
+	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.Wait), "")
 
 	// Register internal commands with the internal group name. This causes
@@ -127,6 +127,7 @@ func main() {
 	subcommands.Register(new(cmd.Boot), internalGroup)
 	subcommands.Register(new(cmd.Debug), internalGroup)
 	subcommands.Register(new(cmd.Gofer), internalGroup)
+	subcommands.Register(new(cmd.Statefile), internalGroup)
 
 	// All subcommands must be registered before flag parsing.
 	flag.Parse()
-- 
cgit v1.2.3


From de68e1d8c437e6234f5e413d7cad6f892a4452d3 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 20 Feb 2020 01:12:03 -0500
Subject: Code Clean:Move getUserRegisters into dieArchSetup() and other small
 changes.

Consistent with QEMU, getUserRegisters() should be an arch-specific
function. So, it should be called in dieArchSetup().

With this patch and the pagetable/pcid patch, the kvm modules on Arm64 can be
built successfully.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/bluepill.go              |  6 ------
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go |  6 ++++++
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go  | 24 ------------------------
 3 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 35cd55fef..4b23f7803 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -81,12 +81,6 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 	// Save the death message, which will be thrown.
 	c.dieState.message = msg
 
-	// Reload all registers to have an accurate stack trace when we return
-	// to host mode. This means that the stack should be unwound correctly.
-	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
-		throw(msg)
-	}
-
 	// Setup the trampoline.
 	dieArchSetup(c, context, &c.dieState.guestRegs)
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index a63a6a071..99cac665d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -31,6 +31,12 @@ import (
 //
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
+		throw(c.dieState.message)
+	}
+
 	// If the vCPU is in user mode, we set the stack to the stored stack
 	// value in the vCPU itself. We don't want to unwind the user stack.
 	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1c8384e6b..b531f2f85 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,30 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// setMemoryRegion initializes a region.
-//
-// This may be called from bluepillHandler, and therefore returns an errno
-// directly (instead of wrapping in an error) to avoid allocations.
-//
-//go:nosplit
-func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
-	userRegion := userMemoryRegion{
-		slot:          uint32(slot),
-		flags:         0,
-		guestPhysAddr: uint64(physical),
-		memorySize:    uint64(length),
-		userspaceAddr: uint64(virtual),
-	}
-
-	// Set the region.
-	_, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(m.fd),
-		_KVM_SET_USER_MEMORY_REGION,
-		uintptr(unsafe.Pointer(&userRegion)))
-	return errno
-}
-
 type kvmVcpuInit struct {
 	target   uint32
 	features [7]uint32
-- 
cgit v1.2.3


From a369c88c0c4ece5239855000d28df045111c1be7 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 29 Jan 2020 04:33:06 -0500
Subject: Lazy-fpsimd support patch series#1: add Arm64-fpsimd support to arch
 module

This patch defines the structures and
adds the implementations for fpsimd initialization.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/arch/arch_aarch64.go | 30 +++++++++++++++++++-----------
 pkg/sentry/arch/arch_arm64.go   | 16 +++++++++++++---
 pkg/sentry/arch/signal_arm64.go | 19 ++++++++++++++++---
 3 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 3b6987665..53039465d 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -32,29 +32,35 @@ import (
 const (
 	// SyscallWidth is the width of insturctions.
 	SyscallWidth = 4
+
+	// fpsimdMagic is the magic number which is used in fpsimd_context.
+	fpsimdMagic = 0x46508001
+
+	// fpsimdContextSize is the size of fpsimd_context.
+	fpsimdContextSize = 0x210
 )
 
 // aarch64FPState is aarch64 floating point state.
 type aarch64FPState []byte
 
-// initAarch64FPState (defined in asm files) sets up initial state.
-func initAarch64FPState(data *FloatingPointData) {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+// initAarch64FPState sets up initial state.
+func initAarch64FPState(data aarch64FPState) {
+	binary.LittleEndian.PutUint32(data, fpsimdMagic)
+	binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
 }
 
 func newAarch64FPStateSlice() []byte {
-	return alignedBytes(4096, 32)[:4096]
+	return alignedBytes(4096, 16)[:fpsimdContextSize]
 }
 
 // newAarch64FPState returns an initialized floating point state.
 //
 // The returned state is large enough to store all floating point state
 // supported by host, even if the app won't use much of it due to a restricted
-// FeatureSet. Since they may still be able to see state not advertised by
-// CPUID we must ensure it does not contain any sentry state.
+// FeatureSet.
 func newAarch64FPState() aarch64FPState {
 	f := aarch64FPState(newAarch64FPStateSlice())
-	initAarch64FPState(f.FloatingPointData())
+	initAarch64FPState(f)
 	return f
 }
 
@@ -133,10 +139,10 @@ func (s State) Proto() *rpb.Registers {
 
 // Fork creates and returns an identical copy of the state.
 func (s *State) Fork() State {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
 	return State{
-		Regs:       s.Regs,
-		FeatureSet: s.FeatureSet,
+		Regs:           s.Regs,
+		aarch64FPState: s.aarch64FPState.fork(),
+		FeatureSet:     s.FeatureSet,
 	}
 }
 
@@ -285,8 +291,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context {
 	case ARM64:
 		return &context64{
 			State{
-				FeatureSet: fs,
+				aarch64FPState: newAarch64FPState(),
+				FeatureSet:     fs,
 			},
+			[]aarch64FPState(nil),
 		}
 	}
 	panic(fmt.Sprintf("unknown architecture %v", arch))
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 94f1a808f..372b650b9 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -68,6 +68,7 @@ const (
 // context64 represents an ARM64 context.
 type context64 struct {
 	State
+	sigFPState []aarch64FPState // fpstate to be restored on sigreturn.
 }
 
 // Arch implements Context.Arch.
@@ -75,10 +76,19 @@ func (c *context64) Arch() Arch {
 	return ARM64
 }
 
+func (c *context64) copySigFPState() []aarch64FPState {
+	var sigfps []aarch64FPState
+	for _, s := range c.sigFPState {
+		sigfps = append(sigfps, s.fork())
+	}
+	return sigfps
+}
+
 // Fork returns an exact copy of this context.
 func (c *context64) Fork() Context {
 	return &context64{
-		State: c.State.Fork(),
+		State:      c.State.Fork(),
+		sigFPState: c.copySigFPState(),
 	}
 }
 
@@ -137,8 +147,8 @@ func (c *context64) SetTLS(value uintptr) bool {
 	return false
 }
 
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 	c.Regs.Regs[3] = uint64(value)
 }
 
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 4f4cc46a8..28615f97f 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -30,14 +30,27 @@ type SignalContext64 struct {
 	Sp        uint64
 	Pc        uint64
 	Pstate    uint64
-	_pad      [8]byte // __attribute__((__aligned__(16)))
-	Reserved  [4096]uint8
+	_pad      [8]byte       // __attribute__((__aligned__(16)))
+	Fpsimd64  FpsimdContext // size = 528
+	Reserved  [3568]uint8
+}
+
+type aarch64Ctx struct {
+	Magic uint32
+	Size  uint32
+}
+
+type FpsimdContext struct {
+	Head  aarch64Ctx
+	Fpsr  uint32
+	Fpcr  uint32
+	Vregs [64]uint64 // actually [32]uint128
 }
 
 // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
 type UContext64 struct {
 	Flags  uint64
-	Link   *UContext64
+	Link   uint64
 	Stack  SignalStack
 	Sigset linux.SignalSet
 	// glibc uses a 1024-bit sigset_t
-- 
cgit v1.2.3


From 10ed60e4778e01a2813eea6e3c201826f75982a5 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 09:57:06 -0800
Subject: VFS2: Support memory mapping in tmpfs.

tmpfs.fileDescription now implements ConfigureMMap. And tmpfs.regularFile
implement memmap.Mappable. The methods are mostly unchanged from VFS1 tmpfs.

PiperOrigin-RevId: 296234557
---
 pkg/sentry/fsimpl/tmpfs/filesystem.go   |   8 +-
 pkg/sentry/fsimpl/tmpfs/regular_file.go | 233 +++++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        |  16 ++-
 3 files changed, 213 insertions(+), 44 deletions(-)

diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 7f7b791c4..e1b551422 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -16,7 +16,6 @@ package tmpfs
 
 import (
 	"fmt"
-	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -347,10 +346,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 			return nil, err
 		}
 		if opts.Flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data.Truncate(0, impl.memFile)
-			atomic.StoreUint64(&impl.size, 0)
-			impl.mu.Unlock()
+			if _, err := impl.truncate(0); err != nil {
+				return nil, err
+			}
 		}
 		return &fd.vfsfd, nil
 	case *directory:
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index dab346a41..711442424 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -15,6 +15,7 @@
 package tmpfs
 
 import (
+	"fmt"
 	"io"
 	"math"
 	"sync/atomic"
@@ -22,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -34,25 +36,53 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// regularFile is a regular (=S_IFREG) tmpfs file.
 type regularFile struct {
 	inode inode
 
 	// memFile is a platform.File used to allocate pages to this regularFile.
 	memFile *pgalloc.MemoryFile
 
-	// mu protects the fields below.
-	mu sync.RWMutex
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks mappings of the file into memmap.MappingSpaces.
+	//
+	// Protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// writableMappingPages tracks how many pages of virtual memory are mapped
+	// as potentially writable from this file. If a page has multiple mappings,
+	// each mapping is counted separately.
+	//
+	// This counter is susceptible to overflow as we can potentially count
+	// mappings from many VMAs. We count pages rather than bytes to slightly
+	// mitigate this.
+	//
+	// Protected by mapsMu.
+	writableMappingPages uint64
+
+	// dataMu protects the fields below.
+	dataMu sync.RWMutex
 
 	// data maps offsets into the file to offsets into memFile that store
 	// the file's data.
+	//
+	// Protected by dataMu.
 	data fsutil.FileRangeSet
 
-	// size is the size of data, but accessed using atomic memory
-	// operations to avoid locking in inode.stat().
-	size uint64
-
 	// seals represents file seals on this inode.
+	//
+	// Protected by dataMu.
 	seals uint32
+
+	// size is the size of data.
+	//
+	// Protected by both dataMu and inode.mu; reading it requires holding
+	// either mutex, while writing requires holding both AND using atomics.
+	// Readers that do not require consistency (like Stat) may read the
+	// value atomically without holding either lock.
+	size uint64
 }
 
 func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
@@ -66,39 +96,170 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod
 
 // truncate grows or shrinks the file to the given size. It returns true if the
 // file size was updated.
-func (rf *regularFile) truncate(size uint64) (bool, error) {
-	rf.mu.Lock()
-	defer rf.mu.Unlock()
+func (rf *regularFile) truncate(newSize uint64) (bool, error) {
+	rf.inode.mu.Lock()
+	defer rf.inode.mu.Unlock()
+	return rf.truncateLocked(newSize)
+}
 
-	if size == rf.size {
+// Preconditions: rf.inode.mu must be held.
+func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
+	oldSize := rf.size
+	if newSize == oldSize {
 		// Nothing to do.
 		return false, nil
 	}
 
-	if size > rf.size {
-		// Growing the file.
+	// Need to hold inode.mu and dataMu while modifying size.
+	rf.dataMu.Lock()
+	if newSize > oldSize {
+		// Can we grow the file?
 		if rf.seals&linux.F_SEAL_GROW != 0 {
-			// Seal does not allow growth.
+			rf.dataMu.Unlock()
 			return false, syserror.EPERM
 		}
-		rf.size = size
+		// We only need to update the file size.
+		atomic.StoreUint64(&rf.size, newSize)
+		rf.dataMu.Unlock()
 		return true, nil
 	}
 
-	// Shrinking the file
+	// We are shrinking the file. First check if this is allowed.
 	if rf.seals&linux.F_SEAL_SHRINK != 0 {
-		// Seal does not allow shrink.
+		rf.dataMu.Unlock()
 		return false, syserror.EPERM
 	}
 
-	// TODO(gvisor.dev/issues/1197): Invalidate mappings once we have
-	// mappings.
+	// Update the file size.
+	atomic.StoreUint64(&rf.size, newSize)
+	rf.dataMu.Unlock()
+
+	// Invalidate past translations of truncated pages.
+	oldpgend := fs.OffsetPageEnd(int64(oldSize))
+	newpgend := fs.OffsetPageEnd(int64(newSize))
+	if newpgend < oldpgend {
+		rf.mapsMu.Lock()
+		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+			// Compare Linux's mm/shmem.c:shmem_setattr() =>
+			// mm/memory.c:unmap_mapping_range(evencows=1).
+			InvalidatePrivate: true,
+		})
+		rf.mapsMu.Unlock()
+	}
 
-	rf.data.Truncate(size, rf.memFile)
-	rf.size = size
+	// We are now guaranteed that there are no translations of truncated pages,
+	// and can remove them.
+	rf.dataMu.Lock()
+	rf.data.Truncate(newSize, rf.memFile)
+	rf.dataMu.Unlock()
 	return true, nil
 }
 
+// AddMapping implements memmap.Mappable.AddMapping.
+func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+
+	// Reject writable mapping if F_SEAL_WRITE is set.
+	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
+		return syserror.EPERM
+	}
+
+	rf.mappings.AddMapping(ms, ar, offset, writable)
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages < pagesBefore {
+			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+
+	rf.mappings.RemoveMapping(ms, ar, offset, writable)
+
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages > pagesBefore {
+			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	rf.dataMu.Lock()
+	defer rf.dataMu.Unlock()
+
+	// Constrain translations to f.attr.Size (rounded up) to prevent
+	// translation to pages that may be concurrently truncated.
+	pgend := fs.OffsetPageEnd(int64(rf.size))
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+		// Newly-allocated pages are zeroed, so we don't need to do anything.
+		return dsts.NumBytes(), nil
+	})
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   rf.memFile,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  usermem.AnyAccess,
+		})
+		translatedEnd = segMR.End
+	}
+
+	// Don't return the error returned by f.data.Fill if it occurred outside of
+	// required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (*regularFile) InvalidateUnsavable(context.Context) error {
+	return nil
+}
+
 type regularFileFD struct {
 	fileDescription
 
@@ -152,8 +313,10 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		// Overflow.
 		return 0, syserror.EFBIG
 	}
+	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
+	f.inode.mu.Unlock()
 	putRegularFileReadWriter(rw)
 	return n, err
 }
@@ -215,6 +378,12 @@ func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng
 	return nil
 }
 
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	file := fd.inode().impl.(*regularFile)
+	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
+}
+
 // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
 type regularFileReadWriter struct {
 	file *regularFile
@@ -244,14 +413,15 @@ func putRegularFileReadWriter(rw *regularFileReadWriter) {
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	rw.file.mu.RLock()
+	rw.file.dataMu.RLock()
+	defer rw.file.dataMu.RUnlock()
+	size := rw.file.size
 
 	// Compute the range to read (limited by file size and overflow-checked).
-	if rw.off >= rw.file.size {
-		rw.file.mu.RUnlock()
+	if rw.off >= size {
 		return 0, io.EOF
 	}
-	end := rw.file.size
+	end := size
 	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
 		end = rend
 	}
@@ -265,7 +435,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			// Get internal mappings.
 			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -275,7 +444,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			rw.off += uint64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -291,7 +459,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			rw.off += uint64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -299,13 +466,16 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
 		}
 	}
-	rw.file.mu.RUnlock()
 	return done, nil
 }
 
 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: inode.mu must be held.
 func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	rw.file.mu.Lock()
+	// Hold dataMu so we can modify size.
+	rw.file.dataMu.Lock()
+	defer rw.file.dataMu.Unlock()
 
 	// Compute the range to write (overflow-checked).
 	end := rw.off + srcs.NumBytes()
@@ -316,7 +486,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 	// Check if seals prevent either file growth or all writes.
 	switch {
 	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
-		rw.file.mu.Unlock()
 		return 0, syserror.EPERM
 	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
 		// When growth is sealed, Linux effectively allows writes which would
@@ -338,7 +507,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 		}
 		if end <= rw.off {
 			// Truncation would result in no data being written.
-			rw.file.mu.Unlock()
 			return 0, syserror.EPERM
 		}
 	}
@@ -395,9 +563,8 @@ exitLoop:
 	// If the write ends beyond the file's previous size, it causes the
 	// file to grow.
 	if rw.off > rw.file.size {
-		atomic.StoreUint64(&rw.file.size, rw.off)
+		rw.file.size = rw.off
 	}
 
-	rw.file.mu.Unlock()
 	return done, retErr
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index c5bb17562..521206305 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -18,9 +18,10 @@
 // Lock order:
 //
 // filesystem.mu
-//   regularFileFD.offMu
-//     regularFile.mu
 //   inode.mu
+//     regularFileFD.offMu
+//       regularFile.mapsMu
+//         regularFile.dataMu
 package tmpfs
 
 import (
@@ -226,12 +227,15 @@ func (i *inode) tryIncRef() bool {
 
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
-		// This is unnecessary; it's mostly to simulate what tmpfs would do.
 		if regFile, ok := i.impl.(*regularFile); ok {
-			regFile.mu.Lock()
+			// Hold inode.mu and regFile.dataMu while mutating
+			// size.
+			i.mu.Lock()
+			regFile.dataMu.Lock()
 			regFile.data.DropAll(regFile.memFile)
 			atomic.StoreUint64(&regFile.size, 0)
-			regFile.mu.Unlock()
+			regFile.dataMu.Unlock()
+			i.mu.Unlock()
 		}
 	} else if refs < 0 {
 		panic("tmpfs.inode.decRef() called without holding a reference")
@@ -320,7 +324,7 @@ func (i *inode) setStat(stat linux.Statx) error {
 	if mask&linux.STATX_SIZE != 0 {
 		switch impl := i.impl.(type) {
 		case *regularFile:
-			updated, err := impl.truncate(stat.Size)
+			updated, err := impl.truncateLocked(stat.Size)
 			if err != nil {
 				return err
 			}
-- 
cgit v1.2.3


From 9a4e3e63ef3c771e9fab3d19ee8ad0a173c7c4eb Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 11:14:53 -0800
Subject: Re-add atomicbitops_arm64.s to BUILD.

This was inadverently dropped by cl/295811743.

PiperOrigin-RevId: 296254482
---
 pkg/atomicbitops/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index ba8b06071..1a30f6967 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "atomicbitops.go",
         "atomicbitops_amd64.s",
+        "atomicbitops_arm64.s",
         "atomicbitops_noasm.go",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 9bad87339a10545d267903e7739f8cd978fbd82a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 11:29:59 -0800
Subject: Better strace logging for epoll syscalls.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Example:

epoll_ctl(0x3 anon_inode:[eventpoll], EPOLL_CTL_ADD, 0x6 anon_inode:[eventfd], 0x7efe2fd92a80 {events=EPOLLIN|EPOLLOUT data=0x10203040506070a}) = 0x0 (4.411µs)

epoll_wait(0x3 anon_inode:[eventpoll], 0x7efe2fd92b50 {{events=EPOLLOUT data=0x102030405060708}{events=EPOLLOUT data=0x102030405060708}{events=EPOLLOUT data=0x102030405060708}}, 0x3, 0xffffffff) = 0x3 (29.891µs)

PiperOrigin-RevId: 296258146
---
 pkg/abi/linux/epoll.go             |  7 +++
 pkg/sentry/strace/BUILD            |  1 +
 pkg/sentry/strace/epoll.go         | 89 ++++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/linux64_amd64.go |  6 +--
 pkg/sentry/strace/linux64_arm64.go |  4 +-
 pkg/sentry/strace/strace.go        |  8 ++++
 pkg/sentry/strace/syscalls.go      | 10 +++++
 7 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 pkg/sentry/strace/epoll.go

diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
index 6e4de69da..1121a1a92 100644
--- a/pkg/abi/linux/epoll.go
+++ b/pkg/abi/linux/epoll.go
@@ -14,6 +14,10 @@
 
 package linux
 
+import (
+	"gvisor.dev/gvisor/pkg/binary"
+)
+
 // Event masks.
 const (
 	EPOLLIN     = 0x1
@@ -53,3 +57,6 @@ const (
 	EPOLL_CTL_DEL = 0x2
 	EPOLL_CTL_MOD = 0x3
 )
+
+// SizeOfEpollEvent is the size of EpollEvent struct.
+var SizeOfEpollEvent = int(binary.Size(EpollEvent{}))
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 2f39a6f2b..88d5db9fc 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "capability.go",
         "clone.go",
+        "epoll.go",
         "futex.go",
         "linux64_amd64.go",
         "linux64_arm64.go",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
new file mode 100644
index 000000000..a6e48b836
--- /dev/null
+++ b/pkg/sentry/strace/epoll.go
@@ -0,0 +1,89 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
+	var e linux.EpollEvent
+	if _, err := t.CopyIn(eventAddr, &e); err != nil {
+		return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
+	}
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "%#x ", eventAddr)
+	writeEpollEvent(&sb, e)
+	return sb.String()
+}
+
+func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes uint64) string {
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "%#x {", eventsAddr)
+	addr := eventsAddr
+	for i := uint64(0); i < numEvents; i++ {
+		var e linux.EpollEvent
+		if _, err := t.CopyIn(addr, &e); err != nil {
+			fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
+			continue
+		}
+		writeEpollEvent(&sb, e)
+		if uint64(sb.Len()) >= maxBytes {
+			sb.WriteString("...")
+			break
+		}
+		if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok {
+			fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr)
+			continue
+		}
+	}
+	sb.WriteString("}")
+	return sb.String()
+}
+
+func writeEpollEvent(sb *strings.Builder, e linux.EpollEvent) {
+	events := epollEventEvents.Parse(uint64(e.Events))
+	fmt.Fprintf(sb, "{events=%s data=[%#x, %#x]}", events, e.Data[0], e.Data[1])
+}
+
+var epollCtlOps = abi.ValueSet{
+	linux.EPOLL_CTL_ADD: "EPOLL_CTL_ADD",
+	linux.EPOLL_CTL_DEL: "EPOLL_CTL_DEL",
+	linux.EPOLL_CTL_MOD: "EPOLL_CTL_MOD",
+}
+
+var epollEventEvents = abi.FlagSet{
+	{Flag: linux.EPOLLIN, Name: "EPOLLIN"},
+	{Flag: linux.EPOLLPRI, Name: "EPOLLPRI"},
+	{Flag: linux.EPOLLOUT, Name: "EPOLLOUT"},
+	{Flag: linux.EPOLLERR, Name: "EPOLLERR"},
+	{Flag: linux.EPOLLHUP, Name: "EPULLHUP"},
+	{Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"},
+	{Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"},
+	{Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"},
+	{Flag: linux.EPOLLWRBAND, Name: "EPOLLWRBAND"},
+	{Flag: linux.EPOLLMSG, Name: "EPOLLMSG"},
+	{Flag: linux.EPOLLRDHUP, Name: "EPOLLRDHUP"},
+	{Flag: linux.EPOLLEXCLUSIVE, Name: "EPOLLEXCLUSIVE"},
+	{Flag: linux.EPOLLWAKEUP, Name: "EPOLLWAKEUP"},
+	{Flag: linux.EPOLLONESHOT, Name: "EPOLLONESHOT"},
+	{Flag: linux.EPOLLET, Name: "EPOLLET"},
+}
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index a4de545e9..71b92eaee 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -256,8 +256,8 @@ var linuxAMD64 = SyscallMap{
 	229: makeSyscallInfo("clock_getres", Hex, PostTimespec),
 	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
 	231: makeSyscallInfo("exit_group", Hex),
-	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
-	233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
+	232: makeSyscallInfo("epoll_wait", FD, EpollEvents, Hex, Hex),
+	233: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent),
 	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	235: makeSyscallInfo("utimes", Path, Timeval),
 	// 236: vserver (not implemented in the Linux kernel)
@@ -305,7 +305,7 @@ var linuxAMD64 = SyscallMap{
 	278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
 	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
 	280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
-	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	281: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex),
 	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
 	283: makeSyscallInfo("timerfd_create", Hex, Hex),
 	284: makeSyscallInfo("eventfd", Hex),
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
index 8bc38545f..bd7361a52 100644
--- a/pkg/sentry/strace/linux64_arm64.go
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -45,8 +45,8 @@ var linuxARM64 = SyscallMap{
 	18:  makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
 	19:  makeSyscallInfo("eventfd2", Hex, Hex),
 	20:  makeSyscallInfo("epoll_create1", Hex),
-	21:  makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
-	22:  makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	21:  makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent),
+	22:  makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex),
 	23:  makeSyscallInfo("dup", FD),
 	24:  makeSyscallInfo("dup3", FD, FD, Hex),
 	25:  makeSyscallInfo("fcntl", FD, Hex, Hex),
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 46cb2a1cc..77655558e 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -481,6 +481,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
 		case PollFDs:
 			output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
+		case EpollCtlOp:
+			output = append(output, epollCtlOps.Parse(uint64(args[arg].Int())))
+		case EpollEvent:
+			output = append(output, epollEvent(t, args[arg].Pointer()))
+		case EpollEvents:
+			output = append(output, epollEvents(t, args[arg].Pointer(), 0 /* numEvents */, uint64(maximumBlobSize)))
 		case SelectFDSet:
 			output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer()))
 		case Oct:
@@ -549,6 +555,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
 		case PollFDs:
 			output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true)
+		case EpollEvents:
+			output[arg] = epollEvents(t, args[arg].Pointer(), uint64(rval), uint64(maximumBlobSize))
 		case GetSockOptVal:
 			output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval)
 		case SetSockOptVal:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 446d1e0f6..7e69b9279 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -228,6 +228,16 @@ const (
 	// SockOptLevel is the optname argument in getsockopt(2) and
 	// setsockopt(2).
 	SockOptName
+
+	// EpollCtlOp is the op argument to epoll_ctl(2).
+	EpollCtlOp
+
+	// EpollEvent is the event argument in epoll_ctl(2).
+	EpollEvent
+
+	// EpollEvents is an array of struct epoll_event. It is the events
+	// argument in epoll_wait(2)/epoll_pwait(2).
+	EpollEvents
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From 72187fa7a9e1f3ee9d021681f4465777f91c13fe Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 20 Feb 2020 12:32:31 -0800
Subject: Import tags.bzl directly from tools/defs.bzl.

This simplifies the script slightly.

PiperOrigin-RevId: 296272077
---
 tools/bazeldefs/defs.bzl | 2 --
 tools/defs.bzl           | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 6f091d759..905b16d41 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -8,7 +8,6 @@ load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
 load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
 load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
 load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
-load("//tools/bazeldefs:tags.bzl", _go_suffixes = "go_suffixes")
 
 container_image = _container_image
 cc_binary = _cc_binary
@@ -19,7 +18,6 @@ cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
 go_image = _go_image
 go_embed_data = _go_embed_data
-go_suffixes = _go_suffixes
 gtest = "@com_google_googletest//:gtest"
 gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 4eece2d83..ddefb72d0 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,8 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:tags.bzl", "go_suffixes")
 
 # Delegate directly.
 cc_binary = _cc_binary
-- 
cgit v1.2.3


From 1bb0195079810773bd4457eecb1e7ac1890ddb74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 13:06:21 -0800
Subject: Add placeholder .travis.yml for #1886

PiperOrigin-RevId: 296279095
---
 .travis.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 000000000..e69de29bb
-- 
cgit v1.2.3


From d90d71474f4c82f742140fdf026821709845cece Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 14:28:31 -0800
Subject: Remove bytes read/written from marshal.Marshallable API.

Users of the API only care about whether the copy in/out succeeds in
their entirety, which is already signalled by the returned error.

PiperOrigin-RevId: 296297843
---
 pkg/sentry/kernel/rseq.go                          |  2 +-
 pkg/sentry/syscalls/linux/sys_stat.go              |  6 ++----
 tools/go_marshal/gomarshal/generator_interfaces.go | 21 +++++++++++----------
 tools/go_marshal/gomarshal/generator_tests.go      |  2 +-
 tools/go_marshal/marshal/marshal.go                |  4 ++--
 5 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..ded95f532 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() {
 	}
 
 	var cs linux.RSeqCriticalSection
-	if _, err := cs.CopyIn(t, critAddr); err != nil {
+	if err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 8b66a9006..11f25e00d 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -131,8 +131,7 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 		return err
 	}
 	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
-	_, err = s.CopyOut(t, statAddr)
-	return err
+	return s.CopyOut(t, statAddr)
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -142,8 +141,7 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 		return err
 	}
 	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
-	_, err = s.CopyOut(t, statAddr)
-	return err
+	return s.CopyOut(t, statAddr)
 }
 
 // Statx implements linux syscall statx(2).
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 3aa299ccd..834c58cee 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -507,13 +507,14 @@ func (g *interfaceGenerator) emitMarshallable() {
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
 			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
 			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("return task.CopyOutBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("return err\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -539,11 +540,11 @@ func (g *interfaceGenerator) emitMarshallable() {
 			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
 			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
 
-			g.emit("len, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
 			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
 			g.emit("// must live until after the CopyOutBytes.\n")
 			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return len, err\n")
+			g.emit("return err\n")
 		} else {
 			fallback()
 		}
@@ -553,20 +554,20 @@ func (g *interfaceGenerator) emitMarshallable() {
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
 			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("n, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
 			g.emit("if err != nil {\n")
 			g.inIndent(func() {
-				g.emit("return n, err\n")
+				g.emit("return err\n")
 			})
 			g.emit("}\n")
 
 			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
-			g.emit("return n, nil\n")
+			g.emit("return nil\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -592,11 +593,11 @@ func (g *interfaceGenerator) emitMarshallable() {
 			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
 			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
 
-			g.emit("len, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
 			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
 			g.emit("// must live until after the CopyInBytes.\n")
 			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return len, err\n")
+			g.emit("return err\n")
 		} else {
 			fallback()
 		}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index 8c28b00d0..2326e7a07 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -92,7 +92,7 @@ func (g *testGenerator) emitTestNonZeroSize() {
 		g.emit("x := &%s{}\n", g.typeName())
 		g.emit("if x.SizeBytes() == 0 {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(\"Marshallable.Size() should not return zero\")\n")
+			g.emit("t.Fatal(\"Marshallable.SizeBytes() should not return zero\")\n")
 		})
 		g.emit("}\n")
 	})
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index 20353850d..f129788e0 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -91,12 +91,12 @@ type Marshallable interface {
 	// marshalled does not escape. The implementation should avoid creating
 	// extra copies in memory by directly deserializing to the object's
 	// underlying memory.
-	CopyIn(task Task, addr usermem.Addr) (int, error)
+	CopyIn(task Task, addr usermem.Addr) error
 
 	// CopyOut serializes a Marshallable type to a task's memory. This may only
 	// be called from a task goroutine. This is more efficient than calling
 	// MarshalUnsafe on Marshallable.Packed types, as the type being serialized
 	// does not escape. The implementation should avoid creating extra copies in
 	// memory by directly serializing from the object's underlying memory.
-	CopyOut(task Task, addr usermem.Addr) (int, error)
+	CopyOut(task Task, addr usermem.Addr) error
 }
-- 
cgit v1.2.3


From 67b615b86f2aa1d4ded3dcf2eb8aca4e7fec57a0 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 14:31:39 -0800
Subject: Support disabling a NIC

- Disabled NICs will have their associated NDP state cleared.
- Disabled NICs will not accept incoming packets.
- Writes through a Route with a disabled NIC will return an invalid
  endpoint state error.
- stack.Stack.FindRoute will not return a route with a disabled NIC.
- NIC's Running flag will report the NIC's enabled status.

Tests:
- stack_test.TestDisableUnknownNIC
- stack_test.TestDisabledNICsNICInfoAndCheckNIC
- stack_test.TestRoutesWithDisabledNIC
- stack_test.TestRouteWritePacketWithDisabledNIC
- stack_test.TestStopStartSolicitingRouters
- stack_test.TestCleanupNDPState
- stack_test.TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable
- stack_test.TestJoinLeaveAllNodesMulticastOnNICEnableDisable
PiperOrigin-RevId: 296298588
---
 pkg/tcpip/stack/ndp.go        |  23 +-
 pkg/tcpip/stack/ndp_test.go   | 822 ++++++++++++++++++++++++------------------
 pkg/tcpip/stack/nic.go        | 110 +++++-
 pkg/tcpip/stack/stack.go      |  45 ++-
 pkg/tcpip/stack/stack_test.go | 377 ++++++++++++++++++-
 5 files changed, 998 insertions(+), 379 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 045409bda..19bd05aa3 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1148,22 +1148,27 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 	return true
 }
 
-// cleanupHostOnlyState cleans up any state that is only useful for hosts.
+// cleanupState cleans up ndp's state.
 //
-// cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a
-// host to a router. This function will invalidate all discovered on-link
-// prefixes, discovered routers, and auto-generated addresses as routers do not
-// normally process Router Advertisements to discover default routers and
-// on-link prefixes, and auto-generate addresses via SLAAC.
+// If hostOnly is true, then only host-specific state will be cleaned up.
+//
+// cleanupState MUST be called with hostOnly set to true when ndp's NIC is
+// transitioning from a host to a router. This function will invalidate all
+// discovered on-link prefixes, discovered routers, and auto-generated
+// addresses.
+//
+// If hostOnly is true, then the link-local auto-generated address will not be
+// invalidated as routers are also expected to generate a link-local address.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupHostOnlyState() {
+func (ndp *ndpState) cleanupState(hostOnly bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
 	linkLocalAddrs := 0
 	for addr := range ndp.autoGenAddresses {
 		// RFC 4862 section 5 states that routers are also expected to generate a
-		// link-local address so we do not invalidate them.
-		if linkLocalSubnet.Contains(addr) {
+		// link-local address so we do not invalidate them if we are cleaning up
+		// host-only state.
+		if hostOnly && linkLocalSubnet.Contains(addr) {
 			linkLocalAddrs++
 			continue
 		}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 1f6f77439..f7b75b74e 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -592,70 +592,94 @@ func TestDADFail(t *testing.T) {
 	}
 }
 
-// TestDADStop tests to make sure that the DAD process stops when an address is
-// removed.
 func TestDADStop(t *testing.T) {
 	const nicID = 1
 
-	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent, 1),
-	}
-	ndpConfigs := stack.NDPConfigurations{
-		RetransmitTimer:        time.Second,
-		DupAddrDetectTransmits: 2,
-	}
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPDisp:          &ndpDisp,
-		NDPConfigs:       ndpConfigs,
-	}
+	tests := []struct {
+		name   string
+		stopFn func(t *testing.T, s *stack.Stack)
+	}{
+		// Tests to make sure that DAD stops when an address is removed.
+		{
+			name: "Remove address",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveAddress(nicID, addr1); err != nil {
+					t.Fatalf("RemoveAddress(%d, %s): %s", nicID, addr1, err)
+				}
+			},
+		},
 
-	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(nicID, e); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+		// Tests to make sure that DAD stops when the NIC is disabled.
+		{
+			name: "Disable NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 	}
 
-	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent, 1),
+			}
+			ndpConfigs := stack.NDPConfigurations{
+				RetransmitTimer:        time.Second,
+				DupAddrDetectTransmits: 2,
+			}
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+				NDPConfigs:       ndpConfigs,
+			}
 
-	// Address should not be considered bound to the NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
-	}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
 
-	// Remove the address. This should stop DAD.
-	if err := s.RemoveAddress(nicID, addr1); err != nil {
-		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
-	}
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
 
-	// Wait for DAD to fail (since the address was removed during DAD).
-	select {
-	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
-		// If we don't get a failure event after the expected resolution
-		// time + extra 1s buffer, something is wrong.
-		t.Fatal("timed out waiting for DAD failure")
-	case e := <-ndpDisp.dadC:
-		if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
-			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
-		}
-	}
-	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
-	}
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			test.stopFn(t, s)
+
+			// Wait for DAD to fail (since the address was removed during DAD).
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the expected resolution
+				// time + extra 1s buffer, something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
 
-	// Should not have sent more than 1 NS message.
-	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
-		t.Fatalf("got NeighborSolicit = %d, want <= 1", got)
+			// Should not have sent more than 1 NS message.
+			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+				t.Errorf("got NeighborSolicit = %d, want <= 1", got)
+			}
+		})
 	}
 }
 
@@ -2886,17 +2910,16 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 	}
 }
 
-// TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
-// and prefixes, and non-linklocal auto-generated addresses are invalidated when
-// a NIC becomes a router.
-func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
+// TestCleanupNDPState tests that all discovered routers and prefixes, and
+// auto-generated addresses are invalidated when a NIC becomes a router.
+func TestCleanupNDPState(t *testing.T) {
 	t.Parallel()
 
 	const (
-		lifetimeSeconds = 5
-		maxEvents       = 4
-		nicID1          = 1
-		nicID2          = 2
+		lifetimeSeconds          = 5
+		maxRouterAndPrefixEvents = 4
+		nicID1                   = 1
+		nicID2                   = 2
 	)
 
 	prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1)
@@ -2912,254 +2935,308 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		PrefixLen: 64,
 	}
 
-	ndpDisp := ndpDispatcher{
-		routerC:        make(chan ndpRouterEvent, maxEvents),
-		rememberRouter: true,
-		prefixC:        make(chan ndpPrefixEvent, maxEvents),
-		rememberPrefix: true,
-		autoGenAddrC:   make(chan ndpAutoGenAddrEvent, maxEvents),
-	}
-	s := stack.New(stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		AutoGenIPv6LinkLocal: true,
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-			DiscoverOnLinkPrefixes: true,
-			AutoGenGlobalAddresses: true,
+	tests := []struct {
+		name                 string
+		cleanupFn            func(t *testing.T, s *stack.Stack)
+		keepAutoGenLinkLocal bool
+		maxAutoGenAddrEvents int
+	}{
+		// A NIC should still keep its auto-generated link-local address when
+		// becoming a router.
+		{
+			name: "Forwarding Enable",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
+			keepAutoGenLinkLocal: true,
+			maxAutoGenAddrEvents: 4,
 		},
-		NDPDisp: &ndpDisp,
-	})
 
-	expectRouterEvent := func() (bool, ndpRouterEvent) {
-		select {
-		case e := <-ndpDisp.routerC:
-			return true, e
-		default:
-		}
+		// A NIC should cleanup all NDP state when it is disabled.
+		{
+			name: "NIC Disable",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-		return false, ndpRouterEvent{}
+				if err := s.DisableNIC(nicID1); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+				}
+				if err := s.DisableNIC(nicID2); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+		},
 	}
 
-	expectPrefixEvent := func() (bool, ndpPrefixEvent) {
-		select {
-		case e := <-ndpDisp.prefixC:
-			return true, e
-		default:
-		}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				routerC:        make(chan ndpRouterEvent, maxRouterAndPrefixEvents),
+				rememberRouter: true,
+				prefixC:        make(chan ndpPrefixEvent, maxRouterAndPrefixEvents),
+				rememberPrefix: true,
+				autoGenAddrC:   make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents),
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: true,
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              true,
+					DiscoverDefaultRouters: true,
+					DiscoverOnLinkPrefixes: true,
+					AutoGenGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
 
-		return false, ndpPrefixEvent{}
-	}
+			expectRouterEvent := func() (bool, ndpRouterEvent) {
+				select {
+				case e := <-ndpDisp.routerC:
+					return true, e
+				default:
+				}
 
-	expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			return true, e
-		default:
-		}
+				return false, ndpRouterEvent{}
+			}
 
-		return false, ndpAutoGenAddrEvent{}
-	}
+			expectPrefixEvent := func() (bool, ndpPrefixEvent) {
+				select {
+				case e := <-ndpDisp.prefixC:
+					return true, e
+				default:
+				}
 
-	e1 := channel.New(0, 1280, linkAddr1)
-	if err := s.CreateNIC(nicID1, e1); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
-	}
-	// We have other tests that make sure we receive the *correct* events
-	// on normal discovery of routers/prefixes, and auto-generated
-	// addresses. Here we just make sure we get an event and let other tests
-	// handle the correctness check.
-	expectAutoGenAddrEvent()
+				return false, ndpPrefixEvent{}
+			}
 
-	e2 := channel.New(0, 1280, linkAddr2)
-	if err := s.CreateNIC(nicID2, e2); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
-	}
-	expectAutoGenAddrEvent()
+			expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					return true, e
+				default:
+				}
 
-	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
-	// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
-	// llAddr4) to discover multiple routers and prefixes, and auto-gen
-	// multiple addresses.
+				return false, ndpAutoGenAddrEvent{}
+			}
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
-	}
+			e1 := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID1, e1); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+			}
+			// We have other tests that make sure we receive the *correct* events
+			// on normal discovery of routers/prefixes, and auto-generated
+			// addresses. Here we just make sure we get an event and let other tests
+			// handle the correctness check.
+			expectAutoGenAddrEvent()
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
-	}
+			e2 := channel.New(0, 1280, linkAddr2)
+			if err := s.CreateNIC(nicID2, e2); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+			}
+			expectAutoGenAddrEvent()
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
-	}
+			// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
+			// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
+			// llAddr4) to discover multiple routers and prefixes, and auto-gen
+			// multiple addresses.
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
-	}
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
+			}
 
-	// We should have the auto-generated addresses added.
-	nicinfo := s.NICInfo()
-	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
-	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
-	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic1Addrs, e1Addr1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic1Addrs, e1Addr2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, e2Addr1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, e2Addr2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
-	}
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
+			}
 
-	// We can't proceed any further if we already failed the test (missing
-	// some discovery/auto-generated address events or addresses).
-	if t.Failed() {
-		t.FailNow()
-	}
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
+			}
 
-	s.SetForwarding(true)
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
+			}
 
-	// Collect invalidation events after becoming a router
-	gotRouterEvents := make(map[ndpRouterEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectRouterEvent()
-		if !ok {
-			t.Errorf("expected %d router events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotRouterEvents[e]++
-	}
-	gotPrefixEvents := make(map[ndpPrefixEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectPrefixEvent()
-		if !ok {
-			t.Errorf("expected %d prefix events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotPrefixEvents[e]++
-	}
-	gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectAutoGenAddrEvent()
-		if !ok {
-			t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotAutoGenAddrEvents[e]++
-	}
+			// We should have the auto-generated addresses added.
+			nicinfo := s.NICInfo()
+			nic1Addrs := nicinfo[nicID1].ProtocolAddresses
+			nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+			if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+			}
 
-	// No need to proceed any further if we already failed the test (missing
-	// some invalidation events).
-	if t.Failed() {
-		t.FailNow()
-	}
+			// We can't proceed any further if we already failed the test (missing
+			// some discovery/auto-generated address events or addresses).
+			if t.Failed() {
+				t.FailNow()
+			}
 
-	expectedRouterEvents := map[ndpRouterEvent]int{
-		{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
-		{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
-	}
-	if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
-		t.Errorf("router events mismatch (-want +got):\n%s", diff)
-	}
-	expectedPrefixEvents := map[ndpPrefixEvent]int{
-		{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
-		{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
-		{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
-		{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
-	}
-	if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
-		t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
-	}
-	expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
-		{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
-		{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
-		{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
-		{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
-	}
-	if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
-		t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
-	}
+			test.cleanupFn(t, s)
 
-	// Make sure the auto-generated addresses got removed.
-	nicinfo = s.NICInfo()
-	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
-	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
-	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-	}
-	if containsV6Addr(nic1Addrs, e1Addr1) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-	}
-	if containsV6Addr(nic1Addrs, e1Addr2) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-	}
-	if containsV6Addr(nic2Addrs, e2Addr1) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-	}
-	if containsV6Addr(nic2Addrs, e2Addr2) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
-	}
+			// Collect invalidation events after having NDP state cleaned up.
+			gotRouterEvents := make(map[ndpRouterEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectRouterEvent()
+				if !ok {
+					t.Errorf("expected %d router events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotRouterEvents[e]++
+			}
+			gotPrefixEvents := make(map[ndpPrefixEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectPrefixEvent()
+				if !ok {
+					t.Errorf("expected %d prefix events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotPrefixEvents[e]++
+			}
+			gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
+			for i := 0; i < test.maxAutoGenAddrEvents; i++ {
+				ok, e := expectAutoGenAddrEvent()
+				if !ok {
+					t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", test.maxAutoGenAddrEvents, i)
+					break
+				}
+				gotAutoGenAddrEvents[e]++
+			}
 
-	// Should not get any more events (invalidation timers should have been
-	// cancelled when we transitioned into a router).
-	time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
-	select {
-	case <-ndpDisp.routerC:
-		t.Error("unexpected router event")
-	default:
-	}
-	select {
-	case <-ndpDisp.prefixC:
-		t.Error("unexpected prefix event")
-	default:
-	}
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Error("unexpected auto-generated address event")
-	default:
+			// No need to proceed any further if we already failed the test (missing
+			// some invalidation events).
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			expectedRouterEvents := map[ndpRouterEvent]int{
+				{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
+				t.Errorf("router events mismatch (-want +got):\n%s", diff)
+			}
+			expectedPrefixEvents := map[ndpPrefixEvent]int{
+				{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
+				t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
+			}
+			expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
+				{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
+			}
+
+			if !test.keepAutoGenLinkLocal {
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID1, addr: llAddrWithPrefix1, eventType: invalidatedAddr}] = 1
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID2, addr: llAddrWithPrefix2, eventType: invalidatedAddr}] = 1
+			}
+
+			if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
+				t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
+			}
+
+			// Make sure the auto-generated addresses got removed.
+			nicinfo = s.NICInfo()
+			nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+			nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+			if containsV6Addr(nic1Addrs, llAddrWithPrefix1) != test.keepAutoGenLinkLocal {
+				if test.keepAutoGenLinkLocal {
+					t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+				} else {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+				}
+			}
+			if containsV6Addr(nic1Addrs, e1Addr1) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+			}
+			if containsV6Addr(nic1Addrs, e1Addr2) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+			}
+			if containsV6Addr(nic2Addrs, llAddrWithPrefix2) != test.keepAutoGenLinkLocal {
+				if test.keepAutoGenLinkLocal {
+					t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+				} else {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+				}
+			}
+			if containsV6Addr(nic2Addrs, e2Addr1) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+			}
+			if containsV6Addr(nic2Addrs, e2Addr2) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+			}
+
+			// Should not get any more events (invalidation timers should have been
+			// cancelled when the NDP state was cleaned up).
+			time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
+			select {
+			case <-ndpDisp.routerC:
+				t.Error("unexpected router event")
+			default:
+			}
+			select {
+			case <-ndpDisp.prefixC:
+				t.Error("unexpected prefix event")
+			default:
+			}
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Error("unexpected auto-generated address event")
+			default:
+			}
+		})
 	}
 }
 
@@ -3406,77 +3483,130 @@ func TestRouterSolicitation(t *testing.T) {
 	})
 }
 
-// TestStopStartSolicitingRouters tests that when forwarding is enabled or
-// disabled, router solicitations are stopped or started, respecitively.
 func TestStopStartSolicitingRouters(t *testing.T) {
 	t.Parallel()
 
+	const nicID = 1
 	const interval = 500 * time.Millisecond
 	const delay = time.Second
 	const maxRtrSolicitations = 3
-	e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
-	waitForPkt := func(timeout time.Duration) {
-		t.Helper()
-		ctx, _ := context.WithTimeout(context.Background(), timeout)
-		p, ok := e.ReadContext(ctx)
-		if !ok {
-			t.Fatal("timed out waiting for packet")
-			return
-		}
 
-		if p.Proto != header.IPv6ProtocolNumber {
-			t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
-		}
-		checker.IPv6(t, p.Pkt.Header.View(),
-			checker.SrcAddr(header.IPv6Any),
-			checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
-			checker.TTL(header.NDPHopLimit),
-			checker.NDPRS())
-	}
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			MaxRtrSolicitations:     maxRtrSolicitations,
-			RtrSolicitationInterval: interval,
-			MaxRtrSolicitationDelay: delay,
+	tests := []struct {
+		name    string
+		startFn func(t *testing.T, s *stack.Stack)
+		stopFn  func(t *testing.T, s *stack.Stack)
+	}{
+		// Tests that when forwarding is enabled or disabled, router solicitations
+		// are stopped or started, respectively.
+		{
+			name: "Forwarding enabled and disabled",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(false)
+			},
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
 		},
-	})
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(1) = %s", err)
-	}
 
-	// Enable forwarding which should stop router solicitations.
-	s.SetForwarding(true)
-	ctx, _ := context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		// A single RS may have been sent before forwarding was enabled.
-		ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
-		if _, ok = e.ReadContext(ctx); ok {
-			t.Fatal("Should not have sent more than one RS message")
-		}
-	}
+		// Tests that when a NIC is enabled or disabled, router solicitations
+		// are started or stopped, respectively.
+		{
+			name: "NIC disabled and enabled",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-	// Enabling forwarding again should do nothing.
-	s.SetForwarding(true)
-	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got a packet after becoming a router")
-	}
+				if err := s.EnableNIC(nicID); err != nil {
+					t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+				}
+			},
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-	// Disable forwarding which should start router solicitations.
-	s.SetForwarding(false)
-	waitForPkt(delay + defaultAsyncEventTimeout)
-	waitForPkt(interval + defaultAsyncEventTimeout)
-	waitForPkt(interval + defaultAsyncEventTimeout)
-	ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 	}
 
-	// Disabling forwarding again should do nothing.
-	s.SetForwarding(false)
-	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got a packet after becoming a router")
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
+			waitForPkt := func(timeout time.Duration) {
+				t.Helper()
+
+				ctx, cancel := context.WithTimeout(context.Background(), timeout)
+				defer cancel()
+				p, ok := e.ReadContext(ctx)
+				if !ok {
+					t.Fatal("timed out waiting for packet")
+					return
+				}
+
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
+				checker.IPv6(t, p.Pkt.Header.View(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPRS())
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					MaxRtrSolicitations:     maxRtrSolicitations,
+					RtrSolicitationInterval: interval,
+					MaxRtrSolicitationDelay: delay,
+				},
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			// Stop soliciting routers.
+			test.stopFn(t, s)
+			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				// A single RS may have been sent before forwarding was enabled.
+				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultTimeout)
+				defer cancel()
+				if _, ok = e.ReadContext(ctx); ok {
+					t.Fatal("should not have sent more than one RS message")
+				}
+			}
+
+			// Stopping router solicitations after it has already been stopped should
+			// do nothing.
+			test.stopFn(t, s)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
+			}
+
+			// Start soliciting routers.
+			test.startFn(t, s)
+			waitForPkt(delay + defaultAsyncEventTimeout)
+			waitForPkt(interval + defaultAsyncEventTimeout)
+			waitForPkt(interval + defaultAsyncEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+			}
+
+			// Starting router solicitations after it has already completed should do
+			// nothing.
+			test.startFn(t, s)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after finishing router solicitations")
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ca3a7a07e..b2be18e47 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -27,6 +27,14 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+var ipv4BroadcastAddr = tcpip.ProtocolAddress{
+	Protocol: header.IPv4ProtocolNumber,
+	AddressWithPrefix: tcpip.AddressWithPrefix{
+		Address:   header.IPv4Broadcast,
+		PrefixLen: 8 * header.IPv4AddressSize,
+	},
+}
+
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
@@ -36,7 +44,8 @@ type NIC struct {
 	linkEP  LinkEndpoint
 	context NICContext
 
-	stats NICStats
+	stats  NICStats
+	attach sync.Once
 
 	mu struct {
 		sync.RWMutex
@@ -135,7 +144,69 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	return nic
 }
 
-// enable enables the NIC. enable will attach the link to its LinkEndpoint and
+// enabled returns true if n is enabled.
+func (n *NIC) enabled() bool {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	return enabled
+}
+
+// disable disables n.
+//
+// It undoes the work done by enable.
+func (n *NIC) disable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if !enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if !n.mu.enabled {
+		return nil
+	}
+
+	// TODO(b/147015577): Should Routes that are currently bound to n be
+	// invalidated? Currently, Routes will continue to work when a NIC is enabled
+	// again, and applications may not know that the underlying NIC was ever
+	// disabled.
+
+	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
+		n.mu.ndp.stopSolicitingRouters()
+		n.mu.ndp.cleanupState(false /* hostOnly */)
+
+		// Stop DAD for all the unicast IPv6 endpoints that are in the
+		// permanentTentative state.
+		for _, r := range n.mu.endpoints {
+			if addr := r.ep.ID().LocalAddress; r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
+				n.mu.ndp.stopDuplicateAddressDetection(addr)
+			}
+		}
+
+		// The NIC may have already left the multicast group.
+		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
+		// The address may have already been removed.
+		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	// TODO(b/147015577): Should n detach from its LinkEndpoint?
+
+	n.mu.enabled = false
+	return nil
+}
+
+// enable enables n. enable will attach the nic to its LinkEndpoint and
 // join the IPv6 All-Nodes Multicast address (ff02::1).
 func (n *NIC) enable() *tcpip.Error {
 	n.mu.RLock()
@@ -158,10 +229,7 @@ func (n *NIC) enable() *tcpip.Error {
 
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol:          header.IPv4ProtocolNumber,
-			AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize},
-		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
+		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -183,6 +251,14 @@ func (n *NIC) enable() *tcpip.Error {
 		return nil
 	}
 
+	// Join the All-Nodes multicast group before starting DAD as responses to DAD
+	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
+	// source address of the NDP NS is the unspecified address, as per RFC 4861
+	// section 7.2.4.
+	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
 	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
 	// state.
 	//
@@ -200,10 +276,6 @@ func (n *NIC) enable() *tcpip.Error {
 		}
 	}
 
-	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
-		return err
-	}
-
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
 	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
 		// The valid and preferred lifetime is infinite for the auto-generated
@@ -234,7 +306,7 @@ func (n *NIC) becomeIPv6Router() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	n.mu.ndp.cleanupHostOnlyState()
+	n.mu.ndp.cleanupState(true /* hostOnly */)
 	n.mu.ndp.stopSolicitingRouters()
 }
 
@@ -252,7 +324,9 @@ func (n *NIC) becomeIPv6Host() {
 // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
 // to start delivering packets.
 func (n *NIC) attachLinkEndpoint() {
-	n.linkEP.Attach(n)
+	n.attach.Do(func() {
+		n.linkEP.Attach(n)
+	})
 }
 
 // setPromiscuousMode enables or disables promiscuous mode.
@@ -712,6 +786,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 		case permanentExpired, temporary:
 			continue
 		}
+
 		addrs = append(addrs, tcpip.ProtocolAddress{
 			Protocol: ref.protocol,
 			AddressWithPrefix: tcpip.AddressWithPrefix{
@@ -1009,6 +1084,15 @@ func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 	return nil
 }
 
+// isInGroup returns true if n has joined the multicast group addr.
+func (n *NIC) isInGroup(addr tcpip.Address) bool {
+	n.mu.RLock()
+	joins := n.mu.mcastJoins[NetworkEndpointID{addr}]
+	n.mu.RUnlock()
+
+	return joins != 0
+}
+
 func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
@@ -1411,7 +1495,7 @@ func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
 //
 // r's NIC must be read locked.
 func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
-	return r.getKind() != permanentExpired || r.nic.mu.spoofing
+	return r.nic.mu.enabled && (r.getKind() != permanentExpired || r.nic.mu.spoofing)
 }
 
 // decRef decrements the ref count and cleans up the endpoint once it reaches
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6eac16e16..fabc976a7 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -921,23 +921,38 @@ func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
+	nic, ok := s.nics[id]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
 	return nic.enable()
 }
 
+// DisableNIC disables the given NIC.
+func (s *Stack) DisableNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.disable()
+}
+
 // CheckNIC checks if a NIC is usable.
 func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	s.mu.RLock()
+	defer s.mu.RUnlock()
+
 	nic, ok := s.nics[id]
-	s.mu.RUnlock()
-	if ok {
-		return nic.linkEP.IsAttached()
+	if !ok {
+		return false
 	}
-	return false
+
+	return nic.enabled()
 }
 
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
@@ -989,7 +1004,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	for id, nic := range s.nics {
 		flags := NICStateFlags{
 			Up:          true, // Netstack interfaces are always up.
-			Running:     nic.linkEP.IsAttached(),
+			Running:     nic.enabled(),
 			Promiscuous: nic.isPromiscuousMode(),
 			Loopback:    nic.isLoopback(),
 		}
@@ -1151,7 +1166,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
 	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
-		if nic, ok := s.nics[id]; ok {
+		if nic, ok := s.nics[id]; ok && nic.enabled() {
 			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
 			}
@@ -1161,7 +1176,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
 				continue
 			}
-			if nic, ok := s.nics[route.NIC]; ok {
+			if nic, ok := s.nics[route.NIC]; ok && nic.enabled() {
 				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
@@ -1614,6 +1629,18 @@ func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NIC
 	return tcpip.ErrUnknownNICID
 }
 
+// IsInGroup returns true if the NIC with ID nicID has joined the multicast
+// group multicastAddr.
+func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.isInGroup(multicastAddr), nil
+	}
+	return false, tcpip.ErrUnknownNICID
+}
+
 // IPTables returns the stack's iptables.
 func (s *Stack) IPTables() iptables.IPTables {
 	s.tablesMu.RLock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 7ba604442..eb6f7d1fc 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
@@ -509,6 +510,257 @@ func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr
 	}
 }
 
+func TestDisableUnknownNIC(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	if err := s.DisableNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.DisableNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
+	}
+}
+
+func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	e := loopback.New()
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	checkNIC := func(enabled bool) {
+		t.Helper()
+
+		allNICInfo := s.NICInfo()
+		nicInfo, ok := allNICInfo[nicID]
+		if !ok {
+			t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+		} else if nicInfo.Flags.Running != enabled {
+			t.Errorf("got nicInfo.Flags.Running = %t, want = %t", nicInfo.Flags.Running, enabled)
+		}
+
+		if got := s.CheckNIC(nicID); got != enabled {
+			t.Errorf("got s.CheckNIC(%d) = %t, want = %t", nicID, got, enabled)
+		}
+	}
+
+	// NIC should initially report itself as disabled.
+	checkNIC(false)
+
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(true)
+
+	// If the NIC is not reporting a correct enabled status, we cannot trust the
+	// next check so end the test here.
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(false)
+}
+
+func TestRoutesWithDisabledNIC(t *testing.T) {
+	const unspecifiedNIC = 0
+	const nicID1 = 1
+	const nicID2 = 2
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep1 := channel.New(0, defaultMTU, "")
+	if err := s.CreateNIC(nicID1, ep1); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+	}
+
+	addr1 := tcpip.Address("\x01")
+	if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+	}
+
+	ep2 := channel.New(0, defaultMTU, "")
+	if err := s.CreateNIC(nicID2, ep2); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+	}
+
+	addr2 := tcpip.Address("\x02")
+	if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+	}
+
+	// Set a route table that sends all packets with odd destination
+	// addresses through the first NIC, and all even destination address
+	// through the second one.
+	{
+		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+			{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+		})
+	}
+
+	// Test routes to odd address.
+	testRoute(t, s, unspecifiedNIC, "", "\x05", addr1)
+	testRoute(t, s, unspecifiedNIC, addr1, "\x05", addr1)
+	testRoute(t, s, nicID1, addr1, "\x05", addr1)
+
+	// Test routes to even address.
+	testRoute(t, s, unspecifiedNIC, "", "\x06", addr2)
+	testRoute(t, s, unspecifiedNIC, addr2, "\x06", addr2)
+	testRoute(t, s, nicID2, addr2, "\x06", addr2)
+
+	// Disabling NIC1 should result in no routes to odd addresses. Routes to even
+	// addresses should continue to be available as NIC2 is still enabled.
+	if err := s.DisableNIC(nicID1); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+	}
+	nic1Dst := tcpip.Address("\x05")
+	testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+	testNoRoute(t, s, nicID1, addr1, nic1Dst)
+	nic2Dst := tcpip.Address("\x06")
+	testRoute(t, s, unspecifiedNIC, "", nic2Dst, addr2)
+	testRoute(t, s, unspecifiedNIC, addr2, nic2Dst, addr2)
+	testRoute(t, s, nicID2, addr2, nic2Dst, addr2)
+
+	// Disabling NIC2 should result in no routes to even addresses. No route
+	// should be available to any address as routes to odd addresses were made
+	// unavailable by disabling NIC1 above.
+	if err := s.DisableNIC(nicID2); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+	}
+	testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+	testNoRoute(t, s, nicID1, addr1, nic1Dst)
+	testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+	testNoRoute(t, s, nicID2, addr2, nic2Dst)
+
+	// Enabling NIC1 should make routes to odd addresses available again. Routes
+	// to even addresses should continue to be unavailable as NIC2 is still
+	// disabled.
+	if err := s.EnableNIC(nicID1); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID1, err)
+	}
+	testRoute(t, s, unspecifiedNIC, "", nic1Dst, addr1)
+	testRoute(t, s, unspecifiedNIC, addr1, nic1Dst, addr1)
+	testRoute(t, s, nicID1, addr1, nic1Dst, addr1)
+	testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+	testNoRoute(t, s, nicID2, addr2, nic2Dst)
+}
+
+func TestRouteWritePacketWithDisabledNIC(t *testing.T) {
+	const unspecifiedNIC = 0
+	const nicID1 = 1
+	const nicID2 = 2
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep1 := channel.New(1, defaultMTU, "")
+	if err := s.CreateNIC(nicID1, ep1); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+	}
+
+	addr1 := tcpip.Address("\x01")
+	if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+	}
+
+	ep2 := channel.New(1, defaultMTU, "")
+	if err := s.CreateNIC(nicID2, ep2); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+	}
+
+	addr2 := tcpip.Address("\x02")
+	if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+	}
+
+	// Set a route table that sends all packets with odd destination
+	// addresses through the first NIC, and all even destination address
+	// through the second one.
+	{
+		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+			{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+		})
+	}
+
+	nic1Dst := tcpip.Address("\x05")
+	r1, err := s.FindRoute(nicID1, addr1, nic1Dst, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID1, addr1, nic1Dst, fakeNetNumber, err)
+	}
+	defer r1.Release()
+
+	nic2Dst := tcpip.Address("\x06")
+	r2, err := s.FindRoute(nicID2, addr2, nic2Dst, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID2, addr2, nic2Dst, fakeNetNumber, err)
+	}
+	defer r2.Release()
+
+	// If we failed to get routes r1 or r2, we cannot proceed with the test.
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	buf := buffer.View([]byte{1})
+	testSend(t, r1, ep1, buf)
+	testSend(t, r2, ep2, buf)
+
+	// Writes with Routes that use the disabled NIC1 should fail.
+	if err := s.DisableNIC(nicID1); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+	}
+	testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+	testSend(t, r2, ep2, buf)
+
+	// Writes with Routes that use the disabled NIC2 should fail.
+	if err := s.DisableNIC(nicID2); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+	}
+	testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+	testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+
+	// Writes with Routes that use the re-enabled NIC1 should succeed.
+	// TODO(b/147015577): Should we instead completely invalidate all Routes that
+	// were bound to a disabled NIC at some point?
+	if err := s.EnableNIC(nicID1); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID1, err)
+	}
+	testSend(t, r1, ep1, buf)
+	testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+}
+
 func TestRoutes(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
@@ -2173,13 +2425,29 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 
 			e := channel.New(0, 1280, test.linkAddr)
 			s := stack.New(opts)
-			nicOpts := stack.NICOptions{Name: test.nicName}
+			nicOpts := stack.NICOptions{Name: test.nicName, Disabled: true}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
 				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
 			}
 
-			var expectedMainAddr tcpip.AddressWithPrefix
+			// A new disabled NIC should not have any address, even if auto generation
+			// was enabled.
+			allStackAddrs := s.AllAddresses()
+			allNICAddrs, ok := allStackAddrs[nicID]
+			if !ok {
+				t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+			}
+			if l := len(allNICAddrs); l != 0 {
+				t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+			}
 
+			// Enabling the NIC should attempt auto-generation of a link-local
+			// address.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+			}
+
+			var expectedMainAddr tcpip.AddressWithPrefix
 			if test.shouldGen {
 				expectedMainAddr = tcpip.AddressWithPrefix{
 					Address:   test.expectedAddr,
@@ -2609,6 +2877,111 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 	}
 }
 
+func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	allStackAddrs := s.AllAddresses()
+	allNICAddrs, ok := allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+
+	// Enabling the NIC should add the IPv4 broadcast address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 1 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 1", l)
+	}
+	want := tcpip.ProtocolAddress{
+		Protocol: header.IPv4ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   header.IPv4Broadcast,
+			PrefixLen: 32,
+		},
+	}
+	if allNICAddrs[0] != want {
+		t.Fatalf("got allNICAddrs[0] = %+v, want = %+v", allNICAddrs[0], want)
+	}
+
+	// Disabling the NIC should remove the IPv4 broadcast address.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+}
+
+func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	// Should not be in the IPv6 all-nodes multicast group yet because the NIC has
+	// not been enabled yet.
+	isInGroup, err := s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be joined when the NIC is enabled.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if !isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be left when the NIC is disabled.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+}
+
 // TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
 // was disabled have DAD performed on them when the NIC is enabled.
 func TestDoDADWhenNICEnabled(t *testing.T) {
-- 
cgit v1.2.3


From 4a73bae269ae9f52a962ae3b08a17ccaacf7ba80 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 15:19:40 -0800
Subject: Initial network namespace support.

TCP/IP will work with netstack networking. hostinet doesn't work, and sockets
will have the same behavior as it is now.

Before the userspace is able to create device, the default loopback device can
be used to test.

/proc/net and /sys/net will still be connected to the root network stack; this
is the same behavior now.

Issue #1833

PiperOrigin-RevId: 296309389
---
 pkg/sentry/fs/proc/net.go                |   5 +-
 pkg/sentry/fs/proc/sys_net.go            |   4 +-
 pkg/sentry/fsimpl/proc/tasks_net.go      |   5 +-
 pkg/sentry/fsimpl/proc/tasks_sys.go      |   4 +-
 pkg/sentry/fsimpl/testutil/kernel.go     |   1 +
 pkg/sentry/inet/BUILD                    |   1 +
 pkg/sentry/inet/namespace.go             |  99 +++++++++++++++++++++++++
 pkg/sentry/kernel/kernel.go              |  26 ++++---
 pkg/sentry/kernel/task.go                |   9 +--
 pkg/sentry/kernel/task_clone.go          |  16 ++--
 pkg/sentry/kernel/task_net.go            |  19 +++--
 pkg/sentry/kernel/task_start.go          |   8 +-
 pkg/tcpip/time_unsafe.go                 |   2 +
 runsc/boot/BUILD                         |   2 +-
 runsc/boot/controller.go                 |  11 +--
 runsc/boot/loader.go                     | 121 +++++++++++++++++++++----------
 runsc/boot/network.go                    |  27 +++++++
 runsc/boot/pprof.go                      |  18 -----
 runsc/boot/pprof/BUILD                   |  11 +++
 runsc/boot/pprof/pprof.go                |  20 +++++
 runsc/sandbox/network.go                 |  25 +------
 test/syscalls/BUILD                      |   2 +
 test/syscalls/linux/BUILD                |  17 +++++
 test/syscalls/linux/network_namespace.cc | 121 +++++++++++++++++++++++++++++++
 24 files changed, 451 insertions(+), 123 deletions(-)
 create mode 100644 pkg/sentry/inet/namespace.go
 delete mode 100644 runsc/boot/pprof.go
 create mode 100644 runsc/boot/pprof/BUILD
 create mode 100644 runsc/boot/pprof/pprof.go
 create mode 100644 test/syscalls/linux/network_namespace.cc

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6f2775344..95d5817ff 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -43,7 +43,10 @@ import (
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
 			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0772d4ae4..d4c4b533d 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
 			"core": p.newSysNetCore(ctx, msrc, s),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 608fec017..d4e1812d8 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -39,7 +39,10 @@ import (
 
 func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
 			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index c7ce74883..3d5dc463c 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
 
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		contents = map[string]*kernfs.Dentry{
 			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}),
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index d0be32e72..488478e29 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -128,6 +128,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		ThreadGroup:             tc,
 		TaskContext:             &kernel.TaskContext{Name: name},
 		Credentials:             auth.CredentialsFromContext(ctx),
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 334432abf..07bf39fed 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -10,6 +10,7 @@ go_library(
     srcs = [
         "context.go",
         "inet.go",
+        "namespace.go",
         "test_stack.go",
     ],
     deps = [
diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
new file mode 100644
index 000000000..c16667e7f
--- /dev/null
+++ b/pkg/sentry/inet/namespace.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+// Namespace represents a network namespace. See network_namespaces(7).
+//
+// +stateify savable
+type Namespace struct {
+	// stack is the network stack implementation of this network namespace.
+	stack Stack `state:"nosave"`
+
+	// creator allows kernel to create new network stack for network namespaces.
+	// If nil, no networking will function if network is namespaced.
+	creator NetworkStackCreator
+
+	// isRoot indicates whether this is the root network namespace.
+	isRoot bool
+}
+
+// NewRootNamespace creates the root network namespace, with creator
+// allowing new network namespaces to be created. If creator is nil, no
+// networking will function if the network is namespaced.
+func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace {
+	return &Namespace{
+		stack:   stack,
+		creator: creator,
+		isRoot:  true,
+	}
+}
+
+// NewNamespace creates a new network namespace from the root.
+func NewNamespace(root *Namespace) *Namespace {
+	n := &Namespace{
+		creator: root.creator,
+	}
+	n.init()
+	return n
+}
+
+// Stack returns the network stack of n. Stack may return nil if no network
+// stack is configured.
+func (n *Namespace) Stack() Stack {
+	return n.stack
+}
+
+// IsRoot returns whether n is the root network namespace.
+func (n *Namespace) IsRoot() bool {
+	return n.isRoot
+}
+
+// RestoreRootStack restores the root network namespace with stack. This should
+// only be called when restoring kernel.
+func (n *Namespace) RestoreRootStack(stack Stack) {
+	if !n.isRoot {
+		panic("RestoreRootStack can only be called on root network namespace")
+	}
+	if n.stack != nil {
+		panic("RestoreRootStack called after a stack has already been set")
+	}
+	n.stack = stack
+}
+
+func (n *Namespace) init() {
+	// Root network namespace will have stack assigned later.
+	if n.isRoot {
+		return
+	}
+	if n.creator != nil {
+		var err error
+		n.stack, err = n.creator.CreateStack()
+		if err != nil {
+			panic(err)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (n *Namespace) afterLoad() {
+	n.init()
+}
+
+// NetworkStackCreator allows new instances of a network stack to be created. It
+// is used by the kernel to create new network namespaces when requested.
+type NetworkStackCreator interface {
+	// CreateStack creates a new network stack for a network namespace.
+	CreateStack() (Stack, error)
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 7da0368f1..c62fd6eb1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -111,7 +111,7 @@ type Kernel struct {
 	timekeeper                  *Timekeeper
 	tasks                       *TaskSet
 	rootUserNamespace           *auth.UserNamespace
-	networkStack                inet.Stack `state:"nosave"`
+	rootNetworkNamespace        *inet.Namespace
 	applicationCores            uint
 	useHostCores                bool
 	extraAuxv                   []arch.AuxEntry
@@ -260,8 +260,9 @@ type InitKernelArgs struct {
 	// RootUserNamespace is the root user namespace.
 	RootUserNamespace *auth.UserNamespace
 
-	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
-	NetworkStack inet.Stack
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
 
 	// ApplicationCores is the number of logical CPUs visible to sandboxed
 	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
@@ -320,7 +321,10 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
 	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
-	k.networkStack = args.NetworkStack
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
 		k.useHostCores = true
@@ -543,8 +547,6 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
-	k.networkStack = net
-
 	initAppCores := k.applicationCores
 
 	// Load the pre-saved CPUID FeatureSet.
@@ -575,6 +577,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
 	// Load the memory file's state.
 	memoryStart := time.Now()
 	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
@@ -905,6 +911,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
@@ -1255,10 +1262,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// NetworkStack returns the network stack. NetworkStack may return nil if no
-// network stack is available.
-func (k *Kernel) NetworkStack() inet.Stack {
-	return k.networkStack
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
 }
 
 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index a3443ff21..e37e23231 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -486,13 +486,10 @@ type Task struct {
 	numaPolicy   int32
 	numaNodeMask uint64
 
-	// If netns is true, the task is in a non-root network namespace. Network
-	// namespaces aren't currently implemented in full; being in a network
-	// namespace simply prevents the task from observing any network devices
-	// (including loopback) or using abstract socket addresses (see unix(7)).
+	// netns is the task's network namespace. netns is never nil.
 	//
-	// netns is protected by mu. netns is owned by the task goroutine.
-	netns bool
+	// netns is protected by mu.
+	netns *inet.Namespace
 
 	// If rseqPreempted is true, before the next call to p.Switch(),
 	// interrupt rseq critical regions as defined by rseqAddr and
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index ba74b4c1c..78866f280 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -54,8 +55,7 @@ type SharingOptions struct {
 	NewUserNamespace bool
 
 	// If NewNetworkNamespace is true, the task should have an independent
-	// network namespace. (Note that network namespaces are not really
-	// implemented; see comment on Task.netns for details.)
+	// network namespace.
 	NewNetworkNamespace bool
 
 	// If NewFiles is true, the task should use an independent file descriptor
@@ -199,6 +199,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
 	// TODO(b/63601033): Implement CLONE_NEWNS.
 	mntnsVFS2 := t.mountNamespaceVFS2
 	if mntnsVFS2 != nil {
@@ -268,7 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
-		NetworkNamespaced:       t.netns,
+		NetworkNamespace:        netns,
 		AllowedCPUMask:          t.CPUMask(),
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
@@ -283,9 +288,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	if opts.NewNetworkNamespace {
-		cfg.NetworkNamespaced = true
-	}
 	nt, err := t.tg.pidns.owner.NewTask(cfg)
 	if err != nil {
 		if opts.NewThreadGroup {
@@ -482,7 +484,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 			t.mu.Unlock()
 			return syserror.EPERM
 		}
-		t.netns = true
+		t.netns = inet.NewNamespace(t.netns)
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 172a31e1d..f7711232c 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -22,14 +22,23 @@ import (
 func (t *Task) IsNetworkNamespaced() bool {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.netns
+	return !t.netns.IsRoot()
 }
 
 // NetworkContext returns the network stack used by the task. NetworkContext
 // may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
 func (t *Task) NetworkContext() inet.Stack {
-	if t.IsNetworkNamespaced() {
-		return nil
-	}
-	return t.k.networkStack
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index f9236a842..a5035bb7f 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -65,9 +66,8 @@ type TaskConfig struct {
 	// Niceness is the niceness of the new task.
 	Niceness int
 
-	// If NetworkNamespaced is true, the new task should observe a non-root
-	// network namespace.
-	NetworkNamespaced bool
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
 
 	// AllowedCPUMask contains the cpus that this task can run on.
 	AllowedCPUMask sched.CPUSet
@@ -133,7 +133,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
 		ioUsage:            &usage.IO{},
 		niceness:           cfg.Niceness,
-		netns:              cfg.NetworkNamespaced,
+		netns:              cfg.NetworkNamespace,
 		utsns:              cfg.UTSNamespace,
 		ipcns:              cfg.IPCNamespace,
 		abstractSockets:    cfg.AbstractSocketNamespace,
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 48764b978..2f98a996f 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -25,6 +25,8 @@ import (
 )
 
 // StdClock implements Clock with the time package.
+//
+// +stateify savable
 type StdClock struct{}
 
 var _ Clock = (*StdClock)(nil)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ae4dd102a..26f68fe3d 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -19,7 +19,6 @@ go_library(
         "loader_amd64.go",
         "loader_arm64.go",
         "network.go",
-        "pprof.go",
         "strace.go",
         "user.go",
     ],
@@ -91,6 +90,7 @@ go_library(
         "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 9c9e94864..17e774e0c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -142,7 +143,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -341,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
-	networkStack := cm.l.k.NetworkStack()
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -365,9 +366,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	if cm.l.conf.ProfileEnable {
-		// initializePProf opens /proc/self/maps, so has to be
-		// called before installing seccomp filters.
-		initializePProf()
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
 	}
 
 	// Seccomp filters have to be applied before parsing the state file.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index eef43b9df..e7ca98134 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -49,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -60,6 +61,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
@@ -230,11 +232,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -277,7 +276,7 @@ func New(args Args) (*Loader, error) {
 		FeatureSet:                  cpuid.HostFeatureSet(),
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
-		NetworkStack:                networkStack,
+		RootNetworkNamespace:        netns,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
@@ -466,7 +465,7 @@ func (l *Loader) run() error {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
-		stack := l.k.NetworkStack().(*hostinet.Stack)
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
 		if err := stack.Configure(); err != nil {
 			return err
 		}
@@ -485,7 +484,7 @@ func (l *Loader) run() error {
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
 		if l.conf.ProfileEnable {
-			initializePProf()
+			pprof.Initialize()
 		}
 
 		// Finally done with all configuration. Setup filters before user code
@@ -908,48 +907,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack(), nil
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
 	case NetworkNone, NetworkSandbox:
-		// NetworkNone sets up loopback using netstack.
-		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
-		s := netstack.Stack{stack.New(stack.Options{
-			NetworkProtocols:   netProtos,
-			TransportProtocols: transProtos,
-			Clock:              clock,
-			Stats:              netstack.Metrics,
-			HandleLocal:        true,
-			// Enable raw sockets for users with sufficient
-			// privileges.
-			RawFactory: raw.EndpointFactory{},
-			UniqueID:   uniqueID,
-		})}
-
-		// Enable SACK Recovery.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
 		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
 
-		// Set default TTLs as required by socket/netstack.
-		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
 
-		// Enable Receive Buffer Auto-Tuning.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
-		}
+}
 
-		s.FillDefaultIPTables()
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
 
-		return &s, nil
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
 
-	default:
-		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillDefaultIPTables()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
 	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
 }
 
 // signal sends a signal to one or more processes in a container. If PID is 0,
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a8765ec8..bee6ee336 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"net"
+	"strings"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -31,6 +32,32 @@ import (
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
 // Network exposes methods that can be used to configure a network stack.
 type Network struct {
 	Stack *stack.Stack
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go
deleted file mode 100644
index 463362f02..000000000
--- a/runsc/boot/pprof.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-func initializePProf() {
-}
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go
new file mode 100644
index 000000000..1ded20dee
--- /dev/null
+++ b/runsc/boot/pprof/pprof.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
+
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 99e143696..bc093fba5 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -21,7 +21,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"strings"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 }
 
 func createDefaultLoopbackInterface(conn *urpc.Client) error {
-	link := boot.LoopbackLink{
-		Name: "lo",
-		Addresses: []net.IP{
-			net.IP("\x7f\x00\x00\x01"),
-			net.IPv6loopback,
-		},
-		Routes: []boot.Route{
-			{
-				Destination: net.IPNet{
-
-					IP:   net.IPv4(0x7f, 0, 0, 0),
-					Mask: net.IPv4Mask(0xff, 0, 0, 0),
-				},
-			},
-			{
-				Destination: net.IPNet{
-					IP:   net.IPv6loopback,
-					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
-				},
-			},
-		},
-	}
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
-		LoopbackLinks: []boot.LoopbackLink{link},
+		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
 	}, nil); err != nil {
 		return fmt.Errorf("creating loopback link and routes: %v", err)
 	}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d69ac8356..d1977d4de 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -258,6 +258,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:munmap_test")
 
+syscall_test(test = "//test/syscalls/linux:network_namespace_test")
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:open_create_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 05a818795..aa303af84 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3639,6 +3639,23 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "network_namespace_test",
+    testonly = 1,
+    srcs = ["network_namespace.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/util:capability_util",
+        "//test/util:memory_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_binary(
     name = "semaphore_test",
     testonly = 1,
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
new file mode 100644
index 000000000..6ea48c263
--- /dev/null
+++ b/test/syscalls/linux/network_namespace.cc
@@ -0,0 +1,121 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <net/if.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using TestFunc = std::function<PosixError()>;
+using RunFunc = std::function<PosixError(TestFunc)>;
+
+struct NamespaceStrategy {
+  RunFunc run;
+
+  static NamespaceStrategy Of(RunFunc run) {
+    NamespaceStrategy s;
+    s.run = run;
+    return s;
+  }
+};
+
+PosixError RunWithUnshare(TestFunc fn) {
+  PosixError err = PosixError(-1, "function did not return a value");
+  ScopedThread t([&] {
+    if (unshare(CLONE_NEWNET) != 0) {
+      err = PosixError(errno);
+      return;
+    }
+    err = fn();
+  });
+  t.Join();
+  return err;
+}
+
+PosixError RunWithClone(TestFunc fn) {
+  struct Args {
+    absl::Notification n;
+    TestFunc fn;
+    PosixError err;
+  };
+  Args args;
+  args.fn = fn;
+  args.err = PosixError(-1, "function did not return a value");
+
+  ASSIGN_OR_RETURN_ERRNO(
+      Mapping child_stack,
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  pid_t child = clone(
+      +[](void *arg) {
+        Args *args = reinterpret_cast<Args *>(arg);
+        args->err = args->fn();
+        args->n.Notify();
+        syscall(SYS_exit, 0);  // Exit manually. No return address on stack.
+        return 0;
+      },
+      reinterpret_cast<void *>(child_stack.addr() + kPageSize),
+      CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args);
+  if (child < 0) {
+    return PosixError(errno, "clone() failed");
+  }
+  args.n.WaitForNotification();
+  return args.err;
+}
+
+class NetworkNamespaceTest
+    : public ::testing::TestWithParam<NamespaceStrategy> {};
+
+TEST_P(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  EXPECT_NO_ERRNO(GetParam().run([]() {
+    // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
+    // Check loopback device exists.
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+      return PosixError(errno, "socket() failed");
+    }
+    struct ifreq ifr;
+    snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+    if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) {
+      return PosixError(errno, "ioctl() failed, lo cannot be found");
+    }
+    return NoError();
+  }));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllNetworkNamespaceTest, NetworkNamespaceTest,
+    ::testing::Values(NamespaceStrategy::Of(RunWithUnshare),
+                      NamespaceStrategy::Of(RunWithClone)));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From f1b72752e5de2abc3c409a6b7447224620b7c11b Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 16:22:45 -0800
Subject: Implement automated marshalling for newtypes on primitives.

PiperOrigin-RevId: 296322954
---
 tools/defs.bzl                                     |   8 +-
 tools/go_marshal/BUILD                             |   5 +
 tools/go_marshal/gomarshal/generator.go            |  43 ++-
 tools/go_marshal/gomarshal/generator_interfaces.go | 296 ++++++++++++++++-----
 tools/go_marshal/gomarshal/generator_tests.go      |  15 +-
 tools/go_marshal/test/test.go                      |  10 +
 6 files changed, 286 insertions(+), 91 deletions(-)

diff --git a/tools/defs.bzl b/tools/defs.bzl
index ddefb72d0..45c065459 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -85,7 +85,7 @@ def go_imports(name, src, out):
         cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
     )
 
-def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
+def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
     The recommended way is to use this rule with mostly identical configuration as the native
@@ -108,6 +108,7 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
       imports: imports required for stateify.
       stateify: whether statify is enabled (default: true).
       marshal: whether marshal is enabled (default: false).
+      marshal_debug: whether the gomarshal tools emits debugging output (default: false).
       **kwargs: standard go_library arguments.
     """
     all_srcs = srcs
@@ -146,7 +147,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
             go_marshal(
                 name = name + suffix + "_abi_autogen",
                 srcs = src_subset,
-                debug = False,
+                debug = select({
+                    "//tools/go_marshal:marshal_config_verbose": True,
+                    "//conditions:default": marshal_debug,
+                }),
                 imports = imports,
                 package = name,
             )
diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD
index 80d9c0504..be49cf9c8 100644
--- a/tools/go_marshal/BUILD
+++ b/tools/go_marshal/BUILD
@@ -12,3 +12,8 @@ go_binary(
         "//tools/go_marshal/gomarshal",
     ],
 )
+
+config_setting(
+    name = "marshal_config_verbose",
+    values = {"define": "gomarshal=verbose"},
+)
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 0fa868415..d365a1f3c 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -44,7 +44,8 @@ const (
 // All recievers are single letters, so we don't allow import aliases to be a
 // single letter.
 var badIdents = []string{
-	"addr", "blk", "buf", "dst", "dsts", "err", "hdr", "len", "ptr", "src", "srcs", "task", "val",
+	"addr", "blk", "buf", "dst", "dsts", "err", "hdr", "idx", "inner", "len",
+	"ptr", "src", "srcs", "task", "val",
 	// All single-letter identifiers.
 }
 
@@ -193,9 +194,9 @@ func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) {
 	return files, fsets, nil
 }
 
-// collectMarshallabeTypes walks the parsed AST and collects a list of type
+// collectMarshallableTypes walks the parsed AST and collects a list of type
 // declarations for which we need to generate the Marshallable interface.
-func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*ast.TypeSpec {
+func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*ast.TypeSpec {
 	var types []*ast.TypeSpec
 	for _, decl := range a.Decls {
 		gdecl, ok := decl.(*ast.GenDecl)
@@ -222,14 +223,22 @@ func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*as
 			continue
 		}
 		for _, spec := range gdecl.Specs {
-			// We already confirmed we're in a type declaration earlier.
+			// We already confirmed we're in a type declaration earlier, so this
+			// cast will succeed.
 			t := spec.(*ast.TypeSpec)
-			if _, ok := t.Type.(*ast.StructType); ok {
-				debugfAt(f.Position(t.Pos()), "Collected marshallable type %s.\n", t.Name.Name)
+			switch t.Type.(type) {
+			case *ast.StructType:
+				debugfAt(f.Position(t.Pos()), "Collected marshallable struct %s.\n", t.Name.Name)
+				types = append(types, t)
+				continue
+			case *ast.Ident: // Newtype on primitive.
+				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on primitive %s.\n", t.Name.Name)
 				types = append(types, t)
 				continue
 			}
-			debugf("Skipping declaration %v since it's not a struct declaration.\n", gdecl)
+			// A user specifically requested marshalling on this type, but we
+			// don't support it.
+			abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name))
 		}
 	}
 	return types
@@ -269,12 +278,20 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp
 }
 
 func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
-	// We're guaranteed to have only struct type specs by now. See
-	// Generator.collectMarshallabeTypes.
 	i := newInterfaceGenerator(t, fset)
-	i.validate()
-	i.emitMarshallable()
-	return i
+	switch ty := t.Type.(type) {
+	case *ast.StructType:
+		i.validateStruct()
+		i.emitMarshallableForStruct()
+		return i
+	case *ast.Ident:
+		i.validatePrimitiveNewtype(ty)
+		i.emitMarshallableForPrimitiveNewtype()
+		return i
+	default:
+		// This should've been filtered out by collectMarshallabeTypes.
+		panic(fmt.Sprintf("Unexpected type %+v", ty))
+	}
 }
 
 // generateOneTestSuite generates a test suite for the automatically generated
@@ -320,7 +337,7 @@ func (g *Generator) Run() error {
 	for i, a := range asts {
 		// Collect type declarations marked for code generation and generate
 		// Marshallable interfaces.
-		for _, t := range g.collectMarshallabeTypes(a, fsets[i]) {
+		for _, t := range g.collectMarshallableTypes(a, fsets[i]) {
 			impl := g.generateOne(t, fsets[i])
 			// Collect Marshallable types referenced by the generated code.
 			for ref, _ := range impl.ms {
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 834c58cee..ea1af998e 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -55,9 +55,6 @@ func (g *interfaceGenerator) typeName() string {
 
 // newinterfaceGenerator creates a new interface generator.
 func newInterfaceGenerator(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
-	if _, ok := t.Type.(*ast.StructType); !ok {
-		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
-	}
 	g := &interfaceGenerator{
 		t:  t,
 		r:  receiverName(t),
@@ -103,9 +100,31 @@ func (g *interfaceGenerator) abortAt(p token.Pos, msg string) {
 	abortAt(g.f.Position(p), msg)
 }
 
-// validate ensures the type we're working with can be marshalled. These checks
-// are done ahead of time and in one place so we can make assumptions later.
-func (g *interfaceGenerator) validate() {
+func (g *interfaceGenerator) validatePrimitiveNewtype(t *ast.Ident) {
+	switch t.Name {
+	case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
+		// These are the only primitive types we're allow. Below, we provide
+		// suggestions for some disallowed types and reject them, then attempt
+		// to marshal any remaining types by invoking the marshal.Marshallable
+		// interface on them. If these types don't actually implement
+		// marshal.Marshallable, compilation of the generated code will fail
+		// with an appropriate error message.
+		return
+	case "int":
+		g.abortAt(t.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
+	case "uint":
+		g.abortAt(t.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
+	case "string":
+		g.abortAt(t.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
+	default:
+		debugfAt(g.f.Position(t.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
+	}
+}
+
+// validateStruct ensures the type we're working with can be marshalled. These
+// checks are done ahead of time and in one place so we can make assumptions
+// later.
+func (g *interfaceGenerator) validateStruct() {
 	g.forEachField(func(f *ast.Field) {
 		if len(f.Names) == 0 {
 			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
@@ -115,25 +134,7 @@ func (g *interfaceGenerator) validate() {
 	g.forEachField(func(f *ast.Field) {
 		fieldDispatcher{
 			primitive: func(_, t *ast.Ident) {
-				switch t.Name {
-				case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
-					// These are the only primitive types we're allow. Below, we
-					// provide suggestions for some disallowed types and reject
-					// them, then attempt to marshal any remaining types by
-					// invoking the marshal.Marshallable interface on them. If
-					// these types don't actually implement
-					// marshal.Marshallable, compilation of the generated code
-					// will fail with an appropriate error message.
-					return
-				case "int":
-					g.abortAt(f.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
-				case "uint":
-					g.abortAt(f.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
-				case "string":
-					g.abortAt(f.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
-				default:
-					debugfAt(g.f.Position(f.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
-				}
+				g.validatePrimitiveNewtype(t)
 			},
 			selector: func(_, _, _ *ast.Ident) {
 				// No validation to perform on selector fields. However this
@@ -190,7 +191,8 @@ func (g *interfaceGenerator) shiftDynamic(bufVar, name string) {
 	g.emit("%s = %s[%s.SizeBytes():]\n", bufVar, bufVar, name)
 }
 
-func (g *interfaceGenerator) marshalScalar(accessor, typ string, bufVar string) {
+// marshalStructFieldScalar writes a single scalar field from a struct to a byte slice.
+func (g *interfaceGenerator) marshalStructFieldScalar(accessor, typ, bufVar string) {
 	switch typ {
 	case "int8", "uint8", "byte":
 		g.emit("%s[0] = byte(%s)\n", bufVar, accessor)
@@ -213,43 +215,27 @@ func (g *interfaceGenerator) marshalScalar(accessor, typ string, bufVar string)
 	}
 }
 
-func (g *interfaceGenerator) unmarshalScalar(accessor, typ string, bufVar string) {
+// unmarshalStructFieldScalar reads a single scalar field from a struct, from a
+// byte slice.
+func (g *interfaceGenerator) unmarshalStructFieldScalar(accessor, typ, bufVar string) {
 	switch typ {
-	case "int8":
-		g.emit("%s = int8(%s[0])\n", accessor, bufVar)
-		g.shift(bufVar, 1)
-	case "uint8":
-		g.emit("%s = uint8(%s[0])\n", accessor, bufVar)
-		g.shift(bufVar, 1)
 	case "byte":
 		g.emit("%s = %s[0]\n", accessor, bufVar)
 		g.shift(bufVar, 1)
-
-	case "int16":
-		g.recordUsedImport("usermem")
-		g.emit("%s = int16(usermem.ByteOrder.Uint16(%s[:2]))\n", accessor, bufVar)
-		g.shift(bufVar, 2)
-	case "uint16":
+	case "int8", "uint8":
+		g.emit("%s = %s(%s[0])\n", accessor, typ, bufVar)
+		g.shift(bufVar, 1)
+	case "int16", "uint16":
 		g.recordUsedImport("usermem")
-		g.emit("%s = usermem.ByteOrder.Uint16(%s[:2])\n", accessor, bufVar)
+		g.emit("%s = %s(usermem.ByteOrder.Uint16(%s[:2]))\n", accessor, typ, bufVar)
 		g.shift(bufVar, 2)
-
-	case "int32":
-		g.recordUsedImport("usermem")
-		g.emit("%s = int32(usermem.ByteOrder.Uint32(%s[:4]))\n", accessor, bufVar)
-		g.shift(bufVar, 4)
-	case "uint32":
+	case "int32", "uint32":
 		g.recordUsedImport("usermem")
-		g.emit("%s = usermem.ByteOrder.Uint32(%s[:4])\n", accessor, bufVar)
+		g.emit("%s = %s(usermem.ByteOrder.Uint32(%s[:4]))\n", accessor, typ, bufVar)
 		g.shift(bufVar, 4)
-
-	case "int64":
-		g.recordUsedImport("usermem")
-		g.emit("%s = int64(usermem.ByteOrder.Uint64(%s[:8]))\n", accessor, bufVar)
-		g.shift(bufVar, 8)
-	case "uint64":
+	case "int64", "uint64":
 		g.recordUsedImport("usermem")
-		g.emit("%s = usermem.ByteOrder.Uint64(%s[:8])\n", accessor, bufVar)
+		g.emit("%s = %s(usermem.ByteOrder.Uint64(%s[:8]))\n", accessor, typ, bufVar)
 		g.shift(bufVar, 8)
 	default:
 		g.emit("%s.UnmarshalBytes(%s[:%s.SizeBytes()])\n", accessor, bufVar, accessor)
@@ -258,6 +244,49 @@ func (g *interfaceGenerator) unmarshalScalar(accessor, typ string, bufVar string
 	}
 }
 
+// marshalPrimitiveScalar writes a single primitive variable to a byte slice.
+func (g *interfaceGenerator) marshalPrimitiveScalar(accessor, typ, bufVar string) {
+	switch typ {
+	case "int8", "uint8", "byte":
+		g.emit("%s[0] = byte(*%s)\n", bufVar, accessor)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint16(%s[:2], uint16(*%s))\n", bufVar, accessor)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint32(%s[:4], uint32(*%s))\n", bufVar, accessor)
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint64(%s[:8], uint64(*%s))\n", bufVar, accessor)
+	default:
+		g.emit("inner := (*%s)(%s)\n", typ, accessor)
+		g.emit("inner.MarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
+	}
+}
+
+// unmarshalPrimitiveScalar read a single primitive variable from a byte slice.
+func (g *interfaceGenerator) unmarshalPrimitiveScalar(accessor, typ, bufVar, typeCast string) {
+	switch typ {
+	case "byte":
+		g.emit("*%s = %s(%s[0])\n", accessor, typeCast, bufVar)
+	case "int8", "uint8":
+		g.emit("*%s = %s(%s(%s[0]))\n", accessor, typeCast, typ, bufVar)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint16(%s[:2])))\n", accessor, typeCast, typ, bufVar)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint32(%s[:4])))\n", accessor, typeCast, typ, bufVar)
+
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint64(%s[:8])))\n", accessor, typeCast, typ, bufVar)
+	default:
+		g.emit("inner := (*%s)(%s)\n", typ, accessor)
+		g.emit("inner.UnmarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
+	}
+}
+
 // areFieldsPackedExpression returns a go expression checking whether g.t's fields are
 // packed. Returns "", false if g.t has no fields that may be potentially
 // packed, otherwise returns <clause>, true, where <clause> is an expression
@@ -274,7 +303,7 @@ func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
 	return strings.Join(cs, " && "), true
 }
 
-func (g *interfaceGenerator) emitMarshallable() {
+func (g *interfaceGenerator) emitMarshallableForStruct() {
 	// Is g.t a packed struct without consideing field types?
 	thisPacked := true
 	g.forEachField(func(f *ast.Field) {
@@ -357,10 +386,10 @@ func (g *interfaceGenerator) emitMarshallable() {
 					}
 					return
 				}
-				g.marshalScalar(g.fieldAccessor(n), t.Name, "dst")
+				g.marshalStructFieldScalar(g.fieldAccessor(n), t.Name, "dst")
 			},
 			selector: func(n, tX, tSel *ast.Ident) {
-				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
+				g.marshalStructFieldScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
 			},
 			array: func(n, t *ast.Ident, size int) {
 				if n.Name == "_" {
@@ -377,9 +406,9 @@ func (g *interfaceGenerator) emitMarshallable() {
 					return
 				}
 
-				g.emit("for i := 0; i < %d; i++ {\n", size)
+				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
 				g.inIndent(func() {
-					g.marshalScalar(fmt.Sprintf("%s[i]", g.fieldAccessor(n)), t.Name, "dst")
+					g.marshalStructFieldScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "dst")
 				})
 				g.emit("}\n")
 			},
@@ -406,10 +435,10 @@ func (g *interfaceGenerator) emitMarshallable() {
 					}
 					return
 				}
-				g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src")
+				g.unmarshalStructFieldScalar(g.fieldAccessor(n), t.Name, "src")
 			},
 			selector: func(n, tX, tSel *ast.Ident) {
-				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
+				g.unmarshalStructFieldScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
 			},
 			array: func(n, t *ast.Ident, size int) {
 				if n.Name == "_" {
@@ -426,9 +455,9 @@ func (g *interfaceGenerator) emitMarshallable() {
 					return
 				}
 
-				g.emit("for i := 0; i < %d; i++ {\n", size)
+				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
 				g.inIndent(func() {
-					g.unmarshalScalar(fmt.Sprintf("%s[i]", g.fieldAccessor(n)), t.Name, "src")
+					g.unmarshalStructFieldScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "src")
 				})
 				g.emit("}\n")
 			},
@@ -650,3 +679,144 @@ func (g *interfaceGenerator) emitMarshallable() {
 	})
 	g.emit("}\n\n")
 }
+
+// emitMarshallableForPrimitiveNewtype outputs code to implement the
+// marshal.Marshallable interface for a newtype on a primitive. Primitive
+// newtypes are always packed, so we can omit the various fallbacks required for
+// non-packed structs.
+func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype() {
+	g.recordUsedImport("io")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("safecopy")
+	g.recordUsedImport("unsafe")
+	g.recordUsedImport("usermem")
+
+	nt := g.t.Type.(*ast.Ident)
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if size, dynamic := g.scalarSize(nt); !dynamic {
+			g.emit("return %d\n", size)
+		} else {
+			g.emit("return (*%s)(nil).SizeBytes()\n", nt.Name)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.marshalPrimitiveScalar(g.r, nt.Name, "dst")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.unmarshalPrimitiveScalar(g.r, nt.Name, "src", g.typeName())
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Scalar newtypes are always packed.\n")
+		g.emit("return true\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		// Fast serialization.
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the CopyOutBytes.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("_, err := task.CopyInBytes(addr, buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the CopyInBytes.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("len, err := w.Write(buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the Write.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return int64(len), err\n")
+
+	})
+	g.emit("}\n\n")
+
+}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index 2326e7a07..8ba47eb67 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -49,9 +49,6 @@ type testGenerator struct {
 }
 
 func newTestGenerator(t *ast.TypeSpec) *testGenerator {
-	if _, ok := t.Type.(*ast.StructType); !ok {
-		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
-	}
 	g := &testGenerator{
 		t:       t,
 		r:       receiverName(t),
@@ -69,14 +66,6 @@ func (g *testGenerator) typeName() string {
 	return g.t.Name.Name
 }
 
-func (g *testGenerator) forEachField(fn func(f *ast.Field)) {
-	// This is guaranteed to succeed because g.t is always a struct.
-	st := g.t.Type.(*ast.StructType)
-	for _, field := range st.Fields.List {
-		fn(field)
-	}
-}
-
 func (g *testGenerator) testFuncName(base string) string {
 	return fmt.Sprintf("%s%s", base, strings.Title(g.t.Name.Name))
 }
@@ -89,7 +78,7 @@ func (g *testGenerator) inTestFunction(name string, body func()) {
 
 func (g *testGenerator) emitTestNonZeroSize() {
 	g.inTestFunction("TestSizeNonZero", func() {
-		g.emit("x := &%s{}\n", g.typeName())
+		g.emit("var x %v\n", g.typeName())
 		g.emit("if x.SizeBytes() == 0 {\n")
 		g.inIndent(func() {
 			g.emit("t.Fatal(\"Marshallable.SizeBytes() should not return zero\")\n")
@@ -100,7 +89,7 @@ func (g *testGenerator) emitTestNonZeroSize() {
 
 func (g *testGenerator) emitTestSuspectAlignment() {
 	g.inTestFunction("TestSuspectAlignment", func() {
-		g.emit("x := %s{}\n", g.typeName())
+		g.emit("var x %v\n", g.typeName())
 		g.emit("analysis.AlignmentCheck(t, reflect.TypeOf(x))\n")
 	})
 }
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
index 8de02d707..93229dedb 100644
--- a/tools/go_marshal/test/test.go
+++ b/tools/go_marshal/test/test.go
@@ -103,3 +103,13 @@ type Stat struct {
 	CTime   Timespec
 	_       [3]int64
 }
+
+// SignalSet is an example marshallable newtype on a primitive.
+//
+// +marshal
+type SignalSet uint64
+
+// SignalSetAlias is an example newtype on another marshallable type.
+//
+// +marshal
+type SignalSetAlias SignalSet
-- 
cgit v1.2.3


From 5d711c329a7973dae37b654528949d62a131319a Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 19 Feb 2020 08:09:16 +0000
Subject: Force downloading new version of org_golang_x_sys.

ARM64 PTRACE_SYSEMU support was added to Linux kernal from
v5.3 and the corresponding support in golang is also enabled
in the latest org.golang/x/sys repository.

Updates #1876

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I10750c4c8b68f6f68d0a4d828e266966434c92fe
---
 WORKSPACE | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index ff0196dc6..a15238a2e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -33,6 +33,20 @@ load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 
 gazelle_dependencies()
 
+# TODO(gvisor.dev/issue/1876): Move the statement to "External repositories"
+# block below once 1876 is fixed.
+#
+# The com_google_protobuf repository below would trigger downloading a older
+# version of org_golang_x_sys. If putting this repository statment in a place
+# after that of the com_google_protobuf, this statement will not work as
+# expectd to download a new version of org_golang_x_sys.
+go_repository(
+    name = "org_golang_x_sys",
+    importpath = "golang.org/x/sys",
+    sum = "h1:72l8qCJ1nGxMGH26QVBVIxKd/D34cfGt0OvrPtpemyY=",
+    version = "v0.0.0-20191220220014-0732a990476f",
+)
+
 # Load C++ rules.
 http_archive(
     name = "rules_cc",
@@ -256,13 +270,6 @@ go_repository(
     version = "v0.0.0-20190423024810-112230192c58",
 )
 
-go_repository(
-    name = "org_golang_x_sys",
-    importpath = "golang.org/x/sys",
-    sum = "h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=",
-    version = "v0.0.0-20190215142949-d0b11bdaac8a",
-)
-
 go_repository(
     name = "org_golang_x_time",
     commit = "c4c64cad1fd0a1a8dab2523e04e61d35308e131e",
-- 
cgit v1.2.3


From 97c07242c37e56f6cfdc52036d554052ba95f2ae Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 21 Feb 2020 09:53:56 -0800
Subject: Use Route.MaxHeaderLength when constructing NDP RS

Test: stack_test.TestRouterSolicitation
PiperOrigin-RevId: 296454766
---
 pkg/tcpip/stack/ndp.go      |  2 +-
 pkg/tcpip/stack/ndp_test.go | 78 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 19bd05aa3..f651871ce 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1235,7 +1235,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 		}
 
 		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
 		pkt.SetType(header.ICMPv6RouterSolicit)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index f7b75b74e..6e9306d09 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -267,6 +267,17 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 	}
 }
 
+// channelLinkWithHeaderLength is a channel.Endpoint with a configurable
+// header length.
+type channelLinkWithHeaderLength struct {
+	*channel.Endpoint
+	headerLength uint16
+}
+
+func (l *channelLinkWithHeaderLength) MaxHeaderLength() uint16 {
+	return l.headerLength
+}
+
 // Check e to make sure that the event is for addr on nic with ID 1, and the
 // resolved flag set to resolved with the specified err.
 func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string {
@@ -323,21 +334,46 @@ func TestDADDisabled(t *testing.T) {
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
 // RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
+// This tests also validates the NDP NS packet that is transmitted.
 func TestDADResolve(t *testing.T) {
 	const nicID = 1
 
 	tests := []struct {
 		name                    string
+		linkHeaderLen           uint16
 		dupAddrDetectTransmits  uint8
 		retransTimer            time.Duration
 		expectedRetransmitTimer time.Duration
 	}{
-		{"1:1s:1s", 1, time.Second, time.Second},
-		{"2:1s:1s", 2, time.Second, time.Second},
-		{"1:2s:2s", 1, 2 * time.Second, 2 * time.Second},
+		{
+			name:                    "1:1s:1s",
+			dupAddrDetectTransmits:  1,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "2:1s:1s",
+			linkHeaderLen:           1,
+			dupAddrDetectTransmits:  2,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "1:2s:2s",
+			linkHeaderLen:           2,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            2 * time.Second,
+			expectedRetransmitTimer: 2 * time.Second,
+		},
 		// 0s is an invalid RetransmitTimer timer and will be fixed to
 		// the default RetransmitTimer value of 1s.
-		{"1:0s:1s", 1, 0, time.Second},
+		{
+			name:                    "1:0s:1s",
+			linkHeaderLen:           3,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            0,
+			expectedRetransmitTimer: time.Second,
+		},
 	}
 
 	for _, test := range tests {
@@ -356,10 +392,13 @@ func TestDADResolve(t *testing.T) {
 			opts.NDPConfigs.RetransmitTimer = test.retransTimer
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
-			e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1)
-			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			e := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1),
+				headerLength: test.linkHeaderLen,
+			}
+			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 			s := stack.New(opts)
-			if err := s.CreateNIC(nicID, e); err != nil {
+			if err := s.CreateNIC(nicID, &e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
@@ -445,6 +484,10 @@ func TestDADResolve(t *testing.T) {
 						checker.NDPNSTargetAddress(addr1),
 						checker.NDPNSOptions(nil),
 					))
+
+				if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+					t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+				}
 			}
 		})
 	}
@@ -3336,8 +3379,11 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 func TestRouterSolicitation(t *testing.T) {
 	t.Parallel()
 
+	const nicID = 1
+
 	tests := []struct {
 		name                        string
+		linkHeaderLen               uint16
 		maxRtrSolicit               uint8
 		rtrSolicitInt               time.Duration
 		effectiveRtrSolicitInt      time.Duration
@@ -3354,6 +3400,7 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS with delay",
+			linkHeaderLen:               1,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3362,6 +3409,7 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Single RS without delay",
+			linkHeaderLen:               2,
 			maxRtrSolicit:               1,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3370,6 +3418,7 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS without delay and invalid zero interval",
+			linkHeaderLen:               3,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               0,
 			effectiveRtrSolicitInt:      4 * time.Second,
@@ -3407,8 +3456,11 @@ func TestRouterSolicitation(t *testing.T) {
 
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
-				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
-				e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+				e := channelLinkWithHeaderLength{
+					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, linkAddr1),
+					headerLength: test.linkHeaderLen,
+				}
+				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
 					ctx, _ := context.WithTimeout(context.Background(), timeout)
@@ -3434,6 +3486,10 @@ func TestRouterSolicitation(t *testing.T) {
 						checker.TTL(header.NDPHopLimit),
 						checker.NDPRS(),
 					)
+
+					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+						t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+					}
 				}
 				waitForNothing := func(timeout time.Duration) {
 					t.Helper()
@@ -3450,8 +3506,8 @@ func TestRouterSolicitation(t *testing.T) {
 						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
 					},
 				})
-				if err := s.CreateNIC(1, e); err != nil {
-					t.Fatalf("CreateNIC(1) = %s", err)
+				if err := s.CreateNIC(nicID, &e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 				}
 
 				// Make sure each RS got sent at the right
-- 
cgit v1.2.3


From a155a23480abfafe096ff50f2c4aaf2c215b6c44 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 21 Feb 2020 11:20:15 -0800
Subject: Attach LinkEndpoint to NetworkDispatcher immediately

Tests: stack_test.TestAttachToLinkEndpointImmediately
PiperOrigin-RevId: 296474068
---
 pkg/tcpip/stack/nic.go        | 25 +++++++------------
 pkg/tcpip/stack/stack.go      |  5 ++--
 pkg/tcpip/stack/stack_test.go | 56 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index b2be18e47..862954ab2 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -44,8 +44,7 @@ type NIC struct {
 	linkEP  LinkEndpoint
 	context NICContext
 
-	stats  NICStats
-	attach sync.Once
+	stats NICStats
 
 	mu struct {
 		sync.RWMutex
@@ -141,6 +140,8 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
 	}
 
+	nic.linkEP.Attach(nic)
+
 	return nic
 }
 
@@ -200,14 +201,16 @@ func (n *NIC) disable() *tcpip.Error {
 		}
 	}
 
-	// TODO(b/147015577): Should n detach from its LinkEndpoint?
-
 	n.mu.enabled = false
 	return nil
 }
 
-// enable enables n. enable will attach the nic to its LinkEndpoint and
-// join the IPv6 All-Nodes Multicast address (ff02::1).
+// enable enables n.
+//
+// If the stack has IPv6 enabled, enable will join the IPv6 All-Nodes Multicast
+// address (ff02::1), start DAD for permanent addresses, and start soliciting
+// routers if the stack is not operating as a router. If the stack is also
+// configured to auto-generate a link-local address, one will be generated.
 func (n *NIC) enable() *tcpip.Error {
 	n.mu.RLock()
 	enabled := n.mu.enabled
@@ -225,8 +228,6 @@ func (n *NIC) enable() *tcpip.Error {
 
 	n.mu.enabled = true
 
-	n.attachLinkEndpoint()
-
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
 		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
@@ -321,14 +322,6 @@ func (n *NIC) becomeIPv6Host() {
 	n.mu.ndp.startSolicitingRouters()
 }
 
-// attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
-// to start delivering packets.
-func (n *NIC) attachLinkEndpoint() {
-	n.attach.Do(func() {
-		n.linkEP.Attach(n)
-	})
-}
-
 // setPromiscuousMode enables or disables promiscuous mode.
 func (n *NIC) setPromiscuousMode(enable bool) {
 	n.mu.Lock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index fabc976a7..f0ed76fbe 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -881,6 +881,8 @@ type NICOptions struct {
 // CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
 // NICOptions. See the documentation on type NICOptions for details on how
 // NICs can be configured.
+//
+// LinkEndpoint.Attach will be called to bind ep with a NetworkDispatcher.
 func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -900,7 +902,6 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 	}
 
 	n := newNIC(s, id, opts.Name, ep, opts.Context)
-
 	s.nics[id] = n
 	if !opts.Disabled {
 		return n.enable()
@@ -910,7 +911,7 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 }
 
 // CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
-// `LinkEndpoint.Attach` to start delivering packets to it.
+// LinkEndpoint.Attach to bind ep with a NetworkDispatcher.
 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index eb6f7d1fc..18016e7db 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -239,6 +239,23 @@ func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
 
+// linkEPWithMockedAttach is a stack.LinkEndpoint that tests can use to verify
+// that LinkEndpoint.Attach was called.
+type linkEPWithMockedAttach struct {
+	stack.LinkEndpoint
+	attached bool
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (l *linkEPWithMockedAttach) Attach(d stack.NetworkDispatcher) {
+	l.LinkEndpoint.Attach(d)
+	l.attached = true
+}
+
+func (l *linkEPWithMockedAttach) isAttached() bool {
+	return l.attached
+}
+
 func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
@@ -510,6 +527,45 @@ func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr
 	}
 }
 
+// TestAttachToLinkEndpointImmediately tests that a LinkEndpoint is attached to
+// a NetworkDispatcher when the NIC is created.
+func TestAttachToLinkEndpointImmediately(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name    string
+		nicOpts stack.NICOptions
+	}{
+		{
+			name:    "Create enabled NIC",
+			nicOpts: stack.NICOptions{Disabled: false},
+		},
+		{
+			name:    "Create disabled NIC",
+			nicOpts: stack.NICOptions{Disabled: true},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+
+			e := linkEPWithMockedAttach{
+				LinkEndpoint: loopback.New(),
+			}
+
+			if err := s.CreateNICWithOptions(nicID, &e, test.nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, test.nicOpts, err)
+			}
+			if !e.isAttached() {
+				t.Fatalf("link endpoint not attached to a network disatcher")
+			}
+		})
+	}
+}
+
 func TestDisableUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-- 
cgit v1.2.3


From 3733499952c056cc8496beb01c72dcf53177048e Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 21 Feb 2020 13:17:44 -0800
Subject: Fix master installer.

Sometimes, when we start a new instance, the file
lock on "apt" is locked. Add a loop to the master
installer.

In addition, the "apt-get install" fails to register
runsc in docker, so run the appropriate scripts to
get that to happen.

Also, add some helpful log messages.

PiperOrigin-RevId: 296497357
---
 benchmarks/harness/machine.py        |  9 ++++++---
 benchmarks/harness/ssh_connection.py |  9 +++++++--
 tools/installers/master.sh           | 17 ++++++++++++++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index 3d32d3dda..5bdc4aa85 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -43,6 +43,8 @@ from benchmarks.harness import machine_mocks
 from benchmarks.harness import ssh_connection
 from benchmarks.harness import tunnel_dispatcher
 
+log = logging.getLogger(__name__)
+
 
 class Machine(object):
   """The machine object is the primary object for benchmarks.
@@ -236,9 +238,10 @@ class RemoteMachine(Machine):
           archive=archive, dir=harness.REMOTE_INSTALLERS_PATH))
       self._has_installers = True
 
-      # Execute the remote installer.
-      self.run("sudo {dir}/{file}".format(
-          dir=harness.REMOTE_INSTALLERS_PATH, file=installer))
+    # Execute the remote installer.
+    self.run("sudo {dir}/{file}".format(
+        dir=harness.REMOTE_INSTALLERS_PATH, file=installer))
+
     if results:
       results[index] = True
 
diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py
index a50e34293..b8c8e42d4 100644
--- a/benchmarks/harness/ssh_connection.py
+++ b/benchmarks/harness/ssh_connection.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """SSHConnection handles the details of SSH connections."""
 
-
+import logging
 import os
 import warnings
 
@@ -24,6 +24,8 @@ from benchmarks import harness
 # Get rid of paramiko Cryptography Warnings.
 warnings.filterwarnings(action="ignore", module=".*paramiko.*")
 
+log = logging.getLogger(__name__)
+
 
 def send_one_file(client: paramiko.SSHClient, path: str,
                   remote_dir: str) -> str:
@@ -94,10 +96,13 @@ class SSHConnection:
       The contents of stdout and stderr.
     """
     with self._client() as client:
+      log.info("running command: %s", cmd)
       _, stdout, stderr = client.exec_command(command=cmd)
-      stdout.channel.recv_exit_status()
+      log.info("returned status: %d", stdout.channel.recv_exit_status())
       stdout = stdout.read().decode("utf-8")
       stderr = stderr.read().decode("utf-8")
+      log.info("stdout: %s", stdout)
+      log.info("stderr: %s", stderr)
     return stdout, stderr
 
   def send_workload(self, name: str) -> str:
diff --git a/tools/installers/master.sh b/tools/installers/master.sh
index 7b1956454..52f9734a6 100755
--- a/tools/installers/master.sh
+++ b/tools/installers/master.sh
@@ -15,6 +15,21 @@
 # limitations under the License.
 
 # Install runsc from the master branch.
+set -e
+
 curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
 add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
-apt-get update && apt-get install -y runsc
+while true; do
+  if apt-get update; then
+    apt-get install -y runsc
+    break
+  fi
+  result=$?
+  # Check if apt update failed to aquire the file lock.
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+runsc install
+service docker restart
+
-- 
cgit v1.2.3


From 10aa4d3b343255db45f5ca4ff7b51f21a309e10b Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 21 Feb 2020 15:05:20 -0800
Subject: Factor platform tags.

PiperOrigin-RevId: 296519566
---
 test/runner/defs.bzl          | 58 ++++++++++++++-----------------------------
 tools/bazeldefs/platforms.bzl | 17 +++++++++++++
 tools/defs.bzl                |  3 +++
 3 files changed, 39 insertions(+), 39 deletions(-)
 create mode 100644 tools/bazeldefs/platforms.bzl

diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
index 5e97c1867..56743a526 100644
--- a/test/runner/defs.bzl
+++ b/test/runner/defs.bzl
@@ -1,6 +1,6 @@
 """Defines a rule for syscall test targets."""
 
-load("//tools:defs.bzl", "loopback")
+load("//tools:defs.bzl", "default_platform", "loopback", "platforms")
 
 def _runner_test_impl(ctx):
     # Generate a runner binary.
@@ -94,19 +94,6 @@ def _syscall_test(
     # Disable off-host networking.
     tags.append("requires-net:loopback")
 
-    # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO(b/120560048): Make the tests run without this tag.
-    tags.append("no-sandbox")
-
-    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
-    # more stable.
-    if platform == "kvm":
-        tags.append("manual")
-        tags.append("requires-kvm")
-
-        # TODO(b/112165693): Remove when tests pass reliably.
-        tags.append("notap")
-
     runner_args = [
         # Arguments are passed directly to runner binary.
         "--platform=" + platform,
@@ -149,6 +136,8 @@ def syscall_test(
       add_hostinet: add a hostinet test.
       tags: starting test tags.
     """
+    if not tags:
+        tags = []
 
     _syscall_test(
         test = test,
@@ -160,35 +149,26 @@ def syscall_test(
         tags = tags,
     )
 
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "kvm",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "ptrace",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
+    for (platform, platform_tags) in platforms.items():
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = platform,
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = platform_tags + tags,
+        )
 
     if add_overlay:
         _syscall_test(
             test = test,
             shard_count = shard_count,
             size = size,
-            platform = "ptrace",
+            platform = default_platform,
             use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
             add_uds_tree = add_uds_tree,
-            tags = tags,
+            tags = platforms[default_platform] + tags,
             overlay = True,
         )
 
@@ -198,10 +178,10 @@ def syscall_test(
             test = test,
             shard_count = shard_count,
             size = size,
-            platform = "ptrace",
+            platform = default_platform,
             use_tmpfs = use_tmpfs,
             add_uds_tree = add_uds_tree,
-            tags = tags,
+            tags = platforms[default_platform] + tags,
             file_access = "shared",
         )
 
@@ -210,9 +190,9 @@ def syscall_test(
             test = test,
             shard_count = shard_count,
             size = size,
-            platform = "ptrace",
+            platform = default_platform,
             use_tmpfs = use_tmpfs,
             network = "host",
             add_uds_tree = add_uds_tree,
-            tags = tags,
+            tags = platforms[default_platform] + tags,
         )
diff --git a/tools/bazeldefs/platforms.bzl b/tools/bazeldefs/platforms.bzl
new file mode 100644
index 000000000..92b0b5fc0
--- /dev/null
+++ b/tools/bazeldefs/platforms.bzl
@@ -0,0 +1,17 @@
+"""List of platforms."""
+
+# Platform to associated tags.
+platforms = {
+    "ptrace": [
+        # TODO(b/120560048): Make the tests run without this tag.
+        "no-sandbox",
+    ],
+    "kvm": [
+        "manual",
+        "local",
+        # TODO(b/120560048): Make the tests run without this tag.
+        "no-sandbox",
+    ],
+}
+
+default_platform = "ptrace"
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 45c065459..15a310403 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -8,6 +8,7 @@ change for Google-internal and bazel-compatible rules.
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
 load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms")
 load("//tools/bazeldefs:tags.bzl", "go_suffixes")
 
 # Delegate directly.
@@ -34,6 +35,8 @@ select_system = _select_system
 loopback = _loopback
 default_installer = _default_installer
 default_net_util = _default_net_util
+platforms = _platforms
+default_platform = _default_platform
 
 def go_binary(name, **kwargs):
     """Wraps the standard go_binary.
-- 
cgit v1.2.3


From b8f56c79be40d9c75f4e2f279c9d821d1c1c3569 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 21 Feb 2020 15:41:56 -0800
Subject: Implement tap/tun device in vfs.

PiperOrigin-RevId: 296526279
---
 pkg/abi/linux/BUILD                              |   1 +
 pkg/abi/linux/ioctl.go                           |  26 ++
 pkg/abi/linux/ioctl_tun.go                       |  29 ++
 pkg/sentry/fs/dev/BUILD                          |   5 +
 pkg/sentry/fs/dev/dev.go                         |  10 +-
 pkg/sentry/fs/dev/net_tun.go                     | 170 +++++++++++
 pkg/syserror/syserror.go                         |   1 +
 pkg/tcpip/buffer/view.go                         |   6 +
 pkg/tcpip/link/channel/BUILD                     |   1 +
 pkg/tcpip/link/channel/channel.go                | 180 +++++++++---
 pkg/tcpip/link/tun/BUILD                         |  18 +-
 pkg/tcpip/link/tun/device.go                     | 352 +++++++++++++++++++++++
 pkg/tcpip/link/tun/protocol.go                   |  56 ++++
 pkg/tcpip/stack/nic.go                           |  32 +++
 pkg/tcpip/stack/stack.go                         |  39 +++
 test/syscalls/BUILD                              |   2 +
 test/syscalls/linux/BUILD                        |  30 ++
 test/syscalls/linux/dev.cc                       |   7 +
 test/syscalls/linux/socket_netlink_route_util.cc | 163 +++++++++++
 test/syscalls/linux/socket_netlink_route_util.h  |  55 ++++
 test/syscalls/linux/tuntap.cc                    | 346 ++++++++++++++++++++++
 21 files changed, 1490 insertions(+), 39 deletions(-)
 create mode 100644 pkg/abi/linux/ioctl_tun.go
 create mode 100644 pkg/sentry/fs/dev/net_tun.go
 create mode 100644 pkg/tcpip/link/tun/device.go
 create mode 100644 pkg/tcpip/link/tun/protocol.go
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.cc
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.h
 create mode 100644 test/syscalls/linux/tuntap.cc

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index a89f34d4b..322d1ccc4 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "futex.go",
         "inotify.go",
         "ioctl.go",
+        "ioctl_tun.go",
         "ip.go",
         "ipc.go",
         "limits.go",
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 0e18db9ef..2062e6a4b 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -72,3 +72,29 @@ const (
 	SIOCGMIIPHY   = 0x8947
 	SIOCGMIIREG   = 0x8948
 )
+
+// ioctl(2) directions. Used to calculate requests number.
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NONE  = 0
+	_IOC_WRITE = 1
+	_IOC_READ  = 2
+)
+
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NRBITS   = 8
+	_IOC_TYPEBITS = 8
+	_IOC_SIZEBITS = 14
+	_IOC_DIRBITS  = 2
+
+	_IOC_NRSHIFT   = 0
+	_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS
+	_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS
+	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
+)
+
+// IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
+func IOC(dir, typ, nr, size uint32) uint32 {
+	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
+}
diff --git a/pkg/abi/linux/ioctl_tun.go b/pkg/abi/linux/ioctl_tun.go
new file mode 100644
index 000000000..c59c9c136
--- /dev/null
+++ b/pkg/abi/linux/ioctl_tun.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// ioctl(2) request numbers from linux/if_tun.h
+var (
+	TUNSETIFF = IOC(_IOC_WRITE, 'T', 202, 4)
+	TUNGETIFF = IOC(_IOC_READ, 'T', 210, 4)
+)
+
+// Flags from net/if_tun.h
+const (
+	IFF_TUN      = 0x0001
+	IFF_TAP      = 0x0002
+	IFF_NO_PI    = 0x1000
+	IFF_NOFILTER = 0x1000
+)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 4c4b7d5cc..9b6bb26d0 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -9,6 +9,7 @@ go_library(
         "device.go",
         "fs.go",
         "full.go",
+        "net_tun.go",
         "null.go",
         "random.go",
         "tty.go",
@@ -19,15 +20,19 @@ go_library(
         "//pkg/context",
         "//pkg/rand",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
+        "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
+        "//pkg/tcpip/link/tun",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 35bd23991..7e66c29b0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -66,8 +66,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo
 	})
 }
 
-func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode {
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -111,7 +111,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
 		// the devpts to be mounted over.
-		"pts": newDirectory(ctx, msrc),
+		"pts": newDirectory(ctx, nil, msrc),
 		// Similarly, applications expect a ptmx device at /dev/ptmx
 		// connected to the terminals provided by /dev/pts/. Rather
 		// than creating a device directly (which requires a hairy
@@ -124,6 +124,10 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+
+		"net": newDirectory(ctx, map[string]*fs.Inode{
+			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
+		}, msrc),
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
new file mode 100644
index 000000000..755644488
--- /dev/null
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	netTunDevMajor = 10
+	netTunDevMinor = 200
+)
+
+// +stateify savable
+type netTunInodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+}
+
+var _ fs.InodeOperations = (*netTunInodeOperations)(nil)
+
+func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations {
+	return &netTunInodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
+}
+
+// +stateify savable
+type netTunFileOperations struct {
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	device tun.Device
+}
+
+var _ fs.FileOperations = (*netTunFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (fops *netTunFileOperations) Release() {
+	fops.device.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	request := args[1].Uint()
+	data := args[2].Pointer()
+
+	switch request {
+	case linux.TUNSETIFF:
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic("Ioctl should be called from a task context")
+		}
+		if !t.HasCapability(linux.CAP_NET_ADMIN) {
+			return 0, syserror.EPERM
+		}
+		stack, ok := t.NetworkContext().(*netstack.Stack)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+
+		var req linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		flags := usermem.ByteOrder.Uint16(req.Data[:])
+		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+
+	case linux.TUNGETIFF:
+		var req linux.IFReq
+
+		copy(req.IFName[:], fops.device.Name())
+
+		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// Write implements fs.FileOperations.Write.
+func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	data := make([]byte, src.NumBytes())
+	if _, err := src.CopyIn(ctx, data); err != nil {
+		return 0, err
+	}
+	return fops.device.Write(data)
+}
+
+// Read implements fs.FileOperations.Read.
+func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := fops.device.Read()
+	if err != nil {
+		return 0, err
+	}
+	n, err := dst.CopyOut(ctx, data)
+	if n > 0 && n < len(data) {
+		// Not an error for partial copying. Packet truncated.
+		err = nil
+	}
+	return int64(n), err
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fops.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fops.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	fops.device.EventUnregister(e)
+}
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 2269f6237..4b5a0fca6 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -29,6 +29,7 @@ var (
 	EACCES       = error(syscall.EACCES)
 	EAGAIN       = error(syscall.EAGAIN)
 	EBADF        = error(syscall.EBADF)
+	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 150310c11..17e94c562 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -156,3 +156,9 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) {
 	vv.views = append(vv.views, vv2.views...)
 	vv.size += vv2.size
 }
+
+// AppendView appends the given view into this vectorised view.
+func (vv *VectorisedView) AppendView(v View) {
+	vv.views = append(vv.views, v)
+	vv.size += len(v)
+}
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 3974c464e..b8b93e78e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = ["channel.go"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 78d447acd..5944ba190 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -20,6 +20,7 @@ package channel
 import (
 	"context"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -33,6 +34,118 @@ type PacketInfo struct {
 	Route stack.Route
 }
 
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// mu protects fields below.
+	mu sync.RWMutex
+	// c is the outbound packet channel. Sending to c should hold mu.
+	c        chan PacketInfo
+	numWrite int
+	numRead  int
+	notify   []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	select {
+	case p := <-q.c:
+		q.numRead++
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	// We have to receive from channel without holding the lock, since it can
+	// block indefinitely. This will cause a window that numWrite - numRead
+	// produces a larger number, but won't go to negative. numWrite >= numRead
+	// still holds.
+	select {
+	case pkt := <-q.c:
+		q.mu.Lock()
+		defer q.mu.Unlock()
+		q.numRead++
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+
+	// It's important to make sure nobody can see numWrite until we increment it,
+	// so numWrite >= numRead holds.
+	q.mu.Lock()
+	select {
+	case q.c <- p:
+		wrote = true
+		q.numWrite++
+	default:
+	}
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	q.mu.RLock()
+	defer q.mu.RUnlock()
+	n := q.numWrite - q.numRead
+	if n < 0 {
+		panic("numWrite < numRead")
+	}
+	return n
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
@@ -41,14 +154,16 @@ type Endpoint struct {
 	linkAddr           tcpip.LinkAddress
 	LinkEPCapabilities stack.LinkEndpointCapabilities
 
-	// c is where outbound packets are queued.
-	c chan PacketInfo
+	// Outbound packet queue.
+	q *queue
 }
 
 // New creates a new channel endpoint.
 func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 	return &Endpoint{
-		c:        make(chan PacketInfo, size),
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
 		mtu:      mtu,
 		linkAddr: linkAddr,
 	}
@@ -57,43 +172,36 @@ func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 // Close closes e. Further packet injections will panic. Reads continue to
 // succeed until all packets are read.
 func (e *Endpoint) Close() {
-	close(e.c)
+	e.q.Close()
 }
 
-// Read does non-blocking read for one packet from the outbound packet queue.
+// Read does non-blocking read one packet from the outbound packet queue.
 func (e *Endpoint) Read() (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	default:
-		return PacketInfo{}, false
-	}
+	return e.q.Read()
 }
 
 // ReadContext does blocking read for one packet from the outbound packet queue.
 // It can be cancelled by ctx, and in this case, it returns false.
 func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	case <-ctx.Done():
-		return PacketInfo{}, false
-	}
+	return e.q.ReadContext(ctx)
 }
 
 // Drain removes all outbound packets from the channel and counts them.
 func (e *Endpoint) Drain() int {
 	c := 0
 	for {
-		select {
-		case <-e.c:
-			c++
-		default:
+		if _, ok := e.Read(); !ok {
 			return c
 		}
+		c++
 	}
 }
 
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
 // InjectInbound injects an inbound packet.
 func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
@@ -155,10 +263,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		Route: route,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
@@ -171,7 +276,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 	route.Release()
 	payloadView := pkts[0].Data.ToView()
 	n := 0
-packetLoop:
 	for _, pkt := range pkts {
 		off := pkt.DataOffset
 		size := pkt.DataSize
@@ -185,12 +289,10 @@ packetLoop:
 			Route: route,
 		}
 
-		select {
-		case e.c <- p:
-			n++
-		default:
-			break packetLoop
+		if !e.q.Write(p) {
+			break
 		}
+		n++
 	}
 
 	return n, nil
@@ -204,13 +306,21 @@ func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 		GSO:   nil,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index e5096ea38..e0db6cf54 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -4,6 +4,22 @@ package(licenses = ["notice"])
 
 go_library(
     name = "tun",
-    srcs = ["tun_unsafe.go"],
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
     visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
 )
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6ff47a742
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,352 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := tcpip.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 862954ab2..46d3a6646 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -298,6 +298,33 @@ func (n *NIC) enable() *tcpip.Error {
 	return nil
 }
 
+// remove detaches NIC from the link endpoint, and marks existing referenced
+// network endpoints expired. This guarantees no packets between this NIC and
+// the network stack.
+func (n *NIC) remove() *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	// Remove permanent and permanentTentative addresses, so no packet goes out.
+	var errs []*tcpip.Error
+	for nid, ref := range n.mu.endpoints {
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			if err := n.removePermanentAddressLocked(nid.LocalAddress); err != nil {
+				errs = append(errs, err)
+			}
+		}
+	}
+	if len(errs) > 0 {
+		return errs[0]
+	}
+
+	return nil
+}
+
 // becomeIPv6Router transitions n into an IPv6 router.
 //
 // When transitioning into an IPv6 router, host-only state (NDP discovered
@@ -1302,6 +1329,11 @@ func (n *NIC) Stack() *Stack {
 	return n.stack
 }
 
+// LinkEndpoint returns the link endpoint of n.
+func (n *NIC) LinkEndpoint() LinkEndpoint {
+	return n.linkEP
+}
+
 // isAddrTentative returns true if addr is tentative on n.
 //
 // Note that if addr is not associated with n, then this function will return
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f0ed76fbe..900dd46c5 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -916,6 +916,18 @@ func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
+// GetNICByName gets the NIC specified by name.
+func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, nic := range s.nics {
+		if nic.Name() == name {
+			return nic, true
+		}
+	}
+	return nil, false
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
@@ -956,6 +968,33 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return nic.enabled()
 }
 
+// RemoveNIC removes NIC and all related routes from the network stack.
+func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	delete(s.nics, id)
+
+	// Remove routes in-place. n tracks the number of routes written.
+	n := 0
+	for i, r := range s.routeTable {
+		if r.NIC != id {
+			// Keep this route.
+			if i > n {
+				s.routeTable[n] = r
+			}
+			n++
+		}
+	}
+	s.routeTable = s.routeTable[:n]
+
+	return nic.remove()
+}
+
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d1977d4de..3518e862d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -678,6 +678,8 @@ syscall_test(
     test = "//test/syscalls/linux:truncate_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:tuntap_test")
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index aa303af84..704bae17b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -131,6 +131,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "socket_netlink_route_util",
+    testonly = 1,
+    srcs = ["socket_netlink_route_util.cc"],
+    hdrs = ["socket_netlink_route_util.h"],
+    deps = [
+        ":socket_netlink_util",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "socket_test_util",
     testonly = 1,
@@ -3430,6 +3441,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_test",
+    testonly = 1,
+    srcs = ["tuntap.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/syscalls/linux:socket_netlink_route_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4dd302eed..4e473268c 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,6 +153,13 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
+TEST(DevTest, NetTunExists) {
+  struct stat statbuf = {};
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
new file mode 100644
index 000000000..53eb3b6b2
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+}  // namespace
+
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
+PosixErrorOr<std::vector<Link>> DumpLinks() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  std::vector<Link> links;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+    if (rta == nullptr) {
+      // Ignore links that do not have a name.
+      return;
+    }
+
+    links.emplace_back();
+    links.back().index = msg->ifi_index;
+    links.back().type = msg->ifi_type;
+    links.back().name =
+        std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+  }));
+  return links;
+}
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.type == ARPHRD_LOOPBACK) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifaddr;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifaddr.ifa_index = index;
+  req.ifaddr.ifa_family = family;
+  req.ifaddr.ifa_prefixlen = prefixlen;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFA_LOCAL;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char pad[NLMSG_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+  req.ifinfo.ifi_flags = flags;
+  req.ifinfo.ifi_change = change;
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFLA_ADDRESS;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
new file mode 100644
index 000000000..2c018e487
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+struct Link {
+  int index;
+  int16_t type;
+  std::string name;
+};
+
+PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
+                     const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+PosixErrorOr<std::vector<Link>> DumpLinks();
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink();
+
+// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen);
+
+// LinkChangeFlags changes interface flags. E.g. IFF_UP.
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
+
+// LinkSetMacAddr sets IFLA_ADDRESS attribute of the interface.
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
new file mode 100644
index 000000000..f6ac9d7b8
--- /dev/null
+++ b/test/syscalls/linux/tuntap.cc
@@ -0,0 +1,346 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr int kIPLen = 4;
+
+constexpr const char kDevNetTun[] = "/dev/net/tun";
+constexpr const char kTapName[] = "tap0";
+
+constexpr const uint8_t kMacA[ETH_ALEN] = {0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA};
+constexpr const uint8_t kMacB[ETH_ALEN] = {0xBB, 0xBB, 0xBB, 0xBB, 0xBB, 0xBB};
+
+PosixErrorOr<std::set<std::string>> DumpLinkNames() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  std::set<std::string> names;
+  for (const auto& link : links) {
+    names.emplace(link.name);
+  }
+  return names;
+}
+
+PosixErrorOr<absl::optional<Link>> GetLinkByName(const std::string& name) {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.name == name) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+struct pihdr {
+  uint16_t pi_flags;
+  uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct iphdr ip;
+  struct icmphdr icmp;
+  char payload[64];
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                          const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  ping_pkt pkt = {};
+
+  pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+  memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+  memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+  pkt.eth.h_proto = htons(ETH_P_IP);
+
+  pkt.ip.ihl = 5;
+  pkt.ip.version = 4;
+  pkt.ip.tos = 0;
+  pkt.ip.tot_len = htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+                         sizeof(pkt.payload));
+  pkt.ip.id = 1;
+  pkt.ip.frag_off = 1 << 6;  // Do not fragment
+  pkt.ip.ttl = 64;
+  pkt.ip.protocol = IPPROTO_ICMP;
+  inet_pton(AF_INET, dstip, &pkt.ip.daddr);
+  inet_pton(AF_INET, srcip, &pkt.ip.saddr);
+  pkt.ip.check = IPChecksum(pkt.ip);
+
+  pkt.icmp.type = ICMP_ECHO;
+  pkt.icmp.code = 0;
+  pkt.icmp.checksum = 0;
+  pkt.icmp.un.echo.sequence = 1;
+  pkt.icmp.un.echo.id = 1;
+
+  strncpy(pkt.payload, "abcd", sizeof(pkt.payload));
+  pkt.icmp.checksum = ICMPChecksum(pkt.icmp, pkt.payload, sizeof(pkt.payload));
+
+  return pkt;
+}
+
+struct arp_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct arphdr arp;
+  uint8_t arp_sha[ETH_ALEN];
+  uint8_t arp_spa[kIPLen];
+  uint8_t arp_tha[ETH_ALEN];
+  uint8_t arp_tpa[kIPLen];
+} __attribute__((packed));
+
+std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                            const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  std::string buffer;
+  buffer.resize(sizeof(arp_pkt));
+
+  arp_pkt* pkt = reinterpret_cast<arp_pkt*>(&buffer[0]);
+  {
+    pkt->pi.pi_protocol = htons(ETH_P_ARP);
+
+    memcpy(pkt->eth.h_dest, kMacA, sizeof(pkt->eth.h_dest));
+    memcpy(pkt->eth.h_source, kMacB, sizeof(pkt->eth.h_source));
+    pkt->eth.h_proto = htons(ETH_P_ARP);
+
+    pkt->arp.ar_hrd = htons(ARPHRD_ETHER);
+    pkt->arp.ar_pro = htons(ETH_P_IP);
+    pkt->arp.ar_hln = ETH_ALEN;
+    pkt->arp.ar_pln = kIPLen;
+    pkt->arp.ar_op = htons(ARPOP_REPLY);
+
+    memcpy(pkt->arp_sha, srcmac, sizeof(pkt->arp_sha));
+    inet_pton(AF_INET, srcip, pkt->arp_spa);
+    memcpy(pkt->arp_tha, dstmac, sizeof(pkt->arp_tha));
+    inet_pton(AF_INET, dstip, pkt->arp_tpa);
+  }
+  return buffer;
+}
+
+}  // namespace
+
+class TuntapTest : public ::testing::Test {
+ protected:
+  void TearDown() override {
+    if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) {
+      // Bring back capability if we had dropped it in test case.
+      ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, true));
+    }
+  }
+};
+
+TEST_F(TuntapTest, CreateInterfaceNoCap) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, false));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  strncpy(ifr.ifr_name, kTapName, IFNAMSIZ);
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(TuntapTest, CreateFixedNameInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_expect = ifr_set;
+  // See __tun_chr_ioctl() in net/drivers/tun.c.
+  ifr_expect.ifr_flags |= IFF_NOFILTER;
+
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(kTapName)));
+  EXPECT_THAT(memcmp(&ifr_expect, &ifr_get, sizeof(ifr_get)), ::testing::Eq(0));
+}
+
+TEST_F(TuntapTest, CreateInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  // Empty ifr.ifr_name. Let kernel assign.
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  std::string ifname = ifr_get.ifr_name;
+  EXPECT_THAT(ifname, ::testing::StartsWith("tap"));
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(ifname)));
+}
+
+TEST_F(TuntapTest, InvalidReadWrite) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  char buf[128] = {};
+  EXPECT_THAT(read(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+}
+
+TEST_F(TuntapTest, WriteToDownDevice) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // FIXME: gVisor always creates enabled/up'd interfaces.
+  SKIP_IF(IsRunningOnGvisor());
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  // Device created should be down by default.
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  char buf[128] = {};
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Interface creation.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  absl::optional<Link> link =
+      ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTapName));
+  ASSERT_TRUE(link.has_value());
+
+  // Interface setup.
+  struct in_addr addr;
+  inet_pton(AF_INET, "10.0.0.1", &addr);
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
+                                   &addr, sizeof(addr)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
+    EXPECT_NO_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+
+    // FIXME: gVisor always creates enabled/up'd interfaces.
+    EXPECT_NO_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+  }
+
+  ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+  std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+
+  // Send ping, this would trigger an ARP request on Linux.
+  EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+              SyscallSucceedsWithValue(sizeof(ping_req)));
+
+  // Receive loop to process inbound packets.
+  struct inpkt {
+    union {
+      pihdr pi;
+      ping_pkt ping;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    // Process ARP packet.
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      // Respond with canned ARP reply.
+      EXPECT_THAT(write(fd.get(), arp_rep.data(), arp_rep.size()),
+                  SyscallSucceedsWithValue(arp_rep.size()));
+      // First ping request might have been dropped due to mac address not in
+      // ARP cache. Send it again.
+      EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+                  SyscallSucceedsWithValue(sizeof(ping_req)));
+    }
+
+    // Process ping response packet.
+    if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
+        r.ping.ip.protocol == ping_req.ip.protocol &&
+        !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
+        !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
+        r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+      // Ends and passes the test.
+      break;
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 75d7f76a6cd81d77f5ce70440c1d95c0296b15ba Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Mon, 11 Nov 2019 20:26:38 -0800
Subject: arm64: add a travis build ci

Build runsc and run "runsc do ls".

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 .travis.yml                                  | 19 ++++++++++++++++++
 Dockerfile                                   | 11 ++++++-----
 Makefile                                     |  5 ++++-
 test/syscalls/linux/32bit.cc                 |  2 +-
 test/syscalls/linux/rseq/uapi.h              | 29 ++++++++++++----------------
 test/syscalls/linux/udp_socket_test_cases.cc |  4 ++++
 6 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index e69de29bb..a2a260538 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -0,0 +1,19 @@
+language: minimal
+sudo: required
+dist: xenial
+cache:
+  directories:
+    - /home/travis/.cache/bazel/
+services:
+  - docker
+matrix:
+  include:
+   - os: linux
+     arch: amd64
+     env: RUNSC_PATH=./bazel-bin/runsc/linux_amd64_pure_stripped/runsc
+   - os: linux
+     arch: arm64
+     env: RUNSC_PATH=./bazel-bin/runsc/linux_arm64_pure_stripped/runsc
+script:
+   - uname -a
+   - make DOCKER_RUN_OPTIONS="" BAZEL_OPTIONS="build runsc:runsc" bazel && $RUNSC_PATH --alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do ls
diff --git a/Dockerfile b/Dockerfile
index 738623023..2bfdfec6c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,9 @@
-FROM ubuntu:bionic
+FROM fedora:31
 
-RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN apt-get update && apt-get install -y bazel && apt-get clean
+RUN  dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
+
+RUN dnf install -y bazel2 git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static
+
+RUN pip install pycparser
 
 WORKDIR /gvisor
diff --git a/Makefile b/Makefile
index a73bc0c36..d9531fbd5 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,9 @@ UID := $(shell id -u ${USER})
 GID := $(shell id -g ${USER})
 GVISOR_BAZEL_CACHE := $(shell readlink -f ~/.cache/bazel/)
 
+# The  --privileged is required to run tests.
+DOCKER_RUN_OPTIONS ?= --privileged
+
 all: runsc
 
 docker-build:
@@ -19,7 +22,7 @@ bazel-server-start: docker-build
 		-v "$(CURDIR):$(CURDIR)" \
 		--workdir "$(CURDIR)" \
 		--tmpfs /tmp:rw,exec \
-		--privileged \
+		$(DOCKER_RUN_OPTIONS) \
 		gvisor-bazel \
 		sh -c "while :; do sleep 100; done" && \
 	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index c47a05181..3c825477c 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -74,7 +74,7 @@ void ExitGroup32(const char instruction[2], int code) {
       "int $3\n"
       :
       : [ code ] "m"(code), [ ip ] "d"(m.ptr())
-      : "rax", "rbx", "rsp");
+      : "rax", "rbx");
 }
 
 constexpr int kExitCode = 42;
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
index e3ff0579a..ca1d67691 100644
--- a/test/syscalls/linux/rseq/uapi.h
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -15,14 +15,9 @@
 #ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 #define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 
-// User-kernel ABI for restartable sequences.
+#include <stdint.h>
 
-// Standard types.
-//
-// N.B. This header will be included in targets that do have the standard
-// library, so we can't shadow the standard type names.
-using __u32 = __UINT32_TYPE__;
-using __u64 = __UINT64_TYPE__;
+// User-kernel ABI for restartable sequences.
 
 #ifdef __x86_64__
 // Syscall numbers.
@@ -32,20 +27,20 @@ constexpr int kRseqSyscall = 334;
 #endif  // __x86_64__
 
 struct rseq_cs {
-  __u32 version;
-  __u32 flags;
-  __u64 start_ip;
-  __u64 post_commit_offset;
-  __u64 abort_ip;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t version;
+  uint32_t flags;
+  uint64_t start_ip;
+  uint64_t post_commit_offset;
+  uint64_t abort_ip;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 // N.B. alignment is enforced by the kernel.
 struct rseq {
-  __u32 cpu_id_start;
-  __u32 cpu_id;
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
   struct rseq_cs* rseq_cs;
-  __u32 flags;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 constexpr int kRseqFlagUnregister = 1 << 0;
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 57b1a357c..740c7986d 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -21,6 +21,10 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 
+#ifndef SIOCGSTAMP
+#include <linux/sockios.h>
+#endif
+
 #include "gtest/gtest.h"
 #include "absl/base/macros.h"
 #include "absl/time/clock.h"
-- 
cgit v1.2.3


From c37b196455e8b3816298e3eea98e4ee2dab8d368 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 24 Feb 2020 10:31:01 -0800
Subject: Add support for tearing down protocol dispatchers and TIME_WAIT
 endpoints.

Protocol dispatchers were previously leaked. Bypassing TIME_WAIT is required to
test this change.

Also fix a race when a socket in SYN-RCVD is closed. This is also required to
test this change.

PiperOrigin-RevId: 296922548
---
 pkg/tcpip/adapters/gonet/gonet_test.go | 63 ++++++++++++++++++++++++++--------
 pkg/tcpip/network/arp/arp.go           | 20 +++++++----
 pkg/tcpip/network/ipv4/ipv4.go         |  6 ++++
 pkg/tcpip/network/ipv6/ipv6.go         |  6 ++++
 pkg/tcpip/stack/registration.go        | 23 ++++++++++---
 pkg/tcpip/stack/stack.go               | 14 +++++++-
 pkg/tcpip/stack/stack_test.go          |  6 ++++
 pkg/tcpip/stack/transport_demuxer.go   | 20 -----------
 pkg/tcpip/stack/transport_test.go      | 15 +++++++-
 pkg/tcpip/tcpip.go                     |  8 ++++-
 pkg/tcpip/transport/icmp/endpoint.go   |  5 +++
 pkg/tcpip/transport/icmp/protocol.go   | 16 ++++++---
 pkg/tcpip/transport/packet/endpoint.go |  5 +++
 pkg/tcpip/transport/raw/endpoint.go    |  5 +++
 pkg/tcpip/transport/tcp/accept.go      |  9 ++++-
 pkg/tcpip/transport/tcp/connect.go     |  4 +--
 pkg/tcpip/transport/tcp/dispatcher.go  | 31 ++++++++++++++++-
 pkg/tcpip/transport/tcp/endpoint.go    | 33 ++++++++++++++++--
 pkg/tcpip/transport/tcp/protocol.go    | 14 ++++++--
 pkg/tcpip/transport/udp/endpoint.go    |  5 +++
 pkg/tcpip/transport/udp/protocol.go    | 14 +++++---
 21 files changed, 256 insertions(+), 66 deletions(-)

diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index ea0a0409a..3c552988a 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -127,6 +127,10 @@ func TestCloseReader(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 
@@ -175,6 +179,10 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -225,30 +233,21 @@ func TestCloseRead(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
 	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
 		var wq waiter.Queue
-		ep, err := r.CreateEndpoint(&wq)
+		_, err := r.CreateEndpoint(&wq)
 		if err != nil {
 			t.Fatalf("r.CreateEndpoint() = %v", err)
 		}
-		defer ep.Close()
-		r.Complete(false)
-
-		c := NewTCPConn(&wq, ep)
-
-		buf := make([]byte, 256)
-		n, e := c.Read(buf)
-		if e != nil || string(buf[:n]) != "abc123" {
-			t.Fatalf("c.Read() = (%d, %v), want (6, nil)", n, e)
-		}
-
-		if n, e = c.Write([]byte("abc123")); e != nil {
-			t.Errorf("c.Write() = (%d, %v), want (6, nil)", n, e)
-		}
+		// Endpoint will be closed in deferred s.Close (above).
 	})
 
 	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
@@ -278,6 +277,10 @@ func TestCloseWrite(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -334,6 +337,10 @@ func TestUDPForwarder(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
@@ -391,6 +398,10 @@ func TestDeadlineChange(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 
@@ -440,6 +451,10 @@ func TestPacketConnTransfer(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
@@ -492,6 +507,10 @@ func TestConnectedPacketConnTransfer(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr := tcpip.FullAddress{NICID, ip, 11211}
@@ -562,6 +581,8 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) {
 	stop = func() {
 		c1.Close()
 		c2.Close()
+		s.Close()
+		s.Wait()
 	}
 
 	if err := l.Close(); err != nil {
@@ -624,6 +645,10 @@ func TestTCPDialError(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr := tcpip.FullAddress{NICID, ip, 11211}
@@ -641,6 +666,10 @@ func TestDialContextTCPCanceled(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -659,6 +688,10 @@ func TestDialContextTCPTimeout(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 4da13c5df..e9fcc89a8 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -148,12 +148,12 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 	}, nil
 }
 
-// LinkAddressProtocol implements stack.LinkAddressResolver.
+// LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
 func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 	return header.IPv4ProtocolNumber
 }
 
-// LinkAddressRequest implements stack.LinkAddressResolver.
+// LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
 	r := &stack.Route{
 		RemoteLinkAddress: broadcastMAC,
@@ -172,7 +172,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 }
 
-// ResolveStaticAddress implements stack.LinkAddressResolver.
+// ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
 func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
 	if addr == header.IPv4Broadcast {
 		return broadcastMAC, true
@@ -183,16 +183,22 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 	return tcpip.LinkAddress([]byte(nil)), false
 }
 
-// SetOption implements NetworkProtocol.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.NetworkProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements NetworkProtocol.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.NetworkProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
 
 // NewProtocol returns an ARP network protocol.
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 6597e6781..4f1742938 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -473,6 +473,12 @@ func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 180a480fd..9aef5234b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -265,6 +265,12 @@ func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index d83adf0ec..f9fd8f18f 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -74,10 +74,11 @@ type TransportEndpoint interface {
 	// HandleControlPacket takes ownership of pkt.
 	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
 
-	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it. This cleanup may happen asynchronously. Wait can
-	// be used to block on this asynchronous cleanup.
-	Close()
+	// Abort initiates an expedited endpoint teardown. It puts the endpoint
+	// in a closed state and frees all resources associated with it. This
+	// cleanup may happen asynchronously. Wait can be used to block on this
+	// asynchronous cleanup.
+	Abort()
 
 	// Wait waits for any worker goroutines owned by the endpoint to stop.
 	//
@@ -160,6 +161,13 @@ type TransportProtocol interface {
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
 	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
 }
 
 // TransportDispatcher contains the methods used by the network stack to deliver
@@ -293,6 +301,13 @@ type NetworkProtocol interface {
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
 	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
 }
 
 // NetworkDispatcher contains the methods used by the network stack to deliver
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 900dd46c5..ebb6c5e3b 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1446,7 +1446,13 @@ func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
 // Endpoints created or modified during this call may not get closed.
 func (s *Stack) Close() {
 	for _, e := range s.RegisteredEndpoints() {
-		e.Close()
+		e.Abort()
+	}
+	for _, p := range s.transportProtocols {
+		p.proto.Close()
+	}
+	for _, p := range s.networkProtocols {
+		p.Close()
 	}
 }
 
@@ -1464,6 +1470,12 @@ func (s *Stack) Wait() {
 	for _, e := range s.CleanupEndpoints() {
 		e.Wait()
 	}
+	for _, p := range s.transportProtocols {
+		p.proto.Wait()
+	}
+	for _, p := range s.networkProtocols {
+		p.Wait()
+	}
 
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 18016e7db..edf6bec52 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -235,6 +235,12 @@ func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Close implements TransportProtocol.Close.
+func (*fakeNetworkProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeNetworkProtocol) Wait() {}
+
 func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index d686e6eb8..778c0a4d6 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -306,26 +306,6 @@ func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, p
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
-// Close implements stack.TransportEndpoint.Close.
-func (ep *multiPortEndpoint) Close() {
-	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
-	ep.mu.RUnlock()
-	for _, e := range eps {
-		e.Close()
-	}
-}
-
-// Wait implements stack.TransportEndpoint.Wait.
-func (ep *multiPortEndpoint) Wait() {
-	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
-	ep.mu.RUnlock()
-	for _, e := range eps {
-		e.Wait()
-	}
-}
-
 // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
 // list. The list might be empty already.
 func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 869c69a6d..5d1da2f8b 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -61,6 +61,10 @@ func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netP
 	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
+func (f *fakeTransportEndpoint) Abort() {
+	f.Close()
+}
+
 func (f *fakeTransportEndpoint) Close() {
 	f.route.Release()
 }
@@ -272,7 +276,7 @@ func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.N
 	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
 }
 
-func (f *fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (*fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return nil, tcpip.ErrUnknownProtocol
 }
 
@@ -310,6 +314,15 @@ func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Abort implements TransportProtocol.Abort.
+func (*fakeTransportProtocol) Abort() {}
+
+// Close implements tcpip.Endpoint.Close.
+func (*fakeTransportProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeTransportProtocol) Wait() {}
+
 func fakeTransFactory() stack.TransportProtocol {
 	return &fakeTransportProtocol{}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index ce5527391..3dc5d87d6 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -341,9 +341,15 @@ type ControlMessages struct {
 // networking stack.
 type Endpoint interface {
 	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it.
+	// associated with it. Close initiates the teardown process, the
+	// Endpoint may not be fully closed when Close returns.
 	Close()
 
+	// Abort initiates an expedited endpoint teardown. As compared to
+	// Close, Abort prioritizes closing the Endpoint quickly over cleanly.
+	// Abort is best effort; implementing Abort with Close is acceptable.
+	Abort()
+
 	// Read reads data from the endpoint and optionally returns the sender.
 	//
 	// This method does not block if there is no data pending. It will also
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 42afb3f5b..426da1ee6 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -96,6 +96,11 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 9ce500e80..113d92901 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,20 +104,26 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
 	return true
 }
 
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // NewProtocol4 returns an ICMPv4 transport protocol.
 func NewProtocol4() stack.TransportProtocol {
 	return &protocol{ProtocolNumber4}
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index fc5bc69fa..5722815e9 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -98,6 +98,11 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 	return ep, nil
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close implements tcpip.Endpoint.Close.
 func (ep *endpoint) Close() {
 	ep.mu.Lock()
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index ee9c4c58b..2ef5fac76 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -121,6 +121,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 	return e, nil
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close implements tcpip.Endpoint.Close.
 func (e *endpoint) Close() {
 	e.mu.Lock()
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 08afb7c17..13e383ffc 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -299,6 +299,13 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
+		// Wake up any waiters. This is strictly not required normally
+		// as a socket that was never accepted can't really have any
+		// registered waiters except when stack.Wait() is called which
+		// waits for all registered endpoints to stop and expects an
+		// EventHUp.
+		ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+
 		if l.listenEP != nil {
 			l.removePendingEndpoint(ep)
 		}
@@ -607,7 +614,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		e.mu.Unlock()
 
 		// Notify waiters that the endpoint is shutdown.
-		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut)
+		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 	}()
 
 	s := sleep.Sleeper{}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 5c5397823..7730e6445 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1372,7 +1372,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 					e.snd.updateMaxPayloadSize(mtu, count)
 				}
 
-				if n&notifyReset != 0 {
+				if n&notifyReset != 0 || n&notifyAbort != 0 {
 					return tcpip.ErrConnectionAborted
 				}
 
@@ -1655,7 +1655,7 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 			}
 		case notification:
 			n := e.fetchNotifications()
-			if n&notifyClose != 0 {
+			if n&notifyClose != 0 || n&notifyAbort != 0 {
 				return nil
 			}
 			if n&notifyDrain != 0 {
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index e18012ac0..d792b07d6 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -68,17 +68,28 @@ func (q *epQueue) empty() bool {
 type processor struct {
 	epQ              epQueue
 	newEndpointWaker sleep.Waker
+	closeWaker       sleep.Waker
 	id               int
+	wg               sync.WaitGroup
 }
 
 func newProcessor(id int) *processor {
 	p := &processor{
 		id: id,
 	}
+	p.wg.Add(1)
 	go p.handleSegments()
 	return p
 }
 
+func (p *processor) close() {
+	p.closeWaker.Assert()
+}
+
+func (p *processor) wait() {
+	p.wg.Wait()
+}
+
 func (p *processor) queueEndpoint(ep *endpoint) {
 	// Queue an endpoint for processing by the processor goroutine.
 	p.epQ.enqueue(ep)
@@ -87,11 +98,17 @@ func (p *processor) queueEndpoint(ep *endpoint) {
 
 func (p *processor) handleSegments() {
 	const newEndpointWaker = 1
+	const closeWaker = 2
 	s := sleep.Sleeper{}
 	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+	s.AddWaker(&p.closeWaker, closeWaker)
 	defer s.Done()
 	for {
-		s.Fetch(true)
+		id, ok := s.Fetch(true)
+		if ok && id == closeWaker {
+			p.wg.Done()
+			return
+		}
 		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
 			if ep.segmentQueue.empty() {
 				continue
@@ -160,6 +177,18 @@ func newDispatcher(nProcessors int) *dispatcher {
 	}
 }
 
+func (d *dispatcher) close() {
+	for _, p := range d.processors {
+		p.close()
+	}
+}
+
+func (d *dispatcher) wait() {
+	for _, p := range d.processors {
+		p.wait()
+	}
+}
+
 func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index f2be0e651..f1ad19dac 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -121,6 +121,8 @@ const (
 	notifyDrain
 	notifyReset
 	notifyResetByPeer
+	// notifyAbort is a request for an expedited teardown.
+	notifyAbort
 	notifyKeepaliveChanged
 	notifyMSSChanged
 	// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -785,6 +787,24 @@ func (e *endpoint) notifyProtocolGoroutine(n uint32) {
 	}
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	// The abort notification is not processed synchronously, so no
+	// synchronization is needed.
+	//
+	// If the endpoint becomes connected after this check, we still close
+	// the endpoint. This worst case results in a slower abort.
+	//
+	// If the endpoint disconnected after the check, nothing needs to be
+	// done, so sending a notification which will potentially be ignored is
+	// fine.
+	if e.EndpointState().connected() {
+		e.notifyProtocolGoroutine(notifyAbort)
+		return
+	}
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources associated
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
@@ -829,9 +849,18 @@ func (e *endpoint) closeNoShutdown() {
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	tcpip.AddDanglingEndpoint(e)
-	if !e.workerRunning {
+	switch e.EndpointState() {
+	// Sockets in StateSynRecv state(passive connections) are closed when
+	// the handshake fails or if the listening socket is closed while
+	// handshake was in progress. In such cases the handshake goroutine
+	// is already gone by the time Close is called and we need to cleanup
+	// here.
+	case StateInitial, StateBound, StateSynRecv:
 		e.cleanupLocked()
-	} else {
+		e.setEndpointState(StateClose)
+	case StateError, StateClose:
+		// do nothing.
+	default:
 		e.workerCleanup = true
 		e.notifyProtocolGoroutine(notifyClose)
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 958c06fa7..73098d904 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -194,7 +194,7 @@ func replyWithReset(s *segment) {
 	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
 }
 
-// SetOption implements TransportProtocol.SetOption.
+// SetOption implements stack.TransportProtocol.SetOption.
 func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case SACKEnabled:
@@ -269,7 +269,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 	}
 }
 
-// Option implements TransportProtocol.Option.
+// Option implements stack.TransportProtocol.Option.
 func (p *protocol) Option(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case *SACKEnabled:
@@ -331,6 +331,16 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (p *protocol) Close() {
+	p.dispatcher.close()
+}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (p *protocol) Wait() {
+	p.dispatcher.wait()
+}
+
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index eff7f3600..1c6a600b8 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -186,6 +186,11 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 259c3072a..8df089d22 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -180,16 +180,22 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 	return true
 }
 
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // NewProtocol returns a UDP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{}
-- 
cgit v1.2.3


From ededa90d07e4df3eb3fe8a52a0afbcdaf82e8df5 Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 24 Feb 2020 14:21:18 -0800
Subject: Internal change.

PiperOrigin-RevId: 296972565
---
 kokoro/runtime_tests/go1.12.cfg       | 10 ++++++++++
 kokoro/runtime_tests/java11.cfg       | 10 ++++++++++
 kokoro/runtime_tests/nodejs12.4.0.cfg | 10 ++++++++++
 kokoro/runtime_tests/php7.3.6.cfg     | 10 ++++++++++
 kokoro/runtime_tests/python3.7.3.cfg  | 10 ++++++++++
 scripts/common_build.sh               | 12 ++++++++++--
 6 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/kokoro/runtime_tests/go1.12.cfg b/kokoro/runtime_tests/go1.12.cfg
index 164ddc18f..fd4911e88 100644
--- a/kokoro/runtime_tests/go1.12.cfg
+++ b/kokoro/runtime_tests/go1.12.cfg
@@ -4,3 +4,13 @@ env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "go1.12"
 }
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc"
+    regex: "**/runsc.*"
+  }
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/java11.cfg b/kokoro/runtime_tests/java11.cfg
index 4957d4794..7f8611a08 100644
--- a/kokoro/runtime_tests/java11.cfg
+++ b/kokoro/runtime_tests/java11.cfg
@@ -4,3 +4,13 @@ env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "java11"
 }
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc"
+    regex: "**/runsc.*"
+  }
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/nodejs12.4.0.cfg b/kokoro/runtime_tests/nodejs12.4.0.cfg
index 1df343f95..c67ad5567 100644
--- a/kokoro/runtime_tests/nodejs12.4.0.cfg
+++ b/kokoro/runtime_tests/nodejs12.4.0.cfg
@@ -4,3 +4,13 @@ env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "nodejs12.4.0"
 }
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc"
+    regex: "**/runsc.*"
+  }
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/php7.3.6.cfg b/kokoro/runtime_tests/php7.3.6.cfg
index 8e3667125..f266c5e26 100644
--- a/kokoro/runtime_tests/php7.3.6.cfg
+++ b/kokoro/runtime_tests/php7.3.6.cfg
@@ -4,3 +4,13 @@ env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "php7.3.6"
 }
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc"
+    regex: "**/runsc.*"
+  }
+}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/python3.7.3.cfg b/kokoro/runtime_tests/python3.7.3.cfg
index 0ca70d5bb..574add152 100644
--- a/kokoro/runtime_tests/python3.7.3.cfg
+++ b/kokoro/runtime_tests/python3.7.3.cfg
@@ -4,3 +4,13 @@ env_vars {
   key: "RUNTIME_TEST_NAME"
   value: "python3.7.3"
 }
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc"
+    regex: "**/runsc.*"
+  }
+}
\ No newline at end of file
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
index ae8b67383..3be0bb21c 100755
--- a/scripts/common_build.sh
+++ b/scripts/common_build.sh
@@ -70,7 +70,9 @@ function collect_logs() {
     for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
       junitparser merge `find $d -name test.xml` $d/test.xml
       cat $d/shard_*_of_*/test.log > $d/test.log
-      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
+      if ls -l $d/shard_*_of_*/test.outputs/outputs.zip 2>/dev/null; then
+        zip -r -1 "$d/outputs.zip" $d/shard_*_of_*/test.outputs/outputs.zip
+      fi
     done
     find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
     # Move test logs to Kokoro directory. tar is used to conveniently perform
@@ -90,7 +92,13 @@ function collect_logs() {
           echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
           echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
         fi
-        tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" .
+        time tar \
+          --verbose \
+          --create \
+          --gzip \
+          --file="${KOKORO_ARTIFACTS_DIR}/${archive}" \
+          --directory "${RUNSC_LOGS_DIR}" \
+          .
       fi
     fi
   fi
-- 
cgit v1.2.3


From 160d5751ab6a06c22aed7d829a17c88344cc7cf2 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 24 Feb 2020 17:28:27 -0800
Subject: Add default behavior for gtest runner.

PiperOrigin-RevId: 297009116
---
 test/perf/BUILD                       |  8 ++++----
 test/perf/linux/getdents_benchmark.cc |  2 +-
 test/runner/gtest/gtest.go            | 25 +++++++++++++++++++------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/test/perf/BUILD b/test/perf/BUILD
index 7a2bf10ed..346a28e16 100644
--- a/test/perf/BUILD
+++ b/test/perf/BUILD
@@ -29,7 +29,7 @@ syscall_test(
 )
 
 syscall_test(
-    size = "large",
+    size = "enormous",
     test = "//test/perf/linux:getdents_benchmark",
 )
 
@@ -39,7 +39,7 @@ syscall_test(
 )
 
 syscall_test(
-    size = "large",
+    size = "enormous",
     test = "//test/perf/linux:gettid_benchmark",
 )
 
@@ -87,7 +87,7 @@ syscall_test(
 )
 
 syscall_test(
-    size = "large",
+    size = "enormous",
     test = "//test/perf/linux:signal_benchmark",
 )
 
@@ -102,7 +102,7 @@ syscall_test(
 )
 
 syscall_test(
-    size = "large",
+    size = "enormous",
     add_overlay = True,
     test = "//test/perf/linux:unlink_benchmark",
 )
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
index 0e03975b4..afc599ad2 100644
--- a/test/perf/linux/getdents_benchmark.cc
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -141,7 +141,7 @@ void BM_GetdentsNewFD(benchmark::State& state) {
   state.SetItemsProcessed(state.iterations());
 }
 
-BENCHMARK(BM_GetdentsNewFD)->Range(1, 1 << 16)->UseRealTime();
+BENCHMARK(BM_GetdentsNewFD)->Range(1, 1 << 12)->UseRealTime();
 
 }  // namespace
 
diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
index 23bf7b5f6..f96e2415e 100644
--- a/test/runner/gtest/gtest.go
+++ b/test/runner/gtest/gtest.go
@@ -43,6 +43,10 @@ type TestCase struct {
 	// Name is the name of this individual test.
 	Name string
 
+	// all indicates that this will run without flags. This takes
+	// precendence over benchmark below.
+	all bool
+
 	// benchmark indicates that this is a benchmark. In this case, the
 	// suite will be empty, and we will use the appropriate test and
 	// benchmark flags.
@@ -57,6 +61,9 @@ func (tc TestCase) FullName() string {
 
 // Args returns arguments to be passed when invoking the test.
 func (tc TestCase) Args() []string {
+	if tc.all {
+		return []string{} // No arguments.
+	}
 	if tc.benchmark {
 		return []string{
 			fmt.Sprintf("%s=^$", filterTestFlag),
@@ -81,11 +88,16 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes
 	cmd := exec.Command(testBin, args...)
 	out, err := cmd.Output()
 	if err != nil {
-		exitErr, ok := err.(*exec.ExitError)
-		if !ok {
-			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
-		}
-		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
+		// We failed to list tests with the given flags. Just
+		// return something that will run the binary with no
+		// flags, which should execute all tests.
+		return []TestCase{
+			TestCase{
+				Suite: "Default",
+				Name:  "All",
+				all:   true,
+			},
+		}, nil
 	}
 
 	// Parse test output.
@@ -114,7 +126,6 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes
 			Suite: suite,
 			Name:  name,
 		})
-
 	}
 
 	// Finished?
@@ -127,6 +138,8 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes
 	cmd = exec.Command(testBin, args...)
 	out, err = cmd.Output()
 	if err != nil {
+		// We were able to enumerate tests above, but not benchmarks?
+		// We requested them, so we return an error in this case.
 		exitErr, ok := err.(*exec.ExitError)
 		if !ok {
 			return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v", err)
-- 
cgit v1.2.3


From 93e0c3752981b6a1c5b745faec6506c17480b84b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 21 Feb 2020 10:28:16 +0000
Subject: Enable bluepill dieTrampoline operation on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I9e1bf2513c23bdd8c387e5b3c874c6ad3ca9aab0
---
 pkg/sentry/platform/kvm/bluepill_arm64.s         |  8 ++++---
 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go | 23 +++++++++++++++++++-
 pkg/sentry/platform/ring0/aarch64.go             | 27 ++++++++++++------------
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index c61700892..04efa0147 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -82,6 +82,8 @@ fallback:
 
 // dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
 TEXT ·dieTrampoline(SB),NOSPLIT,$0
-	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
-	MOVD	R9, 8(RSP)
-	BL	·dieHandler(SB)
+	// R0: Fake the old PC as caller
+	// R1: First argument (vCPU)
+	MOVD.P R1, 8(RSP) // R1: First argument (vCPU)
+	MOVD.P R0, 8(RSP) // R0: Fake the old PC as caller
+	B ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 2f02c03cf..195331383 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -18,9 +18,30 @@ package kvm
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
+// dieArchSetup initialies the state for dieTrampoline.
+//
+// The arm64 dieTrampoline requires the vCPU to be set in R1, and the last PC
+// to be in R0. The trampoline then simulates a call to dieHandler from the
+// provided PC.
+//
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
-	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+	// If the vCPU is in user mode, we set the stack to the stored stack
+	// value in the vCPU itself. We don't want to unwind the user stack.
+	if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t {
+		regs := c.CPU.Registers()
+		context.Regs[0] = regs.Regs[0]
+		context.Sp = regs.Sp
+		context.Regs[29] = regs.Regs[29] // stack base address
+	} else {
+		context.Regs[0] = guestRegs.Regs.Pc
+		context.Sp = guestRegs.Regs.Sp
+		context.Regs[29] = guestRegs.Regs.Regs[29]
+		context.Pstate = guestRegs.Regs.Pstate
+	}
+	context.Regs[1] = uint64(uintptr(unsafe.Pointer(c)))
+	context.Pc = uint64(dieTrampolineAddr)
 }
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index f6da41c27..8122ac6e2 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -27,26 +27,27 @@ const (
 	_PTE_PGT_BASE = 0x7000
 	_PTE_PGT_SIZE = 0x1000
 
-	_PSR_MODE_EL0t = 0x0
-	_PSR_MODE_EL1t = 0x4
-	_PSR_MODE_EL1h = 0x5
-	_PSR_EL_MASK   = 0xf
-
-	_PSR_D_BIT = 0x200
-	_PSR_A_BIT = 0x100
-	_PSR_I_BIT = 0x80
-	_PSR_F_BIT = 0x40
+	_PSR_D_BIT = 0x00000200
+	_PSR_A_BIT = 0x00000100
+	_PSR_I_BIT = 0x00000080
+	_PSR_F_BIT = 0x00000040
 )
 
 const (
+	// PSR bits
+	PSR_MODE_EL0t = 0x00000000
+	PSR_MODE_EL1t = 0x00000004
+	PSR_MODE_EL1h = 0x00000005
+	PSR_MODE_MASK = 0x0000000f
+
 	// KernelFlagsSet should always be set in the kernel.
-	KernelFlagsSet = _PSR_MODE_EL1h
+	KernelFlagsSet = PSR_MODE_EL1h
 
 	// UserFlagsSet are always set in userspace.
-	UserFlagsSet = _PSR_MODE_EL0t
+	UserFlagsSet = PSR_MODE_EL0t
 
-	KernelFlagsClear = _PSR_EL_MASK
-	UserFlagsClear   = _PSR_EL_MASK
+	KernelFlagsClear = PSR_MODE_MASK
+	UserFlagsClear   = PSR_MODE_MASK
 
 	PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
 )
-- 
cgit v1.2.3


From 4d7db46123f020df77cea5c00df4114b7b073845 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 25 Feb 2020 11:13:29 -0800
Subject: Add log during process wait in tests

TestMultiContainerKillAll timed out under --race. Without logging,
we cannot tell if the process list is still increasing, but slowly,
or is stuck.

PiperOrigin-RevId: 297158834
---
 runsc/container/container_test.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 04a7dc237..bdd65b498 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -71,6 +71,7 @@ func waitForProcessCount(cont *Container, want int) error {
 			return &backoff.PermanentError{Err: err}
 		}
 		if got := len(pss); got != want {
+			log.Infof("Waiting for process count to reach %d. Current: %d", want, got)
 			return fmt.Errorf("wrong process count, got: %d, want: %d", got, want)
 		}
 		return nil
-- 
cgit v1.2.3


From d7b73792515d1ac34c8d8c41ef5de379f22f002b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 25 Feb 2020 11:18:28 -0800
Subject: Deflake TestCurrentConnectedIncrement.

TestCurrentConnectedIncrement fails consistently under gotsan due to the sleep
to check metrics is exactly the same as the TIME-WAIT duration. Under gotsan
things can be slow enough that the increment test is done before the protocol
goroutine is run after the TIME-WAIT timer expires and does its cleanup.

Increasing the sleep from 1s to 1.2s makes the test pass consistently.

PiperOrigin-RevId: 297160181
---
 pkg/tcpip/transport/tcp/tcp_test.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index cc118c993..5b2b16afa 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -543,8 +543,9 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 		),
 	)
 
-	// Wait for the TIME-WAIT state to transition to CLOSED.
-	time.Sleep(1 * time.Second)
+	// Wait for a little more than the TIME-WAIT duration for the socket to
+	// transition to CLOSED state.
+	time.Sleep(1200 * time.Millisecond)
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
 		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
-- 
cgit v1.2.3


From 53504e29ca27b8dc9e098fbb88983fdbce90cca3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 12:16:43 -0800
Subject: Fix mount refcount issue.

Each mount is holds a reference on a root Dirent, but the mount itself may
live beyond it's own reference. This means that a call to Root() can come
after the associated reference has been dropped.

Instead of introducing a separate layer of references for mount objects,
we simply change the Root() method to use TryIncRef() and allow it to return
nil if the mount is already gone. This requires updating a small number of
callers and minimizes the change (since VFSv2 will replace this code shortly).

PiperOrigin-RevId: 297174230
---
 pkg/sentry/fs/mount_test.go  | 11 ++++++-----
 pkg/sentry/fs/mounts.go      | 10 +++++++---
 pkg/sentry/fs/proc/mounts.go | 16 ++++++++++++----
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index e672a438c..a3d10770b 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -36,11 +36,12 @@ func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
 	gotPaths := make(map[string]struct{}, len(got))
 	gotStr := make([]string, len(got))
 	for i, g := range got {
-		groot := g.Root()
-		name, _ := groot.FullName(root)
-		groot.DecRef()
-		gotStr[i] = name
-		gotPaths[name] = struct{}{}
+		if groot := g.Root(); groot != nil {
+			name, _ := groot.FullName(root)
+			groot.DecRef()
+			gotStr[i] = name
+			gotPaths[name] = struct{}{}
+		}
 	}
 	if len(got) != len(want) {
 		return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want)
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 574a2cc91..c7981f66e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -100,10 +100,14 @@ func newUndoMount(d *Dirent) *Mount {
 	}
 }
 
-// Root returns the root dirent of this mount. Callers must call DecRef on the
-// returned dirent.
+// Root returns the root dirent of this mount.
+//
+// This may return nil if the mount has already been free. Callers must handle this
+// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent.
 func (m *Mount) Root() *Dirent {
-	m.root.IncRef()
+	if !m.root.TryIncRef() {
+		return nil
+	}
 	return m.root
 }
 
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index c10888100..94deb553b 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -60,13 +60,15 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
 	})
 	for _, m := range ms {
 		mroot := m.Root()
+		if mroot == nil {
+			continue // No longer valid.
+		}
 		mountPath, desc := mroot.FullName(rootDir)
 		mroot.DecRef()
 		if !desc {
 			// MountSources that are not descendants of the chroot jail are ignored.
 			continue
 		}
-
 		fn(mountPath, m)
 	}
 }
@@ -91,6 +93,12 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 	var buf bytes.Buffer
 	forEachMount(mif.t, func(mountPath string, m *fs.Mount) {
+		mroot := m.Root()
+		if mroot == nil {
+			return // No longer valid.
+		}
+		defer mroot.DecRef()
+
 		// Format:
 		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
@@ -107,9 +115,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
-		mroot := m.Root()
-		defer mroot.DecRef()
-
 		sa := mroot.Inode.StableAttr
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
@@ -207,6 +212,9 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		//
 		// The "needs dump"and fsck flags are always 0, which is allowed.
 		root := m.Root()
+		if root == nil {
+			return // No longer valid.
+		}
 		defer root.DecRef()
 
 		flags := root.Inode.MountSource.Flags
-- 
cgit v1.2.3


From 98b693e61b37a62f7b29ce1cab8b4c4c54fa044e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 12:21:27 -0800
Subject: Don't acquire contended lock with the OS thread locked.

Fixes #1049

PiperOrigin-RevId: 297175164
---
 pkg/sentry/platform/kvm/machine.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 8076c7529..f1afc74dc 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -329,10 +329,12 @@ func (m *machine) Destroy() {
 }
 
 // Get gets an available vCPU.
+//
+// This will return with the OS thread locked.
 func (m *machine) Get() *vCPU {
+	m.mu.RLock()
 	runtime.LockOSThread()
 	tid := procid.Current()
-	m.mu.RLock()
 
 	// Check for an exact match.
 	if c := m.vCPUs[tid]; c != nil {
@@ -343,8 +345,22 @@ func (m *machine) Get() *vCPU {
 
 	// The happy path failed. We now proceed to acquire an exclusive lock
 	// (because the vCPU map may change), and scan all available vCPUs.
+	// In this case, we first unlock the OS thread. Otherwise, if mu is
+	// not available, the current system thread will be parked and a new
+	// system thread spawned. We avoid this situation by simply refreshing
+	// tid after relocking the system thread.
 	m.mu.RUnlock()
+	runtime.UnlockOSThread()
 	m.mu.Lock()
+	runtime.LockOSThread()
+	tid = procid.Current()
+
+	// Recheck for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.Unlock()
+		return c
+	}
 
 	for {
 		// Scan for an available vCPU.
-- 
cgit v1.2.3


From 6def8ea6ac601daa9256a31f818db9f7eb532168 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 12:22:09 -0800
Subject: Fix nested logging.

PiperOrigin-RevId: 297175316
---
 pkg/log/glog.go               |  6 ++---
 pkg/log/json.go               |  2 +-
 pkg/log/json_k8s.go           |  2 +-
 pkg/log/log.go                | 60 ++++++++++++++++++++++++++++++++-----------
 pkg/sentry/kernel/task_log.go |  6 ++---
 5 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index cab5fae55..b4f7bb5a4 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -46,7 +46,7 @@ var pid = os.Getpid()
 //   line             The line number
 //   msg              The user-supplied message
 //
-func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
+func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) {
 	// Log level.
 	prefix := byte('?')
 	switch level {
@@ -64,9 +64,7 @@ func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, ar
 	microsecond := int(timestamp.Nanosecond() / 1000)
 
 	// 0 = this frame.
-	// 1 = Debugf, etc.
-	// 2 = Caller.
-	_, file, line, ok := runtime.Caller(2)
+	_, file, line, ok := runtime.Caller(depth + 1)
 	if ok {
 		// Trim any directory path from the file.
 		slash := strings.LastIndexByte(file, byte('/'))
diff --git a/pkg/log/json.go b/pkg/log/json.go
index a278c8fc8..0943db1cc 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -62,7 +62,7 @@ type JSONEmitter struct {
 }
 
 // Emit implements Emitter.Emit.
-func (e JSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e JSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := jsonLog{
 		Msg:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index cee6eb514..6c6fc8b6f 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -33,7 +33,7 @@ type K8sJSONEmitter struct {
 }
 
 // Emit implements Emitter.Emit.
-func (e *K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e *K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := k8sJSONLog{
 		Log:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 5056f17e6..a794da1aa 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -79,7 +79,7 @@ func (l Level) String() string {
 type Emitter interface {
 	// Emit emits the given log statement. This allows for control over the
 	// timestamp used for logging.
-	Emit(level Level, timestamp time.Time, format string, v ...interface{})
+	Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{})
 }
 
 // Writer writes the output to the given writer.
@@ -142,7 +142,7 @@ func (l *Writer) Write(data []byte) (int, error) {
 }
 
 // Emit emits the message.
-func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
+func (l *Writer) Emit(_ int, _ Level, _ time.Time, format string, args ...interface{}) {
 	fmt.Fprintf(l, format, args...)
 }
 
@@ -150,9 +150,9 @@ func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...i
 type MultiEmitter []Emitter
 
 // Emit emits to all emitters.
-func (m *MultiEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (m *MultiEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	for _, e := range *m {
-		e.Emit(level, timestamp, format, v...)
+		e.Emit(1+depth, level, timestamp, format, v...)
 	}
 }
 
@@ -167,7 +167,7 @@ type TestEmitter struct {
 }
 
 // Emit emits to the TestLogger.
-func (t *TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (t *TestEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	t.Logf(format, v...)
 }
 
@@ -198,22 +198,37 @@ type BasicLogger struct {
 
 // Debugf implements logger.Debugf.
 func (l *BasicLogger) Debugf(format string, v ...interface{}) {
-	if l.IsLogging(Debug) {
-		l.Emit(Debug, time.Now(), format, v...)
-	}
+	l.DebugfAtDepth(1, format, v...)
 }
 
 // Infof implements logger.Infof.
 func (l *BasicLogger) Infof(format string, v ...interface{}) {
-	if l.IsLogging(Info) {
-		l.Emit(Info, time.Now(), format, v...)
-	}
+	l.InfofAtDepth(1, format, v...)
 }
 
 // Warningf implements logger.Warningf.
 func (l *BasicLogger) Warningf(format string, v ...interface{}) {
+	l.WarningfAtDepth(1, format, v...)
+}
+
+// DebugfAtDepth logs at a specific depth.
+func (l *BasicLogger) DebugfAtDepth(depth int, format string, v ...interface{}) {
+	if l.IsLogging(Debug) {
+		l.Emit(1+depth, Debug, time.Now(), format, v...)
+	}
+}
+
+// InfofAtDepth logs at a specific depth.
+func (l *BasicLogger) InfofAtDepth(depth int, format string, v ...interface{}) {
+	if l.IsLogging(Info) {
+		l.Emit(1+depth, Info, time.Now(), format, v...)
+	}
+}
+
+// WarningfAtDepth logs at a specific depth.
+func (l *BasicLogger) WarningfAtDepth(depth int, format string, v ...interface{}) {
 	if l.IsLogging(Warning) {
-		l.Emit(Warning, time.Now(), format, v...)
+		l.Emit(1+depth, Warning, time.Now(), format, v...)
 	}
 }
 
@@ -257,17 +272,32 @@ func SetLevel(newLevel Level) {
 
 // Debugf logs to the global logger.
 func Debugf(format string, v ...interface{}) {
-	Log().Debugf(format, v...)
+	Log().DebugfAtDepth(1, format, v...)
 }
 
 // Infof logs to the global logger.
 func Infof(format string, v ...interface{}) {
-	Log().Infof(format, v...)
+	Log().InfofAtDepth(1, format, v...)
 }
 
 // Warningf logs to the global logger.
 func Warningf(format string, v ...interface{}) {
-	Log().Warningf(format, v...)
+	Log().WarningfAtDepth(1, format, v...)
+}
+
+// DebugfAtDepth logs to the global logger.
+func DebugfAtDepth(depth int, format string, v ...interface{}) {
+	Log().DebugfAtDepth(1+depth, format, v...)
+}
+
+// InfofAtDepth logs to the global logger.
+func InfofAtDepth(depth int, format string, v ...interface{}) {
+	Log().InfofAtDepth(1+depth, format, v...)
+}
+
+// WarningfAtDepth logs to the global logger.
+func WarningfAtDepth(depth int, format string, v ...interface{}) {
+	Log().WarningfAtDepth(1+depth, format, v...)
 }
 
 // defaultStackSize is the default buffer size to allocate for stack traces.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 6d737d3e5..eeccaa197 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -32,21 +32,21 @@ const (
 // Infof logs an formatted info message by calling log.Infof.
 func (t *Task) Infof(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Info) {
-		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+		log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Warningf logs a warning string by calling log.Warningf.
 func (t *Task) Warningf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Warning) {
-		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+		log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Debugf creates a debug string that includes the task ID.
 func (t *Task) Debugf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Debug) {
-		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+		log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
-- 
cgit v1.2.3


From 471b15b212831af31c2fe36cd42cea7ec7b7785b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 25 Feb 2020 13:25:36 -0800
Subject: Port most syscalls to VFS2.

pipe and pipe2 aren't ported, pending a slight rework of pipe FDs for VFS2.
mount and umount2 aren't ported out of temporary laziness. access and faccessat
need additional FSImpl methods to implement properly, but are stubbed to
prevent googletest from CHECK-failing. Other syscalls require additional
plumbing.

Updates #1623

PiperOrigin-RevId: 297188448
---
 pkg/abi/linux/epoll_amd64.go                       |   2 +
 pkg/abi/linux/epoll_arm64.go                       |   2 +
 pkg/abi/linux/file.go                              |   2 +
 pkg/abi/linux/fs.go                                |   2 +
 pkg/abi/linux/signal.go                            |   2 +
 pkg/abi/linux/time.go                              |   6 +
 pkg/abi/linux/xattr.go                             |   1 +
 pkg/fspath/BUILD                                   |   4 +-
 pkg/fspath/builder.go                              |   8 +
 pkg/fspath/builder_unsafe.go                       |  27 -
 pkg/fspath/fspath.go                               |   3 +-
 pkg/gohacks/BUILD                                  |  11 +
 pkg/gohacks/gohacks_unsafe.go                      |  57 ++
 pkg/sentry/fsbridge/vfs.go                         |  10 +-
 pkg/sentry/fsimpl/proc/tasks.go                    |   4 +-
 pkg/sentry/kernel/fd_table.go                      |  49 +-
 pkg/sentry/kernel/fs_context.go                    |  22 +
 pkg/sentry/kernel/task.go                          |  18 +
 pkg/sentry/syscalls/linux/sys_epoll.go             |   4 +
 pkg/sentry/syscalls/linux/sys_file.go              |  40 ++
 pkg/sentry/syscalls/linux/sys_getdents.go          |   4 +
 pkg/sentry/syscalls/linux/sys_lseek.go             |   4 +
 pkg/sentry/syscalls/linux/sys_mmap.go              |   4 +
 pkg/sentry/syscalls/linux/sys_read.go              |   4 +
 pkg/sentry/syscalls/linux/sys_stat.go              |   4 +
 pkg/sentry/syscalls/linux/sys_sync.go              |   4 +
 pkg/sentry/syscalls/linux/sys_write.go             |   4 +
 pkg/sentry/syscalls/linux/sys_xattr.go             |   4 +
 pkg/sentry/syscalls/linux/vfs2/BUILD               |  28 +-
 pkg/sentry/syscalls/linux/vfs2/epoll.go            | 225 ++++++++
 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go     |  44 ++
 pkg/sentry/syscalls/linux/vfs2/execve.go           | 137 +++++
 pkg/sentry/syscalls/linux/vfs2/fd.go               | 147 ++++++
 pkg/sentry/syscalls/linux/vfs2/filesystem.go       | 326 ++++++++++++
 pkg/sentry/syscalls/linux/vfs2/fscontext.go        | 131 +++++
 pkg/sentry/syscalls/linux/vfs2/getdents.go         | 149 ++++++
 pkg/sentry/syscalls/linux/vfs2/ioctl.go            |  35 ++
 .../syscalls/linux/vfs2/linux64_override_amd64.go  | 216 ++++----
 .../syscalls/linux/vfs2/linux64_override_arm64.go  |   2 +
 pkg/sentry/syscalls/linux/vfs2/mmap.go             |  92 ++++
 pkg/sentry/syscalls/linux/vfs2/path.go             |  94 ++++
 pkg/sentry/syscalls/linux/vfs2/poll.go             | 584 +++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/read_write.go       | 511 ++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/setstat.go          | 380 ++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/stat.go             | 346 ++++++++++++
 pkg/sentry/syscalls/linux/vfs2/sync.go             |  87 +++
 pkg/sentry/syscalls/linux/vfs2/sys_read.go         |  95 ----
 pkg/sentry/syscalls/linux/vfs2/xattr.go            | 353 +++++++++++++
 pkg/sentry/vfs/BUILD                               |   1 +
 pkg/sentry/vfs/epoll.go                            |   3 +
 pkg/sentry/vfs/mount_unsafe.go                     |  12 +-
 pkg/sentry/vfs/resolving_path.go                   |   2 +-
 pkg/sentry/vfs/vfs.go                              |  10 +-
 pkg/usermem/BUILD                                  |   2 +-
 pkg/usermem/usermem.go                             |   9 +-
 pkg/usermem/usermem_unsafe.go                      |  27 -
 runsc/boot/filter/config.go                        |   2 +
 57 files changed, 4082 insertions(+), 274 deletions(-)
 delete mode 100644 pkg/fspath/builder_unsafe.go
 create mode 100644 pkg/gohacks/BUILD
 create mode 100644 pkg/gohacks/gohacks_unsafe.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/execve.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/fd.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/filesystem.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/fscontext.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/getdents.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/ioctl.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/mmap.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/path.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/poll.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/read_write.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/setstat.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/sync.go
 delete mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_read.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/xattr.go
 delete mode 100644 pkg/usermem/usermem_unsafe.go

diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go
index 57041491c..34ff18009 100644
--- a/pkg/abi/linux/epoll_amd64.go
+++ b/pkg/abi/linux/epoll_amd64.go
@@ -15,6 +15,8 @@
 package linux
 
 // EpollEvent is equivalent to struct epoll_event from epoll(2).
+//
+// +marshal
 type EpollEvent struct {
 	Events uint32
 	// Linux makes struct epoll_event::data a __u64. We represent it as
diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go
index 62ef5821e..f86c35329 100644
--- a/pkg/abi/linux/epoll_arm64.go
+++ b/pkg/abi/linux/epoll_arm64.go
@@ -15,6 +15,8 @@
 package linux
 
 // EpollEvent is equivalent to struct epoll_event from epoll(2).
+//
+// +marshal
 type EpollEvent struct {
 	Events uint32
 	// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c3ab15a4f..e229ac21c 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -241,6 +241,8 @@ const (
 )
 
 // Statx represents struct statx.
+//
+// +marshal
 type Statx struct {
 	Mask           uint32
 	Blksize        uint32
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 2c652baa2..158d2db5b 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -38,6 +38,8 @@ const (
 )
 
 // Statfs is struct statfs, from uapi/asm-generic/statfs.h.
+//
+// +marshal
 type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
 	Type uint64
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index c69b04ea9..1c330e763 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -115,6 +115,8 @@ const (
 )
 
 // SignalSet is a signal mask with a bit corresponding to each signal.
+//
+// +marshal
 type SignalSet uint64
 
 // SignalSetSize is the size in bytes of a SignalSet.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index e562b46d9..e6860ed49 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -157,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec {
 const SizeOfTimeval = 16
 
 // Timeval represents struct timeval in <time.h>.
+//
+// +marshal
 type Timeval struct {
 	Sec  int64
 	Usec int64
@@ -230,6 +232,8 @@ type Tms struct {
 type TimerID int32
 
 // StatxTimestamp represents struct statx_timestamp.
+//
+// +marshal
 type StatxTimestamp struct {
 	Sec  int64
 	Nsec uint32
@@ -258,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
 }
 
 // Utime represents struct utimbuf used by utimes(2).
+//
+// +marshal
 type Utime struct {
 	Actime  int64
 	Modtime int64
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
index a3b6406fa..99180b208 100644
--- a/pkg/abi/linux/xattr.go
+++ b/pkg/abi/linux/xattr.go
@@ -18,6 +18,7 @@ package linux
 const (
 	XATTR_NAME_MAX = 255
 	XATTR_SIZE_MAX = 65536
+	XATTR_LIST_MAX = 65536
 
 	XATTR_CREATE  = 1
 	XATTR_REPLACE = 2
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index ee84471b2..67dd1e225 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -8,9 +8,11 @@ go_library(
     name = "fspath",
     srcs = [
         "builder.go",
-        "builder_unsafe.go",
         "fspath.go",
     ],
+    deps = [
+        "//pkg/gohacks",
+    ],
 )
 
 go_test(
diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go
index 7ddb36826..6318d3874 100644
--- a/pkg/fspath/builder.go
+++ b/pkg/fspath/builder.go
@@ -16,6 +16,8 @@ package fspath
 
 import (
 	"fmt"
+
+	"gvisor.dev/gvisor/pkg/gohacks"
 )
 
 // Builder is similar to strings.Builder, but is used to produce pathnames
@@ -102,3 +104,9 @@ func (b *Builder) AppendString(str string) {
 	copy(b.buf[b.start:], b.buf[oldStart:])
 	copy(b.buf[len(b.buf)-len(str):], str)
 }
+
+// String returns the accumulated string. No other methods should be called
+// after String.
+func (b *Builder) String() string {
+	return gohacks.StringFromImmutableBytes(b.buf[b.start:])
+}
diff --git a/pkg/fspath/builder_unsafe.go b/pkg/fspath/builder_unsafe.go
deleted file mode 100644
index 75606808d..000000000
--- a/pkg/fspath/builder_unsafe.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fspath
-
-import (
-	"unsafe"
-)
-
-// String returns the accumulated string. No other methods should be called
-// after String.
-func (b *Builder) String() string {
-	bs := b.buf[b.start:]
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
index 9fb3fee24..4c983d5fd 100644
--- a/pkg/fspath/fspath.go
+++ b/pkg/fspath/fspath.go
@@ -67,7 +67,8 @@ func Parse(pathname string) Path {
 
 // Path contains the information contained in a pathname string.
 //
-// Path is copyable by value.
+// Path is copyable by value. The zero value for Path is equivalent to
+// fspath.Parse(""), i.e. the empty path.
 type Path struct {
 	// Begin is an iterator to the first path component in the relative part of
 	// the path.
diff --git a/pkg/gohacks/BUILD b/pkg/gohacks/BUILD
new file mode 100644
index 000000000..798a65eca
--- /dev/null
+++ b/pkg/gohacks/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gohacks",
+    srcs = [
+        "gohacks_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go
new file mode 100644
index 000000000..aad675172
--- /dev/null
+++ b/pkg/gohacks/gohacks_unsafe.go
@@ -0,0 +1,57 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gohacks contains utilities for subverting the Go compiler.
+package gohacks
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// Noescape hides a pointer from escape analysis. Noescape is the identity
+// function but escape analysis doesn't think the output depends on the input.
+// Noescape is inlined and currently compiles down to zero instructions.
+// USE CAREFULLY!
+//
+// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().)
+//
+//go:nosplit
+func Noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
+
+// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the
+// same memory backing s instead of making a heap-allocated copy. This is only
+// valid if the returned slice is never mutated.
+func ImmutableBytesFromString(s string) []byte {
+	shdr := (*reflect.StringHeader)(unsafe.Pointer(&s))
+	var bs []byte
+	bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
+	bshdr.Data = shdr.Data
+	bshdr.Len = shdr.Len
+	bshdr.Cap = shdr.Len
+	return bs
+}
+
+// StringFromImmutableBytes is equivalent to string(bs), except that it uses
+// the same memory backing bs instead of making a heap-allocated copy. This is
+// only valid if bs is never mutated after StringFromImmutableBytes returns.
+func StringFromImmutableBytes(bs []byte) string {
+	// This is cheaper than messing with reflect.StringHeader and
+	// reflect.SliceHeader, which as of this writing produces many dead stores
+	// of zeroes. Compare strings.Builder.String().
+	return *(*string)(unsafe.Pointer(&bs))
+}
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index e657c39bc..6aa17bfc1 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -117,15 +117,19 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
 // default anyways.
 //
 // TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
-func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
 	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
 	creds := auth.CredentialsFromContext(ctx)
+	path := fspath.Parse(pathname)
 	pop := &vfs.PathOperation{
 		Root:               l.root,
-		Start:              l.root,
-		Path:               fspath.Parse(path),
+		Start:              l.workingDir,
+		Path:               path,
 		FollowFinalSymlink: resolveFinal,
 	}
+	if path.Absolute {
+		pop.Start = l.root
+	}
 	fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ce08a7d53..10c08fa90 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
 		"net":     newNetDir(root, inoGen, k),
-		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
+		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
-		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
+		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
 	}
 
 	inode := &tasksInode{
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 23b88f7a6..58001d56c 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	if minfd < 0 {
+		// Don't accept negative FDs.
+		return -1, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if minfd >= end {
+			return -1, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	fd := minfd
+	if fd < f.next {
+		fd = f.next
+	}
+	for fd < end {
+		if d, _, _ := f.get(fd); d == nil {
+			f.setVFS2(fd, file, flags)
+			if fd == f.next {
+				// Update next search start position.
+				f.next = fd + 1
+			}
+			return fd, nil
+		}
+		fd++
+	}
+	return -1, syscall.EMFILE
+}
+
 // NewFDAt sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
@@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		return syscall.EBADF
 	}
 
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 	}
 
 	// Install the entry.
+	f.mu.Lock()
+	defer f.mu.Unlock()
 	f.setAll(fd, file, fileVFS2, flags)
 	return nil
 }
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 7218aa24e..47f78df9a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -244,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+	if !vd.Ok() {
+		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
+	}
+
+	f.mu.Lock()
+
+	if !f.rootVFS2.Ok() {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
+	}
+
+	old := f.rootVFS2
+	vd.IncRef()
+	f.rootVFS2 = vd
+	f.mu.Unlock()
+	old.DecRef()
+}
+
 // Umask returns the current umask.
 func (f *FSContext) Umask() uint {
 	f.mu.Lock()
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e37e23231..2cee2e6ed 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -789,6 +789,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error)
 	return fds[0], nil
 }
 
+// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	return t.fdTable.NewFDVFS2(t, fd, file, flags)
+}
+
 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
 //
 // This automatically passes the task as the context.
@@ -798,6 +807,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
 	return t.fdTable.NewFDAt(t, fd, file, flags)
 }
 
+// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
+}
+
 // WithMuLocked executes f with t.mu locked.
 func (t *Task) WithMuLocked(f func(*Task)) {
 	t.mu.Lock()
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index fbef5b376..3ab93fbde 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // EpollCreate1 implements the epoll_create1(2) linux syscall.
 func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	flags := args[0].Int()
@@ -164,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	return EpollWait(t, args)
 }
+
+// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 421845ebb..c21f14dc0 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
 	return path, dirPath, nil
 }
 
+// LINT.IfChange
+
 func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Ioctl implements linux syscall ioctl(2).
 func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/ioctl.go)
+
+// LINT.IfChange
+
 // Getcwd implements the linux syscall getcwd(2).
 func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fscontext.go)
+
+// LINT.IfChange
+
 // Close implements linux syscall close(2).
 func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
 const (
 	_FADV_NORMAL     = 0
 	_FADV_RANDOM     = 1
@@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return n, nil, err
 }
 
+// LINT.ThenChange(vfs2/stat.go)
+
+// LINT.IfChange
+
 func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, unlinkAt(t, dirFD, addr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Truncate implements linux syscall truncate(2).
 func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
 // Umask implements linux syscall umask(2).
 func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	mask := args[0].ModeT()
@@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return uintptr(mask), nil, nil
 }
 
+// LINT.IfChange
+
 // Change ownership of a file.
 //
 // uid and gid may be -1, in which case they will not be changed.
@@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
+// LINT.IfChange
+
 func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
 	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
 	if err != nil {
@@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
 // Fallocate implements linux system call fallocate(2).
 func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index f66f4ffde..b126fecc0 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Getdents implements linux syscall getdents(2) for 64bit systems.
 func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
 func (ds *direntSerializer) Written() int {
 	return ds.written
 }
+
+// LINT.ThenChange(vfs2/getdents.go)
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 297e920c4..3f7691eae 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Lseek implements linux syscall lseek(2).
 func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	return uintptr(offset), nil, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 9959f6e61..91694d374 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	return uintptr(addr), nil, nil
 }
 
+// LINT.IfChange
+
 // Mmap implements linux syscall mmap(2).
 func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	prot := args[2].Int()
@@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(rv), nil, err
 }
 
+// LINT.ThenChange(vfs2/mmap.go)
+
 // Munmap implements linux syscall munmap(2).
 func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 227692f06..78a2cb750 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskRead contains events that can be triggered on reads.
 	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
@@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 11f25e00d..701b27b4a 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
 	return linux.Stat{
 		Dev:     sattr.DeviceID,
@@ -297,3 +299,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 	_, err = t.CopyOut(addr, &statfs)
 	return err
 }
+
+// LINT.ThenChange(vfs2/stat.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 3e55235bd..5ad465ae3 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Sync implements linux system call sync(2).
 func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	t.MountNamespace().SyncAll(t)
@@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 
 	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 }
+
+// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index aba892939..506ee54ce 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskWrite contains events that can be triggered on writes.
 	//
@@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 9d8140b8a..2de5e3422 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getXattrFromPath(t, args, true)
@@ -418,3 +420,5 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
 
 	return d.Inode.RemoveXattr(t, d, name)
 }
+
+// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 6b8a00b6e..f51761e81 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,18 +5,44 @@ package(licenses = ["notice"])
 go_library(
     name = "vfs2",
     srcs = [
+        "epoll.go",
+        "epoll_unsafe.go",
+        "execve.go",
+        "fd.go",
+        "filesystem.go",
+        "fscontext.go",
+        "getdents.go",
+        "ioctl.go",
         "linux64.go",
         "linux64_override_amd64.go",
         "linux64_override_arm64.go",
-        "sys_read.go",
+        "mmap.go",
+        "path.go",
+        "poll.go",
+        "read_write.go",
+        "setstat.go",
+        "stat.go",
+        "sync.go",
+        "xattr.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
new file mode 100644
index 000000000..d6cb0e79a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -0,0 +1,225 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EpollCreate1 implements Linux syscall epoll_create1(2).
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^linux.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements Linux syscall epoll_create(2).
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	// "Since Linux 2.6.8, the size argument is ignored, but must be greater
+	// than zero" - epoll_create(2)
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements Linux syscall epoll_ctl(2).
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	op := args[1].Int()
+	fd := args[2].Int()
+	eventAddr := args[3].Pointer()
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+	if epfile == file {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var event linux.EpollEvent
+	switch op {
+	case linux.EPOLL_CTL_ADD:
+		if err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.AddInterest(file, fd, event)
+	case linux.EPOLL_CTL_DEL:
+		return 0, nil, ep.DeleteInterest(file, fd)
+	case linux.EPOLL_CTL_MOD:
+		if err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.ModifyInterest(file, fd, event)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+	if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
+		return 0, nil, syserror.EINVAL
+	}
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
+	// maxEvents), so that the buffer can be allocated on the stack.
+	var (
+		events       [16]linux.EpollEvent
+		total        int
+		ch           chan struct{}
+		haveDeadline bool
+		deadline     ktime.Time
+	)
+	for {
+		batchEvents := len(events)
+		if batchEvents > maxEvents {
+			batchEvents = maxEvents
+		}
+		n := ep.ReadEvents(events[:batchEvents])
+		maxEvents -= n
+		if n != 0 {
+			// Copy what we read out.
+			copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n])
+			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
+			total += copiedEvents
+			if err != nil {
+				if total != 0 {
+					return uintptr(total), nil, nil
+				}
+				return 0, nil, err
+			}
+			// If we've filled the application's event buffer, we're done.
+			if maxEvents == 0 {
+				return uintptr(total), nil, nil
+			}
+			// Loop if we read a full batch, under the expectation that there
+			// may be more events to read.
+			if n == batchEvents {
+				continue
+			}
+		}
+		// We get here if n != batchEvents. If we read any number of events
+		// (just now, or in a previous iteration of this loop), or if timeout
+		// is 0 (such that epoll_wait should be non-blocking), return the
+		// events we've read so far to the application.
+		if total != 0 || timeout == 0 {
+			return uintptr(total), nil, nil
+		}
+		// In the first iteration of this loop, register with the epoll
+		// instance for readability events, but then immediately continue the
+		// loop since we need to retry ReadEvents() before blocking. In all
+		// subsequent iterations, block until events are available, the timeout
+		// expires, or an interrupt arrives.
+		if ch == nil {
+			var w waiter.Entry
+			w, ch = waiter.NewChannelEntry(nil)
+			epfile.EventRegister(&w, waiter.EventIn)
+			defer epfile.EventUnregister(&w)
+		} else {
+			// Set up the timer if a timeout was specified.
+			if timeout > 0 && !haveDeadline {
+				timeoutDur := time.Duration(timeout) * time.Millisecond
+				deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+				haveDeadline = true
+			}
+			if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+				if err == syserror.ETIMEDOUT {
+					err = nil
+				}
+				// total must be 0 since otherwise we would have returned
+				// above.
+				return 0, nil, err
+			}
+		}
+	}
+}
+
+// EpollPwait implements Linux syscall epoll_pwait(2).
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
new file mode 100644
index 000000000..825f325bf
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
@@ -0,0 +1,44 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"reflect"
+	"runtime"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{}))
+
+func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) {
+	if len(events) == 0 {
+		return 0, nil
+	}
+	// Cast events to a byte slice for copying.
+	var eventBytes []byte
+	eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes))
+	eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0])))
+	eventBytesHdr.Len = len(events) * sizeofEpollEvent
+	eventBytesHdr.Cap = len(events) * sizeofEpollEvent
+	copiedBytes, err := t.CopyOutBytes(addr, eventBytes)
+	runtime.KeepAlive(events)
+	copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
+	return copiedEvents, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
new file mode 100644
index 000000000..aef0078a8
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathnameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+	if err != nil {
+		return 0, nil, err
+	}
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	var executable fsbridge.File
+	closeOnExec := false
+	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
+		// We must open the executable ourselves since dirfd is used as the
+		// starting point while resolving path, but the task working directory
+		// is used as the starting point while resolving interpreters (Linux:
+		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
+		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
+		// incapable of handling this correctly.
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
+		if dirfile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		start := dirfile.VirtualDentry()
+		start.IncRef()
+		dirfile.DecRef()
+		closeOnExec = dirfileFlags.CloseOnExec
+		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		}, &vfs.OpenOptions{
+			Flags:    linux.O_RDONLY,
+			FileExec: true,
+		})
+		start.DecRef()
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef()
+		executable = fsbridge.NewVFSFile(file)
+	}
+
+	// Load the new TaskContext.
+	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
+	defer mntns.DecRef()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	defer wd.DecRef()
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Opener:              fsbridge.NewVFSLookup(mntns, root, wd),
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		Filename:            pathname,
+		File:                executable,
+		CloseOnExec:         closeOnExec,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+	if se != nil {
+		return 0, nil, se.ToError()
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
new file mode 100644
index 000000000..3afcea665
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -0,0 +1,147 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Close implements Linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	// Note that Remove provides a reference on the file that we may use to
+	// flush. It is still active until we drop the final reference below
+	// (and other reference-holding operations complete).
+	_, file := t.FDTable().Remove(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.OnClose(t)
+	return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements Linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements Linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+
+	if oldfd == newfd {
+		// As long as oldfd is valid, dup2() does nothing and returns newfd.
+		file := t.GetFileVFS2(oldfd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		file.DecRef()
+		return uintptr(newfd), nil, nil
+	}
+
+	return dup3(t, oldfd, newfd, 0)
+}
+
+// Dup3 implements Linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return dup3(t, oldfd, newfd, flags)
+}
+
+func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^linux.O_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(oldfd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	cmd := args[1].Int()
+
+	file, flags := t.FDTable().GetVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		minfd := args[2].Int()
+		fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
+			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+		})
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+		return 0, nil, nil
+	case linux.F_GETFL:
+		return uintptr(file.StatusFlags()), nil, nil
+	case linux.F_SETFL:
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	default:
+		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
new file mode 100644
index 000000000..fc5ceea4c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -0,0 +1,326 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Link implements Linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Linkat implements Linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+		return syserror.ENOENT
+	}
+
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
+}
+
+// Mkdir implements Linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements Linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, mkdirat(t, dirfd, addr, mode)
+}
+
+func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
+		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+}
+
+// Mknod implements Linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	dev := args[2].Uint()
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+}
+
+// Mknodat implements Linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	dev := args[3].Uint()
+	return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+}
+
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	major, minor := linux.DecodeDeviceID(dev)
+	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
+		Mode:     linux.FileMode(mode &^ t.FSContext().Umask()),
+		DevMajor: uint32(major),
+		DevMinor: minor,
+	})
+}
+
+// Open implements Linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+	mode := args[2].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, flags, mode)
+}
+
+// Openat implements Linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	mode := args[3].ModeT()
+	return openat(t, dirfd, addr, flags, mode)
+}
+
+// Creat implements Linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
+}
+
+func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
+		Flags: flags,
+		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	return uintptr(fd), nil, err
+}
+
+// Rename implements Linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Renameat implements Linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
+}
+
+// Renameat2 implements Linux syscall renameat2(2).
+func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Uint()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
+		Flags: flags,
+	})
+}
+
+// Rmdir implements Linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlink implements Linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlinkat implements Linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if flags&^linux.AT_REMOVEDIR != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirat(t, dirfd, pathAddr)
+	}
+	return 0, nil, unlinkat(t, dirfd, pathAddr)
+}
+
+// Symlink implements Linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	linkpathAddr := args[1].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
+}
+
+// Symlinkat implements Linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	newdirfd := args[1].Int()
+	linkpathAddr := args[2].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
+}
+
+func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
+	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	linkpath, err := copyInPath(t, linkpathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
new file mode 100644
index 000000000..317409a18
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getcwd implements Linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+
+	root := t.FSContext().RootDirectoryVFS2()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
+	root.DecRef()
+	wd.DecRef()
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Construct a byte slice containing a NUL terminator.
+	buf := t.CopyScratchBuffer(len(s) + 1)
+	copy(buf, s)
+	buf[len(buf)-1] = 0
+
+	// Write the pathname slice.
+	n, err := t.CopyOutBytes(addr, buf)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Chdir implements Linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Fchdir implements Linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Chroot implements Linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetRootDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
new file mode 100644
index 000000000..ddc140b65
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Getdents implements Linux syscall getdents(2).
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, false /* isGetdents64 */)
+}
+
+// Getdents64 implements Linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, true /* isGetdents64 */)
+}
+
+func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	cb := getGetdentsCallback(t, addr, size, isGetdents64)
+	err := file.IterDirents(t, cb)
+	n := size - cb.remaining
+	putGetdentsCallback(cb)
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+type getdentsCallback struct {
+	t            *kernel.Task
+	addr         usermem.Addr
+	remaining    int
+	isGetdents64 bool
+}
+
+var getdentsCallbackPool = sync.Pool{
+	New: func() interface{} {
+		return &getdentsCallback{}
+	},
+}
+
+func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
+	cb := getdentsCallbackPool.Get().(*getdentsCallback)
+	*cb = getdentsCallback{
+		t:            t,
+		addr:         addr,
+		remaining:    size,
+		isGetdents64: isGetdents64,
+	}
+	return cb
+}
+
+func putGetdentsCallback(cb *getdentsCallback) {
+	cb.t = nil
+	getdentsCallbackPool.Put(cb)
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
+	var buf []byte
+	if cb.isGetdents64 {
+		// struct linux_dirent64 {
+		//     ino64_t        d_ino;    /* 64-bit inode number */
+		//     off64_t        d_off;    /* 64-bit offset to next structure */
+		//     unsigned short d_reclen; /* Size of this dirent */
+		//     unsigned char  d_type;   /* File type */
+		//     char           d_name[]; /* Filename (null-terminated) */
+		// };
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		if size < cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		buf[18] = dirent.Type
+		copy(buf[19:], dirent.Name)
+		buf[size-1] = 0 // NUL terminator
+	} else {
+		// struct linux_dirent {
+		//     unsigned long  d_ino;     /* Inode number */
+		//     unsigned long  d_off;     /* Offset to next linux_dirent */
+		//     unsigned short d_reclen;  /* Length of this linux_dirent */
+		//     char           d_name[];  /* Filename (null-terminated) */
+		//                       /* length is actually (d_reclen - 2 -
+		//                          offsetof(struct linux_dirent, d_name)) */
+		//     /*
+		//     char           pad;       // Zero padding byte
+		//     char           d_type;    // File type (only since Linux
+		//                               // 2.6.4); offset is (d_reclen - 1)
+		//     */
+		// };
+		if cb.t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
+		}
+		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
+		if size < cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		copy(buf[18:], dirent.Name)
+		buf[size-3] = 0 // NUL terminator
+		buf[size-2] = 0 // zero padding byte
+		buf[size-1] = dirent.Type
+	}
+	n, err := cb.t.CopyOutBytes(cb.addr, buf)
+	if err != nil {
+		// Don't report partially-written dirents by advancing cb.addr or
+		// cb.remaining.
+		return err
+	}
+	cb.addr += usermem.Addr(n)
+	cb.remaining -= n
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
new file mode 100644
index 000000000..5a2418da9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Ioctl implements Linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	ret, err := file.Ioctl(t, t.MemoryManager(), args)
+	return ret, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index e0ac32b33..7d220bc20 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package vfs2
 
 import (
@@ -22,110 +24,142 @@ import (
 // Override syscall table to add syscalls implementations from this package.
 func Override(table map[uintptr]kernel.Syscall) {
 	table[0] = syscalls.Supported("read", Read)
-
-	// Remove syscalls that haven't been converted yet. It's better to get ENOSYS
-	// rather than a SIGSEGV deep in the stack.
-	delete(table, 1)   // write
-	delete(table, 2)   // open
-	delete(table, 3)   // close
-	delete(table, 4)   // stat
-	delete(table, 5)   // fstat
-	delete(table, 6)   // lstat
-	delete(table, 7)   // poll
-	delete(table, 8)   // lseek
-	delete(table, 9)   // mmap
-	delete(table, 16)  // ioctl
-	delete(table, 17)  // pread64
-	delete(table, 18)  // pwrite64
-	delete(table, 19)  // readv
-	delete(table, 20)  // writev
-	delete(table, 21)  // access
-	delete(table, 22)  // pipe
-	delete(table, 32)  // dup
-	delete(table, 33)  // dup2
-	delete(table, 40)  // sendfile
-	delete(table, 59)  // execve
-	delete(table, 72)  // fcntl
-	delete(table, 73)  // flock
-	delete(table, 74)  // fsync
-	delete(table, 75)  // fdatasync
-	delete(table, 76)  // truncate
-	delete(table, 77)  // ftruncate
-	delete(table, 78)  // getdents
-	delete(table, 79)  // getcwd
-	delete(table, 80)  // chdir
-	delete(table, 81)  // fchdir
-	delete(table, 82)  // rename
-	delete(table, 83)  // mkdir
-	delete(table, 84)  // rmdir
-	delete(table, 85)  // creat
-	delete(table, 86)  // link
-	delete(table, 87)  // unlink
-	delete(table, 88)  // symlink
-	delete(table, 89)  // readlink
-	delete(table, 90)  // chmod
-	delete(table, 91)  // fchmod
-	delete(table, 92)  // chown
-	delete(table, 93)  // fchown
-	delete(table, 94)  // lchown
-	delete(table, 133) // mknod
-	delete(table, 137) // statfs
-	delete(table, 138) // fstatfs
-	delete(table, 161) // chroot
-	delete(table, 162) // sync
+	table[1] = syscalls.Supported("write", Write)
+	table[2] = syscalls.Supported("open", Open)
+	table[3] = syscalls.Supported("close", Close)
+	table[4] = syscalls.Supported("stat", Stat)
+	table[5] = syscalls.Supported("fstat", Fstat)
+	table[6] = syscalls.Supported("lstat", Lstat)
+	table[7] = syscalls.Supported("poll", Poll)
+	table[8] = syscalls.Supported("lseek", Lseek)
+	table[9] = syscalls.Supported("mmap", Mmap)
+	table[16] = syscalls.Supported("ioctl", Ioctl)
+	table[17] = syscalls.Supported("pread64", Pread64)
+	table[18] = syscalls.Supported("pwrite64", Pwrite64)
+	table[19] = syscalls.Supported("readv", Readv)
+	table[20] = syscalls.Supported("writev", Writev)
+	table[21] = syscalls.Supported("access", Access)
+	delete(table, 22) // pipe
+	table[23] = syscalls.Supported("select", Select)
+	table[32] = syscalls.Supported("dup", Dup)
+	table[33] = syscalls.Supported("dup2", Dup2)
+	delete(table, 40) // sendfile
+	delete(table, 41) // socket
+	delete(table, 42) // connect
+	delete(table, 43) // accept
+	delete(table, 44) // sendto
+	delete(table, 45) // recvfrom
+	delete(table, 46) // sendmsg
+	delete(table, 47) // recvmsg
+	delete(table, 48) // shutdown
+	delete(table, 49) // bind
+	delete(table, 50) // listen
+	delete(table, 51) // getsockname
+	delete(table, 52) // getpeername
+	delete(table, 53) // socketpair
+	delete(table, 54) // setsockopt
+	delete(table, 55) // getsockopt
+	table[59] = syscalls.Supported("execve", Execve)
+	table[72] = syscalls.Supported("fcntl", Fcntl)
+	delete(table, 73) // flock
+	table[74] = syscalls.Supported("fsync", Fsync)
+	table[75] = syscalls.Supported("fdatasync", Fdatasync)
+	table[76] = syscalls.Supported("truncate", Truncate)
+	table[77] = syscalls.Supported("ftruncate", Ftruncate)
+	table[78] = syscalls.Supported("getdents", Getdents)
+	table[79] = syscalls.Supported("getcwd", Getcwd)
+	table[80] = syscalls.Supported("chdir", Chdir)
+	table[81] = syscalls.Supported("fchdir", Fchdir)
+	table[82] = syscalls.Supported("rename", Rename)
+	table[83] = syscalls.Supported("mkdir", Mkdir)
+	table[84] = syscalls.Supported("rmdir", Rmdir)
+	table[85] = syscalls.Supported("creat", Creat)
+	table[86] = syscalls.Supported("link", Link)
+	table[87] = syscalls.Supported("unlink", Unlink)
+	table[88] = syscalls.Supported("symlink", Symlink)
+	table[89] = syscalls.Supported("readlink", Readlink)
+	table[90] = syscalls.Supported("chmod", Chmod)
+	table[91] = syscalls.Supported("fchmod", Fchmod)
+	table[92] = syscalls.Supported("chown", Chown)
+	table[93] = syscalls.Supported("fchown", Fchown)
+	table[94] = syscalls.Supported("lchown", Lchown)
+	table[132] = syscalls.Supported("utime", Utime)
+	table[133] = syscalls.Supported("mknod", Mknod)
+	table[137] = syscalls.Supported("statfs", Statfs)
+	table[138] = syscalls.Supported("fstatfs", Fstatfs)
+	table[161] = syscalls.Supported("chroot", Chroot)
+	table[162] = syscalls.Supported("sync", Sync)
 	delete(table, 165) // mount
 	delete(table, 166) // umount2
-	delete(table, 172) // iopl
-	delete(table, 173) // ioperm
 	delete(table, 187) // readahead
-	delete(table, 188) // setxattr
-	delete(table, 189) // lsetxattr
-	delete(table, 190) // fsetxattr
-	delete(table, 191) // getxattr
-	delete(table, 192) // lgetxattr
-	delete(table, 193) // fgetxattr
+	table[188] = syscalls.Supported("setxattr", Setxattr)
+	table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
+	table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
+	table[191] = syscalls.Supported("getxattr", Getxattr)
+	table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
+	table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
+	table[194] = syscalls.Supported("listxattr", Listxattr)
+	table[195] = syscalls.Supported("llistxattr", Llistxattr)
+	table[196] = syscalls.Supported("flistxattr", Flistxattr)
+	table[197] = syscalls.Supported("removexattr", Removexattr)
+	table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
+	table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
 	delete(table, 206) // io_setup
 	delete(table, 207) // io_destroy
 	delete(table, 208) // io_getevents
 	delete(table, 209) // io_submit
 	delete(table, 210) // io_cancel
-	delete(table, 213) // epoll_create
-	delete(table, 214) // epoll_ctl_old
-	delete(table, 215) // epoll_wait_old
-	delete(table, 216) // remap_file_pages
-	delete(table, 217) // getdents64
-	delete(table, 232) // epoll_wait
-	delete(table, 233) // epoll_ctl
+	table[213] = syscalls.Supported("epoll_create", EpollCreate)
+	table[217] = syscalls.Supported("getdents64", Getdents64)
+	delete(table, 221) // fdavise64
+	table[232] = syscalls.Supported("epoll_wait", EpollWait)
+	table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
+	table[235] = syscalls.Supported("utimes", Utimes)
 	delete(table, 253) // inotify_init
 	delete(table, 254) // inotify_add_watch
 	delete(table, 255) // inotify_rm_watch
-	delete(table, 257) // openat
-	delete(table, 258) // mkdirat
-	delete(table, 259) // mknodat
-	delete(table, 260) // fchownat
-	delete(table, 261) // futimesat
-	delete(table, 262) // fstatat
-	delete(table, 263) // unlinkat
-	delete(table, 264) // renameat
-	delete(table, 265) // linkat
-	delete(table, 266) // symlinkat
-	delete(table, 267) // readlinkat
-	delete(table, 268) // fchmodat
-	delete(table, 269) // faccessat
-	delete(table, 270) // pselect
-	delete(table, 271) // ppoll
+	table[257] = syscalls.Supported("openat", Openat)
+	table[258] = syscalls.Supported("mkdirat", Mkdirat)
+	table[259] = syscalls.Supported("mknodat", Mknodat)
+	table[260] = syscalls.Supported("fchownat", Fchownat)
+	table[261] = syscalls.Supported("futimens", Futimens)
+	table[262] = syscalls.Supported("newfstatat", Newfstatat)
+	table[263] = syscalls.Supported("unlinkat", Unlinkat)
+	table[264] = syscalls.Supported("renameat", Renameat)
+	table[265] = syscalls.Supported("linkat", Linkat)
+	table[266] = syscalls.Supported("symlinkat", Symlinkat)
+	table[267] = syscalls.Supported("readlinkat", Readlinkat)
+	table[268] = syscalls.Supported("fchmodat", Fchmodat)
+	table[269] = syscalls.Supported("faccessat", Faccessat)
+	table[270] = syscalls.Supported("pselect", Pselect)
+	table[271] = syscalls.Supported("ppoll", Ppoll)
+	delete(table, 275) // splice
+	delete(table, 276) // tee
+	table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
+	table[280] = syscalls.Supported("utimensat", Utimensat)
+	table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
+	delete(table, 282) // signalfd
+	delete(table, 283) // timerfd_create
+	delete(table, 284) // eventfd
 	delete(table, 285) // fallocate
-	delete(table, 291) // epoll_create1
-	delete(table, 292) // dup3
+	delete(table, 286) // timerfd_settime
+	delete(table, 287) // timerfd_gettime
+	delete(table, 288) // accept4
+	delete(table, 289) // signalfd4
+	delete(table, 290) // eventfd2
+	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
+	table[292] = syscalls.Supported("dup3", Dup3)
 	delete(table, 293) // pipe2
 	delete(table, 294) // inotify_init1
-	delete(table, 295) // preadv
-	delete(table, 296) // pwritev
-	delete(table, 306) // syncfs
-	delete(table, 316) // renameat2
+	table[295] = syscalls.Supported("preadv", Preadv)
+	table[296] = syscalls.Supported("pwritev", Pwritev)
+	delete(table, 299) // recvmmsg
+	table[306] = syscalls.Supported("syncfs", Syncfs)
+	delete(table, 307) // sendmmsg
+	table[316] = syscalls.Supported("renameat2", Renameat2)
 	delete(table, 319) // memfd_create
-	delete(table, 322) // execveat
-	delete(table, 327) // preadv2
-	delete(table, 328) // pwritev2
-	delete(table, 332) // statx
+	table[322] = syscalls.Supported("execveat", Execveat)
+	table[327] = syscalls.Supported("preadv2", Preadv2)
+	table[328] = syscalls.Supported("pwritev2", Pwritev2)
+	table[332] = syscalls.Supported("statx", Statx)
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
index 6af5c400f..a6b367468 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package vfs2
 
 import (
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
new file mode 100644
index 000000000..60a43f0a0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mmap implements Linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := args[4].Int()
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// mmap unconditionally requires that the FD is readable.
+		if !file.IsReadable() {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !file.IsWritable() {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
new file mode 100644
index 000000000..97da6c647
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
+	pathname, err := t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return fspath.Path{}, err
+	}
+	return fspath.Parse(pathname), nil
+}
+
+type taskPathOperation struct {
+	pop          vfs.PathOperation
+	haveStartRef bool
+}
+
+func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
+	root := t.FSContext().RootDirectoryVFS2()
+	start := root
+	haveStartRef := false
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			root.DecRef()
+			return taskPathOperation{}, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			haveStartRef = true
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				root.DecRef()
+				return taskPathOperation{}, syserror.EBADF
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			haveStartRef = true
+			dirfile.DecRef()
+		}
+	}
+	return taskPathOperation{
+		pop: vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+		},
+		haveStartRef: haveStartRef,
+	}, nil
+}
+
+func (tpop *taskPathOperation) Release() {
+	tpop.pop.Root.DecRef()
+	if tpop.haveStartRef {
+		tpop.pop.Start.DecRef()
+		tpop.haveStartRef = false
+	}
+}
+
+type shouldAllowEmptyPath bool
+
+const (
+	disallowEmptyPath shouldAllowEmptyPath = false
+	allowEmptyPath    shouldAllowEmptyPath = true
+)
+
+type shouldFollowFinalSymlink bool
+
+const (
+	nofollowFinalSymlink shouldFollowFinalSymlink = false
+	followFinalSymlink   shouldFollowFinalSymlink = true
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
new file mode 100644
index 000000000..dbf4882da
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -0,0 +1,584 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select. This has no
+// equivalent in Linux; it exists in gVisor since allocation failure in Go is
+// unrecoverable.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file description and waiter of a PollFD.
+type pollState struct {
+	file   *vfs.FileDescription
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.GetFileVFS2(pfd.FD)
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	haveTimeout := timeout >= 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
+// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return nil, syserror.EINVAL
+	}
+
+	pfd := make([]linux.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
+		}
+	}
+
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := copyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+	}
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || nfds > fileCap {
+		return 0, syserror.EINVAL
+	}
+
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < nBytes; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
+	for i := 0; i < nBytes; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.GetFileVFS2(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				var mask int16
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, linux.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	return tsRemaining.CopyOut(t, timespecAddr)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	return tvRemaining.CopyOut(t, timevalAddr)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		var timeval linux.Timeval
+		if err := timeval.CopyIn(t, timevalAddr); err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		if t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
+		}
+		var maskStruct sigSetWithSize
+		if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+			return 0, nil, err
+		}
+		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		var timespec linux.Timespec
+		if err := timespec.CopyIn(t, timespecAddr); err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syserror.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
+
+func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
+	if maskAddr == 0 {
+		return nil
+	}
+	if maskSize != linux.SignalSetSize {
+		return syserror.EINVAL
+	}
+	var mask linux.SignalSet
+	if err := mask.CopyIn(t, maskAddr); err != nil {
+		return err
+	}
+	mask &^= kernel.UnblockableSignals
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
new file mode 100644
index 000000000..35f6308d6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -0,0 +1,511 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	eventMaskRead  = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+	eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements Linux syscall read(2).
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readv implements Linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.Read(t, dst, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Read(t, dst, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Pread64 implements Linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Preadv implements Linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements Linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.ReadOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = read(t, file, dst, opts)
+	} else {
+		n, err = pread(t, file, dst, offset, opts)
+	}
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.PRead(t, dst, offset, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.PRead(t, dst, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Write implements Linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Writev implements Linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.Write(t, src, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Write(t, src, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Pwrite64 implements Linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Pwritev implements Linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements Linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.WriteOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = write(t, file, src, opts)
+	} else {
+		n, err = pwrite(t, file, src, offset, opts)
+	}
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.PWrite(t, src, offset, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.PWrite(t, src, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Lseek implements Linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newoff, err := file.Seek(t, offset, whence)
+	return uintptr(newoff), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
new file mode 100644
index 000000000..9250659ff
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -0,0 +1,380 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+// Chmod implements Linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
+}
+
+// Fchmodat implements Linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
+}
+
+func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Fchmod implements Linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].ModeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Chown implements Linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
+}
+
+// Lchown implements Linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	owner := args[2].Int()
+	group := args[3].Int()
+	flags := args[4].Int()
+	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
+}
+
+func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
+	userns := t.UserNamespace()
+	if owner != -1 {
+		kuid := userns.MapToKUID(auth.UID(owner))
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_UID
+		opts.Stat.UID = uint32(kuid)
+	}
+	if group != -1 {
+		kgid := userns.MapToKGID(auth.GID(group))
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_GID
+		opts.Stat.GID = uint32(kgid)
+	}
+	return nil
+}
+
+// Fchown implements Linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	owner := args[1].Int()
+	group := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, file.SetStat(t, opts)
+}
+
+// Truncate implements Linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+}
+
+// Ftruncate implements Linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+}
+
+// Utime implements Linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times linux.Utime
+		if err := times.CopyIn(t, timesAddr); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime.Sec = times.Actime
+		opts.Stat.Mtime.Sec = times.Modtime
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimes implements Linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Usec * 1000),
+		}
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Usec * 1000),
+		}
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimensat implements Linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Futimens implements Linux syscall futimens(2).
+func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	timesAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.SetStat(t, opts)
+}
+
+func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timespec
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Nsec != linux.UTIME_OMIT {
+		opts.Stat.Mask |= linux.STATX_ATIME
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Nsec),
+		}
+	}
+	if times[1].Nsec != linux.UTIME_OMIT {
+		opts.Stat.Mask |= linux.STATX_MTIME
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Nsec),
+		}
+	}
+	return nil
+}
+
+func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.SetStat() instead of
+				// VirtualFilesystem.SetStatAt(), since the former may be able
+				// to use opened file state to expedite the SetStat.
+				err := dirfile.SetStat(t, *opts)
+				dirfile.DecRef()
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+	}, opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
new file mode 100644
index 000000000..dca8d7011
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -0,0 +1,346 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Stat implements Linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
+}
+
+// Lstat implements Linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
+func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+	return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
+}
+
+func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return err
+				}
+				var stat linux.Stat
+				convertStatxToUserStat(t, &statx, &stat)
+				return stat.CopyOut(t, statAddr)
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	return stat.CopyOut(t, statAddr)
+}
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
+
+func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
+	return linux.Timespec{
+		Sec:  sxts.Sec,
+		Nsec: int64(sxts.Nsec),
+	}
+}
+
+// Fstat implements Linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	statx, err := file.Stat(t, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	return 0, nil, stat.CopyOut(t, statAddr)
+}
+
+// Statx implements Linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+	mask := args[3].Uint()
+	statxAddr := args[4].Pointer()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: mask,
+		Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return 0, nil, syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for statx(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return 0, nil, err
+				}
+				userifyStatx(t, &statx)
+				return 0, nil, statx.CopyOut(t, statxAddr)
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return 0, nil, err
+	}
+	userifyStatx(t, &statx)
+	return 0, nil, statx.CopyOut(t, statxAddr)
+}
+
+func userifyStatx(t *kernel.Task, statx *linux.Statx) {
+	userns := t.UserNamespace()
+	statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
+	statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
+}
+
+// Readlink implements Linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
+}
+
+// Access implements Linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// FIXME(jamieliu): actually implement
+	return 0, nil, nil
+}
+
+// Faccessat implements Linux syscall access(2).
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// FIXME(jamieliu): actually implement
+	return 0, nil, nil
+}
+
+// Readlinkat implements Linux syscall mknodat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
+}
+
+func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
+	if int(size) <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
+	// call operates on the symbolic link referred to by dirfd ..." -
+	// readlinkat(2)
+	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if len(target) > int(size) {
+		target = target[:size]
+	}
+	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Statfs implements Linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, statfs.CopyOut(t, bufAddr)
+}
+
+// Fstatfs implements Linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufAddr := args[1].Pointer()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, statfs.CopyOut(t, bufAddr)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
new file mode 100644
index 000000000..365250b0b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements Linux syscall sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
+}
+
+// Syncfs implements Linux syscall syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SyncFS(t)
+}
+
+// Fsync implements Linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.Sync(t)
+}
+
+// Fdatasync implements Linux syscall fdatasync(2).
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
+	return Fsync(t, args)
+}
+
+// SyncFileRange implements Linux syscall sync_file_range(2).
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	flags := args[3].Uint()
+
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nbytes < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
+	// [offset, offset+nbytes).
+	return 0, nil, file.Sync(t)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
deleted file mode 100644
index 7667524c7..000000000
--- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs2
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-const (
-	// EventMaskRead contains events that can be triggered on reads.
-	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
-)
-
-// Read implements linux syscall read(2).  Note that we try to get a buffer that
-// is exactly the size requested because some applications like qemu expect
-// they can do large reads all at once.  Bug for bug.  Same for other read
-// calls below.
-func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	fd := args[0].Int()
-	addr := args[1].Pointer()
-	size := args[2].SizeT()
-
-	file := t.GetFileVFS2(fd)
-	if file == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer file.DecRef()
-
-	// Check that the size is legitimate.
-	si := int(size)
-	if si < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Get the destination of the read.
-	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	if err != nil {
-		return 0, nil, err
-	}
-
-	n, err := read(t, file, dst, vfs.ReadOptions{})
-	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
-}
-
-func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	n, err := file.Read(t, dst, opts)
-	if err != syserror.ErrWouldBlock {
-		return n, err
-	}
-
-	// Register for notifications.
-	w, ch := waiter.NewChannelEntry(nil)
-	file.EventRegister(&w, EventMaskRead)
-
-	total := n
-	for {
-		// Shorten dst to reflect bytes previously read.
-		dst = dst.DropFirst(int(n))
-
-		// Issue the request and break out if it completes with anything other than
-		// "would block".
-		n, err := file.Read(t, dst, opts)
-		total += n
-		if err != syserror.ErrWouldBlock {
-			break
-		}
-		if err := t.Block(ch); err != nil {
-			break
-		}
-	}
-	file.EventUnregister(&w)
-
-	return total, err
-}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
new file mode 100644
index 000000000..89e9ff4d7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -0,0 +1,353 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"bytes"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Listxattr implements Linux syscall listxattr(2).
+func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, followFinalSymlink)
+}
+
+// Llistxattr implements Linux syscall llistxattr(2).
+func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, nofollowFinalSymlink)
+}
+
+func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Flistxattr implements Linux syscall flistxattr(2).
+func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	names, err := file.Listxattr(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Getxattr implements Linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, followFinalSymlink)
+}
+
+// Lgetxattr implements Linux syscall lgetxattr(2).
+func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, nofollowFinalSymlink)
+}
+
+func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Fgetxattr implements Linux syscall fgetxattr(2).
+func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := file.Getxattr(t, name)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Setxattr implements Linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, followFinalSymlink)
+}
+
+// Lsetxattr implements Linux syscall lsetxattr(2).
+func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, nofollowFinalSymlink)
+}
+
+func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Fsetxattr implements Linux syscall fsetxattr(2).
+func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Removexattr implements Linux syscall removexattr(2).
+func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, followFinalSymlink)
+}
+
+// Lremovexattr implements Linux syscall lremovexattr(2).
+func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, nofollowFinalSymlink)
+}
+
+func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+}
+
+// Fremovexattr implements Linux syscall fremovexattr(2).
+func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Removexattr(t, name)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
+	if size > linux.XATTR_LIST_MAX {
+		size = linux.XATTR_LIST_MAX
+	}
+	var buf bytes.Buffer
+	for _, name := range names {
+		buf.WriteString(name)
+		buf.WriteByte(0)
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the list.
+		return buf.Len(), nil
+	}
+	if buf.Len() > int(size) {
+		if size >= linux.XATTR_LIST_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(listAddr, buf.Bytes())
+}
+
+func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		return "", syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+		return "", err
+	}
+	return gohacks.StringFromImmutableBytes(buf), nil
+}
+
+func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		size = linux.XATTR_SIZE_MAX
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the value.
+		return len(value), nil
+	}
+	if len(value) > int(size) {
+		if size >= linux.XATTR_SIZE_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 0b4f18ab5..07c8383e6 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index eed41139b..3da45d744 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
 	// Add epi to file.epolls so that it is removed when the last
 	// FileDescription reference is dropped.
 	file.epollMu.Lock()
+	if file.epolls == nil {
+		file.epolls = make(map[*epollInterest]struct{})
+	}
 	file.epolls[epi] = struct{}{}
 	file.epollMu.Unlock()
 
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 1fe766a44..bc7581698 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,6 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -160,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
 // Lookup may be called even if there are concurrent mutators of mt.
 func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
 	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
-	hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+	hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
 
 loop:
 	for {
@@ -361,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
 
 //go:linkname rand32 runtime.fastrand
 func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
-	x := uintptr(p)
-	return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8a0b382f6..eb4ebb511 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() {
 		rp.pit = next
 	} else { // at end of path segment, continue with next one
 		rp.curPart--
-		rp.pit = rp.parts[rp.curPart-1]
+		rp.pit = rp.parts[rp.curPart]
 	}
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8f29031b2..73f8043be 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -385,15 +385,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 				// Only a regular file can be executed.
 				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
 				if err != nil {
+					fd.DecRef()
 					return nil, err
 				}
-				if stat.Mask&linux.STATX_TYPE != 0 {
-					// This shouldn't happen, but if type can't be retrieved, file can't
-					// be executed.
-					return nil, syserror.EACCES
-				}
-				if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
-					ctx.Infof("%q is not a regular file: %v", pop.Path, t)
+				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+					fd.DecRef()
 					return nil, syserror.EACCES
 				}
 			}
diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD
index ff8b9e91a..6c9ada9c7 100644
--- a/pkg/usermem/BUILD
+++ b/pkg/usermem/BUILD
@@ -25,7 +25,6 @@ go_library(
         "bytes_io_unsafe.go",
         "usermem.go",
         "usermem_arm64.go",
-        "usermem_unsafe.go",
         "usermem_x86.go",
     ],
     visibility = ["//:sandbox"],
@@ -33,6 +32,7 @@ go_library(
         "//pkg/atomicbitops",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/gohacks",
         "//pkg/log",
         "//pkg/safemem",
         "//pkg/syserror",
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index 71fd4e155..d2f4403b0 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -23,6 +23,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -251,7 +252,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		}
 		end, ok := addr.AddLength(uint64(readlen))
 		if !ok {
-			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
+			return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT
 		}
 		// Shorten the read to avoid crossing page boundaries, since faulting
 		// in a page unnecessarily is expensive. This also ensures that partial
@@ -272,16 +273,16 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		// Look for the terminating zero byte, which may have occurred before
 		// hitting err.
 		if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
-			return stringFromImmutableBytes(buf[:done+i]), nil
+			return gohacks.StringFromImmutableBytes(buf[:done+i]), nil
 		}
 
 		done += n
 		if err != nil {
-			return stringFromImmutableBytes(buf[:done]), err
+			return gohacks.StringFromImmutableBytes(buf[:done]), err
 		}
 		addr = end
 	}
-	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
+	return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG
 }
 
 // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go
deleted file mode 100644
index 876783e78..000000000
--- a/pkg/usermem/usermem_unsafe.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"unsafe"
-)
-
-// stringFromImmutableBytes is equivalent to string(bs), except that it never
-// copies even if escape analysis can't prove that bs does not escape. This is
-// only valid if bs is never mutated after stringFromImmutableBytes returns.
-func stringFromImmutableBytes(bs []byte) string {
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index c69f4c602..a4627905e 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -229,7 +229,9 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_NANOSLEEP: {},
 	syscall.SYS_PPOLL:     {},
 	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PREADV:    {},
 	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_PWRITEV:   {},
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-- 
cgit v1.2.3


From 72e3f3a3eef3a1dc02db0ff71f98a5d7fe89a6e3 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 25 Feb 2020 13:42:34 -0800
Subject: Add option to skip stuck tasks waiting for address space

PiperOrigin-RevId: 297192390
---
 pkg/sentry/kernel/kernel.go       |  4 ++++
 pkg/sentry/kernel/task_context.go |  2 +-
 pkg/sentry/kernel/task_exec.go    |  2 +-
 pkg/sentry/kernel/task_usermem.go |  2 +-
 pkg/sentry/mm/address_space.go    | 23 ++++++++++++++---------
 pkg/sentry/mm/lifecycle.go        | 26 ++++++++++++++------------
 pkg/sentry/mm/mm.go               |  5 +++++
 pkg/sentry/mm/mm_test.go          |  2 +-
 8 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c62fd6eb1..8b76750e9 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -247,6 +247,10 @@ type Kernel struct {
 
 	// VFS keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
+
+	// If set to true, report address space activation waits as if the task is in
+	// external wait so that the watchdog doesn't report the task stuck.
+	SleepForAddressSpaceActivation bool
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2be982684..0158b1788 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -140,7 +140,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*Task
 	}
 
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k, k)
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
 	defer m.DecUsers(ctx)
 	args.MemoryManager = m
 
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 8f57a34a6..00c425cca 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -220,7 +220,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.mu.Unlock()
 	t.unstopVforkParent()
 	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
-	t.MemoryManager().Activate()
+	t.MemoryManager().Activate(t)
 
 	t.ptraceExec(oldTID)
 	return (*runSyscallExit)(nil)
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2bf3ce8a8..b02044ad2 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
 	if mm := t.MemoryManager(); mm != nil {
-		if err := mm.Activate(); err != nil {
+		if err := mm.Activate(t); err != nil {
 			panic("unable to activate mm: " + err.Error())
 		}
 	}
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 94d39af60..0332fc71c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -38,7 +39,7 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
 //
 // When this MemoryManager is no longer needed by a task, it should call
 // Deactivate to release the reference.
-func (mm *MemoryManager) Activate() error {
+func (mm *MemoryManager) Activate(ctx context.Context) error {
 	// Fast path: the MemoryManager already has an active
 	// platform.AddressSpace, and we just need to indicate that we need it too.
 	for {
@@ -91,16 +92,20 @@ func (mm *MemoryManager) Activate() error {
 		if as == nil {
 			// AddressSpace is unavailable, we must wait.
 			//
-			// activeMu must not be held while waiting, as the user
-			// of the address space we are waiting on may attempt
-			// to take activeMu.
-			//
-			// Don't call UninterruptibleSleepStart to register the
-			// wait to allow the watchdog stuck task to trigger in
-			// case a process is starved waiting for the address
-			// space.
+			// activeMu must not be held while waiting, as the user of the address
+			// space we are waiting on may attempt to take activeMu.
 			mm.activeMu.Unlock()
+
+			sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+			if sleep {
+				// Mark this task sleeping while waiting for the address space to
+				// prevent the watchdog from reporting it as a stuck task.
+				ctx.UninterruptibleSleepStart(false)
+			}
 			<-c
+			if sleep {
+				ctx.UninterruptibleSleepFinish(false)
+			}
 			continue
 		}
 
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 3c263ebaa..d8a5b9d29 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -28,16 +28,17 @@ import (
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
 	return &MemoryManager{
-		p:           p,
-		mfp:         mfp,
-		haveASIO:    p.SupportsAddressSpaceIO(),
-		privateRefs: &privateRefs{},
-		users:       1,
-		auxv:        arch.Auxv{},
-		dumpability: UserDumpable,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		p:                  p,
+		mfp:                mfp,
+		haveASIO:           p.SupportsAddressSpaceIO(),
+		privateRefs:        &privateRefs{},
+		users:              1,
+		auxv:               arch.Auxv{},
+		dumpability:        UserDumpable,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: sleepForActivation,
 	}
 }
 
@@ -79,9 +80,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		envv:                 mm.envv,
 		auxv:                 append(arch.Auxv(nil), mm.auxv...),
 		// IncRef'd below, once we know that there isn't an error.
-		executable:  mm.executable,
-		dumpability: mm.dumpability,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		executable:         mm.executable,
+		dumpability:        mm.dumpability,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: mm.sleepForActivation,
 	}
 
 	// Copy vmas.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 637383c7a..c2195ae11 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -226,6 +226,11 @@ type MemoryManager struct {
 	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
 	// must be cloned when CLONE_VM is used.
 	aioManager aioManager
+
+	// sleepForActivation indicates whether the task should report to be sleeping
+	// before trying to activate the address space. When set to true, delays in
+	// activation are not reported as stuck tasks by the watchdog.
+	sleepForActivation bool
 }
 
 // vma represents a virtual memory area.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index edacca741..fdc308542 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -31,7 +31,7 @@ import (
 func testMemoryManager(ctx context.Context) *MemoryManager {
 	p := platform.FromContext(ctx)
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
-	mm := NewMemoryManager(p, mfp)
+	mm := NewMemoryManager(p, mfp, false)
 	mm.layout = arch.MmapLayout{
 		MinAddr:      p.MinUserAddress(),
 		MaxAddr:      p.MaxUserAddress(),
-- 
cgit v1.2.3


From acc405ba60834f5dce9ce04cd762d5cda02224cb Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Tue, 25 Feb 2020 15:03:51 -0800
Subject: Add nat table support for iptables.

- commit the changes for the comments.
---
 pkg/abi/linux/netfilter.go               | 23 +++++++++--
 pkg/sentry/socket/netfilter/netfilter.go | 33 +++++++++++-----
 pkg/tcpip/iptables/iptables.go           | 10 ++++-
 pkg/tcpip/iptables/targets.go            | 66 ++++++++++++++++++++++----------
 pkg/tcpip/iptables/types.go              |  4 +-
 pkg/tcpip/stack/nic.go                   | 13 +------
 test/iptables/nat.go                     | 12 +-----
 7 files changed, 103 insertions(+), 58 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index ba4d84962..2179ac995 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -250,8 +250,24 @@ type XTErrorTarget struct {
 // SizeOfXTErrorTarget is the size of an XTErrorTarget.
 const SizeOfXTErrorTarget = 64
 
+// Flag values for NfNATIPV4Range. The values indicate whether to map
+// protocol specific part(ports) or IPs. It corresponds to values in
+// include/uapi/linux/netfilter/nf_nat.h.
+const (
+	NF_NAT_RANGE_MAP_IPS            = 1 << 0
+	NF_NAT_RANGE_PROTO_SPECIFIED    = 1 << 1
+	NF_NAT_RANGE_PROTO_RANDOM       = 1 << 2
+	NF_NAT_RANGE_PERSISTENT         = 1 << 3
+	NF_NAT_RANGE_PROTO_RANDOM_FULLY = 1 << 4
+	NF_NAT_RANGE_PROTO_RANDOM_ALL   = (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+	NF_NAT_RANGE_MASK               = (NF_NAT_RANGE_MAP_IPS |
+		NF_NAT_RANGE_PROTO_SPECIFIED | NF_NAT_RANGE_PROTO_RANDOM |
+		NF_NAT_RANGE_PERSISTENT | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+)
+
 // NfNATIPV4Range. It corresponds to struct nf_nat_ipv4_range
-// in include/uapi/linux/netfilter/nf_nat.h.
+// in include/uapi/linux/netfilter/nf_nat.h. The fields are in
+// network byte order.
 type NfNATIPV4Range struct {
 	Flags   uint32
 	MinIP   [4]byte
@@ -263,11 +279,12 @@ type NfNATIPV4Range struct {
 // NfNATIPV4MultiRangeCompat. It corresponds to struct
 // nf_nat_ipv4_multi_range_compat in include/uapi/linux/netfilter/nf_nat.h.
 type NfNATIPV4MultiRangeCompat struct {
-	Rangesize uint32
-	RangeIPV4 [1]NfNATIPV4Range
+	RangeSize uint32
+	RangeIPV4 NfNATIPV4Range
 }
 
 // XTRedirectTarget triggers a redirect when reached.
+// Adding 4 bytes of padding to make the struct 8 byte aligned.
 type XTRedirectTarget struct {
 	Target  XTEntryTarget
 	NfRange NfNATIPV4MultiRangeCompat
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 512ad624a..257cb485b 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -288,6 +289,7 @@ func marshalRedirectTarget() []byte {
 			TargetSize: linux.SizeOfXTRedirectTarget,
 		},
 	}
+	copy(target.Target.Name[:], redirectTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
 	return binary.Marshal(ret, usermem.ByteOrder, target)
@@ -405,7 +407,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
-		target, err := parseTarget(optVal[:targetSize])
+		target, err := parseTarget(filter, optVal[:targetSize])
 		if err != nil {
 			nflog("failed to parse target: %v", err)
 			return syserr.ErrInvalidArgument
@@ -552,7 +554,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(optVal []byte) (iptables.Target, error) {
+func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
@@ -604,6 +606,10 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 			return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
 		}
 
+		if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+		}
+
 		var redirectTarget linux.XTRedirectTarget
 		buf = optVal[:linux.SizeOfXTRedirectTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
@@ -612,21 +618,30 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		var target iptables.RedirectTarget
 		nfRange := redirectTarget.NfRange
 
-		target.RangeSize = nfRange.Rangesize
-		target.Flags = nfRange.RangeIPV4[0].Flags
+		// RangeSize should be 1.
+		if nfRange.RangeSize != 1 {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+		}
+
+		// TODO(gvisor.dev/issue/170): Check if the flags are valid.
+		// Also check if we need to map ports or IP.
+		// For now, redirect target only supports dest port change.
+		if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument.")
+		}
+		target.Flags = nfRange.RangeIPV4.Flags
 
-		target.MinIP = tcpip.Address(nfRange.RangeIPV4[0].MinIP[:])
-		target.MaxIP = tcpip.Address(nfRange.RangeIPV4[0].MaxIP[:])
+		target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+		target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
 
 		// Convert port from big endian to little endian.
 		port := make([]byte, 2)
-		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4[0].MinPort)
+		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
 		target.MinPort = binary.LittleEndian.Uint16(port)
 
-		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4[0].MaxPort)
+		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
 		target.MaxPort = binary.LittleEndian.Uint16(port)
 		return target, nil
-
 	}
 
 	// Unknown target.
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index c00d012c0..f7dc4f720 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -207,7 +207,7 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 			underflow := table.Rules[table.Underflows[hook]]
 			// Underflow is guaranteed to be an unconditional
 			// ACCEPT or DROP.
-			switch v, _ := underflow.Target.Action(pkt); v {
+			switch v, _ := underflow.Target.Action(pkt, underflow.Filter); v {
 			case RuleAccept:
 				return TableAccept
 			case RuleDrop:
@@ -233,6 +233,12 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
 	rule := table.Rules[ruleIdx]
 
+	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
+	// pkt.Data.First().
+	if pkt.NetworkHeader == nil {
+		pkt.NetworkHeader = pkt.Data.First()
+	}
+
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
@@ -252,6 +258,6 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	}
 
 	// All the matchers matched, so run the target.
-	verdict, _ := rule.Target.Action(pkt)
+	verdict, _ := rule.Target.Action(pkt, rule.Filter)
 	return verdict
 }
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 06e65bece..a75938da3 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -26,7 +26,7 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+func (AcceptTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, string) {
 	return RuleAccept, ""
 }
 
@@ -34,7 +34,7 @@ func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+func (DropTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, string) {
 	return RuleDrop, ""
 }
 
@@ -43,7 +43,7 @@ func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+func (ErrorTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, string) {
 	log.Debugf("ErrorTarget triggered.")
 	return RuleDrop, ""
 }
@@ -54,7 +54,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+func (UserChainTarget) Action(tcpip.PacketBuffer, IPHeaderFilter) (RuleVerdict, string) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -63,29 +63,55 @@ func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+func (ReturnTarget) Action(tcpip.PacketBuffer, IPHeaderFilter) (RuleVerdict, string) {
 	return RuleReturn, ""
 }
 
 // RedirectTarget redirects the packet by modifying the destination port/IP.
+// Min and Max values for IP and Ports in the struct indicate the range of
+// values which can be used to redirect.
 type RedirectTarget struct {
-	RangeSize uint32
-	Flags     uint32
-	MinIP     tcpip.Address
-	MaxIP     tcpip.Address
-	MinPort   uint16
-	MaxPort   uint16
-}
+	// Flags to check if the redirect is for address or ports.
+	Flags uint32
 
-// Action implements Target.Action.
-func (rt RedirectTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	log.Infof("RedirectTarget triggered.")
+	// Min address used to redirect.
+	MinIP tcpip.Address
+
+	// Max address used to redirect.
+	MaxIP tcpip.Address
 
-	// TODO(gvisor.dev/issue/170): Checking only for UDP protocol.
-	// We're yet to support for TCP protocol.
-	headerView := packet.Data.First()
-	h := header.UDP(headerView)
-	h.SetDestinationPort(rt.MinPort)
+	// Min port used to redirect.
+	MinPort uint16
 
+	// Max port used to redirect.
+	MaxPort uint16
+}
+
+// Action implements Target.Action.
+func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, string) {
+	headerView := pkt.Data.First()
+
+	// Network header should be set.
+	netHeader := header.IPv4(headerView)
+	if netHeader == nil {
+		return RuleDrop, ""
+	}
+
+	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
+	// we need to change dest address (for OUTPUT chain) or ports.
+	hlen := int(netHeader.HeaderLength())
+
+	switch protocol := filter.Protocol; protocol {
+	case header.UDPProtocolNumber:
+		udp := header.UDP(headerView[hlen:])
+		udp.SetDestinationPort(rt.MinPort)
+	case header.TCPProtocolNumber:
+		// TODO(gvisor.dev/issue/170): Need to recompute checksum
+		// and implement nat connection tracking to support TCP.
+		tcp := header.TCP(headerView[hlen:])
+		tcp.SetDestinationPort(rt.MinPort)
+	default:
+		return RuleDrop, ""
+	}
 	return RuleAccept, ""
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 5735d001b..0102831d0 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -63,7 +63,7 @@ const (
 	// TableAccept indicates the packet should continue through netstack.
 	TableAccept TableVerdict = iota
 
-	// TableAccept indicates the packet should be dropped.
+	// TableDrop indicates the packet should be dropped.
 	TableDrop
 )
 
@@ -175,5 +175,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
+	Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, string)
 }
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 2028f5201..a75dc0322 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1087,19 +1087,8 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
 	if protocol == header.IPv4ProtocolNumber {
-		newPkt := pkt.Clone()
-
-		headerView := newPkt.Data.First()
-		h := header.IPv4(headerView)
-		newPkt.NetworkHeader = headerView[:h.HeaderLength()]
-
-		hlen := int(h.HeaderLength())
-		tlen := int(h.TotalLength())
-		newPkt.Data.TrimFront(hlen)
-		newPkt.Data.CapLength(tlen - hlen)
-
 		ipt := n.stack.IPTables()
-		if ok := ipt.Check(iptables.Prerouting, newPkt); !ok {
+		if ok := ipt.Check(iptables.Prerouting, pkt); !ok {
 			// iptables is telling us to drop the packet.
 			return
 		}
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index 306cbd1b3..899d1c9d3 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -71,20 +71,12 @@ func (NATRedirectTCPPort) ContainerAction(ip net.IP) error {
 	}
 
 	// Listen for TCP packets on redirect port.
-	if err := listenTCP(redirectPort, sendloopDuration); err != nil {
-		return fmt.Errorf("connection on port %d should be accepted, but got error %v", redirectPort, err)
-	}
-
-	return nil
+	return listenTCP(redirectPort, sendloopDuration)
 }
 
 // LocalAction implements TestCase.LocalAction.
 func (NATRedirectTCPPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err != nil {
-		return fmt.Errorf("connection destined to port %d should be accepted, but got error %v", dropPort, err)
-	}
-
-	return nil
+	return connectTCP(ip, dropPort, acceptPort, sendloopDuration)
 }
 
 // NATDropUDP tests that packets are not received in ports other than redirect port.
-- 
cgit v1.2.3


From 5f1f9dd9d23d2b805c77b5c38d5900d13e6a29fe Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 25 Feb 2020 15:15:28 -0800
Subject: Use link-local source address for link-local multicast

Tests:
- header_test.TestIsV6LinkLocalMulticastAddress
- header_test.TestScopeForIPv6Address
- stack_test.TestIPv6SourceAddressSelectionScopeAndSameAddress
PiperOrigin-RevId: 297215576
---
 pkg/tcpip/header/ipv6.go      | 22 ++++++++++
 pkg/tcpip/header/ipv6_test.go | 98 ++++++++++++++++++++++++++++++++++++++++---
 pkg/tcpip/stack/stack_test.go | 27 ++++++++----
 3 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 70e6ce095..76e88e9b3 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -115,6 +115,19 @@ const (
 	// for the secret key used to generate an opaque interface identifier as
 	// outlined by RFC 7217.
 	OpaqueIIDSecretKeyMinBytes = 16
+
+	// ipv6MulticastAddressScopeByteIdx is the byte where the scope (scop) field
+	// is located within a multicast IPv6 address, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeByteIdx = 1
+
+	// ipv6MulticastAddressScopeMask is the mask for the scope (scop) field,
+	// within the byte holding the field, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeMask = 0xF
+
+	// ipv6LinkLocalMulticastScope is the value of the scope (scop) field within
+	// a multicast IPv6 address that indicates the address has link-local scope,
+	// as per RFC 4291 section 2.7.
+	ipv6LinkLocalMulticastScope = 2
 )
 
 // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -340,6 +353,12 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool {
 	return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
 }
 
+// IsV6LinkLocalMulticastAddress determines if the provided address is an IPv6
+// link-local multicast address.
+func IsV6LinkLocalMulticastAddress(addr tcpip.Address) bool {
+	return IsV6MulticastAddress(addr) && addr[ipv6MulticastAddressScopeByteIdx]&ipv6MulticastAddressScopeMask == ipv6LinkLocalMulticastScope
+}
+
 // IsV6UniqueLocalAddress determines if the provided address is an IPv6
 // unique-local address (within the prefix FC00::/7).
 func IsV6UniqueLocalAddress(addr tcpip.Address) bool {
@@ -411,6 +430,9 @@ func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
 	}
 
 	switch {
+	case IsV6LinkLocalMulticastAddress(addr):
+		return LinkLocalScope, nil
+
 	case IsV6LinkLocalAddress(addr):
 		return LinkLocalScope, nil
 
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index c3ad503aa..426a873b1 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -27,11 +27,12 @@ import (
 )
 
 const (
-	linkAddr         = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkLocalAddr    = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-	globalAddr       = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkAddr               = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkLocalAddr          = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	globalAddr             = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
 )
 
 func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
@@ -256,6 +257,85 @@ func TestIsV6UniqueLocalAddress(t *testing.T) {
 	}
 }
 
+func TestIsV6LinkLocalMulticastAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: true,
+		},
+		{
+			name:     "Valid Link Local Multicast with flags",
+			addr:     "\xff\xf2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+			expected: true,
+		},
+		{
+			name:     "Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Multicast",
+			addr:     "\xe0\x00\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalMulticastAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalMulticastAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestIsV6LinkLocalAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: true,
+		},
+		{
+			name:     "Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: false,
+		},
+		{
+			name:     "Unique Local",
+			addr:     uniqueLocalAddr1,
+			expected: false,
+		},
+		{
+			name:     "Global",
+			addr:     globalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Link Local",
+			addr:     "\xa9\xfe\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
 func TestScopeForIPv6Address(t *testing.T) {
 	tests := []struct {
 		name  string
@@ -270,11 +350,17 @@ func TestScopeForIPv6Address(t *testing.T) {
 			err:   nil,
 		},
 		{
-			name:  "Link Local",
+			name:  "Link Local Unicast",
 			addr:  linkLocalAddr,
 			scope: header.LinkLocalScope,
 			err:   nil,
 		},
+		{
+			name:  "Link Local Multicast",
+			addr:  linkLocalMulticastAddr,
+			scope: header.LinkLocalScope,
+			err:   nil,
+		},
 		{
 			name:  "Global",
 			addr:  globalAddr,
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index edf6bec52..e15db40fb 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2790,13 +2790,14 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 
 func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 	const (
-		linkLocalAddr1   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		linkLocalAddr2   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		globalAddr1      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		globalAddr2      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		nicID            = 1
+		linkLocalAddr1         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		linkLocalAddr2         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		globalAddr1            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		globalAddr2            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		nicID                  = 1
 	)
 
 	// Rule 3 is not tested here, and is instead tested by NDP's AutoGenAddr test.
@@ -2869,6 +2870,18 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			connectAddr:       linkLocalAddr2,
 			expectedLocalAddr: linkLocalAddr1,
 		},
+		{
+			name:              "Link Local most preferred for link local multicast (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred for link local multicast (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
 		{
 			name:              "Unique Local most preferred (last address)",
 			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
-- 
cgit v1.2.3


From 87288b26a1c40776da31c1edcbe9b1f3a6f5c1ed Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 25 Feb 2020 15:34:20 -0800
Subject: Add netlink sockopt logging to strace.

PiperOrigin-RevId: 297220008
---
 pkg/sentry/strace/socket.go | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51e6d81b2..c0512de89 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -632,4 +632,13 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
 	},
+	linux.SOL_NETLINK: {
+		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
+		linux.NETLINK_CAP_ACK:          "NETLINK_CAP_ACK",
+		linux.NETLINK_DUMP_STRICT_CHK:  "NETLINK_DUMP_STRICT_CHK",
+		linux.NETLINK_EXT_ACK:          "NETLINK_EXT_ACK",
+		linux.NETLINK_LIST_MEMBERSHIPS: "NETLINK_LIST_MEMBERSHIPS",
+		linux.NETLINK_NO_ENOBUFS:       "NETLINK_NO_ENOBUFS",
+		linux.NETLINK_PKTINFO:          "NETLINK_PKTINFO",
+	},
 }
-- 
cgit v1.2.3


From 73201f4c5700ce7af404b968698b86d80989ab1e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 25 Feb 2020 07:30:12 +0000
Subject: Code Clean: Move arch independent codes to common file in kvm pkg.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Iefbdf53e8e8d6d23ae75d8a2ff0d2a6e71f414d8
---
 pkg/sentry/platform/kvm/kvm.go       | 32 ++++++++++++++++++++++++++++++++
 pkg/sentry/platform/kvm/kvm_amd64.go | 32 --------------------------------
 pkg/sentry/platform/kvm/kvm_arm64.go | 32 --------------------------------
 3 files changed, 32 insertions(+), 64 deletions(-)

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 972ba85c3..a9b4af43e 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -27,6 +27,38 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
 // KVM represents a lightweight VM context.
 type KVM struct {
 	platform.NoCPUPreemptionDetection
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index c5a6f9c7d..093497bc4 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -21,17 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
-// userMemoryRegion is a region of physical memory.
-//
-// This mirrors kvm_memory_region.
-type userMemoryRegion struct {
-	slot          uint32
-	flags         uint32
-	guestPhysAddr uint64
-	memorySize    uint64
-	userspaceAddr uint64
-}
-
 // userRegs represents KVM user registers.
 //
 // This mirrors kvm_regs.
@@ -169,27 +158,6 @@ type modelControlRegisters struct {
 	entries [16]modelControlRegister
 }
 
-// runData is the run structure. This may be mapped for synchronous register
-// access (although that doesn't appear to be supported by my kernel at least).
-//
-// This mirrors kvm_run.
-type runData struct {
-	requestInterruptWindow uint8
-	_                      [7]uint8
-
-	exitReason                 uint32
-	readyForInterruptInjection uint8
-	ifFlag                     uint8
-	_                          [2]uint8
-
-	cr8      uint64
-	apicBase uint64
-
-	// This is the union data for exits. Interpretation depends entirely on
-	// the exitReason above (see vCPU code for more information).
-	data [32]uint64
-}
-
 // cpuidEntry is a single CPUID entry.
 //
 // This mirrors kvm_cpuid_entry2.
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 2319c86d3..79045651e 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -20,17 +20,6 @@ import (
 	"syscall"
 )
 
-// userMemoryRegion is a region of physical memory.
-//
-// This mirrors kvm_memory_region.
-type userMemoryRegion struct {
-	slot          uint32
-	flags         uint32
-	guestPhysAddr uint64
-	memorySize    uint64
-	userspaceAddr uint64
-}
-
 type kvmOneReg struct {
 	id   uint64
 	addr uint64
@@ -53,27 +42,6 @@ type userRegs struct {
 	fpRegs  userFpsimdState
 }
 
-// runData is the run structure. This may be mapped for synchronous register
-// access (although that doesn't appear to be supported by my kernel at least).
-//
-// This mirrors kvm_run.
-type runData struct {
-	requestInterruptWindow uint8
-	_                      [7]uint8
-
-	exitReason                 uint32
-	readyForInterruptInjection uint8
-	ifFlag                     uint8
-	_                          [2]uint8
-
-	cr8      uint64
-	apicBase uint64
-
-	// This is the union data for exits. Interpretation depends entirely on
-	// the exitReason above (see vCPU code for more information).
-	data [32]uint64
-}
-
 // updateGlobalOnce does global initialization. It has to be called only once.
 func updateGlobalOnce(fd int) error {
 	physicalInit()
-- 
cgit v1.2.3


From fba479b3c78621cb122af20d1d677fe9193a971c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 19:03:23 -0800
Subject: Fix DATA RACE in fs.MayDelete.

MayDelete must lock the directory also, otherwise concurrent renames may
race. Note that this also changes the methods to be aligned with the actual
Remove and RemoveDirectory methods to minimize confusion when reading the
code. (It was hard to see that resolution was correct.)

PiperOrigin-RevId: 297258304
---
 pkg/sentry/fs/dirent.go               | 25 ++++++++++++++-----------
 pkg/sentry/syscalls/linux/sys_file.go |  4 ++--
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index acab0411a..e0b32e1c1 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1438,8 +1438,8 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 	}, nil
 }
 
-func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
-	uattr, err := dir.Inode.UnstableAttr(ctx)
+func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error {
+	uattr, err := d.Inode.UnstableAttr(ctx)
 	if err != nil {
 		return syserror.EPERM
 	}
@@ -1465,30 +1465,33 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
 	return syserror.EPERM
 }
 
-// MayDelete determines whether `name`, a child of `dir`, can be deleted or
+// MayDelete determines whether `name`, a child of `d`, can be deleted or
 // renamed by `ctx`.
 //
 // Compare Linux kernel fs/namei.c:may_delete.
-func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
-	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error {
+	if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
 		return err
 	}
 
-	victim, err := dir.Walk(ctx, root, name)
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	victim, err := d.walk(ctx, root, name, true /* may unlock */)
 	if err != nil {
 		return err
 	}
 	defer victim.DecRef()
 
-	return mayDelete(ctx, dir, victim)
+	return d.mayDelete(ctx, victim)
 }
 
 // mayDelete determines whether `victim`, a child of `dir`, can be deleted or
 // renamed by `ctx`.
 //
 // Preconditions: `dir` is writable and executable by `ctx`.
-func mayDelete(ctx context.Context, dir, victim *Dirent) error {
-	if err := checkSticky(ctx, dir, victim); err != nil {
+func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error {
+	if err := d.checkSticky(ctx, victim); err != nil {
 		return err
 	}
 
@@ -1542,7 +1545,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	defer renamed.DecRef()
 
 	// Check that the renamed dirent is deletable.
-	if err := mayDelete(ctx, oldParent, renamed); err != nil {
+	if err := oldParent.mayDelete(ctx, renamed); err != nil {
 		return err
 	}
 
@@ -1580,7 +1583,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
-		if err := mayDelete(ctx, newParent, replaced); err != nil {
+		if err := newParent.mayDelete(ctx, replaced); err != nil {
 			replaced.DecRef()
 			return err
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index c21f14dc0..d10a9bed8 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1236,7 +1236,7 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTEMPTY
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
@@ -1517,7 +1517,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTDIR
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
-- 
cgit v1.2.3


From a92087f0f8fe82ce99414ec99ffe33e514cb21f6 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 25 Feb 2020 19:12:22 -0800
Subject: Add VFS.NewDisconnectedMount().

Analogous to Linux's kern_mount().

PiperOrigin-RevId: 297259580
---
 pkg/sentry/vfs/mount.go | 17 +++++++++++++++++
 pkg/sentry/vfs/vfs.go   | 16 +++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 9912df799..31a4e5480 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -139,6 +139,23 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+	fs.IncRef()
+	if root != nil {
+		root.IncRef()
+	}
+	return &Mount{
+		vfs:  vfs,
+		fs:   fs,
+		root: root,
+		refs: 1,
+	}, nil
+}
+
 // MountAt creates and mounts a Filesystem configured by the given arguments.
 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
 	rft := vfs.getFilesystemType(fsTypeName)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 73f8043be..bde81e1ef 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -126,17 +126,23 @@ func (vfs *VirtualFilesystem) Init() error {
 	// Construct vfs.anonMount.
 	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
 	if err != nil {
-		return err
+		// This shouldn't be possible since anonBlockDevMinorNext was
+		// initialized to 1 above (no device numbers have been allocated yet).
+		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
 	}
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
 	}
 	anonfs.vfsfs.Init(vfs, &anonfs)
-	vfs.anonMount = &Mount{
-		vfs:  vfs,
-		fs:   &anonfs.vfsfs,
-		refs: 1,
+	defer anonfs.vfsfs.DecRef()
+	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+	if err != nil {
+		// We should not be passing any MountOptions that would cause
+		// construction of this mount to fail.
+		panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
 	}
+	vfs.anonMount = anonMount
+
 	return nil
 }
 
-- 
cgit v1.2.3


From d8ed78431162fcaed0b31b54d939c8a54d4736e7 Mon Sep 17 00:00:00 2001
From: moricho <ikeda.morito@gmail.com>
Date: Tue, 25 Feb 2020 16:49:08 +0900
Subject: add profile option

---
 pkg/sentry/control/pprof.go | 34 +++++++++++++++++++++---
 runsc/boot/controller.go    | 13 +++++----
 runsc/cmd/debug.go          | 64 +++++++++++++++++++++++++++++++++++++--------
 runsc/sandbox/sandbox.go    | 60 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 152 insertions(+), 19 deletions(-)

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 151808911..5d1907c0e 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -117,15 +117,43 @@ func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
 	return nil
 }
 
-// Goroutine is an RPC stub which dumps out the stack trace for all running
+// GoroutineProfile is an RPC stub which dumps out the stack trace for all running
 // goroutines.
-func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
+func (p *Profile) GoroutineProfile(o *ProfileOpts, _ *struct{}) error {
 	if len(o.FilePayload.Files) < 1 {
 		return errNoOutput
 	}
 	output := o.FilePayload.Files[0]
 	defer output.Close()
-	if err := pprof.Lookup("goroutine").WriteTo(output, 2); err != nil {
+	if err := pprof.Lookup("goroutine").WriteTo(output, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+// BlockProfile is an RPC stub which dumps out the stack trace that led to
+// blocking on synchronization primitives.
+func (p *Profile) BlockProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("block").WriteTo(output, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+// MutexProfile is an RPC stub which dumps out the stack trace of holders of
+// contended mutexes.
+func (p *Profile) MutexProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("mutex").WriteTo(output, 0); err != nil {
 		return err
 	}
 	return nil
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 17e774e0c..8125d5061 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -101,11 +101,14 @@ const (
 
 // Profiling related commands (see pprof.go for more details).
 const (
-	StartCPUProfile = "Profile.StartCPUProfile"
-	StopCPUProfile  = "Profile.StopCPUProfile"
-	HeapProfile     = "Profile.HeapProfile"
-	StartTrace      = "Profile.StartTrace"
-	StopTrace       = "Profile.StopTrace"
+	StartCPUProfile  = "Profile.StartCPUProfile"
+	StopCPUProfile   = "Profile.StopCPUProfile"
+	HeapProfile      = "Profile.HeapProfile"
+	GoroutineProfile = "Profile.GoroutineProfile"
+	BlockProfile     = "Profile.BlockProfile"
+	MutexProfile     = "Profile.MutexProfile"
+	StartTrace       = "Profile.StartTrace"
+	StopTrace        = "Profile.StopTrace"
 )
 
 // Logging related commands (see logging.go for more details).
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 79965460e..b5de2588b 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,17 +32,20 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid         int
-	stacks      bool
-	signal      int
-	profileHeap string
-	profileCPU  string
-	trace       string
-	strace      string
-	logLevel    string
-	logPackets  string
-	duration    time.Duration
-	ps          bool
+	pid              int
+	stacks           bool
+	signal           int
+	profileHeap      string
+	profileCPU       string
+	profileGoroutine string
+	profileBlock     string
+	profileMutex     string
+	trace            string
+	strace           string
+	logLevel         string
+	logPackets       string
+	duration         time.Duration
+	ps               bool
 }
 
 // Name implements subcommands.Command.
@@ -66,6 +69,9 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+	f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
+	f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
+	f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
 	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
@@ -147,6 +153,42 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Heap profile written to %q", d.profileHeap)
 	}
+	if d.profileGoroutine != "" {
+		f, err := os.Create(d.profileGoroutine)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.GoroutineProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Goroutine profile written to %q", d.profileGoroutine)
+	}
+	if d.profileBlock != "" {
+		f, err := os.Create(d.profileBlock)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.BlockProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Block profile written to %q", d.profileBlock)
+	}
+	if d.profileMutex != "" {
+		f, err := os.Create(d.profileMutex)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.MutexProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Mutex profile written to %q", d.profileMutex)
+	}
 
 	delay := false
 	if d.profileCPU != "" {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ec72bdbfd..2e0e2fd66 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -972,6 +972,66 @@ func (s *Sandbox) StopCPUProfile() error {
 	return nil
 }
 
+// GoroutineProfile writes a goroutine profile to the given file.
+func (s *Sandbox) GoroutineProfile(f *os.File) error {
+	log.Debugf("Goroutine profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// BlockProfile writes a block profile to the given file.
+func (s *Sandbox) BlockProfile(f *os.File) error {
+	log.Debugf("Block profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// MutexProfile writes a mutex profile to the given file.
+func (s *Sandbox) MutexProfile(f *os.File) error {
+	log.Debugf("Mutex profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err)
+	}
+	return nil
+}
+
 // StartTrace start trace  writing to the given file.
 func (s *Sandbox) StartTrace(f *os.File) error {
 	log.Debugf("Trace start %q", s.ID)
-- 
cgit v1.2.3


From 408979e619c4b5df74503c7a887aaaa06fd0d730 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 14 Feb 2020 17:19:32 -0800
Subject: iptables: filter by IP address (and range)

Enables commands such as:
$ iptables -A INPUT -d 127.0.0.1 -j ACCEPT
$ iptables -t nat -A PREROUTING ! -d 127.0.0.1 -j REDIRECT

Also adds a bunch of REDIRECT+destination tests.
---
 pkg/abi/linux/netfilter.go               |  28 ++++-
 pkg/sentry/socket/netfilter/netfilter.go |  17 ++-
 pkg/tcpip/iptables/iptables.go           |  28 ++++-
 pkg/tcpip/iptables/types.go              |  12 ++
 test/iptables/filter_input.go            |  65 ++++++++++
 test/iptables/filter_output.go           |  56 +++++++++
 test/iptables/iptables_test.go           |  66 ++++++++++
 test/iptables/iptables_util.go           |  25 +++-
 test/iptables/nat.go                     | 201 ++++++++++++++++++++++++++++++-
 9 files changed, 487 insertions(+), 11 deletions(-)

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index bd2e13ba1..aa149afb5 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -158,10 +158,36 @@ type IPTIP struct {
 	// Flags define matching behavior for the IP header.
 	Flags uint8
 
-	// InverseFlags invert the meaning of fields in struct IPTIP.
+	// InverseFlags invert the meaning of fields in struct IPTIP. See the
+	// IPT_INV_* flags.
 	InverseFlags uint8
 }
 
+// Inverts the meaning of the Protocol field. Corresponds to a constant in
+// include/uapi/linux/netfilter/x_tables.h.
+const XT_INV_PROTO = 0x40
+
+// Flags in IPTIP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter_ipv4/ip_tables.h.
+const (
+	// Invert the meaning of InputInterface.
+	IPT_INV_VIA_IN = 0x01
+	// Invert the meaning of OutputInterface.
+	IPT_INV_VIA_OUT = 0x02
+	// Unclear what this is, as no references to it exist in the kernel.
+	IPT_INV_TOS = 0x04
+	// Invert the meaning of Src.
+	IPT_INV_SRCIP = 0x08
+	// Invert the meaning of Dst.
+	IPT_INV_DSTIP = 0x10
+	// Invert the meaning of the IPT_F_FRAG flag.
+	IPT_INV_FRAG = 0x20
+	// Invert the meaning of the Protocol field.
+	IPT_INV_PROTO = XT_INV_PROTO
+	// Enable all flags.
+	IPT_INV_MASK = 0x7F
+)
+
 // SizeOfIPTIP is the size of an IPTIP.
 const SizeOfIPTIP = 84
 
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 2ec11f6ac..faa3e892a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -630,8 +631,14 @@ func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
 		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
 	}
+	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
+		return iptables.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
 	return iptables.IPHeaderFilter{
-		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		Protocol:  tcpip.TransportProtocolNumber(iptip.Protocol),
+		Dst:       tcpip.Address(iptip.Dst[:]),
+		DstMask:   tcpip.Address(iptip.DstMask[:]),
+		DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
 	}, nil
 }
 
@@ -639,16 +646,16 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 	// Currently we check that everything except protocol is zeroed.
 	var emptyInetAddr = linux.InetAddr{}
 	var emptyInterface = [linux.IFNAMSIZ]byte{}
-	return iptip.Dst != emptyInetAddr ||
-		iptip.Src != emptyInetAddr ||
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IPT_INV_DSTIP)
+	return iptip.Src != emptyInetAddr ||
 		iptip.SrcMask != emptyInetAddr ||
-		iptip.DstMask != emptyInetAddr ||
 		iptip.InputInterface != emptyInterface ||
 		iptip.OutputInterface != emptyInterface ||
 		iptip.InputInterfaceMask != emptyInterface ||
 		iptip.OutputInterfaceMask != emptyInterface ||
 		iptip.Flags != 0 ||
-		iptip.InverseFlags != 0
+		iptip.InverseFlags&^inverseMask != 0
 }
 
 func validUnderflow(rule iptables.Rule) bool {
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index dbaccbb36..262b6448d 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -240,9 +240,8 @@ func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, r
 func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
-	// First check whether the packet matches the IP header filter.
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
+	// Check whether the packet matches the IP header filter.
+	if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader)) {
 		// Continue on to the next rule.
 		return RuleJump, ruleIdx + 1
 	}
@@ -263,3 +262,26 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	// All the matchers matched, so run the target.
 	return rule.Target.Action(pkt)
 }
+
+func filterMatch(filter IPHeaderFilter, hdr header.IPv4) bool {
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// Check the transport protocol.
+	if filter.Protocol != 0 && filter.Protocol != hdr.TransportProtocol() {
+		return false
+	}
+
+	// Check the destination IP.
+	dest := hdr.DestinationAddress()
+	matches := true
+	for i := range filter.Dst {
+		if dest[i]&filter.DstMask[i] != filter.Dst[i] {
+			matches = false
+			break
+		}
+	}
+	if matches == filter.DstInvert {
+		return false
+	}
+
+	return true
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7d032fd23..e7fcf6bff 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -144,6 +144,18 @@ type Rule struct {
 type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
+
+	// Dst matches the destination IP address.
+	Dst tcpip.Address
+
+	// DstMask masks bits of the destination IP address when comparing with
+	// Dst.
+	DstMask tcpip.Address
+
+	// DstInvert inverts the meaning of the destination IP check, i.e. when
+	// true the filter will match packets that fail the destination
+	// comparison.
+	DstInvert bool
 }
 
 // A Matcher is the interface for matching packets.
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index b2fb6401a..0d3350d8a 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -47,6 +47,8 @@ func init() {
 	RegisterTestCase(FilterInputJumpReturnDrop{})
 	RegisterTestCase(FilterInputJumpBuiltin{})
 	RegisterTestCase(FilterInputJumpTwice{})
+	RegisterTestCase(FilterInputDestination{})
+	RegisterTestCase(FilterInputInvertDestination{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -595,3 +597,66 @@ func (FilterInputJumpTwice) ContainerAction(ip net.IP) error {
 func (FilterInputJumpTwice) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// FilterInputDestination verifies that we can filter packets via `-d
+// <ipaddr>`.
+type FilterInputDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDestination) Name() string {
+	return "FilterInputDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDestination) ContainerAction(ip net.IP) error {
+	addrs, err := localAddrs()
+	if err != nil {
+		return err
+	}
+
+	// Make INPUT's default action DROP, then ACCEPT all packets bound for
+	// this machine.
+	rules := [][]string{{"-P", "INPUT", "DROP"}}
+	for _, addr := range addrs {
+		rules = append(rules, []string{"-A", "INPUT", "-d", addr, "-j", "ACCEPT"})
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDestination) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputDestination verifies that we can filter packets via `! -d
+// <ipaddr>`.
+type FilterInputInvertDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputInvertDestination) Name() string {
+	return "FilterInputInvertDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputInvertDestination) ContainerAction(ip net.IP) error {
+	// Make INPUT's default action DROP, then ACCEPT all packets not bound
+	// for 127.0.0.1.
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-A", "INPUT", "!", "-d", localIP, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputInvertDestination) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go
index ee2c49f9a..39e648e32 100644
--- a/test/iptables/filter_output.go
+++ b/test/iptables/filter_output.go
@@ -22,6 +22,8 @@ import (
 func init() {
 	RegisterTestCase(FilterOutputDropTCPDestPort{})
 	RegisterTestCase(FilterOutputDropTCPSrcPort{})
+	RegisterTestCase(FilterOutputDestination{})
+	RegisterTestCase(FilterOutputInvertDestination{})
 }
 
 // FilterOutputDropTCPDestPort tests that connections are not accepted on specified source ports.
@@ -85,3 +87,57 @@ func (FilterOutputDropTCPSrcPort) LocalAction(ip net.IP) error {
 
 	return nil
 }
+
+// FilterOutputDestination tests that we can selectively allow packets to
+// certain destinations.
+type FilterOutputDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDestination) Name() string {
+	return "FilterOutputDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDestination) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "-d", ip.String(), "-j", "ACCEPT"},
+		{"-P", "OUTPUT", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDestination) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputInvertDestination tests that we can selectively allow packets
+// not headed for a particular destination.
+type FilterOutputInvertDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInvertDestination) Name() string {
+	return "FilterOutputInvertDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInvertDestination) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "!", "-d", localIP, "-j", "ACCEPT"},
+		{"-P", "OUTPUT", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInvertDestination) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 0621861eb..5eabd2461 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -285,3 +285,69 @@ func TestJumpTwice(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestInputDestination(t *testing.T) {
+	if err := singleTest(FilterInputDestination{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestInputInvertDestination(t *testing.T) {
+	if err := singleTest(FilterInputInvertDestination{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestOutputDestination(t *testing.T) {
+	if err := singleTest(FilterOutputDestination{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestOutputInvertDestination(t *testing.T) {
+	if err := singleTest(FilterOutputInvertDestination{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATOutRedirectIP(t *testing.T) {
+	if err := singleTest(NATOutRedirectIP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATOutDontRedirectIP(t *testing.T) {
+	if err := singleTest(NATOutDontRedirectIP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATOutRedirectInvert(t *testing.T) {
+	if err := singleTest(NATOutRedirectInvert{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATPreRedirectIP(t *testing.T) {
+	if err := singleTest(NATPreRedirectIP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATPreDontRedirectIP(t *testing.T) {
+	if err := singleTest(NATPreDontRedirectIP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATPreRedirectInvert(t *testing.T) {
+	if err := singleTest(NATPreRedirectInvert{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestNATRedirectRequiresProtocol(t *testing.T) {
+	if err := singleTest(NATRedirectRequiresProtocol{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 32cf5a417..178a662e1 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -24,6 +24,7 @@ import (
 )
 
 const iptablesBinary = "iptables"
+const localIP = "127.0.0.1"
 
 // filterTable calls `iptables -t filter` with the given args.
 func filterTable(args ...string) error {
@@ -46,8 +47,17 @@ func tableCmd(table string, args []string) error {
 
 // filterTableRules is like filterTable, but runs multiple iptables commands.
 func filterTableRules(argsList [][]string) error {
+	return tableRules("filter", argsList)
+}
+
+// natTableRules is like natTable, but runs multiple iptables commands.
+func natTableRules(argsList [][]string) error {
+	return tableRules("nat", argsList)
+}
+
+func tableRules(table string, argsList [][]string) error {
 	for _, args := range argsList {
-		if err := filterTable(args...); err != nil {
+		if err := tableCmd(table, args); err != nil {
 			return err
 		}
 	}
@@ -149,3 +159,16 @@ func connectTCP(ip net.IP, remotePort, localPort int, timeout time.Duration) err
 
 	return nil
 }
+
+// localAddrs returns a list of local network interface addresses.
+func localAddrs() ([]string, error) {
+	addrs, err := net.InterfaceAddrs()
+	if err != nil {
+		return nil, err
+	}
+	addrStrs := make([]string, 0, len(addrs))
+	for _, addr := range addrs {
+		addrStrs = append(addrStrs, addr.String())
+	}
+	return addrStrs, nil
+}
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index a01117ec8..020c862ad 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -15,8 +15,10 @@
 package iptables
 
 import (
+	"errors"
 	"fmt"
 	"net"
+	"time"
 )
 
 const (
@@ -26,6 +28,13 @@ const (
 func init() {
 	RegisterTestCase(NATRedirectUDPPort{})
 	RegisterTestCase(NATDropUDP{})
+	RegisterTestCase(NATPreRedirectIP{})
+	RegisterTestCase(NATPreDontRedirectIP{})
+	RegisterTestCase(NATPreRedirectInvert{})
+	RegisterTestCase(NATOutRedirectIP{})
+	RegisterTestCase(NATOutDontRedirectIP{})
+	RegisterTestCase(NATOutRedirectInvert{})
+	RegisterTestCase(NATRedirectRequiresProtocol{})
 }
 
 // NATRedirectUDPPort tests that packets are redirected to different port.
@@ -53,7 +62,8 @@ func (NATRedirectUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
 
-// NATDropUDP tests that packets are not received in ports other than redirect port.
+// NATDropUDP tests that packets are not received in ports other than redirect
+// port.
 type NATDropUDP struct{}
 
 // Name implements TestCase.Name.
@@ -78,3 +88,192 @@ func (NATDropUDP) ContainerAction(ip net.IP) error {
 func (NATDropUDP) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// NATOutRedirectIP uses iptables to select packets based on destination IP and
+// redirects them.
+type NATOutRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATOutRedirectIP) Name() string {
+	return "NATOutRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutRedirectIP) ContainerAction(ip net.IP) error {
+	// Redirect OUTPUT packets to a listening localhost port.
+	dest := net.IP([]byte{200, 0, 0, 2})
+	return loopbackTest(dest, "-A", "OUTPUT", "-d", dest.String(), "-p", "udp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort))
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutRedirectIP) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// NATOutDontRedirectIP tests that iptables matching with "-d" does not match
+// packets it shouldn't.
+type NATOutDontRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATOutDontRedirectIP) Name() string {
+	return "NATOutDontRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutDontRedirectIP) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "OUTPUT", "-d", localIP, "-p", "udp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", dropPort)); err != nil {
+		return err
+	}
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutDontRedirectIP) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// NATOutRedirectInvert tests that iptables can match with "! -d".
+type NATOutRedirectInvert struct{}
+
+// Name implements TestCase.Name.
+func (NATOutRedirectInvert) Name() string {
+	return "NATOutRedirectInvert"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutRedirectInvert) ContainerAction(ip net.IP) error {
+	// Redirect OUTPUT packets to a listening localhost port.
+	dest := []byte{200, 0, 0, 3}
+	destStr := "200.0.0.2"
+	return loopbackTest(dest, "-A", "OUTPUT", "!", "-d", destStr, "-p", "udp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort))
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutRedirectInvert) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// NATPreRedirectIP tests that we can use iptables to select packets based on
+// destination IP and redirect them.
+type NATPreRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATPreRedirectIP) Name() string {
+	return "NATPreRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreRedirectIP) ContainerAction(ip net.IP) error {
+	addrs, err := localAddrs()
+	if err != nil {
+		return err
+	}
+
+	var rules [][]string
+	for _, addr := range addrs {
+		rules = append(rules, []string{"-A", "PREROUTING", "-p", "udp", "-d", addr, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)})
+	}
+	if err := natTableRules(rules); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreRedirectIP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// NATPreDontRedirectIP tests that iptables matching with "-d" does not match
+// packets it shouldn't.
+type NATPreDontRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATPreDontRedirectIP) Name() string {
+	return "NATPreDontRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreDontRedirectIP) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-d", localIP, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", dropPort)); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreDontRedirectIP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// NATPreRedirectInvert tests that iptables can match with "! -d".
+type NATPreRedirectInvert struct{}
+
+// Name implements TestCase.Name.
+func (NATPreRedirectInvert) Name() string {
+	return "NATPreRedirectInvert"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreRedirectInvert) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "!", "-d", localIP, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreRedirectInvert) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// NATRedirectRequiresProtocol tests that use of the --to-ports flag requires a
+// protocol to be specified with -p.
+type NATRedirectRequiresProtocol struct{}
+
+// Name implements TestCase.Name.
+func (NATRedirectRequiresProtocol) Name() string {
+	return "NATRedirectRequiresProtocol"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATRedirectRequiresProtocol) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-d", localIP, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)); err == nil {
+		return errors.New("expected an error using REDIRECT --to-ports without a protocol")
+	}
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATRedirectRequiresProtocol) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// loopbackTests runs an iptables rule and ensures that packets sent to
+// dest:dropPort are received by localhost:acceptPort.
+func loopbackTest(dest net.IP, args ...string) error {
+	if err := natTable(args...); err != nil {
+		return err
+	}
+	sendCh := make(chan error)
+	listenCh := make(chan error)
+	go func() {
+		sendCh <- sendUDPLoop(dest, dropPort, sendloopDuration)
+	}()
+	go func() {
+		listenCh <- listenUDP(acceptPort, sendloopDuration)
+	}()
+	select {
+	case err := <-listenCh:
+		if err != nil {
+			return err
+		}
+	case <-time.After(sendloopDuration):
+		return errors.New("timed out")
+	}
+	// sendCh will always take the full sendloop time.
+	return <-sendCh
+}
-- 
cgit v1.2.3


From 9fccf98c0d990cb32666091855f3a396f762ce55 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Wed, 26 Feb 2020 13:18:35 -0800
Subject: Fix merge conflicts.

---
 pkg/tcpip/iptables/iptables.go |  5 ++---
 pkg/tcpip/iptables/targets.go  | 14 +++++++-------
 pkg/tcpip/iptables/types.go    |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 80ddbd442..2ab9e0675 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -171,7 +171,6 @@ const (
 	chainReturn
 )
 
-
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
@@ -242,7 +241,7 @@ func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, r
 				return chainDrop
 			case chainReturn:
 				ruleIdx++
-				continue   
+				continue
 			default:
 				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 			}
@@ -289,5 +288,5 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	}
 
 	// All the matchers matched, so run the target.
-  return rule.Target.Action(pkt, rule.Filter)
+	return rule.Target.Action(pkt)
 }
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 5dbb28145..96318118c 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -24,7 +24,7 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, int) {
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 	return RuleAccept, 0
 }
 
@@ -32,7 +32,7 @@ func (AcceptTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (Ru
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, int) {
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 	return RuleDrop, 0
 }
 
@@ -41,7 +41,7 @@ func (DropTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (Rule
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, int) {
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
 	return RuleDrop, 0
 }
@@ -52,7 +52,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer, IPHeaderFilter) (RuleVerdict, int) {
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -61,7 +61,7 @@ func (UserChainTarget) Action(tcpip.PacketBuffer, IPHeaderFilter) (RuleVerdict,
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer, IPHeaderFilter) (RuleVerdict, int) {
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 	return RuleReturn, 0
 }
 
@@ -86,7 +86,7 @@ type RedirectTarget struct {
 }
 
 // Action implements Target.Action.
-func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, int) {
+func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer) (RuleVerdict, int) {
 	headerView := pkt.Data.First()
 
 	// Network header should be set.
@@ -99,7 +99,7 @@ func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer, filter IPHeaderFilter) (
 	// we need to change dest address (for OUTPUT chain) or ports.
 	hlen := int(netHeader.HeaderLength())
 
-	switch protocol := filter.Protocol; protocol {
+	switch protocol := netHeader.TransportProtocol(); protocol {
 	case header.UDPProtocolNumber:
 		udp := header.UDP(headerView[hlen:])
 		udp.SetDestinationPort(rt.MinPort)
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 8bd3a2c94..9c2ad2d46 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -164,5 +164,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer, filter IPHeaderFilter) (RuleVerdict, int)
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
 }
-- 
cgit v1.2.3


From de0b2ebf8635a75bfabfd0a8b48de7923017574e Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 26 Feb 2020 18:16:19 -0800
Subject: Add getsockopt tests for SO_SNDTIMEO and SO_RCVTIMEO

PiperOrigin-RevId: 297485310
---
 test/syscalls/linux/socket_generic.cc | 96 ++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 8 deletions(-)

diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index e8f24a59e..f7d6139f1 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -447,6 +447,60 @@ TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, SendTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 89, .tv_usec = 42000};
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 89);
+  EXPECT_EQ(actual_tv.tv_usec, 42000);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeoutLargerArg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval_with_extra {
+    struct timeval tv;
+    int64_t extra_data;
+  } ABSL_ATTRIBUTE_PACKED;
+
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 123000},
+  };
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &tv_extra, sizeof(tv_extra)),
+              SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 123000);
+}
+
 TEST_P(AllSocketPairTest, SendTimeoutAllowsWrite) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -491,18 +545,36 @@ TEST_P(AllSocketPairTest, SendTimeoutAllowsSendmsg) {
   ASSERT_NO_FATAL_FAILURE(SendNullCmsg(sockets->first_fd(), buf, sizeof(buf)));
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSet) {
+TEST_P(AllSocketPairTest, RecvTimeoutDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  struct timeval tv {
-    .tv_sec = 0, .tv_usec = 35
-  };
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetRecvTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 123, .tv_usec = 456000};
   EXPECT_THAT(
       setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
       SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 123);
+  EXPECT_EQ(actual_tv.tv_usec, 456000);
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
+TEST_P(AllSocketPairTest, SetGetRecvTimeoutLargerArg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval_with_extra {
@@ -510,13 +582,21 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
     int64_t extra_data;
   } ABSL_ATTRIBUTE_PACKED;
 
-  timeval_with_extra tv_extra;
-  tv_extra.tv.tv_sec = 0;
-  tv_extra.tv.tv_usec = 25;
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 432000},
+  };
 
   EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
                          &tv_extra, sizeof(tv_extra)),
               SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 432000);
 }
 
 TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgOneSecondSucceeds) {
-- 
cgit v1.2.3


From 8fb84f78adfc0dba964ebe97edb51ebf8a80f752 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 26 Feb 2020 19:28:20 -0800
Subject: Fix construct of linux.Stat for arm64.

PiperOrigin-RevId: 297494373
---
 pkg/sentry/syscalls/linux/BUILD              |  2 ++
 pkg/sentry/syscalls/linux/sys_stat.go        | 18 -----------
 pkg/sentry/syscalls/linux/sys_stat_amd64.go  | 45 +++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_stat_arm64.go  | 45 +++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/BUILD         |  2 ++
 pkg/sentry/syscalls/linux/vfs2/stat.go       | 23 --------------
 pkg/sentry/syscalls/linux/vfs2/stat_amd64.go | 46 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/stat_arm64.go | 46 ++++++++++++++++++++++++++++
 8 files changed, 186 insertions(+), 41 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat_arm64.go

diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index c7883e68e..0d24fd3c4 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,6 +42,8 @@ go_library(
         "sys_socket.go",
         "sys_splice.go",
         "sys_stat.go",
+        "sys_stat_amd64.go",
+        "sys_stat_arm64.go",
         "sys_sync.go",
         "sys_sysinfo.go",
         "sys_syslog.go",
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 701b27b4a..9bd2df104 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -25,24 +25,6 @@ import (
 
 // LINT.IfChange
 
-func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
-	return linux.Stat{
-		Dev:     sattr.DeviceID,
-		Ino:     sattr.InodeID,
-		Nlink:   uattr.Links,
-		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
-		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
-		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
-		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
-		Size:    uattr.Size,
-		Blksize: sattr.BlockSize,
-		Blocks:  uattr.Usage / 512,
-		ATime:   uattr.AccessTime.Timespec(),
-		MTime:   uattr.ModificationTime.Timespec(),
-		CTime:   uattr.StatusChangeTime.Timespec(),
-	}
-}
-
 // Stat implements linux syscall stat(2).
 func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..0a04a6113
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_amd64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..5a3b1bfad
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uint32(uattr.Links),
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: int32(sattr.BlockSize),
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_arm64.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index f51761e81..e7695e995 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -22,6 +22,8 @@ go_library(
         "read_write.go",
         "setstat.go",
         "stat.go",
+        "stat_amd64.go",
+        "stat_arm64.go",
         "sync.go",
         "xattr.go",
     ],
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index dca8d7011..12c532310 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -113,29 +113,6 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
 	return stat.CopyOut(t, statAddr)
 }
 
-// This takes both input and output as pointer arguments to avoid copying large
-// structs.
-func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
-	// Linux just copies fields from struct kstat without regard to struct
-	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
-	userns := t.UserNamespace()
-	*stat = linux.Stat{
-		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
-		Ino:     statx.Ino,
-		Nlink:   uint64(statx.Nlink),
-		Mode:    uint32(statx.Mode),
-		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
-		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
-		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
-		Size:    int64(statx.Size),
-		Blksize: int64(statx.Blksize),
-		Blocks:  int64(statx.Blocks),
-		ATime:   timespecFromStatxTimestamp(statx.Atime),
-		MTime:   timespecFromStatxTimestamp(statx.Mtime),
-		CTime:   timespecFromStatxTimestamp(statx.Ctime),
-	}
-}
-
 func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
 	return linux.Timespec{
 		Sec:  sxts.Sec,
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
new file mode 100644
index 000000000..2da538fc6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
new file mode 100644
index 000000000..88b9c7627
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint32(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int32(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
-- 
cgit v1.2.3


From 5f0e8e6239108c1b597e1cb8cb588bbf09d192b7 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 27 Feb 2020 01:19:28 -0500
Subject: Prepare the vcpu environment for sentry on Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index d42eda37b..db6465663 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -394,6 +394,8 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 
 #define STACK_FRAME_SIZE 16
 
+// kernelExitToEl0 is the entrypoint for application in guest_el0.
+// Prepare the vcpu environment for container application.
 TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	// Step1, save sentry context into memory.
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
@@ -464,7 +466,23 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 
 	ERET()
 
+// kernelExitToEl1 is the entrypoint for sentry in guest_el1.
+// Prepare the vcpu environment for sentry.
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+
+	MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1
+	WORD $0xd5184001  //MSR R1, SPSR_EL1
+
+	MOVD CPU_REGISTERS+PTRACE_PC(RSV_REG), R1
+	MSR R1, ELR_EL1
+
+	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
+	MOVD R1, RSP
+
+	REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
+	MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
+
 	ERET()
 
 // Start is the CPU entrypoint.
-- 
cgit v1.2.3


From 8e2b14fecf204b35fe258816792bdc03a1ca0912 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 27 Feb 2020 10:21:33 -0800
Subject: Use automated release notes, if available.

PiperOrigin-RevId: 297628615
---
 scripts/release.sh   | 13 ++++++++++++-
 tools/tag_release.sh | 12 +++++++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/scripts/release.sh b/scripts/release.sh
index 091abf87f..e14ba04a7 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -25,6 +25,14 @@ if ! [[ -v KOKORO_RELEASE_TAG ]]; then
   echo "No KOKORO_RELEASE_TAG provided." >&2
   exit 1
 fi
+if ! [[ -v KOKORO_RELNOTES ]]; then
+  echo "No KOKORO_RELNOTES provided." >&2
+  exit 1
+fi
+if ! [[ -r "${KOKORO_ARTIFACTS_DIR}/${KOKORO_RELNOTES}" ]]; then
+  echo "The file '${KOKORO_ARTIFACTS_DIR}/${KOKORO_RELNOTES}' is not readable." >&2
+  exit 1
+fi
 
 # Unless an explicit releaser is provided, use the bot e-mail.
 declare -r KOKORO_RELEASE_AUTHOR=${KOKORO_RELEASE_AUTHOR:-gvisor-bot}
@@ -46,4 +54,7 @@ EOF
 fi
 
 # Run the release tool, which pushes to the origin repository.
-tools/tag_release.sh "${KOKORO_RELEASE_COMMIT}" "${KOKORO_RELEASE_TAG}"
+tools/tag_release.sh \
+    "${KOKORO_RELEASE_COMMIT}" \
+    "${KOKORO_RELEASE_TAG}" \
+    "${KOKORO_ARTIFACTS_DIR}/${KOKORO_RELNOTES}"
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index f33b902d6..4dbfe420a 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -21,13 +21,19 @@
 set -xeu
 
 # Check arguments.
-if [ "$#" -ne 2 ]; then
-  echo "usage: $0 <commit|revid> <release.rc>"
+if [ "$#" -ne 3 ]; then
+  echo "usage: $0 <commit|revid> <release.rc> <message-file>"
   exit 1
 fi
 
 declare -r target_commit="$1"
 declare -r release="$2"
+declare -r message_file="$3"
+
+if ! [[ -r "${message_file}" ]]; then
+  echo "error: message file '${message_file}' is not readable."
+  exit 1
+fi
 
 closest_commit() {
   while read line; do
@@ -64,6 +70,6 @@ fi
 
 # Tag the given commit (annotated, to record the committer).
 declare -r tag="release-${release}"
-(git tag -m "Release ${release}" -a "${tag}" "${commit}" && \
+(git tag -F "${message_file}" -a "${tag}" "${commit}" && \
   git push origin tag "${tag}") || \
   (git tag -d "${tag}" && false)
-- 
cgit v1.2.3


From abf7ebcd38e8c2750f4542f29115140bb2b44a9b Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 27 Feb 2020 10:59:32 -0800
Subject: Internal change.

PiperOrigin-RevId: 297638665
---
 pkg/sentry/socket/netstack/netstack.go |  40 +++++++++--
 pkg/tcpip/transport/packet/endpoint.go |  21 +++++-
 test/syscalls/linux/packet_socket.cc   | 124 ++++++++++++++++++++++++++++++---
 3 files changed, 167 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e187276c5..48c268bfa 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,14 +712,40 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
-	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 5722815e9..09a1cd436 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -76,6 +76,7 @@ type endpoint struct {
 	sndBufSize int
 	closed     bool
 	stats      tcpip.TransportEndpointStats `state:"nosave"`
+	bound      bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -125,6 +126,7 @@ func (ep *endpoint) Close() {
 	}
 
 	ep.closed = true
+	ep.bound = false
 	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
@@ -216,7 +218,24 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
 	// - packet(7).
 
-	return tcpip.ErrNotSupported
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 92ae55eec..bc22de788 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 #include <linux/capability.h>
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
@@ -163,16 +164,11 @@ int CookedPacketTest::GetLoopbackIndex() {
   return ifr.ifr_ifindex;
 }
 
-// Receive via a packet socket.
-TEST_P(CookedPacketTest, Receive) {
-  // Let's use a simple IP payload: a UDP datagram.
-  FileDescriptor udp_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  SendUDPMessage(udp_sock.get());
-
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = sock;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -182,9 +178,10 @@ TEST_P(CookedPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
+
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
   // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
   // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
@@ -194,7 +191,7 @@ TEST_P(CookedPacketTest, Receive) {
   // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+  EXPECT_EQ(src.sll_ifindex, ifindex);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -222,6 +219,18 @@ TEST_P(CookedPacketTest, Receive) {
   EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
 }
 
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  int loopback_index = GetLoopbackIndex();
+  ReceiveMessage(socket_, loopback_index);
+}
+
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
   // TODO(b/129292371): Remove once we support packet socket writing.
@@ -313,6 +322,101 @@ TEST_P(CookedPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Binding socket again should fail.
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr)),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  // Get interface other than loopback.
+  struct ifreq ifr = {};
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (strcmp(i->ifa_name, "lo") != 0) {
+      strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+      break;
+    }
+  }
+
+  // Skip if no interface is available other than loopback.
+  if (strlen(ifr.ifr_name) == 0) {
+    GTEST_SKIP();
+  }
+
+  // Get interface index.
+  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Bind to packet socket requires only family, protocol and ifindex.
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Send to loopback interface.
+  struct sockaddr_in dest = {};
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  dest.sin_family = AF_INET;
+  dest.sin_port = kPort;
+  EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+                     reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Wait and make sure the socket never receives any data.
+  struct pollfd pfd = {};
+  pfd.fd = socket_;
+  pfd.events = POLLIN;
+  EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From 246b34dabda51a7c8239fb4eb6d2e05756a92ad4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 27 Feb 2020 20:26:02 +0000
Subject: Bump rack from 2.0.7 to 2.2.2 in /benchmarks/workloads/ruby

Bumps [rack](https://github.com/rack/rack) from 2.0.7 to 2.2.2.
- [Release notes](https://github.com/rack/rack/releases)
- [Changelog](https://github.com/rack/rack/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rack/rack/compare/2.0.7...v2.2.2)

Signed-off-by: dependabot[bot] <support@github.com>
---
 benchmarks/workloads/ruby/Gemfile.lock | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
index b44817bd3..da94aefc4 100644
--- a/benchmarks/workloads/ruby/Gemfile.lock
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -1,28 +1,41 @@
 GEM
   remote: https://rubygems.org/
   specs:
+    activemerchant (1.105.0)
+      activesupport (>= 4.2)
+      builder (>= 2.1.2, < 4.0.0)
+      i18n (>= 0.6.9)
+      nokogiri (~> 1.4)
     activesupport (5.2.3)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 0.7, < 2)
       minitest (~> 5.1)
       tzinfo (~> 1.1)
+    bcrypt (3.1.13)
+    builder (3.2.4)
     cassandra-driver (3.2.3)
       ione (~> 1.2)
     concurrent-ruby (1.1.5)
+    ffi (1.12.2)
     i18n (1.6.0)
       concurrent-ruby (~> 1.0)
     ione (1.2.4)
+    mini_portile2 (2.4.0)
     minitest (5.11.3)
     mustermann (1.0.3)
+    nokogiri (1.10.8)
+      mini_portile2 (~> 2.4.0)
     pdf-core (0.7.0)
     prawn (2.2.2)
       pdf-core (~> 0.7.0)
       ttfunk (~> 1.5)
     puma (3.12.1)
-    rack (2.0.7)
+    rack (2.2.2)
     rack-protection (2.0.5)
       rack
     rake (12.3.2)
+    rbnacl (7.1.1)
+      ffi
     redis (4.1.1)
     ruby-fann (1.2.6)
     sinatra (2.0.5)
@@ -43,9 +56,12 @@ PLATFORMS
   ruby
 
 DEPENDENCIES
+  activemerchant
+  bcrypt
   cassandra-driver
   puma
   rake
+  rbnacl
   redis
   ruby-fann
   sinatra
-- 
cgit v1.2.3


From 0eafb7eb278233e7cb3888cc7b674dd5879d4667 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 27 Feb 2020 12:46:04 -0800
Subject: Bump puma from 3.12.0 to 3.12.2 in
 /benchmarks/workloads/ruby_template

Bumps [puma](https://github.com/puma/puma) from 3.12.0 to 3.12.2.
<details>
<summary>Release notes</summary>

*Sourced from [puma's releases](https://github.com/puma/puma/releases).*

> v3.12.1
</details>
<details>
<summary>Changelog</summary>

*Sourced from [puma's changelog](https://github.com/puma/puma/blob/master/History.md).*

> ## 4.3.1 and 3.12.2 / 2019-12-05
>
> * Security
>   * Fix: a poorly-behaved client could use keepalive requests to monopolize Puma's reactor and create a denial of service attack. CVE-2019-16770.
>
> ## 4.3.0 / 2019-11-07
>
> * Features
>   * Strip whitespace at end of HTTP headers ([#2010](https://github-redirect.dependabot.com/puma/puma/issues/2010))
>   * Optimize HTTP parser for JRuby ([#2012](https://github-redirect.dependabot.com/puma/puma/issues/2012))
>   * Add SSL support for the control app and cli ([#2046](https://github-redirect.dependabot.com/puma/puma/issues/2046), [#2052](https://github-redirect.dependabot.com/puma/puma/issues/2052))
>
> * Bugfixes
>   * Fix Errno::EINVAL when SSL is enabled and browser rejects cert ([#1564](https://github-redirect.dependabot.com/puma/puma/issues/1564))
>   * Fix pumactl defaulting puma to development if an environment was not specified ([#2035](https://github-redirect.dependabot.com/puma/puma/issues/2035))
>   * Fix closing file stream when reading pid from pidfile ([#2048](https://github-redirect.dependabot.com/puma/puma/issues/2048))
>   * Fix a typo in configuration option `--extra_runtime_dependencies` ([#2050](https://github-redirect.dependabot.com/puma/puma/issues/2050))
>
> ## 4.2.1 / 2019-10-07
>
> * 3 bugfixes
>   * Fix socket activation of systemd (pre-existing) unix binder files ([#1842](https://github-redirect.dependabot.com/puma/puma/issues/1842), [#1988](https://github-redirect.dependabot.com/puma/puma/issues/1988))
>   * Deal with multiple calls to bind correctly ([#1986](https://github-redirect.dependabot.com/puma/puma/issues/1986), [#1994](https://github-redirect.dependabot.com/puma/puma/issues/1994), [#2006](https://github-redirect.dependabot.com/puma/puma/issues/2006))
>   * Accepts symbols for `verify_mode` ([#1222](https://github-redirect.dependabot.com/puma/puma/issues/1222))
>
> ## 4.2.0 / 2019-09-23
>
> * 6 features
>   * Pumactl has a new -e environment option and reads `config/puma/<environment>.rb` config files ([#1885](https://github-redirect.dependabot.com/puma/puma/issues/1885))
>   * Semicolons are now allowed in URL paths (MRI only), useful for Angular or Redmine ([#1934](https://github-redirect.dependabot.com/puma/puma/issues/1934))
>   * Allow extra dependencies to be defined when using prune_bundler ([#1105](https://github-redirect.dependabot.com/puma/puma/issues/1105))
>   * Puma now reports the correct port when binding to port 0, also reports other listeners when binding to localhost ([#1786](https://github-redirect.dependabot.com/puma/puma/issues/1786))
>   * Sending SIGINFO to any Puma worker now prints currently active threads and their backtraces ([#1320](https://github-redirect.dependabot.com/puma/puma/issues/1320))
>   * Puma threads all now have their name set on Ruby 2.3+ ([#1968](https://github-redirect.dependabot.com/puma/puma/issues/1968))
> * 4 bugfixes
>   * Fix some misbehavior with phased restart and externally SIGTERMed workers ([#1908](https://github-redirect.dependabot.com/puma/puma/issues/1908), [#1952](https://github-redirect.dependabot.com/puma/puma/issues/1952))
>   * Fix socket closing on error ([#1941](https://github-redirect.dependabot.com/puma/puma/issues/1941))
>   * Removed unnecessary SIGINT trap for JRuby that caused some race conditions ([#1961](https://github-redirect.dependabot.com/puma/puma/issues/1961))
>   * Fix socket files being left around after process stopped ([#1970](https://github-redirect.dependabot.com/puma/puma/issues/1970))
> * Absolutely thousands of lines of test improvements and fixes thanks to [@&#8203;MSP-Greg](https://github.com/MSP-Greg)
>
> ## 4.1.1 / 2019-09-05
>
> * 3 bugfixes
>   * Revert our attempt to not dup STDOUT/STDERR ([#1946](https://github-redirect.dependabot.com/puma/puma/issues/1946))
>   * Fix socket close on error ([#1941](https://github-redirect.dependabot.com/puma/puma/issues/1941))
>   * Fix workers not shutting down correctly ([#1908](https://github-redirect.dependabot.com/puma/puma/issues/1908))
>
> ## 4.1.0 / 2019-08-08
>
></tr></table> ... (truncated)
</details>
<details>
<summary>Commits</summary>

- [`bb29fc7`](https://github.com/puma/puma/commit/bb29fc7fe8f822d0f72706a1ae86e49af3476777) 3.12.2
- [`058df12`](https://github.com/puma/puma/commit/058df12b78e7d1ec661c3b8777f26a736c26675b) 4.3.1 and 4.2.1 release notes
- [`06053e6`](https://github.com/puma/puma/commit/06053e60908074bb38293d4449ea261cb009b53e) Merge pull request from GHSA-7xx3-m584-x994
- [`461c9e9`](https://github.com/puma/puma/commit/461c9e99783e5f69e632acedae83be55017d5fe4) Docs files
- [`7e2c88d`](https://github.com/puma/puma/commit/7e2c88d4131a1a70f551287e49b8f527d29d0469) v3.12.1
- [`36964ec`](https://github.com/puma/puma/commit/36964ec42982d7b3205760bc2bf9ccf3fec8af69) Merge pull request [#1700](https://github-redirect.dependabot.com/puma/puma/issues/1700) from schneems/schneems/fix-puma-rack-handler-config
- [`c24c0c8`](https://github.com/puma/puma/commit/c24c0c883496f581d9092bbe7f7431129eeb7190) Rack handler should use provided default host
- [`e5d566e`](https://github.com/puma/puma/commit/e5d566ed81f3663d70f0318f8bf3d858734cb74b) Merge pull request [#1682](https://github-redirect.dependabot.com/puma/puma/issues/1682) from MSP-Greg/update-travis-ruby
- [`cecc44a`](https://github.com/puma/puma/commit/cecc44aa0ae326e46031b48023253d08df706455) Merge pull request [#1701](https://github-redirect.dependabot.com/puma/puma/issues/1701) from schneems/schneems/m
- [`ce57cfb`](https://github.com/puma/puma/commit/ce57cfb8c3c8259cda13c322de32dd4ff07ec03a) Allow running individual tests via the `m` gem.
- Additional commits viewable in [compare view](https://github.com/puma/puma/compare/v3.12.0...v3.12.2)
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=puma&package-manager=bundler&previous-version=3.12.0&new-version=3.12.2)](https://help.github.com/articles/configuring-automated-security-fixes)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot ignore this [patch|minor|major] version` will close this PR and stop Dependabot creating any more for this minor/major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language
- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language
- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language
- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language

You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/google/gvisor/network/alerts).

</details>

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1345 from google:dependabot/bundler/benchmarks/workloads/ruby_template/puma-3.12.2 2be8d923b4cf5452e763ce369803f2729876b209
PiperOrigin-RevId: 297664218
---
 benchmarks/workloads/ruby_template/Gemfile.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock
index dd8d56fb7..af03ed7fd 100644
--- a/benchmarks/workloads/ruby_template/Gemfile.lock
+++ b/benchmarks/workloads/ruby_template/Gemfile.lock
@@ -2,25 +2,25 @@ GEM
   remote: https://rubygems.org/
   specs:
     mustermann (1.0.3)
-    puma (3.12.0)
+    puma (3.12.2)
     rack (2.0.6)
     rack-protection (2.0.5)
       rack
+    redis (4.1.0)
     sinatra (2.0.5)
       mustermann (~> 1.0)
       rack (~> 2.0)
       rack-protection (= 2.0.5)
       tilt (~> 2.0)
     tilt (2.0.9)
-    redis (4.1.0)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
   puma
-  sinatra
   redis
+  sinatra
 
 BUNDLED WITH
    1.17.1
\ No newline at end of file
-- 
cgit v1.2.3


From ff681b174752c5778f60d5dcb3c846d018ee42b1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 27 Feb 2020 20:52:41 +0000
Subject: Bump puma from 3.12.1 to 3.12.2 in /benchmarks/workloads/ruby

Bumps [puma](https://github.com/puma/puma) from 3.12.1 to 3.12.2.
- [Release notes](https://github.com/puma/puma/releases)
- [Changelog](https://github.com/puma/puma/blob/master/History.md)
- [Commits](https://github.com/puma/puma/compare/v3.12.1...v3.12.2)

Signed-off-by: dependabot[bot] <support@github.com>
---
 benchmarks/workloads/ruby/Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
index da94aefc4..855edf17f 100644
--- a/benchmarks/workloads/ruby/Gemfile.lock
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -29,7 +29,7 @@ GEM
     prawn (2.2.2)
       pdf-core (~> 0.7.0)
       ttfunk (~> 1.5)
-    puma (3.12.1)
+    puma (3.12.2)
     rack (2.2.2)
     rack-protection (2.0.5)
       rack
-- 
cgit v1.2.3


From c6bdc6b05b4abfcb3c677013496c9a6b1d1365dd Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 27 Feb 2020 14:14:34 -0800
Subject: Fix a race in TCP endpoint teardown and teardown the stack in
 tcp_test.

Call stack.Close on stacks when we are done with them in tcp_test. This avoids
leaking resources and reduces the test's flakiness when race/gotsan is enabled.
It also provides test coverage for the race also fixed in this change, which
can be reliably triggered with the stack.Close change (and without the other
changes) when race/gotsan is enabled.

The race was possible when calling Abort (via stack.Close) on an endpoint
processing a SYN segment as part of a passive connect.

Updates #1564

PiperOrigin-RevId: 297685432
---
 pkg/tcpip/transport/tcp/accept.go                  |  1 +
 pkg/tcpip/transport/tcp/connect.go                 |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go                | 16 +++++++++++++++-
 pkg/tcpip/transport/tcp/testing/context/context.go |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 13e383ffc..85049e54e 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -236,6 +236,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.route.NetProto}
 	n.rcvBufSize = int(l.rcvWnd)
 	n.amss = mssForRoute(&n.route)
+	n.setEndpointState(StateConnecting)
 
 	n.maybeEnableTimestamp(rcvdSynOpts)
 	n.maybeEnableSACKPermitted(rcvdSynOpts)
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 7730e6445..cd247f3e1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -577,7 +577,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
-			if n&notifyClose != 0 {
+			if (n&notifyClose)|(n&notifyAbort) != 0 {
 				return tcpip.ErrAborted
 			}
 			if n&notifyDrain != 0 {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index f1ad19dac..9e72730bd 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -798,7 +798,21 @@ func (e *endpoint) Abort() {
 	// If the endpoint disconnected after the check, nothing needs to be
 	// done, so sending a notification which will potentially be ignored is
 	// fine.
-	if e.EndpointState().connected() {
+	//
+	// If the endpoint connecting finishes after the check, the endpoint
+	// is either in a connected state (where we would notifyAbort anyway),
+	// SYN-RECV (where we would also notifyAbort anyway), or in an error
+	// state where nothing is required and the notification can be safely
+	// ignored.
+	//
+	// Endpoints where a Close during connecting or SYN-RECV state would be
+	// problematic are set to state connecting before being registered (and
+	// thus possible to be Aborted). They are never available in initial
+	// state.
+	//
+	// Endpoints transitioning from initial to connecting state may be
+	// safely either closed or sent notifyAbort.
+	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
 		e.notifyProtocolGoroutine(notifyAbort)
 		return
 	}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 1e9a0dea3..8cea20fb5 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -204,6 +204,7 @@ func (c *Context) Cleanup() {
 	if c.EP != nil {
 		c.EP.Close()
 	}
+	c.Stack().Close()
 }
 
 // Stack returns a reference to the stack in the Context.
-- 
cgit v1.2.3


From aa9f8abaef5c6250bdcee8fd88b2420f20791c5d Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 27 Feb 2020 14:51:29 -0800
Subject: Implement automated marshalling for newtypes on arrays.

PiperOrigin-RevId: 297693838
---
 tools/go_marshal/gomarshal/BUILD                   |   3 +
 tools/go_marshal/gomarshal/generator.go            |  17 +-
 tools/go_marshal/gomarshal/generator_interfaces.go | 665 +--------------------
 .../generator_interfaces_array_newtype.go          | 183 ++++++
 .../generator_interfaces_primitive_newtype.go      | 229 +++++++
 .../gomarshal/generator_interfaces_struct.go       | 450 ++++++++++++++
 tools/go_marshal/gomarshal/generator_tests.go      |   2 +-
 tools/go_marshal/gomarshal/util.go                 |  41 +-
 tools/go_marshal/test/test.go                      |   5 +
 9 files changed, 915 insertions(+), 680 deletions(-)
 create mode 100644 tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
 create mode 100644 tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
 create mode 100644 tools/go_marshal/gomarshal/generator_interfaces_struct.go

diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD
index b5d5a4487..44cb33ae4 100644
--- a/tools/go_marshal/gomarshal/BUILD
+++ b/tools/go_marshal/gomarshal/BUILD
@@ -7,6 +7,9 @@ go_library(
     srcs = [
         "generator.go",
         "generator_interfaces.go",
+        "generator_interfaces_array_newtype.go",
+        "generator_interfaces_primitive_newtype.go",
+        "generator_interfaces_struct.go",
         "generator_tests.go",
         "util.go",
     ],
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index d365a1f3c..729489de5 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -235,6 +235,10 @@ func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*a
 				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on primitive %s.\n", t.Name.Name)
 				types = append(types, t)
 				continue
+			case *ast.ArrayType: // Newtype on array.
+				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on array %s.\n", t.Name.Name)
+				types = append(types, t)
+				continue
 			}
 			// A user specifically requested marshalling on this type, but we
 			// don't support it.
@@ -281,17 +285,20 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface
 	i := newInterfaceGenerator(t, fset)
 	switch ty := t.Type.(type) {
 	case *ast.StructType:
-		i.validateStruct()
-		i.emitMarshallableForStruct()
-		return i
+		i.validateStruct(t, ty)
+		i.emitMarshallableForStruct(ty)
 	case *ast.Ident:
 		i.validatePrimitiveNewtype(ty)
-		i.emitMarshallableForPrimitiveNewtype()
-		return i
+		i.emitMarshallableForPrimitiveNewtype(ty)
+	case *ast.ArrayType:
+		i.validateArrayNewtype(t.Name, ty)
+		// After validate, we can safely call arrayLen.
+		i.emitMarshallableForArrayNewtype(t.Name, ty.Elt.(*ast.Ident), arrayLen(ty))
 	default:
 		// This should've been filtered out by collectMarshallabeTypes.
 		panic(fmt.Sprintf("Unexpected type %+v", ty))
 	}
+	return i
 }
 
 // generateOneTestSuite generates a test suite for the automatically generated
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index ea1af998e..8babf61d2 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -15,10 +15,8 @@
 package gomarshal
 
 import (
-	"fmt"
 	"go/ast"
 	"go/token"
-	"strings"
 )
 
 // interfaceGenerator generates marshalling interfaces for a single type.
@@ -81,18 +79,6 @@ func (g *interfaceGenerator) recordPotentiallyNonPackedField(fieldName string) {
 	g.as[fieldName] = struct{}{}
 }
 
-func (g *interfaceGenerator) forEachField(fn func(f *ast.Field)) {
-	// This is guaranteed to succeed because g.t is always a struct.
-	st := g.t.Type.(*ast.StructType)
-	for _, field := range st.Fields.List {
-		fn(field)
-	}
-}
-
-func (g *interfaceGenerator) fieldAccessor(n *ast.Ident) string {
-	return fmt.Sprintf("%s.%s", g.r, n.Name)
-}
-
 // abortAt aborts the go_marshal tool with the given error message, with a
 // reference position to the input source. Same as abortAt, but uses g to
 // resolve p to position.
@@ -100,71 +86,6 @@ func (g *interfaceGenerator) abortAt(p token.Pos, msg string) {
 	abortAt(g.f.Position(p), msg)
 }
 
-func (g *interfaceGenerator) validatePrimitiveNewtype(t *ast.Ident) {
-	switch t.Name {
-	case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
-		// These are the only primitive types we're allow. Below, we provide
-		// suggestions for some disallowed types and reject them, then attempt
-		// to marshal any remaining types by invoking the marshal.Marshallable
-		// interface on them. If these types don't actually implement
-		// marshal.Marshallable, compilation of the generated code will fail
-		// with an appropriate error message.
-		return
-	case "int":
-		g.abortAt(t.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
-	case "uint":
-		g.abortAt(t.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
-	case "string":
-		g.abortAt(t.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
-	default:
-		debugfAt(g.f.Position(t.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
-	}
-}
-
-// validateStruct ensures the type we're working with can be marshalled. These
-// checks are done ahead of time and in one place so we can make assumptions
-// later.
-func (g *interfaceGenerator) validateStruct() {
-	g.forEachField(func(f *ast.Field) {
-		if len(f.Names) == 0 {
-			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
-		}
-	})
-
-	g.forEachField(func(f *ast.Field) {
-		fieldDispatcher{
-			primitive: func(_, t *ast.Ident) {
-				g.validatePrimitiveNewtype(t)
-			},
-			selector: func(_, _, _ *ast.Ident) {
-				// No validation to perform on selector fields. However this
-				// callback must still be provided.
-			},
-			array: func(n, _ *ast.Ident, len int) {
-				a := f.Type.(*ast.ArrayType)
-				if a.Len == nil {
-					g.abortAt(f.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name))
-				}
-
-				if _, ok := a.Len.(*ast.BasicLit); !ok {
-					g.abortAt(a.Len.Pos(), fmt.Sprintf("Array size must be a literal, don's use consts or expressions"))
-				}
-
-				if _, ok := a.Elt.(*ast.Ident); !ok {
-					g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt)))
-				}
-
-				if len <= 0 {
-					g.abortAt(a.Len.Pos(), fmt.Sprintf("Marshalling not supported for zero length arrays, why does an ABI struct have one?"))
-				}
-			},
-			unhandled: func(_ *ast.Ident) {
-				g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type)))
-			},
-		}.dispatch(f)
-	})
-}
-
 // scalarSize returns the size of type identified by t. If t isn't a primitive
 // type, the size isn't known at code generation time, and must be resolved via
 // the marshal.Marshallable interface.
@@ -191,8 +112,8 @@ func (g *interfaceGenerator) shiftDynamic(bufVar, name string) {
 	g.emit("%s = %s[%s.SizeBytes():]\n", bufVar, bufVar, name)
 }
 
-// marshalStructFieldScalar writes a single scalar field from a struct to a byte slice.
-func (g *interfaceGenerator) marshalStructFieldScalar(accessor, typ, bufVar string) {
+// marshalScalar writes a single scalar to a byte slice.
+func (g *interfaceGenerator) marshalScalar(accessor, typ, bufVar string) {
 	switch typ {
 	case "int8", "uint8", "byte":
 		g.emit("%s[0] = byte(%s)\n", bufVar, accessor)
@@ -215,9 +136,8 @@ func (g *interfaceGenerator) marshalStructFieldScalar(accessor, typ, bufVar stri
 	}
 }
 
-// unmarshalStructFieldScalar reads a single scalar field from a struct, from a
-// byte slice.
-func (g *interfaceGenerator) unmarshalStructFieldScalar(accessor, typ, bufVar string) {
+// unmarshalScalar reads a single scalar from a byte slice.
+func (g *interfaceGenerator) unmarshalScalar(accessor, typ, bufVar string) {
 	switch typ {
 	case "byte":
 		g.emit("%s = %s[0]\n", accessor, bufVar)
@@ -243,580 +163,3 @@ func (g *interfaceGenerator) unmarshalStructFieldScalar(accessor, typ, bufVar st
 		g.recordPotentiallyNonPackedField(accessor)
 	}
 }
-
-// marshalPrimitiveScalar writes a single primitive variable to a byte slice.
-func (g *interfaceGenerator) marshalPrimitiveScalar(accessor, typ, bufVar string) {
-	switch typ {
-	case "int8", "uint8", "byte":
-		g.emit("%s[0] = byte(*%s)\n", bufVar, accessor)
-	case "int16", "uint16":
-		g.recordUsedImport("usermem")
-		g.emit("usermem.ByteOrder.PutUint16(%s[:2], uint16(*%s))\n", bufVar, accessor)
-	case "int32", "uint32":
-		g.recordUsedImport("usermem")
-		g.emit("usermem.ByteOrder.PutUint32(%s[:4], uint32(*%s))\n", bufVar, accessor)
-	case "int64", "uint64":
-		g.recordUsedImport("usermem")
-		g.emit("usermem.ByteOrder.PutUint64(%s[:8], uint64(*%s))\n", bufVar, accessor)
-	default:
-		g.emit("inner := (*%s)(%s)\n", typ, accessor)
-		g.emit("inner.MarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
-	}
-}
-
-// unmarshalPrimitiveScalar read a single primitive variable from a byte slice.
-func (g *interfaceGenerator) unmarshalPrimitiveScalar(accessor, typ, bufVar, typeCast string) {
-	switch typ {
-	case "byte":
-		g.emit("*%s = %s(%s[0])\n", accessor, typeCast, bufVar)
-	case "int8", "uint8":
-		g.emit("*%s = %s(%s(%s[0]))\n", accessor, typeCast, typ, bufVar)
-	case "int16", "uint16":
-		g.recordUsedImport("usermem")
-		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint16(%s[:2])))\n", accessor, typeCast, typ, bufVar)
-	case "int32", "uint32":
-		g.recordUsedImport("usermem")
-		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint32(%s[:4])))\n", accessor, typeCast, typ, bufVar)
-
-	case "int64", "uint64":
-		g.recordUsedImport("usermem")
-		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint64(%s[:8])))\n", accessor, typeCast, typ, bufVar)
-	default:
-		g.emit("inner := (*%s)(%s)\n", typ, accessor)
-		g.emit("inner.UnmarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
-	}
-}
-
-// areFieldsPackedExpression returns a go expression checking whether g.t's fields are
-// packed. Returns "", false if g.t has no fields that may be potentially
-// packed, otherwise returns <clause>, true, where <clause> is an expression
-// like "t.a.Packed() && t.b.Packed() && t.c.Packed()".
-func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
-	if len(g.as) == 0 {
-		return "", false
-	}
-
-	cs := make([]string, 0, len(g.as))
-	for accessor, _ := range g.as {
-		cs = append(cs, fmt.Sprintf("%s.Packed()", accessor))
-	}
-	return strings.Join(cs, " && "), true
-}
-
-func (g *interfaceGenerator) emitMarshallableForStruct() {
-	// Is g.t a packed struct without consideing field types?
-	thisPacked := true
-	g.forEachField(func(f *ast.Field) {
-		if f.Tag != nil {
-			if f.Tag.Value == "`marshal:\"unaligned\"`" {
-				if thisPacked {
-					debugfAt(g.f.Position(g.t.Pos()),
-						fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name))
-					thisPacked = false
-				}
-			}
-		}
-	})
-
-	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
-	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		primitiveSize := 0
-		var dynamicSizeTerms []string
-
-		g.forEachField(fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
-				if size, dynamic := g.scalarSize(t); !dynamic {
-					primitiveSize += size
-				} else {
-					g.recordUsedMarshallable(t.Name)
-					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", t.Name))
-				}
-			},
-			selector: func(n, tX, tSel *ast.Ident) {
-				tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name)
-				g.recordUsedImport(tX.Name)
-				g.recordUsedMarshallable(tName)
-				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
-			},
-			array: func(n, t *ast.Ident, len int) {
-				if len < 1 {
-					// Zero-length arrays should've been rejected by validate().
-					panic("unreachable")
-				}
-				if size, dynamic := g.scalarSize(t); !dynamic {
-					primitiveSize += size * len
-				} else {
-					g.recordUsedMarshallable(t.Name)
-					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%d", t.Name, len))
-				}
-			},
-		}.dispatch)
-		g.emit("return %d", primitiveSize)
-		if len(dynamicSizeTerms) > 0 {
-			g.incIndent()
-		}
-		{
-			for _, d := range dynamicSizeTerms {
-				g.emitNoIndent(" +\n")
-				g.emit(d)
-			}
-		}
-		if len(dynamicSizeTerms) > 0 {
-			g.decIndent()
-		}
-	})
-	g.emit("\n}\n\n")
-
-	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
-	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.forEachField(fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
-				if n.Name == "_" {
-					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", t.Name, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("dst", len)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can referece here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name)
-					}
-					return
-				}
-				g.marshalStructFieldScalar(g.fieldAccessor(n), t.Name, "dst")
-			},
-			selector: func(n, tX, tSel *ast.Ident) {
-				g.marshalStructFieldScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
-			},
-			array: func(n, t *ast.Ident, size int) {
-				if n.Name == "_" {
-					g.emit("// Padding: dst[:sizeof(%s)*%d] ~= [%d]%s{0}\n", t.Name, size, size, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("dst", len*size)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can reference here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("dst = dst[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
-					}
-					return
-				}
-
-				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
-				g.inIndent(func() {
-					g.marshalStructFieldScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "dst")
-				})
-				g.emit("}\n")
-			},
-		}.dispatch)
-	})
-	g.emit("}\n\n")
-
-	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
-	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.forEachField(fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
-				if n.Name == "_" {
-					g.emit("// Padding: var _ %s ~= src[:sizeof(%s)]\n", t.Name, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("src", len)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can reference here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("src = src[(*%s)(nil).SizeBytes():]\n", t.Name)
-						g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name))
-					}
-					return
-				}
-				g.unmarshalStructFieldScalar(g.fieldAccessor(n), t.Name, "src")
-			},
-			selector: func(n, tX, tSel *ast.Ident) {
-				g.unmarshalStructFieldScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
-			},
-			array: func(n, t *ast.Ident, size int) {
-				if n.Name == "_" {
-					g.emit("// Padding: ~ copy([%d]%s(%s), src[:sizeof(%s)*%d])\n", size, t.Name, g.fieldAccessor(n), t.Name, size)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("src", len*size)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can referece here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("src = src[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
-					}
-					return
-				}
-
-				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
-				g.inIndent(func() {
-					g.unmarshalStructFieldScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "src")
-				})
-				g.emit("}\n")
-			},
-		}.dispatch)
-	})
-	g.emit("}\n\n")
-
-	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
-	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		expr, fieldsMaybePacked := g.areFieldsPackedExpression()
-		switch {
-		case !thisPacked:
-			g.emit("return false\n")
-		case fieldsMaybePacked:
-			g.emit("return %s\n", expr)
-		default:
-			g.emit("return true\n")
-
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
-	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		if thisPacked {
-			g.recordUsedImport("safecopy")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if %s {\n", cond)
-				g.inIndent(func() {
-					g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
-				})
-				g.emit("} else {\n")
-				g.inIndent(func() {
-					g.emit("%s.MarshalBytes(dst)\n", g.r)
-				})
-				g.emit("}\n")
-			} else {
-				g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
-			}
-		} else {
-			g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName())
-			g.emit("%s.MarshalBytes(dst)\n", g.r)
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
-	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		if thisPacked {
-			g.recordUsedImport("safecopy")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if %s {\n", cond)
-				g.inIndent(func() {
-					g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
-				})
-				g.emit("} else {\n")
-				g.inIndent(func() {
-					g.emit("%s.UnmarshalBytes(src)\n", g.r)
-				})
-				g.emit("}\n")
-			} else {
-				g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
-			}
-		} else {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("%s.UnmarshalBytes(src)\n", g.r)
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
-	g.recordUsedImport("marshal")
-	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		fallback := func() {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-			g.emit("return err\n")
-		}
-		if thisPacked {
-			g.recordUsedImport("reflect")
-			g.recordUsedImport("runtime")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if !%s {\n", cond)
-				g.inIndent(fallback)
-				g.emit("}\n\n")
-			}
-			// Fast serialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
-
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the CopyOutBytes.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return err\n")
-		} else {
-			fallback()
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
-	g.recordUsedImport("marshal")
-	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		fallback := func() {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-			g.emit("if err != nil {\n")
-			g.inIndent(func() {
-				g.emit("return err\n")
-			})
-			g.emit("}\n")
-
-			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
-			g.emit("return nil\n")
-		}
-		if thisPacked {
-			g.recordUsedImport("reflect")
-			g.recordUsedImport("runtime")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if !%s {\n", cond)
-				g.inIndent(fallback)
-				g.emit("}\n\n")
-			}
-			// Fast deserialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
-
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the CopyInBytes.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return err\n")
-		} else {
-			fallback()
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
-	g.recordUsedImport("io")
-	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		fallback := func() {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
-			g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r)
-			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("n, err := w.Write(buf)\n")
-			g.emit("return int64(n), err\n")
-		}
-		if thisPacked {
-			g.recordUsedImport("reflect")
-			g.recordUsedImport("runtime")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if !%s {\n", cond)
-				g.inIndent(fallback)
-				g.emit("}\n\n")
-			}
-			// Fast serialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
-
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-			g.emit("len, err := w.Write(buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the Write.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return int64(len), err\n")
-		} else {
-			fallback()
-		}
-	})
-	g.emit("}\n\n")
-}
-
-// emitMarshallableForPrimitiveNewtype outputs code to implement the
-// marshal.Marshallable interface for a newtype on a primitive. Primitive
-// newtypes are always packed, so we can omit the various fallbacks required for
-// non-packed structs.
-func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype() {
-	g.recordUsedImport("io")
-	g.recordUsedImport("marshal")
-	g.recordUsedImport("reflect")
-	g.recordUsedImport("runtime")
-	g.recordUsedImport("safecopy")
-	g.recordUsedImport("unsafe")
-	g.recordUsedImport("usermem")
-
-	nt := g.t.Type.(*ast.Ident)
-
-	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
-	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		if size, dynamic := g.scalarSize(nt); !dynamic {
-			g.emit("return %d\n", size)
-		} else {
-			g.emit("return (*%s)(nil).SizeBytes()\n", nt.Name)
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
-	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.marshalPrimitiveScalar(g.r, nt.Name, "dst")
-	})
-	g.emit("}\n\n")
-
-	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
-	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.unmarshalPrimitiveScalar(g.r, nt.Name, "src", g.typeName())
-	})
-	g.emit("}\n\n")
-
-	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
-	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.emit("// Scalar newtypes are always packed.\n")
-		g.emit("return true\n")
-	})
-	g.emit("}\n\n")
-
-	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
-	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
-	})
-	g.emit("}\n\n")
-
-	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
-	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
-	})
-	g.emit("}\n\n")
-
-	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		// Fast serialization.
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the CopyOutBytes.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return err\n")
-	})
-	g.emit("}\n\n")
-
-	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the CopyInBytes.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return err\n")
-	})
-	g.emit("}\n\n")
-
-	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
-	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("len, err := w.Write(buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the Write.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return int64(len), err\n")
-
-	})
-	g.emit("}\n\n")
-
-}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
new file mode 100644
index 000000000..da36d9305
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
@@ -0,0 +1,183 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the bits of the code generator specific to marshalling
+// newtypes on arrays.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+)
+
+func (g *interfaceGenerator) validateArrayNewtype(n *ast.Ident, a *ast.ArrayType) {
+	if a.Len == nil {
+		g.abortAt(a.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name))
+	}
+
+	if _, ok := a.Len.(*ast.BasicLit); !ok {
+		g.abortAt(a.Len.Pos(), fmt.Sprintf("Array size must be a literal, don't use consts or expressions"))
+	}
+
+	if _, ok := a.Elt.(*ast.Ident); !ok {
+		g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt)))
+	}
+
+	if arrayLen(a) <= 0 {
+		g.abortAt(a.Len.Pos(), fmt.Sprintf("Marshalling not supported for zero length arrays, why does an ABI struct have one?"))
+	}
+}
+
+func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident, len int) {
+	g.recordUsedImport("io")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("safecopy")
+	g.recordUsedImport("unsafe")
+	g.recordUsedImport("usermem")
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if size, dynamic := g.scalarSize(elt); !dynamic {
+			g.emit("return %d\n", size*len)
+		} else {
+			g.emit("return (*%s)(nil).SizeBytes() * %d\n", n.Name, len)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("for idx := 0; idx < %d; idx++ {\n", len)
+		g.inIndent(func() {
+			g.marshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "dst")
+		})
+		g.emit("}\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("for idx := 0; idx < %d; idx++ {\n", len)
+		g.inIndent(func() {
+			g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "src")
+		})
+		g.emit("}\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Array newtypes are always packed.\n")
+		g.emit("return true\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		// Fast serialization.
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the CopyOutBytes.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("_, err := task.CopyInBytes(addr, buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the CopyInBytes.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("len, err := w.Write(buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the Write.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return int64(len), err\n")
+
+	})
+	g.emit("}\n\n")
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
new file mode 100644
index 000000000..159397825
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the bits of the code generator specific to marshalling
+// newtypes on primitives.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+)
+
+// marshalPrimitiveScalar writes a single primitive variable to a byte
+// slice.
+func (g *interfaceGenerator) marshalPrimitiveScalar(accessor, typ, bufVar string) {
+	switch typ {
+	case "int8", "uint8", "byte":
+		g.emit("%s[0] = byte(*%s)\n", bufVar, accessor)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint16(%s[:2], uint16(*%s))\n", bufVar, accessor)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint32(%s[:4], uint32(*%s))\n", bufVar, accessor)
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint64(%s[:8], uint64(*%s))\n", bufVar, accessor)
+	default:
+		g.emit("// Explicilty cast to the underlying type before dispatching to\n")
+		g.emit("// MarshalBytes, so we don't recursively call %s.MarshalBytes\n", accessor)
+		g.emit("inner := (*%s)(%s)\n", typ, accessor)
+		g.emit("inner.MarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
+	}
+}
+
+// unmarshalPrimitiveScalar read a single primitive variable from a byte slice.
+func (g *interfaceGenerator) unmarshalPrimitiveScalar(accessor, typ, bufVar, typeCast string) {
+	switch typ {
+	case "byte":
+		g.emit("*%s = %s(%s[0])\n", accessor, typeCast, bufVar)
+	case "int8", "uint8":
+		g.emit("*%s = %s(%s(%s[0]))\n", accessor, typeCast, typ, bufVar)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint16(%s[:2])))\n", accessor, typeCast, typ, bufVar)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint32(%s[:4])))\n", accessor, typeCast, typ, bufVar)
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint64(%s[:8])))\n", accessor, typeCast, typ, bufVar)
+	default:
+		g.emit("// Explicilty cast to the underlying type before dispatching to\n")
+		g.emit("// UnmarshalBytes, so we don't recursively call %s.UnmarshalBytes\n", accessor)
+		g.emit("inner := (*%s)(%s)\n", typ, accessor)
+		g.emit("inner.UnmarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
+	}
+}
+
+func (g *interfaceGenerator) validatePrimitiveNewtype(t *ast.Ident) {
+	switch t.Name {
+	case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
+		// These are the only primitive types we're allow. Below, we provide
+		// suggestions for some disallowed types and reject them, then attempt
+		// to marshal any remaining types by invoking the marshal.Marshallable
+		// interface on them. If these types don't actually implement
+		// marshal.Marshallable, compilation of the generated code will fail
+		// with an appropriate error message.
+		return
+	case "int":
+		g.abortAt(t.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
+	case "uint":
+		g.abortAt(t.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
+	case "string":
+		g.abortAt(t.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
+	default:
+		debugfAt(g.f.Position(t.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
+	}
+}
+
+// emitMarshallableForPrimitiveNewtype outputs code to implement the
+// marshal.Marshallable interface for a newtype on a primitive. Primitive
+// newtypes are always packed, so we can omit the various fallbacks required for
+// non-packed structs.
+func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident) {
+	g.recordUsedImport("io")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("safecopy")
+	g.recordUsedImport("unsafe")
+	g.recordUsedImport("usermem")
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if size, dynamic := g.scalarSize(nt); !dynamic {
+			g.emit("return %d\n", size)
+		} else {
+			g.emit("return (*%s)(nil).SizeBytes()\n", nt.Name)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.marshalPrimitiveScalar(g.r, nt.Name, "dst")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.unmarshalPrimitiveScalar(g.r, nt.Name, "src", g.typeName())
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Scalar newtypes are always packed.\n")
+		g.emit("return true\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		// Fast serialization.
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the CopyOutBytes.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("_, err := task.CopyInBytes(addr, buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the CopyInBytes.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+		g.emit("val := uintptr(ptr)\n")
+		g.emit("val = val^0\n\n")
+
+		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+		g.emit("var buf []byte\n")
+		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+		g.emit("hdr.Data = val\n")
+		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+		g.emit("len, err := w.Write(buf)\n")
+		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+		g.emit("// must live until after the Write.\n")
+		g.emit("runtime.KeepAlive(%s)\n", g.r)
+		g.emit("return int64(len), err\n")
+
+	})
+	g.emit("}\n\n")
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
new file mode 100644
index 000000000..e66a38b2e
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -0,0 +1,450 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the bits of the code generator specific to marshalling
+// structs.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+	"strings"
+)
+
+func (g *interfaceGenerator) fieldAccessor(n *ast.Ident) string {
+	return fmt.Sprintf("%s.%s", g.r, n.Name)
+}
+
+// areFieldsPackedExpression returns a go expression checking whether g.t's fields are
+// packed. Returns "", false if g.t has no fields that may be potentially
+// packed, otherwise returns <clause>, true, where <clause> is an expression
+// like "t.a.Packed() && t.b.Packed() && t.c.Packed()".
+func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
+	if len(g.as) == 0 {
+		return "", false
+	}
+
+	cs := make([]string, 0, len(g.as))
+	for accessor, _ := range g.as {
+		cs = append(cs, fmt.Sprintf("%s.Packed()", accessor))
+	}
+	return strings.Join(cs, " && "), true
+}
+
+// validateStruct ensures the type we're working with can be marshalled. These
+// checks are done ahead of time and in one place so we can make assumptions
+// later.
+func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType) {
+	forEachStructField(st, func(f *ast.Field) {
+		if len(f.Names) == 0 {
+			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
+		}
+	})
+
+	forEachStructField(st, func(f *ast.Field) {
+		fieldDispatcher{
+			primitive: func(_, t *ast.Ident) {
+				g.validatePrimitiveNewtype(t)
+			},
+			selector: func(_, _, _ *ast.Ident) {
+				// No validation to perform on selector fields. However this
+				// callback must still be provided.
+			},
+			array: func(n, _ *ast.Ident, len int) {
+				g.validateArrayNewtype(n, f.Type.(*ast.ArrayType))
+			},
+			unhandled: func(_ *ast.Ident) {
+				g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type)))
+			},
+		}.dispatch(f)
+	})
+}
+
+func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
+	// Is g.t a packed struct without consideing field types?
+	thisPacked := true
+	forEachStructField(st, func(f *ast.Field) {
+		if f.Tag != nil {
+			if f.Tag.Value == "`marshal:\"unaligned\"`" {
+				if thisPacked {
+					debugfAt(g.f.Position(g.t.Pos()),
+						fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name))
+					thisPacked = false
+				}
+			}
+		}
+	})
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		primitiveSize := 0
+		var dynamicSizeTerms []string
+
+		forEachStructField(st, fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if size, dynamic := g.scalarSize(t); !dynamic {
+					primitiveSize += size
+				} else {
+					g.recordUsedMarshallable(t.Name)
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", t.Name))
+				}
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name)
+				g.recordUsedImport(tX.Name)
+				g.recordUsedMarshallable(tName)
+				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
+			},
+			array: func(n, t *ast.Ident, len int) {
+				if len < 1 {
+					// Zero-length arrays should've been rejected by validate().
+					panic("unreachable")
+				}
+				if size, dynamic := g.scalarSize(t); !dynamic {
+					primitiveSize += size * len
+				} else {
+					g.recordUsedMarshallable(t.Name)
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%d", t.Name, len))
+				}
+			},
+		}.dispatch)
+		g.emit("return %d", primitiveSize)
+		if len(dynamicSizeTerms) > 0 {
+			g.incIndent()
+		}
+		{
+			for _, d := range dynamicSizeTerms {
+				g.emitNoIndent(" +\n")
+				g.emit(d)
+			}
+		}
+		if len(dynamicSizeTerms) > 0 {
+			g.decIndent()
+		}
+	})
+	g.emit("\n}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		forEachStructField(st, fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", t.Name, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("dst", len)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can referece here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name)
+					}
+					return
+				}
+				g.marshalScalar(g.fieldAccessor(n), t.Name, "dst")
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
+			},
+			array: func(n, t *ast.Ident, size int) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)*%d] ~= [%d]%s{0}\n", t.Name, size, size, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("dst", len*size)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can reference here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("dst = dst[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
+					}
+					return
+				}
+
+				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
+				g.inIndent(func() {
+					g.marshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "dst")
+				})
+				g.emit("}\n")
+			},
+		}.dispatch)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		forEachStructField(st, fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: var _ %s ~= src[:sizeof(%s)]\n", t.Name, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("src", len)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can reference here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("src = src[(*%s)(nil).SizeBytes():]\n", t.Name)
+						g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name))
+					}
+					return
+				}
+				g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src")
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
+			},
+			array: func(n, t *ast.Ident, size int) {
+				if n.Name == "_" {
+					g.emit("// Padding: ~ copy([%d]%s(%s), src[:sizeof(%s)*%d])\n", size, t.Name, g.fieldAccessor(n), t.Name, size)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("src", len*size)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can referece here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("src = src[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
+					}
+					return
+				}
+
+				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
+				g.inIndent(func() {
+					g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "src")
+				})
+				g.emit("}\n")
+			},
+		}.dispatch)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		expr, fieldsMaybePacked := g.areFieldsPackedExpression()
+		switch {
+		case !thisPacked:
+			g.emit("return false\n")
+		case fieldsMaybePacked:
+			g.emit("return %s\n", expr)
+		default:
+			g.emit("return true\n")
+
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if thisPacked {
+			g.recordUsedImport("safecopy")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if %s {\n", cond)
+				g.inIndent(func() {
+					g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+				})
+				g.emit("} else {\n")
+				g.inIndent(func() {
+					g.emit("%s.MarshalBytes(dst)\n", g.r)
+				})
+				g.emit("}\n")
+			} else {
+				g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+			}
+		} else {
+			g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName())
+			g.emit("%s.MarshalBytes(dst)\n", g.r)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if thisPacked {
+			g.recordUsedImport("safecopy")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if %s {\n", cond)
+				g.inIndent(func() {
+					g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+				})
+				g.emit("} else {\n")
+				g.inIndent(func() {
+					g.emit("%s.UnmarshalBytes(src)\n", g.r)
+				})
+				g.emit("}\n")
+			} else {
+				g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+			}
+		} else {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("%s.UnmarshalBytes(src)\n", g.r)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
+			g.emit("%s.MarshalBytes(buf)\n", g.r)
+			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("return err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+			g.emit("val := uintptr(ptr)\n")
+			g.emit("val = val^0\n\n")
+
+			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+			g.emit("var buf []byte\n")
+			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+			g.emit("hdr.Data = val\n")
+			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+			g.emit("// must live until after the CopyOutBytes.\n")
+			g.emit("runtime.KeepAlive(%s)\n", g.r)
+			g.emit("return err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
+			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("if err != nil {\n")
+			g.inIndent(func() {
+				g.emit("return err\n")
+			})
+			g.emit("}\n")
+
+			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
+			g.emit("return nil\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast deserialization.
+			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+			g.emit("val := uintptr(ptr)\n")
+			g.emit("val = val^0\n\n")
+
+			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+			g.emit("var buf []byte\n")
+			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+			g.emit("hdr.Data = val\n")
+			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+			g.emit("// must live until after the CopyInBytes.\n")
+			g.emit("runtime.KeepAlive(%s)\n", g.r)
+			g.emit("return err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.recordUsedImport("io")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r)
+			g.emit("%s.MarshalBytes(buf)\n", g.r)
+			g.emit("n, err := w.Write(buf)\n")
+			g.emit("return int64(n), err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
+			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
+			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
+			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
+			g.emit("val := uintptr(ptr)\n")
+			g.emit("val = val^0\n\n")
+
+			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
+			g.emit("var buf []byte\n")
+			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
+			g.emit("hdr.Data = val\n")
+			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
+			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+
+			g.emit("len, err := w.Write(buf)\n")
+			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
+			g.emit("// must live until after the Write.\n")
+			g.emit("runtime.KeepAlive(%s)\n", g.r)
+			g.emit("return int64(len), err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index 8ba47eb67..fd992e44a 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -164,7 +164,7 @@ func (g *testGenerator) emitTestSizeBytesOnTypedNilPtr() {
 
 		g.emit("if sizeFromTypedNilPtr != sizeFromConcrete {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatalf(\"SizeBytes() on typed nil pointer (%v) doesn't match size returned by a concrete object (%v).\\n\", sizeFromTypedNilPtr, sizeFromConcrete)")
+			g.emit("t.Fatalf(\"SizeBytes() on typed nil pointer (%v) doesn't match size returned by a concrete object (%v).\\n\", sizeFromTypedNilPtr, sizeFromConcrete)\n")
 		})
 		g.emit("}\n")
 	})
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index e2bca4e7c..a0936e013 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -64,6 +64,12 @@ func kindString(e ast.Expr) string {
 	}
 }
 
+func forEachStructField(st *ast.StructType, fn func(f *ast.Field)) {
+	for _, field := range st.Fields.List {
+		fn(field)
+	}
+}
+
 // fieldDispatcher is a collection of callbacks for handling different types of
 // fields in a struct declaration.
 type fieldDispatcher struct {
@@ -73,6 +79,25 @@ type fieldDispatcher struct {
 	unhandled func(n *ast.Ident)
 }
 
+// Precondition: a must have a literal for the array length. Consts and
+// expressions are not allowed as array lengths, and should be rejected by the
+// caller.
+func arrayLen(a *ast.ArrayType) int {
+	if a.Len == nil {
+		// Probably a slice? Must be handled by caller.
+		panic("Nil array length in array type")
+	}
+	lenLit, ok := a.Len.(*ast.BasicLit)
+	if !ok {
+		panic("Array has non-literal for length")
+	}
+	len, err := strconv.Atoi(lenLit.Value)
+	if err != nil {
+		panic(fmt.Sprintf("Failed to parse array length '%s' as number: %v", lenLit.Value, err))
+	}
+	return len
+}
+
 // Precondition: All dispatch callbacks that will be invoked must be
 // provided. Embedded fields are not allowed, len(f.Names) >= 1.
 func (fd fieldDispatcher) dispatch(f *ast.Field) {
@@ -96,22 +121,12 @@ func (fd fieldDispatcher) dispatch(f *ast.Field) {
 		case *ast.SelectorExpr:
 			fd.selector(name, v.X.(*ast.Ident), v.Sel)
 		case *ast.ArrayType:
-			len := 0
-			if v.Len != nil {
-				// Non-literal array length is handled by generatorInterfaces.validate().
-				if lenLit, ok := v.Len.(*ast.BasicLit); ok {
-					var err error
-					len, err = strconv.Atoi(lenLit.Value)
-					if err != nil {
-						panic(err)
-					}
-				}
-			}
 			switch t := v.Elt.(type) {
 			case *ast.Ident:
-				fd.array(name, t, len)
+				fd.array(name, t, arrayLen(v))
 			default:
-				fd.array(name, nil, len)
+				// Should be handled with a better error message during validate.
+				panic(fmt.Sprintf("Array element type is of unsupported kind. Expected *ast.Ident, got %v", t))
 			}
 		default:
 			fd.unhandled(name)
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
index 93229dedb..c829db6da 100644
--- a/tools/go_marshal/test/test.go
+++ b/tools/go_marshal/test/test.go
@@ -104,6 +104,11 @@ type Stat struct {
 	_       [3]int64
 }
 
+// InetAddr is an example marshallable newtype on an array.
+//
+// +marshal
+type InetAddr [4]byte
+
 // SignalSet is an example marshallable newtype on a primitive.
 //
 // +marshal
-- 
cgit v1.2.3


From 88f73699225bd50102bbacb6f78052338f205cdd Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Feb 2020 14:58:16 -0800
Subject: Log oom_score_adj value on error

Updates #1873

PiperOrigin-RevId: 297695241
---
 runsc/container/container.go | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 68782c4be..c9839044c 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -17,6 +17,7 @@ package container
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -1066,18 +1067,10 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
 
 // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
 func (c *Container) adjustGoferOOMScoreAdj() error {
-	if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil {
-		if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
-			// Ignore NotExist error because it can be returned when the sandbox
-			// exited while OOM score was being adjusted.
-			if !os.IsNotExist(err) {
-				return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
-			}
-			log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid)
-		}
+	if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil {
+		return nil
 	}
-
-	return nil
+	return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj)
 }
 
 // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
@@ -1154,29 +1147,29 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool)
 	}
 
 	// Set the lowest of all containers oom_score_adj to the sandbox.
-	if err := setOOMScoreAdj(s.Pid, lowScore); err != nil {
-		// Ignore NotExist error because it can be returned when the sandbox
-		// exited while OOM score was being adjusted.
-		if !os.IsNotExist(err) {
-			return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
-		}
-		log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid)
-	}
-
-	return nil
+	return setOOMScoreAdj(s.Pid, lowScore)
 }
 
 // setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
 // /proc must be available and mounted read-write. scoreAdj should be between
-// -1000 and 1000.
+// -1000 and 1000. It's a noop if the process has already exited.
 func setOOMScoreAdj(pid int, scoreAdj int) error {
 	f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
 	if err != nil {
+		// Ignore NotExist errors because it can race with process exit.
+		if os.IsNotExist(err) {
+			log.Warningf("Process (%d) not found setting oom_score_adj", pid)
+			return nil
+		}
 		return err
 	}
 	defer f.Close()
 	if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
-		return err
+		if errors.Is(err, syscall.ESRCH) {
+			log.Warningf("Process (%d) exited while setting oom_score_adj", pid)
+			return nil
+		}
+		return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err)
 	}
 	return nil
 }
-- 
cgit v1.2.3


From c96bb4d2ebc6a24b3111d986c5d40574ec8ff660 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 27 Feb 2020 15:35:19 -0800
Subject: Fix apt-get reliability issues.

This is frequently causing the core build scripts to fail. The core ubuntu
distribution will perform an auto-update at first start, which may cause the
lock file to be held. All apt-get commands may be done in a loop in order to
retry to avoid this issue. We may want to consider retrying other pieces, but
for now this should avoid the most frequent cause of build flakes.

PiperOrigin-RevId: 297704789
---
 scripts/build.sh                         |  2 +-
 scripts/common.sh                        | 14 ++++++++++++++
 tools/images/ubuntu1604/10_core.sh       | 15 ++++++++++++++-
 tools/images/ubuntu1604/20_bazel.sh      | 12 +++++++++++-
 tools/images/ubuntu1604/25_docker.sh     | 33 +++++++++++++++++++++++++-------
 tools/images/ubuntu1604/30_containerd.sh | 12 +++++++++++-
 tools/images/ubuntu1604/40_kokoro.sh     | 17 +++++++++++++++-
 tools/installers/master.sh               |  7 +++----
 8 files changed, 96 insertions(+), 16 deletions(-)

diff --git a/scripts/build.sh b/scripts/build.sh
index 4c042af6c..7c9c99800 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -17,7 +17,7 @@
 source $(dirname $0)/common.sh
 
 # Install required packages for make_repository.sh et al.
-sudo apt-get update && sudo apt-get install -y dpkg-sig coreutils apt-utils xz-utils
+apt_install dpkg-sig coreutils apt-utils xz-utils
 
 # Build runsc.
 runsc=$(build -c opt //runsc)
diff --git a/scripts/common.sh b/scripts/common.sh
index 3ca699e4a..735a383de 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -84,3 +84,17 @@ function install_runsc() {
   # Restart docker to pick up the new runtime configuration.
   sudo systemctl restart docker
 }
+
+# Installs the given packages. Note that the package names should be verified to
+# be correct, otherwise this may result in a loop that spins until time out.
+function apt_install() {
+  while true; do
+    if (sudo apt-get update && sudo apt-get install -y "$@"); then
+      break
+    fi
+    result=$?
+    if [[ $result -ne 100 ]]; then
+      return $result
+    fi
+  done
+}
diff --git a/tools/images/ubuntu1604/10_core.sh b/tools/images/ubuntu1604/10_core.sh
index 46dda6bb1..cd518d6ac 100755
--- a/tools/images/ubuntu1604/10_core.sh
+++ b/tools/images/ubuntu1604/10_core.sh
@@ -17,7 +17,20 @@
 set -xeo pipefail
 
 # Install all essential build tools.
-apt-get update && apt-get -y install make git-core build-essential linux-headers-$(uname -r) pkg-config
+while true; do
+  if (apt-get update && apt-get install -y \
+      make \
+      git-core \
+      build-essential \
+      linux-headers-$(uname -r) \
+      pkg-config); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Install a recent go toolchain.
 if ! [[ -d /usr/local/go ]]; then
diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
index b33e1656c..bb7afa676 100755
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ b/tools/images/ubuntu1604/20_bazel.sh
@@ -19,7 +19,17 @@ set -xeo pipefail
 declare -r BAZEL_VERSION=2.0.0
 
 # Install bazel dependencies.
-apt-get update && apt-get install -y openjdk-8-jdk-headless unzip
+while true; do
+  if (apt-get update && apt-get install -y \
+      openjdk-8-jdk-headless \
+      unzip); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Use the release installer.
 curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
diff --git a/tools/images/ubuntu1604/25_docker.sh b/tools/images/ubuntu1604/25_docker.sh
index 1d3defcd3..11eea2d72 100755
--- a/tools/images/ubuntu1604/25_docker.sh
+++ b/tools/images/ubuntu1604/25_docker.sh
@@ -15,12 +15,20 @@
 # limitations under the License.
 
 # Add dependencies.
-apt-get update && apt-get -y install \
-    apt-transport-https \
-    ca-certificates \
-    curl \
-    gnupg-agent \
-    software-properties-common
+while true; do
+  if (apt-get update && apt-get install -y \
+      apt-transport-https \
+      ca-certificates \
+      curl \
+      gnupg-agent \
+      software-properties-common); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Install the key.
 curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
@@ -32,4 +40,15 @@ add-apt-repository \
    stable"
 
 # Install docker.
-apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io
+while true; do
+  if (apt-get update && apt-get install -y \
+      docker-ce \
+      docker-ce-cli \
+      containerd.io); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
diff --git a/tools/images/ubuntu1604/30_containerd.sh b/tools/images/ubuntu1604/30_containerd.sh
index a7472bd1c..fb3699c12 100755
--- a/tools/images/ubuntu1604/30_containerd.sh
+++ b/tools/images/ubuntu1604/30_containerd.sh
@@ -34,7 +34,17 @@ install_helper() {
 }
 
 # Install dependencies for the crictl tests.
-apt-get install -y btrfs-tools libseccomp-dev
+while true; do
+  if (apt-get update && apt-get install -y \
+      btrfs-tools \
+      libseccomp-dev); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Install containerd & cri-tools.
 GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
diff --git a/tools/images/ubuntu1604/40_kokoro.sh b/tools/images/ubuntu1604/40_kokoro.sh
index 5f2dfc858..06a1e6c48 100755
--- a/tools/images/ubuntu1604/40_kokoro.sh
+++ b/tools/images/ubuntu1604/40_kokoro.sh
@@ -23,7 +23,22 @@ declare -r ssh_public_keys=(
 )
 
 # Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip
+while true; do
+  if (apt-get update && apt-get install -y \
+      rsync \
+      coreutils \
+      python-psutil \
+      qemu-kvm \
+      python-pip \
+      python3-pip \
+      zip); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # junitparser is used to merge junit xml files.
 pip install junitparser
diff --git a/tools/installers/master.sh b/tools/installers/master.sh
index 52f9734a6..2c6001c6c 100755
--- a/tools/installers/master.sh
+++ b/tools/installers/master.sh
@@ -19,17 +19,16 @@ set -e
 
 curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
 add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
+
 while true; do
-  if apt-get update; then
-    apt-get install -y runsc
+  if (apt-get update && apt-get install -y runsc); then
     break
   fi
   result=$?
-  # Check if apt update failed to aquire the file lock.
   if [[ $result -ne 100 ]]; then
     exit $result
   fi
 done
+
 runsc install
 service docker restart
-
-- 
cgit v1.2.3


From dd1ed5c789ff72fd6bbacda0ff7c7acf9672d25a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 28 Feb 2020 14:47:34 +0800
Subject: skip vsyscall test cases on Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/time.cc     | 2 ++
 test/syscalls/linux/vsyscall.cc | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 1ccb95733..e75bba669 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -26,6 +26,7 @@ namespace {
 
 constexpr long kFudgeSeconds = 5;
 
+#if defined(__x86_64__) || defined(__i386__)
 // Mimics the time(2) wrapper from glibc prior to 2.15.
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
@@ -98,6 +99,7 @@ TEST(TimeTest, VsyscallGettimeofday_InvalidAddressSIGSEGV) {
                                     reinterpret_cast<struct timezone*>(0x1)),
               ::testing::KilledBySignal(SIGSEGV), "");
 }
+#endif
 
 }  // namespace
 
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index 2c2303358..ae4377108 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -24,6 +24,7 @@ namespace testing {
 
 namespace {
 
+#if defined(__x86_64__) || defined(__i386__)
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
@@ -37,6 +38,7 @@ TEST(VsyscallTest, VsyscallAlwaysAvailableOnGvisor) {
   time_t t;
   EXPECT_THAT(vsyscall_time(&t), SyscallSucceeds());
 }
+#endif
 
 }  // namespace
 
-- 
cgit v1.2.3


From af6fab651406c411ef848c360b017713de385880 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Fri, 28 Feb 2020 10:00:38 -0800
Subject: Add nat table support for iptables.

- Fix review comments.
---
 pkg/sentry/socket/netfilter/netfilter.go | 14 +++++---
 pkg/tcpip/iptables/targets.go            | 55 ++++++++++++++++++++++++--------
 pkg/tcpip/iptables/types.go              |  2 +-
 3 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index f68a2260d..c8a9e67b8 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -292,9 +292,9 @@ func marshalRedirectTarget() []byte {
 		},
 	}
 	copy(target.Target.Name[:], redirectTargetName)
-  
+
 	ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
-  return binary.Marshal(ret, usermem.ByteOrder, target)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
 func marshalJumpTarget(jt JumpTarget) []byte {
@@ -670,15 +670,21 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 
 		// TODO(gvisor.dev/issue/170): Check if the flags are valid.
 		// Also check if we need to map ports or IP.
-		// For now, redirect target only supports dest port change.
+		// For now, redirect target only supports destination port change.
+		// Port range and IP range are not supported yet.
 		if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
 			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument.")
 		}
-		target.Flags = nfRange.RangeIPV4.Flags
+		target.RangeProtoSpecified = true
 
 		target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
 		target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
 
+		// TODO(gvisor.dev/issue/170): Port range is not supported yet.
+		if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument.")
+		}
+
 		// Convert port from big endian to little endian.
 		port := make([]byte, 2)
 		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 96318118c..ae5af7c53 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -69,8 +69,11 @@ func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 // Min and Max values for IP and Ports in the struct indicate the range of
 // values which can be used to redirect.
 type RedirectTarget struct {
-	// Flags to check if the redirect is for address or ports.
-	Flags uint32
+	// TODO(gvisor.dev/issue/170): Other flags need to be aded after
+	// we support them.
+	// RangeProtoSpecified flag indicates single port is specified to
+	// redirect.
+	RangeProtoSpecified bool
 
 	// Min address used to redirect.
 	MinIP tcpip.Address
@@ -86,30 +89,56 @@ type RedirectTarget struct {
 }
 
 // Action implements Target.Action.
+// TODO(gvisor.dev/issue/170): Parse headers without copying. The current
+// implementation only works for PREROUTING and calls pkt.Clone(), neither
+// of which should be the case.
 func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer) (RuleVerdict, int) {
-	headerView := pkt.Data.First()
+	newPkt := pkt.Clone()
 
-	// Network header should be set.
+	// Set network header.
+	headerView := newPkt.Data.First()
 	netHeader := header.IPv4(headerView)
-	if netHeader == nil {
-		return RuleDrop, 0
-	}
+	newPkt.NetworkHeader = headerView[:netHeader.HeaderLength()]
 
-	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
-	// we need to change dest address (for OUTPUT chain) or ports.
 	hlen := int(netHeader.HeaderLength())
+	tlen := int(netHeader.TotalLength())
+	newPkt.Data.TrimFront(hlen)
+	newPkt.Data.CapLength(tlen - hlen)
 
+	// TODO(gvisor.dev/issue/170): Change destination address to
+	// loopback or interface address on which the packet was
+	// received.
+
+	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
+	// we need to change dest address (for OUTPUT chain) or ports.
 	switch protocol := netHeader.TransportProtocol(); protocol {
 	case header.UDPProtocolNumber:
-		udp := header.UDP(headerView[hlen:])
-		udp.SetDestinationPort(rt.MinPort)
+		var udpHeader header.UDP
+		if newPkt.TransportHeader != nil {
+			udpHeader = header.UDP(newPkt.TransportHeader)
+		} else {
+			if len(pkt.Data.First()) < header.UDPMinimumSize {
+				return RuleDrop, 0
+			}
+			udpHeader = header.UDP(newPkt.Data.First())
+		}
+		udpHeader.SetDestinationPort(rt.MinPort)
 	case header.TCPProtocolNumber:
+		var tcpHeader header.TCP
+		if newPkt.TransportHeader != nil {
+			tcpHeader = header.TCP(newPkt.TransportHeader)
+		} else {
+			if len(pkt.Data.First()) < header.TCPMinimumSize {
+				return RuleDrop, 0
+			}
+			tcpHeader = header.TCP(newPkt.TransportHeader)
+		}
 		// TODO(gvisor.dev/issue/170): Need to recompute checksum
 		// and implement nat connection tracking to support TCP.
-		tcp := header.TCP(headerView[hlen:])
-		tcp.SetDestinationPort(rt.MinPort)
+		tcpHeader.SetDestinationPort(rt.MinPort)
 	default:
 		return RuleDrop, 0
 	}
+
 	return RuleAccept, 0
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 9c2ad2d46..7d032fd23 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -163,6 +163,6 @@ type Matcher interface {
 type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
-	// Jump, it also returns the name of the chain to jump to.
+	// Jump, it also returns the index of the rule to jump to.
 	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
 }
-- 
cgit v1.2.3


From 0f8a9e362337ee684042331c5bf24a3cb43d6fc4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Feb 2020 10:13:59 -0800
Subject: Change dup2 call to dup3

We changed syscalls to allow dup3 for ARM64.

Updates #1198

PiperOrigin-RevId: 297870816
---
 pkg/sentry/fsimpl/gofer/gofer.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d00850e25..c4a8f0b38 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1045,13 +1045,13 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// using the old file descriptor, preventing us from safely
 				// closing it. We could handle this by invalidating existing
 				// memmap.Translations, but this is expensive. Instead, use
-				// dup2() to make the old file descriptor refer to the new file
+				// dup3 to make the old file descriptor refer to the new file
 				// description, then close the new file descriptor (which is no
 				// longer needed). Racing callers may use the old or new file
 				// description, but this doesn't matter since they refer to the
 				// same file (unless d.fs.opts.overlayfsStaleRead is true,
 				// which we handle separately).
-				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil {
 					d.handleMu.Unlock()
 					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
 					h.close(ctx)
-- 
cgit v1.2.3


From e5d9a4010bdbea10320348b022ee5b761c1eba07 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 11 Feb 2020 16:01:42 -0800
Subject: Add ability to execute go.mod in gopath context.

---
 CONTRIBUTING.md |  3 +++
 WORKSPACE       | 41 ++++++++++++++++++++++++++++++-----------
 go.mod          | 31 ++++++++++++++-----------------
 go.sum          | 29 +++++++++++++++++++----------
 tools/go_mod.sh | 29 +++++++++++++++++++++++++++++
 5 files changed, 95 insertions(+), 38 deletions(-)
 create mode 100755 tools/go_mod.sh

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 71650a4b8..ad8e710da 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,6 +32,9 @@ will need to be added to the appropriate `BUILD` files, and the `:gopath` target
 will need to be re-run to generate appropriate symlinks in the `GOPATH`
 directory tree.
 
+Dependencies can be added by using `go mod get`. In order to keep the
+`WORKSPACE` file in sync, run `tools/go_mod.sh` in place of `go mod`.
+
 ### Coding Guidelines
 
 All Go code should conform to the [Go style guidelines][gostyle]. C++ code
diff --git a/WORKSPACE b/WORKSPACE
index a15238a2e..995d2c7f1 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -20,7 +20,7 @@ http_archive(
     ],
 )
 
-load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_toolchains")
+load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_dependencies")
 
 go_rules_dependencies()
 
@@ -43,8 +43,8 @@ gazelle_dependencies()
 go_repository(
     name = "org_golang_x_sys",
     importpath = "golang.org/x/sys",
-    sum = "h1:72l8qCJ1nGxMGH26QVBVIxKd/D34cfGt0OvrPtpemyY=",
-    version = "v0.0.0-20191220220014-0732a990476f",
+    sum = "h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=",
+    version = "v0.0.0-20190215142949-d0b11bdaac8a",
 )
 
 # Load C++ rules.
@@ -68,8 +68,11 @@ http_archive(
         "https://github.com/bazelbuild/rules_proto/archive/97d8af4dc474595af3900dd85cb3a29ad28cc313.tar.gz",
     ],
 )
+
 load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+
 rules_proto_dependencies()
+
 rules_proto_toolchains()
 
 # Load python dependencies.
@@ -146,9 +149,9 @@ load(
 # This container is built from the Dockerfile in test/iptables/runner.
 container_pull(
     name = "iptables-test",
+    digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
     registry = "gcr.io",
     repository = "gvisor-presubmit/iptables-test",
-    digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
 )
 
 load(
@@ -201,6 +204,13 @@ go_repository(
     version = "v0.0.0-20171129191014-dec09d789f3d",
 )
 
+go_repository(
+    name = "com_github_kr_pretty",
+    importpath = "github.com/kr/pretty",
+    sum = "h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=",
+    version = "v0.2.0",
+)
+
 go_repository(
     name = "com_github_kr_pty",
     importpath = "github.com/kr/pty",
@@ -208,6 +218,13 @@ go_repository(
     version = "v1.1.1",
 )
 
+go_repository(
+    name = "com_github_kr_text",
+    importpath = "github.com/kr/text",
+    sum = "h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=",
+    version = "v0.1.0",
+)
+
 go_repository(
     name = "com_github_opencontainers_runtime-spec",
     importpath = "github.com/opencontainers/runtime-spec",
@@ -236,6 +253,13 @@ go_repository(
     version = "v0.0.0-20171111001504-be1fbeda1936",
 )
 
+go_repository(
+    name = "in_gopkg_check_v1",
+    importpath = "gopkg.in/check.v1",
+    sum = "h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=",
+    version = "v1.0.0-20190902080502-41f04d3bba15",
+)
+
 go_repository(
     name = "org_golang_x_crypto",
     importpath = "golang.org/x/crypto",
@@ -257,12 +281,6 @@ go_repository(
     version = "v0.3.0",
 )
 
-go_repository(
-    name = "org_golang_x_tools",
-    commit = "36563e24a262",
-    importpath = "golang.org/x/tools",
-)
-
 go_repository(
     name = "org_golang_x_sync",
     importpath = "golang.org/x/sync",
@@ -272,8 +290,9 @@ go_repository(
 
 go_repository(
     name = "org_golang_x_time",
-    commit = "c4c64cad1fd0a1a8dab2523e04e61d35308e131e",
     importpath = "golang.org/x/time",
+    sum = "h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=",
+    version = "v0.0.0-20191024005414-555d28b269f0",
 )
 
 go_repository(
diff --git a/go.mod b/go.mod
index c4687ed02..3a8b9288d 100644
--- a/go.mod
+++ b/go.mod
@@ -3,21 +3,18 @@ module gvisor.dev/gvisor
 go 1.13
 
 require (
-  github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
-  github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
-  github.com/golang/mock v1.3.1
-  github.com/golang/protobuf v1.3.1
-  github.com/google/btree v1.0.0
-  github.com/google/go-cmp v0.2.0
-  github.com/google/go-github/v28 v28.1.1
-  github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
-  github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
-  github.com/kr/pty v1.1.1
-  github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
-  github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
-  github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
-  github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
-  golang.org/x/net v0.0.0-20190311183353-d8887717615a
-  golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6
-  golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+	github.com/golang/protobuf v1.3.1
+	github.com/google/btree v1.0.0
+	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+	github.com/kr/pretty v0.2.0 // indirect
+	github.com/kr/pty v1.1.1
+	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 // indirect
+	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
+	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
 )
diff --git a/go.sum b/go.sum
index 434770beb..f16a549fd 100644
--- a/go.sum
+++ b/go.sum
@@ -1,21 +1,30 @@
+github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 h1:+FKjzBIdfBHYDvxCv+djmDJdes/AoDtg8gpcxowBlF8=
 github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422/go.mod h1:b6Nc7NRH5C4aCISLry0tLnTjcuTEvoiqcWDdsU0sOGM=
+github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 h1:JFTFz3HZTGmgMz4E1TabNBNJljROSYgja1b4l50FNVs=
 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
-github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
+github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-github/v28 v28.1.1/go.mod h1:bsqJWQX05omyWVmc00nEUql9mhQyv38lDZ8kPZcQVoM=
+github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8 h1:GZGUPQiZfYrd9uOqyqwbQcHPkz/EZJVkZB1MkaO9UBI=
 github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
-github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
+github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 h1:d9F+LNYwMyi3BDN4GzZdaSiq4otb8duVEWyZjeUtOQI=
 github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 h1:b6uOv7YOFK0TYG7HtkIgExQo+2RdLuwRft63jn2HWj8=
 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
+github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e h1:/Tdc23Arz1OtdIsBY2utWepGRQ9fEAJlhkdoLzWMK8Q=
 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
+github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 h1:J9gO8RJCAFlln1jsvRba/CWVUnMHwObklfxxjErl1uk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
diff --git a/tools/go_mod.sh b/tools/go_mod.sh
new file mode 100755
index 000000000..84b779d6d
--- /dev/null
+++ b/tools/go_mod.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+# Build the :gopath target.
+bazel build //:gopath
+declare -r gopathdir="bazel-bin/gopath/src/gvisor.dev/gvisor/"
+
+# Copy go.mod and execute the command.
+cp -a go.mod go.sum "${gopathdir}"
+(cd "${gopathdir}" && go mod "$@")
+cp -a "${gopathdir}/go.mod" "${gopathdir}/go.sum" .
+
+# Cleanup the WORKSPACE file.
+bazel run //:gazelle -- update-repos -from_file=go.mod
-- 
cgit v1.2.3


From 6b4d36e3253238dd72d0861ac1220d147e1de8dd Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 28 Feb 2020 10:37:52 -0800
Subject: Hide /dev/net/tun when using hostinet.

/dev/net/tun does not currently work with hostinet. This has caused some
program starts failing because it thinks the feature exists.

PiperOrigin-RevId: 297876196
---
 pkg/sentry/fs/dev/BUILD                |  1 +
 pkg/sentry/fs/dev/dev.go               |  7 +++++--
 pkg/sentry/fs/dev/net_tun.go           |  7 +++++++
 pkg/sentry/kernel/kernel.go            |  4 ++++
 test/syscalls/BUILD                    |  5 +++++
 test/syscalls/linux/BUILD              | 12 +++++++++++
 test/syscalls/linux/dev.cc             |  7 -------
 test/syscalls/linux/tuntap.cc          |  7 +++++++
 test/syscalls/linux/tuntap_hostinet.cc | 37 ++++++++++++++++++++++++++++++++++
 9 files changed, 78 insertions(+), 9 deletions(-)
 create mode 100644 test/syscalls/linux/tuntap_hostinet.cc

diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9b6bb26d0..9379a4d7b 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 7e66c29b0..acbd401a0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -124,10 +125,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+	}
 
-		"net": newDirectory(ctx, map[string]*fs.Inode{
+	if isNetTunSupported(inet.StackFromContext(ctx)) {
+		contents["net"] = newDirectory(ctx, map[string]*fs.Inode{
 			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
-		}, msrc),
+		}, msrc)
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index 755644488..dc7ad075a 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -168,3 +169,9 @@ func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.Eve
 func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
 	fops.device.EventUnregister(e)
 }
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 8b76750e9..1d627564f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -755,6 +755,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1481,6 +1483,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 3518e862d..a69b0ce13 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -680,6 +680,11 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:tuntap_test")
 
+syscall_test(
+    add_hostinet = True,
+    test = "//test/syscalls/linux:tuntap_hostinet_test",
+)
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 704bae17b..70c120e42 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3460,6 +3460,18 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_hostinet_test",
+    testonly = 1,
+    srcs = ["tuntap_hostinet.cc"],
+    linkstatic = 1,
+    deps = [
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4e473268c..4dd302eed 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,13 +153,6 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
-TEST(DevTest, NetTunExists) {
-  struct stat statbuf = {};
-  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
-  // Check that it's a character device with rw-rw-rw- permissions.
-  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
-}
-
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index f6ac9d7b8..f734511d6 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -153,6 +153,13 @@ std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
 
 }  // namespace
 
+TEST(TuntapStaticTest, NetTunExists) {
+  struct stat statbuf;
+  ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 class TuntapTest : public ::testing::Test {
  protected:
   void TearDown() override {
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
new file mode 100644
index 000000000..0c527419e
--- /dev/null
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TuntapHostInetTest, NoNetTun) {
+  SKIP_IF(!IsRunningOnGvisor());
+
+  struct stat statbuf;
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
+}
+
+}  // namespace
+}  // namespace testing
+
+}  // namespace gvisor
-- 
cgit v1.2.3


From 322dbfe06bfc3949b7b3a7e7add695c41213ddec Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 28 Feb 2020 11:23:00 -0800
Subject: Allow to specify a separate log for GO's runtime messages

GO's runtime calls the write system call twice to print "panic:"
and "the reason of this panic", so here is a race window when
other threads can print something to the log and we will see
something like this:

panic: log messages from another thread
The reason of the panic.

This confuses the syzkaller blacklist and dedup detection.

It also makes the logs generally difficult to read. e.g.,
data races often have one side of the race, followed by
a large "diagnosis" dump, finally followed by the other
side of the race.

PiperOrigin-RevId: 297887895
---
 runsc/boot/config.go     |  4 ++++
 runsc/main.go            | 37 +++++++++++++++++++++++--------------
 runsc/sandbox/sandbox.go | 18 ++++++++++++++++++
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 35391030f..7ea5bfade 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -158,6 +158,9 @@ type Config struct {
 	// DebugLog is the path to log debug information to, if not empty.
 	DebugLog string
 
+	// PanicLog is the path to log GO's runtime messages, if not empty.
+	PanicLog string
+
 	// DebugLogFormat is the log format for debug.
 	DebugLogFormat string
 
@@ -269,6 +272,7 @@ func (c *Config) ToFlags() []string {
 		"--log=" + c.LogFilename,
 		"--log-format=" + c.LogFormat,
 		"--debug-log=" + c.DebugLog,
+		"--panic-log=" + c.PanicLog,
 		"--debug-log-format=" + c.DebugLogFormat,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
diff --git a/runsc/main.go b/runsc/main.go
index af73bed97..62e184ec9 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -54,9 +54,11 @@ var (
 
 	// Debugging flags.
 	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	panicLog        = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
 	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
 	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
 	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	panicLogFD      = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
 	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
 	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
 
@@ -206,6 +208,7 @@ func main() {
 		LogFilename:        *logFilename,
 		LogFormat:          *logFormat,
 		DebugLog:           *debugLog,
+		PanicLog:           *panicLog,
 		DebugLogFormat:     *debugLogFormat,
 		FileAccess:         fsAccess,
 		FSGoferHostUDS:     *fsGoferHostUDS,
@@ -258,20 +261,6 @@ func main() {
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
 
-		// Quick sanity check to make sure no other commands get passed
-		// a log fd (they should use log dir instead).
-		if subcommand != "boot" && subcommand != "gofer" {
-			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
-		}
-
-		// If we are the boot process, then we own our stdio FDs and can do what we
-		// want with them. Since Docker and Containerd both eat boot's stderr, we
-		// dup our stderr to the provided log FD so that panics will appear in the
-		// logs, rather than just disappear.
-		if err := syscall.Dup3(int(f.Fd()), int(os.Stderr.Fd()), 0); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
-		}
-
 		e = newEmitter(*debugLogFormat, f)
 
 	} else if *debugLog != "" {
@@ -287,6 +276,26 @@ func main() {
 		e = newEmitter("text", ioutil.Discard)
 	}
 
+	if *panicLogFD > -1 || *debugLogFD > -1 {
+		fd := *panicLogFD
+		if fd < 0 {
+			fd = *debugLogFD
+		}
+		// Quick sanity check to make sure no other commands get passed
+		// a log fd (they should use log dir instead).
+		if subcommand != "boot" && subcommand != "gofer" {
+			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
+		}
+
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
+		}
+	}
+
 	if *alsoLogToStderr {
 		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ec72bdbfd..67e27df4d 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -369,6 +369,24 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
 		nextFD++
 	}
+	if conf.PanicLog != "" {
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+
+		panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test)
+		if err != nil {
+			return fmt.Errorf("opening debug log file in %q: %v", conf.PanicLog, err)
+		}
+		defer panicLogFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile)
+		cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
 
 	cmd.Args = append(cmd.Args, "--panic-signal="+strconv.Itoa(int(syscall.SIGTERM)))
 
-- 
cgit v1.2.3


From 463f4217d109ded8af758fe51a5daf8670da9794 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 28 Feb 2020 12:28:10 -0800
Subject: Make pipe buffer implementation standard.

A follow-up change will convert the networking code to use this standard
pipe implementation.

PiperOrigin-RevId: 297903206
---
 pkg/buffer/BUILD                      |  39 ++++
 pkg/buffer/buffer.go                  |  67 ++++++
 pkg/buffer/safemem.go                 | 131 ++++++++++++
 pkg/buffer/view.go                    | 382 ++++++++++++++++++++++++++++++++++
 pkg/buffer/view_test.go               | 233 +++++++++++++++++++++
 pkg/buffer/view_unsafe.go             |  25 +++
 pkg/sentry/kernel/pipe/BUILD          |  18 +-
 pkg/sentry/kernel/pipe/buffer.go      | 115 ----------
 pkg/sentry/kernel/pipe/buffer_test.go |  32 ---
 pkg/sentry/kernel/pipe/pipe.go        | 118 ++---------
 pkg/sentry/kernel/pipe/pipe_util.go   |  25 +--
 11 files changed, 912 insertions(+), 273 deletions(-)
 create mode 100644 pkg/buffer/BUILD
 create mode 100644 pkg/buffer/buffer.go
 create mode 100644 pkg/buffer/safemem.go
 create mode 100644 pkg/buffer/view.go
 create mode 100644 pkg/buffer/view_test.go
 create mode 100644 pkg/buffer/view_unsafe.go
 delete mode 100644 pkg/sentry/kernel/pipe/buffer.go
 delete mode 100644 pkg/sentry/kernel/pipe/buffer_test.go

diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
new file mode 100644
index 000000000..a77a3beea
--- /dev/null
+++ b/pkg/buffer/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "buffer_list",
+    out = "buffer_list.go",
+    package = "buffer",
+    prefix = "buffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Buffer",
+        "Linker": "*Buffer",
+    },
+)
+
+go_library(
+    name = "buffer",
+    srcs = [
+        "buffer.go",
+        "buffer_list.go",
+        "safemem.go",
+        "view.go",
+        "view_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/safemem",
+    ],
+)
+
+go_test(
+    name = "buffer_test",
+    size = "small",
+    srcs = ["view_test.go"],
+    library = ":buffer",
+)
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
new file mode 100644
index 000000000..d5f64609b
--- /dev/null
+++ b/pkg/buffer/buffer.go
@@ -0,0 +1,67 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer provides the implementation of a buffer view.
+package buffer
+
+import (
+	"sync"
+)
+
+const bufferSize = 8144 // See below.
+
+// Buffer encapsulates a queueable byte buffer.
+//
+// Note that the total size is slightly less than two pages. This is done
+// intentionally to ensure that the buffer object aligns with runtime
+// internals. We have no hard size or alignment requirements. This two page
+// size will effectively minimize internal fragmentation, but still have a
+// large enough chunk to limit excessive segmentation.
+//
+// +stateify savable
+type Buffer struct {
+	data  [bufferSize]byte
+	read  int
+	write int
+	bufferEntry
+}
+
+// Reset resets internal data.
+//
+// This must be called before use.
+func (b *Buffer) Reset() {
+	b.read = 0
+	b.write = 0
+}
+
+// Empty indicates the buffer is empty.
+//
+// This indicates there is no data left to read.
+func (b *Buffer) Empty() bool {
+	return b.read == b.write
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *Buffer) Full() bool {
+	return b.write == len(b.data)
+}
+
+// bufferPool is a pool for buffers.
+var bufferPool = sync.Pool{
+	New: func() interface{} {
+		return new(Buffer)
+	},
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
new file mode 100644
index 000000000..071aaa488
--- /dev/null
+++ b/pkg/buffer/safemem.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// WriteBlock returns this buffer as a write Block.
+func (b *Buffer) WriteBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.data[b.write:])
+}
+
+// ReadBlock returns this buffer as a read Block.
+func (b *Buffer) ReadBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.data[b.read:b.write])
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// This will advance the write index.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	need := int(srcs.NumBytes())
+	if need == 0 {
+		return 0, nil
+	}
+
+	var (
+		dst    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	// Need at least one buffer.
+	firstBuf := v.data.Back()
+	if firstBuf == nil {
+		firstBuf = bufferPool.Get().(*Buffer)
+		v.data.PushBack(firstBuf)
+	}
+
+	// Does the last block have sufficient capacity alone?
+	if l := len(firstBuf.data) - firstBuf.write; l >= need {
+		dst = safemem.BlockSeqOf(firstBuf.WriteBlock())
+	} else {
+		// Append blocks until sufficient.
+		need -= l
+		blocks = append(blocks, firstBuf.WriteBlock())
+		for need > 0 {
+			emptyBuf := bufferPool.Get().(*Buffer)
+			v.data.PushBack(emptyBuf)
+			need -= len(emptyBuf.data) // Full block.
+			blocks = append(blocks, emptyBuf.WriteBlock())
+		}
+		dst = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform the copy.
+	n, err := safemem.CopySeq(dst, srcs)
+	v.size += int64(n)
+
+	// Update all indices.
+	for left := int(n); left > 0; firstBuf = firstBuf.Next() {
+		if l := len(firstBuf.data) - firstBuf.write; left >= l {
+			firstBuf.write += l // Whole block.
+			left -= l
+		} else {
+			firstBuf.write += left // Partial block.
+			left = 0
+		}
+	}
+
+	return n, err
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+//
+// This will not advance the read index; the caller should follow
+// this call with a call to TrimFront in order to remove the read
+// data from the buffer. This is done to support pipe sematics.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	need := int(dsts.NumBytes())
+	if need == 0 {
+		return 0, nil
+	}
+
+	var (
+		src    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	firstBuf := v.data.Front()
+	if firstBuf == nil {
+		return 0, io.EOF
+	}
+
+	// Is all the data in a single block?
+	if l := firstBuf.write - firstBuf.read; l >= need {
+		src = safemem.BlockSeqOf(firstBuf.ReadBlock())
+	} else {
+		// Build a list of all the buffers.
+		need -= l
+		blocks = append(blocks, firstBuf.ReadBlock())
+		for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() {
+			need -= buf.write - buf.read
+			blocks = append(blocks, buf.ReadBlock())
+		}
+		src = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform the copy.
+	n, err := safemem.CopySeq(dsts, src)
+
+	// See above: we would normally advance the read index here, but we
+	// don't do that in order to support pipe semantics. We rely on a
+	// separate call to TrimFront() in this case.
+
+	return n, err
+}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
new file mode 100644
index 000000000..00fc11e9c
--- /dev/null
+++ b/pkg/buffer/view.go
@@ -0,0 +1,382 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"fmt"
+	"io"
+)
+
+// View is a non-linear buffer.
+//
+// All methods are thread compatible.
+//
+// +stateify savable
+type View struct {
+	data bufferList
+	size int64
+}
+
+// TrimFront removes the first count bytes from the buffer.
+func (v *View) TrimFront(count int64) {
+	if count >= v.size {
+		v.advanceRead(v.size)
+	} else {
+		v.advanceRead(count)
+	}
+}
+
+// Read implements io.Reader.Read.
+//
+// Note that reading does not advance the read index. This must be done
+// manually using TrimFront or other methods.
+func (v *View) Read(p []byte) (int, error) {
+	return v.ReadAt(p, 0)
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (v *View) ReadAt(p []byte, offset int64) (int, error) {
+	var (
+		skipped int64
+		done    int64
+	)
+	for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
+		needToSkip := int(offset - skipped)
+		if l := buf.write - buf.read; l <= needToSkip {
+			skipped += int64(l)
+			continue
+		}
+
+		// Actually read data.
+		n := copy(p[done:], buf.data[buf.read+needToSkip:buf.write])
+		skipped += int64(needToSkip)
+		done += int64(n)
+	}
+	if int(done) < len(p) {
+		return int(done), io.EOF
+	}
+	return int(done), nil
+}
+
+// Write implements io.Writer.Write.
+func (v *View) Write(p []byte) (int, error) {
+	v.Append(p) // Does not fail.
+	return len(p), nil
+}
+
+// advanceRead advances the view's read index.
+//
+// Precondition: there must be sufficient bytes in the buffer.
+func (v *View) advanceRead(count int64) {
+	for buf := v.data.Front(); buf != nil && count > 0; {
+		l := int64(buf.write - buf.read)
+		if l > count {
+			// There is still data for reading.
+			buf.read += int(count)
+			v.size -= count
+			count = 0
+			break
+		}
+
+		// Read from this buffer.
+		buf.read += int(l)
+		count -= l
+		v.size -= l
+
+		// When all data has been read from a buffer, we push
+		// it into the empty buffer pool for reuse.
+		oldBuf := buf
+		buf = buf.Next() // Iterate.
+		v.data.Remove(oldBuf)
+		oldBuf.Reset()
+		bufferPool.Put(oldBuf)
+	}
+	if count > 0 {
+		panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
+	}
+}
+
+// Truncate truncates the view to the given bytes.
+func (v *View) Truncate(length int64) {
+	if length < 0 || length >= v.size {
+		return // Nothing to do.
+	}
+	for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
+		l := int64(buf.write - buf.read) // Local bytes.
+		switch {
+		case v.size-l >= length:
+			// Drop the buffer completely; see above.
+			v.data.Remove(buf)
+			v.size -= l
+			buf.Reset()
+			bufferPool.Put(buf)
+
+		case v.size > length && v.size-l < length:
+			// Just truncate the buffer locally.
+			delta := (length - (v.size - l))
+			buf.write = buf.read + int(delta)
+			v.size = length
+
+		default:
+			// Should never happen.
+			panic("invalid buffer during truncation")
+		}
+	}
+	v.size = length // Save the new size.
+}
+
+// Grow grows the given view to the number of bytes. If zero
+// is true, all these bytes will be zero. If zero is false,
+// then this is the caller's responsibility.
+//
+// Precondition: length must be >= 0.
+func (v *View) Grow(length int64, zero bool) {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	for v.size < length {
+		buf := v.data.Back()
+
+		// Is there at least one buffer?
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*Buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Write up to length bytes.
+		l := len(buf.data) - buf.write
+		if int64(l) > length-v.size {
+			l = int(length - v.size)
+		}
+
+		// Zero the written section; note that this pattern is
+		// specifically recognized and optimized by the compiler.
+		if zero {
+			for i := buf.write; i < buf.write+l; i++ {
+				buf.data[i] = 0
+			}
+		}
+
+		// Advance the index.
+		buf.write += l
+		v.size += int64(l)
+	}
+}
+
+// Prepend prepends the given data.
+func (v *View) Prepend(data []byte) {
+	// Is there any space in the first buffer?
+	if buf := v.data.Front(); buf != nil && buf.read > 0 {
+		// Fill up before the first write.
+		avail := buf.read
+		copy(buf.data[0:], data[len(data)-avail:])
+		data = data[:len(data)-avail]
+		v.size += int64(avail)
+	}
+
+	for len(data) > 0 {
+		// Do we need an empty buffer?
+		buf := bufferPool.Get().(*Buffer)
+		v.data.PushFront(buf)
+
+		// The buffer is empty; copy last chunk.
+		start := len(data) - len(buf.data)
+		if start < 0 {
+			start = 0 // Everything.
+		}
+
+		// We have to put the data at the end of the current
+		// buffer in order to ensure that the next prepend will
+		// correctly fill up the beginning of this buffer.
+		bStart := len(buf.data) - len(data[start:])
+		n := copy(buf.data[bStart:], data[start:])
+		buf.read = bStart
+		buf.write = len(buf.data)
+		data = data[:start]
+		v.size += int64(n)
+	}
+}
+
+// Append appends the given data.
+func (v *View) Append(data []byte) {
+	for done := 0; done < len(data); {
+		buf := v.data.Back()
+
+		// Find the first empty buffer.
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*Buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Copy in to the given buffer.
+		n := copy(buf.data[buf.write:], data[done:])
+		done += n
+		buf.write += n
+		v.size += int64(n)
+	}
+}
+
+// Flatten returns a flattened copy of this data.
+//
+// This method should not be used in any performance-sensitive paths. It may
+// allocate a fresh byte slice sufficiently large to contain all the data in
+// the buffer.
+//
+// N.B. Tee data still belongs to this view, as if there is a single buffer
+// present, then it will be returned directly. This should be used for
+// temporary use only, and a reference to the given slice should not be held.
+func (v *View) Flatten() []byte {
+	if buf := v.data.Front(); buf.Next() == nil {
+		return buf.data[buf.read:buf.write] // Only one buffer.
+	}
+	data := make([]byte, 0, v.size) // Need to flatten.
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		// Copy to the allocated slice.
+		data = append(data, buf.data[buf.read:buf.write]...)
+	}
+	return data
+}
+
+// Size indicates the total amount of data available in this view.
+func (v *View) Size() (sz int64) {
+	sz = v.size // Pre-calculated.
+	return sz
+}
+
+// Copy makes a strict copy of this view.
+func (v *View) Copy() (other View) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		other.Append(buf.data[buf.read:buf.write])
+	}
+	return other
+}
+
+// Apply applies the given function across all valid data.
+func (v *View) Apply(fn func([]byte)) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		if l := int64(buf.write - buf.read); l > 0 {
+			fn(buf.data[buf.read:buf.write])
+		}
+	}
+}
+
+// Merge merges the provided View with this one.
+//
+// The other view will be empty after this operation.
+func (v *View) Merge(other *View) {
+	// Copy over all buffers.
+	for buf := other.data.Front(); buf != nil && !buf.Empty(); buf = other.data.Front() {
+		other.data.Remove(buf)
+		v.data.PushBack(buf)
+	}
+
+	// Adjust sizes.
+	v.size += other.size
+	other.size = 0
+}
+
+// WriteFromReader writes to the buffer from an io.Reader.
+func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	for done < count {
+		buf := v.data.Back()
+
+		// Find the first empty buffer.
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*Buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Is this less than the minimum batch?
+		if len(buf.data[buf.write:]) < minBatch && (count-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = r.Read(tmp)
+			v.Write(tmp[:n])
+			done += int64(n)
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the read, if necessary.
+		end := len(buf.data)
+		if int64(end-buf.write) > (count - done) {
+			end = buf.write + int(count-done)
+		}
+
+		// Pass the relevant portion of the buffer.
+		n, err = r.Read(buf.data[buf.write:end])
+		buf.write += n
+		done += int64(n)
+		v.size += int64(n)
+		if err == io.EOF {
+			err = nil // Short write allowed.
+			break
+		} else if err != nil {
+			break
+		}
+	}
+	return done, err
+}
+
+// ReadToWriter reads from the buffer into an io.Writer.
+//
+// N.B. This does not consume the bytes read. TrimFront should
+// be called appropriately after this call in order to do so.
+func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	offset := 0 // Spill-over for batching.
+	for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
+		l := buf.write - buf.read - offset
+
+		// Is this less than the minimum batch?
+		if l < minBatch && (count-done) >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = v.ReadAt(tmp, done)
+			w.Write(tmp[:n])
+			done += int64(n)
+			offset = n - l // Reset below.
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the write if necessary.
+		if int64(l) >= (count - done) {
+			l = int(count - done)
+		}
+
+		// Perform the actual write.
+		n, err = w.Write(buf.data[buf.read+offset : buf.read+offset+l])
+		done += int64(n)
+		if err != nil {
+			break
+		}
+
+		// Reset spill-over.
+		offset = 0
+	}
+	return done, err
+}
diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go
new file mode 100644
index 000000000..37e652f16
--- /dev/null
+++ b/pkg/buffer/view_test.go
@@ -0,0 +1,233 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+)
+
+func TestView(t *testing.T) {
+	testCases := []struct {
+		name   string
+		input  string
+		output string
+		ops    []func(*View)
+	}{
+		// Prepend.
+		{
+			name:  "prepend",
+			input: "world",
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte("hello "))
+				},
+			},
+			output: "hello world",
+		},
+		{
+			name:  "prepend fill",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte("0"))
+				},
+			},
+			output: "0" + strings.Repeat("1", bufferSize-1),
+		},
+		{
+			name:  "prepend overflow",
+			input: strings.Repeat("1", bufferSize),
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte("0"))
+				},
+			},
+			output: "0" + strings.Repeat("1", bufferSize),
+		},
+		{
+			name:  "prepend multiple buffers",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte(strings.Repeat("0", bufferSize*3)))
+				},
+			},
+			output: strings.Repeat("0", bufferSize*3) + strings.Repeat("1", bufferSize-1),
+		},
+
+		// Append.
+		{
+			name:  "append",
+			input: "hello",
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte(" world"))
+				},
+			},
+			output: "hello world",
+		},
+		{
+			name:  "append fill",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte("0"))
+				},
+			},
+			output: strings.Repeat("1", bufferSize-1) + "0",
+		},
+		{
+			name:  "append overflow",
+			input: strings.Repeat("1", bufferSize),
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte("0"))
+				},
+			},
+			output: strings.Repeat("1", bufferSize) + "0",
+		},
+		{
+			name:  "append multiple buffers",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte(strings.Repeat("0", bufferSize*3)))
+				},
+			},
+			output: strings.Repeat("1", bufferSize-1) + strings.Repeat("0", bufferSize*3),
+		},
+
+		// Truncate.
+		{
+			name:  "truncate",
+			input: "hello world",
+			ops: []func(*View){
+				func(v *View) {
+					v.Truncate(5)
+				},
+			},
+			output: "hello",
+		},
+		{
+			name:  "truncate multiple buffers",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.Truncate(bufferSize*2 - 1)
+				},
+			},
+			output: strings.Repeat("1", bufferSize*2-1),
+		},
+		{
+			name:  "truncate multiple buffers to one buffer",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.Truncate(5)
+				},
+			},
+			output: "11111",
+		},
+
+		// TrimFront.
+		{
+			name:  "trim",
+			input: "hello world",
+			ops: []func(*View){
+				func(v *View) {
+					v.TrimFront(6)
+				},
+			},
+			output: "world",
+		},
+		{
+			name:  "trim multiple buffers",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.TrimFront(1)
+				},
+			},
+			output: strings.Repeat("1", bufferSize*2-1),
+		},
+		{
+			name:  "trim multiple buffers to one buffer",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.TrimFront(bufferSize*2 - 1)
+				},
+			},
+			output: "1",
+		},
+
+		// Grow.
+		{
+			name:  "grow",
+			input: "hello world",
+			ops: []func(*View){
+				func(v *View) {
+					v.Grow(1, true)
+				},
+			},
+			output: "hello world",
+		},
+		{
+			name: "grow from zero",
+			ops: []func(*View){
+				func(v *View) {
+					v.Grow(1024, true)
+				},
+			},
+			output: strings.Repeat("\x00", 1024),
+		},
+		{
+			name:  "grow from non-zero",
+			input: strings.Repeat("1", bufferSize),
+			ops: []func(*View){
+				func(v *View) {
+					v.Grow(bufferSize*2, true)
+				},
+			},
+			output: strings.Repeat("1", bufferSize) + strings.Repeat("\x00", bufferSize),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Construct the new view.
+			var view View
+			view.Append([]byte(tc.input))
+
+			// Run all operations.
+			for _, op := range tc.ops {
+				op(&view)
+			}
+
+			// Flatten and validate.
+			out := view.Flatten()
+			if !bytes.Equal([]byte(tc.output), out) {
+				t.Errorf("expected %q, got %q", tc.output, string(out))
+			}
+
+			// Ensure the size is correct.
+			if len(out) != int(view.Size()) {
+				t.Errorf("size is wrong: expected %d, got %d", len(out), view.Size())
+			}
+		})
+	}
+}
diff --git a/pkg/buffer/view_unsafe.go b/pkg/buffer/view_unsafe.go
new file mode 100644
index 000000000..d1ef39b26
--- /dev/null
+++ b/pkg/buffer/view_unsafe.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"unsafe"
+)
+
+// minBatch is the smallest Read or Write operation that the
+// WriteFromReader and ReadToWriter functions will use.
+//
+// This is defined as the size of a native pointer.
+const minBatch = int(unsafe.Sizeof(uintptr(0)))
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4c049d5b4..f29dc0472 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,25 +1,10 @@
 load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-go_template_instance(
-    name = "buffer_list",
-    out = "buffer_list.go",
-    package = "pipe",
-    prefix = "buffer",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*buffer",
-        "Linker": "*buffer",
-    },
-)
-
 go_library(
     name = "pipe",
     srcs = [
-        "buffer.go",
-        "buffer_list.go",
         "device.go",
         "node.go",
         "pipe.go",
@@ -33,8 +18,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/buffer",
         "//pkg/context",
-        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -51,7 +36,6 @@ go_test(
     name = "pipe_test",
     size = "small",
     srcs = [
-        "buffer_test.go",
         "node_test.go",
         "pipe_test.go",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
deleted file mode 100644
index fe3be5dbd..000000000
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"io"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// buffer encapsulates a queueable byte buffer.
-//
-// Note that the total size is slightly less than two pages. This
-// is done intentionally to ensure that the buffer object aligns
-// with runtime internals. We have no hard size or alignment
-// requirements. This two page size will effectively minimize
-// internal fragmentation, but still have a large enough chunk
-// to limit excessive segmentation.
-//
-// +stateify savable
-type buffer struct {
-	data  [8144]byte
-	read  int
-	write int
-	bufferEntry
-}
-
-// Reset resets internal data.
-//
-// This must be called before use.
-func (b *buffer) Reset() {
-	b.read = 0
-	b.write = 0
-}
-
-// Empty indicates the buffer is empty.
-//
-// This indicates there is no data left to read.
-func (b *buffer) Empty() bool {
-	return b.read == b.write
-}
-
-// Full indicates the buffer is full.
-//
-// This indicates there is no capacity left to write.
-func (b *buffer) Full() bool {
-	return b.write == len(b.data)
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
-	n, err := safemem.CopySeq(dst, srcs)
-	b.write += int(n)
-	return n, err
-}
-
-// WriteFromReader writes to the buffer from an io.Reader.
-func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) {
-	dst := b.data[b.write:]
-	if count < int64(len(dst)) {
-		dst = b.data[b.write:][:count]
-	}
-	n, err := r.Read(dst)
-	b.write += n
-	return int64(n), err
-}
-
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
-	n, err := safemem.CopySeq(dsts, src)
-	b.read += int(n)
-	return n, err
-}
-
-// ReadToWriter reads from the buffer into an io.Writer.
-func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) {
-	src := b.data[b.read:b.write]
-	if count < int64(len(src)) {
-		src = b.data[b.read:][:count]
-	}
-	n, err := w.Write(src)
-	if !dup {
-		b.read += n
-	}
-	return int64(n), err
-}
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
-
-// newBuffer grabs a new buffer from the pool.
-func newBuffer() *buffer {
-	b := bufferPool.Get().(*buffer)
-	b.Reset()
-	return b
-}
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go
deleted file mode 100644
index 4d54b8b8f..000000000
--- a/pkg/sentry/kernel/pipe/buffer_test.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"testing"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func TestBufferSize(t *testing.T) {
-	bufferSize := unsafe.Sizeof(buffer{})
-	if bufferSize < usermem.PageSize {
-		t.Errorf("buffer is less than a page")
-	}
-	if bufferSize > (2 * usermem.PageSize) {
-		t.Errorf("buffer is greater than two pages")
-	}
-}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 08410283f..725e9db7d 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -20,6 +20,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -70,10 +71,10 @@ type Pipe struct {
 	// mu protects all pipe internal state below.
 	mu sync.Mutex `state:"nosave"`
 
-	// data is the buffer queue of pipe contents.
+	// view is the underlying set of buffers.
 	//
 	// This is protected by mu.
-	data bufferList
+	view buffer.View
 
 	// max is the maximum size of the pipe in bytes. When this max has been
 	// reached, writers will get EWOULDBLOCK.
@@ -81,11 +82,6 @@ type Pipe struct {
 	// This is protected by mu.
 	max int64
 
-	// size is the current size of the pipe in bytes.
-	//
-	// This is protected by mu.
-	size int64
-
 	// hadWriter indicates if this pipe ever had a writer. Note that this
 	// does not necessarily indicate there is *currently* a writer, just
 	// that there has been a writer at some point since the pipe was
@@ -196,7 +192,7 @@ type readOps struct {
 	limit func(int64)
 
 	// read performs the actual read operation.
-	read func(*buffer) (int64, error)
+	read func(*buffer.View) (int64, error)
 }
 
 // read reads data from the pipe into dst and returns the number of bytes
@@ -213,7 +209,7 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	defer p.mu.Unlock()
 
 	// Is the pipe empty?
-	if p.size == 0 {
+	if p.view.Size() == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
 			return 0, nil
@@ -222,71 +218,13 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	}
 
 	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
+	if ops.left() > p.view.Size() {
+		ops.limit(p.view.Size())
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Pop the first buffer.
-		first := p.data.Front()
-		if first == nil {
-			break
-		}
-
-		// Copy user data.
-		n, err := ops.read(first)
-		done += int64(n)
-		p.size -= n
-
-		// Empty buffer?
-		if first.Empty() {
-			// Push to the free list.
-			p.data.Remove(first)
-			bufferPool.Put(first)
-		}
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
-}
-
-// dup duplicates all data from this pipe into the given writer.
-//
-// There is no blocking behavior implemented here. The writer may propagate
-// some blocking error. All the writes must be complete writes.
-func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Is the pipe empty?
-	if p.size == 0 {
-		if !p.HasWriters() {
-			// See above.
-			return 0, nil
-		}
-		return 0, syserror.ErrWouldBlock
-	}
-
-	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
-	}
-
-	done := int64(0)
-	for buf := p.data.Front(); buf != nil; buf = buf.Next() {
-		n, err := ops.read(buf)
-		done += n
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
+	// Copy user data; the read op is responsible for trimming.
+	done, err := ops.read(&p.view)
+	return done, err
 }
 
 type writeOps struct {
@@ -297,7 +235,7 @@ type writeOps struct {
 	limit func(int64)
 
 	// write should write to the provided buffer.
-	write func(*buffer) (int64, error)
+	write func(*buffer.View) (int64, error)
 }
 
 // write writes data from sv into the pipe and returns the number of bytes
@@ -317,33 +255,19 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.size; wanted > avail {
+	if avail := p.max - p.view.Size(); wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
 		ops.limit(avail)
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Need a new buffer?
-		last := p.data.Back()
-		if last == nil || last.Full() {
-			// Add a new buffer to the data list.
-			last = newBuffer()
-			p.data.PushBack(last)
-		}
-
-		// Copy user data.
-		n, err := ops.write(last)
-		done += int64(n)
-		p.size += n
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
+	// Copy user data.
+	done, err := ops.write(&p.view)
+	if err != nil {
+		return done, err
 	}
+
 	if wanted > done {
 		// Partial write due to full pipe.
 		return done, syserror.ErrWouldBlock
@@ -396,7 +320,7 @@ func (p *Pipe) HasWriters() bool {
 // Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasReaders() && p.data.Front() != nil {
+	if p.HasReaders() && p.view.Size() != 0 {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
@@ -422,7 +346,7 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 // Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasWriters() && p.size < p.max {
+	if p.HasWriters() && p.view.Size() < p.max {
 		ready |= waiter.EventOut
 	}
 	if !p.HasReaders() {
@@ -451,7 +375,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	return p.size
+	return p.view.Size()
 }
 
 // FifoSize implements fs.FifoSizer.FifoSize.
@@ -474,7 +398,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	if size < p.size {
+	if size < p.view.Size() {
 		return 0, syserror.EBUSY
 	}
 	p.max = size
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 80158239e..5a1d4fd57 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			dst = dst.TakeFirst64(l)
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, buf)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, view)
 			dst = dst.DropFirst64(n)
+			view.TrimFront(n)
 			return n, err
 		},
 	})
@@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool)
 		limit: func(l int64) {
 			count = l
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := buf.ReadToWriter(w, count, dup)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToWriter(w, count)
+			if !dup {
+				view.TrimFront(n)
+			}
 			count -= n
 			return n, err
 		},
 	}
-	if dup {
-		// There is no notification for dup operations.
-		return p.dup(ctx, ops)
-	}
 	n, err := p.read(ctx, ops)
 	if n > 0 {
 		p.Notify(waiter.EventOut)
@@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			src = src.TakeFirst64(l)
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := src.CopyInTo(ctx, buf)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := src.CopyInTo(ctx, view)
 			src = src.DropFirst64(n)
 			return n, err
 		},
@@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e
 		limit: func(l int64) {
 			count = l
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := buf.WriteFromReader(r, count)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromReader(r, count)
 			count -= n
 			return n, err
 		},
-- 
cgit v1.2.3


From df8740b8a7fb8fa05d7a0387749b61d57a74c06c Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 28 Feb 2020 13:21:33 -0800
Subject: Mark gettid and getdents as nogotsan

PiperOrigin-RevId: 297915815
---
 test/perf/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/perf/BUILD b/test/perf/BUILD
index 346a28e16..0a0def6a3 100644
--- a/test/perf/BUILD
+++ b/test/perf/BUILD
@@ -30,6 +30,7 @@ syscall_test(
 
 syscall_test(
     size = "enormous",
+    tags = ["nogotsan"],
     test = "//test/perf/linux:getdents_benchmark",
 )
 
@@ -40,6 +41,7 @@ syscall_test(
 
 syscall_test(
     size = "enormous",
+    tags = ["nogotsan"],
     test = "//test/perf/linux:gettid_benchmark",
 )
 
-- 
cgit v1.2.3


From ccecf29f3ffd1fe2179d95d2c16c7c60d77a6b17 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 28 Feb 2020 13:21:56 -0800
Subject: Bump rules_go to 0.22.0 and go toolchain to 1.14.

PiperOrigin-RevId: 297915917
---
 WORKSPACE | 57 ++++++++++++++++++++++++++++++++++++---------------------
 go.mod    |  6 ++++--
 go.sum    | 16 +++++++++++++++-
 3 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 995d2c7f1..3afd7e832 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,10 +4,10 @@ load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 # Load go bazel rules and gazelle.
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "f99a9d76e972e0c8f935b2fe6d0d9d778f67c760c6d2400e23fc2e469016e2bd",
+    sha256 = "94f90feaa65c9cdc840cd21f67d967870b5943d684966a47569da8073e42063d",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.21.2/rules_go-v0.21.2.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.21.2/rules_go-v0.21.2.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.22.0/rules_go-v0.22.0.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.22.0/rules_go-v0.22.0.tar.gz",
     ],
 )
 
@@ -25,7 +25,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_depe
 go_rules_dependencies()
 
 go_register_toolchains(
-    go_version = "1.13.7",
+    go_version = "1.14",
     nogo = "@//:nogo",
 )
 
@@ -43,8 +43,8 @@ gazelle_dependencies()
 go_repository(
     name = "org_golang_x_sys",
     importpath = "golang.org/x/sys",
-    sum = "h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=",
-    version = "v0.0.0-20190215142949-d0b11bdaac8a",
+    sum = "h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=",
+    version = "v0.0.0-20190412213103-97732733099d",
 )
 
 # Load C++ rules.
@@ -99,11 +99,11 @@ pip_install()
 # See releases at https://releases.bazel.build/bazel-toolchains.html
 http_archive(
     name = "bazel_toolchains",
-    sha256 = "a653c9d318e42b14c0ccd7ac50c4a2a276c0db1e39743ab88b5aa2f0bc9cf607",
-    strip_prefix = "bazel-toolchains-2.0.2",
+    sha256 = "b5a8039df7119d618402472f3adff8a1bd0ae9d5e253f53fcc4c47122e91a3d2",
+    strip_prefix = "bazel-toolchains-2.1.1",
     urls = [
-        "https://github.com/bazelbuild/bazel-toolchains/releases/download/2.0.2/bazel-toolchains-2.0.2.tar.gz",
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/2.0.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-toolchains/releases/download/2.1.1/bazel-toolchains-2.1.1.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/2.1.1.tar.gz",
     ],
 )
 
@@ -263,22 +263,22 @@ go_repository(
 go_repository(
     name = "org_golang_x_crypto",
     importpath = "golang.org/x/crypto",
-    sum = "h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=",
-    version = "v0.0.0-20190308221718-c2843e01d9a2",
+    sum = "h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=",
+    version = "v0.0.0-20191011191535-87dc89f01550",
 )
 
 go_repository(
-    name = "org_golang_x_net",
-    importpath = "golang.org/x/net",
-    sum = "h1:oWX7TPOiFAMXLq8o0ikBYfCJVlRHBcsciT5bXOrH628=",
-    version = "v0.0.0-20190311183353-d8887717615a",
+    name = "org_golang_x_mod",
+    importpath = "golang.org/x/mod",
+    sum = "h1:p1YOIz9H/mGN8k1XkaV5VFAq9+zhN9Obefv439UwRhI=",
+    version = "v0.2.1-0.20200224194123-e5e73c1b9c72",
 )
 
 go_repository(
-    name = "org_golang_x_text",
-    importpath = "golang.org/x/text",
-    sum = "h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=",
-    version = "v0.3.0",
+    name = "org_golang_x_net",
+    importpath = "golang.org/x/net",
+    sum = "h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=",
+    version = "v0.0.0-20190620200207-3b0461eec859",
 )
 
 go_repository(
@@ -288,6 +288,13 @@ go_repository(
     version = "v0.0.0-20190423024810-112230192c58",
 )
 
+go_repository(
+    name = "org_golang_x_text",
+    importpath = "golang.org/x/text",
+    sum = "h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=",
+    version = "v0.3.0",
+)
+
 go_repository(
     name = "org_golang_x_time",
     importpath = "golang.org/x/time",
@@ -297,8 +304,16 @@ go_repository(
 
 go_repository(
     name = "org_golang_x_tools",
-    commit = "aa82965741a9fecd12b026fbb3d3c6ed3231b8f8",
     importpath = "golang.org/x/tools",
+    sum = "h1:aZzprAO9/8oim3qStq3wc1Xuxx4QmAGriC4VU4ojemQ=",
+    version = "v0.0.0-20191119224855-298f0cb1881e",
+)
+
+go_repository(
+    name = "org_golang_x_xerrors",
+    importpath = "golang.org/x/xerrors",
+    sum = "h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=",
+    version = "v0.0.0-20191204190536-9bdfabe68543",
 )
 
 go_repository(
diff --git a/go.mod b/go.mod
index 3a8b9288d..e03aa56c1 100644
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module gvisor.dev/gvisor
 
-go 1.13
+go 1.14
 
 require (
 	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
@@ -14,7 +14,9 @@ require (
 	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
 	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
 	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 // indirect
-	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+	golang.org/x/mod v0.2.1-0.20200224194123-e5e73c1b9c72 // indirect
+	golang.org/x/sys v0.0.0-20190412213103-97732733099d
 	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
+	golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 // indirect
 	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
 )
diff --git a/go.sum b/go.sum
index f16a549fd..7d7a11c1d 100644
--- a/go.sum
+++ b/go.sum
@@ -22,9 +22,23 @@ github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e h1:/Tdc23Arz
 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 h1:J9gO8RJCAFlln1jsvRba/CWVUnMHwObklfxxjErl1uk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/mod v0.2.1-0.20200224194123-e5e73c1b9c72 h1:p1YOIz9H/mGN8k1XkaV5VFAq9+zhN9Obefv439UwRhI=
+golang.org/x/mod v0.2.1-0.20200224194123-e5e73c1b9c72/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-- 
cgit v1.2.3


From 837cf6255120fe4577a9fd758ecbe2f52f268ba8 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 14 Feb 2020 11:58:07 -0800
Subject: pcids.go isn't arch-specific

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 pkg/sentry/platform/ring0/pagetables/BUILD        |   2 +-
 pkg/sentry/platform/ring0/pagetables/pcids.go     | 107 +++++++++++++++++++++
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 109 ----------------------
 3 files changed, 108 insertions(+), 110 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids.go
 delete mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_x86.go

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 4f2406ce3..581841555 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -80,7 +80,7 @@ go_library(
         "pagetables_amd64.go",
         "pagetables_arm64.go",
         "pagetables_x86.go",
-        "pcids_x86.go",
+        "pcids.go",
         "walker_amd64.go",
         "walker_arm64.go",
         "walker_empty.go",
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids.go b/pkg/sentry/platform/ring0/pagetables/pcids.go
new file mode 100644
index 000000000..9206030bf
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids.go
@@ -0,0 +1,107 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// limitPCID is the number of valid PCIDs.
+const limitPCID = 4096
+
+// PCIDs is a simple PCID database.
+//
+// This is not protected by locks and is thus suitable for use only with a
+// single CPU at a time.
+type PCIDs struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// cache are the assigned page tables.
+	cache map[*PageTables]uint16
+
+	// avail are available PCIDs.
+	avail []uint16
+}
+
+// NewPCIDs returns a new PCID database.
+//
+// start is the first index to assign. Typically this will be one, as the zero
+// pcid will always be flushed on transition (see pagetables_x86.go). This may
+// be more than one if specific PCIDs are reserved.
+//
+// Nil is returned iff the start and size are out of range.
+func NewPCIDs(start, size uint16) *PCIDs {
+	if start+uint16(size) >= limitPCID {
+		return nil // See comment.
+	}
+	p := &PCIDs{
+		cache: make(map[*PageTables]uint16),
+	}
+	for pcid := start; pcid < start+size; pcid++ {
+		p.avail = append(p.avail, pcid)
+	}
+	return p
+}
+
+// Assign assigns a PCID to the given PageTables.
+//
+// This may overwrite any previous assignment provided. If this in the case,
+// true is returned to indicate that the PCID should be flushed.
+func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		p.mu.Unlock()
+		return pcid, false // No flush.
+	}
+
+	// Is there something available?
+	if len(p.avail) > 0 {
+		pcid := p.avail[len(p.avail)-1]
+		p.avail = p.avail[:len(p.avail)-1]
+		p.cache[pt] = pcid
+
+		// We need to flush because while this is in the available
+		// pool, it may have been used previously.
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// Evict an existing table.
+	for old, pcid := range p.cache {
+		delete(p.cache, old)
+		p.cache[pt] = pcid
+
+		// A flush is definitely required in this case, these page
+		// tables may still be active. (They will just be assigned some
+		// other PCID if and when they hit the given CPU again.)
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// No PCID.
+	p.mu.Unlock()
+	return 0, false
+}
+
+// Drop drops references to a set of page tables.
+func (p *PCIDs) Drop(pt *PageTables) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		delete(p.cache, pt)
+		p.avail = append(p.avail, pcid)
+	}
+	p.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
deleted file mode 100644
index e199bae18..000000000
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build i386 amd64
-
-package pagetables
-
-import (
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// limitPCID is the number of valid PCIDs.
-const limitPCID = 4096
-
-// PCIDs is a simple PCID database.
-//
-// This is not protected by locks and is thus suitable for use only with a
-// single CPU at a time.
-type PCIDs struct {
-	// mu protects below.
-	mu sync.Mutex
-
-	// cache are the assigned page tables.
-	cache map[*PageTables]uint16
-
-	// avail are available PCIDs.
-	avail []uint16
-}
-
-// NewPCIDs returns a new PCID database.
-//
-// start is the first index to assign. Typically this will be one, as the zero
-// pcid will always be flushed on transition (see pagetables_x86.go). This may
-// be more than one if specific PCIDs are reserved.
-//
-// Nil is returned iff the start and size are out of range.
-func NewPCIDs(start, size uint16) *PCIDs {
-	if start+uint16(size) >= limitPCID {
-		return nil // See comment.
-	}
-	p := &PCIDs{
-		cache: make(map[*PageTables]uint16),
-	}
-	for pcid := start; pcid < start+size; pcid++ {
-		p.avail = append(p.avail, pcid)
-	}
-	return p
-}
-
-// Assign assigns a PCID to the given PageTables.
-//
-// This may overwrite any previous assignment provided. If this in the case,
-// true is returned to indicate that the PCID should be flushed.
-func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
-	p.mu.Lock()
-	if pcid, ok := p.cache[pt]; ok {
-		p.mu.Unlock()
-		return pcid, false // No flush.
-	}
-
-	// Is there something available?
-	if len(p.avail) > 0 {
-		pcid := p.avail[len(p.avail)-1]
-		p.avail = p.avail[:len(p.avail)-1]
-		p.cache[pt] = pcid
-
-		// We need to flush because while this is in the available
-		// pool, it may have been used previously.
-		p.mu.Unlock()
-		return pcid, true
-	}
-
-	// Evict an existing table.
-	for old, pcid := range p.cache {
-		delete(p.cache, old)
-		p.cache[pt] = pcid
-
-		// A flush is definitely required in this case, these page
-		// tables may still be active. (They will just be assigned some
-		// other PCID if and when they hit the given CPU again.)
-		p.mu.Unlock()
-		return pcid, true
-	}
-
-	// No PCID.
-	p.mu.Unlock()
-	return 0, false
-}
-
-// Drop drops references to a set of page tables.
-func (p *PCIDs) Drop(pt *PageTables) {
-	p.mu.Lock()
-	if pcid, ok := p.cache[pt]; ok {
-		delete(p.cache, pt)
-		p.avail = append(p.avail, pcid)
-	}
-	p.mu.Unlock()
-}
-- 
cgit v1.2.3


From ce4d1e45bb8822a1677c90c33fe211467febc1b6 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 28 Feb 2020 15:01:01 -0800
Subject: Run `./tools/go_mod.sh tidy`.

These dependencies do not need to be in our go.mod or go.sum files.

PiperOrigin-RevId: 297942163
---
 go.mod |  2 --
 go.sum | 14 --------------
 2 files changed, 16 deletions(-)

diff --git a/go.mod b/go.mod
index e03aa56c1..80d7b513a 100644
--- a/go.mod
+++ b/go.mod
@@ -14,9 +14,7 @@ require (
 	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
 	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
 	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 // indirect
-	golang.org/x/mod v0.2.1-0.20200224194123-e5e73c1b9c72 // indirect
 	golang.org/x/sys v0.0.0-20190412213103-97732733099d
 	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
-	golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 // indirect
 	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
 )
diff --git a/go.sum b/go.sum
index 7d7a11c1d..99f1e7dd4 100644
--- a/go.sum
+++ b/go.sum
@@ -22,23 +22,9 @@ github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e h1:/Tdc23Arz
 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 h1:J9gO8RJCAFlln1jsvRba/CWVUnMHwObklfxxjErl1uk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/mod v0.2.1-0.20200224194123-e5e73c1b9c72 h1:p1YOIz9H/mGN8k1XkaV5VFAq9+zhN9Obefv439UwRhI=
-golang.org/x/mod v0.2.1-0.20200224194123-e5e73c1b9c72/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-- 
cgit v1.2.3


From 413a9b7fdc14f8bff660e1988e3ef0355dd4e6c6 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 21 Feb 2020 11:00:11 -0800
Subject: Define CPUIDInstruction for arm64

There is no cpuid instruction on arm64, so we need to defined it
just to avoid a compile time error.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 pkg/sentry/arch/arch_arm64.go |  5 +++++
 pkg/sentry/kernel/task_run.go | 41 ++++++++++++++++++++++++++++-------------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 372b650b9..885115ae2 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -53,6 +53,11 @@ const (
 	preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5
 )
 
+var (
+	// CPUIDInstruction doesn't exist on ARM64.
+	CPUIDInstruction = []byte{}
+)
+
 // These constants are selected as heuristics to help make the Platform's
 // potentially limited address space conform as closely to Linux as possible.
 const (
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 5568c91bc..799cbcd93 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -126,13 +126,39 @@ func (t *Task) doStop() {
 	}
 }
 
+func (*runApp) handleCPUIDInstruction(t *Task) error {
+	if len(arch.CPUIDInstruction) == 0 {
+		// CPUID emulation isn't supported, but this code can be
+		// executed, because the ptrace platform returns
+		// ErrContextSignalCPUID on page faults too. Look at
+		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
+		// details.
+		return platform.ErrContextSignal
+	}
+	// Is this a CPUID instruction?
+	region := trace.StartRegion(t.traceContext, cpuidRegion)
+	expected := arch.CPUIDInstruction[:]
+	found := make([]byte, len(expected))
+	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	if err == nil && bytes.Equal(expected, found) {
+		// Skip the cpuid instruction.
+		t.Arch().CPUIDEmulate(t)
+		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+		region.End()
+
+		return nil
+	}
+	region.End() // Not an actual CPUID, but required copy-in.
+	return platform.ErrContextSignal
+}
+
 // The runApp state checks for interrupts before executing untrusted
 // application code.
 //
 // +stateify savable
 type runApp struct{}
 
-func (*runApp) execute(t *Task) taskRunState {
+func (app *runApp) execute(t *Task) taskRunState {
 	if t.interrupted() {
 		// Checkpointing instructs tasks to stop by sending an interrupt, so we
 		// must check for stops before entering runInterrupt (instead of
@@ -237,21 +263,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextSignalCPUID:
-		// Is this a CPUID instruction?
-		region := trace.StartRegion(t.traceContext, cpuidRegion)
-		expected := arch.CPUIDInstruction[:]
-		found := make([]byte, len(expected))
-		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
-		if err == nil && bytes.Equal(expected, found) {
-			// Skip the cpuid instruction.
-			t.Arch().CPUIDEmulate(t)
-			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
-			region.End()
-
+		if err := app.handleCPUIDInstruction(t); err == nil {
 			// Resume execution.
 			return (*runApp)(nil)
 		}
-		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
-- 
cgit v1.2.3


From ab7ecdd66d2aa4835bbe655ba672bd30813a2a88 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 28 Feb 2020 17:53:25 -0800
Subject: watchdog: print panic error message before other messages

This is needed for syzkaller to proper classify issues.

Right now, all watchdog issues are duped to one with the
subject "panic: Sentry detected stuck task(s). See stack
trace and message above for more details".

PiperOrigin-RevId: 297975363
---
 pkg/sentry/watchdog/watchdog.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index bfb2fac26..f7d6009a0 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -221,7 +221,7 @@ func (w *Watchdog) waitForStart() {
 		return
 	}
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog.Start() not called within %s:\n")
+	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
 	w.doAction(w.StartupTimeoutAction, false, &buf)
 }
 
@@ -325,7 +325,7 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog goroutine is stuck:\n")
+	buf.WriteString("Watchdog goroutine is stuck:")
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
@@ -359,7 +359,7 @@ func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
 		case <-metricsEmitted:
 		case <-time.After(1 * time.Second):
 		}
-		panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String()))
+		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
 	default:
 		panic(fmt.Sprintf("Unknown watchdog action %v", action))
 
-- 
cgit v1.2.3


From 99e395e3b1379c12a382544d597dc12bd9293592 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 28 Feb 2020 14:52:01 +0800
Subject: passed the syscall test case 'clock_getres' on Arm64 platform

Test command:
	bazel test //test/syscalls:clock_getres_test_runsc_ptrace

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 vdso/vdso.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index 8bb80a7a4..c2585d592 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -126,6 +126,10 @@ extern "C" int __kernel_clock_getres(clockid_t clock, struct timespec* res) {
     case CLOCK_REALTIME:
     case CLOCK_MONOTONIC:
     case CLOCK_BOOTTIME: {
+      if (res == nullptr) {
+        return 0;
+      }
+
       res->tv_sec = 0;
       res->tv_nsec = 1;
       break;
-- 
cgit v1.2.3


From 36b193b1db60cad3c1c65ce3abef03a6a0594e3e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 2 Mar 2020 07:13:47 +0000
Subject: Fix syscall test build error on arm64.

The error was introduced in the merge of PR #1471.
Some codes are missing when adding bazel select_arch
command to the test/syscall/linux/BUILD file.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8cae3f4ae78c2e14671f3ac6e7361dc2806d9305
---
 test/syscalls/linux/BUILD | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 70c120e42..9ab13ba07 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -608,7 +608,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select_arch(
+	amd64 = ["exceptions.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
@@ -1475,7 +1478,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select_arch(
+        amd64 = ["arch_prctl.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
@@ -3322,7 +3328,10 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = ["sysret.cc"],
+    srcs = select_arch(
+        amd64 = ["sysret.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
-- 
cgit v1.2.3


From 62bd3ca8a37543c060292ca86669e537b9ebf36d Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 2 Mar 2020 10:05:52 -0800
Subject: Take write lock when removing xattr

PiperOrigin-RevId: 298380654
---
 pkg/sentry/fs/fsutil/inode.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index daecc4ffe..1922ff08c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -259,8 +259,8 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, ui
 
 // RemoveXattr implements fs.InodeOperations.RemoveXattr.
 func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
-	i.mu.RLock()
-	defer i.mu.RUnlock()
+	i.mu.Lock()
+	defer i.mu.Unlock()
 	if _, ok := i.xattrs[name]; ok {
 		delete(i.xattrs, name)
 		return nil
-- 
cgit v1.2.3


From 42fb7d349137bd8847e7c3df6493fde3bc8e6e89 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 2 Mar 2020 10:32:20 -0800
Subject: socket: take readMu to access readView

DATA RACE in netstack.(*SocketOperations).fetchReadView

Write at 0x00c001dca138 by goroutine 1001:
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).fetchReadView()
      pkg/sentry/socket/netstack/netstack.go:418 +0x85
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).coalescingRead()
      pkg/sentry/socket/netstack/netstack.go:2309 +0x67
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).nonBlockingRead()
      pkg/sentry/socket/netstack/netstack.go:2378 +0x183d

Previous read at 0x00c001dca138 by goroutine 1111:
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).Ioctl()
      pkg/sentry/socket/netstack/netstack.go:2666 +0x533
  gvisor.dev/gvisor/pkg/sentry/syscalls/linux.Ioctl()

Reported-by: syzbot+d4c3885fcc346f08deb6@syzkaller.appspotmail.com
PiperOrigin-RevId: 298387377
---
 pkg/sentry/socket/netstack/netstack.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 48c268bfa..1eeb37446 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2663,7 +2663,9 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 		}
 
 		// Add bytes removed from the endpoint but not yet sent to the caller.
+		s.readMu.Lock()
 		v += len(s.readView)
+		s.readMu.Unlock()
 
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
-- 
cgit v1.2.3


From 8821a7104f8c8f263b88def1a646d518ec3f5dd2 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 2 Mar 2020 13:14:12 -0800
Subject: Do not read-lock NIC recursively

A deadlock may occur if a write lock on a RWMutex is blocked between
nested read lock attempts as the inner read lock attempt will be
blocked in this scenario.

Example (T1 and T2 are differnt goroutines):
  T1: obtain read-lock
  T2: attempt write-lock (blocks)
  T1: attempt inner/nested read-lock (blocks)

Here we can see that T1 and T2 are deadlocked.

Tests: Existing tests pass.
PiperOrigin-RevId: 298426678
---
 pkg/tcpip/stack/nic.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 46d3a6646..3e6196aee 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -451,7 +451,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
 	for _, r := range primaryAddrs {
 		// If r is not valid for outgoing connections, it is not a valid endpoint.
-		if !r.isValidForOutgoing() {
+		if !r.isValidForOutgoingRLocked() {
 			continue
 		}
 
-- 
cgit v1.2.3


From 5fadbea3ed71d1d02d96289bd545871de59a4359 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Mon, 2 Mar 2020 14:38:15 -0800
Subject: Update golang.org/x/sys

It was downgraded by mistake in
e5d9a4010bdb ("Add ability to execute go.mod in gopath context.")

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 WORKSPACE | 4 ++--
 go.mod    | 2 +-
 go.sum    | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 3afd7e832..d1c2d8a24 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -43,8 +43,8 @@ gazelle_dependencies()
 go_repository(
     name = "org_golang_x_sys",
     importpath = "golang.org/x/sys",
-    sum = "h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=",
-    version = "v0.0.0-20190412213103-97732733099d",
+    sum = "h1:uYVVQ9WP/Ds2ROhcaGPeIdVq0RIXVLwsHlnvJ+cT1So=",
+    version = "v0.0.0-20200302150141-5c8b2ff67527",
 )
 
 # Load C++ rules.
diff --git a/go.mod b/go.mod
index 80d7b513a..434fa713f 100644
--- a/go.mod
+++ b/go.mod
@@ -14,7 +14,7 @@ require (
 	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
 	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
 	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 // indirect
-	golang.org/x/sys v0.0.0-20190412213103-97732733099d
+	golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527
 	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
 	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
 )
diff --git a/go.sum b/go.sum
index 99f1e7dd4..c44a17c71 100644
--- a/go.sum
+++ b/go.sum
@@ -24,6 +24,8 @@ github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 h1:J9gO8RJCAFlln
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527 h1:uYVVQ9WP/Ds2ROhcaGPeIdVq0RIXVLwsHlnvJ+cT1So=
+golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
-- 
cgit v1.2.3


From 33101752501fafea99d77f34bbd65f3e0083d22e Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 2 Mar 2020 14:43:52 -0800
Subject: Fix data-race when reading/writing e.amss.

PiperOrigin-RevId: 298451319
---
 pkg/tcpip/transport/tcp/connect.go  | 11 +++++++++--
 pkg/tcpip/transport/tcp/endpoint.go | 29 ++++++++++++++++++-----------
 test/syscalls/linux/tcp_socket.cc   | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index cd247f3e1..ae4f3f3a9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -295,6 +295,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
 	ttl := h.ep.ttl
+	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
@@ -307,7 +308,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 		// permits SACK. This is not explicitly defined in the RFC but
 		// this is the behaviour implemented by Linux.
 		SACKPermitted: rcvSynOpts.SACKPermitted,
-		MSS:           h.ep.amss,
+		MSS:           amss,
 	}
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
@@ -356,6 +357,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
+		h.ep.mu.RLock()
+		amss := h.ep.amss
+		h.ep.mu.RUnlock()
+
 		h.resetState()
 		synOpts := header.TCPSynOptions{
 			WS:            h.rcvWndScale,
@@ -363,7 +368,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			TSVal:         h.ep.timestamp(),
 			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
-			MSS:           h.ep.amss,
+			MSS:           amss,
 		}
 		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
 		return nil
@@ -530,6 +535,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
+	h.ep.mu.Lock()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
@@ -540,6 +546,7 @@ func (h *handshake) execute() *tcpip.Error {
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
+	h.ep.mu.Unlock()
 
 	// Execute is also called in a listen context so we want to make sure we
 	// only send the TS/SACK option when we received the TS/SACK in the
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9e72730bd..8b9154e69 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -959,15 +959,18 @@ func (e *endpoint) initialReceiveWindow() int {
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
 // based on the number of bytes copied to user space.
 func (e *endpoint) ModerateRecvBuf(copied int) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	now := time.Now()
 	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
 		e.rcvAutoParams.copied += copied
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	prevRTTCopied := e.rcvAutoParams.copied + copied
@@ -1008,7 +1011,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 			e.rcvBufSize = rcvWnd
 			availAfter := e.receiveBufferAvailableLocked()
 			mask := uint32(notifyReceiveWindowChanged)
-			if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 				mask |= notifyNonZeroReceiveWindow
 			}
 			e.notifyProtocolGoroutine(mask)
@@ -1023,6 +1026,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvAutoParams.measureTime = now
 	e.rcvAutoParams.copied = 0
 	e.rcvListMu.Unlock()
+	e.mu.RUnlock()
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
@@ -1052,7 +1056,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-
 	e.mu.RUnlock()
 
 	if err == tcpip.ErrClosedForReceive {
@@ -1085,7 +1088,7 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThreshold(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1303,9 +1306,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
-// windowCrossedACKThreshold checks if the receive window to be announced now
-// would be under aMSS or under half receive buffer, whichever smaller. This is
-// useful as a receive side silly window syndrome prevention mechanism. If
+// windowCrossedACKThresholdLocked checks if the receive window to be announced
+// now would be under aMSS or under half receive buffer, whichever smaller. This
+// is useful as a receive side silly window syndrome prevention mechanism. If
 // window grows to reasonable value, we should send ACK to the sender to inform
 // the rx space is now large. We also want ensure a series of small read()'s
 // won't trigger a flood of spurious tiny ACK's.
@@ -1316,7 +1319,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 // crossed will be true if the window size crossed the ACK threshold.
 // above will be true if the new window is >= ACK threshold and false
 // otherwise.
-func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, above bool) {
+//
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
 	newAvail := e.receiveBufferAvailableLocked()
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
@@ -1379,6 +1384,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 		mask := uint32(notifyReceiveWindowChanged)
 
+		e.mu.RLock()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1405,11 +1411,11 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// Immediately send an ACK to uncork the sender silly window
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
 		e.rcvListMu.Unlock()
-
+		e.mu.RUnlock()
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -2414,13 +2420,14 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 // to be read, or when the connection is closed for receiving (in which case
 // s will be nil).
 func (e *endpoint) readyToRead(s *segment) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if s != nil {
 		s.incRef()
 		e.rcvBufUsed += s.data.Size()
 		// Increase counter if the receive window falls down below MSS
 		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(-s.data.Size()); crossed && !above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 		}
 		e.rcvList.PushBack(s)
@@ -2428,7 +2435,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-
+	e.mu.RUnlock()
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c4591a3b9..579463384 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1349,6 +1349,21 @@ TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) {
+  auto s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      addrlen);
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+      SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 43abb24657e737dee1108ff0d512b2e1b6d8a3f6 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Mon, 2 Mar 2020 16:30:51 -0800
Subject: Fix panic caused by invalid address for Bind in packet sockets.

PiperOrigin-RevId: 298476533
---
 pkg/sentry/socket/netstack/netstack.go |  4 ++++
 test/syscalls/linux/packet_socket.cc   | 13 +++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 1eeb37446..13a9a60b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,6 +712,10 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
+	}
+
 	family := usermem.ByteOrder.Uint16(sockaddr)
 	var addr tcpip.FullAddress
 
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index bc22de788..248762ca9 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -417,6 +417,19 @@ TEST_P(CookedPacketTest, BindDrop) {
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
 }
 
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+  // Null address.
+  ASSERT_THAT(bind(socket_, nullptr, sizeof(struct sockaddr)),
+              SyscallFailsWithErrno(EFAULT));
+
+  // Address of size 1.
+  uint8_t addr = 0;
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From fc3a09cd3c56ef20fd398a5f61a5e59111ed55b3 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 3 Mar 2020 17:45:10 +0800
Subject: code clean: minor changes to compatible with ubuntu18.04

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/bad.cc     | 2 +-
 test/syscalls/linux/seccomp.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index adfb149df..a26fc6af3 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -28,7 +28,7 @@ namespace {
 constexpr uint32_t kNotImplementedSyscall = SYS_get_kernel_syms;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kNotImplementedSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kNotImplementedSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 TEST(BadSyscallTest, NotImplemented) {
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index cf6499f8b..8e0fc9acc 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -53,7 +53,7 @@ namespace {
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
-- 
cgit v1.2.3


From b3c549d8391e7cadd82a5ab9280bc63bb372aa97 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 3 Mar 2020 12:36:37 -0800
Subject: Move temp_umask to test/util.

PiperOrigin-RevId: 298667595
---
 test/syscalls/linux/BUILD          |  9 ++-------
 test/syscalls/linux/mkdir.cc       |  2 +-
 test/syscalls/linux/open_create.cc |  2 +-
 test/syscalls/linux/temp_umask.h   | 39 --------------------------------------
 test/util/BUILD                    |  6 ++++++
 test/util/temp_umask.h             | 39 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 49 insertions(+), 48 deletions(-)
 delete mode 100644 test/syscalls/linux/temp_umask.h
 create mode 100644 test/util/temp_umask.h

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 70c120e42..dae2b1077 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -166,11 +166,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "temp_umask",
-    hdrs = ["temp_umask.h"],
-)
-
 cc_library(
     name = "unix_domain_socket_test_util",
     testonly = 1,
@@ -1140,11 +1135,11 @@ cc_binary(
     srcs = ["mkdir.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -1299,12 +1294,12 @@ cc_binary(
     srcs = ["open_create.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index cf138d328..def4c50a4 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -18,10 +18,10 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 902d0a0dc..51eacf3f2 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -19,11 +19,11 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/temp_umask.h b/test/syscalls/linux/temp_umask.h
deleted file mode 100644
index 81a25440c..000000000
--- a/test/syscalls/linux/temp_umask.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-#define GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-
-#include <sys/stat.h>
-#include <sys/types.h>
-
-namespace gvisor {
-namespace testing {
-
-class TempUmask {
- public:
-  // Sets the process umask to `mask`.
-  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
-
-  // Sets the process umask to its previous value.
-  ~TempUmask() { umask(old_mask_); }
-
- private:
-  mode_t old_mask_;
-};
-
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
diff --git a/test/util/BUILD b/test/util/BUILD
index 8b5a0f25c..2a17c33ee 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -350,3 +350,9 @@ cc_library(
         ":save_util",
     ],
 )
+
+cc_library(
+    name = "temp_umask",
+    testonly = 1,
+    hdrs = ["temp_umask.h"],
+)
diff --git a/test/util/temp_umask.h b/test/util/temp_umask.h
new file mode 100644
index 000000000..e7de84a54
--- /dev/null
+++ b/test/util/temp_umask.h
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_TEMP_UMASK_H_
+#define GVISOR_TEST_UTIL_TEMP_UMASK_H_
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace gvisor {
+namespace testing {
+
+class TempUmask {
+ public:
+  // Sets the process umask to `mask`.
+  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
+
+  // Sets the process umask to its previous value.
+  ~TempUmask() { umask(old_mask_); }
+
+ private:
+  mode_t old_mask_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_TEMP_UMASK_H_
-- 
cgit v1.2.3


From c15b8515eb4a07699e5f2401f0332286f0a51043 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 3 Mar 2020 13:40:59 -0800
Subject: Fix datarace on TransportEndpointInfo.ID and clean up semantics.

Ensures that all access to TransportEndpointInfo.ID is either:
* In a function ending in a Locked suffix.
* While holding the appropriate mutex.

This primary affects the checkV4Mapped method on affected endpoints, which has
been renamed to checkV4MappedLocked. Also document the method and change its
argument to be a value instead of a pointer which had caused some awkwardness.

This race was possible in the udp and icmp endpoints between Connect and uses
of TransportEndpointInfo.ID including in both itself and Bind.

The tcp endpoint did not suffer from this bug, but benefited from better
documentation.

Updates #357

PiperOrigin-RevId: 298682913
---
 pkg/tcpip/stack/stack.go                  | 12 +++++++-----
 pkg/tcpip/transport/icmp/endpoint.go      | 23 +++++++++++------------
 pkg/tcpip/transport/tcp/endpoint.go       | 15 ++++++++-------
 pkg/tcpip/transport/udp/endpoint.go       | 30 ++++++++++++++++--------------
 pkg/tcpip/transport/udp/endpoint_state.go |  3 +++
 5 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index ebb6c5e3b..13354d884 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -551,11 +551,13 @@ type TransportEndpointInfo struct {
 	RegisterNICID tcpip.NICID
 }
 
-// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
-// and returns the network protocol number to be used to communicate with the
-// specified address. It returns an error if the passed address is incompatible
-// with the receiver.
-func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+// AddrNetProtoLocked unwraps the specified address if it is a V4-mapped V6
+// address and returns the network protocol number to be used to communicate
+// with the specified address. It returns an error if the passed address is
+// incompatible with the receiver.
+//
+// Preconditon: the parent endpoint mu must be held while calling this method.
+func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
 	netProto := e.NetProto
 	switch len(addr.Addr) {
 	case header.IPv4AddressSize:
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 426da1ee6..2a396e9bc 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -291,15 +291,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			nicID = e.BindNICID
 		}
 
-		toCopy := *to
-		to = &toCopy
-		netProto, err := e.checkV4Mapped(to)
+		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
 		}
 
-		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicID, e.BindAddr, to.Addr, netProto, false /* multicastLoop */)
+		// Find the endpoint.
+		r, err := e.stack.FindRoute(nicID, e.BindAddr, dst.Addr, netProto, false /* multicastLoop */)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -480,13 +478,14 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, false /* v6only */)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -517,7 +516,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -630,7 +629,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8b9154e69..40cc664c0 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1874,13 +1874,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -1910,7 +1911,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 	connectingAddr := addr.Addr
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -2276,7 +2277,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	e.BindAddr = addr.Addr
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1c6a600b8..0af4514e1 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -443,19 +443,19 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, tcpip.ErrBroadcastDisabled
 		}
 
-		netProto, err := e.checkV4Mapped(to)
+		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
 		}
 
-		r, _, err := e.connectRoute(nicID, *to, netProto)
+		r, _, err := e.connectRoute(nicID, dst, netProto)
 		if err != nil {
 			return 0, nil, err
 		}
 		defer r.Release()
 
 		route = &r
-		dstPort = to.Port
+		dstPort = dst.Port
 	}
 
 	if route.IsResolutionRequired() {
@@ -566,7 +566,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
-		netProto, err := e.checkV4Mapped(&fa)
+		fa, netProto, err := e.checkV4MappedLocked(fa)
 		if err != nil {
 			return err
 		}
@@ -927,13 +927,14 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -981,10 +982,6 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	netProto, err := e.checkV4Mapped(&addr)
-	if err != nil {
-		return err
-	}
 	if addr.Port == 0 {
 		// We don't support connecting to port zero.
 		return tcpip.ErrInvalidEndpointState
@@ -1012,6 +1009,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
 	r, nicID, err := e.connectRoute(nicID, addr, netProto)
 	if err != nil {
 		return err
@@ -1139,7 +1141,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 43fb047ed..466bd9381 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -69,6 +69,9 @@ func (e *endpoint) afterLoad() {
 
 // Resume implements tcpip.ResumableEndpoint.Resume.
 func (e *endpoint) Resume(s *stack.Stack) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
 	e.stack = s
 
 	for _, m := range e.multicastMemberships {
-- 
cgit v1.2.3


From 844e4d284cddf9795a0db7c38f926fe7b49bb873 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 3 Mar 2020 14:09:45 -0800
Subject: Extract local variables for readability

PiperOrigin-RevId: 298690552
---
 pkg/ilist/list.go | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 019caadca..f3a609b57 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -88,8 +88,9 @@ func (l *List) Back() Element {
 
 // PushFront inserts the element e at the front of list l.
 func (l *List) PushFront(e Element) {
-	ElementMapper{}.linkerFor(e).SetNext(l.head)
-	ElementMapper{}.linkerFor(e).SetPrev(nil)
+	linker := ElementMapper{}.linkerFor(e)
+	linker.SetNext(l.head)
+	linker.SetPrev(nil)
 
 	if l.head != nil {
 		ElementMapper{}.linkerFor(l.head).SetPrev(e)
@@ -102,8 +103,9 @@ func (l *List) PushFront(e Element) {
 
 // PushBack inserts the element e at the back of list l.
 func (l *List) PushBack(e Element) {
-	ElementMapper{}.linkerFor(e).SetNext(nil)
-	ElementMapper{}.linkerFor(e).SetPrev(l.tail)
+	linker := ElementMapper{}.linkerFor(e)
+	linker.SetNext(nil)
+	linker.SetPrev(l.tail)
 
 	if l.tail != nil {
 		ElementMapper{}.linkerFor(l.tail).SetNext(e)
@@ -132,10 +134,14 @@ func (l *List) PushBackList(m *List) {
 
 // InsertAfter inserts e after b.
 func (l *List) InsertAfter(b, e Element) {
-	a := ElementMapper{}.linkerFor(b).Next()
-	ElementMapper{}.linkerFor(e).SetNext(a)
-	ElementMapper{}.linkerFor(e).SetPrev(b)
-	ElementMapper{}.linkerFor(b).SetNext(e)
+	bLinker := ElementMapper{}.linkerFor(b)
+	eLinker := ElementMapper{}.linkerFor(e)
+
+	a := bLinker.Next()
+
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	bLinker.SetNext(e)
 
 	if a != nil {
 		ElementMapper{}.linkerFor(a).SetPrev(e)
@@ -146,10 +152,13 @@ func (l *List) InsertAfter(b, e Element) {
 
 // InsertBefore inserts e before a.
 func (l *List) InsertBefore(a, e Element) {
-	b := ElementMapper{}.linkerFor(a).Prev()
-	ElementMapper{}.linkerFor(e).SetNext(a)
-	ElementMapper{}.linkerFor(e).SetPrev(b)
-	ElementMapper{}.linkerFor(a).SetPrev(e)
+	aLinker := ElementMapper{}.linkerFor(a)
+	eLinker := ElementMapper{}.linkerFor(e)
+
+	b := aLinker.Prev()
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	aLinker.SetPrev(e)
 
 	if b != nil {
 		ElementMapper{}.linkerFor(b).SetNext(e)
-- 
cgit v1.2.3


From 277a0d5a1fbe8272d4729c01ee4c6e374d047ebc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 3 Mar 2020 14:34:36 -0800
Subject: platform/ptrace: don't call probeSeccomp on arm64

The support of PTRACE_SYSEMU on arm64 was added in the 5.3 kernel,
so we can be sure that the current version is higher that 5.3.

And this change moves vsyscall seccomp rules to the arch specific file,
because vsyscall isn't supported on arm64.

PiperOrigin-RevId: 298696493
---
 pkg/sentry/platform/ptrace/subprocess_amd64.go | 80 +++++++++++++++++++++++---
 pkg/sentry/platform/ptrace/subprocess_arm64.go | 11 +++-
 pkg/sentry/platform/ptrace/subprocess_linux.go | 65 +--------------------
 3 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index e99798c56..cd74945e7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,7 @@ import (
 	"strings"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -183,13 +184,76 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
-	return append(rules, seccomp.RuleSet{
-		Rules: seccomp.SyscallRules{
-			syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-				{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
+	rules = append(rules,
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
 			},
-		},
-		Action: linux.SECCOMP_RET_ALLOW,
-	})
+			Action:   linux.SECCOMP_RET_TRAP,
+			Vsyscall: true,
+		})
+	if defaultAction != linux.SECCOMP_RET_ALLOW {
+		rules = append(rules,
+			seccomp.RuleSet{
+				Rules: seccomp.SyscallRules{
+					syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+						{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+					},
+				},
+				Action: linux.SECCOMP_RET_ALLOW,
+			})
+	}
+	return rules
+}
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 7b975137f..7f5c393f0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -160,6 +160,15 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
 	return rules
 }
+
+// probeSeccomp returns true if seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8.
+//
+// On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so
+// probeSeccomp can always return true.
+func probeSeccomp() bool {
+	return true
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 74968dfdf..2ce528601 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"syscall"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
@@ -30,54 +29,6 @@ import (
 
 const syscallEvent syscall.Signal = 0x80
 
-// probeSeccomp returns true iff seccomp is run after ptrace notifications,
-// which is generally the case for kernel version >= 4.8. This check is dynamic
-// because kernels have be backported behavior.
-//
-// See createStub for more information.
-//
-// Precondition: the runtime OS thread must be locked.
-func probeSeccomp() bool {
-	// Create a completely new, destroyable process.
-	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
-	if err != nil {
-		panic(fmt.Sprintf("seccomp probe failed: %v", err))
-	}
-	defer t.destroy()
-
-	// Set registers to the yield system call. This call is not allowed
-	// by the filters specified in the attachThread function.
-	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
-	if err := t.setRegs(&regs); err != nil {
-		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
-	}
-
-	for {
-		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
-			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
-		}
-
-		sig := t.wait(stopped)
-		if sig == (syscallEvent | syscall.SIGTRAP) {
-			// Did the seccomp errno hook already run? This would
-			// indicate that seccomp is first in line and we're
-			// less than 4.8.
-			if err := t.getRegs(&regs); err != nil {
-				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
-			}
-			if _, err := syscallReturnValue(&regs); err == nil {
-				// The seccomp errno mode ran first, and reset
-				// the error in the registers.
-				return false
-			}
-			// The seccomp hook did not run yet, and therefore it
-			// is safe to use RET_KILL mode for dispatched calls.
-			return true
-		}
-	}
-}
-
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -123,18 +74,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 	// stub and all its children. This is used to create child stubs
 	// (below), so we must include the ability to fork, but otherwise lock
 	// down available calls only to what is needed.
-	rules := []seccomp.RuleSet{
-		// Rules for trapping vsyscall access.
-		{
-			Rules: seccomp.SyscallRules{
-				syscall.SYS_GETTIMEOFDAY: {},
-				syscall.SYS_TIME:         {},
-				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
-			},
-			Action:   linux.SECCOMP_RET_TRAP,
-			Vsyscall: true,
-		},
-	}
+	rules := []seccomp.RuleSet{}
 	if defaultAction != linux.SECCOMP_RET_ALLOW {
 		rules = append(rules, seccomp.RuleSet{
 			Rules: seccomp.SyscallRules{
@@ -173,9 +113,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			},
 			Action: linux.SECCOMP_RET_ALLOW,
 		})
-
-		rules = appendArchSeccompRules(rules)
 	}
+	rules = appendArchSeccompRules(rules, defaultAction)
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
 	if err != nil {
 		return nil, err
-- 
cgit v1.2.3


From 371abe00f052dec37106f2dc22921bc84fb94818 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 3 Mar 2020 15:06:07 -0800
Subject: Avoid memory leaks

Properly discard segments from the segment heap.

PiperOrigin-RevId: 298704074
---
 pkg/tcpip/transport/tcp/rcv.go          | 4 ++++
 pkg/tcpip/transport/tcp/segment_heap.go | 1 +
 2 files changed, 5 insertions(+)

diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 958f03ac1..d80aff1b6 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -195,6 +195,10 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 		for i := first; i < len(r.pendingRcvdSegments); i++ {
 			r.pendingRcvdSegments[i].decRef()
+			// Note that slice truncation does not allow garbage collection of
+			// truncated items, thus truncated items must be set to nil to avoid
+			// memory leaks.
+			r.pendingRcvdSegments[i] = nil
 		}
 		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
 
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index 9fd061d7d..e28f213ba 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -41,6 +41,7 @@ func (h *segmentHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
+	old[n-1] = nil
 	*h = old[:n-1]
 	return x
 }
-- 
cgit v1.2.3


From 122d47aed17abf4301596e19fc8ac9cdad8118d9 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Mar 2020 15:27:23 -0800
Subject: Update cached file size when cache is skipped

gofer.dentryReadWriter.WriteFromBlocks was not updating
gofer.dentry.size after a write operation that skips the
cache.

Updates #1198

PiperOrigin-RevId: 298708646
---
 pkg/sentry/fsimpl/gofer/regular_file.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 54c1031a7..e95209661 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -361,8 +361,15 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
 	rw.d.handleMu.RLock()
 	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
 		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
-		rw.d.handleMu.RUnlock()
 		rw.off += n
+		rw.d.dataMu.Lock()
+		if rw.off > rw.d.size {
+			atomic.StoreUint64(&rw.d.size, rw.off)
+			// The remote file's size will implicitly be extended to the correct
+			// value when we write back to it.
+		}
+		rw.d.dataMu.Unlock()
+		rw.d.handleMu.RUnlock()
 		return n, err
 	}
 
-- 
cgit v1.2.3


From 9a4495fb87fda13be41207e0e3f95bb009debb65 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Mar 2020 01:22:58 +0000
Subject: Bump rake from 12.3.2 to 12.3.3 in /benchmarks/workloads/ruby

Bumps [rake](https://github.com/ruby/rake) from 12.3.2 to 12.3.3.
- [Release notes](https://github.com/ruby/rake/releases)
- [Changelog](https://github.com/ruby/rake/blob/master/History.rdoc)
- [Commits](https://github.com/ruby/rake/compare/v12.3.2...v12.3.3)

Signed-off-by: dependabot[bot] <support@github.com>
---
 benchmarks/workloads/ruby/Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
index 855edf17f..17ebcbec3 100644
--- a/benchmarks/workloads/ruby/Gemfile.lock
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -33,7 +33,7 @@ GEM
     rack (2.2.2)
     rack-protection (2.0.5)
       rack
-    rake (12.3.2)
+    rake (12.3.3)
     rbnacl (7.1.1)
       ffi
     redis (4.1.1)
-- 
cgit v1.2.3


From 9b26d2fa0d652cb08e13a1262faedc01be4805de Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Mar 2020 01:22:58 +0000
Subject: Bump puma from 3.12.2 to 3.12.4 in
 /benchmarks/workloads/ruby_template

Bumps [puma](https://github.com/puma/puma) from 3.12.2 to 3.12.4.
- [Release notes](https://github.com/puma/puma/releases)
- [Changelog](https://github.com/puma/puma/blob/master/History.md)
- [Commits](https://github.com/puma/puma/compare/v3.12.2...v3.12.4)

Signed-off-by: dependabot[bot] <support@github.com>
---
 benchmarks/workloads/ruby_template/Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock
index af03ed7fd..f637b6081 100644
--- a/benchmarks/workloads/ruby_template/Gemfile.lock
+++ b/benchmarks/workloads/ruby_template/Gemfile.lock
@@ -2,7 +2,7 @@ GEM
   remote: https://rubygems.org/
   specs:
     mustermann (1.0.3)
-    puma (3.12.2)
+    puma (3.12.4)
     rack (2.0.6)
     rack-protection (2.0.5)
       rack
-- 
cgit v1.2.3


From ef1219c1451a75916693a54ddad39d04cf763d90 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 4 Mar 2020 00:14:41 -0800
Subject: Use shuf instead of $RANDOM everywhere.

$RANDOM can cause collisions but shuf uses /dev/urandom so it ought to cause
fewer.

PiperOrigin-RevId: 298786344
---
 test/packetdrill/packetdrill_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh
index 0b22dfd5c..c8268170f 100755
--- a/test/packetdrill/packetdrill_test.sh
+++ b/test/packetdrill/packetdrill_test.sh
@@ -91,8 +91,8 @@ fi
 # Variables specific to the test runner start with TEST_RUNNER_.
 declare -r PACKETDRILL="/packetdrill/gtests/net/packetdrill/packetdrill"
 # Use random numbers so that test networks don't collide.
-declare -r CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-declare -r TEST_NET="test_net-${RANDOM}${RANDOM}"
+declare -r CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)"
+declare -r TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)"
 declare -r tolerance_usecs=100000
 # On both DUT and test runner, testing packets are on the eth2 interface.
 declare -r TEST_DEVICE="eth2"
-- 
cgit v1.2.3


From 504c9e14d61a9ca9fa3615290a05471684019ecc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 3 Mar 2020 15:53:48 -0800
Subject: test/runner: use proper filters for test cases

The benchmark_filter options accepts regex-s, but
the gtest-filter option accepts shell-like wildcards.

Fixes #2034

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 test/runner/gtest/gtest.go             | 7 ++++---
 test/syscalls/linux/tuntap_hostinet.cc | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
index f96e2415e..869169ad5 100644
--- a/test/runner/gtest/gtest.go
+++ b/test/runner/gtest/gtest.go
@@ -66,13 +66,12 @@ func (tc TestCase) Args() []string {
 	}
 	if tc.benchmark {
 		return []string{
-			fmt.Sprintf("%s=^$", filterTestFlag),
 			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+			fmt.Sprintf("%s=", filterTestFlag),
 		}
 	}
 	return []string{
-		fmt.Sprintf("%s=^%s$", filterTestFlag, tc.FullName()),
-		fmt.Sprintf("%s=^$", filterBenchmarkFlag),
+		fmt.Sprintf("%s=%s", filterTestFlag, tc.FullName()),
 	}
 }
 
@@ -147,6 +146,8 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes
 		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
 	}
 
+	out = []byte(strings.Trim(string(out), "\n"))
+
 	// Parse benchmark output.
 	for _, line := range strings.Split(string(out), "\n") {
 		// Strip comments.
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
index 0c527419e..1513fb9d5 100644
--- a/test/syscalls/linux/tuntap_hostinet.cc
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -26,6 +26,7 @@ namespace {
 
 TEST(TuntapHostInetTest, NoNetTun) {
   SKIP_IF(!IsRunningOnGvisor());
+  SKIP_IF(!IsRunningWithHostinet());
 
   struct stat statbuf;
   ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
-- 
cgit v1.2.3


From a690b5762480ae80ef4264402fcfa8d84b57339a Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 4 Mar 2020 14:29:11 -0800
Subject: Ensure that safemem.BlockSeqOf(safemem.Block{}) produces an empty
 BlockSeq.

PiperOrigin-RevId: 298941855
---
 pkg/safemem/seq_test.go   | 21 +++++++++++++++++++++
 pkg/safemem/seq_unsafe.go |  3 +++
 2 files changed, 24 insertions(+)

diff --git a/pkg/safemem/seq_test.go b/pkg/safemem/seq_test.go
index eba4bb535..de34005e9 100644
--- a/pkg/safemem/seq_test.go
+++ b/pkg/safemem/seq_test.go
@@ -20,6 +20,27 @@ import (
 	"testing"
 )
 
+func TestBlockSeqOfEmptyBlock(t *testing.T) {
+	bs := BlockSeqOf(Block{})
+	if !bs.IsEmpty() {
+		t.Errorf("BlockSeqOf(Block{}).IsEmpty(): got false, wanted true; BlockSeq is %v", bs)
+	}
+}
+
+func TestBlockSeqOfNonemptyBlock(t *testing.T) {
+	b := BlockFromSafeSlice(make([]byte, 1))
+	bs := BlockSeqOf(b)
+	if bs.IsEmpty() {
+		t.Fatalf("BlockSeqOf(non-empty Block).IsEmpty(): got true, wanted false; BlockSeq is %v", bs)
+	}
+	if head := bs.Head(); head != b {
+		t.Fatalf("BlockSeqOf(non-empty Block).Head(): got %v, wanted %v", head, b)
+	}
+	if tail := bs.Tail(); !tail.IsEmpty() {
+		t.Fatalf("BlockSeqOf(non-empty Block).Tail().IsEmpty(): got false, wanted true: tail is %v", tail)
+	}
+}
+
 type blockSeqTest struct {
 	desc string
 
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index dcdfc9600..f5f0574f8 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -56,6 +56,9 @@ type BlockSeq struct {
 
 // BlockSeqOf returns a BlockSeq representing the single Block b.
 func BlockSeqOf(b Block) BlockSeq {
+	if b.length == 0 {
+		return BlockSeq{}
+	}
 	bs := BlockSeq{
 		data:   b.start,
 		length: -1,
-- 
cgit v1.2.3


From 80b40bbb06f3c9ca23e3ad152b481ab222ec0e47 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 4 Mar 2020 16:15:22 -0800
Subject: tests: Don't print log messages on stdout

A parser of test results doesn't expect to see any extra messages.

PiperOrigin-RevId: 298966577
---
 pkg/seccomp/seccomp_test.go     | 2 +-
 runsc/container/console_test.go | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index da5a5e4b2..88766f33b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -451,7 +451,7 @@ func TestRandom(t *testing.T) {
 		}
 	}
 
-	fmt.Printf("Testing filters: %v", syscallRules)
+	t.Logf("Testing filters: %v", syscallRules)
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  syscallRules,
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index c2518d52b..651615d4c 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -333,13 +333,13 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	// file. Writes after a certain point will block unless we drain the
 	// PTY, so we must continually copy from it.
 	//
-	// We log the output to stdout for debugabilitly, and also to a buffer,
+	// We log the output to stderr for debugabilitly, and also to a buffer,
 	// since we wait on particular output from bash below. We use a custom
 	// blockingBuffer which is thread-safe and also blocks on Read calls,
 	// which makes this a suitable Reader for WaitUntilRead.
 	ptyBuf := newBlockingBuffer()
 	tee := io.TeeReader(ptyMaster, ptyBuf)
-	go io.Copy(os.Stdout, tee)
+	go io.Copy(os.Stderr, tee)
 
 	// Start the container.
 	if err := c.Start(conf); err != nil {
-- 
cgit v1.2.3


From 2cf974117d16f8750c40f6f37d555a91f9ecabc7 Mon Sep 17 00:00:00 2001
From: Andrew Dunham <andrew@du.nham.ca>
Date: Wed, 4 Mar 2020 22:53:47 -0800
Subject: WORKSPACE: bump rules_python to include reproducibility fix

---
 WORKSPACE | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index d1c2d8a24..d2bbadc63 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -78,9 +78,9 @@ rules_proto_toolchains()
 # Load python dependencies.
 git_repository(
     name = "rules_python",
-    commit = "94677401bc56ed5d756f50b441a6a5c7f735a6d4",
+    commit = "abc4869e02fe9b3866942e89f07b7341f830e805",
     remote = "https://github.com/bazelbuild/rules_python.git",
-    shallow_since = "1573842889 -0500",
+    shallow_since = "1583341286 -0500",
 )
 
 load("@rules_python//python:pip.bzl", "pip_import")
-- 
cgit v1.2.3


From 6ec669631fe41ad739e46fed4dfca68d53001f83 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 5 Mar 2020 13:07:04 -0800
Subject: tests: Don't print log messages on stdout

A parser of test results doesn't expect to see any extra messages.

PiperOrigin-RevId: 299174138
---
 runsc/container/container_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index bdd65b498..c7eea85b3 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -2092,7 +2092,7 @@ func TestOverlayfsStaleRead(t *testing.T) {
 	defer out.Close()
 
 	const want = "foobar"
-	cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
+	cmd := fmt.Sprintf("cat %q >&2 && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
 	spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
 	if err := run(spec, conf); err != nil {
 		t.Fatalf("Error running container: %v", err)
-- 
cgit v1.2.3


From 9b3aad33c4470908953b7b548b12cba77799f342 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 5 Mar 2020 15:55:40 -0800
Subject: Use a pool of arrays to avoid slice headers from escaping in TCP
 options pool.

By putting slices into the pool, the slice header escapes. This can be avoided
by not putting the slice header into the pool.

This removes an allocation from the TCP segment send path.

PiperOrigin-RevId: 299215480
---
 pkg/tcpip/transport/tcp/BUILD             |  1 +
 pkg/tcpip/transport/tcp/connect.go        |  6 +++---
 pkg/tcpip/transport/tcp/connect_unsafe.go | 30 ++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/connect_unsafe.go

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 272e8f570..a32f9eacf 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -32,6 +32,7 @@ go_library(
     srcs = [
         "accept.go",
         "connect.go",
+        "connect_unsafe.go",
         "cubic.go",
         "cubic_state.go",
         "dispatcher.go",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ae4f3f3a9..c0f73ef16 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -624,17 +624,17 @@ func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
 
 var optionPool = sync.Pool{
 	New: func() interface{} {
-		return make([]byte, maxOptionSize)
+		return &[maxOptionSize]byte{}
 	},
 }
 
 func getOptions() []byte {
-	return optionPool.Get().([]byte)
+	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
 }
 
 func putOptions(options []byte) {
 	// Reslice to full capacity.
-	optionPool.Put(options[0:cap(options)])
+	optionPool.Put(optionsToArray(options))
 }
 
 func makeSynOptions(opts header.TCPSynOptions) []byte {
diff --git a/pkg/tcpip/transport/tcp/connect_unsafe.go b/pkg/tcpip/transport/tcp/connect_unsafe.go
new file mode 100644
index 000000000..cfc304616
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/connect_unsafe.go
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// optionsToArray converts a slice of capacity >-= maxOptionSize to an array.
+//
+// optionsToArray panics if the capacity of options is smaller than
+// maxOptionSize.
+func optionsToArray(options []byte) *[maxOptionSize]byte {
+	// Reslice to full capacity.
+	options = options[0:maxOptionSize]
+	return (*[maxOptionSize]byte)(unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&options)).Data))
+}
-- 
cgit v1.2.3


From 9b64b658c1b9a4986bc5a4ebd9e5ddeb9f52dfa3 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 5 Mar 2020 17:39:11 -0800
Subject: Fix S/R on inet.Namespace.

PiperOrigin-RevId: 299238067
---
 pkg/sentry/inet/namespace.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
index c16667e7f..029af3025 100644
--- a/pkg/sentry/inet/namespace.go
+++ b/pkg/sentry/inet/namespace.go
@@ -23,7 +23,10 @@ type Namespace struct {
 
 	// creator allows kernel to create new network stack for network namespaces.
 	// If nil, no networking will function if network is namespaced.
-	creator NetworkStackCreator
+	//
+	// At afterLoad(), creator will be used to create network stack. Stateify
+	// needs to wait for this field to be loaded before calling afterLoad().
+	creator NetworkStackCreator `state:"wait"`
 
 	// isRoot indicates whether this is the root network namespace.
 	isRoot bool
-- 
cgit v1.2.3


From da48fc6cca23a38faef51c5b5f8ae609940773a0 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 5 Mar 2020 18:21:39 -0800
Subject: Stub oom_score_adj and oom_score.

Adds an oom_score_adj and oom_score proc file stub. oom_score_adj accepts
writes of values -1000 to 1000 and persists the value with the task. New tasks
inherit the parent's oom_score_adj.

oom_score is a read-only stub that always returns the value '0'.

Issue #202

PiperOrigin-RevId: 299245355
---
 pkg/sentry/fs/proc/task.go               | 126 ++++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/proc/task.go           |  12 +--
 pkg/sentry/fsimpl/proc/task_files.go     |  43 +++++++++++
 pkg/sentry/fsimpl/proc/tasks_test.go     |  32 ++++----
 pkg/sentry/kernel/task.go                |  33 ++++++++
 pkg/sentry/kernel/task_clone.go          |   6 ++
 pkg/sentry/kernel/task_start.go          |   4 +
 test/syscalls/BUILD                      |   8 +-
 test/syscalls/linux/BUILD                |  13 ++++
 test/syscalls/linux/proc.cc              |  21 ++++++
 test/syscalls/linux/proc_pid_oomscore.cc |  72 ++++++++++++++++++
 11 files changed, 330 insertions(+), 40 deletions(-)
 create mode 100644 test/syscalls/linux/proc_pid_oomscore.cc

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 8ab8d8a02..4e9b0fc00 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -72,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil)
 // newTaskDir creates a new proc task entry.
 func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":      newAuxvec(t, msrc),
-		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":      newComm(t, msrc),
-		"environ":   newExecArgInode(t, msrc, environExecArg),
-		"exe":       newExe(t, msrc),
-		"fd":        newFdDir(t, msrc),
-		"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map":   newGIDMap(t, msrc),
-		"io":        newIO(t, msrc, isThreadGroup),
-		"maps":      newMaps(t, msrc),
-		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"ns":        newNamespaceDir(t, msrc),
-		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
-		"statm":     newStatm(t, msrc),
-		"status":    newStatus(t, msrc, p.pidns),
-		"uid_map":   newUIDMap(t, msrc),
+		"auxv":          newAuxvec(t, msrc),
+		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":          newComm(t, msrc),
+		"environ":       newExecArgInode(t, msrc, environExecArg),
+		"exe":           newExe(t, msrc),
+		"fd":            newFdDir(t, msrc),
+		"fdinfo":        newFdInfoDir(t, msrc),
+		"gid_map":       newGIDMap(t, msrc),
+		"io":            newIO(t, msrc, isThreadGroup),
+		"maps":          newMaps(t, msrc),
+		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"ns":            newNamespaceDir(t, msrc),
+		"oom_score":     newOOMScore(t, msrc),
+		"oom_score_adj": newOOMScoreAdj(t, msrc),
+		"smaps":         newSmaps(t, msrc),
+		"stat":          newTaskStat(t, msrc, isThreadGroup, p.pidns),
+		"statm":         newStatm(t, msrc),
+		"status":        newStatus(t, msrc, p.pidns),
+		"uid_map":       newUIDMap(t, msrc),
 	}
 	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
@@ -796,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	return int64(n), err
 }
 
+// newOOMScore returns a oom_score file. It is a stub that always returns 0.
+// TODO(gvisor.dev/issue/1967)
+func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newStaticProcInode(t, msrc, []byte("0\n"))
+}
+
+// oomScoreAdj is a file containing the oom_score adjustment for a task.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	fsutil.SimpleFileInode
+
+	t *kernel.Task
+}
+
+// +stateify savable
+type oomScoreAdjFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	t *kernel.Task
+}
+
+// newOOMScoreAdj returns a oom_score_adj file.
+func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	i := &oomScoreAdj{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(t, i, msrc, fs.SpecialFile, t)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+	adj, err := f.t.OOMScoreAdj()
+	if err != nil {
+		return 0, err
+	}
+	adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n")
+	n, err := dst.CopyOut(ctx, adjBytes)
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := f.t.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
 // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 2d814668a..18e5cd6f6 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
 			"user": newNamespaceSymlink(task, inoGen.NextIno(), "user"),
 		}),
-		"smaps":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
-		"stat":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
-		"status":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
+		"status":        newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers)
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index efd3b3453..5a231ac86 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
 	return nil
 }
+
+// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	adj, err := o.task.OOMScoreAdj()
+	if err != nil {
+		return err
+	}
+	fmt.Fprintf(buf, "%d\n", adj)
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := o.task.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index c5d531fe0..0eb401619 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -63,21 +63,23 @@ var (
 		"thread-self": threadSelfLink.NextOff,
 	}
 	taskStaticFiles = map[string]testutil.DirentType{
-		"auxv":    linux.DT_REG,
-		"cgroup":  linux.DT_REG,
-		"cmdline": linux.DT_REG,
-		"comm":    linux.DT_REG,
-		"environ": linux.DT_REG,
-		"gid_map": linux.DT_REG,
-		"io":      linux.DT_REG,
-		"maps":    linux.DT_REG,
-		"ns":      linux.DT_DIR,
-		"smaps":   linux.DT_REG,
-		"stat":    linux.DT_REG,
-		"statm":   linux.DT_REG,
-		"status":  linux.DT_REG,
-		"task":    linux.DT_DIR,
-		"uid_map": linux.DT_REG,
+		"auxv":          linux.DT_REG,
+		"cgroup":        linux.DT_REG,
+		"cmdline":       linux.DT_REG,
+		"comm":          linux.DT_REG,
+		"environ":       linux.DT_REG,
+		"gid_map":       linux.DT_REG,
+		"io":            linux.DT_REG,
+		"maps":          linux.DT_REG,
+		"ns":            linux.DT_DIR,
+		"oom_score":     linux.DT_REG,
+		"oom_score_adj": linux.DT_REG,
+		"smaps":         linux.DT_REG,
+		"stat":          linux.DT_REG,
+		"statm":         linux.DT_REG,
+		"status":        linux.DT_REG,
+		"task":          linux.DT_DIR,
+		"uid_map":       linux.DT_REG,
 	}
 )
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2cee2e6ed..c0dbbe890 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -554,6 +555,13 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// oomScoreAdj is the task's OOM score adjustment. This is currently not
+	// used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is protected by mu, and is owned by the task goroutine.
+	oomScoreAdj int32
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -847,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) ContainerID() string {
 	return t.containerID
 }
+
+// OOMScoreAdj gets the task's OOM score adjustment.
+func (t *Task) OOMScoreAdj() (int32, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	return t.oomScoreAdj, nil
+}
+
+// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
+// between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return syserror.ESRCH
+	}
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	t.oomScoreAdj = adj
+	return nil
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 78866f280..dda502bb8 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -264,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		rseqSignature = t.rseqSignature
 	}
 
+	adj, err := t.OOMScoreAdj()
+	if err != nil {
+		return 0, nil, err
+	}
+
 	cfg := &TaskConfig{
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
@@ -282,6 +287,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
+		OOMScoreAdj:             adj,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index a5035bb7f..2bbf48bb8 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -93,6 +93,9 @@ type TaskConfig struct {
 
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
+
+	// oomScoreAdj is the task's OOM score adjustment.
+	OOMScoreAdj int32
 }
 
 // NewTask creates a new task defined by cfg.
@@ -143,6 +146,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		rseqSignature:      cfg.RSeqSignature,
 		futexWaiter:        futex.NewWaiter(),
 		containerID:        cfg.ContainerID,
+		oomScoreAdj:        cfg.OOMScoreAdj,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a69b0ce13..9800a0cdf 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -318,10 +318,14 @@ syscall_test(
     test = "//test/syscalls/linux:proc_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
-
 syscall_test(test = "//test/syscalls/linux:proc_net_test")
 
+syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0fbd556de..43455f1a3 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1631,6 +1631,19 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "proc_pid_oomscore_test",
+    testonly = 1,
+    srcs = ["proc_pid_oomscore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_binary(
     name = "proc_pid_smaps_test",
     testonly = 1,
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index f91187e75..5a70f6c3b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1431,6 +1431,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 
   EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
 }
 
 // Test whether /proc/PID/ files can be read for a zombie process.
@@ -1466,6 +1472,12 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   //
@@ -1527,6 +1539,15 @@ TEST(ProcPidFile, SubprocessExited) {
 
   EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
 }
 
 PosixError DirContainsImpl(absl::string_view path,
diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc
new file mode 100644
index 000000000..707821a3f
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_oomscore.cc
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int> ReadProcNumber(std::string path) {
+  ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path));
+  EXPECT_EQ(contents[contents.length() - 1], '\n');
+
+  int num;
+  if (!absl::SimpleAtoi(contents, &num)) {
+    return PosixError(EINVAL, "invalid value: " + contents);
+  }
+
+  return num;
+}
+
+TEST(ProcPidOomscoreTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score"));
+  EXPECT_LE(oom_score, 1000);
+  EXPECT_GE(oom_score, -1000);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+
+  // oom_score_adj defaults to 0.
+  EXPECT_EQ(oom_score, 0);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicWrite) {
+  constexpr int test_value = 7;
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY));
+  ASSERT_THAT(
+      RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1),
+      SyscallSucceeds());
+
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+  EXPECT_EQ(oom_score, test_value);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From d6f5e71df2c8ff3d763cba703786af68af1f9841 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 6 Mar 2020 08:01:45 -0800
Subject: Get strings for stack.DHCPv6ConfigurationFromNDPRA

Useful for logs to print the string representation of the value
instead of the integer value.

PiperOrigin-RevId: 299356847
---
 pkg/tcpip/stack/BUILD                              |  1 +
 .../stack/dhcpv6configurationfromndpra_string.go   | 39 ++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 705cf01ee..8febd54c8 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -18,6 +18,7 @@ go_template_instance(
 go_library(
     name = "stack",
     srcs = [
+        "dhcpv6configurationfromndpra_string.go",
         "icmp_rate_limit.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
new file mode 100644
index 000000000..8b4213eec
--- /dev/null
+++ b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type=DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[DHCPv6NoConfiguration-0]
+	_ = x[DHCPv6ManagedAddress-1]
+	_ = x[DHCPv6OtherConfigurations-2]
+}
+
+const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAddressDHCPv6OtherConfigurations"
+
+var _DHCPv6ConfigurationFromNDPRA_index = [...]uint8{0, 21, 41, 66}
+
+func (i DHCPv6ConfigurationFromNDPRA) String() string {
+	if i < 0 || i >= DHCPv6ConfigurationFromNDPRA(len(_DHCPv6ConfigurationFromNDPRA_index)-1) {
+		return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _DHCPv6ConfigurationFromNDPRA_name[_DHCPv6ConfigurationFromNDPRA_index[i]:_DHCPv6ConfigurationFromNDPRA_index[i+1]]
+}
-- 
cgit v1.2.3


From 1e8c0bcedb265d3149e5d2ab1181628d013539c4 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Fri, 6 Mar 2020 09:25:32 -0800
Subject: Add nat table support for iptables.

---
 pkg/tcpip/iptables/targets.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index ae5af7c53..e457f2349 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -69,7 +69,7 @@ func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 // Min and Max values for IP and Ports in the struct indicate the range of
 // values which can be used to redirect.
 type RedirectTarget struct {
-	// TODO(gvisor.dev/issue/170): Other flags need to be aded after
+	// TODO(gvisor.dev/issue/170): Other flags need to be added after
 	// we support them.
 	// RangeProtoSpecified flag indicates single port is specified to
 	// redirect.
@@ -98,7 +98,7 @@ func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer) (RuleVerdict, int) {
 	// Set network header.
 	headerView := newPkt.Data.First()
 	netHeader := header.IPv4(headerView)
-	newPkt.NetworkHeader = headerView[:netHeader.HeaderLength()]
+	newPkt.NetworkHeader = headerView[:header.IPv4MinimumSize]
 
 	hlen := int(netHeader.HeaderLength())
 	tlen := int(netHeader.TotalLength())
-- 
cgit v1.2.3


From 20170d4fd5c26def584664762f4e639f0b43ff6e Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 6 Mar 2020 11:22:55 -0800
Subject: Move packetdrill tests to open-source

PiperOrigin-RevId: 299396286
---
 test/packetdrill/BUILD                             | 44 +++++++++++++++++++-
 test/packetdrill/accept_ack_drop.pkt               | 27 ++++++++++++
 test/packetdrill/defs.bzl                          | 10 ++---
 test/packetdrill/linux/tcp_user_timeout.pkt        | 39 ++++++++++++++++++
 .../listen_close_before_handshake_complete.pkt     | 31 ++++++++++++++
 test/packetdrill/netstack/tcp_user_timeout.pkt     | 38 +++++++++++++++++
 test/packetdrill/no_rst_to_rst.pkt                 | 36 ++++++++++++++++
 .../reset_for_ack_when_no_syn_cookies_in_use.pkt   |  9 ++++
 test/packetdrill/sanity_test.pkt                   |  7 ++++
 test/packetdrill/tcp_defer_accept.pkt              | 48 ++++++++++++++++++++++
 test/packetdrill/tcp_defer_accept_timeout.pkt      | 48 ++++++++++++++++++++++
 11 files changed, 330 insertions(+), 7 deletions(-)
 create mode 100644 test/packetdrill/accept_ack_drop.pkt
 create mode 100644 test/packetdrill/linux/tcp_user_timeout.pkt
 create mode 100644 test/packetdrill/listen_close_before_handshake_complete.pkt
 create mode 100644 test/packetdrill/netstack/tcp_user_timeout.pkt
 create mode 100644 test/packetdrill/no_rst_to_rst.pkt
 create mode 100644 test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt
 create mode 100644 test/packetdrill/sanity_test.pkt
 create mode 100644 test/packetdrill/tcp_defer_accept.pkt
 create mode 100644 test/packetdrill/tcp_defer_accept_timeout.pkt

diff --git a/test/packetdrill/BUILD b/test/packetdrill/BUILD
index d113555b1..fb0b2db41 100644
--- a/test/packetdrill/BUILD
+++ b/test/packetdrill/BUILD
@@ -1,8 +1,48 @@
-load("defs.bzl", "packetdrill_test")
+load("defs.bzl", "packetdrill_linux_test", "packetdrill_netstack_test", "packetdrill_test")
 
 package(licenses = ["notice"])
 
 packetdrill_test(
-    name = "fin_wait2_timeout",
+    name = "packetdrill_sanity_test",
+    scripts = ["sanity_test.pkt"],
+)
+
+packetdrill_test(
+    name = "accept_ack_drop_test",
+    scripts = ["accept_ack_drop.pkt"],
+)
+
+packetdrill_test(
+    name = "fin_wait2_timeout_test",
     scripts = ["fin_wait2_timeout.pkt"],
 )
+
+packetdrill_linux_test(
+    name = "tcp_user_timeout_test_linux_test",
+    scripts = ["linux/tcp_user_timeout.pkt"],
+)
+
+packetdrill_netstack_test(
+    name = "tcp_user_timeout_test_netstack_test",
+    scripts = ["netstack/tcp_user_timeout.pkt"],
+)
+
+packetdrill_test(
+    name = "listen_close_before_handshake_complete_test",
+    scripts = ["listen_close_before_handshake_complete.pkt"],
+)
+
+packetdrill_test(
+    name = "no_rst_to_rst_test",
+    scripts = ["no_rst_to_rst.pkt"],
+)
+
+packetdrill_test(
+    name = "tcp_defer_accept_test",
+    scripts = ["tcp_defer_accept.pkt"],
+)
+
+packetdrill_test(
+    name = "tcp_defer_accept_timeout_test",
+    scripts = ["tcp_defer_accept_timeout.pkt"],
+)
diff --git a/test/packetdrill/accept_ack_drop.pkt b/test/packetdrill/accept_ack_drop.pkt
new file mode 100644
index 000000000..76e638fd4
--- /dev/null
+++ b/test/packetdrill/accept_ack_drop.pkt
@@ -0,0 +1,27 @@
+// Test that the accept works if the final ACK is dropped and an ack with data
+// follows the dropped ack.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0.0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0.0 > S. 0:0(0) ack 1    <...>
+
++0.0 < . 1:5(4) ack 1 win 257
++0.0 > . 1:1(0) ack 5 <...>
+
+// This should cause connection to transition to connected state.
++0.000 accept(3, ..., ...) = 4
++0.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Now read the data and we should get 4 bytes.
++0.000 read(4,..., 4) = 4
++0.000 close(4) = 0
+
++0.0 > F. 1:1(0) ack 5 <...>
++0.0 < F. 5:5(0) ack 2 win 257
++0.01 > . 2:2(0) ack 6 <...>
\ No newline at end of file
diff --git a/test/packetdrill/defs.bzl b/test/packetdrill/defs.bzl
index 8623ce7b1..f499c177b 100644
--- a/test/packetdrill/defs.bzl
+++ b/test/packetdrill/defs.bzl
@@ -66,7 +66,7 @@ def packetdrill_linux_test(name, **kwargs):
     if "tags" not in kwargs:
         kwargs["tags"] = _PACKETDRILL_TAGS
     _packetdrill_test(
-        name = name + "_linux_test",
+        name = name,
         flags = ["--dut_platform", "linux"],
         **kwargs
     )
@@ -75,13 +75,13 @@ def packetdrill_netstack_test(name, **kwargs):
     if "tags" not in kwargs:
         kwargs["tags"] = _PACKETDRILL_TAGS
     _packetdrill_test(
-        name = name + "_netstack_test",
+        name = name,
         # This is the default runtime unless
         # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
         flags = ["--dut_platform", "netstack", "--runtime", "runsc-d"],
         **kwargs
     )
 
-def packetdrill_test(**kwargs):
-    packetdrill_linux_test(**kwargs)
-    packetdrill_netstack_test(**kwargs)
+def packetdrill_test(name, **kwargs):
+    packetdrill_linux_test(name + "_linux_test", **kwargs)
+    packetdrill_netstack_test(name + "_netstack_test", **kwargs)
diff --git a/test/packetdrill/linux/tcp_user_timeout.pkt b/test/packetdrill/linux/tcp_user_timeout.pkt
new file mode 100644
index 000000000..38018cb42
--- /dev/null
+++ b/test/packetdrill/linux/tcp_user_timeout.pkt
@@ -0,0 +1,39 @@
+// Test that a socket w/ TCP_USER_TIMEOUT set aborts the connection
+// if there is pending unacked data after the user specified timeout.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
++0.1 < . 1:1(0) ack 1 win 32792
+
++0.100 accept(3, ..., ...) = 4
+
+// Okay, we received nothing, and decide to close this idle socket.
+// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth
+// trying hard to cleanly close this flow, at the price of keeping
+// a TCP structure in kernel for about 1 minute!
++2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
+
+// The write/ack is required mainly for netstack as netstack does
+// not update its RTO during the handshake.
++0 write(4, ..., 100) = 100
++0 > P. 1:101(100) ack 1 <...>
++0 < . 1:1(0) ack 101 win 32792
+
++0 close(4) = 0
+
++0 > F. 101:101(0) ack 1 <...>
++.3~+.400 > F. 101:101(0) ack 1 <...>
++.3~+.400 > F. 101:101(0) ack 1 <...>
++.6~+.800 > F. 101:101(0) ack 1 <...>
++1.2~+1.300 > F. 101:101(0) ack 1 <...>
+
+// We finally receive something from the peer, but it is way too late
+// Our socket vanished because TCP_USER_TIMEOUT was really small.
++.1 < . 1:2(1) ack 102 win 32792
++0 > R 102:102(0) win 0
diff --git a/test/packetdrill/listen_close_before_handshake_complete.pkt b/test/packetdrill/listen_close_before_handshake_complete.pkt
new file mode 100644
index 000000000..51c3f1a32
--- /dev/null
+++ b/test/packetdrill/listen_close_before_handshake_complete.pkt
@@ -0,0 +1,31 @@
+// Test that closing a listening socket closes any connections in SYN-RCVD
+// state and any packets bound for these connections generate a RESET.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++0.100 close(3) = 0
++0.1 < P. 1:1(0) ack 1 win 257
+
+// Linux generates a reset with no ack number/bit set. This is contradictory to
+// what is specified in Rule 1 under Reset Generation in
+// https://tools.ietf.org/html/rfc793#section-3.4.
+//   "1. If the connection does not exist (CLOSED) then a reset is sent
+//    in response to any incoming segment except another reset.  In
+//    particular, SYNs addressed to a non-existent connection are rejected
+//    by this means.
+//
+//    If the incoming segment has an ACK field, the reset takes its
+//    sequence number from the ACK field of the segment, otherwise the
+//    reset has sequence number zero and the ACK field is set to the sum
+//    of the sequence number and segment length of the incoming segment.
+//    The connection remains in the CLOSED state."
+
++0.0  > R 1:1(0) win 0
\ No newline at end of file
diff --git a/test/packetdrill/netstack/tcp_user_timeout.pkt b/test/packetdrill/netstack/tcp_user_timeout.pkt
new file mode 100644
index 000000000..60103adba
--- /dev/null
+++ b/test/packetdrill/netstack/tcp_user_timeout.pkt
@@ -0,0 +1,38 @@
+// Test that a socket w/ TCP_USER_TIMEOUT set aborts the connection
+// if there is pending unacked data after the user specified timeout.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
++0.1 < . 1:1(0) ack 1 win 32792
+
++0.100 accept(3, ..., ...) = 4
+
+// Okay, we received nothing, and decide to close this idle socket.
+// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth
+// trying hard to cleanly close this flow, at the price of keeping
+// a TCP structure in kernel for about 1 minute!
++2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
+
+// The write/ack is required mainly for netstack as netstack does
+// not update its RTO during the handshake.
++0 write(4, ..., 100) = 100
++0 > P. 1:101(100) ack 1 <...>
++0 < . 1:1(0) ack 101 win 32792
+
++0 close(4) = 0
+
++0 > F. 101:101(0) ack 1 <...>
++.2~+.300 > F. 101:101(0) ack 1 <...>
++.4~+.500 > F. 101:101(0) ack 1 <...>
++.8~+.900 > F. 101:101(0) ack 1 <...>
+
+// We finally receive something from the peer, but it is way too late
+// Our socket vanished because TCP_USER_TIMEOUT was really small.
++1.61 < . 1:2(1) ack 102 win 32792
++0 > R 102:102(0) win 0
diff --git a/test/packetdrill/no_rst_to_rst.pkt b/test/packetdrill/no_rst_to_rst.pkt
new file mode 100644
index 000000000..612747827
--- /dev/null
+++ b/test/packetdrill/no_rst_to_rst.pkt
@@ -0,0 +1,36 @@
+// Test a RST is not generated in response to a RST and a RST is correctly
+// generated when an accepted endpoint is RST due to an incoming RST.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
++0 < P. 1:1(0) ack 1 win 257
+
++0.100 accept(3, ..., ...) = 4
+
++0.200 < R 1:1(0) win 0
+
++0.300 read(4,..., 4) = -1 ECONNRESET (Connection Reset by Peer)
+
++0.00 < . 1:1(0) ack 1 win 257
+
+// Linux generates a reset with no ack number/bit set. This is contradictory to
+// what is specified in Rule 1 under Reset Generation in
+// https://tools.ietf.org/html/rfc793#section-3.4.
+//   "1. If the connection does not exist (CLOSED) then a reset is sent
+//    in response to any incoming segment except another reset.  In
+//    particular, SYNs addressed to a non-existent connection are rejected
+//    by this means.
+//
+//    If the incoming segment has an ACK field, the reset takes its
+//    sequence number from the ACK field of the segment, otherwise the
+//    reset has sequence number zero and the ACK field is set to the sum
+//    of the sequence number and segment length of the incoming segment.
+//    The connection remains in the CLOSED state."
+
++0.00 > R 1:1(0) win 0
\ No newline at end of file
diff --git a/test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt b/test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt
new file mode 100644
index 000000000..a86b90ce6
--- /dev/null
+++ b/test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt
@@ -0,0 +1,9 @@
+// Test that a listening socket generates a RST when it receives an
+// ACK and syn cookies are not in use.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
++0.1 < . 1:1(0) ack 1 win 32792
++0 > R 1:1(0) ack 0 win 0
\ No newline at end of file
diff --git a/test/packetdrill/sanity_test.pkt b/test/packetdrill/sanity_test.pkt
new file mode 100644
index 000000000..b3b58c366
--- /dev/null
+++ b/test/packetdrill/sanity_test.pkt
@@ -0,0 +1,7 @@
+// Basic sanity test. One system call.
+//
+// All of the plumbing has to be working however, and the packetdrill wire
+// client needs to be able to connect to the wire server and send the script,
+// probe local interfaces, run through the test w/ timings, etc.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
diff --git a/test/packetdrill/tcp_defer_accept.pkt b/test/packetdrill/tcp_defer_accept.pkt
new file mode 100644
index 000000000..a17f946db
--- /dev/null
+++ b/test/packetdrill/tcp_defer_accept.pkt
@@ -0,0 +1,48 @@
+// Test that a bare ACK does not complete a connection when TCP_DEFER_ACCEPT
+// timeout is not hit but an ACK w/ data does complete and deliver the
+// connection to the accept queue.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_TCP, TCP_DEFER_ACCEPT, [5], 4) = 0
++0.000 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0.0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0.0 > S. 0:0(0) ack 1    <...>
+
+// Send a bare ACK this should not complete the connection as we
+// set the TCP_DEFER_ACCEPT above.
++0.0 < . 1:1(0) ack 1 win 257
+
+// The bare ACK should be dropped and no connection should be delivered
+// to the accept queue.
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// Send another bare ACK and it should still fail we set TCP_DEFER_ACCEPT
+// to 5 seconds above.
++2.5 < . 1:1(0) ack 1 win 257
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// set accept socket back to blocking.
++0.000 fcntl(3, F_SETFL, O_RDWR) = 0
+
+// Now send an ACK w/ data. This should complete the connection
+// and deliver the socket to the accept queue.
++0.1 < . 1:5(4) ack 1 win 257
++0.0 > . 1:1(0) ack 5 <...>
+
+// This should cause connection to transition to connected state.
++0.000 accept(3, ..., ...) = 4
++0.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Now read the data and we should get 4 bytes.
++0.000 read(4,..., 4) = 4
++0.000 close(4) = 0
+
++0.0 > F. 1:1(0) ack 5 <...>
++0.0 < F. 5:5(0) ack 2 win 257
++0.01 > . 2:2(0) ack 6 <...>
\ No newline at end of file
diff --git a/test/packetdrill/tcp_defer_accept_timeout.pkt b/test/packetdrill/tcp_defer_accept_timeout.pkt
new file mode 100644
index 000000000..201fdeb14
--- /dev/null
+++ b/test/packetdrill/tcp_defer_accept_timeout.pkt
@@ -0,0 +1,48 @@
+// Test that a bare ACK is accepted after TCP_DEFER_ACCEPT timeout
+// is hit and a connection is delivered.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_TCP, TCP_DEFER_ACCEPT, [3], 4) = 0
++0.000 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0.0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0.0 > S. 0:0(0) ack 1    <...>
+
+// Send a bare ACK this should not complete the connection as we
+// set the TCP_DEFER_ACCEPT above.
++0.0 < . 1:1(0) ack 1 win 257
+
+// The bare ACK should be dropped and no connection should be delivered
+// to the accept queue.
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// Send another bare ACK and it should still fail we set TCP_DEFER_ACCEPT
+// to 5 seconds above.
++2.5 < . 1:1(0) ack 1 win 257
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// set accept socket back to blocking.
++0.000 fcntl(3, F_SETFL, O_RDWR) = 0
+
+// We should see one more retransmit of the SYN-ACK as a last ditch
+// attempt when TCP_DEFER_ACCEPT timeout is hit to trigger another
+// ACK or a packet with data.
++.35~+2.35 > S. 0:0(0) ack 1 <...>
+
+// Now send another bare ACK after TCP_DEFER_ACCEPT time has been passed.
++0.0 < . 1:1(0) ack 1 win 257
+
+// The ACK above should cause connection to transition to connected state.
++0.000 accept(3, ..., ...) = 4
++0.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
++0.000 close(4) = 0
+
++0.0 > F. 1:1(0) ack 1 <...>
++0.0 < F. 1:1(0) ack 2 win 257
++0.01 > . 2:2(0) ack 2 <...>
-- 
cgit v1.2.3


From f50d9a31e9e734a02e0191f6bc91b387bb21f9ab Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 6 Mar 2020 11:32:13 -0800
Subject: Specify the source of outgoing NDP RS

If the NIC has a valid IPv6 address assigned, use it as the
source address for outgoing NDP Router Solicitation packets.

Test: stack_test.TestRouterSolicitation
PiperOrigin-RevId: 299398763
---
 pkg/tcpip/checker/checker.go | 106 +++++++++++++++++++++++++++----------------
 pkg/tcpip/stack/ndp.go       |  29 ++++++++++--
 pkg/tcpip/stack/ndp_test.go  |  39 +++++++++++++---
 3 files changed, 125 insertions(+), 49 deletions(-)

diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index c6c160dfc..8dc0f7c0e 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -785,6 +785,52 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 	}
 }
 
+// ndpOptions checks that optsBuf only contains opts.
+func ndpOptions(t *testing.T, optsBuf header.NDPOptions, opts []header.NDPOption) {
+	t.Helper()
+
+	it, err := optsBuf.Iter(true)
+	if err != nil {
+		t.Errorf("optsBuf.Iter(true): %s", err)
+		return
+	}
+
+	i := 0
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			// This should never happen as Iter(true) above did not return an error.
+			t.Fatalf("unexpected error when iterating over NDP options: %s", err)
+		}
+		if done {
+			break
+		}
+
+		if i >= len(opts) {
+			t.Errorf("got unexpected option: %s", opt)
+			continue
+		}
+
+		switch wantOpt := opts[i].(type) {
+		case header.NDPSourceLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
+		default:
+			t.Fatalf("checker not implemented for expected NDP option: %T", wantOpt)
+		}
+
+		i++
+	}
+
+	if missing := opts[i:]; len(missing) > 0 {
+		t.Errorf("missing options: %s", missing)
+	}
+}
+
 // NDPNSOptions creates a checker that checks that the packet contains the
 // provided NDP options within an NDP Neighbor Solicitation message.
 //
@@ -796,47 +842,31 @@ func NDPNSOptions(opts []header.NDPOption) TransportChecker {
 
 		icmp := h.(header.ICMPv6)
 		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
-		it, err := ns.Options().Iter(true)
-		if err != nil {
-			t.Errorf("opts.Iter(true): %s", err)
-			return
-		}
-
-		i := 0
-		for {
-			opt, done, _ := it.Next()
-			if done {
-				break
-			}
-
-			if i >= len(opts) {
-				t.Errorf("got unexpected option: %s", opt)
-				continue
-			}
-
-			switch wantOpt := opts[i].(type) {
-			case header.NDPSourceLinkLayerAddressOption:
-				gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
-				if !ok {
-					t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
-				} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
-					t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
-				}
-			default:
-				panic("not implemented")
-			}
-
-			i++
-		}
-
-		if missing := opts[i:]; len(missing) > 0 {
-			t.Errorf("missing options: %s", missing)
-		}
+		ndpOptions(t, ns.Options(), opts)
 	}
 }
 
 // NDPRS creates a checker that checks that the packet contains a valid NDP
 // Router Solicitation message (as per the raw wire format).
-func NDPRS() NetworkChecker {
-	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize)
+//
+// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPRS as far as the size of the message is concerned. The values within the
+// message are up to checkers to validate.
+func NDPRS(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize, checkers...)
+}
+
+// NDPRSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Router Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPRS message as far as the size is concerned.
+func NDPRSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		rs := header.NDPRouterSolicit(icmp.NDPPayload())
+		ndpOptions(t, rs.Options(), opts)
+	}
 }
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index f651871ce..a9f4d5dad 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1220,9 +1220,15 @@ func (ndp *ndpState) startSolicitingRouters() {
 	}
 
 	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
-		// Send an RS message with the unspecified source address.
-		ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
-		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
+		// to the sending interface, or the unspecified address if no address is
+		// assigned to the sending interface.
+		ref := ndp.nic.primaryIPv6Endpoint(header.IPv6AllRoutersMulticastAddress)
+		if ref == nil {
+			ref = ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
+		}
+		localAddr := ref.ep.ID().LocalAddress
+		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
 		// Route should resolve immediately since
@@ -1234,10 +1240,25 @@ func (ndp *ndpState) startSolicitingRouters() {
 			log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID())
 		}
 
-		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
+		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
+		// link-layer address option if the source address of the NDP RS is
+		// specified. This option MUST NOT be included if the source address is
+		// unspecified.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		var optsSerializer header.NDPOptionsSerializer
+		if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(r.LocalLinkAddress) {
+			optsSerializer = header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(r.LocalLinkAddress),
+			}
+		}
+		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length())
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
 		pkt.SetType(header.ICMPv6RouterSolicit)
+		rs := header.NDPRouterSolicit(pkt.NDPPayload())
+		rs.Options().Serialize(optsSerializer)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		sent := r.Stats().ICMP.V6PacketsSent
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 6e9306d09..98b1c807c 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -3384,6 +3384,10 @@ func TestRouterSolicitation(t *testing.T) {
 	tests := []struct {
 		name                        string
 		linkHeaderLen               uint16
+		linkAddr                    tcpip.LinkAddress
+		nicAddr                     tcpip.Address
+		expectedSrcAddr             tcpip.Address
+		expectedNDPOpts             []header.NDPOption
 		maxRtrSolicit               uint8
 		rtrSolicitInt               time.Duration
 		effectiveRtrSolicitInt      time.Duration
@@ -3392,6 +3396,7 @@ func TestRouterSolicitation(t *testing.T) {
 	}{
 		{
 			name:                        "Single RS with delay",
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               1,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3401,6 +3406,8 @@ func TestRouterSolicitation(t *testing.T) {
 		{
 			name:                        "Two RS with delay",
 			linkHeaderLen:               1,
+			nicAddr:                     llAddr1,
+			expectedSrcAddr:             llAddr1,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3408,8 +3415,14 @@ func TestRouterSolicitation(t *testing.T) {
 			effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
 		},
 		{
-			name:                        "Single RS without delay",
-			linkHeaderLen:               2,
+			name:            "Single RS without delay",
+			linkHeaderLen:   2,
+			linkAddr:        linkAddr1,
+			nicAddr:         llAddr1,
+			expectedSrcAddr: llAddr1,
+			expectedNDPOpts: []header.NDPOption{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			},
 			maxRtrSolicit:               1,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3419,6 +3432,8 @@ func TestRouterSolicitation(t *testing.T) {
 		{
 			name:                        "Two RS without delay and invalid zero interval",
 			linkHeaderLen:               3,
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               0,
 			effectiveRtrSolicitInt:      4 * time.Second,
@@ -3427,6 +3442,8 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Three RS without delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               3,
 			rtrSolicitInt:               500 * time.Millisecond,
 			effectiveRtrSolicitInt:      500 * time.Millisecond,
@@ -3435,6 +3452,8 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS with invalid negative delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3457,7 +3476,7 @@ func TestRouterSolicitation(t *testing.T) {
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
 				e := channelLinkWithHeaderLength{
-					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, linkAddr1),
+					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
 					headerLength: test.linkHeaderLen,
 				}
 				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
@@ -3481,10 +3500,10 @@ func TestRouterSolicitation(t *testing.T) {
 
 					checker.IPv6(t,
 						p.Pkt.Header.View(),
-						checker.SrcAddr(header.IPv6Any),
+						checker.SrcAddr(test.expectedSrcAddr),
 						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
 						checker.TTL(header.NDPHopLimit),
-						checker.NDPRS(),
+						checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
 					)
 
 					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
@@ -3510,13 +3529,19 @@ func TestRouterSolicitation(t *testing.T) {
 					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 				}
 
-				// Make sure each RS got sent at the right
-				// times.
+				if addr := test.nicAddr; addr != "" {
+					if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err)
+					}
+				}
+
+				// Make sure each RS is sent at the right time.
 				remaining := test.maxRtrSolicit
 				if remaining > 0 {
 					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
 					remaining--
 				}
+
 				for ; remaining > 0; remaining-- {
 					waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
 					waitForPkt(defaultAsyncEventTimeout)
-- 
cgit v1.2.3


From d5dbe366bf7c9f5b648b8114a9dc7f45589899b1 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 6 Mar 2020 11:41:10 -0800
Subject: shutdown(s, SHUT_WR) in TIME-WAIT returns ENOTCONN

From RFC 793 s3.9 p61 Event Processing:

CLOSE Call during TIME-WAIT: return with "error: connection closing"

Fixes #1603

PiperOrigin-RevId: 299401353
---
 pkg/tcpip/transport/tcp/endpoint.go |  5 ++++-
 test/syscalls/linux/tcp_socket.cc   | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 40cc664c0..dc9c18b6f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2117,10 +2117,13 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 		// Close for write.
 		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
 			e.sndBufMu.Lock()
-
 			if e.sndClosed {
 				// Already closed.
 				e.sndBufMu.Unlock()
+				if e.EndpointState() == StateTimeWait {
+					e.mu.Unlock()
+					return tcpip.ErrNotConnected
+				}
 				break
 			}
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 579463384..d9c1ac0e1 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -143,6 +143,20 @@ TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
       SyscallFailsWithErrno(EISCONN));
 }
 
+TEST_P(TcpSocketTest, ShutdownWriteInTimeWait) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_RDWR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter TIME_WAIT.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInFinWait1) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter FIN-WAIT2.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
-- 
cgit v1.2.3


From 6fa5cee82c0f515b001dee5f3840e1f875b2f477 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 6 Mar 2020 12:30:37 -0800
Subject: Prevent memory leaks in ilist

When list elements are removed from a list but not discarded, it becomes
important to invalidate the references they hold to their former
neighbors to prevent memory leaks.

PiperOrigin-RevId: 299412421
---
 pkg/ilist/list.go                                |  8 ++++++--
 pkg/sentry/fs/dirent_cache.go                    |  2 --
 pkg/sentry/fs/inotify.go                         |  5 ++++-
 pkg/sentry/kernel/epoll/epoll_state.go           | 13 ++++++++-----
 pkg/tcpip/network/fragmentation/fragmentation.go |  8 +++++---
 5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index f3a609b57..8f93e4d6d 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -169,8 +169,9 @@ func (l *List) InsertBefore(a, e Element) {
 
 // Remove removes e from l.
 func (l *List) Remove(e Element) {
-	prev := ElementMapper{}.linkerFor(e).Prev()
-	next := ElementMapper{}.linkerFor(e).Next()
+	linker := ElementMapper{}.linkerFor(e)
+	prev := linker.Prev()
+	next := linker.Next()
 
 	if prev != nil {
 		ElementMapper{}.linkerFor(prev).SetNext(next)
@@ -183,6 +184,9 @@ func (l *List) Remove(e Element) {
 	} else {
 		l.tail = prev
 	}
+
+	linker.SetNext(nil)
+	linker.SetPrev(nil)
 }
 
 // Entry is a default implementation of Linker. Users can add anonymous fields
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 25514ace4..33de32c69 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -101,8 +101,6 @@ func (c *DirentCache) remove(d *Dirent) {
 		panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d))
 	}
 	c.list.Remove(d)
-	d.SetPrev(nil)
-	d.SetNext(nil)
 	d.DecRef()
 	c.currentSize--
 	if c.limit != nil {
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 928c90aa0..e3a715c1f 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -143,7 +143,10 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 	}
 
 	var writeLen int64
-	for event := i.events.Front(); event != nil; event = event.Next() {
+	for it := i.events.Front(); it != nil; {
+		event := it
+		it = it.Next()
+
 		// Does the buffer have enough remaining space to hold the event we're
 		// about to write out?
 		if dst.NumBytes() < int64(event.sizeOf()) {
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index a0d35d350..8e9f200d0 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() {
 		}
 	}
 
-	for it := e.waitingList.Front(); it != nil; it = it.Next() {
-		if it.id.File.Readiness(it.mask) != 0 {
-			e.waitingList.Remove(it)
-			e.readyList.PushBack(it)
-			it.curList = &e.readyList
+	for it := e.waitingList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		if entry.id.File.Readiness(entry.mask) != 0 {
+			e.waitingList.Remove(entry)
+			e.readyList.PushBack(entry)
+			entry.curList = &e.readyList
 			e.Notify(waiter.EventIn)
 		}
 	}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 92f2aa13a..f42abc4bb 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -115,10 +115,12 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
 	// Evict reassemblers if we are consuming more memory than highLimit until
 	// we reach lowLimit.
 	if f.size > f.highLimit {
-		tail := f.rList.Back()
-		for f.size > f.lowLimit && tail != nil {
+		for f.size > f.lowLimit {
+			tail := f.rList.Back()
+			if tail == nil {
+				break
+			}
 			f.release(tail)
-			tail = tail.Prev()
 		}
 	}
 	f.mu.Unlock()
-- 
cgit v1.2.3


From 960f6a975b7e44c0efe8fd38c66b02017c4fe137 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 6 Mar 2020 12:58:45 -0800
Subject: Add plumbing for importing fds in VFS2, along with non-socket,
 non-TTY impl.

In VFS2, imported file descriptors are stored in a kernfs-based filesystem.
Upon calling ImportFD, the host fd can be accessed in two ways:
1. a FileDescription that can be added to the FDTable, and
2. a Dentry in the host.filesystem mount, which we will want to access through
magic symlinks in /proc/[pid]/fd/.

An implementation of the kernfs.Inode interface stores a unique host fd. This
inode can be inserted into file descriptions as well as dentries.

This change also plumbs in three FileDescriptionImpls corresponding to fds for
sockets, TTYs, and other files (only the latter is implemented here).
These implementations will mostly make corresponding syscalls to the host.
Where possible, the logic is ported over from pkg/sentry/fs/host.

Updates #1672

PiperOrigin-RevId: 299417263
---
 pkg/sentry/fs/host/util.go                     |  12 +-
 pkg/sentry/fsimpl/host/BUILD                   |  27 +++
 pkg/sentry/fsimpl/host/default_file.go         | 233 ++++++++++++++++++++
 pkg/sentry/fsimpl/host/host.go                 | 286 +++++++++++++++++++++++++
 pkg/sentry/fsimpl/host/util.go                 |  86 ++++++++
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |   2 +-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |  18 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go         |   6 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  54 ++---
 pkg/sentry/fsimpl/kernfs/kernfs.go             |   7 +-
 pkg/sentry/fsimpl/proc/subtasks.go             |  13 +-
 pkg/sentry/fsimpl/proc/task.go                 |  23 +-
 pkg/sentry/fsimpl/proc/tasks.go                |  19 +-
 pkg/sentry/vfs/BUILD                           |   4 +
 pkg/sentry/vfs/file_description_impl_util.go   |   4 +-
 15 files changed, 731 insertions(+), 63 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/host/BUILD
 create mode 100644 pkg/sentry/fsimpl/host/default_file.go
 create mode 100644 pkg/sentry/fsimpl/host/host.go
 create mode 100644 pkg/sentry/fsimpl/host/util.go

diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index e37e687c6..7c60dc1db 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -152,9 +152,9 @@ func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
 		Usage:            s.Blocks * 512,
 		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
 		Owner:            owner(mo, s),
-		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
-		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
-		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		AccessTime:       time.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
 		Links:            uint64(s.Nlink),
 	}
 }
@@ -165,6 +165,8 @@ type dirInfo struct {
 	bufp int    // location of next record in buf.
 }
 
+// LINT.IfChange
+
 // isBlockError unwraps os errors and checks if they are caused by EAGAIN or
 // EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
@@ -177,6 +179,8 @@ func isBlockError(err error) bool {
 	return false
 }
 
+// LINT.ThenChange(../../fsimpl/host/util.go)
+
 func hostEffectiveKIDs() (uint32, []uint32, error) {
 	gids, err := os.Getgroups()
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
new file mode 100644
index 000000000..731f192b3
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "host",
+    srcs = [
+        "default_file.go",
+        "host.go",
+        "util.go",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
new file mode 100644
index 000000000..172cdb161
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/default_file.go
@@ -0,0 +1,233 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"math"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files.
+type defaultFileFD struct {
+	fileDescription
+
+	// canMap specifies whether we allow the file to be memory mapped.
+	canMap bool
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// offset specifies the current file offset.
+	offset int64
+}
+
+// TODO(gvisor.dev/issue/1672): Implement Waitable interface.
+
+// PRead implements FileDescriptionImpl.
+func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags))
+}
+
+// Read implements FileDescriptionImpl.
+func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		// These files can't be memory mapped, assert this.
+		if f.canMap {
+			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+		}
+
+		f.mu.Lock()
+		n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags))
+		f.mu.Unlock()
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	f.mu.Lock()
+	n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags))
+	f.offset += n
+	f.mu.Unlock()
+	return n, err
+}
+
+func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
+	if flags&^(linux.RWF_VALID) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	reader := safemem.FromVecReaderFunc{
+		func(srcs [][]byte) (int64, error) {
+			n, err := unix.Preadv2(fd, srcs, offset, flags)
+			return int64(n), err
+		},
+	}
+	n, err := dst.CopyOutFrom(ctx, reader)
+	return int64(n), err
+}
+
+// PWrite implements FileDescriptionImpl.
+func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags))
+}
+
+// Write implements FileDescriptionImpl.
+func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		// These files can't be memory mapped, assert this.
+		if f.canMap {
+			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+		}
+
+		f.mu.Lock()
+		n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags))
+		f.mu.Unlock()
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+	f.mu.Lock()
+	n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags))
+	f.offset += n
+	f.mu.Unlock()
+	return n, err
+}
+
+func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
+	if flags&^(linux.RWF_VALID) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	writer := safemem.FromVecWriterFunc{
+		func(srcs [][]byte) (int64, error) {
+			n, err := unix.Pwritev2(fd, srcs, offset, flags)
+			return int64(n), err
+		},
+	}
+	n, err := src.CopyInTo(ctx, writer)
+	return int64(n), err
+}
+
+// Seek implements FileDescriptionImpl.
+//
+// Note that we do not support seeking on directories, since we do not even
+// allow directory fds to be imported at all.
+func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
+	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
+	if f.inode.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = offset
+
+	case linux.SEEK_CUR:
+		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
+		if offset > math.MaxInt64-f.offset {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if f.offset+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset += offset
+
+	case linux.SEEK_END:
+		var s syscall.Stat_t
+		if err := syscall.Fstat(f.inode.hostFD, &s); err != nil {
+			return f.offset, err
+		}
+		size := s.Size
+
+		// Check for overflow. Note that underflow cannot occur, since size >= 0.
+		if offset > math.MaxInt64-size {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if size+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = size + offset
+
+	case linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Modifying the offset in the host file table should not matter, since
+		// this is the only place where we use it.
+		//
+		// For reading and writing, we always rely on our internal offset.
+		n, err := unix.Seek(f.inode.hostFD, offset, int(whence))
+		if err != nil {
+			return f.offset, err
+		}
+		f.offset = n
+
+	default:
+		// Invalid whence.
+		return f.offset, syserror.EINVAL
+	}
+
+	return f.offset, nil
+}
+
+// Sync implements FileDescriptionImpl.
+func (f *defaultFileFD) Sync(context.Context) error {
+	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	return unix.Fsync(f.inode.hostFD)
+}
+
+// ConfigureMMap implements FileDescriptionImpl.
+func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
+	if !f.canMap {
+		return syserror.ENODEV
+	}
+	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
+	return syserror.ENODEV
+}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
new file mode 100644
index 000000000..c205e6a0b
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -0,0 +1,286 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host provides a filesystem implementation for host files imported as
+// file descriptors.
+package host
+
+import (
+	"errors"
+	"fmt"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
+func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) {
+	// Must be importing to a mount of host.filesystem.
+	fs, ok := mnt.Filesystem().Impl().(*filesystem)
+	if !ok {
+		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
+	}
+
+	// Retrieve metadata.
+	var s syscall.Stat_t
+	if err := syscall.Fstat(hostFD, &s); err != nil {
+		return nil, err
+	}
+
+	fileMode := linux.FileMode(s.Mode)
+	fileType := fileMode.FileType()
+	// Pipes, character devices, and sockets can return EWOULDBLOCK for
+	// operations that would block.
+	isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+
+	i := &inode{
+		hostFD:   hostFD,
+		isStream: isStream,
+		isTTY:    isTTY,
+		ino:      fs.NextIno(),
+		mode:     fileMode,
+		uid:      ownerUID,
+		gid:      ownerGID,
+	}
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+	// i.open will take a reference on d.
+	defer d.DecRef()
+
+	return i.open(d.VFSDentry(), mnt)
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// When the reference count reaches zero, the host fd is closed.
+	refs.AtomicRefCount
+
+	// hostFD contains the host fd that this file was originally created from,
+	// which must be available at time of restore.
+	//
+	// This field is initialized at creation time and is immutable.
+	hostFD int
+
+	// isStream is true if the host fd points to a file representing a stream,
+	// e.g. a socket or a pipe. Such files are not seekable and can return
+	// EWOULDBLOCK for I/O operations.
+	//
+	// This field is initialized at creation time and is immutable.
+	isStream bool
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
+
+	// ino is an inode number unique within this filesystem.
+	ino uint64
+
+	// mu protects the inode metadata below.
+	mu sync.Mutex
+
+	// mode is the file mode of this inode. Note that this value may become out
+	// of date if the mode is changed on the host, e.g. with chmod.
+	mode linux.FileMode
+
+	// uid and gid of the file owner. Note that these refer to the owner of the
+	// file created on import, not the fd on the host.
+	uid auth.KUID
+	gid auth.KGID
+}
+
+// Note that these flags may become out of date, since they can be modified
+// on the host, e.g. with fcntl.
+func fileFlagsFromHostFD(fd int) (int, error) {
+	flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0)
+	if err != nil {
+		log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err)
+		return 0, err
+	}
+	// TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
+	flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+	return flags, nil
+}
+
+// CheckPermissions implements kernfs.Inode.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid)
+}
+
+// Mode implements kernfs.Inode.
+func (i *inode) Mode() linux.FileMode {
+	return i.mode
+}
+
+// Stat implements kernfs.Inode.
+func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	var s unix.Statx_t
+	if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil {
+		return linux.Statx{}, err
+	}
+	ls := unixToLinuxStatx(s)
+
+	// Use our own internal inode number and file owner.
+	//
+	// TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well.
+	// If we use the device number from the host, it may collide with another
+	// sentry-internal device number. We handle device/inode numbers without
+	// relying on the host to prevent collisions.
+	ls.Ino = i.ino
+	ls.UID = uint32(i.uid)
+	ls.GID = uint32(i.gid)
+
+	// Update file mode from the host.
+	i.mode = linux.FileMode(ls.Mode)
+
+	return ls, nil
+}
+
+// SetStat implements kernfs.Inode.
+func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	s := opts.Stat
+
+	m := s.Mask
+	if m == 0 {
+		return nil
+	}
+	if m&(linux.STATX_UID|linux.STATX_GID) != 0 {
+		return syserror.EPERM
+	}
+	if m&linux.STATX_MODE != 0 {
+		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
+			return err
+		}
+		i.mode = linux.FileMode(s.Mode)
+	}
+	if m&linux.STATX_SIZE != 0 {
+		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
+			return err
+		}
+	}
+	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
+		timestamps := []unix.Timespec{
+			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
+			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
+		}
+		if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DecRef implements kernfs.Inode.
+func (i *inode) DecRef() {
+	i.AtomicRefCount.DecRefWithDestructor(i.Destroy)
+}
+
+// Destroy implements kernfs.Inode.
+func (i *inode) Destroy() {
+	if err := unix.Close(i.hostFD); err != nil {
+		log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+	}
+}
+
+// Open implements kernfs.Inode.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.open(vfsd, rp.Mount())
+}
+
+func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
+
+	fileType := i.mode.FileType()
+	if fileType == syscall.S_IFSOCK {
+		if i.isTTY {
+			return nil, errors.New("cannot use host socket as TTY")
+		}
+		// TODO(gvisor.dev/issue/1672): support importing sockets.
+		return nil, errors.New("importing host sockets not supported")
+	}
+
+	if i.isTTY {
+		// TODO(gvisor.dev/issue/1672): support importing host fd as TTY.
+		return nil, errors.New("importing host fd as TTY not supported")
+	}
+
+	// For simplicity, set offset to 0. Technically, we should
+	// only set to 0 on files that are not seekable (sockets, pipes, etc.),
+	// and use the offset from the host fd otherwise.
+	fd := &defaultFileFD{
+		fileDescription: fileDescription{
+			inode: i,
+		},
+		canMap: canMap(uint32(fileType)),
+		mu:     sync.Mutex{},
+		offset: 0,
+	}
+
+	vfsfd := &fd.vfsfd
+	flags, err := fileFlagsFromHostFD(i.hostFD)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return vfsfd, nil
+}
+
+// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
+	// cached to reduce indirections and casting. fileDescription does not hold
+	// a reference on the inode through the inode field (since one is already
+	// held via the Dentry).
+	//
+	// inode is immutable after fileDescription creation.
+	inode *inode
+}
+
+// SetStat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error {
+	return f.inode.SetStat(nil, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return f.inode.Stat(nil, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Release() {
+	// noop
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
new file mode 100644
index 000000000..e1ccacb4d
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -0,0 +1,86 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec {
+	if omit {
+		return unix.Timespec{
+			Sec:  0,
+			Nsec: unix.UTIME_OMIT,
+		}
+	}
+	return unix.Timespec{
+		Sec:  int64(ts.Sec),
+		Nsec: int64(ts.Nsec),
+	}
+}
+
+func unixToLinuxStatx(s unix.Statx_t) linux.Statx {
+	return linux.Statx{
+		Mask:           s.Mask,
+		Blksize:        s.Blksize,
+		Attributes:     s.Attributes,
+		Nlink:          s.Nlink,
+		UID:            s.Uid,
+		GID:            s.Gid,
+		Mode:           s.Mode,
+		Ino:            s.Ino,
+		Size:           s.Size,
+		Blocks:         s.Blocks,
+		AttributesMask: s.Attributes_mask,
+		Atime:          unixToLinuxStatxTimestamp(s.Atime),
+		Btime:          unixToLinuxStatxTimestamp(s.Btime),
+		Ctime:          unixToLinuxStatxTimestamp(s.Ctime),
+		Mtime:          unixToLinuxStatxTimestamp(s.Mtime),
+		RdevMajor:      s.Rdev_major,
+		RdevMinor:      s.Rdev_minor,
+		DevMajor:       s.Dev_major,
+		DevMinor:       s.Dev_minor,
+	}
+}
+
+func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec}
+}
+
+// wouldBlock returns true for file types that can return EWOULDBLOCK
+// for blocking operations, e.g. pipes, character devices, and sockets.
+func wouldBlock(fileType uint32) bool {
+	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+}
+
+// canMap returns true if a file with fileType is allowed to be memory mapped.
+// This is ported over from VFS1, but it's probably not the best way for us
+// to check if a file can be memory mapped.
+func canMap(fileType uint32) bool {
+	// TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
+	//
+	// TODO(b/38213152): Some obscure character devices can be mapped.
+	return fileType == syscall.S_IFREG
+}
+
+// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
+// If so, they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK
+}
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index d092ccb2a..1c026f4d8 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) Release() {}
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(fs), nil
+	return fd.inode.Stat(fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 5650512e0..da821d524 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -107,9 +107,13 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 
+	opts := vfs.StatOptions{Mask: linux.STATX_INO}
 	// Handle ".".
 	if fd.off == 0 {
-		stat := fd.inode().Stat(vfsFS)
+		stat, err := fd.inode().Stat(vfsFS, opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
@@ -125,7 +129,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	// Handle "..".
 	if fd.off == 1 {
 		parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
-		stat := parentInode.Stat(vfsFS)
+		stat, err := parentInode.Stat(vfsFS, opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    "..",
 			Type:    linux.FileMode(stat.Mode).DirentType(),
@@ -146,7 +153,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
 		inode := it.Dentry.Impl().(*Dentry).inode
-		stat := inode.Stat(vfsFS)
+		stat, err := inode.Stat(vfsFS, opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    it.Name,
 			Type:    linux.FileMode(stat.Mode).DirentType(),
@@ -190,7 +200,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
 	inode := fd.inode()
-	return inode.Stat(fs), nil
+	return inode.Stat(fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 292f58afd..1d7e04ad4 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This file implements vfs.FilesystemImpl for kernfs.
-
 package kernfs
 
+// This file implements vfs.FilesystemImpl for kernfs.
+
 import (
 	"fmt"
 
@@ -634,7 +634,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(fs.VFSFilesystem()), nil
+	return inode.Stat(fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 099d70a16..d50018b18 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -36,20 +36,20 @@ type InodeNoopRefCount struct {
 }
 
 // IncRef implements Inode.IncRef.
-func (n *InodeNoopRefCount) IncRef() {
+func (InodeNoopRefCount) IncRef() {
 }
 
 // DecRef implements Inode.DecRef.
-func (n *InodeNoopRefCount) DecRef() {
+func (InodeNoopRefCount) DecRef() {
 }
 
 // TryIncRef implements Inode.TryIncRef.
-func (n *InodeNoopRefCount) TryIncRef() bool {
+func (InodeNoopRefCount) TryIncRef() bool {
 	return true
 }
 
 // Destroy implements Inode.Destroy.
-func (n *InodeNoopRefCount) Destroy() {
+func (InodeNoopRefCount) Destroy() {
 }
 
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
@@ -58,27 +58,27 @@ func (n *InodeNoopRefCount) Destroy() {
 type InodeDirectoryNoNewChildren struct{}
 
 // NewFile implements Inode.NewFile.
-func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewDir implements Inode.NewDir.
-func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewLink implements Inode.NewLink.
-func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewNode implements Inode.NewNode.
-func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
@@ -90,62 +90,62 @@ type InodeNotDirectory struct {
 }
 
 // HasChildren implements Inode.HasChildren.
-func (*InodeNotDirectory) HasChildren() bool {
+func (InodeNotDirectory) HasChildren() bool {
 	return false
 }
 
 // NewFile implements Inode.NewFile.
-func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
 	panic("NewFile called on non-directory inode")
 }
 
 // NewDir implements Inode.NewDir.
-func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
 	panic("NewDir called on non-directory inode")
 }
 
 // NewLink implements Inode.NewLinkink.
-func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
 	panic("NewLink called on non-directory inode")
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
 	panic("NewSymlink called on non-directory inode")
 }
 
 // NewNode implements Inode.NewNode.
-func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
 	panic("NewNode called on non-directory inode")
 }
 
 // Unlink implements Inode.Unlink.
-func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
 	panic("Unlink called on non-directory inode")
 }
 
 // RmDir implements Inode.RmDir.
-func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
 	panic("RmDir called on non-directory inode")
 }
 
 // Rename implements Inode.Rename.
-func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
 	panic("Rename called on non-directory inode")
 }
 
 // Lookup implements Inode.Lookup.
-func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	panic("Lookup called on non-directory inode")
 }
 
 // IterDirents implements Inode.IterDirents.
-func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
 	panic("IterDirents called on non-directory inode")
 }
 
 // Valid implements Inode.Valid.
-func (*InodeNotDirectory) Valid(context.Context) bool {
+func (InodeNotDirectory) Valid(context.Context) bool {
 	return true
 }
 
@@ -157,17 +157,17 @@ func (*InodeNotDirectory) Valid(context.Context) bool {
 type InodeNoDynamicLookup struct{}
 
 // Lookup implements Inode.Lookup.
-func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	return nil, syserror.ENOENT
 }
 
 // IterDirents implements Inode.IterDirents.
-func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	return offset, nil
 }
 
 // Valid implements Inode.Valid.
-func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 	return true
 }
 
@@ -177,7 +177,7 @@ func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 type InodeNotSymlink struct{}
 
 // Readlink implements Inode.Readlink.
-func (*InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context) (string, error) {
 	return "", syserror.EINVAL
 }
 
@@ -219,7 +219,7 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
+func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
 	stat.Ino = atomic.LoadUint64(&a.ino)
@@ -230,7 +230,7 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
 
 	// TODO: Implement other stat fields like timestamps.
 
-	return stat
+	return stat, nil
 }
 
 // SetStat implements Inode.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index c74fa999b..a8ab2a2ba 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -176,8 +176,6 @@ type Dentry struct {
 	vfsd  vfs.Dentry
 	inode Inode
 
-	refs uint64
-
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
 	flags uint32
@@ -302,7 +300,8 @@ type Inode interface {
 	// this inode. The returned file description should hold a reference on the
 	// inode for its lifetime.
 	//
-	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
+	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
+	// the inode on which Open() is being called.
 	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
 }
 
@@ -328,7 +327,7 @@ type inodeMetadata interface {
 
 	// Stat returns the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.StatAt.
-	Stat(fs *vfs.Filesystem) linux.Statx
+	Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
 
 	// SetStat updates the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.SetStatAt.
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index f3f4e49b4..611645f3f 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -121,8 +121,13 @@ func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.O
 }
 
 // Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
-	stat := i.InodeAttrs.Stat(vsfs)
-	stat.Nlink += uint32(i.task.ThreadGroup().Count())
-	return stat
+func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		stat.Nlink += uint32(i.task.ThreadGroup().Count())
+	}
+	return stat, nil
 }
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 18e5cd6f6..c0d643f51 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -154,12 +154,21 @@ func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, childre
 }
 
 // Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
-	stat := i.Inode.Stat(fs)
-	uid, gid := i.getOwner(linux.FileMode(stat.Mode))
-	stat.UID = uint32(uid)
-	stat.GID = uint32(gid)
-	return stat
+func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.Inode.Stat(fs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
+		uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+		if opts.Mask&linux.STATX_UID != 0 {
+			stat.UID = uint32(uid)
+		}
+		if opts.Mask&linux.STATX_GID != 0 {
+			stat.GID = uint32(gid)
+		}
+	}
+	return stat, nil
 }
 
 // CheckPermissions implements kernfs.Inode.
@@ -236,7 +245,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr
 // member, there is one entry containing three colon-separated fields:
 //   hierarchy-ID:controller-list:cgroup-path"
 func newCgroupData(controllers map[string]string) dynamicInode {
-	buf := bytes.Buffer{}
+	var buf bytes.Buffer
 
 	// The hierarchy ids must be positive integers (for cgroup v1), but the
 	// exact number does not matter, so long as they are unique. We can
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 10c08fa90..b1e39c82f 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -211,17 +211,22 @@ func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.Open
 	return fd.VFSFileDescription(), nil
 }
 
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
-	stat := i.InodeAttrs.Stat(vsfs)
+func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
 
-	// Add dynamic children to link count.
-	for _, tg := range i.pidns.ThreadGroups() {
-		if leader := tg.Leader(); leader != nil {
-			stat.Nlink++
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		// Add dynamic children to link count.
+		for _, tg := range i.pidns.ThreadGroups() {
+			if leader := tg.Leader(); leader != nil {
+				stat.Nlink++
+			}
 		}
 	}
 
-	return stat
+	return stat, nil
 }
 
 func cpuInfoData(k *kernel.Kernel) string {
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 07c8383e6..cb4deb068 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -42,10 +42,13 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -53,6 +56,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index c2a52ec1b..45191d1c3 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -33,8 +33,8 @@ import (
 // implementations to adapt:
 //   - Have a local fileDescription struct (containing FileDescription) which
 //     embeds FileDescriptionDefaultImpl and overrides the default methods
-//     which are common to all fd implementations for that for that filesystem
-//     like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+//     which are common to all fd implementations for that filesystem like
+//     StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
 //   - This should be embedded in all file description implementations as the
 //     first field by value.
 //   - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
-- 
cgit v1.2.3


From 228813fd26aadea012cd8c39e084e4b0fb23d273 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 6 Mar 2020 15:22:30 -0800
Subject: Update comments and debug level for profiling options.

PiperOrigin-RevId: 299448307
---
 pkg/sentry/control/pprof.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 5d1907c0e..663e51989 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -117,15 +117,15 @@ func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
 	return nil
 }
 
-// GoroutineProfile is an RPC stub which dumps out the stack trace for all running
-// goroutines.
+// GoroutineProfile is an RPC stub which dumps out the stack trace for all
+// running goroutines.
 func (p *Profile) GoroutineProfile(o *ProfileOpts, _ *struct{}) error {
 	if len(o.FilePayload.Files) < 1 {
 		return errNoOutput
 	}
 	output := o.FilePayload.Files[0]
 	defer output.Close()
-	if err := pprof.Lookup("goroutine").WriteTo(output, 0); err != nil {
+	if err := pprof.Lookup("goroutine").WriteTo(output, 2); err != nil {
 		return err
 	}
 	return nil
-- 
cgit v1.2.3


From b23999f3e4e8f5950d650e3951b19b9c0b298cbc Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 6 Mar 2020 17:00:48 -0800
Subject: Fix runsc permissions for "dev.sh --refresh"

PiperOrigin-RevId: 299466906
---
 scripts/dev.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/dev.sh b/scripts/dev.sh
index 6238b4d0b..a9107f33e 100755
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -66,6 +66,7 @@ if [[ ${REFRESH} -eq 0 ]]; then
 else
   mkdir -p "$(dirname ${RUNSC_BIN})"
   cp -f ${OUTPUT} "${RUNSC_BIN}"
+  chmod a+rx "${RUNSC_BIN}"
 
   echo
   echo "Runtime ${RUNTIME} refreshed."
-- 
cgit v1.2.3


From bc319d29e15ad43086fe15a7c0cc25d358daf339 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 7 Mar 2020 02:31:42 +0000
Subject: Bump puma from 3.12.2 to 3.12.4 in /benchmarks/workloads/ruby

Bumps [puma](https://github.com/puma/puma) from 3.12.2 to 3.12.4.
- [Release notes](https://github.com/puma/puma/releases)
- [Changelog](https://github.com/puma/puma/blob/master/History.md)
- [Commits](https://github.com/puma/puma/compare/v3.12.2...v3.12.4)

Signed-off-by: dependabot[bot] <support@github.com>
---
 benchmarks/workloads/ruby/Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
index 17ebcbec3..ea9f0ea85 100644
--- a/benchmarks/workloads/ruby/Gemfile.lock
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -29,7 +29,7 @@ GEM
     prawn (2.2.2)
       pdf-core (~> 0.7.0)
       ttfunk (~> 1.5)
-    puma (3.12.2)
+    puma (3.12.4)
     rack (2.2.2)
     rack-protection (2.0.5)
       rack
-- 
cgit v1.2.3


From c04958e2fa456587277baef361868bddc0df9e49 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 13 Jan 2020 07:44:58 +0000
Subject: Enable thread local storage support on arm64.

Linux use the task.thread.uw.tp_value field to store the
TLS pointer on arm64 platform, and we use a similar way
in gvisor to store it in the arch/State struct.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Ie76b5c6d109bc27ccfd594008a96753806db7764
---
 pkg/abi/linux/elf.go                              |  3 ++
 pkg/sentry/arch/arch_aarch64.go                   |  5 ++
 pkg/sentry/arch/arch_arm64.go                     | 13 ++---
 pkg/sentry/platform/ptrace/BUILD                  |  1 +
 pkg/sentry/platform/ptrace/ptrace_amd64.go        | 14 +++++
 pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go | 62 +++++++++++++++++++++++
 pkg/sentry/platform/ptrace/subprocess.go          | 12 +++++
 7 files changed, 104 insertions(+), 6 deletions(-)
 create mode 100644 pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go

diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go
index 40f0459a0..7c9a02f20 100644
--- a/pkg/abi/linux/elf.go
+++ b/pkg/abi/linux/elf.go
@@ -102,4 +102,7 @@ const (
 
 	// NT_X86_XSTATE is for x86 extended state using xsave.
 	NT_X86_XSTATE = 0x202
+
+	// NT_ARM_TLS is for ARM TLS register.
+	NT_ARM_TLS = 0x401
 )
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 5053393c1..b998f84fc 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -95,6 +95,9 @@ type State struct {
 	// Our floating point state.
 	aarch64FPState `state:"wait"`
 
+	// TLS pointer
+	TPValue uint64
+
 	// FeatureSet is a pointer to the currently active feature set.
 	FeatureSet *cpuid.FeatureSet
 }
@@ -145,6 +148,7 @@ func (s *State) Fork() State {
 	return State{
 		Regs:           s.Regs,
 		aarch64FPState: s.aarch64FPState.fork(),
+		TPValue:        s.TPValue,
 		FeatureSet:     s.FeatureSet,
 	}
 }
@@ -255,6 +259,7 @@ func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) {
 const (
 	_NT_PRSTATUS = 1
 	_NT_PRFPREG  = 2
+	_NT_ARM_TLS  = 0x401
 )
 
 // PtraceGetRegSet implements Context.PtraceGetRegSet.
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 885115ae2..db99c5acb 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -140,16 +140,17 @@ func (c *context64) SetStack(value uintptr) {
 
 // TLS returns the current TLS pointer.
 func (c *context64) TLS() uintptr {
-	// TODO(gvisor.dev/issue/1238): TLS is not supported.
-	// MRS_TPIDR_EL0
-	return 0
+	return uintptr(c.TPValue)
 }
 
 // SetTLS sets the current TLS pointer. Returns false if value is invalid.
 func (c *context64) SetTLS(value uintptr) bool {
-	// TODO(gvisor.dev/issue/1238): TLS is not supported.
-	// MSR_TPIDR_EL0
-	return false
+	if value >= uintptr(maxAddr64) {
+		return false
+	}
+
+	c.TPValue = uint64(value)
+	return true
 }
 
 // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 95abd321e..30402c2df 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -9,6 +9,7 @@ go_library(
         "ptrace.go",
         "ptrace_amd64.go",
         "ptrace_arm64.go",
+        "ptrace_arm64_unsafe.go",
         "ptrace_unsafe.go",
         "stub_amd64.s",
         "stub_arm64.s",
diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go
index db0212538..24fc5dc62 100644
--- a/pkg/sentry/platform/ptrace/ptrace_amd64.go
+++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go
@@ -31,3 +31,17 @@ func fpRegSet(useXsave bool) uintptr {
 func stackPointer(r *syscall.PtraceRegs) uintptr {
 	return uintptr(r.Rsp)
 }
+
+// x86 use the fs_base register to store the TLS pointer which can be
+// get/set in "func (t *thread) get/setRegs(regs *syscall.PtraceRegs)".
+// So both of the get/setTLS() operations are noop here.
+
+// getTLS gets the thread local storage register.
+func (t *thread) getTLS(tls *uint64) error {
+	return nil
+}
+
+// setTLS sets the thread local storage register.
+func (t *thread) setTLS(tls *uint64) error {
+	return nil
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go
new file mode 100644
index 000000000..32b8a6be9
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ptrace
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// getTLS gets the thread local storage register.
+func (t *thread) getTLS(tls *uint64) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(unsafe.Pointer(tls)),
+		Len:  uint64(unsafe.Sizeof(*tls)),
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETREGSET,
+		uintptr(t.tid),
+		linux.NT_ARM_TLS,
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// setTLS sets the thread local storage register.
+func (t *thread) setTLS(tls *uint64) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(unsafe.Pointer(tls)),
+		Len:  uint64(unsafe.Sizeof(*tls)),
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETREGSET,
+		uintptr(t.tid),
+		linux.NT_ARM_TLS,
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 31b7cec53..a644609ef 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -506,6 +506,9 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	regs := &ac.StateData().Regs
 	t.resetSysemuRegs(regs)
 
+	// Extract TLS register
+	tls := uint64(ac.TLS())
+
 	// Check for interrupts, and ensure that future interrupts will signal t.
 	if !c.interrupt.Enable(t) {
 		// Pending interrupt; simulate.
@@ -526,6 +529,9 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
 		panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err))
 	}
+	if err := t.setTLS(&tls); err != nil {
+		panic(fmt.Sprintf("ptrace set tls (%+v) failed: %v", tls, err))
+	}
 
 	for {
 		// Start running until the next system call.
@@ -555,6 +561,12 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		if err := t.getFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
 			panic(fmt.Sprintf("ptrace get fpregs failed: %v", err))
 		}
+		if err := t.getTLS(&tls); err != nil {
+			panic(fmt.Sprintf("ptrace get tls failed: %v", err))
+		}
+		if !ac.SetTLS(uintptr(tls)) {
+			panic(fmt.Sprintf("tls value %v is invalid", tls))
+		}
 
 		// Is it a system call?
 		if sig == (syscallEvent | syscall.SIGTRAP) {
-- 
cgit v1.2.3


From 2446161b3faa352bf28dc83e338f10967f0224c2 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 9 Mar 2020 11:52:32 -0700
Subject: perf/signal: rewrite code in assembly to avoid compiler optimizations

Without this change, the assembly code of this test compiled without
optimizations:

mov    -0x150(%rbp),%rax
movl   $0x77777777,(%rax)
lea    -0x128(%rbp),%rax

with optimizations:

movl   $0x77777777,0x0

This code doesn't work properly, because the test changes rax in the segv
handler.

PiperOrigin-RevId: 299896117
---
 test/perf/linux/signal_benchmark.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/perf/linux/signal_benchmark.cc b/test/perf/linux/signal_benchmark.cc
index a6928df58..cec679191 100644
--- a/test/perf/linux/signal_benchmark.cc
+++ b/test/perf/linux/signal_benchmark.cc
@@ -43,11 +43,13 @@ void BM_FaultSignalFixup(benchmark::State& state) {
 
   // Fault, fault, fault.
   for (auto _ : state) {
-    register volatile unsigned int* ptr asm("rax");
-
     // Trigger the segfault.
-    ptr = nullptr;
-    *ptr = 0;
+    asm volatile(
+        "movq $0, %%rax\n"
+        "movq $0x77777777, (%%rax)\n"
+        :
+        :
+        : "rax");
   }
 }
 
-- 
cgit v1.2.3


From b36de6e7be0542b410901d3cbcd1b3c0fc493cf5 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 9 Mar 2020 19:57:35 -0700
Subject: Move /proc/net to /proc/PID/net, and make /proc/net ->
 /proc/self/net.

Issue #1833

PiperOrigin-RevId: 299998105
---
 pkg/sentry/fs/proc/net.go            |  53 +--
 pkg/sentry/fs/proc/proc.go           |   2 +-
 pkg/sentry/fs/proc/task.go           |   1 +
 pkg/sentry/fsimpl/proc/BUILD         |   2 +-
 pkg/sentry/fsimpl/proc/task.go       |   1 +
 pkg/sentry/fsimpl/proc/task_net.go   | 790 +++++++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/proc/tasks.go      |   2 +-
 pkg/sentry/fsimpl/proc/tasks_net.go  | 787 ----------------------------------
 pkg/sentry/fsimpl/proc/tasks_test.go |   3 +-
 test/syscalls/linux/proc_net.cc      |  25 ++
 10 files changed, 849 insertions(+), 817 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/proc/task_net.go
 delete mode 100644 pkg/sentry/fsimpl/proc/tasks_net.go

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 95d5817ff..bd18177d4 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -40,47 +40,48 @@ import (
 
 // LINT.IfChange
 
-// newNet creates a new proc net entry.
-func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
+// newNetDir creates a new proc net entry.
+func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	k := t.Kernel()
+
 	var contents map[string]*fs.Inode
-	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
-	// network namespace of the calling process. We should make this per-process,
-	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
-	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
+	if s := t.NetworkNamespace().Stack(); s != nil {
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
 		contents = map[string]*fs.Inode{
-			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
-			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
+			"dev":  seqfile.NewSeqFileInode(t, &netDev{s: s}, msrc),
+			"snmp": seqfile.NewSeqFileInode(t, &netSnmp{s: s}, msrc),
 
 			// The following files are simple stubs until they are
 			// implemented in netstack, if the file contains a
 			// header the stub is just the header otherwise it is
 			// an empty file.
-			"arp": newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
+			"arp": newStaticProcInode(t, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
 
-			"netlink":   newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
-			"netstat":   newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
-			"packet":    newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
-			"protocols": newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
+			"netlink":   newStaticProcInode(t, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
+			"netstat":   newStaticProcInode(t, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
+			"packet":    newStaticProcInode(t, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
+			"protocols": newStaticProcInode(t, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
 			// Linux sets psched values to: nsec per usec, psched
 			// tick in ns, 1000000, high res timer ticks per sec
 			// (ClockGetres returns 1ns resolution).
-			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
-			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function\n")),
-			"route":  seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc),
-			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
-			"udp":    seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc),
-			"unix":   seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
+			"psched": newStaticProcInode(t, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
+			"ptype":  newStaticProcInode(t, msrc, []byte("Type Device      Function\n")),
+			"route":  seqfile.NewSeqFileInode(t, &netRoute{s: s}, msrc),
+			"tcp":    seqfile.NewSeqFileInode(t, &netTCP{k: k}, msrc),
+			"udp":    seqfile.NewSeqFileInode(t, &netUDP{k: k}, msrc),
+			"unix":   seqfile.NewSeqFileInode(t, &netUnix{k: k}, msrc),
 		}
 
 		if s.SupportsIPv6() {
-			contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc)
-			contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte(""))
-			contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc)
-			contents["udp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
+			contents["if_inet6"] = seqfile.NewSeqFileInode(t, &ifinet6{s: s}, msrc)
+			contents["ipv6_route"] = newStaticProcInode(t, msrc, []byte(""))
+			contents["tcp6"] = seqfile.NewSeqFileInode(t, &netTCP6{k: k}, msrc)
+			contents["udp6"] = newStaticProcInode(t, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
 		}
 	}
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(t, d, msrc, fs.SpecialDirectory, t)
 }
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
@@ -837,4 +838,4 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	return data, 0
 }
 
-// LINT.ThenChange(../../fsimpl/proc/tasks_net.go)
+// LINT.ThenChange(../../fsimpl/proc/task_net.go)
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index c8abb5052..c659224a7 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -70,6 +70,7 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
 		"mounts":      newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil),
+		"net":         newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/net"), msrc, fs.Symlink, nil),
 		"self":        newSelf(ctx, pidns, msrc),
 		"stat":        seqfile.NewSeqFileInode(ctx, &statData{k}, msrc),
 		"thread-self": newThreadSelf(ctx, pidns, msrc),
@@ -86,7 +87,6 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 	}
 
 	// Add more contents that need proc to be initialized.
-	p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc))
 	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
 
 	return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 4e9b0fc00..03cc788c8 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
 		"maps":          newMaps(t, msrc),
 		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"net":           newNetDir(t, msrc),
 		"ns":            newNamespaceDir(t, msrc),
 		"oom_score":     newOOMScore(t, msrc),
 		"oom_score_adj": newOOMScoreAdj(t, msrc),
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index a83245866..bb609a305 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -9,9 +9,9 @@ go_library(
         "subtasks.go",
         "task.go",
         "task_files.go",
+        "task_net.go",
         "tasks.go",
         "tasks_files.go",
-        "tasks_net.go",
         "tasks_sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index c0d643f51..493acbd1b 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -57,6 +57,7 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 		"maps":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
 		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"net": newTaskNetDir(task, inoGen),
 		"ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{
 			"net":  newNamespaceSymlink(task, inoGen.NextIno(), "net"),
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
new file mode 100644
index 000000000..373a7b17d
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -0,0 +1,790 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
+	k := task.Kernel()
+	pidns := task.PIDNamespace()
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+
+	var contents map[string]*kernfs.Dentry
+	if stack := task.NetworkNamespace().Stack(); stack != nil {
+		const (
+			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
+			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
+			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
+			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
+			ptype     = "Type Device      Function\n"
+			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
+		)
+		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
+
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
+		contents = map[string]*kernfs.Dentry{
+			"dev":  newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}),
+			"snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}),
+
+			// The following files are simple stubs until they are implemented in
+			// netstack, if the file contains a header the stub is just the header
+			// otherwise it is an empty file.
+			"arp":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)),
+			"netlink":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)),
+			"netstat":   newDentry(root, inoGen.NextIno(), 0444, &netStatData{}),
+			"packet":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)),
+			"protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)),
+
+			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
+			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
+			"psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)),
+			"ptype":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)),
+			"route":  newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}),
+			"tcp":    newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}),
+			"udp":    newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}),
+			"unix":   newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}),
+		}
+
+		if stack.SupportsIPv6() {
+			contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(""))
+			contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6))
+		}
+	}
+
+	return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents)
+}
+
+// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
+//
+// +stateify savable
+type ifinet6 struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*ifinet6)(nil)
+
+func (n *ifinet6) contents() []string {
+	var lines []string
+	nics := n.stack.Interfaces()
+	for id, naddrs := range n.stack.InterfaceAddrs() {
+		nic, ok := nics[id]
+		if !ok {
+			// NIC was added after NICNames was called. We'll just ignore it.
+			continue
+		}
+
+		for _, a := range naddrs {
+			// IPv6 only.
+			if a.Family != linux.AF_INET6 {
+				continue
+			}
+
+			// Fields:
+			// IPv6 address displayed in 32 hexadecimal chars without colons
+			// Netlink device number (interface index) in hexadecimal (use nic id)
+			// Prefix length in hexadecimal
+			// Scope value (use 0)
+			// Interface flags
+			// Device name
+			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+		}
+	}
+	return lines
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	for _, l := range n.contents() {
+		buf.WriteString(l)
+	}
+	return nil
+}
+
+// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
+//
+// +stateify savable
+type netDevData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netDevData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	interfaces := n.stack.Interfaces()
+	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
+	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
+
+	for _, i := range interfaces {
+		// Implements the same format as
+		// net/core/net-procfs.c:dev_seq_printf_stats.
+		var stats inet.StatDev
+		if err := n.stack.Statistics(&stats, i.Name); err != nil {
+			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
+			continue
+		}
+		fmt.Fprintf(
+			buf,
+			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+			i.Name,
+			// Received
+			stats[0], // bytes
+			stats[1], // packets
+			stats[2], // errors
+			stats[3], // dropped
+			stats[4], // fifo
+			stats[5], // frame
+			stats[6], // compressed
+			stats[7], // multicast
+			// Transmitted
+			stats[8],  // bytes
+			stats[9],  // packets
+			stats[10], // errors
+			stats[11], // dropped
+			stats[12], // fifo
+			stats[13], // frame
+			stats[14], // compressed
+			stats[15], // multicast
+		)
+	}
+
+	return nil
+}
+
+// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnixData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUnixData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
+	for _, se := range n.kernel.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+			s.DecRef()
+			// Not a unix socket.
+			continue
+		}
+		sops := sfile.FileOperations.(*unix.SocketOperations)
+
+		addr, err := sops.Endpoint().GetLocalAddress()
+		if err != nil {
+			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			addr.Addr = "<unknown>"
+		}
+
+		sockFlags := 0
+		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+			if ce.Listening() {
+				// For unix domain sockets, linux reports a single flag
+				// value if the socket is listening, of __SO_ACCEPTCON.
+				sockFlags = linux.SO_ACCEPTCON
+			}
+		}
+
+		// In the socket entry below, the value for the 'Num' field requires
+		// some consideration. Linux prints the address to the struct
+		// unix_sock representing a socket in the kernel, but may redact the
+		// value for unprivileged users depending on the kptr_restrict
+		// sysctl.
+		//
+		// One use for this field is to allow a privileged user to
+		// introspect into the kernel memory to determine information about
+		// a socket not available through procfs, such as the socket's peer.
+		//
+		// In gvisor, returning a pointer to our internal structures would
+		// be pointless, as it wouldn't match the memory layout for struct
+		// unix_sock, making introspection difficult. We could populate a
+		// struct unix_sock with the appropriate data, but even that
+		// requires consideration for which kernel version to emulate, as
+		// the definition of this struct changes over time.
+		//
+		// For now, we always redact this pointer.
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			0,                             // Protocol, always 0 for UDS.
+			sockFlags,                     // Flags.
+			sops.Endpoint().Type(),        // Type.
+			sops.State(),                  // State.
+			sfile.InodeID(),               // Inode.
+		)
+
+		// Path
+		if len(addr.Addr) != 0 {
+			if addr.Addr[0] == 0 {
+				// Abstract path.
+				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
+			} else {
+				fmt.Fprintf(buf, " %s", string(addr.Addr))
+			}
+		}
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+func networkToHost16(n uint16) uint16 {
+	// n is in network byte order, so is big-endian. The most-significant byte
+	// should be stored in the lower address.
+	//
+	// We manually inline binary.BigEndian.Uint16() because Go does not support
+	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
+	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
+	// interface method call, defeating inlining.
+	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
+	return usermem.ByteOrder.Uint16(buf[:])
+}
+
+func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
+	switch family {
+	case linux.AF_INET:
+		var a linux.SockAddrInet
+		if i != nil {
+			a = *i.(*linux.SockAddrInet)
+		}
+
+		// linux.SockAddrInet.Port is stored in the network byte order and is
+		// printed like a number in host byte order. Note that all numbers in host
+		// byte order are printed with the most-significant byte first when
+		// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
+		port := networkToHost16(a.Port)
+
+		// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
+		// (i.e. most-significant byte in index 0). Linux represents this as a
+		// __be32 which is a typedef for an unsigned int, and is printed with
+		// %X. This means that for a little-endian machine, Linux prints the
+		// least-significant byte of the address first. To emulate this, we first
+		// invert the byte order for the address using usermem.ByteOrder.Uint32,
+		// which makes it have the equivalent encoding to a __be32 on a little
+		// endian machine. Note that this operation is a no-op on a big endian
+		// machine. Then similar to Linux, we format it with %X, which will print
+		// the most-significant byte of the __be32 address first, which is now
+		// actually the least-significant byte of the original address in
+		// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
+		addr := usermem.ByteOrder.Uint32(a.Addr[:])
+
+		fmt.Fprintf(w, "%08X:%04X ", addr, port)
+	case linux.AF_INET6:
+		var a linux.SockAddrInet6
+		if i != nil {
+			a = *i.(*linux.SockAddrInet6)
+		}
+
+		port := networkToHost16(a.Port)
+		addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4])
+		addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8])
+		addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12])
+		addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16])
+		fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
+	}
+}
+
+func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	for _, se := range k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
+			s.DecRef()
+			// Not tcp4 sockets.
+			continue
+		}
+
+		// Linux's documentation for the fields below can be found at
+		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+		// Note that the header doesn't contain labels for all the fields.
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%4d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddr
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = local
+			}
+		}
+		writeInetAddr(buf, family, localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddr
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = remote
+			}
+		}
+		writeInetAddr(buf, family, remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when; timer active state and number of jiffies
+		// until timer expires. Unimplemented.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt; number of unrecovered RTO timeouts.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout; number of unanswered 0-window probes.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: refcount. Don't count the ref we obtain while deferencing
+		// the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: retransmit timeout. Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: predicted tick of soft clock (delayed ACK control data).
+		// Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: sending congestion window, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+		// Unimplemented, report as large threshold.
+		fmt.Fprintf(buf, "%d", -1)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+
+	return nil
+}
+
+// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCPData)(nil)
+
+func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
+}
+
+// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
+//
+// +stateify savable
+type netTCP6Data struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCP6Data)(nil)
+
+func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
+}
+
+// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
+//
+// +stateify savable
+type netUDPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUDPData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	for _, se := range d.kernel.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
+			s.DecRef()
+			// Not udp4 socket.
+			continue
+		}
+
+		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%5d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = *local.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = *remote.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when. Always 0 for UDP.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt. Always 0 for UDP.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout. Always 0 for UDP.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: ref; reference count on the socket inode. Don't count the ref
+		// we obtain while deferencing the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: drops; number of dropped packets. Unimplemented.
+		fmt.Fprintf(buf, "%d", 0)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
+//
+// +stateify savable
+type netSnmpData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netSnmpData)(nil)
+
+type snmpLine struct {
+	prefix string
+	header string
+}
+
+var snmp = []snmpLine{
+	{
+		prefix: "Ip",
+		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
+	},
+	{
+		prefix: "Icmp",
+		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
+	},
+	{
+		prefix: "IcmpMsg",
+	},
+	{
+		prefix: "Tcp",
+		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
+	},
+	{
+		prefix: "Udp",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+	{
+		prefix: "UdpLite",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+}
+
+func toSlice(a interface{}) []uint64 {
+	v := reflect.Indirect(reflect.ValueOf(a))
+	return v.Slice(0, v.Len()).Interface().([]uint64)
+}
+
+func sprintSlice(s []uint64) string {
+	if len(s) == 0 {
+		return ""
+	}
+	r := fmt.Sprint(s)
+	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
+}
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	types := []interface{}{
+		&inet.StatSNMPIP{},
+		&inet.StatSNMPICMP{},
+		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
+		&inet.StatSNMPTCP{},
+		&inet.StatSNMPUDP{},
+		&inet.StatSNMPUDPLite{},
+	}
+	for i, stat := range types {
+		line := snmp[i]
+		if stat == nil {
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			continue
+		}
+		if err := d.stack.Statistics(stat, line.prefix); err != nil {
+			if err == syserror.EOPNOTSUPP {
+				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			} else {
+				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			}
+		}
+
+		fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)
+
+		if line.prefix == "Tcp" {
+			tcp := stat.(*inet.StatSNMPTCP)
+			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
+			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+		} else {
+			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+		}
+	}
+	return nil
+}
+
+// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
+//
+// +stateify savable
+type netRouteData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netRouteData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
+
+	interfaces := d.stack.Interfaces()
+	for _, rt := range d.stack.RouteTable() {
+		// /proc/net/route only includes ipv4 routes.
+		if rt.Family != linux.AF_INET {
+			continue
+		}
+
+		// /proc/net/route does not include broadcast or multicast routes.
+		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
+			continue
+		}
+
+		iface, ok := interfaces[rt.OutputInterface]
+		if !ok || iface.Name == "lo" {
+			continue
+		}
+
+		var (
+			gw     uint32
+			prefix uint32
+			flags  = linux.RTF_UP
+		)
+		if len(rt.GatewayAddr) == header.IPv4AddressSize {
+			flags |= linux.RTF_GATEWAY
+			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
+		}
+		if len(rt.DstAddr) == header.IPv4AddressSize {
+			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
+		}
+		l := fmt.Sprintf(
+			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
+			iface.Name,
+			prefix,
+			gw,
+			flags,
+			0, // RefCnt.
+			0, // Use.
+			0, // Metric.
+			(uint32(1)<<rt.DstLen)-1,
+			0, // MTU.
+			0, // Window.
+			0, // RTT.
+		)
+		fmt.Fprintf(buf, "%-127s\n", l)
+	}
+	return nil
+}
+
+// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
+//
+// +stateify savable
+type netStatData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
+		"EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
+		"LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
+		"PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
+		"ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
+		"TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
+		"TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
+		"TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
+		"TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
+		"TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
+		"TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
+		"TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
+		"TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
+		"TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
+		"TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
+		"TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
+		"TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
+		"TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
+		"TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
+		"TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
+		"TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
+		"TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
+		"TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
+		"TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
+		"TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
+		"TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
+		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
+		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
+		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
+		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b1e39c82f..d203cebd4 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -72,7 +72,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"sys":     newSysDir(root, inoGen, k),
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
-		"net":     newNetDir(root, inoGen, k),
+		"net":     kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"),
 		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
 		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
deleted file mode 100644
index d4e1812d8..000000000
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ /dev/null
@@ -1,787 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"reflect"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
-	var contents map[string]*kernfs.Dentry
-	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
-	// network namespace of the calling process. We should make this per-process,
-	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
-	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
-		const (
-			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
-			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
-			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
-			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
-			ptype     = "Type Device      Function\n"
-			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
-		)
-		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
-
-		contents = map[string]*kernfs.Dentry{
-			"dev":  newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}),
-			"snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}),
-
-			// The following files are simple stubs until they are implemented in
-			// netstack, if the file contains a header the stub is just the header
-			// otherwise it is an empty file.
-			"arp":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)),
-			"netlink":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)),
-			"netstat":   newDentry(root, inoGen.NextIno(), 0444, &netStatData{}),
-			"packet":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)),
-			"protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)),
-
-			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
-			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-			"psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)),
-			"ptype":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)),
-			"route":  newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}),
-			"tcp":    newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}),
-			"udp":    newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}),
-			"unix":   newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}),
-		}
-
-		if stack.SupportsIPv6() {
-			contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack})
-			contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(""))
-			contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k})
-			contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6))
-		}
-	}
-
-	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents)
-}
-
-// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
-//
-// +stateify savable
-type ifinet6 struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*ifinet6)(nil)
-
-func (n *ifinet6) contents() []string {
-	var lines []string
-	nics := n.stack.Interfaces()
-	for id, naddrs := range n.stack.InterfaceAddrs() {
-		nic, ok := nics[id]
-		if !ok {
-			// NIC was added after NICNames was called. We'll just ignore it.
-			continue
-		}
-
-		for _, a := range naddrs {
-			// IPv6 only.
-			if a.Family != linux.AF_INET6 {
-				continue
-			}
-
-			// Fields:
-			// IPv6 address displayed in 32 hexadecimal chars without colons
-			// Netlink device number (interface index) in hexadecimal (use nic id)
-			// Prefix length in hexadecimal
-			// Scope value (use 0)
-			// Interface flags
-			// Device name
-			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
-		}
-	}
-	return lines
-}
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	for _, l := range n.contents() {
-		buf.WriteString(l)
-	}
-	return nil
-}
-
-// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
-//
-// +stateify savable
-type netDevData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netDevData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	interfaces := n.stack.Interfaces()
-	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
-	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
-
-	for _, i := range interfaces {
-		// Implements the same format as
-		// net/core/net-procfs.c:dev_seq_printf_stats.
-		var stats inet.StatDev
-		if err := n.stack.Statistics(&stats, i.Name); err != nil {
-			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
-			continue
-		}
-		fmt.Fprintf(
-			buf,
-			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
-			i.Name,
-			// Received
-			stats[0], // bytes
-			stats[1], // packets
-			stats[2], // errors
-			stats[3], // dropped
-			stats[4], // fifo
-			stats[5], // frame
-			stats[6], // compressed
-			stats[7], // multicast
-			// Transmitted
-			stats[8],  // bytes
-			stats[9],  // packets
-			stats[10], // errors
-			stats[11], // dropped
-			stats[12], // fifo
-			stats[13], // frame
-			stats[14], // compressed
-			stats[15], // multicast
-		)
-	}
-
-	return nil
-}
-
-// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
-//
-// +stateify savable
-type netUnixData struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netUnixData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
-	for _, se := range n.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
-			continue
-		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
-			s.DecRef()
-			// Not a unix socket.
-			continue
-		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
-
-		addr, err := sops.Endpoint().GetLocalAddress()
-		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
-			addr.Addr = "<unknown>"
-		}
-
-		sockFlags := 0
-		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
-			if ce.Listening() {
-				// For unix domain sockets, linux reports a single flag
-				// value if the socket is listening, of __SO_ACCEPTCON.
-				sockFlags = linux.SO_ACCEPTCON
-			}
-		}
-
-		// In the socket entry below, the value for the 'Num' field requires
-		// some consideration. Linux prints the address to the struct
-		// unix_sock representing a socket in the kernel, but may redact the
-		// value for unprivileged users depending on the kptr_restrict
-		// sysctl.
-		//
-		// One use for this field is to allow a privileged user to
-		// introspect into the kernel memory to determine information about
-		// a socket not available through procfs, such as the socket's peer.
-		//
-		// In gvisor, returning a pointer to our internal structures would
-		// be pointless, as it wouldn't match the memory layout for struct
-		// unix_sock, making introspection difficult. We could populate a
-		// struct unix_sock with the appropriate data, but even that
-		// requires consideration for which kernel version to emulate, as
-		// the definition of this struct changes over time.
-		//
-		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
-			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
-			0,                             // Protocol, always 0 for UDS.
-			sockFlags,                     // Flags.
-			sops.Endpoint().Type(),        // Type.
-			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
-		)
-
-		// Path
-		if len(addr.Addr) != 0 {
-			if addr.Addr[0] == 0 {
-				// Abstract path.
-				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
-			} else {
-				fmt.Fprintf(buf, " %s", string(addr.Addr))
-			}
-		}
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-	return nil
-}
-
-func networkToHost16(n uint16) uint16 {
-	// n is in network byte order, so is big-endian. The most-significant byte
-	// should be stored in the lower address.
-	//
-	// We manually inline binary.BigEndian.Uint16() because Go does not support
-	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
-	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
-	// interface method call, defeating inlining.
-	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
-	return usermem.ByteOrder.Uint16(buf[:])
-}
-
-func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
-	switch family {
-	case linux.AF_INET:
-		var a linux.SockAddrInet
-		if i != nil {
-			a = *i.(*linux.SockAddrInet)
-		}
-
-		// linux.SockAddrInet.Port is stored in the network byte order and is
-		// printed like a number in host byte order. Note that all numbers in host
-		// byte order are printed with the most-significant byte first when
-		// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
-		port := networkToHost16(a.Port)
-
-		// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
-		// (i.e. most-significant byte in index 0). Linux represents this as a
-		// __be32 which is a typedef for an unsigned int, and is printed with
-		// %X. This means that for a little-endian machine, Linux prints the
-		// least-significant byte of the address first. To emulate this, we first
-		// invert the byte order for the address using usermem.ByteOrder.Uint32,
-		// which makes it have the equivalent encoding to a __be32 on a little
-		// endian machine. Note that this operation is a no-op on a big endian
-		// machine. Then similar to Linux, we format it with %X, which will print
-		// the most-significant byte of the __be32 address first, which is now
-		// actually the least-significant byte of the original address in
-		// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
-		addr := usermem.ByteOrder.Uint32(a.Addr[:])
-
-		fmt.Fprintf(w, "%08X:%04X ", addr, port)
-	case linux.AF_INET6:
-		var a linux.SockAddrInet6
-		if i != nil {
-			a = *i.(*linux.SockAddrInet6)
-		}
-
-		port := networkToHost16(a.Port)
-		addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4])
-		addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8])
-		addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12])
-		addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16])
-		fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
-	}
-}
-
-func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
-	// t may be nil here if our caller is not part of a task goroutine. This can
-	// happen for example if we're here for "sentryctl cat". When t is nil,
-	// degrade gracefully and retrieve what we can.
-	t := kernel.TaskFromContext(ctx)
-
-	for _, se := range k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
-			continue
-		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
-		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
-		}
-		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
-			s.DecRef()
-			// Not tcp4 sockets.
-			continue
-		}
-
-		// Linux's documentation for the fields below can be found at
-		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
-		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
-		// Note that the header doesn't contain labels for all the fields.
-
-		// Field: sl; entry number.
-		fmt.Fprintf(buf, "%4d: ", se.ID)
-
-		// Field: local_adddress.
-		var localAddr linux.SockAddr
-		if t != nil {
-			if local, _, err := sops.GetSockName(t); err == nil {
-				localAddr = local
-			}
-		}
-		writeInetAddr(buf, family, localAddr)
-
-		// Field: rem_address.
-		var remoteAddr linux.SockAddr
-		if t != nil {
-			if remote, _, err := sops.GetPeerName(t); err == nil {
-				remoteAddr = remote
-			}
-		}
-		writeInetAddr(buf, family, remoteAddr)
-
-		// Field: state; socket state.
-		fmt.Fprintf(buf, "%02X ", sops.State())
-
-		// Field: tx_queue, rx_queue; number of packets in the transmit and
-		// receive queue. Unimplemented.
-		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
-
-		// Field: tr, tm->when; timer active state and number of jiffies
-		// until timer expires. Unimplemented.
-		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
-
-		// Field: retrnsmt; number of unrecovered RTO timeouts.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%08X ", 0)
-
-		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
-			fmt.Fprintf(buf, "%5d ", 0)
-		} else {
-			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
-		}
-
-		// Field: timeout; number of unanswered 0-window probes.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%8d ", 0)
-
-		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
-
-		// Field: refcount. Don't count the ref we obtain while deferencing
-		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
-
-		// Field: Socket struct address. Redacted due to the same reason as
-		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
-		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
-
-		// Field: retransmit timeout. Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: predicted tick of soft clock (delayed ACK control data).
-		// Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: sending congestion window, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
-		// Unimplemented, report as large threshold.
-		fmt.Fprintf(buf, "%d", -1)
-
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-
-	return nil
-}
-
-// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
-//
-// +stateify savable
-type netTCPData struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netTCPData)(nil)
-
-func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
-	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
-}
-
-// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
-//
-// +stateify savable
-type netTCP6Data struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netTCP6Data)(nil)
-
-func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
-	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
-}
-
-// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
-//
-// +stateify savable
-type netUDPData struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netUDPData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	// t may be nil here if our caller is not part of a task goroutine. This can
-	// happen for example if we're here for "sentryctl cat". When t is nil,
-	// degrade gracefully and retrieve what we can.
-	t := kernel.TaskFromContext(ctx)
-
-	for _, se := range d.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
-			continue
-		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
-		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
-		}
-		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
-			s.DecRef()
-			// Not udp4 socket.
-			continue
-		}
-
-		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
-
-		// Field: sl; entry number.
-		fmt.Fprintf(buf, "%5d: ", se.ID)
-
-		// Field: local_adddress.
-		var localAddr linux.SockAddrInet
-		if t != nil {
-			if local, _, err := sops.GetSockName(t); err == nil {
-				localAddr = *local.(*linux.SockAddrInet)
-			}
-		}
-		writeInetAddr(buf, linux.AF_INET, &localAddr)
-
-		// Field: rem_address.
-		var remoteAddr linux.SockAddrInet
-		if t != nil {
-			if remote, _, err := sops.GetPeerName(t); err == nil {
-				remoteAddr = *remote.(*linux.SockAddrInet)
-			}
-		}
-		writeInetAddr(buf, linux.AF_INET, &remoteAddr)
-
-		// Field: state; socket state.
-		fmt.Fprintf(buf, "%02X ", sops.State())
-
-		// Field: tx_queue, rx_queue; number of packets in the transmit and
-		// receive queue. Unimplemented.
-		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
-
-		// Field: tr, tm->when. Always 0 for UDP.
-		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
-
-		// Field: retrnsmt. Always 0 for UDP.
-		fmt.Fprintf(buf, "%08X ", 0)
-
-		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
-			fmt.Fprintf(buf, "%5d ", 0)
-		} else {
-			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
-		}
-
-		// Field: timeout. Always 0 for UDP.
-		fmt.Fprintf(buf, "%8d ", 0)
-
-		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
-
-		// Field: ref; reference count on the socket inode. Don't count the ref
-		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
-
-		// Field: Socket struct address. Redacted due to the same reason as
-		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
-		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
-
-		// Field: drops; number of dropped packets. Unimplemented.
-		fmt.Fprintf(buf, "%d", 0)
-
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-	return nil
-}
-
-// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
-//
-// +stateify savable
-type netSnmpData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netSnmpData)(nil)
-
-type snmpLine struct {
-	prefix string
-	header string
-}
-
-var snmp = []snmpLine{
-	{
-		prefix: "Ip",
-		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
-	},
-	{
-		prefix: "Icmp",
-		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
-	},
-	{
-		prefix: "IcmpMsg",
-	},
-	{
-		prefix: "Tcp",
-		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
-	},
-	{
-		prefix: "Udp",
-		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
-	},
-	{
-		prefix: "UdpLite",
-		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
-	},
-}
-
-func toSlice(a interface{}) []uint64 {
-	v := reflect.Indirect(reflect.ValueOf(a))
-	return v.Slice(0, v.Len()).Interface().([]uint64)
-}
-
-func sprintSlice(s []uint64) string {
-	if len(s) == 0 {
-		return ""
-	}
-	r := fmt.Sprint(s)
-	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
-}
-
-// Generate implements vfs.DynamicBytesSource.
-func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	types := []interface{}{
-		&inet.StatSNMPIP{},
-		&inet.StatSNMPICMP{},
-		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
-		&inet.StatSNMPTCP{},
-		&inet.StatSNMPUDP{},
-		&inet.StatSNMPUDPLite{},
-	}
-	for i, stat := range types {
-		line := snmp[i]
-		if stat == nil {
-			fmt.Fprintf(buf, "%s:\n", line.prefix)
-			fmt.Fprintf(buf, "%s:\n", line.prefix)
-			continue
-		}
-		if err := d.stack.Statistics(stat, line.prefix); err != nil {
-			if err == syserror.EOPNOTSUPP {
-				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
-			} else {
-				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
-			}
-		}
-
-		fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)
-
-		if line.prefix == "Tcp" {
-			tcp := stat.(*inet.StatSNMPTCP)
-			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
-			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
-		} else {
-			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
-		}
-	}
-	return nil
-}
-
-// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
-//
-// +stateify savable
-type netRouteData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netRouteData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.
-// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
-func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
-
-	interfaces := d.stack.Interfaces()
-	for _, rt := range d.stack.RouteTable() {
-		// /proc/net/route only includes ipv4 routes.
-		if rt.Family != linux.AF_INET {
-			continue
-		}
-
-		// /proc/net/route does not include broadcast or multicast routes.
-		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
-			continue
-		}
-
-		iface, ok := interfaces[rt.OutputInterface]
-		if !ok || iface.Name == "lo" {
-			continue
-		}
-
-		var (
-			gw     uint32
-			prefix uint32
-			flags  = linux.RTF_UP
-		)
-		if len(rt.GatewayAddr) == header.IPv4AddressSize {
-			flags |= linux.RTF_GATEWAY
-			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
-		}
-		if len(rt.DstAddr) == header.IPv4AddressSize {
-			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
-		}
-		l := fmt.Sprintf(
-			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
-			iface.Name,
-			prefix,
-			gw,
-			flags,
-			0, // RefCnt.
-			0, // Use.
-			0, // Metric.
-			(uint32(1)<<rt.DstLen)-1,
-			0, // MTU.
-			0, // Window.
-			0, // RTT.
-		)
-		fmt.Fprintf(buf, "%-127s\n", l)
-	}
-	return nil
-}
-
-// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
-//
-// +stateify savable
-type netStatData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netStatData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.
-// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
-func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
-		"EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
-		"LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
-		"PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
-		"ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
-		"TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
-		"TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
-		"TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
-		"TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
-		"TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
-		"TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
-		"TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
-		"TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
-		"TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
-		"TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
-		"TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
-		"TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
-		"TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
-		"TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
-		"TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
-		"TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
-		"TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
-		"TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
-		"TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
-		"TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
-		"TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
-		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
-		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
-		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
-		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 0eb401619..1bb9430c0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -50,7 +50,7 @@ var (
 		"loadavg":     linux.DT_REG,
 		"meminfo":     linux.DT_REG,
 		"mounts":      linux.DT_LNK,
-		"net":         linux.DT_DIR,
+		"net":         linux.DT_LNK,
 		"self":        linux.DT_LNK,
 		"stat":        linux.DT_REG,
 		"sys":         linux.DT_DIR,
@@ -71,6 +71,7 @@ var (
 		"gid_map":       linux.DT_REG,
 		"io":            linux.DT_REG,
 		"maps":          linux.DT_REG,
+		"net":           linux.DT_DIR,
 		"ns":            linux.DT_DIR,
 		"oom_score":     linux.DT_REG,
 		"oom_score_adj": linux.DT_REG,
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 3a611a86f..05c952b99 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -33,6 +33,31 @@ namespace gvisor {
 namespace testing {
 namespace {
 
+constexpr const char kProcNet[] = "/proc/net";
+
+TEST(ProcNetSymlinkTarget, FileMode) {
+  struct stat s;
+  ASSERT_THAT(stat(kProcNet, &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_mode & S_IFMT, S_IFDIR);
+  EXPECT_EQ(s.st_mode & 0777, 0555);
+}
+
+TEST(ProcNetSymlink, FileMode) {
+  struct stat s;
+  ASSERT_THAT(lstat(kProcNet, &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_mode & S_IFMT, S_IFLNK);
+  EXPECT_EQ(s.st_mode & 0777, 0777);
+}
+
+TEST(ProcNetSymlink, Contents) {
+  char buf[40] = {};
+  int n = readlink(kProcNet, buf, sizeof(buf));
+  ASSERT_THAT(n, SyscallSucceeds());
+
+  buf[n] = 0;
+  EXPECT_STREQ(buf, "self/net");
+}
+
 TEST(ProcNetIfInet6, Format) {
   auto ifinet6 = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/if_inet6"));
   EXPECT_THAT(ifinet6,
-- 
cgit v1.2.3


From 0990ef7517236228f575f222ae639d375badec15 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Tue, 10 Mar 2020 13:58:27 -0700
Subject: Make checkpoint/restore e2e test less flaky

PiperOrigin-RevId: 300171916
---
 test/e2e/integration_test.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index 28064e557..cc4fbbaed 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -175,10 +175,8 @@ func TestCheckpointRestore(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	// TODO(b/143498576): Remove after github.com/moby/moby/issues/38963 is fixed.
-	time.Sleep(1 * time.Second)
-
-	if err := d.Restore("test"); err != nil {
+	// TODO(b/143498576): Remove Poll after github.com/moby/moby/issues/38963 is fixed.
+	if err := testutil.Poll(func() error { return d.Restore("test") }, 15*time.Second); err != nil {
 		t.Fatal("docker restore failed:", err)
 	}
 
-- 
cgit v1.2.3


From d6440ec5a125746b76f189c0a5d5946dde9afc37 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 10 Mar 2020 14:49:16 -0700
Subject: The packet forwarding should resolve the link address if necessary.

Fixes #1510

Test:
- stack_test.TestForwardingWithStaticResolver
- stack_test.TestForwardingWithFakeResolver
- stack_test.TestForwardingWithNoResolver
- stack_test.TestForwardingWithFakeResolverPartialTimeout
- stack_test.TestForwardingWithFakeResolverTwoPackets
- stack_test.TestForwardingWithFakeResolverManyPackets
- stack_test.TestForwardingWithFakeResolverManyResolutions
PiperOrigin-RevId: 300182570
---
 pkg/tcpip/stack/BUILD             |   2 +
 pkg/tcpip/stack/forwarder.go      | 131 ++++++++
 pkg/tcpip/stack/forwarder_test.go | 635 ++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/nic.go            |  51 ++-
 pkg/tcpip/stack/stack.go          |   5 +
 5 files changed, 808 insertions(+), 16 deletions(-)
 create mode 100644 pkg/tcpip/stack/forwarder.go
 create mode 100644 pkg/tcpip/stack/forwarder_test.go

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 8febd54c8..6c029b2fb 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -19,6 +19,7 @@ go_library(
     name = "stack",
     srcs = [
         "dhcpv6configurationfromndpra_string.go",
+        "forwarder.go",
         "icmp_rate_limit.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
@@ -80,6 +81,7 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = [
+        "forwarder_test.go",
         "linkaddrcache_test.go",
         "nic_test.go",
     ],
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
new file mode 100644
index 000000000..631953935
--- /dev/null
+++ b/pkg/tcpip/stack/forwarder.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// maxPendingResolutions is the maximum number of pending link-address
+	// resolutions.
+	maxPendingResolutions          = 64
+	maxPendingPacketsPerResolution = 256
+)
+
+type pendingPacket struct {
+	nic   *NIC
+	route *Route
+	proto tcpip.NetworkProtocolNumber
+	pkt   tcpip.PacketBuffer
+}
+
+type forwardQueue struct {
+	sync.Mutex
+
+	// The packets to send once the resolver completes.
+	packets map[<-chan struct{}][]*pendingPacket
+
+	// FIFO of channels used to cancel the oldest goroutine waiting for
+	// link-address resolution.
+	cancelChans []chan struct{}
+}
+
+func newForwardQueue() *forwardQueue {
+	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
+}
+
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	shouldWait := false
+
+	f.Lock()
+	packets, ok := f.packets[ch]
+	if !ok {
+		shouldWait = true
+	}
+	for len(packets) == maxPendingPacketsPerResolution {
+		p := packets[0]
+		packets = packets[1:]
+		p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+		p.route.Release()
+	}
+	if l := len(packets); l >= maxPendingPacketsPerResolution {
+		panic(fmt.Sprintf("max pending packets for resolution reached; got %d packets, max = %d", l, maxPendingPacketsPerResolution))
+	}
+	f.packets[ch] = append(packets, &pendingPacket{
+		nic:   n,
+		route: r,
+		proto: protocol,
+		pkt:   pkt,
+	})
+	f.Unlock()
+
+	if !shouldWait {
+		return
+	}
+
+	// Wait for the link-address resolution to complete.
+	// Start a goroutine with a forwarding-cancel channel so that we can
+	// limit the maximum number of goroutines running concurrently.
+	cancel := f.newCancelChannel()
+	go func() {
+		cancelled := false
+		select {
+		case <-ch:
+		case <-cancel:
+			cancelled = true
+		}
+
+		f.Lock()
+		packets := f.packets[ch]
+		delete(f.packets, ch)
+		f.Unlock()
+
+		for _, p := range packets {
+			if cancelled {
+				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+			} else if _, err := p.route.Resolve(nil); err != nil {
+				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+			} else {
+				p.nic.forwardPacket(p.route, p.proto, p.pkt)
+			}
+			p.route.Release()
+		}
+	}()
+}
+
+// newCancelChannel creates a channel that can cancel a pending forwarding
+// activity. The oldest channel is closed if the number of open channels would
+// exceed maxPendingResolutions.
+func (f *forwardQueue) newCancelChannel() chan struct{} {
+	f.Lock()
+	defer f.Unlock()
+
+	if len(f.cancelChans) == maxPendingResolutions {
+		ch := f.cancelChans[0]
+		f.cancelChans = f.cancelChans[1:]
+		close(ch)
+	}
+	if l := len(f.cancelChans); l >= maxPendingResolutions {
+		panic(fmt.Sprintf("max pending resolutions reached; got %d active resolutions, max = %d", l, maxPendingResolutions))
+	}
+
+	ch := make(chan struct{})
+	f.cancelChans = append(f.cancelChans, ch)
+	return ch
+}
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
new file mode 100644
index 000000000..321b7524d
--- /dev/null
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -0,0 +1,635 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"encoding/binary"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+const (
+	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
+	fwdTestNetHeaderLen                                    = 12
+	fwdTestNetDefaultPrefixLen                             = 8
+
+	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
+	// except where another value is explicitly used. It is chosen to match
+	// the MTU of loopback interfaces on linux systems.
+	fwdTestNetDefaultMTU = 65536
+)
+
+// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
+// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
+// use the first three: destination address, source address, and transport
+// protocol. They're all one byte fields to simplify parsing.
+type fwdTestNetworkEndpoint struct {
+	nicID      tcpip.NICID
+	id         NetworkEndpointID
+	prefixLen  int
+	proto      *fwdTestNetworkProtocol
+	dispatcher TransportDispatcher
+	ep         LinkEndpoint
+}
+
+func (f *fwdTestNetworkEndpoint) MTU() uint32 {
+	return f.ep.MTU() - uint32(f.MaxHeaderLength())
+}
+
+func (f *fwdTestNetworkEndpoint) NICID() tcpip.NICID {
+	return f.nicID
+}
+
+func (f *fwdTestNetworkEndpoint) PrefixLen() int {
+	return f.prefixLen
+}
+
+func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
+	return 123
+}
+
+func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
+	return &f.id
+}
+
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt tcpip.PacketBuffer) {
+	// Consume the network header.
+	b := pkt.Data.First()
+	pkt.Data.TrimFront(fwdTestNetHeaderLen)
+
+	// Dispatch the packet to the transport protocol.
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
+}
+
+func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
+	return f.ep.MaxHeaderLength() + fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
+	return 0
+}
+
+func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
+	return f.ep.Capabilities()
+}
+
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+	// Add the protocol's header to the packet and send it to the link
+	// endpoint.
+	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
+	b[0] = r.RemoteAddress[0]
+	b[1] = f.id.LocalAddress[0]
+	b[2] = byte(params.Protocol)
+
+	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (*fwdTestNetworkEndpoint) Close() {}
+
+// fwdTestNetworkProtocol is a network-layer protocol that implements Address
+// resolution.
+type fwdTestNetworkProtocol struct {
+	addrCache              *linkAddrCache
+	addrResolveDelay       time.Duration
+	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address)
+	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
+}
+
+func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
+	return fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
+	return fwdTestNetDefaultPrefixLen
+}
+
+func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
+}
+
+func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+	return &fwdTestNetworkEndpoint{
+		nicID:      nicID,
+		id:         NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:  addrWithPrefix.PrefixLen,
+		proto:      f,
+		dispatcher: dispatcher,
+		ep:         ep,
+	}, nil
+}
+
+func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (f *fwdTestNetworkProtocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (f *fwdTestNetworkProtocol) Close() {}
+
+func (f *fwdTestNetworkProtocol) Wait() {}
+
+func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error {
+	if f.addrCache != nil && f.onLinkAddressResolved != nil {
+		time.AfterFunc(f.addrResolveDelay, func() {
+			f.onLinkAddressResolved(f.addrCache, addr)
+		})
+	}
+	return nil
+}
+
+func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if f.onResolveStaticAddress != nil {
+		return f.onResolveStaticAddress(addr)
+	}
+	return "", false
+}
+
+func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+// fwdTestPacketInfo holds all the information about an outbound packet.
+type fwdTestPacketInfo struct {
+	RemoteLinkAddress tcpip.LinkAddress
+	LocalLinkAddress  tcpip.LinkAddress
+	Pkt               tcpip.PacketBuffer
+}
+
+type fwdTestLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+	mtu        uint32
+	linkAddr   tcpip.LinkAddress
+
+	// C is where outbound packets are queued.
+	C chan fwdTestPacketInfo
+}
+
+// InjectInbound injects an inbound packet.
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *fwdTestLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *fwdTestLinkEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	caps := LinkEndpointCapabilities(0)
+	return caps | CapabilityResolutionRequired
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		RemoteLinkAddress: r.RemoteLinkAddress,
+		LocalLinkAddress:  r.LocalLinkAddress,
+		Pkt:               pkt,
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	n := 0
+	for _, pkt := range pkts {
+		e.WritePacket(r, gso, protocol, pkt)
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		Pkt: tcpip.PacketBuffer{Data: vv},
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*fwdTestLinkEndpoint) Wait() {}
+
+func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
+	// Create a stack with the network protocol and two NICs.
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocol{proto},
+	})
+
+	proto.addrCache = s.linkAddrCache
+
+	// Enable forwarding.
+	s.SetForwarding(true)
+
+	// NIC 1 has the link address "a", and added the network address 1.
+	ep1 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "a",
+	}
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC #1 failed:", err)
+	}
+	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress #1 failed:", err)
+	}
+
+	// NIC 2 has the link address "b", and added the network address 2.
+	ep2 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "b",
+	}
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC #2 failed:", err)
+	}
+	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress #2 failed:", err)
+	}
+
+	// Route all packets to NIC 2.
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
+	}
+
+	return ep1, ep2
+}
+
+func TestForwardingWithStaticResolver(t *testing.T) {
+	// Create a network protocol with a static resolver.
+	proto := &fwdTestNetworkProtocol{
+		onResolveStaticAddress:
+		// The network address 3 is resolved to the link address "c".
+		func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+			if addr == "\x03" {
+				return "c", true
+			}
+			return "", false
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[0] = 3
+	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	default:
+		t.Fatal("packet not forwarded")
+	}
+
+	// Test that the static address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithFakeResolver(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any address will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[0] = 3
+	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	case <-time.After(time.Second):
+		t.Fatal("packet not forwarded")
+	}
+
+	// Test that the address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithNoResolver(t *testing.T) {
+	// Create a network protocol without a resolver.
+	proto := &fwdTestNetworkProtocol{}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[0] = 3
+	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	select {
+	case <-ep2.C:
+		t.Fatal("Packet should not be forwarded")
+	case <-time.After(time.Second):
+	}
+}
+
+func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Only packets to address 3 will be resolved to the
+			// link address "c".
+			if addr == "\x03" {
+				cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+			}
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 4 on NIC 1. This packet should
+	// not be forwarded.
+	buf := buffer.NewView(30)
+	buf[0] = 4
+	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf = buffer.NewView(30)
+	buf[0] = 3
+	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	case <-time.After(time.Second):
+		t.Fatal("packet not forwarded")
+	}
+
+	b := p.Pkt.Header.View()
+	if b[0] != 3 {
+		t.Fatalf("got b[0] = %d, want = 3", b[0])
+	}
+
+	// Test that the address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject two inbound packets to address 3 on NIC 1.
+	for i := 0; i < 2; i++ {
+		buf := buffer.NewView(30)
+		buf[0] = 3
+		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < 2; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		b := p.Pkt.Header.View()
+		if b[0] != 3 {
+			t.Fatalf("got b[0] = %d, want = 3", b[0])
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
+
+func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
+		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
+		buf := buffer.NewView(30)
+		buf[0] = 3
+		// Set the packet sequence number.
+		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
+		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < maxPendingPacketsPerResolution; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		b := p.Pkt.Header.View()
+		if b[0] != 3 {
+			t.Fatalf("got b[0] = %d, want = 3", b[0])
+		}
+		// The first 5 packets should not be forwarded so the the
+		// sequemnce number should start with 5.
+		want := uint16(i + 5)
+		if n := binary.BigEndian.Uint16(b[fwdTestNetHeaderLen:]); n != want {
+			t.Fatalf("got the packet #%d, want = #%d", n, want)
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
+
+func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	for i := 0; i < maxPendingResolutions+5; i++ {
+		// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
+		// Each packet has a different destination address (3 to
+		// maxPendingResolutions + 7).
+		buf := buffer.NewView(30)
+		buf[0] = byte(3 + i)
+		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < maxPendingResolutions; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		// The first 5 packets (address 3 to 7) should not be forwarded
+		// because their address resolutions are interrupted.
+		b := p.Pkt.Header.View()
+		if b[0] < 8 {
+			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3e6196aee..cd9202aed 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1201,10 +1201,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
 			return
 		}
-		defer r.Release()
-
-		r.LocalLinkAddress = n.linkEP.LinkAddress()
-		r.RemoteLinkAddress = remote
 
 		// Found a NIC.
 		n := r.ref.nic
@@ -1213,24 +1209,33 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		ok = ok && ref.isValidForOutgoingRLocked() && ref.tryIncRef()
 		n.mu.RUnlock()
 		if ok {
+			r.LocalLinkAddress = n.linkEP.LinkAddress()
+			r.RemoteLinkAddress = remote
 			r.RemoteAddress = src
 			// TODO(b/123449044): Update the source NIC as well.
 			ref.ep.HandlePacket(&r, pkt)
 			ref.decRef()
-		} else {
-			// n doesn't have a destination endpoint.
-			// Send the packet out of n.
-			pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
-			pkt.Data.RemoveFirst()
-
-			// TODO(b/128629022): use route.WritePacket.
-			if err := n.linkEP.WritePacket(&r, nil /* gso */, protocol, pkt); err != nil {
-				r.Stats().IP.OutgoingPacketErrors.Increment()
-			} else {
-				n.stats.Tx.Packets.Increment()
-				n.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+			r.Release()
+			return
+		}
+
+		// n doesn't have a destination endpoint.
+		// Send the packet out of n.
+		// TODO(b/128629022): move this logic to route.WritePacket.
+		if ch, err := r.Resolve(nil); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				n.stack.forwarder.enqueue(ch, n, &r, protocol, pkt)
+				// forwarder will release route.
+				return
 			}
+			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
+			r.Release()
+			return
 		}
+
+		// The link-address resolution finished immediately.
+		n.forwardPacket(&r, protocol, pkt)
+		r.Release()
 		return
 	}
 
@@ -1240,6 +1245,20 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 }
 
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+	pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
+	pkt.Data.RemoveFirst()
+
+	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return
+	}
+
+	n.stats.Tx.Packets.Increment()
+	n.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+}
+
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
 func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 13354d884..6f423874a 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -462,6 +462,10 @@ type Stack struct {
 	// opaqueIIDOpts hold the options for generating opaque interface identifiers
 	// (IIDs) as outlined by RFC 7217.
 	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// forwarder holds the packets that wait for their link-address resolutions
+	// to complete, and forwards them when each resolution is done.
+	forwarder *forwardQueue
 }
 
 // UniqueID is an abstract generator of unique identifiers.
@@ -641,6 +645,7 @@ func New(opts Options) *Stack {
 		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
 		opaqueIIDOpts:        opts.OpaqueIIDOpts,
+		forwarder:            newForwardQueue(),
 	}
 
 	// Add specified network protocols.
-- 
cgit v1.2.3


From f56fe66b13b979f2ac96e8fce6fb0a5dec9a32e0 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 10 Mar 2020 17:50:47 -0700
Subject: Honour the link's MaxHeaderLength when forwarding

This change also updates where the IP packet buffer is held in an
outbound tcpip.PacketBuffer from Header to Data. This change removes
unncessary copying of the IP packet buffer when forwarding.

Test: stack_test.TestNICForwarding
PiperOrigin-RevId: 300217972
---
 pkg/tcpip/packet_buffer.go        |   8 ++-
 pkg/tcpip/stack/forwarder_test.go |   8 +--
 pkg/tcpip/stack/nic.go            |   6 +-
 pkg/tcpip/stack/stack_test.go     | 112 ++++++++++++++++++++++++--------------
 pkg/tcpip/stack/transport_test.go |   4 +-
 5 files changed, 85 insertions(+), 53 deletions(-)

diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
index ab24372e7..04852132c 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/packet_buffer.go
@@ -39,8 +39,12 @@ type PacketBuffer struct {
 	// payload.
 	DataSize int
 
-	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header.
+	// Header holds the headers of outbound packets generated by the netstack. As
+	// a packet is passed down the stack, each layer adds to Header.
+	//
+	// Note, if a packet is being forwarded at the IP layer, the headers for the
+	// IP layer and above (transport) will be held in Data as the packet was not
+	// passed down the stack it arrived at before being forwarded.
 	Header buffer.Prependable
 
 	// These fields are used by both inbound and outbound packets. They
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 321b7524d..5a04590d5 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -473,7 +473,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Header.View()
+	b := p.Pkt.Data.First()
 	if b[0] != 3 {
 		t.Fatalf("got b[0] = %d, want = 3", b[0])
 	}
@@ -517,7 +517,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.First()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -564,7 +564,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.First()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -619,7 +619,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.First()
 		if b[0] < 8 {
 			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
 		}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index cd9202aed..e46bd86c6 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1246,10 +1246,10 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 }
 
 func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
-	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-	pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
-	pkt.Data.RemoveFirst()
+	// TODO(b/143425874): Decrease the TTL field in forwarded packets.
 
+	// pkt.Header should have enough capacity to hold the link's headers.
+	pkt.Header = buffer.NewPrependable(int(n.linkEP.MaxHeaderLength()))
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index e15db40fb..9515426d6 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2240,56 +2240,84 @@ func TestNICStats(t *testing.T) {
 }
 
 func TestNICForwarding(t *testing.T) {
-	// Create a stack with the fake network protocol, two NICs, each with
-	// an address.
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-	s.SetForwarding(true)
+	const nicID1 = 1
+	const nicID2 = 2
+	const dstAddr = tcpip.Address("\x03")
 
-	ep1 := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(1, ep1); err != nil {
-		t.Fatal("CreateNIC #1 failed:", err)
-	}
-	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
-		t.Fatal("AddAddress #1 failed:", err)
+	tests := []struct {
+		name      string
+		headerLen uint16
+	}{
+		{
+			name: "Zero header length",
+		},
+		{
+			name:      "Non-zero header length",
+			headerLen: 16,
+		},
 	}
 
-	ep2 := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(2, ep2); err != nil {
-		t.Fatal("CreateNIC #2 failed:", err)
-	}
-	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
-		t.Fatal("AddAddress #2 failed:", err)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+			s.SetForwarding(true)
 
-	// Route all packets to address 3 to NIC 2.
-	{
-		subnet, err := tcpip.NewSubnet("\x03", "\xff")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 2}})
-	}
+			ep1 := channel.New(10, defaultMTU, "")
+			if err := s.CreateNIC(nicID1, ep1); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+			}
+			if err := s.AddAddress(nicID1, fakeNetNumber, "\x01"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x01): %s", nicID1, fakeNetNumber, err)
+			}
 
-	// Send a packet to address 3.
-	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
-		Data: buf.ToVectorisedView(),
-	})
+			ep2 := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(10, defaultMTU, ""),
+				headerLength: test.headerLen,
+			}
+			if err := s.CreateNIC(nicID2, &ep2); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+			}
+			if err := s.AddAddress(nicID2, fakeNetNumber, "\x02"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x02): %s", nicID2, fakeNetNumber, err)
+			}
 
-	if _, ok := ep2.Read(); !ok {
-		t.Fatal("Packet not forwarded")
-	}
+			// Route all packets to dstAddr to NIC 2.
+			{
+				subnet, err := tcpip.NewSubnet(dstAddr, "\xff")
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID2}})
+			}
 
-	// Test that forwarding increments Tx stats correctly.
-	if got, want := s.NICInfo()[2].Stats.Tx.Packets.Value(), uint64(1); got != want {
-		t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
-	}
+			// Send a packet to dstAddr.
+			buf := buffer.NewView(30)
+			buf[0] = dstAddr[0]
+			ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+				Data: buf.ToVectorisedView(),
+			})
 
-	if got, want := s.NICInfo()[2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
-		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			pkt, ok := ep2.Read()
+			if !ok {
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the link's MaxHeaderLength is honoured.
+			if capacity, want := pkt.Pkt.Header.AvailableLength(), int(test.headerLen); capacity != want {
+				t.Errorf("got Header.AvailableLength() = %d, want = %d", capacity, want)
+			}
+
+			// Test that forwarding increments Tx stats correctly.
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Packets.Value(), uint64(1); got != want {
+				t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
+			}
+
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
+				t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			}
+		})
 	}
 }
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 5d1da2f8b..3609a25b6 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -641,10 +641,10 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	if dst := p.Pkt.Header.View()[0]; dst != 3 {
+	if dst := p.Pkt.Data.First()[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := p.Pkt.Header.View()[1]; src != 1 {
+	if src := p.Pkt.Data.First()[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
-- 
cgit v1.2.3


From 7bca09107b4efc0a7f36f932612061f13a146d6f Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 11 Mar 2020 06:07:52 -0700
Subject: Automated rollback of changelist 300217972

PiperOrigin-RevId: 300308974
---
 pkg/tcpip/packet_buffer.go        |   8 +--
 pkg/tcpip/stack/forwarder_test.go |   8 +--
 pkg/tcpip/stack/nic.go            |   6 +-
 pkg/tcpip/stack/stack_test.go     | 112 ++++++++++++++------------------------
 pkg/tcpip/stack/transport_test.go |   4 +-
 5 files changed, 53 insertions(+), 85 deletions(-)

diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
index 04852132c..ab24372e7 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/packet_buffer.go
@@ -39,12 +39,8 @@ type PacketBuffer struct {
 	// payload.
 	DataSize int
 
-	// Header holds the headers of outbound packets generated by the netstack. As
-	// a packet is passed down the stack, each layer adds to Header.
-	//
-	// Note, if a packet is being forwarded at the IP layer, the headers for the
-	// IP layer and above (transport) will be held in Data as the packet was not
-	// passed down the stack it arrived at before being forwarded.
+	// Header holds the headers of outbound packets. As a packet is passed
+	// down the stack, each layer adds to Header.
 	Header buffer.Prependable
 
 	// These fields are used by both inbound and outbound packets. They
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 5a04590d5..321b7524d 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -473,7 +473,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Data.First()
+	b := p.Pkt.Header.View()
 	if b[0] != 3 {
 		t.Fatalf("got b[0] = %d, want = 3", b[0])
 	}
@@ -517,7 +517,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.First()
+		b := p.Pkt.Header.View()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -564,7 +564,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.First()
+		b := p.Pkt.Header.View()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -619,7 +619,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Data.First()
+		b := p.Pkt.Header.View()
 		if b[0] < 8 {
 			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
 		}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index e46bd86c6..cd9202aed 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1246,10 +1246,10 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 }
 
 func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
-	// TODO(b/143425874): Decrease the TTL field in forwarded packets.
+	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+	pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
+	pkt.Data.RemoveFirst()
 
-	// pkt.Header should have enough capacity to hold the link's headers.
-	pkt.Header = buffer.NewPrependable(int(n.linkEP.MaxHeaderLength()))
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9515426d6..e15db40fb 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2240,84 +2240,56 @@ func TestNICStats(t *testing.T) {
 }
 
 func TestNICForwarding(t *testing.T) {
-	const nicID1 = 1
-	const nicID2 = 2
-	const dstAddr = tcpip.Address("\x03")
+	// Create a stack with the fake network protocol, two NICs, each with
+	// an address.
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	s.SetForwarding(true)
 
-	tests := []struct {
-		name      string
-		headerLen uint16
-	}{
-		{
-			name: "Zero header length",
-		},
-		{
-			name:      "Non-zero header length",
-			headerLen: 16,
-		},
+	ep1 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC #1 failed:", err)
+	}
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress #1 failed:", err)
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-			})
-			s.SetForwarding(true)
-
-			ep1 := channel.New(10, defaultMTU, "")
-			if err := s.CreateNIC(nicID1, ep1); err != nil {
-				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
-			}
-			if err := s.AddAddress(nicID1, fakeNetNumber, "\x01"); err != nil {
-				t.Fatalf("AddAddress(%d, %d, 0x01): %s", nicID1, fakeNetNumber, err)
-			}
-
-			ep2 := channelLinkWithHeaderLength{
-				Endpoint:     channel.New(10, defaultMTU, ""),
-				headerLength: test.headerLen,
-			}
-			if err := s.CreateNIC(nicID2, &ep2); err != nil {
-				t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
-			}
-			if err := s.AddAddress(nicID2, fakeNetNumber, "\x02"); err != nil {
-				t.Fatalf("AddAddress(%d, %d, 0x02): %s", nicID2, fakeNetNumber, err)
-			}
-
-			// Route all packets to dstAddr to NIC 2.
-			{
-				subnet, err := tcpip.NewSubnet(dstAddr, "\xff")
-				if err != nil {
-					t.Fatal(err)
-				}
-				s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID2}})
-			}
+	ep2 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC #2 failed:", err)
+	}
+	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress #2 failed:", err)
+	}
 
-			// Send a packet to dstAddr.
-			buf := buffer.NewView(30)
-			buf[0] = dstAddr[0]
-			ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
-				Data: buf.ToVectorisedView(),
-			})
+	// Route all packets to address 3 to NIC 2.
+	{
+		subnet, err := tcpip.NewSubnet("\x03", "\xff")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 2}})
+	}
 
-			pkt, ok := ep2.Read()
-			if !ok {
-				t.Fatal("packet not forwarded")
-			}
+	// Send a packet to address 3.
+	buf := buffer.NewView(30)
+	buf[0] = 3
+	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 
-			// Test that the link's MaxHeaderLength is honoured.
-			if capacity, want := pkt.Pkt.Header.AvailableLength(), int(test.headerLen); capacity != want {
-				t.Errorf("got Header.AvailableLength() = %d, want = %d", capacity, want)
-			}
+	if _, ok := ep2.Read(); !ok {
+		t.Fatal("Packet not forwarded")
+	}
 
-			// Test that forwarding increments Tx stats correctly.
-			if got, want := s.NICInfo()[nicID2].Stats.Tx.Packets.Value(), uint64(1); got != want {
-				t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
-			}
+	// Test that forwarding increments Tx stats correctly.
+	if got, want := s.NICInfo()[2].Stats.Tx.Packets.Value(), uint64(1); got != want {
+		t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
+	}
 
-			if got, want := s.NICInfo()[nicID2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
-				t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
-			}
-		})
+	if got, want := s.NICInfo()[2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
+		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
 	}
 }
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 3609a25b6..5d1da2f8b 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -641,10 +641,10 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	if dst := p.Pkt.Data.First()[0]; dst != 3 {
+	if dst := p.Pkt.Header.View()[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := p.Pkt.Data.First()[1]; src != 1 {
+	if src := p.Pkt.Header.View()[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
-- 
cgit v1.2.3


From 2aa9514a06a5e34894e606d508ac2df53b082c74 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 11 Mar 2020 09:49:06 -0700
Subject: runsc: don't redirect SIGURG which is used by Go's runtime scheduler

Go 1.14+ sends SIGURG to Ms to attempt asynchronous preemption of a G. Since it
can't guarantee that a SIGURG is only related to preemption, it continues to
forward them to signal.Notify (see runtime.sighandler).

When runsc is running a container, there are three processes: a parent process
and two children (sandbox and gopher). A parent process sets a signal handler
for all signals and redirect them to the container init process. This logic
should ignore SIGURG signals. We already ignore them in the Sentry, but it will
be better to not notify about them when this is possible.

PiperOrigin-RevId: 300345286
---
 pkg/sentry/sighandling/sighandling.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index ba1f9043d..959ef7217 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -83,6 +83,10 @@ func StartSignalForwarding(handler func(linux.Signal)) func() {
 	// for their handling.
 	var sigchans []chan os.Signal
 	for sig := 1; sig <= numSignals+1; sig++ {
+		// SIGURG is used by Go's runtime scheduler.
+		if sig == int(linux.SIGURG) {
+			continue
+		}
 		sigchan := make(chan os.Signal, 1)
 		sigchans = append(sigchans, sigchan)
 		signal.Notify(sigchan, syscall.Signal(sig))
-- 
cgit v1.2.3


From d3fa741fb539d16b271f1bed2bcd93e382b0b2e2 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 11 Mar 2020 11:44:27 -0700
Subject: runsc: Set asyncpreemptoff for the kvm platform

The asynchronous goroutine preemption is a new feature of Go 1.14.

When we switched to go 1.14 (cl/297915917) in the bazel config,
the kokoro syscall-kvm job started permanently failing. Lets
temporary set asyncpreemptoff for the kvm platform to unblock tests.

PiperOrigin-RevId: 300372387
---
 runsc/sandbox/sandbox.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 192bde40c..6177d6aa7 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -444,6 +444,12 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
+	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
+	// isn't set.
+	if conf.Platform == "kvm" {
+		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
+	}
+
 	// The current process' stdio must be passed to the application via the
 	// --stdio-fds flag. The stdio of the sandbox process itself must not
 	// be connected to the same FDs, otherwise we risk leaking sandbox
-- 
cgit v1.2.3


From 22d89ef5cb7f3321bc7147b130fcff9f083a8aab Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 11 Mar 2020 12:00:28 -0700
Subject: Import "unsafe" in bluepill_arm64_unsafe.go

This fixes a compile time error:
pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go:45:35: undefined: unsafe

PiperOrigin-RevId: 300375687
---
 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 195331383..eb5ed574e 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,6 +17,8 @@
 package kvm
 
 import (
+	"unsafe"
+
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
-- 
cgit v1.2.3


From 5ee9bbb15d90b5f116d9482e932e0823cffd9f6e Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 11 Mar 2020 14:38:04 -0700
Subject: Upgrade Kythe

PiperOrigin-RevId: 300409401
---
 kokoro/kythe/generate_xrefs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 323b0f77b..2f531aa72 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -23,7 +23,7 @@ bazel version
 
 python3 -V
 
-readonly KYTHE_VERSION='v0.0.41'
+readonly KYTHE_VERSION='v0.0.43'
 readonly WORKDIR="$(mktemp -d)"
 readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
 if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
-- 
cgit v1.2.3


From fd84cddab0dc175d5050e2ac46f8520f632d1550 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Wed, 11 Mar 2020 14:44:57 -0700
Subject: Internal change

PiperOrigin-RevId: 300410856
---
 kokoro/benchmark_tests.cfg | 6 +++---
 scripts/benchmark.sh       | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kokoro/benchmark_tests.cfg b/kokoro/benchmark_tests.cfg
index c48518a05..7e2c8acad 100644
--- a/kokoro/benchmark_tests.cfg
+++ b/kokoro/benchmark_tests.cfg
@@ -5,7 +5,7 @@ before_action {
   fetch_keystore {
     keystore_resource {
         keystore_config_id : 73898
-        keyname : 'kokoro-rbe-service-account'
+        keyname : 'gvisor-benchmarks-service-account'
     },
   }
 }
@@ -21,6 +21,6 @@ env_vars {
 }
 
 env_vars {
-  key : 'KOKORO_SERVICE_ACCOUNT'
-  value : '73898_kokoro-rbe-service-account'
+  key : 'GCLOUD_CREDENTIALS'
+  value : '73898_gvisor-benchmarks-service-account'
 }
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index a0317db02..79ff198d5 100644
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -17,9 +17,10 @@
 # Run in the root of the repo.
 cd "$(dirname "$0")"
 
-KEY_PATH=${KEY_PATH:-"${KOKORO_KEYSTORE_DIR}/${KOKORO_SERVICE_ACCOUNT}"}
+export GOOGLE_APPLICATION_CREDENTIALS="${KOKORO_KEYSTORE_DIR}/${GCLOUD_CREDENTIALS}"
 
-gcloud auth activate-service-account --key-file "${KEY_PATH}"
+gcloud auth activate-service-account --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
 
 gcloud compute instances list
 
+bq show gvisor-benchmarks:test.test
-- 
cgit v1.2.3


From 4054b021f05cb0902e9877ba82403978fd8d6405 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 9 Mar 2020 17:40:13 -0700
Subject: iptables: ready tests to be enabled in kokoro

Fixed flakes (tested via --runs_per_test=100) and added skips for
not-yet-implemented features. Once submitted, the iptables tests will be
ready to enable in kokoro.
---
 scripts/iptables_tests.sh      | 11 +++++++----
 test/iptables/filter_input.go  | 13 +++++++------
 test/iptables/filter_output.go | 10 ++++++----
 test/iptables/iptables_test.go |  6 ++++++
 test/iptables/iptables_util.go | 13 +++++--------
 test/iptables/nat.go           |  2 +-
 6 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
index 3069d8628..b4a5211a5 100755
--- a/scripts/iptables_tests.sh
+++ b/scripts/iptables_tests.sh
@@ -19,9 +19,12 @@ source $(dirname $0)/common.sh
 install_runsc_for_test iptables
 
 # Build the docker image for the test.
-run //test/iptables/runner-image --norun
+run //test/iptables/runner:runner-image --norun
 
-# TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better
-# supported
-test //test/iptables:iptables_test "--test_arg=--runtime=runc" \
+test //test/iptables:iptables_test \
+  "--test_arg=--runtime=runc" \
+  "--test_arg=--image=bazel/test/iptables/runner:runner-image"
+
+test //test/iptables:iptables_test \
+  "--test_arg=--runtime=runsc" \
   "--test_arg=--image=bazel/test/iptables/runner:runner-image"
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index b2fb6401a..141d20fbb 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -106,7 +106,7 @@ func (FilterInputDropOnlyUDP) ContainerAction(ip net.IP) error {
 func (FilterInputDropOnlyUDP) LocalAction(ip net.IP) error {
 	// Try to establish a TCP connection with the container, which should
 	// succeed.
-	return connectTCP(ip, acceptPort, dropPort, sendloopDuration)
+	return connectTCP(ip, acceptPort, sendloopDuration)
 }
 
 // FilterInputDropUDPPort tests that we can drop UDP traffic by port.
@@ -192,7 +192,7 @@ func (FilterInputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPDestPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err == nil {
+	if err := connectTCP(ip, dropPort, sendloopDuration); err == nil {
 		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
 	}
 
@@ -209,13 +209,14 @@ func (FilterInputDropTCPSrcPort) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "tcp", "-m", "tcp", "--sport", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+	// Drop anything from an ephemeral port.
+	if err := filterTable("-A", "INPUT", "-p", "tcp", "-m", "tcp", "--sport", "1024:65535", "-j", "DROP"); err != nil {
 		return err
 	}
 
 	// Listen for TCP packets on accept port.
 	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
+		return fmt.Errorf("connection destined to port %d should not be accepted, but was", dropPort)
 	}
 
 	return nil
@@ -223,8 +224,8 @@ func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, acceptPort, dropPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection on port %d should not be acceptedi, but got accepted", dropPort)
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection should not be accepted, but was")
 	}
 
 	return nil
diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go
index ee2c49f9a..1314a5a92 100644
--- a/test/iptables/filter_output.go
+++ b/test/iptables/filter_output.go
@@ -24,7 +24,8 @@ func init() {
 	RegisterTestCase(FilterOutputDropTCPSrcPort{})
 }
 
-// FilterOutputDropTCPDestPort tests that connections are not accepted on specified source ports.
+// FilterOutputDropTCPDestPort tests that connections are not accepted on
+// specified source ports.
 type FilterOutputDropTCPDestPort struct{}
 
 // Name implements TestCase.Name.
@@ -48,14 +49,15 @@ func (FilterOutputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterOutputDropTCPDestPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, acceptPort, dropPort, sendloopDuration); err == nil {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
 		return fmt.Errorf("connection on port %d should not be accepted, but got accepted", dropPort)
 	}
 
 	return nil
 }
 
-// FilterOutputDropTCPSrcPort tests that connections are not accepted on specified source ports.
+// FilterOutputDropTCPSrcPort tests that connections are not accepted on
+// specified source ports.
 type FilterOutputDropTCPSrcPort struct{}
 
 // Name implements TestCase.Name.
@@ -79,7 +81,7 @@ func (FilterOutputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterOutputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err == nil {
+	if err := connectTCP(ip, dropPort, sendloopDuration); err == nil {
 		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
 	}
 
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 29ad5932d..56ba78107 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -191,24 +191,28 @@ func TestFilterInputDropOnlyUDP(t *testing.T) {
 }
 
 func TestNATRedirectUDPPort(t *testing.T) {
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectUDPPort{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATRedirectTCPPort(t *testing.T) {
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectTCPPort{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATDropUDP(t *testing.T) {
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATDropUDP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATAcceptAll(t *testing.T) {
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATAcceptAll{}); err != nil {
 		t.Fatal(err)
 	}
@@ -251,12 +255,14 @@ func TestFilterInputReturnUnderflow(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
+	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestFilterOutputDropTCPSrcPort(t *testing.T) {
+	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
 		t.Fatal(err)
 	}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 32cf5a417..1f8dac4f1 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -125,26 +125,23 @@ func listenTCP(port int, timeout time.Duration) error {
 	return nil
 }
 
-// connectTCP connects the TCP server over specified local port, server IP and remote/server port.
-func connectTCP(ip net.IP, remotePort, localPort int, timeout time.Duration) error {
+// connectTCP connects to the given IP and port from an ephemeral local address.
+func connectTCP(ip net.IP, port int, timeout time.Duration) error {
 	contAddr := net.TCPAddr{
 		IP:   ip,
-		Port: remotePort,
+		Port: port,
 	}
 	// The container may not be listening when we first connect, so retry
 	// upon error.
 	callback := func() error {
-		localAddr := net.TCPAddr{
-			Port: localPort,
-		}
-		conn, err := net.DialTCP("tcp4", &localAddr, &contAddr)
+		conn, err := net.DialTCP("tcp4", nil, &contAddr)
 		if conn != nil {
 			conn.Close()
 		}
 		return err
 	}
 	if err := testutil.Poll(callback, timeout); err != nil {
-		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
+		return fmt.Errorf("timed out waiting to connect IP, most recent error: %v", err)
 	}
 
 	return nil
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index 899d1c9d3..6ca6b46ca 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -76,7 +76,7 @@ func (NATRedirectTCPPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (NATRedirectTCPPort) LocalAction(ip net.IP) error {
-	return connectTCP(ip, dropPort, acceptPort, sendloopDuration)
+	return connectTCP(ip, dropPort, sendloopDuration)
 }
 
 // NATDropUDP tests that packets are not received in ports other than redirect port.
-- 
cgit v1.2.3


From 81675b850e27ea9d6c853a73bd667fc16901a5e8 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 11 Mar 2020 17:02:46 -0700
Subject: Fix memory leak in danglingEndpoints.

Endpoints which were being terminated in an ERROR state or were moved to CLOSED
by the worker goroutine do not run cleanupLocked() as that should already be run
by the worker termination. But when making that change we made the mistake of
not removing the endpoint from the danglingEndpoints which is normally done in
cleanupLocked().

As a result these endpoints are leaked since a reference is held to them in the
danglingEndpoints array forever till Stack is torn down.

PiperOrigin-RevId: 300438426
---
 pkg/tcpip/transport/tcp/connect.go  | 1 +
 pkg/tcpip/transport/tcp/endpoint.go | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index c0f73ef16..be86af502 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1639,6 +1639,7 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 	const timeWaitDone = 3
 
 	s := sleep.Sleeper{}
+	defer s.Done()
 	s.AddWaker(&e.newSegmentWaker, newSegment)
 	s.AddWaker(&e.notificationWaker, notification)
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index dc9c18b6f..cf73f5382 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -862,7 +862,6 @@ func (e *endpoint) closeNoShutdown() {
 	e.closed = true
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
-	tcpip.AddDanglingEndpoint(e)
 	switch e.EndpointState() {
 	// Sockets in StateSynRecv state(passive connections) are closed when
 	// the handshake fails or if the listening socket is closed while
@@ -876,6 +875,9 @@ func (e *endpoint) closeNoShutdown() {
 		// do nothing.
 	default:
 		e.workerCleanup = true
+		tcpip.AddDanglingEndpoint(e)
+		// Worker will remove the dangling endpoint when the endpoint
+		// goroutine terminates.
 		e.notifyProtocolGoroutine(notifyClose)
 	}
 
-- 
cgit v1.2.3


From 61051f226889f51fb97bd44131899a3c502b4c42 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 11 Mar 2020 19:50:59 -0700
Subject: Clean-up buffer implementation.

This also adds substantial test cases.

The Read/Write interfaces are dropped as they are not necessary.

PiperOrigin-RevId: 300461547
---
 pkg/buffer/BUILD           |  10 +-
 pkg/buffer/buffer.go       |  55 +++--
 pkg/buffer/safemem.go      |  30 ++-
 pkg/buffer/safemem_test.go | 170 +++++++++++++++
 pkg/buffer/view.go         | 214 ++++++++++---------
 pkg/buffer/view_test.go    | 510 +++++++++++++++++++++++++++++++++------------
 6 files changed, 715 insertions(+), 274 deletions(-)
 create mode 100644 pkg/buffer/safemem_test.go

diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
index a77a3beea..dcd086298 100644
--- a/pkg/buffer/BUILD
+++ b/pkg/buffer/BUILD
@@ -10,8 +10,8 @@ go_template_instance(
     prefix = "buffer",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*Buffer",
-        "Linker": "*Buffer",
+        "Element": "*buffer",
+        "Linker": "*buffer",
     },
 )
 
@@ -34,6 +34,10 @@ go_library(
 go_test(
     name = "buffer_test",
     size = "small",
-    srcs = ["view_test.go"],
+    srcs = [
+        "safemem_test.go",
+        "view_test.go",
+    ],
     library = ":buffer",
+    deps = ["//pkg/safemem"],
 )
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
index d5f64609b..c6d089fd9 100644
--- a/pkg/buffer/buffer.go
+++ b/pkg/buffer/buffer.go
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 // Package buffer provides the implementation of a buffer view.
+//
+// A view is an flexible buffer, backed by a pool, supporting the safecopy
+// operations natively as well as the ability to grow via either prepend or
+// append, as well as shrink.
 package buffer
 
 import (
@@ -21,7 +25,7 @@ import (
 
 const bufferSize = 8144 // See below.
 
-// Buffer encapsulates a queueable byte buffer.
+// buffer encapsulates a queueable byte buffer.
 //
 // Note that the total size is slightly less than two pages. This is done
 // intentionally to ensure that the buffer object aligns with runtime
@@ -30,38 +34,61 @@ const bufferSize = 8144 // See below.
 // large enough chunk to limit excessive segmentation.
 //
 // +stateify savable
-type Buffer struct {
+type buffer struct {
 	data  [bufferSize]byte
 	read  int
 	write int
 	bufferEntry
 }
 
-// Reset resets internal data.
+// reset resets internal data.
 //
-// This must be called before use.
-func (b *Buffer) Reset() {
+// This must be called before returning the buffer to the pool.
+func (b *buffer) Reset() {
 	b.read = 0
 	b.write = 0
 }
 
-// Empty indicates the buffer is empty.
-//
-// This indicates there is no data left to read.
-func (b *Buffer) Empty() bool {
-	return b.read == b.write
-}
-
 // Full indicates the buffer is full.
 //
 // This indicates there is no capacity left to write.
-func (b *Buffer) Full() bool {
+func (b *buffer) Full() bool {
 	return b.write == len(b.data)
 }
 
+// ReadSize returns the number of bytes available for reading.
+func (b *buffer) ReadSize() int {
+	return b.write - b.read
+}
+
+// ReadMove advances the read index by the given amount.
+func (b *buffer) ReadMove(n int) {
+	b.read += n
+}
+
+// ReadSlice returns the read slice for this buffer.
+func (b *buffer) ReadSlice() []byte {
+	return b.data[b.read:b.write]
+}
+
+// WriteSize returns the number of bytes available for writing.
+func (b *buffer) WriteSize() int {
+	return len(b.data) - b.write
+}
+
+// WriteMove advances the write index by the given amount.
+func (b *buffer) WriteMove(n int) {
+	b.write += n
+}
+
+// WriteSlice returns the write slice for this buffer.
+func (b *buffer) WriteSlice() []byte {
+	return b.data[b.write:]
+}
+
 // bufferPool is a pool for buffers.
 var bufferPool = sync.Pool{
 	New: func() interface{} {
-		return new(Buffer)
+		return new(buffer)
 	},
 }
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
index 071aaa488..0e5b86344 100644
--- a/pkg/buffer/safemem.go
+++ b/pkg/buffer/safemem.go
@@ -15,19 +15,17 @@
 package buffer
 
 import (
-	"io"
-
 	"gvisor.dev/gvisor/pkg/safemem"
 )
 
 // WriteBlock returns this buffer as a write Block.
-func (b *Buffer) WriteBlock() safemem.Block {
-	return safemem.BlockFromSafeSlice(b.data[b.write:])
+func (b *buffer) WriteBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.WriteSlice())
 }
 
 // ReadBlock returns this buffer as a read Block.
-func (b *Buffer) ReadBlock() safemem.Block {
-	return safemem.BlockFromSafeSlice(b.data[b.read:b.write])
+func (b *buffer) ReadBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.ReadSlice())
 }
 
 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
@@ -47,21 +45,21 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
 	// Need at least one buffer.
 	firstBuf := v.data.Back()
 	if firstBuf == nil {
-		firstBuf = bufferPool.Get().(*Buffer)
+		firstBuf = bufferPool.Get().(*buffer)
 		v.data.PushBack(firstBuf)
 	}
 
 	// Does the last block have sufficient capacity alone?
-	if l := len(firstBuf.data) - firstBuf.write; l >= need {
+	if l := firstBuf.WriteSize(); l >= need {
 		dst = safemem.BlockSeqOf(firstBuf.WriteBlock())
 	} else {
 		// Append blocks until sufficient.
 		need -= l
 		blocks = append(blocks, firstBuf.WriteBlock())
 		for need > 0 {
-			emptyBuf := bufferPool.Get().(*Buffer)
+			emptyBuf := bufferPool.Get().(*buffer)
 			v.data.PushBack(emptyBuf)
-			need -= len(emptyBuf.data) // Full block.
+			need -= emptyBuf.WriteSize()
 			blocks = append(blocks, emptyBuf.WriteBlock())
 		}
 		dst = safemem.BlockSeqFromSlice(blocks)
@@ -73,11 +71,11 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
 
 	// Update all indices.
 	for left := int(n); left > 0; firstBuf = firstBuf.Next() {
-		if l := len(firstBuf.data) - firstBuf.write; left >= l {
-			firstBuf.write += l // Whole block.
+		if l := firstBuf.WriteSize(); left >= l {
+			firstBuf.WriteMove(l) // Whole block.
 			left -= l
 		} else {
-			firstBuf.write += left // Partial block.
+			firstBuf.WriteMove(left) // Partial block.
 			left = 0
 		}
 	}
@@ -103,18 +101,18 @@ func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 
 	firstBuf := v.data.Front()
 	if firstBuf == nil {
-		return 0, io.EOF
+		return 0, nil // No EOF.
 	}
 
 	// Is all the data in a single block?
-	if l := firstBuf.write - firstBuf.read; l >= need {
+	if l := firstBuf.ReadSize(); l >= need {
 		src = safemem.BlockSeqOf(firstBuf.ReadBlock())
 	} else {
 		// Build a list of all the buffers.
 		need -= l
 		blocks = append(blocks, firstBuf.ReadBlock())
 		for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() {
-			need -= buf.write - buf.read
+			need -= buf.ReadSize()
 			blocks = append(blocks, buf.ReadBlock())
 		}
 		src = safemem.BlockSeqFromSlice(blocks)
diff --git a/pkg/buffer/safemem_test.go b/pkg/buffer/safemem_test.go
new file mode 100644
index 000000000..47f357e0c
--- /dev/null
+++ b/pkg/buffer/safemem_test.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+func TestSafemem(t *testing.T) {
+	testCases := []struct {
+		name    string
+		input   string
+		output  string
+		readLen int
+		op      func(*View)
+	}{
+		// Basic coverage.
+		{
+			name:   "short",
+			input:  "010",
+			output: "010",
+		},
+		{
+			name:   "long",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "0" + strings.Repeat("1", bufferSize) + "0",
+		},
+		{
+			name:    "short-read",
+			input:   "0",
+			readLen: 100, // > size.
+			output:  "0",
+		},
+		{
+			name:   "zero-read",
+			input:  "0",
+			output: "",
+		},
+		{
+			name:    "read-empty",
+			input:   "",
+			readLen: 1, // > size.
+			output:  "",
+		},
+
+		// Ensure offsets work.
+		{
+			name:   "offsets-short",
+			input:  "012",
+			output: "2",
+			op: func(v *View) {
+				v.TrimFront(2)
+			},
+		},
+		{
+			name:   "offsets-long0",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: strings.Repeat("1", bufferSize) + "0",
+			op: func(v *View) {
+				v.TrimFront(1)
+			},
+		},
+		{
+			name:   "offsets-long1",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: strings.Repeat("1", bufferSize-1) + "0",
+			op: func(v *View) {
+				v.TrimFront(2)
+			},
+		},
+		{
+			name:   "offsets-long2",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "10",
+			op: func(v *View) {
+				v.TrimFront(bufferSize)
+			},
+		},
+
+		// Ensure truncation works.
+		{
+			name:   "truncate-short",
+			input:  "012",
+			output: "01",
+			op: func(v *View) {
+				v.Truncate(2)
+			},
+		},
+		{
+			name:   "truncate-long0",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "0" + strings.Repeat("1", bufferSize),
+			op: func(v *View) {
+				v.Truncate(bufferSize + 1)
+			},
+		},
+		{
+			name:   "truncate-long1",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "0" + strings.Repeat("1", bufferSize-1),
+			op: func(v *View) {
+				v.Truncate(bufferSize)
+			},
+		},
+		{
+			name:   "truncate-long2",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "01",
+			op: func(v *View) {
+				v.Truncate(2)
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Construct the new view.
+			var view View
+			bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice([]byte(tc.input)))
+			n, err := view.WriteFromBlocks(bs)
+			if err != nil {
+				t.Errorf("expected err nil, got %v", err)
+			}
+			if n != uint64(len(tc.input)) {
+				t.Errorf("expected %d bytes, got %d", len(tc.input), n)
+			}
+
+			// Run the operation.
+			if tc.op != nil {
+				tc.op(&view)
+			}
+
+			// Read and validate.
+			readLen := tc.readLen
+			if readLen == 0 {
+				readLen = len(tc.output) // Default.
+			}
+			out := make([]byte, readLen)
+			bs = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(out))
+			n, err = view.ReadToBlocks(bs)
+			if err != nil {
+				t.Errorf("expected nil, got %v", err)
+			}
+			if n != uint64(len(tc.output)) {
+				t.Errorf("expected %d bytes, got %d", len(tc.output), n)
+			}
+
+			// Ensure the contents are correct.
+			if !bytes.Equal(out[:n], []byte(tc.output[:n])) {
+				t.Errorf("contents are wrong: expected %q, got %q", tc.output, string(out))
+			}
+		})
+	}
+}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
index 00fc11e9c..e6901eadb 100644
--- a/pkg/buffer/view.go
+++ b/pkg/buffer/view.go
@@ -38,14 +38,6 @@ func (v *View) TrimFront(count int64) {
 	}
 }
 
-// Read implements io.Reader.Read.
-//
-// Note that reading does not advance the read index. This must be done
-// manually using TrimFront or other methods.
-func (v *View) Read(p []byte) (int, error) {
-	return v.ReadAt(p, 0)
-}
-
 // ReadAt implements io.ReaderAt.ReadAt.
 func (v *View) ReadAt(p []byte, offset int64) (int, error) {
 	var (
@@ -54,54 +46,46 @@ func (v *View) ReadAt(p []byte, offset int64) (int, error) {
 	)
 	for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
 		needToSkip := int(offset - skipped)
-		if l := buf.write - buf.read; l <= needToSkip {
-			skipped += int64(l)
+		if sz := buf.ReadSize(); sz <= needToSkip {
+			skipped += int64(sz)
 			continue
 		}
 
 		// Actually read data.
-		n := copy(p[done:], buf.data[buf.read+needToSkip:buf.write])
+		n := copy(p[done:], buf.ReadSlice()[needToSkip:])
 		skipped += int64(needToSkip)
 		done += int64(n)
 	}
-	if int(done) < len(p) {
+	if int(done) < len(p) || offset+done == v.size {
 		return int(done), io.EOF
 	}
 	return int(done), nil
 }
 
-// Write implements io.Writer.Write.
-func (v *View) Write(p []byte) (int, error) {
-	v.Append(p) // Does not fail.
-	return len(p), nil
-}
-
 // advanceRead advances the view's read index.
 //
 // Precondition: there must be sufficient bytes in the buffer.
 func (v *View) advanceRead(count int64) {
 	for buf := v.data.Front(); buf != nil && count > 0; {
-		l := int64(buf.write - buf.read)
-		if l > count {
+		sz := int64(buf.ReadSize())
+		if sz > count {
 			// There is still data for reading.
-			buf.read += int(count)
+			buf.ReadMove(int(count))
 			v.size -= count
 			count = 0
 			break
 		}
 
-		// Read from this buffer.
-		buf.read += int(l)
-		count -= l
-		v.size -= l
-
-		// When all data has been read from a buffer, we push
-		// it into the empty buffer pool for reuse.
+		// Consume the whole buffer.
 		oldBuf := buf
 		buf = buf.Next() // Iterate.
 		v.data.Remove(oldBuf)
 		oldBuf.Reset()
 		bufferPool.Put(oldBuf)
+
+		// Update counts.
+		count -= sz
+		v.size -= sz
 	}
 	if count > 0 {
 		panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
@@ -109,37 +93,39 @@ func (v *View) advanceRead(count int64) {
 }
 
 // Truncate truncates the view to the given bytes.
+//
+// This will not grow the view, only shrink it. If a length is passed that is
+// greater than the current size of the view, then nothing will happen.
+//
+// Precondition: length must be >= 0.
 func (v *View) Truncate(length int64) {
-	if length < 0 || length >= v.size {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	if length >= v.size {
 		return // Nothing to do.
 	}
 	for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
-		l := int64(buf.write - buf.read) // Local bytes.
-		switch {
-		case v.size-l >= length:
-			// Drop the buffer completely; see above.
-			v.data.Remove(buf)
-			v.size -= l
-			buf.Reset()
-			bufferPool.Put(buf)
-
-		case v.size > length && v.size-l < length:
-			// Just truncate the buffer locally.
-			delta := (length - (v.size - l))
-			buf.write = buf.read + int(delta)
+		sz := int64(buf.ReadSize())
+		if after := v.size - sz; after < length {
+			// Truncate the buffer locally.
+			left := (length - after)
+			buf.write = buf.read + int(left)
 			v.size = length
-
-		default:
-			// Should never happen.
-			panic("invalid buffer during truncation")
+			break
 		}
+
+		// Drop the buffer completely; see above.
+		v.data.Remove(buf)
+		buf.Reset()
+		bufferPool.Put(buf)
+		v.size -= sz
 	}
-	v.size = length // Save the new size.
 }
 
-// Grow grows the given view to the number of bytes. If zero
-// is true, all these bytes will be zero. If zero is false,
-// then this is the caller's responsibility.
+// Grow grows the given view to the number of bytes, which will be appended. If
+// zero is true, all these bytes will be zero. If zero is false, then this is
+// the caller's responsibility.
 //
 // Precondition: length must be >= 0.
 func (v *View) Grow(length int64, zero bool) {
@@ -149,29 +135,29 @@ func (v *View) Grow(length int64, zero bool) {
 	for v.size < length {
 		buf := v.data.Back()
 
-		// Is there at least one buffer?
+		// Is there some space in the last buffer?
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*Buffer)
+			buf = bufferPool.Get().(*buffer)
 			v.data.PushBack(buf)
 		}
 
 		// Write up to length bytes.
-		l := len(buf.data) - buf.write
-		if int64(l) > length-v.size {
-			l = int(length - v.size)
+		sz := buf.WriteSize()
+		if int64(sz) > length-v.size {
+			sz = int(length - v.size)
 		}
 
 		// Zero the written section; note that this pattern is
 		// specifically recognized and optimized by the compiler.
 		if zero {
-			for i := buf.write; i < buf.write+l; i++ {
+			for i := buf.write; i < buf.write+sz; i++ {
 				buf.data[i] = 0
 			}
 		}
 
 		// Advance the index.
-		buf.write += l
-		v.size += int64(l)
+		buf.WriteMove(sz)
+		v.size += int64(sz)
 	}
 }
 
@@ -181,31 +167,40 @@ func (v *View) Prepend(data []byte) {
 	if buf := v.data.Front(); buf != nil && buf.read > 0 {
 		// Fill up before the first write.
 		avail := buf.read
-		copy(buf.data[0:], data[len(data)-avail:])
-		data = data[:len(data)-avail]
-		v.size += int64(avail)
+		bStart := 0
+		dStart := len(data) - avail
+		if avail > len(data) {
+			bStart = avail - len(data)
+			dStart = 0
+		}
+		n := copy(buf.data[bStart:], data[dStart:])
+		data = data[:dStart]
+		v.size += int64(n)
+		buf.read -= n
 	}
 
 	for len(data) > 0 {
 		// Do we need an empty buffer?
-		buf := bufferPool.Get().(*Buffer)
+		buf := bufferPool.Get().(*buffer)
 		v.data.PushFront(buf)
 
 		// The buffer is empty; copy last chunk.
-		start := len(data) - len(buf.data)
-		if start < 0 {
-			start = 0 // Everything.
+		avail := len(buf.data)
+		bStart := 0
+		dStart := len(data) - avail
+		if avail > len(data) {
+			bStart = avail - len(data)
+			dStart = 0
 		}
 
 		// We have to put the data at the end of the current
 		// buffer in order to ensure that the next prepend will
 		// correctly fill up the beginning of this buffer.
-		bStart := len(buf.data) - len(data[start:])
-		n := copy(buf.data[bStart:], data[start:])
-		buf.read = bStart
-		buf.write = len(buf.data)
-		data = data[:start]
+		n := copy(buf.data[bStart:], data[dStart:])
+		data = data[:dStart]
 		v.size += int64(n)
+		buf.read = len(buf.data) - n
+		buf.write = len(buf.data)
 	}
 }
 
@@ -214,16 +209,16 @@ func (v *View) Append(data []byte) {
 	for done := 0; done < len(data); {
 		buf := v.data.Back()
 
-		// Find the first empty buffer.
+		// Ensure there's a buffer with space.
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*Buffer)
+			buf = bufferPool.Get().(*buffer)
 			v.data.PushBack(buf)
 		}
 
 		// Copy in to the given buffer.
-		n := copy(buf.data[buf.write:], data[done:])
+		n := copy(buf.WriteSlice(), data[done:])
 		done += n
-		buf.write += n
+		buf.WriteMove(n)
 		v.size += int64(n)
 	}
 }
@@ -232,52 +227,52 @@ func (v *View) Append(data []byte) {
 //
 // This method should not be used in any performance-sensitive paths. It may
 // allocate a fresh byte slice sufficiently large to contain all the data in
-// the buffer.
+// the buffer. This is principally for debugging.
 //
 // N.B. Tee data still belongs to this view, as if there is a single buffer
 // present, then it will be returned directly. This should be used for
 // temporary use only, and a reference to the given slice should not be held.
 func (v *View) Flatten() []byte {
-	if buf := v.data.Front(); buf.Next() == nil {
-		return buf.data[buf.read:buf.write] // Only one buffer.
+	if buf := v.data.Front(); buf == nil {
+		return nil // No data at all.
+	} else if buf.Next() == nil {
+		return buf.ReadSlice() // Only one buffer.
 	}
 	data := make([]byte, 0, v.size) // Need to flatten.
 	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
 		// Copy to the allocated slice.
-		data = append(data, buf.data[buf.read:buf.write]...)
+		data = append(data, buf.ReadSlice()...)
 	}
 	return data
 }
 
 // Size indicates the total amount of data available in this view.
-func (v *View) Size() (sz int64) {
-	sz = v.size // Pre-calculated.
-	return sz
+func (v *View) Size() int64 {
+	return v.size
 }
 
 // Copy makes a strict copy of this view.
 func (v *View) Copy() (other View) {
 	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
-		other.Append(buf.data[buf.read:buf.write])
+		other.Append(buf.ReadSlice())
 	}
-	return other
+	return
 }
 
 // Apply applies the given function across all valid data.
 func (v *View) Apply(fn func([]byte)) {
 	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
-		if l := int64(buf.write - buf.read); l > 0 {
-			fn(buf.data[buf.read:buf.write])
-		}
+		fn(buf.ReadSlice())
 	}
 }
 
 // Merge merges the provided View with this one.
 //
-// The other view will be empty after this operation.
+// The other view will be appended to v, and other will be empty after this
+// operation completes.
 func (v *View) Merge(other *View) {
 	// Copy over all buffers.
-	for buf := other.data.Front(); buf != nil && !buf.Empty(); buf = other.data.Front() {
+	for buf := other.data.Front(); buf != nil; buf = other.data.Front() {
 		other.data.Remove(buf)
 		v.data.PushBack(buf)
 	}
@@ -288,6 +283,9 @@ func (v *View) Merge(other *View) {
 }
 
 // WriteFromReader writes to the buffer from an io.Reader.
+//
+// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
 func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
 	var (
 		done int64
@@ -297,17 +295,17 @@ func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
 	for done < count {
 		buf := v.data.Back()
 
-		// Find the first empty buffer.
+		// Ensure we have an empty buffer.
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*Buffer)
+			buf = bufferPool.Get().(*buffer)
 			v.data.PushBack(buf)
 		}
 
 		// Is this less than the minimum batch?
-		if len(buf.data[buf.write:]) < minBatch && (count-done) >= int64(minBatch) {
+		if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) {
 			tmp := make([]byte, minBatch)
 			n, err = r.Read(tmp)
-			v.Write(tmp[:n])
+			v.Append(tmp[:n])
 			done += int64(n)
 			if err != nil {
 				break
@@ -316,14 +314,14 @@ func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
 		}
 
 		// Limit the read, if necessary.
-		end := len(buf.data)
-		if int64(end-buf.write) > (count - done) {
-			end = buf.write + int(count-done)
+		sz := buf.WriteSize()
+		if left := count - done; int64(sz) > left {
+			sz = int(left)
 		}
 
 		// Pass the relevant portion of the buffer.
-		n, err = r.Read(buf.data[buf.write:end])
-		buf.write += n
+		n, err = r.Read(buf.WriteSlice()[:sz])
+		buf.WriteMove(n)
 		done += int64(n)
 		v.size += int64(n)
 		if err == io.EOF {
@@ -340,6 +338,9 @@ func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
 //
 // N.B. This does not consume the bytes read. TrimFront should
 // be called appropriately after this call in order to do so.
+//
+// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
 func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
 	var (
 		done int64
@@ -348,15 +349,22 @@ func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
 	)
 	offset := 0 // Spill-over for batching.
 	for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
-		l := buf.write - buf.read - offset
+		// Has this been consumed? Skip it.
+		sz := buf.ReadSize()
+		if sz <= offset {
+			offset -= sz
+			continue
+		}
+		sz -= offset
 
 		// Is this less than the minimum batch?
-		if l < minBatch && (count-done) >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
+		left := count - done
+		if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
 			tmp := make([]byte, minBatch)
 			n, err = v.ReadAt(tmp, done)
 			w.Write(tmp[:n])
 			done += int64(n)
-			offset = n - l // Reset below.
+			offset = n - sz // Reset below.
 			if err != nil {
 				break
 			}
@@ -364,12 +372,12 @@ func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
 		}
 
 		// Limit the write if necessary.
-		if int64(l) >= (count - done) {
-			l = int(count - done)
+		if int64(sz) >= left {
+			sz = int(left)
 		}
 
 		// Perform the actual write.
-		n, err = w.Write(buf.data[buf.read+offset : buf.read+offset+l])
+		n, err = w.Write(buf.ReadSlice()[offset : offset+sz])
 		done += int64(n)
 		if err != nil {
 			break
diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go
index 37e652f16..3db1bc6ee 100644
--- a/pkg/buffer/view_test.go
+++ b/pkg/buffer/view_test.go
@@ -16,218 +16,452 @@ package buffer
 
 import (
 	"bytes"
+	"io"
 	"strings"
 	"testing"
 )
 
+func fillAppend(v *View, data []byte) {
+	v.Append(data)
+}
+
+func fillAppendEnd(v *View, data []byte) {
+	v.Grow(bufferSize-1, false)
+	v.Append(data)
+	v.TrimFront(bufferSize - 1)
+}
+
+func fillWriteFromReader(v *View, data []byte) {
+	b := bytes.NewBuffer(data)
+	v.WriteFromReader(b, int64(len(data)))
+}
+
+func fillWriteFromReaderEnd(v *View, data []byte) {
+	v.Grow(bufferSize-1, false)
+	b := bytes.NewBuffer(data)
+	v.WriteFromReader(b, int64(len(data)))
+	v.TrimFront(bufferSize - 1)
+}
+
+var fillFuncs = map[string]func(*View, []byte){
+	"append":             fillAppend,
+	"appendEnd":          fillAppendEnd,
+	"writeFromReader":    fillWriteFromReader,
+	"writeFromReaderEnd": fillWriteFromReaderEnd,
+}
+
+func testReadAt(t *testing.T, v *View, offset int64, n int, wantStr string, wantErr error) {
+	t.Helper()
+	d := make([]byte, n)
+	n, err := v.ReadAt(d, offset)
+	if n != len(wantStr) {
+		t.Errorf("got %d, want %d", n, len(wantStr))
+	}
+	if err != wantErr {
+		t.Errorf("got err %v, want %v", err, wantErr)
+	}
+	if !bytes.Equal(d[:n], []byte(wantStr)) {
+		t.Errorf("got %q, want %q", string(d[:n]), wantStr)
+	}
+}
+
 func TestView(t *testing.T) {
 	testCases := []struct {
 		name   string
 		input  string
 		output string
-		ops    []func(*View)
+		op     func(*testing.T, *View)
 	}{
-		// Prepend.
+		// Preconditions.
+		{
+			name:   "truncate-check",
+			input:  "hello",
+			output: "hello", // Not touched.
+			op: func(t *testing.T, v *View) {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("Truncate(-1) did not panic")
+					}
+				}()
+				v.Truncate(-1)
+			},
+		},
+		{
+			name:   "grow-check",
+			input:  "hello",
+			output: "hello", // Not touched.
+			op: func(t *testing.T, v *View) {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("Grow(-1) did not panic")
+					}
+				}()
+				v.Grow(-1, false)
+			},
+		},
 		{
-			name:  "prepend",
-			input: "world",
-			ops: []func(*View){
-				func(v *View) {
-					v.Prepend([]byte("hello "))
-				},
+			name:   "advance-check",
+			input:  "hello",
+			output: "", // Consumed.
+			op: func(t *testing.T, v *View) {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("advanceRead(Size()+1) did not panic")
+					}
+				}()
+				v.advanceRead(v.Size() + 1)
 			},
+		},
+
+		// Prepend.
+		{
+			name:   "prepend",
+			input:  "world",
 			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte("hello "))
+			},
 		},
 		{
-			name:  "prepend fill",
-			input: strings.Repeat("1", bufferSize-1),
-			ops: []func(*View){
-				func(v *View) {
-					v.Prepend([]byte("0"))
-				},
+			name:   "prepend-backfill-full",
+			input:  "hello world",
+			output: "jello world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(1)
+				v.Prepend([]byte("j"))
 			},
-			output: "0" + strings.Repeat("1", bufferSize-1),
 		},
 		{
-			name:  "prepend overflow",
-			input: strings.Repeat("1", bufferSize),
-			ops: []func(*View){
-				func(v *View) {
-					v.Prepend([]byte("0"))
-				},
+			name:   "prepend-backfill-under",
+			input:  "hello world",
+			output: "hola world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(5)
+				v.Prepend([]byte("hola"))
 			},
-			output: "0" + strings.Repeat("1", bufferSize),
 		},
 		{
-			name:  "prepend multiple buffers",
-			input: strings.Repeat("1", bufferSize-1),
-			ops: []func(*View){
-				func(v *View) {
-					v.Prepend([]byte(strings.Repeat("0", bufferSize*3)))
-				},
+			name:   "prepend-backfill-over",
+			input:  "hello world",
+			output: "smello world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(1)
+				v.Prepend([]byte("sm"))
 			},
+		},
+		{
+			name:   "prepend-fill",
+			input:  strings.Repeat("1", bufferSize-1),
+			output: "0" + strings.Repeat("1", bufferSize-1),
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte("0"))
+			},
+		},
+		{
+			name:   "prepend-overflow",
+			input:  strings.Repeat("1", bufferSize),
+			output: "0" + strings.Repeat("1", bufferSize),
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte("0"))
+			},
+		},
+		{
+			name:   "prepend-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize-1),
 			output: strings.Repeat("0", bufferSize*3) + strings.Repeat("1", bufferSize-1),
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte(strings.Repeat("0", bufferSize*3)))
+			},
 		},
 
-		// Append.
+		// Append and write.
 		{
-			name:  "append",
-			input: "hello",
-			ops: []func(*View){
-				func(v *View) {
-					v.Append([]byte(" world"))
-				},
-			},
+			name:   "append",
+			input:  "hello",
 			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte(" world"))
+			},
 		},
 		{
-			name:  "append fill",
-			input: strings.Repeat("1", bufferSize-1),
-			ops: []func(*View){
-				func(v *View) {
-					v.Append([]byte("0"))
-				},
-			},
+			name:   "append-fill",
+			input:  strings.Repeat("1", bufferSize-1),
 			output: strings.Repeat("1", bufferSize-1) + "0",
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte("0"))
+			},
 		},
 		{
-			name:  "append overflow",
-			input: strings.Repeat("1", bufferSize),
-			ops: []func(*View){
-				func(v *View) {
-					v.Append([]byte("0"))
-				},
-			},
+			name:   "append-overflow",
+			input:  strings.Repeat("1", bufferSize),
 			output: strings.Repeat("1", bufferSize) + "0",
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte("0"))
+			},
 		},
 		{
-			name:  "append multiple buffers",
-			input: strings.Repeat("1", bufferSize-1),
-			ops: []func(*View){
-				func(v *View) {
-					v.Append([]byte(strings.Repeat("0", bufferSize*3)))
-				},
-			},
+			name:   "append-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize-1),
 			output: strings.Repeat("1", bufferSize-1) + strings.Repeat("0", bufferSize*3),
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte(strings.Repeat("0", bufferSize*3)))
+			},
 		},
 
 		// Truncate.
 		{
-			name:  "truncate",
-			input: "hello world",
-			ops: []func(*View){
-				func(v *View) {
-					v.Truncate(5)
-				},
-			},
+			name:   "truncate",
+			input:  "hello world",
 			output: "hello",
+			op: func(t *testing.T, v *View) {
+				v.Truncate(5)
+			},
 		},
 		{
-			name:  "truncate multiple buffers",
-			input: strings.Repeat("1", bufferSize*2),
-			ops: []func(*View){
-				func(v *View) {
-					v.Truncate(bufferSize*2 - 1)
-				},
+			name:   "truncate-noop",
+			input:  "hello world",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Truncate(v.Size() + 1)
 			},
-			output: strings.Repeat("1", bufferSize*2-1),
 		},
 		{
-			name:  "truncate multiple buffers to one buffer",
-			input: strings.Repeat("1", bufferSize*2),
-			ops: []func(*View){
-				func(v *View) {
-					v.Truncate(5)
-				},
+			name:   "truncate-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize*2),
+			output: strings.Repeat("1", bufferSize*2-1),
+			op: func(t *testing.T, v *View) {
+				v.Truncate(bufferSize*2 - 1)
 			},
+		},
+		{
+			name:   "truncate-multiple-buffers-to-one",
+			input:  strings.Repeat("1", bufferSize*2),
 			output: "11111",
+			op: func(t *testing.T, v *View) {
+				v.Truncate(5)
+			},
 		},
 
 		// TrimFront.
 		{
-			name:  "trim",
-			input: "hello world",
-			ops: []func(*View){
-				func(v *View) {
-					v.TrimFront(6)
-				},
-			},
+			name:   "trim",
+			input:  "hello world",
 			output: "world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(6)
+			},
 		},
 		{
-			name:  "trim multiple buffers",
-			input: strings.Repeat("1", bufferSize*2),
-			ops: []func(*View){
-				func(v *View) {
-					v.TrimFront(1)
-				},
+			name:   "trim-too-large",
+			input:  "hello world",
+			output: "",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(v.Size() + 1)
 			},
-			output: strings.Repeat("1", bufferSize*2-1),
 		},
 		{
-			name:  "trim multiple buffers to one buffer",
-			input: strings.Repeat("1", bufferSize*2),
-			ops: []func(*View){
-				func(v *View) {
-					v.TrimFront(bufferSize*2 - 1)
-				},
+			name:   "trim-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize*2),
+			output: strings.Repeat("1", bufferSize*2-1),
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(1)
 			},
+		},
+		{
+			name:   "trim-multiple-buffers-to-one-buffer",
+			input:  strings.Repeat("1", bufferSize*2),
 			output: "1",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(bufferSize*2 - 1)
+			},
 		},
 
 		// Grow.
 		{
-			name:  "grow",
-			input: "hello world",
-			ops: []func(*View){
-				func(v *View) {
-					v.Grow(1, true)
-				},
-			},
+			name:   "grow",
+			input:  "hello world",
 			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Grow(1, true)
+			},
 		},
 		{
-			name: "grow from zero",
-			ops: []func(*View){
-				func(v *View) {
-					v.Grow(1024, true)
-				},
-			},
+			name:   "grow-from-zero",
 			output: strings.Repeat("\x00", 1024),
+			op: func(t *testing.T, v *View) {
+				v.Grow(1024, true)
+			},
 		},
 		{
-			name:  "grow from non-zero",
-			input: strings.Repeat("1", bufferSize),
-			ops: []func(*View){
-				func(v *View) {
-					v.Grow(bufferSize*2, true)
-				},
-			},
+			name:   "grow-from-non-zero",
+			input:  strings.Repeat("1", bufferSize),
 			output: strings.Repeat("1", bufferSize) + strings.Repeat("\x00", bufferSize),
+			op: func(t *testing.T, v *View) {
+				v.Grow(bufferSize*2, true)
+			},
+		},
+
+		// Copy.
+		{
+			name:   "copy",
+			input:  "hello",
+			output: "hello",
+			op: func(t *testing.T, v *View) {
+				other := v.Copy()
+				bs := other.Flatten()
+				want := []byte("hello")
+				if !bytes.Equal(bs, want) {
+					t.Errorf("expected %v, got %v", want, bs)
+				}
+			},
+		},
+		{
+			name:   "copy-large",
+			input:  strings.Repeat("1", bufferSize+1),
+			output: strings.Repeat("1", bufferSize+1),
+			op: func(t *testing.T, v *View) {
+				other := v.Copy()
+				bs := other.Flatten()
+				want := []byte(strings.Repeat("1", bufferSize+1))
+				if !bytes.Equal(bs, want) {
+					t.Errorf("expected %v, got %v", want, bs)
+				}
+			},
+		},
+
+		// Merge.
+		{
+			name:   "merge",
+			input:  "hello",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				var other View
+				other.Append([]byte(" world"))
+				v.Merge(&other)
+				if sz := other.Size(); sz != 0 {
+					t.Errorf("expected 0, got %d", sz)
+				}
+			},
+		},
+		{
+			name:   "merge-large",
+			input:  strings.Repeat("1", bufferSize+1),
+			output: strings.Repeat("1", bufferSize+1) + strings.Repeat("0", bufferSize+1),
+			op: func(t *testing.T, v *View) {
+				var other View
+				other.Append([]byte(strings.Repeat("0", bufferSize+1)))
+				v.Merge(&other)
+				if sz := other.Size(); sz != 0 {
+					t.Errorf("expected 0, got %d", sz)
+				}
+			},
+		},
+
+		// ReadAt.
+		{
+			name:   "readat",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 0, 6, "hello", io.EOF) },
+		},
+		{
+			name:   "readat-long",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 0, 8, "hello", io.EOF) },
+		},
+		{
+			name:   "readat-short",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 0, 3, "hel", nil) },
+		},
+		{
+			name:   "readat-offset",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 2, 3, "llo", io.EOF) },
+		},
+		{
+			name:   "readat-long-offset",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 2, 8, "llo", io.EOF) },
+		},
+		{
+			name:   "readat-short-offset",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 2, 2, "ll", nil) },
+		},
+		{
+			name:   "readat-skip-all",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, bufferSize+1, 1, "", io.EOF) },
+		},
+		{
+			name:   "readat-second-buffer",
+			input:  strings.Repeat("0", bufferSize+1) + "12",
+			output: strings.Repeat("0", bufferSize+1) + "12",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, bufferSize+1, 1, "1", nil) },
+		},
+		{
+			name:   "readat-second-buffer-end",
+			input:  strings.Repeat("0", bufferSize+1) + "12",
+			output: strings.Repeat("0", bufferSize+1) + "12",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, bufferSize+1, 2, "12", io.EOF) },
 		},
 	}
 
 	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			// Construct the new view.
-			var view View
-			view.Append([]byte(tc.input))
-
-			// Run all operations.
-			for _, op := range tc.ops {
-				op(&view)
-			}
-
-			// Flatten and validate.
-			out := view.Flatten()
-			if !bytes.Equal([]byte(tc.output), out) {
-				t.Errorf("expected %q, got %q", tc.output, string(out))
-			}
-
-			// Ensure the size is correct.
-			if len(out) != int(view.Size()) {
-				t.Errorf("size is wrong: expected %d, got %d", len(out), view.Size())
-			}
-		})
+		for fillName, fn := range fillFuncs {
+			t.Run(fillName+"/"+tc.name, func(t *testing.T) {
+				// Construct & fill the view.
+				var view View
+				fn(&view, []byte(tc.input))
+
+				// Run the operation.
+				if tc.op != nil {
+					tc.op(t, &view)
+				}
+
+				// Flatten and validate.
+				out := view.Flatten()
+				if !bytes.Equal([]byte(tc.output), out) {
+					t.Errorf("expected %q, got %q", tc.output, string(out))
+				}
+
+				// Ensure the size is correct.
+				if len(out) != int(view.Size()) {
+					t.Errorf("size is wrong: expected %d, got %d", len(out), view.Size())
+				}
+
+				// Calculate contents via apply.
+				var appliedOut []byte
+				view.Apply(func(b []byte) {
+					appliedOut = append(appliedOut, b...)
+				})
+				if len(appliedOut) != len(out) {
+					t.Errorf("expected %d, got %d", len(out), len(appliedOut))
+				}
+				if !bytes.Equal(appliedOut, out) {
+					t.Errorf("expected %v, got %v", out, appliedOut)
+				}
+
+				// Calculate contents via ReadToWriter.
+				var b bytes.Buffer
+				n, err := view.ReadToWriter(&b, int64(len(out)))
+				if n != int64(len(out)) {
+					t.Errorf("expected %d, got %d", len(out), n)
+				}
+				if err != nil {
+					t.Errorf("expected nil, got %v", err)
+				}
+				if !bytes.Equal(b.Bytes(), out) {
+					t.Errorf("expected %v, got %v", out, b.Bytes())
+				}
+			})
+		}
 	}
 }
-- 
cgit v1.2.3


From 538e35f61bc293bb2adf8b437afdeab4d27b6e53 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 11 Mar 2020 19:56:13 -0700
Subject: Fix race condition (*tcp.endpoint).Close

Atomically close the endpoint. Before this change, it was possible for
multiple callers to perform duplicate work.

PiperOrigin-RevId: 300462110
---
 pkg/tcpip/transport/tcp/endpoint.go | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cf73f5382..5187a5e25 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -825,6 +825,7 @@ func (e *endpoint) Abort() {
 func (e *endpoint) Close() {
 	e.mu.Lock()
 	closed := e.closed
+	e.closed = true
 	e.mu.Unlock()
 	if closed {
 		return
@@ -833,13 +834,7 @@ func (e *endpoint) Close() {
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
-	e.closeNoShutdown()
-}
 
-// closeNoShutdown closes the endpoint without doing a full shutdown. This is
-// used when a connection needs to be aborted with a RST and we want to skip
-// a full 4 way TCP shutdown.
-func (e *endpoint) closeNoShutdown() {
 	e.mu.Lock()
 
 	// For listening sockets, we always release ports inline so that they
@@ -858,8 +853,6 @@ func (e *endpoint) closeNoShutdown() {
 		e.boundPortFlags = ports.Flags{}
 	}
 
-	// Mark endpoint as closed.
-	e.closed = true
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	switch e.EndpointState() {
-- 
cgit v1.2.3


From ac05043525a058e22a5db422e5f8d344df21fa55 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 11 Mar 2020 20:37:09 -0700
Subject: Implement heap.Interface on pointer receiver

PiperOrigin-RevId: 300467253
---
 pkg/tcpip/transport/tcp/segment_heap.go | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index e28f213ba..8d3ddce4b 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -14,21 +14,25 @@
 
 package tcp
 
+import "container/heap"
+
 type segmentHeap []*segment
 
+var _ heap.Interface = (*segmentHeap)(nil)
+
 // Len returns the length of h.
-func (h segmentHeap) Len() int {
-	return len(h)
+func (h *segmentHeap) Len() int {
+	return len(*h)
 }
 
 // Less determines whether the i-th element of h is less than the j-th element.
-func (h segmentHeap) Less(i, j int) bool {
-	return h[i].sequenceNumber.LessThan(h[j].sequenceNumber)
+func (h *segmentHeap) Less(i, j int) bool {
+	return (*h)[i].sequenceNumber.LessThan((*h)[j].sequenceNumber)
 }
 
 // Swap swaps the i-th and j-th elements of h.
-func (h segmentHeap) Swap(i, j int) {
-	h[i], h[j] = h[j], h[i]
+func (h *segmentHeap) Swap(i, j int) {
+	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
 }
 
 // Push adds x as the last element of h.
-- 
cgit v1.2.3


From 035f7434e978f3f246ae05e9c748e8ca7d8d7fd1 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 11 Mar 2020 21:12:41 -0700
Subject: Use a heap in transport demuxer

...instead of sorting at various times. Plug a memory leak by setting
removed elements to nil.

PiperOrigin-RevId: 300471087
---
 pkg/tcpip/stack/transport_demuxer.go      | 155 +++++++++++++++---------------
 pkg/tcpip/stack/transport_demuxer_test.go |  14 ++-
 2 files changed, 89 insertions(+), 80 deletions(-)

diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 778c0a4d6..ff1845bfb 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -15,9 +15,9 @@
 package stack
 
 import (
+	"container/heap"
 	"fmt"
 	"math/rand"
-	"sort"
 
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -141,16 +141,17 @@ func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto t
 	epsByNic.mu.Lock()
 	defer epsByNic.mu.Unlock()
 
-	if multiPortEp, ok := epsByNic.endpoints[bindToDevice]; ok {
-		// There was already a bind.
-		return multiPortEp.singleRegisterEndpoint(t, reusePort)
+	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+	if !ok {
+		multiPortEp = &multiPortEndpoint{
+			demux:      d,
+			netProto:   netProto,
+			transProto: transProto,
+			reuse:      reusePort,
+		}
+		epsByNic.endpoints[bindToDevice] = multiPortEp
 	}
 
-	// This is a new binding.
-	multiPortEp := &multiPortEndpoint{demux: d, netProto: netProto, transProto: transProto}
-	multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
-	multiPortEp.reuse = reusePort
-	epsByNic.endpoints[bindToDevice] = multiPortEp
 	return multiPortEp.singleRegisterEndpoint(t, reusePort)
 }
 
@@ -222,6 +223,35 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 	return nil
 }
 
+type transportEndpointHeap []TransportEndpoint
+
+var _ heap.Interface = (*transportEndpointHeap)(nil)
+
+func (h *transportEndpointHeap) Len() int {
+	return len(*h)
+}
+
+func (h *transportEndpointHeap) Less(i, j int) bool {
+	return (*h)[i].UniqueID() < (*h)[j].UniqueID()
+}
+
+func (h *transportEndpointHeap) Swap(i, j int) {
+	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
+}
+
+func (h *transportEndpointHeap) Push(x interface{}) {
+	*h = append(*h, x.(TransportEndpoint))
+}
+
+func (h *transportEndpointHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	old[n-1] = nil
+	*h = old[:n-1]
+	return x
+}
+
 // multiPortEndpoint is a container for TransportEndpoints which are bound to
 // the same pair of address and port. endpointsArr always has at least one
 // element.
@@ -237,15 +267,14 @@ type multiPortEndpoint struct {
 	netProto   tcpip.NetworkProtocolNumber
 	transProto tcpip.TransportProtocolNumber
 
-	endpointsArr []TransportEndpoint
-	endpointsMap map[TransportEndpoint]int
+	endpoints transportEndpointHeap
 	// reuse indicates if more than one endpoint is allowed.
 	reuse bool
 }
 
 func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
 	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
+	eps := append([]TransportEndpoint(nil), ep.endpoints...)
 	ep.mu.RUnlock()
 	return eps
 }
@@ -262,8 +291,8 @@ func reciprocalScale(val, n uint32) uint32 {
 // ports then uses it to select a socket. In this case, all packets from one
 // address will be sent to same endpoint.
 func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint {
-	if len(mpep.endpointsArr) == 1 {
-		return mpep.endpointsArr[0]
+	if len(mpep.endpoints) == 1 {
+		return mpep.endpoints[0]
 	}
 
 	payload := []byte{
@@ -279,29 +308,26 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	h.Write([]byte(id.RemoteAddress))
 	hash := h.Sum32()
 
-	idx := reciprocalScale(hash, uint32(len(mpep.endpointsArr)))
-	return mpep.endpointsArr[idx]
+	idx := reciprocalScale(hash, uint32(len(mpep.endpoints)))
+	return mpep.endpoints[idx]
 }
 
 func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep.mu.RLock()
 	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
-	for i, endpoint := range ep.endpointsArr {
-		// HandlePacket takes ownership of pkt, so each endpoint needs
-		// its own copy except for the final one.
-		if i == len(ep.endpointsArr)-1 {
-			if mustQueue {
-				queuedProtocol.QueuePacket(r, endpoint, id, pkt)
-				break
-			}
-			endpoint.HandlePacket(r, id, pkt)
-			break
-		}
+	// HandlePacket takes ownership of pkt, so each endpoint needs
+	// its own copy except for the final one.
+	for _, endpoint := range ep.endpoints[:len(ep.endpoints)-1] {
 		if mustQueue {
 			queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
-			continue
+		} else {
+			endpoint.HandlePacket(r, id, pkt.Clone())
 		}
-		endpoint.HandlePacket(r, id, pkt.Clone())
+	}
+	if endpoint := ep.endpoints[len(ep.endpoints)-1]; mustQueue {
+		queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+	} else {
+		endpoint.HandlePacket(r, id, pkt)
 	}
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
@@ -312,26 +338,15 @@ func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePo
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
-	if len(ep.endpointsArr) > 0 {
+	if len(ep.endpoints) != 0 {
 		// If it was previously bound, we need to check if we can bind again.
 		if !ep.reuse || !reusePort {
 			return tcpip.ErrPortInUse
 		}
 	}
 
-	// A new endpoint is added into endpointsArr and its index there is saved in
-	// endpointsMap. This will allow us to remove endpoint from the array fast.
-	ep.endpointsMap[t] = len(ep.endpointsArr)
-	ep.endpointsArr = append(ep.endpointsArr, t)
+	heap.Push(&ep.endpoints, t)
 
-	// ep.endpointsArr is sorted by endpoint unique IDs, so that endpoints
-	// can be restored in the same order.
-	sort.Slice(ep.endpointsArr, func(i, j int) bool {
-		return ep.endpointsArr[i].UniqueID() < ep.endpointsArr[j].UniqueID()
-	})
-	for i, e := range ep.endpointsArr {
-		ep.endpointsMap[e] = i
-	}
 	return nil
 }
 
@@ -340,21 +355,13 @@ func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
-	idx, ok := ep.endpointsMap[t]
-	if !ok {
-		return false
-	}
-	delete(ep.endpointsMap, t)
-	l := len(ep.endpointsArr)
-	if l > 1 {
-		// The last endpoint in endpointsArr is moved instead of the deleted one.
-		lastEp := ep.endpointsArr[l-1]
-		ep.endpointsArr[idx] = lastEp
-		ep.endpointsMap[lastEp] = idx
-		ep.endpointsArr = ep.endpointsArr[0 : l-1]
-		return false
+	for i, endpoint := range ep.endpoints {
+		if endpoint == t {
+			heap.Remove(&ep.endpoints, i)
+			break
+		}
 	}
-	return true
+	return len(ep.endpoints) == 0
 }
 
 func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
@@ -371,17 +378,14 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 
-	if epsByNic, ok := eps.endpoints[id]; ok {
-		// There was already a binding.
-		return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
-	}
-
-	// This is a new binding.
-	epsByNic := &endpointsByNic{
-		endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
-		seed:      rand.Uint32(),
+	epsByNic, ok := eps.endpoints[id]
+	if !ok {
+		epsByNic = &endpointsByNic{
+			endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
+			seed:      rand.Uint32(),
+		}
+		eps.endpoints[id] = epsByNic
 	}
-	eps.endpoints[id] = epsByNic
 
 	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
@@ -396,14 +400,6 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 	}
 }
 
-var loopbackSubnet = func() tcpip.Subnet {
-	sn, err := tcpip.NewSubnet("\x7f\x00\x00\x00", "\xff\x00\x00\x00")
-	if err != nil {
-		panic(err)
-	}
-	return sn
-}()
-
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if
 // the packet no longer needs to be handled.
@@ -601,8 +597,8 @@ func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNum
 	}
 
 	eps.mu.Lock()
-	defer eps.mu.Unlock()
 	eps.rawEndpoints = append(eps.rawEndpoints, ep)
+	eps.mu.Unlock()
 
 	return nil
 }
@@ -616,13 +612,16 @@ func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolN
 	}
 
 	eps.mu.Lock()
-	defer eps.mu.Unlock()
 	for i, rawEP := range eps.rawEndpoints {
 		if rawEP == ep {
-			eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...)
-			return
+			lastIdx := len(eps.rawEndpoints) - 1
+			eps.rawEndpoints[i] = eps.rawEndpoints[lastIdx]
+			eps.rawEndpoints[lastIdx] = nil
+			eps.rawEndpoints = eps.rawEndpoints[:lastIdx]
+			break
 		}
 	}
+	eps.mu.Unlock()
 }
 
 func isMulticastOrBroadcast(addr tcpip.Address) bool {
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 5e9237de9..0e3e239c5 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -167,8 +167,18 @@ func TestTransportDemuxerRegister(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
 				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
-			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, nil, false, 0), test.want; got != want {
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			var wq waiter.Queue
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatal(err)
+			}
+			tEP, ok := ep.(stack.TransportEndpoint)
+			if !ok {
+				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
+			}
+			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, false, 0), test.want; got != want {
 				t.Fatalf("s.RegisterTransportEndpoint(...) = %v, want %v", got, want)
 			}
 		})
-- 
cgit v1.2.3


From 7df936f359766618470ae31a7cbf1b761bd19b59 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 4 Mar 2020 04:20:36 -0500
Subject: passed the syscall test case 'alarm' on Arm64 platform

This issue was caused by 'restart_syscall'.
The value of Register R0 should be stored after finishing sysemu.
So that we can restore the value and restart syscall.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/arch/arch_aarch64.go   |  4 ++++
 pkg/sentry/arch/syscalls_arm64.go | 10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 5053393c1..01940bca4 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -97,6 +97,9 @@ type State struct {
 
 	// FeatureSet is a pointer to the currently active feature set.
 	FeatureSet *cpuid.FeatureSet
+
+	// OrigR0 stores the value of register R0.
+	OrigR0 uint64
 }
 
 // Proto returns a protobuf representation of the system registers in State.
@@ -146,6 +149,7 @@ func (s *State) Fork() State {
 		Regs:           s.Regs,
 		aarch64FPState: s.aarch64FPState.fork(),
 		FeatureSet:     s.FeatureSet,
+		OrigR0:         s.OrigR0,
 	}
 }
 
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
index 00d5ef461..dc13b6124 100644
--- a/pkg/sentry/arch/syscalls_arm64.go
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -50,13 +50,21 @@ func (c *context64) SyscallArgs() SyscallArguments {
 }
 
 // RestartSyscall implements Context.RestartSyscall.
+// Prepare for system call restart, OrigR0 will be restored to R0.
+// Please see the linux code as reference:
+// arch/arm64/kernel/signal.c:do_signal()
 func (c *context64) RestartSyscall() {
 	c.Regs.Pc -= SyscallWidth
-	c.Regs.Regs[8] = uint64(restartSyscallNr)
+	// R0 will be backed up into OrigR0 when entering doSyscall().
+	// Please see the linux code as reference:
+	// arch/arm64/kernel/syscall.c:el0_svc_common().
+	// Here we restore it back.
+	c.Regs.Regs[0] = uint64(c.OrigR0)
 }
 
 // RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
 func (c *context64) RestartSyscallWithRestartBlock() {
 	c.Regs.Pc -= SyscallWidth
+	c.Regs.Regs[0] = uint64(c.OrigR0)
 	c.Regs.Regs[8] = uint64(restartSyscallNr)
 }
-- 
cgit v1.2.3


From f2e4b5ab932a3816e4957171b303db645fd04a94 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 12 Mar 2020 12:31:16 -0700
Subject: Kill sandbox process when parent process terminates

When the sandbox runs in attached more, e.g. runsc do, runsc run, the
sandbox lifetime is controlled by the parent process. This wasn't working
in all cases because PR_GET_PDEATHSIG doesn't propagate through execve
when the process changes uid/gid. So it was getting dropped when the
sandbox execve's to change to user nobody.

PiperOrigin-RevId: 300601247
---
 runsc/cmd/boot.go                 |  75 +++++++++++++-------
 runsc/container/container_test.go |  17 -----
 runsc/sandbox/sandbox.go          |  12 ++--
 runsc/specutils/namespace.go      |   3 +
 runsc/testutil/testutil.go        |  23 +++---
 test/root/BUILD                   |   3 +
 test/root/runsc_test.go           | 146 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 221 insertions(+), 58 deletions(-)
 create mode 100644 test/root/runsc_test.go

diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 0f3da69a0..0938944a6 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -23,6 +23,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -82,8 +83,13 @@ type Boot struct {
 	// sandbox (e.g. gofer) and sent through this FD.
 	mountsFD int
 
-	// pidns is set if the sanadbox is in its own pid namespace.
+	// pidns is set if the sandbox is in its own pid namespace.
 	pidns bool
+
+	// attached is set to true to kill the sandbox process when the parent process
+	// terminates. This flag is set when the command execve's itself because
+	// parent death signal doesn't propagate through execve when uid/gid changes.
+	attached bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -118,6 +124,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
 	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
 	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
+	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -133,29 +140,32 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	conf := args[0].(*boot.Config)
 
+	if b.attached {
+		// Ensure this process is killed after parent process terminates when
+		// attached mode is enabled. In the unfortunate event that the parent
+		// terminates before this point, this process leaks.
+		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
+			Fatalf("error setting parent death signal: %v", err)
+		}
+	}
+
 	if b.setUpRoot {
 		if err := setUpChroot(b.pidns); err != nil {
 			Fatalf("error setting up chroot: %v", err)
 		}
 
-		if !b.applyCaps {
-			// Remove --setup-root arg to call myself.
-			var args []string
-			for _, arg := range os.Args {
-				if !strings.Contains(arg, "setup-root") {
-					args = append(args, arg)
-				}
-			}
-			if !conf.Rootless {
-				// Note that we've already read the spec from the spec FD, and
-				// we will read it again after the exec call. This works
-				// because the ReadSpecFromFile function seeks to the beginning
-				// of the file before reading.
-				if err := callSelfAsNobody(args); err != nil {
-					Fatalf("%v", err)
-				}
-				panic("callSelfAsNobody must never return success")
+		if !b.applyCaps && !conf.Rootless {
+			// Remove --apply-caps arg to call myself. It has already been done.
+			args := prepareArgs(b.attached, "setup-root")
+
+			// Note that we've already read the spec from the spec FD, and
+			// we will read it again after the exec call. This works
+			// because the ReadSpecFromFile function seeks to the beginning
+			// of the file before reading.
+			if err := callSelfAsNobody(args); err != nil {
+				Fatalf("%v", err)
 			}
+			panic("callSelfAsNobody must never return success")
 		}
 	}
 
@@ -181,13 +191,9 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			caps.Permitted = append(caps.Permitted, c)
 		}
 
-		// Remove --apply-caps arg to call myself.
-		var args []string
-		for _, arg := range os.Args {
-			if !strings.Contains(arg, "setup-root") && !strings.Contains(arg, "apply-caps") {
-				args = append(args, arg)
-			}
-		}
+		// Remove --apply-caps and --setup-root arg to call myself. Both have
+		// already been done.
+		args := prepareArgs(b.attached, "setup-root", "apply-caps")
 
 		// Note that we've already read the spec from the spec FD, and
 		// we will read it again after the exec call. This works
@@ -258,3 +264,22 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	l.Destroy()
 	return subcommands.ExitSuccess
 }
+
+func prepareArgs(attached bool, exclude ...string) []string {
+	var args []string
+	for _, arg := range os.Args {
+		for _, excl := range exclude {
+			if strings.Contains(arg, excl) {
+				goto skip
+			}
+		}
+		args = append(args, arg)
+		if attached && arg == "boot" {
+			// Strategicaly place "--attached" after the command. This is needed
+			// to ensure the new process is killed when the parent process terminates.
+			args = append(args, "--attached")
+		}
+	skip:
+	}
+	return args
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c7eea85b3..442e80ac0 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -124,23 +124,6 @@ func procListsEqual(got, want []*control.Process) (bool, error) {
 	return true, nil
 }
 
-// getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
-// test for equality. This is because we already confirmed that exec occurred.
-func getAndCheckProcLists(cont *Container, want []*control.Process) error {
-	got, err := cont.Processes()
-	if err != nil {
-		return fmt.Errorf("error getting process data from container: %v", err)
-	}
-	equal, err := procListsEqual(got, want)
-	if err != nil {
-		return err
-	}
-	if equal {
-		return nil
-	}
-	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
-}
-
 func procListToString(pl []*control.Process) string {
 	strs := make([]string, 0, len(pl))
 	for _, p := range pl {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6177d6aa7..8de75ae57 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -701,6 +701,13 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
+	if args.Attached {
+		// Kill sandbox if parent process exits in attached mode.
+		cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
+		// Tells boot that any process it creates must have pdeathsig set.
+		cmd.Args = append(cmd.Args, "--attached")
+	}
+
 	// Add container as the last argument.
 	cmd.Args = append(cmd.Args, s.ID)
 
@@ -709,11 +716,6 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		log.Debugf("Donating FD %d: %q", i+3, f.Name())
 	}
 
-	if args.Attached {
-		// Kill sandbox if parent process exits in attached mode.
-		cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
-	}
-
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index c7dd3051c..60bb7b7ee 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -252,6 +252,9 @@ func MaybeRunAsRoot() error {
 		},
 		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
 		GidMappingsEnableSetgroups: false,
+
+		// Make sure child is killed when the parent terminates.
+		Pdeathsig: syscall.SIGKILL,
 	}
 
 	cmd.Env = os.Environ()
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 92d677e71..51e487715 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -87,18 +87,19 @@ func TestConfig() *boot.Config {
 		logDir = dir + "/"
 	}
 	return &boot.Config{
-		Debug:           true,
-		DebugLog:        logDir,
-		LogFormat:       "text",
-		DebugLogFormat:  "text",
-		AlsoLogToStderr: true,
-		LogPackets:      true,
-		Network:         boot.NetworkNone,
-		Strace:          true,
-		Platform:        "ptrace",
-		FileAccess:      boot.FileAccessExclusive,
+		Debug:              true,
+		DebugLog:           logDir,
+		LogFormat:          "text",
+		DebugLogFormat:     "text",
+		AlsoLogToStderr:    true,
+		LogPackets:         true,
+		Network:            boot.NetworkNone,
+		Strace:             true,
+		Platform:           "ptrace",
+		FileAccess:         boot.FileAccessExclusive,
+		NumNetworkChannels: 1,
+
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
-		NumNetworkChannels:                         1,
 	}
 }
 
diff --git a/test/root/BUILD b/test/root/BUILD
index 23ce2a70f..ddc9b4955 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -16,6 +16,7 @@ go_test(
         "crictl_test.go",
         "main_test.go",
         "oom_score_adj_test.go",
+        "runsc_test.go",
     ],
     data = [
         "//runsc",
@@ -37,7 +38,9 @@ go_test(
         "//runsc/specutils",
         "//runsc/testutil",
         "//test/root/testdata",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/test/root/runsc_test.go b/test/root/runsc_test.go
new file mode 100644
index 000000000..28bb60a12
--- /dev/null
+++ b/test/root/runsc_test.go
@@ -0,0 +1,146 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package root
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+// TestDoKill checks that when "runsc do..." is killed, the sandbox process is
+// also terminated. This ensures that parent death signal is propagate to the
+// sandbox process correctly.
+func TestDoKill(t *testing.T) {
+	// Make the sandbox process be reparented here when it's killed, so we can
+	// wait for it.
+	if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); err != nil {
+		t.Fatalf("prctl(PR_SET_CHILD_SUBREAPER): %v", err)
+	}
+
+	cmd := exec.Command(specutils.ExePath, "do", "sleep", "10000")
+	buf := &bytes.Buffer{}
+	cmd.Stdout = buf
+	cmd.Stderr = buf
+	cmd.Start()
+
+	var pid int
+	findSandbox := func() error {
+		var err error
+		pid, err = sandboxPid(cmd.Process.Pid)
+		if err != nil {
+			return &backoff.PermanentError{Err: err}
+		}
+		if pid == 0 {
+			return fmt.Errorf("sandbox process not found")
+		}
+		return nil
+	}
+	if err := testutil.Poll(findSandbox, 10*time.Second); err != nil {
+		t.Fatalf("failed to find sandbox: %v", err)
+	}
+	t.Logf("Found sandbox, pid: %d", pid)
+
+	if err := cmd.Process.Kill(); err != nil {
+		t.Fatalf("failed to kill run process: %v", err)
+	}
+	cmd.Wait()
+	t.Logf("Parent process killed (%d). Output: %s", cmd.Process.Pid, buf.String())
+
+	ch := make(chan struct{})
+	go func() {
+		defer func() { ch <- struct{}{} }()
+		t.Logf("Waiting for sandbox process (%d) termination", pid)
+		if _, err := unix.Wait4(pid, nil, 0, nil); err != nil {
+			t.Errorf("error waiting for sandbox process (%d): %v", pid, err)
+		}
+	}()
+	select {
+	case <-ch:
+		// Done
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timeout waiting for sandbox process (%d) to exit", pid)
+	}
+}
+
+// sandboxPid looks for the sandbox process inside the process tree starting
+// from "pid". It returns 0 and no error if no sandbox process is found. It
+// returns error if anything failed.
+func sandboxPid(pid int) (int, error) {
+	cmd := exec.Command("pgrep", "-P", strconv.Itoa(pid))
+	buf := &bytes.Buffer{}
+	cmd.Stdout = buf
+	if err := cmd.Start(); err != nil {
+		return 0, err
+	}
+	ps, err := cmd.Process.Wait()
+	if err != nil {
+		return 0, err
+	}
+	if ps.ExitCode() == 1 {
+		// pgrep returns 1 when no process is found.
+		return 0, nil
+	}
+
+	var children []int
+	for _, line := range strings.Split(buf.String(), "\n") {
+		if len(line) == 0 {
+			continue
+		}
+		child, err := strconv.Atoi(line)
+		if err != nil {
+			return 0, err
+		}
+
+		cmdline, err := ioutil.ReadFile(filepath.Join("/proc", line, "cmdline"))
+		if err != nil {
+			return 0, err
+		}
+		args := strings.SplitN(string(cmdline), "\x00", 2)
+		if len(args) == 0 {
+			return 0, fmt.Errorf("malformed cmdline file: %q", cmdline)
+		}
+		// The sandbox process has the first argument set to "runsc-sandbox".
+		if args[0] == "runsc-sandbox" {
+			return child, nil
+		}
+
+		children = append(children, child)
+	}
+
+	// Sandbox process wasn't found, try another level down.
+	for _, pid := range children {
+		sand, err := sandboxPid(pid)
+		if err != nil {
+			return 0, err
+		}
+		if sand != 0 {
+			return sand, nil
+		}
+		// Not found, continue the search.
+	}
+	return 0, nil
+}
-- 
cgit v1.2.3


From 919664600d8d3e06e90237bd7391171e9d3ce27d Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 12 Mar 2020 13:10:27 -0700
Subject: Mark gonet_test as flaky.

Mark /pkg/tcpip/adapters/gonet/gonet_test as flaky.

PiperOrigin-RevId: 300609529
---
 pkg/tcpip/adapters/gonet/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index a984f1712..e57d45f2a 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -22,6 +22,7 @@ go_test(
     size = "small",
     srcs = ["gonet_test.go"],
     library = ":gonet",
+    tags = ["flaky"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
-- 
cgit v1.2.3


From bbf86003bfd2a7547744b89c72e1cd06e9385e66 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 12 Mar 2020 14:34:16 -0700
Subject: Remove flaky network namespace test that uses clone().

PiperOrigin-RevId: 300626011
---
 test/syscalls/linux/BUILD                |  3 +-
 test/syscalls/linux/network_namespace.cc | 87 ++++----------------------------
 2 files changed, 10 insertions(+), 80 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 43455f1a3..636e5db12 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3707,11 +3707,10 @@ cc_binary(
         ":socket_test_util",
         gtest,
         "//test/util:capability_util",
-        "//test/util:memory_util",
+        "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
     ],
 )
 
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
index 6ea48c263..133fdecf0 100644
--- a/test/syscalls/linux/network_namespace.cc
+++ b/test/syscalls/linux/network_namespace.cc
@@ -20,102 +20,33 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "absl/synchronization/notification.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/capability_util.h"
-#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
-
 namespace {
 
-using TestFunc = std::function<PosixError()>;
-using RunFunc = std::function<PosixError(TestFunc)>;
-
-struct NamespaceStrategy {
-  RunFunc run;
-
-  static NamespaceStrategy Of(RunFunc run) {
-    NamespaceStrategy s;
-    s.run = run;
-    return s;
-  }
-};
+TEST(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-PosixError RunWithUnshare(TestFunc fn) {
-  PosixError err = PosixError(-1, "function did not return a value");
   ScopedThread t([&] {
-    if (unshare(CLONE_NEWNET) != 0) {
-      err = PosixError(errno);
-      return;
-    }
-    err = fn();
-  });
-  t.Join();
-  return err;
-}
+    ASSERT_THAT(unshare(CLONE_NEWNET), SyscallSucceedsWithValue(0));
 
-PosixError RunWithClone(TestFunc fn) {
-  struct Args {
-    absl::Notification n;
-    TestFunc fn;
-    PosixError err;
-  };
-  Args args;
-  args.fn = fn;
-  args.err = PosixError(-1, "function did not return a value");
-
-  ASSIGN_OR_RETURN_ERRNO(
-      Mapping child_stack,
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  pid_t child = clone(
-      +[](void *arg) {
-        Args *args = reinterpret_cast<Args *>(arg);
-        args->err = args->fn();
-        args->n.Notify();
-        syscall(SYS_exit, 0);  // Exit manually. No return address on stack.
-        return 0;
-      },
-      reinterpret_cast<void *>(child_stack.addr() + kPageSize),
-      CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args);
-  if (child < 0) {
-    return PosixError(errno, "clone() failed");
-  }
-  args.n.WaitForNotification();
-  return args.err;
-}
-
-class NetworkNamespaceTest
-    : public ::testing::TestWithParam<NamespaceStrategy> {};
-
-TEST_P(NetworkNamespaceTest, LoopbackExists) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
-
-  EXPECT_NO_ERRNO(GetParam().run([]() {
     // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
     // Check loopback device exists.
     int sock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (sock < 0) {
-      return PosixError(errno, "socket() failed");
-    }
+    ASSERT_THAT(sock, SyscallSucceeds());
     struct ifreq ifr;
-    snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
-    if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) {
-      return PosixError(errno, "ioctl() failed, lo cannot be found");
-    }
-    return NoError();
-  }));
+    strncpy(ifr.ifr_name, "lo", IFNAMSIZ);
+    EXPECT_THAT(ioctl(sock, SIOCGIFINDEX, &ifr), SyscallSucceeds())
+        << "lo cannot be found";
+  });
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    AllNetworkNamespaceTest, NetworkNamespaceTest,
-    ::testing::Values(NamespaceStrategy::Of(RunWithUnshare),
-                      NamespaceStrategy::Of(RunWithClone)));
-
 }  // namespace
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From f693e1334b6fd0bea26fad770dfec3aa7e03c59a Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 12 Mar 2020 18:38:41 -0700
Subject: Clarify comments about IHL in ipv4.go.

PiperOrigin-RevId: 300668506
---
 pkg/tcpip/header/ipv4.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index e5360e7c1..76839eb92 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -38,7 +38,8 @@ const (
 // IPv4Fields contains the fields of an IPv4 packet. It is used to describe the
 // fields of a packet that needs to be encoded.
 type IPv4Fields struct {
-	// IHL is the "internet header length" field of an IPv4 packet.
+	// IHL is the "internet header length" field of an IPv4 packet. The value
+	// is in bytes.
 	IHL uint8
 
 	// TOS is the "type of service" field of an IPv4 packet.
@@ -138,7 +139,7 @@ func IPVersion(b []byte) int {
 }
 
 // HeaderLength returns the value of the "header length" field of the ipv4
-// header.
+// header. The length returned is in bytes.
 func (b IPv4) HeaderLength() uint8 {
 	return (b[versIHL] & 0xf) * 4
 }
-- 
cgit v1.2.3


From 333b74dc288357e192dbd86f6d0732be5ea7df64 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 13 Mar 2020 03:02:26 +0000
Subject: Enable syscall seccomp test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Ibc926c917d98b31fc92bbf8d82d6818c39b0f93c
---
 test/syscalls/linux/seccomp.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 8e0fc9acc..06cc6a64e 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -72,8 +72,15 @@ void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
   struct sock_filter filter[] = {
       // A = seccomp_data.arch
       BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
+#if defined(__x86_64__)
       // if (A != AUDIT_ARCH_X86_64) goto kill
       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
+#elif defined(__aarch64__)
+      // if (A != AUDIT_ARCH_AARCH64) goto kill
+      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4),
+#else
+#error "Unknown architecture"
+#endif
       // A = seccomp_data.nr
       BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
       // if (A != sysno) goto allow
@@ -179,9 +186,12 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
           TEST_CHECK(info->si_errno == kTrapValue);
           TEST_CHECK(info->si_call_addr != nullptr);
           TEST_CHECK(info->si_syscall == kFilteredSyscall);
-#ifdef __x86_64__
+#if defined(__x86_64__)
           TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
           TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
+#elif defined(__aarch64__)
+          TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64);
+          TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall);
 #endif  // defined(__x86_64__)
           _exit(0);
         });
-- 
cgit v1.2.3


From 8f8f16efafd48da3c5e4db329a90bb76620b2324 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 13 Mar 2020 08:56:47 -0700
Subject: Add support for mount flags

Plumbs MS_NOEXEC and MS_RDONLY. Others are TODO.

Updates #1623 #1193

PiperOrigin-RevId: 300764669
---
 pkg/sentry/fsbridge/vfs.go |  2 --
 pkg/sentry/vfs/mount.go    | 42 ++++++++++++++++++++++--------------------
 pkg/sentry/vfs/options.go  | 16 +++++++++++++++-
 pkg/sentry/vfs/vfs.go      |  5 +++++
 4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 6aa17bfc1..79b808359 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -115,8 +115,6 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
 //
 // remainingTraversals is not configurable in VFS2, all callers are using the
 // default anyways.
-//
-// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
 func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
 	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
 	creds := auth.CredentialsFromContext(ctx)
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 31a4e5480..05f6233f9 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -74,6 +74,10 @@ type Mount struct {
 	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
 	umounted bool
 
+	// flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+	// for MS_RDONLY which is tracked in "writers".
+	flags MountFlags
+
 	// The lower 63 bits of writers is the number of calls to
 	// Mount.CheckBeginWrite() that have not yet been paired with a call to
 	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
@@ -81,6 +85,21 @@ type Mount struct {
 	writers int64
 }
 
+func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
+	mnt := &Mount{
+		vfs:   vfs,
+		fs:    fs,
+		root:  root,
+		flags: opts.Flags,
+		ns:    mntns,
+		refs:  1,
+	}
+	if opts.ReadOnly {
+		mnt.setReadOnlyLocked(true)
+	}
+	return mnt
+}
+
 // A MountNamespace is a collection of Mounts.
 //
 // MountNamespaces are reference-counted. Unless otherwise specified, all
@@ -129,13 +148,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
-	mntns.root = &Mount{
-		vfs:  vfs,
-		fs:   fs,
-		root: root,
-		ns:   mntns,
-		refs: 1,
-	}
+	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
 	return mntns, nil
 }
 
@@ -148,12 +161,7 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry,
 	if root != nil {
 		root.IncRef()
 	}
-	return &Mount{
-		vfs:  vfs,
-		fs:   fs,
-		root: root,
-		refs: 1,
-	}, nil
+	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
 }
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
@@ -218,13 +226,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	// are directories, or neither are, and returns ENOTDIR if this is not the
 	// case.
 	mntns := vd.mount.ns
-	mnt := &Mount{
-		vfs:  vfs,
-		fs:   fs,
-		root: root,
-		ns:   mntns,
-		refs: 1,
-	}
+	mnt := newMount(vfs, fs, root, mntns, opts)
 	vfs.mounts.seq.BeginWrite()
 	vfs.connectLocked(mnt, vd, mntns)
 	vfs.mounts.seq.EndWrite()
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 6af7fdac1..3e90dc4ed 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -46,8 +46,21 @@ type MknodOptions struct {
 	DevMinor uint32
 }
 
+// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+type MountFlags struct {
+	// NoExec is equivalent to MS_NOEXEC.
+	NoExec bool
+}
+
 // MountOptions contains options to VirtualFilesystem.MountAt().
 type MountOptions struct {
+	// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+	Flags MountFlags
+
+	// ReadOnly is equivalent to MS_RDONLY.
+	ReadOnly bool
+
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
 
@@ -75,7 +88,8 @@ type OpenOptions struct {
 
 	// FileExec is set when the file is being opened to be executed.
 	// VirtualFilesystem.OpenAt() checks that the caller has execute permissions
-	// on the file, and that the file is a regular file.
+	// on the file, that the file is a regular file, and that the mount doesn't
+	// have MS_NOEXEC set.
 	FileExec bool
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index bde81e1ef..365e8b30d 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -388,6 +388,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 			// TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
 			// to FileDescription.Stat().
 			if opts.FileExec {
+				if fd.Mount().flags.NoExec {
+					fd.DecRef()
+					return nil, syserror.EACCES
+				}
+
 				// Only a regular file can be executed.
 				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
 				if err != nil {
-- 
cgit v1.2.3


From 28d26d2c4f231c447a10bcbcfb8223a804c9d8bc Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 13 Mar 2020 10:43:09 -0700
Subject: Honour the link's MaxHeaderLength when forwarding

LinkEndpoints may expect/assume that the a tcpip.PacketBuffer's Header
has enough capacity for its own headers, as per documentation for
LinkEndpoint.MaxHeaderLength.

Test: stack_test.TestNICForwarding
PiperOrigin-RevId: 300784192
---
 pkg/tcpip/stack/nic.go        |  18 ++++++-
 pkg/tcpip/stack/stack_test.go | 112 ++++++++++++++++++++++++++----------------
 2 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3cd5fec71..230ee0697 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -15,6 +15,7 @@
 package stack
 
 import (
+	"fmt"
 	"log"
 	"reflect"
 	"sort"
@@ -1259,9 +1260,24 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-	pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
+
+	firstData := pkt.Data.First()
 	pkt.Data.RemoveFirst()
 
+	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen == 0 {
+		pkt.Header = buffer.NewPrependableFromView(firstData)
+	} else {
+		firstDataLen := len(firstData)
+
+		// pkt.Header should have enough capacity to hold n.linkEP's headers.
+		pkt.Header = buffer.NewPrependable(firstDataLen + linkHeaderLen)
+
+		// TODO(b/151227689): avoid copying the packet when forwarding
+		if n := copy(pkt.Header.Prepend(firstDataLen), firstData); n != firstDataLen {
+			panic(fmt.Sprintf("copied %d bytes, expected %d", n, firstDataLen))
+		}
+	}
+
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index e15db40fb..9515426d6 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2240,56 +2240,84 @@ func TestNICStats(t *testing.T) {
 }
 
 func TestNICForwarding(t *testing.T) {
-	// Create a stack with the fake network protocol, two NICs, each with
-	// an address.
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-	s.SetForwarding(true)
+	const nicID1 = 1
+	const nicID2 = 2
+	const dstAddr = tcpip.Address("\x03")
 
-	ep1 := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(1, ep1); err != nil {
-		t.Fatal("CreateNIC #1 failed:", err)
-	}
-	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
-		t.Fatal("AddAddress #1 failed:", err)
+	tests := []struct {
+		name      string
+		headerLen uint16
+	}{
+		{
+			name: "Zero header length",
+		},
+		{
+			name:      "Non-zero header length",
+			headerLen: 16,
+		},
 	}
 
-	ep2 := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(2, ep2); err != nil {
-		t.Fatal("CreateNIC #2 failed:", err)
-	}
-	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
-		t.Fatal("AddAddress #2 failed:", err)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+			s.SetForwarding(true)
 
-	// Route all packets to address 3 to NIC 2.
-	{
-		subnet, err := tcpip.NewSubnet("\x03", "\xff")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 2}})
-	}
+			ep1 := channel.New(10, defaultMTU, "")
+			if err := s.CreateNIC(nicID1, ep1); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+			}
+			if err := s.AddAddress(nicID1, fakeNetNumber, "\x01"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x01): %s", nicID1, fakeNetNumber, err)
+			}
 
-	// Send a packet to address 3.
-	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
-		Data: buf.ToVectorisedView(),
-	})
+			ep2 := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(10, defaultMTU, ""),
+				headerLength: test.headerLen,
+			}
+			if err := s.CreateNIC(nicID2, &ep2); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+			}
+			if err := s.AddAddress(nicID2, fakeNetNumber, "\x02"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x02): %s", nicID2, fakeNetNumber, err)
+			}
 
-	if _, ok := ep2.Read(); !ok {
-		t.Fatal("Packet not forwarded")
-	}
+			// Route all packets to dstAddr to NIC 2.
+			{
+				subnet, err := tcpip.NewSubnet(dstAddr, "\xff")
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID2}})
+			}
 
-	// Test that forwarding increments Tx stats correctly.
-	if got, want := s.NICInfo()[2].Stats.Tx.Packets.Value(), uint64(1); got != want {
-		t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
-	}
+			// Send a packet to dstAddr.
+			buf := buffer.NewView(30)
+			buf[0] = dstAddr[0]
+			ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+				Data: buf.ToVectorisedView(),
+			})
 
-	if got, want := s.NICInfo()[2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
-		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			pkt, ok := ep2.Read()
+			if !ok {
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the link's MaxHeaderLength is honoured.
+			if capacity, want := pkt.Pkt.Header.AvailableLength(), int(test.headerLen); capacity != want {
+				t.Errorf("got Header.AvailableLength() = %d, want = %d", capacity, want)
+			}
+
+			// Test that forwarding increments Tx stats correctly.
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Packets.Value(), uint64(1); got != want {
+				t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
+			}
+
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
+				t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			}
+		})
 	}
 }
 
-- 
cgit v1.2.3


From f458a325e9b6aecf2ee198de19063505c48a14d7 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 13 Mar 2020 11:25:49 -0700
Subject: Fix "application exiting with {Code:0 Signo:27}" during boot.

2aa9514a06a5e34894e606d508ac2df53b082c74 skips SIGURG, but later code expects
the sigchans array contains consecutive signal numbers.

PiperOrigin-RevId: 300793450
---
 pkg/sentry/sighandling/sighandling.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 959ef7217..83195d5a1 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -83,12 +83,13 @@ func StartSignalForwarding(handler func(linux.Signal)) func() {
 	// for their handling.
 	var sigchans []chan os.Signal
 	for sig := 1; sig <= numSignals+1; sig++ {
+		sigchan := make(chan os.Signal, 1)
+		sigchans = append(sigchans, sigchan)
+
 		// SIGURG is used by Go's runtime scheduler.
 		if sig == int(linux.SIGURG) {
 			continue
 		}
-		sigchan := make(chan os.Signal, 1)
-		sigchans = append(sigchans, sigchan)
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-- 
cgit v1.2.3


From 2e38408f20a084de716962d4631e0fec1fd16c16 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 13 Mar 2020 11:40:13 -0700
Subject: Implement access/faccessat for VFS2.

Note that the raw faccessat system call does not actually take a flags argument;
according to faccessat(2), the glibc wrapper implements the flags by using
fstatat(2). Remove the flag argument that we try to extract from vfs1, which
would just be a garbage value.

Updates #1965
Fixes #2101

PiperOrigin-RevId: 300796067
---
 pkg/sentry/fsimpl/ext/filesystem.go    | 10 ++++++
 pkg/sentry/fsimpl/gofer/filesystem.go  | 13 ++++++++
 pkg/sentry/fsimpl/kernfs/filesystem.go | 14 ++++++++
 pkg/sentry/fsimpl/tmpfs/filesystem.go  | 12 +++++++
 pkg/sentry/syscalls/linux/sys_file.go  | 16 ++++++---
 pkg/sentry/syscalls/linux/vfs2/stat.go | 60 +++++++++++++++++++++++++++++++---
 pkg/sentry/vfs/anonfs.go               | 25 +++++++++++---
 pkg/sentry/vfs/filesystem.go           |  4 +++
 pkg/sentry/vfs/vfs.go                  | 17 ++++++++++
 9 files changed, 157 insertions(+), 14 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index e05429d41..8497be615 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -255,6 +256,15 @@ func (fs *filesystem) statTo(stat *linux.Statfs) {
 	// TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return inode.checkPermissions(rp.Credentials(), ats)
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	vfsd, inode, err := fs.walk(rp, false)
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 5cfb0dc4c..38e4cdbc5 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -499,6 +500,18 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
 	putDentrySlice(*ds)
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats, d.isDir())
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	var ds *[]*dentry
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 1d7e04ad4..3288de290 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -229,6 +230,19 @@ func (fs *Filesystem) Sync(ctx context.Context) error {
 	return nil
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	defer fs.processDeferredDecRefs()
+
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	return inode.CheckPermissions(ctx, creds, ats)
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e1b551422..02637fca6 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -154,6 +155,17 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	return create(parent, name)
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	return d.inode.checkPermissions(creds, ats, d.inode.isDir())
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	fs.mu.RLock()
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index d10a9bed8..35a98212a 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -514,7 +514,7 @@ func (ac accessContext) Value(key interface{}) interface{} {
 	}
 }
 
-func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error {
+func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error {
 	const rOK = 4
 	const wOK = 2
 	const xOK = 1
@@ -529,7 +529,7 @@ func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode
 		return syserror.EINVAL
 	}
 
-	return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
 		// access(2) and faccessat(2) check permissions using real
 		// UID/GID, not effective UID/GID.
 		//
@@ -564,17 +564,23 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	addr := args[0].Pointer()
 	mode := args[1].ModeT()
 
-	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode)
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
 }
 
 // Faccessat implements linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat().  If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
 func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	dirFD := args[0].Int()
 	addr := args[1].Pointer()
 	mode := args[2].ModeT()
-	flags := args[3].Int()
 
-	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
+	return 0, nil, accessAt(t, dirFD, addr, mode)
 }
 
 // LINT.ThenChange(vfs2/filesystem.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index 12c532310..a74ea6fd5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -228,14 +228,64 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 // Access implements Linux syscall access(2).
 func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	// FIXME(jamieliu): actually implement
-	return 0, nil, nil
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
 }
 
-// Faccessat implements Linux syscall access(2).
+// Faccessat implements Linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat().  If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
 func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	// FIXME(jamieliu): actually implement
-	return 0, nil, nil
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+
+	return 0, nil, accessAt(t, dirfd, addr, mode)
+}
+
+func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return err
+	}
+
+	// access(2) and faccessat(2) check permissions using real
+	// UID/GID, not effective UID/GID.
+	//
+	// "access() needs to use the real uid/gid, not the effective
+	// uid/gid. We do this by temporarily clearing all FS-related
+	// capabilities and switching the fsuid/fsgid around to the
+	// real ones." -fs/open.c:faccessat
+	creds := t.Credentials().Fork()
+	creds.EffectiveKUID = creds.RealKUID
+	creds.EffectiveKGID = creds.RealKGID
+	if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+		creds.EffectiveCaps = creds.PermittedCaps
+	} else {
+		creds.EffectiveCaps = 0
+	}
+
+	return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
 }
 
 // Readlinkat implements Linux syscall mknodat(2).
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 2db25be49..925996517 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -41,7 +41,14 @@ func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
 	}
 }
 
-const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+const (
+	anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+	// Mode, UID, and GID for a generic anonfs file.
+	anonFileMode = 0600 // no type is correct
+	anonFileUID  = auth.RootKUID
+	anonFileGID  = auth.RootKGID
+)
 
 // anonFilesystem is the implementation of FilesystemImpl that backs
 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -69,6 +76,16 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error {
 	return nil
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+//
+// TODO(gvisor.dev/issue/1965): Implement access permissions.
+func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return GenericCheckPermissions(creds, ats, false /* isDir */, anonFileMode, anonFileUID, anonFileGID)
+}
+
 // GetDentryAt implements FilesystemImpl.GetDentryAt.
 func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
 	if !rp.Done() {
@@ -167,9 +184,9 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St
 		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
 		Blksize:  anonfsBlockSize,
 		Nlink:    1,
-		UID:      uint32(auth.RootKUID),
-		GID:      uint32(auth.RootKGID),
-		Mode:     0600, // no type is correct
+		UID:      uint32(anonFileUID),
+		GID:      uint32(anonFileGID),
+		Mode:     anonFileMode,
 		Ino:      1,
 		Size:     0,
 		Blocks:   0,
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 556976d0b..c43dcff3d 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
 // A Filesystem is a tree of nodes represented by Dentries, which forms part of
@@ -144,6 +145,9 @@ type FilesystemImpl interface {
 	// file data to be written to the underlying [filesystem]", as by syncfs(2).
 	Sync(ctx context.Context) error
 
+	// AccessAt checks whether a user with creds can access the file at rp.
+	AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error
+
 	// GetDentryAt returns a Dentry representing the file at rp. A reference is
 	// taken on the returned Dentry.
 	//
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 365e8b30d..2e2880171 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -174,6 +174,23 @@ type PathOperation struct {
 	FollowFinalSymlink bool
 }
 
+// AccessAt checks whether a user with creds has access to the file at
+// the given path.
+func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
 // GetDentryAt returns a VirtualDentry representing the given path, at which a
 // file must exist. A reference is taken on the returned VirtualDentry.
 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-- 
cgit v1.2.3


From b8fda7f34f7e9629629bb0cdf7f7e72c7efdace9 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 13 Mar 2020 11:50:30 -0700
Subject: Run "startup" benchmark in Kokoro benchmark job.

PiperOrigin-RevId: 300798423
---
 scripts/benchmark.sh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index 79ff198d5..032899386 100644
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -14,13 +14,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Run in the root of the repo.
-cd "$(dirname "$0")"
+source $(dirname $0)/common.sh
 
+# Exporting for subprocesses as GCP APIs and tools check this environmental
+# variable for authentication.
 export GOOGLE_APPLICATION_CREDENTIALS="${KOKORO_KEYSTORE_DIR}/${GCLOUD_CREDENTIALS}"
 
-gcloud auth activate-service-account --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
+gcloud auth activate-service-account \
+   --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
 
-gcloud compute instances list
-
-bq show gvisor-benchmarks:test.test
+bazel run //benchmarks:benchmarks -- \
+  --verbose \
+  run-gcp \
+  startup \
+  --runtime=runc \
+  --runtime=runsc \
+  --installers=head
-- 
cgit v1.2.3


From 722abdd8339f1df515beae0ad5272c8c2b2cfed0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 13 Mar 2020 12:09:58 -0700
Subject: Skip process if it has exited

PiperOrigin-RevId: 300802159
---
 test/root/runsc_test.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/root/runsc_test.go b/test/root/runsc_test.go
index 28bb60a12..90373e2db 100644
--- a/test/root/runsc_test.go
+++ b/test/root/runsc_test.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"io/ioutil"
+	"os"
 	"os/exec"
 	"path/filepath"
 	"strconv"
@@ -117,6 +118,10 @@ func sandboxPid(pid int) (int, error) {
 
 		cmdline, err := ioutil.ReadFile(filepath.Join("/proc", line, "cmdline"))
 		if err != nil {
+			if os.IsNotExist(err) {
+				// Raced with process exit.
+				continue
+			}
 			return 0, err
 		}
 		args := strings.SplitN(string(cmdline), "\x00", 2)
-- 
cgit v1.2.3


From b78cee3bae142eb5c602d51874d0cbad274777e2 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 13 Mar 2020 12:16:59 -0700
Subject: Fix lock recursion in kernel.ProcessGroup.SendSignal().

PiperOrigin-RevId: 300803515
---
 pkg/sentry/kernel/sessions.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 047b5214d..0e19286de 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
 
 	var lastErr error
 	for tg := range tasks.Root.tgids {
-		if tg.ProcessGroup() == pg {
+		if tg.processGroup == pg {
 			tg.signalHandlers.mu.Lock()
 			infoCopy := *info
 			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
-- 
cgit v1.2.3


From 86409c91813256f45ebcb08efeac9d7f9e56a804 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 13 Mar 2020 12:20:09 -0700
Subject: Avoid unnecessary work in transportDemuxer.deliverPacket().

- Don't allocate []*endpointsByNic in transportDemuxer.deliverPacket() unless
  actually needed for UDP broadcast/multicast.

- Don't allocate []*endpointsByNic via transportDemuxer.findEndpointLocked()
  => transportDemuxer.findAllEndpointsLocked().

- Skip unnecessary map lookups in transportDemuxer.findEndpointLocked() =>
  transportDemuxer.findAllEndpointsLocked() (now iterEndpointsLocked).

For most deliverable packets other than UDP broadcast/multicast packets, this
saves two slice allocations and three map lookups per packet.

PiperOrigin-RevId: 300804135
---
 pkg/tcpip/stack/transport_demuxer.go | 112 ++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 53 deletions(-)

diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index ff1845bfb..d4c0359e8 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -409,61 +409,45 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 		return false
 	}
 
-	eps.mu.RLock()
-
-	// Determine which transport endpoint or endpoints to deliver this packet to.
 	// If the packet is a UDP broadcast or multicast, then find all matching
-	// transport endpoints. If the packet is a TCP packet with a non-unicast
-	// source or destination address, then do nothing further and instruct
-	// the caller to do the same.
-	var destEps []*endpointsByNic
-	switch protocol {
-	case header.UDPProtocolNumber:
-		if isMulticastOrBroadcast(id.LocalAddress) {
-			destEps = d.findAllEndpointsLocked(eps, id)
-			break
-		}
-
-		if ep := d.findEndpointLocked(eps, id); ep != nil {
-			destEps = append(destEps, ep)
+	// transport endpoints.
+	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
+		eps.mu.RLock()
+		destEPs := d.findAllEndpointsLocked(eps, id)
+		eps.mu.RUnlock()
+		// Fail if we didn't find at least one matching transport endpoint.
+		if len(destEPs) == 0 {
+			r.Stats().UDP.UnknownPortErrors.Increment()
+			return false
 		}
-
-	case header.TCPProtocolNumber:
-		if !(isUnicast(r.LocalAddress) && isUnicast(r.RemoteAddress)) {
-			// TCP can only be used to communicate between a single
-			// source and a single destination; the addresses must
-			// be unicast.
-			eps.mu.RUnlock()
-			r.Stats().TCP.InvalidSegmentsReceived.Increment()
-			return true
+		// handlePacket takes ownership of pkt, so each endpoint needs its own
+		// copy except for the final one.
+		for _, ep := range destEPs[:len(destEPs)-1] {
+			ep.handlePacket(r, id, pkt.Clone())
 		}
+		destEPs[len(destEPs)-1].handlePacket(r, id, pkt)
+		return true
+	}
 
-		fallthrough
-
-	default:
-		if ep := d.findEndpointLocked(eps, id); ep != nil {
-			destEps = append(destEps, ep)
-		}
+	// If the packet is a TCP packet with a non-unicast source or destination
+	// address, then do nothing further and instruct the caller to do the same.
+	if protocol == header.TCPProtocolNumber && (!isUnicast(r.LocalAddress) || !isUnicast(r.RemoteAddress)) {
+		// TCP can only be used to communicate between a single source and a
+		// single destination; the addresses must be unicast.
+		r.Stats().TCP.InvalidSegmentsReceived.Increment()
+		return true
 	}
 
+	eps.mu.RLock()
+	ep := d.findEndpointLocked(eps, id)
 	eps.mu.RUnlock()
-
-	// Fail if we didn't find at least one matching transport endpoint.
-	if len(destEps) == 0 {
-		// UDP packet could not be delivered to an unknown destination port.
+	if ep == nil {
 		if protocol == header.UDPProtocolNumber {
 			r.Stats().UDP.UnknownPortErrors.Increment()
 		}
 		return false
 	}
-
-	// HandlePacket takes ownership of pkt, so each endpoint needs its own
-	// copy except for the final one.
-	for _, ep := range destEps[:len(destEps)-1] {
-		ep.handlePacket(r, id, pkt.Clone())
-	}
-	destEps[len(destEps)-1].handlePacket(r, id, pkt)
-
+	ep.handlePacket(r, id, pkt)
 	return true
 }
 
@@ -515,11 +499,17 @@ func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtoco
 	return true
 }
 
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
-	var matchedEPs []*endpointsByNic
+// iterEndpointsLocked yields all endpointsByNic in eps that match id, in
+// descending order of match quality. If a call to yield returns false,
+// iterEndpointsLocked stops iteration and returns immediately.
+//
+// Preconditions: eps.mu must be locked.
+func (d *transportDemuxer) iterEndpointsLocked(eps *transportEndpoints, id TransportEndpointID, yield func(*endpointsByNic) bool) {
 	// Try to find a match with the id as provided.
 	if ep, ok := eps.endpoints[id]; ok {
-		matchedEPs = append(matchedEPs, ep)
+		if !yield(ep) {
+			return
+		}
 	}
 
 	// Try to find a match with the id minus the local address.
@@ -527,7 +517,9 @@ func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id Tr
 
 	nid.LocalAddress = ""
 	if ep, ok := eps.endpoints[nid]; ok {
-		matchedEPs = append(matchedEPs, ep)
+		if !yield(ep) {
+			return
+		}
 	}
 
 	// Try to find a match with the id minus the remote part.
@@ -535,14 +527,26 @@ func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id Tr
 	nid.RemoteAddress = ""
 	nid.RemotePort = 0
 	if ep, ok := eps.endpoints[nid]; ok {
-		matchedEPs = append(matchedEPs, ep)
+		if !yield(ep) {
+			return
+		}
 	}
 
 	// Try to find a match with only the local port.
 	nid.LocalAddress = ""
 	if ep, ok := eps.endpoints[nid]; ok {
-		matchedEPs = append(matchedEPs, ep)
+		if !yield(ep) {
+			return
+		}
 	}
+}
+
+func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
+	var matchedEPs []*endpointsByNic
+	d.iterEndpointsLocked(eps, id, func(ep *endpointsByNic) bool {
+		matchedEPs = append(matchedEPs, ep)
+		return true
+	})
 	return matchedEPs
 }
 
@@ -580,10 +584,12 @@ func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolN
 // findEndpointLocked returns the endpoint that most closely matches the given
 // id.
 func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
-	if matchedEPs := d.findAllEndpointsLocked(eps, id); len(matchedEPs) > 0 {
-		return matchedEPs[0]
-	}
-	return nil
+	var matchedEP *endpointsByNic
+	d.iterEndpointsLocked(eps, id, func(ep *endpointsByNic) bool {
+		matchedEP = ep
+		return false
+	})
+	return matchedEP
 }
 
 // registerRawEndpoint registers the given endpoint with the dispatcher such
-- 
cgit v1.2.3


From 530a31f3c08b10fbd2f8135c5b76380cf5e7f4e8 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 13 Mar 2020 12:29:19 -0700
Subject: Disable a NIC before removing it

When a NIC is removed, attempt to disable the NIC first to cleanup
dynamic state and stop ongoing periodic tasks (e.g. IPv6 router
solicitations, DAD) so that a removed NIC does not attempt to send
packets.

Tests:
    - stack_test.TestRemoveUnknownNIC
    - stack_test.TestRemoveNIC
    - stack_test.TestDADStop
    - stack_test.TestCleanupNDPState
    - stack_test.TestRouteWithDownNIC
    - stack_test.TestStopStartSolicitingRouters
PiperOrigin-RevId: 300805857
---
 pkg/tcpip/stack/ndp_test.go     | 154 ++++++++++-----
 pkg/tcpip/stack/nic.go          |  77 +++++---
 pkg/tcpip/stack/registration.go |   3 +
 pkg/tcpip/stack/stack_test.go   | 408 ++++++++++++++++++++++++----------------
 4 files changed, 413 insertions(+), 229 deletions(-)

diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 98b1c807c..4368c236c 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -639,8 +639,9 @@ func TestDADStop(t *testing.T) {
 	const nicID = 1
 
 	tests := []struct {
-		name   string
-		stopFn func(t *testing.T, s *stack.Stack)
+		name               string
+		stopFn             func(t *testing.T, s *stack.Stack)
+		skipFinalAddrCheck bool
 	}{
 		// Tests to make sure that DAD stops when an address is removed.
 		{
@@ -661,6 +662,19 @@ func TestDADStop(t *testing.T) {
 				}
 			},
 		},
+
+		// Tests to make sure that DAD stops when the NIC is removed.
+		{
+			name: "Remove NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveNIC(nicID); err != nil {
+					t.Fatalf("RemoveNIC(%d): %s", nicID, err)
+				}
+			},
+			// The NIC is removed so we can't check its addresses after calling
+			// stopFn.
+			skipFinalAddrCheck: true,
+		},
 	}
 
 	for _, test := range tests {
@@ -710,12 +724,15 @@ func TestDADStop(t *testing.T) {
 					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
-			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-			}
-			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+
+			if !test.skipFinalAddrCheck {
+				addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+				if err != nil {
+					t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+				}
+				if want := (tcpip.AddressWithPrefix{}); addr != want {
+					t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+				}
 			}
 
 			// Should not have sent more than 1 NS message.
@@ -2983,11 +3000,12 @@ func TestCleanupNDPState(t *testing.T) {
 		cleanupFn            func(t *testing.T, s *stack.Stack)
 		keepAutoGenLinkLocal bool
 		maxAutoGenAddrEvents int
+		skipFinalAddrCheck   bool
 	}{
 		// A NIC should still keep its auto-generated link-local address when
 		// becoming a router.
 		{
-			name: "Forwarding Enable",
+			name: "Enable forwarding",
 			cleanupFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
 				s.SetForwarding(true)
@@ -2998,7 +3016,7 @@ func TestCleanupNDPState(t *testing.T) {
 
 		// A NIC should cleanup all NDP state when it is disabled.
 		{
-			name: "NIC Disable",
+			name: "Disable NIC",
 			cleanupFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
 
@@ -3012,6 +3030,26 @@ func TestCleanupNDPState(t *testing.T) {
 			keepAutoGenLinkLocal: false,
 			maxAutoGenAddrEvents: 6,
 		},
+
+		// A NIC should cleanup all NDP state when it is removed.
+		{
+			name: "Remove NIC",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+
+				if err := s.RemoveNIC(nicID1); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID1, err)
+				}
+				if err := s.RemoveNIC(nicID2); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+			// The NICs are removed so we can't check their addresses after calling
+			// stopFn.
+			skipFinalAddrCheck: true,
+		},
 	}
 
 	for _, test := range tests {
@@ -3230,35 +3268,37 @@ func TestCleanupNDPState(t *testing.T) {
 				t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
 			}
 
-			// Make sure the auto-generated addresses got removed.
-			nicinfo = s.NICInfo()
-			nic1Addrs = nicinfo[nicID1].ProtocolAddresses
-			nic2Addrs = nicinfo[nicID2].ProtocolAddresses
-			if containsV6Addr(nic1Addrs, llAddrWithPrefix1) != test.keepAutoGenLinkLocal {
-				if test.keepAutoGenLinkLocal {
-					t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-				} else {
-					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+			if !test.skipFinalAddrCheck {
+				// Make sure the auto-generated addresses got removed.
+				nicinfo = s.NICInfo()
+				nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+				nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+				if containsV6Addr(nic1Addrs, llAddrWithPrefix1) != test.keepAutoGenLinkLocal {
+					if test.keepAutoGenLinkLocal {
+						t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+					} else {
+						t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+					}
 				}
-			}
-			if containsV6Addr(nic1Addrs, e1Addr1) {
-				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-			}
-			if containsV6Addr(nic1Addrs, e1Addr2) {
-				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-			}
-			if containsV6Addr(nic2Addrs, llAddrWithPrefix2) != test.keepAutoGenLinkLocal {
-				if test.keepAutoGenLinkLocal {
-					t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-				} else {
-					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+				if containsV6Addr(nic1Addrs, e1Addr1) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+				}
+				if containsV6Addr(nic1Addrs, e1Addr2) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+				}
+				if containsV6Addr(nic2Addrs, llAddrWithPrefix2) != test.keepAutoGenLinkLocal {
+					if test.keepAutoGenLinkLocal {
+						t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+					} else {
+						t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+					}
+				}
+				if containsV6Addr(nic2Addrs, e2Addr1) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+				}
+				if containsV6Addr(nic2Addrs, e2Addr2) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
 				}
-			}
-			if containsV6Addr(nic2Addrs, e2Addr1) {
-				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-			}
-			if containsV6Addr(nic2Addrs, e2Addr2) {
-				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
 			}
 
 			// Should not get any more events (invalidation timers should have been
@@ -3575,17 +3615,19 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 	tests := []struct {
 		name    string
 		startFn func(t *testing.T, s *stack.Stack)
-		stopFn  func(t *testing.T, s *stack.Stack)
+		// first is used to tell stopFn that it is being called for the first time
+		// after router solicitations were last enabled.
+		stopFn func(t *testing.T, s *stack.Stack, first bool)
 	}{
 		// Tests that when forwarding is enabled or disabled, router solicitations
 		// are stopped or started, respectively.
 		{
-			name: "Forwarding enabled and disabled",
+			name: "Enable and disable forwarding",
 			startFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
 				s.SetForwarding(false)
 			},
-			stopFn: func(t *testing.T, s *stack.Stack) {
+			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
 				t.Helper()
 				s.SetForwarding(true)
 			},
@@ -3594,7 +3636,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 		// Tests that when a NIC is enabled or disabled, router solicitations
 		// are started or stopped, respectively.
 		{
-			name: "NIC disabled and enabled",
+			name: "Enable and disable NIC",
 			startFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
 
@@ -3602,7 +3644,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 					t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
 				}
 			},
-			stopFn: func(t *testing.T, s *stack.Stack) {
+			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
 				t.Helper()
 
 				if err := s.DisableNIC(nicID); err != nil {
@@ -3610,6 +3652,25 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 				}
 			},
 		},
+
+		// Tests that when a NIC is removed, router solicitations are stopped. We
+		// cannot start router solications on a removed NIC.
+		{
+			name: "Remove NIC",
+			stopFn: func(t *testing.T, s *stack.Stack, first bool) {
+				t.Helper()
+
+				// Only try to remove the NIC the first time stopFn is called since it's
+				// impossible to remove an already removed NIC.
+				if !first {
+					return
+				}
+
+				if err := s.RemoveNIC(nicID); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 	}
 
 	for _, test := range tests {
@@ -3648,7 +3709,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			}
 
 			// Stop soliciting routers.
-			test.stopFn(t, s)
+			test.stopFn(t, s, true /* first */)
 			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
@@ -3662,13 +3723,18 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 
 			// Stopping router solicitations after it has already been stopped should
 			// do nothing.
-			test.stopFn(t, s)
+			test.stopFn(t, s, false /* first */)
 			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
 			}
 
+			// If test.startFn is nil, there is no way to restart router solications.
+			if test.startFn == nil {
+				return
+			}
+
 			// Start soliciting routers.
 			test.startFn(t, s)
 			waitForPkt(delay + defaultAsyncEventTimeout)
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 230ee0697..11eaa6a2c 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -56,7 +56,7 @@ type NIC struct {
 		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
 		endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
 		addressRanges []tcpip.Subnet
-		mcastJoins    map[NetworkEndpointID]int32
+		mcastJoins    map[NetworkEndpointID]uint32
 		// packetEPs is protected by mu, but the contained PacketEndpoint
 		// values are not.
 		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
@@ -123,7 +123,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	}
 	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
 	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
-	nic.mu.mcastJoins = make(map[NetworkEndpointID]int32)
+	nic.mu.mcastJoins = make(map[NetworkEndpointID]uint32)
 	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
 	nic.mu.ndp = ndpState{
 		nic:              nic,
@@ -167,8 +167,17 @@ func (n *NIC) disable() *tcpip.Error {
 	}
 
 	n.mu.Lock()
-	defer n.mu.Unlock()
+	err := n.disableLocked()
+	n.mu.Unlock()
+	return err
+}
 
+// disableLocked disables n.
+//
+// It undoes the work done by enable.
+//
+// n MUST be locked.
+func (n *NIC) disableLocked() *tcpip.Error {
 	if !n.mu.enabled {
 		return nil
 	}
@@ -191,7 +200,7 @@ func (n *NIC) disable() *tcpip.Error {
 		}
 
 		// The NIC may have already left the multicast group.
-		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
+		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
 			return err
 		}
 	}
@@ -307,24 +316,33 @@ func (n *NIC) remove() *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	// Detach from link endpoint, so no packet comes in.
-	n.linkEP.Attach(nil)
+	n.disableLocked()
+
+	// TODO(b/151378115): come up with a better way to pick an error than the
+	// first one.
+	var err *tcpip.Error
+
+	// Forcefully leave multicast groups.
+	for nid := range n.mu.mcastJoins {
+		if tempErr := n.leaveGroupLocked(nid.LocalAddress, true /* force */); tempErr != nil && err == nil {
+			err = tempErr
+		}
+	}
 
 	// Remove permanent and permanentTentative addresses, so no packet goes out.
-	var errs []*tcpip.Error
 	for nid, ref := range n.mu.endpoints {
 		switch ref.getKind() {
 		case permanentTentative, permanent:
-			if err := n.removePermanentAddressLocked(nid.LocalAddress); err != nil {
-				errs = append(errs, err)
+			if tempErr := n.removePermanentAddressLocked(nid.LocalAddress); tempErr != nil && err == nil {
+				err = tempErr
 			}
 		}
 	}
-	if len(errs) > 0 {
-		return errs[0]
-	}
 
-	return nil
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	return err
 }
 
 // becomeIPv6Router transitions n into an IPv6 router.
@@ -971,6 +989,7 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 	for i, ref := range refs {
 		if ref == r {
 			n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+			refs[len(refs)-1] = nil
 			break
 		}
 	}
@@ -1021,9 +1040,12 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 
 	// If we are removing an IPv6 unicast address, leave the solicited-node
 	// multicast address.
+	//
+	// We ignore the tcpip.ErrBadLocalAddress error because the solicited-node
+	// multicast group may be left by user action.
 	if isIPv6Unicast {
 		snmc := header.SolicitedNodeAddr(addr)
-		if err := n.leaveGroupLocked(snmc); err != nil {
+		if err := n.leaveGroupLocked(snmc, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
 			return err
 		}
 	}
@@ -1083,26 +1105,31 @@ func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	return n.leaveGroupLocked(addr)
+	return n.leaveGroupLocked(addr, false /* force */)
 }
 
 // leaveGroupLocked decrements the count for the given multicast address, and
 // when it reaches zero removes the endpoint for this address. n MUST be locked
 // before leaveGroupLocked is called.
-func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
+//
+// If force is true, then the count for the multicast addres is ignored and the
+// endpoint will be removed immediately.
+func (n *NIC) leaveGroupLocked(addr tcpip.Address, force bool) *tcpip.Error {
 	id := NetworkEndpointID{addr}
-	joins := n.mu.mcastJoins[id]
-	switch joins {
-	case 0:
+	joins, ok := n.mu.mcastJoins[id]
+	if !ok {
 		// There are no joins with this address on this NIC.
 		return tcpip.ErrBadLocalAddress
-	case 1:
-		// This is the last one, clean up.
-		if err := n.removePermanentAddressLocked(addr); err != nil {
-			return err
-		}
 	}
-	n.mu.mcastJoins[id] = joins - 1
+
+	joins--
+	if force || joins == 0 {
+		// There are no outstanding joins or we are forced to leave, clean up.
+		delete(n.mu.mcastJoins, id)
+		return n.removePermanentAddressLocked(addr)
+	}
+
+	n.mu.mcastJoins[id] = joins
 	return nil
 }
 
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index f9fd8f18f..fa28b46b1 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -401,6 +401,9 @@ type LinkEndpoint interface {
 
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
+	//
+	// Attach will be called with a nil dispatcher if the receiver's associated
+	// NIC is being removed.
 	Attach(dispatcher NetworkDispatcher)
 
 	// IsAttached returns whether a NetworkDispatcher is attached to the
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9515426d6..9836b340f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -255,7 +255,7 @@ type linkEPWithMockedAttach struct {
 // Attach implements stack.LinkEndpoint.Attach.
 func (l *linkEPWithMockedAttach) Attach(d stack.NetworkDispatcher) {
 	l.LinkEndpoint.Attach(d)
-	l.attached = true
+	l.attached = d != nil
 }
 
 func (l *linkEPWithMockedAttach) isAttached() bool {
@@ -566,7 +566,7 @@ func TestAttachToLinkEndpointImmediately(t *testing.T) {
 				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, test.nicOpts, err)
 			}
 			if !e.isAttached() {
-				t.Fatalf("link endpoint not attached to a network disatcher")
+				t.Fatal("link endpoint not attached to a network dispatcher")
 			}
 		})
 	}
@@ -631,196 +631,240 @@ func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
 	checkNIC(false)
 }
 
-func TestRoutesWithDisabledNIC(t *testing.T) {
-	const unspecifiedNIC = 0
-	const nicID1 = 1
-	const nicID2 = 2
-
+func TestRemoveUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
 	})
 
-	ep1 := channel.New(0, defaultMTU, "")
-	if err := s.CreateNIC(nicID1, ep1); err != nil {
-		t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+	if err := s.RemoveNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.RemoveNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
 	}
+}
 
-	addr1 := tcpip.Address("\x01")
-	if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
-	}
+func TestRemoveNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
 
-	ep2 := channel.New(0, defaultMTU, "")
-	if err := s.CreateNIC(nicID2, ep2); err != nil {
-		t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+	e := linkEPWithMockedAttach{
+		LinkEndpoint: loopback.New(),
+	}
+	if err := s.CreateNIC(nicID, &e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	addr2 := tcpip.Address("\x02")
-	if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+	// NIC should be present in NICInfo and attached to a NetworkDispatcher.
+	allNICInfo := s.NICInfo()
+	if _, ok := allNICInfo[nicID]; !ok {
+		t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+	}
+	if !e.isAttached() {
+		t.Fatal("link endpoint not attached to a network dispatcher")
 	}
 
-	// Set a route table that sends all packets with odd destination
-	// addresses through the first NIC, and all even destination address
-	// through the second one.
-	{
-		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
-		if err != nil {
-			t.Fatal(err)
-		}
-		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{
-			{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
-			{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
-		})
+	// Removing a NIC should remove it from NICInfo and e should be detached from
+	// the NetworkDispatcher.
+	if err := s.RemoveNIC(nicID); err != nil {
+		t.Fatalf("s.RemoveNIC(%d): %s", nicID, err)
 	}
+	if nicInfo, ok := s.NICInfo()[nicID]; ok {
+		t.Errorf("got unexpected NICInfo entry for deleted NIC %d = %+v", nicID, nicInfo)
+	}
+	if e.isAttached() {
+		t.Error("link endpoint for removed NIC still attached to a network dispatcher")
+	}
+}
 
-	// Test routes to odd address.
-	testRoute(t, s, unspecifiedNIC, "", "\x05", addr1)
-	testRoute(t, s, unspecifiedNIC, addr1, "\x05", addr1)
-	testRoute(t, s, nicID1, addr1, "\x05", addr1)
+func TestRouteWithDownNIC(t *testing.T) {
+	tests := []struct {
+		name   string
+		downFn func(s *stack.Stack, nicID tcpip.NICID) *tcpip.Error
+		upFn   func(s *stack.Stack, nicID tcpip.NICID) *tcpip.Error
+	}{
+		{
+			name:   "Disabled NIC",
+			downFn: (*stack.Stack).DisableNIC,
+			upFn:   (*stack.Stack).EnableNIC,
+		},
+
+		// Once a NIC is removed, it cannot be brought up.
+		{
+			name:   "Removed NIC",
+			downFn: (*stack.Stack).RemoveNIC,
+		},
+	}
 
-	// Test routes to even address.
-	testRoute(t, s, unspecifiedNIC, "", "\x06", addr2)
-	testRoute(t, s, unspecifiedNIC, addr2, "\x06", addr2)
-	testRoute(t, s, nicID2, addr2, "\x06", addr2)
-
-	// Disabling NIC1 should result in no routes to odd addresses. Routes to even
-	// addresses should continue to be available as NIC2 is still enabled.
-	if err := s.DisableNIC(nicID1); err != nil {
-		t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
-	}
-	nic1Dst := tcpip.Address("\x05")
-	testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
-	testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
-	testNoRoute(t, s, nicID1, addr1, nic1Dst)
-	nic2Dst := tcpip.Address("\x06")
-	testRoute(t, s, unspecifiedNIC, "", nic2Dst, addr2)
-	testRoute(t, s, unspecifiedNIC, addr2, nic2Dst, addr2)
-	testRoute(t, s, nicID2, addr2, nic2Dst, addr2)
-
-	// Disabling NIC2 should result in no routes to even addresses. No route
-	// should be available to any address as routes to odd addresses were made
-	// unavailable by disabling NIC1 above.
-	if err := s.DisableNIC(nicID2); err != nil {
-		t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
-	}
-	testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
-	testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
-	testNoRoute(t, s, nicID1, addr1, nic1Dst)
-	testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
-	testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
-	testNoRoute(t, s, nicID2, addr2, nic2Dst)
-
-	// Enabling NIC1 should make routes to odd addresses available again. Routes
-	// to even addresses should continue to be unavailable as NIC2 is still
-	// disabled.
-	if err := s.EnableNIC(nicID1); err != nil {
-		t.Fatalf("s.EnableNIC(%d): %s", nicID1, err)
-	}
-	testRoute(t, s, unspecifiedNIC, "", nic1Dst, addr1)
-	testRoute(t, s, unspecifiedNIC, addr1, nic1Dst, addr1)
-	testRoute(t, s, nicID1, addr1, nic1Dst, addr1)
-	testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
-	testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
-	testNoRoute(t, s, nicID2, addr2, nic2Dst)
-}
-
-func TestRouteWritePacketWithDisabledNIC(t *testing.T) {
 	const unspecifiedNIC = 0
 	const nicID1 = 1
 	const nicID2 = 2
+	const addr1 = tcpip.Address("\x01")
+	const addr2 = tcpip.Address("\x02")
+	const nic1Dst = tcpip.Address("\x05")
+	const nic2Dst = tcpip.Address("\x06")
+
+	setup := func(t *testing.T) (*stack.Stack, *channel.Endpoint, *channel.Endpoint) {
+		s := stack.New(stack.Options{
+			NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		})
 
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
+		ep1 := channel.New(1, defaultMTU, "")
+		if err := s.CreateNIC(nicID1, ep1); err != nil {
+			t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+		}
 
-	ep1 := channel.New(1, defaultMTU, "")
-	if err := s.CreateNIC(nicID1, ep1); err != nil {
-		t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
-	}
+		if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+			t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+		}
 
-	addr1 := tcpip.Address("\x01")
-	if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
-	}
+		ep2 := channel.New(1, defaultMTU, "")
+		if err := s.CreateNIC(nicID2, ep2); err != nil {
+			t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+		}
 
-	ep2 := channel.New(1, defaultMTU, "")
-	if err := s.CreateNIC(nicID2, ep2); err != nil {
-		t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
-	}
+		if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+			t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+		}
 
-	addr2 := tcpip.Address("\x02")
-	if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+		// Set a route table that sends all packets with odd destination
+		// addresses through the first NIC, and all even destination address
+		// through the second one.
+		{
+			subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+			if err != nil {
+				t.Fatal(err)
+			}
+			subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+			if err != nil {
+				t.Fatal(err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+				{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+			})
+		}
+
+		return s, ep1, ep2
 	}
 
-	// Set a route table that sends all packets with odd destination
-	// addresses through the first NIC, and all even destination address
-	// through the second one.
-	{
-		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
-		if err != nil {
-			t.Fatal(err)
-		}
-		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
-		if err != nil {
-			t.Fatal(err)
+	// Tests that routes through a down NIC are not used when looking up a route
+	// for a destination.
+	t.Run("Find", func(t *testing.T) {
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				s, _, _ := setup(t)
+
+				// Test routes to odd address.
+				testRoute(t, s, unspecifiedNIC, "", "\x05", addr1)
+				testRoute(t, s, unspecifiedNIC, addr1, "\x05", addr1)
+				testRoute(t, s, nicID1, addr1, "\x05", addr1)
+
+				// Test routes to even address.
+				testRoute(t, s, unspecifiedNIC, "", "\x06", addr2)
+				testRoute(t, s, unspecifiedNIC, addr2, "\x06", addr2)
+				testRoute(t, s, nicID2, addr2, "\x06", addr2)
+
+				// Bringing NIC1 down should result in no routes to odd addresses. Routes to
+				// even addresses should continue to be available as NIC2 is still up.
+				if err := test.downFn(s, nicID1); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID1, err)
+				}
+				testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+				testNoRoute(t, s, nicID1, addr1, nic1Dst)
+				testRoute(t, s, unspecifiedNIC, "", nic2Dst, addr2)
+				testRoute(t, s, unspecifiedNIC, addr2, nic2Dst, addr2)
+				testRoute(t, s, nicID2, addr2, nic2Dst, addr2)
+
+				// Bringing NIC2 down should result in no routes to even addresses. No
+				// route should be available to any address as routes to odd addresses
+				// were made unavailable by bringing NIC1 down above.
+				if err := test.downFn(s, nicID2); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID2, err)
+				}
+				testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+				testNoRoute(t, s, nicID1, addr1, nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+				testNoRoute(t, s, nicID2, addr2, nic2Dst)
+
+				if upFn := test.upFn; upFn != nil {
+					// Bringing NIC1 up should make routes to odd addresses available
+					// again. Routes to even addresses should continue to be unavailable
+					// as NIC2 is still down.
+					if err := upFn(s, nicID1); err != nil {
+						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
+					}
+					testRoute(t, s, unspecifiedNIC, "", nic1Dst, addr1)
+					testRoute(t, s, unspecifiedNIC, addr1, nic1Dst, addr1)
+					testRoute(t, s, nicID1, addr1, nic1Dst, addr1)
+					testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+					testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+					testNoRoute(t, s, nicID2, addr2, nic2Dst)
+				}
+			})
 		}
-		s.SetRouteTable([]tcpip.Route{
-			{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
-			{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
-		})
-	}
+	})
 
-	nic1Dst := tcpip.Address("\x05")
-	r1, err := s.FindRoute(nicID1, addr1, nic1Dst, fakeNetNumber, false /* multicastLoop */)
-	if err != nil {
-		t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID1, addr1, nic1Dst, fakeNetNumber, err)
-	}
-	defer r1.Release()
+	// Tests that writing a packet using a Route through a down NIC fails.
+	t.Run("WritePacket", func(t *testing.T) {
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				s, ep1, ep2 := setup(t)
 
-	nic2Dst := tcpip.Address("\x06")
-	r2, err := s.FindRoute(nicID2, addr2, nic2Dst, fakeNetNumber, false /* multicastLoop */)
-	if err != nil {
-		t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID2, addr2, nic2Dst, fakeNetNumber, err)
-	}
-	defer r2.Release()
+				r1, err := s.FindRoute(nicID1, addr1, nic1Dst, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID1, addr1, nic1Dst, fakeNetNumber, err)
+				}
+				defer r1.Release()
 
-	// If we failed to get routes r1 or r2, we cannot proceed with the test.
-	if t.Failed() {
-		t.FailNow()
-	}
+				r2, err := s.FindRoute(nicID2, addr2, nic2Dst, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID2, addr2, nic2Dst, fakeNetNumber, err)
+				}
+				defer r2.Release()
 
-	buf := buffer.View([]byte{1})
-	testSend(t, r1, ep1, buf)
-	testSend(t, r2, ep2, buf)
+				// If we failed to get routes r1 or r2, we cannot proceed with the test.
+				if t.Failed() {
+					t.FailNow()
+				}
 
-	// Writes with Routes that use the disabled NIC1 should fail.
-	if err := s.DisableNIC(nicID1); err != nil {
-		t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
-	}
-	testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
-	testSend(t, r2, ep2, buf)
+				buf := buffer.View([]byte{1})
+				testSend(t, r1, ep1, buf)
+				testSend(t, r2, ep2, buf)
 
-	// Writes with Routes that use the disabled NIC2 should fail.
-	if err := s.DisableNIC(nicID2); err != nil {
-		t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
-	}
-	testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
-	testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+				// Writes with Routes that use NIC1 after being brought down should fail.
+				if err := test.downFn(s, nicID1); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID1, err)
+				}
+				testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+				testSend(t, r2, ep2, buf)
 
-	// Writes with Routes that use the re-enabled NIC1 should succeed.
-	// TODO(b/147015577): Should we instead completely invalidate all Routes that
-	// were bound to a disabled NIC at some point?
-	if err := s.EnableNIC(nicID1); err != nil {
-		t.Fatalf("s.EnableNIC(%d): %s", nicID1, err)
-	}
-	testSend(t, r1, ep1, buf)
-	testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+				// Writes with Routes that use NIC2 after being brought down should fail.
+				if err := test.downFn(s, nicID2); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID2, err)
+				}
+				testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+				testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+
+				if upFn := test.upFn; upFn != nil {
+					// Writes with Routes that use NIC1 after being brought up should
+					// succeed.
+					//
+					// TODO(b/147015577): Should we instead completely invalidate all
+					// Routes that were bound to a NIC that was brought down at some
+					// point?
+					if err := upFn(s, nicID1); err != nil {
+						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
+					}
+					testSend(t, r1, ep1, buf)
+					testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+				}
+			})
+		}
+	})
 }
 
 func TestRoutes(t *testing.T) {
@@ -3038,6 +3082,50 @@ func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
 	}
 }
 
+// TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval tests that removing an IPv6
+// address after leaving its solicited node multicast address does not result in
+// an error.
+func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	e := channel.New(10, 1280, linkAddr1)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, ipv6.ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, addr1, err)
+	}
+
+	// The NIC should have joined addr1's solicited node multicast address.
+	snmc := header.SolicitedNodeAddr(addr1)
+	in, err := s.IsInGroup(nicID, snmc)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, snmc, err)
+	}
+	if !in {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, snmc)
+	}
+
+	if err := s.LeaveGroup(ipv6.ProtocolNumber, nicID, snmc); err != nil {
+		t.Fatalf("LeaveGroup(%d, %d, %s): %s", ipv6.ProtocolNumber, nicID, snmc, err)
+	}
+	in, err = s.IsInGroup(nicID, snmc)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, snmc, err)
+	}
+	if in {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, snmc)
+	}
+
+	if err := s.RemoveAddress(nicID, addr1); err != nil {
+		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
+	}
+}
+
 func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
 	const nicID = 1
 
-- 
cgit v1.2.3


From 1c0535297067179a822ba2dd9a6fe13a8be5a666 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 13 Mar 2020 13:17:59 -0700
Subject: Fix oom_score_adj.

- Make oomScoreAdj a ThreadGroup field (Linux: signal_struct::oom_score_adj).

- Avoid deadlock caused by Task.OOMScoreAdj()/SetOOMScoreAdj() locking Task.mu
  and TaskSet.mu in the wrong order (via Task.ExitState()).

PiperOrigin-RevId: 300814698
---
 pkg/sentry/fs/proc/task.go           | 17 ++++++++++-------
 pkg/sentry/fsimpl/proc/task_files.go | 10 ++++++----
 pkg/sentry/kernel/task.go            | 29 ++++++-----------------------
 pkg/sentry/kernel/task_clone.go      |  9 +++------
 pkg/sentry/kernel/task_start.go      |  4 ----
 pkg/sentry/kernel/thread_group.go    |  7 +++++++
 6 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 03cc788c8..d6c5dd2c1 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -853,15 +853,15 @@ func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 
 // Read implements fs.FileOperations.Read.
 func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	if offset != 0 {
-		return 0, io.EOF
+	if f.t.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
 	}
-	adj, err := f.t.OOMScoreAdj()
-	if err != nil {
-		return 0, err
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj())
+	if offset >= int64(buf.Len()) {
+		return 0, io.EOF
 	}
-	adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n")
-	n, err := dst.CopyOut(ctx, adjBytes)
+	n, err := dst.CopyOut(ctx, buf.Bytes()[offset:])
 	return int64(n), err
 }
 
@@ -880,6 +880,9 @@ func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOS
 		return 0, err
 	}
 
+	if f.t.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
+	}
 	if err := f.t.SetOOMScoreAdj(v); err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 5a231ac86..4d3332771 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -539,11 +539,10 @@ var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	adj, err := o.task.OOMScoreAdj()
-	if err != nil {
-		return err
+	if o.task.ExitState() == kernel.TaskExitDead {
+		return syserror.ESRCH
 	}
-	fmt.Fprintf(buf, "%d\n", adj)
+	fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj())
 	return nil
 }
 
@@ -562,6 +561,9 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
 		return 0, err
 	}
 
+	if o.task.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
+	}
 	if err := o.task.SetOOMScoreAdj(v); err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index c0dbbe890..8452ddf5b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -555,13 +555,6 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
-
-	// oomScoreAdj is the task's OOM score adjustment. This is currently not
-	// used but is maintained for consistency.
-	// TODO(gvisor.dev/issue/1967)
-	//
-	// oomScoreAdj is protected by mu, and is owned by the task goroutine.
-	oomScoreAdj int32
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -856,27 +849,17 @@ func (t *Task) ContainerID() string {
 	return t.containerID
 }
 
-// OOMScoreAdj gets the task's OOM score adjustment.
-func (t *Task) OOMScoreAdj() (int32, error) {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	if t.ExitState() == TaskExitDead {
-		return 0, syserror.ESRCH
-	}
-	return t.oomScoreAdj, nil
+// OOMScoreAdj gets the task's thread group's OOM score adjustment.
+func (t *Task) OOMScoreAdj() int32 {
+	return atomic.LoadInt32(&t.tg.oomScoreAdj)
 }
 
-// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
-// between -1000 and 1000 inclusive.
+// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
+// value should be between -1000 and 1000 inclusive.
 func (t *Task) SetOOMScoreAdj(adj int32) error {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	if t.ExitState() == TaskExitDead {
-		return syserror.ESRCH
-	}
 	if adj > 1000 || adj < -1000 {
 		return syserror.EINVAL
 	}
-	t.oomScoreAdj = adj
+	atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
 	return nil
 }
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index dda502bb8..e1ecca99e 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -15,6 +15,8 @@
 package kernel
 
 import (
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -260,15 +262,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 			sh = sh.Fork()
 		}
 		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+		tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
 		rseqAddr = t.rseqAddr
 		rseqSignature = t.rseqSignature
 	}
 
-	adj, err := t.OOMScoreAdj()
-	if err != nil {
-		return 0, nil, err
-	}
-
 	cfg := &TaskConfig{
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
@@ -287,7 +285,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
-		OOMScoreAdj:             adj,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 2bbf48bb8..a5035bb7f 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -93,9 +93,6 @@ type TaskConfig struct {
 
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
-
-	// oomScoreAdj is the task's OOM score adjustment.
-	OOMScoreAdj int32
 }
 
 // NewTask creates a new task defined by cfg.
@@ -146,7 +143,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		rseqSignature:      cfg.RSeqSignature,
 		futexWaiter:        futex.NewWaiter(),
 		containerID:        cfg.ContainerID,
-		oomScoreAdj:        cfg.OOMScoreAdj,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 268f62e9d..52849f5b3 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -254,6 +254,13 @@ type ThreadGroup struct {
 	//
 	// tty is protected by the signal mutex.
 	tty *TTY
+
+	// oomScoreAdj is the thread group's OOM score adjustment. This is
+	// currently not used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is accessed using atomic memory operations.
+	oomScoreAdj int32
 }
 
 // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
-- 
cgit v1.2.3


From 645b1b2e9cd40084a42d6168de72a915449780b7 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 13 Mar 2020 14:58:16 -0700
Subject: Refactor SLAAC address state into SLAAC prefix state

Previously, SLAAC related state was stored on a per-address basis. This was
sufficient for the simple case of a single SLAAC address per prefix, but
future CLs will introduce temporary addresses which will result in multiple
SLAAC addresses for a prefix. This refactor allows storing multiple addresses
for a prefix in a single SLAAC prefix state.

No behaviour changes - existing tests continue to pass.

PiperOrigin-RevId: 300832812
---
 pkg/tcpip/stack/ndp.go | 296 ++++++++++++++++++++++++++-----------------------
 pkg/tcpip/stack/nic.go |  20 ++--
 2 files changed, 167 insertions(+), 149 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index a9f4d5dad..d689a006d 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -361,16 +361,16 @@ type ndpState struct {
 	// The default routers discovered through Router Advertisements.
 	defaultRouters map[tcpip.Address]defaultRouterState
 
+	// The timer used to send the next router solicitation message.
+	rtrSolicitTimer *time.Timer
+
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
 	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
 
-	// The timer used to send the next router solicitation message.
-	// If routers are being solicited, rtrSolicitTimer MUST NOT be nil.
-	rtrSolicitTimer *time.Timer
-
-	// The addresses generated by SLAAC.
-	autoGenAddresses map[tcpip.Address]autoGenAddressState
+	// The SLAAC prefixes discovered through Router Advertisements' Prefix
+	// Information option.
+	slaacPrefixes map[tcpip.Subnet]slaacPrefixState
 
 	// The last learned DHCPv6 configuration from an NDP RA.
 	dhcpv6Configuration DHCPv6ConfigurationFromNDPRA
@@ -402,18 +402,16 @@ type onLinkPrefixState struct {
 	invalidationTimer tcpip.CancellableTimer
 }
 
-// autoGenAddressState holds data associated with an address generated via
-// SLAAC.
-type autoGenAddressState struct {
-	// A reference to the referencedNetworkEndpoint that this autoGenAddressState
-	// is holding state for.
-	ref *referencedNetworkEndpoint
-
+// slaacPrefixState holds state associated with a SLAAC prefix.
+type slaacPrefixState struct {
 	deprecationTimer  tcpip.CancellableTimer
 	invalidationTimer tcpip.CancellableTimer
 
 	// Nonzero only when the address is not valid forever.
 	validUntil time.Time
+
+	// The prefix's permanent address endpoint.
+	ref *referencedNetworkEndpoint
 }
 
 // startDuplicateAddressDetection performs Duplicate Address Detection.
@@ -899,23 +897,15 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 
 	prefix := pi.Subnet()
 
-	// Check if we already have an auto-generated address for prefix.
-	for addr, addrState := range ndp.autoGenAddresses {
-		refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: addrState.ref.ep.PrefixLen()}
-		if refAddrWithPrefix.Subnet() != prefix {
-			continue
-		}
-
-		// At this point, we know we are refreshing a SLAAC generated IPv6 address
-		// with the prefix prefix. Do the work as outlined by RFC 4862 section
-		// 5.5.3.e.
-		ndp.refreshAutoGenAddressLifetimes(addr, pl, vl)
+	// Check if we already maintain SLAAC state for prefix.
+	if _, ok := ndp.slaacPrefixes[prefix]; ok {
+		// As per RFC 4862 section 5.5.3.e, refresh prefix's SLAAC lifetimes.
+		ndp.refreshSLAACPrefixLifetimes(prefix, pl, vl)
 		return
 	}
 
-	// We do not already have an address with the prefix prefix. Do the
-	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
-	// to auto-generate global addresses by SLAAC.
+	// prefix is a new SLAAC prefix. Do the work as outlined by RFC 4862 section
+	// 5.5.3.d if ndp is configured to auto-generate new addresses via SLAAC.
 	if !ndp.configs.AutoGenGlobalAddresses {
 		return
 	}
@@ -927,6 +917,8 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 // for prefix.
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
+//
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// If we do not already have an address for this prefix and the valid
 	// lifetime is 0, no need to do anything further, as per RFC 4862
@@ -942,9 +934,59 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 		return
 	}
 
+	// If the preferred lifetime is zero, then the prefix should be considered
+	// deprecated.
+	deprecated := pl == 0
+	ref := ndp.addSLAACAddr(prefix, deprecated)
+	if ref == nil {
+		// We were unable to generate a permanent address for prefix so do nothing
+		// further as there is no reason to maintain state for a SLAAC prefix we
+		// cannot generate a permanent address for.
+		return
+	}
+
+	state := slaacPrefixState{
+		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				log.Fatalf("ndp: must have a slaacPrefixes entry for the SLAAC prefix %s", prefix)
+			}
+
+			ndp.deprecateSLAACAddress(prefixState.ref)
+		}),
+		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+			ndp.invalidateSLAACPrefix(prefix, true)
+		}),
+		ref: ref,
+	}
+
+	// Setup the initial timers to deprecate and invalidate prefix.
+
+	if !deprecated && pl < header.NDPInfiniteLifetime {
+		state.deprecationTimer.Reset(pl)
+	}
+
+	if vl < header.NDPInfiniteLifetime {
+		state.invalidationTimer.Reset(vl)
+		state.validUntil = time.Now().Add(vl)
+	}
+
+	ndp.slaacPrefixes[prefix] = state
+}
+
+// addSLAACAddr adds a SLAAC address for prefix.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) addSLAACAddr(prefix tcpip.Subnet, deprecated bool) *referencedNetworkEndpoint {
 	addrBytes := []byte(prefix.ID())
 	if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-		addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name), 0 /* dadCounter */, oIID.SecretKey)
+		addrBytes = header.AppendOpaqueInterfaceIdentifier(
+			addrBytes[:header.IIDOffsetInIPv6Address],
+			prefix,
+			oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name),
+			0, /* dadCounter */
+			oIID.SecretKey,
+		)
 	} else {
 		// Only attempt to generate an interface-specific IID if we have a valid
 		// link address.
@@ -953,137 +995,103 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 		// LinkEndpoint.LinkAddress) before reaching this point.
 		linkAddr := ndp.nic.linkEP.LinkAddress()
 		if !header.IsValidUnicastEthernetAddress(linkAddr) {
-			return
+			return nil
 		}
 
 		// Generate an address within prefix from the modified EUI-64 of ndp's NIC's
 		// Ethernet MAC address.
 		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
 	}
-	addr := tcpip.Address(addrBytes)
-	addrWithPrefix := tcpip.AddressWithPrefix{
-		Address:   addr,
-		PrefixLen: validPrefixLenForAutoGen,
+
+	generatedAddr := tcpip.ProtocolAddress{
+		Protocol: header.IPv6ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(addrBytes),
+			PrefixLen: validPrefixLenForAutoGen,
+		},
 	}
 
 	// If the nic already has this address, do nothing further.
-	if ndp.nic.hasPermanentAddrLocked(addr) {
-		return
+	if ndp.nic.hasPermanentAddrLocked(generatedAddr.AddressWithPrefix.Address) {
+		return nil
 	}
 
 	// Inform the integrator that we have a new SLAAC address.
 	ndpDisp := ndp.nic.stack.ndpDisp
 	if ndpDisp == nil {
-		return
+		return nil
 	}
-	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
+
+	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), generatedAddr.AddressWithPrefix) {
 		// Informed by the integrator not to add the address.
-		return
+		return nil
 	}
 
-	protocolAddr := tcpip.ProtocolAddress{
-		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addrWithPrefix,
-	}
-	// If the preferred lifetime is zero, then the address should be considered
-	// deprecated.
-	deprecated := pl == 0
-	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
+	ref, err := ndp.nic.addAddressLocked(generatedAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
 	if err != nil {
-		log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
-	}
-
-	state := autoGenAddressState{
-		ref: ref,
-		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
-			addrState, ok := ndp.autoGenAddresses[addr]
-			if !ok {
-				log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr)
-			}
-			addrState.ref.deprecated = true
-			ndp.notifyAutoGenAddressDeprecated(addr)
-		}),
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
-			ndp.invalidateAutoGenAddress(addr)
-		}),
+		log.Fatalf("ndp: error when adding address %+v: %s", generatedAddr, err)
 	}
 
-	// Setup the initial timers to deprecate and invalidate this newly generated
-	// address.
-
-	if !deprecated && pl < header.NDPInfiniteLifetime {
-		state.deprecationTimer.Reset(pl)
-	}
-
-	if vl < header.NDPInfiniteLifetime {
-		state.invalidationTimer.Reset(vl)
-		state.validUntil = time.Now().Add(vl)
-	}
-
-	ndp.autoGenAddresses[addr] = state
+	return ref
 }
 
-// refreshAutoGenAddressLifetimes refreshes the lifetime of a SLAAC generated
-// address addr.
+// refreshSLAACPrefixLifetimes refreshes the lifetimes of a SLAAC prefix.
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
-func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl time.Duration) {
-	addrState, ok := ndp.autoGenAddresses[addr]
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, pl, vl time.Duration) {
+	prefixState, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
-		log.Fatalf("ndp: SLAAC state not found to refresh lifetimes for %s", addr)
+		log.Fatalf("ndp: SLAAC prefix state not found to refresh lifetimes for %s", prefix)
 	}
-	defer func() { ndp.autoGenAddresses[addr] = addrState }()
+	defer func() { ndp.slaacPrefixes[prefix] = prefixState }()
 
-	// If the preferred lifetime is zero, then the address should be considered
-	// deprecated.
+	// If the preferred lifetime is zero, then the prefix should be deprecated.
 	deprecated := pl == 0
-	wasDeprecated := addrState.ref.deprecated
-	addrState.ref.deprecated = deprecated
-
-	// Only send the deprecation event if the deprecated status for addr just
-	// changed from non-deprecated to deprecated.
-	if !wasDeprecated && deprecated {
-		ndp.notifyAutoGenAddressDeprecated(addr)
+	if deprecated {
+		ndp.deprecateSLAACAddress(prefixState.ref)
+	} else {
+		prefixState.ref.deprecated = false
 	}
 
-	// If addr was preferred for some finite lifetime before, stop the deprecation
-	// timer so it can be reset.
-	addrState.deprecationTimer.StopLocked()
+	// If prefix was preferred for some finite lifetime before, stop the
+	// deprecation timer so it can be reset.
+	prefixState.deprecationTimer.StopLocked()
 
-	// Reset the deprecation timer if addr has a finite preferred lifetime.
+	// Reset the deprecation timer if prefix has a finite preferred lifetime.
 	if !deprecated && pl < header.NDPInfiniteLifetime {
-		addrState.deprecationTimer.Reset(pl)
+		prefixState.deprecationTimer.Reset(pl)
 	}
 
-	// As per RFC 4862 section 5.5.3.e, the valid lifetime of the address
-	//
+	// As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix:
 	//
 	// 1) If the received Valid Lifetime is greater than 2 hours or greater than
-	//    RemainingLifetime, set the valid lifetime of the address to the
+	//    RemainingLifetime, set the valid lifetime of the prefix to the
 	//    advertised Valid Lifetime.
 	//
 	// 2) If RemainingLifetime is less than or equal to 2 hours, ignore the
 	//    advertised Valid Lifetime.
 	//
-	// 3) Otherwise, reset the valid lifetime of the address to 2 hours.
+	// 3) Otherwise, reset the valid lifetime of the prefix to 2 hours.
 
 	// Handle the infinite valid lifetime separately as we do not keep a timer in
 	// this case.
 	if vl >= header.NDPInfiniteLifetime {
-		addrState.invalidationTimer.StopLocked()
-		addrState.validUntil = time.Time{}
+		prefixState.invalidationTimer.StopLocked()
+		prefixState.validUntil = time.Time{}
 		return
 	}
 
 	var effectiveVl time.Duration
 	var rl time.Duration
 
-	// If the address was originally set to be valid forever, assume the remaining
+	// If the prefix was originally set to be valid forever, assume the remaining
 	// time to be the maximum possible value.
-	if addrState.validUntil == (time.Time{}) {
+	if prefixState.validUntil == (time.Time{}) {
 		rl = header.NDPInfiniteLifetime
 	} else {
-		rl = time.Until(addrState.validUntil)
+		rl = time.Until(prefixState.validUntil)
 	}
 
 	if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
@@ -1094,58 +1102,66 @@ func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl t
 		effectiveVl = MinPrefixInformationValidLifetimeForUpdate
 	}
 
-	addrState.invalidationTimer.StopLocked()
-	addrState.invalidationTimer.Reset(effectiveVl)
-	addrState.validUntil = time.Now().Add(effectiveVl)
-}
-
-// notifyAutoGenAddressDeprecated notifies the stack's NDP dispatcher that addr
-// has been deprecated.
-func (ndp *ndpState) notifyAutoGenAddressDeprecated(addr tcpip.Address) {
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{
-			Address:   addr,
-			PrefixLen: validPrefixLenForAutoGen,
-		})
-	}
+	prefixState.invalidationTimer.StopLocked()
+	prefixState.invalidationTimer.Reset(effectiveVl)
+	prefixState.validUntil = time.Now().Add(effectiveVl)
 }
 
-// invalidateAutoGenAddress invalidates an auto-generated address.
+// deprecateSLAACAddress marks ref as deprecated and notifies the stack's NDP
+// dispatcher that ref has been deprecated.
+//
+// deprecateSLAACAddress does nothing if ref is already deprecated.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) {
-	if !ndp.cleanupAutoGenAddrResourcesAndNotify(addr) {
+func (ndp *ndpState) deprecateSLAACAddress(ref *referencedNetworkEndpoint) {
+	if ref.deprecated {
 		return
 	}
 
-	ndp.nic.removePermanentAddressLocked(addr)
+	ref.deprecated = true
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+			Address:   ref.ep.ID().LocalAddress,
+			PrefixLen: ref.ep.PrefixLen(),
+		})
+	}
 }
 
-// cleanupAutoGenAddrResourcesAndNotify cleans up an invalidated auto-generated
-// address's resources from ndp. If the stack has an NDP dispatcher, it will
-// be notified that addr has been invalidated.
-//
-// Returns true if ndp had resources for addr to cleanup.
+// invalidateSLAACPrefix invalidates a SLAAC prefix.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool {
-	state, ok := ndp.autoGenAddresses[addr]
+func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, removeAddr bool) {
+	state, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
-		return false
+		return
 	}
 
 	state.deprecationTimer.StopLocked()
 	state.invalidationTimer.StopLocked()
-	delete(ndp.autoGenAddresses, addr)
+	delete(ndp.slaacPrefixes, prefix)
+
+	addr := state.ref.ep.ID().LocalAddress
+
+	if removeAddr {
+		if err := ndp.nic.removePermanentAddressLocked(addr); err != nil {
+			log.Fatalf("ndp: removePermanentAddressLocked(%s): %s", addr, err)
+		}
+	}
 
 	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
 		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
 			Address:   addr,
-			PrefixLen: validPrefixLenForAutoGen,
+			PrefixLen: state.ref.ep.PrefixLen(),
 		})
 	}
+}
 
-	return true
+// cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC
+// address's resources from ndp.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix) {
+	ndp.invalidateSLAACPrefix(addr.Subnet(), false)
 }
 
 // cleanupState cleans up ndp's state.
@@ -1163,21 +1179,21 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 // The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupState(hostOnly bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
-	linkLocalAddrs := 0
-	for addr := range ndp.autoGenAddresses {
+	linkLocalPrefixes := 0
+	for prefix := range ndp.slaacPrefixes {
 		// RFC 4862 section 5 states that routers are also expected to generate a
 		// link-local address so we do not invalidate them if we are cleaning up
 		// host-only state.
-		if hostOnly && linkLocalSubnet.Contains(addr) {
-			linkLocalAddrs++
+		if hostOnly && prefix == linkLocalSubnet {
+			linkLocalPrefixes++
 			continue
 		}
 
-		ndp.invalidateAutoGenAddress(addr)
+		ndp.invalidateSLAACPrefix(prefix, true)
 	}
 
-	if got := len(ndp.autoGenAddresses); got != linkLocalAddrs {
-		log.Fatalf("ndp: still have non-linklocal auto-generated addresses after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalAddrs)
+	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
+		log.Fatalf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes)
 	}
 
 	for prefix := range ndp.onLinkPrefixes {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 11eaa6a2c..9dcb1d52c 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -126,12 +126,12 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	nic.mu.mcastJoins = make(map[NetworkEndpointID]uint32)
 	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
 	nic.mu.ndp = ndpState{
-		nic:              nic,
-		configs:          stack.ndpConfigs,
-		dad:              make(map[tcpip.Address]dadState),
-		defaultRouters:   make(map[tcpip.Address]defaultRouterState),
-		onLinkPrefixes:   make(map[tcpip.Subnet]onLinkPrefixState),
-		autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
+		nic:            nic,
+		configs:        stack.ndpConfigs,
+		dad:            make(map[tcpip.Address]dadState),
+		defaultRouters: make(map[tcpip.Address]defaultRouterState),
+		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
 	}
 
 	// Register supported packet endpoint protocols.
@@ -1017,8 +1017,7 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 	isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
 
 	if isIPv6Unicast {
-		// If we are removing a tentative IPv6 unicast address, stop
-		// DAD.
+		// If we are removing a tentative IPv6 unicast address, stop DAD.
 		if kind == permanentTentative {
 			n.mu.ndp.stopDuplicateAddressDetection(addr)
 		}
@@ -1026,7 +1025,10 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 		// If we are removing an address generated via SLAAC, cleanup
 		// its SLAAC resources and notify the integrator.
 		if r.configType == slaac {
-			n.mu.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
+			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(tcpip.AddressWithPrefix{
+				Address:   addr,
+				PrefixLen: r.ep.PrefixLen(),
+			})
 		}
 	}
 
-- 
cgit v1.2.3


From 6d4497de25e50cd0bd8bd3c4a1813c1e1bbf914a Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 13 Mar 2020 14:59:22 -0700
Subject: Fix typo

PiperOrigin-RevId: 300832988
---
 pkg/tcpip/link/fdbased/endpoint.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index b7f60178e..a753fb243 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -467,7 +467,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 	views := pkts[0].Data.Views()
 	/*
-	 * Each bondary in views can add one more iovec.
+	 * Each boundary in views can add one more iovec.
 	 *
 	 * payload |      |          |         |
 	 *         -----------------------------
-- 
cgit v1.2.3


From d8cf54f20f2dd7348d06796f29a42991086c3f81 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 13 Mar 2020 15:17:01 -0700
Subject: Internal change

PiperOrigin-RevId: 300836270
---
 kokoro/benchmark_tests.cfg | 2 +-
 scripts/benchmark.sh       | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kokoro/benchmark_tests.cfg b/kokoro/benchmark_tests.cfg
index 7e2c8acad..f85cc9681 100644
--- a/kokoro/benchmark_tests.cfg
+++ b/kokoro/benchmark_tests.cfg
@@ -12,7 +12,7 @@ before_action {
 
 env_vars {
   key : 'PROJECT'
-  value : 'gvisor-kokoro-testing'
+  value : 'gvisor-benchmarks'
 }
 
 env_vars {
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index 032899386..06d44f914 100644
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -23,6 +23,9 @@ export GOOGLE_APPLICATION_CREDENTIALS="${KOKORO_KEYSTORE_DIR}/${GCLOUD_CREDENTIA
 gcloud auth activate-service-account \
    --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
 
+gcloud config set project ${PROJECT}
+gcloud config set compute/zone ${ZONE}
+
 bazel run //benchmarks:benchmarks -- \
   --verbose \
   run-gcp \
-- 
cgit v1.2.3


From b0f2c3e7646df603156f1b8e8b3382f33353eb04 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 13 Mar 2020 16:08:06 -0700
Subject: Fix infinite loop in semaphore.sem.wakeWaiters().

PiperOrigin-RevId: 300845134
---
 pkg/sentry/kernel/semaphore/semaphore.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 1000f3287..c00fa1138 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() {
 	for w := s.waiters.Front(); w != nil; {
 		if s.value < w.value {
 			// Still blocked, skip it.
+			w = w.Next()
 			continue
 		}
 		w.ch <- struct{}{}
-- 
cgit v1.2.3


From 829beebf0b67e20e50dd5ec4a5030636e38cc576 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 13 Mar 2020 17:16:59 -0700
Subject: Panic if file in FDTable has been destroyed

This will give more information about the file to
identify where possibly the extra DecRef()
would be.

PiperOrigin-RevId: 300855874
---
 pkg/sentry/kernel/fd_table.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 58001d56c..00f914564 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -195,6 +195,8 @@ func (f *FDTable) Size() int {
 //
 // It is the caller's responsibility to acquire an appropriate lock.
 func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+	// retries tracks the number of failed TryIncRef attempts for the same FD.
+	retries := 0
 	fd := int32(0)
 	for {
 		file, fileVFS2, flags, ok := f.getAll(fd)
@@ -204,17 +206,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
 		switch {
 		case file != nil:
 			if !file.TryIncRef() {
+				retries++
+				if retries > 1000 {
+					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
+				}
 				continue // Race caught.
 			}
 			fn(fd, file, nil, flags)
 			file.DecRef()
 		case fileVFS2 != nil:
 			if !fileVFS2.TryIncRef() {
+				retries++
+				if retries > 1000 {
+					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
+				}
 				continue // Race caught.
 			}
 			fn(fd, nil, fileVFS2, flags)
 			fileVFS2.DecRef()
 		}
+		retries = 0
 		fd++
 	}
 }
-- 
cgit v1.2.3


From 45a8ae240dd180f1b8b4c56e77ac67e4cd3af96f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 13 Mar 2020 18:56:05 -0700
Subject: Add remaining procfs files

Closes #1195

PiperOrigin-RevId: 300867055
---
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |   8 +
 pkg/sentry/fsimpl/kernfs/kernfs.go          |   2 +-
 pkg/sentry/fsimpl/proc/BUILD                |   4 +
 pkg/sentry/fsimpl/proc/subtasks.go          |   6 +-
 pkg/sentry/fsimpl/proc/task.go              |  26 +--
 pkg/sentry/fsimpl/proc/task_fds.go          | 287 ++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/proc/task_files.go        | 251 +++++++++++++++++++++++-
 pkg/sentry/fsimpl/proc/tasks.go             |  30 ++-
 pkg/sentry/fsimpl/proc/tasks_files.go       |  52 +++--
 pkg/sentry/fsimpl/proc/tasks_test.go        |  87 +++++++--
 pkg/sentry/fsimpl/testutil/BUILD            |   2 +
 pkg/sentry/fsimpl/testutil/kernel.go        |  27 ++-
 pkg/sentry/kernel/fd_table.go               |   7 +-
 13 files changed, 709 insertions(+), 80 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/proc/task_fds.go

diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index d50018b18..94ca3dbdd 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -554,3 +554,11 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs
 	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
+
+// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+type AlwaysValid struct{}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (*AlwaysValid) Valid(context.Context) bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a8ab2a2ba..18a34a590 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -319,7 +319,7 @@ type inodeMetadata interface {
 	// CheckPermissions checks that creds may access this inode for the
 	// requested access type, per the the rules of
 	// fs/namei.c:generic_permission().
-	CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error
+	CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error
 
 	// Mode returns the (struct stat)::st_mode value for this inode. This is
 	// separated from Stat for performance.
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index bb609a305..8156984eb 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -8,6 +8,7 @@ go_library(
         "filesystem.go",
         "subtasks.go",
         "task.go",
+        "task_fds.go",
         "task_files.go",
         "task_net.go",
         "tasks.go",
@@ -19,8 +20,10 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
@@ -53,6 +56,7 @@ go_test(
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 611645f3f..a3a7c16a5 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -34,6 +34,7 @@ type subtasksInode struct {
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
+	kernfs.AlwaysValid
 
 	task              *kernel.Task
 	pidns             *kernel.PIDNamespace
@@ -61,11 +62,6 @@ func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenera
 	return dentry
 }
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Valid(ctx context.Context) bool {
-	return true
-}
-
 // Lookup implements kernfs.inodeDynamicLookup.
 func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	tid, err := strconv.ParseUint(name, 10, 32)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 493acbd1b..4891caab6 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -45,19 +45,19 @@ var _ kernfs.Inode = (*taskInode)(nil)
 
 func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
 	contents := map[string]*kernfs.Dentry{
-		"auxv":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}),
-		"cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
-		"comm":    newComm(task, inoGen.NextIno(), 0444),
-		"environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
-		//"exe":       newExe(t, msrc),
-		//"fd":        newFdDir(t, msrc),
-		//"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}),
-		"io":      newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)),
-		"maps":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
-		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"net": newTaskNetDir(task, inoGen),
+		"auxv":      newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+		"comm":      newComm(task, inoGen.NextIno(), 0444),
+		"environ":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+		"exe":       newExeSymlink(task, inoGen.NextIno()),
+		"fd":        newFDDirInode(task, inoGen),
+		"fdinfo":    newFDInfoDirInode(task, inoGen),
+		"gid_map":   newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":        newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":      newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
+		"mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}),
+		"mounts":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}),
+		"net":       newTaskNetDir(task, inoGen),
 		"ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{
 			"net":  newNamespaceSymlink(task, inoGen.NextIno(), "net"),
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
new file mode 100644
index 000000000..76bfc5307
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -0,0 +1,287 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type fdDir struct {
+	inoGen InoGenerator
+	task   *kernel.Task
+
+	// When produceSymlinks is set, dirents produces for the FDs are reported
+	// as symlink. Otherwise, they are reported as regular files.
+	produceSymlink bool
+}
+
+func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) {
+	fd, err := strconv.ParseUint(name, 10, 64)
+	if err != nil {
+		return nil, kernel.FDFlags{}, syserror.ENOENT
+	}
+
+	var (
+		file  *vfs.FileDescription
+		flags kernel.FDFlags
+	)
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			file, flags = fdTable.GetVFS2(int32(fd))
+		}
+	})
+	if file == nil {
+		return nil, kernel.FDFlags{}, syserror.ENOENT
+	}
+	return file, flags, nil
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) {
+	var fds []int32
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.GetFDs()
+		}
+	})
+
+	offset := absOffset + relOffset
+	typ := uint8(linux.DT_REG)
+	if i.produceSymlink {
+		typ = linux.DT_LNK
+	}
+
+	// Find the appropriate starting point.
+	idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) })
+	if idx >= len(fds) {
+		return offset, nil
+	}
+	for _, fd := range fds[idx:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(fd), 10),
+			Type:    typ,
+			Ino:     i.inoGen.NextIno(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+// fdDirInode represents the inode for /proc/[pid]/fd directory.
+//
+// +stateify savable
+type fdDirInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+	fdDir
+}
+
+var _ kernfs.Inode = (*fdDirInode)(nil)
+
+func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
+	inode := &fdDirInode{
+		fdDir: fdDir{
+			inoGen:         inoGen,
+			task:           task,
+			produceSymlink: true,
+		},
+	}
+	inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	file, _, err := i.lookup(name)
+	if err != nil {
+		return nil, err
+	}
+	taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno())
+	return taskDentry.VFSDentry(), nil
+}
+
+// Open implements kernfs.Inode.
+func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	return fd.VFSFileDescription(), nil
+}
+
+// CheckPermissions implements kernfs.Inode.
+//
+// This is to match Linux, which uses a special permission handler to guarantee
+// that a process can still access /proc/self/fd after it has executed
+// setuid. See fs/proc/fd.c:proc_fd_permission.
+func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	err := i.InodeAttrs.CheckPermissions(ctx, creds, ats)
+	if err == nil {
+		// Access granted, no extra check needed.
+		return nil
+	}
+	if t := kernel.TaskFromContext(ctx); t != nil {
+		// Allow access if the task trying to access it is in the thread group
+		// corresponding to this directory.
+		if i.task.ThreadGroup() == t.ThreadGroup() {
+			// Access granted (overridden).
+			return nil
+		}
+	}
+	return err
+}
+
+// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
+//
+// +stateify savable
+type fdSymlink struct {
+	refs.AtomicRefCount
+	kernfs.InodeAttrs
+	kernfs.InodeSymlink
+
+	file *vfs.FileDescription
+}
+
+var _ kernfs.Inode = (*fdSymlink)(nil)
+
+func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry {
+	file.IncRef()
+	inode := &fdSymlink{file: file}
+	inode.Init(creds, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+	return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry())
+}
+
+func (s *fdSymlink) DecRef() {
+	s.AtomicRefCount.DecRefWithDestructor(func() {
+		s.Destroy()
+	})
+}
+
+func (s *fdSymlink) Destroy() {
+	s.file.DecRef()
+}
+
+// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
+//
+// +stateify savable
+type fdInfoDirInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+	fdDir
+}
+
+var _ kernfs.Inode = (*fdInfoDirInode)(nil)
+
+func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
+	inode := &fdInfoDirInode{
+		fdDir: fdDir{
+			inoGen: inoGen,
+			task:   task,
+		},
+	}
+	inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	file, flags, err := i.lookup(name)
+	if err != nil {
+		return nil, err
+	}
+
+	data := &fdInfoData{file: file, flags: flags}
+	dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data)
+	return dentry.VFSDentry(), nil
+}
+
+// Open implements kernfs.Inode.
+func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	return fd.VFSFileDescription(), nil
+}
+
+// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
+//
+// +stateify savable
+type fdInfoData struct {
+	kernfs.DynamicBytesFile
+	refs.AtomicRefCount
+
+	file  *vfs.FileDescription
+	flags kernel.FDFlags
+}
+
+var _ dynamicInode = (*fdInfoData)(nil)
+
+func (d *fdInfoData) DecRef() {
+	d.AtomicRefCount.DecRefWithDestructor(d.destroy)
+}
+
+func (d *fdInfoData) destroy() {
+	d.file.DecRef()
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/121266871): Include pos, locks, and other data. For now we only
+	// have flags.
+	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+	flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags()
+	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 4d3332771..8c743df8d 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -18,10 +18,14 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"sort"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -496,7 +500,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider.
 type ioUsage interface {
 	// IOUsage returns the io usage data.
 	IOUsage() *usage.IO
@@ -570,3 +574,248 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
 
 	return n, nil
 }
+
+// exeSymlink is an symlink for the /proc/[pid]/exe file.
+//
+// +stateify savable
+type exeSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+}
+
+var _ kernfs.Inode = (*exeSymlink)(nil)
+
+func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+	inode := &exeSymlink{task: task}
+	inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+// Readlink implements kernfs.Inode.
+func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return "", syserror.EACCES
+	}
+
+	// Pull out the executable for /proc/[pid]/exe.
+	exec, err := s.executable()
+	if err != nil {
+		return "", err
+	}
+	defer exec.DecRef()
+
+	return exec.PathnameWithDeleted(ctx), nil
+}
+
+func (s *exeSymlink) executable() (file fsbridge.File, err error) {
+	s.task.WithMuLocked(func(t *kernel.Task) {
+		mm := t.MemoryManager()
+		if mm == nil {
+			// TODO(b/34851096): Check shouldn't allow Readlink once the
+			// Task is zombied.
+			err = syserror.EACCES
+			return
+		}
+
+		// The MemoryManager may be destroyed, in which case
+		// MemoryManager.destroy will simply set the executable to nil
+		// (with locks held).
+		file = mm.Executable()
+		if file == nil {
+			err = syserror.ENOENT
+		}
+	})
+	return
+}
+
+// forEachMountSource runs f for the process root mount and each mount that is
+// a descendant of the root.
+func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
+	var fsctx *kernel.FSContext
+	t.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
+	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return
+	}
+
+	// All mount points must be relative to the rootDir, and mounts outside
+	// will be excluded.
+	rootDir := fsctx.RootDirectory()
+	if rootDir == nil {
+		// The task has been destroyed. Nothing to show here.
+		return
+	}
+	defer rootDir.DecRef()
+
+	mnt := t.MountNamespace().FindMount(rootDir)
+	if mnt == nil {
+		// Has it just been unmounted?
+		return
+	}
+	ms := t.MountNamespace().AllMountsUnder(mnt)
+	sort.Slice(ms, func(i, j int) bool {
+		return ms[i].ID < ms[j].ID
+	})
+	for _, m := range ms {
+		mroot := m.Root()
+		if mroot == nil {
+			continue // No longer valid.
+		}
+		mountPath, desc := mroot.FullName(rootDir)
+		mroot.DecRef()
+		if !desc {
+			// MountSources that are not descendants of the chroot jail are ignored.
+			continue
+		}
+		fn(mountPath, m)
+	}
+}
+
+// mountInfoData is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
+type mountInfoData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mountInfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
+		mroot := m.Root()
+		if mroot == nil {
+			return // No longer valid.
+		}
+		defer mroot.DecRef()
+
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) MountSource ID.
+		fmt.Fprintf(buf, "%d ", m.ID)
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := m.ID
+		if !m.IsRoot() && !m.IsUndo() {
+			pID = m.ParentID
+		}
+		fmt.Fprintf(buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		sa := mroot.Inode.StableAttr
+		fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE(b/78135857): This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(buf, "%s ", mountPath)
+
+		// (6) Mount options.
+		flags := mroot.Inode.MountSource.Flags
+		opts := "rw"
+		if flags.ReadOnly {
+			opts = "ro"
+		}
+		if flags.NoAtime {
+			opts += ",noatime"
+		}
+		if flags.NoExec {
+			opts += ",noexec"
+		}
+		fmt.Fprintf(buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(buf, "- ")
+
+		// (9) Filesystem type.
+		fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType)
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(buf, "none ")
+
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource))
+	})
+	return nil
+}
+
+func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if msrc.Flags.ReadOnly {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if msrc.FilesystemType == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}
+
+// mountsData is used to implement /proc/[pid]/mounts.
+//
+// +stateify savable
+type mountsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mountInfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
+		// Format:
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// We use the filesystem name as the first field, since there
+		// is no real block device we can point to, and we also should
+		// not expose anything about the remote filesystem.
+		//
+		// Only ro/rw option is supported for now.
+		//
+		// The "needs dump"and fsck flags are always 0, which is allowed.
+		root := m.Root()
+		if root == nil {
+			return // No longer valid.
+		}
+		defer root.DecRef()
+
+		flags := root.Inode.MountSource.Flags
+		opts := "rw"
+		if flags.ReadOnly {
+			opts = "ro"
+		}
+		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0)
+	})
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index d203cebd4..07115664c 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -46,6 +46,7 @@ type tasksInode struct {
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
+	kernfs.AlwaysValid
 
 	inoGen InoGenerator
 	pidns  *kernel.PIDNamespace
@@ -66,23 +67,23 @@ var _ kernfs.Inode = (*tasksInode)(nil)
 func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 	contents := map[string]*kernfs.Dentry{
-		"cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
-		//"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}),
-		"loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
-		"sys":     newSysDir(root, inoGen, k),
-		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
-		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
-		"net":     kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"),
-		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
-		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
-		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
+		"cpuinfo":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
+		"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}),
+		"loadavg":     newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
+		"sys":         newSysDir(root, inoGen, k),
+		"meminfo":     newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
+		"mounts":      kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
+		"net":         kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"),
+		"stat":        newDentry(root, inoGen.NextIno(), 0444, &statData{}),
+		"uptime":      newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
+		"version":     newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
 	}
 
 	inode := &tasksInode{
 		pidns:             pidns,
 		inoGen:            inoGen,
-		selfSymlink:       newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
-		threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+		selfSymlink:       newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(),
+		threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(),
 		cgroupControllers: cgroupControllers,
 	}
 	inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
@@ -121,11 +122,6 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 	return taskDentry.VFSDentry(), nil
 }
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Valid(ctx context.Context) bool {
-	return true
-}
-
 // IterDirents implements kernfs.inodeDynamicLookup.
 func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
 	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 434998910..b99badba8 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -40,9 +40,9 @@ type selfSymlink struct {
 
 var _ kernfs.Inode = (*selfSymlink)(nil)
 
-func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
 	inode := &selfSymlink{pidns: pidns}
-	inode.Init(creds, ino, linux.ModeSymlink|perm)
+	inode.Init(creds, ino, linux.ModeSymlink|0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -72,9 +72,9 @@ type threadSelfSymlink struct {
 
 var _ kernfs.Inode = (*threadSelfSymlink)(nil)
 
-func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
 	inode := &threadSelfSymlink{pidns: pidns}
-	inode.Init(creds, ino, linux.ModeSymlink|perm)
+	inode.Init(creds, ino, linux.ModeSymlink|0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -138,21 +138,19 @@ func (c cpuStats) String() string {
 // +stateify savable
 type statData struct {
 	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
 }
 
 var _ dynamicInode = (*statData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	// TODO(b/37226836): We currently export only zero CPU stats. We could
 	// at least provide some aggregate stats.
 	var cpu cpuStats
 	fmt.Fprintf(buf, "cpu  %s\n", cpu)
 
-	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
+	k := kernel.KernelFromContext(ctx)
+	for c, max := uint(0), k.ApplicationCores(); c < max; c++ {
 		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
 	}
 
@@ -176,7 +174,7 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "ctxt 0\n")
 
 	// CLOCK_REALTIME timestamp from boot, in seconds.
-	fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
+	fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds())
 
 	// Total number of clones.
 	// TODO(b/37226836): Count this.
@@ -209,7 +207,7 @@ type loadavgData struct {
 var _ dynamicInode = (*loadavgData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	// TODO(b/62345059): Include real data in fields.
 	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
 	// Column 4-5: currently running processes and the total number of processes.
@@ -223,16 +221,14 @@ func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 // +stateify savable
 type meminfoData struct {
 	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
 }
 
 var _ dynamicInode = (*meminfoData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	mf := d.k.MemoryFile()
+func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	mf := k.MemoryFile()
 	mf.UpdateUsage()
 	snapshot, totalUsage := usage.MemoryAccounting.Copy()
 	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
@@ -295,16 +291,14 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 // +stateify savable
 type versionData struct {
 	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
 }
 
 var _ dynamicInode = (*versionData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	init := v.k.GlobalInit()
+func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	init := k.GlobalInit()
 	if init == nil {
 		// Attempted to read before the init Task is created. This can
 		// only occur during startup, which should never need to read
@@ -335,3 +329,19 @@ func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
 	return nil
 }
+
+// filesystemsData backs /proc/filesystems.
+//
+// +stateify savable
+type filesystemsData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*filesystemsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	k.VFS().GenerateProcFilesystems(buf)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 1bb9430c0..d0f97c137 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -47,6 +48,7 @@ var (
 var (
 	tasksStaticFiles = map[string]testutil.DirentType{
 		"cpuinfo":     linux.DT_REG,
+		"filesystems": linux.DT_REG,
 		"loadavg":     linux.DT_REG,
 		"meminfo":     linux.DT_REG,
 		"mounts":      linux.DT_LNK,
@@ -68,9 +70,14 @@ var (
 		"cmdline":       linux.DT_REG,
 		"comm":          linux.DT_REG,
 		"environ":       linux.DT_REG,
+		"exe":           linux.DT_LNK,
+		"fd":            linux.DT_DIR,
+		"fdinfo":        linux.DT_DIR,
 		"gid_map":       linux.DT_REG,
 		"io":            linux.DT_REG,
 		"maps":          linux.DT_REG,
+		"mountinfo":     linux.DT_REG,
+		"mounts":        linux.DT_REG,
 		"net":           linux.DT_DIR,
 		"ns":            linux.DT_DIR,
 		"oom_score":     linux.DT_REG,
@@ -96,17 +103,37 @@ func setup(t *testing.T) *testutil.System {
 	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	fsOpts := vfs.GetFilesystemOptions{
-		InternalData: &InternalData{
-			Cgroups: map[string]string{
-				"cpuset": "/foo/cpuset",
-				"memory": "/foo/memory",
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+	pop := &vfs.PathOperation{
+		Root:  mntns.Root(),
+		Start: mntns.Root(),
+		Path:  fspath.Parse("/proc"),
+	}
+	if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
+		t.Fatalf("MkDir(/proc): %v", err)
+	}
+
+	pop = &vfs.PathOperation{
+		Root:  mntns.Root(),
+		Start: mntns.Root(),
+		Path:  fspath.Parse("/proc"),
+	}
+	mntOpts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: &InternalData{
+				Cgroups: map[string]string{
+					"cpuset": "/foo/cpuset",
+					"memory": "/foo/memory",
+				},
 			},
 		},
 	}
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts)
-	if err != nil {
-		t.Fatalf("NewMountNamespace(): %v", err)
+	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+		t.Fatalf("MountAt(/proc): %v", err)
 	}
 	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
 }
@@ -115,7 +142,7 @@ func TestTasksEmpty(t *testing.T) {
 	s := setup(t)
 	defer s.Destroy()
 
-	collector := s.ListDirents(s.PathOpAtRoot("/"))
+	collector := s.ListDirents(s.PathOpAtRoot("/proc"))
 	s.AssertAllDirentTypes(collector, tasksStaticFiles)
 	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
 }
@@ -141,7 +168,7 @@ func TestTasks(t *testing.T) {
 		expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR
 	}
 
-	collector := s.ListDirents(s.PathOpAtRoot("/"))
+	collector := s.ListDirents(s.PathOpAtRoot("/proc"))
 	s.AssertAllDirentTypes(collector, expectedDirents)
 	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
 
@@ -181,7 +208,7 @@ func TestTasks(t *testing.T) {
 	}
 
 	// Test lookup.
-	for _, path := range []string{"/1", "/2"} {
+	for _, path := range []string{"/proc/1", "/proc/2"} {
 		fd, err := s.VFS.OpenAt(
 			s.Ctx,
 			s.Creds,
@@ -191,6 +218,7 @@ func TestTasks(t *testing.T) {
 		if err != nil {
 			t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
 		}
+		defer fd.DecRef()
 		buf := make([]byte, 1)
 		bufIOSeq := usermem.BytesIOSequence(buf)
 		if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
@@ -201,10 +229,10 @@ func TestTasks(t *testing.T) {
 	if _, err := s.VFS.OpenAt(
 		s.Ctx,
 		s.Creds,
-		s.PathOpAtRoot("/9999"),
+		s.PathOpAtRoot("/proc/9999"),
 		&vfs.OpenOptions{},
 	); err != syserror.ENOENT {
-		t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err)
+		t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err)
 	}
 }
 
@@ -302,12 +330,13 @@ func TestTasksOffset(t *testing.T) {
 			fd, err := s.VFS.OpenAt(
 				s.Ctx,
 				s.Creds,
-				s.PathOpAtRoot("/"),
+				s.PathOpAtRoot("/proc"),
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
 				t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
 			}
+			defer fd.DecRef()
 			if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil {
 				t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
 			}
@@ -344,7 +373,7 @@ func TestTask(t *testing.T) {
 		t.Fatalf("CreateTask(): %v", err)
 	}
 
-	collector := s.ListDirents(s.PathOpAtRoot("/1"))
+	collector := s.ListDirents(s.PathOpAtRoot("/proc/1"))
 	s.AssertAllDirentTypes(collector, taskStaticFiles)
 }
 
@@ -362,14 +391,14 @@ func TestProcSelf(t *testing.T) {
 	collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{
 		Root:               s.Root,
 		Start:              s.Root,
-		Path:               fspath.Parse("/self/"),
+		Path:               fspath.Parse("/proc/self/"),
 		FollowFinalSymlink: true,
 	})
 	s.AssertAllDirentTypes(collector, taskStaticFiles)
 }
 
 func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) {
-	t.Logf("Iterating: /proc%s", fd.MappedName(ctx))
+	t.Logf("Iterating: %s", fd.MappedName(ctx))
 
 	var collector testutil.DirentCollector
 	if err := fd.IterDirents(ctx, &collector); err != nil {
@@ -412,6 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F
 			t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err)
 			continue
 		}
+		defer child.DecRef()
 		stat, err := child.Stat(ctx, vfs.StatOptions{})
 		if err != nil {
 			t.Errorf("Stat(%v) failed: %v", childPath, err)
@@ -432,6 +462,22 @@ func TestTree(t *testing.T) {
 	defer s.Destroy()
 
 	k := kernel.KernelFromContext(s.Ctx)
+
+	pop := &vfs.PathOperation{
+		Root:  s.Root,
+		Start: s.Root,
+		Path:  fspath.Parse("test-file"),
+	}
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_CREAT,
+		Mode:  0777,
+	}
+	file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts)
+	if err != nil {
+		t.Fatalf("failed to create test file: %v", err)
+	}
+	defer file.DecRef()
+
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
@@ -439,6 +485,8 @@ func TestTree(t *testing.T) {
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
+		// Add file to populate /proc/[pid]/fd and fdinfo directories.
+		task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{})
 		tasks = append(tasks, task)
 	}
 
@@ -446,11 +494,12 @@ func TestTree(t *testing.T) {
 	fd, err := s.VFS.OpenAt(
 		ctx,
 		auth.CredentialsFromContext(s.Ctx),
-		&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")},
+		&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")},
 		&vfs.OpenOptions{},
 	)
 	if err != nil {
-		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+		t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err)
 	}
 	iterateDir(ctx, t, s, fd)
+	fd.DecRef()
 }
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index e4f36f4ae..0e4053a46 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -16,12 +16,14 @@ go_library(
         "//pkg/cpuid",
         "//pkg/fspath",
         "//pkg/memutil",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
+        "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 488478e29..c16a36cdb 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -23,13 +23,16 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/time"
@@ -123,10 +126,17 @@ func Boot() (*kernel.Kernel, error) {
 // CreateTask creates a new bare bones task for tests.
 func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
 	k := kernel.KernelFromContext(ctx)
+	exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root)
+	if err != nil {
+		return nil, err
+	}
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
+	m.SetExecutable(fsbridge.NewVFSFile(exe))
+
 	config := &kernel.TaskConfig{
 		Kernel:                  k,
 		ThreadGroup:             tc,
-		TaskContext:             &kernel.TaskContext{Name: name},
+		TaskContext:             &kernel.TaskContext{Name: name, MemoryManager: m},
 		Credentials:             auth.CredentialsFromContext(ctx),
 		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
@@ -135,10 +145,25 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
 		MountNamespaceVFS2:      mntns,
 		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
+		FDTable:                 k.NewFDTable(),
 	}
 	return k.TaskSet().NewTask(config)
 }
 
+func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
+	const name = "executable"
+	pop := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_CREAT,
+		Mode:  0777,
+	}
+	return vfsObj.OpenAt(ctx, creds, pop, opts)
+}
+
 func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	const memfileName = "test-memory"
 	memfd, err := memutil.CreateMemFD(memfileName, 0)
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 00f914564..7de2e509e 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -191,7 +191,7 @@ func (f *FDTable) Size() int {
 	return int(size)
 }
 
-// forEach iterates over all non-nil files.
+// forEach iterates over all non-nil files in sorted order.
 //
 // It is the caller's responsibility to acquire an appropriate lock.
 func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
@@ -458,7 +458,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
 	}
 }
 
-// GetFDs returns a list of valid fds.
+// GetFDs returns a sorted list of valid fds.
+//
+// Precondition: The caller must be running on the task goroutine, or Task.mu
+// must be locked.
 func (f *FDTable) GetFDs() []int32 {
 	fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
 	f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-- 
cgit v1.2.3


From 5e413cad10d2358a21dd08216953faee70e62a0b Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Sat, 14 Mar 2020 07:13:15 -0700
Subject: Plumb VFS2 imported fds into virtual filesystem.

- When setting up the virtual filesystem, mount a host.filesystem to contain
  all files that need to be imported.
- Make read/preadv syscalls to the host in cases where preadv2 may not be
  supported yet (likewise for writing).
- Make save/restore functions in kernel/kernel.go return early if vfs2 is
  enabled.

PiperOrigin-RevId: 300922353
---
 pkg/abi/linux/file.go                  |   3 +
 pkg/sentry/fs/host/control.go          |   2 +
 pkg/sentry/fsimpl/host/BUILD           |   2 +
 pkg/sentry/fsimpl/host/default_file.go |  45 +++++++-----
 pkg/sentry/fsimpl/host/host.go         | 124 ++++++++++++++++++++++++++++++---
 pkg/sentry/fsimpl/host/util.go         |  28 ++------
 pkg/sentry/kernel/kernel.go            |  40 +++++++----
 pkg/sentry/syscalls/linux/sys_stat.go  |   5 +-
 pkg/sentry/syscalls/linux/vfs2/stat.go |   6 +-
 runsc/boot/filter/config.go            |   1 +
 test/syscalls/linux/stat.cc            |  60 ++++++++++++++--
 11 files changed, 246 insertions(+), 70 deletions(-)

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index e229ac21c..dbe58acbe 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -266,6 +266,9 @@ type Statx struct {
 	DevMinor       uint32
 }
 
+// SizeOfStatx is the size of a Statx struct.
+var SizeOfStatx = binary.Size(Statx{})
+
 // FileMode represents a mode_t.
 type FileMode uint16
 
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 1658979fc..cd84e1337 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -32,6 +32,8 @@ func newSCMRights(fds []int) control.SCMRights {
 }
 
 // Files implements control.SCMRights.Files.
+//
+// TODO(gvisor.dev/issue/2017): Port to VFS2.
 func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) {
 	n := max
 	var trunc bool
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 731f192b3..5d67f88e3 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -9,9 +9,11 @@ go_library(
         "host.go",
         "util.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
index 172cdb161..98682ba5e 100644
--- a/pkg/sentry/fsimpl/host/default_file.go
+++ b/pkg/sentry/fsimpl/host/default_file.go
@@ -21,6 +21,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -64,9 +65,7 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v
 			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 		}
 
-		f.mu.Lock()
 		n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags))
-		f.mu.Unlock()
 		if isBlockError(err) {
 			// If we got any data at all, return it as a "completed" partial read
 			// rather than retrying until complete.
@@ -86,16 +85,22 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v
 	return n, err
 }
 
-func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
-	if flags&^(linux.RWF_VALID) != 0 {
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
+	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
-	reader := safemem.FromVecReaderFunc{
-		func(srcs [][]byte) (int64, error) {
-			n, err := unix.Preadv2(fd, srcs, offset, flags)
-			return int64(n), err
-		},
+	var reader safemem.Reader
+	if offset == -1 {
+		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
+	} else {
+		reader = safemem.FromVecReaderFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Preadv(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
 	}
 	n, err := dst.CopyOutFrom(ctx, reader)
 	return int64(n), err
@@ -120,9 +125,7 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 		}
 
-		f.mu.Lock()
 		n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags))
-		f.mu.Unlock()
 		if isBlockError(err) {
 			err = syserror.ErrWouldBlock
 		}
@@ -137,16 +140,22 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 	return n, err
 }
 
-func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
-	if flags&^(linux.RWF_VALID) != 0 {
+func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
-	writer := safemem.FromVecWriterFunc{
-		func(srcs [][]byte) (int64, error) {
-			n, err := unix.Pwritev2(fd, srcs, offset, flags)
-			return int64(n), err
-		},
+	var writer safemem.Writer
+	if offset == -1 {
+		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
+	} else {
+		writer = safemem.FromVecWriterFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Pwritev(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
 	}
 	n, err := src.CopyInTo(ctx, writer)
 	return int64(n), err
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index c205e6a0b..0be812d13 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -38,10 +38,19 @@ type filesystem struct {
 	kernfs.Filesystem
 }
 
+// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD.
+func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj)
+	vfsfs := fs.VFSFilesystem()
+	// NewDisconnectedMount will take an additional reference on vfsfs.
+	defer vfsfs.DecRef()
+	return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{})
+}
+
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
 func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) {
-	// Must be importing to a mount of host.filesystem.
-	fs, ok := mnt.Filesystem().Impl().(*filesystem)
+	fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem)
 	if !ok {
 		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
 	}
@@ -54,8 +63,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID
 
 	fileMode := linux.FileMode(s.Mode)
 	fileType := fileMode.FileType()
-	// Pipes, character devices, and sockets can return EWOULDBLOCK for
-	// operations that would block.
+	// Pipes, character devices, and sockets.
 	isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
 
 	i := &inode{
@@ -143,11 +151,109 @@ func (i *inode) Mode() linux.FileMode {
 
 // Stat implements kernfs.Inode.
 func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	if opts.Mask&linux.STATX__RESERVED != 0 {
+		return linux.Statx{}, syserror.EINVAL
+	}
+	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
+		return linux.Statx{}, syserror.EINVAL
+	}
+
+	// Limit our host call only to known flags.
+	mask := opts.Mask & linux.STATX_ALL
 	var s unix.Statx_t
-	if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil {
+	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
+	// Fallback to fstat(2), if statx(2) is not supported on the host.
+	//
+	// TODO(b/151263641): Remove fallback.
+	if err == syserror.ENOSYS {
+		return i.fstat(opts)
+	} else if err != nil {
+		return linux.Statx{}, err
+	}
+
+	ls := linux.Statx{Mask: mask}
+	// Unconditionally fill blksize, attributes, and device numbers, as indicated
+	// by /include/uapi/linux/stat.h.
+	//
+	// RdevMajor/RdevMinor are left as zero, so as not to expose host device
+	// numbers.
+	//
+	// TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined
+	// device numbers. If we use the device number from the host, it may collide
+	// with another sentry-internal device number. We handle device/inode
+	// numbers without relying on the host to prevent collisions.
+	ls.Blksize = s.Blksize
+	ls.Attributes = s.Attributes
+	ls.AttributesMask = s.Attributes_mask
+
+	if mask|linux.STATX_TYPE != 0 {
+		ls.Mode |= s.Mode & linux.S_IFMT
+	}
+	if mask|linux.STATX_MODE != 0 {
+		ls.Mode |= s.Mode &^ linux.S_IFMT
+	}
+	if mask|linux.STATX_NLINK != 0 {
+		ls.Nlink = s.Nlink
+	}
+	if mask|linux.STATX_ATIME != 0 {
+		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
+	}
+	if mask|linux.STATX_BTIME != 0 {
+		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
+	}
+	if mask|linux.STATX_CTIME != 0 {
+		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
+	}
+	if mask|linux.STATX_MTIME != 0 {
+		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
+	}
+	if mask|linux.STATX_SIZE != 0 {
+		ls.Size = s.Size
+	}
+	if mask|linux.STATX_BLOCKS != 0 {
+		ls.Blocks = s.Blocks
+	}
+
+	// Use our own internal inode number and file owner.
+	if mask|linux.STATX_INO != 0 {
+		ls.Ino = i.ino
+	}
+	if mask|linux.STATX_UID != 0 {
+		ls.UID = uint32(i.uid)
+	}
+	if mask|linux.STATX_GID != 0 {
+		ls.GID = uint32(i.gid)
+	}
+
+	return ls, nil
+}
+
+// fstat is a best-effort fallback for inode.Stat() if the host does not
+// support statx(2).
+//
+// We ignore the mask and sync flags in opts and simply supply
+// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
+// of a mask or sync flags. fstat(2) does not provide any metadata
+// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
+// those fields remain empty.
+func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) {
+	var s unix.Stat_t
+	if err := unix.Fstat(i.hostFD, &s); err != nil {
 		return linux.Statx{}, err
 	}
-	ls := unixToLinuxStatx(s)
+
+	// Note that rdev numbers are left as 0; do not expose host device numbers.
+	ls := linux.Statx{
+		Mask:    linux.STATX_BASIC_STATS,
+		Blksize: uint32(s.Blksize),
+		Nlink:   uint32(s.Nlink),
+		Mode:    uint16(s.Mode),
+		Size:    uint64(s.Size),
+		Blocks:  uint64(s.Blocks),
+		Atime:   timespecToStatxTimestamp(s.Atim),
+		Ctime:   timespecToStatxTimestamp(s.Ctim),
+		Mtime:   timespecToStatxTimestamp(s.Mtim),
+	}
 
 	// Use our own internal inode number and file owner.
 	//
@@ -159,9 +265,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro
 	ls.UID = uint32(i.uid)
 	ls.GID = uint32(i.gid)
 
-	// Update file mode from the host.
-	i.mode = linux.FileMode(ls.Mode)
-
 	return ls, nil
 }
 
@@ -217,7 +320,6 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio
 }
 
 func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
-
 	fileType := i.mode.FileType()
 	if fileType == syscall.S_IFSOCK {
 		if i.isTTY {
@@ -227,6 +329,8 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error
 		return nil, errors.New("importing host sockets not supported")
 	}
 
+	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
+	// we don't allow importing arbitrary file types without proper support.
 	if i.isTTY {
 		// TODO(gvisor.dev/issue/1672): support importing host fd as TTY.
 		return nil, errors.New("importing host fd as TTY not supported")
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index e1ccacb4d..d519feef5 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -35,34 +35,14 @@ func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec {
 	}
 }
 
-func unixToLinuxStatx(s unix.Statx_t) linux.Statx {
-	return linux.Statx{
-		Mask:           s.Mask,
-		Blksize:        s.Blksize,
-		Attributes:     s.Attributes,
-		Nlink:          s.Nlink,
-		UID:            s.Uid,
-		GID:            s.Gid,
-		Mode:           s.Mode,
-		Ino:            s.Ino,
-		Size:           s.Size,
-		Blocks:         s.Blocks,
-		AttributesMask: s.Attributes_mask,
-		Atime:          unixToLinuxStatxTimestamp(s.Atime),
-		Btime:          unixToLinuxStatxTimestamp(s.Btime),
-		Ctime:          unixToLinuxStatxTimestamp(s.Ctime),
-		Mtime:          unixToLinuxStatxTimestamp(s.Mtime),
-		RdevMajor:      s.Rdev_major,
-		RdevMinor:      s.Rdev_minor,
-		DevMajor:       s.Dev_major,
-		DevMinor:       s.Dev_minor,
-	}
-}
-
 func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp {
 	return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec}
 }
 
+func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
+}
+
 // wouldBlock returns true for file types that can return EWOULDBLOCK
 // for blocking operations, e.g. pipes, character devices, and sockets.
 func wouldBlock(fileType uint32) bool {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1d627564f..6feda8fa1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error {
 //
 // Precondition: Must be called with the kernel paused.
 func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return nil
+	}
+
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
 }
 
 func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	// TODO(gvisor.dev/issues/1663): Add save support for VFS2.
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
 	return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
@@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
 }
 
 func (ts *TaskSet) unregisterEpollWaiters() {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return
+	}
+
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() {
 		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
-					tfd.PauseTimer()
-				}
-			})
+			// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+			if !VFS2Enabled {
+				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
+						tfd.PauseTimer()
+					}
+				})
+			}
 		}
 	}
 	k.timekeeper.PauseUpdates()
@@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() {
 				it.ResumeTimer()
 			}
 		}
-		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
-					tfd.ResumeTimer()
-				}
-			})
+		// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+		if !VFS2Enabled {
+			if t.fdTable != nil {
+				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
+						tfd.ResumeTimer()
+					}
+				})
+			}
 		}
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 9bd2df104..a11a87cd1 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -136,7 +136,10 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	mask := args[3].Uint()
 	statxAddr := args[4].Pointer()
 
-	if mask&linux.STATX__RESERVED > 0 {
+	if mask&linux.STATX__RESERVED != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index a74ea6fd5..97eaedd66 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -150,7 +150,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	mask := args[3].Uint()
 	statxAddr := args[4].Pointer()
 
-	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if mask&linux.STATX__RESERVED != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index a4627905e..f459d1973 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -284,6 +284,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
+	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index c951ac3b3..513b9cd1c 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -607,7 +607,7 @@ int statx(int dirfd, const char* pathname, int flags, unsigned int mask,
 }
 
 TEST_F(StatTest, StatxAbsPath) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   struct kernel_statx stx;
@@ -617,7 +617,7 @@ TEST_F(StatTest, StatxAbsPath) {
 }
 
 TEST_F(StatTest, StatxRelPathDirFD) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   struct kernel_statx stx;
@@ -631,7 +631,7 @@ TEST_F(StatTest, StatxRelPathDirFD) {
 }
 
 TEST_F(StatTest, StatxRelPathCwd) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
@@ -643,7 +643,7 @@ TEST_F(StatTest, StatxRelPathCwd) {
 }
 
 TEST_F(StatTest, StatxEmptyPath) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
@@ -653,6 +653,58 @@ TEST_F(StatTest, StatxEmptyPath) {
   EXPECT_TRUE(S_ISREG(stx.stx_mode));
 }
 
+TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  // Set all mask bits except for STATX__RESERVED.
+  uint mask = 0xffffffff & ~0x80000000;
+  EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxRejectsReservedMaskBit) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  // Set STATX__RESERVED in the mask.
+  EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(StatTest, StatxSymlink) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, test_file_name_));
+  std::string p = link.path();
+
+  struct kernel_statx stx;
+  EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISLNK(stx.stx_mode));
+  EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxInvalidFlags) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
+                    0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 97127750289b49dd5e29f8ddb4209137e47fe52d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Sat, 14 Mar 2020 13:46:55 -0700
Subject: Disallow kernfs.Inode.SetStat for readonly inodes

Updates #1195, #1193

PiperOrigin-RevId: 300950993
---
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  7 +++---
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  9 +++++--
 pkg/sentry/fsimpl/kernfs/symlink.go            |  7 ++++++
 pkg/sentry/fsimpl/proc/subtasks.go             |  5 ++++
 pkg/sentry/fsimpl/proc/task.go                 | 10 +++-----
 pkg/sentry/fsimpl/proc/tasks.go                | 16 ++++++++++++-
 pkg/sentry/fsimpl/proc/tasks_files.go          | 33 ++++++++++++++++++++++----
 pkg/sentry/fsimpl/sys/sys.go                   |  8 ++++---
 8 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1c026f4d8..0d27a8867 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -61,9 +61,10 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf
 	return &fd.vfsfd, nil
 }
 
-// SetStat implements Inode.SetStat.
-func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
-	// DynamicBytesFiles are immutable.
+// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
+// inode attributes to be changed. Override SetStat() making it call
+// f.InodeAttrs to allow it.
+func (*DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 94ca3dbdd..4ed41326d 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -228,7 +228,7 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error)
 	stat.GID = atomic.LoadUint32(&a.gid)
 	stat.Nlink = atomic.LoadUint32(&a.nlink)
 
-	// TODO: Implement other stat fields like timestamps.
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
 
 	return stat, nil
 }
@@ -256,7 +256,7 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
 	// Note that not all fields are modifiable. For example, the file type and
 	// inode numbers are immutable after node creation.
 
-	// TODO: Implement other stat fields like timestamps.
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
 
 	return nil
 }
@@ -555,6 +555,11 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs
 	return fd.VFSFileDescription(), nil
 }
 
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticDirectory) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
 // AlwaysValid partially implements kernfs.inodeDynamicLookup.
 type AlwaysValid struct{}
 
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 0ee7eb9b7..41c5a3099 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -18,6 +18,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // StaticSymlink provides an Inode implementation for symlinks that point to
@@ -52,3 +54,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string)
 func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 	return s.target, nil
 }
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index a3a7c16a5..ea6d60f6e 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -127,3 +127,8 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.
 	}
 	return stat, nil
 }
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*subtasksInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 4891caab6..fae3fc5aa 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -107,13 +107,9 @@ func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenO
 	return fd.VFSFileDescription(), nil
 }
 
-// SetStat implements kernfs.Inode.
-func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
-	stat := opts.Stat
-	if stat.Mask&linux.STATX_MODE != 0 {
-		return syserror.EPERM
-	}
-	return nil
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*taskInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	return syserror.EPERM
 }
 
 // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 07115664c..9f2ef8200 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -67,7 +67,7 @@ var _ kernfs.Inode = (*tasksInode)(nil)
 func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 	contents := map[string]*kernfs.Dentry{
-		"cpuinfo":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
+		"cpuinfo":     newDentry(root, inoGen.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
 		"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}),
 		"loadavg":     newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
 		"sys":         newSysDir(root, inoGen, k),
@@ -225,6 +225,20 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Sta
 	return stat, nil
 }
 
+// staticFileSetStat implements a special static file that allows inode
+// attributes to be set. This is to support /proc files that are readonly, but
+// allow attributes to be set.
+type staticFileSetStat struct {
+	dynamicBytesFileSetAttr
+	vfs.StaticData
+}
+
+var _ dynamicInode = (*staticFileSetStat)(nil)
+
+func newStaticFileSetStat(data string) *staticFileSetStat {
+	return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}}
+}
+
 func cpuInfoData(k *kernel.Kernel) string {
 	features := k.FeatureSet()
 	if features == nil {
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index b99badba8..20085bb39 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -62,6 +63,11 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*selfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
 type threadSelfSymlink struct {
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
@@ -95,6 +101,23 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*threadSelfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// dynamicBytesFileSetAttr implements a special file that allows inode
+// attributes to be set. This is to support /proc files that are readonly, but
+// allow attributes to be set.
+type dynamicBytesFileSetAttr struct {
+	kernfs.DynamicBytesFile
+}
+
+// SetStat implements Inode.SetStat.
+func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	return d.DynamicBytesFile.InodeAttrs.SetStat(fs, opts)
+}
+
 // cpuStats contains the breakdown of CPU time for /proc/stat.
 type cpuStats struct {
 	// user is time spent in userspace tasks with non-positive niceness.
@@ -137,7 +160,7 @@ func (c cpuStats) String() string {
 //
 // +stateify savable
 type statData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*statData)(nil)
@@ -201,7 +224,7 @@ func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type loadavgData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*loadavgData)(nil)
@@ -220,7 +243,7 @@ func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type meminfoData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*meminfoData)(nil)
@@ -271,7 +294,7 @@ func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type uptimeData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*uptimeData)(nil)
@@ -290,7 +313,7 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type versionData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*versionData)(nil)
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index c36c4fa11..3928ff2c8 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -94,15 +94,17 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &d.dentry
 }
 
-// SetStat implements kernfs.Inode.SetStat.
-func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*dir) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
 // Open implements kernfs.Inode.Open.
 func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
-- 
cgit v1.2.3


From 52758e16e0d1e67b3cdd56e04abfce663607ac42 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 16 Mar 2020 08:02:05 -0700
Subject: Prevent vnetHdr from escaping in WritePacket.

PiperOrigin-RevId: 301157950
---
 pkg/tcpip/link/fdbased/endpoint.go        | 2 +-
 pkg/tcpip/link/fdbased/endpoint_unsafe.go | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index a753fb243..3b36b9673 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -407,7 +407,6 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 
 	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		vnetHdr := virtioNetHdr{}
-		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
 		if gso != nil {
 			vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
 			if gso.NeedsCsum {
@@ -428,6 +427,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 			}
 		}
 
+		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
 		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
 	}
 
diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
index 97a477b61..d81858353 100644
--- a/pkg/tcpip/link/fdbased/endpoint_unsafe.go
+++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
@@ -24,9 +24,10 @@ import (
 const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{}))
 
 func vnetHdrToByteSlice(hdr *virtioNetHdr) (slice []byte) {
-	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
-	sh.Data = uintptr(unsafe.Pointer(hdr))
-	sh.Len = virtioNetHdrSize
-	sh.Cap = virtioNetHdrSize
+	*(*reflect.SliceHeader)(unsafe.Pointer(&slice)) = reflect.SliceHeader{
+		Data: uintptr((unsafe.Pointer(hdr))),
+		Len:  virtioNetHdrSize,
+		Cap:  virtioNetHdrSize,
+	}
 	return
 }
-- 
cgit v1.2.3


From 69da42885aff9371fd53227583a546df914de02b Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 16 Mar 2020 12:02:33 -0700
Subject: Enable ARP resolution in TAP devices.

PiperOrigin-RevId: 301208471
---
 pkg/tcpip/link/tun/device.go  |  10 +++-
 test/syscalls/linux/tuntap.cc | 105 +++++++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 6ff47a742..f6e301304 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -98,7 +98,12 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 		prefix = "tap"
 	}
 
-	endpoint, err := attachOrCreateNIC(s, name, prefix)
+	linkCaps := stack.CapabilityNone
+	if isTap {
+		linkCaps |= stack.CapabilityResolutionRequired
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
 	if err != nil {
 		return syserror.EINVAL
 	}
@@ -109,7 +114,7 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 	return nil
 }
 
-func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error) {
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
 	for {
 		// 1. Try to attach to an existing NIC.
 		if name != "" {
@@ -135,6 +140,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error
 			nicID:    id,
 			name:     name,
 		}
+		endpoint.Endpoint.LinkEPCapabilities = linkCaps
 		if endpoint.name == "" {
 			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
 		}
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index f734511d6..53ad2dda3 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -256,50 +256,59 @@ TEST_F(TuntapTest, WriteToDownDevice) {
   EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
 }
 
-// This test sets up a TAP device and pings kernel by sending ICMP echo request.
-//
-// It works as the following:
-// * Open /dev/net/tun, and create kTapName interface.
-// * Use rtnetlink to do initial setup of the interface:
-//   * Assign IP address 10.0.0.1/24 to kernel.
-//   * MAC address: kMacA
-//   * Bring up the interface.
-// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
-// * Loop to receive packets from TAP device/fd:
-//   * If packet is an ICMP echo reply, it stops and passes the test.
-//   * If packet is an ARP request, it responds with canned reply and resends
-//   the
-//     ICMP request packet.
-TEST_F(TuntapTest, PingKernel) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
-
+PosixErrorOr<FileDescriptor> OpenAndAttachTap(
+    const std::string& dev_name, const std::string& dev_ipv4_addr) {
   // Interface creation.
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR));
 
   struct ifreq ifr_set = {};
   ifr_set.ifr_flags = IFF_TAP;
-  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
-  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
-              SyscallSucceedsWithValue(0));
+  strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ);
+  if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) {
+    return PosixError(errno);
+  }
 
-  absl::optional<Link> link =
-      ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTapName));
-  ASSERT_TRUE(link.has_value());
+  ASSIGN_OR_RETURN_ERRNO(absl::optional<Link> link, GetLinkByName(dev_name));
+  if (!link.has_value()) {
+    return PosixError(ENOENT, "no link");
+  }
 
   // Interface setup.
   struct in_addr addr;
-  inet_pton(AF_INET, "10.0.0.1", &addr);
+  inet_pton(AF_INET, dev_ipv4_addr.c_str(), &addr);
   EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
                                    &addr, sizeof(addr)));
 
   if (!IsRunningOnGvisor()) {
     // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
-    EXPECT_NO_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+    RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
 
     // FIXME: gVisor always creates enabled/up'd interfaces.
-    EXPECT_NO_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+    RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
   }
 
+  return fd;
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
   ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
   std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
 
@@ -349,5 +358,47 @@ TEST_F(TuntapTest, PingKernel) {
   }
 }
 
+TEST_F(TuntapTest, SendUdpTriggersArpResolution) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+
+  // Send a UDP packet to remote.
+  int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+  ASSERT_THAT(sock, SyscallSucceeds());
+
+  struct sockaddr_in remote = {};
+  remote.sin_family = AF_INET;
+  remote.sin_port = htons(42);
+  inet_pton(AF_INET, "10.0.0.2", &remote.sin_addr);
+  int ret = sendto(sock, "hello", 5, 0, reinterpret_cast<sockaddr*>(&remote),
+                   sizeof(remote));
+  ASSERT_THAT(ret, ::testing::AnyOf(SyscallSucceeds(),
+                                    SyscallFailsWithErrno(EHOSTDOWN)));
+
+  struct inpkt {
+    union {
+      pihdr pi;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      break;
+    }
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 0f60799a4f8c3db567973574147370fc900df55f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 16 Mar 2020 13:28:00 -0700
Subject: Add calls to vfs.CheckSetStat to fsimpls

Only gofer filesystem was calling vfs.CheckSetStat for
vfs.FilesystemImpl.SetStatAt and vfs.FileDescriptionImpl.SetStat.

Updates #1193, #1672, #1197

PiperOrigin-RevId: 301226522
---
 pkg/sentry/fsimpl/host/host.go                 | 16 +++++++++++-----
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  2 +-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |  4 +++-
 pkg/sentry/fsimpl/kernfs/filesystem.go         |  2 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    | 14 ++++++++++++--
 pkg/sentry/fsimpl/kernfs/kernfs.go             |  6 ++++--
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |  2 +-
 pkg/sentry/fsimpl/kernfs/symlink.go            |  2 +-
 pkg/sentry/fsimpl/proc/subtasks.go             |  3 ++-
 pkg/sentry/fsimpl/proc/task.go                 |  2 +-
 pkg/sentry/fsimpl/proc/tasks_files.go          |  8 ++++----
 pkg/sentry/fsimpl/sys/sys.go                   |  2 +-
 pkg/sentry/fsimpl/tmpfs/filesystem.go          |  2 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go               | 11 +++++++++--
 pkg/sentry/vfs/file_description.go             |  3 ++-
 pkg/sentry/vfs/filesystem.go                   |  4 +++-
 16 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 0be812d13..67c050c30 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -114,7 +114,8 @@ type inode struct {
 	ino uint64
 
 	// mu protects the inode metadata below.
-	mu sync.Mutex
+	// TODO(gvisor.dev/issue/1672): actually protect fields below.
+	//mu sync.Mutex
 
 	// mode is the file mode of this inode. Note that this value may become out
 	// of date if the mode is changed on the host, e.g. with chmod.
@@ -269,16 +270,20 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) {
 }
 
 // SetStat implements kernfs.Inode.
-func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+func (i *inode) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	s := opts.Stat
 
 	m := s.Mask
 	if m == 0 {
 		return nil
 	}
-	if m&(linux.STATX_UID|linux.STATX_GID) != 0 {
+	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		return syserror.EPERM
 	}
+	if err := vfs.CheckSetStat(creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil {
+		return err
+	}
+
 	if m&linux.STATX_MODE != 0 {
 		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
 			return err
@@ -375,8 +380,9 @@ type fileDescription struct {
 }
 
 // SetStat implements vfs.FileDescriptionImpl.
-func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error {
-	return f.inode.SetStat(nil, opts)
+func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	return f.inode.SetStat(nil, creds, opts)
 }
 
 // Stat implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 0d27a8867..c788d1d62 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -64,7 +64,7 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf
 // SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
 // inode attributes to be changed. Override SetStat() making it call
 // f.InodeAttrs to allow it.
-func (*DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*DynamicBytesFile) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index da821d524..331c82011 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -17,6 +17,7 @@ package kernfs
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -206,6 +207,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	fs := fd.filesystem()
+	creds := auth.CredentialsFromContext(ctx)
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(fs, opts)
+	return inode.SetStat(fs, creds, opts)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3288de290..37fbe2eea 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -636,7 +636,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	return inode.SetStat(fs.VFSFilesystem(), opts)
+	return inode.SetStat(fs.VFSFilesystem(), rp.Credentials(), opts)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 4ed41326d..851c61b49 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -234,7 +234,17 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error)
 }
 
 // SetStat implements Inode.SetStat.
-func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+func (a *InodeAttrs) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+		return err
+	}
+
 	stat := opts.Stat
 	if stat.Mask&linux.STATX_MODE != 0 {
 		for {
@@ -556,7 +566,7 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*StaticDirectory) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*StaticDirectory) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 18a34a590..b12b216d2 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -330,8 +330,10 @@ type inodeMetadata interface {
 	Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
 
 	// SetStat updates the metadata for this inode. This corresponds to
-	// vfs.FilesystemImpl.SetStatAt.
-	SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error
+	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
+	// if the operation can be performed (see vfs.CheckSetStat() for common
+	// checks).
+	SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
 }
 
 // Precondition: All methods in this interface may only be called on directory
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 0459fb305..2875e6ffa 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -91,7 +91,7 @@ type attrs struct {
 	kernfs.InodeAttrs
 }
 
-func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error {
+func (*attrs) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 41c5a3099..92f709d29 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -56,6 +56,6 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*StaticSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*StaticSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index ea6d60f6e..eb191aba4 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -129,6 +130,6 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*subtasksInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*subtasksInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index fae3fc5aa..ceb427ffb 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -108,7 +108,7 @@ func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenO
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*taskInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*taskInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 20085bb39..d3d99393f 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -64,7 +64,7 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*selfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*selfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
@@ -102,7 +102,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*threadSelfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*threadSelfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
@@ -114,8 +114,8 @@ type dynamicBytesFileSetAttr struct {
 }
 
 // SetStat implements Inode.SetStat.
-func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
-	return d.DynamicBytesFile.InodeAttrs.SetStat(fs, opts)
+func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return d.DynamicBytesFile.InodeAttrs.SetStat(fs, creds, opts)
 }
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 3928ff2c8..9c8e63783 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -95,7 +95,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*dir) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+func (*dir) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 02637fca6..6e8b4cae7 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -575,7 +575,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if err != nil {
 		return err
 	}
-	return d.inode.setStat(opts.Stat)
+	return d.inode.setStat(rp.Credentials(), &opts.Stat)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 521206305..c18f1e46e 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -299,10 +299,16 @@ func (i *inode) statTo(stat *linux.Statx) {
 	}
 }
 
-func (i *inode) setStat(stat linux.Statx) error {
+func (i *inode) setStat(creds *auth.Credentials, stat *linux.Statx) error {
 	if stat.Mask == 0 {
 		return nil
 	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+		return err
+	}
 	i.mu.Lock()
 	var (
 		needsMtimeBump bool
@@ -457,5 +463,6 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	return fd.inode().setStat(opts.Stat)
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode().setStat(creds, &opts.Stat)
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 9a1ad630c..8ee549dc2 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -286,7 +286,8 @@ type FileDescriptionImpl interface {
 	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
 
 	// SetStat updates metadata for the file represented by the
-	// FileDescription.
+	// FileDescription. Implementations are responsible for checking if the
+	// operation can be performed (see vfs.CheckSetStat() for common checks).
 	SetStat(ctx context.Context, opts SetStatOptions) error
 
 	// StatFS returns metadata for the filesystem containing the file
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index c43dcff3d..332decce6 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -366,7 +366,9 @@ type FilesystemImpl interface {
 	// ResolvingPath.Resolve*(), then !rp.Done().
 	RmdirAt(ctx context.Context, rp *ResolvingPath) error
 
-	// SetStatAt updates metadata for the file at the given path.
+	// SetStatAt updates metadata for the file at the given path. Implementations
+	// are responsible for checking if the operation can be performed
+	// (see vfs.CheckSetStat() for common checks).
 	//
 	// Errors:
 	//
-- 
cgit v1.2.3


From 2a6c4369be8d0522a1f439aa02bce0eb21d42ea2 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 16 Mar 2020 15:59:29 -0700
Subject: Enforce file size rlimits in VFS2

Updates #1035

PiperOrigin-RevId: 301255357
---
 pkg/sentry/fsimpl/gofer/gofer.go               |  2 +-
 pkg/sentry/fsimpl/gofer/regular_file.go        |  5 ++++
 pkg/sentry/fsimpl/gofer/special_file.go        |  8 +++++++
 pkg/sentry/fsimpl/host/default_file.go         |  7 +++++-
 pkg/sentry/fsimpl/host/host.go                 |  6 ++---
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  2 +-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |  2 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go         |  2 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  6 ++---
 pkg/sentry/fsimpl/kernfs/kernfs.go             |  2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |  2 +-
 pkg/sentry/fsimpl/kernfs/symlink.go            |  2 +-
 pkg/sentry/fsimpl/proc/subtasks.go             |  2 +-
 pkg/sentry/fsimpl/proc/task.go                 |  2 +-
 pkg/sentry/fsimpl/proc/tasks_files.go          |  8 +++----
 pkg/sentry/fsimpl/sys/sys.go                   |  2 +-
 pkg/sentry/fsimpl/tmpfs/filesystem.go          |  2 +-
 pkg/sentry/fsimpl/tmpfs/regular_file.go        | 11 +++++++--
 pkg/sentry/fsimpl/tmpfs/tmpfs.go               |  6 ++---
 pkg/sentry/syscalls/linux/vfs2/setstat.go      | 15 ++++++++++--
 pkg/sentry/vfs/BUILD                           |  1 +
 pkg/sentry/vfs/file_description_impl_util.go   |  5 ++++
 pkg/sentry/vfs/permissions.go                  | 33 +++++++++++++++++++++++++-
 23 files changed, 103 insertions(+), 30 deletions(-)

diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index c4a8f0b38..999485492 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -713,7 +713,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index e95209661..3593eb1d5 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -126,6 +126,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 	if opts.Flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
+	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(limit)
 
 	d := fd.dentry()
 	d.metadataMu.Lock()
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 08c691c47..274f7346f 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -107,6 +107,14 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		return 0, syserror.EOPNOTSUPP
 	}
 
+	if fd.dentry().fileType() == linux.S_IFREG {
+		limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+		if err != nil {
+			return 0, err
+		}
+		src = src.TakeFirst64(limit)
+	}
+
 	// Do a buffered write. See rationale in PRead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
 		d.touchCMtime(ctx)
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
index 98682ba5e..459238603 100644
--- a/pkg/sentry/fsimpl/host/default_file.go
+++ b/pkg/sentry/fsimpl/host/default_file.go
@@ -112,7 +112,6 @@ func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offs
 	if f.inode.isStream {
 		return 0, syserror.ESPIPE
 	}
-
 	return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags))
 }
 
@@ -146,6 +145,12 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs
 		return 0, syserror.EOPNOTSUPP
 	}
 
+	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(limit)
+
 	var writer safemem.Writer
 	if offset == -1 {
 		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 67c050c30..2eebcd60c 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -270,7 +270,7 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) {
 }
 
 // SetStat implements kernfs.Inode.
-func (i *inode) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	s := opts.Stat
 
 	m := s.Mask
@@ -280,7 +280,7 @@ func (i *inode) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.Se
 	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil {
 		return err
 	}
 
@@ -382,7 +382,7 @@ type fileDescription struct {
 // SetStat implements vfs.FileDescriptionImpl.
 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	return f.inode.SetStat(nil, creds, opts)
+	return f.inode.SetStat(ctx, nil, creds, opts)
 }
 
 // Stat implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index c788d1d62..d8bddbafa 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -64,7 +64,7 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf
 // SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
 // inode attributes to be changed. Override SetStat() making it call
 // f.InodeAttrs to allow it.
-func (*DynamicBytesFile) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 331c82011..75c4bab1a 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -209,5 +209,5 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio
 	fs := fd.filesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(fs, creds, opts)
+	return inode.SetStat(ctx, fs, creds, opts)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 37fbe2eea..31da8b511 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -636,7 +636,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	return inode.SetStat(fs.VFSFilesystem(), rp.Credentials(), opts)
+	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 851c61b49..c612dcf07 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -234,14 +234,14 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error)
 }
 
 // SetStat implements Inode.SetStat.
-func (a *InodeAttrs) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
 	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
 
@@ -566,7 +566,7 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*StaticDirectory) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index b12b216d2..794e38908 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -333,7 +333,7 @@ type inodeMetadata interface {
 	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
 	// if the operation can be performed (see vfs.CheckSetStat() for common
 	// checks).
-	SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
+	SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
 }
 
 // Precondition: All methods in this interface may only be called on directory
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 2875e6ffa..fb0d25ad7 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -91,7 +91,7 @@ type attrs struct {
 	kernfs.InodeAttrs
 }
 
-func (*attrs) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 92f709d29..5918d3309 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -56,6 +56,6 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*StaticSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index eb191aba4..a21313666 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -130,6 +130,6 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*subtasksInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index ceb427ffb..49d6efb0e 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -108,7 +108,7 @@ func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenO
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*taskInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index d3d99393f..882c1981e 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -64,7 +64,7 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*selfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
@@ -102,7 +102,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*threadSelfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
@@ -114,8 +114,8 @@ type dynamicBytesFileSetAttr struct {
 }
 
 // SetStat implements Inode.SetStat.
-func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	return d.DynamicBytesFile.InodeAttrs.SetStat(fs, creds, opts)
+func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
 }
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 9c8e63783..7abfd62f2 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -95,7 +95,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
-func (*dir) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 6e8b4cae7..75d01b853 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -575,7 +575,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if err != nil {
 		return err
 	}
-	return d.inode.setStat(rp.Credentials(), &opts.Stat)
+	return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 711442424..5a2896bf6 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -308,11 +308,18 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		return 0, nil
 	}
 	f := fd.inode().impl.(*regularFile)
-	end := offset + srclen
-	if end < offset {
+	if end := offset + srclen; end < offset {
 		// Overflow.
 		return 0, syserror.EFBIG
 	}
+
+	var err error
+	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(srclen)
+
 	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index c18f1e46e..ff69372b3 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -299,14 +299,14 @@ func (i *inode) statTo(stat *linux.Statx) {
 	}
 }
 
-func (i *inode) setStat(creds *auth.Credentials, stat *linux.Statx) error {
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error {
 	if stat.Mask == 0 {
 		return nil
 	}
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
 		return err
 	}
 	i.mu.Lock()
@@ -464,5 +464,5 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	return fd.inode().setStat(creds, &opts.Stat)
+	return fd.inode().setStat(ctx, creds, &opts.Stat)
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 9250659ff..136453ccc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -173,12 +173,13 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		return 0, nil, err
 	}
 
-	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+	err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
 		Stat: linux.Statx{
 			Mask: linux.STATX_SIZE,
 			Size: uint64(length),
 		},
 	})
+	return 0, nil, handleSetSizeError(t, err)
 }
 
 // Ftruncate implements Linux syscall ftruncate(2).
@@ -196,12 +197,13 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 	defer file.DecRef()
 
-	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+	err := file.SetStat(t, vfs.SetStatOptions{
 		Stat: linux.Statx{
 			Mask: linux.STATX_SIZE,
 			Size: uint64(length),
 		},
 	})
+	return 0, nil, handleSetSizeError(t, err)
 }
 
 // Utime implements Linux syscall utime(2).
@@ -378,3 +380,12 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa
 		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
 	}, opts)
 }
+
+func handleSetSizeError(t *kernel.Task, err error) error {
+	if err == syserror.ErrExceedsFileSizeLimit {
+		// Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
+		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
+		return syserror.EFBIG
+	}
+	return err
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index cb4deb068..a2a06fc8f 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 45191d1c3..d45e602ce 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -339,6 +339,11 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src
 	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
+	limit, err := CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(limit)
 
 	writable, ok := fd.data.(WritableDynamicBytesSource)
 	if !ok {
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 8e250998a..2c8f23f55 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -15,8 +15,12 @@
 package vfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -147,7 +151,16 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 // CheckSetStat checks that creds has permission to change the metadata of a
 // file with the given permissions, UID, and GID as specified by stat, subject
 // to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		limit, err := CheckLimit(ctx, 0, int64(stat.Size))
+		if err != nil {
+			return err
+		}
+		if limit < int64(stat.Size) {
+			return syserror.ErrExceedsFileSizeLimit
+		}
+	}
 	if stat.Mask&linux.STATX_MODE != 0 {
 		if !CanActAsOwner(creds, kuid) {
 			return syserror.EPERM
@@ -205,3 +218,21 @@ func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
 func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
 	return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
 }
+
+// CheckLimit enforces file size rlimits. It returns error if the write
+// operation must not proceed. Otherwise it returns the max length allowed to
+// without violating the limit.
+func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
+	fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+	if fileSizeLimit > math.MaxInt64 {
+		return size, nil
+	}
+	if offset >= int64(fileSizeLimit) {
+		return 0, syserror.ErrExceedsFileSizeLimit
+	}
+	remaining := int64(fileSizeLimit) - offset
+	if remaining < size {
+		return remaining, nil
+	}
+	return size, nil
+}
-- 
cgit v1.2.3


From b55f0e5d40c17cadf68d6238564d675ed12f8f49 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 16 Mar 2020 18:28:29 -0700
Subject: fdtable: don't try to zap fdtable entry if close is called for
 non-existing fd

FDTable.setAll is used to zap entries, but it grows the table up to
a specified fd.

Reported-by: syzbot+9e281b0750d2d4caa190@syzkaller.appspotmail.com
PiperOrigin-RevId: 301280000
---
 pkg/sentry/kernel/fd_table.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 7de2e509e..dddc28d5a 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -536,7 +536,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
 	case orig2 != nil:
 		orig2.IncRef()
 	}
-	f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+	if orig != nil || orig2 != nil {
+		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+	}
 	return orig, orig2
 }
 
-- 
cgit v1.2.3


From 3192e55ffe04b583ca4261ec0b04a6e566a6038b Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Tue, 17 Mar 2020 08:52:14 -0700
Subject: Packetimpact in Go with c++ stub

PiperOrigin-RevId: 301382690
---
 Dockerfile                                        |   2 +-
 WORKSPACE                                         |  35 ++
 pkg/tcpip/header/tcp.go                           |  42 +-
 test/packetdrill/Dockerfile                       |   8 +-
 test/packetimpact/README.md                       | 531 ++++++++++++++++++++++
 test/packetimpact/dut/BUILD                       |  18 +
 test/packetimpact/dut/posix_server.cc             | 229 ++++++++++
 test/packetimpact/proto/BUILD                     |  12 +
 test/packetimpact/proto/posix_server.proto        | 150 ++++++
 test/packetimpact/testbench/BUILD                 |  31 ++
 test/packetimpact/testbench/connections.go        | 245 ++++++++++
 test/packetimpact/testbench/dut.go                | 363 +++++++++++++++
 test/packetimpact/testbench/dut_client.go         |  28 ++
 test/packetimpact/testbench/layers.go             | 507 +++++++++++++++++++++
 test/packetimpact/testbench/rawsockets.go         | 151 ++++++
 test/packetimpact/tests/BUILD                     |  21 +
 test/packetimpact/tests/Dockerfile                |   5 +
 test/packetimpact/tests/defs.bzl                  | 106 +++++
 test/packetimpact/tests/fin_wait2_timeout_test.go |  68 +++
 test/packetimpact/tests/test_runner.sh            | 246 ++++++++++
 tools/bazeldefs/defs.bzl                          |  81 +++-
 tools/defs.bzl                                    |  58 ++-
 22 files changed, 2898 insertions(+), 39 deletions(-)
 create mode 100644 test/packetimpact/README.md
 create mode 100644 test/packetimpact/dut/BUILD
 create mode 100644 test/packetimpact/dut/posix_server.cc
 create mode 100644 test/packetimpact/proto/BUILD
 create mode 100644 test/packetimpact/proto/posix_server.proto
 create mode 100644 test/packetimpact/testbench/BUILD
 create mode 100644 test/packetimpact/testbench/connections.go
 create mode 100644 test/packetimpact/testbench/dut.go
 create mode 100644 test/packetimpact/testbench/dut_client.go
 create mode 100644 test/packetimpact/testbench/layers.go
 create mode 100644 test/packetimpact/testbench/rawsockets.go
 create mode 100644 test/packetimpact/tests/BUILD
 create mode 100644 test/packetimpact/tests/Dockerfile
 create mode 100644 test/packetimpact/tests/defs.bzl
 create mode 100644 test/packetimpact/tests/fin_wait2_timeout_test.go
 create mode 100755 test/packetimpact/tests/test_runner.sh

diff --git a/Dockerfile b/Dockerfile
index 2bfdfec6c..0fac71710 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM fedora:31
 
 RUN  dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
 
-RUN dnf install -y bazel2 git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static
+RUN dnf install -y bazel2 git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch
 
 RUN pip install pycparser
 
diff --git a/WORKSPACE b/WORKSPACE
index d2bbadc63..62dfb9dc6 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -161,6 +161,20 @@ load(
 
 _go_image_repos()
 
+# Load C++ grpc rules.
+http_archive(
+    name = "com_github_grpc_grpc",
+    sha256 = "2fcb7f1ab160d6fd3aaade64520be3e5446fc4c6fa7ba6581afdc4e26094bd81",
+    strip_prefix = "grpc-1.26.0",
+    urls = [
+        "https://github.com/grpc/grpc/archive/v1.26.0.tar.gz",
+    ],
+)
+load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+grpc_deps()
+load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
+grpc_extra_deps()
+
 # External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
@@ -204,6 +218,13 @@ go_repository(
     version = "v0.0.0-20171129191014-dec09d789f3d",
 )
 
+go_repository(
+    name = "com_github_imdario_mergo",
+    importpath = "github.com/imdario/mergo",
+    version = "v0.3.8",
+    sum = "h1:CGgOkSJeqMRmt0D9XLWExdT4m4F1vd3FV3VPt+0VxkQ=",
+)
+
 go_repository(
     name = "com_github_kr_pretty",
     importpath = "github.com/kr/pretty",
@@ -225,6 +246,12 @@ go_repository(
     version = "v0.1.0",
 )
 
+go_repository(
+    name = "com_github_mohae_deepcopy",
+    importpath = "github.com/mohae/deepcopy",
+    commit = "c48cc78d482608239f6c4c92a4abd87eb8761c90",
+)
+
 go_repository(
     name = "com_github_opencontainers_runtime-spec",
     importpath = "github.com/opencontainers/runtime-spec",
@@ -253,6 +280,14 @@ go_repository(
     version = "v0.0.0-20171111001504-be1fbeda1936",
 )
 
+go_repository(
+    name = "org_golang_google_grpc",
+    build_file_proto_mode = "disable",
+    importpath = "google.golang.org/grpc",
+    sum = "h1:zvIju4sqAGvwKspUQOhwnpcqSbzi7/H6QomNNjTL4sk=",
+    version = "v1.27.1",
+)
+
 go_repository(
     name = "in_gopkg_check_v1",
     importpath = "gopkg.in/check.v1",
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 82cfe785c..13480687d 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -81,7 +81,8 @@ type TCPFields struct {
 	// AckNum is the "acknowledgement number" field of a TCP packet.
 	AckNum uint32
 
-	// DataOffset is the "data offset" field of a TCP packet.
+	// DataOffset is the "data offset" field of a TCP packet. It is the length of
+	// the TCP header in bytes.
 	DataOffset uint8
 
 	// Flags is the "flags" field of a TCP packet.
@@ -213,7 +214,8 @@ func (b TCP) AckNumber() uint32 {
 	return binary.BigEndian.Uint32(b[TCPAckNumOffset:])
 }
 
-// DataOffset returns the "data offset" field of the tcp header.
+// DataOffset returns the "data offset" field of the tcp header. The return
+// value is the length of the TCP header in bytes.
 func (b TCP) DataOffset() uint8 {
 	return (b[TCPDataOffset] >> 4) * 4
 }
@@ -238,6 +240,11 @@ func (b TCP) Checksum() uint16 {
 	return binary.BigEndian.Uint16(b[TCPChecksumOffset:])
 }
 
+// UrgentPointer returns the "urgent pointer" field of the tcp header.
+func (b TCP) UrgentPointer() uint16 {
+	return binary.BigEndian.Uint16(b[TCPUrgentPtrOffset:])
+}
+
 // SetSourcePort sets the "source port" field of the tcp header.
 func (b TCP) SetSourcePort(port uint16) {
 	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port)
@@ -253,6 +260,37 @@ func (b TCP) SetChecksum(checksum uint16) {
 	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], checksum)
 }
 
+// SetDataOffset sets the data offset field of the tcp header. headerLen should
+// be the length of the TCP header in bytes.
+func (b TCP) SetDataOffset(headerLen uint8) {
+	b[TCPDataOffset] = (headerLen / 4) << 4
+}
+
+// SetSequenceNumber sets the sequence number field of the tcp header.
+func (b TCP) SetSequenceNumber(seqNum uint32) {
+	binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seqNum)
+}
+
+// SetAckNumber sets the ack number field of the tcp header.
+func (b TCP) SetAckNumber(ackNum uint32) {
+	binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ackNum)
+}
+
+// SetFlags sets the flags field of the tcp header.
+func (b TCP) SetFlags(flags uint8) {
+	b[TCPFlagsOffset] = flags
+}
+
+// SetWindowSize sets the window size field of the tcp header.
+func (b TCP) SetWindowSize(rcvwnd uint16) {
+	binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
+}
+
+// SetUrgentPoiner sets the window size field of the tcp header.
+func (b TCP) SetUrgentPoiner(urgentPointer uint16) {
+	binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], urgentPointer)
+}
+
 // CalculateChecksum calculates the checksum of the tcp segment.
 // partialChecksum is the checksum of the network-layer pseudo-header
 // and the checksum of the segment data.
diff --git a/test/packetdrill/Dockerfile b/test/packetdrill/Dockerfile
index bd4451355..4b75e9527 100644
--- a/test/packetdrill/Dockerfile
+++ b/test/packetdrill/Dockerfile
@@ -1,9 +1,9 @@
 FROM ubuntu:bionic
 
-RUN apt-get update
-RUN apt-get install -y net-tools git iptables iputils-ping netcat tcpdump jq tar
+RUN apt-get update && apt-get install -y net-tools git iptables iputils-ping \
+        netcat tcpdump jq tar bison flex make
 RUN hash -r
 RUN git clone --branch packetdrill-v2.0 \
         https://github.com/google/packetdrill.git
-RUN cd packetdrill/gtests/net/packetdrill && ./configure && \
-        apt-get install -y bison flex make && make
+RUN cd packetdrill/gtests/net/packetdrill && ./configure && make
+CMD /bin/bash
diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md
new file mode 100644
index 000000000..ece4dedc6
--- /dev/null
+++ b/test/packetimpact/README.md
@@ -0,0 +1,531 @@
+# Packetimpact
+
+## What is packetimpact?
+
+Packetimpact is a tool for platform-independent network testing. It is heavily
+inspired by [packetdrill](https://github.com/google/packetdrill). It creates two
+docker containers connected by a network. One is for the test bench, which
+operates the test. The other is for the device-under-test (DUT), which is the
+software being tested. The test bench communicates over the network with the DUT
+to check correctness of the network.
+
+### Goals
+
+Packetimpact aims to provide:
+
+*   A **multi-platform** solution that can test both Linux and gVisor.
+*   **Conciseness** on par with packetdrill scripts.
+*   **Control-flow** like for loops, conditionals, and variables.
+*   **Flexibilty** to specify every byte in a packet or use multiple sockets.
+
+## When to use packetimpact?
+
+There are a few ways to write networking tests for gVisor currently:
+
+*   [Go unit tests](https://github.com/google/gvisor/tree/master/pkg/tcpip)
+*   [syscall tests](https://github.com/google/gvisor/tree/master/test/syscalls/linux)
+*   [packetdrill tests](https://github.com/google/gvisor/tree/master/test/packetdrill)
+*   packetimpact tests
+
+The right choice depends on the needs of the test.
+
+Feature       | Go unit test | syscall test | packetdrill | packetimpact
+------------- | ------------ | ------------ | ----------- | ------------
+Multiplatform | no           | **YES**      | **YES**     | **YES**
+Concise       | no           | somewhat     | somewhat    | **VERY**
+Control-flow  | **YES**      | **YES**      | no          | **YES**
+Flexible      | **VERY**     | no           | somewhat    | **VERY**
+
+### Go unit tests
+
+If the test depends on the internals of gVisor and doesn't need to run on Linux
+or other platforms for comparison purposes, a Go unit test can be appropriate.
+They can observe internals of gVisor networking. The downside is that they are
+**not concise** and **not multiplatform**. If you require insight on gVisor
+internals, this is the right choice.
+
+### Syscall tests
+
+Syscall tests are **multiplatform** but cannot examine the internals of gVisor
+networking. They are **concise**. They can use **control-flow** structures like
+conditionals, for loops, and variables. However, they are limited to only what
+the POSIX interface provides so they are **not flexible**. For example, you
+would have difficulty writing a syscall test that intentionally sends a bad IP
+checksum. Or if you did write that test with raw sockets, it would be very
+**verbose** to write a test that intentionally send wrong checksums, wrong
+protocols, wrong sequence numbers, etc.
+
+### Packetdrill tests
+
+Packetdrill tests are **multiplatform** and can run against both Linux and
+gVisor. They are **concise** and use a special packetdrill scripting language.
+They are **more flexible** than a syscall test in that they can send packets
+that a syscall test would have difficulty sending, like a packet with a
+calcuated ACK number. But they are also somewhat limimted in flexibiilty in that
+they can't do tests with multiple sockets. They have **no control-flow** ability
+like variables or conditionals. For example, it isn't possible to send a packet
+that depends on the window size of a previous packet because the packetdrill
+language can't express that. Nor could you branch based on whether or not the
+other side supports window scaling, for example.
+
+### Packetimpact tests
+
+Packetimpact tests are similar to Packetdrill tests except that they are written
+in Go instead of the packetdrill scripting language. That gives them all the
+**control-flow** abilities of Go (loops, functions, variables, etc). They are
+**multiplatform** in the same way as packetdrill tests but even more
+**flexible** because Go is more expressive than the scripting language of
+packetdrill. However, Go is **not as concise** as the packetdrill language. Many
+design decisions below are made to mitigate that.
+
+## How it works
+
+```
+    +--------------+               +--------------+
+    |              |   TEST NET    |              |
+    |              | <===========> |    Device    |
+    |    Test      |               |    Under     |
+    |    Bench     |               |    Test      |
+    |              | <===========> |    (DUT)     |
+    |              |  CONTROL NET  |              |
+    +--------------+               +--------------+
+```
+
+Two docker containers are created by a script, one for the test bench and the
+other for the device under test (DUT). The script connects the two containers
+with a control network and test network. It also does some other tasks like
+waiting until the DUT is ready before starting the test and disabling Linux
+networking that would interfere with the test bench.
+
+### DUT
+
+The DUT container runs a program called the "posix_server". The posix_server is
+written in c++ for maximum portability. It is compiled on the host. The script
+that starts the containers copies it into the DUT's container and runs it. It's
+job is to receive directions from the test bench on what actions to take. For
+this, the posix_server does three steps in a loop:
+
+1.  Listen for a request from the test bench.
+2.  Execute a command.
+3.  Send the response back to the test bench.
+
+The requests and responses are
+[protobufs](https://developers.google.com/protocol-buffers) and the
+communication is done with [gRPC](https://grpc.io/). The commands run are
+[POSIX socket commands](https://en.wikipedia.org/wiki/Berkeley_sockets#Socket_API_functions),
+with the inputs and outputs converted into protobuf requests and responses. All
+communication is on the control network, so that the test network is unaffected
+by extra packets.
+
+For example, this is the request and response pair to call
+[`socket()`](http://man7.org/linux/man-pages/man2/socket.2.html):
+
+```protocol-buffer
+message SocketRequest {
+  int32 domain = 1;
+  int32 type = 2;
+  int32 protocol = 3;
+}
+
+message SocketResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;
+}
+```
+
+##### Alternatives considered
+
+*   We could have use JSON for communication instead. It would have been a
+    lighter-touch than protobuf but protobuf handles all the data type and has
+    strict typing to prevent a class of errors. The test bench could be written
+    in other languages, too.
+*   Instead of mimicking the POSIX interfaces, arguments could have had a more
+    natural form, like the `bind()` getting a string IP address instead of bytes
+    in a `sockaddr_t`. However, conforming to the existing structures keeps more
+    of the complexity in Go and keeps the posix_server simpler and thus more
+    likely to compile everywhere.
+
+### Test Bench
+
+The test bench does most of the work in a test. It is a Go program that compiles
+on the host and is copied by the script into test bench's container. It is a
+regular [go unit test](https://golang.org/pkg/testing/) that imports the test
+bench framework. The test bench framwork is based on three basic utilities:
+
+*   Commanding the DUT to run POSIX commands and return responses.
+*   Sending raw packets to the DUT on the test network.
+*   Listening for raw packets from the DUT on the test network.
+
+#### DUT commands
+
+To keep the interface to the DUT consistent and easy-to-use, each POSIX command
+supported by the posix_server is wrapped in functions with signatures similar to
+the ones in the [Go unix package](https://godoc.org/golang.org/x/sys/unix). This
+way all the details of endianess and (un)marshalling of go structs such as
+[unix.Timeval](https://godoc.org/golang.org/x/sys/unix#Timeval) is handled in
+one place. This also makes it straight-forward to convert tests that use `unix.`
+or `syscall.` calls to `dut.` calls.
+
+For example, creating a connection to the DUT and commanding it to make a socket
+looks like this:
+
+```go
+dut := testbench.NewDut(t)
+fd, err := dut.SocketWithErrno(unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_IP)
+if fd < 0 {
+  t.Fatalf(...)
+}
+```
+
+Because the usual case is to fail the test when the DUT fails to create a
+socket, there is a concise version of each of the `...WithErrno` functions that
+does that:
+
+```go
+dut := testbench.NewDut(t)
+fd := dut.Socket(unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_IP)
+```
+
+The DUT and other structs in the code store a `*testing.T` so that they can
+provide versions of functions that call `t.Fatalf(...)`. This helps keep tests
+concise.
+
+##### Alternatives considered
+
+*   Instead of mimicking the `unix.` go interface, we could have invented a more
+    natural one, like using `float64` instead of `Timeval`. However, using the
+    same function signatures that `unix.` has makes it easier to convert code to
+    `dut.`. Also, using an existing interface ensures that we don't invent an
+    interface that isn't extensible. For example, if we invented a function for
+    `bind()` that didn't support IPv6 and later we had to add a second `bind6()`
+    function.
+
+#### Sending/Receiving Raw Packets
+
+The framework wraps POSIX sockets for sending and receiving raw frames. Both
+send and receive are synchronous commands.
+[SO_RCVTIMEO](http://man7.org/linux/man-pages/man7/socket.7.html) is used to set
+a timeout on the receive commands. For ease of use, these are wrapped in an
+`Injector` and a `Sniffer`. They have functions:
+
+```go
+func (s *Sniffer) Recv(timeout time.Duration) []byte {...}
+func (i *Injector) Send(b []byte) {...}
+```
+
+##### Alternatives considered
+
+*   [gopacket](https://github.com/google/gopacket) pcap has raw socket support
+    but requires cgo. cgo is not guaranteed to be portable from the host to the
+    container and in practice, the container doesn't recognize binaries built on
+    the host if they use cgo.
+*   Both gVisor and gopacket have the ability to read and write pcap files
+    without cgo but that is insufficient here.
+*   The sniffer and injector can't share a socket because they need to be bound
+    differently.
+*   Sniffing could have been done asynchronously with channels, obviating the
+    need for `SO_RCVTIMEO`. But that would introduce asynchronous complication.
+    `SO_RCVTIMEO` is well supported on the test bench.
+
+#### `Layer` struct
+
+A large part of packetimpact tests is creating packets to send and comparing
+received packets against expectations. To keep tests concise, it is useful to be
+able to specify just the important parts of packets that need to be set. For
+example, sending a packet with default values except for TCP Flags. And for
+packets received, it's useful to be able to compare just the necessary parts of
+received packets and ignore the rest.
+
+To aid in both of those, Go structs with optional fields are created for each
+encapsulation type, such as IPv4, TCP, and Ethernet. This is inspired by
+[scapy](https://scapy.readthedocs.io/en/latest/). For example, here is the
+struct for Ethernet:
+
+```go
+type Ether struct {
+  LayerBase
+  SrcAddr *tcpip.LinkAddress
+  DstAddr *tcpip.LinkAddress
+  Type    *tcpip.NetworkProtocolNumber
+}
+```
+
+Each struct has the same fields as those in the
+[gVisor headers](https://github.com/google/gvisor/tree/master/pkg/tcpip/header)
+but with a pointer for each field that may be `nil`.
+
+##### Alternatives considered
+
+*   Just use []byte like gVisor headers do. The drawback is that it makes the
+    tests more verbose.
+    *   For example, there would be no way to call `Send(myBytes)` concisely and
+        indicate if the checksum should be calculated automatically versus
+        overridden. The only way would be to add lines to the test to calculate
+        it before each Send, which is wordy. Or make multiple versions of Send:
+        one that checksums IP, one that doesn't, one that checksums TCP, one
+        that does both, etc. That would be many combinations.
+    *   Filtering inputs would become verbose. Either:
+    *   large conditionals that need to be repeated many places:
+        `h[FlagOffset] == SYN && h[LengthOffset:LengthOffset+2] == ...` or
+    *   Many functions, one per field, like: `filterByFlag(myBytes, SYN)`,
+        `filterByLength(myBytes, 20)`, `filterByNextProto(myBytes, 0x8000)`,
+        etc.
+    *   Using pointers allows us to combine `Layer`s with a one-line call to
+        `mergo.Merge(...)`. So the default `Layers` can be overridden by a
+        `Layers` with just the TCP conection's src/dst which can be overridden
+        by one with just a test specific TCP window size. Each override is
+        specified as just one call to `mergo.Merge`.
+    *   It's a proven way to separate the details of a packet from the byte
+        format as shown by scapy's success.
+*   Use packetgo. It's more general than parsing packets with gVisor. However:
+    *   packetgo doesn't have optional fields so many of the above problems
+        still apply.
+    *   It would be yet another dependency.
+    *   It's not as well known to engineers that are already writing gVisor
+        code.
+    *   It might be a good candidate for replacing the parsing of packets into
+        `Layer`s if all that parsing turns out to be more work than parsing by
+        packetgo and converting *that* to `Layer`. packetgo has easier to use
+        getters for the layers. This could be done later in a way that doesn't
+        break tests.
+
+#### `Layer` methods
+
+The `Layer` structs provide a way to partially specify an encapsulation. They
+also need methods for using those partially specified encapsulation, for example
+to marshal them to bytes or compare them. For those, each encapsulation
+implements the `Layer` interface:
+
+```go
+// Layer is the interface that all encapsulations must implement.
+//
+// A Layer is an encapsulation in a packet, such as TCP, IPv4, IPv6, etc. A
+// Layer contains all the fields of the encapsulation. Each field is a pointer
+// and may be nil.
+type Layer interface {
+    // toBytes converts the Layer into bytes. In places where the Layer's field
+    // isn't nil, the value that is pointed to is used. When the field is nil, a
+    // reasonable default for the Layer is used. For example, "64" for IPv4 TTL
+    // and a calculated checksum for TCP or IP. Some layers require information
+    // from the previous or next layers in order to compute a default, such as
+    // TCP's checksum or Ethernet's type, so each Layer has a doubly-linked list
+    // to the layer's neighbors.
+    toBytes() ([]byte, error)
+
+    // match checks if the current Layer matches the provided Layer. If either
+    // Layer has a nil in a given field, that field is considered matching.
+    // Otherwise, the values pointed to by the fields must match.
+    match(Layer) bool
+
+    // length in bytes of the current encapsulation
+    length() int
+
+    // next gets a pointer to the encapsulated Layer.
+    next() Layer
+
+    // prev gets a pointer to the Layer encapsulating this one.
+    prev() Layer
+
+    // setNext sets the pointer to the encapsulated Layer.
+    setNext(Layer)
+
+    // setPrev sets the pointer to the Layer encapsulating this one.
+    setPrev(Layer)
+}
+```
+
+For each `Layer` there is also a parsing function. For example, this one is for
+Ethernet:
+
+```
+func ParseEther(b []byte) (Layers, error)
+```
+
+The parsing function converts bytes received on the wire into a `Layer`
+(actually `Layers`, see below) which has no `nil`s in it. By using
+`match(Layer)` to compare against another `Layer` that *does* have `nil`s in it,
+the received bytes can be partially compared. The `nil`s behave as
+"don't-cares".
+
+##### Alternatives considered
+
+*   Matching against `[]byte` instead of converting to `Layer` first.
+    *   The downside is that it precludes the use of a `cmp.Equal` one-liner to
+        do comparisons.
+    *   It creates confusion in the code to deal with both representations at
+        different times. For example, is the checksum calculated on `[]byte` or
+        `Layer` when sending? What about when checking received packets?
+
+#### `Layers`
+
+```
+type Layers []Layer
+
+func (ls *Layers) match(other Layers) bool {...}
+func (ls *Layers) toBytes() ([]byte, error) {...}
+```
+
+`Layers` is an array of `Layer`. It represents a stack of encapsulations, such
+as `Layers{Ether{},IPv4{},TCP{},Payload{}}`. It also has `toBytes()` and
+`match(Layers)`, like `Layer`. The parse functions above actually return
+`Layers` and not `Layer` because they know about the headers below and
+sequentially call each parser on the remaining, encapsulated bytes.
+
+All this leads to the ability to write concise packet processing. For example:
+
+```go
+etherType := 0x8000
+flags = uint8(header.TCPFlagSyn|header.TCPFlagAck)
+toMatch := Layers{Ether{Type: &etherType}, IPv4{}, TCP{Flags: &flags}}
+for {
+  recvBytes := sniffer.Recv(time.Second)
+  if recvBytes == nil {
+    println("Got no packet for 1 second")
+  }
+  gotPacket, err := ParseEther(recvBytes)
+  if err == nil && toMatch.match(gotPacket) {
+    println("Got a TCP/IPv4/Eth packet with SYNACK")
+  }
+}
+```
+
+##### Alternatives considered
+
+*   Don't use previous and next pointers.
+    *   Each layer may need to be able to interrogate the layers aroung it, like
+        for computing the next protocol number or total length. So *some*
+        mechanism is needed for a `Layer` to see neighboring layers.
+    *   We could pass the entire array `Layers` to the `toBytes()` function.
+        Passing an array to a method that includes in the array the function
+        receiver itself seems wrong.
+
+#### Connections
+
+Using `Layers` above, we can create connection structures to maintain state
+about connections. For example, here is the `TCPIPv4` struct:
+
+```
+type TCPIPv4 struct {
+  outgoing     Layers
+  incoming     Layers
+  localSeqNum  uint32
+  remoteSeqNum uint32
+  sniffer      Sniffer
+  injector     Injector
+  t            *testing.T
+}
+```
+
+`TCPIPv4` contains an `outgoing Layers` which holds the defaults for the
+connection, such as the source and destination MACs, IPs, and ports. When
+`outgoing.toBytes()` is called a valid packet for this TCPIPv4 flow is built.
+
+It also contains `incoming Layers` which holds filter for incoming packets that
+belong to this flow. `incoming.match(Layers)` is used on received bytes to check
+if they are part of the flow.
+
+The `sniffer` and `injector` are for receiving and sending raw packet bytes. The
+`localSeqNum` and `remoteSeqNum` are updated by `Send` and `Recv` so that
+outgoing packets will have, by default, the correct sequence number and ack
+number.
+
+TCPIPv4 provides some functions:
+
+```
+func (conn *TCPIPv4) Send(tcp TCP) {...}
+func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {...}
+```
+
+`Send(tcp TCP)` uses [mergo](https://github.com/imdario/mergo) to merge the
+provided `TCP` (a `Layer`) into `outgoing`. This way the user can specify
+concisely just which fields of `outgoing` to modify. The packet is sent using
+the `injector`.
+
+`Recv(timeout time.Duration)` reads packets from the sniffer until either the
+timeout has elapsed or a packet that matches `incoming` arrives.
+
+Using those, we can perform a TCP 3-way handshake without too much code:
+
+```go
+func (conn *TCPIPv4) Handshake() {
+  syn := uint8(header.TCPFlagSyn)
+  synack := uint8(header.TCPFlagSyn)
+  ack := uint8(header.TCPFlagAck)
+  conn.Send(TCP{Flags: &syn}) // Send a packet with all defaults but set TCP-SYN.
+
+  // Wait for the SYN-ACK response.
+  for {
+    newTCP := conn.Recv(time.Second)  // This already filters by MAC, IP, and ports.
+    if TCP{Flags: &synack}.match(newTCP) {
+      break // Only if it's a SYN-ACK proceed.
+    }
+  }
+
+  conn.Send(TCP{Flags: &ack}) // Send an ACK. The seq and ack numbers are set correctly.
+}
+```
+
+The handshake code is part of the testbench utilities so tests can share this
+common sequence, making tests even more concise.
+
+##### Alternatives considered
+
+*   Instead of storing `outgoing` and `incoming`, store values.
+    *   There would be many more things to store instead, like `localMac`,
+        `remoteMac`, `localIP`, `remoteIP`, `localPort`, and `remotePort`.
+    *   Construction of a packet would be many lines to copy each of these
+        values into a `[]byte`. And there would be slight variations needed for
+        each encapsulation stack, like TCPIPv6 and ARP.
+    *   Filtering incoming packets would be a long sequence:
+    *   Compare the MACs, then
+    *   Parse the next header, then
+    *   Compare the IPs, then
+    *   Parse the next header, then
+    *   Compare the TCP ports. Instead it's all just one call to
+        `cmp.Equal(...)`, for all sequences.
+    *   A TCPIPv6 connection could share most of the code. Only the type of the
+        IP addresses are different. The types of `outgoing` and `incoming` would
+        be remain `Layers`.
+    *   An ARP connection could share all the Ethernet parts. The IP `Layer`
+        could be factored out of `outgoing`. After that, the IPv4 and IPv6
+        connections could implement one interface and a single TCP struct could
+        have either network protocol through composition.
+
+## Putting it all together
+
+Here's what te start of a packetimpact unit test looks like. This test creates a
+TCP connection with the DUT. There are added comments for explanation in this
+document but a real test might not include them in order to stay even more
+concise.
+
+```go
+func TestMyTcpTest(t *testing.T) {
+  // Prepare a DUT for communication.
+  dut := testbench.NewDUT(t)
+
+  // This does:
+  //   dut.Socket()
+  //   dut.Bind()
+  //   dut.Getsockname() to learn the new port number
+  //   dut.Listen()
+  listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+  defer dut.Close(listenFD) // Tell the DUT to close the socket at the end of the test.
+
+  // Monitor a new TCP connection with sniffer, injector, sequence number tracking,
+  // and reasonable outgoing and incoming packet field default IPs, MACs, and port numbers.
+  conn := testbench.NewTCPIPv4(t, dut, remotePort)
+
+  // Perform a 3-way handshake: send SYN, expect SYNACK, send ACK.
+  conn.Handshake()
+
+  // Tell the DUT to accept the new connection.
+  acceptFD := dut.Accept(acceptFd)
+}
+```
+
+## Other notes
+
+*   The time between receiving a SYN-ACK and replying with an ACK in `Handshake`
+    is about 3ms. This is much slower than the native unix response, which is
+    about 0.3ms. Packetdrill gets closer to 0.3ms. For tests where timing is
+    crucial, packetdrill is faster and more precise.
diff --git a/test/packetimpact/dut/BUILD b/test/packetimpact/dut/BUILD
new file mode 100644
index 000000000..3ce63c2c6
--- /dev/null
+++ b/test/packetimpact/dut/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "cc_binary", "grpcpp")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+cc_binary(
+    name = "posix_server",
+    srcs = ["posix_server.cc"],
+    linkstatic = 1,
+    static = True,  # This is needed for running in a docker container.
+    deps = [
+        grpcpp,
+        "//test/packetimpact/proto:posix_server_cc_grpc_proto",
+        "//test/packetimpact/proto:posix_server_cc_proto",
+    ],
+)
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
new file mode 100644
index 000000000..2f10dda40
--- /dev/null
+++ b/test/packetimpact/dut/posix_server.cc
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <getopt.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <unordered_map>
+
+#include "arpa/inet.h"
+#include "include/grpcpp/security/server_credentials.h"
+#include "include/grpcpp/server_builder.h"
+#include "test/packetimpact/proto/posix_server.grpc.pb.h"
+#include "test/packetimpact/proto/posix_server.pb.h"
+
+// Converts a sockaddr_storage to a Sockaddr message.
+::grpc::Status sockaddr_to_proto(const sockaddr_storage &addr,
+                                 socklen_t addrlen,
+                                 posix_server::Sockaddr *sockaddr_proto) {
+  switch (addr.ss_family) {
+    case AF_INET: {
+      auto addr_in = reinterpret_cast<const sockaddr_in *>(&addr);
+      auto response_in = sockaddr_proto->mutable_in();
+      response_in->set_family(addr_in->sin_family);
+      response_in->set_port(ntohs(addr_in->sin_port));
+      response_in->mutable_addr()->assign(
+          reinterpret_cast<const char *>(&addr_in->sin_addr.s_addr), 4);
+      return ::grpc::Status::OK;
+    }
+    case AF_INET6: {
+      auto addr_in6 = reinterpret_cast<const sockaddr_in6 *>(&addr);
+      auto response_in6 = sockaddr_proto->mutable_in6();
+      response_in6->set_family(addr_in6->sin6_family);
+      response_in6->set_port(ntohs(addr_in6->sin6_port));
+      response_in6->set_flowinfo(ntohl(addr_in6->sin6_flowinfo));
+      response_in6->mutable_addr()->assign(
+          reinterpret_cast<const char *>(&addr_in6->sin6_addr.s6_addr), 16);
+      response_in6->set_scope_id(ntohl(addr_in6->sin6_scope_id));
+      return ::grpc::Status::OK;
+    }
+  }
+  return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Unknown Sockaddr");
+}
+
+class PosixImpl final : public posix_server::Posix::Service {
+  ::grpc::Status Socket(grpc_impl::ServerContext *context,
+                        const ::posix_server::SocketRequest *request,
+                        ::posix_server::SocketResponse *response) override {
+    response->set_fd(
+        socket(request->domain(), request->type(), request->protocol()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Bind(grpc_impl::ServerContext *context,
+                      const ::posix_server::BindRequest *request,
+                      ::posix_server::BindResponse *response) override {
+    if (!request->has_addr()) {
+      return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                            "Missing address");
+    }
+    sockaddr_storage addr;
+
+    switch (request->addr().sockaddr_case()) {
+      case posix_server::Sockaddr::SockaddrCase::kIn: {
+        auto request_in = request->addr().in();
+        if (request_in.addr().size() != 4) {
+          return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                                "IPv4 address must be 4 bytes");
+        }
+        auto addr_in = reinterpret_cast<sockaddr_in *>(&addr);
+        addr_in->sin_family = request_in.family();
+        addr_in->sin_port = htons(request_in.port());
+        request_in.addr().copy(
+            reinterpret_cast<char *>(&addr_in->sin_addr.s_addr), 4);
+        break;
+      }
+      case posix_server::Sockaddr::SockaddrCase::kIn6: {
+        auto request_in6 = request->addr().in6();
+        if (request_in6.addr().size() != 16) {
+          return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                                "IPv6 address must be 16 bytes");
+        }
+        auto addr_in6 = reinterpret_cast<sockaddr_in6 *>(&addr);
+        addr_in6->sin6_family = request_in6.family();
+        addr_in6->sin6_port = htons(request_in6.port());
+        addr_in6->sin6_flowinfo = htonl(request_in6.flowinfo());
+        request_in6.addr().copy(
+            reinterpret_cast<char *>(&addr_in6->sin6_addr.s6_addr), 16);
+        addr_in6->sin6_scope_id = htonl(request_in6.scope_id());
+        break;
+      }
+      case posix_server::Sockaddr::SockaddrCase::SOCKADDR_NOT_SET:
+      default:
+        return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                              "Unknown Sockaddr");
+    }
+    response->set_ret(bind(request->sockfd(),
+                           reinterpret_cast<sockaddr *>(&addr), sizeof(addr)));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status GetSockName(
+      grpc_impl::ServerContext *context,
+      const ::posix_server::GetSockNameRequest *request,
+      ::posix_server::GetSockNameResponse *response) override {
+    sockaddr_storage addr;
+    socklen_t addrlen = sizeof(addr);
+    response->set_ret(getsockname(
+        request->sockfd(), reinterpret_cast<sockaddr *>(&addr), &addrlen));
+    response->set_errno_(errno);
+    return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
+  }
+
+  ::grpc::Status Listen(grpc_impl::ServerContext *context,
+                        const ::posix_server::ListenRequest *request,
+                        ::posix_server::ListenResponse *response) override {
+    response->set_ret(listen(request->sockfd(), request->backlog()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Accept(grpc_impl::ServerContext *context,
+                        const ::posix_server::AcceptRequest *request,
+                        ::posix_server::AcceptResponse *response) override {
+    sockaddr_storage addr;
+    socklen_t addrlen = sizeof(addr);
+    response->set_fd(accept(request->sockfd(),
+                            reinterpret_cast<sockaddr *>(&addr), &addrlen));
+    response->set_errno_(errno);
+    return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
+  }
+
+  ::grpc::Status SetSockOpt(
+      grpc_impl::ServerContext *context,
+      const ::posix_server::SetSockOptRequest *request,
+      ::posix_server::SetSockOptResponse *response) override {
+    response->set_ret(setsockopt(request->sockfd(), request->level(),
+                                 request->optname(), request->optval().c_str(),
+                                 request->optval().size()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status SetSockOptTimeval(
+      ::grpc::ServerContext *context,
+      const ::posix_server::SetSockOptTimevalRequest *request,
+      ::posix_server::SetSockOptTimevalResponse *response) override {
+    timeval tv = {.tv_sec = static_cast<__time_t>(request->timeval().seconds()),
+                  .tv_usec = static_cast<__suseconds_t>(
+                      request->timeval().microseconds())};
+    response->set_ret(setsockopt(request->sockfd(), request->level(),
+                                 request->optname(), &tv, sizeof(tv)));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Close(grpc_impl::ServerContext *context,
+                       const ::posix_server::CloseRequest *request,
+                       ::posix_server::CloseResponse *response) override {
+    response->set_ret(close(request->fd()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+};
+
+// Parse command line options. Returns a pointer to the first argument beyond
+// the options.
+void parse_command_line_options(int argc, char *argv[], std::string *ip,
+                                int *port) {
+  static struct option options[] = {{"ip", required_argument, NULL, 1},
+                                    {"port", required_argument, NULL, 2},
+                                    {0, 0, 0, 0}};
+
+  // Parse the arguments.
+  int c;
+  while ((c = getopt_long(argc, argv, "", options, NULL)) > 0) {
+    if (c == 1) {
+      *ip = optarg;
+    } else if (c == 2) {
+      *port = std::stoi(std::string(optarg));
+    }
+  }
+}
+
+void run_server(const std::string &ip, int port) {
+  PosixImpl posix_service;
+  grpc::ServerBuilder builder;
+  std::string server_address = ip + ":" + std::to_string(port);
+  // Set the authentication mechanism.
+  std::shared_ptr<grpc::ServerCredentials> creds =
+      grpc::InsecureServerCredentials();
+  builder.AddListeningPort(server_address, creds);
+  builder.RegisterService(&posix_service);
+
+  std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
+  std::cerr << "Server listening on " << server_address << std::endl;
+  server->Wait();
+  std::cerr << "posix_server is finished." << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+  std::cerr << "posix_server is starting." << std::endl;
+  std::string ip;
+  int port;
+  parse_command_line_options(argc, argv, &ip, &port);
+
+  std::cerr << "Got IP " << ip << " and port " << port << "." << std::endl;
+  run_server(ip, port);
+}
diff --git a/test/packetimpact/proto/BUILD b/test/packetimpact/proto/BUILD
new file mode 100644
index 000000000..4a4370f42
--- /dev/null
+++ b/test/packetimpact/proto/BUILD
@@ -0,0 +1,12 @@
+load("//tools:defs.bzl", "proto_library")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+proto_library(
+    name = "posix_server",
+    srcs = ["posix_server.proto"],
+    has_services = 1,
+)
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
new file mode 100644
index 000000000..026876fc2
--- /dev/null
+++ b/test/packetimpact/proto/posix_server.proto
@@ -0,0 +1,150 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package posix_server;
+
+message SocketRequest {
+  int32 domain = 1;
+  int32 type = 2;
+  int32 protocol = 3;
+}
+
+message SocketResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;
+}
+
+message SockaddrIn {
+  int32 family = 1;
+  uint32 port = 2;
+  bytes addr = 3;
+}
+
+message SockaddrIn6 {
+  uint32 family = 1;
+  uint32 port = 2;
+  uint32 flowinfo = 3;
+  bytes addr = 4;
+  uint32 scope_id = 5;
+}
+
+message Sockaddr {
+  oneof sockaddr {
+    SockaddrIn in = 1;
+    SockaddrIn6 in6 = 2;
+  }
+}
+
+message BindRequest {
+  int32 sockfd = 1;
+  Sockaddr addr = 2;
+}
+
+message BindResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+message GetSockNameRequest {
+  int32 sockfd = 1;
+}
+
+message GetSockNameResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+  Sockaddr addr = 3;
+}
+
+message ListenRequest {
+  int32 sockfd = 1;
+  int32 backlog = 2;
+}
+
+message ListenResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+message AcceptRequest {
+  int32 sockfd = 1;
+}
+
+message AcceptResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;
+  Sockaddr addr = 3;
+}
+
+message SetSockOptRequest {
+  int32 sockfd = 1;
+  int32 level = 2;
+  int32 optname = 3;
+  bytes optval = 4;
+}
+
+message SetSockOptResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+message Timeval {
+  int64 seconds = 1;
+  int64 microseconds = 2;
+}
+
+message SetSockOptTimevalRequest {
+  int32 sockfd = 1;
+  int32 level = 2;
+  int32 optname = 3;
+  Timeval timeval = 4;
+}
+
+message SetSockOptTimevalResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+message CloseRequest {
+  int32 fd = 1;
+}
+
+message CloseResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+service Posix {
+  // Call socket() on the DUT.
+  rpc Socket(SocketRequest) returns (SocketResponse);
+  // Call bind() on the DUT.
+  rpc Bind(BindRequest) returns (BindResponse);
+  // Call getsockname() on the DUT.
+  rpc GetSockName(GetSockNameRequest) returns (GetSockNameResponse);
+  // Call listen() on the DUT.
+  rpc Listen(ListenRequest) returns (ListenResponse);
+  // Call accept() on the DUT.
+  rpc Accept(AcceptRequest) returns (AcceptResponse);
+  // Call setsockopt() on the DUT.  You should prefer one of the other
+  // SetSockOpt* functions with a more structured optval or else you may get the
+  // encoding wrong, such as making a bad assumption about the server's word
+  // sizes or endianness.
+  rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
+  // Call setsockopt() on the DUT with a Timeval optval.
+  rpc SetSockOptTimeval(SetSockOptTimevalRequest)
+      returns (SetSockOptTimevalResponse);
+  // Call close() on the DUT.
+  rpc Close(CloseRequest) returns (CloseResponse);
+}
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
new file mode 100644
index 000000000..a34c81fcc
--- /dev/null
+++ b/test/packetimpact/testbench/BUILD
@@ -0,0 +1,31 @@
+load("//tools:defs.bzl", "go_library")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "testbench",
+    srcs = [
+        "connections.go",
+        "dut.go",
+        "dut_client.go",
+        "layers.go",
+        "rawsockets.go",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//pkg/usermem",
+        "//test/packetimpact/proto:posix_server_go_proto",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_imdario_mergo//:go_default_library",
+        "@com_github_mohae_deepcopy//:go_default_library",
+        "@org_golang_google_grpc//:go_default_library",
+        "@org_golang_google_grpc//keepalive:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
new file mode 100644
index 000000000..b7aa63934
--- /dev/null
+++ b/test/packetimpact/testbench/connections.go
@@ -0,0 +1,245 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testbench has utilities to send and receive packets and also command
+// the DUT to run POSIX functions.
+package testbench
+
+import (
+	"flag"
+	"fmt"
+	"math/rand"
+	"net"
+	"testing"
+	"time"
+
+	"github.com/mohae/deepcopy"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+var localIPv4 = flag.String("local_ipv4", "", "local IPv4 address for test packets")
+var remoteIPv4 = flag.String("remote_ipv4", "", "remote IPv4 address for test packets")
+var localMAC = flag.String("local_mac", "", "local mac address for test packets")
+var remoteMAC = flag.String("remote_mac", "", "remote mac address for test packets")
+
+// TCPIPv4 maintains state about a TCP/IPv4 connection.
+type TCPIPv4 struct {
+	outgoing     Layers
+	incoming     Layers
+	LocalSeqNum  seqnum.Value
+	RemoteSeqNum seqnum.Value
+	SynAck       *TCP
+	sniffer      Sniffer
+	injector     Injector
+	portPickerFD int
+	t            *testing.T
+}
+
+// pickPort makes a new socket and returns the socket FD and port. The caller
+// must close the FD when done with the port if there is no error.
+func pickPort() (int, uint16, error) {
+	fd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM, 0)
+	if err != nil {
+		return -1, 0, err
+	}
+	var sa unix.SockaddrInet4
+	copy(sa.Addr[0:4], net.ParseIP(*localIPv4).To4())
+	if err := unix.Bind(fd, &sa); err != nil {
+		unix.Close(fd)
+		return -1, 0, err
+	}
+	newSockAddr, err := unix.Getsockname(fd)
+	if err != nil {
+		unix.Close(fd)
+		return -1, 0, err
+	}
+	newSockAddrInet4, ok := newSockAddr.(*unix.SockaddrInet4)
+	if !ok {
+		unix.Close(fd)
+		return -1, 0, fmt.Errorf("can't cast Getsockname result to SockaddrInet4")
+	}
+	return fd, uint16(newSockAddrInet4.Port), nil
+}
+
+// tcpLayerIndex is the position of the TCP layer in the TCPIPv4 connection. It
+// is the third, after Ethernet and IPv4.
+const tcpLayerIndex int = 2
+
+// NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults.
+func NewTCPIPv4(t *testing.T, dut DUT, outgoingTCP, incomingTCP TCP) TCPIPv4 {
+	lMAC, err := tcpip.ParseMACAddress(*localMAC)
+	if err != nil {
+		t.Fatalf("can't parse localMAC %q: %s", *localMAC, err)
+	}
+
+	rMAC, err := tcpip.ParseMACAddress(*remoteMAC)
+	if err != nil {
+		t.Fatalf("can't parse remoteMAC %q: %s", *remoteMAC, err)
+	}
+
+	portPickerFD, localPort, err := pickPort()
+	if err != nil {
+		t.Fatalf("can't pick a port: %s", err)
+	}
+	lIP := tcpip.Address(net.ParseIP(*localIPv4).To4())
+	rIP := tcpip.Address(net.ParseIP(*remoteIPv4).To4())
+
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make new sniffer: %s", err)
+	}
+
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make new injector: %s", err)
+	}
+
+	newOutgoingTCP := &TCP{
+		DataOffset: Uint8(header.TCPMinimumSize),
+		WindowSize: Uint16(32768),
+		SrcPort:    &localPort,
+	}
+	if err := newOutgoingTCP.merge(outgoingTCP); err != nil {
+		t.Fatalf("can't merge %v into %v: %s", outgoingTCP, newOutgoingTCP, err)
+	}
+	newIncomingTCP := &TCP{
+		DstPort: &localPort,
+	}
+	if err := newIncomingTCP.merge(incomingTCP); err != nil {
+		t.Fatalf("can't merge %v into %v: %s", incomingTCP, newIncomingTCP, err)
+	}
+	return TCPIPv4{
+		outgoing: Layers{
+			&Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
+			&IPv4{SrcAddr: &lIP, DstAddr: &rIP},
+			newOutgoingTCP},
+		incoming: Layers{
+			&Ether{SrcAddr: &rMAC, DstAddr: &lMAC},
+			&IPv4{SrcAddr: &rIP, DstAddr: &lIP},
+			newIncomingTCP},
+		sniffer:      sniffer,
+		injector:     injector,
+		portPickerFD: portPickerFD,
+		t:            t,
+		LocalSeqNum:  seqnum.Value(rand.Uint32()),
+	}
+}
+
+// Close the injector and sniffer associated with this connection.
+func (conn *TCPIPv4) Close() {
+	conn.sniffer.Close()
+	conn.injector.Close()
+	if err := unix.Close(conn.portPickerFD); err != nil {
+		conn.t.Fatalf("can't close portPickerFD: %s", err)
+	}
+	conn.portPickerFD = -1
+}
+
+// Send a packet with reasonable defaults and override some fields by tcp.
+func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+	if tcp.SeqNum == nil {
+		tcp.SeqNum = Uint32(uint32(conn.LocalSeqNum))
+	}
+	if tcp.AckNum == nil {
+		tcp.AckNum = Uint32(uint32(conn.RemoteSeqNum))
+	}
+	layersToSend := deepcopy.Copy(conn.outgoing).(Layers)
+	if err := layersToSend[tcpLayerIndex].(*TCP).merge(tcp); err != nil {
+		conn.t.Fatalf("can't merge %v into %v: %s", tcp, layersToSend[tcpLayerIndex], err)
+	}
+	layersToSend = append(layersToSend, additionalLayers...)
+	outBytes, err := layersToSend.toBytes()
+	if err != nil {
+		conn.t.Fatalf("can't build outgoing TCP packet: %s", err)
+	}
+	conn.injector.Send(outBytes)
+
+	// Compute the next TCP sequence number.
+	for i := tcpLayerIndex + 1; i < len(layersToSend); i++ {
+		conn.LocalSeqNum.UpdateForward(seqnum.Size(layersToSend[i].length()))
+	}
+	if tcp.Flags != nil && *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
+		conn.LocalSeqNum.UpdateForward(1)
+	}
+}
+
+// Recv gets a packet from the sniffer within the timeout provided. If no packet
+// arrives before the timeout, it returns nil.
+func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = deadline.Sub(time.Now())
+		if timeout <= 0 {
+			break
+		}
+		b := conn.sniffer.Recv(timeout)
+		if b == nil {
+			break
+		}
+		layers, err := ParseEther(b)
+		if err != nil {
+			continue // Ignore packets that can't be parsed.
+		}
+		if !conn.incoming.match(layers) {
+			continue // Ignore packets that don't match the expected incoming.
+		}
+		tcpHeader := (layers[tcpLayerIndex]).(*TCP)
+		conn.RemoteSeqNum = seqnum.Value(*tcpHeader.SeqNum)
+		if *tcpHeader.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
+			conn.RemoteSeqNum.UpdateForward(1)
+		}
+		for i := tcpLayerIndex + 1; i < len(layers); i++ {
+			conn.RemoteSeqNum.UpdateForward(seqnum.Size(layers[i].length()))
+		}
+		return tcpHeader
+	}
+	return nil
+}
+
+// Expect a packet that matches the provided tcp within the timeout specified.
+// If it doesn't arrive in time, the test fails.
+func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) *TCP {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = deadline.Sub(time.Now())
+		if timeout <= 0 {
+			return nil
+		}
+		gotTCP := conn.Recv(timeout)
+		if gotTCP == nil {
+			return nil
+		}
+		if tcp.match(gotTCP) {
+			return gotTCP
+		}
+	}
+}
+
+// Handshake performs a TCP 3-way handshake.
+func (conn *TCPIPv4) Handshake() {
+	// Send the SYN.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn)})
+
+	// Wait for the SYN-ACK.
+	conn.SynAck = conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if conn.SynAck == nil {
+		conn.t.Fatalf("didn't get synack during handshake")
+	}
+
+	// Send an ACK.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
+}
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
new file mode 100644
index 000000000..8ea1706d3
--- /dev/null
+++ b/test/packetimpact/testbench/dut.go
@@ -0,0 +1,363 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"context"
+	"flag"
+	"net"
+	"strconv"
+	"syscall"
+	"testing"
+	"time"
+
+	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
+
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/keepalive"
+)
+
+var (
+	posixServerIP   = flag.String("posix_server_ip", "", "ip address to listen to for UDP commands")
+	posixServerPort = flag.Int("posix_server_port", 40000, "port to listen to for UDP commands")
+	rpcTimeout      = flag.Duration("rpc_timeout", 100*time.Millisecond, "gRPC timeout")
+	rpcKeepalive    = flag.Duration("rpc_keepalive", 10*time.Second, "gRPC keepalive")
+)
+
+// DUT communicates with the DUT to force it to make POSIX calls.
+type DUT struct {
+	t           *testing.T
+	conn        *grpc.ClientConn
+	posixServer PosixClient
+}
+
+// NewDUT creates a new connection with the DUT over gRPC.
+func NewDUT(t *testing.T) DUT {
+	flag.Parse()
+	posixServerAddress := *posixServerIP + ":" + strconv.Itoa(*posixServerPort)
+	conn, err := grpc.Dial(posixServerAddress, grpc.WithInsecure(), grpc.WithKeepaliveParams(keepalive.ClientParameters{Timeout: *rpcKeepalive}))
+	if err != nil {
+		t.Fatalf("failed to grpc.Dial(%s): %s", posixServerAddress, err)
+	}
+	posixServer := NewPosixClient(conn)
+	return DUT{
+		t:           t,
+		conn:        conn,
+		posixServer: posixServer,
+	}
+}
+
+// TearDown closes the underlying connection.
+func (dut *DUT) TearDown() {
+	dut.conn.Close()
+}
+
+// SocketWithErrno calls socket on the DUT and returns the fd and errno.
+func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.SocketRequest{
+		Domain:   domain,
+		Type:     typ,
+		Protocol: proto,
+	}
+	ctx := context.Background()
+	resp, err := dut.posixServer.Socket(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Socket: %s", err)
+	}
+	return resp.GetFd(), syscall.Errno(resp.GetErrno_())
+}
+
+// Socket calls socket on the DUT and returns the file descriptor. If socket
+// fails on the DUT, the test ends.
+func (dut *DUT) Socket(domain, typ, proto int32) int32 {
+	dut.t.Helper()
+	fd, err := dut.SocketWithErrno(domain, typ, proto)
+	if fd < 0 {
+		dut.t.Fatalf("failed to create socket: %s", err)
+	}
+	return fd
+}
+
+func (dut *DUT) sockaddrToProto(sa unix.Sockaddr) *pb.Sockaddr {
+	dut.t.Helper()
+	switch s := sa.(type) {
+	case *unix.SockaddrInet4:
+		return &pb.Sockaddr{
+			Sockaddr: &pb.Sockaddr_In{
+				In: &pb.SockaddrIn{
+					Family: unix.AF_INET,
+					Port:   uint32(s.Port),
+					Addr:   s.Addr[:],
+				},
+			},
+		}
+	case *unix.SockaddrInet6:
+		return &pb.Sockaddr{
+			Sockaddr: &pb.Sockaddr_In6{
+				In6: &pb.SockaddrIn6{
+					Family:   unix.AF_INET6,
+					Port:     uint32(s.Port),
+					Flowinfo: 0,
+					ScopeId:  s.ZoneId,
+					Addr:     s.Addr[:],
+				},
+			},
+		}
+	}
+	dut.t.Fatalf("can't parse Sockaddr: %+v", sa)
+	return nil
+}
+
+func (dut *DUT) protoToSockaddr(sa *pb.Sockaddr) unix.Sockaddr {
+	dut.t.Helper()
+	switch s := sa.Sockaddr.(type) {
+	case *pb.Sockaddr_In:
+		ret := unix.SockaddrInet4{
+			Port: int(s.In.GetPort()),
+		}
+		copy(ret.Addr[:], s.In.GetAddr())
+		return &ret
+	case *pb.Sockaddr_In6:
+		ret := unix.SockaddrInet6{
+			Port:   int(s.In6.GetPort()),
+			ZoneId: s.In6.GetScopeId(),
+		}
+		copy(ret.Addr[:], s.In6.GetAddr())
+	}
+	dut.t.Fatalf("can't parse Sockaddr: %+v", sa)
+	return nil
+}
+
+// BindWithErrno calls bind on the DUT.
+func (dut *DUT) BindWithErrno(fd int32, sa unix.Sockaddr) (int32, error) {
+	dut.t.Helper()
+	req := pb.BindRequest{
+		Sockfd: fd,
+		Addr:   dut.sockaddrToProto(sa),
+	}
+	ctx := context.Background()
+	resp, err := dut.posixServer.Bind(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Bind: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Bind calls bind on the DUT and causes a fatal test failure if it doesn't
+// succeed.
+func (dut *DUT) Bind(fd int32, sa unix.Sockaddr) {
+	dut.t.Helper()
+	ret, err := dut.BindWithErrno(fd, sa)
+	if ret != 0 {
+		dut.t.Fatalf("failed to bind socket: %s", err)
+	}
+}
+
+// GetSockNameWithErrno calls getsockname on the DUT.
+func (dut *DUT) GetSockNameWithErrno(sockfd int32) (int32, unix.Sockaddr, error) {
+	dut.t.Helper()
+	req := pb.GetSockNameRequest{
+		Sockfd: sockfd,
+	}
+	ctx := context.Background()
+	resp, err := dut.posixServer.GetSockName(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Bind: %s", err)
+	}
+	return resp.GetRet(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+}
+
+// GetSockName calls getsockname on the DUT and causes a fatal test failure if
+// it doens't succeed.
+func (dut *DUT) GetSockName(sockfd int32) unix.Sockaddr {
+	dut.t.Helper()
+	ret, sa, err := dut.GetSockNameWithErrno(sockfd)
+	if ret != 0 {
+		dut.t.Fatalf("failed to getsockname: %s", err)
+	}
+	return sa
+}
+
+// ListenWithErrno calls listen on the DUT.
+func (dut *DUT) ListenWithErrno(sockfd, backlog int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.ListenRequest{
+		Sockfd:  sockfd,
+		Backlog: backlog,
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	resp, err := dut.posixServer.Listen(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Listen: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Listen calls listen on the DUT and causes a fatal test failure if it doesn't
+// succeed.
+func (dut *DUT) Listen(sockfd, backlog int32) {
+	dut.t.Helper()
+	ret, err := dut.ListenWithErrno(sockfd, backlog)
+	if ret != 0 {
+		dut.t.Fatalf("failed to listen: %s", err)
+	}
+}
+
+// AcceptWithErrno calls accept on the DUT.
+func (dut *DUT) AcceptWithErrno(sockfd int32) (int32, unix.Sockaddr, error) {
+	dut.t.Helper()
+	req := pb.AcceptRequest{
+		Sockfd: sockfd,
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	resp, err := dut.posixServer.Accept(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Accept: %s", err)
+	}
+	return resp.GetFd(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+}
+
+// Accept calls accept on the DUT and causes a fatal test failure if it doesn't
+// succeed.
+func (dut *DUT) Accept(sockfd int32) (int32, unix.Sockaddr) {
+	dut.t.Helper()
+	fd, sa, err := dut.AcceptWithErrno(sockfd)
+	if fd < 0 {
+		dut.t.Fatalf("failed to accept: %s", err)
+	}
+	return fd, sa
+}
+
+// SetSockOptWithErrno calls setsockopt on the DUT.
+func (dut *DUT) SetSockOptWithErrno(sockfd, level, optname int32, optval []byte) (int32, error) {
+	dut.t.Helper()
+	req := pb.SetSockOptRequest{
+		Sockfd:  sockfd,
+		Level:   level,
+		Optname: optname,
+		Optval:  optval,
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	resp, err := dut.posixServer.SetSockOpt(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call SetSockOpt: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// SetSockOpt calls setsockopt on the DUT and causes a fatal test failure if it
+// doesn't succeed.
+func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) {
+	dut.t.Helper()
+	ret, err := dut.SetSockOptWithErrno(sockfd, level, optname, optval)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOpt: %s", err)
+	}
+}
+
+// SetSockOptTimevalWithErrno calls setsockopt with the timeval converted to
+// bytes.
+func (dut *DUT) SetSockOptTimevalWithErrno(sockfd, level, optname int32, tv *unix.Timeval) (int32, error) {
+	dut.t.Helper()
+	timeval := pb.Timeval{
+		Seconds:      int64(tv.Sec),
+		Microseconds: int64(tv.Usec),
+	}
+	req := pb.SetSockOptTimevalRequest{
+		Sockfd:  sockfd,
+		Level:   level,
+		Optname: optname,
+		Timeval: &timeval,
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	resp, err := dut.posixServer.SetSockOptTimeval(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call SetSockOptTimeval: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed.
+func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval) {
+	dut.t.Helper()
+	ret, err := dut.SetSockOptTimevalWithErrno(sockfd, level, optname, tv)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOptTimeval: %s", err)
+	}
+}
+
+// CloseWithErrno calls close on the DUT.
+func (dut *DUT) CloseWithErrno(fd int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.CloseRequest{
+		Fd: fd,
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	resp, err := dut.posixServer.Close(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Close: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Close calls close on the DUT and causes a fatal test failure if it doesn't
+// succeed.
+func (dut *DUT) Close(fd int32) {
+	dut.t.Helper()
+	ret, err := dut.CloseWithErrno(fd)
+	if ret != 0 {
+		dut.t.Fatalf("failed to close: %s", err)
+	}
+}
+
+// CreateListener makes a new TCP connection.  If it fails, the test ends.
+func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
+	dut.t.Helper()
+	addr := net.ParseIP(*remoteIPv4)
+	var fd int32
+	if addr.To4() != nil {
+		fd = dut.Socket(unix.AF_INET, typ, proto)
+		sa := unix.SockaddrInet4{}
+		copy(sa.Addr[:], addr.To4())
+		dut.Bind(fd, &sa)
+	} else if addr.To16() != nil {
+		fd = dut.Socket(unix.AF_INET6, typ, proto)
+		sa := unix.SockaddrInet6{}
+		copy(sa.Addr[:], addr.To16())
+		dut.Bind(fd, &sa)
+	} else {
+		dut.t.Fatal("unknown ip addr type for remoteIP")
+	}
+	sa := dut.GetSockName(fd)
+	var port int
+	switch s := sa.(type) {
+	case *unix.SockaddrInet4:
+		port = s.Port
+	case *unix.SockaddrInet6:
+		port = s.Port
+	default:
+		dut.t.Fatalf("unknown sockaddr type from getsockname: %t", sa)
+	}
+	dut.Listen(fd, backlog)
+	return fd, uint16(port)
+}
diff --git a/test/packetimpact/testbench/dut_client.go b/test/packetimpact/testbench/dut_client.go
new file mode 100644
index 000000000..b130a33a2
--- /dev/null
+++ b/test/packetimpact/testbench/dut_client.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"google.golang.org/grpc"
+	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
+)
+
+// PosixClient is a gRPC client for the Posix service.
+type PosixClient pb.PosixClient
+
+// NewPosixClient makes a new gRPC client for the Posix service.
+func NewPosixClient(c grpc.ClientConnInterface) PosixClient {
+	return pb.NewPosixClient(c)
+}
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
new file mode 100644
index 000000000..35fa4dcb6
--- /dev/null
+++ b/test/packetimpact/testbench/layers.go
@@ -0,0 +1,507 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"fmt"
+	"reflect"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/imdario/mergo"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// Layer is the interface that all encapsulations must implement.
+//
+// A Layer is an encapsulation in a packet, such as TCP, IPv4, IPv6, etc. A
+// Layer contains all the fields of the encapsulation. Each field is a pointer
+// and may be nil.
+type Layer interface {
+	// toBytes converts the Layer into bytes. In places where the Layer's field
+	// isn't nil, the value that is pointed to is used. When the field is nil, a
+	// reasonable default for the Layer is used. For example, "64" for IPv4 TTL
+	// and a calculated checksum for TCP or IP. Some layers require information
+	// from the previous or next layers in order to compute a default, such as
+	// TCP's checksum or Ethernet's type, so each Layer has a doubly-linked list
+	// to the layer's neighbors.
+	toBytes() ([]byte, error)
+
+	// match checks if the current Layer matches the provided Layer. If either
+	// Layer has a nil in a given field, that field is considered matching.
+	// Otherwise, the values pointed to by the fields must match.
+	match(Layer) bool
+
+	// length in bytes of the current encapsulation
+	length() int
+
+	// next gets a pointer to the encapsulated Layer.
+	next() Layer
+
+	// prev gets a pointer to the Layer encapsulating this one.
+	prev() Layer
+
+	// setNext sets the pointer to the encapsulated Layer.
+	setNext(Layer)
+
+	// setPrev sets the pointer to the Layer encapsulating this one.
+	setPrev(Layer)
+}
+
+// LayerBase is the common elements of all layers.
+type LayerBase struct {
+	nextLayer Layer
+	prevLayer Layer
+}
+
+func (lb *LayerBase) next() Layer {
+	return lb.nextLayer
+}
+
+func (lb *LayerBase) prev() Layer {
+	return lb.prevLayer
+}
+
+func (lb *LayerBase) setNext(l Layer) {
+	lb.nextLayer = l
+}
+
+func (lb *LayerBase) setPrev(l Layer) {
+	lb.prevLayer = l
+}
+
+func equalLayer(x, y Layer) bool {
+	opt := cmp.FilterValues(func(x, y interface{}) bool {
+		if reflect.ValueOf(x).Kind() == reflect.Ptr && reflect.ValueOf(x).IsNil() {
+			return true
+		}
+		if reflect.ValueOf(y).Kind() == reflect.Ptr && reflect.ValueOf(y).IsNil() {
+			return true
+		}
+		return false
+
+	}, cmp.Ignore())
+	return cmp.Equal(x, y, opt, cmpopts.IgnoreUnexported(LayerBase{}))
+}
+
+// Ether can construct and match the ethernet encapsulation.
+type Ether struct {
+	LayerBase
+	SrcAddr *tcpip.LinkAddress
+	DstAddr *tcpip.LinkAddress
+	Type    *tcpip.NetworkProtocolNumber
+}
+
+func (l *Ether) toBytes() ([]byte, error) {
+	b := make([]byte, header.EthernetMinimumSize)
+	h := header.Ethernet(b)
+	fields := &header.EthernetFields{}
+	if l.SrcAddr != nil {
+		fields.SrcAddr = *l.SrcAddr
+	}
+	if l.DstAddr != nil {
+		fields.DstAddr = *l.DstAddr
+	}
+	if l.Type != nil {
+		fields.Type = *l.Type
+	} else {
+		switch n := l.next().(type) {
+		case *IPv4:
+			fields.Type = header.IPv4ProtocolNumber
+		default:
+			// TODO(b/150301488): Support more protocols, like IPv6.
+			return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %d", n)
+		}
+	}
+	h.Encode(fields)
+	return h, nil
+}
+
+// LinkAddress is a helper routine that allocates a new tcpip.LinkAddress value
+// to store v and returns a pointer to it.
+func LinkAddress(v tcpip.LinkAddress) *tcpip.LinkAddress {
+	return &v
+}
+
+// NetworkProtocolNumber is a helper routine that allocates a new
+// tcpip.NetworkProtocolNumber value to store v and returns a pointer to it.
+func NetworkProtocolNumber(v tcpip.NetworkProtocolNumber) *tcpip.NetworkProtocolNumber {
+	return &v
+}
+
+// ParseEther parses the bytes assuming that they start with an ethernet header
+// and continues parsing further encapsulations.
+func ParseEther(b []byte) (Layers, error) {
+	h := header.Ethernet(b)
+	ether := Ether{
+		SrcAddr: LinkAddress(h.SourceAddress()),
+		DstAddr: LinkAddress(h.DestinationAddress()),
+		Type:    NetworkProtocolNumber(h.Type()),
+	}
+	layers := Layers{&ether}
+	switch h.Type() {
+	case header.IPv4ProtocolNumber:
+		moreLayers, err := ParseIPv4(b[ether.length():])
+		if err != nil {
+			return nil, err
+		}
+		return append(layers, moreLayers...), nil
+	default:
+		// TODO(b/150301488): Support more protocols, like IPv6.
+		return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %v", b)
+	}
+}
+
+func (l *Ether) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *Ether) length() int {
+	return header.EthernetMinimumSize
+}
+
+// IPv4 can construct and match the ethernet excapulation.
+type IPv4 struct {
+	LayerBase
+	IHL            *uint8
+	TOS            *uint8
+	TotalLength    *uint16
+	ID             *uint16
+	Flags          *uint8
+	FragmentOffset *uint16
+	TTL            *uint8
+	Protocol       *uint8
+	Checksum       *uint16
+	SrcAddr        *tcpip.Address
+	DstAddr        *tcpip.Address
+}
+
+func (l *IPv4) toBytes() ([]byte, error) {
+	b := make([]byte, header.IPv4MinimumSize)
+	h := header.IPv4(b)
+	fields := &header.IPv4Fields{
+		IHL:            20,
+		TOS:            0,
+		TotalLength:    0,
+		ID:             0,
+		Flags:          0,
+		FragmentOffset: 0,
+		TTL:            64,
+		Protocol:       0,
+		Checksum:       0,
+		SrcAddr:        tcpip.Address(""),
+		DstAddr:        tcpip.Address(""),
+	}
+	if l.TOS != nil {
+		fields.TOS = *l.TOS
+	}
+	if l.TotalLength != nil {
+		fields.TotalLength = *l.TotalLength
+	} else {
+		fields.TotalLength = uint16(l.length())
+		current := l.next()
+		for current != nil {
+			fields.TotalLength += uint16(current.length())
+			current = current.next()
+		}
+	}
+	if l.ID != nil {
+		fields.ID = *l.ID
+	}
+	if l.Flags != nil {
+		fields.Flags = *l.Flags
+	}
+	if l.FragmentOffset != nil {
+		fields.FragmentOffset = *l.FragmentOffset
+	}
+	if l.TTL != nil {
+		fields.TTL = *l.TTL
+	}
+	if l.Protocol != nil {
+		fields.Protocol = *l.Protocol
+	} else {
+		switch n := l.next().(type) {
+		case *TCP:
+			fields.Protocol = uint8(header.TCPProtocolNumber)
+		default:
+			// TODO(b/150301488): Support more protocols, like UDP.
+			return nil, fmt.Errorf("can't deduce the ip header's next protocol: %+v", n)
+		}
+	}
+	if l.SrcAddr != nil {
+		fields.SrcAddr = *l.SrcAddr
+	}
+	if l.DstAddr != nil {
+		fields.DstAddr = *l.DstAddr
+	}
+	if l.Checksum != nil {
+		fields.Checksum = *l.Checksum
+	}
+	h.Encode(fields)
+	if l.Checksum == nil {
+		h.SetChecksum(^h.CalculateChecksum())
+	}
+	return h, nil
+}
+
+// Uint16 is a helper routine that allocates a new
+// uint16 value to store v and returns a pointer to it.
+func Uint16(v uint16) *uint16 {
+	return &v
+}
+
+// Uint8 is a helper routine that allocates a new
+// uint8 value to store v and returns a pointer to it.
+func Uint8(v uint8) *uint8 {
+	return &v
+}
+
+// Address is a helper routine that allocates a new tcpip.Address value to store
+// v and returns a pointer to it.
+func Address(v tcpip.Address) *tcpip.Address {
+	return &v
+}
+
+// ParseIPv4 parses the bytes assuming that they start with an ipv4 header and
+// continues parsing further encapsulations.
+func ParseIPv4(b []byte) (Layers, error) {
+	h := header.IPv4(b)
+	tos, _ := h.TOS()
+	ipv4 := IPv4{
+		IHL:            Uint8(h.HeaderLength()),
+		TOS:            &tos,
+		TotalLength:    Uint16(h.TotalLength()),
+		ID:             Uint16(h.ID()),
+		Flags:          Uint8(h.Flags()),
+		FragmentOffset: Uint16(h.FragmentOffset()),
+		TTL:            Uint8(h.TTL()),
+		Protocol:       Uint8(h.Protocol()),
+		Checksum:       Uint16(h.Checksum()),
+		SrcAddr:        Address(h.SourceAddress()),
+		DstAddr:        Address(h.DestinationAddress()),
+	}
+	layers := Layers{&ipv4}
+	switch h.Protocol() {
+	case uint8(header.TCPProtocolNumber):
+		moreLayers, err := ParseTCP(b[ipv4.length():])
+		if err != nil {
+			return nil, err
+		}
+		return append(layers, moreLayers...), nil
+	}
+	return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %d", h.Protocol())
+}
+
+func (l *IPv4) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *IPv4) length() int {
+	if l.IHL == nil {
+		return header.IPv4MinimumSize
+	}
+	return int(*l.IHL)
+}
+
+// TCP can construct and match the TCP excapulation.
+type TCP struct {
+	LayerBase
+	SrcPort       *uint16
+	DstPort       *uint16
+	SeqNum        *uint32
+	AckNum        *uint32
+	DataOffset    *uint8
+	Flags         *uint8
+	WindowSize    *uint16
+	Checksum      *uint16
+	UrgentPointer *uint16
+}
+
+func (l *TCP) toBytes() ([]byte, error) {
+	b := make([]byte, header.TCPMinimumSize)
+	h := header.TCP(b)
+	if l.SrcPort != nil {
+		h.SetSourcePort(*l.SrcPort)
+	}
+	if l.DstPort != nil {
+		h.SetDestinationPort(*l.DstPort)
+	}
+	if l.SeqNum != nil {
+		h.SetSequenceNumber(*l.SeqNum)
+	}
+	if l.AckNum != nil {
+		h.SetAckNumber(*l.AckNum)
+	}
+	if l.DataOffset != nil {
+		h.SetDataOffset(*l.DataOffset)
+	}
+	if l.Flags != nil {
+		h.SetFlags(*l.Flags)
+	}
+	if l.WindowSize != nil {
+		h.SetWindowSize(*l.WindowSize)
+	}
+	if l.UrgentPointer != nil {
+		h.SetUrgentPoiner(*l.UrgentPointer)
+	}
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+		return h, nil
+	}
+	if err := setChecksum(&h, l); err != nil {
+		return nil, err
+	}
+	return h, nil
+}
+
+// setChecksum calculates the checksum of the TCP header and sets it in h.
+func setChecksum(h *header.TCP, tcp *TCP) error {
+	h.SetChecksum(0)
+	tcpLength := uint16(tcp.length())
+	current := tcp.next()
+	for current != nil {
+		tcpLength += uint16(current.length())
+		current = current.next()
+	}
+
+	var xsum uint16
+	switch s := tcp.prev().(type) {
+	case *IPv4:
+		xsum = header.PseudoHeaderChecksum(header.TCPProtocolNumber, *s.SrcAddr, *s.DstAddr, tcpLength)
+	default:
+		// TODO(b/150301488): Support more protocols, like IPv6.
+		return fmt.Errorf("can't get src and dst addr from previous layer")
+	}
+	current = tcp.next()
+	for current != nil {
+		payload, err := current.toBytes()
+		if err != nil {
+			return fmt.Errorf("can't get bytes for next header: %s", payload)
+		}
+		xsum = header.Checksum(payload, xsum)
+		current = current.next()
+	}
+	h.SetChecksum(^h.CalculateChecksum(xsum))
+	return nil
+}
+
+// Uint32 is a helper routine that allocates a new
+// uint32 value to store v and returns a pointer to it.
+func Uint32(v uint32) *uint32 {
+	return &v
+}
+
+// ParseTCP parses the bytes assuming that they start with a tcp header and
+// continues parsing further encapsulations.
+func ParseTCP(b []byte) (Layers, error) {
+	h := header.TCP(b)
+	tcp := TCP{
+		SrcPort:       Uint16(h.SourcePort()),
+		DstPort:       Uint16(h.DestinationPort()),
+		SeqNum:        Uint32(h.SequenceNumber()),
+		AckNum:        Uint32(h.AckNumber()),
+		DataOffset:    Uint8(h.DataOffset()),
+		Flags:         Uint8(h.Flags()),
+		WindowSize:    Uint16(h.WindowSize()),
+		Checksum:      Uint16(h.Checksum()),
+		UrgentPointer: Uint16(h.UrgentPointer()),
+	}
+	layers := Layers{&tcp}
+	moreLayers, err := ParsePayload(b[tcp.length():])
+	if err != nil {
+		return nil, err
+	}
+	return append(layers, moreLayers...), nil
+}
+
+func (l *TCP) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *TCP) length() int {
+	if l.DataOffset == nil {
+		return header.TCPMinimumSize
+	}
+	return int(*l.DataOffset)
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *TCP) merge(other TCP) error {
+	return mergo.Merge(l, other, mergo.WithOverride)
+}
+
+// Payload has bytes beyond OSI layer 4.
+type Payload struct {
+	LayerBase
+	Bytes []byte
+}
+
+// ParsePayload parses the bytes assuming that they start with a payload and
+// continue to the end. There can be no further encapsulations.
+func ParsePayload(b []byte) (Layers, error) {
+	payload := Payload{
+		Bytes: b,
+	}
+	return Layers{&payload}, nil
+}
+
+func (l *Payload) toBytes() ([]byte, error) {
+	return l.Bytes, nil
+}
+
+func (l *Payload) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *Payload) length() int {
+	return len(l.Bytes)
+}
+
+// Layers is an array of Layer and supports similar functions to Layer.
+type Layers []Layer
+
+func (ls *Layers) toBytes() ([]byte, error) {
+	for i, l := range *ls {
+		if i > 0 {
+			l.setPrev((*ls)[i-1])
+		}
+		if i+1 < len(*ls) {
+			l.setNext((*ls)[i+1])
+		}
+	}
+	outBytes := []byte{}
+	for _, l := range *ls {
+		layerBytes, err := l.toBytes()
+		if err != nil {
+			return nil, err
+		}
+		outBytes = append(outBytes, layerBytes...)
+	}
+	return outBytes, nil
+}
+
+func (ls *Layers) match(other Layers) bool {
+	if len(*ls) > len(other) {
+		return false
+	}
+	for i := 0; i < len(*ls); i++ {
+		if !equalLayer((*ls)[i], other[i]) {
+			return false
+		}
+	}
+	return true
+}
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
new file mode 100644
index 000000000..0c7d0f979
--- /dev/null
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -0,0 +1,151 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"encoding/binary"
+	"flag"
+	"math"
+	"net"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+var device = flag.String("device", "", "local device for test packets")
+
+// Sniffer can sniff raw packets on the wire.
+type Sniffer struct {
+	t  *testing.T
+	fd int
+}
+
+func htons(x uint16) uint16 {
+	buf := [2]byte{}
+	binary.BigEndian.PutUint16(buf[:], x)
+	return usermem.ByteOrder.Uint16(buf[:])
+}
+
+// NewSniffer creates a Sniffer connected to *device.
+func NewSniffer(t *testing.T) (Sniffer, error) {
+	flag.Parse()
+	snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL)))
+	if err != nil {
+		return Sniffer{}, err
+	}
+	return Sniffer{
+		t:  t,
+		fd: snifferFd,
+	}, nil
+}
+
+// maxReadSize should be large enough for the maximum frame size in bytes. If a
+// packet too large for the buffer arrives, the test will get a fatal error.
+const maxReadSize int = 65536
+
+// Recv tries to read one frame until the timeout is up.
+func (s *Sniffer) Recv(timeout time.Duration) []byte {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = deadline.Sub(time.Now())
+		if timeout <= 0 {
+			return nil
+		}
+		whole, frac := math.Modf(timeout.Seconds())
+		tv := unix.Timeval{
+			Sec:  int64(whole),
+			Usec: int64(frac * float64(time.Microsecond/time.Second)),
+		}
+
+		if err := unix.SetsockoptTimeval(s.fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv); err != nil {
+			s.t.Fatalf("can't setsockopt SO_RCVTIMEO: %s", err)
+		}
+
+		buf := make([]byte, maxReadSize)
+		nread, _, err := unix.Recvfrom(s.fd, buf, unix.MSG_TRUNC)
+		if err == unix.EINTR || err == unix.EAGAIN {
+			// There was a timeout.
+			continue
+		}
+		if err != nil {
+			s.t.Fatalf("can't read: %s", err)
+		}
+		if nread > maxReadSize {
+			s.t.Fatalf("received a truncated frame of %d bytes", nread)
+		}
+		return buf[:nread]
+	}
+}
+
+// Close the socket that Sniffer is using.
+func (s *Sniffer) Close() {
+	if err := unix.Close(s.fd); err != nil {
+		s.t.Fatalf("can't close sniffer socket: %s", err)
+	}
+	s.fd = -1
+}
+
+// Injector can inject raw frames.
+type Injector struct {
+	t  *testing.T
+	fd int
+}
+
+// NewInjector creates a new injector on *device.
+func NewInjector(t *testing.T) (Injector, error) {
+	flag.Parse()
+	ifInfo, err := net.InterfaceByName(*device)
+	if err != nil {
+		return Injector{}, err
+	}
+
+	var haddr [8]byte
+	copy(haddr[:], ifInfo.HardwareAddr)
+	sa := unix.SockaddrLinklayer{
+		Protocol: unix.ETH_P_IP,
+		Ifindex:  ifInfo.Index,
+		Halen:    uint8(len(ifInfo.HardwareAddr)),
+		Addr:     haddr,
+	}
+
+	injectFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL)))
+	if err != nil {
+		return Injector{}, err
+	}
+	if err := unix.Bind(injectFd, &sa); err != nil {
+		return Injector{}, err
+	}
+	return Injector{
+		t:  t,
+		fd: injectFd,
+	}, nil
+}
+
+// Send a raw frame.
+func (i *Injector) Send(b []byte) {
+	if _, err := unix.Write(i.fd, b); err != nil {
+		i.t.Fatalf("can't write: %s", err)
+	}
+}
+
+// Close the underlying socket.
+func (i *Injector) Close() {
+	if err := unix.Close(i.fd); err != nil {
+		i.t.Fatalf("can't close sniffer socket: %s", err)
+	}
+	i.fd = -1
+}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
new file mode 100644
index 000000000..1dff2a4d5
--- /dev/null
+++ b/test/packetimpact/tests/BUILD
@@ -0,0 +1,21 @@
+load("defs.bzl", "packetimpact_go_test")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+packetimpact_go_test(
+    name = "fin_wait2_timeout",
+    srcs = ["fin_wait2_timeout_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+sh_binary(
+    name = "test_runner",
+    srcs = ["test_runner.sh"],
+)
diff --git a/test/packetimpact/tests/Dockerfile b/test/packetimpact/tests/Dockerfile
new file mode 100644
index 000000000..507030cc7
--- /dev/null
+++ b/test/packetimpact/tests/Dockerfile
@@ -0,0 +1,5 @@
+FROM ubuntu:bionic
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y iptables netcat tcpdump iproute2 tshark
+RUN hash -r
+CMD /bin/bash
diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/tests/defs.bzl
new file mode 100644
index 000000000..3baac567a
--- /dev/null
+++ b/test/packetimpact/tests/defs.bzl
@@ -0,0 +1,106 @@
+"""Defines rules for packetimpact test targets."""
+
+load("//tools:defs.bzl", "go_test")
+
+def _packetimpact_test_impl(ctx):
+    test_runner = ctx.executable._test_runner
+    bench = ctx.actions.declare_file("%s-bench" % ctx.label.name)
+    bench_content = "\n".join([
+        "#!/bin/bash",
+        # This test will run part in a distinct user namespace. This can cause
+        # permission problems, because all runfiles may not be owned by the
+        # current user, and no other users will be mapped in that namespace.
+        # Make sure that everything is readable here.
+        "find . -type f -exec chmod a+rx {} \\;",
+        "find . -type d -exec chmod a+rx {} \\;",
+        "%s %s --posix_server_binary %s --testbench_binary %s $@\n" % (
+            test_runner.short_path,
+            " ".join(ctx.attr.flags),
+            ctx.files._posix_server_binary[0].short_path,
+            ctx.files.testbench_binary[0].short_path,
+        ),
+    ])
+    ctx.actions.write(bench, bench_content, is_executable = True)
+
+    transitive_files = depset()
+    if hasattr(ctx.attr._test_runner, "data_runfiles"):
+        transitive_files = depset(ctx.attr._test_runner.data_runfiles.files)
+    runfiles = ctx.runfiles(
+        files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server_binary,
+        transitive_files = transitive_files,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = bench, runfiles = runfiles)]
+
+_packetimpact_test = rule(
+    attrs = {
+        "_test_runner": attr.label(
+            executable = True,
+            cfg = "target",
+            default = ":test_runner",
+        ),
+        "_posix_server_binary": attr.label(
+            cfg = "target",
+            default = "//test/packetimpact/dut:posix_server",
+        ),
+        "testbench_binary": attr.label(
+            cfg = "target",
+            mandatory = True,
+        ),
+        "flags": attr.string_list(
+            mandatory = False,
+            default = [],
+        ),
+    },
+    test = True,
+    implementation = _packetimpact_test_impl,
+)
+
+PACKETIMPACT_TAGS = ["local", "manual"]
+
+def packetimpact_linux_test(name, testbench_binary, **kwargs):
+    """Add a packetimpact test on linux.
+
+    Args:
+        name: name of the test
+        testbench_binary: the testbench binary
+        **kwargs: all the other args, forwarded to _packetimpact_test
+    """
+    _packetimpact_test(
+        name = name + "_linux_test",
+        testbench_binary = testbench_binary,
+        flags = ["--dut_platform", "linux"],
+        tags = PACKETIMPACT_TAGS,
+        **kwargs
+    )
+
+def packetimpact_netstack_test(name, testbench_binary, **kwargs):
+    """Add a packetimpact test on netstack.
+
+    Args:
+        name: name of the test
+        testbench_binary: the testbench binary
+        **kwargs: all the other args, forwarded to _packetimpact_test
+    """
+    _packetimpact_test(
+        name = name + "_netstack_test",
+        testbench_binary = testbench_binary,
+        # This is the default runtime unless
+        # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
+        flags = ["--dut_platform", "netstack", "--runtime=runsc-d"],
+        tags = PACKETIMPACT_TAGS,
+        **kwargs
+    )
+
+def packetimpact_go_test(name, size = "small", pure = True, **kwargs):
+    testbench_binary = name + "_test"
+    go_test(
+        name = testbench_binary,
+        size = size,
+        pure = pure,
+        tags = PACKETIMPACT_TAGS,
+        **kwargs
+    )
+    packetimpact_linux_test(name = name, testbench_binary = testbench_binary)
+    packetimpact_netstack_test(name = name, testbench_binary = testbench_binary)
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
new file mode 100644
index 000000000..5f54e67ed
--- /dev/null
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -0,0 +1,68 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fin_wait2_timeout_test
+
+import (
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func TestFinWait2Timeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		linger2     bool
+	}{
+		{"WithLinger2", true},
+		{"WithoutLinger2", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			dut := tb.NewDUT(t)
+			defer dut.TearDown()
+			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFd)
+			conn := tb.NewTCPIPv4(t, dut, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+			conn.Handshake()
+
+			acceptFd, _ := dut.Accept(listenFd)
+			if tt.linger2 {
+				tv := unix.Timeval{Sec: 1, Usec: 0}
+				dut.SetSockOptTimeval(acceptFd, unix.SOL_TCP, unix.TCP_LINGER2, &tv)
+			}
+			dut.Close(acceptFd)
+
+			if gotOne := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); gotOne == nil {
+				t.Fatal("expected a FIN-ACK within 1 second but got none")
+			}
+			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+
+			time.Sleep(5 * time.Second)
+			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+			if tt.linger2 {
+				if gotOne := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); gotOne == nil {
+					t.Fatal("expected a RST packet within a second but got none")
+				}
+			} else {
+				if gotOne := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, 10*time.Second); gotOne != nil {
+					t.Fatal("expected no RST packets within ten seconds but got one")
+				}
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
new file mode 100755
index 000000000..5281cb53d
--- /dev/null
+++ b/test/packetimpact/tests/test_runner.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run a packetimpact test.  Two docker containers are made, one for the
+# Device-Under-Test (DUT) and one for the test bench.  Each is attached with
+# two networks, one for control packets that aid the test and one for test
+# packets which are sent as part of the test and observed for correctness.
+
+set -euxo pipefail
+
+function failure() {
+  local lineno=$1
+  local msg=$2
+  local filename="$0"
+  echo "FAIL: $filename:$lineno: $msg"
+}
+trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
+
+declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark"
+
+# Don't use declare below so that the error from getopt will end the script.
+PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
+
+eval set -- "$PARSED"
+
+while true; do
+  case "$1" in
+    --dut_platform)
+      # Either "linux" or "netstack".
+      declare -r DUT_PLATFORM="$2"
+      shift 2
+      ;;
+    --posix_server_binary)
+      declare -r POSIX_SERVER_BINARY="$2"
+      shift 2
+      ;;
+    --testbench_binary)
+      declare -r TESTBENCH_BINARY="$2"
+      shift 2
+      ;;
+    --runtime)
+      # Not readonly because there might be multiple --runtime arguments and we
+      # want to use just the last one.  Only used if --dut_platform is
+      # "netstack".
+      declare RUNTIME="$2"
+      shift 2
+      ;;
+    --tshark)
+      declare -r TSHARK="1"
+      shift 1
+      ;;
+    --)
+      shift
+      break
+      ;;
+    *)
+      echo "Programming error"
+      exit 3
+  esac
+done
+
+# All the other arguments are scripts.
+declare -r scripts="$@"
+
+# Check that the required flags are defined in a way that is safe for "set -u".
+if [[ "${DUT_PLATFORM-}" == "netstack" ]]; then
+  if [[ -z "${RUNTIME-}" ]]; then
+    echo "FAIL: Missing --runtime argument: ${RUNTIME-}"
+    exit 2
+  fi
+  declare -r RUNTIME_ARG="--runtime ${RUNTIME}"
+elif [[ "${DUT_PLATFORM-}" == "linux" ]]; then
+  declare -r RUNTIME_ARG=""
+else
+  echo "FAIL: Bad or missing --dut_platform argument: ${DUT_PLATFORM-}"
+  exit 2
+fi
+if [[ ! -f "${POSIX_SERVER_BINARY-}" ]]; then
+  echo "FAIL: Bad or missing --posix_server_binary: ${POSIX_SERVER-}"
+  exit 2
+fi
+if [[ ! -f "${TESTBENCH_BINARY-}" ]]; then
+  echo "FAIL: Bad or missing --testbench_binary: ${TESTBENCH_BINARY-}"
+  exit 2
+fi
+
+# Variables specific to the control network and interface start with CTRL_.
+# Variables specific to the test network and interface start with TEST_.
+# Variables specific to the DUT start with DUT_.
+# Variables specific to the test bench start with TESTBENCH_.
+# Use random numbers so that test networks don't collide.
+declare -r CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
+declare -r TEST_NET="test_net-${RANDOM}${RANDOM}"
+# On both DUT and test bench, testing packets are on the eth2 interface.
+declare -r TEST_DEVICE="eth2"
+# Number of bits in the *_NET_PREFIX variables.
+declare -r NET_MASK="24"
+function new_net_prefix() {
+  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
+}
+# Last bits of the DUT's IP address.
+declare -r DUT_NET_SUFFIX=".10"
+# Control port.
+declare -r CTRL_PORT="40000"
+# Last bits of the test bench's IP address.
+declare -r TESTBENCH_NET_SUFFIX=".20"
+declare -r TIMEOUT="60"
+declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetimpact"
+# Make sure that docker is installed.
+docker --version
+
+function finish {
+  local cleanup_success=1
+  for net in "${CTRL_NET}" "${TEST_NET}"; do
+    # Kill all processes attached to ${net}.
+    for docker_command in "kill" "rm"; do
+      (docker network inspect "${net}" \
+        --format '{{range $key, $value := .Containers}}{{$key}} {{end}}' \
+        | xargs -r docker "${docker_command}") || \
+        cleanup_success=0
+    done
+    # Remove the network.
+    docker network rm "${net}" || \
+      cleanup_success=0
+  done
+
+  if ((!$cleanup_success)); then
+    echo "FAIL: Cleanup command failed"
+    exit 4
+  fi
+}
+trap finish EXIT
+
+# Subnet for control packets between test bench and DUT.
+declare CTRL_NET_PREFIX=$(new_net_prefix)
+while ! docker network create \
+  "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
+  sleep 0.1
+  declare CTRL_NET_PREFIX=$(new_net_prefix)
+done
+
+# Subnet for the packets that are part of the test.
+declare TEST_NET_PREFIX=$(new_net_prefix)
+while ! docker network create \
+  "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
+  sleep 0.1
+  declare TEST_NET_PREFIX=$(new_net_prefix)
+done
+
+docker pull "${IMAGE_TAG}"
+
+# Create the DUT container and connect to network.
+DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
+  --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
+docker network connect "${CTRL_NET}" \
+  --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
+  || (docker kill ${DUT}; docker rm ${DUT}; false)
+docker network connect "${TEST_NET}" \
+  --ip "${TEST_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
+  || (docker kill ${DUT}; docker rm ${DUT}; false)
+docker start "${DUT}"
+
+# Create the test bench container and connect to network.
+TESTBENCH=$(docker create --privileged --rm \
+  --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
+docker network connect "${CTRL_NET}" \
+  --ip "${CTRL_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
+  || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false)
+docker network connect "${TEST_NET}" \
+  --ip "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
+  || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false)
+docker start "${TESTBENCH}"
+
+# Start the posix_server in the DUT.
+declare -r DOCKER_POSIX_SERVER_BINARY="/$(basename ${POSIX_SERVER_BINARY})"
+docker cp -L ${POSIX_SERVER_BINARY} "${DUT}:${DOCKER_POSIX_SERVER_BINARY}"
+
+docker exec -t "${DUT}" \
+  /bin/bash -c "${DOCKER_POSIX_SERVER_BINARY} \
+  --ip ${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
+  --port ${CTRL_PORT}" &
+
+# Because the Linux kernel receives the SYN-ACK but didn't send the SYN it will
+# issue a RST. To prevent this IPtables can be used to filter those out.
+docker exec "${TESTBENCH}" \
+  iptables -A INPUT -i ${TEST_DEVICE} -j DROP
+
+# Wait for the DUT server to come up.  Attempt to connect to it from the test
+# bench every 100 milliseconds until success.
+while ! docker exec "${TESTBENCH}" \
+  nc -zv "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${CTRL_PORT}"; do
+  sleep 0.1
+done
+
+declare -r REMOTE_MAC=$(docker exec -t "${DUT}" ip link show \
+  "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
+declare -r LOCAL_MAC=$(docker exec -t "${TESTBENCH}" ip link show \
+  "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
+
+declare -r DOCKER_TESTBENCH_BINARY="/$(basename ${TESTBENCH_BINARY})"
+docker cp -L "${TESTBENCH_BINARY}" "${TESTBENCH}:${DOCKER_TESTBENCH_BINARY}"
+
+if [[ -z "${TSHARK-}" ]]; then
+  # Run tcpdump in the test bench unbuffered, without dns resolution, just on
+  # the interface with the test packets.
+  docker exec -t "${TESTBENCH}" \
+    tcpdump -S -vvv -U -n -i "${TEST_DEVICE}" net "${TEST_NET_PREFIX}/24" &
+else
+  # Run tshark in the test bench unbuffered, without dns resolution, just on the
+  # interface with the test packets.
+  docker exec -t "${TESTBENCH}" \
+    tshark -V -l -n -i "${TEST_DEVICE}" \
+    host "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" &
+fi
+
+# tcpdump and tshark take time to startup
+sleep 3
+
+# Start a packetimpact test on the test bench.  The packetimpact test sends and
+# receives packets and also sends POSIX socket commands to the posix_server to
+# be executed on the DUT.
+docker exec -t "${TESTBENCH}" \
+  /bin/bash -c "${DOCKER_TESTBENCH_BINARY} \
+  --posix_server_ip=${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
+  --posix_server_port=${CTRL_PORT} \
+  --remote_ipv4=${TEST_NET_PREFIX}${DUT_NET_SUFFIX} \
+  --local_ipv4=${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX} \
+  --remote_mac=${REMOTE_MAC} \
+  --local_mac=${LOCAL_MAC} \
+  --device=${TEST_DEVICE}"
+
+echo PASS: No errors.
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 905b16d41..64171ad8d 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -2,15 +2,15 @@
 
 load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
 load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
-load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library")
+load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library")
 load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
 load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
 load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
 load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
 load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
+load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library")
 
 container_image = _container_image
-cc_binary = _cc_binary
 cc_library = _cc_library
 cc_flags_supplier = _cc_flags_supplier
 cc_proto_library = _cc_proto_library
@@ -19,16 +19,70 @@ cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
 go_image = _go_image
 go_embed_data = _go_embed_data
 gtest = "@com_google_googletest//:gtest"
+grpcpp = "@com_github_grpc_grpc//:grpc++"
 gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
-proto_library = native.proto_library
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = native.py_library
 py_binary = native.py_binary
 py_test = native.py_test
 
+def proto_library(name, has_services = None, **kwargs):
+    native.proto_library(
+        name = name,
+        **kwargs
+    )
+
+def cc_grpc_library(name, **kwargs):
+    _cc_grpc_library(name = name, grpc_only = True, **kwargs)
+
+def _go_proto_or_grpc_library(go_library_func, name, **kwargs):
+    deps = [
+        dep.replace("_proto", "_go_proto")
+        for dep in (kwargs.pop("deps", []) or [])
+    ]
+    go_library_func(
+        name = name + "_go_proto",
+        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name + "_go_proto",
+        proto = ":" + name + "_proto",
+        deps = deps,
+        **kwargs
+    )
+
+def go_proto_library(name, **kwargs):
+    _go_proto_or_grpc_library(_go_proto_library, name, **kwargs)
+
+def go_grpc_and_proto_libraries(name, **kwargs):
+    _go_proto_or_grpc_library(_go_grpc_library, name, **kwargs)
+
+def cc_binary(name, static = False, **kwargs):
+    """Run cc_binary.
+
+    Args:
+        name: name of the target.
+        static: make a static binary if True
+        **kwargs: the rest of the args.
+    """
+    if static:
+        if "linkopts" in kwargs:
+            kwargs["linkopts"] += ["-static", "-lstdc++"]
+        else:
+            kwargs["linkopts"] = ["-static", "-lstdc++"]
+    _cc_binary(
+        name = name,
+        **kwargs
+    )
+
 def go_binary(name, static = False, pure = False, **kwargs):
+    """Build a go binary.
+
+    Args:
+        name: name of the target.
+        static: build a static binary.
+        pure: build without cgo.
+        **kwargs: rest of the arguments are passed to _go_binary.
+    """
     if static:
         kwargs["static"] = "on"
     if pure:
@@ -52,18 +106,17 @@ def go_tool_library(name, **kwargs):
         **kwargs
     )
 
-def go_proto_library(name, proto, **kwargs):
-    deps = kwargs.pop("deps", [])
-    _go_proto_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name,
-        proto = proto,
-        deps = [dep.replace("_proto", "_go_proto") for dep in deps],
-        **kwargs
-    )
+def go_test(name, pure = False, library = None, **kwargs):
+    """Build a go test.
 
-def go_test(name, **kwargs):
-    library = kwargs.pop("library", None)
+    Args:
+        name: name of the output binary.
+        pure: should it be built without cgo.
+        library: the library to embed.
+        **kwargs: rest of the arguments to pass to _go_test.
+    """
+    if pure:
+        kwargs["pure"] = "on"
     if library:
         kwargs["embed"] = [library]
     _go_test(
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 15a310403..91d689a82 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,36 +7,39 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms")
 load("//tools/bazeldefs:tags.bzl", "go_suffixes")
 
 # Delegate directly.
 cc_binary = _cc_binary
+cc_flags_supplier = _cc_flags_supplier
+cc_grpc_library = _cc_grpc_library
 cc_library = _cc_library
 cc_test = _cc_test
 cc_toolchain = _cc_toolchain
-cc_flags_supplier = _cc_flags_supplier
 container_image = _container_image
+default_installer = _default_installer
+default_net_util = _default_net_util
+gbenchmark = _gbenchmark
 go_embed_data = _go_embed_data
 go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
 gtest = _gtest
-gbenchmark = _gbenchmark
+grpcpp = _grpcpp
+loopback = _loopback
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
-py_library = _py_library
 py_binary = _py_binary
-py_test = _py_test
+py_library = _py_library
 py_requirement = _py_requirement
+py_test = _py_test
 select_arch = _select_arch
 select_system = _select_system
-loopback = _loopback
-default_installer = _default_installer
-default_net_util = _default_net_util
-platforms = _platforms
+
 default_platform = _default_platform
+platforms = _platforms
 
 def go_binary(name, **kwargs):
     """Wraps the standard go_binary.
@@ -190,33 +193,52 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
                 **kwargs
             )
 
-def proto_library(name, srcs, **kwargs):
+def proto_library(name, srcs, deps = None, has_services = 0, **kwargs):
     """Wraps the standard proto_library.
 
-    Given a proto_library named "foo", this produces three different targets:
+    Given a proto_library named "foo", this produces up to five different
+    targets:
     - foo_proto: proto_library rule.
     - foo_go_proto: go_proto_library rule.
     - foo_cc_proto: cc_proto_library rule.
+    - foo_go_grpc_proto: go_grpc_library rule.
+    - foo_cc_grpc_proto: cc_grpc_library rule.
 
     Args:
+      name: the name to which _proto, _go_proto, etc, will be appended.
       srcs: the proto sources.
+      deps: for the proto library and the go_proto_library.
+      has_services: 1 to build gRPC code, otherwise 0.
       **kwargs: standard proto_library arguments.
     """
-    deps = kwargs.pop("deps", [])
     _proto_library(
         name = name + "_proto",
         srcs = srcs,
         deps = deps,
+        has_services = has_services,
         **kwargs
     )
-    _go_proto_library(
-        name = name + "_go_proto",
-        proto = ":" + name + "_proto",
-        deps = deps,
-        **kwargs
-    )
+    if has_services:
+        _go_grpc_and_proto_libraries(
+            name = name,
+            deps = deps,
+            **kwargs
+        )
+    else:
+        _go_proto_library(
+            name = name,
+            deps = deps,
+            **kwargs
+        )
     _cc_proto_library(
         name = name + "_cc_proto",
         deps = [":" + name + "_proto"],
         **kwargs
     )
+    if has_services:
+        _cc_grpc_library(
+            name = name + "_cc_grpc_proto",
+            srcs = [":" + name + "_proto"],
+            deps = [":" + name + "_cc_proto"],
+            **kwargs
+        )
-- 
cgit v1.2.3


From 42d78ba61bf93f927ee89099abf5ef27b394b0d9 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Tue, 17 Mar 2020 10:29:12 -0700
Subject: Remove HostFS from Sentry.

PiperOrigin-RevId: 301402181
---
 pkg/sentry/fs/host/BUILD               |   4 +-
 pkg/sentry/fs/host/descriptor.go       |  37 +---
 pkg/sentry/fs/host/descriptor_state.go |   2 +-
 pkg/sentry/fs/host/descriptor_test.go  |   4 +-
 pkg/sentry/fs/host/file.go             |   4 +-
 pkg/sentry/fs/host/fs.go               | 339 -----------------------------
 pkg/sentry/fs/host/fs_test.go          | 380 ---------------------------------
 pkg/sentry/fs/host/host.go             |  59 +++++
 pkg/sentry/fs/host/inode.go            | 141 ++----------
 pkg/sentry/fs/host/inode_state.go      |  32 +--
 pkg/sentry/fs/host/inode_test.go       |  66 ------
 pkg/sentry/fs/host/util.go             |  84 +-------
 pkg/sentry/fs/host/util_unsafe.go      |  41 ----
 13 files changed, 94 insertions(+), 1099 deletions(-)
 delete mode 100644 pkg/sentry/fs/host/fs.go
 delete mode 100644 pkg/sentry/fs/host/fs_test.go
 create mode 100644 pkg/sentry/fs/host/host.go

diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 21003ea45..011625c80 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -10,7 +10,7 @@ go_library(
         "descriptor_state.go",
         "device.go",
         "file.go",
-        "fs.go",
+        "host.go",
         "inode.go",
         "inode_state.go",
         "ioctl_unsafe.go",
@@ -62,14 +62,12 @@ go_test(
     size = "small",
     srcs = [
         "descriptor_test.go",
-        "fs_test.go",
         "inode_test.go",
         "socket_test.go",
         "wait_test.go",
     ],
     library = ":host",
     deps = [
-        "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/sentry/contexttest",
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 2a4d1b291..cfdce6a74 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -16,7 +16,6 @@ package host
 
 import (
 	"fmt"
-	"path"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/fdnotifier"
@@ -28,12 +27,9 @@ import (
 //
 // +stateify savable
 type descriptor struct {
-	// donated is true if the host fd was donated by another process.
-	donated bool
-
 	// If origFD >= 0, it is the host fd that this file was originally created
 	// from, which must be available at time of restore. The FD can be closed
-	// after descriptor is created. Only set if donated is true.
+	// after descriptor is created.
 	origFD int
 
 	// wouldBlock is true if value (below) points to a file that can
@@ -41,15 +37,13 @@ type descriptor struct {
 	wouldBlock bool
 
 	// value is the wrapped host fd. It is never saved or restored
-	// directly. How it is restored depends on whether it was
-	// donated and the fs.MountSource it was originally
-	// opened/created from.
+	// directly.
 	value int `state:"nosave"`
 }
 
 // newDescriptor returns a wrapped host file descriptor. On success,
 // the descriptor is registered for event notifications with queue.
-func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+func newDescriptor(fd int, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
 	ownedFD := fd
 	origFD := -1
 	if saveable {
@@ -69,7 +63,6 @@ func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *
 		}
 	}
 	return &descriptor{
-		donated:    donated,
 		origFD:     origFD,
 		wouldBlock: wouldBlock,
 		value:      ownedFD,
@@ -77,25 +70,11 @@ func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *
 }
 
 // initAfterLoad initializes the value of the descriptor after Load.
-func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
-	if d.donated {
-		var err error
-		d.value, err = syscall.Dup(d.origFD)
-		if err != nil {
-			return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
-		}
-	} else {
-		name, ok := mo.inodeMappings[id]
-		if !ok {
-			return fmt.Errorf("failed to find path for inode number %d", id)
-		}
-		fullpath := path.Join(mo.root, name)
-
-		var err error
-		d.value, err = open(nil, fullpath)
-		if err != nil {
-			return fmt.Errorf("failed to open %q: %v", fullpath, err)
-		}
+func (d *descriptor) initAfterLoad(id uint64, queue *waiter.Queue) error {
+	var err error
+	d.value, err = syscall.Dup(d.origFD)
+	if err != nil {
+		return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
 	}
 	if d.wouldBlock {
 		if err := syscall.SetNonblock(d.value, true); err != nil {
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
index 8167390a9..e880582ab 100644
--- a/pkg/sentry/fs/host/descriptor_state.go
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -16,7 +16,7 @@ package host
 
 // beforeSave is invoked by stateify.
 func (d *descriptor) beforeSave() {
-	if d.donated && d.origFD < 0 {
+	if d.origFD < 0 {
 		panic("donated file descriptor cannot be saved")
 	}
 }
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index 4205981f5..d8e4605b6 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -47,10 +47,10 @@ func TestDescriptorRelease(t *testing.T) {
 
 			// FD ownership is transferred to the descritor.
 			queue := &waiter.Queue{}
-			d, err := newDescriptor(fd, false /* donated*/, tc.saveable, tc.wouldBlock, queue)
+			d, err := newDescriptor(fd, tc.saveable, tc.wouldBlock, queue)
 			if err != nil {
 				syscall.Close(fd)
-				t.Fatalf("newDescriptor(%d, %t, false, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err)
+				t.Fatalf("newDescriptor(%d, %t, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err)
 			}
 			if tc.saveable {
 				if d.origFD < 0 {
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index e08f56d04..034862694 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -101,8 +101,8 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner
 		})
 		return s, nil
 	default:
-		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
-		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+		msrc := fs.NewNonCachingMountSource(ctx, &filesystem{}, fs.MountSourceFlags{})
+		inode, err := newInode(ctx, msrc, donated, saveable)
 		if err != nil {
 			return nil, err
 		}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
deleted file mode 100644
index d3e8e3a36..000000000
--- a/pkg/sentry/fs/host/fs.go
+++ /dev/null
@@ -1,339 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package host implements an fs.Filesystem for files backed by host
-// file descriptors.
-package host
-
-import (
-	"fmt"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-)
-
-// FilesystemName is the name under which Filesystem is registered.
-const FilesystemName = "whitelistfs"
-
-const (
-	// whitelistKey is the mount option containing a comma-separated list
-	// of host paths to whitelist.
-	whitelistKey = "whitelist"
-
-	// rootPathKey is the mount option containing the root path of the
-	// mount.
-	rootPathKey = "root"
-
-	// dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
-	dontTranslateOwnershipKey = "dont_translate_ownership"
-)
-
-// maxTraversals determines link traversals in building the whitelist.
-const maxTraversals = 10
-
-// Filesystem is a pseudo file system that is only available during the setup
-// to lock down the configurations. This filesystem should only be mounted at root.
-//
-// Think twice before exposing this to applications.
-//
-// +stateify savable
-type Filesystem struct {
-	// whitelist is a set of host paths to whitelist.
-	paths []string
-}
-
-var _ fs.Filesystem = (*Filesystem)(nil)
-
-// Name is the identifier of this file system.
-func (*Filesystem) Name() string {
-	return FilesystemName
-}
-
-// AllowUserMount prohibits users from using mount(2) with this file system.
-func (*Filesystem) AllowUserMount() bool {
-	return false
-}
-
-// AllowUserList allows this filesystem to be listed in /proc/filesystems.
-func (*Filesystem) AllowUserList() bool {
-	return true
-}
-
-// Flags returns that there is nothing special about this file system.
-func (*Filesystem) Flags() fs.FilesystemFlags {
-	return 0
-}
-
-// Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
-// down in PreExec below.
-func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
-	// Parse generic comma-separated key=value options.
-	options := fs.GenericMountSourceOptions(data)
-
-	// Grab the whitelist if one was specified.
-	// TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
-	// no whitelist.
-	if wl, ok := options[whitelistKey]; ok {
-		f.paths = strings.Split(wl, "|")
-		delete(options, whitelistKey)
-	}
-
-	// If the rootPath was set, use it. Othewise default to the root of the
-	// host fs.
-	rootPath := "/"
-	if rp, ok := options[rootPathKey]; ok {
-		rootPath = rp
-		delete(options, rootPathKey)
-
-		// We must relativize the whitelisted paths to the new root.
-		for i, p := range f.paths {
-			rel, err := filepath.Rel(rootPath, p)
-			if err != nil {
-				return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
-			}
-			f.paths[i] = path.Join("/", rel)
-		}
-	}
-	fd, err := open(nil, rootPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to find root: %v", err)
-	}
-
-	var dontTranslateOwnership bool
-	if v, ok := options[dontTranslateOwnershipKey]; ok {
-		b, err := strconv.ParseBool(v)
-		if err != nil {
-			return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
-		}
-		dontTranslateOwnership = b
-		delete(options, dontTranslateOwnershipKey)
-	}
-
-	// Fail if the caller passed us more options than we know about.
-	if len(options) > 0 {
-		return nil, fmt.Errorf("unsupported mount options: %v", options)
-	}
-
-	// The mounting EUID/EGID will be cached by this file system. This will
-	// be used to assign ownership to files that we own.
-	owner := fs.FileOwnerFromContext(ctx)
-
-	// Construct the host file system mount and inode.
-	msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
-	return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
-}
-
-// InstallWhitelist locks down the MountNamespace to only the currently installed
-// Dirents and the given paths.
-func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
-	return installWhitelist(ctx, m, f.paths)
-}
-
-func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
-	if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
-		// Warning will be logged during filter installation if the empty
-		// whitelist matters (allows for host file access).
-		return nil
-	}
-
-	// Done tracks entries already added.
-	done := make(map[string]bool)
-	root := m.Root()
-	defer root.DecRef()
-
-	for i := 0; i < len(paths); i++ {
-		// Make sure the path is absolute. This is a sanity check.
-		if !path.IsAbs(paths[i]) {
-			return fmt.Errorf("path %q is not absolute", paths[i])
-		}
-
-		// We need to add all the intermediate paths, in case one of
-		// them is a symlink that needs to be resolved.
-		for j := 1; j <= len(paths[i]); j++ {
-			if j < len(paths[i]) && paths[i][j] != '/' {
-				continue
-			}
-			current := paths[i][:j]
-
-			// Lookup the given component in the tree.
-			remainingTraversals := uint(maxTraversals)
-			d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
-			if err != nil {
-				log.Warningf("populate failed for %q: %v", current, err)
-				continue
-			}
-
-			// It's critical that this DecRef happens after the
-			// freeze below. This ensures that the dentry is in
-			// place to be frozen. Otherwise, we freeze without
-			// these entries.
-			defer d.DecRef()
-
-			// Expand the last component if necessary.
-			if current == paths[i] {
-				// Is it a directory or symlink?
-				sattr := d.Inode.StableAttr
-				if fs.IsDir(sattr) {
-					for name := range childDentAttrs(ctx, d) {
-						paths = append(paths, path.Join(current, name))
-					}
-				}
-				if fs.IsSymlink(sattr) {
-					// Only expand symlinks once. The
-					// folder structure may contain
-					// recursive symlinks and we don't want
-					// to end up infinitely expanding this
-					// symlink. This is safe because this
-					// is the last component. If a later
-					// path wants to symlink something
-					// beneath this symlink that will still
-					// be handled by the FindLink above.
-					if done[current] {
-						continue
-					}
-
-					s, err := d.Inode.Readlink(ctx)
-					if err != nil {
-						log.Warningf("readlink failed for %q: %v", current, err)
-						continue
-					}
-					if path.IsAbs(s) {
-						paths = append(paths, s)
-					} else {
-						target := path.Join(path.Dir(current), s)
-						paths = append(paths, target)
-					}
-				}
-			}
-
-			// Only report this one once even though we may look
-			// it up more than once. If we whitelist /a/b,/a then
-			// /a will be "done" when it is looked up for /a/b,
-			// however we still need to expand all of its contents
-			// when whitelisting /a.
-			if !done[current] {
-				log.Debugf("whitelisted: %s", current)
-			}
-			done[current] = true
-		}
-	}
-
-	// Freeze the mount tree in place. This prevents any new paths from
-	// being opened and any old ones from being removed. If we do provide
-	// tmpfs mounts, we'll want to freeze/thaw those separately.
-	m.Freeze()
-	return nil
-}
-
-func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
-	dirname, _ := d.FullName(nil /* root */)
-	dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		log.Warningf("failed to open directory %q: %v", dirname, err)
-		return nil
-	}
-	dir.DecRef()
-	var stubSerializer fs.CollectEntriesSerializer
-	if err := dir.Readdir(ctx, &stubSerializer); err != nil {
-		log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
-		return nil
-	}
-	delete(stubSerializer.Entries, ".")
-	delete(stubSerializer.Entries, "..")
-	return stubSerializer.Entries
-}
-
-// newMountSource constructs a new host fs.MountSource
-// relative to a root path. The root should match the mount point.
-func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
-	return fs.NewMountSource(ctx, &superOperations{
-		root:                   root,
-		inodeMappings:          make(map[uint64]string),
-		mounter:                mounter,
-		dontTranslateOwnership: dontTranslateOwnership,
-	}, filesystem, flags)
-}
-
-// superOperations implements fs.MountSourceOperations.
-//
-// +stateify savable
-type superOperations struct {
-	fs.SimpleMountSourceOperations
-
-	// root is the path of the mount point. All inode mappings
-	// are relative to this root.
-	root string
-
-	// inodeMappings contains mappings of fs.Inodes associated
-	// with this MountSource to paths under root.
-	inodeMappings map[uint64]string
-
-	// mounter is the cached EUID/EGID that mounted this file system.
-	mounter fs.FileOwner
-
-	// dontTranslateOwnership indicates whether to not translate file
-	// ownership.
-	//
-	// By default, files/directories owned by the sandbox uses UID/GID
-	// of the mounter. For files/directories that are not owned by the
-	// sandbox, file UID/GID is translated to a UID/GID which cannot
-	// be mapped in the sandboxed application's user namespace. The
-	// UID/GID will look like the nobody UID/GID (65534) but is not
-	// strictly owned by the user "nobody".
-	//
-	// If whitelistfs is a lower filesystem in an overlay, set
-	// dont_translate_ownership=true in mount options.
-	dontTranslateOwnership bool
-}
-
-var _ fs.MountSourceOperations = (*superOperations)(nil)
-
-// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
-func (m *superOperations) ResetInodeMappings() {
-	m.inodeMappings = make(map[uint64]string)
-}
-
-// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
-func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
-	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
-	// because overlay copyUp may have changed them out from under us.
-	// So much for "immutable".
-	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
-	m.inodeMappings[sattr.InodeID] = path
-}
-
-// Keep implements fs.MountSourceOperations.Keep.
-//
-// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
-// host file while it is in the dirent cache (say from RO to RW), but it is not
-// possible to re-open the file with more relaxed permissions, since the host
-// FD is already open and stored in the inode.
-//
-// Using the dirent LRU cache increases the odds that this bug is encountered.
-// Since host file access is relatively fast anyways, we disable the LRU cache
-// for host fs files.  Once we can properly deal with permissions changes and
-// re-opening host files, we should revisit whether or not to make use of the
-// LRU cache.
-func (*superOperations) Keep(*fs.Dirent) bool {
-	return false
-}
-
-func init() {
-	fs.RegisterFilesystem(&Filesystem{})
-}
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
deleted file mode 100644
index 3111d2df9..000000000
--- a/pkg/sentry/fs/host/fs_test.go
+++ /dev/null
@@ -1,380 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package host
-
-import (
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path"
-	"reflect"
-	"sort"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-)
-
-// newTestMountNamespace creates a MountNamespace with a ramfs root.
-// It returns the host folder created, which should be removed when done.
-func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) {
-	p, err := ioutil.TempDir("", "root")
-	if err != nil {
-		return nil, "", err
-	}
-
-	fd, err := open(nil, p)
-	if err != nil {
-		os.RemoveAll(p)
-		return nil, "", err
-	}
-	ctx := contexttest.Context(t)
-	root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
-	if err != nil {
-		os.RemoveAll(p)
-		return nil, "", err
-	}
-	mm, err := fs.NewMountNamespace(ctx, root)
-	if err != nil {
-		os.RemoveAll(p)
-		return nil, "", err
-	}
-	return mm, p, nil
-}
-
-// createTestDirs populates the root with some test files and directories.
-// /a/a1.txt
-// /a/a2.txt
-// /b/b1.txt
-// /b/c/c1.txt
-// /symlinks/normal.txt
-// /symlinks/to_normal.txt -> /symlinks/normal.txt
-// /symlinks/recursive -> /symlinks
-func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error {
-	r := m.Root()
-	defer r.DecRef()
-
-	if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	a, err := r.Walk(ctx, r, "a")
-	if err != nil {
-		return err
-	}
-	defer a.DecRef()
-
-	a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	a1.DecRef()
-
-	a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	a2.DecRef()
-
-	if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	b, err := r.Walk(ctx, r, "b")
-	if err != nil {
-		return err
-	}
-	defer b.DecRef()
-
-	b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	b1.DecRef()
-
-	if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	c, err := b.Walk(ctx, r, "c")
-	if err != nil {
-		return err
-	}
-	defer c.DecRef()
-
-	c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	c1.DecRef()
-
-	if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	symlinks, err := r.Walk(ctx, r, "symlinks")
-	if err != nil {
-		return err
-	}
-	defer symlinks.DecRef()
-
-	normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	normal.DecRef()
-
-	if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil {
-		return err
-	}
-
-	return symlinks.CreateLink(ctx, r, "/symlinks", "recursive")
-}
-
-// allPaths returns a slice of all paths of entries visible in the rootfs.
-func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) {
-	var paths []string
-	root := m.Root()
-	defer root.DecRef()
-
-	maxTraversals := uint(1)
-	d, err := m.FindLink(ctx, root, nil, base, &maxTraversals)
-	if err != nil {
-		t.Logf("FindLink failed for %q", base)
-		return paths, err
-	}
-	defer d.DecRef()
-
-	if fs.IsDir(d.Inode.StableAttr) {
-		dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-		if err != nil {
-			return nil, fmt.Errorf("failed to open directory %q: %v", base, err)
-		}
-		iter, ok := dir.FileOperations.(fs.DirIterator)
-		if !ok {
-			return nil, fmt.Errorf("cannot directly iterate on host directory %q", base)
-		}
-		dirCtx := &fs.DirCtx{
-			Serializer: noopDentrySerializer{},
-		}
-		if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil {
-			return nil, err
-		}
-		for name := range dirCtx.DentAttrs() {
-			if name == "." || name == ".." {
-				continue
-			}
-
-			fullName := path.Join(base, name)
-			paths = append(paths, fullName)
-
-			// Recurse.
-			subpaths, err := allPaths(ctx, t, m, fullName)
-			if err != nil {
-				return paths, err
-			}
-			paths = append(paths, subpaths...)
-		}
-	}
-
-	return paths, nil
-}
-
-type noopDentrySerializer struct{}
-
-func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error {
-	return nil
-}
-func (noopDentrySerializer) Written() int {
-	return 4096
-}
-
-// pathsEqual returns true if the two string slices contain the same entries.
-func pathsEqual(got, want []string) bool {
-	sort.Strings(got)
-	sort.Strings(want)
-
-	if len(got) != len(want) {
-		return false
-	}
-
-	for i := range got {
-		if got[i] != want[i] {
-			return false
-		}
-	}
-
-	return true
-}
-
-func TestWhitelist(t *testing.T) {
-	for _, test := range []struct {
-		// description of the test.
-		desc string
-		// paths are the paths to whitelist
-		paths []string
-		// want are all of the directory entries that should be
-		// visible (nothing beyond this set should be visible).
-		want []string
-	}{
-		{
-			desc:  "root",
-			paths: []string{"/"},
-			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"},
-		},
-		{
-			desc:  "top-level directories",
-			paths: []string{"/a", "/b"},
-			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "nested directories (1/2)",
-			paths: []string{"/b", "/b/c"},
-			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "nested directories (2/2)",
-			paths: []string{"/b/c", "/b"},
-			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "single file",
-			paths: []string{"/b/c/c1.txt"},
-			want:  []string{"/b", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "single file and directory",
-			paths: []string{"/a/a1.txt", "/b/c"},
-			want:  []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "symlink",
-			paths: []string{"/symlinks/to_normal.txt"},
-			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"},
-		},
-		{
-			desc:  "recursive symlink",
-			paths: []string{"/symlinks/recursive/normal.txt"},
-			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"},
-		},
-	} {
-		t.Run(test.desc, func(t *testing.T) {
-			m, p, err := newTestMountNamespace(t)
-			if err != nil {
-				t.Errorf("Failed to create MountNamespace: %v", err)
-			}
-			defer os.RemoveAll(p)
-
-			ctx := withRoot(contexttest.RootContext(t), m.Root())
-			if err := createTestDirs(ctx, t, m); err != nil {
-				t.Errorf("Failed to create test dirs: %v", err)
-			}
-
-			if err := installWhitelist(ctx, m, test.paths); err != nil {
-				t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err)
-			}
-
-			got, err := allPaths(ctx, t, m, "/")
-			if err != nil {
-				t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err)
-			}
-
-			if !pathsEqual(got, test.want) {
-				t.Errorf("For paths %v got %v want %v", test.paths, got, test.want)
-			}
-		})
-	}
-}
-
-func TestRootPath(t *testing.T) {
-	// Create a temp dir, which will be the root of our mounted fs.
-	rootPath, err := ioutil.TempDir(os.TempDir(), "root")
-	if err != nil {
-		t.Fatalf("TempDir failed: %v", err)
-	}
-	defer os.RemoveAll(rootPath)
-
-	// Create two files inside the new root, one which will be whitelisted
-	// and one not.
-	whitelisted, err := ioutil.TempFile(rootPath, "white")
-	if err != nil {
-		t.Fatalf("TempFile failed: %v", err)
-	}
-	if _, err := ioutil.TempFile(rootPath, "black"); err != nil {
-		t.Fatalf("TempFile failed: %v", err)
-	}
-
-	// Create a mount with a root path and single whitelisted file.
-	hostFS := &Filesystem{}
-	ctx := contexttest.Context(t)
-	data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
-	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data, nil)
-	if err != nil {
-		t.Fatalf("Mount failed: %v", err)
-	}
-	mm, err := fs.NewMountNamespace(ctx, inode)
-	if err != nil {
-		t.Fatalf("NewMountNamespace failed: %v", err)
-	}
-	if err := hostFS.InstallWhitelist(ctx, mm); err != nil {
-		t.Fatalf("InstallWhitelist failed: %v", err)
-	}
-
-	// Get the contents of the root directory.
-	rootDir := mm.Root()
-	rctx := withRoot(ctx, rootDir)
-	f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{})
-	if err != nil {
-		t.Fatalf("GetFile failed: %v", err)
-	}
-	c := &fs.CollectEntriesSerializer{}
-	if err := f.Readdir(rctx, c); err != nil {
-		t.Fatalf("Readdir failed: %v", err)
-	}
-
-	// We should have only our whitelisted file, plus the dots.
-	want := []string{path.Base(whitelisted.Name()), ".", ".."}
-	got := c.Order
-	sort.Strings(want)
-	sort.Strings(got)
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got %v, wanted %v", got, want)
-	}
-}
-
-type rootContext struct {
-	context.Context
-	root *fs.Dirent
-}
-
-// withRoot returns a copy of ctx with the given root.
-func withRoot(ctx context.Context, root *fs.Dirent) context.Context {
-	return &rootContext{
-		Context: ctx,
-		root:    root,
-	}
-}
-
-// Value implements Context.Value.
-func (rc rootContext) Value(key interface{}) interface{} {
-	switch key {
-	case fs.CtxRoot:
-		rc.root.IncRef()
-		return rc.root
-	default:
-		return rc.Context.Value(key)
-	}
-}
diff --git a/pkg/sentry/fs/host/host.go b/pkg/sentry/fs/host/host.go
new file mode 100644
index 000000000..081ba1dd8
--- /dev/null
+++ b/pkg/sentry/fs/host/host.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host supports file descriptors imported directly.
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystem is a host filesystem.
+//
+// +stateify savable
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name under which the filesystem is registered.
+const FilesystemName = "host"
+
+// Name is the name of the filesystem.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// Mount returns an error. Mounting hostfs is not allowed.
+func (*filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, dataObj interface{}) (*fs.Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*filesystem) AllowUserMount() bool {
+	return false
+}
+
+// AllowUserList prohibits this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return false
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 6fa39caab..1da3c0a17 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -17,12 +17,10 @@ package host
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/secio"
-	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -69,9 +67,6 @@ type inodeOperations struct {
 //
 // +stateify savable
 type inodeFileState struct {
-	// Common file system state.
-	mops *superOperations `state:"wait"`
-
 	// descriptor is the backing host FD.
 	descriptor *descriptor `state:"wait"`
 
@@ -160,7 +155,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	if err := syscall.Fstat(i.FD(), &s); err != nil {
 		return fs.UnstableAttr{}, err
 	}
-	return unstableAttr(i.mops, &s), nil
+	return unstableAttr(&s), nil
 }
 
 // Allocate implements fsutil.CachedFileObject.Allocate.
@@ -172,7 +167,7 @@ func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error
 var _ fs.InodeOperations = (*inodeOperations)(nil)
 
 // newInode returns a new fs.Inode backed by the host FD.
-func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool) (*fs.Inode, error) {
 	// Retrieve metadata.
 	var s syscall.Stat_t
 	err := syscall.Fstat(fd, &s)
@@ -181,24 +176,17 @@ func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool,
 	}
 
 	fileState := &inodeFileState{
-		mops:  msrc.MountSourceOperations.(*superOperations),
 		sattr: stableAttr(&s),
 	}
 
 	// Initialize the wrapped host file descriptor.
-	fileState.descriptor, err = newDescriptor(
-		fd,
-		donated,
-		saveable,
-		wouldBlock(&s),
-		&fileState.queue,
-	)
+	fileState.descriptor, err = newDescriptor(fd, saveable, wouldBlock(&s), &fileState.queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Build the fs.InodeOperations.
-	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+	uattr := unstableAttr(&s)
 	iops := &inodeOperations{
 		fileState: fileState,
 		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
@@ -232,54 +220,23 @@ func (i *inodeOperations) Release(context.Context) {
 
 // Lookup implements fs.InodeOperations.Lookup.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	// Get a new FD relative to i at name.
-	fd, err := open(i, name)
-	if err != nil {
-		if err == syserror.ENOENT {
-			return nil, syserror.ENOENT
-		}
-		return nil, err
-	}
-
-	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
-	if err != nil {
-		return nil, err
-	}
-
-	// Return the fs.Dirent.
-	return fs.NewDirent(ctx, inode, name), nil
+	return nil, syserror.ENOENT
 }
 
 // Create implements fs.InodeOperations.Create.
 func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
-	// Create a file relative to i at name.
-	//
-	// N.B. We always open this file O_RDWR regardless of flags because a
-	// future GetFile might want more access. Open allows this regardless
-	// of perm.
-	fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
-	if err != nil {
-		return nil, err
-	}
-
-	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
-	if err != nil {
-		return nil, err
-	}
+	return nil, syserror.EPERM
 
-	d := fs.NewDirent(ctx, inode, name)
-	defer d.DecRef()
-	return inode.GetFile(ctx, d, flags)
 }
 
 // CreateDirectory implements fs.InodeOperations.CreateDirectory.
 func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
-	return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+	return syserror.EPERM
 }
 
 // CreateLink implements fs.InodeOperations.CreateLink.
 func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
-	return createLink(i.fileState.FD(), oldname, newname)
+	return syserror.EPERM
 }
 
 // CreateHardLink implements fs.InodeOperations.CreateHardLink.
@@ -294,25 +251,17 @@ func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePe
 
 // Remove implements fs.InodeOperations.Remove.
 func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
-	return unlinkAt(i.fileState.FD(), name, false /* dir */)
+	return syserror.EPERM
 }
 
 // RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
 func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
-	return unlinkAt(i.fileState.FD(), name, true /* dir */)
+	return syserror.EPERM
 }
 
 // Rename implements fs.InodeOperations.Rename.
 func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
-	op, ok := oldParent.InodeOperations.(*inodeOperations)
-	if !ok {
-		return syscall.EXDEV
-	}
-	np, ok := newParent.InodeOperations.(*inodeOperations)
-	if !ok {
-		return syscall.EXDEV
-	}
-	return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+	return syserror.EPERM
 }
 
 // Bind implements fs.InodeOperations.Bind.
@@ -461,69 +410,7 @@ func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
 
 // readdirAll returns all of the directory entries in i.
 func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
-	i.readdirMu.Lock()
-	defer i.readdirMu.Unlock()
-
-	fd := i.fileState.FD()
-
-	// syscall.ReadDirent will use getdents, which will seek the file past
-	// the last directory entry. To read the directory entries a second
-	// time, we need to seek back to the beginning.
-	if _, err := syscall.Seek(fd, 0, 0); err != nil {
-		if err == syscall.ESPIPE {
-			// All directories should be seekable. If this file
-			// isn't seekable, it is not a directory and we should
-			// return that more sane error.
-			err = syscall.ENOTDIR
-		}
-		return nil, err
-	}
-
-	names := make([]string, 0, 100)
-	for {
-		// Refill the buffer if necessary
-		if d.bufp >= d.nbuf {
-			d.bufp = 0
-			// ReadDirent will just do a sys_getdents64 to the kernel.
-			n, err := syscall.ReadDirent(fd, d.buf)
-			if err != nil {
-				return nil, err
-			}
-			if n == 0 {
-				break // EOF
-			}
-			d.nbuf = n
-		}
-
-		var nb int
-		// Parse the dirent buffer we just get and return the directory names along
-		// with the number of bytes consumed in the buffer.
-		nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
-		d.bufp += nb
-	}
-
-	entries := make(map[string]fs.DentAttr)
-	for _, filename := range names {
-		// Lookup the type and host device and inode.
-		stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
-		if lerr == syscall.ENOENT {
-			// File disappeared between readdir and lstat.
-			// Just treat it as if it didn't exist.
-			continue
-		}
-
-		// There was a serious problem, we should probably report it.
-		if lerr != nil {
-			return nil, lerr
-		}
-
-		entries[filename] = fs.DentAttr{
-			Type: nodeType(&stat),
-			InodeID: hostFileDevice.Map(device.MultiDeviceKey{
-				Device: stat.Dev,
-				Inode:  stat.Ino,
-			}),
-		}
-	}
-	return entries, nil
+	// We only support non-directory file descriptors that have been
+	// imported, so just claim that this isn't a directory, even if it is.
+	return nil, syscall.ENOTDIR
 }
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index 299e0e0b0..1adbd4562 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -18,29 +18,14 @@ import (
 	"fmt"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
-// beforeSave is invoked by stateify.
-func (i *inodeFileState) beforeSave() {
-	if !i.queue.IsEmpty() {
-		panic("event queue must be empty")
-	}
-	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
-		uattr, err := i.unstableAttr(context.Background())
-		if err != nil {
-			panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)})
-		}
-		i.savedUAttr = &uattr
-	}
-}
-
 // afterLoad is invoked by stateify.
 func (i *inodeFileState) afterLoad() {
 	// Initialize the descriptor value.
-	if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+	if err := i.descriptor.initAfterLoad(i.sattr.InodeID, &i.queue); err != nil {
 		panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
 	}
 
@@ -61,19 +46,4 @@ func (i *inodeFileState) afterLoad() {
 		// change across save and restore, error out.
 		panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)})
 	}
-
-	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
-		env, ok := fs.CurrentRestoreEnvironment()
-		if !ok {
-			panic("missing restore environment")
-		}
-		uattr := unstableAttr(i.mops, &s)
-		if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
-			panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)})
-		}
-		if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
-			panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)})
-		}
-		i.savedUAttr = nil
-	}
 }
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 7221bc825..4c374681c 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -15,9 +15,6 @@
 package host
 
 import (
-	"io/ioutil"
-	"os"
-	"path"
 	"syscall"
 	"testing"
 
@@ -25,69 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
-// TestMultipleReaddir verifies that multiple Readdir calls return the same
-// thing if they use different dir contexts.
-func TestMultipleReaddir(t *testing.T) {
-	p, err := ioutil.TempDir("", "readdir")
-	if err != nil {
-		t.Fatalf("Failed to create test dir: %v", err)
-	}
-	defer os.RemoveAll(p)
-
-	f, err := os.Create(path.Join(p, "a.txt"))
-	if err != nil {
-		t.Fatalf("Failed to create a.txt: %v", err)
-	}
-	f.Close()
-
-	f, err = os.Create(path.Join(p, "b.txt"))
-	if err != nil {
-		t.Fatalf("Failed to create b.txt: %v", err)
-	}
-	f.Close()
-
-	fd, err := open(nil, p)
-	if err != nil {
-		t.Fatalf("Failed to open %q: %v", p, err)
-	}
-	ctx := contexttest.Context(t)
-	n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
-	if err != nil {
-		t.Fatalf("Failed to create inode: %v", err)
-	}
-
-	dirent := fs.NewDirent(ctx, n, "readdir")
-	openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true})
-	if err != nil {
-		t.Fatalf("Failed to get file: %v", err)
-	}
-	defer openFile.DecRef()
-
-	c1 := &fs.DirCtx{DirCursor: new(string)}
-	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c1, 0); err != nil {
-		t.Fatalf("First Readdir failed: %v", err)
-	}
-
-	c2 := &fs.DirCtx{DirCursor: new(string)}
-	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c2, 0); err != nil {
-		t.Errorf("Second Readdir failed: %v", err)
-	}
-
-	if _, ok := c1.DentAttrs()["a.txt"]; !ok {
-		t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs())
-	}
-	if _, ok := c1.DentAttrs()["b.txt"]; !ok {
-		t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs())
-	}
-
-	if _, ok := c2.DentAttrs()["a.txt"]; !ok {
-		t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs())
-	}
-	if _, ok := c2.DentAttrs()["b.txt"]; !ok {
-		t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs())
-	}
-}
-
 // TestCloseFD verifies fds will be closed.
 func TestCloseFD(t *testing.T) {
 	var p [2]int
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 7c60dc1db..388108fdf 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -16,7 +16,6 @@ package host
 
 import (
 	"os"
-	"path"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -28,45 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func open(parent *inodeOperations, name string) (int, error) {
-	if parent == nil && !path.IsAbs(name) {
-		return -1, syserror.EINVAL
-	}
-	name = path.Clean(name)
-
-	// Don't follow through symlinks.
-	flags := syscall.O_NOFOLLOW
-
-	if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
-		return fd, nil
-	}
-	// Retry as read-only.
-	if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
-		return fd, nil
-	}
-
-	// Retry as write-only.
-	if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
-		return fd, nil
-	}
-
-	// Retry as a symlink, by including O_PATH as an option.
-	fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
-	if err == nil {
-		return fd, nil
-	}
-
-	// Everything failed.
-	return -1, err
-}
-
-func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
-	if parent == nil {
-		return syscall.Open(name, flags, uint32(perm))
-	}
-	return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
-}
-
 func nodeType(s *syscall.Stat_t) fs.InodeType {
 	switch x := (s.Mode & syscall.S_IFMT); x {
 	case syscall.S_IFLNK:
@@ -107,51 +67,19 @@ func stableAttr(s *syscall.Stat_t) fs.StableAttr {
 	}
 }
 
-func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
-	// User requested no translation, just return actual owner.
-	if mo.dontTranslateOwnership {
-		return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
-	}
-
-	// Show only IDs relevant to the sandboxed task. I.e. if we not own the
-	// file, no sandboxed task can own the file. In that case, we
-	// use OverflowID for UID, implying that the IDs are not mapped in the
-	// "root" user namespace.
-	//
-	// E.g.
-	// sandbox's host EUID/EGID is 1/1.
-	// some_dir's host UID/GID is 2/1.
-	// Task that mounted this fs has virtualized EUID/EGID 5/5.
-	//
-	// If you executed `ls -n` in the sandboxed task, it would show:
-	// drwxwrxwrx [...] 65534 5 [...] some_dir
-
-	// Files are owned by OverflowID by default.
-	owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
-
-	// If we own file on host, let mounting task's initial EUID own
-	// the file.
-	if s.Uid == hostUID {
-		owner.UID = mo.mounter.UID
-	}
-
-	// If our group matches file's group, make file's group match
-	// the mounting task's initial EGID.
-	for _, gid := range hostGIDs {
-		if s.Gid == gid {
-			owner.GID = mo.mounter.GID
-			break
-		}
+func owner(s *syscall.Stat_t) fs.FileOwner {
+	return fs.FileOwner{
+		UID: auth.KUID(s.Uid),
+		GID: auth.KGID(s.Gid),
 	}
-	return owner
 }
 
-func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr {
 	return fs.UnstableAttr{
 		Size:             s.Size,
 		Usage:            s.Blocks * 512,
 		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
-		Owner:            owner(mo, s),
+		Owner:            owner(s),
 		AccessTime:       time.FromUnix(s.Atim.Sec, s.Atim.Nsec),
 		ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
 		StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index 3ab36b088..23bd35d64 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -26,26 +26,6 @@ import (
 // NulByte is a single NUL byte. It is passed to readlinkat as an empty string.
 var NulByte byte = '\x00'
 
-func createLink(fd int, name string, linkName string) error {
-	namePtr, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return err
-	}
-	linkNamePtr, err := syscall.BytePtrFromString(linkName)
-	if err != nil {
-		return err
-	}
-	_, _, errno := syscall.Syscall(
-		syscall.SYS_SYMLINKAT,
-		uintptr(unsafe.Pointer(namePtr)),
-		uintptr(fd),
-		uintptr(unsafe.Pointer(linkNamePtr)))
-	if errno != 0 {
-		return errno
-	}
-	return nil
-}
-
 func readLink(fd int) (string, error) {
 	// Buffer sizing copied from os.Readlink.
 	for l := 128; ; l *= 2 {
@@ -66,27 +46,6 @@ func readLink(fd int) (string, error) {
 	}
 }
 
-func unlinkAt(fd int, name string, dir bool) error {
-	namePtr, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return err
-	}
-	var flags uintptr
-	if dir {
-		flags = linux.AT_REMOVEDIR
-	}
-	_, _, errno := syscall.Syscall(
-		syscall.SYS_UNLINKAT,
-		uintptr(fd),
-		uintptr(unsafe.Pointer(namePtr)),
-		flags,
-	)
-	if errno != 0 {
-		return errno
-	}
-	return nil
-}
-
 func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
 	if omit {
 		return syscall.Timespec{0, linux.UTIME_OMIT}
-- 
cgit v1.2.3


From 1cc5a71a0e2100c89c97ffd12a38143907b33630 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 17 Mar 2020 16:21:36 -0700
Subject: iptables: fix type in script name, mark some new tests as skipped

PiperOrigin-RevId: 301476456
---
 kokoro/iptables_tests.cfg      |  2 +-
 test/iptables/iptables_test.go | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
index 7af20629a..a30d82591 100644
--- a/kokoro/iptables_tests.cfg
+++ b/kokoro/iptables_tests.cfg
@@ -1,4 +1,4 @@
-build_file: "repo/scripts/iptables_test.sh"
+build_file: "repo/scripts/iptables_tests.sh"
 
 action {
   define_artifacts {
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 73ba8b447..7f1f70606 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -191,6 +191,7 @@ func TestFilterInputDropOnlyUDP(t *testing.T) {
 }
 
 func TestNATRedirectUDPPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectUDPPort{}); err != nil {
 		t.Fatal(err)
@@ -198,6 +199,7 @@ func TestNATRedirectUDPPort(t *testing.T) {
 }
 
 func TestNATRedirectTCPPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectTCPPort{}); err != nil {
 		t.Fatal(err)
@@ -205,6 +207,7 @@ func TestNATRedirectTCPPort(t *testing.T) {
 }
 
 func TestNATDropUDP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATDropUDP{}); err != nil {
 		t.Fatal(err)
@@ -212,6 +215,7 @@ func TestNATDropUDP(t *testing.T) {
 }
 
 func TestNATAcceptAll(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATAcceptAll{}); err != nil {
 		t.Fatal(err)
@@ -255,6 +259,7 @@ func TestFilterInputReturnUnderflow(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
@@ -262,6 +267,7 @@ func TestFilterOutputDropTCPDestPort(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPSrcPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
 		t.Fatal(err)
@@ -329,42 +335,56 @@ func TestOutputInvertDestination(t *testing.T) {
 }
 
 func TestNATOutRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATOutDontRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutDontRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATOutRedirectInvert(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutRedirectInvert{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreDontRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreDontRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreRedirectInvert(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreRedirectInvert{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATRedirectRequiresProtocol(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectRequiresProtocol{}); err != nil {
 		t.Fatal(err)
 	}
-- 
cgit v1.2.3


From eddd6ce514e3bbcc08ce9d8435c7dac12715989c Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 17 Mar 2020 19:09:24 -0700
Subject: Wrap rand.Reader in a bufio.Reader.

rand.Read() results in a syscall to the host on every call instead
we can wrap it with a bufio.Reader to buffer and reduce number of syscalls.
This is especially important for TCP where every newly created endpoint
reads random data to initialize the timestamp offsets for the endpoint.

Updates #231

PiperOrigin-RevId: 301501607
---
 pkg/rand/rand_linux.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 0bdad5fad..1aec96e2b 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -45,12 +45,18 @@ func (r *reader) Read(p []byte) (int, error) {
 	return rand.Read(p)
 }
 
+// mu protects the global Reader below.
+var mu sync.Mutex
+
 // Reader is the default reader.
 var Reader io.Reader = &reader{}
 
 // Read reads from the default reader.
 func Read(b []byte) (int, error) {
-	return io.ReadFull(Reader, b)
+	mu.Lock()
+	n, err := io.ReadFull(Reader, b)
+	mu.Unlock()
+	return n, err
 }
 
 // Init can be called to make sure /dev/urandom is pre-opened on kernels that
-- 
cgit v1.2.3


From 9c35d7eb1f96f12207f78b94722f0e8b778b5af3 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 11 Mar 2020 09:55:07 +0000
Subject: Enable syscall sysret_test on arm64.

Fixes #2058

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I05750d238a6425d3a47fae15720901f4dd924a32
---
 test/syscalls/linux/BUILD     |  5 +----
 test/syscalls/linux/sysret.cc | 32 +++++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 636e5db12..d0c431234 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3336,10 +3336,7 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = select_arch(
-        amd64 = ["sysret.cc"],
-        arm64 = [],
-    ),
+    srcs = ["sysret.cc"],
     linkstatic = 1,
     deps = [
         gtest,
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index 819fa655a..569190a59 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -14,6 +14,8 @@
 
 // Tests to verify that the behavior of linux and gvisor matches when
 // 'sysret' returns to bad (aka non-canonical) %rip or %rsp.
+
+#include <linux/elf.h>
 #include <sys/ptrace.h>
 #include <sys/user.h>
 
@@ -32,6 +34,7 @@ constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
 class SysretTest : public ::testing::Test {
  protected:
   struct user_regs_struct regs_;
+  struct iovec iov;
   pid_t child_;
 
   void SetUp() override {
@@ -48,10 +51,14 @@ class SysretTest : public ::testing::Test {
 
     // Parent.
     int status;
+    memset(&iov, 0, sizeof(iov));
     ASSERT_THAT(pid, SyscallSucceeds());  // Might still be < 0.
     ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
     EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, pid, 0, &regs_), SyscallSucceeds());
+
+    iov.iov_base = &regs_;
+    iov.iov_len = sizeof(regs_);
+    ASSERT_THAT(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov), SyscallSucceeds());
 
     child_ = pid;
   }
@@ -61,13 +68,25 @@ class SysretTest : public ::testing::Test {
   }
 
   void SetRip(uint64_t newrip) {
+#if defined(__x86_64__)
     regs_.rip = newrip;
-    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+#elif defined(__aarch64__)
+    regs_.pc = newrip;
+#else
+#error "Unknown architecture"
+#endif
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov), SyscallSucceeds());
   }
 
   void SetRsp(uint64_t newrsp) {
+#if defined(__x86_64__)
     regs_.rsp = newrsp;
-    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+#elif defined(__aarch64__)
+    regs_.sp = newrsp;
+#else
+#error "Unknown architecture"
+#endif
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov), SyscallSucceeds());
   }
 
   // Wait waits for the child pid and returns the exit status.
@@ -104,8 +123,15 @@ TEST_F(SysretTest, BadRsp) {
   SetRsp(kNonCanonicalRsp);
   Detach();
   int status = Wait();
+#if defined(__x86_64__)
   EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGBUS)
       << "status = " << status;
+#elif defined(__aarch64__)
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+      << "status = " << status;
+#else
+#error "Unknown architecture"
+#endif
 }
 }  // namespace
 
-- 
cgit v1.2.3


From c29d4fc59eefefa71526daf030844a4b449c91b9 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 18 Mar 2020 06:35:34 -0700
Subject: Automated rollback of changelist 301501607

PiperOrigin-RevId: 301578043
---
 pkg/rand/rand_linux.go | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 1aec96e2b..0bdad5fad 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -45,18 +45,12 @@ func (r *reader) Read(p []byte) (int, error) {
 	return rand.Read(p)
 }
 
-// mu protects the global Reader below.
-var mu sync.Mutex
-
 // Reader is the default reader.
 var Reader io.Reader = &reader{}
 
 // Read reads from the default reader.
 func Read(b []byte) (int, error) {
-	mu.Lock()
-	n, err := io.ReadFull(Reader, b)
-	mu.Unlock()
-	return n, err
+	return io.ReadFull(Reader, b)
 }
 
 // Init can be called to make sure /dev/urandom is pre-opened on kernels that
-- 
cgit v1.2.3


From 707664e0c498921860b209a19974977bb20c5746 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 18 Mar 2020 10:16:37 -0700
Subject: Send the ACK later to stabilize the test.

PiperOrigin-RevId: 301614096
---
 test/packetdrill/fin_wait2_timeout.pkt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/packetdrill/fin_wait2_timeout.pkt b/test/packetdrill/fin_wait2_timeout.pkt
index 613f0bec9..93ab08575 100644
--- a/test/packetdrill/fin_wait2_timeout.pkt
+++ b/test/packetdrill/fin_wait2_timeout.pkt
@@ -19,5 +19,5 @@
 +0 > F. 1:1(0) ack 1 <...>
 +0 < . 1:1(0) ack 2 win 257
 
-+1.1 < . 1:1(0) ack 2 win 257
++2 < . 1:1(0) ack 2 win 257
 +0 > R  2:2(0) win 0
-- 
cgit v1.2.3


From fc16e64396cf534bc4336e6bc0396a2f0f621e70 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 18 Mar 2020 13:06:55 -0700
Subject: Automated rollback of changelist 301476456

PiperOrigin-RevId: 301650898
---
 kokoro/iptables_tests.cfg      |  2 +-
 test/iptables/iptables_test.go | 20 --------------------
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
index a30d82591..7af20629a 100644
--- a/kokoro/iptables_tests.cfg
+++ b/kokoro/iptables_tests.cfg
@@ -1,4 +1,4 @@
-build_file: "repo/scripts/iptables_tests.sh"
+build_file: "repo/scripts/iptables_test.sh"
 
 action {
   define_artifacts {
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 7f1f70606..73ba8b447 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -191,7 +191,6 @@ func TestFilterInputDropOnlyUDP(t *testing.T) {
 }
 
 func TestNATRedirectUDPPort(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectUDPPort{}); err != nil {
 		t.Fatal(err)
@@ -199,7 +198,6 @@ func TestNATRedirectUDPPort(t *testing.T) {
 }
 
 func TestNATRedirectTCPPort(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectTCPPort{}); err != nil {
 		t.Fatal(err)
@@ -207,7 +205,6 @@ func TestNATRedirectTCPPort(t *testing.T) {
 }
 
 func TestNATDropUDP(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATDropUDP{}); err != nil {
 		t.Fatal(err)
@@ -215,7 +212,6 @@ func TestNATDropUDP(t *testing.T) {
 }
 
 func TestNATAcceptAll(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATAcceptAll{}); err != nil {
 		t.Fatal(err)
@@ -259,7 +255,6 @@ func TestFilterInputReturnUnderflow(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
@@ -267,7 +262,6 @@ func TestFilterOutputDropTCPDestPort(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPSrcPort(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
 		t.Fatal(err)
@@ -335,56 +329,42 @@ func TestOutputInvertDestination(t *testing.T) {
 }
 
 func TestNATOutRedirectIP(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATOutDontRedirectIP(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutDontRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATOutRedirectInvert(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutRedirectInvert{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreRedirectIP(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreDontRedirectIP(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreDontRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreRedirectInvert(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreRedirectInvert{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATRedirectRequiresProtocol(t *testing.T) {
-	// TODO(gvisor.dev/issue/170): Enable when supported.
-	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectRequiresProtocol{}); err != nil {
 		t.Fatal(err)
 	}
-- 
cgit v1.2.3


From f1d1af2a4ad35dd20a7c56bd9e842e347b126c31 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 18 Mar 2020 15:12:11 -0700
Subject: Fix FDTable.NewFDVFS2

It was looking at VFS1 table to determine where to
allocate the next FD from.

Updates #1035

PiperOrigin-RevId: 301678858
---
 pkg/sentry/kernel/fd_table.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index dddc28d5a..d09d97825 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -338,7 +338,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 		fd = f.next
 	}
 	for fd < end {
-		if d, _, _ := f.get(fd); d == nil {
+		if d, _, _ := f.getVFS2(fd); d == nil {
 			f.setVFS2(fd, file, flags)
 			if fd == f.next {
 				// Update next search start position.
-- 
cgit v1.2.3


From b5ea65c07c29cbc894e9f879796eed816696d042 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 18 Mar 2020 15:51:36 -0700
Subject: iptables: skip tests for not-yet-supported features

PiperOrigin-RevId: 301686266
---
 test/iptables/iptables_test.go | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 73ba8b447..7f1f70606 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -191,6 +191,7 @@ func TestFilterInputDropOnlyUDP(t *testing.T) {
 }
 
 func TestNATRedirectUDPPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectUDPPort{}); err != nil {
 		t.Fatal(err)
@@ -198,6 +199,7 @@ func TestNATRedirectUDPPort(t *testing.T) {
 }
 
 func TestNATRedirectTCPPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectTCPPort{}); err != nil {
 		t.Fatal(err)
@@ -205,6 +207,7 @@ func TestNATRedirectTCPPort(t *testing.T) {
 }
 
 func TestNATDropUDP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATDropUDP{}); err != nil {
 		t.Fatal(err)
@@ -212,6 +215,7 @@ func TestNATDropUDP(t *testing.T) {
 }
 
 func TestNATAcceptAll(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATAcceptAll{}); err != nil {
 		t.Fatal(err)
@@ -255,6 +259,7 @@ func TestFilterInputReturnUnderflow(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
@@ -262,6 +267,7 @@ func TestFilterOutputDropTCPDestPort(t *testing.T) {
 }
 
 func TestFilterOutputDropTCPSrcPort(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
 		t.Fatal(err)
@@ -329,42 +335,56 @@ func TestOutputInvertDestination(t *testing.T) {
 }
 
 func TestNATOutRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATOutDontRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutDontRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATOutRedirectInvert(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATOutRedirectInvert{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreDontRedirectIP(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreDontRedirectIP{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATPreRedirectInvert(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATPreRedirectInvert{}); err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestNATRedirectRequiresProtocol(t *testing.T) {
+	// TODO(gvisor.dev/issue/170): Enable when supported.
+	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
 	if err := singleTest(NATRedirectRequiresProtocol{}); err != nil {
 		t.Fatal(err)
 	}
-- 
cgit v1.2.3


From 92a00ca91affab8564b8875387758914ddc9785b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 18 Mar 2020 16:25:20 -0700
Subject: Store segment transmit count.

This will aid in segment reordering detection.

Updates #691

PiperOrigin-RevId: 301692638
---
 pkg/tcpip/transport/tcp/segment.go | 6 +++---
 pkg/tcpip/transport/tcp/snd.go     | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 1c10da5ca..5d0bc4f72 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -56,9 +56,9 @@ type segment struct {
 	options        []byte `state:".([]byte)"`
 	hasNewSACKInfo bool
 	rcvdTime       time.Time `state:".(unixTime)"`
-	// xmitTime is the last transmit time of this segment. A zero value
-	// indicates that the segment has yet to be transmitted.
-	xmitTime time.Time `state:".(unixTime)"`
+	// xmitTime is the last transmit time of this segment.
+	xmitTime  time.Time `state:".(unixTime)"`
+	xmitCount uint32
 }
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) *segment {
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index b74b61e7d..657c3146e 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -1229,7 +1229,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 
 // sendSegment sends the specified segment.
 func (s *sender) sendSegment(seg *segment) *tcpip.Error {
-	if !seg.xmitTime.IsZero() {
+	if seg.xmitCount > 0 {
 		s.ep.stack.Stats().TCP.Retransmits.Increment()
 		s.ep.stats.SendErrors.Retransmits.Increment()
 		if s.sndCwnd < s.sndSsthresh {
@@ -1237,6 +1237,7 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		}
 	}
 	seg.xmitTime = time.Now()
+	seg.xmitCount++
 	return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
 }
 
-- 
cgit v1.2.3


From c3cee7f5a433708a394cee4e89c223f80036f5d9 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 18 Mar 2020 17:41:06 -0700
Subject: Deflake third_party/gvisor/pkg/gate/gate_test

TestConcurrentAll executes 1000 goroutines which never sleep,
so they are not preempted by Go's runtime. In Go 1.14, async preemption
has been added, but the added runtime.Gosched() call will do nothing
wrong in this case too.

PiperOrigin-RevId: 301705712
---
 pkg/gate/gate_test.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 850693df8..316015e06 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -15,6 +15,7 @@
 package gate_test
 
 import (
+	"runtime"
 	"testing"
 	"time"
 
@@ -165,6 +166,8 @@ func worker(g *gate.Gate, done *sync.WaitGroup) {
 		if !g.Enter() {
 			break
 		}
+		// Golang before v1.14 doesn't preempt busyloops.
+		runtime.Gosched()
 		g.Leave()
 	}
 	done.Done()
-- 
cgit v1.2.3


From a8f9cc87989979b6d8bc3759e64bdd1b76329b64 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 18 Mar 2020 14:44:56 -0700
Subject: iptables: deflake DropTCP*Port tests

These tests could timeout because net.DialTCP didn't respect the
timeout.
---
 test/iptables/filter_input.go  | 20 ++++++++++++++++----
 test/iptables/iptables_util.go |  2 +-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 05647de33..4ccd4cce7 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -194,8 +194,14 @@ func (FilterInputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPDestPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
+	// After the container sets its DROP rule, we shouldn't be able to connect.
+	// However, we may succeed in connecting if this runs before the container
+	// sets the rule. To avoid this race, we retry connecting until
+	// sendloopDuration has elapsed, ignoring whether the connect succeeds. The
+	// test works becuase the container will error if a connection is
+	// established after the rule is set.
+	for start := time.Now(); time.Since(start) < sendloopDuration; {
+		connectTCP(ip, dropPort, sendloopDuration-time.Since(start))
 	}
 
 	return nil
@@ -226,8 +232,14 @@ func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection should not be accepted, but was")
+	// After the container sets its DROP rule, we shouldn't be able to connect.
+	// However, we may succeed in connecting if this runs before the container
+	// sets the rule. To avoid this race, we retry connecting until
+	// sendloopDuration has elapsed, ignoring whether the connect succeeds. The
+	// test works becuase the container will error if a connection is
+	// established after the rule is set.
+	for start := time.Now(); time.Since(start) < sendloopDuration; {
+		connectTCP(ip, acceptPort, sendloopDuration-time.Since(start))
 	}
 
 	return nil
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index e8ae65c5a..134391e8d 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -144,7 +144,7 @@ func connectTCP(ip net.IP, port int, timeout time.Duration) error {
 	// The container may not be listening when we first connect, so retry
 	// upon error.
 	callback := func() error {
-		conn, err := net.DialTCP("tcp4", nil, &contAddr)
+		conn, err := net.DialTimeout("tcp", contAddr.String(), timeout)
 		if conn != nil {
 			conn.Close()
 		}
-- 
cgit v1.2.3


From 3a42638a0b32ceede66d8d593609b424bbdba47e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 18 Mar 2020 19:08:46 -0700
Subject: Port imported TTY fds to vfs2.

Refactor fs/host.TTYFileOperations so that the relevant functionality can be
shared with VFS2 (fsimpl/host.ttyFD).

Incorporate host.defaultFileFD into the default host.fileDescription. This way,
there is no need for a separate default_file.go. As in vfs1, the TTY file
implementation can be built on top of this default and override operations as
necessary (PRead/Read/PWrite/Write, Release, Ioctl).

Note that these changes still need to be plumbed into runsc, which refers to
imported TTYs in control/proc.go:ExecAsync.

Updates #1672.

PiperOrigin-RevId: 301718157
---
 pkg/sentry/fs/host/ioctl_unsafe.go     |   4 +
 pkg/sentry/fs/host/tty.go              |   5 +
 pkg/sentry/fs/host/util.go             |   8 +-
 pkg/sentry/fsimpl/host/BUILD           |   6 +-
 pkg/sentry/fsimpl/host/default_file.go | 247 ---------------------
 pkg/sentry/fsimpl/host/host.go         | 258 ++++++++++++++++++++--
 pkg/sentry/fsimpl/host/ioctl_unsafe.go |  56 +++++
 pkg/sentry/fsimpl/host/tty.go          | 379 +++++++++++++++++++++++++++++++++
 8 files changed, 692 insertions(+), 271 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/host/default_file.go
 create mode 100644 pkg/sentry/fsimpl/host/ioctl_unsafe.go
 create mode 100644 pkg/sentry/fsimpl/host/tty.go

diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 271582e54..150ac8e19 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
+// LINT.IfChange
+
 func ioctlGetTermios(fd int) (*linux.Termios, error) {
 	var t linux.Termios
 	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
@@ -54,3 +56,5 @@ func ioctlSetWinsize(fd int, w *linux.Winsize) error {
 	}
 	return nil
 }
+
+// LINT.ThenChange(../../fsimpl/host/ioctl_unsafe.go)
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 3f218b4a7..cb91355ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -26,6 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // TTYFileOperations implements fs.FileOperations for a host file descriptor
 // that wraps a TTY FD.
 //
@@ -43,6 +45,7 @@ type TTYFileOperations struct {
 	// connected to this TTY.
 	fgProcessGroup *kernel.ProcessGroup
 
+	// termios contains the terminal attributes for this TTY.
 	termios linux.KernelTermios
 }
 
@@ -357,3 +360,5 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
 	return kernel.ERESTARTSYS
 }
+
+// LINT.ThenChange(../../fsimpl/host/tty.go)
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 388108fdf..1b0356930 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -80,9 +80,9 @@ func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr {
 		Usage:            s.Blocks * 512,
 		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
 		Owner:            owner(s),
-		AccessTime:       time.FromUnix(s.Atim.Sec, s.Atim.Nsec),
-		ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
-		StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
 		Links:            uint64(s.Nlink),
 	}
 }
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 5d67f88e3..0bb4a5c3e 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -5,8 +5,9 @@ licenses(["notice"])
 go_library(
     name = "host",
     srcs = [
-        "default_file.go",
         "host.go",
+        "ioctl_unsafe.go",
+        "tty.go",
         "util.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -17,9 +18,12 @@ go_library(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
deleted file mode 100644
index 459238603..000000000
--- a/pkg/sentry/fsimpl/host/default_file.go
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package host
-
-import (
-	"math"
-	"syscall"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fd"
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files.
-type defaultFileFD struct {
-	fileDescription
-
-	// canMap specifies whether we allow the file to be memory mapped.
-	canMap bool
-
-	// mu protects the fields below.
-	mu sync.Mutex
-
-	// offset specifies the current file offset.
-	offset int64
-}
-
-// TODO(gvisor.dev/issue/1672): Implement Waitable interface.
-
-// PRead implements FileDescriptionImpl.
-func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		return 0, syserror.ESPIPE
-	}
-
-	return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags))
-}
-
-// Read implements FileDescriptionImpl.
-func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		// These files can't be memory mapped, assert this.
-		if f.canMap {
-			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-		}
-
-		n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags))
-		if isBlockError(err) {
-			// If we got any data at all, return it as a "completed" partial read
-			// rather than retrying until complete.
-			if n != 0 {
-				err = nil
-			} else {
-				err = syserror.ErrWouldBlock
-			}
-		}
-		return n, err
-	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	f.mu.Lock()
-	n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags))
-	f.offset += n
-	f.mu.Unlock()
-	return n, err
-}
-
-func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
-	if flags != 0 {
-		return 0, syserror.EOPNOTSUPP
-	}
-
-	var reader safemem.Reader
-	if offset == -1 {
-		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
-	} else {
-		reader = safemem.FromVecReaderFunc{
-			func(srcs [][]byte) (int64, error) {
-				n, err := unix.Preadv(hostFD, srcs, offset)
-				return int64(n), err
-			},
-		}
-	}
-	n, err := dst.CopyOutFrom(ctx, reader)
-	return int64(n), err
-}
-
-// PWrite implements FileDescriptionImpl.
-func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		return 0, syserror.ESPIPE
-	}
-	return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags))
-}
-
-// Write implements FileDescriptionImpl.
-func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		// These files can't be memory mapped, assert this.
-		if f.canMap {
-			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-		}
-
-		n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags))
-		if isBlockError(err) {
-			err = syserror.ErrWouldBlock
-		}
-		return n, err
-	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
-	f.mu.Lock()
-	n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags))
-	f.offset += n
-	f.mu.Unlock()
-	return n, err
-}
-
-func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
-	if flags != 0 {
-		return 0, syserror.EOPNOTSUPP
-	}
-
-	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
-	if err != nil {
-		return 0, err
-	}
-	src = src.TakeFirst64(limit)
-
-	var writer safemem.Writer
-	if offset == -1 {
-		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
-	} else {
-		writer = safemem.FromVecWriterFunc{
-			func(srcs [][]byte) (int64, error) {
-				n, err := unix.Pwritev(hostFD, srcs, offset)
-				return int64(n), err
-			},
-		}
-	}
-	n, err := src.CopyInTo(ctx, writer)
-	return int64(n), err
-}
-
-// Seek implements FileDescriptionImpl.
-//
-// Note that we do not support seeking on directories, since we do not even
-// allow directory fds to be imported at all.
-func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
-	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
-	if f.inode.isStream {
-		return 0, syserror.ESPIPE
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	switch whence {
-	case linux.SEEK_SET:
-		if offset < 0 {
-			return f.offset, syserror.EINVAL
-		}
-		f.offset = offset
-
-	case linux.SEEK_CUR:
-		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
-		if offset > math.MaxInt64-f.offset {
-			return f.offset, syserror.EOVERFLOW
-		}
-		if f.offset+offset < 0 {
-			return f.offset, syserror.EINVAL
-		}
-		f.offset += offset
-
-	case linux.SEEK_END:
-		var s syscall.Stat_t
-		if err := syscall.Fstat(f.inode.hostFD, &s); err != nil {
-			return f.offset, err
-		}
-		size := s.Size
-
-		// Check for overflow. Note that underflow cannot occur, since size >= 0.
-		if offset > math.MaxInt64-size {
-			return f.offset, syserror.EOVERFLOW
-		}
-		if size+offset < 0 {
-			return f.offset, syserror.EINVAL
-		}
-		f.offset = size + offset
-
-	case linux.SEEK_DATA, linux.SEEK_HOLE:
-		// Modifying the offset in the host file table should not matter, since
-		// this is the only place where we use it.
-		//
-		// For reading and writing, we always rely on our internal offset.
-		n, err := unix.Seek(f.inode.hostFD, offset, int(whence))
-		if err != nil {
-			return f.offset, err
-		}
-		f.offset = n
-
-	default:
-		// Invalid whence.
-		return f.offset, syserror.EINVAL
-	}
-
-	return f.offset, nil
-}
-
-// Sync implements FileDescriptionImpl.
-func (f *defaultFileFD) Sync(context.Context) error {
-	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
-	return unix.Fsync(f.inode.hostFD)
-}
-
-// ConfigureMMap implements FileDescriptionImpl.
-func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
-	if !f.canMap {
-		return syserror.ENODEV
-	}
-	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
-	return syserror.ENODEV
-}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 2eebcd60c..3afb41395 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -19,18 +19,23 @@ package host
 import (
 	"errors"
 	"fmt"
+	"math"
 	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // filesystem implements vfs.FilesystemImpl.
@@ -70,10 +75,20 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID
 		hostFD:   hostFD,
 		isStream: isStream,
 		isTTY:    isTTY,
+		canMap:   canMap(uint32(fileType)),
 		ino:      fs.NextIno(),
 		mode:     fileMode,
 		uid:      ownerUID,
 		gid:      ownerGID,
+		// For simplicity, set offset to 0. Technically, we should
+		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
+		// and use the offset from the host fd otherwise.
+		offset: 0,
+	}
+
+	// These files can't be memory mapped, assert this.
+	if i.isStream && i.canMap {
+		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 	}
 
 	d := &kernfs.Dentry{}
@@ -110,12 +125,17 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	isTTY bool
 
+	// canMap specifies whether we allow the file to be memory mapped.
+	//
+	// This field is initialized at creation time and is immutable.
+	canMap bool
+
 	// ino is an inode number unique within this filesystem.
+	//
+	// This field is initialized at creation time and is immutable.
 	ino uint64
 
-	// mu protects the inode metadata below.
-	// TODO(gvisor.dev/issue/1672): actually protect fields below.
-	//mu sync.Mutex
+	// TODO(gvisor.dev/issue/1672): protect mode, uid, and gid with mutex.
 
 	// mode is the file mode of this inode. Note that this value may become out
 	// of date if the mode is changed on the host, e.g. with chmod.
@@ -125,6 +145,12 @@ type inode struct {
 	// file created on import, not the fd on the host.
 	uid auth.KUID
 	gid auth.KGID
+
+	// offsetMu protects offset.
+	offsetMu sync.Mutex
+
+	// offset specifies the current file offset.
+	offset int64
 }
 
 // Note that these flags may become out of date, since they can be modified
@@ -336,36 +362,40 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error
 
 	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
 	// we don't allow importing arbitrary file types without proper support.
+	var (
+		vfsfd  *vfs.FileDescription
+		fdImpl vfs.FileDescriptionImpl
+	)
 	if i.isTTY {
-		// TODO(gvisor.dev/issue/1672): support importing host fd as TTY.
-		return nil, errors.New("importing host fd as TTY not supported")
-	}
-
-	// For simplicity, set offset to 0. Technically, we should
-	// only set to 0 on files that are not seekable (sockets, pipes, etc.),
-	// and use the offset from the host fd otherwise.
-	fd := &defaultFileFD{
-		fileDescription: fileDescription{
-			inode: i,
-		},
-		canMap: canMap(uint32(fileType)),
-		mu:     sync.Mutex{},
-		offset: 0,
+		fd := &ttyFD{
+			fileDescription: fileDescription{inode: i},
+			termios:         linux.DefaultSlaveTermios,
+		}
+		vfsfd = &fd.vfsfd
+		fdImpl = fd
+	} else {
+		// For simplicity, set offset to 0. Technically, we should
+		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
+		// and use the offset from the host fd otherwise.
+		fd := &fileDescription{inode: i}
+		vfsfd = &fd.vfsfd
+		fdImpl = fd
 	}
 
-	vfsfd := &fd.vfsfd
 	flags, err := fileFlagsFromHostFD(i.hostFD)
 	if err != nil {
 		return nil, err
 	}
 
-	if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := vfsfd.Init(fdImpl, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return vfsfd, nil
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// TODO(gvisor.dev/issue/1672): Implement Waitable interface.
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -394,3 +424,193 @@ func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.S
 func (f *fileDescription) Release() {
 	// noop
 }
+
+// PRead implements FileDescriptionImpl.
+func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
+}
+
+// Read implements FileDescriptionImpl.
+func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	i.offsetMu.Lock()
+	n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags)
+	i.offset += n
+	i.offsetMu.Unlock()
+	return n, err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	var reader safemem.Reader
+	if offset == -1 {
+		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
+	} else {
+		reader = safemem.FromVecReaderFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Preadv(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
+	}
+	n, err := dst.CopyOutFrom(ctx, reader)
+	return int64(n), err
+}
+
+// PWrite implements FileDescriptionImpl.
+func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+}
+
+// Write implements FileDescriptionImpl.
+func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+	i.offsetMu.Lock()
+	n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags)
+	i.offset += n
+	i.offsetMu.Unlock()
+	return n, err
+}
+
+func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	var writer safemem.Writer
+	if offset == -1 {
+		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
+	} else {
+		writer = safemem.FromVecWriterFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Pwritev(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
+	}
+	n, err := src.CopyInTo(ctx, writer)
+	return int64(n), err
+}
+
+// Seek implements FileDescriptionImpl.
+//
+// Note that we do not support seeking on directories, since we do not even
+// allow directory fds to be imported at all.
+func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
+	if i.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	i.offsetMu.Lock()
+	defer i.offsetMu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return i.offset, syserror.EINVAL
+		}
+		i.offset = offset
+
+	case linux.SEEK_CUR:
+		// Check for overflow. Note that underflow cannot occur, since i.offset >= 0.
+		if offset > math.MaxInt64-i.offset {
+			return i.offset, syserror.EOVERFLOW
+		}
+		if i.offset+offset < 0 {
+			return i.offset, syserror.EINVAL
+		}
+		i.offset += offset
+
+	case linux.SEEK_END:
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			return i.offset, err
+		}
+		size := s.Size
+
+		// Check for overflow. Note that underflow cannot occur, since size >= 0.
+		if offset > math.MaxInt64-size {
+			return i.offset, syserror.EOVERFLOW
+		}
+		if size+offset < 0 {
+			return i.offset, syserror.EINVAL
+		}
+		i.offset = size + offset
+
+	case linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Modifying the offset in the host file table should not matter, since
+		// this is the only place where we use it.
+		//
+		// For reading and writing, we always rely on our internal offset.
+		n, err := unix.Seek(i.hostFD, offset, int(whence))
+		if err != nil {
+			return i.offset, err
+		}
+		i.offset = n
+
+	default:
+		// Invalid whence.
+		return i.offset, syserror.EINVAL
+	}
+
+	return i.offset, nil
+}
+
+// Sync implements FileDescriptionImpl.
+func (f *fileDescription) Sync(context.Context) error {
+	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	return unix.Fsync(f.inode.hostFD)
+}
+
+// ConfigureMMap implements FileDescriptionImpl.
+func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
+	if !f.inode.canMap {
+		return syserror.ENODEV
+	}
+	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
+	return syserror.ENODEV
+}
diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
new file mode 100644
index 000000000..0983bf7d8
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
new file mode 100644
index 000000000..8936afb06
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -0,0 +1,379 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor
+// that wraps a TTY FD.
+type ttyFD struct {
+	fileDescription
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// session is the session attached to this ttyFD.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
+	fgProcessGroup *kernel.ProcessGroup
+
+	// termios contains the terminal attributes for this TTY.
+	termios linux.KernelTermios
+}
+
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
+func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *ttyFD) Release() {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileDescription.Release()
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.Write(ctx, src, opts)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0]. This is the real FD:
+	fd := t.inode.hostFD
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		if err == nil {
+			t.termios.FromTermios(termios)
+		}
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on this
+		// terminal.
+
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change()
+			// to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
+			return 0, syserror.EPERM
+		}
+
+		t.fgProcessGroup = pg
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+
+		// Unlike setting the termios, any process group (even background ones) can
+		// set the winsize.
+
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change is more of a blacklist of cases than a
+		// whitelist, and is surprisingly permissive. Allowing the
+		// change seems most appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	//
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
+	return kernel.ERESTARTSYS
+}
-- 
cgit v1.2.3


From e9e399c25d4fcad2adfe92d73b192b9784774964 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 19 Mar 2020 07:18:47 -0700
Subject: Remove workMu from tcpip.Endpoint.

workMu is removed and e.mu is now a mutex that supports TryLock.  The packet
processing path tries to lock the mutex and if its locked it will just queue the
packet and move on. The endpoint.UnlockUser() will process any backlog of
packets before unlocking the socket.

This simplifies the locking inside tcp endpoints a lot. Further the
endpoint.LockUser() implements spinning as long as the lock is not held by
another syscall goroutine. This ensures low latency as not spinning leads to the
task thread being put to sleep if the lock is held by the packet dispatch
path. This is suboptimal as the lower layer rarely holds the lock for long so
implementing spinning here helps.

If the lock is held by another task goroutine then we just proceed to call
LockUser() and the task could be put to sleep.

The protocol goroutines themselves just call e.mu.Lock() and block if the
lock is currently not available.

Updates #231, #357

PiperOrigin-RevId: 301808349
---
 pkg/sentry/kernel/epoll/epoll.go          |   2 +
 pkg/sentry/socket/netstack/netstack.go    |  24 +-
 pkg/tcpip/transport/tcp/accept.go         |  90 +++---
 pkg/tcpip/transport/tcp/connect.go        |  78 ++---
 pkg/tcpip/transport/tcp/dispatcher.go     |   8 +-
 pkg/tcpip/transport/tcp/endpoint.go       | 495 ++++++++++++++++--------------
 pkg/tcpip/transport/tcp/endpoint_state.go |   5 +-
 pkg/tcpip/transport/tcp/protocol.go       |  38 +--
 pkg/tcpip/transport/tcp/rcv.go            |   6 -
 pkg/tcpip/transport/tcp/segment_queue.go  |   8 +-
 pkg/tcpip/transport/tcp/snd.go            |   3 -
 pkg/tcpip/transport/tcp/tcp_test.go       |  41 ++-
 12 files changed, 424 insertions(+), 374 deletions(-)

diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 8bffb78fc..592650923 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -296,8 +296,10 @@ func (*readyCallback) Callback(w *waiter.Entry) {
 		e.waitingList.Remove(entry)
 		e.readyList.PushBack(entry)
 		entry.curList = &e.readyList
+		e.listsMu.Unlock()
 
 		e.Notify(waiter.EventIn)
+		return
 	}
 
 	e.listsMu.Unlock()
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 13a9a60b4..a2e1da02f 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,6 +29,7 @@ import (
 	"io"
 	"math"
 	"reflect"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -264,6 +265,12 @@ type SocketOperations struct {
 	skType   linux.SockType
 	protocol int
 
+	// readViewHasData is 1 iff readView has data to be read, 0 otherwise.
+	// Must be accessed using atomic operations. It must only be written
+	// with readMu held but can be read without holding readMu. The latter
+	// is required to avoid deadlocks in epoll Readiness checks.
+	readViewHasData uint32
+
 	// readMu protects access to the below fields.
 	readMu sync.Mutex `state:"nosave"`
 	// readView contains the remaining payload from the last packet.
@@ -410,21 +417,24 @@ func (s *SocketOperations) isPacketBased() bool {
 
 // fetchReadView updates the readView field of the socket if it's currently
 // empty. It assumes that the socket is locked.
+//
+// Precondition: s.readMu must be held.
 func (s *SocketOperations) fetchReadView() *syserr.Error {
 	if len(s.readView) > 0 {
 		return nil
 	}
-
 	s.readView = nil
 	s.sender = tcpip.FullAddress{}
 
 	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
+		atomic.StoreUint32(&s.readViewHasData, 0)
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
 	s.readCM = cms
+	atomic.StoreUint32(&s.readViewHasData, 1)
 
 	return nil
 }
@@ -623,11 +633,9 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	// Check our cached value iff the caller asked for readability and the
 	// endpoint itself is currently not readable.
 	if (mask & ^r & waiter.EventIn) != 0 {
-		s.readMu.Lock()
-		if len(s.readView) > 0 {
+		if atomic.LoadUint32(&s.readViewHasData) == 1 {
 			r |= waiter.EventIn
 		}
-		s.readMu.Unlock()
 	}
 
 	return r
@@ -2334,6 +2342,10 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 		}
 		copied += n
 		s.readView.TrimFront(n)
+		if len(s.readView) == 0 {
+			atomic.StoreUint32(&s.readViewHasData, 0)
+		}
+
 		dst = dst.DropFirst(n)
 		if e != nil {
 			err = syserr.FromError(e)
@@ -2456,6 +2468,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readView.TrimFront(int(n))
 	}
 
+	if len(s.readView) == 0 {
+		atomic.StoreUint32(&s.readViewHasData, 0)
+	}
+
 	var flags int
 	if msgLen > int(n) {
 		flags |= linux.MSG_TRUNC
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 85049e54e..4d7602d54 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -221,7 +221,8 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 }
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
-// the connection parameters given by the arguments.
+// the connection parameters given by the arguments. The endpoint is returned
+// with n.mu held.
 func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
@@ -243,21 +244,6 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 	n.initGSO()
 
-	// Now inherit any socket options that should be inherited from the
-	// listening endpoint.
-	// In case of Forwarder listenEP will be nil and hence this check.
-	if l.listenEP != nil {
-		l.listenEP.propagateInheritableOptions(n)
-	}
-
-	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
-		n.Close()
-		return nil, err
-	}
-
-	n.isRegistered = true
-
 	// Create sender and receiver.
 	//
 	// The receiver at least temporarily has a zero receive window scale,
@@ -269,11 +255,27 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	// window to grow to a really large value.
 	n.rcvAutoParams.prevCopied = n.initialReceiveWindow()
 
+	// Lock the endpoint before registering to ensure that no out of
+	// band changes are possible due to incoming packets etc till
+	// the endpoint is done initializing.
+	n.mu.Lock()
+
+	// Register new endpoint so that packets are routed to it.
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
+		n.mu.Unlock()
+		n.Close()
+		return nil, err
+	}
+
+	n.isRegistered = true
+
 	return n, nil
 }
 
 // createEndpointAndPerformHandshake creates a new endpoint in connected state
 // and then performs the TCP 3-way handshake.
+//
+// The new endpoint is returned with e.mu held.
 func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
@@ -289,9 +291,25 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
 			l.listenEP.mu.Unlock()
+			// Ensure we release any registrations done by the newly
+			// created endpoint.
+			ep.mu.Unlock()
+			ep.Close()
+
+			// Wake up any waiters. This is strictly not required normally
+			// as a socket that was never accepted can't really have any
+			// registered waiters except when stack.Wait() is called which
+			// waits for all registered endpoints to stop and expects an
+			// EventHUp.
+			ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		l.listenEP.propagateInheritableOptionsLocked(ep)
+
 		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
@@ -299,6 +317,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Perform the 3-way handshake.
 	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
+		ep.mu.Unlock()
 		ep.Close()
 		// Wake up any waiters. This is strictly not required normally
 		// as a socket that was never accepted can't really have any
@@ -312,9 +331,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		}
 		return nil, err
 	}
-	ep.mu.Lock()
 	ep.isConnectNotified = true
-	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
 	// handshake because it's possible that the peer doesn't support window
@@ -366,12 +383,12 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	}
 }
 
-// propagateInheritableOptions propagates any options set on the listening
+// propagateInheritableOptionsLocked propagates any options set on the listening
 // endpoint to the newly created endpoint.
-func (e *endpoint) propagateInheritableOptions(n *endpoint) {
-	e.mu.Lock()
+//
+// Precondition: e.mu and n.mu must be held.
+func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
 	n.userTimeout = e.userTimeout
-	e.mu.Unlock()
 }
 
 // handleSynSegment is called in its own goroutine once the listening endpoint
@@ -382,7 +399,11 @@ func (e *endpoint) propagateInheritableOptions(n *endpoint) {
 // cookies to accept connections.
 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
 	defer decSynRcvdCount()
-	defer e.decSynRcvdCount()
+	defer func() {
+		e.mu.Lock()
+		e.decSynRcvdCount()
+		e.mu.Unlock()
+	}()
 	defer s.decRef()
 
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
@@ -399,29 +420,21 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 }
 
 func (e *endpoint) incSynRcvdCount() bool {
-	e.mu.Lock()
-	if e.synRcvdCount >= cap(e.acceptedChan) {
-		e.mu.Unlock()
+	if e.synRcvdCount >= (cap(e.acceptedChan)) {
 		return false
 	}
 	e.synRcvdCount++
-	e.mu.Unlock()
 	return true
 }
 
 func (e *endpoint) decSynRcvdCount() {
-	e.mu.Lock()
 	e.synRcvdCount--
-	e.mu.Unlock()
 }
 
 func (e *endpoint) acceptQueueIsFull() bool {
-	e.mu.Lock()
 	if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c {
-		e.mu.Unlock()
 		return true
 	}
-	e.mu.Unlock()
 	return false
 }
 
@@ -559,6 +572,10 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			return
 		}
 
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		e.propagateInheritableOptionsLocked(n)
+
 		// clear the tsOffset for the newly created
 		// endpoint as the Timestamp was already
 		// randomly offset when the original SYN-ACK was
@@ -593,14 +610,12 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	e.mu.Lock()
 	v6only := e.v6only
-	e.mu.Unlock()
 	ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.NetProto)
 
 	defer func() {
 		// Mark endpoint as closed. This will prevent goroutines running
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
-		e.mu.Lock()
 		e.setEndpointState(StateClose)
 
 		// close any endpoints in SYN-RCVD state.
@@ -622,7 +637,10 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	s.AddWaker(&e.notificationWaker, wakerForNotification)
 	s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
 	for {
-		switch index, _ := s.Fetch(true); index {
+		e.mu.Unlock()
+		index, _ := s.Fetch(true)
+		e.mu.Lock()
+		switch index {
 		case wakerForNotification:
 			n := e.fetchNotifications()
 			if n&notifyClose != 0 {
@@ -635,7 +653,9 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 					s.decRef()
 				}
 				close(e.drainDone)
+				e.mu.Unlock()
 				<-e.undrain
+				e.mu.Lock()
 			}
 
 		case wakerForNewSegment:
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index be86af502..edb37a549 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -61,6 +61,9 @@ const (
 )
 
 // handshake holds the state used during a TCP 3-way handshake.
+//
+// NOTE: handshake.ep.mu is held during handshake processing. It is released if
+// we are going to block and reacquired when we start processing an event.
 type handshake struct {
 	ep     *endpoint
 	state  handshakeState
@@ -209,9 +212,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
 	h.deferAccept = deferAccept
-	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
-	h.ep.mu.Unlock()
 }
 
 // checkAck checks if the ACK number, if present, of a segment received during
@@ -241,9 +242,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
 			// was acceptable then signal the user "error: connection reset", drop
 			// the segment, enter CLOSED state, delete TCB, and return."
-			h.ep.mu.Lock()
 			h.ep.workerCleanup = true
-			h.ep.mu.Unlock()
 			// Although the RFC above calls out ECONNRESET, Linux actually returns
 			// ECONNREFUSED here so we do as well.
 			return tcpip.ErrConnectionRefused
@@ -281,9 +280,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if s.flagIsSet(header.TCPFlagAck) {
 		h.state = handshakeCompleted
 
-		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
-		h.ep.mu.Unlock()
 
 		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
 		return nil
@@ -293,11 +290,9 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// but resend our own SYN and wait for it to be acknowledged in the
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
-	h.ep.mu.Lock()
 	ttl := h.ep.ttl
 	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
-	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    int(h.effectiveRcvWndScale()),
 		TS:    rcvSynOpts.TS,
@@ -357,10 +352,6 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		h.ep.mu.RLock()
-		amss := h.ep.amss
-		h.ep.mu.RUnlock()
-
 		h.resetState()
 		synOpts := header.TCPSynOptions{
 			WS:            h.rcvWndScale,
@@ -368,7 +359,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			TSVal:         h.ep.timestamp(),
 			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
-			MSS:           amss,
+			MSS:           h.ep.amss,
 		}
 		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
 		return nil
@@ -399,15 +390,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 		}
 		h.state = handshakeCompleted
 
-		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+
 		// If the segment has data then requeue it for the receiver
 		// to process it again once main loop is started.
 		if s.data.Size() > 0 {
 			s.incRef()
 			h.ep.enqueueSegment(s)
 		}
-		h.ep.mu.Unlock()
 		return nil
 	}
 
@@ -493,7 +483,9 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 			}
 			if n&notifyDrain != 0 {
 				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
 				<-h.ep.undrain
+				h.ep.mu.Lock()
 			}
 		}
 
@@ -535,7 +527,6 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
-	h.ep.mu.Lock()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
@@ -546,7 +537,6 @@ func (h *handshake) execute() *tcpip.Error {
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
-	h.ep.mu.Unlock()
 
 	// Execute is also called in a listen context so we want to make sure we
 	// only send the TS/SACK option when we received the TS/SACK in the
@@ -563,7 +553,11 @@ func (h *handshake) execute() *tcpip.Error {
 	h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
 
 	for h.state != handshakeCompleted {
-		switch index, _ := s.Fetch(true); index {
+		h.ep.mu.Unlock()
+		index, _ := s.Fetch(true)
+		h.ep.mu.Lock()
+		switch index {
+
 		case wakerForResend:
 			timeOut *= 2
 			if timeOut > MaxRTO {
@@ -600,7 +594,9 @@ func (h *handshake) execute() *tcpip.Error {
 					}
 				}
 				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
 				<-h.ep.undrain
+				h.ep.mu.Lock()
 			}
 
 		case wakerForNewSegment:
@@ -1016,7 +1012,6 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		// except SYN-SENT, all reset (RST) segments are
 		// validated by checking their SEQ-fields." So
 		// we only process it if it's acceptable.
-		e.mu.Lock()
 		switch e.EndpointState() {
 		// In case of a RST in CLOSE-WAIT linux moves
 		// the socket to closed state with an error set
@@ -1040,11 +1035,9 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		case StateCloseWait:
 			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
-			e.mu.Unlock()
 			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
-			e.mu.Unlock()
 			// RFC 793, page 37 states that "in all states
 			// except SYN-SENT, all reset (RST) segments are
 			// validated by checking their SEQ-fields." So
@@ -1157,9 +1150,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 		// Now check if the received segment has caused us to transition
 		// to a CLOSED state, if yes then terminate processing and do
 		// not invoke the sender.
-		e.mu.RLock()
 		state := e.state
-		e.mu.RUnlock()
 		if state == StateClose {
 			// When we get into StateClose while processing from the queue,
 			// return immediately and let the protocolMainloop handle it.
@@ -1182,9 +1173,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
-	e.mu.RLock()
 	userTimeout := e.userTimeout
-	e.mu.RUnlock()
 
 	e.keepalive.Lock()
 	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
@@ -1248,6 +1237,7 @@ func (e *endpoint) disableKeepaliveTimer() {
 // goroutine and is responsible for sending segments and handling received
 // segments.
 func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
+	e.mu.Lock()
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
@@ -1269,7 +1259,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		}
 
 		e.mu.Unlock()
-		e.workMu.Unlock()
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1280,16 +1269,13 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		// completion.
 		initialRcvWnd := e.initialReceiveWindow()
 		h := newHandshake(e, seqnum.Size(initialRcvWnd))
-		e.mu.Lock()
 		h.ep.setEndpointState(StateSynSent)
-		e.mu.Unlock()
 
 		if err := h.execute(); err != nil {
 			e.lastErrorMu.Lock()
 			e.lastError = err
 			e.lastErrorMu.Unlock()
 
-			e.mu.Lock()
 			e.setEndpointState(StateError)
 			e.HardError = err
 
@@ -1302,9 +1288,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	e.keepalive.timer.init(&e.keepalive.waker)
 	defer e.keepalive.timer.cleanup()
 
-	e.mu.Lock()
 	drained := e.drainDone != nil
-	e.mu.Unlock()
 	if drained {
 		close(e.drainDone)
 		<-e.undrain
@@ -1330,10 +1314,8 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 				// This means the socket is being closed due
 				// to the TCP-FIN-WAIT2 timeout was hit. Just
 				// mark the socket as closed.
-				e.mu.Lock()
 				e.transitionToStateCloseLocked()
 				e.workerCleanup = true
-				e.mu.Unlock()
 				return nil
 			},
 		},
@@ -1388,7 +1370,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 				}
 
 				if n&notifyClose != 0 && closeTimer == nil {
-					e.mu.Lock()
 					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
@@ -1397,7 +1378,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						})
 						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 					}
-					e.mu.Unlock()
 				}
 
 				if n&notifyKeepaliveChanged != 0 {
@@ -1417,7 +1397,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
+						e.mu.Unlock()
 						<-e.undrain
+						e.mu.Lock()
 					}
 				}
 
@@ -1460,7 +1442,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	}
 	e.rcvListMu.Unlock()
 
-	e.mu.Lock()
 	if e.workerCleanup {
 		e.notifyProtocolGoroutine(notifyClose)
 	}
@@ -1468,7 +1449,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	cleanupOnError := func(err *tcpip.Error) {
-		e.mu.Lock()
 		e.workerCleanup = true
 		if err != nil {
 			e.resetConnectionLocked(err)
@@ -1480,16 +1460,11 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 loop:
 	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
-		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
-		e.workMu.Lock()
+		e.mu.Lock()
 
-		// We need to double check here because the notification maybe
+		// We need to double check here because the notification may be
 		// stale by the time we got around to processing it.
-		//
-		// NOTE: since we now hold the workMu the processors cannot
-		// change the state of the endpoint so it's safe to proceed
-		// after this check.
 		switch e.EndpointState() {
 		case StateError:
 			// If the endpoint has already transitioned to an ERROR
@@ -1502,21 +1477,17 @@ loop:
 		case StateTimeWait:
 			fallthrough
 		case StateClose:
-			e.mu.Lock()
 			break loop
 		default:
 			if err := funcs[v].f(); err != nil {
 				cleanupOnError(err)
 				return nil
 			}
-			e.mu.Lock()
 		}
 	}
 
-	state := e.EndpointState()
-	e.mu.Unlock()
 	var reuseTW func()
-	if state == StateTimeWait {
+	if e.EndpointState() == StateTimeWait {
 		// Disable close timer as we now entering real TIME_WAIT.
 		if closeTimer != nil {
 			closeTimer.Stop()
@@ -1526,14 +1497,11 @@ loop:
 		s.Done()
 		// Wake up any waiters before we enter TIME_WAIT.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
-		e.mu.Lock()
 		e.workerCleanup = true
-		e.mu.Unlock()
 		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
-	e.mu.Lock()
 	if e.EndpointState() != StateError {
 		e.transitionToStateCloseLocked()
 	}
@@ -1649,9 +1617,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 	defer timeWaitTimer.Stop()
 
 	for {
-		e.workMu.Unlock()
+		e.mu.Unlock()
 		v, _ := s.Fetch(true)
-		e.workMu.Lock()
+		e.mu.Lock()
 		switch v {
 		case newSegment:
 			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
@@ -1674,7 +1642,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 					e.handleTimeWaitSegments()
 				}
 				close(e.drainDone)
+				e.mu.Unlock()
 				<-e.undrain
+				e.mu.Lock()
 				return nil
 			}
 		case timeWaitDone:
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index d792b07d6..90ac956a9 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -128,7 +128,7 @@ func (p *processor) handleSegments() {
 				continue
 			}
 
-			if !ep.workMu.TryLock() {
+			if !ep.mu.TryLock() {
 				ep.newSegmentWaker.Assert()
 				continue
 			}
@@ -138,12 +138,10 @@ func (p *processor) handleSegments() {
 			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
 				// Send any active resets if required.
 				if err != nil {
-					ep.mu.Lock()
 					ep.resetConnectionLocked(err)
-					ep.mu.Unlock()
 				}
 				ep.notifyProtocolGoroutine(notifyTickleWorker)
-				ep.workMu.Unlock()
+				ep.mu.Unlock()
 				continue
 			}
 
@@ -151,7 +149,7 @@ func (p *processor) handleSegments() {
 				p.epQ.enqueue(ep)
 			}
 
-			ep.workMu.Unlock()
+			ep.mu.Unlock()
 		}
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5187a5e25..eb8a9d73e 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -18,6 +18,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
+	"runtime"
 	"strings"
 	"sync/atomic"
 	"time"
@@ -33,7 +34,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
-	"gvisor.dev/gvisor/pkg/tmutex"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -283,6 +283,37 @@ func (*EndpointInfo) IsEndpointInfo() {}
 // synchronized. The protocol implementation, however, runs in a single
 // goroutine.
 //
+// Each endpoint has a few mutexes:
+//
+// e.mu -> Primary mutex for an endpoint must be held for all operations except
+// in e.Readiness where acquiring it will result in a deadlock in epoll
+// implementation.
+//
+// The following three mutexes can be acquired independent of e.mu but if
+// acquired with e.mu then e.mu must be acquired first.
+//
+// e.rcvListMu -> Protects the rcvList and associated fields.
+// e.sndBufMu -> Protects the sndQueue and associated fields.
+// e.lastErrorMu -> Protects the lastError field.
+//
+// LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
+// based on the context in which the lock is acquired. In the syscall context
+// e.LockUser/e.UnlockUser should be used and when doing background processing
+// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
+// in brief.
+//
+// The reason for this locking behaviour is to avoid wakeups to handle packets.
+// In cases where the endpoint is already locked the background processor can
+// queue the packet up and go its merry way and the lock owner will eventually
+// process the backlog when releasing the lock. Similarly when acquiring the
+// lock from say a syscall goroutine we can implement a bit of spinning if we
+// know that the lock is not held by another syscall goroutine. Background
+// processors should never hold the lock for long and we can avoid an expensive
+// sleep/wakeup by spinning for a shortwhile.
+//
+// For more details please see the detailed documentation on
+// e.LockUser/e.UnlockUser methods.
+//
 // +stateify savable
 type endpoint struct {
 	EndpointInfo
@@ -299,12 +330,6 @@ type endpoint struct {
 	// Precondition: epQueue.mu must be held to read/write this field..
 	pendingProcessing bool `state:"nosave"`
 
-	// workMu is used to arbitrate which goroutine may perform protocol
-	// work. Only the main protocol goroutine is expected to call Lock() on
-	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
-	// perform work without having to wait for the main one to wake up.
-	workMu tmutex.Mutex `state:"nosave"`
-
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack  `state:"manual"`
@@ -330,15 +355,11 @@ type endpoint struct {
 	rcvBufSize    int
 	rcvBufUsed    int
 	rcvAutoParams rcvBufAutoTuneParams
-	// zeroWindow indicates that the window was closed due to receive buffer
-	// space being filled up. This is set by the worker goroutine before
-	// moving a segment to the rcvList. This setting is cleared by the
-	// endpoint when a Read() call reads enough data for the new window to
-	// be non-zero.
-	zeroWindow bool
 
-	// The following fields are protected by the mutex.
-	mu sync.RWMutex `state:"nosave"`
+	// mu protects all endpoint fields unless documented otherwise. mu must
+	// be acquired before interacting with the endpoint fields.
+	mu          sync.Mutex `state:"nosave"`
+	ownedByUser uint32
 
 	// state must be read/set using the EndpointState()/setEndpointState() methods.
 	state EndpointState `state:".(EndpointState)"`
@@ -583,14 +604,93 @@ func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
 	return maxMSS
 }
 
+// LockUser tries to lock e.mu and if it fails it will check if the lock is held
+// by another syscall goroutine. If yes, then it will goto sleep waiting for the
+// lock to be released, if not then it will spin till it acquires the lock or
+// another syscall goroutine acquires it in which case it will goto sleep as
+// described above.
+//
+// The assumption behind spinning here being that background packet processing
+// should not be holding the lock for long and spinning reduces latency as we
+// avoid an expensive sleep/wakeup of of the syscall goroutine).
+func (e *endpoint) LockUser() {
+	for {
+		// Try first if the sock is locked then check if it's owned
+		// by another user goroutine if not then we spin, otherwise
+		// we just goto sleep on the Lock() and wait.
+		if !e.mu.TryLock() {
+			// If socket is owned by the user then just goto sleep
+			// as the lock could be held for a reasonably long time.
+			if atomic.LoadUint32(&e.ownedByUser) == 1 {
+				e.mu.Lock()
+				atomic.StoreUint32(&e.ownedByUser, 1)
+				return
+			}
+			// Spin but yield the processor since the lower half
+			// should yield the lock soon.
+			runtime.Gosched()
+			continue
+		}
+		atomic.StoreUint32(&e.ownedByUser, 1)
+		return
+	}
+}
+
+// UnlockUser will check if there are any segments already queued for processing
+// and process any such segments before unlocking e.mu. This is required because
+// we when packets arrive and endpoint lock is already held then such packets
+// are queued up to be processed. If the lock is held by the endpoint goroutine
+// then it will process these packets but if the lock is instead held by the
+// syscall goroutine then we can have the syscall goroutine process the backlog
+// before unlocking.
+//
+// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
+// endpoint. It's also required eventually when we get rid of the endpoint
+// protocol goroutine altogether.
+//
+// Precondition: e.LockUser() must have been called before calling e.UnlockUser()
+func (e *endpoint) UnlockUser() {
+	// Lock segment queue before checking so that we avoid a race where
+	// segments can be queued between the time we check if queue is empty
+	// and actually unlock the endpoint mutex.
+	for {
+		e.segmentQueue.mu.Lock()
+		if e.segmentQueue.emptyLocked() {
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			e.segmentQueue.mu.Unlock()
+			return
+		}
+		e.segmentQueue.mu.Unlock()
+
+		switch e.EndpointState() {
+		case StateEstablished:
+			if err := e.handleSegments(true /* fastPath */); err != nil {
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+		default:
+			// Since we are waking the endpoint goroutine here just unlock
+			// and let it process the queued segments.
+			e.newSegmentWaker.Assert()
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			return
+		}
+	}
+}
+
 // StopWork halts packet processing. Only to be used in tests.
 func (e *endpoint) StopWork() {
-	e.workMu.Lock()
+	e.mu.Lock()
 }
 
 // ResumeWork resumes packet processing. Only to be used in tests.
 func (e *endpoint) ResumeWork() {
-	e.workMu.Unlock()
+	e.mu.Unlock()
 }
 
 // setEndpointState updates the state of the endpoint to state atomically. This
@@ -709,8 +809,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 	}
 
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	e.workMu.Init()
-	e.workMu.Lock()
 	e.tsOffset = timeStampOffset()
 
 	return e
@@ -721,9 +819,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	result := waiter.EventMask(0)
 
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-
 	switch e.EndpointState() {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
@@ -823,20 +918,22 @@ func (e *endpoint) Abort() {
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
 func (e *endpoint) Close() {
-	e.mu.Lock()
-	closed := e.closed
-	e.closed = true
-	e.mu.Unlock()
-	if closed {
+	e.LockUser()
+	defer e.UnlockUser()
+	if e.closed {
 		return
 	}
 
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
-	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
-
-	e.mu.Lock()
+	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdownLocked()
+}
 
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdownLocked() {
 	// For listening sockets, we always release ports inline so that they
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
@@ -853,6 +950,8 @@ func (e *endpoint) Close() {
 		e.boundPortFlags = ports.Flags{}
 	}
 
+	// Mark endpoint as closed.
+	e.closed = true
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	switch e.EndpointState() {
@@ -873,8 +972,6 @@ func (e *endpoint) Close() {
 		// goroutine terminates.
 		e.notifyProtocolGoroutine(notifyClose)
 	}
-
-	e.mu.Unlock()
 }
 
 // closePendingAcceptableConnections closes all connections that have completed
@@ -909,7 +1006,6 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 // after Close() is called and the worker goroutine (if any) is done with its
 // work.
 func (e *endpoint) cleanupLocked() {
-
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
 	if e.acceptedChan != nil {
@@ -954,18 +1050,18 @@ func (e *endpoint) initialReceiveWindow() int {
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
 // based on the number of bytes copied to user space.
 func (e *endpoint) ModerateRecvBuf(copied int) {
-	e.mu.RLock()
+	e.LockUser()
+	defer e.UnlockUser()
+
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
-		e.mu.RUnlock()
 		return
 	}
 	now := time.Now()
 	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
 		e.rcvAutoParams.copied += copied
 		e.rcvListMu.Unlock()
-		e.mu.RUnlock()
 		return
 	}
 	prevRTTCopied := e.rcvAutoParams.copied + copied
@@ -1021,7 +1117,6 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvAutoParams.measureTime = now
 	e.rcvAutoParams.copied = 0
 	e.rcvListMu.Unlock()
-	e.mu.RUnlock()
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
@@ -1031,7 +1126,7 @@ func (e *endpoint) IPTables() (iptables.IPTables, error) {
 
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	e.mu.RLock()
+	e.LockUser()
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
 	// would cause the state to become StateError so we should allow the
@@ -1041,7 +1136,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
-		e.mu.RUnlock()
+		e.UnlockUser()
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
@@ -1051,7 +1146,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-	e.mu.RUnlock()
+	e.UnlockUser()
 
 	if err == tcpip.ErrClosedForReceive {
 		e.stats.ReadErrors.ReadClosed.Increment()
@@ -1124,13 +1219,13 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
 	// and opts.EndOfRecord are also ignored.
 
-	e.mu.RLock()
+	e.LockUser()
 	e.sndBufMu.Lock()
 
 	avail, err := e.isEndpointWritableLocked()
 	if err != nil {
 		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 		e.stats.WriteErrors.WriteClosed.Increment()
 		return 0, nil, err
 	}
@@ -1142,113 +1237,68 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	// are copying data in.
 	if !opts.Atomic {
 		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 	}
 
 	// Fetch data.
 	v, perr := p.Payload(avail)
 	if perr != nil || len(v) == 0 {
-		if opts.Atomic { // See above.
+		// Note that perr may be nil if len(v) == 0.
+		if opts.Atomic {
 			e.sndBufMu.Unlock()
-			e.mu.RUnlock()
+			e.UnlockUser()
 		}
-		// Note that perr may be nil if len(v) == 0.
 		return 0, nil, perr
 	}
 
-	if opts.Atomic {
+	queueAndSend := func() (int64, <-chan struct{}, *tcpip.Error) {
 		// Add data to the send queue.
 		s := newSegmentFromView(&e.route, e.ID, v)
 		e.sndBufUsed += len(v)
 		e.sndBufInQueue += seqnum.Size(len(v))
 		e.sndQueue.PushBack(s)
 		e.sndBufMu.Unlock()
-		// Release the endpoint lock to prevent deadlocks due to lock
-		// order inversion when acquiring workMu.
-		e.mu.RUnlock()
-	}
 
-	if e.workMu.TryLock() {
-		// Since we released locks in between it's possible that the
-		// endpoint transitioned to a CLOSED/ERROR states so make
-		// sure endpoint is still writable before trying to write.
-		if !opts.Atomic { // See above.
-			e.mu.RLock()
-			e.sndBufMu.Lock()
-
-			// Because we released the lock before copying, check state again
-			// to make sure the endpoint is still in a valid state for a write.
-			avail, err = e.isEndpointWritableLocked()
-			if err != nil {
-				e.sndBufMu.Unlock()
-				e.mu.RUnlock()
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return 0, nil, err
-			}
-
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-			// Add data to the send queue.
-			s := newSegmentFromView(&e.route, e.ID, v)
-			e.sndBufUsed += len(v)
-			e.sndBufInQueue += seqnum.Size(len(v))
-			e.sndQueue.PushBack(s)
-			e.sndBufMu.Unlock()
-			// Release the endpoint lock to prevent deadlocks due to lock
-			// order inversion when acquiring workMu.
-			e.mu.RUnlock()
-
-		}
 		// Do the work inline.
 		e.handleWrite()
-		e.workMu.Unlock()
-	} else {
-		if !opts.Atomic { // See above.
-			e.mu.RLock()
-			e.sndBufMu.Lock()
+		e.UnlockUser()
+		return int64(len(v)), nil, nil
+	}
 
-			// Because we released the lock before copying, check state again
-			// to make sure the endpoint is still in a valid state for a write.
-			avail, err = e.isEndpointWritableLocked()
-			if err != nil {
-				e.sndBufMu.Unlock()
-				e.mu.RUnlock()
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return 0, nil, err
-			}
+	if opts.Atomic {
+		// Locks released in queueAndSend()
+		return queueAndSend()
+	}
 
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-			// Add data to the send queue.
-			s := newSegmentFromView(&e.route, e.ID, v)
-			e.sndBufUsed += len(v)
-			e.sndBufInQueue += seqnum.Size(len(v))
-			e.sndQueue.PushBack(s)
-			e.sndBufMu.Unlock()
-			// Release the endpoint lock to prevent deadlocks due to lock
-			// order inversion when acquiring workMu.
-			e.mu.RUnlock()
+	// Since we released locks in between it's possible that the
+	// endpoint transitioned to a CLOSED/ERROR states so make
+	// sure endpoint is still writable before trying to write.
+	e.LockUser()
+	e.sndBufMu.Lock()
+	avail, err = e.isEndpointWritableLocked()
+	if err != nil {
+		e.sndBufMu.Unlock()
+		e.UnlockUser()
+		e.stats.WriteErrors.WriteClosed.Increment()
+		return 0, nil, err
+	}
 
-		}
-		// Let the protocol goroutine do the work.
-		e.sndWaker.Assert()
+	// Discard any excess data copied in due to avail being reduced due
+	// to a simultaneous write call to the socket.
+	if avail < len(v) {
+		v = v[:avail]
 	}
 
-	return int64(len(v)), nil, nil
+	// Locks released in queueAndSend()
+	return queueAndSend()
 }
 
 // Peek reads data without consuming it from the endpoint.
 //
 // This method does not block if there is no data pending.
 func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
@@ -1339,6 +1389,9 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo
 
 // SetSockOptBool sets a socket option.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+
 	switch opt {
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
@@ -1346,9 +1399,6 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		e.mu.Lock()
-		defer e.mu.Unlock()
-
 		// We only allow this to be set when we're in the initial state.
 		if e.EndpointState() != StateInitial {
 			return tcpip.ErrInvalidEndpointState
@@ -1379,7 +1429,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 		mask := uint32(notifyReceiveWindowChanged)
 
-		e.mu.RLock()
+		e.LockUser()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1409,8 +1459,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
+
 		e.rcvListMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -1466,15 +1517,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case tcpip.ReuseAddressOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.reuseAddr = v != 0
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.ReusePortOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.reusePort = v != 0
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.BindToDeviceOption:
@@ -1482,9 +1533,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
-		e.mu.Lock()
+		e.LockUser()
 		e.bindToDevice = id
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.QuickAckOption:
@@ -1500,16 +1551,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
 			return tcpip.ErrInvalidOptionValue
 		}
-		e.mu.Lock()
+		e.LockUser()
 		e.userMSS = uint16(userMSS)
-		e.mu.Unlock()
+		e.UnlockUser()
 		e.notifyProtocolGoroutine(notifyMSSChanged)
 		return nil
 
 	case tcpip.TTLOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.ttl = uint8(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.KeepaliveEnabledOption:
@@ -1541,15 +1592,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case tcpip.TCPUserTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.userTimeout = time.Duration(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.BroadcastOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.broadcast = v != 0
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.CongestionControlOption:
@@ -1563,22 +1614,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		availCC := strings.Split(string(avail), " ")
 		for _, cc := range availCC {
 			if v == tcpip.CongestionControlOption(cc) {
-				// Acquire the work mutex as we may need to
-				// reinitialize the congestion control state.
-				e.mu.Lock()
+				e.LockUser()
 				state := e.EndpointState()
 				e.cc = v
-				e.mu.Unlock()
 				switch state {
 				case StateEstablished:
-					e.workMu.Lock()
-					e.mu.Lock()
 					if e.EndpointState() == state {
 						e.snd.cc = e.snd.initCongestionControl(e.cc)
 					}
-					e.mu.Unlock()
-					e.workMu.Unlock()
 				}
+				e.UnlockUser()
 				return nil
 			}
 		}
@@ -1588,23 +1633,23 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return tcpip.ErrNoSuchFile
 
 	case tcpip.IPv4TOSOption:
-		e.mu.Lock()
+		e.LockUser()
 		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
 		// ignore the bits for now.
 		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.IPv6TrafficClassOption:
-		e.mu.Lock()
+		e.LockUser()
 		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
 		// ignore the bits for now.
 		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.TCPLingerTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		if v < 0 {
 			// Same as effectively disabling TCPLinger timeout.
 			v = 0
@@ -1622,16 +1667,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			v = stkTCPLingerTimeout
 		}
 		e.tcpLingerTimeout = time.Duration(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.TCPDeferAcceptOption:
-		e.mu.Lock()
+		e.LockUser()
 		if time.Duration(v) > MaxRTO {
 			v = tcpip.TCPDeferAcceptOption(MaxRTO)
 		}
 		e.deferAccept = time.Duration(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	default:
@@ -1641,8 +1686,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 // readyReceiveSize returns the number of bytes ready to be received.
 func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// The endpoint cannot be in listen state.
 	if e.EndpointState() == StateListen {
@@ -1664,9 +1709,9 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 			return false, tcpip.ErrUnknownProtocolOption
 		}
 
-		e.mu.Lock()
+		e.LockUser()
 		v := e.v6only
-		e.mu.Unlock()
+		e.UnlockUser()
 
 		return v, nil
 	}
@@ -1730,9 +1775,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.ReuseAddressOption:
-		e.mu.RLock()
+		e.LockUser()
 		v := e.reuseAddr
-		e.mu.RUnlock()
+		e.UnlockUser()
 
 		*o = 0
 		if v {
@@ -1741,9 +1786,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.ReusePortOption:
-		e.mu.RLock()
+		e.LockUser()
 		v := e.reusePort
-		e.mu.RUnlock()
+		e.UnlockUser()
 
 		*o = 0
 		if v {
@@ -1752,9 +1797,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.BindToDeviceOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
-		e.mu.RUnlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.QuickAckOption:
@@ -1765,16 +1810,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.TTLOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TTLOption(e.ttl)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.TCPInfoOption:
 		*o = tcpip.TCPInfoOption{}
-		e.mu.RLock()
+		e.LockUser()
 		snd := e.snd
-		e.mu.RUnlock()
+		e.UnlockUser()
 		if snd != nil {
 			snd.rtt.Lock()
 			o.RTT = snd.rtt.srtt
@@ -1813,9 +1858,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.TCPUserTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.OutOfBandInlineOption:
@@ -1824,9 +1869,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.BroadcastOption:
-		e.mu.Lock()
+		e.LockUser()
 		v := e.broadcast
-		e.mu.Unlock()
+		e.UnlockUser()
 
 		*o = 0
 		if v {
@@ -1835,33 +1880,33 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.CongestionControlOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = e.cc
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.IPv4TOSOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.IPv4TOSOption(e.sendTOS)
-		e.mu.RUnlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.IPv6TrafficClassOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
-		e.mu.RUnlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.TCPLingerTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.TCPDeferAcceptOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	default:
@@ -1901,8 +1946,8 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 // yet accepted by the app, they are restored without running the main goroutine
 // here.
 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	connectingAddr := addr.Addr
 
@@ -2071,9 +2116,13 @@ func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
-	e.mu.Lock()
+	e.LockUser()
+	defer e.UnlockUser()
+	return e.shutdownLocked(flags)
+}
+
+func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.shutdownFlags |= flags
-	finQueued := false
 	switch {
 	case e.EndpointState().connected():
 		// Close for read.
@@ -2087,24 +2136,9 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				e.mu.Unlock()
-				// Try to send an active reset immediately if the
-				// work mutex is available.
-				if e.workMu.TryLock() {
-					e.mu.Lock()
-					// We need to double check here to make
-					// sure worker has not transitioned the
-					// endpoint out of a connected state
-					// before trying to send a reset.
-					if e.EndpointState().connected() {
-						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-						e.notifyProtocolGoroutine(notifyTickleWorker)
-					}
-					e.mu.Unlock()
-					e.workMu.Unlock()
-				} else {
-					e.notifyProtocolGoroutine(notifyReset)
-				}
+				e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+				// Wake up worker to terminate loop.
+				e.notifyProtocolGoroutine(notifyTickleWorker)
 				return nil
 			}
 		}
@@ -2116,42 +2150,32 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 				// Already closed.
 				e.sndBufMu.Unlock()
 				if e.EndpointState() == StateTimeWait {
-					e.mu.Unlock()
 					return tcpip.ErrNotConnected
 				}
-				break
+				return nil
 			}
 
 			// Queue fin segment.
 			s := newSegmentFromView(&e.route, e.ID, nil)
 			e.sndQueue.PushBack(s)
 			e.sndBufInQueue++
-			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
 			e.sndBufMu.Unlock()
+			e.handleClose()
 		}
 
+		return nil
 	case e.EndpointState() == StateListen:
 		// Tell protocolListenLoop to stop.
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
 		}
+		return nil
+
 	default:
-		e.mu.Unlock()
 		return tcpip.ErrNotConnected
 	}
-	e.mu.Unlock()
-	if finQueued {
-		if e.workMu.TryLock() {
-			e.handleClose()
-			e.workMu.Unlock()
-		} else {
-			// Tell protocol goroutine to close.
-			e.sndCloseWaker.Assert()
-		}
-	}
-	return nil
 }
 
 // Listen puts the endpoint in "listen" mode, which allows it to accept
@@ -2166,8 +2190,8 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 }
 
 func (e *endpoint) listen(backlog int) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// Allow the backlog to be adjusted if the endpoint is not shutting down.
 	// When the endpoint shuts down, it sets workerCleanup to true, and from
@@ -2229,7 +2253,6 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
 func (e *endpoint) startAcceptedLoop() {
-	e.mu.Lock()
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2240,8 +2263,8 @@ func (e *endpoint) startAcceptedLoop() {
 // Accept returns a new endpoint if a peer has established a connection
 // to an endpoint previously set to listen mode.
 func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// Endpoint must be in listen state before it can accept connections.
 	if e.EndpointState() != StateListen {
@@ -2260,8 +2283,8 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 // Bind binds the endpoint to a specific local port and optionally address.
 func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	return e.bindLocked(addr)
 }
@@ -2339,8 +2362,8 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 
 // GetLocalAddress returns the address to which the endpoint is bound.
 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	return tcpip.FullAddress{
 		Addr: e.ID.LocalAddress,
@@ -2351,8 +2374,8 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 
 // GetRemoteAddress returns the address to which the endpoint is connected.
 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	if !e.EndpointState().connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
@@ -2419,7 +2442,6 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 // to be read, or when the connection is closed for receiving (in which case
 // s will be nil).
 func (e *endpoint) readyToRead(s *segment) {
-	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if s != nil {
 		s.incRef()
@@ -2434,7 +2456,6 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-	e.mu.RUnlock()
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
@@ -2578,9 +2599,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 	s.SegTime = time.Now()
 
 	// Copy EndpointID.
-	e.mu.Lock()
 	s.ID = stack.TCPEndpointID(e.ID)
-	e.mu.Unlock()
 
 	// Copy endpoint rcv state.
 	e.rcvListMu.Lock()
@@ -2710,10 +2729,10 @@ func (e *endpoint) State() uint32 {
 
 // Info returns a copy of the endpoint info.
 func (e *endpoint) Info() tcpip.EndpointInfo {
-	e.mu.RLock()
+	e.LockUser()
 	// Make a copy of the endpoint info.
 	ret := e.EndpointInfo
-	e.mu.RUnlock()
+	e.UnlockUser()
 	return &ret
 }
 
@@ -2728,9 +2747,9 @@ func (e *endpoint) Wait() {
 	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
 	defer e.waiterQueue.EventUnregister(&waitEntry)
 	for {
-		e.mu.Lock()
+		e.LockUser()
 		running := e.workerRunning
-		e.mu.Unlock()
+		e.UnlockUser()
 		if !running {
 			break
 		}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4a46f0ec5..9175de441 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -162,8 +162,8 @@ func (e *endpoint) loadState(state EndpointState) {
 		connectingLoading.Add(1)
 	}
 	// Directly update the state here rather than using e.setEndpointState
-	// as the endpoint is still being loaded and the stack reference to increment
-	// metrics is not yet initialized.
+	// as the endpoint is still being loaded and the stack reference is not
+	// yet initialized.
 	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
 }
 
@@ -180,7 +180,6 @@ func (e *endpoint) afterLoad() {
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	e.workMu.Init()
 	state := e.origEndpointState
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 73098d904..b0f918bb4 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -95,7 +95,7 @@ const (
 )
 
 type protocol struct {
-	mu                         sync.Mutex
+	mu                         sync.RWMutex
 	sackEnabled                bool
 	delayEnabled               bool
 	sendBufferSize             SendBufferSizeOption
@@ -273,57 +273,57 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 func (p *protocol) Option(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case *SACKEnabled:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = SACKEnabled(p.sackEnabled)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *DelayEnabled:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = DelayEnabled(p.delayEnabled)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *SendBufferSizeOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = p.sendBufferSize
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *ReceiveBufferSizeOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = p.recvBufferSize
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.CongestionControlOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.CongestionControlOption(p.congestionControl)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.AvailableCongestionControlOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.ModerateReceiveBufferOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPLingerTimeoutOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPTimeWaitTimeoutOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	default:
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index d80aff1b6..caf8977b3 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -168,7 +168,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 		// We just received a FIN, our next state depends on whether we sent a
 		// FIN already or not.
-		r.ep.mu.Lock()
 		switch r.ep.EndpointState() {
 		case StateEstablished:
 			r.ep.setEndpointState(StateCloseWait)
@@ -183,7 +182,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateFinWait2:
 			r.ep.setEndpointState(StateTimeWait)
 		}
-		r.ep.mu.Unlock()
 
 		// Flush out any pending segments, except the very first one if
 		// it happens to be the one we're handling now because the
@@ -208,7 +206,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// Handle ACK (not FIN-ACK, which we handled above) during one of the
 	// shutdown states.
 	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
-		r.ep.mu.Lock()
 		switch r.ep.EndpointState() {
 		case StateFinWait1:
 			r.ep.setEndpointState(StateFinWait2)
@@ -222,7 +219,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateLastAck:
 			r.ep.transitionToStateCloseLocked()
 		}
-		r.ep.mu.Unlock()
 	}
 
 	return true
@@ -336,10 +332,8 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 // handleRcvdSegment handles TCP segments directed at the connection managed by
 // r as they arrive. It is called by the protocol main loop.
 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
-	r.ep.mu.RLock()
 	state := r.ep.EndpointState()
 	closed := r.ep.closed
-	r.ep.mu.RUnlock()
 
 	if state != StateEstablished {
 		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index bd20a7ee9..48a257137 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -28,10 +28,16 @@ type segmentQueue struct {
 	used  int
 }
 
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *segmentQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
 // empty determines if the queue is empty.
 func (q *segmentQueue) empty() bool {
 	q.mu.Lock()
-	r := q.used == 0
+	r := q.emptyLocked()
 	q.mu.Unlock()
 
 	return r
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 657c3146e..17fed4ec5 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -455,9 +455,7 @@ func (s *sender) retransmitTimerExpired() bool {
 	// Give up if we've waited more than a minute since the last resend or
 	// if a user time out is set and we have exceeded the user specified
 	// timeout since the first retransmission.
-	s.ep.mu.RLock()
 	uto := s.ep.userTimeout
-	s.ep.mu.RUnlock()
 
 	if s.firstRetransmittedSegXmitTime.IsZero() {
 		// We store the original xmitTime of the segment that we are
@@ -713,7 +711,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		default:
 			s.ep.setEndpointState(StateFinWait1)
 		}
-
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 5b2b16afa..39d36d2ba 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2236,9 +2236,17 @@ func TestSegmentMerging(t *testing.T) {
 
 			c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-			// Prevent the endpoint from processing packets.
-			test.stop(c.EP)
+			// Send 10 1 byte segments to fill up InitialWindow but don't
+			// ACK. That should prevent anymore packets from going out.
+			for i := 0; i < 10; i++ {
+				view := buffer.NewViewFromBytes([]byte{0})
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %v", i+1, err)
+				}
+			}
 
+			// Now send the segments that should get merged as the congestion
+			// window is full and we won't be able to send any more packets.
 			var allData []byte
 			for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
 				allData = append(allData, data...)
@@ -2248,8 +2256,29 @@ func TestSegmentMerging(t *testing.T) {
 				}
 			}
 
-			// Let the endpoint process the segments that we just sent.
-			test.resume(c.EP)
+			// Check that we get 10 packets of 1 byte each.
+			for i := 0; i < 10; i++ {
+				b := c.GetPacket()
+				checker.IPv4(t, b,
+					checker.PayloadLen(header.TCPMinimumSize+1),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
+						checker.AckNum(790),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  c.IRS.Add(1 + 10), // 10 for the 10 bytes of payload.
+				RcvWnd:  30000,
+			})
 
 			// Check that data is received.
 			b := c.GetPacket()
@@ -2257,7 +2286,7 @@ func TestSegmentMerging(t *testing.T) {
 				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(uint32(c.IRS)+1),
+					checker.SeqNum(uint32(c.IRS)+11),
 					checker.AckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
@@ -2273,7 +2302,7 @@ func TestSegmentMerging(t *testing.T) {
 				DstPort: c.Port,
 				Flags:   header.TCPFlagAck,
 				SeqNum:  790,
-				AckNum:  c.IRS.Add(1 + seqnum.Size(len(allData))),
+				AckNum:  c.IRS.Add(11 + seqnum.Size(len(allData))),
 				RcvWnd:  30000,
 			})
 		})
-- 
cgit v1.2.3


From fd27a917ef068a4e17cddbe3f671b59f52e6e030 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 19 Mar 2020 09:41:56 -0700
Subject: Address comments on workMu removal change.

Updates #231, #357

PiperOrigin-RevId: 301833669
---
 pkg/tcpip/transport/tcp/accept.go   |  2 +-
 pkg/tcpip/transport/tcp/tcp_test.go | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 4d7602d54..3f80995f3 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -420,7 +420,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 }
 
 func (e *endpoint) incSynRcvdCount() bool {
-	if e.synRcvdCount >= (cap(e.acceptedChan)) {
+	if e.synRcvdCount >= cap(e.acceptedChan) {
 		return false
 	}
 	e.synRcvdCount++
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 39d36d2ba..ce3df7478 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2236,12 +2236,13 @@ func TestSegmentMerging(t *testing.T) {
 
 			c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-			// Send 10 1 byte segments to fill up InitialWindow but don't
-			// ACK. That should prevent anymore packets from going out.
-			for i := 0; i < 10; i++ {
+			// Send tcp.InitialCwnd number of segments to fill up
+			// InitialWindow but don't ACK. That should prevent
+			// anymore packets from going out.
+			for i := 0; i < tcp.InitialCwnd; i++ {
 				view := buffer.NewViewFromBytes([]byte{0})
 				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-					t.Fatalf("Write #%d failed: %v", i+1, err)
+					t.Fatalf("Write #%d failed: %s", i+1, err)
 				}
 			}
 
@@ -2256,8 +2257,8 @@ func TestSegmentMerging(t *testing.T) {
 				}
 			}
 
-			// Check that we get 10 packets of 1 byte each.
-			for i := 0; i < 10; i++ {
+			// Check that we get tcp.InitialCwnd packets.
+			for i := 0; i < tcp.InitialCwnd; i++ {
 				b := c.GetPacket()
 				checker.IPv4(t, b,
 					checker.PayloadLen(header.TCPMinimumSize+1),
-- 
cgit v1.2.3


From 3a37f6791745a26d38b69fbe9d4090f8fd0c7827 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 19 Mar 2020 09:59:21 -0700
Subject: Change SocketOperations.readMu to an RWMutex.

Also get rid of the readViewHasData as it's not required anymore.

Updates #231, #357

PiperOrigin-RevId: 301837227
---
 pkg/sentry/socket/netstack/netstack.go | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index a2e1da02f..a6ef7a47e 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,7 +29,6 @@ import (
 	"io"
 	"math"
 	"reflect"
-	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -265,14 +264,8 @@ type SocketOperations struct {
 	skType   linux.SockType
 	protocol int
 
-	// readViewHasData is 1 iff readView has data to be read, 0 otherwise.
-	// Must be accessed using atomic operations. It must only be written
-	// with readMu held but can be read without holding readMu. The latter
-	// is required to avoid deadlocks in epoll Readiness checks.
-	readViewHasData uint32
-
 	// readMu protects access to the below fields.
-	readMu sync.Mutex `state:"nosave"`
+	readMu sync.RWMutex `state:"nosave"`
 	// readView contains the remaining payload from the last packet.
 	readView buffer.View
 	// readCM holds control message information for the last packet read
@@ -428,13 +421,11 @@ func (s *SocketOperations) fetchReadView() *syserr.Error {
 
 	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
-		atomic.StoreUint32(&s.readViewHasData, 0)
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
 	s.readCM = cms
-	atomic.StoreUint32(&s.readViewHasData, 1)
 
 	return nil
 }
@@ -633,9 +624,11 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	// Check our cached value iff the caller asked for readability and the
 	// endpoint itself is currently not readable.
 	if (mask & ^r & waiter.EventIn) != 0 {
-		if atomic.LoadUint32(&s.readViewHasData) == 1 {
+		s.readMu.RLock()
+		if len(s.readView) > 0 {
 			r |= waiter.EventIn
 		}
+		s.readMu.RUnlock()
 	}
 
 	return r
@@ -2342,9 +2335,6 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 		}
 		copied += n
 		s.readView.TrimFront(n)
-		if len(s.readView) == 0 {
-			atomic.StoreUint32(&s.readViewHasData, 0)
-		}
 
 		dst = dst.DropFirst(n)
 		if e != nil {
@@ -2468,10 +2458,6 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readView.TrimFront(int(n))
 	}
 
-	if len(s.readView) == 0 {
-		atomic.StoreUint32(&s.readViewHasData, 0)
-	}
-
 	var flags int
 	if msgLen > int(n) {
 		flags |= linux.MSG_TRUNC
-- 
cgit v1.2.3


From 8b461aa36b4ad1d9a05a5917446fe89356dbb70d Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Thu, 19 Mar 2020 11:33:25 -0700
Subject: Remove redundant dep in BUILD

PiperOrigin-RevId: 301859066
---
 pkg/tcpip/transport/tcp/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index a32f9eacf..2fdf6c0a5 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -71,7 +71,6 @@ go_library(
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
-        "//pkg/tmutex",
         "//pkg/waiter",
         "@com_github_google_btree//:go_default_library",
     ],
-- 
cgit v1.2.3


From e0fbcdcb7f8a3e18946ccd5d5b98ea3adadee5ba Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 19 Mar 2020 12:32:15 -0700
Subject: Add packetimpact tests to presubmit and CI testing

PiperOrigin-RevId: 301872161
---
 kokoro/packetimpact_tests.cfg    |  9 +++++++++
 scripts/packetimpact_tests.sh    | 20 ++++++++++++++++++++
 test/packetimpact/tests/defs.bzl |  4 ++--
 3 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 kokoro/packetimpact_tests.cfg
 create mode 100755 scripts/packetimpact_tests.sh

diff --git a/kokoro/packetimpact_tests.cfg b/kokoro/packetimpact_tests.cfg
new file mode 100644
index 000000000..db86b52d5
--- /dev/null
+++ b/kokoro/packetimpact_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/packetimpact_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/scripts/packetimpact_tests.sh b/scripts/packetimpact_tests.sh
new file mode 100755
index 000000000..027d11e64
--- /dev/null
+++ b/scripts/packetimpact_tests.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+install_runsc_for_test runsc-d
+test_runsc $(bazel query "attr(tags, packetimpact, tests(//test/packetimpact/...))")
diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/tests/defs.bzl
index 3baac567a..1b4213d9b 100644
--- a/test/packetimpact/tests/defs.bzl
+++ b/test/packetimpact/tests/defs.bzl
@@ -71,7 +71,7 @@ def packetimpact_linux_test(name, testbench_binary, **kwargs):
         name = name + "_linux_test",
         testbench_binary = testbench_binary,
         flags = ["--dut_platform", "linux"],
-        tags = PACKETIMPACT_TAGS,
+        tags = PACKETIMPACT_TAGS + ["packetimpact"],
         **kwargs
     )
 
@@ -89,7 +89,7 @@ def packetimpact_netstack_test(name, testbench_binary, **kwargs):
         # This is the default runtime unless
         # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
         flags = ["--dut_platform", "netstack", "--runtime=runsc-d"],
-        tags = PACKETIMPACT_TAGS,
+        tags = PACKETIMPACT_TAGS + ["packetimpact"],
         **kwargs
     )
 
-- 
cgit v1.2.3


From 238e80fe38670faed5418444ce9fb2eb84e6d5d7 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 19 Mar 2020 14:09:03 -0700
Subject: Automated rollback of changelist 300409401

PiperOrigin-RevId: 301891849
---
 kokoro/kythe/generate_xrefs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 2f531aa72..323b0f77b 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -23,7 +23,7 @@ bazel version
 
 python3 -V
 
-readonly KYTHE_VERSION='v0.0.43'
+readonly KYTHE_VERSION='v0.0.41'
 readonly WORKDIR="$(mktemp -d)"
 readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
 if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
-- 
cgit v1.2.3


From 57d9bd922b4eff922d1a5185529fe5446249d592 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 19 Mar 2020 15:28:52 -0700
Subject: Remove the "frozen" bit from dirents.

Frozen was to lock down changes to the host filesystem
for hostFS. Now that hostFS is gone, it can be removed.

PiperOrigin-RevId: 301907923
---
 pkg/sentry/fs/dirent.go            | 133 +------------------------------------
 pkg/sentry/fs/file_overlay_test.go |  84 -----------------------
 pkg/sentry/fs/mounts.go            |  13 ----
 3 files changed, 1 insertion(+), 229 deletions(-)

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index e0b32e1c1..0266a5287 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -17,7 +17,6 @@ package fs
 import (
 	"fmt"
 	"path"
-	"sort"
 	"sync/atomic"
 	"syscall"
 
@@ -121,9 +120,6 @@ type Dirent struct {
 	// deleted may be set atomically when removed.
 	deleted int32
 
-	// frozen indicates this entry can't walk to unknown nodes.
-	frozen bool
-
 	// mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
 	mounted bool
 
@@ -253,8 +249,7 @@ func (d *Dirent) IsNegative() bool {
 	return d.Inode == nil
 }
 
-// hashChild will hash child into the children list of its new parent d, carrying over
-// any "frozen" state from d.
+// hashChild will hash child into the children list of its new parent d.
 //
 // Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
 // validate the returned unhashed weak reference. Common cases:
@@ -282,9 +277,6 @@ func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
 		d.IncRef()
 	}
 
-	// Carry over parent's frozen state.
-	child.frozen = d.frozen
-
 	return d.hashChildParentSet(child)
 }
 
@@ -400,38 +392,6 @@ func (d *Dirent) MountRoot() *Dirent {
 	return mountRoot
 }
 
-// Freeze prevents this dirent from walking to more nodes. Freeze is applied
-// recursively to all children.
-//
-// If this particular Dirent represents a Virtual node, then Walks and Creates
-// may proceed as before.
-//
-// Freeze can only be called before the application starts running, otherwise
-// the root it might be out of sync with the application root if modified by
-// sys_chroot.
-func (d *Dirent) Freeze() {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	if d.frozen {
-		// Already frozen.
-		return
-	}
-	d.frozen = true
-
-	// Take a reference when freezing.
-	for _, w := range d.children {
-		if child := w.Get(); child != nil {
-			// NOTE: We would normally drop the reference here. But
-			// instead we're hanging on to it.
-			ch := child.(*Dirent)
-			ch.Freeze()
-		}
-	}
-
-	// Drop all expired weak references.
-	d.flush()
-}
-
 // descendantOf returns true if the receiver dirent is equal to, or a
 // descendant of, the argument dirent.
 //
@@ -524,11 +484,6 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 		w.Drop()
 	}
 
-	// Are we allowed to do the lookup?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return nil, syscall.ENOENT
-	}
-
 	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
 	// expensive, if possible release the lock and re-acquire it.
 	if walkMayUnlock {
@@ -659,11 +614,6 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 		return nil, syscall.EEXIST
 	}
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return nil, syscall.ENOENT
-	}
-
 	// Try the create. We need to trust the file system to return EEXIST (or something
 	// that will translate to EEXIST) if name already exists.
 	file, err := d.Inode.Create(ctx, d, name, flags, perms)
@@ -727,11 +677,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		return syscall.EEXIST
 	}
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Remove any negative Dirent. We've already asserted above with d.exists
 	// that the only thing remaining here can be a negative Dirent.
 	if w, ok := d.children[name]; ok {
@@ -862,49 +807,6 @@ func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
 	return dot, dot
 }
 
-// readdirFrozen returns readdir results based solely on the frozen children.
-func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) {
-	// Collect attrs for "." and  "..".
-	attrs := make(map[string]DentAttr)
-	names := []string{".", ".."}
-	attrs["."], attrs[".."] = d.GetDotAttrs(root)
-
-	// Get info from all children.
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	for name, w := range d.children {
-		if child := w.Get(); child != nil {
-			defer child.DecRef()
-
-			// Skip negative children.
-			if child.(*Dirent).IsNegative() {
-				continue
-			}
-
-			sattr := child.(*Dirent).Inode.StableAttr
-			attrs[name] = DentAttr{
-				Type:    sattr.Type,
-				InodeID: sattr.InodeID,
-			}
-			names = append(names, name)
-		}
-	}
-
-	sort.Strings(names)
-
-	if int(offset) >= len(names) {
-		return offset, nil
-	}
-	names = names[int(offset):]
-	for _, name := range names {
-		if err := dirCtx.DirEmit(name, attrs[name]); err != nil {
-			return offset, err
-		}
-		offset++
-	}
-	return offset, nil
-}
-
 // DirIterator is an open directory containing directory entries that can be read.
 type DirIterator interface {
 	// IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
@@ -964,10 +866,6 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
 		return offset, nil
 	}
 
-	if d.frozen {
-		return d.readdirFrozen(root, offset, dirCtx)
-	}
-
 	// Collect attrs for "." and "..".
 	dot, dotdot := d.GetDotAttrs(root)
 
@@ -1068,11 +966,6 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err
 		return nil, syserror.EINVAL
 	}
 
-	// Are we frozen?
-	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
-		return nil, syserror.ENOENT
-	}
-
 	// Dirent that'll replace d.
 	//
 	// Note that NewDirent returns with one reference taken; the reference
@@ -1101,11 +994,6 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
 		return syserror.ENOENT
 	}
 
-	// Are we frozen?
-	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
-		return syserror.ENOENT
-	}
-
 	// Remount our former child in its place.
 	//
 	// As replacement used to be our child, it must already have the right
@@ -1135,11 +1023,6 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
 	unlock := d.lockDirectory()
 	defer unlock()
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Try to walk to the node.
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
@@ -1201,11 +1084,6 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
 	unlock := d.lockDirectory()
 	defer unlock()
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Check for dots.
 	if name == "." {
 		// Rejected as the last component by rmdir(2).
@@ -1519,15 +1397,6 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		return err
 	}
 
-	// Are we frozen?
-	// TODO(jamieliu): Is this the right errno?
-	if oldParent.frozen && !oldParent.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-	if newParent.frozen && !newParent.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Do we have general permission to remove from oldParent and
 	// create/replace in newParent?
 	if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index a76d87e3a..1971cc680 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -175,90 +175,6 @@ func TestReaddirRevalidation(t *testing.T) {
 	}
 }
 
-// TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
-// a frozen dirent tree does not make Readdir calls to the underlying files.
-// This is a regression test for b/114808269.
-func TestReaddirOverlayFrozen(t *testing.T) {
-	ctx := contexttest.Context(t)
-
-	// Create an overlay with two directories, each with two files.
-	upper := newTestRamfsDir(ctx, []dirContent{{name: "upper-file1"}, {name: "upper-file2"}}, nil)
-	lower := newTestRamfsDir(ctx, []dirContent{{name: "lower-file1"}, {name: "lower-file2"}}, nil)
-	overlayInode := fs.NewTestOverlayDir(ctx, upper, lower, false)
-
-	// Set that overlay as the root.
-	root := fs.NewDirent(ctx, overlayInode, "root")
-	ctx = &rootContext{
-		Context: ctx,
-		root:    root,
-	}
-
-	// Check that calling Readdir on the root now returns all 4 files (2
-	// from each layer in the overlay).
-	rootFile, err := root.Inode.GetFile(ctx, root, fs.FileFlags{Read: true})
-	if err != nil {
-		t.Fatalf("root.Inode.GetFile failed: %v", err)
-	}
-	defer rootFile.DecRef()
-	ser := &fs.CollectEntriesSerializer{}
-	if err := rootFile.Readdir(ctx, ser); err != nil {
-		t.Fatalf("rootFile.Readdir failed: %v", err)
-	}
-	if got, want := ser.Order, []string{".", "..", "lower-file1", "lower-file2", "upper-file1", "upper-file2"}; !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got names %v, want %v", got, want)
-	}
-
-	// Readdir should have been called on upper and lower.
-	upperDir := upper.InodeOperations.(*dir)
-	lowerDir := lower.InodeOperations.(*dir)
-	if !upperDir.ReaddirCalled {
-		t.Errorf("upperDir.ReaddirCalled got %v, want true", upperDir.ReaddirCalled)
-	}
-	if !lowerDir.ReaddirCalled {
-		t.Errorf("lowerDir.ReaddirCalled got %v, want true", lowerDir.ReaddirCalled)
-	}
-
-	// Reset.
-	upperDir.ReaddirCalled = false
-	lowerDir.ReaddirCalled = false
-
-	// Take references on "upper-file1" and "lower-file1", pinning them in
-	// the dirent tree.
-	for _, name := range []string{"upper-file1", "lower-file1"} {
-		if _, err := root.Walk(ctx, root, name); err != nil {
-			t.Fatalf("root.Walk(%q) failed: %v", name, err)
-		}
-		// Don't drop a reference on the returned dirent so that it
-		// will stay in the tree.
-	}
-
-	// Freeze the dirent tree.
-	root.Freeze()
-
-	// Seek back to the beginning of the file.
-	if _, err := rootFile.Seek(ctx, fs.SeekSet, 0); err != nil {
-		t.Fatalf("error seeking to beginning of directory: %v", err)
-	}
-
-	// Calling Readdir on the root now will return only the pinned
-	// children.
-	ser = &fs.CollectEntriesSerializer{}
-	if err := rootFile.Readdir(ctx, ser); err != nil {
-		t.Fatalf("rootFile.Readdir failed: %v", err)
-	}
-	if got, want := ser.Order, []string{".", "..", "lower-file1", "upper-file1"}; !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got names %v, want %v", got, want)
-	}
-
-	// Readdir should NOT have been called on upper or lower.
-	if upperDir.ReaddirCalled {
-		t.Errorf("upperDir.ReaddirCalled got %v, want false", upperDir.ReaddirCalled)
-	}
-	if lowerDir.ReaddirCalled {
-		t.Errorf("lowerDir.ReaddirCalled got %v, want false", lowerDir.ReaddirCalled)
-	}
-}
-
 type rootContext struct {
 	context.Context
 	root *fs.Dirent
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index c7981f66e..b414ddaee 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -273,19 +273,6 @@ func (mns *MountNamespace) DecRef() {
 	mns.DecRefWithDestructor(mns.destroy)
 }
 
-// Freeze freezes the entire mount tree.
-func (mns *MountNamespace) Freeze() {
-	mns.mu.Lock()
-	defer mns.mu.Unlock()
-
-	// We only want to freeze Dirents with active references, not Dirents referenced
-	// by a mount's MountSource.
-	mns.flushMountSourceRefsLocked()
-
-	// Freeze the entire shebang.
-	mns.root.Freeze()
-}
-
 // withMountLocked prevents further walks to `node`, because `node` is about to
 // be a mount point.
 func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
-- 
cgit v1.2.3


From b9210b285566acd72d0820b42c1a330ba56a1ad0 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 19 Mar 2020 16:05:52 -0700
Subject: Fix bm-tools to run on bazel.

Fixes random stuff that is broken on bazel/kokoro.
- random output coming back as "bytes" object instead of str
- missed syntax error in bazel
- a flag is missing in the version of gcloud on kokoro

PiperOrigin-RevId: 301915289
---
 benchmarks/harness/machine_producers/gcloud_producer.py | 8 +++++---
 tools/images/BUILD                                      | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
index 513d16e4f..1a624df2e 100644
--- a/benchmarks/harness/machine_producers/gcloud_producer.py
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -168,7 +168,9 @@ class GCloudProducer(machine_producer.MachineProducer):
     cmd.append("--zone=" + self.zone)
     cmd.append("--machine-type=" + self.machine_type)
     res = self._run_command(cmd)
-    return json.loads(res.stdout)
+    data = res.stdout
+    data = str(data, "utf-8") if isinstance(data, (bytes, bytearray)) else data
+    return json.loads(data)
 
   def _add_ssh_key_to_instances(self, names: List[str]) -> None:
     """Adds ssh key to instances by calling gcloud ssh command.
@@ -186,11 +188,11 @@ class GCloudProducer(machine_producer.MachineProducer):
       TimeoutError: when 3 unsuccessful tries to ssh into the host return 255.
     """
     for name in names:
-      cmd = "gcloud compute ssh {name}".format(name=name).split(" ")
+      cmd = "gcloud compute ssh {user}@{name}".format(
+          user=self.ssh_user, name=name).split(" ")
       cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file))
       cmd.append("--zone={zone}".format(zone=self.zone))
       cmd.append("--command=uname")
-      cmd.append("--ssh-key-expire-after=60m")
       timeout = datetime.timedelta(seconds=5 * 60)
       start = datetime.datetime.now()
       while datetime.datetime.now() <= timeout + start:
diff --git a/tools/images/BUILD b/tools/images/BUILD
index fe11f08a3..66ffd02aa 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -9,7 +9,7 @@ package(
 genrule(
     name = "zone",
     outs = ["zone.txt"],
-    cmd = "gcloud config get-value compute/zone > $@",
+    cmd = "gcloud config get-value compute/zone > \"$@\"",
     tags = [
         "local",
         "manual",
-- 
cgit v1.2.3


From 069f1edbe42ebd91800f9b35e8724babc4081613 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 19 Mar 2020 20:17:03 -0700
Subject: Improve error message when pivot_root fails

PiperOrigin-RevId: 301949722
---
 runsc/cmd/chroot.go | 2 +-
 runsc/cmd/gofer.go  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index b5a0ce17d..189244765 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -50,7 +50,7 @@ func pivotRoot(root string) error {
 	// new_root, so after umounting the old_root, we will see only
 	// the new_root in "/".
 	if err := syscall.PivotRoot(".", "."); err != nil {
-		return fmt.Errorf("error changing root filesystem: %v", err)
+		return fmt.Errorf("pivot_root failed, make sure that the root mount has a parent: %v", err)
 	}
 
 	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 6e06f3c0f..02e5af3d3 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -335,7 +335,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		if err := pivotRoot("/proc"); err != nil {
-			Fatalf("faild to change the root file system: %v", err)
+			Fatalf("failed to change the root file system: %v", err)
 		}
 		if err := os.Chdir("/"); err != nil {
 			Fatalf("failed to change working directory")
-- 
cgit v1.2.3


From 248e46f320525704da917e148a8f69d9b74671a0 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 19 Mar 2020 23:29:15 -0700
Subject: Whitelist utimensat(2).

utimensat is used by hostfs for setting timestamps on imported fds. Previously,
this would crash the sandbox since utimensat was not allowed.

Correct the VFS2 version of hostfs to match the call in VFS1.

PiperOrigin-RevId: 301970121
---
 pkg/sentry/fsimpl/host/BUILD          |  1 +
 pkg/sentry/fsimpl/host/host.go        |  4 ++--
 pkg/sentry/fsimpl/host/util.go        |  8 ++++----
 pkg/sentry/fsimpl/host/util_unsafe.go | 34 ++++++++++++++++++++++++++++++++++
 runsc/boot/filter/config.go           |  8 ++++++++
 5 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/host/util_unsafe.go

diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 0bb4a5c3e..82e1fb74b 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -9,6 +9,7 @@ go_library(
         "ioctl_unsafe.go",
         "tty.go",
         "util.go",
+        "util_unsafe.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 3afb41395..1f735628f 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -322,11 +322,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		}
 	}
 	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
-		timestamps := []unix.Timespec{
+		ts := [2]syscall.Timespec{
 			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
 			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
 		}
-		if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil {
+		if err := setTimestamps(i.hostFD, &ts); err != nil {
 			return err
 		}
 	}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index d519feef5..2bc757b1a 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -22,15 +22,15 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec {
+func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec {
 	if omit {
-		return unix.Timespec{
+		return syscall.Timespec{
 			Sec:  0,
 			Nsec: unix.UTIME_OMIT,
 		}
 	}
-	return unix.Timespec{
-		Sec:  int64(ts.Sec),
+	return syscall.Timespec{
+		Sec:  ts.Sec,
 		Nsec: int64(ts.Nsec),
 	}
 }
diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go
new file mode 100644
index 000000000..5136ac844
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util_unsafe.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func setTimestamps(fd int, ts *[2]syscall.Timespec) error {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(ts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f459d1973..06b9f888a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -291,6 +291,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(uint64(os.Getpid())),
 		},
 	},
+	syscall.SYS_UTIMENSAT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* null pathname */
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* flags */
+		},
+	},
 	syscall.SYS_WRITE: {},
 	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
 	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
-- 
cgit v1.2.3


From 49aef9cee70d111f6c3e1a6b04430bbe414a6c1e Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 20 Mar 2020 15:24:00 -0700
Subject: Remove unused variable `sndNxtList`.

PiperOrigin-RevId: 302110328
---
 pkg/tcpip/transport/tcp/connect.go | 1 -
 pkg/tcpip/transport/tcp/snd.go     | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index edb37a549..53193afc6 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -878,7 +878,6 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 	first := e.sndQueue.Front()
 	if first != nil {
 		e.snd.writeList.PushBackList(&e.sndQueue)
-		e.snd.sndNxtList.UpdateForward(e.sndBufInQueue)
 		e.sndBufInQueue = 0
 	}
 
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 17fed4ec5..6b7bac37d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -126,10 +126,6 @@ type sender struct {
 	// sndNxt is the sequence number of the next segment to be sent.
 	sndNxt seqnum.Value
 
-	// sndNxtList is the sequence number of the next segment to be added to
-	// the send list.
-	sndNxtList seqnum.Value
-
 	// rttMeasureSeqNum is the sequence number being used for the latest RTT
 	// measurement.
 	rttMeasureSeqNum seqnum.Value
@@ -229,7 +225,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 		sndWnd:           sndWnd,
 		sndUna:           iss + 1,
 		sndNxt:           iss + 1,
-		sndNxtList:       iss + 1,
 		rto:              1 * time.Second,
 		rttMeasureSeqNum: iss + 1,
 		lastSendTime:     time.Now(),
-- 
cgit v1.2.3


From 1bf2e52bdb5f366b397cb887d4cbdb91dd5e3213 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 20 Mar 2020 17:00:55 -0700
Subject: Actually wrap rand.Reader in bufio.Reader.

Updates #231

PiperOrigin-RevId: 302127697
---
 pkg/rand/rand_linux.go | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 0bdad5fad..fa6a21026 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -17,6 +17,7 @@
 package rand
 
 import (
+	"bufio"
 	"crypto/rand"
 	"io"
 
@@ -45,8 +46,22 @@ func (r *reader) Read(p []byte) (int, error) {
 	return rand.Read(p)
 }
 
+// bufferedReader implements a threadsafe buffered io.Reader.
+type bufferedReader struct {
+	mu sync.Mutex
+	r  *bufio.Reader
+}
+
+// Read implements io.Reader.Read.
+func (b *bufferedReader) Read(p []byte) (int, error) {
+	b.mu.Lock()
+	n, err := b.r.Read(p)
+	b.mu.Unlock()
+	return n, err
+}
+
 // Reader is the default reader.
-var Reader io.Reader = &reader{}
+var Reader io.Reader = &bufferedReader{r: bufio.NewReader(&reader{})}
 
 // Read reads from the default reader.
 func Read(b []byte) (int, error) {
-- 
cgit v1.2.3


From d5fe1ce0c1c551c3165632eecc0ea5589c049bd5 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 20 Mar 2020 17:19:06 -0700
Subject: test: Create a separate /tmp mount only for tests with the shared tag

The root mount is not shared by default, but all other mounts are shared.

So if we create the /tmp mount, this means that we run tests on a shared mount
even if tests run without the --shared option.

PiperOrigin-RevId: 302130790
---
 test/runner/runner.go | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/test/runner/runner.go b/test/runner/runner.go
index a78ef38e0..0d3742f71 100644
--- a/test/runner/runner.go
+++ b/test/runner/runner.go
@@ -300,6 +300,7 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 
 	// Test spec comes with pre-defined mounts that we don't want. Reset it.
 	spec.Mounts = nil
+	testTmpDir := "/tmp"
 	if *useTmpfs {
 		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
 		// features only available in gVisor's internal tmpfs may fail.
@@ -325,11 +326,19 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 			t.Fatalf("could not chmod temp dir: %v", err)
 		}
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
+		// "/tmp" is not replaced with a tmpfs mount inside the sandbox
+		// when it's not empty. This ensures that testTmpDir uses gofer
+		// in exclusive mode.
+		testTmpDir = tmpDir
+		if *fileAccess == "shared" {
+			// All external mounts except the root mount are shared.
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Type:        "bind",
+				Destination: "/tmp",
+				Source:      tmpDir,
+			})
+			testTmpDir = "/tmp"
+		}
 	}
 
 	// Set environment variables that indicate we are
@@ -349,12 +358,8 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 
 	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
 	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
+	env = filterEnv(env, []string{"TEST_TMPDIR"})
+	env = append(env, fmt.Sprintf("TEST_TMPDIR=%s", testTmpDir))
 
 	spec.Process.Env = env
 
-- 
cgit v1.2.3


From fed59953aad40d89730ebfc6f33b17116c42abcf Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 20 Mar 2020 18:22:49 -0700
Subject: Statically link libpthread for static c++ binaries.

The posix_server works fine when run in locally or in docker but fails in the
kokoro GCP build environment. Linking libpthread statically fixes it.

PiperOrigin-RevId: 302139082
---
 tools/bazeldefs/defs.bzl | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 64171ad8d..0a74370a6 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -65,10 +65,17 @@ def cc_binary(name, static = False, **kwargs):
         **kwargs: the rest of the args.
     """
     if static:
-        if "linkopts" in kwargs:
-            kwargs["linkopts"] += ["-static", "-lstdc++"]
-        else:
-            kwargs["linkopts"] = ["-static", "-lstdc++"]
+        # How to statically link a c++ program that uses threads, like for gRPC:
+        # https://gcc.gnu.org/legacy-ml/gcc-help/2010-05/msg00029.html
+        if "linkopts" not in kwargs:
+            kwargs["linkopts"] = []
+        kwargs["linkopts"] += [
+            "-static",
+            "-lstdc++",
+            "-Wl,--whole-archive",
+            "-lpthread",
+            "-Wl,--no-whole-archive",
+        ]
     _cc_binary(
         name = name,
         **kwargs
-- 
cgit v1.2.3


From 8c35614760a194d52a706748adb9af3a28fb864b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 23 Mar 2020 13:35:10 -0700
Subject: iptables: enable iptables tests as nonblocking

PiperOrigin-RevId: 302506064
---
 kokoro/iptables_tests.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
index 7af20629a..a30d82591 100644
--- a/kokoro/iptables_tests.cfg
+++ b/kokoro/iptables_tests.cfg
@@ -1,4 +1,4 @@
-build_file: "repo/scripts/iptables_test.sh"
+build_file: "repo/scripts/iptables_tests.sh"
 
 action {
   define_artifacts {
-- 
cgit v1.2.3


From 6eebaea94936ffefd3603ff16ad2225856313fa3 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 23 Mar 2020 14:32:05 -0700
Subject: Correctly release taskPathOperation for accessAt.

PiperOrigin-RevId: 302518924
---
 pkg/sentry/syscalls/linux/vfs2/stat.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index 97eaedd66..068243132 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -272,6 +272,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err
 	if err != nil {
 		return err
 	}
+	defer tpop.Release()
 
 	// access(2) and faccessat(2) check permissions using real
 	// UID/GID, not effective UID/GID.
-- 
cgit v1.2.3


From 369cf38bd7186da97e134538cd4839a8a4d1aa2c Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 23 Mar 2020 16:05:29 -0700
Subject: Fix data race in SetSockOpt.

PiperOrigin-RevId: 302539171
---
 pkg/sentry/socket/netstack/netstack.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index a6ef7a47e..c19f5639b 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2382,9 +2382,9 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		// caller-supplied  buffer.
 		s.readMu.Lock()
 		n, err := s.coalescingRead(ctx, dst, trunc)
-		s.readMu.Unlock()
 		cmsg := s.controlMessages()
 		s.fillCmsgInq(&cmsg)
+		s.readMu.Unlock()
 		return n, 0, nil, 0, cmsg, err
 	}
 
-- 
cgit v1.2.3


From a730d74b3230fb32181b9a940c07b61338222874 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Mon, 23 Mar 2020 16:11:37 -0700
Subject: Support basic /proc/net/dev metrics for netstack

Fixes #506

PiperOrigin-RevId: 302540404
---
 pkg/sentry/socket/netstack/stack.go | 73 ++++++++++++++++++++++++++-----------
 test/syscalls/linux/proc_net.cc     | 53 +++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 21 deletions(-)

diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 0692482e9..a8e2e8c24 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -200,36 +200,66 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	switch stats := stat.(type) {
+	case *inet.StatDev:
+		for _, ni := range s.Stack.NICInfo() {
+			if ni.Name != arg {
+				continue
+			}
+			// TODO(gvisor.dev/issue/2103) Support stubbed stats.
+			*stats = inet.StatDev{
+				// Receive section.
+				ni.Stats.Rx.Bytes.Value(),   // bytes.
+				ni.Stats.Rx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // frame.
+				0,                           // compressed.
+				0,                           // multicast.
+				// Transmit section.
+				ni.Stats.Tx.Bytes.Value(),   // bytes.
+				ni.Stats.Tx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // colls.
+				0,                           // carrier.
+				0,                           // compressed.
+			}
+			break
+		}
 	case *inet.StatSNMPIP:
 		ip := Metrics.IP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPIP{
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			0,                          // Ip/Forwarding.
+			0,                          // Ip/DefaultTTL.
 			ip.PacketsReceived.Value(), // InReceives.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			0,                          // Ip/InHdrErrors.
 			ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			0,                               // Ip/ForwDatagrams.
+			0,                               // Ip/InUnknownProtos.
+			0,                               // Ip/InDiscards.
 			ip.PacketsDelivered.Value(),     // InDelivers.
 			ip.PacketsSent.Value(),          // OutRequests.
 			ip.OutgoingPacketErrors.Value(), // OutDiscards.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+			0,                               // Ip/OutNoRoutes.
+			0,                               // Support Ip/ReasmTimeout.
+			0,                               // Support Ip/ReasmReqds.
+			0,                               // Support Ip/ReasmOKs.
+			0,                               // Support Ip/ReasmFails.
+			0,                               // Support Ip/FragOKs.
+			0,                               // Support Ip/FragFails.
+			0,                               // Support Ip/FragCreates.
 		}
 	case *inet.StatSNMPICMP:
 		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
 		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPICMP{
-			0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs.
+			0, // Icmp/InMsgs.
 			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors.
+			0,                         // Icmp/InCsumErrors.
 			in.DstUnreachable.Value(), // InDestUnreachs.
 			in.TimeExceeded.Value(),   // InTimeExcds.
 			in.ParamProblem.Value(),   // InParmProbs.
@@ -241,7 +271,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			in.TimestampReply.Value(), // InTimestampReps.
 			in.InfoRequest.Value(),    // InAddrMasks.
 			in.InfoReply.Value(),      // InAddrMaskReps.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs.
+			0,                         // Icmp/OutMsgs.
 			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
 			out.DstUnreachable.Value(),                     // OutDestUnreachs.
 			out.TimeExceeded.Value(),                       // OutTimeExcds.
@@ -277,15 +307,16 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 		}
 	case *inet.StatSNMPUDP:
 		udp := Metrics.UDP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPUDP{
 			udp.PacketsReceived.Value(),     // InDatagrams.
 			udp.UnknownPortErrors.Value(),   // NoPorts.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InErrors.
+			0,                               // Udp/InErrors.
 			udp.PacketsSent.Value(),         // OutDatagrams.
 			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti.
+			0,                               // Udp/SndbufErrors.
+			0,                               // Udp/InCsumErrors.
+			0,                               // Udp/IgnoredMulti.
 		}
 	default:
 		return syserr.ErrEndpointOperation.ToError()
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 05c952b99..4e23d1e78 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -92,6 +92,59 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
   EXPECT_EQ(buf, to_write);
 }
 
+// DeviceEntry is an entry in /proc/net/dev
+struct DeviceEntry {
+  std::string name;
+  uint64_t stats[16];
+};
+
+PosixErrorOr<std::vector<DeviceEntry>> GetDeviceMetricsFromProc(
+    const std::string dev) {
+  std::vector<std::string> lines = absl::StrSplit(dev, '\n');
+  std::vector<DeviceEntry> entries;
+
+  // /proc/net/dev prints 2 lines of headers followed by a line of metrics for
+  // each network interface.
+  for (unsigned i = 2; i < lines.size(); i++) {
+    // Ignore empty lines.
+    if (lines[i].empty()) {
+      continue;
+    }
+
+    std::vector<std::string> values =
+        absl::StrSplit(lines[i], ' ', absl::SkipWhitespace());
+
+    // Interface name + 16 values.
+    if (values.size() != 17) {
+      return PosixError(EINVAL, "invalid line: " + lines[i]);
+    }
+
+    DeviceEntry entry;
+    entry.name = values[0];
+    // Skip the interface name and read only the values.
+    for (unsigned j = 1; j < 17; j++) {
+      uint64_t num;
+      if (!absl::SimpleAtoi(values[j], &num)) {
+        return PosixError(EINVAL, "invalid value: " + values[j]);
+      }
+      entry.stats[j - 1] = num;
+    }
+
+    entries.push_back(entry);
+  }
+
+  return entries;
+}
+
+// TEST(ProcNetDev, Format) tests that /proc/net/dev is parsable and
+// contains at least one entry.
+TEST(ProcNetDev, Format) {
+  auto dev = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/dev"));
+  auto entries = ASSERT_NO_ERRNO_AND_VALUE(GetDeviceMetricsFromProc(dev));
+
+  EXPECT_GT(entries.size(), 0);
+}
+
 PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
                                              const std::string& type,
                                              const std::string& item) {
-- 
cgit v1.2.3


From 7e4073af12bed2c76bc5757ef3e5fbfba75308a0 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 24 Mar 2020 09:05:06 -0700
Subject: Move tcpip.PacketBuffer and IPTables to stack package.

This is a precursor to be being able to build an intrusive list
of PacketBuffers for use in queuing disciplines being implemented.

Updates #2214

PiperOrigin-RevId: 302677662
---
 pkg/sentry/socket/netfilter/BUILD                  |   1 -
 pkg/sentry/socket/netfilter/extensions.go          |  14 +-
 pkg/sentry/socket/netfilter/netfilter.go           | 121 ++++----
 pkg/sentry/socket/netfilter/targets.go             |  11 +-
 pkg/sentry/socket/netfilter/tcp_matcher.go         |  11 +-
 pkg/sentry/socket/netfilter/udp_matcher.go         |  13 +-
 pkg/sentry/socket/netstack/BUILD                   |   1 -
 pkg/sentry/socket/netstack/stack.go                |   3 +-
 pkg/tcpip/BUILD                                    |   2 -
 pkg/tcpip/iptables/BUILD                           |  18 --
 pkg/tcpip/iptables/iptables.go                     | 314 ---------------------
 pkg/tcpip/iptables/targets.go                      | 144 ----------
 pkg/tcpip/iptables/types.go                        | 180 ------------
 pkg/tcpip/link/channel/channel.go                  |  14 +-
 pkg/tcpip/link/fdbased/endpoint.go                 |   6 +-
 pkg/tcpip/link/fdbased/endpoint_test.go            |  10 +-
 pkg/tcpip/link/fdbased/mmap.go                     |   3 +-
 pkg/tcpip/link/fdbased/packet_dispatchers.go       |   4 +-
 pkg/tcpip/link/loopback/loopback.go                |   8 +-
 pkg/tcpip/link/muxed/injectable.go                 |   6 +-
 pkg/tcpip/link/muxed/injectable_test.go            |   4 +-
 pkg/tcpip/link/sharedmem/sharedmem.go              |   6 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go         |  26 +-
 pkg/tcpip/link/sniffer/sniffer.go                  |  10 +-
 pkg/tcpip/link/tun/device.go                       |   2 +-
 pkg/tcpip/link/waitable/waitable.go                |   6 +-
 pkg/tcpip/link/waitable/waitable_test.go           |  18 +-
 pkg/tcpip/network/arp/arp.go                       |  12 +-
 pkg/tcpip/network/arp/arp_test.go                  |   2 +-
 pkg/tcpip/network/ip_test.go                       |  24 +-
 pkg/tcpip/network/ipv4/BUILD                       |   1 -
 pkg/tcpip/network/ipv4/icmp.go                     |   9 +-
 pkg/tcpip/network/ipv4/ipv4.go                     |  21 +-
 pkg/tcpip/network/ipv4/ipv4_test.go                |  18 +-
 pkg/tcpip/network/ipv6/icmp.go                     |  10 +-
 pkg/tcpip/network/ipv6/icmp_test.go                |  14 +-
 pkg/tcpip/network/ipv6/ipv6.go                     |  10 +-
 pkg/tcpip/network/ipv6/ipv6_test.go                |   4 +-
 pkg/tcpip/network/ipv6/ndp_test.go                 |   8 +-
 pkg/tcpip/packet_buffer.go                         |  67 -----
 pkg/tcpip/packet_buffer_state.go                   |  27 --
 pkg/tcpip/stack/BUILD                              |   8 +-
 pkg/tcpip/stack/forwarder.go                       |   4 +-
 pkg/tcpip/stack/forwarder_test.go                  |  36 +--
 pkg/tcpip/stack/iptables.go                        | 311 ++++++++++++++++++++
 pkg/tcpip/stack/iptables_targets.go                | 144 ++++++++++
 pkg/tcpip/stack/iptables_types.go                  | 180 ++++++++++++
 pkg/tcpip/stack/ndp.go                             |   4 +-
 pkg/tcpip/stack/ndp_test.go                        |  14 +-
 pkg/tcpip/stack/nic.go                             |  13 +-
 pkg/tcpip/stack/nic_test.go                        |   3 +-
 pkg/tcpip/stack/packet_buffer.go                   |  66 +++++
 pkg/tcpip/stack/packet_buffer_state.go             |  26 ++
 pkg/tcpip/stack/registration.go                    |  32 +--
 pkg/tcpip/stack/route.go                           |   6 +-
 pkg/tcpip/stack/stack.go                           |  11 +-
 pkg/tcpip/stack/stack_test.go                      |  28 +-
 pkg/tcpip/stack/transport_demuxer.go               |  14 +-
 pkg/tcpip/stack/transport_demuxer_test.go          |   2 +-
 pkg/tcpip/stack/transport_test.go                  |  27 +-
 pkg/tcpip/transport/icmp/BUILD                     |   1 -
 pkg/tcpip/transport/icmp/endpoint.go               |  11 +-
 pkg/tcpip/transport/icmp/protocol.go               |   2 +-
 pkg/tcpip/transport/packet/BUILD                   |   1 -
 pkg/tcpip/transport/packet/endpoint.go             |   9 +-
 pkg/tcpip/transport/raw/BUILD                      |   1 -
 pkg/tcpip/transport/raw/endpoint.go                |   9 +-
 pkg/tcpip/transport/tcp/BUILD                      |   1 -
 pkg/tcpip/transport/tcp/connect.go                 |   6 +-
 pkg/tcpip/transport/tcp/dispatcher.go              |   3 +-
 pkg/tcpip/transport/tcp/endpoint.go                |   7 +-
 pkg/tcpip/transport/tcp/forwarder.go               |   2 +-
 pkg/tcpip/transport/tcp/protocol.go                |   4 +-
 pkg/tcpip/transport/tcp/segment.go                 |   3 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  10 +-
 pkg/tcpip/transport/udp/BUILD                      |   1 -
 pkg/tcpip/transport/udp/endpoint.go                |   9 +-
 pkg/tcpip/transport/udp/forwarder.go               |   4 +-
 pkg/tcpip/transport/udp/protocol.go                |   6 +-
 pkg/tcpip/transport/udp/udp_test.go                |   4 +-
 80 files changed, 1080 insertions(+), 1126 deletions(-)
 delete mode 100644 pkg/tcpip/iptables/BUILD
 delete mode 100644 pkg/tcpip/iptables/iptables.go
 delete mode 100644 pkg/tcpip/iptables/targets.go
 delete mode 100644 pkg/tcpip/iptables/types.go
 delete mode 100644 pkg/tcpip/packet_buffer.go
 delete mode 100644 pkg/tcpip/packet_buffer_state.go
 create mode 100644 pkg/tcpip/stack/iptables.go
 create mode 100644 pkg/tcpip/stack/iptables_targets.go
 create mode 100644 pkg/tcpip/stack/iptables_types.go
 create mode 100644 pkg/tcpip/stack/packet_buffer.go
 create mode 100644 pkg/tcpip/stack/packet_buffer_state.go

diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 7cd2ce55b..e801abeb8 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index b4b244abf..0336a32d8 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,7 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -37,12 +37,12 @@ type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
-	// marshal converts from an iptables.Matcher to an ABI struct.
-	marshal(matcher iptables.Matcher) []byte
+	// marshal converts from an stack.Matcher to an ABI struct.
+	marshal(matcher stack.Matcher) []byte
 
 	// unmarshal converts from the ABI matcher struct to an
-	// iptables.Matcher.
-	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
+	// stack.Matcher.
+	unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error)
 }
 
 // matchMakers maps the name of supported matchers to the matchMaker that
@@ -58,7 +58,7 @@ func registerMatchMaker(mm matchMaker) {
 	matchMakers[mm.name()] = mm
 }
 
-func marshalMatcher(matcher iptables.Matcher) []byte {
+func marshalMatcher(matcher stack.Matcher) []byte {
 	matchMaker, ok := matchMakers[matcher.Name()]
 	if !ok {
 		panic(fmt.Sprintf("Unknown matcher of type %T.", matcher))
@@ -86,7 +86,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 	return append(buf, make([]byte, size-len(buf))...)
 }
 
-func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
+func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) {
 	matchMaker, ok := matchMakers[match.Name.String()]
 	if !ok {
 		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index b5b9be46f..55bcc3ace 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -129,19 +128,19 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) {
-	ipt := stack.IPTables()
+func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
+	ipt := stk.IPTables()
 	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
-		return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename)
+		return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
 
 // FillDefaultIPTables sets stack's IPTables to the default tables and
 // populates them with metadata.
-func FillDefaultIPTables(stack *stack.Stack) {
-	ipt := iptables.DefaultTables()
+func FillDefaultIPTables(stk *stack.Stack) {
+	ipt := stack.DefaultTables()
 
 	// In order to fill in the metadata, we have to translate ipt from its
 	// netstack format to Linux's giant-binary-blob format.
@@ -154,14 +153,14 @@ func FillDefaultIPTables(stack *stack.Stack) {
 		ipt.Tables[name] = table
 	}
 
-	stack.SetIPTables(ipt)
+	stk.SetIPTables(ipt)
 }
 
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) {
+func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
@@ -234,19 +233,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-func marshalTarget(target iptables.Target) []byte {
+func marshalTarget(target stack.Target) []byte {
 	switch tg := target.(type) {
-	case iptables.AcceptTarget:
-		return marshalStandardTarget(iptables.RuleAccept)
-	case iptables.DropTarget:
-		return marshalStandardTarget(iptables.RuleDrop)
-	case iptables.ErrorTarget:
+	case stack.AcceptTarget:
+		return marshalStandardTarget(stack.RuleAccept)
+	case stack.DropTarget:
+		return marshalStandardTarget(stack.RuleDrop)
+	case stack.ErrorTarget:
 		return marshalErrorTarget(errorTargetName)
-	case iptables.UserChainTarget:
+	case stack.UserChainTarget:
 		return marshalErrorTarget(tg.Name)
-	case iptables.ReturnTarget:
-		return marshalStandardTarget(iptables.RuleReturn)
-	case iptables.RedirectTarget:
+	case stack.ReturnTarget:
+		return marshalStandardTarget(stack.RuleReturn)
+	case stack.RedirectTarget:
 		return marshalRedirectTarget()
 	case JumpTarget:
 		return marshalJumpTarget(tg)
@@ -255,7 +254,7 @@ func marshalTarget(target iptables.Target) []byte {
 	}
 }
 
-func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
+func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
 	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
@@ -316,13 +315,13 @@ func marshalJumpTarget(jt JumpTarget) []byte {
 
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
+func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
 	switch verdict {
-	case iptables.RuleAccept:
+	case stack.RuleAccept:
 		return -linux.NF_ACCEPT - 1
-	case iptables.RuleDrop:
+	case stack.RuleDrop:
 		return -linux.NF_DROP - 1
-	case iptables.RuleReturn:
+	case stack.RuleReturn:
 		return linux.NF_RETURN
 	default:
 		// TODO(gvisor.dev/issue/170): Support Jump.
@@ -331,18 +330,18 @@ func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
 }
 
 // translateToStandardTarget translates from the value in a
-// linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardTarget(val int32) (iptables.Target, error) {
+// linux.XTStandardTarget to an stack.Verdict.
+func translateToStandardTarget(val int32) (stack.Target, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return iptables.AcceptTarget{}, nil
+		return stack.AcceptTarget{}, nil
 	case -linux.NF_DROP - 1:
-		return iptables.DropTarget{}, nil
+		return stack.DropTarget{}, nil
 	case -linux.NF_QUEUE - 1:
 		return nil, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		return iptables.ReturnTarget{}, nil
+		return stack.ReturnTarget{}, nil
 	default:
 		return nil, fmt.Errorf("unknown iptables verdict %d", val)
 	}
@@ -350,7 +349,7 @@ func translateToStandardTarget(val int32) (iptables.Target, error) {
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
+func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
 		nflog("optVal has insufficient size for replace %d", len(optVal))
@@ -362,12 +361,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
 
 	// TODO(gvisor.dev/issue/170): Support other tables.
-	var table iptables.Table
+	var table stack.Table
 	switch replace.Name.String() {
-	case iptables.TablenameFilter:
-		table = iptables.EmptyFilterTable()
-	case iptables.TablenameNat:
-		table = iptables.EmptyNatTable()
+	case stack.TablenameFilter:
+		table = stack.EmptyFilterTable()
+	case stack.TablenameNat:
+		table = stack.EmptyNatTable()
 	default:
 		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
@@ -434,7 +433,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 		optVal = optVal[targetSize:]
 
-		table.Rules = append(table.Rules, iptables.Rule{
+		table.Rules = append(table.Rules, stack.Rule{
 			Filter:   filter,
 			Target:   target,
 			Matchers: matchers,
@@ -465,11 +464,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 					table.Underflows[hk] = ruleIdx
 				}
 			}
-			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
+			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == stack.HookUnset {
 				nflog("hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
-			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
+			if ruleIdx := table.Underflows[hk]; ruleIdx == stack.HookUnset {
 				nflog("underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
@@ -478,7 +477,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Add the user chains.
 	for ruleIdx, rule := range table.Rules {
-		target, ok := rule.Target.(iptables.UserChainTarget)
+		target, ok := rule.Target.(stack.UserChainTarget)
 		if !ok {
 			continue
 		}
@@ -522,8 +521,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// PREROUTING chain right now, make sure all other chains point to
 	// ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
-		if hook != iptables.Input && hook != iptables.Prerouting {
-			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
+		if hook != stack.Input && hook != stack.Prerouting {
+			if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -535,7 +534,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	ipt := stack.IPTables()
+	ipt := stk.IPTables()
 	table.SetMetadata(metadata{
 		HookEntry:  replace.HookEntry,
 		Underflow:  replace.Underflow,
@@ -543,16 +542,16 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		Size:       replace.Size,
 	})
 	ipt.Tables[replace.Name.String()] = table
-	stack.SetIPTables(ipt)
+	stk.SetIPTables(ipt)
 
 	return nil
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
-func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) {
+func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) {
 	nflog("set entries: parsing matchers of size %d", len(optVal))
-	var matchers []iptables.Matcher
+	var matchers []stack.Matcher
 	for len(optVal) > 0 {
 		nflog("set entries: optVal has len %d", len(optVal))
 
@@ -594,7 +593,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target, error) {
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
@@ -638,11 +637,11 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 		switch name := errorTarget.Name.String(); name {
 		case errorTargetName:
 			nflog("set entries: error target")
-			return iptables.ErrorTarget{}, nil
+			return stack.ErrorTarget{}, nil
 		default:
 			// User defined chain.
 			nflog("set entries: user-defined target %q", name)
-			return iptables.UserChainTarget{Name: name}, nil
+			return stack.UserChainTarget{Name: name}, nil
 		}
 
 	case redirectTargetName:
@@ -659,8 +658,8 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 		buf = optVal[:linux.SizeOfXTRedirectTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
 
-		// Copy linux.XTRedirectTarget to iptables.RedirectTarget.
-		var target iptables.RedirectTarget
+		// Copy linux.XTRedirectTarget to stack.RedirectTarget.
+		var target stack.RedirectTarget
 		nfRange := redirectTarget.NfRange
 
 		// RangeSize should be 1.
@@ -699,14 +698,14 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 	return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
-		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
 	}
 	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
-		return iptables.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
 	}
-	return iptables.IPHeaderFilter{
+	return stack.IPHeaderFilter{
 		Protocol:  tcpip.TransportProtocolNumber(iptip.Protocol),
 		Dst:       tcpip.Address(iptip.Dst[:]),
 		DstMask:   tcpip.Address(iptip.DstMask[:]),
@@ -733,30 +732,30 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 		iptip.InverseFlags&^inverseMask != 0
 }
 
-func validUnderflow(rule iptables.Rule) bool {
+func validUnderflow(rule stack.Rule) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
 	switch rule.Target.(type) {
-	case iptables.AcceptTarget, iptables.DropTarget:
+	case stack.AcceptTarget, stack.DropTarget:
 		return true
 	default:
 		return false
 	}
 }
 
-func hookFromLinux(hook int) iptables.Hook {
+func hookFromLinux(hook int) stack.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
-		return iptables.Prerouting
+		return stack.Prerouting
 	case linux.NF_INET_LOCAL_IN:
-		return iptables.Input
+		return stack.Input
 	case linux.NF_INET_FORWARD:
-		return iptables.Forward
+		return stack.Forward
 	case linux.NF_INET_LOCAL_OUT:
-		return iptables.Output
+		return stack.Output
 	case linux.NF_INET_POST_ROUTING:
-		return iptables.Postrouting
+		return stack.Postrouting
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index c421b87cf..c948de876 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,11 +15,10 @@
 package netfilter
 
 import (
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
-// JumpTarget implements iptables.Target.
+// JumpTarget implements stack.Target.
 type JumpTarget struct {
 	// Offset is the byte offset of the rule to jump to. It is used for
 	// marshaling and unmarshaling.
@@ -29,7 +28,7 @@ type JumpTarget struct {
 	RuleNum int
 }
 
-// Action implements iptables.Target.Action.
-func (jt JumpTarget) Action(tcpip.PacketBuffer) (iptables.RuleVerdict, int) {
-	return iptables.RuleJump, jt.RuleNum
+// Action implements stack.Target.Action.
+func (jt JumpTarget) Action(stack.PacketBuffer) (stack.RuleVerdict, int) {
+	return stack.RuleJump, jt.RuleNum
 }
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index f9945e214..ff1cfd8f6 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -19,9 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,7 +39,7 @@ func (tcpMarshaler) name() string {
 }
 
 // marshal implements matchMaker.marshal.
-func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
+func (tcpMarshaler) marshal(mr stack.Matcher) []byte {
 	matcher := mr.(*TCPMatcher)
 	xttcp := linux.XTTCP{
 		SourcePortStart:      matcher.sourcePortStart,
@@ -53,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 }
 
 // unmarshal implements matchMaker.unmarshal.
-func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
 	if len(buf) < linux.SizeOfXTTCP {
 		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
 	}
@@ -97,7 +96,7 @@ func (*TCPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
@@ -115,7 +114,7 @@ func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac
 	// Now we need the transport header. However, this may not have been set
 	// yet.
 	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
+	// ultimately be moved into the stack.Check codepath as matchers are
 	// added.
 	var tcpHeader header.TCP
 	if pkt.TransportHeader != nil {
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 86aa11696..3359418c1 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -19,9 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,7 +39,7 @@ func (udpMarshaler) name() string {
 }
 
 // marshal implements matchMaker.marshal.
-func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
+func (udpMarshaler) marshal(mr stack.Matcher) []byte {
 	matcher := mr.(*UDPMatcher)
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.sourcePortStart,
@@ -53,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 }
 
 // unmarshal implements matchMaker.unmarshal.
-func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
 	if len(buf) < linux.SizeOfXTUDP {
 		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
 	}
@@ -94,11 +93,11 @@ func (*UDPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
-	// into the iptables.Check codepath as matchers are added.
+	// into the stack.Check codepath as matchers are added.
 	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
 		return false, false
 	}
@@ -114,7 +113,7 @@ func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac
 	// Now we need the transport header. However, this may not have been set
 	// yet.
 	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
+	// ultimately be moved into the stack.Check codepath as matchers are
 	// added.
 	var udpHeader header.UDP
 	if pkt.TransportHeader != nil {
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index ab01cb4fa..cbf46b1e9 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -38,7 +38,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index a8e2e8c24..f5fa18136 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -23,7 +23,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -363,7 +362,7 @@ func (s *Stack) RouteTable() []inet.Route {
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (iptables.IPTables, error) {
+func (s *Stack) IPTables() (stack.IPTables, error) {
 	return s.Stack.IPTables(), nil
 }
 
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 26f7ba86b..454e07662 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -5,8 +5,6 @@ package(licenses = ["notice"])
 go_library(
     name = "tcpip",
     srcs = [
-        "packet_buffer.go",
-        "packet_buffer_state.go",
         "tcpip.go",
         "time_unsafe.go",
         "timer.go",
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
deleted file mode 100644
index d1b73cfdf..000000000
--- a/pkg/tcpip/iptables/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "iptables",
-    srcs = [
-        "iptables.go",
-        "targets.go",
-        "types.go",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/log",
-        "//pkg/tcpip",
-        "//pkg/tcpip/header",
-    ],
-)
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
deleted file mode 100644
index d30571c74..000000000
--- a/pkg/tcpip/iptables/iptables.go
+++ /dev/null
@@ -1,314 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package iptables supports packet filtering and manipulation via the iptables
-// tool.
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-// Table names.
-const (
-	TablenameNat    = "nat"
-	TablenameMangle = "mangle"
-	TablenameFilter = "filter"
-)
-
-// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
-const (
-	ChainNamePrerouting  = "PREROUTING"
-	ChainNameInput       = "INPUT"
-	ChainNameForward     = "FORWARD"
-	ChainNameOutput      = "OUTPUT"
-	ChainNamePostrouting = "POSTROUTING"
-)
-
-// HookUnset indicates that there is no hook set for an entrypoint or
-// underflow.
-const HookUnset = -1
-
-// DefaultTables returns a default set of tables. Each chain is set to accept
-// all packets.
-func DefaultTables() IPTables {
-	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
-	// iotas.
-	return IPTables{
-		Tables: map[string]Table{
-			TablenameNat: Table{
-				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
-				},
-				BuiltinChains: map[Hook]int{
-					Prerouting:  0,
-					Input:       1,
-					Output:      2,
-					Postrouting: 3,
-				},
-				Underflows: map[Hook]int{
-					Prerouting:  0,
-					Input:       1,
-					Output:      2,
-					Postrouting: 3,
-				},
-				UserChains: map[string]int{},
-			},
-			TablenameMangle: Table{
-				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
-				},
-				BuiltinChains: map[Hook]int{
-					Prerouting: 0,
-					Output:     1,
-				},
-				Underflows: map[Hook]int{
-					Prerouting: 0,
-					Output:     1,
-				},
-				UserChains: map[string]int{},
-			},
-			TablenameFilter: Table{
-				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
-				},
-				BuiltinChains: map[Hook]int{
-					Input:   0,
-					Forward: 1,
-					Output:  2,
-				},
-				Underflows: map[Hook]int{
-					Input:   0,
-					Forward: 1,
-					Output:  2,
-				},
-				UserChains: map[string]int{},
-			},
-		},
-		Priorities: map[Hook][]string{
-			Input:      []string{TablenameNat, TablenameFilter},
-			Prerouting: []string{TablenameMangle, TablenameNat},
-			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
-		},
-	}
-}
-
-// EmptyFilterTable returns a Table with no rules and the filter table chains
-// mapped to HookUnset.
-func EmptyFilterTable() Table {
-	return Table{
-		Rules: []Rule{},
-		BuiltinChains: map[Hook]int{
-			Input:   HookUnset,
-			Forward: HookUnset,
-			Output:  HookUnset,
-		},
-		Underflows: map[Hook]int{
-			Input:   HookUnset,
-			Forward: HookUnset,
-			Output:  HookUnset,
-		},
-		UserChains: map[string]int{},
-	}
-}
-
-// EmptyNatTable returns a Table with no rules and the filter table chains
-// mapped to HookUnset.
-func EmptyNatTable() Table {
-	return Table{
-		Rules: []Rule{},
-		BuiltinChains: map[Hook]int{
-			Prerouting:  HookUnset,
-			Input:       HookUnset,
-			Output:      HookUnset,
-			Postrouting: HookUnset,
-		},
-		Underflows: map[Hook]int{
-			Prerouting:  HookUnset,
-			Input:       HookUnset,
-			Output:      HookUnset,
-			Postrouting: HookUnset,
-		},
-		UserChains: map[string]int{},
-	}
-}
-
-// A chainVerdict is what a table decides should be done with a packet.
-type chainVerdict int
-
-const (
-	// chainAccept indicates the packet should continue through netstack.
-	chainAccept chainVerdict = iota
-
-	// chainAccept indicates the packet should be dropped.
-	chainDrop
-
-	// chainReturn indicates the packet should return to the calling chain
-	// or the underflow rule of a builtin chain.
-	chainReturn
-)
-
-// Check runs pkt through the rules for hook. It returns true when the packet
-// should continue traversing the network stack and false when it should be
-// dropped.
-//
-// Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
-	// Go through each table containing the hook.
-	for _, tablename := range it.Priorities[hook] {
-		table := it.Tables[tablename]
-		ruleIdx := table.BuiltinChains[hook]
-		switch verdict := it.checkChain(hook, pkt, table, ruleIdx); verdict {
-		// If the table returns Accept, move on to the next table.
-		case chainAccept:
-			continue
-		// The Drop verdict is final.
-		case chainDrop:
-			return false
-		case chainReturn:
-			// Any Return from a built-in chain means we have to
-			// call the underflow.
-			underflow := table.Rules[table.Underflows[hook]]
-			switch v, _ := underflow.Target.Action(pkt); v {
-			case RuleAccept:
-				continue
-			case RuleDrop:
-				return false
-			case RuleJump, RuleReturn:
-				panic("Underflows should only return RuleAccept or RuleDrop.")
-			default:
-				panic(fmt.Sprintf("Unknown verdict: %d", v))
-			}
-
-		default:
-			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
-		}
-	}
-
-	// Every table returned Accept.
-	return true
-}
-
-// Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) chainVerdict {
-	// Start from ruleIdx and walk the list of rules until a rule gives us
-	// a verdict.
-	for ruleIdx < len(table.Rules) {
-		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx); verdict {
-		case RuleAccept:
-			return chainAccept
-
-		case RuleDrop:
-			return chainDrop
-
-		case RuleReturn:
-			return chainReturn
-
-		case RuleJump:
-			// "Jumping" to the next rule just means we're
-			// continuing on down the list.
-			if jumpTo == ruleIdx+1 {
-				ruleIdx++
-				continue
-			}
-			switch verdict := it.checkChain(hook, pkt, table, jumpTo); verdict {
-			case chainAccept:
-				return chainAccept
-			case chainDrop:
-				return chainDrop
-			case chainReturn:
-				ruleIdx++
-				continue
-			default:
-				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
-			}
-
-		default:
-			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
-		}
-
-	}
-
-	// We got through the entire table without a decision. Default to DROP
-	// for safety.
-	return chainDrop
-}
-
-// Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
-	rule := table.Rules[ruleIdx]
-
-	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
-	// pkt.Data.First().
-	if pkt.NetworkHeader == nil {
-		pkt.NetworkHeader = pkt.Data.First()
-	}
-
-	// Check whether the packet matches the IP header filter.
-	if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader)) {
-		// Continue on to the next rule.
-		return RuleJump, ruleIdx + 1
-	}
-
-	// Go through each rule matcher. If they all match, run
-	// the rule target.
-	for _, matcher := range rule.Matchers {
-		matches, hotdrop := matcher.Match(hook, pkt, "")
-		if hotdrop {
-			return RuleDrop, 0
-		}
-		if !matches {
-			// Continue on to the next rule.
-			return RuleJump, ruleIdx + 1
-		}
-	}
-
-	// All the matchers matched, so run the target.
-	return rule.Target.Action(pkt)
-}
-
-func filterMatch(filter IPHeaderFilter, hdr header.IPv4) bool {
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	// Check the transport protocol.
-	if filter.Protocol != 0 && filter.Protocol != hdr.TransportProtocol() {
-		return false
-	}
-
-	// Check the destination IP.
-	dest := hdr.DestinationAddress()
-	matches := true
-	for i := range filter.Dst {
-		if dest[i]&filter.DstMask[i] != filter.Dst[i] {
-			matches = false
-			break
-		}
-	}
-	if matches == filter.DstInvert {
-		return false
-	}
-
-	return true
-}
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
deleted file mode 100644
index e457f2349..000000000
--- a/pkg/tcpip/iptables/targets.go
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-// AcceptTarget accepts packets.
-type AcceptTarget struct{}
-
-// Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
-	return RuleAccept, 0
-}
-
-// DropTarget drops packets.
-type DropTarget struct{}
-
-// Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
-	return RuleDrop, 0
-}
-
-// ErrorTarget logs an error and drops the packet. It represents a target that
-// should be unreachable.
-type ErrorTarget struct{}
-
-// Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
-	log.Debugf("ErrorTarget triggered.")
-	return RuleDrop, 0
-}
-
-// UserChainTarget marks a rule as the beginning of a user chain.
-type UserChainTarget struct {
-	Name string
-}
-
-// Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
-	panic("UserChainTarget should never be called.")
-}
-
-// ReturnTarget returns from the current chain. If the chain is a built-in, the
-// hook's underflow should be called.
-type ReturnTarget struct{}
-
-// Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
-	return RuleReturn, 0
-}
-
-// RedirectTarget redirects the packet by modifying the destination port/IP.
-// Min and Max values for IP and Ports in the struct indicate the range of
-// values which can be used to redirect.
-type RedirectTarget struct {
-	// TODO(gvisor.dev/issue/170): Other flags need to be added after
-	// we support them.
-	// RangeProtoSpecified flag indicates single port is specified to
-	// redirect.
-	RangeProtoSpecified bool
-
-	// Min address used to redirect.
-	MinIP tcpip.Address
-
-	// Max address used to redirect.
-	MaxIP tcpip.Address
-
-	// Min port used to redirect.
-	MinPort uint16
-
-	// Max port used to redirect.
-	MaxPort uint16
-}
-
-// Action implements Target.Action.
-// TODO(gvisor.dev/issue/170): Parse headers without copying. The current
-// implementation only works for PREROUTING and calls pkt.Clone(), neither
-// of which should be the case.
-func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer) (RuleVerdict, int) {
-	newPkt := pkt.Clone()
-
-	// Set network header.
-	headerView := newPkt.Data.First()
-	netHeader := header.IPv4(headerView)
-	newPkt.NetworkHeader = headerView[:header.IPv4MinimumSize]
-
-	hlen := int(netHeader.HeaderLength())
-	tlen := int(netHeader.TotalLength())
-	newPkt.Data.TrimFront(hlen)
-	newPkt.Data.CapLength(tlen - hlen)
-
-	// TODO(gvisor.dev/issue/170): Change destination address to
-	// loopback or interface address on which the packet was
-	// received.
-
-	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
-	// we need to change dest address (for OUTPUT chain) or ports.
-	switch protocol := netHeader.TransportProtocol(); protocol {
-	case header.UDPProtocolNumber:
-		var udpHeader header.UDP
-		if newPkt.TransportHeader != nil {
-			udpHeader = header.UDP(newPkt.TransportHeader)
-		} else {
-			if len(pkt.Data.First()) < header.UDPMinimumSize {
-				return RuleDrop, 0
-			}
-			udpHeader = header.UDP(newPkt.Data.First())
-		}
-		udpHeader.SetDestinationPort(rt.MinPort)
-	case header.TCPProtocolNumber:
-		var tcpHeader header.TCP
-		if newPkt.TransportHeader != nil {
-			tcpHeader = header.TCP(newPkt.TransportHeader)
-		} else {
-			if len(pkt.Data.First()) < header.TCPMinimumSize {
-				return RuleDrop, 0
-			}
-			tcpHeader = header.TCP(newPkt.TransportHeader)
-		}
-		// TODO(gvisor.dev/issue/170): Need to recompute checksum
-		// and implement nat connection tracking to support TCP.
-		tcpHeader.SetDestinationPort(rt.MinPort)
-	default:
-		return RuleDrop, 0
-	}
-
-	return RuleAccept, 0
-}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
deleted file mode 100644
index e7fcf6bff..000000000
--- a/pkg/tcpip/iptables/types.go
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"gvisor.dev/gvisor/pkg/tcpip"
-)
-
-// A Hook specifies one of the hooks built into the network stack.
-//
-//                      Userspace app          Userspace app
-//                            ^                      |
-//                            |                      v
-//                         [Input]               [Output]
-//                            ^                      |
-//                            |                      v
-//                            |                   routing
-//                            |                      |
-//                            |                      v
-// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]----->
-type Hook uint
-
-// These values correspond to values in include/uapi/linux/netfilter.h.
-const (
-	// Prerouting happens before a packet is routed to applications or to
-	// be forwarded.
-	Prerouting Hook = iota
-
-	// Input happens before a packet reaches an application.
-	Input
-
-	// Forward happens once it's decided that a packet should be forwarded
-	// to another host.
-	Forward
-
-	// Output happens after a packet is written by an application to be
-	// sent out.
-	Output
-
-	// Postrouting happens just before a packet goes out on the wire.
-	Postrouting
-
-	// The total number of hooks.
-	NumHooks
-)
-
-// A RuleVerdict is what a rule decides should be done with a packet.
-type RuleVerdict int
-
-const (
-	// RuleAccept indicates the packet should continue through netstack.
-	RuleAccept RuleVerdict = iota
-
-	// RuleDrop indicates the packet should be dropped.
-	RuleDrop
-
-	// RuleJump indicates the packet should jump to another chain.
-	RuleJump
-
-	// RuleReturn indicates the packet should return to the previous chain.
-	RuleReturn
-)
-
-// IPTables holds all the tables for a netstack.
-type IPTables struct {
-	// Tables maps table names to tables. User tables have arbitrary names.
-	Tables map[string]Table
-
-	// Priorities maps each hook to a list of table names. The order of the
-	// list is the order in which each table should be visited for that
-	// hook.
-	Priorities map[Hook][]string
-}
-
-// A Table defines a set of chains and hooks into the network stack. It is
-// really just a list of rules with some metadata for entrypoints and such.
-type Table struct {
-	// Rules holds the rules that make up the table.
-	Rules []Rule
-
-	// BuiltinChains maps builtin chains to their entrypoint rule in Rules.
-	BuiltinChains map[Hook]int
-
-	// Underflows maps builtin chains to their underflow rule in Rules
-	// (i.e. the rule to execute if the chain returns without a verdict).
-	Underflows map[Hook]int
-
-	// UserChains holds user-defined chains for the keyed by name. Users
-	// can give their chains arbitrary names.
-	UserChains map[string]int
-
-	// Metadata holds information about the Table that is useful to users
-	// of IPTables, but not to the netstack IPTables code itself.
-	metadata interface{}
-}
-
-// ValidHooks returns a bitmap of the builtin hooks for the given table.
-func (table *Table) ValidHooks() uint32 {
-	hooks := uint32(0)
-	for hook := range table.BuiltinChains {
-		hooks |= 1 << hook
-	}
-	return hooks
-}
-
-// Metadata returns the metadata object stored in table.
-func (table *Table) Metadata() interface{} {
-	return table.metadata
-}
-
-// SetMetadata sets the metadata object stored in table.
-func (table *Table) SetMetadata(metadata interface{}) {
-	table.metadata = metadata
-}
-
-// A Rule is a packet processing rule. It consists of two pieces. First it
-// contains zero or more matchers, each of which is a specification of which
-// packets this rule applies to. If there are no matchers in the rule, it
-// applies to any packet.
-type Rule struct {
-	// Filter holds basic IP filtering fields common to every rule.
-	Filter IPHeaderFilter
-
-	// Matchers is the list of matchers for this rule.
-	Matchers []Matcher
-
-	// Target is the action to invoke if all the matchers match the packet.
-	Target Target
-}
-
-// IPHeaderFilter holds basic IP filtering data common to every rule.
-type IPHeaderFilter struct {
-	// Protocol matches the transport protocol.
-	Protocol tcpip.TransportProtocolNumber
-
-	// Dst matches the destination IP address.
-	Dst tcpip.Address
-
-	// DstMask masks bits of the destination IP address when comparing with
-	// Dst.
-	DstMask tcpip.Address
-
-	// DstInvert inverts the meaning of the destination IP check, i.e. when
-	// true the filter will match packets that fail the destination
-	// comparison.
-	DstInvert bool
-}
-
-// A Matcher is the interface for matching packets.
-type Matcher interface {
-	// Name returns the name of the Matcher.
-	Name() string
-
-	// Match returns whether the packet matches and whether the packet
-	// should be "hotdropped", i.e. dropped immediately. This is usually
-	// used for suspicious packets.
-	//
-	// Precondition: packet.NetworkHeader is set.
-	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
-}
-
-// A Target is the interface for taking an action for a packet.
-type Target interface {
-	// Action takes an action on the packet and returns a verdict on how
-	// traversal should (or should not) continue. If the return value is
-	// Jump, it also returns the index of the rule to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
-}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 5944ba190..a8d6653ce 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -28,7 +28,7 @@ import (
 
 // PacketInfo holds all the information about an outbound packet.
 type PacketInfo struct {
-	Pkt   tcpip.PacketBuffer
+	Pkt   stack.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
 	Route stack.Route
@@ -203,12 +203,12 @@ func (e *Endpoint) NumQueued() int {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt stack.PacketBuffer) {
 	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
 }
 
@@ -251,7 +251,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
@@ -269,7 +269,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
@@ -280,7 +280,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 		off := pkt.DataOffset
 		size := pkt.DataSize
 		p := PacketInfo{
-			Pkt: tcpip.PacketBuffer{
+			Pkt: stack.PacketBuffer{
 				Header: pkt.Header,
 				Data:   buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(),
 			},
@@ -301,7 +301,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := PacketInfo{
-		Pkt:   tcpip.PacketBuffer{Data: vv},
+		Pkt:   stack.PacketBuffer{Data: vv},
 		Proto: 0,
 		GSO:   nil,
 	}
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 3b36b9673..235e647ff 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -386,7 +386,7 @@ const (
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
@@ -440,7 +440,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 
 // WritePackets writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	var ethHdrBuf []byte
 	// hdr + data
 	iovLen := 2
@@ -610,7 +610,7 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 2066987eb..c7dbbbc6b 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -45,7 +45,7 @@ const (
 type packetInfo struct {
 	raddr    tcpip.LinkAddress
 	proto    tcpip.NetworkProtocolNumber
-	contents tcpip.PacketBuffer
+	contents stack.PacketBuffer
 }
 
 type context struct {
@@ -92,7 +92,7 @@ func (c *context) cleanup() {
 	syscall.Close(c.fds[1])
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
@@ -168,7 +168,7 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(r, gso, proto, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -261,7 +261,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 	// WritePacket panics given a prependable with anything less than
 	// the minimum size of the ethernet header.
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
-	if err := c.ep.WritePacket(r, nil /* gso */, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.VectorisedView{},
 	}); err != nil {
@@ -324,7 +324,7 @@ func TestDeliverPacket(t *testing.T) {
 					want := packetInfo{
 						raddr: raddr,
 						proto: proto,
-						contents: tcpip.PacketBuffer{
+						contents: stack.PacketBuffer{
 							Data:       buffer.View(b).ToVectorisedView(),
 							LinkHeader: buffer.View(hdr),
 						},
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 62ed1e569..fe2bf3b0b 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
@@ -190,7 +191,7 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, tcpip.PacketBuffer{
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, stack.PacketBuffer{
 		Data:       buffer.View(pkt).ToVectorisedView(),
 		LinkHeader: buffer.View(eth),
 	})
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index c67d684ce..cb4cbea69 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,7 +139,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	pkt := tcpip.PacketBuffer{
+	pkt := stack.PacketBuffer{
 		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
 		LinkHeader: buffer.View(eth),
 	}
@@ -296,7 +296,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		pkt := tcpip.PacketBuffer{
+		pkt := stack.PacketBuffer{
 			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
 			LinkHeader: buffer.View(eth),
 		}
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 499cc608f..4039753b7 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -76,7 +76,7 @@ func (*endpoint) Wait() {}
 
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
-func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 	views[0] = pkt.Header.View()
 	views = append(views, pkt.Data.Views()...)
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 	// Because we're immediately turning around and writing the packet back
 	// to the rx path, we intentionally don't preserve the remote and local
 	// link addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, tcpip.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, stack.PacketBuffer{
 		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 	})
 
@@ -92,7 +92,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -106,7 +106,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	// There should be an ethernet header at the beginning of vv.
 	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
 	vv.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), tcpip.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
 		Data:       vv,
 		LinkHeader: buffer.View(linkHeader),
 	})
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 445b22c17..f5973066d 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -80,14 +80,14 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
 // r.RemoteAddress has a route registered in this endpoint.
-func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	endpoint, ok := m.routes[r.RemoteAddress]
 	if !ok {
 		return 0, tcpip.ErrNoRoute
@@ -98,7 +98,7 @@ func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts [
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
 		return endpoint.WritePacket(r, gso, protocol, pkt)
 	}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 63b249837..87c734c1f 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -50,7 +50,7 @@ func TestInjectableEndpointDispatch(t *testing.T) {
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
 
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
 	})
@@ -70,7 +70,7 @@ func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
 	hdr := buffer.NewPrependable(1)
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewView(0).ToVectorisedView(),
 	})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 655e537c4..6461d0108 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -185,7 +185,7 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	// Add the ethernet header here.
 	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
 	pkt.LinkHeader = buffer.View(eth)
@@ -214,7 +214,7 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -275,7 +275,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 
 		// Send packet up the stack.
 		eth := header.Ethernet(b[:header.EthernetMinimumSize])
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), tcpip.PacketBuffer{
+		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), stack.PacketBuffer{
 			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
 			LinkHeader: buffer.View(eth),
 		})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 5c729a439..27ea3f531 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,7 +131,7 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
 		addr:  remoteLinkAddr,
@@ -273,7 +273,7 @@ func TestSimpleSend(t *testing.T) {
 			randomFill(buf)
 
 			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-			if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
 				Header: hdr,
 				Data:   buf.ToVectorisedView(),
 			}); err != nil {
@@ -345,7 +345,7 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
 
 	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-	if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
 		Header: hdr,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
@@ -401,7 +401,7 @@ func TestFillTxQueue(t *testing.T) {
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -419,7 +419,7 @@ func TestFillTxQueue(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -447,7 +447,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	// Send two packets so that the id slice has at least two slots.
 	for i := 2; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -470,7 +470,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -488,7 +488,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -514,7 +514,7 @@ func TestFillTxMemory(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queueDataSize / bufferSize; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -533,7 +533,7 @@ func TestFillTxMemory(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	})
@@ -561,7 +561,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// until there is only one buffer left.
 	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -577,7 +577,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 		uu := buffer.NewView(bufferSize).ToVectorisedView()
-		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   uu,
 		}); err != want {
@@ -588,7 +588,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// Attempt to write the one-buffer packet again. It must succeed.
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 3392b7edd..0a6b8945c 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -123,7 +123,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("recv", protocol, pkt.Data.First(), nil)
 	}
@@ -200,7 +200,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("send", protocol, pkt.Header.View(), gso)
 	}
@@ -232,7 +232,7 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumb
 // WritePacket implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	e.dumpPacket(gso, protocol, pkt)
 	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
@@ -240,10 +240,10 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	view := pkts[0].Data.ToView()
 	for _, pkt := range pkts {
-		e.dumpPacket(gso, protocol, tcpip.PacketBuffer{
+		e.dumpPacket(gso, protocol, stack.PacketBuffer{
 			Header: pkt.Header,
 			Data:   view[pkt.DataOffset:][:pkt.DataSize].ToVectorisedView(),
 		})
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index f6e301304..617446ea2 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -213,7 +213,7 @@ func (d *Device) Write(data []byte) (int64, error) {
 		remote = tcpip.LinkAddress(zeroMAC[:])
 	}
 
-	pkt := tcpip.PacketBuffer{
+	pkt := stack.PacketBuffer{
 		Data: buffer.View(data).ToVectorisedView(),
 	}
 	if ethHdr != nil {
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index a8de38979..52fe397bf 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,7 +50,7 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
@@ -99,7 +99,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
@@ -112,7 +112,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	if !e.writeGate.Enter() {
 		return len(pkts), nil
 	}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 31b11a27a..88224e494 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	e.dispatchCount++
 }
 
@@ -65,13 +65,13 @@ func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	e.writeCount += len(pkts)
 	return len(pkts), nil
 }
@@ -89,21 +89,21 @@ func TestWaitWrite(t *testing.T) {
 	wep := New(ep)
 
 	// Write and check that it goes through.
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
 	if want := 1; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on dispatches, then try to write. It must go through.
 	wep.WaitDispatch()
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on writes, then try to write. It must not go through.
 	wep.WaitWrite()
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index e9fcc89a8..255098372 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,20 +79,20 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	v := pkt.Data.First()
 	h := header.ARP(v)
 	if !h.IsValid() {
@@ -113,7 +113,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
 		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 		})
 		fallthrough // also fill the cache from requests
@@ -167,7 +167,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	copy(h.ProtocolAddressSender(), localAddr)
 	copy(h.ProtocolAddressTarget(), addr)
 
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 	})
 }
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 03cf03b6d..b3e239ac7 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -103,7 +103,7 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.InjectInbound(arp.ProtocolNumber, tcpip.PacketBuffer{
+		c.linkEP.InjectInbound(arp.ProtocolNumber, stack.PacketBuffer{
 			Data: v.ToVectorisedView(),
 		})
 	}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index f4d78f8c6..4950d69fc 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,7 +96,7 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt stack.PacketBuffer) {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
@@ -104,7 +104,7 @@ func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.Trans
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
@@ -150,7 +150,7 @@ func (*testObject) Wait() {}
 // WritePacket is called by network endpoints after producing a packet and
 // writing it to the link endpoint. This is used by the test object to verify
 // that the produced packet is as expected.
-func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	var prot tcpip.TransportProtocolNumber
 	var srcAddr tcpip.Address
 	var dstAddr tcpip.Address
@@ -172,7 +172,7 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -246,7 +246,7 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -289,7 +289,7 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: view.ToVectorisedView(),
 	})
 	if o.dataCalls != 1 {
@@ -379,7 +379,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.extra = c.expectedExtra
 
 			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, tcpip.PacketBuffer{
+			ep.HandlePacket(&r, stack.PacketBuffer{
 				Data: vv,
 			})
 			if want := c.expectedCount; o.controlCalls != want {
@@ -444,7 +444,7 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: frag1.ToVectorisedView(),
 	})
 	if o.dataCalls != 0 {
@@ -452,7 +452,7 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: frag2.ToVectorisedView(),
 	})
 	if o.dataCalls != 1 {
@@ -487,7 +487,7 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -530,7 +530,7 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: view.ToVectorisedView(),
 	})
 	if o.dataCalls != 1 {
@@ -644,7 +644,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, tcpip.PacketBuffer{
+			ep.HandlePacket(&r, stack.PacketBuffer{
 				Data: view[:len(view)-c.trunc].ToVectorisedView(),
 			})
 			if want := c.expectedCount; o.controlCalls != want {
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 0fef2b1f1..880ea7de2 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -13,7 +13,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 32bf39e43..c4bf1ba5c 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,7 +15,6 @@
 package ipv4
 
 import (
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -25,7 +24,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	h := header.IPv4(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that the IP
@@ -53,7 +52,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
 	v := pkt.Data.First()
@@ -85,7 +84,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, tcpip.PacketBuffer{
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, stack.PacketBuffer{
 			Data:          pkt.Data.Clone(nil),
 			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
 		})
@@ -99,7 +98,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 		pkt.SetChecksum(0)
 		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header:          hdr,
 			Data:            vv,
 			TransportHeader: buffer.View(pkt),
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 4f1742938..b3ee6000e 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -125,7 +124,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 // packet's stated length matches the length of the header+payload. mtu
 // includes the IP header and options. This does not support the DontFragment
 // IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt stack.PacketBuffer) *tcpip.Error {
 	// This packet is too big, it needs to be fragmented.
 	ip := header.IPv4(pkt.Header.View())
 	flags := ip.Flags()
@@ -165,7 +164,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 		if i > 0 {
 			newPayload := pkt.Data.Clone(nil)
 			newPayload.CapLength(innerMTU)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -184,7 +183,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			newPayload := pkt.Data.Clone(nil)
 			newPayloadLength := outerMTU - pkt.Header.UsedLength()
 			newPayload.CapLength(newPayloadLength)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -198,7 +197,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			startOfHdr := pkt.Header
 			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
 			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
 				Header:        startOfHdr,
 				Data:          emptyVV,
 				NetworkHeader: buffer.View(h),
@@ -241,7 +240,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -253,7 +252,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+		e.HandlePacket(&loopedR, stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
@@ -273,7 +272,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
@@ -292,7 +291,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	ip := header.IPv4(pkt.Data.First())
@@ -344,7 +343,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuf
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	headerView := pkt.Data.First()
 	h := header.IPv4(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
@@ -361,7 +360,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(iptables.Input, pkt); !ok {
+	if ok := ipt.Check(stack.Input, pkt); !ok {
 		// iptables is telling us to drop the packet.
 		return
 	}
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index e900f1b45..5a864d832 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -113,7 +113,7 @@ func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketInfo tcpip.PacketBuffer, mtu uint32) {
+func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketInfo stack.PacketBuffer, mtu uint32) {
 	t.Helper()
 	// Make a complete array of the sourcePacketInfo packet.
 	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
@@ -173,7 +173,7 @@ func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketIn
 
 type errorChannel struct {
 	*channel.Endpoint
-	Ch                    chan tcpip.PacketBuffer
+	Ch                    chan stack.PacketBuffer
 	packetCollectorErrors []*tcpip.Error
 }
 
@@ -183,7 +183,7 @@ type errorChannel struct {
 func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
 	return &errorChannel{
 		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan tcpip.PacketBuffer, size),
+		Ch:                    make(chan stack.PacketBuffer, size),
 		packetCollectorErrors: packetCollectorErrors,
 	}
 }
@@ -202,7 +202,7 @@ func (e *errorChannel) Drain() int {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	select {
 	case e.Ch <- pkt:
 	default:
@@ -281,13 +281,13 @@ func TestFragmentation(t *testing.T) {
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
-			source := tcpip.PacketBuffer{
+			source := stack.PacketBuffer{
 				Header: hdr,
 				// Save the source payload because WritePacket will modify it.
 				Data: payload.Clone(nil),
 			}
 			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -295,7 +295,7 @@ func TestFragmentation(t *testing.T) {
 				t.Errorf("err got %v, want %v", err, nil)
 			}
 
-			var results []tcpip.PacketBuffer
+			var results []stack.PacketBuffer
 		L:
 			for {
 				select {
@@ -337,7 +337,7 @@ func TestFragmentationErrors(t *testing.T) {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
 			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -459,7 +459,7 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicID, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, tcpip.PacketBuffer{
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 45dc757c7..8640feffc 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -27,7 +27,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	h := header.IPv6(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that up to
@@ -62,7 +62,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
@@ -243,7 +243,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		//
 		// The IP Hop Limit field has a value of 255, i.e., the packet
 		// could not possibly have been forwarded by a router.
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 		}); err != nil {
 			sent.Dropped.Increment()
@@ -330,7 +330,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		copy(packet, h)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   pkt.Data,
 		}); err != nil {
@@ -463,7 +463,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 	})
 }
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 50c4b6474..bae09ed94 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -56,7 +56,7 @@ func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
-func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, tcpip.PacketBuffer) *tcpip.Error {
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, stack.PacketBuffer) *tcpip.Error {
 	return nil
 }
 
@@ -66,7 +66,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, tcpip.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, stack.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -187,7 +187,7 @@ func TestICMPCounts(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, tcpip.PacketBuffer{
+		ep.HandlePacket(&r, stack.PacketBuffer{
 			Data: hdr.View().ToVectorisedView(),
 		})
 	}
@@ -326,7 +326,7 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
 		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), stack.PacketBuffer{
 			Data: vv,
 		})
 	}
@@ -561,7 +561,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 					Data: hdr.View().ToVectorisedView(),
 				})
 			}
@@ -738,7 +738,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 					Data: hdr.View().ToVectorisedView(),
 				})
 			}
@@ -916,7 +916,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 9aef5234b..29e597002 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -112,7 +112,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -124,7 +124,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+		e.HandlePacket(&loopedR, stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
@@ -139,7 +139,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
 	}
@@ -161,14 +161,14 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	headerView := pkt.Data.First()
 	h := header.IPv6(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 1cbfa7278..ed98ef22a 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -55,7 +55,7 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
@@ -113,7 +113,7 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index c9395de52..f924ed9e1 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -135,7 +135,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -238,7 +238,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -304,7 +304,7 @@ func TestHopLimitValidation(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, tcpip.PacketBuffer{
+		ep.HandlePacket(r, stack.PacketBuffer{
 			Data: hdr.View().ToVectorisedView(),
 		})
 	}
@@ -588,7 +588,7 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
deleted file mode 100644
index ab24372e7..000000000
--- a/pkg/tcpip/packet_buffer.go
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcpip
-
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
-
-// A PacketBuffer contains all the data of a network packet.
-//
-// As a PacketBuffer traverses up the stack, it may be necessary to pass it to
-// multiple endpoints. Clone() should be called in such cases so that
-// modifications to the Data field do not affect other copies.
-//
-// +stateify savable
-type PacketBuffer struct {
-	// Data holds the payload of the packet. For inbound packets, it also
-	// holds the headers, which are consumed as the packet moves up the
-	// stack. Headers are guaranteed not to be split across views.
-	//
-	// The bytes backing Data are immutable, but Data itself may be trimmed
-	// or otherwise modified.
-	Data buffer.VectorisedView
-
-	// DataOffset is used for GSO output. It is the offset into the Data
-	// field where the payload of this packet starts.
-	DataOffset int
-
-	// DataSize is used for GSO output. It is the size of this packet's
-	// payload.
-	DataSize int
-
-	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header.
-	Header buffer.Prependable
-
-	// These fields are used by both inbound and outbound packets. They
-	// typically overlap with the Data and Header fields.
-	//
-	// The bytes backing these views are immutable. Each field may be nil
-	// if either it has not been set yet or no such header exists (e.g.
-	// packets sent via loopback may not have a link header).
-	//
-	// These fields may be Views into other slices (either Data or Header).
-	// SR dosen't support this, so deep copies are necessary in some cases.
-	LinkHeader      buffer.View
-	NetworkHeader   buffer.View
-	TransportHeader buffer.View
-}
-
-// Clone makes a copy of pk. It clones the Data field, which creates a new
-// VectorisedView but does not deep copy the underlying bytes.
-//
-// Clone also does not deep copy any of its other fields.
-func (pk PacketBuffer) Clone() PacketBuffer {
-	pk.Data = pk.Data.Clone(nil)
-	return pk
-}
diff --git a/pkg/tcpip/packet_buffer_state.go b/pkg/tcpip/packet_buffer_state.go
deleted file mode 100644
index ad3cc24fa..000000000
--- a/pkg/tcpip/packet_buffer_state.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcpip
-
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
-
-// beforeSave is invoked by stateify.
-func (pk *PacketBuffer) beforeSave() {
-	// Non-Data fields may be slices of the Data field. This causes
-	// problems for SR, so during save we make each header independent.
-	pk.Header = pk.Header.DeepCopy()
-	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
-	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
-	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
-}
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 6c029b2fb..7a43a1d4e 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -21,10 +21,15 @@ go_library(
         "dhcpv6configurationfromndpra_string.go",
         "forwarder.go",
         "icmp_rate_limit.go",
+        "iptables.go",
+        "iptables_targets.go",
+        "iptables_types.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
         "ndp.go",
         "nic.go",
+        "packet_buffer.go",
+        "packet_buffer_state.go",
         "registration.go",
         "route.go",
         "stack.go",
@@ -34,6 +39,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/ilist",
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
@@ -41,7 +47,6 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/waiter",
@@ -65,7 +70,6 @@ go_test(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
index 631953935..6b64cd37f 100644
--- a/pkg/tcpip/stack/forwarder.go
+++ b/pkg/tcpip/stack/forwarder.go
@@ -32,7 +32,7 @@ type pendingPacket struct {
 	nic   *NIC
 	route *Route
 	proto tcpip.NetworkProtocolNumber
-	pkt   tcpip.PacketBuffer
+	pkt   PacketBuffer
 }
 
 type forwardQueue struct {
@@ -50,7 +50,7 @@ func newForwardQueue() *forwardQueue {
 	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
 }
 
-func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	shouldWait := false
 
 	f.Lock()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 321b7524d..c45c43d21 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -68,7 +68,7 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt tcpip.PacketBuffer) {
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
 	// Consume the network header.
 	b := pkt.Data.First()
 	pkt.Data.TrimFront(fwdTestNetHeaderLen)
@@ -89,7 +89,7 @@ func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
 	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
@@ -101,11 +101,11 @@ func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkH
 }
 
 // WritePackets implements LinkEndpoint.WritePackets.
-func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -183,7 +183,7 @@ func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumb
 type fwdTestPacketInfo struct {
 	RemoteLinkAddress tcpip.LinkAddress
 	LocalLinkAddress  tcpip.LinkAddress
-	Pkt               tcpip.PacketBuffer
+	Pkt               PacketBuffer
 }
 
 type fwdTestLinkEndpoint struct {
@@ -196,12 +196,12 @@ type fwdTestLinkEndpoint struct {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt PacketBuffer) {
 	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
 }
 
@@ -244,7 +244,7 @@ func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error {
 	p := fwdTestPacketInfo{
 		RemoteLinkAddress: r.RemoteLinkAddress,
 		LocalLinkAddress:  r.LocalLinkAddress,
@@ -260,7 +260,7 @@ func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.Netw
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	n := 0
 	for _, pkt := range pkts {
 		e.WritePacket(r, gso, protocol, pkt)
@@ -273,7 +273,7 @@ func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.Pack
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := fwdTestPacketInfo{
-		Pkt: tcpip.PacketBuffer{Data: vv},
+		Pkt: PacketBuffer{Data: vv},
 	}
 
 	select {
@@ -355,7 +355,7 @@ func TestForwardingWithStaticResolver(t *testing.T) {
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -392,7 +392,7 @@ func TestForwardingWithFakeResolver(t *testing.T) {
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -423,7 +423,7 @@ func TestForwardingWithNoResolver(t *testing.T) {
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -453,7 +453,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// not be forwarded.
 	buf := buffer.NewView(30)
 	buf[0] = 4
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -461,7 +461,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// forwarded to NIC 2.
 	buf = buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -503,7 +503,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		buf := buffer.NewView(30)
 		buf[0] = 3
-		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -550,7 +550,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 		buf[0] = 3
 		// Set the packet sequence number.
 		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -603,7 +603,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 		// maxPendingResolutions + 7).
 		buf := buffer.NewView(30)
 		buf[0] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
new file mode 100644
index 000000000..37907ae24
--- /dev/null
+++ b/pkg/tcpip/stack/iptables.go
@@ -0,0 +1,311 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// Table names.
+const (
+	TablenameNat    = "nat"
+	TablenameMangle = "mangle"
+	TablenameFilter = "filter"
+)
+
+// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
+const (
+	ChainNamePrerouting  = "PREROUTING"
+	ChainNameInput       = "INPUT"
+	ChainNameForward     = "FORWARD"
+	ChainNameOutput      = "OUTPUT"
+	ChainNamePostrouting = "POSTROUTING"
+)
+
+// HookUnset indicates that there is no hook set for an entrypoint or
+// underflow.
+const HookUnset = -1
+
+// DefaultTables returns a default set of tables. Each chain is set to accept
+// all packets.
+func DefaultTables() IPTables {
+	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
+	// iotas.
+	return IPTables{
+		Tables: map[string]Table{
+			TablenameNat: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
+				},
+				Underflows: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
+				},
+				UserChains: map[string]int{},
+			},
+			TablenameMangle: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				Underflows: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				UserChains: map[string]int{},
+			},
+			TablenameFilter: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				Underflows: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				UserChains: map[string]int{},
+			},
+		},
+		Priorities: map[Hook][]string{
+			Input:      []string{TablenameNat, TablenameFilter},
+			Prerouting: []string{TablenameMangle, TablenameNat},
+			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
+		},
+	}
+}
+
+// EmptyFilterTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyFilterTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
+// EmptyNatTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyNatTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
+// A chainVerdict is what a table decides should be done with a packet.
+type chainVerdict int
+
+const (
+	// chainAccept indicates the packet should continue through netstack.
+	chainAccept chainVerdict = iota
+
+	// chainAccept indicates the packet should be dropped.
+	chainDrop
+
+	// chainReturn indicates the packet should return to the calling chain
+	// or the underflow rule of a builtin chain.
+	chainReturn
+)
+
+// Check runs pkt through the rules for hook. It returns true when the packet
+// should continue traversing the network stack and false when it should be
+// dropped.
+//
+// Precondition: pkt.NetworkHeader is set.
+func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool {
+	// Go through each table containing the hook.
+	for _, tablename := range it.Priorities[hook] {
+		table := it.Tables[tablename]
+		ruleIdx := table.BuiltinChains[hook]
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx); verdict {
+		// If the table returns Accept, move on to the next table.
+		case chainAccept:
+			continue
+		// The Drop verdict is final.
+		case chainDrop:
+			return false
+		case chainReturn:
+			// Any Return from a built-in chain means we have to
+			// call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				continue
+			case RuleDrop:
+				return false
+			case RuleJump, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
+		default:
+			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
+		}
+	}
+
+	// Every table returned Accept.
+	return true
+}
+
+// Precondition: pkt.NetworkHeader is set.
+func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict {
+	// Start from ruleIdx and walk the list of rules until a rule gives us
+	// a verdict.
+	for ruleIdx < len(table.Rules) {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx); verdict {
+		case RuleAccept:
+			return chainAccept
+
+		case RuleDrop:
+			return chainDrop
+
+		case RuleReturn:
+			return chainReturn
+
+		case RuleJump:
+			// "Jumping" to the next rule just means we're
+			// continuing on down the list.
+			if jumpTo == ruleIdx+1 {
+				ruleIdx++
+				continue
+			}
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo); verdict {
+			case chainAccept:
+				return chainAccept
+			case chainDrop:
+				return chainDrop
+			case chainReturn:
+				ruleIdx++
+				continue
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
+			}
+
+		default:
+			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
+		}
+
+	}
+
+	// We got through the entire table without a decision. Default to DROP
+	// for safety.
+	return chainDrop
+}
+
+// Precondition: pk.NetworkHeader is set.
+func (it *IPTables) checkRule(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
+	rule := table.Rules[ruleIdx]
+
+	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
+	// pkt.Data.First().
+	if pkt.NetworkHeader == nil {
+		pkt.NetworkHeader = pkt.Data.First()
+	}
+
+	// Check whether the packet matches the IP header filter.
+	if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader)) {
+		// Continue on to the next rule.
+		return RuleJump, ruleIdx + 1
+	}
+
+	// Go through each rule matcher. If they all match, run
+	// the rule target.
+	for _, matcher := range rule.Matchers {
+		matches, hotdrop := matcher.Match(hook, pkt, "")
+		if hotdrop {
+			return RuleDrop, 0
+		}
+		if !matches {
+			// Continue on to the next rule.
+			return RuleJump, ruleIdx + 1
+		}
+	}
+
+	// All the matchers matched, so run the target.
+	return rule.Target.Action(pkt)
+}
+
+func filterMatch(filter IPHeaderFilter, hdr header.IPv4) bool {
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// Check the transport protocol.
+	if filter.Protocol != 0 && filter.Protocol != hdr.TransportProtocol() {
+		return false
+	}
+
+	// Check the destination IP.
+	dest := hdr.DestinationAddress()
+	matches := true
+	for i := range filter.Dst {
+		if dest[i]&filter.DstMask[i] != filter.Dst[i] {
+			matches = false
+			break
+		}
+	}
+	if matches == filter.DstInvert {
+		return false
+	}
+
+	return true
+}
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
new file mode 100644
index 000000000..7b4543caf
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -0,0 +1,144 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// AcceptTarget accepts packets.
+type AcceptTarget struct{}
+
+// Action implements Target.Action.
+func (AcceptTarget) Action(packet PacketBuffer) (RuleVerdict, int) {
+	return RuleAccept, 0
+}
+
+// DropTarget drops packets.
+type DropTarget struct{}
+
+// Action implements Target.Action.
+func (DropTarget) Action(packet PacketBuffer) (RuleVerdict, int) {
+	return RuleDrop, 0
+}
+
+// ErrorTarget logs an error and drops the packet. It represents a target that
+// should be unreachable.
+type ErrorTarget struct{}
+
+// Action implements Target.Action.
+func (ErrorTarget) Action(packet PacketBuffer) (RuleVerdict, int) {
+	log.Debugf("ErrorTarget triggered.")
+	return RuleDrop, 0
+}
+
+// UserChainTarget marks a rule as the beginning of a user chain.
+type UserChainTarget struct {
+	Name string
+}
+
+// Action implements Target.Action.
+func (UserChainTarget) Action(PacketBuffer) (RuleVerdict, int) {
+	panic("UserChainTarget should never be called.")
+}
+
+// ReturnTarget returns from the current chain. If the chain is a built-in, the
+// hook's underflow should be called.
+type ReturnTarget struct{}
+
+// Action implements Target.Action.
+func (ReturnTarget) Action(PacketBuffer) (RuleVerdict, int) {
+	return RuleReturn, 0
+}
+
+// RedirectTarget redirects the packet by modifying the destination port/IP.
+// Min and Max values for IP and Ports in the struct indicate the range of
+// values which can be used to redirect.
+type RedirectTarget struct {
+	// TODO(gvisor.dev/issue/170): Other flags need to be added after
+	// we support them.
+	// RangeProtoSpecified flag indicates single port is specified to
+	// redirect.
+	RangeProtoSpecified bool
+
+	// Min address used to redirect.
+	MinIP tcpip.Address
+
+	// Max address used to redirect.
+	MaxIP tcpip.Address
+
+	// Min port used to redirect.
+	MinPort uint16
+
+	// Max port used to redirect.
+	MaxPort uint16
+}
+
+// Action implements Target.Action.
+// TODO(gvisor.dev/issue/170): Parse headers without copying. The current
+// implementation only works for PREROUTING and calls pkt.Clone(), neither
+// of which should be the case.
+func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
+	newPkt := pkt.Clone()
+
+	// Set network header.
+	headerView := newPkt.Data.First()
+	netHeader := header.IPv4(headerView)
+	newPkt.NetworkHeader = headerView[:header.IPv4MinimumSize]
+
+	hlen := int(netHeader.HeaderLength())
+	tlen := int(netHeader.TotalLength())
+	newPkt.Data.TrimFront(hlen)
+	newPkt.Data.CapLength(tlen - hlen)
+
+	// TODO(gvisor.dev/issue/170): Change destination address to
+	// loopback or interface address on which the packet was
+	// received.
+
+	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
+	// we need to change dest address (for OUTPUT chain) or ports.
+	switch protocol := netHeader.TransportProtocol(); protocol {
+	case header.UDPProtocolNumber:
+		var udpHeader header.UDP
+		if newPkt.TransportHeader != nil {
+			udpHeader = header.UDP(newPkt.TransportHeader)
+		} else {
+			if len(pkt.Data.First()) < header.UDPMinimumSize {
+				return RuleDrop, 0
+			}
+			udpHeader = header.UDP(newPkt.Data.First())
+		}
+		udpHeader.SetDestinationPort(rt.MinPort)
+	case header.TCPProtocolNumber:
+		var tcpHeader header.TCP
+		if newPkt.TransportHeader != nil {
+			tcpHeader = header.TCP(newPkt.TransportHeader)
+		} else {
+			if len(pkt.Data.First()) < header.TCPMinimumSize {
+				return RuleDrop, 0
+			}
+			tcpHeader = header.TCP(newPkt.TransportHeader)
+		}
+		// TODO(gvisor.dev/issue/170): Need to recompute checksum
+		// and implement nat connection tracking to support TCP.
+		tcpHeader.SetDestinationPort(rt.MinPort)
+	default:
+		return RuleDrop, 0
+	}
+
+	return RuleAccept, 0
+}
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
new file mode 100644
index 000000000..2ffb55f2a
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -0,0 +1,180 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// A Hook specifies one of the hooks built into the network stack.
+//
+//                      Userspace app          Userspace app
+//                            ^                      |
+//                            |                      v
+//                         [Input]               [Output]
+//                            ^                      |
+//                            |                      v
+//                            |                   routing
+//                            |                      |
+//                            |                      v
+// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]----->
+type Hook uint
+
+// These values correspond to values in include/uapi/linux/netfilter.h.
+const (
+	// Prerouting happens before a packet is routed to applications or to
+	// be forwarded.
+	Prerouting Hook = iota
+
+	// Input happens before a packet reaches an application.
+	Input
+
+	// Forward happens once it's decided that a packet should be forwarded
+	// to another host.
+	Forward
+
+	// Output happens after a packet is written by an application to be
+	// sent out.
+	Output
+
+	// Postrouting happens just before a packet goes out on the wire.
+	Postrouting
+
+	// The total number of hooks.
+	NumHooks
+)
+
+// A RuleVerdict is what a rule decides should be done with a packet.
+type RuleVerdict int
+
+const (
+	// RuleAccept indicates the packet should continue through netstack.
+	RuleAccept RuleVerdict = iota
+
+	// RuleDrop indicates the packet should be dropped.
+	RuleDrop
+
+	// RuleJump indicates the packet should jump to another chain.
+	RuleJump
+
+	// RuleReturn indicates the packet should return to the previous chain.
+	RuleReturn
+)
+
+// IPTables holds all the tables for a netstack.
+type IPTables struct {
+	// Tables maps table names to tables. User tables have arbitrary names.
+	Tables map[string]Table
+
+	// Priorities maps each hook to a list of table names. The order of the
+	// list is the order in which each table should be visited for that
+	// hook.
+	Priorities map[Hook][]string
+}
+
+// A Table defines a set of chains and hooks into the network stack. It is
+// really just a list of rules with some metadata for entrypoints and such.
+type Table struct {
+	// Rules holds the rules that make up the table.
+	Rules []Rule
+
+	// BuiltinChains maps builtin chains to their entrypoint rule in Rules.
+	BuiltinChains map[Hook]int
+
+	// Underflows maps builtin chains to their underflow rule in Rules
+	// (i.e. the rule to execute if the chain returns without a verdict).
+	Underflows map[Hook]int
+
+	// UserChains holds user-defined chains for the keyed by name. Users
+	// can give their chains arbitrary names.
+	UserChains map[string]int
+
+	// Metadata holds information about the Table that is useful to users
+	// of IPTables, but not to the netstack IPTables code itself.
+	metadata interface{}
+}
+
+// ValidHooks returns a bitmap of the builtin hooks for the given table.
+func (table *Table) ValidHooks() uint32 {
+	hooks := uint32(0)
+	for hook := range table.BuiltinChains {
+		hooks |= 1 << hook
+	}
+	return hooks
+}
+
+// Metadata returns the metadata object stored in table.
+func (table *Table) Metadata() interface{} {
+	return table.metadata
+}
+
+// SetMetadata sets the metadata object stored in table.
+func (table *Table) SetMetadata(metadata interface{}) {
+	table.metadata = metadata
+}
+
+// A Rule is a packet processing rule. It consists of two pieces. First it
+// contains zero or more matchers, each of which is a specification of which
+// packets this rule applies to. If there are no matchers in the rule, it
+// applies to any packet.
+type Rule struct {
+	// Filter holds basic IP filtering fields common to every rule.
+	Filter IPHeaderFilter
+
+	// Matchers is the list of matchers for this rule.
+	Matchers []Matcher
+
+	// Target is the action to invoke if all the matchers match the packet.
+	Target Target
+}
+
+// IPHeaderFilter holds basic IP filtering data common to every rule.
+type IPHeaderFilter struct {
+	// Protocol matches the transport protocol.
+	Protocol tcpip.TransportProtocolNumber
+
+	// Dst matches the destination IP address.
+	Dst tcpip.Address
+
+	// DstMask masks bits of the destination IP address when comparing with
+	// Dst.
+	DstMask tcpip.Address
+
+	// DstInvert inverts the meaning of the destination IP check, i.e. when
+	// true the filter will match packets that fail the destination
+	// comparison.
+	DstInvert bool
+}
+
+// A Matcher is the interface for matching packets.
+type Matcher interface {
+	// Name returns the name of the Matcher.
+	Name() string
+
+	// Match returns whether the packet matches and whether the packet
+	// should be "hotdropped", i.e. dropped immediately. This is usually
+	// used for suspicious packets.
+	//
+	// Precondition: packet.NetworkHeader is set.
+	Match(hook Hook, packet PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+}
+
+// A Target is the interface for taking an action for a packet.
+type Target interface {
+	// Action takes an action on the packet and returns a verdict on how
+	// traversal should (or should not) continue. If the return value is
+	// Jump, it also returns the index of the rule to jump to.
+	Action(packet PacketBuffer) (RuleVerdict, int)
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index d689a006d..630fdefc5 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -564,7 +564,7 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
 			TOS:      DefaultTOS,
-		}, tcpip.PacketBuffer{Header: hdr},
+		}, PacketBuffer{Header: hdr},
 	); err != nil {
 		sent.Dropped.Increment()
 		return err
@@ -1283,7 +1283,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
 				TOS:      DefaultTOS,
-			}, tcpip.PacketBuffer{Header: hdr},
+			}, PacketBuffer{Header: hdr},
 		); err != nil {
 			sent.Dropped.Increment()
 			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 4368c236c..06edd05b6 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -602,7 +602,7 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -918,7 +918,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 
 // raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
 // and DHCPv6 configurations specified.
-func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
 	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
@@ -953,14 +953,14 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo
 		DstAddr:       header.IPv6AllNodesMulticastAddress,
 	})
 
-	return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+	return stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
 }
 
 // raBufWithOpts returns a valid NDP Router Advertisement with options.
 //
 // Note, raBufWithOpts does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
 }
 
@@ -969,7 +969,7 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
 //
 // Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
 // DHCPv6 related ones.
-func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) tcpip.PacketBuffer {
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
 }
 
@@ -977,7 +977,7 @@ func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bo
 //
 // Note, raBuf does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
+func raBuf(ip tcpip.Address, rl uint16) stack.PacketBuffer {
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
 }
 
@@ -986,7 +986,7 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) stack.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
 		// The OnLink flag is the 7th bit in the flags byte.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 9dcb1d52c..b6fa647ea 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 )
 
 var ipv4BroadcastAddr = tcpip.ProtocolAddress{
@@ -1144,7 +1143,7 @@ func (n *NIC) isInGroup(addr tcpip.Address) bool {
 	return joins != 0
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
 
@@ -1158,7 +1157,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	n.mu.RLock()
 	enabled := n.mu.enabled
 	// If the NIC is not yet enabled, don't receive any packets.
@@ -1222,7 +1221,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
 	if protocol == header.IPv4ProtocolNumber {
 		ipt := n.stack.IPTables()
-		if ok := ipt.Check(iptables.Prerouting, pkt); !ok {
+		if ok := ipt.Check(Prerouting, pkt); !ok {
 			// iptables is telling us to drop the packet.
 			return
 		}
@@ -1287,7 +1286,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 }
 
-func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
 
 	firstData := pkt.Data.First()
@@ -1318,7 +1317,7 @@ func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -1364,7 +1363,7 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index edaee3b86..d672fc157 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -17,7 +17,6 @@ package stack
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
@@ -45,7 +44,7 @@ func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 		t.FailNow()
 	}
 
-	nic.DeliverNetworkPacket(nil, "", "", 0, tcpip.PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+	nic.DeliverNetworkPacket(nil, "", "", 0, PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
 
 	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
 		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
new file mode 100644
index 000000000..1850fa8c3
--- /dev/null
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package stack
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// A PacketBuffer contains all the data of a network packet.
+//
+// As a PacketBuffer traverses up the stack, it may be necessary to pass it to
+// multiple endpoints. Clone() should be called in such cases so that
+// modifications to the Data field do not affect other copies.
+//
+// +stateify savable
+type PacketBuffer struct {
+	// Data holds the payload of the packet. For inbound packets, it also
+	// holds the headers, which are consumed as the packet moves up the
+	// stack. Headers are guaranteed not to be split across views.
+	//
+	// The bytes backing Data are immutable, but Data itself may be trimmed
+	// or otherwise modified.
+	Data buffer.VectorisedView
+
+	// DataOffset is used for GSO output. It is the offset into the Data
+	// field where the payload of this packet starts.
+	DataOffset int
+
+	// DataSize is used for GSO output. It is the size of this packet's
+	// payload.
+	DataSize int
+
+	// Header holds the headers of outbound packets. As a packet is passed
+	// down the stack, each layer adds to Header.
+	Header buffer.Prependable
+
+	// These fields are used by both inbound and outbound packets. They
+	// typically overlap with the Data and Header fields.
+	//
+	// The bytes backing these views are immutable. Each field may be nil
+	// if either it has not been set yet or no such header exists (e.g.
+	// packets sent via loopback may not have a link header).
+	//
+	// These fields may be Views into other slices (either Data or Header).
+	// SR dosen't support this, so deep copies are necessary in some cases.
+	LinkHeader      buffer.View
+	NetworkHeader   buffer.View
+	TransportHeader buffer.View
+}
+
+// Clone makes a copy of pk. It clones the Data field, which creates a new
+// VectorisedView but does not deep copy the underlying bytes.
+//
+// Clone also does not deep copy any of its other fields.
+func (pk PacketBuffer) Clone() PacketBuffer {
+	pk.Data = pk.Data.Clone(nil)
+	return pk
+}
diff --git a/pkg/tcpip/stack/packet_buffer_state.go b/pkg/tcpip/stack/packet_buffer_state.go
new file mode 100644
index 000000000..76602549e
--- /dev/null
+++ b/pkg/tcpip/stack/packet_buffer_state.go
@@ -0,0 +1,26 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package stack
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// beforeSave is invoked by stateify.
+func (pk *PacketBuffer) beforeSave() {
+	// Non-Data fields may be slices of the Data field. This causes
+	// problems for SR, so during save we make each header independent.
+	pk.Header = pk.Header.DeepCopy()
+	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
+	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
+	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
+}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index fa28b46b1..ac043b722 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -67,12 +67,12 @@ type TransportEndpoint interface {
 	// this transport endpoint. It sets pkt.TransportHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer)
 
 	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
 	// HandleControlPacket takes ownership of pkt.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer)
 
 	// Abort initiates an expedited endpoint teardown. It puts the endpoint
 	// in a closed state and frees all resources associated with it. This
@@ -100,7 +100,7 @@ type RawTransportEndpoint interface {
 	// layer up.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, pkt PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -118,7 +118,7 @@ type PacketEndpoint interface {
 	// should construct its own ethernet header for applications.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -150,7 +150,7 @@ type TransportProtocol interface {
 	// stats purposes only).
 	//
 	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -180,7 +180,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -189,7 +189,7 @@ type TransportDispatcher interface {
 	// DeliverTransportControlPacket.
 	//
 	// DeliverTransportControlPacket takes ownership of pkt.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -242,15 +242,15 @@ type NetworkEndpoint interface {
 	// WritePacket writes a packet to the given destination address and
 	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
 	// already been set.
-	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
 	// protocol. pkts must not be zero length.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error
+	WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
@@ -265,7 +265,7 @@ type NetworkEndpoint interface {
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, pkt PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
@@ -322,7 +322,7 @@ type NetworkDispatcher interface {
 	// packets sent via loopback), and won't have the field set.
 	//
 	// DeliverNetworkPacket takes ownership of pkt.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -354,7 +354,7 @@ const (
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
 // ethernet, loopback, raw) and used by network layer protocols to send packets
 // out through the implementer's data link endpoint. When a link header exists,
-// it sets each tcpip.PacketBuffer's LinkHeader field before passing it up the
+// it sets each PacketBuffer's LinkHeader field before passing it up the
 // stack.
 type LinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
@@ -385,7 +385,7 @@ type LinkEndpoint interface {
 	// To participate in transparent bridging, a LinkEndpoint implementation
 	// should call eth.Encode with header.EthernetFields.SrcAddr set to
 	// r.LocalLinkAddress if it is provided.
-	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
 	// given route. pkts must not be zero length.
@@ -393,7 +393,7 @@ type LinkEndpoint interface {
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header.
@@ -426,7 +426,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index f565aafb2..9fbe8a411 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -153,7 +153,7 @@ func (r *Route) IsResolutionRequired() bool {
 }
 
 // WritePacket writes the packet through the given route.
-func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -169,7 +169,7 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.Pack
 }
 
 // WritePackets writes the set of packets through the given route.
-func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+func (r *Route) WritePackets(gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
 	if !r.ref.isValidForOutgoing() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
@@ -190,7 +190,7 @@ func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params Network
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error {
+func (r *Route) WriteHeaderIncludedPacket(pkt PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6f423874a..a9584d636 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -31,7 +31,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -51,7 +50,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -428,7 +427,7 @@ type Stack struct {
 
 	// tables are the iptables packet filtering and manipulation rules. The are
 	// protected by tablesMu.`
-	tables iptables.IPTables
+	tables IPTables
 
 	// resumableEndpoints is a list of endpoints that need to be resumed if the
 	// stack is being restored.
@@ -738,7 +737,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
@@ -1701,7 +1700,7 @@ func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool,
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() iptables.IPTables {
+func (s *Stack) IPTables() IPTables {
 	s.tablesMu.RLock()
 	t := s.tables
 	s.tablesMu.RUnlock()
@@ -1709,7 +1708,7 @@ func (s *Stack) IPTables() iptables.IPTables {
 }
 
 // SetIPTables sets the stack's iptables.
-func (s *Stack) SetIPTables(ipt iptables.IPTables) {
+func (s *Stack) SetIPTables(ipt IPTables) {
 	s.tablesMu.Lock()
 	s.tables = ipt
 	s.tablesMu.Unlock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9836b340f..555fcd92f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -90,7 +90,7 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
@@ -126,7 +126,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
@@ -141,7 +141,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 		views[0] = pkt.Header.View()
 		views = append(views, pkt.Data.Views()...)
-		f.HandlePacket(r, tcpip.PacketBuffer{
+		f.HandlePacket(r, stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 	}
@@ -153,11 +153,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -287,7 +287,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet with wrong address is not delivered.
 	buf[0] = 3
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 0 {
@@ -299,7 +299,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to first endpoint.
 	buf[0] = 1
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -311,7 +311,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to second endpoint.
 	buf[0] = 2
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -322,7 +322,7 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.InjectInbound(fakeNetNumber-1, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber-1, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -334,7 +334,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -356,7 +356,7 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 
 func send(r stack.Route, payload buffer.View) *tcpip.Error {
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	})
@@ -414,7 +414,7 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
@@ -2257,7 +2257,7 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
@@ -2339,7 +2339,7 @@ func TestNICForwarding(t *testing.T) {
 			// Send a packet to dstAddr.
 			buf := buffer.NewView(30)
 			buf[0] = dstAddr[0]
-			ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+			ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 				Data: buf.ToVectorisedView(),
 			})
 
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index d4c0359e8..c55e3e8bc 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -85,7 +85,7 @@ func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer) {
 	epsByNic.mu.RLock()
 
 	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
@@ -116,7 +116,7 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer) {
 	epsByNic.mu.RLock()
 	defer epsByNic.mu.RUnlock()
 
@@ -184,7 +184,7 @@ type transportDemuxer struct {
 // the dispatcher to delivery packets to the QueuePacket method instead of
 // calling HandlePacket directly on the endpoint.
 type queuedTransportProtocol interface {
-	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
@@ -312,7 +312,7 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	return mpep.endpoints[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt PacketBuffer) {
 	ep.mu.RLock()
 	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	// HandlePacket takes ownership of pkt, so each endpoint needs
@@ -403,7 +403,7 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if
 // the packet no longer needs to be handled.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -453,7 +453,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -477,7 +477,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 0e3e239c5..84311bcc8 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -150,7 +150,7 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 5d1da2f8b..8ca9ac3cf 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -87,7 +86,7 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.View(v).ToVectorisedView(),
 	}); err != nil {
@@ -214,7 +213,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ tcpip.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ stack.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -231,7 +230,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, tcpip.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, stack.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -242,8 +241,8 @@ func (f *fakeTransportEndpoint) State() uint32 {
 
 func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
-func (f *fakeTransportEndpoint) IPTables() (iptables.IPTables, error) {
-	return iptables.IPTables{}, nil
+func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
+	return stack.IPTables{}, nil
 }
 
 func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
@@ -288,7 +287,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
 	return true
 }
 
@@ -368,7 +367,7 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -379,7 +378,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -390,7 +389,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 1 {
@@ -445,7 +444,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -456,7 +455,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -467,7 +466,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 1 {
@@ -622,7 +621,7 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep2.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: req.ToVectorisedView(),
 	})
 
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index ac18ec5b1..9ce625c17 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 2a396e9bc..613b12ead 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -135,7 +134,7 @@ func (e *endpoint) Close() {
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -441,7 +440,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data.ToVectorisedView(),
 		TransportHeader: buffer.View(icmpv4),
@@ -471,7 +470,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header:          hdr,
 		Data:            dataVV,
 		TransportHeader: buffer.View(icmpv6),
@@ -733,7 +732,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
@@ -795,7 +794,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 113d92901..3c47692b2 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,7 +104,7 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
 	return true
 }
 
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index d22de6b26..b989b1209 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 09a1cd436..df49d0995 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -100,8 +99,8 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 }
 
 // Abort implements stack.TransportEndpoint.Abort.
-func (e *endpoint) Abort() {
-	e.Close()
+func (ep *endpoint) Abort() {
+	ep.Close()
 }
 
 // Close implements tcpip.Endpoint.Close.
@@ -134,7 +133,7 @@ func (ep *endpoint) Close() {
 func (ep *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (ep *endpoint) IPTables() (iptables.IPTables, error) {
+func (ep *endpoint) IPTables() (stack.IPTables, error) {
 	return ep.stack.IPTables(), nil
 }
 
@@ -299,7 +298,7 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index c9baf4600..2eab09088 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/packet",
         "//pkg/waiter",
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 2ef5fac76..536dafd1e 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -30,7 +30,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -161,7 +160,7 @@ func (e *endpoint) Close() {
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -342,7 +341,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
 		if !e.associated {
-			if err := route.WriteHeaderIncludedPacket(tcpip.PacketBuffer{
+			if err := route.WriteHeaderIncludedPacket(stack.PacketBuffer{
 				Data: buffer.View(payloadBytes).ToVectorisedView(),
 			}); err != nil {
 				return 0, nil, err
@@ -350,7 +349,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 			break
 		}
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buffer.View(payloadBytes).ToVectorisedView(),
 		}); err != nil {
@@ -574,7 +573,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt stack.PacketBuffer) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 2fdf6c0a5..7f94f9646 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -66,7 +66,6 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 53193afc6..79552fc61 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -705,7 +705,7 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
 	return nil
 }
 
-func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.PacketBuffer, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
+func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
 	optLen := len(opts)
 	hdr := &pkt.Header
 	packetSize := pkt.DataSize
@@ -752,7 +752,7 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 	// Allocate one big slice for all the headers.
 	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
 	buf := make([]byte, n*hdrSize)
-	pkts := make([]tcpip.PacketBuffer, n)
+	pkts := make([]stack.PacketBuffer, n)
 	for i := range pkts {
 		pkts[i].Header = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
 	}
@@ -795,7 +795,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
 	}
 
-	pkt := tcpip.PacketBuffer{
+	pkt := stack.PacketBuffer{
 		Header:     buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
 		DataOffset: 0,
 		DataSize:   data.Size(),
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index 90ac956a9..6062ca916 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -18,7 +18,6 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -187,7 +186,7 @@ func (d *dispatcher) wait() {
 	}
 }
 
-func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
 	if !s.parse() {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index eb8a9d73e..594efaa11 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -30,7 +30,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -1120,7 +1119,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -2388,7 +2387,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
 	// land at the Dispatcher which then can either delivery using the
 	// worker go routine or directly do the invoke the tcp processing inline
@@ -2407,7 +2406,7 @@ func (e *endpoint) enqueueSegment(s *segment) bool {
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index c9ee5bf06..a094471b8 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -61,7 +61,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index b0f918bb4..57985b85d 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -140,7 +140,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // to a specific processing queue. Each queue is serviced by its own processor
 // goroutine which is responsible for dequeuing and doing full TCP dispatch of
 // the packet.
-func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	p.dispatcher.queuePacket(r, ep, id, pkt)
 }
 
@@ -151,7 +151,7 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 5d0bc4f72..e6fe7985d 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -18,7 +18,6 @@ import (
 	"sync/atomic"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
@@ -61,7 +60,7 @@ type segment struct {
 	xmitCount uint32
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 8cea20fb5..d4f6bc635 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -307,7 +307,7 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
@@ -363,7 +363,7 @@ func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcp
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: s,
 	})
 }
@@ -371,7 +371,7 @@ func (c *Context) SendSegment(s buffer.VectorisedView) {
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: c.BuildSegment(payload, h),
 	})
 }
@@ -380,7 +380,7 @@ func (c *Context) SendPacket(payload []byte, h *Headers) {
 // & TCPheaders) in an IPv4 packet via the link layer endpoint using the
 // provided source and destination IPv4 addresses.
 func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
 	})
 }
@@ -548,7 +548,7 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index adc908e24..b5d2d0ba6 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 0af4514e1..a3372ac58 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -234,7 +233,7 @@ func (e *endpoint) Close() {
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -913,7 +912,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	if useDefaultTTL {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data,
 		TransportHeader: buffer.View(udp),
@@ -1260,7 +1259,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Get the header then trim it from the view.
 	hdr := header.UDP(pkt.Data.First())
 	if int(hdr.Length()) > pkt.Data.Size() {
@@ -1327,7 +1326,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State.
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index fc706ede2..a674ceb68 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -43,7 +43,7 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
@@ -61,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	pkt   tcpip.PacketBuffer
+	pkt   stack.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 8df089d22..6e31a9bac 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -66,7 +66,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	// Get the header then trim it from the view.
 	hdr := header.UDP(pkt.Data.First())
 	if int(hdr.Length()) > pkt.Data.Size() {
@@ -135,7 +135,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv4DstUnreachable)
 		pkt.SetCode(header.ICMPv4PortUnreachable)
 		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   payload,
 		})
@@ -172,7 +172,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv6DstUnreachable)
 		pkt.SetCode(header.ICMPv6PortUnreachable)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   payload,
 		})
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 34b7c2360..0905726c1 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -439,7 +439,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
 		Data:            buf.ToVectorisedView(),
 		NetworkHeader:   buffer.View(ip),
 		TransportHeader: buffer.View(u),
@@ -486,7 +486,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 
 	// Inject packet.
 
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data:            buf.ToVectorisedView(),
 		NetworkHeader:   buffer.View(ip),
 		TransportHeader: buffer.View(u),
-- 
cgit v1.2.3


From f97858011fa88b539585ca456943922204d92840 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 24 Mar 2020 10:57:24 -0700
Subject: Open a temp directory before changing capabilities and user ID-s

In cl/302130790, we started using a temp directory which is provided by bazel.

By default, a test process has enough permissions to open it, but there is not
any guarantee that it still will be able to do this after changing credentials.

PiperOrigin-RevId: 302702337
---
 test/syscalls/linux/sticky.cc | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index 7e73325bf..92eec0449 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -42,8 +42,9 @@ TEST(StickyTest, StickyBitPermDenied) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -61,7 +62,8 @@ TEST(StickyTest, StickyBitPermDenied) {
         syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1),
         SyscallSucceeds());
 
-    EXPECT_THAT(rmdir(path.c_str()), SyscallFailsWithErrno(EPERM));
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallFailsWithErrno(EPERM));
   });
 }
 
@@ -96,8 +98,9 @@ TEST(StickyTest, StickyBitCapFOWNER) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -114,7 +117,8 @@ TEST(StickyTest, StickyBitCapFOWNER) {
         SyscallSucceeds());
 
     EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true));
-    EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallSucceeds());
   });
 }
 }  // namespace
-- 
cgit v1.2.3


From c8eeedcc1d6b1ee25532ae630a7efd7aa4656bdc Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 24 Mar 2020 15:33:16 -0700
Subject: Add support for setting TCP segment hash.

This allows the link layer endpoints to consistenly hash a TCP
segment to a single underlying queue in case a link layer endpoint
does support multiple underlying queues.

Updates #231

PiperOrigin-RevId: 302760664
---
 pkg/tcpip/link/fdbased/endpoint.go      |  10 ++-
 pkg/tcpip/link/fdbased/endpoint_test.go |  76 +++++++++++++-----
 pkg/tcpip/stack/BUILD                   |   1 +
 pkg/tcpip/stack/packet_buffer.go        |   5 ++
 pkg/tcpip/stack/packet_buffer_state.go  |   1 +
 pkg/tcpip/stack/rand.go                 |  40 +++++++++
 pkg/tcpip/stack/stack.go                |  44 +++++++++-
 pkg/tcpip/transport/tcp/accept.go       |  20 ++++-
 pkg/tcpip/transport/tcp/connect.go      | 138 +++++++++++++++++++++++---------
 pkg/tcpip/transport/tcp/endpoint.go     |   5 ++
 pkg/tcpip/transport/tcp/protocol.go     |  10 ++-
 11 files changed, 280 insertions(+), 70 deletions(-)
 create mode 100644 pkg/tcpip/stack/rand.go

diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 235e647ff..3b3b6909b 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -405,6 +405,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		eth.Encode(ethHdr)
 	}
 
+	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
 	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		vnetHdr := virtioNetHdr{}
 		if gso != nil {
@@ -428,14 +429,14 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		}
 
 		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
-		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
+		return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
 	}
 
 	if pkt.Data.Size() == 0 {
-		return rawfile.NonBlockingWrite(e.fds[0], pkt.Header.View())
+		return rawfile.NonBlockingWrite(fd, pkt.Header.View())
 	}
 
-	return rawfile.NonBlockingWrite3(e.fds[0], pkt.Header.View(), pkt.Data.ToView(), nil)
+	return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil)
 }
 
 // WritePackets writes outbound packets to the file descriptor. If it is not
@@ -551,7 +552,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.Pac
 
 	packets := 0
 	for packets < n {
-		sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs)
+		fd := e.fds[pkts[packets].Hash%uint32(len(e.fds))]
+		sent, err := rawfile.NonBlockingSendMMsg(fd, mmsgHdrs)
 		if err != nil {
 			return packets, err
 		}
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index c7dbbbc6b..3bfb15a8e 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -49,36 +49,42 @@ type packetInfo struct {
 }
 
 type context struct {
-	t    *testing.T
-	fds  [2]int
-	ep   stack.LinkEndpoint
-	ch   chan packetInfo
-	done chan struct{}
+	t        *testing.T
+	readFDs  []int
+	writeFDs []int
+	ep       stack.LinkEndpoint
+	ch       chan packetInfo
+	done     chan struct{}
 }
 
 func newContext(t *testing.T, opt *Options) *context {
-	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	firstFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+	secondFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
 	if err != nil {
 		t.Fatalf("Socketpair failed: %v", err)
 	}
 
-	done := make(chan struct{}, 1)
+	done := make(chan struct{}, 2)
 	opt.ClosedFunc = func(*tcpip.Error) {
 		done <- struct{}{}
 	}
 
-	opt.FDs = []int{fds[1]}
+	opt.FDs = []int{firstFDPair[1], secondFDPair[1]}
 	ep, err := New(opt)
 	if err != nil {
 		t.Fatalf("Failed to create FD endpoint: %v", err)
 	}
 
 	c := &context{
-		t:    t,
-		fds:  fds,
-		ep:   ep,
-		ch:   make(chan packetInfo, 100),
-		done: done,
+		t:        t,
+		readFDs:  []int{firstFDPair[0], secondFDPair[0]},
+		writeFDs: opt.FDs,
+		ep:       ep,
+		ch:       make(chan packetInfo, 100),
+		done:     done,
 	}
 
 	ep.Attach(c)
@@ -87,9 +93,14 @@ func newContext(t *testing.T, opt *Options) *context {
 }
 
 func (c *context) cleanup() {
-	syscall.Close(c.fds[0])
+	for _, fd := range c.readFDs {
+		syscall.Close(fd)
+	}
+	<-c.done
 	<-c.done
-	syscall.Close(c.fds[1])
+	for _, fd := range c.writeFDs {
+		syscall.Close(fd)
+	}
 }
 
 func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
@@ -136,7 +147,7 @@ func TestAddress(t *testing.T) {
 	}
 }
 
-func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
+func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash uint32) {
 	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize})
 	defer c.cleanup()
 
@@ -171,13 +182,15 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
 	if err := c.ep.WritePacket(r, gso, proto, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
+		Hash:   hash,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
-	// Read from fd, then compare with what we wrote.
+	// Read from the corresponding FD, then compare with what we wrote.
 	b = make([]byte, mtu)
-	n, err := syscall.Read(c.fds[0], b)
+	fd := c.readFDs[hash%uint32(len(c.readFDs))]
+	n, err := syscall.Read(fd, b)
 	if err != nil {
 		t.Fatalf("Read failed: %v", err)
 	}
@@ -238,7 +251,7 @@ func TestWritePacket(t *testing.T) {
 				t.Run(
 					fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v", eth, plen, gso),
 					func(t *testing.T) {
-						testWritePacket(t, plen, eth, gso)
+						testWritePacket(t, plen, eth, gso, 0)
 					},
 				)
 			}
@@ -246,6 +259,27 @@ func TestWritePacket(t *testing.T) {
 	}
 }
 
+func TestHashedWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+	hashes := []uint32{0, 1}
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				for _, hash := range hashes {
+					t.Run(
+						fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v,Hash=%d", eth, plen, gso, hash),
+						func(t *testing.T) {
+							testWritePacket(t, plen, eth, gso, hash)
+						},
+					)
+				}
+			}
+		}
+	}
+}
+
 func TestPreserveSrcAddress(t *testing.T) {
 	baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99")
 
@@ -270,7 +304,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 
 	// Read from the FD, then compare with what we wrote.
 	b := make([]byte, mtu)
-	n, err := syscall.Read(c.fds[0], b)
+	n, err := syscall.Read(c.readFDs[0], b)
 	if err != nil {
 		t.Fatalf("Read failed: %v", err)
 	}
@@ -314,7 +348,7 @@ func TestDeliverPacket(t *testing.T) {
 				}
 
 				// Write packet via the file descriptor.
-				if _, err := syscall.Write(c.fds[0], all); err != nil {
+				if _, err := syscall.Write(c.readFDs[0], all); err != nil {
 					t.Fatalf("Write failed: %v", err)
 				}
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 7a43a1d4e..8d80e9cee 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -30,6 +30,7 @@ go_library(
         "nic.go",
         "packet_buffer.go",
         "packet_buffer_state.go",
+        "rand.go",
         "registration.go",
         "route.go",
         "stack.go",
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 1850fa8c3..9505a4e92 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -10,6 +10,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 package stack
 
 import "gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -54,6 +55,10 @@ type PacketBuffer struct {
 	LinkHeader      buffer.View
 	NetworkHeader   buffer.View
 	TransportHeader buffer.View
+
+	// Hash is the transport layer hash of this packet. A value of zero
+	// indicates no valid hash has been set.
+	Hash uint32
 }
 
 // Clone makes a copy of pk. It clones the Data field, which creates a new
diff --git a/pkg/tcpip/stack/packet_buffer_state.go b/pkg/tcpip/stack/packet_buffer_state.go
index 76602549e..0c6b7924c 100644
--- a/pkg/tcpip/stack/packet_buffer_state.go
+++ b/pkg/tcpip/stack/packet_buffer_state.go
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 package stack
 
 import "gvisor.dev/gvisor/pkg/tcpip/buffer"
diff --git a/pkg/tcpip/stack/rand.go b/pkg/tcpip/stack/rand.go
new file mode 100644
index 000000000..421fb5c15
--- /dev/null
+++ b/pkg/tcpip/stack/rand.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	mathrand "math/rand"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// lockedRandomSource provides a threadsafe rand.Source.
+type lockedRandomSource struct {
+	mu  sync.Mutex
+	src mathrand.Source
+}
+
+func (r *lockedRandomSource) Int63() (n int64) {
+	r.mu.Lock()
+	n = r.src.Int63()
+	r.mu.Unlock()
+	return n
+}
+
+func (r *lockedRandomSource) Seed(seed int64) {
+	r.mu.Lock()
+	r.src.Seed(seed)
+	r.mu.Unlock()
+}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a9584d636..41398a1b6 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -20,7 +20,9 @@
 package stack
 
 import (
+	"bytes"
 	"encoding/binary"
+	mathrand "math/rand"
 	"sync/atomic"
 	"time"
 
@@ -465,6 +467,10 @@ type Stack struct {
 	// forwarder holds the packets that wait for their link-address resolutions
 	// to complete, and forwards them when each resolution is done.
 	forwarder *forwardQueue
+
+	// randomGenerator is an injectable pseudo random generator that can be
+	// used when a random number is required.
+	randomGenerator *mathrand.Rand
 }
 
 // UniqueID is an abstract generator of unique identifiers.
@@ -525,9 +531,16 @@ type Options struct {
 	// this is non-nil.
 	RawFactory RawFactory
 
-	// OpaqueIIDOpts hold the options for generating opaque interface identifiers
-	// (IIDs) as outlined by RFC 7217.
+	// OpaqueIIDOpts hold the options for generating opaque interface
+	// identifiers (IIDs) as outlined by RFC 7217.
 	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// RandSource is an optional source to use to generate random
+	// numbers. If omitted it defaults to a Source seeded by the data
+	// returned by rand.Read().
+	//
+	// RandSource must be thread-safe.
+	RandSource mathrand.Source
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -623,6 +636,13 @@ func New(opts Options) *Stack {
 		opts.UniqueID = new(uniqueIDGenerator)
 	}
 
+	randSrc := opts.RandSource
+	if randSrc == nil {
+		// Source provided by mathrand.NewSource is not thread-safe so
+		// we wrap it in a simple thread-safe version.
+		randSrc = &lockedRandomSource{src: mathrand.NewSource(generateRandInt64())}
+	}
+
 	// Make sure opts.NDPConfigs contains valid values only.
 	opts.NDPConfigs.validate()
 
@@ -645,6 +665,7 @@ func New(opts Options) *Stack {
 		ndpDisp:              opts.NDPDisp,
 		opaqueIIDOpts:        opts.OpaqueIIDOpts,
 		forwarder:            newForwardQueue(),
+		randomGenerator:      mathrand.New(randSrc),
 	}
 
 	// Add specified network protocols.
@@ -1818,6 +1839,12 @@ func (s *Stack) Seed() uint32 {
 	return s.seed
 }
 
+// Rand returns a reference to a pseudo random generator that can be used
+// to generate random numbers as required.
+func (s *Stack) Rand() *mathrand.Rand {
+	return s.randomGenerator
+}
+
 func generateRandUint32() uint32 {
 	b := make([]byte, 4)
 	if _, err := rand.Read(b); err != nil {
@@ -1825,3 +1852,16 @@ func generateRandUint32() uint32 {
 	}
 	return binary.LittleEndian.Uint32(b)
 }
+
+func generateRandInt64() int64 {
+	b := make([]byte, 8)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	buf := bytes.NewReader(b)
+	var v int64
+	if err := binary.Read(buf, binary.LittleEndian, &v); err != nil {
+		panic(err)
+	}
+	return v
+}
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 3f80995f3..b4c4c8ab1 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -445,7 +445,15 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
 		// must be sent in response to a SYN-ACK while in the listen
 		// state to prevent completing a handshake from an old SYN.
-		e.sendTCP(&s.route, s.id, buffer.VectorisedView{}, e.ttl, e.sendTOS, header.TCPFlagRst, s.ackNumber, 0, 0, nil, nil)
+		e.sendTCP(&s.route, tcpFields{
+			id:     s.id,
+			ttl:    e.ttl,
+			tos:    e.sendTOS,
+			flags:  header.TCPFlagRst,
+			seq:    s.ackNumber,
+			ack:    0,
+			rcvWnd: 0,
+		}, buffer.VectorisedView{}, nil)
 		return
 	}
 
@@ -493,7 +501,15 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 				TSEcr: opts.TSVal,
 				MSS:   mssForRoute(&s.route),
 			}
-			e.sendSynTCP(&s.route, s.id, e.ttl, e.sendTOS, header.TCPFlagSyn|header.TCPFlagAck, cookie, s.sequenceNumber+1, ctx.rcvWnd, synOpts)
+			e.sendSynTCP(&s.route, tcpFields{
+				id:     s.id,
+				ttl:    e.ttl,
+				tos:    e.sendTOS,
+				flags:  header.TCPFlagSyn | header.TCPFlagAck,
+				seq:    cookie,
+				ack:    s.sequenceNumber + 1,
+				rcvWnd: ctx.rcvWnd,
+			}, synOpts)
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
 		}
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 79552fc61..1d245c2c6 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -308,7 +308,15 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
 	}
-	h.ep.sendSynTCP(&s.route, h.ep.ID, ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+	h.ep.sendSynTCP(&s.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
 	return nil
 }
 
@@ -361,7 +369,15 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
-		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+		h.ep.sendSynTCP(&s.route, tcpFields{
+			id:     h.ep.ID,
+			ttl:    h.ep.ttl,
+			tos:    h.ep.sendTOS,
+			flags:  h.flags,
+			seq:    h.iss,
+			ack:    h.ackNum,
+			rcvWnd: h.rcvWnd,
+		}, synOpts)
 		return nil
 	}
 
@@ -550,7 +566,16 @@ func (h *handshake) execute() *tcpip.Error {
 			synOpts.WS = -1
 		}
 	}
-	h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+
+	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    h.ep.ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
 
 	for h.state != handshakeCompleted {
 		h.ep.mu.Unlock()
@@ -573,7 +598,15 @@ func (h *handshake) execute() *tcpip.Error {
 			// the connection with another ACK or data (as ACKs are never
 			// retransmitted on their own).
 			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
-				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+				h.ep.sendSynTCP(&h.ep.route, tcpFields{
+					id:     h.ep.ID,
+					ttl:    h.ep.ttl,
+					tos:    h.ep.sendTOS,
+					flags:  h.flags,
+					seq:    h.iss,
+					ack:    h.ackNum,
+					rcvWnd: h.rcvWnd,
+				}, synOpts)
 			}
 
 		case wakerForNotification:
@@ -686,18 +719,33 @@ func makeSynOptions(opts header.TCPSynOptions) []byte {
 	return options[:offset]
 }
 
-func (e *endpoint) sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error {
-	options := makeSynOptions(opts)
+// tcpFields is a struct to carry different parameters required by the
+// send*TCP variant functions below.
+type tcpFields struct {
+	id     stack.TransportEndpointID
+	ttl    uint8
+	tos    uint8
+	flags  byte
+	seq    seqnum.Value
+	ack    seqnum.Value
+	rcvWnd seqnum.Size
+	opts   []byte
+	txHash uint32
+}
+
+func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) *tcpip.Error {
+	tf.opts = makeSynOptions(opts)
 	// We ignore SYN send errors and let the callers re-attempt send.
-	if err := e.sendTCP(r, id, buffer.VectorisedView{}, ttl, tos, flags, seq, ack, rcvWnd, options, nil); err != nil {
+	if err := e.sendTCP(r, tf, buffer.VectorisedView{}, nil); err != nil {
 		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
 	}
-	putOptions(options)
+	putOptions(tf.opts)
 	return nil
 }
 
-func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	if err := sendTCP(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso); err != nil {
+func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	tf.txHash = e.txHash
+	if err := sendTCP(r, tf, data, gso); err != nil {
 		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
 		return err
 	}
@@ -705,8 +753,8 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
 	return nil
 }
 
-func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
-	optLen := len(opts)
+func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
+	optLen := len(tf.opts)
 	hdr := &pkt.Header
 	packetSize := pkt.DataSize
 	off := pkt.DataOffset
@@ -714,15 +762,15 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *stack.Packet
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
 	pkt.TransportHeader = buffer.View(tcp)
 	tcp.Encode(&header.TCPFields{
-		SrcPort:    id.LocalPort,
-		DstPort:    id.RemotePort,
-		SeqNum:     uint32(seq),
-		AckNum:     uint32(ack),
+		SrcPort:    tf.id.LocalPort,
+		DstPort:    tf.id.RemotePort,
+		SeqNum:     uint32(tf.seq),
+		AckNum:     uint32(tf.ack),
 		DataOffset: uint8(header.TCPMinimumSize + optLen),
-		Flags:      flags,
-		WindowSize: uint16(rcvWnd),
+		Flags:      tf.flags,
+		WindowSize: uint16(tf.rcvWnd),
 	})
-	copy(tcp[header.TCPMinimumSize:], opts)
+	copy(tcp[header.TCPMinimumSize:], tf.opts)
 
 	length := uint16(hdr.UsedLength() + packetSize)
 	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
@@ -737,13 +785,12 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *stack.Packet
 		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, off, packetSize)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
-
 }
 
-func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	optLen := len(opts)
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
+func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
 	}
 
 	mss := int(gso.MSS)
@@ -768,14 +815,15 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 		pkts[i].DataOffset = off
 		pkts[i].DataSize = packetSize
 		pkts[i].Data = data
-		buildTCPHdr(r, id, &pkts[i], flags, seq, ack, rcvWnd, opts, gso)
+		pkts[i].Hash = tf.txHash
+		buildTCPHdr(r, tf, &pkts[i], gso)
 		off += packetSize
-		seq = seq.Add(seqnum.Size(packetSize))
+		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
 	}
-	if ttl == 0 {
-		ttl = r.DefaultTTL()
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
 	}
-	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
+	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
 	if err != nil {
 		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
 	}
@@ -785,14 +833,14 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 
 // sendTCP sends a TCP segment with the provided options via the provided
 // network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	optLen := len(opts)
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
+func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
-		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
+		return sendTCPBatch(r, tf, data, gso)
 	}
 
 	pkt := stack.PacketBuffer{
@@ -800,18 +848,19 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 		DataOffset: 0,
 		DataSize:   data.Size(),
 		Data:       data,
+		Hash:       tf.txHash,
 	}
-	buildTCPHdr(r, id, &pkt, flags, seq, ack, rcvWnd, opts, gso)
+	buildTCPHdr(r, tf, &pkt, gso)
 
-	if ttl == 0 {
-		ttl = r.DefaultTTL()
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, pkt); err != nil {
+	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
 		r.Stats().TCP.SegmentSendErrors.Increment()
 		return err
 	}
 	r.Stats().TCP.SegmentsSent.Increment()
-	if (flags & header.TCPFlagRst) != 0 {
+	if (tf.flags & header.TCPFlagRst) != 0 {
 		r.Stats().TCP.ResetsSent.Increment()
 	}
 	return nil
@@ -863,7 +912,16 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
-	err := e.sendTCP(&e.route, e.ID, data, e.ttl, e.sendTOS, flags, seq, ack, rcvWnd, options, e.gso)
+	err := e.sendTCP(&e.route, tcpFields{
+		id:     e.ID,
+		ttl:    e.ttl,
+		tos:    e.sendTOS,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: rcvWnd,
+		opts:   options,
+	}, data, e.gso)
 	putOptions(options)
 	return err
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 594efaa11..b6e571361 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -581,6 +581,10 @@ type endpoint struct {
 	// endpoint and at this point the endpoint is only around
 	// to complete the TCP shutdown.
 	closed bool
+
+	// txHash is the transport layer hash to be set on outbound packets
+	// emitted by this endpoint.
+	txHash uint32
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -771,6 +775,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 			count:    9,
 		},
 		uniqueID: s.UniqueID(),
+		txHash:   s.Rand().Uint32(),
 	}
 
 	var ss SendBufferSizeOption
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 57985b85d..1377107ca 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -191,7 +191,15 @@ func replyWithReset(s *segment) {
 		flags |= header.TCPFlagAck
 		ack = s.sequenceNumber.Add(s.logicalLen())
 	}
-	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+	sendTCP(&s.route, tcpFields{
+		id:     s.id,
+		ttl:    s.route.DefaultTTL(),
+		tos:    stack.DefaultTOS,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: 0,
+	}, buffer.VectorisedView{}, nil /* gso */)
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-- 
cgit v1.2.3


From d8c4eff3f77b2a5dde34389033fe0ce4589b2e82 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 25 Mar 2020 08:10:11 -0700
Subject: Automated rollback of changelist 301837227

PiperOrigin-RevId: 302891559
---
 pkg/sentry/socket/netstack/netstack.go | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index c19f5639b..f14c336b9 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,6 +29,7 @@ import (
 	"io"
 	"math"
 	"reflect"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -264,8 +265,14 @@ type SocketOperations struct {
 	skType   linux.SockType
 	protocol int
 
+	// readViewHasData is 1 iff readView has data to be read, 0 otherwise.
+	// Must be accessed using atomic operations. It must only be written
+	// with readMu held but can be read without holding readMu. The latter
+	// is required to avoid deadlocks in epoll Readiness checks.
+	readViewHasData uint32
+
 	// readMu protects access to the below fields.
-	readMu sync.RWMutex `state:"nosave"`
+	readMu sync.Mutex `state:"nosave"`
 	// readView contains the remaining payload from the last packet.
 	readView buffer.View
 	// readCM holds control message information for the last packet read
@@ -421,11 +428,13 @@ func (s *SocketOperations) fetchReadView() *syserr.Error {
 
 	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
+		atomic.StoreUint32(&s.readViewHasData, 0)
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
 	s.readCM = cms
+	atomic.StoreUint32(&s.readViewHasData, 1)
 
 	return nil
 }
@@ -624,11 +633,9 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	// Check our cached value iff the caller asked for readability and the
 	// endpoint itself is currently not readable.
 	if (mask & ^r & waiter.EventIn) != 0 {
-		s.readMu.RLock()
-		if len(s.readView) > 0 {
+		if atomic.LoadUint32(&s.readViewHasData) == 1 {
 			r |= waiter.EventIn
 		}
-		s.readMu.RUnlock()
 	}
 
 	return r
@@ -2335,6 +2342,9 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 		}
 		copied += n
 		s.readView.TrimFront(n)
+		if len(s.readView) == 0 {
+			atomic.StoreUint32(&s.readViewHasData, 0)
+		}
 
 		dst = dst.DropFirst(n)
 		if e != nil {
@@ -2458,6 +2468,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readView.TrimFront(int(n))
 	}
 
+	if len(s.readView) == 0 {
+		atomic.StoreUint32(&s.readViewHasData, 0)
+	}
+
 	var flags int
 	if msgLen > int(n) {
 		flags |= linux.MSG_TRUNC
-- 
cgit v1.2.3


From d04adebaab86ac30aca463b06528fc22430598ac Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 25 Mar 2020 10:54:19 -0700
Subject: Fix data-race in endpoint.Readiness

PiperOrigin-RevId: 302924789
---
 pkg/sync/aliases.go                       |  5 +++
 pkg/tcpip/transport/tcp/accept.go         | 46 ++++++++++++--------
 pkg/tcpip/transport/tcp/endpoint.go       | 70 ++++++++++++++++++++-----------
 pkg/tcpip/transport/tcp/endpoint_state.go |  3 ++
 4 files changed, 82 insertions(+), 42 deletions(-)

diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go
index d2d7132fa..0d4316254 100644
--- a/pkg/sync/aliases.go
+++ b/pkg/sync/aliases.go
@@ -29,3 +29,8 @@ type (
 	// Map is an alias of sync.Map.
 	Map = sync.Map
 )
+
+// NewCond is a wrapper around sync.NewCond.
+func NewCond(l Locker) *Cond {
+	return sync.NewCond(l)
+}
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index b4c4c8ab1..375ca21f6 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -365,21 +365,29 @@ func (l *listenContext) closeAllPendingEndpoints() {
 }
 
 // deliverAccepted delivers the newly-accepted endpoint to the listener. If the
-// endpoint has transitioned out of the listen state, the new endpoint is closed
-// instead.
+// endpoint has transitioned out of the listen state (acceptedChan is nil),
+// the new endpoint is closed instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.Lock()
-	state := e.EndpointState()
 	e.pendingAccepted.Add(1)
-	defer e.pendingAccepted.Done()
-	acceptedChan := e.acceptedChan
 	e.mu.Unlock()
+	defer e.pendingAccepted.Done()
 
-	if state == StateListen {
-		acceptedChan <- n
-		e.waiterQueue.Notify(waiter.EventIn)
-	} else {
-		n.Close()
+	e.acceptMu.Lock()
+	for {
+		if e.acceptedChan == nil {
+			e.acceptMu.Unlock()
+			n.Close()
+			return
+		}
+		select {
+		case e.acceptedChan <- n:
+			e.acceptMu.Unlock()
+			e.waiterQueue.Notify(waiter.EventIn)
+			return
+		default:
+			e.acceptCond.Wait()
+		}
 	}
 }
 
@@ -420,11 +428,13 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 }
 
 func (e *endpoint) incSynRcvdCount() bool {
-	if e.synRcvdCount >= cap(e.acceptedChan) {
-		return false
+	e.acceptMu.Lock()
+	canInc := e.synRcvdCount < cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	if canInc {
+		e.synRcvdCount++
 	}
-	e.synRcvdCount++
-	return true
+	return canInc
 }
 
 func (e *endpoint) decSynRcvdCount() {
@@ -432,10 +442,10 @@ func (e *endpoint) decSynRcvdCount() {
 }
 
 func (e *endpoint) acceptQueueIsFull() bool {
-	if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c {
-		return true
-	}
-	return false
+	e.acceptMu.Lock()
+	full := len(e.acceptedChan)+e.synRcvdCount >= cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	return full
 }
 
 // handleListenSegment is called when a listening endpoint receives a segment
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b6e571361..1ebee0cfe 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -291,6 +291,7 @@ func (*EndpointInfo) IsEndpointInfo() {}
 // The following three mutexes can be acquired independent of e.mu but if
 // acquired with e.mu then e.mu must be acquired first.
 //
+// e.acceptMu -> protects acceptedChan.
 // e.rcvListMu -> Protects the rcvList and associated fields.
 // e.sndBufMu -> Protects the sndQueue and associated fields.
 // e.lastErrorMu -> Protects the lastError field.
@@ -533,6 +534,23 @@ type endpoint struct {
 	// to the acceptedChan below terminate before we close acceptedChan.
 	pendingAccepted sync.WaitGroup `state:"nosave"`
 
+	// acceptMu protects acceptedChan.
+	acceptMu sync.Mutex `state:"nosave"`
+
+	// acceptCond is a condition variable that can be used to block on when
+	// acceptedChan is full and an endpoint is ready to be delivered.
+	//
+	// This condition variable is required because just blocking on sending
+	// to acceptedChan does not work in cases where endpoint.Listen is
+	// called twice with different backlog values. In such cases the channel
+	// is closed and a new one created. Any pending goroutines blocking on
+	// the write to the channel will panic.
+	//
+	// We use this condition variable to block/unblock goroutines which
+	// tried to deliver an endpoint but couldn't because accept backlog was
+	// full ( See: endpoint.deliverAccepted ).
+	acceptCond *sync.Cond `state:"nosave"`
+
 	// acceptedChan is used by a listening endpoint protocol goroutine to
 	// send newly accepted connections to the endpoint so that they can be
 	// read by Accept() calls.
@@ -814,6 +832,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.tsOffset = timeStampOffset()
+	e.acceptCond = sync.NewCond(&e.acceptMu)
 
 	return e
 }
@@ -834,9 +853,11 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	case StateListen:
 		// Check if there's anything in the accepted channel.
 		if (mask & waiter.EventIn) != 0 {
+			e.acceptMu.Lock()
 			if len(e.acceptedChan) > 0 {
 				result |= waiter.EventIn
 			}
+			e.acceptMu.Unlock()
 		}
 	}
 	if e.EndpointState().connected() {
@@ -981,29 +1002,19 @@ func (e *endpoint) closeNoShutdownLocked() {
 // closePendingAcceptableConnections closes all connections that have completed
 // handshake but not yet been delivered to the application.
 func (e *endpoint) closePendingAcceptableConnectionsLocked() {
-	done := make(chan struct{})
-	// Spin a goroutine up as ranging on e.acceptedChan will just block when
-	// there are no more connections in the channel. Using a non-blocking
-	// select does not work as it can potentially select the default case
-	// even when there are pending writes but that are not yet written to
-	// the channel.
-	go func() {
-		defer close(done)
-		for n := range e.acceptedChan {
-			n.notifyProtocolGoroutine(notifyReset)
-			// close all connections that have completed but
-			// not accepted by the application.
-			n.Close()
-		}
-	}()
-	// pendingAccepted(see endpoint.deliverAccepted) tracks the number of
-	// endpoints which have completed handshake but are not yet written to
-	// the e.acceptedChan. We wait here till the goroutine above can drain
-	// all such connections from e.acceptedChan.
-	e.pendingAccepted.Wait()
+	e.acceptMu.Lock()
+	if e.acceptedChan == nil {
+		e.acceptMu.Unlock()
+		return
+	}
+
 	close(e.acceptedChan)
-	<-done
 	e.acceptedChan = nil
+	e.acceptCond.Broadcast()
+	e.acceptMu.Unlock()
+
+	// Wait for all pending endpoints to close.
+	e.pendingAccepted.Wait()
 }
 
 // cleanupLocked frees all resources associated with the endpoint. It is called
@@ -1012,9 +1023,7 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 func (e *endpoint) cleanupLocked() {
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
-	if e.acceptedChan != nil {
-		e.closePendingAcceptableConnectionsLocked()
-	}
+	e.closePendingAcceptableConnectionsLocked()
 
 	e.workerCleanup = false
 
@@ -2204,6 +2213,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	if e.EndpointState() == StateListen && !e.workerCleanup {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
+		e.acceptMu.Lock()
+		defer e.acceptMu.Unlock()
 		if len(e.acceptedChan) > backlog {
 			return tcpip.ErrInvalidEndpointState
 		}
@@ -2216,6 +2227,11 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		for ep := range origChan {
 			e.acceptedChan <- ep
 		}
+
+		// Notify any blocked goroutines that they can attempt to
+		// deliver endpoints again.
+		e.acceptCond.Broadcast()
+
 		return nil
 	}
 
@@ -2245,9 +2261,12 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	// The channel may be non-nil when we're restoring the endpoint, and it
 	// may be pre-populated with some previously accepted (but not Accepted)
 	// endpoints.
+	e.acceptMu.Lock()
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
+	e.acceptMu.Unlock()
+
 	e.workerRunning = true
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
@@ -2276,9 +2295,12 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	}
 
 	// Get the new accepted endpoint.
+	e.acceptMu.Lock()
+	defer e.acceptMu.Unlock()
 	var n *endpoint
 	select {
 	case n = <-e.acceptedChan:
+		e.acceptCond.Signal()
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 9175de441..c3c692555 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -173,6 +173,9 @@ func (e *endpoint) afterLoad() {
 	// Restore the endpoint to InitialState as it will be moved to
 	// its origEndpointState during Resume.
 	e.state = StateInitial
+	// Condition variables and mutexs are not S/R'ed so reinitialize
+	// acceptCond with e.acceptMu.
+	e.acceptCond = sync.NewCond(&e.acceptMu)
 	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
-- 
cgit v1.2.3


From e7fbf6949514f1cf239437dcead4ceed5aac029e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 25 Mar 2020 10:56:36 -0700
Subject: Fix futex_benchmark.

- Fix definitions of Futex* wrappers.

- Correctly handle glibc syscall() (which returns -1 and sets errno instead of
  returning the raw syscall return value).

- De-parameterize FutexWaitBitset, which was apparently intended to test with
  deadlines of between 0 and 100000 nanoseconds after the Unix epoch, but was
  broken due to the preceding two issues.

- Use wall time to measure the durations of tests that are expected to block
  (and thus stop accumulating CPU time).

- Require 5s for all tests to improve robustness in the presence of sentry GC.

- Remove FutexContend and FutexContendDeadline; it's unclear what these are
  supposed to measure, given that (1) FutexLock is unrealistically inefficient
  and (2) the benchmark rewards slow scheduling (since this reduces
  contention).

PiperOrigin-RevId: 302925246
---
 test/perf/linux/futex_benchmark.cc | 144 ++++++++++++-------------------------
 1 file changed, 47 insertions(+), 97 deletions(-)

diff --git a/test/perf/linux/futex_benchmark.cc b/test/perf/linux/futex_benchmark.cc
index b349d50bf..241f39896 100644
--- a/test/perf/linux/futex_benchmark.cc
+++ b/test/perf/linux/futex_benchmark.cc
@@ -33,24 +33,24 @@ namespace testing {
 namespace {
 
 inline int FutexWait(std::atomic<int32_t>* v, int32_t val) {
-  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, nullptr);
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, val, nullptr);
 }
 
-inline int FutexWaitRelativeTimeout(std::atomic<int32_t>* v, int32_t val,
-                                    const struct timespec* reltime) {
-  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, reltime);
+inline int FutexWaitMonotonicTimeout(std::atomic<int32_t>* v, int32_t val,
+                                     const struct timespec* timeout) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, val, timeout);
 }
 
-inline int FutexWaitAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
-                                    const struct timespec* abstime) {
-  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, abstime);
+inline int FutexWaitMonotonicDeadline(std::atomic<int32_t>* v, int32_t val,
+                                      const struct timespec* deadline) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE, val, deadline,
+                 nullptr, FUTEX_BITSET_MATCH_ANY);
 }
 
-inline int FutexWaitBitsetAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
-                                          int32_t bits,
-                                          const struct timespec* abstime) {
+inline int FutexWaitRealtimeDeadline(std::atomic<int32_t>* v, int32_t val,
+                                     const struct timespec* deadline) {
   return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE | FUTEX_CLOCK_REALTIME,
-                 val, abstime, nullptr, bits);
+                 val, deadline, nullptr, FUTEX_BITSET_MATCH_ANY);
 }
 
 inline int FutexWake(std::atomic<int32_t>* v, int32_t count) {
@@ -62,11 +62,11 @@ void BM_FutexWakeNop(benchmark::State& state) {
   std::atomic<int32_t> v(0);
 
   for (auto _ : state) {
-    EXPECT_EQ(0, FutexWake(&v, 1));
+    TEST_PCHECK(FutexWake(&v, 1) == 0);
   }
 }
 
-BENCHMARK(BM_FutexWakeNop);
+BENCHMARK(BM_FutexWakeNop)->MinTime(5);
 
 // This just uses FUTEX_WAIT on an address whose value has changed, i.e., the
 // syscall won't wait.
@@ -74,43 +74,63 @@ void BM_FutexWaitNop(benchmark::State& state) {
   std::atomic<int32_t> v(0);
 
   for (auto _ : state) {
-    EXPECT_EQ(-EAGAIN, FutexWait(&v, 1));
+    TEST_PCHECK(FutexWait(&v, 1) == -1 && errno == EAGAIN);
   }
 }
 
-BENCHMARK(BM_FutexWaitNop);
+BENCHMARK(BM_FutexWaitNop)->MinTime(5);
 
 // This uses FUTEX_WAIT with a timeout on an address whose value never
 // changes, such that it always times out. Timeout overhead can be estimated by
 // timer overruns for short timeouts.
-void BM_FutexWaitTimeout(benchmark::State& state) {
+void BM_FutexWaitMonotonicTimeout(benchmark::State& state) {
   const int timeout_ns = state.range(0);
   std::atomic<int32_t> v(0);
   auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
 
   for (auto _ : state) {
-    EXPECT_EQ(-ETIMEDOUT, FutexWaitRelativeTimeout(&v, 0, &ts));
+    TEST_PCHECK(FutexWaitMonotonicTimeout(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
   }
 }
 
-BENCHMARK(BM_FutexWaitTimeout)
+BENCHMARK(BM_FutexWaitMonotonicTimeout)
+    ->MinTime(5)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(10)
     ->Arg(100)
     ->Arg(1000)
     ->Arg(10000);
 
-// This calls FUTEX_WAIT_BITSET with CLOCK_REALTIME.
-void BM_FutexWaitBitset(benchmark::State& state) {
+// This uses FUTEX_WAIT_BITSET with a deadline that is in the past. This allows
+// estimation of the overhead of setting up a timer for a deadline (as opposed
+// to a timeout as specified for FUTEX_WAIT).
+void BM_FutexWaitMonotonicDeadline(benchmark::State& state) {
   std::atomic<int32_t> v(0);
-  int timeout_ns = state.range(0);
-  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+  struct timespec ts = {};
+
   for (auto _ : state) {
-    EXPECT_EQ(-ETIMEDOUT, FutexWaitBitsetAbsoluteTimeout(&v, 0, 1, &ts));
+    TEST_PCHECK(FutexWaitMonotonicDeadline(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
   }
 }
 
-BENCHMARK(BM_FutexWaitBitset)->Range(0, 100000);
+BENCHMARK(BM_FutexWaitMonotonicDeadline)->MinTime(5);
+
+// This is equivalent to BM_FutexWaitMonotonicDeadline, but uses CLOCK_REALTIME
+// instead of CLOCK_MONOTONIC for the deadline.
+void BM_FutexWaitRealtimeDeadline(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  struct timespec ts = {};
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWaitRealtimeDeadline(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
+  }
+}
+
+BENCHMARK(BM_FutexWaitRealtimeDeadline)->MinTime(5);
 
 int64_t GetCurrentMonotonicTimeNanos() {
   struct timespec ts;
@@ -130,11 +150,10 @@ void SpinNanos(int64_t delay_ns) {
 
 // Each iteration of FutexRoundtripDelayed involves a thread sending a futex
 // wakeup to another thread, which spins for delay_us and then sends a futex
-// wakeup back. The time per iteration is 2*  (delay_us + kBeforeWakeDelayNs +
+// wakeup back. The time per iteration is 2 * (delay_us + kBeforeWakeDelayNs +
 // futex/scheduling overhead).
 void BM_FutexRoundtripDelayed(benchmark::State& state) {
   const int delay_us = state.range(0);
-
   const int64_t delay_ns = delay_us * 1000;
   // Spin for an extra kBeforeWakeDelayNs before invoking FUTEX_WAKE to reduce
   // the probability that the wakeup comes before the wait, preventing the wait
@@ -165,83 +184,14 @@ void BM_FutexRoundtripDelayed(benchmark::State& state) {
 }
 
 BENCHMARK(BM_FutexRoundtripDelayed)
+    ->MinTime(5)
+    ->UseRealTime()
     ->Arg(0)
     ->Arg(10)
     ->Arg(20)
     ->Arg(50)
     ->Arg(100);
 
-// FutexLock is a simple, dumb futex based lock implementation.
-// It will try to acquire the lock by atomically incrementing the
-// lock word. If it did not increment the lock from 0 to 1, someone
-// else has the lock, so it will FUTEX_WAIT until it is woken in
-// the unlock path.
-class FutexLock {
- public:
-  FutexLock() : lock_word_(0) {}
-
-  void lock(struct timespec* deadline) {
-    int32_t val;
-    while ((val = lock_word_.fetch_add(1, std::memory_order_acquire) + 1) !=
-           1) {
-      // If we didn't get the lock by incrementing from 0 to 1,
-      // do a FUTEX_WAIT with the desired current value set to
-      // val. If val is no longer what the atomic increment returned,
-      // someone might have set it to 0 so we can try to acquire
-      // again.
-      int ret = FutexWaitAbsoluteTimeout(&lock_word_, val, deadline);
-      if (ret == 0 || ret == -EWOULDBLOCK || ret == -EINTR) {
-        continue;
-      } else {
-        FAIL() << "unexpected FUTEX_WAIT return: " << ret;
-      }
-    }
-  }
-
-  void unlock() {
-    // Store 0 into the lock word and wake one waiter. We intentionally
-    // ignore the return value of the FUTEX_WAKE here, since there may be
-    // no waiters to wake anyway.
-    lock_word_.store(0, std::memory_order_release);
-    (void)FutexWake(&lock_word_, 1);
-  }
-
- private:
-  std::atomic<int32_t> lock_word_;
-};
-
-FutexLock* test_lock;  // Used below.
-
-void FutexContend(benchmark::State& state, int thread_index,
-                  struct timespec* deadline) {
-  int counter = 0;
-  if (thread_index == 0) {
-    test_lock = new FutexLock();
-  }
-  for (auto _ : state) {
-    test_lock->lock(deadline);
-    counter++;
-    test_lock->unlock();
-  }
-  if (thread_index == 0) {
-    delete test_lock;
-  }
-  state.SetItemsProcessed(state.iterations());
-}
-
-void BM_FutexContend(benchmark::State& state) {
-  FutexContend(state, state.thread_index, nullptr);
-}
-
-BENCHMARK(BM_FutexContend)->ThreadRange(1, 1024)->UseRealTime();
-
-void BM_FutexDeadlineContend(benchmark::State& state) {
-  auto deadline = absl::ToTimespec(absl::Now() + absl::Minutes(10));
-  FutexContend(state, state.thread_index, &deadline);
-}
-
-BENCHMARK(BM_FutexDeadlineContend)->ThreadRange(1, 1024)->UseRealTime();
-
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 2e09f2bdce11d5f303333c68af4272abb62b7885 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 25 Mar 2020 11:20:24 -0700
Subject: travis: exclude copybara branches

When copybara migrates changes, it creates a new branch and then creates a
pull-requests which is based on this branch. In this case, travis-ci
triggers build twice for the branch and for the pull-request.

PiperOrigin-RevId: 302930634
---
 .travis.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index a2a260538..acbd3d61b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,3 +17,7 @@ matrix:
 script:
    - uname -a
    - make DOCKER_RUN_OPTIONS="" BAZEL_OPTIONS="build runsc:runsc" bazel && $RUNSC_PATH --alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do ls
+branches:
+  except:
+  # Skip copybara branches.
+  - /^test\/cl.*$/
-- 
cgit v1.2.3


From c7f5673529af758c9f7c95523535174c7929dab5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Mar 2020 14:44:18 -0700
Subject: Set file mode and type to attribute

Makes less error prone to find file type.

Updates #1197

PiperOrigin-RevId: 302974244
---
 pkg/sentry/fsimpl/tmpfs/device_file.go  | 10 ++++++++++
 pkg/sentry/fsimpl/tmpfs/directory.go    |  7 +------
 pkg/sentry/fsimpl/tmpfs/named_pipe.go   |  2 +-
 pkg/sentry/fsimpl/tmpfs/regular_file.go |  2 +-
 pkg/sentry/fsimpl/tmpfs/symlink.go      |  3 ++-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        | 26 ++++++++++++--------------
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 84b181b90..83bf885ee 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -15,6 +15,8 @@
 package tmpfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -33,6 +35,14 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
 		major: major,
 		minor: minor,
 	}
+	switch kind {
+	case vfs.BlockDevice:
+		mode |= linux.S_IFBLK
+	case vfs.CharDevice:
+		mode |= linux.S_IFCHR
+	default:
+		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
+	}
 	file.inode.init(file, fs, creds, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index b4380af38..37c75ab64 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -34,16 +34,11 @@ type directory struct {
 
 func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
 	return &dir.inode
 }
 
-func (i *inode) isDir() bool {
-	_, ok := i.impl.(*directory)
-	return ok
-}
-
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 0c57fdca3..2c5c739df 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -34,7 +34,7 @@ type namedPipe struct {
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 5a2896bf6..26cd65605 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -89,7 +89,7 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod
 	file := &regularFile{
 		memFile: fs.memFile,
 	}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, creds, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 5246aca84..47e075ed4 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -15,6 +15,7 @@
 package tmpfs
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -27,7 +28,7 @@ func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, 0777)
+	link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index ff69372b3..2d5070a46 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -144,7 +144,7 @@ type inode struct {
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
 	mu    sync.Mutex
-	mode  uint32 // excluding file type bits, which are based on impl
+	mode  uint32 // file type and mode
 	nlink uint32 // protected by filesystem.mu instead of inode.mu
 	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
 	gid   uint32 // auth.KGID, but ...
@@ -168,6 +168,9 @@ type inode struct {
 const maxLinks = math.MaxUint32
 
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic("file type is required in FileMode")
+	}
 	i.clock = fs.clock
 	i.refs = 1
 	i.mode = uint32(mode)
@@ -269,31 +272,21 @@ func (i *inode) statTo(stat *linux.Statx) {
 	// TODO(gvisor.dev/issues/1197): Device number.
 	switch impl := i.impl.(type) {
 	case *regularFile:
-		stat.Mode |= linux.S_IFREG
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(atomic.LoadUint64(&impl.size))
 		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
 		// a uint64 accessed using atomic memory operations to avoid taking
 		// locks).
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *directory:
-		stat.Mode |= linux.S_IFDIR
 	case *symlink:
-		stat.Mode |= linux.S_IFLNK
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(len(impl.target))
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *namedPipe:
-		stat.Mode |= linux.S_IFIFO
 	case *deviceFile:
-		switch impl.kind {
-		case vfs.BlockDevice:
-			stat.Mode |= linux.S_IFBLK
-		case vfs.CharDevice:
-			stat.Mode |= linux.S_IFCHR
-		}
 		stat.RdevMajor = impl.major
 		stat.RdevMinor = impl.minor
+	case *directory, *namedPipe:
+		// Nothing to do.
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
@@ -316,7 +309,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	)
 	mask := stat.Mask
 	if mask&linux.STATX_MODE != 0 {
-		atomic.StoreUint32(&i.mode, uint32(stat.Mode))
+		ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT
+		atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT))
 		needsCtimeBump = true
 	}
 	if mask&linux.STATX_UID != 0 {
@@ -439,6 +433,10 @@ func (i *inode) direntType() uint8 {
 	}
 }
 
+func (i *inode) isDir() bool {
+	return linux.FileMode(i.mode).FileType() == linux.S_IFDIR
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
-- 
cgit v1.2.3


From e541ebec2fdb5b29209cb3fc8235b77edcaebb6a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Mar 2020 14:54:10 -0700
Subject: Misc fixes to make stat_test pass (almost)

The only test failing now requires socket which is not
available in VFS2 yet.

Updates #1198

PiperOrigin-RevId: 302976572
---
 pkg/bits/bits_template.go                    |  8 ++++++
 pkg/bits/uint64_test.go                      | 18 ++++++++++++
 pkg/sentry/fsimpl/gofer/filesystem.go        | 16 +++++++++--
 pkg/sentry/fsimpl/gofer/gofer.go             | 41 ++++++++++++++++++++++++----
 pkg/sentry/syscalls/linux/vfs2/BUILD         |  1 +
 pkg/sentry/syscalls/linux/vfs2/filesystem.go |  2 +-
 pkg/sentry/syscalls/linux/vfs2/getdents.go   |  4 +--
 pkg/sentry/syscalls/linux/vfs2/stat.go       |  7 ++++-
 pkg/sentry/vfs/resolving_path.go             | 16 +++++++++--
 test/syscalls/linux/stat.cc                  | 11 +++++++-
 10 files changed, 109 insertions(+), 15 deletions(-)

diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 93a435b80..998645388 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -42,3 +42,11 @@ func Mask(is ...int) T {
 func MaskOf(i int) T {
 	return T(1) << T(i)
 }
+
+// IsPowerOfTwo returns true if v is power of 2.
+func IsPowerOfTwo(v T) bool {
+	if v == 0 {
+		return false
+	}
+	return v&(v-1) == 0
+}
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index 1b018d808..193d1ebcd 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -114,3 +114,21 @@ func TestIsOn(t *testing.T) {
 		}
 	}
 }
+
+func TestIsPowerOfTwo(t *testing.T) {
+	for _, tc := range []struct {
+		v    uint64
+		want bool
+	}{
+		{v: 0, want: false},
+		{v: 1, want: true},
+		{v: 2, want: true},
+		{v: 3, want: false},
+		{v: 4, want: true},
+		{v: 5, want: false},
+	} {
+		if got := IsPowerOfTwo64(tc.v); got != tc.want {
+			t.Errorf("IsPowerOfTwo(%d) = %t, want: %t", tc.v, got, tc.want)
+		}
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 38e4cdbc5..26b492185 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -454,6 +454,9 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 	if fs.opts.interop != InteropModeShared {
 		parent.touchCMtime(ctx)
+		if dir {
+			parent.decLinks()
+		}
 		parent.cacheNegativeChildLocked(name)
 		parent.dirents = nil
 	}
@@ -569,8 +572,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
 		creds := rp.Credentials()
-		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		return err
+		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
+			return err
+		}
+		if fs.opts.interop != InteropModeShared {
+			parent.incLinks()
+		}
+		return nil
 	})
 }
 
@@ -962,6 +970,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.dirents = nil
 		delete(newParent.negativeChildren, newName)
 		newParent.dirents = nil
+		if renamed.isDir() {
+			oldParent.decLinks()
+			newParent.incLinks()
+		}
 	}
 	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
 	return nil
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 999485492..13928ce36 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -485,6 +485,11 @@ type dentry struct {
 	// locked to mutate it).
 	size uint64
 
+	// nlink counts the number of hard links to this dentry. It's updated and
+	// accessed using atomic operations. It's not protected by metadataMu like the
+	// other metadata fields.
+	nlink uint32
+
 	mapsMu sync.Mutex
 
 	// If this dentry represents a regular file, mappings tracks mappings of
@@ -604,6 +609,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	if mask.BTime {
 		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
 	}
+	if mask.NLink {
+		d.nlink = uint32(attr.NLink)
+	}
 	d.vfsd.Init(d)
 
 	fs.syncMu.Lock()
@@ -645,6 +653,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.BTime {
 		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
 	}
+	if mask.NLink {
+		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
+	}
 	if mask.Size {
 		d.dataMu.Lock()
 		atomic.StoreUint64(&d.size, attr.Size)
@@ -687,10 +698,7 @@ func (d *dentry) fileType() uint32 {
 func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
 	stat.Blksize = atomic.LoadUint32(&d.blockSize)
-	stat.Nlink = 1
-	if d.isDir() {
-		stat.Nlink = 2
-	}
+	stat.Nlink = atomic.LoadUint32(&d.nlink)
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
@@ -703,7 +711,7 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
 	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
 	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
-	// TODO(jamieliu): device number
+	// TODO(gvisor.dev/issue/1198): device number
 }
 
 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
@@ -1094,6 +1102,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 	return nil
 }
 
+// incLinks increments link count.
+//
+// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
+func (d *dentry) incLinks() {
+	v := atomic.AddUint32(&d.nlink, 1)
+	if v < 2 {
+		panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+	}
+}
+
+// decLinks decrements link count.
+//
+// Preconditions: d.nlink > 1.
+func (d *dentry) decLinks() {
+	v := atomic.AddUint32(&d.nlink, ^uint32(0))
+	if v == 0 {
+		panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+	}
+}
+
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -1112,7 +1140,8 @@ func (fd *fileDescription) dentry() *dentry {
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
+	if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
 		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
 		// available?
 		if err := d.updateFromGetattr(ctx); err != nil {
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index e7695e995..2eb210014 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -31,6 +31,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/bits",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index fc5ceea4c..a859095e2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -250,7 +250,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index ddc140b65..a61cc5059 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -97,7 +97,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		//     char           d_name[]; /* Filename (null-terminated) */
 		// };
 		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
-		if size < cb.remaining {
+		if size > cb.remaining {
 			return syserror.EINVAL
 		}
 		buf = cb.t.CopyScratchBuffer(size)
@@ -125,7 +125,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
 		}
 		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
-		if size < cb.remaining {
+		if size > cb.remaining {
 			return syserror.EINVAL
 		}
 		buf = cb.t.CopyScratchBuffer(size)
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index 068243132..fdfe49243 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -153,7 +154,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-
+	// Make sure that only one sync type option is set.
+	syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
+	if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
+		return 0, nil, syserror.EINVAL
+	}
 	if mask&linux.STATX__RESERVED != 0 {
 		return 0, nil, syserror.EINVAL
 	}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index eb4ebb511..8f31495da 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -329,10 +329,22 @@ func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
 // component in pcs represents a symbolic link, the symbolic link should be
 // followed.
 //
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+//   - For most non-creating walks, the last path component is handled by
+//     fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+//     after the path component is non-NULL (which is only possible if it's '/')
+//     and the path component is of type LAST_NORM.
+//
+//   - For open/openat/openat2 without O_CREAT, the last path component is
+//     handled by fs/namei.c:do_last(), which does the same, though without the
+//     LAST_NORM check.
+//
 // Preconditions: !rp.Done().
 func (rp *ResolvingPath) ShouldFollowSymlink() bool {
-	// Non-final symlinks are always followed.
-	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+	// Non-final symlinks are always followed. Paths terminated with '/' are also
+	// always followed.
+	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
 }
 
 // HandleSymlink is called when the current path component is a symbolic link
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 513b9cd1c..2503960f3 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -34,6 +34,13 @@
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
+#ifndef AT_STATX_FORCE_SYNC
+#define AT_STATX_FORCE_SYNC 0x2000
+#endif
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_DONT_SYNC 0x4000
+#endif
+
 namespace gvisor {
 namespace testing {
 
@@ -700,8 +707,10 @@ TEST_F(StatTest, StatxInvalidFlags) {
   struct kernel_statx stx;
   EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
               SyscallFailsWithErrno(EINVAL));
+
+  // Sync flags are mutually exclusive.
   EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
-                    0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx),
+                    AT_STATX_FORCE_SYNC | AT_STATX_DONT_SYNC, 0, &stx),
               SyscallFailsWithErrno(EINVAL));
 }
 
-- 
cgit v1.2.3


From f2eba940159b4ed55359f99a3422ad899a8647ee Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Mar 2020 15:23:48 -0700
Subject: Remove TODO to push down exec permission check

Pushing it down requires all implementation to check for
exec individualy which is not maintanable. Making it part
of GenericCheckPermissions add extra cost to everyone that
calls it. So it's better to keep is in
VirtualFilesystem.OpenAt.

Updates #1193

PiperOrigin-RevId: 302982993
---
 pkg/sentry/vfs/vfs.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 2e2880171..03d1fb943 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -402,8 +402,6 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 		if err == nil {
 			vfs.putResolvingPath(rp)
 
-			// TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
-			// to FileDescription.Stat().
 			if opts.FileExec {
 				if fd.Mount().flags.NoExec {
 					fd.DecRef()
-- 
cgit v1.2.3


From 5f03dca5227e4f2e7aa472ad40d421d4623c9f72 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Mar 2020 15:38:38 -0700
Subject: Fix race in TestRunEnvHasHome

It's possible to execute the command that checks user's
$HOME dir before the user is created. Move the code that
creates the user inside exec so it can be serialized.

PiperOrigin-RevId: 302986184
---
 test/e2e/exec_test.go | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
index 4074d2285..594c8e752 100644
--- a/test/e2e/exec_test.go
+++ b/test/e2e/exec_test.go
@@ -240,17 +240,7 @@ func TestExecEnvHasHome(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("exec-env-home-test")
 
-	// We will check that HOME is set for root user, and also for a new
-	// non-root user we will create.
-	newUID := 1234
-	newHome := "/foo/bar"
-
-	// Create a new user with a home directory, and then sleep.
-	script := fmt.Sprintf(`
-	mkdir -p -m 777 %s && \
-	adduser foo -D -u %d -h %s && \
-	sleep 1000`, newHome, newUID, newHome)
-	if err := d.Run("alpine", "/bin/sh", "-c", script); err != nil {
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -264,7 +254,15 @@ func TestExecEnvHasHome(t *testing.T) {
 		t.Errorf("wanted exec output to contain %q, got %q", want, got)
 	}
 
-	// Execute the same as uid 123 and expect newHome.
+	// Create a new user with a home directory.
+	newUID := 1234
+	newHome := "/foo/bar"
+	cmd := fmt.Sprintf("mkdir -p -m 777 %q && adduser foo -D -u %d -h %q", newHome, newUID, newHome)
+	if _, err := d.Exec("/bin/sh", "-c", cmd); err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+
+	// Execute the same as the new user and expect newHome.
 	got, err = d.ExecAsUser(strconv.Itoa(newUID), "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
-- 
cgit v1.2.3


From d440fe0613a3b8fec75d33aff36ebada220106b0 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 28 Feb 2020 18:01:21 -0800
Subject: Fix go_marshal Example name.

There is a canonical naming convention for Examples, which are checked
by analyzers. This must be fixed since adding exceptions for generated
code will be more challenging.
---
 tools/go_marshal/gomarshal/generator.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 729489de5..82983804c 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -413,7 +413,7 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 	// empty example instead.
 	if len(ts) == 0 {
 		b.reset()
-		b.emit("func ExampleEmptyTestSuite() {\n")
+		b.emit("func Example() {\n")
 		b.inIndent(func() {
 			b.emit("// This example is intentionally empty to ensure this file contains at least\n")
 			b.emit("// one testable entity. go-marshal is forced to emit a test file if a package\n")
-- 
cgit v1.2.3


From 882ed330e6e7761f482f9bfa771cc6693e0f8008 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 4 Oct 2019 19:03:07 -0700
Subject: nogo: enable sanitizers.

This enables all relevant santizers (though most analyzers will not find
much, it will prevent instances from creeping in), and codifies existing
exceptions in tools/nogo.js to be fixed.
---
 BUILD           | 23 +++++++++++++-
 tools/BUILD     |  2 +-
 tools/nogo.js   |  7 -----
 tools/nogo.json | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 9 deletions(-)
 delete mode 100644 tools/nogo.js
 create mode 100644 tools/nogo.json

diff --git a/BUILD b/BUILD
index 5fd929378..a709a9816 100644
--- a/BUILD
+++ b/BUILD
@@ -49,10 +49,31 @@ gazelle(name = "gazelle")
 # live in the tools subdirectory (unless they are standard).
 nogo(
     name = "nogo",
-    config = "//tools:nogo.js",
+    config = "//tools:nogo.json",
     visibility = ["//visibility:public"],
     deps = [
         "//tools/checkunsafe",
+        "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/assign:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/atomic:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/atomicalign:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/bools:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/buildtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/cgocall:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/copylock:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/deepequalerrors:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/loopclosure:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/lostcancel:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilfunc:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilness:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/printf:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/shift:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/stdmethods:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/structtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/tests:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unmarshal:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unsafeptr:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unusedresult:go_tool_library",
     ],
 )
 
diff --git a/tools/BUILD b/tools/BUILD
index e73a9c885..ba3506c04 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -1,3 +1,3 @@
 package(licenses = ["notice"])
 
-exports_files(["nogo.js"])
+exports_files(["nogo.json"])
diff --git a/tools/nogo.js b/tools/nogo.js
deleted file mode 100644
index fc0a4d1f0..000000000
--- a/tools/nogo.js
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "checkunsafe": {
-    "exclude_files": {
-      "/external/": "not subject to constraint"
-    }
-  }
-}
diff --git a/tools/nogo.json b/tools/nogo.json
new file mode 100644
index 000000000..ff369be6f
--- /dev/null
+++ b/tools/nogo.json
@@ -0,0 +1,95 @@
+{
+  "assign": {
+    "exclude_files": {
+      "/external/bazel_gazelle/walk/walk.go": "allowed: false positive"
+    }
+  },
+  "checkunsafe": {
+    "exclude_files": {
+      "/external/": "allowed: not subject to unsafe naming rules"
+    }
+  },
+  "copylocks": {
+    "exclude_files": {
+      ".*_state_autogen.go": "fix: m.Failf copies by value",
+      "/pkg/log/json.go": "fix: Emit passes lock by value: gvisor.dev/gvisor/pkg/log.JSONEmitter contains gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex",
+      "/pkg/log/log_test.go": "fix: call of fmt.Printf copies lock value: gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex",
+      "/pkg/sentry/fs/host/socket_test.go": "fix: call of t.Errorf copies lock value: gvisor.dev/gvisor/pkg/sentry/fs/host.ConnectedEndpoint contains gvisor.dev/gvisor/pkg/refs.AtomicRefCount contains gvisor.dev/gvisor/pkg/sync.Mutex",
+      "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpMemInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpSack contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/fs/tty/slave.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/tty.slaveInodeOperations contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/kernel/time/time.go": "fix: Readiness passes lock by value: gvisor.dev/gvisor/pkg/sentry/kernel/time.ClockEventsQueue contains gvisor.dev/gvisor/pkg/waiter.Queue contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/kernel/syscalls_state.go": "fix: assignment copies lock value to *s: gvisor.dev/gvisor/pkg/sentry/kernel.SyscallTable contains gvisor.dev/gvisor/pkg/sentry/kernel.SyscallFlagsTable contains gvisor.dev/gvisor/pkg/sync.Mutex"
+    }
+  },
+  "lostcancel": {
+    "exclude_files": {
+      "/pkg/tcpip/network/arp/arp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
+      "/pkg/tcpip/stack/ndp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
+      "/pkg/tcpip/transport/udp/udp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
+      "/pkg/tcpip/transport/tcp/testing/context/context.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak"
+    }
+  },
+  "nilness": {
+    "exclude_files": {
+      "/com_github_vishvananda_netlink/route_linux.go": "allowed: false positive",
+      "/external/bazel_gazelle/cmd/gazelle/.*": "allowed: false positive",
+      "/org_golang_x_tools/go/packages/golist.go": "allowed: runtime internals",
+      "/pkg/sentry/platform/kvm/kvm_test.go": "allowed: intentional"
+    }
+  },
+  "printf": {
+    "exclude_files": {
+      ".*_abi_autogen_test.go": "fix: Sprintf format has insufficient args",
+      "/pkg/segment/test/segment_test.go": "fix: Errorf format %d arg seg.Start is a func value, not called",
+      "/pkg/tcpip/tcpip_test.go": "fix: Error call has possible formatting directive %q",
+      "/pkg/tcpip/header/eth_test.go": "fix: Fatalf format %s reads arg #3, but call has 2 args",
+      "/pkg/tcpip/header/ndp_test.go": "fix: Errorf format %d reads arg #1, but call has 0 args",
+      "/pkg/eventchannel/event_test.go": "fix: Fatal call has possible formatting directive %v",
+      "/pkg/tcpip/stack/ndp.go": "fix: Fatalf format %s has arg protocolAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %s has arg flags of wrong type gvisor.dev/gvisor/pkg/sentry/fs.FileFlags",
+      "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %d arg f.FD is a func value, not called",
+      "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call",
+      "/pkg/tcpip/transport/udp/udp_test.go": "fix: Fatalf format %s has arg h.srcAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.FullAddress",
+      "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Fatalf format %s has arg tcpTW of wrong type gvisor.dev/gvisor/pkg/tcpip.TCPTimeWaitTimeoutOption",
+      "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Errorf call needs 1 arg but has 2 args",
+      "/pkg/tcpip/stack/ndp_test.go": "fix: Errorf format %s reads arg #3, but call has 2 args",
+      "/pkg/tcpip/stack/ndp_test.go": "fix: Fatalf format %s reads arg #5, but call has 4 args",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg protoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic1ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic2ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatal call has possible formatting directive %t",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf call has arguments but no formatting directives",
+      "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call",
+      "/pkg/sentry/fsimpl/tmpfs/stat_test.go": "fix: Errorf format %v reads arg #1, but call has 0 args",
+      "/runsc/container/test_app/test_app.go": "fix: Fatal call has possible formatting directive %q",
+      "/test/root/cgroup_test.go": "fix: Errorf format %s has arg gots of wrong type []int",
+      "/test/root/cgroup_test.go": "fix: Fatalf format %v reads arg #3, but call has 2 args",
+      "/test/runtimes/runner.go": "fix: Skip call has possible formatting directive %q",
+      "/test/runtimes/blacklist_test.go": "fix: Errorf format %q has arg blacklistFile of wrong type *string"
+    }
+  },
+  "structtag": {
+    "exclude_files": {
+      "/external/": "allowed: may use arbitrary tags"
+    }
+  },
+  "unsafeptr": {
+    "exclude_files": {
+      ".*_test.go": "allowed: exclude tests",
+      "/pkg/flipcall/flipcall_unsafe.go": "allowed: special case",
+      "/pkg/gohacks/gohacks_unsafe.go": "allowed: special case",
+      "/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/kvm/(bluepill|machine)_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case",
+      "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case"
+    }
+  },
+  "unusedresult": {
+    "exclude_files": {
+      "/pkg/sentry/fsimpl/proc/task_net.go": "fix: result of fmt.Sprintf call not used",
+      "/pkg/sentry/fsimpl/proc/tasks_net.go": "fix: result of fmt.Sprintf call not used"
+    }
+  }
+}
-- 
cgit v1.2.3


From 4f374da60cb0c39ed862518e5b7524cb86ec96db Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 25 Mar 2020 18:55:19 -0700
Subject: iptable: fix tests timeouts

Tests were run assuming a runtime of "runsc" was present, and did not
have --net-raw enabled.
---
 scripts/iptables_tests.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
index b4a5211a5..0f46909ac 100755
--- a/scripts/iptables_tests.sh
+++ b/scripts/iptables_tests.sh
@@ -16,7 +16,7 @@
 
 source $(dirname $0)/common.sh
 
-install_runsc_for_test iptables
+install_runsc_for_test iptables --net-raw
 
 # Build the docker image for the test.
 run //test/iptables/runner:runner-image --norun
@@ -26,5 +26,5 @@ test //test/iptables:iptables_test \
   "--test_arg=--image=bazel/test/iptables/runner:runner-image"
 
 test //test/iptables:iptables_test \
-  "--test_arg=--runtime=runsc" \
+  "--test_arg=--runtime=${RUNTIME}" \
   "--test_arg=--image=bazel/test/iptables/runner:runner-image"
-- 
cgit v1.2.3


From de694e5484502d53166d70b36141e62fcdf07803 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Mar 2020 19:12:25 -0700
Subject: Combine file mode and isDir arguments

Updates #1035

PiperOrigin-RevId: 303021328
---
 pkg/abi/linux/file.go                       |  5 +++++
 pkg/sentry/fsimpl/ext/inode.go              |  2 +-
 pkg/sentry/fsimpl/gofer/filesystem.go       | 22 +++++++++++-----------
 pkg/sentry/fsimpl/gofer/gofer.go            |  7 ++++---
 pkg/sentry/fsimpl/host/host.go              |  6 +++---
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go    |  3 +--
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |  6 ++----
 pkg/sentry/fsimpl/proc/task.go              |  9 +--------
 pkg/sentry/fsimpl/tmpfs/filesystem.go       | 24 ++++++++++++------------
 pkg/sentry/fsimpl/tmpfs/tmpfs.go            |  8 +++++---
 pkg/sentry/vfs/anonfs.go                    |  2 +-
 pkg/sentry/vfs/permissions.go               | 23 ++++++++++-------------
 12 files changed, 56 insertions(+), 61 deletions(-)

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index dbe58acbe..055ac1d7c 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -287,6 +287,11 @@ func (m FileMode) ExtraBits() FileMode {
 	return m &^ (PermissionsMask | FileTypeMask)
 }
 
+// IsDir returns true if file type represents a directory.
+func (m FileMode) IsDir() bool {
+	return m.FileType() == S_IFDIR
+}
+
 // String returns a string representation of m.
 func (m FileMode) String() string {
 	var s []string
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 6962083f5..a39a37318 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -186,7 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 }
 
 func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+	return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID())
 }
 
 // statTo writes the statx fields to the output parameter.
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 26b492185..1e43df9ec 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -119,7 +119,7 @@ func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
-	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
@@ -314,7 +314,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	if parent.isDeleted() {
@@ -378,7 +378,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -512,7 +512,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	if err != nil {
 		return err
 	}
-	return d.checkPermissions(creds, ats, d.isDir())
+	return d.checkPermissions(creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -528,7 +528,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -624,7 +624,7 @@ afterTrailingSymlink:
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Determine whether or not we need to create a file.
@@ -661,7 +661,7 @@ afterTrailingSymlink:
 // Preconditions: fs.renameMu must be locked.
 func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
-	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
 	mnt := rp.Mount()
@@ -722,7 +722,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
-	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
 	if d.isDeleted() {
@@ -884,7 +884,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			return err
 		}
 	}
-	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	vfsObj := rp.VirtualFilesystem()
@@ -904,7 +904,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			return syserror.EINVAL
 		}
 		if oldParent != newParent {
-			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -915,7 +915,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	if oldParent != newParent {
-		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 			return err
 		}
 		newParent.dirMu.Lock()
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 13928ce36..cf276a417 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -721,7 +721,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
@@ -843,8 +844,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	return nil
 }
 
-func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 1f735628f..a54985ef5 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -167,8 +167,8 @@ func fileFlagsFromHostFD(fd int) (int, error) {
 }
 
 // CheckPermissions implements kernfs.Inode.
-func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid)
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, i.mode, i.uid, i.gid)
 }
 
 // Mode implements kernfs.Inode.
@@ -306,7 +306,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &s, i.Mode(), i.uid, i.gid); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 75c4bab1a..bfa786c88 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -206,8 +206,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	fs := fd.filesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(ctx, fs, creds, opts)
+	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index c612dcf07..5c84b10c9 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -241,7 +241,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
 
@@ -273,12 +273,10 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 
 // CheckPermissions implements Inode.CheckPermissions.
 func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
-	mode := a.Mode()
 	return vfs.GenericCheckPermissions(
 		creds,
 		ats,
-		mode.FileType() == linux.ModeDirectory,
-		uint16(mode),
+		a.Mode(),
 		auth.KUID(atomic.LoadUint32(&a.uid)),
 		auth.KGID(atomic.LoadUint32(&a.gid)),
 	)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 49d6efb0e..aee2a4392 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -172,14 +172,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S
 func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
-	return vfs.GenericCheckPermissions(
-		creds,
-		ats,
-		mode.FileType() == linux.ModeDirectory,
-		uint16(mode),
-		uid,
-		gid,
-	)
+	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
 }
 
 func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 75d01b853..12cc64385 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -41,7 +41,7 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 	if !d.inode.isDir() {
 		return nil, syserror.ENOTDIR
 	}
-	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
@@ -125,7 +125,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -163,7 +163,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	if err != nil {
 		return err
 	}
-	return d.inode.checkPermissions(creds, ats, d.inode.isDir())
+	return d.inode.checkPermissions(creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -178,7 +178,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.inode.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -301,7 +301,7 @@ afterTrailingSymlink:
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -316,7 +316,7 @@ afterTrailingSymlink:
 	child, err := stepLocked(rp, parent)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -347,7 +347,7 @@ afterTrailingSymlink:
 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if !afterCreate {
-		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
 	}
@@ -428,7 +428,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	defer mnt.EndWrite()
 
 	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
@@ -445,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 		if oldParent != newParent {
 			// Writability is needed to change renamed's "..".
-			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -455,7 +455,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 	}
 
-	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	replacedVFSD := newParent.vfsd.Child(newName)
@@ -528,7 +528,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -621,7 +621,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 2d5070a46..2f9e6c876 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -245,8 +245,9 @@ func (i *inode) decRef() {
 	}
 }
 
-func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
 }
 
 // Go won't inline this function, and returning linux.Statx (which is quite
@@ -299,7 +300,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
 		return err
 	}
 	i.mu.Lock()
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 925996517..a62e43589 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -83,7 +83,7 @@ func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
-	return GenericCheckPermissions(creds, ats, false /* isDir */, anonFileMode, anonFileUID, anonFileGID)
+	return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
 }
 
 // GetDentryAt implements FilesystemImpl.GetDentryAt.
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 2c8f23f55..f9647f90e 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -29,9 +29,9 @@ type AccessTypes uint16
 
 // Bits in AccessTypes.
 const (
+	MayExec  AccessTypes = 1
+	MayWrite AccessTypes = 2
 	MayRead  AccessTypes = 4
-	MayWrite             = 2
-	MayExec              = 1
 )
 
 // OnlyRead returns true if access _only_ allows read.
@@ -56,16 +56,17 @@ func (a AccessTypes) MayExec() bool {
 
 // GenericCheckPermissions checks that creds has the given access rights on a
 // file with the given permissions, UID, and GID, subject to the rules of
-// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
-func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+// fs/namei.c:generic_permission().
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
 	// Check permission bits.
-	perms := mode
+	perms := uint16(mode.Permissions())
 	if creds.EffectiveKUID == kuid {
 		perms >>= 6
 	} else if creds.InGroup(kgid) {
 		perms >>= 3
 	}
 	if uint16(ats)&perms == uint16(ats) {
+		// All permission bits match, access granted.
 		return nil
 	}
 
@@ -77,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	}
 	// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
 	// directories, and read arbitrary non-directory files.
-	if (isDir && !ats.MayWrite()) || ats.OnlyRead() {
+	if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
 		if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
 			return nil
 		}
@@ -85,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
 	// access to non-directory files, and execute access to non-directory files
 	// for which at least one execute bit is set.
-	if isDir || !ats.MayExec() || (mode&0111 != 0) {
+	if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
 		if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
 			return nil
 		}
@@ -151,7 +152,7 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 // CheckSetStat checks that creds has permission to change the metadata of a
 // file with the given permissions, UID, and GID as specified by stat, subject
 // to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
 	if stat.Mask&linux.STATX_SIZE != 0 {
 		limit, err := CheckLimit(ctx, 0, int64(stat.Size))
 		if err != nil {
@@ -190,11 +191,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
 				(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
 				return syserror.EPERM
 			}
-			// isDir is irrelevant in the following call to
-			// GenericCheckPermissions since ats == MayWrite means that
-			// CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
-			// applies, regardless of isDir.
-			if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+			if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
 				return err
 			}
 		}
-- 
cgit v1.2.3


From c64796748c735af8b304e62d7833648b691d5a72 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Thu, 26 Mar 2020 08:46:33 -0700
Subject: Clean up transport_demuxer.go and test

- Change receiver of endpoint lookup functions
- Remove unused struct fields and functions in test
- s/%v/%s/ for errors
- Capitalize NIC
  https://github.com/golang/go/wiki/CodeReviewComments#initialisms

PiperOrigin-RevId: 303119580
---
 pkg/tcpip/stack/transport_demuxer.go      | 239 +++++++++++++++---------------
 pkg/tcpip/stack/transport_demuxer_test.go | 115 +++++---------
 2 files changed, 160 insertions(+), 194 deletions(-)

diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index c55e3e8bc..9a33ed375 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -35,7 +35,7 @@ type protocolIDs struct {
 type transportEndpoints struct {
 	// mu protects all fields of the transportEndpoints.
 	mu        sync.RWMutex
-	endpoints map[TransportEndpointID]*endpointsByNic
+	endpoints map[TransportEndpointID]*endpointsByNIC
 	// rawEndpoints contains endpoints for raw sockets, which receive all
 	// traffic of a given protocol regardless of port.
 	rawEndpoints []RawTransportEndpoint
@@ -46,11 +46,11 @@ type transportEndpoints struct {
 func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
-	epsByNic, ok := eps.endpoints[id]
+	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
 		return
 	}
-	if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
+	if !epsByNIC.unregisterEndpoint(bindToDevice, ep) {
 		return
 	}
 	delete(eps.endpoints, id)
@@ -66,18 +66,85 @@ func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
 	return es
 }
 
-type endpointsByNic struct {
+// iterEndpointsLocked yields all endpointsByNIC in eps that match id, in
+// descending order of match quality. If a call to yield returns false,
+// iterEndpointsLocked stops iteration and returns immediately.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) {
+	// Try to find a match with the id as provided.
+	if ep, ok := eps.endpoints[id]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the local address.
+	nid := id
+
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the remote part.
+	nid.LocalAddress = id.LocalAddress
+	nid.RemoteAddress = ""
+	nid.RemotePort = 0
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with only the local port.
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+}
+
+// findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in
+// descending order of match quality.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC {
+	var matchedEPs []*endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEPs = append(matchedEPs, ep)
+		return true
+	})
+	return matchedEPs
+}
+
+// findEndpointLocked returns the endpoint that most closely matches the given id.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC {
+	var matchedEP *endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEP = ep
+		return false
+	})
+	return matchedEP
+}
+
+type endpointsByNIC struct {
 	mu        sync.RWMutex
 	endpoints map[tcpip.NICID]*multiPortEndpoint
 	// seed is a random secret for a jenkins hash.
 	seed uint32
 }
 
-func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
-	epsByNic.mu.RLock()
-	defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
 	var eps []TransportEndpoint
-	for _, ep := range epsByNic.endpoints {
+	for _, ep := range epsByNIC.endpoints {
 		eps = append(eps, ep.transportEndpoints()...)
 	}
 	return eps
@@ -85,13 +152,13 @@ func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer) {
-	epsByNic.mu.RLock()
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer) {
+	epsByNIC.mu.RLock()
 
-	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
 	if !ok {
-		if mpep, ok = epsByNic.endpoints[0]; !ok {
-			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 			return
 		}
 	}
@@ -100,29 +167,29 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 	// endpoints bound to the right device.
 	if isMulticastOrBroadcast(id.LocalAddress) {
 		mpep.handlePacketAll(r, id, pkt)
-		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
 	// multiPortEndpoints are guaranteed to have at least one element.
-	transEP := selectEndpoint(id, mpep, epsByNic.seed)
+	transEP := selectEndpoint(id, mpep, epsByNIC.seed)
 	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
 		queuedProtocol.QueuePacket(r, transEP, id, pkt)
-		epsByNic.mu.RUnlock()
+		epsByNIC.mu.RUnlock()
 		return
 	}
 
 	transEP.HandlePacket(r, id, pkt)
-	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+	epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer) {
-	epsByNic.mu.RLock()
-	defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer) {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
 
-	mpep, ok := epsByNic.endpoints[n.ID()]
+	mpep, ok := epsByNIC.endpoints[n.ID()]
 	if !ok {
-		mpep, ok = epsByNic.endpoints[0]
+		mpep, ok = epsByNIC.endpoints[0]
 	}
 	if !ok {
 		return
@@ -132,16 +199,16 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 	// broadcast like we are doing with handlePacket above?
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, pkt)
+	selectEndpoint(id, mpep, epsByNIC.seed).HandleControlPacket(id, typ, extra, pkt)
 }
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
-	epsByNic.mu.Lock()
-	defer epsByNic.mu.Unlock()
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
 
-	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		multiPortEp = &multiPortEndpoint{
 			demux:      d,
@@ -149,24 +216,24 @@ func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto t
 			transProto: transProto,
 			reuse:      reusePort,
 		}
-		epsByNic.endpoints[bindToDevice] = multiPortEp
+		epsByNIC.endpoints[bindToDevice] = multiPortEp
 	}
 
 	return multiPortEp.singleRegisterEndpoint(t, reusePort)
 }
 
-// unregisterEndpoint returns true if endpointsByNic has to be unregistered.
-func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
-	epsByNic.mu.Lock()
-	defer epsByNic.mu.Unlock()
-	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+// unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		return false
 	}
 	if multiPortEp.unregisterEndpoint(t) {
-		delete(epsByNic.endpoints, bindToDevice)
+		delete(epsByNIC.endpoints, bindToDevice)
 	}
-	return len(epsByNic.endpoints) == 0
+	return len(epsByNIC.endpoints) == 0
 }
 
 // transportDemuxer demultiplexes packets targeted at a transport endpoint
@@ -198,7 +265,7 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 		for proto := range stack.transportProtocols {
 			protoIDs := protocolIDs{netProto, proto}
 			d.protocol[protoIDs] = &transportEndpoints{
-				endpoints: make(map[TransportEndpointID]*endpointsByNic),
+				endpoints: make(map[TransportEndpointID]*endpointsByNIC),
 			}
 			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
 			if isQueued {
@@ -378,16 +445,16 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 
-	epsByNic, ok := eps.endpoints[id]
+	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
-		epsByNic = &endpointsByNic{
+		epsByNIC = &endpointsByNIC{
 			endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
 			seed:      rand.Uint32(),
 		}
-		eps.endpoints[id] = epsByNic
+		eps.endpoints[id] = epsByNIC
 	}
 
-	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
+	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
@@ -413,7 +480,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	// transport endpoints.
 	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
 		eps.mu.RLock()
-		destEPs := d.findAllEndpointsLocked(eps, id)
+		destEPs := eps.findAllEndpointsLocked(id)
 		eps.mu.RUnlock()
 		// Fail if we didn't find at least one matching transport endpoint.
 		if len(destEPs) == 0 {
@@ -439,7 +506,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	}
 
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, id)
+	ep := eps.findEndpointLocked(id)
 	eps.mu.RUnlock()
 	if ep == nil {
 		if protocol == header.UDPProtocolNumber {
@@ -483,115 +550,47 @@ func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtoco
 		return false
 	}
 
-	// Try to find the endpoint.
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, id)
+	ep := eps.findEndpointLocked(id)
 	eps.mu.RUnlock()
-
-	// Fail if we didn't find one.
 	if ep == nil {
 		return false
 	}
 
-	// Deliver the packet.
 	ep.handleControlPacket(n, id, typ, extra, pkt)
-
 	return true
 }
 
-// iterEndpointsLocked yields all endpointsByNic in eps that match id, in
-// descending order of match quality. If a call to yield returns false,
-// iterEndpointsLocked stops iteration and returns immediately.
-//
-// Preconditions: eps.mu must be locked.
-func (d *transportDemuxer) iterEndpointsLocked(eps *transportEndpoints, id TransportEndpointID, yield func(*endpointsByNic) bool) {
-	// Try to find a match with the id as provided.
-	if ep, ok := eps.endpoints[id]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-
-	// Try to find a match with the id minus the local address.
-	nid := id
-
-	nid.LocalAddress = ""
-	if ep, ok := eps.endpoints[nid]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-
-	// Try to find a match with the id minus the remote part.
-	nid.LocalAddress = id.LocalAddress
-	nid.RemoteAddress = ""
-	nid.RemotePort = 0
-	if ep, ok := eps.endpoints[nid]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-
-	// Try to find a match with only the local port.
-	nid.LocalAddress = ""
-	if ep, ok := eps.endpoints[nid]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-}
-
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
-	var matchedEPs []*endpointsByNic
-	d.iterEndpointsLocked(eps, id, func(ep *endpointsByNic) bool {
-		matchedEPs = append(matchedEPs, ep)
-		return true
-	})
-	return matchedEPs
-}
-
 // findTransportEndpoint find a single endpoint that most closely matches the provided id.
 func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
 	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
 	if !ok {
 		return nil
 	}
-	// Try to find the endpoint.
+
 	eps.mu.RLock()
-	epsByNic := d.findEndpointLocked(eps, id)
-	// Fail if we didn't find one.
-	if epsByNic == nil {
+	epsByNIC := eps.findEndpointLocked(id)
+	if epsByNIC == nil {
 		eps.mu.RUnlock()
 		return nil
 	}
 
-	epsByNic.mu.RLock()
+	epsByNIC.mu.RLock()
 	eps.mu.RUnlock()
 
-	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
 	if !ok {
-		if mpep, ok = epsByNic.endpoints[0]; !ok {
-			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 			return nil
 		}
 	}
 
-	ep := selectEndpoint(id, mpep, epsByNic.seed)
-	epsByNic.mu.RUnlock()
+	ep := selectEndpoint(id, mpep, epsByNIC.seed)
+	epsByNIC.mu.RUnlock()
 	return ep
 }
 
-// findEndpointLocked returns the endpoint that most closely matches the given
-// id.
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
-	var matchedEP *endpointsByNic
-	d.iterEndpointsLocked(eps, id, func(ep *endpointsByNic) bool {
-		matchedEP = ep
-		return false
-	})
-	return matchedEP
-}
-
 // registerRawEndpoint registers the given endpoint with the dispatcher such
 // that packets of the appropriate protocol are delivered to it. A single
 // packet can be sent to one or more raw endpoints along with a non-raw
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 84311bcc8..75c119c99 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -40,75 +40,47 @@ const (
 )
 
 type testContext struct {
-	t       *testing.T
 	linkEps map[tcpip.NICID]*channel.Endpoint
 	s       *stack.Stack
-
-	ep tcpip.Endpoint
-	wq waiter.Queue
-}
-
-func (c *testContext) cleanup() {
-	if c.ep != nil {
-		c.ep.Close()
-	}
-}
-
-func (c *testContext) createV6Endpoint(v6only bool) {
-	var err *tcpip.Error
-	c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &c.wq)
-	if err != nil {
-		c.t.Fatalf("NewEndpoint failed: %v", err)
-	}
-
-	if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
-		c.t.Fatalf("SetSockOpt failed: %v", err)
-	}
+	wq      waiter.Queue
 }
 
 // newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
 func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+	})
 	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
 	for _, linkEpID := range linkEpIDs {
 		channelEp := channel.New(256, mtu, "")
 		if err := s.CreateNIC(linkEpID, channelEp); err != nil {
-			t.Fatalf("CreateNIC failed: %v", err)
+			t.Fatalf("CreateNIC failed: %s", err)
 		}
 		linkEps[linkEpID] = channelEp
 
 		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, stackAddr); err != nil {
-			t.Fatalf("AddAddress IPv4 failed: %v", err)
+			t.Fatalf("AddAddress IPv4 failed: %s", err)
 		}
 
 		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
-			t.Fatalf("AddAddress IPv6 failed: %v", err)
+			t.Fatalf("AddAddress IPv6 failed: %s", err)
 		}
 	}
 
 	s.SetRouteTable([]tcpip.Route{
-		{
-			Destination: header.IPv4EmptySubnet,
-			NIC:         1,
-		},
-		{
-			Destination: header.IPv6EmptySubnet,
-			NIC:         1,
-		},
+		{Destination: header.IPv4EmptySubnet, NIC: 1},
+		{Destination: header.IPv6EmptySubnet, NIC: 1},
 	})
 
 	return &testContext{
-		t:       t,
 		s:       s,
 		linkEps: linkEps,
 	}
 }
 
 type headers struct {
-	srcPort uint16
-	dstPort uint16
+	srcPort, dstPort uint16
 }
 
 func newPayload() []byte {
@@ -179,15 +151,15 @@ func TestTransportDemuxerRegister(t *testing.T) {
 				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
 			}
 			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, false, 0), test.want; got != want {
-				t.Fatalf("s.RegisterTransportEndpoint(...) = %v, want %v", got, want)
+				t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want)
 			}
 		})
 	}
 }
 
-// TestReuseBindToDevice injects varied packets on input devices and checks that
+// TestBindToDeviceDistribution injects varied packets on input devices and checks that
 // the distribution of packets received matches expectations.
-func TestDistribution(t *testing.T) {
+func TestBindToDeviceDistribution(t *testing.T) {
 	type endpointSockopts struct {
 		reuse        int
 		bindToDevice tcpip.NICID
@@ -196,19 +168,19 @@ func TestDistribution(t *testing.T) {
 		name string
 		// endpoints will received the inject packets.
 		endpoints []endpointSockopts
-		// wantedDistribution is the wanted ratio of packets received on each
+		// wantDistributions is the want ratio of packets received on each
 		// endpoint for each NIC on which packets are injected.
-		wantedDistributions map[tcpip.NICID][]float64
+		wantDistributions map[tcpip.NICID][]float64
 	}{
 		{
 			"BindPortReuse",
 			// 5 endpoints that all have reuse set.
 			[]endpointSockopts{
-				{1, 0},
-				{1, 0},
-				{1, 0},
-				{1, 0},
-				{1, 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed evenly.
@@ -219,9 +191,9 @@ func TestDistribution(t *testing.T) {
 			"BindToDevice",
 			// 3 endpoints with various bindings.
 			[]endpointSockopts{
-				{0, 1},
-				{0, 2},
-				{0, 3},
+				{reuse: 0, bindToDevice: 1},
+				{reuse: 0, bindToDevice: 2},
+				{reuse: 0, bindToDevice: 3},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 go only to the endpoint bound to dev0.
@@ -236,12 +208,12 @@ func TestDistribution(t *testing.T) {
 			"ReuseAndBindToDevice",
 			// 6 endpoints with various bindings.
 			[]endpointSockopts{
-				{1, 1},
-				{1, 1},
-				{1, 2},
-				{1, 2},
-				{1, 2},
-				{1, 0},
+				{reuse: 1, bindToDevice: 1},
+				{reuse: 1, bindToDevice: 1},
+				{reuse: 1, bindToDevice: 2},
+				{reuse: 1, bindToDevice: 2},
+				{reuse: 1, bindToDevice: 2},
+				{reuse: 1, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed among endpoints bound to
@@ -256,16 +228,13 @@ func TestDistribution(t *testing.T) {
 		},
 	} {
 		t.Run(test.name, func(t *testing.T) {
-			for device, wantedDistribution := range test.wantedDistributions {
+			for device, wantDistribution := range test.wantDistributions {
 				t.Run(string(device), func(t *testing.T) {
 					var devices []tcpip.NICID
-					for d := range test.wantedDistributions {
+					for d := range test.wantDistributions {
 						devices = append(devices, d)
 					}
 					c := newDualTestContextMultiNIC(t, defaultMTU, devices)
-					defer c.cleanup()
-
-					c.createV6Endpoint(false)
 
 					eps := make(map[tcpip.Endpoint]int)
 
@@ -281,7 +250,7 @@ func TestDistribution(t *testing.T) {
 						var err *tcpip.Error
 						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
 						if err != nil {
-							c.t.Fatalf("NewEndpoint failed: %v", err)
+							t.Fatalf("NewEndpoint failed: %s", err)
 						}
 						eps[ep] = i
 
@@ -294,20 +263,20 @@ func TestDistribution(t *testing.T) {
 						defer ep.Close()
 						reusePortOption := tcpip.ReusePortOption(endpoint.reuse)
 						if err := ep.SetSockOpt(reusePortOption); err != nil {
-							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", reusePortOption, i, err)
+							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", reusePortOption, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
 						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
-							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", bindToDeviceOption, i, err)
+							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
 						}
 						if err := ep.Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
-							t.Fatalf("ep.Bind(...) on endpoint %d failed: %v", i, err)
+							t.Fatalf("ep.Bind(...) on endpoint %d failed: %s", i, err)
 						}
 					}
 
 					npackets := 100000
 					nports := 10000
-					if got, want := len(test.endpoints), len(wantedDistribution); got != want {
+					if got, want := len(test.endpoints), len(wantDistribution); got != want {
 						t.Fatalf("got len(test.endpoints) = %d, want %d", got, want)
 					}
 					ports := make(map[uint16]tcpip.Endpoint)
@@ -322,11 +291,9 @@ func TestDistribution(t *testing.T) {
 								dstPort: stackPort},
 							device)
 
-						var addr tcpip.FullAddress
 						ep := <-pollChannel
-						_, _, err := ep.Read(&addr)
-						if err != nil {
-							c.t.Fatalf("Read on endpoint %d failed: %v", eps[ep], err)
+						if _, _, err := ep.Read(nil); err != nil {
+							t.Fatalf("Read on endpoint %d failed: %s", eps[ep], err)
 						}
 						stats[ep]++
 						if i < nports {
@@ -342,13 +309,13 @@ func TestDistribution(t *testing.T) {
 
 					// Check that a packet distribution is as expected.
 					for ep, i := range eps {
-						wantedRatio := wantedDistribution[i]
-						wantedRecv := wantedRatio * float64(npackets)
+						wantRatio := wantDistribution[i]
+						wantRecv := wantRatio * float64(npackets)
 						actualRecv := stats[ep]
 						actualRatio := float64(stats[ep]) / float64(npackets)
 						// The deviation is less than 10%.
-						if math.Abs(actualRatio-wantedRatio) > 0.05 {
-							t.Errorf("wanted about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantedRatio*100, wantedRecv, npackets, i, actualRatio*100, actualRecv, npackets)
+						if math.Abs(actualRatio-wantRatio) > 0.05 {
+							t.Errorf("want about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantRatio*100, wantRecv, npackets, i, actualRatio*100, actualRecv, npackets)
 						}
 					}
 				})
-- 
cgit v1.2.3


From bc3def43c3c30ccde6577a0af213d13e4fd17e1e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 26 Mar 2020 10:46:47 -0700
Subject: Check error in DropTCP*Port tests and fix comment.

PiperOrigin-RevId: 303147253
---
 test/iptables/filter_input.go | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 4ccd4cce7..41e0cfa8d 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -194,14 +194,11 @@ func (FilterInputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPDestPort) LocalAction(ip net.IP) error {
-	// After the container sets its DROP rule, we shouldn't be able to connect.
-	// However, we may succeed in connecting if this runs before the container
-	// sets the rule. To avoid this race, we retry connecting until
-	// sendloopDuration has elapsed, ignoring whether the connect succeeds. The
-	// test works becuase the container will error if a connection is
-	// established after the rule is set.
+	// Ensure we cannot connect to the container.
 	for start := time.Now(); time.Since(start) < sendloopDuration; {
-		connectTCP(ip, dropPort, sendloopDuration-time.Since(start))
+		if err := connectTCP(ip, dropPort, sendloopDuration-time.Since(start)); err == nil {
+			return fmt.Errorf("expected not to connect, but was able to connect on port %d", dropPort)
+		}
 	}
 
 	return nil
@@ -232,14 +229,11 @@ func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	// After the container sets its DROP rule, we shouldn't be able to connect.
-	// However, we may succeed in connecting if this runs before the container
-	// sets the rule. To avoid this race, we retry connecting until
-	// sendloopDuration has elapsed, ignoring whether the connect succeeds. The
-	// test works becuase the container will error if a connection is
-	// established after the rule is set.
+	// Ensure we cannot connect to the container.
 	for start := time.Now(); time.Since(start) < sendloopDuration; {
-		connectTCP(ip, acceptPort, sendloopDuration-time.Since(start))
+		if err := connectTCP(ip, acceptPort, sendloopDuration-time.Since(start)); err == nil {
+			return fmt.Errorf("expected not to connect, but was able to connect on port %d", acceptPort)
+		}
 	}
 
 	return nil
-- 
cgit v1.2.3


From d5ef8091b4aab16639116e64469db16fc36386c7 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Thu, 26 Mar 2020 11:28:05 -0700
Subject: Add IPv4 to bind_to_device distribution test

PiperOrigin-RevId: 303156734
---
 pkg/tcpip/stack/transport_demuxer_test.go | 107 ++++++++++++++++++++++++------
 1 file changed, 86 insertions(+), 21 deletions(-)

diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 75c119c99..c65b0c632 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -31,12 +31,14 @@ import (
 )
 
 const (
-	stackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	testV6Addr  = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	testSrcAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	testDstAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
 
-	stackAddr = "\x0a\x00\x00\x01"
-	stackPort = 1234
-	testPort  = 4096
+	testSrcAddrV4 = "\x0a\x00\x00\x01"
+	testDstAddrV4 = "\x0a\x00\x00\x02"
+
+	testDstPort = 1234
+	testSrcPort = 4096
 )
 
 type testContext struct {
@@ -59,11 +61,11 @@ func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICI
 		}
 		linkEps[linkEpID] = channelEp
 
-		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, stackAddr); err != nil {
+		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, testDstAddrV4); err != nil {
 			t.Fatalf("AddAddress IPv4 failed: %s", err)
 		}
 
-		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
+		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, testDstAddrV6); err != nil {
 			t.Fatalf("AddAddress IPv6 failed: %s", err)
 		}
 	}
@@ -91,6 +93,47 @@ func newPayload() []byte {
 	return b
 }
 
+func (c *testContext) sendV4Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TOS:         0x80,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(udp.ProtocolNumber),
+		SrcAddr:     testSrcAddrV4,
+		DstAddr:     testDstAddrV4,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcPort,
+		DstPort: h.dstPort,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV4, testDstAddrV4, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
+}
+
 func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
@@ -102,8 +145,8 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
-		SrcAddr:       testV6Addr,
-		DstAddr:       stackV6Addr,
+		SrcAddr:       testSrcAddrV6,
+		DstAddr:       testDstAddrV6,
 	})
 
 	// Initialize the UDP header.
@@ -115,7 +158,7 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	})
 
 	// Calculate the UDP pseudo-header checksum.
-	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testV6Addr, stackV6Addr, uint16(len(u)))
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV6, testDstAddrV6, uint16(len(u)))
 
 	// Calculate the UDP checksum and set it.
 	xsum = header.Checksum(payload, xsum)
@@ -123,7 +166,9 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 
 	// Inject packet.
 	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
-		Data: buf.ToVectorisedView(),
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
 	})
 }
 
@@ -227,9 +272,12 @@ func TestBindToDeviceDistribution(t *testing.T) {
 			},
 		},
 	} {
-		t.Run(test.name, func(t *testing.T) {
+		for protoName, netProtoNum := range map[string]tcpip.NetworkProtocolNumber{
+			"IPv4": ipv4.ProtocolNumber,
+			"IPv6": ipv6.ProtocolNumber,
+		} {
 			for device, wantDistribution := range test.wantDistributions {
-				t.Run(string(device), func(t *testing.T) {
+				t.Run(test.name+protoName+string(device), func(t *testing.T) {
 					var devices []tcpip.NICID
 					for d := range test.wantDistributions {
 						devices = append(devices, d)
@@ -248,7 +296,7 @@ func TestBindToDeviceDistribution(t *testing.T) {
 						defer close(ch)
 
 						var err *tcpip.Error
-						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
+						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, netProtoNum, &wq)
 						if err != nil {
 							t.Fatalf("NewEndpoint failed: %s", err)
 						}
@@ -269,7 +317,17 @@ func TestBindToDeviceDistribution(t *testing.T) {
 						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
 							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
 						}
-						if err := ep.Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
+
+						var dstAddr tcpip.Address
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							dstAddr = testDstAddrV4
+						case ipv6.ProtocolNumber:
+							dstAddr = testDstAddrV6
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
+						}
+						if err := ep.Bind(tcpip.FullAddress{Addr: dstAddr, Port: testDstPort}); err != nil {
 							t.Fatalf("ep.Bind(...) on endpoint %d failed: %s", i, err)
 						}
 					}
@@ -285,11 +343,18 @@ func TestBindToDeviceDistribution(t *testing.T) {
 						// Send a packet.
 						port := uint16(i % nports)
 						payload := newPayload()
-						c.sendV6Packet(payload,
-							&headers{
-								srcPort: testPort + port,
-								dstPort: stackPort},
-							device)
+						hdrs := &headers{
+							srcPort: testSrcPort + port,
+							dstPort: testDstPort,
+						}
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							c.sendV4Packet(payload, hdrs, device)
+						case ipv6.ProtocolNumber:
+							c.sendV6Packet(payload, hdrs, device)
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
+						}
 
 						ep := <-pollChannel
 						if _, _, err := ep.Read(nil); err != nil {
@@ -320,6 +385,6 @@ func TestBindToDeviceDistribution(t *testing.T) {
 					}
 				})
 			}
-		})
+		}
 	}
 }
-- 
cgit v1.2.3


From 92b9069b67b927cef25a1490ebd142ad6d65690d Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Fri, 20 Mar 2020 12:00:21 -0700
Subject: Support owner matching for iptables.

This feature will match UID and GID of the packet creator, for locally
generated packets. This match is only valid in the OUTPUT and POSTROUTING
chains. Forwarded packets do not have any socket associated with them.
Packets from kernel threads do have a socket, but usually no owner.
---
 pkg/abi/linux/netfilter.go                   |  41 ++++++++
 pkg/abi/linux/netfilter_test.go              |   1 +
 pkg/sentry/kernel/task.go                    |  12 +++
 pkg/sentry/socket/netfilter/BUILD            |   1 +
 pkg/sentry/socket/netfilter/netfilter.go     |   7 +-
 pkg/sentry/socket/netfilter/owner_matcher.go | 128 ++++++++++++++++++++++++
 pkg/sentry/socket/netstack/provider.go       |   6 ++
 pkg/tcpip/network/ipv4/ipv4.go               |  15 +++
 pkg/tcpip/stack/packet_buffer.go             |   9 +-
 pkg/tcpip/stack/transport_test.go            |   2 +
 pkg/tcpip/tcpip.go                           |  12 +++
 pkg/tcpip/transport/icmp/endpoint.go         |  12 ++-
 pkg/tcpip/transport/packet/endpoint.go       |   2 +
 pkg/tcpip/transport/raw/endpoint.go          |   9 ++
 pkg/tcpip/transport/tcp/accept.go            |   5 +-
 pkg/tcpip/transport/tcp/connect.go           |  10 +-
 pkg/tcpip/transport/tcp/endpoint.go          |   7 ++
 pkg/tcpip/transport/tcp/forwarder.go         |   2 +-
 pkg/tcpip/transport/tcp/protocol.go          |   2 +-
 pkg/tcpip/transport/udp/endpoint.go          |  12 ++-
 test/iptables/filter_output.go               | 143 +++++++++++++++++++++++++++
 test/iptables/iptables_test.go               |  30 ++++++
 22 files changed, 451 insertions(+), 17 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/owner_matcher.go

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 80dc09aa9..a8d4f9d69 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -509,3 +509,44 @@ const (
 	// Enable all flags.
 	XT_UDP_INV_MASK = 0x03
 )
+
+// IPTOwnerInfo holds data for matching packets with owner. It corresponds
+// to struct ipt_owner_info in libxt_owner.c of iptables binary.
+type IPTOwnerInfo struct {
+	// UID is user id which created the packet.
+	UID uint32
+
+	// GID is group id which created the packet.
+	GID uint32
+
+	// PID is process id of the process which created the packet.
+	PID uint32
+
+	// SID is session id which created the packet.
+	SID uint32
+
+	// Comm is the command name which created the packet.
+	Comm [16]byte
+
+	// Match is used to match UID/GID of the socket. See the
+	// XT_OWNER_* flags below.
+	Match uint8
+
+	// Invert flips the meaning of Match field.
+	Invert uint8
+}
+
+// SizeOfIPTOwnerInfo is the size of an XTOwnerMatchInfo.
+const SizeOfIPTOwnerInfo = 34
+
+// Flags in IPTOwnerInfo.Match. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_owner.h.
+const (
+	// Match the UID of the packet.
+	XT_OWNER_UID = 1 << 0
+	// Match the GID of the packet.
+	XT_OWNER_GID = 1 << 1
+	// Match if the socket exists for the packet. Forwarded
+	// packets do not have an associated socket.
+	XT_OWNER_SOCKET = 1 << 2
+)
diff --git a/pkg/abi/linux/netfilter_test.go b/pkg/abi/linux/netfilter_test.go
index 21e237f92..565dd550e 100644
--- a/pkg/abi/linux/netfilter_test.go
+++ b/pkg/abi/linux/netfilter_test.go
@@ -29,6 +29,7 @@ func TestSizes(t *testing.T) {
 		{IPTGetEntries{}, SizeOfIPTGetEntries},
 		{IPTGetinfo{}, SizeOfIPTGetinfo},
 		{IPTIP{}, SizeOfIPTIP},
+		{IPTOwnerInfo{}, SizeOfIPTOwnerInfo},
 		{IPTReplace{}, SizeOfIPTReplace},
 		{XTCounters{}, SizeOfXTCounters},
 		{XTEntryMatch{}, SizeOfXTEntryMatch},
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 8452ddf5b..d6546735e 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -863,3 +863,15 @@ func (t *Task) SetOOMScoreAdj(adj int32) error {
 	atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
 	return nil
 }
+
+// UID returns t's uid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) UID() uint32 {
+	return uint32(t.Credentials().EffectiveKUID)
+}
+
+// GID returns t's gid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) GID() uint32 {
+	return uint32(t.Credentials().EffectiveKGID)
+}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index e801abeb8..721094bbf 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "extensions.go",
         "netfilter.go",
+        "owner_matcher.go",
         "targets.go",
         "tcp_matcher.go",
         "udp_matcher.go",
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 55bcc3ace..878f81fd5 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -517,11 +517,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	}
 
 	// TODO(gvisor.dev/issue/170): Support other chains.
-	// Since we only support modifying the INPUT chain and redirect for
-	// PREROUTING chain right now, make sure all other chains point to
-	// ACCEPT rules.
+	// Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now,
+	// make sure all other chains point to ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
-		if hook != stack.Input && hook != stack.Prerouting {
+		if hook == stack.Forward || hook == stack.Postrouting {
 			if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
new file mode 100644
index 000000000..5949a7c29
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -0,0 +1,128 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameOwner = "owner"
+
+func init() {
+	registerMatchMaker(ownerMarshaler{})
+}
+
+// ownerMarshaler implements matchMaker for owner matching.
+type ownerMarshaler struct{}
+
+// name implements matchMaker.name.
+func (ownerMarshaler) name() string {
+	return matcherNameOwner
+}
+
+// marshal implements matchMaker.marshal.
+func (ownerMarshaler) marshal(mr stack.Matcher) []byte {
+	matcher := mr.(*OwnerMatcher)
+	iptOwnerInfo := linux.IPTOwnerInfo{
+		UID: matcher.uid,
+		GID: matcher.gid,
+	}
+
+	// Support for UID match.
+	// TODO(gvisor.dev/issue/170): Need to support gid match.
+	if matcher.matchUID {
+		iptOwnerInfo.Match = linux.XT_OWNER_UID
+	} else if matcher.matchGID {
+		panic("GID match is not supported.")
+	} else {
+		panic("UID match is not set.")
+	}
+
+	buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo)
+	return marshalEntryMatch(matcherNameOwner, binary.Marshal(buf, usermem.ByteOrder, iptOwnerInfo))
+}
+
+// unmarshal implements matchMaker.unmarshal.
+func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
+	if len(buf) < linux.SizeOfIPTOwnerInfo {
+		return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may
+	// exceed what's strictly necessary to hold matchData.
+	var matchData linux.IPTOwnerInfo
+	binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData)
+
+	if matchData.Invert != 0 {
+		return nil, fmt.Errorf("invert flag is not supported for owner match")
+	}
+
+	// Support for UID match.
+	// TODO(gvisor.dev/issue/170): Need to support gid match.
+	if matchData.Match&linux.XT_OWNER_UID != linux.XT_OWNER_UID {
+		return nil, fmt.Errorf("owner match is only supported for uid")
+	}
+
+	// Check Flags.
+	var owner OwnerMatcher
+	owner.uid = matchData.UID
+	owner.gid = matchData.GID
+	owner.matchUID = true
+
+	return &owner, nil
+}
+
+type OwnerMatcher struct {
+	uid      uint32
+	gid      uint32
+	matchUID bool
+	matchGID bool
+	invert   uint8
+}
+
+// Name implements Matcher.Name.
+func (*OwnerMatcher) Name() string {
+	return matcherNameOwner
+}
+
+// Match implements Matcher.Match.
+func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+	// Support only for OUTPUT chain.
+	// TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also.
+	if hook != stack.Output {
+		return false, true
+	}
+
+	// If the packet owner is not set, drop the packet.
+	// Support for uid match.
+	// TODO(gvisor.dev/issue/170): Need to support gid match.
+	if pkt.Owner == nil || !om.matchUID {
+		return false, true
+	}
+
+	// TODO(gvisor.dev/issue/170): Need to add tests to verify
+	// drop rule when packet UID does not match owner matcher UID.
+	if pkt.Owner.UID() != om.uid {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 5f181f017..eb090e79b 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -126,6 +126,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 		ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated)
 	} else {
 		ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq)
+
+		// Assign task to PacketOwner interface to get the UID and GID for
+		// iptables owner matching.
+		if e == nil {
+			ep.SetOwner(t)
+		}
 	}
 	if e != nil {
 		return nil, syserr.TranslateNetstackError(e)
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index b3ee6000e..a7d9a8b25 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -244,6 +244,14 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	ipt := e.stack.IPTables()
+	if ok := ipt.Check(stack.Output, pkt); !ok {
+		// iptables is telling us to drop the packet.
+		return nil
+	}
+
 	if r.Loop&stack.PacketLoop != 0 {
 		// The inbound path expects the network header to still be in
 		// the PacketBuffer's Data field.
@@ -280,7 +288,14 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.Pac
 		return len(pkts), nil
 	}
 
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	ipt := e.stack.IPTables()
 	for i := range pkts {
+		if ok := ipt.Check(stack.Output, pkts[i]); !ok {
+			// iptables is telling us to drop the packet.
+			continue
+		}
 		ip := e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params)
 		pkts[i].NetworkHeader = buffer.View(ip)
 	}
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 9505a4e92..9367de180 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -13,7 +13,10 @@
 
 package stack
 
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
 
 // A PacketBuffer contains all the data of a network packet.
 //
@@ -59,6 +62,10 @@ type PacketBuffer struct {
 	// Hash is the transport layer hash of this packet. A value of zero
 	// indicates no valid hash has been set.
 	Hash uint32
+
+	// Owner is implemented by task to get the uid and gid.
+	// Only set for locally generated packets.
+	Owner tcpip.PacketOwner
 }
 
 // Clone makes a copy of pk. It clones the Data field, which creates a new
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 8ca9ac3cf..3084e6593 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -56,6 +56,8 @@ func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
+func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
+
 func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
 	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 3dc5d87d6..2ef3271f1 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -336,6 +336,15 @@ type ControlMessages struct {
 	PacketInfo IPPacketInfo
 }
 
+// PacketOwner is used to get UID and GID of the packet.
+type PacketOwner interface {
+	// UID returns UID of the packet.
+	UID() uint32
+
+	// GID returns GID of the packet.
+	GID() uint32
+}
+
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
 // that exposes functionality like read, write, connect, etc. to users of the
 // networking stack.
@@ -470,6 +479,9 @@ type Endpoint interface {
 
 	// Stats returns a reference to the endpoint stats.
 	Stats() EndpointStats
+
+	// SetOwner sets the task owner to the endpoint owner.
+	SetOwner(owner PacketOwner)
 }
 
 // EndpointInfo is the interface implemented by each endpoint info struct.
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 613b12ead..b007302fb 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -73,6 +73,9 @@ type endpoint struct {
 	route         stack.Route `state:"manual"`
 	ttl           uint8
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
@@ -133,6 +136,10 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
+
 // IPTables implements tcpip.Endpoint.IPTables.
 func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
@@ -321,7 +328,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		err = send4(route, e.ID.LocalPort, v, e.ttl)
+		err = send4(route, e.ID.LocalPort, v, e.ttl, e.owner)
 
 	case header.IPv6ProtocolNumber:
 		err = send6(route, e.ID.LocalPort, v, e.ttl)
@@ -415,7 +422,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
-func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Error {
+func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error {
 	if len(data) < header.ICMPv4MinimumSize {
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -444,6 +451,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 		Header:          hdr,
 		Data:            data.ToVectorisedView(),
 		TransportHeader: buffer.View(icmpv4),
+		Owner:           owner,
 	})
 }
 
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index df49d0995..23158173d 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -392,3 +392,5 @@ func (ep *endpoint) Info() tcpip.EndpointInfo {
 func (ep *endpoint) Stats() tcpip.EndpointStats {
 	return &ep.stats
 }
+
+func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 536dafd1e..337bc1c71 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -80,6 +80,9 @@ type endpoint struct {
 	// Connect(), and is valid only when conneted is true.
 	route stack.Route                  `state:"manual"`
 	stats tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 // NewEndpoint returns a raw  endpoint for the given protocols.
@@ -159,6 +162,10 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
+
 // IPTables implements tcpip.Endpoint.IPTables.
 func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
@@ -348,10 +355,12 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 			}
 			break
 		}
+
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
 		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buffer.View(payloadBytes).ToVectorisedView(),
+			Owner:  e.owner,
 		}); err != nil {
 			return 0, nil, err
 		}
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 375ca21f6..7a9dea4ac 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -276,7 +276,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 // and then performs the TCP 3-way handshake.
 //
 // The new endpoint is returned with e.mu held.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
@@ -284,6 +284,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	if err != nil {
 		return nil, err
 	}
+	ep.owner = owner
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
 	deferAccept := time.Duration(0)
@@ -414,7 +415,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	}()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}, e.owner)
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 1d245c2c6..3239a5911 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -745,7 +745,7 @@ func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOp
 
 func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
 	tf.txHash = e.txHash
-	if err := sendTCP(r, tf, data, gso); err != nil {
+	if err := sendTCP(r, tf, data, gso, e.owner); err != nil {
 		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
 		return err
 	}
@@ -787,7 +787,7 @@ func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *sta
 	}
 }
 
-func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
 	optLen := len(tf.opts)
 	if tf.rcvWnd > 0xffff {
 		tf.rcvWnd = 0xffff
@@ -816,6 +816,7 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 		pkts[i].DataSize = packetSize
 		pkts[i].Data = data
 		pkts[i].Hash = tf.txHash
+		pkts[i].Owner = owner
 		buildTCPHdr(r, tf, &pkts[i], gso)
 		off += packetSize
 		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
@@ -833,14 +834,14 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 
 // sendTCP sends a TCP segment with the provided options via the provided
 // network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
 	optLen := len(tf.opts)
 	if tf.rcvWnd > 0xffff {
 		tf.rcvWnd = 0xffff
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
-		return sendTCPBatch(r, tf, data, gso)
+		return sendTCPBatch(r, tf, data, gso, owner)
 	}
 
 	pkt := stack.PacketBuffer{
@@ -849,6 +850,7 @@ func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stac
 		DataSize:   data.Size(),
 		Data:       data,
 		Hash:       tf.txHash,
+		Owner:      owner,
 	}
 	buildTCPHdr(r, tf, &pkt, gso)
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 1ebee0cfe..9b123e968 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -603,6 +603,9 @@ type endpoint struct {
 	// txHash is the transport layer hash to be set on outbound packets
 	// emitted by this endpoint.
 	txHash uint32
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -1132,6 +1135,10 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvListMu.Unlock()
 }
 
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
+
 // IPTables implements tcpip.Endpoint.IPTables.
 func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index a094471b8..808410c92 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -157,7 +157,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	}, queue)
+	}, queue, nil)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 1377107ca..dce9a1652 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -199,7 +199,7 @@ func replyWithReset(s *segment) {
 		seq:    seq,
 		ack:    ack,
 		rcvWnd: 0,
-	}, buffer.VectorisedView{}, nil /* gso */)
+	}, buffer.VectorisedView{}, nil /* gso */, nil /* PacketOwner */)
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index a3372ac58..120d3baa3 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -143,6 +143,9 @@ type endpoint struct {
 
 	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
 	stats tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 // +stateify savable
@@ -484,7 +487,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		useDefaultTTL = false
 	}
 
-	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS); err != nil {
+	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner); err != nil {
 		return 0, nil, err
 	}
 	return int64(len(v)), nil, nil
@@ -886,7 +889,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 // sendUDP sends a UDP segment via the provided network endpoint and under the
 // provided identity.
-func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8) *tcpip.Error {
+func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner) *tcpip.Error {
 	// Allocate a buffer for the UDP header.
 	hdr := buffer.NewPrependable(header.UDPMinimumSize + int(r.MaxHeaderLength()))
 
@@ -916,6 +919,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 		Header:          hdr,
 		Data:            data,
 		TransportHeader: buffer.View(udp),
+		Owner:           owner,
 	}); err != nil {
 		r.Stats().UDP.PacketSendErrors.Increment()
 		return err
@@ -1356,3 +1360,7 @@ func (*endpoint) Wait() {}
 func isBroadcastOrMulticast(a tcpip.Address) bool {
 	return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a)
 }
+
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go
index 4582d514c..f6d974b85 100644
--- a/test/iptables/filter_output.go
+++ b/test/iptables/filter_output.go
@@ -24,6 +24,11 @@ func init() {
 	RegisterTestCase(FilterOutputDropTCPSrcPort{})
 	RegisterTestCase(FilterOutputDestination{})
 	RegisterTestCase(FilterOutputInvertDestination{})
+	RegisterTestCase(FilterOutputAcceptTCPOwner{})
+	RegisterTestCase(FilterOutputDropTCPOwner{})
+	RegisterTestCase(FilterOutputAcceptUDPOwner{})
+	RegisterTestCase(FilterOutputDropUDPOwner{})
+	RegisterTestCase(FilterOutputOwnerFail{})
 }
 
 // FilterOutputDropTCPDestPort tests that connections are not accepted on
@@ -90,6 +95,144 @@ func (FilterOutputDropTCPSrcPort) LocalAction(ip net.IP) error {
 	return nil
 }
 
+// FilterOutputAcceptTCPOwner tests that TCP connections from uid owner are accepted.
+type FilterOutputAcceptTCPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputAcceptTCPOwner) Name() string {
+	return "FilterOutputAcceptTCPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputAcceptTCPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--uid-owner", "root", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("connection on port %d should be accepted, but got dropped", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputAcceptTCPOwner) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("connection destined to port %d should be accepted, but got dropped", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputDropTCPOwner tests that TCP connections from uid owner are dropped.
+type FilterOutputDropTCPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropTCPOwner) Name() string {
+	return "FilterOutputDropTCPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropTCPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--uid-owner", "root", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection on port %d should be dropped, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropTCPOwner) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection destined to port %d should be dropped, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputAcceptUDPOwner tests that UDP packets from uid owner are accepted.
+type FilterOutputAcceptUDPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputAcceptUDPOwner) Name() string {
+	return "FilterOutputAcceptUDPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputAcceptUDPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "--uid-owner", "root", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// Send UDP packets on acceptPort.
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputAcceptUDPOwner) LocalAction(ip net.IP) error {
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputDropUDPOwner tests that UDP packets from uid owner are dropped.
+type FilterOutputDropUDPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropUDPOwner) Name() string {
+	return "FilterOutputDropUDPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropUDPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "--uid-owner", "root", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Send UDP packets on dropPort.
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropUDPOwner) LocalAction(ip net.IP) error {
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets should not be received")
+	}
+
+	return nil
+}
+
+// FilterOutputOwnerFail tests that without uid/gid option, owner rule
+// will fail.
+type FilterOutputOwnerFail struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputOwnerFail) Name() string {
+	return "FilterOutputOwnerFail"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputOwnerFail) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "-j", "ACCEPT"); err == nil {
+		return fmt.Errorf("Invalid argument")
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputOwnerFail) LocalAction(ip net.IP) error {
+	// no-op.
+	return nil
+}
+
 // FilterOutputDestination tests that we can selectively allow packets to
 // certain destinations.
 type FilterOutputDestination struct{}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 7f1f70606..493d69052 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -274,6 +274,36 @@ func TestFilterOutputDropTCPSrcPort(t *testing.T) {
 	}
 }
 
+func TestFilterOutputAcceptTCPOwner(t *testing.T) {
+	if err := singleTest(FilterOutputAcceptTCPOwner{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterOutputDropTCPOwner(t *testing.T) {
+	if err := singleTest(FilterOutputDropTCPOwner{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterOutputAcceptUDPOwner(t *testing.T) {
+	if err := singleTest(FilterOutputAcceptUDPOwner{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterOutputDropUDPOwner(t *testing.T) {
+	if err := singleTest(FilterOutputDropUDPOwner{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterOutputOwnerFail(t *testing.T) {
+	if err := singleTest(FilterOutputOwnerFail{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestJumpSerialize(t *testing.T) {
 	if err := singleTest(FilterInputSerializeJump{}); err != nil {
 		t.Fatal(err)
-- 
cgit v1.2.3


From a5742f177af1758d9e7b65bfbf11af297960817b Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 26 Mar 2020 12:29:14 -0700
Subject: Add nogo exemption for machine_arm64_unsafe.go

---
 tools/nogo.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/nogo.json b/tools/nogo.json
index ff369be6f..2b4c6d3b6 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -81,6 +81,7 @@
       "/pkg/gohacks/gohacks_unsafe.go": "allowed: special case",
       "/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go": "allowed: special case",
       "/pkg/sentry/platform/kvm/(bluepill|machine)_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/kvm/machine_arm64_unsafe.go": "fix: gvisor.dev/issue/22464",
       "/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go": "allowed: special case",
       "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case",
       "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case"
-- 
cgit v1.2.3


From e466ab04a20731ebeb8a9725d808def975d4c88d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 26 Mar 2020 13:45:02 -0700
Subject: Add unique ID to Mount type.

Analagous to Linux's mount.mnt_id. This ID is displayed in
/proc/[pid]/mountinfo.

PiperOrigin-RevId: 303185564
---
 pkg/sentry/vfs/mount.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 05f6233f9..4b68cabda 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -24,6 +24,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// lastMountID is used to allocate mount ids. Must be accessed atomically.
+var lastMountID uint64
+
 // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
 // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
 // (Mount.fs), which applies to path resolution in the context of a particular
@@ -48,6 +51,9 @@ type Mount struct {
 	fs   *Filesystem
 	root *Dentry
 
+	// ID is the immutable mount ID.
+	ID uint64
+
 	// key is protected by VirtualFilesystem.mountMu and
 	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
 	// key.parent and key.point if they are not nil.
@@ -87,6 +93,7 @@ type Mount struct {
 
 func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
 	mnt := &Mount{
+		ID:    atomic.AddUint64(&lastMountID, 1),
 		vfs:   vfs,
 		fs:    fs,
 		root:  root,
-- 
cgit v1.2.3


From fbe80460a7eb34147b928fa1023b28a3c094c070 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 26 Mar 2020 14:04:28 -0700
Subject: Handle IPv6 Fragment & Routing extension headers

Enables the reassembly of fragmented IPv6 packets and handling of the
Routing extension header with a Segments Left value of 0. Atomic
fragments are handled as described in RFC 6946 to not interfere with
"normal" fragment traffic. No specific routing header type is supported.

Note, the stack does not yet support sending ICMPv6 error messages in
response to IPv6 packets that cannot be handled/parsed. That will come
in a later change (Issue #2211).

Test:
- header_test.TestIPv6RoutingExtHdr
- header_test.TestIPv6FragmentExtHdr
- header_test.TestIPv6ExtHdrIterErr
- header_test.TestIPv6ExtHdrIter
- ipv6_test.TestReceiveIPv6ExtHdrs
- ipv6_test.TestReceiveIPv6Fragments

RELNOTES: n/a
PiperOrigin-RevId: 303189584
---
 pkg/tcpip/buffer/view.go                        |  20 +
 pkg/tcpip/header/BUILD                          |   3 +
 pkg/tcpip/header/ipv6_extension_headers.go      | 344 +++++++++++
 pkg/tcpip/header/ipv6_extension_headers_test.go | 515 ++++++++++++++++
 pkg/tcpip/network/hash/hash.go                  |   4 +-
 pkg/tcpip/network/ipv6/BUILD                    |   3 +
 pkg/tcpip/network/ipv6/ipv6.go                  | 122 +++-
 pkg/tcpip/network/ipv6/ipv6_test.go             | 768 ++++++++++++++++++++++++
 8 files changed, 1772 insertions(+), 7 deletions(-)
 create mode 100644 pkg/tcpip/header/ipv6_extension_headers.go
 create mode 100644 pkg/tcpip/header/ipv6_extension_headers_test.go

diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 17e94c562..8d42cd066 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -15,6 +15,10 @@
 // Package buffer provides the implementation of a buffer view.
 package buffer
 
+import (
+	"bytes"
+)
+
 // View is a slice of a buffer, with convenience methods.
 type View []byte
 
@@ -45,6 +49,13 @@ func (v *View) CapLength(length int) {
 	*v = (*v)[:length:length]
 }
 
+// Reader returns a bytes.Reader for v.
+func (v *View) Reader() bytes.Reader {
+	var r bytes.Reader
+	r.Reset(*v)
+	return r
+}
+
 // ToVectorisedView returns a VectorisedView containing the receiver.
 func (v View) ToVectorisedView() VectorisedView {
 	return NewVectorisedView(len(v), []View{v})
@@ -162,3 +173,12 @@ func (vv *VectorisedView) AppendView(v View) {
 	vv.views = append(vv.views, v)
 	vv.size += len(v)
 }
+
+// Readers returns a bytes.Reader for each of vv's views.
+func (vv *VectorisedView) Readers() []bytes.Reader {
+	readers := make([]bytes.Reader, 0, len(vv.views))
+	for _, v := range vv.views {
+		readers = append(readers, v.Reader())
+	}
+	return readers
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 9da0d71f8..7094f3f0b 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -14,6 +14,7 @@ go_library(
         "interfaces.go",
         "ipv4.go",
         "ipv6.go",
+        "ipv6_extension_headers.go",
         "ipv6_fragment.go",
         "ndp_neighbor_advert.go",
         "ndp_neighbor_solicit.go",
@@ -55,11 +56,13 @@ go_test(
     size = "small",
     srcs = [
         "eth_test.go",
+        "ipv6_extension_headers_test.go",
         "ndp_test.go",
     ],
     library = ":header",
     deps = [
         "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
         "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
new file mode 100644
index 000000000..b8866d4d2
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -0,0 +1,344 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bufio"
+	"encoding/binary"
+	"fmt"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// IPv6ExtensionHeaderIdentifier is an IPv6 extension header identifier.
+type IPv6ExtensionHeaderIdentifier uint8
+
+const (
+	// IPv6RoutingExtHdrIdentifier is the header identifier of a Routing extension
+	// header, as per RFC 8200 section 4.4.
+	IPv6RoutingExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 43
+
+	// IPv6FragmentExtHdrIdentifier is the header identifier of a Fragment
+	// extension header, as per RFC 8200 section 4.5.
+	IPv6FragmentExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 44
+
+	// IPv6NoNextHeaderIdentifier is the header identifier used to signify the end
+	// of an IPv6 payload, as per RFC 8200 section 4.7.
+	IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59
+)
+
+const (
+	// ipv6RoutingExtHdrSegmentsLeftIdx is the index to the Segments Left field
+	// within an IPv6RoutingExtHdr.
+	ipv6RoutingExtHdrSegmentsLeftIdx = 1
+
+	// ipv6FragmentExtHdrFragmentOffsetOffset is the offset to the start of the
+	// Fragment Offset field within an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrFragmentOffsetOffset = 0
+
+	// ipv6FragmentExtHdrFragmentOffsetShift is the least significant bits to
+	// discard from the Fragment Offset.
+	ipv6FragmentExtHdrFragmentOffsetShift = 3
+
+	// ipv6FragmentExtHdrFlagsIdx is the index to the flags field within an
+	// IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrFlagsIdx = 1
+
+	// ipv6FragmentExtHdrMFlagMask is the mask of the More (M) flag within the
+	// flags field of an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrMFlagMask = 1
+
+	// ipv6FragmentExtHdrIdentificationOffset is the offset to the Identification
+	// field within an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrIdentificationOffset = 2
+
+	// ipv6ExtHdrLenBytesPerUnit is the unit size of an extension header's length
+	// field. That is, given a Length field of 2, the extension header expects
+	// 16 bytes following the first 8 bytes (see ipv6ExtHdrLenBytesExcluded for
+	// details about the first 8 bytes' exclusion from the Length field).
+	ipv6ExtHdrLenBytesPerUnit = 8
+
+	// ipv6ExtHdrLenBytesExcluded is the number of bytes excluded from an
+	// extension header's Length field following the Length field.
+	//
+	// The Length field excludes the first 8 bytes, but the Next Header and Length
+	// field take up the first 2 of the 8 bytes so we expect (at minimum) 6 bytes
+	// after the Length field.
+	//
+	// This ensures that every extension header is at least 8 bytes.
+	ipv6ExtHdrLenBytesExcluded = 6
+
+	// IPv6FragmentExtHdrFragmentOffsetBytesPerUnit is the unit size of a Fragment
+	// extension header's Fragment Offset field. That is, given a Fragment Offset
+	// of 2, the extension header is indiciating that the fragment's payload
+	// starts at the 16th byte in the reassembled packet.
+	IPv6FragmentExtHdrFragmentOffsetBytesPerUnit = 8
+)
+
+// IPv6PayloadHeader is implemented by the various headers that can be found
+// in an IPv6 payload.
+//
+// These headers include IPv6 extension headers or upper layer data.
+type IPv6PayloadHeader interface {
+	isIPv6PayloadHeader()
+}
+
+// IPv6RawPayloadHeader the remainder of an IPv6 payload after an iterator
+// encounters a Next Header field it does not recognize as an IPv6 extension
+// header.
+type IPv6RawPayloadHeader struct {
+	Identifier IPv6ExtensionHeaderIdentifier
+	Buf        buffer.VectorisedView
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6RawPayloadHeader) isIPv6PayloadHeader() {}
+
+// IPv6RoutingExtHdr is a buffer holding the Routing extension header specific
+// data as outlined in RFC 8200 section 4.4.
+type IPv6RoutingExtHdr []byte
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6RoutingExtHdr) isIPv6PayloadHeader() {}
+
+// SegmentsLeft returns the Segments Left field.
+func (b IPv6RoutingExtHdr) SegmentsLeft() uint8 {
+	return b[ipv6RoutingExtHdrSegmentsLeftIdx]
+}
+
+// IPv6FragmentExtHdr is a buffer holding the Fragment extension header specific
+// data as outlined in RFC 8200 section 4.5.
+//
+// Note, the buffer does not include the Next Header and Reserved fields.
+type IPv6FragmentExtHdr [6]byte
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6FragmentExtHdr) isIPv6PayloadHeader() {}
+
+// FragmentOffset returns the Fragment Offset field.
+//
+// This value indicates where the buffer following the Fragment extension header
+// starts in the target (reassembled) packet.
+func (b IPv6FragmentExtHdr) FragmentOffset() uint16 {
+	return binary.BigEndian.Uint16(b[ipv6FragmentExtHdrFragmentOffsetOffset:]) >> ipv6FragmentExtHdrFragmentOffsetShift
+}
+
+// More returns the More (M) flag.
+//
+// This indicates whether any fragments are expected to succeed b.
+func (b IPv6FragmentExtHdr) More() bool {
+	return b[ipv6FragmentExtHdrFlagsIdx]&ipv6FragmentExtHdrMFlagMask != 0
+}
+
+// ID returns the Identification field.
+//
+// This value is used to uniquely identify the packet, between a
+// souce and destination.
+func (b IPv6FragmentExtHdr) ID() uint32 {
+	return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:])
+}
+
+// IPv6PayloadIterator is an iterator over the contents of an IPv6 payload.
+//
+// The IPv6 payload may contain IPv6 extension headers before any upper layer
+// data.
+//
+// Note, between when an IPv6PayloadIterator is obtained and last used, no
+// changes to the payload may happen. Doing so may cause undefined and
+// unexpected behaviour. It is fine to obtain an IPv6PayloadIterator, iterate
+// over the first few headers then modify the backing payload so long as the
+// IPv6PayloadIterator obtained before modification is no longer used.
+type IPv6PayloadIterator struct {
+	// The identifier of the next header to parse.
+	nextHdrIdentifier IPv6ExtensionHeaderIdentifier
+
+	// reader is an io.Reader over payload.
+	reader  bufio.Reader
+	payload buffer.VectorisedView
+
+	// Indicates to the iterator that it should return the remaining payload as a
+	// raw payload on the next call to Next.
+	forceRaw bool
+}
+
+// MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
+// extension headers, or a raw payload if the payload cannot be parsed.
+func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.VectorisedView, check bool) (IPv6PayloadIterator, error) {
+	readers := payload.Readers()
+	readerPs := make([]io.Reader, 0, len(readers))
+	for i := range readers {
+		readerPs = append(readerPs, &readers[i])
+	}
+
+	// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
+	reader := *bufio.NewReaderSize(io.MultiReader(readerPs...), 1)
+
+	it := IPv6PayloadIterator{
+		nextHdrIdentifier: nextHdrIdentifier,
+		payload:           payload.Clone(nil),
+		reader:            reader,
+	}
+
+	var err error
+
+	if check {
+		for {
+			var done bool
+			if _, done, err = it.Next(); err != nil || done {
+				break
+			}
+		}
+
+		// Reset it (and its underlying readers) before returning it.
+		for i := range readers {
+			readers[i].Seek(0, io.SeekStart)
+		}
+		reader.Reset(io.MultiReader(readerPs...))
+		it = IPv6PayloadIterator{
+			nextHdrIdentifier: nextHdrIdentifier,
+			payload:           payload.Clone(nil),
+			reader:            reader,
+		}
+	}
+
+	return it, err
+}
+
+// AsRawHeader returns the remaining payload of i as a raw header and
+// completes the iterator.
+//
+// Calls to Next after calling AsRawHeader on i will indicate that the
+// iterator is done.
+func (i *IPv6PayloadIterator) AsRawHeader() IPv6RawPayloadHeader {
+	buf := i.payload
+	identifier := i.nextHdrIdentifier
+
+	// Mark i as done.
+	*i = IPv6PayloadIterator{
+		nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
+	}
+
+	return IPv6RawPayloadHeader{Identifier: identifier, Buf: buf}
+}
+
+// Next returns the next item in the payload.
+//
+// If the next item is not a known IPv6 extension header, IPv6RawPayloadHeader
+// will be returned with the remaining bytes and next header identifier.
+//
+// The return is of the format (header, done, error). done will be true when
+// Next is unable to return anything because the iterator has reached the end of
+// the payload, or an error occured.
+func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
+	// We could be forced to return i as a raw header when the previous header was
+	// a fragment extension header as the data following the fragment extension
+	// header may not be complete.
+	if i.forceRaw {
+		return i.AsRawHeader(), false, nil
+	}
+
+	// Is the header we are parsing a known extension header?
+	switch i.nextHdrIdentifier {
+	case IPv6RoutingExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6RoutingExtHdr(bytes), false, nil
+	case IPv6FragmentExtHdrIdentifier:
+		var data [6]byte
+		// We ignore the returned bytes becauase we know the fragment extension
+		// header specific data will fit in data.
+		nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:])
+		if err != nil {
+			return nil, true, err
+		}
+
+		fragmentExtHdr := IPv6FragmentExtHdr(data)
+
+		// If the packet is a fragmented packet, do not attempt to parse
+		// anything after the fragment extension header as the data following
+		// the extension header may not be complete.
+		if fragmentExtHdr.More() || fragmentExtHdr.FragmentOffset() != 0 {
+			i.forceRaw = true
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return fragmentExtHdr, false, nil
+	case IPv6NoNextHeaderIdentifier:
+		// This indicates the end of the IPv6 payload.
+		return nil, true, nil
+
+	default:
+		// The header we are parsing is not a known extension header. Return the
+		// raw payload.
+		return i.AsRawHeader(), false, nil
+	}
+}
+
+// nextHeaderData returns the extension header's Next Header field and raw data.
+//
+// fragmentHdr indicates that the extension header being parsed is the Fragment
+// extension header so the Length field should be ignored as it is Reserved
+// for the Fragment extension header.
+//
+// If bytes is not nil, extension header specific data will be read into bytes
+// if it has enough capacity. If bytes is provided but does not have enough
+// capacity for the data, nextHeaderData will panic.
+func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IPv6ExtensionHeaderIdentifier, []byte, error) {
+	// We ignore the number of bytes read because we know we will only ever read
+	// at max 1 bytes since rune has a length of 1. If we read 0 bytes, the Read
+	// would return io.EOF to indicate that io.Reader has reached the end of the
+	// payload.
+	nextHdrIdentifier, err := i.reader.ReadByte()
+	i.payload.TrimFront(1)
+	if err != nil {
+		return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+	}
+
+	var length uint8
+	length, err = i.reader.ReadByte()
+	i.payload.TrimFront(1)
+	if err != nil {
+		var ret error
+		if fragmentHdr {
+			ret = fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+		} else {
+			ret = fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+		}
+		return 0, nil, ret
+	}
+	if fragmentHdr {
+		length = 0
+	}
+
+	bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded
+	if bytes == nil {
+		bytes = make([]byte, bytesLen)
+	} else if n := len(bytes); n < bytesLen {
+		panic(fmt.Sprintf("bytes only has space for %d bytes but need space for %d bytes (length = %d) for extension header with id = %d", n, bytesLen, length, i.nextHdrIdentifier))
+	}
+
+	n, err := io.ReadFull(&i.reader, bytes)
+	i.payload.TrimFront(n)
+	if err != nil {
+		return 0, nil, fmt.Errorf("read %d out of %d extension header data bytes (length = %d) for header with id = %d: %w", n, bytesLen, length, i.nextHdrIdentifier, err)
+	}
+
+	return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), bytes, nil
+}
diff --git a/pkg/tcpip/header/ipv6_extension_headers_test.go b/pkg/tcpip/header/ipv6_extension_headers_test.go
new file mode 100644
index 000000000..4bfdc77c4
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_extension_headers_test.go
@@ -0,0 +1,515 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold the same Identifier value and
+// contain the same bytes in Buf, even if the bytes are split across views
+// differently.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6RawPayloadHeader) Equal(b IPv6RawPayloadHeader) bool {
+	return a.Identifier == b.Identifier && bytes.Equal(a.Buf.ToView(), b.Buf.ToView())
+}
+
+func TestIPv6RoutingExtHdr(t *testing.T) {
+	tests := []struct {
+		name         string
+		bytes        []byte
+		segmentsLeft uint8
+	}{
+		{
+			name:         "Zeroes",
+			bytes:        []byte{0, 0, 0, 0, 0, 0},
+			segmentsLeft: 0,
+		},
+		{
+			name:         "Ones",
+			bytes:        []byte{1, 1, 1, 1, 1, 1},
+			segmentsLeft: 1,
+		},
+		{
+			name:         "Mixed",
+			bytes:        []byte{1, 2, 3, 4, 5, 6},
+			segmentsLeft: 2,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			extHdr := IPv6RoutingExtHdr(test.bytes)
+			if got := extHdr.SegmentsLeft(); got != test.segmentsLeft {
+				t.Errorf("got SegmentsLeft() = %d, want = %d", got, test.segmentsLeft)
+			}
+		})
+	}
+}
+
+func TestIPv6FragmentExtHdr(t *testing.T) {
+	tests := []struct {
+		name           string
+		bytes          [6]byte
+		fragmentOffset uint16
+		more           bool
+		id             uint32
+	}{
+		{
+			name:           "Zeroes",
+			bytes:          [6]byte{0, 0, 0, 0, 0, 0},
+			fragmentOffset: 0,
+			more:           false,
+			id:             0,
+		},
+		{
+			name:           "Ones",
+			bytes:          [6]byte{0, 9, 0, 0, 0, 1},
+			fragmentOffset: 1,
+			more:           true,
+			id:             1,
+		},
+		{
+			name:           "Mixed",
+			bytes:          [6]byte{68, 9, 128, 4, 2, 1},
+			fragmentOffset: 2177,
+			more:           true,
+			id:             2147746305,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			extHdr := IPv6FragmentExtHdr(test.bytes)
+			if got := extHdr.FragmentOffset(); got != test.fragmentOffset {
+				t.Errorf("got FragmentOffset() = %d, want = %d", got, test.fragmentOffset)
+			}
+			if got := extHdr.More(); got != test.more {
+				t.Errorf("got More() = %t, want = %t", got, test.more)
+			}
+			if got := extHdr.ID(); got != test.id {
+				t.Errorf("got ID() = %d, want = %d", got, test.id)
+			}
+		})
+	}
+}
+
+func makeVectorisedViewFromByteBuffers(bs ...[]byte) buffer.VectorisedView {
+	size := 0
+	var vs []buffer.View
+
+	for _, b := range bs {
+		vs = append(vs, buffer.View(b))
+		size += len(b)
+	}
+
+	return buffer.NewVectorisedView(size, vs)
+}
+
+func TestIPv6ExtHdrIterErr(t *testing.T) {
+	tests := []struct {
+		name         string
+		firstNextHdr IPv6ExtensionHeaderIdentifier
+		payload      buffer.VectorisedView
+		err          error
+	}{
+		{
+			name:         "Upper layer only without data",
+			firstNextHdr: 255,
+		},
+		{
+			name:         "Upper layer only with data",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
+		},
+
+		{
+			name:         "No next header",
+			firstNextHdr: IPv6NoNextHeaderIdentifier,
+		},
+		{
+			name:         "No next header with data",
+			firstNextHdr: IPv6NoNextHeaderIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
+		},
+
+		{
+			name:         "Valid single fragment",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2, 1}),
+		},
+		{
+			name:         "Fragment too small",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2}),
+			err:          io.ErrUnexpectedEOF,
+		},
+
+		{
+			name:         "Valid single routing",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2, 3, 4, 5, 6}),
+		},
+		{
+			name:         "Valid single routing across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2}, []byte{3, 4, 5, 6}),
+		},
+		{
+			name:         "Routing too small with zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2, 3, 4, 5}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid routing with non-zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8}),
+		},
+		{
+			name:         "Valid routing with non-zero length field across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7, 8}),
+		},
+		{
+			name:         "Routing too small with non-zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Routing too small with non-zero length field across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7}),
+			err:          io.ErrUnexpectedEOF,
+		},
+
+		{
+			name:         "Mixed",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if _, err := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload, false); err != nil {
+				t.Errorf("got MakeIPv6PayloadIterator(%d, _, false) = %s, want = nil", test.firstNextHdr, err)
+			}
+
+			if _, err := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload, true); !errors.Is(err, test.err) {
+				t.Errorf("got MakeIPv6PayloadIterator(%d, _, true) = %v, want = %v", test.firstNextHdr, err, test.err)
+			}
+		})
+	}
+}
+
+func TestIPv6ExtHdrIter(t *testing.T) {
+	routingExtHdrWithUpperLayerData := buffer.View([]byte{255, 0, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4})
+	upperLayerData := buffer.View([]byte{1, 2, 3, 4})
+	tests := []struct {
+		name         string
+		firstNextHdr IPv6ExtensionHeaderIdentifier
+		payload      buffer.VectorisedView
+		expected     []IPv6PayloadHeader
+	}{
+		// With a non-atomic fragment, the payload after the fragment will not be
+		// parsed because the payload may not be complete.
+		{
+			name:         "fragment - routing - upper",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6RoutingExtHdrIdentifier,
+					Buf:        routingExtHdrWithUpperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "fragment - routing - upper (across views)",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2}, []byte{3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6RoutingExtHdrIdentifier,
+					Buf:        routingExtHdrWithUpperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+
+		// If we have an atomic fragment, the payload following the fragment
+		// extension header should be parsed normally.
+		{
+			name:         "atomic fragment - routing - upper",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "atomic fragment - routing - upper (across views)",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6}, []byte{128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2}, []byte{3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2}, []byte{3, 4}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+				},
+			},
+		},
+		{
+			name:         "atomic fragment - no next header",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Res (Reserved) bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "routing - atomic fragment - no next header",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "routing - atomic fragment - no next header (across views)",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 255, 0, 6}, []byte{128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "routing - fragment - no next header",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Fragment Offset = 32; Res = 6.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 1, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{1, 6, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6NoNextHeaderIdentifier,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+
+		// Test the raw payload for common transport layer protocol numbers.
+		{
+			name:         "TCP raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(TCPProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(TCPProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "UDP raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(UDPProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(UDPProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "ICMPv4 raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(ICMPv4ProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(ICMPv4ProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "ICMPv6 raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(ICMPv6ProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(ICMPv6ProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "Unknwon next header raw payload",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: 255,
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "Unknwon next header raw payload (across views)",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: 255,
+				Buf:        makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+			}},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			it, err := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload, true)
+			if err != nil {
+				t.Fatalf("MakeIPv6PayloadIterator(%d, _ true): %s", test.firstNextHdr, err)
+			}
+
+			for i, e := range test.expected {
+				extHdr, done, err := it.Next()
+				if err != nil {
+					t.Errorf("(i=%d) Next(): %s", i, err)
+				}
+				if done {
+					t.Errorf("(i=%d) unexpectedly done iterating", i)
+				}
+				if diff := cmp.Diff(e, extHdr); diff != "" {
+					t.Errorf("(i=%d) got ext hdr mismatch (-want +got):\n%s", i, diff)
+				}
+
+				if t.Failed() {
+					t.FailNow()
+				}
+			}
+
+			extHdr, done, err := it.Next()
+			if err != nil {
+				t.Errorf("(last) Next(): %s", err)
+			}
+			if !done {
+				t.Errorf("(last) iterator unexpectedly not done")
+			}
+			if extHdr != nil {
+				t.Errorf("(last) got Next() = %T, want = nil", extHdr)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index 6a215938b..8f65713c5 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -80,12 +80,12 @@ func IPv4FragmentHash(h header.IPv4) uint32 {
 // RFC 2640 (sec 4.5) is not very sharp on this aspect.
 // As a reference, also Linux ignores the protocol to compute
 // the hash (inet6_hash_frag).
-func IPv6FragmentHash(h header.IPv6, f header.IPv6Fragment) uint32 {
+func IPv6FragmentHash(h header.IPv6, id uint32) uint32 {
 	t := h.SourceAddress()
 	y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
 	t = h.DestinationAddress()
 	z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
-	return Hash3Words(f.ID(), y, z, hashIV)
+	return Hash3Words(id, y, z, hashIV)
 }
 
 func rol32(v, shift uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index fb11874c6..a93a7621a 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -13,6 +13,8 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/fragmentation",
+        "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
     ],
 )
@@ -36,5 +38,6 @@ go_test(
         "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 29e597002..a703a768c 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -21,11 +21,14 @@
 package ipv6
 
 import (
+	"fmt"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
+	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -49,6 +52,7 @@ type endpoint struct {
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
 	dispatcher    stack.TransportDispatcher
+	fragmentation *fragmentation.Fragmentation
 	protocol      *protocol
 }
 
@@ -172,6 +176,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	headerView := pkt.Data.First()
 	h := header.IPv6(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
@@ -179,14 +184,120 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	pkt.Data.TrimFront(header.IPv6MinimumSize)
 	pkt.Data.CapLength(int(h.PayloadLength()))
 
-	p := h.TransportProtocol()
-	if p == header.ICMPv6ProtocolNumber {
-		e.handleICMP(r, headerView, pkt)
+	it, err := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), pkt.Data, true)
+	if err != nil {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, pkt)
+	for {
+		extHdr, done, err := it.Next()
+		if err != nil {
+			// This should never happen as MakeIPv6PayloadIterator above did not
+			// return an error.
+			panic(fmt.Sprintf("unexpected error when iterating over IPv6 payload: %s", err))
+		}
+		if done {
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6RoutingExtHdr:
+			// As per RFC 8200 section 4.4, if a node encounters a routing header with
+			// an unrecognized routing type value, with a non-zero Segments Left
+			// value, the node must discard the packet and send an ICMP Parameter
+			// Problem, Code 0. If the Segments Left is 0, the node must ignore the
+			// Routing extension header and process the next header in the packet.
+			//
+			// Note, the stack does not yet handle any type of routing extension
+			// header, so we just make sure Segments Left is zero before processing
+			// the next extension header.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 0 for
+			// unrecognized routing types with a non-zero Segments Left value.
+			if extHdr.SegmentsLeft() != 0 {
+				return
+			}
+
+		case header.IPv6FragmentExtHdr:
+			fragmentOffset := extHdr.FragmentOffset()
+			more := extHdr.More()
+			if !more && fragmentOffset == 0 {
+				// This fragment extension header indicates that this packet is an
+				// atomic fragment. An atomic fragment is a fragment that contains
+				// all the data required to reassemble a full packet. As per RFC 6946,
+				// atomic fragments must not interfere with "normal" fragmented traffic
+				// so we skip processing the fragment instead of feeding it through the
+				// reassembly process below.
+				continue
+			}
+
+			rawPayload := it.AsRawHeader()
+			fragmentPayloadLen := rawPayload.Buf.Size()
+			if fragmentPayloadLen == 0 {
+				// Drop the packet as it's marked as a fragment but has no payload.
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			// The packet is a fragment, let's try to reassemble it.
+			start := fragmentOffset * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
+			last := start + uint16(fragmentPayloadLen) - 1
+
+			// Drop the packet if the fragmentOffset is incorrect. i.e the
+			// combination of fragmentOffset and pkt.Data.size() causes a
+			// wrap around resulting in last being less than the offset.
+			if last < start {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			var ready bool
+			pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, more, rawPayload.Buf)
+			if err != nil {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			if ready {
+				// We create a new iterator with the reassembled packet because we could
+				// have more extension headers in the reassembled payload, as per RFC
+				// 8200 section 4.5.
+				it, err = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data, true)
+				if err != nil {
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					r.Stats().IP.MalformedFragmentsReceived.Increment()
+					return
+				}
+			}
+
+		case header.IPv6RawPayloadHeader:
+			// If the last header in the payload isn't a known IPv6 extension header,
+			// handle it as if it is transport layer data.
+			pkt.Data = extHdr.Buf
+
+			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
+				e.handleICMP(r, headerView, pkt)
+			} else {
+				r.Stats().IP.PacketsDelivered.Increment()
+				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
+				// in response to unrecognized next header values.
+				e.dispatcher.DeliverTransportPacket(r, p, pkt)
+			}
+
+		default:
+			// If we receive a packet for an extension header we do not yet handle,
+			// drop the packet for now.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
+			// in response to unrecognized next header values.
+			r.Stats().UnknownProtocolRcvdPackets.Increment()
+			return
+		}
+	}
 }
 
 // Close cleans up resources associated with the endpoint.
@@ -229,6 +340,7 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 		linkEP:        linkEP,
 		linkAddrCache: linkAddrCache,
 		dispatcher:    dispatcher,
+		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
 		protocol:      p,
 	}, nil
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index ed98ef22a..86bfda85e 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -17,6 +17,7 @@ package ipv6
 import (
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -33,6 +34,12 @@ const (
 	// The least significant 3 bytes are the same as addr2 so both addr2 and
 	// addr3 will have the same solicited-node address.
 	addr3 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x02"
+
+	// Tests use the extension header identifier values as uint8 instead of
+	// header.IPv6ExtensionHeaderIdentifier.
+	routingExtHdrID  = uint8(header.IPv6RoutingExtHdrIdentifier)
+	fragmentExtHdrID = uint8(header.IPv6FragmentExtHdrIdentifier)
+	noNextHdrID      = uint8(header.IPv6NoNextHeaderIdentifier)
 )
 
 // testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
@@ -268,3 +275,764 @@ func TestAddIpv6Address(t *testing.T) {
 		})
 	}
 }
+
+func TestReceiveIPv6ExtHdrs(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name         string
+		extHdr       func(nextHdr uint8) ([]byte, uint8)
+		shouldAccept bool
+	}{
+		{
+			name:         "None",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, nextHdr },
+			shouldAccept: true,
+		},
+		{
+			name:         "routing with zero segments left",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 2, 3, 4, 5}, routingExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "routing with non-zero segments left",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 1, 2, 3, 4, 5}, routingExtHdrID },
+			shouldAccept: false,
+		},
+		{
+			name:         "atomic fragment with zero ID",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 0, 0, 0, 0}, fragmentExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "atomic fragment with non-zero ID",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "fragment",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			shouldAccept: false,
+		},
+		{
+			name: "routing - atomic fragment",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					nextHdr, 0, 0, 0, 1, 2, 3, 4,
+				}, routingExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "atomic fragment - routing",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Fragment extension header.
+					routingExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Routing extension header.
+					nextHdr, 0, 1, 0, 2, 3, 4, 5,
+				}, fragmentExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name:         "No next header",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			shouldAccept: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			udpPayload := []byte{1, 2, 3, 4, 5, 6, 7, 8}
+			udpLength := header.UDPMinimumSize + len(udpPayload)
+			extHdrBytes, ipv6NextHdr := test.extHdr(uint8(header.UDPProtocolNumber))
+			extHdrLen := len(extHdrBytes)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + extHdrLen + udpLength)
+
+			// Serialize UDP message.
+			u := header.UDP(hdr.Prepend(udpLength))
+			u.Encode(&header.UDPFields{
+				SrcPort: 5555,
+				DstPort: 80,
+				Length:  uint16(udpLength),
+			})
+			copy(u.Payload(), udpPayload)
+			sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+			sum = header.Checksum(udpPayload, sum)
+			u.SetChecksum(^u.CalculateChecksum(sum))
+
+			// Copy extension header bytes between the UDP message and the IPv6
+			// fixed header.
+			copy(hdr.Prepend(extHdrLen), extHdrBytes)
+
+			// Serialize IPv6 fixed header.
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    ipv6NextHdr,
+				HopLimit:      255,
+				SrcAddr:       addr1,
+				DstAddr:       addr2,
+			})
+
+			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			stats := s.Stats().UDP.PacketsReceived
+
+			if !test.shouldAccept {
+				if got := stats.Value(); got != 0 {
+					t.Errorf("got UDP Rx Packets = %d, want = 0", got)
+				}
+
+				return
+			}
+
+			// Expect a UDP packet.
+			if got := stats.Value(); got != 1 {
+				t.Errorf("got UDP Rx Packets = %d, want = 1", got)
+			}
+			gotPayload, _, err := ep.Read(nil)
+			if err != nil {
+				t.Fatalf("Read(nil): %s", err)
+			}
+			if diff := cmp.Diff(buffer.View(udpPayload), gotPayload); diff != "" {
+				t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+			}
+
+			// Should not have any more UDP packets.
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
+
+// fragmentData holds the IPv6 payload for a fragmented IPv6 packet.
+type fragmentData struct {
+	nextHdr uint8
+	data    buffer.VectorisedView
+}
+
+func TestReceiveIPv6Fragments(t *testing.T) {
+	const nicID = 1
+	const udpPayload1Length = 256
+	const udpPayload2Length = 128
+	const fragmentExtHdrLen = 8
+	// Note, not all routing extension headers will be 8 bytes but this test
+	// uses 8 byte routing extension headers for most sub tests.
+	const routingExtHdrLen = 8
+
+	udpGen := func(payload []byte, multiplier uint8) buffer.View {
+		payloadLen := len(payload)
+		for i := 0; i < payloadLen; i++ {
+			payload[i] = uint8(i) * multiplier
+		}
+
+		udpLength := header.UDPMinimumSize + payloadLen
+
+		hdr := buffer.NewPrependable(udpLength)
+		u := header.UDP(hdr.Prepend(udpLength))
+		u.Encode(&header.UDPFields{
+			SrcPort: 5555,
+			DstPort: 80,
+			Length:  uint16(udpLength),
+		})
+		copy(u.Payload(), payload)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum = header.Checksum(payload, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+		return hdr.View()
+	}
+
+	var udpPayload1Buf [udpPayload1Length]byte
+	udpPayload1 := udpPayload1Buf[:]
+	ipv6Payload1 := udpGen(udpPayload1, 1)
+
+	var udpPayload2Buf [udpPayload2Length]byte
+	udpPayload2 := udpPayload2Buf[:]
+	ipv6Payload2 := udpGen(udpPayload2, 2)
+
+	tests := []struct {
+		name             string
+		expectedPayload  []byte
+		fragments        []fragmentData
+		expectedPayloads [][]byte
+	}{
+		{
+			name: "No fragmentation",
+			fragments: []fragmentData{
+				{
+					nextHdr: uint8(header.UDPProtocolNumber),
+					data:    ipv6Payload1.ToVectorisedView(),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Atomic fragment",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with different IDs",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 2}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with per-fragment routing header with zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with per-fragment routing header with non-zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 1, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 1, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 0, 2, 3, 4, 5}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with routing header with non-zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 1, 2, 3, 4, 5}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with zero segments left across fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is fragmentExtHdrLen+8 because the
+						// first 8 bytes of the 16 byte routing extension header is in
+						// this fragment.
+						fragmentExtHdrLen+8,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header (part 1)
+							//
+							// Segments left = 0.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 1, 1, 0, 2, 3, 4, 5}),
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is
+						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// the 16 byte routing extension header is in this fagment.
+						fragmentExtHdrLen+8+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 1, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 8, 0, 0, 0, 1}),
+
+							// Routing extension header (part 2)
+							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with routing header with non-zero segments left across fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is fragmentExtHdrLen+8 because the
+						// first 8 bytes of the 16 byte routing extension header is in
+						// this fragment.
+						fragmentExtHdrLen+8,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header (part 1)
+							//
+							// Segments left = 1.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 1, 1, 1, 2, 3, 4, 5}),
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is
+						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// the 16 byte routing extension header is in this fagment.
+						fragmentExtHdrLen+8+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 1, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 8, 0, 0, 0, 1}),
+
+							// Routing extension header (part 2)
+							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		// As per RFC 6946, IPv6 atomic fragments MUST NOT interfere with "normal"
+		// fragmented traffic.
+		{
+			name: "Two fragments with atomic",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				// This fragment has the same ID as the other fragments but is an atomic
+				// fragment. It should not interfere with the other fragments.
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload2),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 1}),
+
+							ipv6Payload2,
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload2, udpPayload1},
+		},
+		{
+			name: "Two interleaved fragmented packets",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 2}),
+
+							ipv6Payload2[:32],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload2)-32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 4, More = false, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 2}),
+
+							ipv6Payload2[32:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			for _, f := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize)
+
+				// Serialize IPv6 fixed header.
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(f.data.Size()),
+					NextHeader:    f.nextHdr,
+					HopLimit:      255,
+					SrcAddr:       addr1,
+					DstAddr:       addr2,
+				})
+
+				vv := hdr.View().ToVectorisedView()
+				vv.Append(f.data)
+
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+					Data: vv,
+				})
+			}
+
+			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
+				t.Errorf("got UDP Rx Packets = %d, want = %d", got, want)
+			}
+
+			for i, p := range test.expectedPayloads {
+				gotPayload, _, err := ep.Read(nil)
+				if err != nil {
+					t.Fatalf("(i=%d) Read(nil): %s", i, err)
+				}
+				if diff := cmp.Diff(buffer.View(p), gotPayload); diff != "" {
+					t.Errorf("(i=%d) got UDP payload mismatch (-want +got):\n%s", i, diff)
+				}
+			}
+
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("(last) got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From edc3c049eb553fcbf32f4a6b515141a26c5609d4 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 26 Mar 2020 15:59:41 -0700
Subject: Use panic instead of log.Fatalf

PiperOrigin-RevId: 303212189
---
 pkg/tcpip/network/ipv6/icmp.go |  6 +++---
 pkg/tcpip/stack/ndp.go         | 29 +++++++++++++++--------------
 pkg/tcpip/stack/nic.go         |  5 ++---
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 8640feffc..e0dd5afd3 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -15,7 +15,7 @@
 package ipv6
 
 import (
-	"log"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -199,7 +199,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 			opt, done, err := it.Next()
 			if err != nil {
 				// This should never happen as Iter(true) above did not return an error.
-				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
 			}
 			if done {
 				break
@@ -306,7 +306,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 			opt, done, err := it.Next()
 			if err != nil {
 				// This should never happen as Iter(true) above did not return an error.
-				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
 			}
 			if done {
 				break
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 630fdefc5..7c9fc48d1 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -15,6 +15,7 @@
 package stack
 
 import (
+	"fmt"
 	"log"
 	"math/rand"
 	"time"
@@ -428,7 +429,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 
 	if ref.getKind() != permanentTentative {
 		// The endpoint should be marked as tentative since we are starting DAD.
-		log.Fatalf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID())
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
 	// Should not attempt to perform DAD on an address that is currently in the
@@ -440,7 +441,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// address, or its reference count would have been increased without doing
 		// the work that would have been done for an address that was brand new.
 		// See NIC.addAddressLocked.
-		log.Fatalf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID())
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
 	remaining := ndp.configs.DupAddrDetectTransmits
@@ -476,7 +477,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		if ref.getKind() != permanentTentative {
 			// The endpoint should still be marked as tentative since we are still
 			// performing DAD on it.
-			log.Fatalf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID())
+			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID()))
 		}
 
 		dadDone := remaining == 0
@@ -546,9 +547,9 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	// Route should resolve immediately since snmc is a multicast address so a
 	// remote link address can be calculated without a resolution process.
 	if c, err := r.Resolve(nil); err != nil {
-		log.Fatalf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err)
+		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
 	} else if c != nil {
-		log.Fatalf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID())
+		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
 	}
 
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
@@ -949,7 +950,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
-				log.Fatalf("ndp: must have a slaacPrefixes entry for the SLAAC prefix %s", prefix)
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the SLAAC prefix %s", prefix))
 			}
 
 			ndp.deprecateSLAACAddress(prefixState.ref)
@@ -1029,7 +1030,7 @@ func (ndp *ndpState) addSLAACAddr(prefix tcpip.Subnet, deprecated bool) *referen
 
 	ref, err := ndp.nic.addAddressLocked(generatedAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
 	if err != nil {
-		log.Fatalf("ndp: error when adding address %+v: %s", generatedAddr, err)
+		panic(fmt.Sprintf("ndp: error when adding address %+v: %s", generatedAddr, err))
 	}
 
 	return ref
@@ -1043,7 +1044,7 @@ func (ndp *ndpState) addSLAACAddr(prefix tcpip.Subnet, deprecated bool) *referen
 func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, pl, vl time.Duration) {
 	prefixState, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
-		log.Fatalf("ndp: SLAAC prefix state not found to refresh lifetimes for %s", prefix)
+		panic(fmt.Sprintf("ndp: SLAAC prefix state not found to refresh lifetimes for %s", prefix))
 	}
 	defer func() { ndp.slaacPrefixes[prefix] = prefixState }()
 
@@ -1144,7 +1145,7 @@ func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, removeAddr bool)
 
 	if removeAddr {
 		if err := ndp.nic.removePermanentAddressLocked(addr); err != nil {
-			log.Fatalf("ndp: removePermanentAddressLocked(%s): %s", addr, err)
+			panic(fmt.Sprintf("ndp: removePermanentAddressLocked(%s): %s", addr, err))
 		}
 	}
 
@@ -1193,7 +1194,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 	}
 
 	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
-		log.Fatalf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes)
+		panic(fmt.Sprintf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes))
 	}
 
 	for prefix := range ndp.onLinkPrefixes {
@@ -1201,7 +1202,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 	}
 
 	if got := len(ndp.onLinkPrefixes); got != 0 {
-		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got)
+		panic(fmt.Sprintf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got))
 	}
 
 	for router := range ndp.defaultRouters {
@@ -1209,7 +1210,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 	}
 
 	if got := len(ndp.defaultRouters); got != 0 {
-		log.Fatalf("ndp: still have discovered default routers after cleaning up; found = %d", got)
+		panic(fmt.Sprintf("ndp: still have discovered default routers after cleaning up; found = %d", got))
 	}
 }
 
@@ -1251,9 +1252,9 @@ func (ndp *ndpState) startSolicitingRouters() {
 		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
 		// remote link address can be calculated without a resolution process.
 		if c, err := r.Resolve(nil); err != nil {
-			log.Fatalf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err)
+			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
 		} else if c != nil {
-			log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID())
+			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
 		}
 
 		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index b6fa647ea..4835251bc 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -16,7 +16,6 @@ package stack
 
 import (
 	"fmt"
-	"log"
 	"reflect"
 	"sort"
 	"strings"
@@ -480,7 +479,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 			// Should never happen as we got r from the primary IPv6 endpoint list and
 			// ScopeForIPv6Address only returns an error if addr is not an IPv6
 			// address.
-			log.Fatalf("header.ScopeForIPv6Address(%s): %s", addr, err)
+			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
 		}
 
 		cs = append(cs, ipv6AddrCandidate{
@@ -492,7 +491,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
 	if err != nil {
 		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
-		log.Fatalf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err)
+		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
 	}
 
 	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
-- 
cgit v1.2.3


From 137f3614009b0ef931c1d00a083b4ae8e6a39bc9 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 26 Mar 2020 16:46:15 -0700
Subject: Use host-defined file owner and mode, when possible, for imported
 fds.

Using the host-defined file owner matches VFS1. It is more correct to use the
host-defined mode, since the cached value may become out of date. However,
kernfs.Inode.Mode() does not return an error--other filesystems on kernfs are
in-memory so retrieving mode should not fail. Therefore, if the host syscall
fails, we rely on a cached value instead.

Updates #1672.

PiperOrigin-RevId: 303220864
---
 pkg/sentry/control/proc.go       |   6 +--
 pkg/sentry/fs/host/BUILD         |   1 -
 pkg/sentry/fs/host/control.go    |   2 +-
 pkg/sentry/fs/host/file.go       |  10 ++--
 pkg/sentry/fs/host/inode_test.go |   3 +-
 pkg/sentry/fs/host/wait_test.go  |   3 +-
 pkg/sentry/fsimpl/host/host.go   | 110 ++++++++++++++++++++++++++-------------
 runsc/boot/fds.go                |   5 +-
 8 files changed, 87 insertions(+), 53 deletions(-)

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 5457ba5e7..b51fb3959 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -224,8 +224,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		}
 	}
 
-	mounter := fs.FileOwnerFromContext(ctx)
-
 	// TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2.
 	var ttyFile *fs.File
 	for appFD, hostFile := range args.FilePayload.Files {
@@ -235,7 +233,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, true /* isTTY */)
+				appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), true /* isTTY */)
 				if err != nil {
 					return nil, 0, nil, err
 				}
@@ -254,7 +252,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		} else {
 			// Import the file as a regular host file.
 			var err error
-			appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, false /* isTTY */)
+			appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), false /* isTTY */)
 			if err != nil {
 				return nil, 0, nil, err
 			}
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 011625c80..aabce6cc9 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -71,7 +71,6 @@ go_test(
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/sentry/contexttest",
-        "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index cd84e1337..52c0504b6 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -78,7 +78,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
 		}
 
 		// Create the file backed by hostFD.
-		file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+		file, err := NewFile(ctx, fd)
 		if err != nil {
 			ctx.Warningf("Error creating file from host FD: %v", err)
 			break
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 034862694..3e48b8b2c 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -60,8 +60,8 @@ var _ fs.FileOperations = (*fileOperations)(nil)
 // The returned File cannot be saved, since there is no guarantee that the same
 // FD will exist or represent the same file at time of restore. If such a
 // guarantee does exist, use ImportFile instead.
-func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
-	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+func NewFile(ctx context.Context, fd int) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, false, false)
 }
 
 // ImportFile creates a new File backed by the provided host file descriptor.
@@ -71,13 +71,13 @@ func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error
 // If the returned file is saved, it will be restored by re-importing the FD
 // originally passed to ImportFile. It is the restorer's responsibility to
 // ensure that the FD represents the same file.
-func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) {
-	return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY)
+func ImportFile(ctx context.Context, fd int, isTTY bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, true, isTTY)
 }
 
 // newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is
 // saveable, then saveable is true.
-func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) {
+func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool) (*fs.File, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(donated, &s); err != nil {
 		return nil, err
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 4c374681c..c507f57eb 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -19,7 +19,6 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
 // TestCloseFD verifies fds will be closed.
@@ -33,7 +32,7 @@ func TestCloseFD(t *testing.T) {
 
 	// Use the write-end because we will detect if it's closed on the read end.
 	ctx := contexttest.Context(t)
-	file, err := NewFile(ctx, p[1], fs.RootOwner)
+	file, err := NewFile(ctx, p[1])
 	if err != nil {
 		t.Fatalf("Failed to create File: %v", err)
 	}
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index d49c3a635..ce397a5e3 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -20,7 +20,6 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -34,7 +33,7 @@ func TestWait(t *testing.T) {
 	defer syscall.Close(fds[1])
 
 	ctx := contexttest.Context(t)
-	file, err := NewFile(ctx, fds[0], fs.RootOwner)
+	file, err := NewFile(ctx, fds[0])
 	if err != nil {
 		syscall.Close(fds[0])
 		t.Fatalf("NewFile failed: %v", err)
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index a54985ef5..17e3d6e9d 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -54,7 +54,7 @@ func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) {
 }
 
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
-func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) {
+func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
 	fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem)
 	if !ok {
 		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
@@ -78,8 +78,6 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID
 		canMap:   canMap(uint32(fileType)),
 		ino:      fs.NextIno(),
 		mode:     fileMode,
-		uid:      ownerUID,
-		gid:      ownerGID,
 		// For simplicity, set offset to 0. Technically, we should
 		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
 		// and use the offset from the host fd otherwise.
@@ -135,17 +133,20 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	ino uint64
 
-	// TODO(gvisor.dev/issue/1672): protect mode, uid, and gid with mutex.
+	// modeMu protects mode.
+	modeMu sync.Mutex
 
-	// mode is the file mode of this inode. Note that this value may become out
-	// of date if the mode is changed on the host, e.g. with chmod.
+	// mode is a cached version of the file mode on the host. Note that it may
+	// become out of date if the mode is changed on the host, e.g. with chmod.
+	//
+	// Generally, it is better to retrieve the mode from the host through an
+	// fstat syscall. We only use this value in inode.Mode(), which cannot
+	// return an error, if the syscall to host fails.
+	//
+	// FIXME(b/152294168): Plumb error into Inode.Mode() return value so we
+	// can get rid of this.
 	mode linux.FileMode
 
-	// uid and gid of the file owner. Note that these refer to the owner of the
-	// file created on import, not the fd on the host.
-	uid auth.KUID
-	gid auth.KGID
-
 	// offsetMu protects offset.
 	offsetMu sync.Mutex
 
@@ -168,12 +169,35 @@ func fileFlagsFromHostFD(fd int) (int, error) {
 
 // CheckPermissions implements kernfs.Inode.
 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, ats, i.mode, i.uid, i.gid)
+	mode, uid, gid, err := i.getPermissions()
+	if err != nil {
+		return err
+	}
+	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
 }
 
 // Mode implements kernfs.Inode.
 func (i *inode) Mode() linux.FileMode {
-	return i.mode
+	mode, _, _, err := i.getPermissions()
+	if err != nil {
+		return i.mode
+	}
+
+	return linux.FileMode(mode)
+}
+
+func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) {
+	// Retrieve metadata.
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		return 0, 0, 0, err
+	}
+
+	// Update cached mode.
+	i.modeMu.Lock()
+	i.mode = linux.FileMode(s.Mode)
+	i.modeMu.Unlock()
+	return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil
 }
 
 // Stat implements kernfs.Inode.
@@ -213,45 +237,51 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro
 	ls.Attributes = s.Attributes
 	ls.AttributesMask = s.Attributes_mask
 
-	if mask|linux.STATX_TYPE != 0 {
+	if mask&linux.STATX_TYPE != 0 {
 		ls.Mode |= s.Mode & linux.S_IFMT
 	}
-	if mask|linux.STATX_MODE != 0 {
+	if mask&linux.STATX_MODE != 0 {
 		ls.Mode |= s.Mode &^ linux.S_IFMT
 	}
-	if mask|linux.STATX_NLINK != 0 {
+	if mask&linux.STATX_NLINK != 0 {
 		ls.Nlink = s.Nlink
 	}
-	if mask|linux.STATX_ATIME != 0 {
+	if mask&linux.STATX_UID != 0 {
+		ls.UID = s.Uid
+	}
+	if mask&linux.STATX_GID != 0 {
+		ls.GID = s.Gid
+	}
+	if mask&linux.STATX_ATIME != 0 {
 		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
 	}
-	if mask|linux.STATX_BTIME != 0 {
+	if mask&linux.STATX_BTIME != 0 {
 		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
 	}
-	if mask|linux.STATX_CTIME != 0 {
+	if mask&linux.STATX_CTIME != 0 {
 		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
 	}
-	if mask|linux.STATX_MTIME != 0 {
+	if mask&linux.STATX_MTIME != 0 {
 		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
 	}
-	if mask|linux.STATX_SIZE != 0 {
+	if mask&linux.STATX_SIZE != 0 {
 		ls.Size = s.Size
 	}
-	if mask|linux.STATX_BLOCKS != 0 {
+	if mask&linux.STATX_BLOCKS != 0 {
 		ls.Blocks = s.Blocks
 	}
 
-	// Use our own internal inode number and file owner.
-	if mask|linux.STATX_INO != 0 {
+	// Use our own internal inode number.
+	if mask&linux.STATX_INO != 0 {
 		ls.Ino = i.ino
 	}
-	if mask|linux.STATX_UID != 0 {
-		ls.UID = uint32(i.uid)
-	}
-	if mask|linux.STATX_GID != 0 {
-		ls.GID = uint32(i.gid)
-	}
 
+	// Update cached mode.
+	if (mask&linux.STATX_TYPE != 0) && (mask&linux.STATX_MODE != 0) {
+		i.modeMu.Lock()
+		i.mode = linux.FileMode(s.Mode)
+		i.modeMu.Unlock()
+	}
 	return ls, nil
 }
 
@@ -274,6 +304,8 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) {
 		Mask:    linux.STATX_BASIC_STATS,
 		Blksize: uint32(s.Blksize),
 		Nlink:   uint32(s.Nlink),
+		UID:     s.Uid,
+		GID:     s.Gid,
 		Mode:    uint16(s.Mode),
 		Size:    uint64(s.Size),
 		Blocks:  uint64(s.Blocks),
@@ -282,15 +314,13 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) {
 		Mtime:   timespecToStatxTimestamp(s.Mtim),
 	}
 
-	// Use our own internal inode number and file owner.
+	// Use our own internal inode number.
 	//
 	// TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well.
 	// If we use the device number from the host, it may collide with another
 	// sentry-internal device number. We handle device/inode numbers without
 	// relying on the host to prevent collisions.
 	ls.Ino = i.ino
-	ls.UID = uint32(i.uid)
-	ls.GID = uint32(i.gid)
 
 	return ls, nil
 }
@@ -306,7 +336,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &s, i.Mode(), i.uid, i.gid); err != nil {
+	mode, uid, gid, err := i.getPermissions()
+	if err != nil {
+		return err
+	}
+	if err := vfs.CheckSetStat(ctx, creds, &s, mode.Permissions(), uid, gid); err != nil {
 		return err
 	}
 
@@ -314,7 +348,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
 			return err
 		}
+		i.modeMu.Lock()
 		i.mode = linux.FileMode(s.Mode)
+		i.modeMu.Unlock()
 	}
 	if m&linux.STATX_SIZE != 0 {
 		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
@@ -351,7 +387,11 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio
 }
 
 func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
-	fileType := i.mode.FileType()
+	mode, _, _, err := i.getPermissions()
+	if err != nil {
+		return nil, err
+	}
+	fileType := mode.FileType()
 	if fileType == syscall.S_IFSOCK {
 		if i.isTTY {
 			return nil, errors.New("cannot use host socket as TTY")
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 417d2d5fb..5314b0f2a 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -34,7 +34,6 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 	k := kernel.KernelFromContext(ctx)
 	fdTable := k.NewFDTable()
 	defer fdTable.DecRef()
-	mounter := fs.FileOwnerFromContext(ctx)
 
 	var ttyFile *fs.File
 	for appFD, hostFD := range stdioFDs {
@@ -44,7 +43,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
+				appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
@@ -63,7 +62,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 		} else {
 			// Import the file as a regular host file.
 			var err error
-			appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
+			appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
-- 
cgit v1.2.3


From 76a7ace751bfd4b16411edbc0a2b06d0308b8832 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 26 Mar 2020 21:47:46 -0700
Subject: Add BoundEndpointAt filesystem operation.

BoundEndpointAt() is needed to support Unix sockets bound at a
file path, corresponding to BoundEndpoint() in VFS1.

Updates #1476.

PiperOrigin-RevId: 303258251
---
 pkg/sentry/fsimpl/ext/BUILD            |  1 +
 pkg/sentry/fsimpl/ext/filesystem.go    | 12 ++++++++++
 pkg/sentry/fsimpl/gofer/BUILD          |  1 +
 pkg/sentry/fsimpl/gofer/filesystem.go  |  8 +++++++
 pkg/sentry/fsimpl/kernfs/BUILD         |  1 +
 pkg/sentry/fsimpl/kernfs/filesystem.go | 13 ++++++++++
 pkg/sentry/fsimpl/tmpfs/BUILD          |  1 +
 pkg/sentry/fsimpl/tmpfs/filesystem.go  |  8 +++++++
 pkg/sentry/vfs/BUILD                   |  1 +
 pkg/sentry/vfs/anonfs.go               |  9 +++++++
 pkg/sentry/vfs/filesystem.go           |  8 ++++++-
 pkg/sentry/vfs/vfs.go                  | 44 +++++++++++++++++++++++++++-------
 pkg/syserror/syserror.go               |  1 +
 13 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 6f78f478f..d83d75b3d 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -45,6 +45,7 @@ go_library(
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
         "//pkg/sync",
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 8497be615..48eaccdbc 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -463,6 +464,17 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO(b/134676337): Support sockets.
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
 func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
 	_, _, err := fs.walk(rp, false)
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 4ba76a1e8..d15a36709 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -46,6 +46,7 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 1e43df9ec..269624362 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -1059,6 +1060,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return fs.unlinkAt(ctx, rp, false /* dir */)
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+//
+// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
 func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
 	var ds *[]*dentry
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index e73f1f857..b3d6299d0 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -35,6 +35,7 @@ go_library(
         "//pkg/refs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 31da8b511..a429fa23d 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -728,6 +729,18 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
 func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 57abd5583..6ea35affb 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -46,6 +46,7 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sentry/vfs/lock",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 12cc64385..e678ecc37 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -656,6 +657,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+//
+// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
 func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
 	fs.mu.RLock()
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index a2a06fc8f..bf4d27c7d 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -53,6 +53,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index a62e43589..f58867066 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -222,6 +223,14 @@ func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 	return syserror.EPERM
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) {
+	if !rp.Final() {
+		return nil, syserror.ENOTDIR
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements FilesystemImpl.ListxattrAt.
 func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
 	if !rp.Done() {
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 332decce6..7b7d233f9 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // A Filesystem is a tree of nodes represented by Dentries, which forms part of
@@ -460,6 +461,11 @@ type FilesystemImpl interface {
 	// RemovexattrAt returns ENOTSUP.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
+	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
+	//
+	// - If a non-socket file exists at rp, then BoundEndpointAt returns ECONNREFUSED.
+	BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error)
+
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
 	//
 	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
@@ -482,7 +488,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: inotify_add_watch(); bind()
+	// TODO: inotify_add_watch()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 03d1fb943..1708c1a53 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -38,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -230,7 +231,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -271,7 +272,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -307,7 +308,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -340,7 +341,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -350,6 +351,33 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 	}
 }
 
+// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
+func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (transport.BoundEndpoint, error) {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return nil, syserror.ECONNREFUSED
+		}
+		return nil, syserror.ENOENT
+	}
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return bep, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
 // OpenAt returns a FileDescription providing access to the file at the given
 // path. A reference is taken on the returned FileDescription.
 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
@@ -494,7 +522,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -527,7 +555,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -608,7 +636,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -640,7 +668,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 4b5a0fca6..f86db0999 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -27,6 +27,7 @@ import (
 var (
 	E2BIG        = error(syscall.E2BIG)
 	EACCES       = error(syscall.EACCES)
+	EADDRINUSE   = error(syscall.EADDRINUSE)
 	EAGAIN       = error(syscall.EAGAIN)
 	EBADF        = error(syscall.EBADF)
 	EBADFD       = error(syscall.EBADFD)
-- 
cgit v1.2.3


From be415eedcb3d5f61ed9f6a90bba8df2ecd54d8c8 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 26 Feb 2020 04:17:54 -0500
Subject: add arch-specific feature into mm

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/mm/lifecycle.go |  1 +
 pkg/sentry/mm/metadata.go  | 14 ++++++++++++++
 pkg/sentry/mm/mm.go        |  3 +++
 3 files changed, 18 insertions(+)

diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index d8a5b9d29..e6269107c 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -84,6 +84,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		dumpability:        mm.dumpability,
 		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
 		sleepForActivation: mm.sleepForActivation,
+		vdsoSigReturnAddr : mm.vdsoSigReturnAddr,
 	}
 
 	// Copy vmas.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 6a49334f4..28e5057f7 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -167,3 +167,17 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
 		orig.DecRef()
 	}
 }
+
+// VDSOSigReturn returns the address of vdso_sigreturn.
+func (mm *MemoryManager) VDSOSigReturn() uint64 {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.vdsoSigReturnAddr
+}
+
+// SetVDSOSigReturn sets the address of vdso_sigreturn.
+func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.vdsoSigReturnAddr = addr
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index c2195ae11..34d3bde7a 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -231,6 +231,9 @@ type MemoryManager struct {
 	// before trying to activate the address space. When set to true, delays in
 	// activation are not reported as stuck tasks by the watchdog.
 	sleepForActivation bool
+
+	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
+	vdsoSigReturnAddr uint64
 }
 
 // vma represents a virtual memory area.
-- 
cgit v1.2.3


From 2a4aff7f7ea62e4aae1b175262b68a8212826176 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 27 Mar 2020 16:47:12 -0700
Subject: Support Hop By Hop and Destination Options ext hdr

Enables handling the Hop by Hop and Destination Options extension
headers, but options are not yet supported. All options will be
treated as unknown and their respective action will be followed.

Note, the stack does not yet support sending ICMPv6 error messages in
response to options that cannot be handled/parsed. That will come
in a later change (Issue #2211).

Tests:
- header_test.TestIPv6UnknownExtHdrOption
- header_test.TestIPv6OptionsExtHdrIterErr
- header_test.TestIPv6OptionsExtHdrIter
- ipv6_test.TestReceiveIPv6ExtHdrs
PiperOrigin-RevId: 303433085
---
 pkg/tcpip/header/ipv6_extension_headers.go      | 257 +++++++++++--
 pkg/tcpip/header/ipv6_extension_headers_test.go | 492 ++++++++++++++++++++++--
 pkg/tcpip/network/ipv6/ipv6.go                  |  86 ++++-
 pkg/tcpip/network/ipv6/ipv6_test.go             | 219 ++++++++++-
 4 files changed, 980 insertions(+), 74 deletions(-)

diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index b8866d4d2..1b6c3f328 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -16,6 +16,7 @@ package header
 
 import (
 	"bufio"
+	"bytes"
 	"encoding/binary"
 	"fmt"
 	"io"
@@ -27,6 +28,10 @@ import (
 type IPv6ExtensionHeaderIdentifier uint8
 
 const (
+	// IPv6HopByHopOptionsExtHdrIdentifier is the header identifier of a Hop by
+	// Hop Options extension header, as per RFC 8200 section 4.3.
+	IPv6HopByHopOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 0
+
 	// IPv6RoutingExtHdrIdentifier is the header identifier of a Routing extension
 	// header, as per RFC 8200 section 4.4.
 	IPv6RoutingExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 43
@@ -35,12 +40,24 @@ const (
 	// extension header, as per RFC 8200 section 4.5.
 	IPv6FragmentExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 44
 
+	// IPv6DestinationOptionsExtHdrIdentifier is the header identifier of a
+	// Destination Options extension header, as per RFC 8200 section 4.6.
+	IPv6DestinationOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 60
+
 	// IPv6NoNextHeaderIdentifier is the header identifier used to signify the end
 	// of an IPv6 payload, as per RFC 8200 section 4.7.
 	IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59
 )
 
 const (
+	// ipv6UnknownExtHdrOptionActionMask is the mask of the action to take when
+	// a node encounters an unrecognized option.
+	ipv6UnknownExtHdrOptionActionMask = 192
+
+	// ipv6UnknownExtHdrOptionActionShift is the least significant bits to discard
+	// from the action value for an unrecognized option identifier.
+	ipv6UnknownExtHdrOptionActionShift = 6
+
 	// ipv6RoutingExtHdrSegmentsLeftIdx is the index to the Segments Left field
 	// within an IPv6RoutingExtHdr.
 	ipv6RoutingExtHdrSegmentsLeftIdx = 1
@@ -107,6 +124,188 @@ type IPv6RawPayloadHeader struct {
 // isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
 func (IPv6RawPayloadHeader) isIPv6PayloadHeader() {}
 
+// ipv6OptionsExtHdr is an IPv6 extension header that holds options.
+type ipv6OptionsExtHdr []byte
+
+// Iter returns an iterator over the IPv6 extension header options held in b.
+func (b ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator {
+	it := IPv6OptionsExtHdrOptionsIterator{}
+	it.reader.Reset(b)
+	return it
+}
+
+// IPv6OptionsExtHdrOptionsIterator is an iterator over IPv6 extension header
+// options.
+//
+// Note, between when an IPv6OptionsExtHdrOptionsIterator is obtained and last
+// used, no changes to the underlying buffer may happen. Doing so may cause
+// undefined and unexpected behaviour. It is fine to obtain an
+// IPv6OptionsExtHdrOptionsIterator, iterate over the first few options then
+// modify the backing payload so long as the IPv6OptionsExtHdrOptionsIterator
+// obtained before modification is no longer used.
+type IPv6OptionsExtHdrOptionsIterator struct {
+	reader bytes.Reader
+}
+
+// IPv6OptionUnknownAction is the action that must be taken if the processing
+// IPv6 node does not recognize the option, as outlined in RFC 8200 section 4.2.
+type IPv6OptionUnknownAction int
+
+const (
+	// IPv6OptionUnknownActionSkip indicates that the unrecognized option must
+	// be skipped and the node should continue processing the header.
+	IPv6OptionUnknownActionSkip IPv6OptionUnknownAction = 0
+
+	// IPv6OptionUnknownActionDiscard indicates that the packet must be silently
+	// discarded.
+	IPv6OptionUnknownActionDiscard IPv6OptionUnknownAction = 1
+
+	// IPv6OptionUnknownActionDiscardSendICMP indicates that the packet must be
+	// discarded and the node must send an ICMP Parameter Problem, Code 2, message
+	// to the packet's source, regardless of whether or not the packet's
+	// Destination was a multicast address.
+	IPv6OptionUnknownActionDiscardSendICMP IPv6OptionUnknownAction = 2
+
+	// IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest indicates that the
+	// packet must be discarded and the node must send an ICMP Parameter Problem,
+	// Code 2, message to the packet's source only if the packet's Destination was
+	// not a multicast address.
+	IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest IPv6OptionUnknownAction = 3
+)
+
+// IPv6ExtHdrOption is implemented by the various IPv6 extension header options.
+type IPv6ExtHdrOption interface {
+	// UnknownAction returns the action to take in response to an unrecognized
+	// option.
+	UnknownAction() IPv6OptionUnknownAction
+
+	// isIPv6ExtHdrOption is used to "lock" this interface so it is not
+	// implemented by other packages.
+	isIPv6ExtHdrOption()
+}
+
+// IPv6ExtHdrOptionIndentifier is an IPv6 extension header option identifier.
+type IPv6ExtHdrOptionIndentifier uint8
+
+const (
+	// ipv6Pad1ExtHdrOptionIdentifier is the identifier for a padding option that
+	// provides 1 byte padding, as outlined in RFC 8200 section 4.2.
+	ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 0
+
+	// ipv6PadBExtHdrOptionIdentifier is the identifier for a padding option that
+	// provides variable length byte padding, as outlined in RFC 8200 section 4.2.
+	ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 1
+)
+
+// IPv6UnknownExtHdrOption holds the identifier and data for an IPv6 extension
+// header option that is unknown by the parsing utilities.
+type IPv6UnknownExtHdrOption struct {
+	Identifier IPv6ExtHdrOptionIndentifier
+	Data       []byte
+}
+
+// UnknownAction implements IPv6OptionUnknownAction.UnknownAction.
+func (o *IPv6UnknownExtHdrOption) UnknownAction() IPv6OptionUnknownAction {
+	return IPv6OptionUnknownAction((o.Identifier & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift)
+}
+
+// isIPv6ExtHdrOption implements IPv6ExtHdrOption.isIPv6ExtHdrOption.
+func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {}
+
+// Next returns the next option in the options data.
+//
+// If the next item is not a known extension header option,
+// IPv6UnknownExtHdrOption will be returned with the option identifier and data.
+//
+// The return is of the format (option, done, error). done will be true when
+// Next is unable to return anything because the iterator has reached the end of
+// the options data, or an error occured.
+func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) {
+	for {
+		temp, err := i.reader.ReadByte()
+		if err != nil {
+			// If we can't read the first byte of a new option, then we know the
+			// options buffer has been exhausted and we are done iterating.
+			return nil, true, nil
+		}
+		id := IPv6ExtHdrOptionIndentifier(temp)
+
+		// If the option identifier indicates the option is a Pad1 option, then we
+		// know the option does not have Length and Data fields. End processing of
+		// the Pad1 option and continue processing the buffer as a new option.
+		if id == ipv6Pad1ExtHdrOptionIdentifier {
+			continue
+		}
+
+		length, err := i.reader.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				// ReadByte should only ever return nil or io.EOF.
+				panic(fmt.Sprintf("unexpected error when reading the option's Length field for option with id = %d: %s", id, err))
+			}
+
+			// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
+			// we start parsing an option; we expect the reader to contain enough
+			// bytes for the whole option.
+			return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF)
+		}
+
+		// Special-case the variable length padding option to avoid a copy.
+		if id == ipv6PadNExtHdrOptionIdentifier {
+			// Do we have enough bytes in the reader for the PadN option?
+			if n := i.reader.Len(); n < int(length) {
+				// Reset the reader to effectively consume the remaining buffer.
+				i.reader.Reset(nil)
+
+				// We return the same error as if we failed to read a non-padding option
+				// so consumers of this iterator don't need to differentiate between
+				// padding and non-padding options.
+				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
+			}
+
+			if _, err := i.reader.Seek(int64(length), io.SeekCurrent); err != nil {
+				panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err))
+			}
+
+			// End processing of the PadN option and continue processing the buffer as
+			// a new option.
+			continue
+		}
+
+		bytes := make([]byte, length)
+		if n, err := io.ReadFull(&i.reader, bytes); err != nil {
+			// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
+			// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
+			// Length field found in the option.
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+
+			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
+		}
+
+		return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
+	}
+}
+
+// IPv6HopByHopOptionsExtHdr is a buffer holding the Hop By Hop Options
+// extension header.
+type IPv6HopByHopOptionsExtHdr struct {
+	ipv6OptionsExtHdr
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6HopByHopOptionsExtHdr) isIPv6PayloadHeader() {}
+
+// IPv6DestinationOptionsExtHdr is a buffer holding the Destination Options
+// extension header.
+type IPv6DestinationOptionsExtHdr struct {
+	ipv6OptionsExtHdr
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6DestinationOptionsExtHdr) isIPv6PayloadHeader() {}
+
 // IPv6RoutingExtHdr is a buffer holding the Routing extension header specific
 // data as outlined in RFC 8200 section 4.4.
 type IPv6RoutingExtHdr []byte
@@ -176,45 +375,19 @@ type IPv6PayloadIterator struct {
 
 // MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
 // extension headers, or a raw payload if the payload cannot be parsed.
-func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.VectorisedView, check bool) (IPv6PayloadIterator, error) {
+func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.VectorisedView) IPv6PayloadIterator {
 	readers := payload.Readers()
 	readerPs := make([]io.Reader, 0, len(readers))
 	for i := range readers {
 		readerPs = append(readerPs, &readers[i])
 	}
 
-	// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
-	reader := *bufio.NewReaderSize(io.MultiReader(readerPs...), 1)
-
-	it := IPv6PayloadIterator{
+	return IPv6PayloadIterator{
 		nextHdrIdentifier: nextHdrIdentifier,
 		payload:           payload.Clone(nil),
-		reader:            reader,
-	}
-
-	var err error
-
-	if check {
-		for {
-			var done bool
-			if _, done, err = it.Next(); err != nil || done {
-				break
-			}
-		}
-
-		// Reset it (and its underlying readers) before returning it.
-		for i := range readers {
-			readers[i].Seek(0, io.SeekStart)
-		}
-		reader.Reset(io.MultiReader(readerPs...))
-		it = IPv6PayloadIterator{
-			nextHdrIdentifier: nextHdrIdentifier,
-			payload:           payload.Clone(nil),
-			reader:            reader,
-		}
+		// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
+		reader: *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
 	}
-
-	return it, err
 }
 
 // AsRawHeader returns the remaining payload of i as a raw header and
@@ -252,6 +425,14 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 
 	// Is the header we are parsing a known extension header?
 	switch i.nextHdrIdentifier {
+	case IPv6HopByHopOptionsExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
 	case IPv6RoutingExtHdrIdentifier:
 		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
 		if err != nil {
@@ -280,6 +461,14 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 
 		i.nextHdrIdentifier = nextHdrIdentifier
 		return fragmentExtHdr, false, nil
+	case IPv6DestinationOptionsExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
 	case IPv6NoNextHeaderIdentifier:
 		// This indicates the end of the IPv6 payload.
 		return nil, true, nil
@@ -315,13 +504,11 @@ func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IP
 	length, err = i.reader.ReadByte()
 	i.payload.TrimFront(1)
 	if err != nil {
-		var ret error
 		if fragmentHdr {
-			ret = fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
-		} else {
-			ret = fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+			return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
 		}
-		return 0, nil, ret
+
+		return 0, nil, fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
 	}
 	if fragmentHdr {
 		length = 0
diff --git a/pkg/tcpip/header/ipv6_extension_headers_test.go b/pkg/tcpip/header/ipv6_extension_headers_test.go
index 4bfdc77c4..133ccc8b6 100644
--- a/pkg/tcpip/header/ipv6_extension_headers_test.go
+++ b/pkg/tcpip/header/ipv6_extension_headers_test.go
@@ -36,6 +36,354 @@ func (a IPv6RawPayloadHeader) Equal(b IPv6RawPayloadHeader) bool {
 	return a.Identifier == b.Identifier && bytes.Equal(a.Buf.ToView(), b.Buf.ToView())
 }
 
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold equivalent ipv6OptionsExtHdrs.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6HopByHopOptionsExtHdr) Equal(b IPv6HopByHopOptionsExtHdr) bool {
+	return bytes.Equal(a.ipv6OptionsExtHdr, b.ipv6OptionsExtHdr)
+}
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold equivalent ipv6OptionsExtHdrs.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6DestinationOptionsExtHdr) Equal(b IPv6DestinationOptionsExtHdr) bool {
+	return bytes.Equal(a.ipv6OptionsExtHdr, b.ipv6OptionsExtHdr)
+}
+
+func TestIPv6UnknownExtHdrOption(t *testing.T) {
+	tests := []struct {
+		name                  string
+		identifier            IPv6ExtHdrOptionIndentifier
+		expectedUnknownAction IPv6OptionUnknownAction
+	}{
+		{
+			name:                  "Skip with zero LSBs",
+			identifier:            0,
+			expectedUnknownAction: IPv6OptionUnknownActionSkip,
+		},
+		{
+			name:                  "Discard with zero LSBs",
+			identifier:            64,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscard,
+		},
+		{
+			name:                  "Discard and ICMP with zero LSBs",
+			identifier:            128,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMP,
+		},
+		{
+			name:                  "Discard and ICMP for non multicast destination with zero LSBs",
+			identifier:            192,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest,
+		},
+		{
+			name:                  "Skip with non-zero LSBs",
+			identifier:            63,
+			expectedUnknownAction: IPv6OptionUnknownActionSkip,
+		},
+		{
+			name:                  "Discard with non-zero LSBs",
+			identifier:            127,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscard,
+		},
+		{
+			name:                  "Discard and ICMP with non-zero LSBs",
+			identifier:            191,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMP,
+		},
+		{
+			name:                  "Discard and ICMP for non multicast destination with non-zero LSBs",
+			identifier:            255,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opt := &IPv6UnknownExtHdrOption{Identifier: test.identifier, Data: []byte{1, 2, 3, 4}}
+			if a := opt.UnknownAction(); a != test.expectedUnknownAction {
+				t.Fatalf("got UnknownAction() = %d, want = %d", a, test.expectedUnknownAction)
+			}
+		})
+	}
+
+}
+
+func TestIPv6OptionsExtHdrIterErr(t *testing.T) {
+	tests := []struct {
+		name  string
+		bytes []byte
+		err   error
+	}{
+		{
+			name:  "Single unknown with zero length",
+			bytes: []byte{255, 0},
+		},
+		{
+			name:  "Single unknown with non-zero length",
+			bytes: []byte{255, 3, 1, 2, 3},
+		},
+		{
+			name: "Two options",
+			bytes: []byte{
+				255, 0,
+				254, 1, 1,
+			},
+		},
+		{
+			name: "Three options",
+			bytes: []byte{
+				255, 0,
+				254, 1, 1,
+				253, 4, 2, 3, 4, 5,
+			},
+		},
+		{
+			name:  "Single unknown only identifier",
+			bytes: []byte{255},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Single unknown too small with length = 1",
+			bytes: []byte{255, 1},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Single unknown too small with length = 2",
+			bytes: []byte{255, 2, 1},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown only identifier",
+			bytes: []byte{
+				255, 0,
+				254,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown missing data",
+			bytes: []byte{
+				255, 0,
+				254, 1,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown too small",
+			bytes: []byte{
+				255, 0,
+				254, 2, 1,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "One Pad1",
+			bytes: []byte{0},
+		},
+		{
+			name:  "Multiple Pad1",
+			bytes: []byte{0, 0, 0},
+		},
+		{
+			name: "Multiple PadN",
+			bytes: []byte{
+				// Pad3
+				1, 1, 1,
+
+				// Pad5
+				1, 3, 1, 2, 3,
+			},
+		},
+		{
+			name:  "Pad5 too small middle of data buffer",
+			bytes: []byte{1, 3, 1, 2},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Pad5 no data",
+			bytes: []byte{1, 3},
+			err:   io.ErrUnexpectedEOF,
+		},
+	}
+
+	check := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expectedErr error) {
+		for i := 0; ; i++ {
+			_, done, err := it.Next()
+			if err != nil {
+				// If we encountered a non-nil error while iterating, make sure it is
+				// is the same error as expectedErr.
+				if !errors.Is(err, expectedErr) {
+					t.Fatalf("got %d-th Next() = %v, want = %v", i, err, expectedErr)
+				}
+
+				return
+			}
+			if done {
+				// If we are done (without an error), make sure that we did not expect
+				// an error.
+				if expectedErr != nil {
+					t.Fatalf("expected error when iterating; want = %s", expectedErr)
+				}
+
+				return
+			}
+		}
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Run("Hop By Hop", func(t *testing.T) {
+				extHdr := IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				check(t, extHdr.Iter(), test.err)
+			})
+
+			t.Run("Destination", func(t *testing.T) {
+				extHdr := IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				check(t, extHdr.Iter(), test.err)
+			})
+		})
+	}
+}
+
+func TestIPv6OptionsExtHdrIter(t *testing.T) {
+	tests := []struct {
+		name     string
+		bytes    []byte
+		expected []IPv6ExtHdrOption
+	}{
+		{
+			name:  "Single unknown with zero length",
+			bytes: []byte{255, 0},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{}},
+			},
+		},
+		{
+			name:  "Single unknown with non-zero length",
+			bytes: []byte{255, 3, 1, 2, 3},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{1, 2, 3}},
+			},
+		},
+		{
+			name:  "Single Pad1",
+			bytes: []byte{0},
+		},
+		{
+			name:  "Two Pad1",
+			bytes: []byte{0, 0},
+		},
+		{
+			name:  "Single Pad3",
+			bytes: []byte{1, 1, 1},
+		},
+		{
+			name:  "Single Pad5",
+			bytes: []byte{1, 3, 1, 2, 3},
+		},
+		{
+			name: "Multiple Pad",
+			bytes: []byte{
+				// Pad1
+				0,
+
+				// Pad2
+				1, 0,
+
+				// Pad3
+				1, 1, 1,
+
+				// Pad4
+				1, 2, 1, 2,
+
+				// Pad5
+				1, 3, 1, 2, 3,
+			},
+		},
+		{
+			name: "Multiple options",
+			bytes: []byte{
+				// Pad1
+				0,
+
+				// Unknown
+				255, 0,
+
+				// Pad2
+				1, 0,
+
+				// Unknown
+				254, 1, 1,
+
+				// Pad3
+				1, 1, 1,
+
+				// Unknown
+				253, 4, 2, 3, 4, 5,
+
+				// Pad4
+				1, 2, 1, 2,
+			},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{}},
+				&IPv6UnknownExtHdrOption{Identifier: 254, Data: []byte{1}},
+				&IPv6UnknownExtHdrOption{Identifier: 253, Data: []byte{2, 3, 4, 5}},
+			},
+		},
+	}
+
+	checkIter := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expected []IPv6ExtHdrOption) {
+		for i, e := range expected {
+			opt, done, err := it.Next()
+			if err != nil {
+				t.Errorf("(i=%d) Next(): %s", i, err)
+			}
+			if done {
+				t.Errorf("(i=%d) unexpectedly done iterating", i)
+			}
+			if diff := cmp.Diff(e, opt); diff != "" {
+				t.Errorf("(i=%d) got option mismatch (-want +got):\n%s", i, diff)
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+		}
+
+		opt, done, err := it.Next()
+		if err != nil {
+			t.Errorf("(last) Next(): %s", err)
+		}
+		if !done {
+			t.Errorf("(last) iterator unexpectedly not done")
+		}
+		if opt != nil {
+			t.Errorf("(last) got Next() = %T, want = nil", opt)
+		}
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Run("Hop By Hop", func(t *testing.T) {
+				extHdr := IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				checkIter(t, extHdr.Iter(), test.expected)
+			})
+
+			t.Run("Destination", func(t *testing.T) {
+				extHdr := IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				checkIter(t, extHdr.Iter(), test.expected)
+			})
+		})
+	}
+}
+
 func TestIPv6RoutingExtHdr(t *testing.T) {
 	tests := []struct {
 		name         string
@@ -144,7 +492,6 @@ func TestIPv6ExtHdrIterErr(t *testing.T) {
 			firstNextHdr: 255,
 			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
 		},
-
 		{
 			name:         "No next header",
 			firstNextHdr: IPv6NoNextHeaderIdentifier,
@@ -154,7 +501,17 @@ func TestIPv6ExtHdrIterErr(t *testing.T) {
 			firstNextHdr: IPv6NoNextHeaderIdentifier,
 			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
 		},
-
+		{
+			name:         "Valid single hop by hop",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3, 4}),
+		},
+		{
+			name:         "Hop by hop too small",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3}),
+			err:          io.ErrUnexpectedEOF,
+		},
 		{
 			name:         "Valid single fragment",
 			firstNextHdr: IPv6FragmentExtHdrIdentifier,
@@ -166,7 +523,17 @@ func TestIPv6ExtHdrIterErr(t *testing.T) {
 			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2}),
 			err:          io.ErrUnexpectedEOF,
 		},
-
+		{
+			name:         "Valid single destination",
+			firstNextHdr: IPv6DestinationOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3, 4}),
+		},
+		{
+			name:         "Destination too small",
+			firstNextHdr: IPv6DestinationOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3}),
+			err:          io.ErrUnexpectedEOF,
+		},
 		{
 			name:         "Valid single routing",
 			firstNextHdr: IPv6RoutingExtHdrIdentifier,
@@ -205,31 +572,93 @@ func TestIPv6ExtHdrIterErr(t *testing.T) {
 			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7}),
 			err:          io.ErrUnexpectedEOF,
 		},
-
 		{
 			name:         "Mixed",
-			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
 			payload: makeVectorisedViewFromByteBuffers([]byte{
-				// Fragment extension header.
-				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
 
 				// Routing extension header.
-				255, 0, 1, 2, 3, 4, 5, 6,
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3, 4,
 
 				// Upper layer data.
 				1, 2, 3, 4,
 			}),
 		},
+		{
+			name:         "Mixed without upper layer data",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3, 4,
+			}),
+		},
+		{
+			name:         "Mixed without upper layer data but last ext hdr too small",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3,
+			}),
+			err: io.ErrUnexpectedEOF,
+		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			if _, err := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload, false); err != nil {
-				t.Errorf("got MakeIPv6PayloadIterator(%d, _, false) = %s, want = nil", test.firstNextHdr, err)
-			}
+			it := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload)
+
+			for i := 0; ; i++ {
+				_, done, err := it.Next()
+				if err != nil {
+					// If we encountered a non-nil error while iterating, make sure it is
+					// is the same error as test.err.
+					if !errors.Is(err, test.err) {
+						t.Fatalf("got %d-th Next() = %v, want = %v", i, err, test.err)
+					}
+
+					return
+				}
+				if done {
+					// If we are done (without an error), make sure that we did not expect
+					// an error.
+					if test.err != nil {
+						t.Fatalf("expected error when iterating; want = %s", test.err)
+					}
 
-			if _, err := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload, true); !errors.Is(err, test.err) {
-				t.Errorf("got MakeIPv6PayloadIterator(%d, _, true) = %v, want = %v", test.firstNextHdr, err, test.err)
+					return
+				}
 			}
 		})
 	}
@@ -247,9 +676,12 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 		// With a non-atomic fragment, the payload after the fragment will not be
 		// parsed because the payload may not be complete.
 		{
-			name:         "fragment - routing - upper",
-			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			name:         "hopbyhop - fragment - routing - upper",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
 			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
 				// Fragment extension header.
 				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
 
@@ -260,6 +692,7 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 				1, 2, 3, 4,
 			}),
 			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
 				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
 				IPv6RawPayloadHeader{
 					Identifier: IPv6RoutingExtHdrIdentifier,
@@ -292,7 +725,7 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 		// If we have an atomic fragment, the payload following the fragment
 		// extension header should be parsed normally.
 		{
-			name:         "atomic fragment - routing - upper",
+			name:         "atomic fragment - routing - destination - upper",
 			firstNextHdr: IPv6FragmentExtHdrIdentifier,
 			payload: makeVectorisedViewFromByteBuffers([]byte{
 				// Fragment extension header.
@@ -301,7 +734,10 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
 
 				// Routing extension header.
-				255, 0, 1, 2, 3, 4, 5, 6,
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 1, 4, 1, 2, 3, 4,
 
 				// Upper layer data.
 				1, 2, 3, 4,
@@ -309,6 +745,7 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 			expected: []IPv6PayloadHeader{
 				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
 				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
 				IPv6RawPayloadHeader{
 					Identifier: 255,
 					Buf:        upperLayerData.ToVectorisedView(),
@@ -339,19 +776,23 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 			},
 		},
 		{
-			name:         "atomic fragment - no next header",
+			name:         "atomic fragment - destination - no next header",
 			firstNextHdr: IPv6FragmentExtHdrIdentifier,
 			payload: makeVectorisedViewFromByteBuffers([]byte{
 				// Fragment extension header.
 				//
 				// Res (Reserved) bits are 1 which should not affect anything.
-				uint8(IPv6NoNextHeaderIdentifier), 0, 0, 6, 128, 4, 2, 1,
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Destination Options extension header.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 1, 4, 1, 2, 3, 4,
 
 				// Random data.
 				1, 2, 3, 4,
 			}),
 			expected: []IPv6PayloadHeader{
 				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
 			},
 		},
 		{
@@ -395,9 +836,12 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 			},
 		},
 		{
-			name:         "routing - fragment - no next header",
-			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			name:         "hopbyhop - routing - fragment - no next header",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
 			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
 				// Routing extension header.
 				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
 
@@ -410,6 +854,7 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 				1, 2, 3, 4,
 			}),
 			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
 				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
 				IPv6FragmentExtHdr([6]byte{1, 6, 128, 4, 2, 1}),
 				IPv6RawPayloadHeader{
@@ -478,10 +923,7 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			it, err := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload, true)
-			if err != nil {
-				t.Fatalf("MakeIPv6PayloadIterator(%d, _ true): %s", test.firstNextHdr, err)
-			}
+			it := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload)
 
 			for i, e := range test.expected {
 				extHdr, done, err := it.Next()
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index a703a768c..685239017 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -184,24 +184,61 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	pkt.Data.TrimFront(header.IPv6MinimumSize)
 	pkt.Data.CapLength(int(h.PayloadLength()))
 
-	it, err := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), pkt.Data, true)
-	if err != nil {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), pkt.Data)
 
-	for {
+	for firstHeader := true; ; firstHeader = false {
 		extHdr, done, err := it.Next()
 		if err != nil {
-			// This should never happen as MakeIPv6PayloadIterator above did not
-			// return an error.
-			panic(fmt.Sprintf("unexpected error when iterating over IPv6 payload: %s", err))
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			return
 		}
 		if done {
 			break
 		}
 
 		switch extHdr := extHdr.(type) {
+		case header.IPv6HopByHopOptionsExtHdr:
+			// As per RFC 8200 section 4.1, the Hop By Hop extension header is
+			// restricted to appear immediately after an IPv6 fixed header.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1
+			// (unrecognized next header) error in response to an extension header's
+			// Next Header field with the Hop By Hop extension header identifier.
+			if !firstHeader {
+				return
+			}
+
+			optsIt := extHdr.Iter()
+
+			for {
+				opt, done, err := optsIt.Next()
+				if err != nil {
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					return
+				}
+				if done {
+					break
+				}
+
+				// We currently do not support any IPv6 Hop By Hop extension header
+				// options.
+				switch opt.UnknownAction() {
+				case header.IPv6OptionUnknownActionSkip:
+				case header.IPv6OptionUnknownActionDiscard:
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				default:
+					panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %d", opt))
+				}
+			}
+
 		case header.IPv6RoutingExtHdr:
 			// As per RFC 8200 section 4.4, if a node encounters a routing header with
 			// an unrecognized routing type value, with a non-zero Segments Left
@@ -266,12 +303,39 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 				// We create a new iterator with the reassembled packet because we could
 				// have more extension headers in the reassembled payload, as per RFC
 				// 8200 section 4.5.
-				it, err = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data, true)
+				it = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data)
+			}
+
+		case header.IPv6DestinationOptionsExtHdr:
+			optsIt := extHdr.Iter()
+
+			for {
+				opt, done, err := optsIt.Next()
 				if err != nil {
 					r.Stats().IP.MalformedPacketsReceived.Increment()
-					r.Stats().IP.MalformedFragmentsReceived.Increment()
 					return
 				}
+				if done {
+					break
+				}
+
+				// We currently do not support any IPv6 Destination extension header
+				// options.
+				switch opt.UnknownAction() {
+				case header.IPv6OptionUnknownActionSkip:
+				case header.IPv6OptionUnknownActionDiscard:
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				default:
+					panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %d", opt))
+				}
 			}
 
 		case header.IPv6RawPayloadHeader:
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 86bfda85e..37f7e53ce 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -37,9 +37,11 @@ const (
 
 	// Tests use the extension header identifier values as uint8 instead of
 	// header.IPv6ExtensionHeaderIdentifier.
-	routingExtHdrID  = uint8(header.IPv6RoutingExtHdrIdentifier)
-	fragmentExtHdrID = uint8(header.IPv6FragmentExtHdrIdentifier)
-	noNextHdrID      = uint8(header.IPv6NoNextHeaderIdentifier)
+	hopByHopExtHdrID    = uint8(header.IPv6HopByHopOptionsExtHdrIdentifier)
+	routingExtHdrID     = uint8(header.IPv6RoutingExtHdrIdentifier)
+	fragmentExtHdrID    = uint8(header.IPv6FragmentExtHdrIdentifier)
+	destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
+	noNextHdrID         = uint8(header.IPv6NoNextHeaderIdentifier)
 )
 
 // testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
@@ -289,6 +291,67 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, nextHdr },
 			shouldAccept: true,
 		},
+		{
+			name: "hopbyhop with unknown option skippable action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					62, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop with unknown option discard action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard unknown.
+					127, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
 		{
 			name:         "routing with zero segments left",
 			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 2, 3, 4, 5}, routingExtHdrID },
@@ -314,6 +377,72 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 1, 2, 3, 4}, fragmentExtHdrID },
 			shouldAccept: false,
 		},
+		{
+			name:         "No next header",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option skippable action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					62, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "destination with unknown option discard action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard unknown.
+					127, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action unless multicast dest",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
 		{
 			name: "routing - atomic fragment",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
@@ -340,11 +469,95 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			},
 			shouldAccept: true,
 		},
+		{
+			name: "hop by hop (with skippable unknown) - routing",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					nextHdr, 0, 1, 0, 2, 3, 4, 5,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "routing - hop by hop (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Hop By Hop extension header with skippable unknown option.
+					nextHdr, 0, 62, 4, 1, 2, 3, 4,
+				}, routingExtHdrID
+			},
+			shouldAccept: false,
+		},
 		{
 			name:         "No next header",
 			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
 			shouldAccept: false,
 		},
+		{
+			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with skippable unknown option.
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop (with discard unknown) - routing - atomic fragment - destination (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with discard action for unknown option.
+					routingExtHdrID, 0, 65, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with skippable unknown option.
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with discard unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with discard action for unknown
+					// option.
+					nextHdr, 0, 65, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
 	}
 
 	for _, test := range tests {
-- 
cgit v1.2.3


From 10f2c8db915df14102e3f4d9efcfce372c90707a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 27 Mar 2020 16:53:28 -0700
Subject: Add FilesystemType.Name method, and FilesystemType field to
 Filesystem struct.

Both have analogues in Linux:
* struct file_system_type has a char *name field.
* struct super_block keeps a pointer to the file_system_type.

These fields are necessary to support the `filesystem type` field in
/proc/[pid]/mountinfo.

PiperOrigin-RevId: 303434063
---
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go  |  5 +++++
 pkg/sentry/fsimpl/ext/ext.go            | 12 ++++++++++--
 pkg/sentry/fsimpl/gofer/gofer.go        |  7 ++++++-
 pkg/sentry/fsimpl/host/host.go          | 15 ++++++++++++++-
 pkg/sentry/fsimpl/kernfs/kernfs.go      |  7 ++-----
 pkg/sentry/fsimpl/kernfs/kernfs_test.go |  8 ++++++--
 pkg/sentry/fsimpl/proc/filesystem.go    | 11 ++++++++---
 pkg/sentry/fsimpl/sys/sys.go            |  9 +++++++--
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        |  7 ++++++-
 pkg/sentry/vfs/anonfs.go                | 13 +++++++++++++
 pkg/sentry/vfs/filesystem.go            | 11 ++++++++++-
 pkg/sentry/vfs/filesystem_type.go       |  3 +++
 pkg/sentry/vfs/vfs.go                   |  2 +-
 13 files changed, 91 insertions(+), 19 deletions(-)

diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index abd4f24e7..64f1b142c 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -42,6 +42,11 @@ type FilesystemType struct {
 	root *vfs.Dentry
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (*FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fst.initOnce.Do(func() {
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 373d23b74..7176af6d1 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -30,6 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the name of this filesystem.
+const Name = "ext"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
@@ -91,8 +94,13 @@ func isCompatible(sb disklayout.SuperBlock) bool {
 	return true
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
 	// EACCESS should be returned according to mount(2). Filesystem independent
 	// flags (like readonly) are currently not available in pkg/sentry/vfs.
@@ -103,7 +111,7 @@ func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
 	}
 
 	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
-	fs.vfsfs.Init(vfsObj, &fs)
+	fs.vfsfs.Init(vfsObj, &fsType, &fs)
 	fs.sb, err = readSuperBlock(dev)
 	if err != nil {
 		return nil, nil, err
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index cf276a417..8e41b6b1c 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -199,6 +199,11 @@ const (
 	InteropModeShared
 )
 
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -374,7 +379,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		dentries:       make(map[*dentry]struct{}),
 		specialFileFDs: make(map[*specialFileFD]struct{}),
 	}
-	fs.vfsfs.Init(vfsObj, fs)
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
 	// Construct the root dentry.
 	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 17e3d6e9d..7d9dcd4c9 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -38,6 +38,19 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("cannot instaniate a host filesystem")
+}
+
+// Name implements FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "none"
+}
+
 // filesystem implements vfs.FilesystemImpl.
 type filesystem struct {
 	kernfs.Filesystem
@@ -46,7 +59,7 @@ type filesystem struct {
 // NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD.
 func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) {
 	fs := &filesystem{}
-	fs.Init(vfsObj)
+	fs.Init(vfsObj, &filesystemType{})
 	vfsfs := fs.VFSFilesystem()
 	// NewDisconnectedMount will take an additional reference on vfsfs.
 	defer vfsfs.DecRef()
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 794e38908..2cefef020 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -63,9 +63,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
-
 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
 // filesystem. Concrete implementations are expected to embed this in their own
 // Filesystem type.
@@ -138,8 +135,8 @@ func (fs *Filesystem) processDeferredDecRefsLocked() {
 // Init initializes a kernfs filesystem. This should be called from during
 // vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding
 // kernfs.
-func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) {
-	fs.vfsfs.Init(vfsObj, fs)
+func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem, fsType vfs.FilesystemType) {
+	fs.vfsfs.Init(vfsObj, fsType, fs)
 }
 
 // VFSFilesystem returns the generic vfs filesystem object.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index fb0d25ad7..465451f35 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -187,9 +187,13 @@ func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, err
 	return nil, syserror.EPERM
 }
 
-func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fsType) Name() string {
+	return "kernfs"
+}
+
+func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fs := &filesystem{}
-	fs.Init(vfsObj)
+	fs.Init(vfsObj, &fst)
 	root := fst.rootFn(creds, fs)
 	return fs.VFSFilesystem(), root.VFSDentry(), nil
 }
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 5c19d5522..104fc9030 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -36,8 +36,13 @@ type FilesystemType struct{}
 
 var _ vfs.FilesystemType = (*FilesystemType)(nil)
 
-// GetFilesystem implements vfs.FilesystemType.
-func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, nil, fmt.Errorf("procfs requires a kernel")
@@ -48,7 +53,7 @@ func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtual
 	}
 
 	procfs := &kernfs.Filesystem{}
-	procfs.VFSFilesystem().Init(vfsObj, procfs)
+	procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)
 
 	var cgroups map[string]string
 	if opts.InternalData != nil {
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 7abfd62f2..5c617270e 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -39,10 +39,15 @@ type filesystem struct {
 	kernfs.Filesystem
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fs := &filesystem{}
-	fs.Filesystem.Init(vfsObj)
+	fs.Filesystem.Init(vfsObj, &fsType)
 	k := kernel.KernelFromContext(ctx)
 	maxCPUCores := k.ApplicationCores()
 	defaultSysDirMode := linux.FileMode(0755)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 2f9e6c876..b07b0dbae 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -63,6 +63,11 @@ type filesystem struct {
 	nextInoMinusOne uint64 // accessed using atomic memory operations
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -74,7 +79,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		memFile: memFileProvider.MemoryFile(),
 		clock:   clock,
 	}
-	fs.vfsfs.Init(vfsObj, &fs)
+	fs.vfsfs.Init(vfsObj, &fstype, &fs)
 	root := fs.newDentry(fs.newDirectory(creds, 01777))
 	return &fs.vfsfs, &root.vfsd, nil
 }
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index f58867066..d1f6dfb45 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -51,6 +51,19 @@ const (
 	anonFileGID  = auth.RootKGID
 )
 
+// anonFilesystemType implements FilesystemType.
+type anonFilesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) {
+	panic("cannot instaniate an anon filesystem")
+}
+
+// Name implemenents FilesystemType.Name.
+func (anonFilesystemType) Name() string {
+	return "none"
+}
+
 // anonFilesystem is the implementation of FilesystemImpl that backs
 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
 //
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 7b7d233f9..cd34782ff 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -42,21 +42,30 @@ type Filesystem struct {
 	// immutable.
 	vfs *VirtualFilesystem
 
+	// fsType is the FilesystemType of this Filesystem.
+	fsType FilesystemType
+
 	// impl is the FilesystemImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in Dentry.
 	impl FilesystemImpl
 }
 
 // Init must be called before first use of fs.
-func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
 	fs.refs = 1
 	fs.vfs = vfsObj
+	fs.fsType = fsType
 	fs.impl = impl
 	vfsObj.filesystemsMu.Lock()
 	vfsObj.filesystems[fs] = struct{}{}
 	vfsObj.filesystemsMu.Unlock()
 }
 
+// FilesystemType returns the FilesystemType for this Filesystem.
+func (fs *Filesystem) FilesystemType() FilesystemType {
+	return fs.fsType
+}
+
 // VirtualFilesystem returns the containing VirtualFilesystem.
 func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
 	return fs.vfs
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index bb9cada81..f2298f7f6 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -30,6 +30,9 @@ type FilesystemType interface {
 	// along with its mount root. A reference is taken on the returned
 	// Filesystem and Dentry.
 	GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
+
+	// Name returns the name of this FilesystemType.
+	Name() string
 }
 
 // GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 1708c1a53..720b90d8f 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -134,7 +134,7 @@ func (vfs *VirtualFilesystem) Init() error {
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
 	}
-	anonfs.vfsfs.Init(vfs, &anonfs)
+	anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
 	defer anonfs.vfsfs.DecRef()
 	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
 	if err != nil {
-- 
cgit v1.2.3


From f6e4daa67ad5f07ac1bcff33476b4d13f49a69bc Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 27 Mar 2020 16:54:45 -0700
Subject: Add vfs.PathnameReachable().

/proc/[pid]/mount* omit mounts whose mount point is outside the chroot, which
is checked (indirectly) via __d_path().

PiperOrigin-RevId: 303434226
---
 pkg/sentry/vfs/pathname.go | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index b318c681a..f21a88034 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -90,6 +90,49 @@ loop:
 	return b.String(), nil
 }
 
+// PathnameReachable returns an absolute pathname to vd, consistent with
+// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is
+// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP
+// (causing the entire containing entry to be skipped), PathnameReachable
+// returns ("", nil).
+func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				return "", nil
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			return "", nil
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	return b.String(), nil
+}
+
 // PathnameForGetcwd returns an absolute pathname to vd, consistent with
 // Linux's sys_getcwd().
 func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
-- 
cgit v1.2.3


From 4aee3706406d6b102540ad5bea272b7c893da827 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 30 Mar 2020 10:43:31 -0700
Subject: Internal change.

PiperOrigin-RevId: 303773475
---
 .../harness/machine_producers/gcloud_producer.py     | 20 ++++++++++++--------
 benchmarks/runner/__init__.py                        |  7 ++++---
 benchmarks/runner/commands.py                        |  7 +++++++
 scripts/benchmark.sh                                 |  3 ++-
 tools/images/build.sh                                |  9 +++++++--
 5 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
index 1a624df2e..44d72f575 100644
--- a/benchmarks/harness/machine_producers/gcloud_producer.py
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -53,6 +53,8 @@ class GCloudProducer(machine_producer.MachineProducer):
     ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys.
     ssh_user: string of user name for ssh_key
     ssh_password: string of password for ssh key
+    internal: if true, use internal IPs of instances. Used if bm-tools is
+    running on a GCP vm when a firewall is set for external IPs.
     mock: a mock printer which will print mock data if required. Mock data is
       recorded output from subprocess calls (returncode, stdout, args).
     condition: mutex for this class around machine creation and deleteion.
@@ -66,6 +68,7 @@ class GCloudProducer(machine_producer.MachineProducer):
                ssh_key_file: str,
                ssh_user: str,
                ssh_password: str,
+               internal: bool,
                mock: gcloud_mock_recorder.MockPrinter = None):
     self.image = image
     self.zone = zone
@@ -74,6 +77,7 @@ class GCloudProducer(machine_producer.MachineProducer):
     self.ssh_key_file = ssh_key_file
     self.ssh_user = ssh_user
     self.ssh_password = ssh_password
+    self.internal = internal
     self.mock = mock
     self.condition = threading.Condition()
 
@@ -129,15 +133,13 @@ class GCloudProducer(machine_producer.MachineProducer):
     machines = []
     for instance in instances:
       name = instance["name"]
+      external = instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"]
+      internal = instance["networkInterfaces"][0]["networkIP"]
       kwargs = {
-          "hostname":
-              instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"],
-          "key_path":
-              self.ssh_key_file,
-          "username":
-              self.ssh_user,
-          "key_password":
-              self.ssh_password
+          "hostname": internal if self.internal else external,
+          "key_path": self.ssh_key_file,
+          "username": self.ssh_user,
+          "key_password": self.ssh_password
       }
       machines.append(machine.RemoteMachine(name=name, **kwargs))
     return machines
@@ -190,6 +192,8 @@ class GCloudProducer(machine_producer.MachineProducer):
     for name in names:
       cmd = "gcloud compute ssh {user}@{name}".format(
           user=self.ssh_user, name=name).split(" ")
+      if self.internal:
+        cmd.append("--internal-ip")
       cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file))
       cmd.append("--zone={zone}".format(zone=self.zone))
       cmd.append("--command=uname")
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
index ba27dc69f..ca785a148 100644
--- a/benchmarks/runner/__init__.py
+++ b/benchmarks/runner/__init__.py
@@ -120,8 +120,8 @@ def run_mock(ctx, **kwargs):
 
 @runner.command("run-gcp", commands.GCPCommand)
 @click.pass_context
-def run_gcp(ctx, image_file: str, zone_file: str, machine_type: str,
-            installers: List[str], **kwargs):
+def run_gcp(ctx, image_file: str, zone_file: str, internal: bool,
+            machine_type: str, installers: List[str], **kwargs):
   """Runs all benchmarks on GCP instances."""
 
   # Resolve all files.
@@ -137,7 +137,8 @@ def run_gcp(ctx, image_file: str, zone_file: str, machine_type: str,
       installers,
       ssh_key_file=key_file,
       ssh_user=harness.DEFAULT_USER,
-      ssh_password="")
+      ssh_password="",
+      internal=internal)
 
   try:
     run(ctx, producer, **kwargs)
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
index 0fccb2fad..194804527 100644
--- a/benchmarks/runner/commands.py
+++ b/benchmarks/runner/commands.py
@@ -111,6 +111,12 @@ class GCPCommand(RunCommand):
         default=os.path.join(
             os.path.dirname(__file__), "../../tools/images/zone.txt"),
     )
+    internal = click.core.Option(
+        ("--internal/--no-internal",),
+        help="""Use instance internal IPs. Used if bm-tools runner is running on
+        GCP instance with firewall rules blocking external IPs.""",
+        default=False,
+    )
     installers = click.core.Option(
         ("--installers",),
         help="The set of installers to use.",
@@ -124,6 +130,7 @@ class GCPCommand(RunCommand):
     self.params.extend([
         image_file,
         zone_file,
+        internal,
         machine_type,
         installers,
     ])
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index 06d44f914..334684675 100644
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -29,7 +29,8 @@ gcloud config set compute/zone ${ZONE}
 bazel run //benchmarks:benchmarks -- \
   --verbose \
   run-gcp \
-  startup \
+  "(startup|absl)" \
+  --internal \
   --runtime=runc \
   --runtime=runsc \
   --installers=head
diff --git a/tools/images/build.sh b/tools/images/build.sh
index be462d556..f89f39cbd 100755
--- a/tools/images/build.sh
+++ b/tools/images/build.sh
@@ -63,13 +63,18 @@ trap cleanup EXIT
 # Wait for the instance to become available (up to 5 minutes).
 declare timeout=300
 declare success=0
+declare internal=""
 declare -r start=$(date +%s)
 declare -r end=$((${start}+${timeout}))
 while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
-  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
+  if gcloud compute ssh --zone "${internal}" "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
     success=$((${success}+1))
+  elif gcloud compute ssh --zone --internal-ip "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
+    success=$((${success}+1))
+    internal="--internal-ip"
   fi
 done
+
 if [[ "${success}" -eq "0" ]]; then
   echo "connect timed out after ${timeout} seconds."
   exit 1
@@ -77,7 +82,7 @@ fi
 
 # Run the install scripts provided.
 for arg; do
-  gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}" >/dev/null
+  gcloud compute ssh --zone "${internal}" "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}" >/dev/null
 done
 
 # Stop the instance; required before creating an image.
-- 
cgit v1.2.3


From 3fac85da951f9f56d0232718ea7584250cf11f31 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 30 Mar 2020 12:36:30 -0700
Subject: kvm: handle exit reasons even under EINTR.

In the case of other signals (preemption), inject a normal bounce and
defer the signal until the vCPU has been returned from guest mode.

PiperOrigin-RevId: 303799678
---
 pkg/atomicbitops/atomicbitops_amd64.s           | 16 ++--
 pkg/atomicbitops/atomicbitops_arm64.s           | 16 ++--
 pkg/atomicbitops/atomicbitops_noasm.go          |  8 ++
 pkg/safecopy/safecopy.go                        |  4 +-
 pkg/safecopy/safecopy_unsafe.go                 |  6 +-
 pkg/sentry/platform/kvm/BUILD                   |  1 +
 pkg/sentry/platform/kvm/bluepill.go             | 12 ++-
 pkg/sentry/platform/kvm/bluepill_unsafe.go      | 97 ++++++++++++++++---------
 pkg/sentry/platform/kvm/kvm_const.go            |  1 +
 pkg/sentry/platform/kvm/kvm_test.go             | 64 ++++++++++++++--
 pkg/sentry/platform/kvm/machine.go              |  9 +--
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 25 -------
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 26 -------
 pkg/sentry/platform/kvm/machine_unsafe.go       | 41 +++++++++++
 runsc/sandbox/sandbox.go                        |  6 --
 15 files changed, 207 insertions(+), 125 deletions(-)

diff --git a/pkg/atomicbitops/atomicbitops_amd64.s b/pkg/atomicbitops/atomicbitops_amd64.s
index 54c887ee5..f0edd4de7 100644
--- a/pkg/atomicbitops/atomicbitops_amd64.s
+++ b/pkg/atomicbitops/atomicbitops_amd64.s
@@ -16,28 +16,28 @@
 
 #include "textflag.h"
 
-TEXT ·AndUint32(SB),$0-12
+TEXT ·AndUint32(SB),NOSPLIT,$0-12
   MOVQ  addr+0(FP), BP
   MOVL  val+8(FP), AX
   LOCK
   ANDL   AX, 0(BP)
   RET
 
-TEXT ·OrUint32(SB),$0-12
+TEXT ·OrUint32(SB),NOSPLIT,$0-12
   MOVQ  addr+0(FP), BP
   MOVL  val+8(FP), AX
   LOCK
   ORL   AX, 0(BP)
   RET
 
-TEXT ·XorUint32(SB),$0-12
+TEXT ·XorUint32(SB),NOSPLIT,$0-12
   MOVQ  addr+0(FP), BP
   MOVL  val+8(FP), AX
   LOCK
   XORL   AX, 0(BP)
   RET
 
-TEXT ·CompareAndSwapUint32(SB),$0-20
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20
   MOVQ  addr+0(FP), DI
   MOVL  old+8(FP), AX
   MOVL  new+12(FP), DX
@@ -46,28 +46,28 @@ TEXT ·CompareAndSwapUint32(SB),$0-20
   MOVL  AX, ret+16(FP)
   RET
 
-TEXT ·AndUint64(SB),$0-16
+TEXT ·AndUint64(SB),NOSPLIT,$0-16
   MOVQ  addr+0(FP), BP
   MOVQ  val+8(FP), AX
   LOCK
   ANDQ   AX, 0(BP)
   RET
 
-TEXT ·OrUint64(SB),$0-16
+TEXT ·OrUint64(SB),NOSPLIT,$0-16
   MOVQ  addr+0(FP), BP
   MOVQ  val+8(FP), AX
   LOCK
   ORQ   AX, 0(BP)
   RET
 
-TEXT ·XorUint64(SB),$0-16
+TEXT ·XorUint64(SB),NOSPLIT,$0-16
   MOVQ  addr+0(FP), BP
   MOVQ  val+8(FP), AX
   LOCK
   XORQ   AX, 0(BP)
   RET
 
-TEXT ·CompareAndSwapUint64(SB),$0-32
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-32
   MOVQ  addr+0(FP), DI
   MOVQ  old+8(FP), AX
   MOVQ  new+16(FP), DX
diff --git a/pkg/atomicbitops/atomicbitops_arm64.s b/pkg/atomicbitops/atomicbitops_arm64.s
index 5c780851b..644a6bca5 100644
--- a/pkg/atomicbitops/atomicbitops_arm64.s
+++ b/pkg/atomicbitops/atomicbitops_arm64.s
@@ -16,7 +16,7 @@
 
 #include "textflag.h"
 
-TEXT ·AndUint32(SB),$0-12
+TEXT ·AndUint32(SB),NOSPLIT,$0-12
   MOVD    ptr+0(FP), R0
   MOVW    val+8(FP), R1
 again:
@@ -26,7 +26,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·OrUint32(SB),$0-12
+TEXT ·OrUint32(SB),NOSPLIT,$0-12
   MOVD    ptr+0(FP), R0
   MOVW    val+8(FP), R1
 again:
@@ -36,7 +36,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·XorUint32(SB),$0-12
+TEXT ·XorUint32(SB),NOSPLIT,$0-12
   MOVD    ptr+0(FP), R0
   MOVW    val+8(FP), R1
 again:
@@ -46,7 +46,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·CompareAndSwapUint32(SB),$0-20
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20
   MOVD addr+0(FP), R0
   MOVW old+8(FP), R1
   MOVW new+12(FP), R2
@@ -60,7 +60,7 @@ done:
   MOVW R3, prev+16(FP)
   RET
 
-TEXT ·AndUint64(SB),$0-16
+TEXT ·AndUint64(SB),NOSPLIT,$0-16
   MOVD    ptr+0(FP), R0
   MOVD    val+8(FP), R1
 again:
@@ -70,7 +70,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·OrUint64(SB),$0-16
+TEXT ·OrUint64(SB),NOSPLIT,$0-16
   MOVD    ptr+0(FP), R0
   MOVD    val+8(FP), R1
 again:
@@ -80,7 +80,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·XorUint64(SB),$0-16
+TEXT ·XorUint64(SB),NOSPLIT,$0-16
   MOVD    ptr+0(FP), R0
   MOVD    val+8(FP), R1
 again:
@@ -90,7 +90,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·CompareAndSwapUint64(SB),$0-32
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-32
   MOVD addr+0(FP), R0
   MOVD old+8(FP), R1
   MOVD new+16(FP), R2
diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go
index 3b2898256..4e9c27b98 100644
--- a/pkg/atomicbitops/atomicbitops_noasm.go
+++ b/pkg/atomicbitops/atomicbitops_noasm.go
@@ -20,6 +20,7 @@ import (
 	"sync/atomic"
 )
 
+//go:nosplit
 func AndUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -30,6 +31,7 @@ func AndUint32(addr *uint32, val uint32) {
 	}
 }
 
+//go:nosplit
 func OrUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -40,6 +42,7 @@ func OrUint32(addr *uint32, val uint32) {
 	}
 }
 
+//go:nosplit
 func XorUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -50,6 +53,7 @@ func XorUint32(addr *uint32, val uint32) {
 	}
 }
 
+//go:nosplit
 func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
 	for {
 		prev = atomic.LoadUint32(addr)
@@ -62,6 +66,7 @@ func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
 	}
 }
 
+//go:nosplit
 func AndUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -72,6 +77,7 @@ func AndUint64(addr *uint64, val uint64) {
 	}
 }
 
+//go:nosplit
 func OrUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -82,6 +88,7 @@ func OrUint64(addr *uint64, val uint64) {
 	}
 }
 
+//go:nosplit
 func XorUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -92,6 +99,7 @@ func XorUint64(addr *uint64, val uint64) {
 	}
 }
 
+//go:nosplit
 func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
 	for {
 		prev = atomic.LoadUint64(addr)
diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go
index 2fb7e5809..521f1a82d 100644
--- a/pkg/safecopy/safecopy.go
+++ b/pkg/safecopy/safecopy.go
@@ -127,10 +127,10 @@ func initializeAddresses() {
 
 func init() {
 	initializeAddresses()
-	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil {
+	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler, 0); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
 	}
-	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil {
+	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler, 0); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
 	}
 	syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) {
diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go
index 41dd567f3..b15b920fe 100644
--- a/pkg/safecopy/safecopy_unsafe.go
+++ b/pkg/safecopy/safecopy_unsafe.go
@@ -324,11 +324,13 @@ func errorFromFaultSignal(addr uintptr, sig int32) error {
 //
 // It stores the value of the previously set handler in previous.
 //
+// The extraMask parameter is OR'ed into the existing signal handler mask.
+//
 // This function will be called on initialization in order to install safecopy
 // handlers for appropriate signals. These handlers will call the previous
 // handler however, and if this is function is being used externally then the
 // same courtesy is expected.
-func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error {
+func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr, extraMask uint64) error {
 	var sa struct {
 		handler  uintptr
 		flags    uint64
@@ -348,10 +350,10 @@ func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr
 	if sa.handler == 0 {
 		return fmt.Errorf("previous handler for signal %x isn't set", sig)
 	}
-
 	*previous = sa.handler
 
 	// Install our own handler.
+	sa.mask |= extraMask
 	sa.handler = handler
 	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
 		return e
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..e27f57536 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -70,6 +70,7 @@ go_test(
         "requires-kvm",
     ],
     deps = [
+        "//pkg/procid",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 4b23f7803..555b5fa96 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -46,6 +46,14 @@ var (
 	// bounceSignalMask has only bounceSignal set.
 	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
 
+	// otherSignalsMask includes all other signals that will be cause the
+	// vCPU to exit during execution.
+	//
+	// Currently, this includes the preemption signal and the profiling
+	// signal. In general, these should be signals whose delivery actually
+	// influences the way the program executes as the switch can be costly.
+	otherSignalsMask = uint64(1<<(uint64(syscall.SIGURG)-1)) | uint64(1<<(uint64(syscall.SIGPROF)-1))
+
 	// bounce is the interrupt vector used to return to the kernel.
 	bounce = uint32(ring0.VirtualizationException)
 
@@ -86,8 +94,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 }
 
 func init() {
-	// Install the handler.
-	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+	// Install the handler, masking all signals.
+	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler, ^uint64(0)); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
 	}
 
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9add7c944..4e9d80765 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -24,6 +24,7 @@ import (
 	"syscall"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -58,6 +59,19 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 	return &((*arch.UContext64)(context).MContext)
 }
 
+// injectInterrupt is a helper to inject an interrupt.
+//
+//go:nosplit
+func injectInterrupt(c *vCPU) {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_INTERRUPT,
+		uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+		throw("interrupt injection failed")
+	}
+}
+
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
@@ -69,6 +83,9 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
 	c := bluepillArchEnter(bluepillArchContext(context))
 
+	// Enable preemption.
+	c.setSignalMask(true)
+
 	// Increment the number of switches.
 	atomic.AddUint32(&c.switches, 1)
 
@@ -89,6 +106,9 @@ func bluepillHandler(context unsafe.Pointer) {
 			// interrupted KVM. Since we're in a signal handler
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
+			//
+			// We will not be able to actually do subsequent
+			// KVM_RUNs until this signal is processed.
 			timeout := syscall.Timespec{}
 			sig, _, errno := syscall.RawSyscall6(
 				syscall.SYS_RT_SIGTIMEDWAIT,
@@ -98,12 +118,24 @@ func bluepillHandler(context unsafe.Pointer) {
 				8,                                 // sigset size.
 				0, 0)
 			if errno == syscall.EAGAIN {
-				continue
-			}
-			if errno != 0 {
+				// If weren't able to process this signal, then
+				// it must not have been in the bounceMask. By
+				// elimination, it must have been the
+				// preemption signal. We can't process this
+				// signal right now, so we need to disable
+				// preemption until the interrupt is actually
+				// handled.
+				c.setSignalMask(false)
+				// Note that there is a waiter for this vCPU.
+				// This will cause the vCPU to exit at some
+				// point in the future (releasing the user lock
+				// and guest mode).
+				atomicbitops.OrUint32(&c.state, vCPUWaiter)
+			} else if errno != 0 {
+				// We only expect success or a timeout.
 				throw("error waiting for pending signal")
-			}
-			if sig != uintptr(bounceSignal) {
+			} else if sig != uintptr(bounceSignal) {
+				// Only the bounce should be processed.
 				throw("unexpected signal")
 			}
 
@@ -114,11 +146,10 @@ func bluepillHandler(context unsafe.Pointer) {
 			// ready.
 			if c.runData.readyForInterruptInjection == 0 {
 				c.runData.requestInterruptWindow = 1
-				continue // Rerun vCPU.
 			} else {
-				// Force injection below; the vCPU is ready.
-				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+				injectInterrupt(c)
 			}
+			continue // Rerun vCPU.
 		case syscall.EFAULT:
 			// If a fault is not serviceable due to the host
 			// backing pages having page permissions, instead of an
@@ -137,6 +168,30 @@ func bluepillHandler(context unsafe.Pointer) {
 		}
 
 		switch c.runData.exitReason {
+		case _KVM_EXIT_HLT:
+			// Copy out registers.
+			bluepillArchExit(c, bluepillArchContext(context))
+
+			// Return to the vCPUReady state; notify any waiters.
+			user := atomic.LoadUint32(&c.state) & vCPUUser
+			switch atomic.SwapUint32(&c.state, user) {
+			case user | vCPUGuest: // Expected case.
+			case user | vCPUGuest | vCPUWaiter:
+				c.notify()
+			default:
+				throw("invalid state")
+			}
+			return
+		case _KVM_EXIT_IRQ_WINDOW_OPEN:
+			// Inject an interrupt now.
+			injectInterrupt(c)
+			// Clear previous injection request.
+			c.runData.requestInterruptWindow = 0
+		case _KVM_EXIT_INTR:
+			// This is fine, it is the normal exit reason during
+			// signal delivery. However, we still need to handle
+			// other potential exit reasons *combined* with EINTR,
+			// so this switch must be hit even after the above.
 		case _KVM_EXIT_EXCEPTION:
 			c.die(bluepillArchContext(context), "exception")
 			return
@@ -155,20 +210,6 @@ func bluepillHandler(context unsafe.Pointer) {
 		case _KVM_EXIT_DEBUG:
 			c.die(bluepillArchContext(context), "debug")
 			return
-		case _KVM_EXIT_HLT:
-			// Copy out registers.
-			bluepillArchExit(c, bluepillArchContext(context))
-
-			// Return to the vCPUReady state; notify any waiters.
-			user := atomic.LoadUint32(&c.state) & vCPUUser
-			switch atomic.SwapUint32(&c.state, user) {
-			case user | vCPUGuest: // Expected case.
-			case user | vCPUGuest | vCPUWaiter:
-				c.notify()
-			default:
-				throw("invalid state")
-			}
-			return
 		case _KVM_EXIT_MMIO:
 			// Increment the fault count.
 			atomic.AddUint32(&c.faults, 1)
@@ -200,18 +241,6 @@ func bluepillHandler(context unsafe.Pointer) {
 					data[i] = *b
 				}
 			}
-		case _KVM_EXIT_IRQ_WINDOW_OPEN:
-			// Interrupt: we must have requested an interrupt
-			// window; set the interrupt line.
-			if _, _, errno := syscall.RawSyscall(
-				syscall.SYS_IOCTL,
-				uintptr(c.fd),
-				_KVM_INTERRUPT,
-				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
-				throw("interrupt injection failed")
-			}
-			// Clear previous injection request.
-			c.runData.requestInterruptWindow = 0
 		case _KVM_EXIT_SHUTDOWN:
 			c.die(bluepillArchContext(context), "shutdown")
 			return
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 1d5c77ff4..07d9c9a98 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -48,6 +48,7 @@ const (
 	_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
 	_KVM_EXIT_SHUTDOWN        = 0x8
 	_KVM_EXIT_FAIL_ENTRY      = 0x9
+	_KVM_EXIT_INTR            = 0xa
 	_KVM_EXIT_INTERNAL_ERROR  = 0x11
 	_KVM_EXIT_SYSTEM_EVENT    = 0x18
 )
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index c42752d50..d42ba3f24 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -16,12 +16,15 @@ package kvm
 
 import (
 	"math/rand"
+	"os"
 	"reflect"
+	"runtime"
 	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
@@ -320,15 +323,18 @@ func TestBounce(t *testing.T) {
 	})
 }
 
+// randomSleep is used by some race tests below.
+//
+// O(hundreds of microseconds) is appropriate to ensure different overlaps and
+// different schedules.
+func randomSleep() {
+	if n := rand.Intn(1000); n > 100 {
+		time.Sleep(time.Duration(n) * time.Microsecond)
+	}
+}
+
 func TestBounceStress(t *testing.T) {
 	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		randomSleep := func() {
-			// O(hundreds of microseconds) is appropriate to ensure
-			// different overlaps and different schedules.
-			if n := rand.Intn(1000); n > 100 {
-				time.Sleep(time.Duration(n) * time.Microsecond)
-			}
-		}
 		for i := 0; i < 1000; i++ {
 			// Start an asynchronously executing goroutine that
 			// calls Bounce at pseudo-random point in time.
@@ -355,6 +361,50 @@ func TestBounceStress(t *testing.T) {
 	})
 }
 
+func TestPreemption(t *testing.T) {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		// Lock the main vCPU thread.
+		runtime.LockOSThread()
+		pid := os.Getpid()
+		tid := procid.Current()
+		running := uint32(1)
+		defer atomic.StoreUint32(&running, 0)
+
+		// Start generating "preemptions".
+		go func() {
+			for atomic.LoadUint32(&running) != 0 {
+				// Kick via a preemption: best effort.
+				syscall.Tgkill(pid, int(tid), syscall.SIGURG)
+				randomSleep()
+			}
+		}()
+
+		for i := 0; i < 1000; i++ {
+			randomSleep()
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+			}, &si); err != platform.ErrContextInterrupt {
+				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+			}
+			// Was this caused by a preemption signal?
+			if got := atomic.LoadUint32(&c.state); got&vCPUGuest != 0 && got&vCPUWaiter == 0 {
+				continue
+			}
+			c.unlock()
+			// Should have dropped from guest mode, processed preemption.
+			if got := atomic.LoadUint32(&c.state); got != vCPUReady {
+				t.Errorf("vCPU not in ready state: got %v", got)
+			}
+			randomSleep()
+			c.lock()
+		}
+		return false
+	})
+}
+
 func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f1afc74dc..345b71e8f 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -108,6 +108,9 @@ type vCPU struct {
 	// This is a bitmask of the three fields (vCPU*) described above.
 	state uint32
 
+	// signalMask is the vCPU signal mask.
+	signalMask uint64
+
 	// runData for this vCPU.
 	runData *runData
 
@@ -121,6 +124,7 @@ type vCPU struct {
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
 
+	// dieState is the temporary state associated with throwing exceptions.
 	dieState dieState
 }
 
@@ -153,11 +157,6 @@ func (m *machine) newVCPU() *vCPU {
 	c.CPU.Init(&m.kernel, c)
 	m.vCPUsByID[c.id] = c
 
-	// Ensure the signal mask is correct.
-	if err := c.setSignalMask(); err != nil {
-		panic(fmt.Sprintf("error setting signal mask: %v", err))
-	}
-
 	// Map the run data.
 	runData, err := mapRunData(int(fd))
 	if err != nil {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..52286e56d 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -111,31 +111,6 @@ func (c *vCPU) setSystemTime() error {
 	return nil
 }
 
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
-	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
-	data.mask2 = ^uint32(bounceSignalMask >> 32)
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		return fmt.Errorf("error setting signal mask: %v", errno)
-	}
-	return nil
-}
-
 // setUserRegisters sets user registers in the vCPU.
 func (c *vCPU) setUserRegisters(uregs *userRegs) error {
 	if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index b531f2f85..185eeb4f0 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -268,32 +268,6 @@ func (c *vCPU) setSystemTime() error {
 	return nil
 }
 
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
-	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
-	data.mask2 = ^uint32(bounceSignalMask >> 32)
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		return fmt.Errorf("error setting signal mask: %v", errno)
-	}
-
-	return nil
-}
-
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
 	// Check for canonical addresses.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index f04be2ab5..e4de0a889 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,6 +87,47 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
+// setSignalMask sets the vCPU signal mask.
+//
+// This will be called from the bluepill handler, and therefore must not
+// perform any allocation.
+//
+//go:nosplit
+func (c *vCPU) setSignalMask(enableOthers bool) {
+	// The signal mask is either:
+	// *) Only the bounce signal, which we need to use to execute the
+	//    machine state up until the bounce interrupt can be processed.
+	//    or
+	// *) All signals, which is the default state unless we need to
+	//    continue execution to exit guest mode (the case above).
+	mask := bounceSignalMask
+	if enableOthers {
+		mask |= otherSignalsMask
+	}
+	if c.signalMask == mask {
+		return // Already set.
+	}
+
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(mask & 0xffffffff)
+	data.mask2 = ^uint32(mask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		throw("setSignal mask failed")
+	}
+}
+
 // atomicAddressSpace is an atomic address space pointer.
 type atomicAddressSpace struct {
 	pointer unsafe.Pointer
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 8de75ae57..6c15727fa 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -444,12 +444,6 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
-	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
-	// isn't set.
-	if conf.Platform == "kvm" {
-		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
-	}
-
 	// The current process' stdio must be passed to the application via the
 	// --stdio-fds flag. The stdio of the sandbox process itself must not
 	// be connected to the same FDs, otherwise we risk leaking sandbox
-- 
cgit v1.2.3


From e36eccc4b18676e2cb441380d0e4e46f038f638e Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 30 Mar 2020 13:04:44 -0700
Subject: BigQuery schema for benchmark-tools dashboard.

PiperOrigin-RevId: 303805784
---
 WORKSPACE                  |  43 ++++++++++++++++
 tools/bigquery/BUILD       |  10 ++++
 tools/bigquery/bigquery.go | 121 +++++++++++++++++++++++++++++++++++++++++++++
 tools/nogo.json            |   4 +-
 4 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 tools/bigquery/BUILD
 create mode 100644 tools/bigquery/bigquery.go

diff --git a/WORKSPACE b/WORKSPACE
index 62dfb9dc6..4d2b4a72f 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -386,6 +386,49 @@ go_repository(
     version = "v1.0.0",
 )
 
+go_repository(
+    name = "com_google_cloud_go_bigquery",
+    importpath = "cloud.google.com/go/bigquery",
+    sum = "h1:K2NyuHRuv15ku6eUpe0DQk5ZykPMnSOnvuVf6IHcjaE=",
+    version = "v1.5.0",
+)
+
+go_repository(
+    name = "org_golang_google_api",
+    importpath = "google.golang.org/api",
+    sum = "h1:jz2KixHX7EcCPiQrySzPdnYT7DbINAypCqKZ1Z7GM40=",
+    version = "v0.20.0",
+)
+
+# BigQuery Dependencies for Benchmarks
+go_repository(
+    name = "com_google_cloud_go",
+    importpath = "cloud.google.com/go",
+    sum = "h1:eoz/lYxKSL4CNAiaUJ0ZfD1J3bfMYbU5B3rwM1C1EIU=",
+    version = "v0.55.0",
+)
+
+go_repository(
+    name = "com_github_googleapis_gax_go_v2",
+    importpath = "github.com/googleapis/gax-go/v2",
+    sum = "h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=",
+    version = "v2.0.5",
+)
+
+go_repository(
+    name = "io_opencensus_go",
+    importpath = "go.opencensus.io",
+    sum = "h1:8sGtKOrtQqkN1bp2AtX+misvLIlOmsEsNd+9NIcPEm8=",
+    version = "v0.22.3",
+)
+
+go_repository(
+    name = "com_github_golang_groupcache",
+    importpath = "github.com/golang/groupcache",
+    sum = "h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=",
+    version = "v0.0.0-20200121045136-8c9f03a8e57e",
+)
+
 # System Call test dependencies.
 http_archive(
     name = "com_google_absl",
diff --git a/tools/bigquery/BUILD b/tools/bigquery/BUILD
new file mode 100644
index 000000000..5748fb390
--- /dev/null
+++ b/tools/bigquery/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "bigquery",
+    testonly = 1,
+    srcs = ["bigquery.go"],
+    deps = ["@com_google_cloud_go_bigquery//:go_default_library"],
+)
diff --git a/tools/bigquery/bigquery.go b/tools/bigquery/bigquery.go
new file mode 100644
index 000000000..56f0dc5c9
--- /dev/null
+++ b/tools/bigquery/bigquery.go
@@ -0,0 +1,121 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package bigquery defines a BigQuery schema for benchmarks.
+//
+// This package contains a schema for BigQuery and methods for publishing
+// benchmark data into tables.
+package bigquery
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	bq "cloud.google.com/go/bigquery"
+)
+
+// Benchmark is the top level structure of recorded benchmark data. BigQuery
+// will infer the schema from this.
+type Benchmark struct {
+	Name      string    `bq:"name"`
+	Timestamp time.Time `bq:"timestamp"`
+	Official  bool      `bq:"official"`
+	Metric    []*Metric `bq:"metric"`
+	Metadata  *Metadata `bq:"metadata"`
+}
+
+// Metric holds the actual metric data and unit information for this benchmark.
+type Metric struct {
+	Name   string  `bq:"name"`
+	Unit   string  `bq:"unit"`
+	Sample float64 `bq:"sample"`
+}
+
+// Metadata about this benchmark.
+type Metadata struct {
+	CL          string `bq:"changelist"`
+	IterationID string `bq:"iteration_id"`
+	PendingCL   string `bq:"pending_cl"`
+	Workflow    string `bq:"workflow"`
+	Platform    string `bq:"platform"`
+	Gofer       string `bq:"gofer"`
+}
+
+// InitBigQuery initializes a BigQuery dataset/table in the project. If the dataset/table already exists, it is not duplicated.
+func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string) error {
+	client, err := bq.NewClient(ctx, projectID)
+	if err != nil {
+		return fmt.Errorf("failed to initialize client on project %s: %v", projectID, err)
+	}
+	defer client.Close()
+
+	dataset := client.Dataset(datasetID)
+	if err := dataset.Create(ctx, nil); err != nil && !checkDuplicateError(err) {
+		return fmt.Errorf("failed to create dataset: %s: %v", datasetID, err)
+	}
+
+	table := dataset.Table(tableID)
+	schema, err := bq.InferSchema(Benchmark{})
+	if err != nil {
+		return fmt.Errorf("failed to infer schema: %v", err)
+	}
+
+	if err := table.Create(ctx, &bq.TableMetadata{Schema: schema}); err != nil && !checkDuplicateError(err) {
+		return fmt.Errorf("failed to create table: %s: %v", tableID, err)
+	}
+	return nil
+}
+
+// AddMetric adds a metric to an existing Benchmark.
+func (bm *Benchmark) AddMetric(metricName, unit string, sample float64) {
+	m := &Metric{
+		Name:   metricName,
+		Unit:   unit,
+		Sample: sample,
+	}
+	bm.Metric = append(bm.Metric, m)
+}
+
+// NewBenchmark initializes a new benchmark.
+func NewBenchmark(name string, official bool) *Benchmark {
+	return &Benchmark{
+		Name:      name,
+		Timestamp: time.Now().UTC(),
+		Official:  official,
+		Metric:    make([]*Metric, 0),
+	}
+}
+
+// SendBenchmarks sends the slice of benchmarks to the BigQuery dataset/table.
+func SendBenchmarks(ctx context.Context, benchmarks []*Benchmark, projectID, datasetID, tableID string) error {
+	client, err := bq.NewClient(ctx, projectID)
+	if err != nil {
+		return fmt.Errorf("Failed to initialize client on project: %s: %v", projectID, err)
+	}
+	defer client.Close()
+
+	uploader := client.Dataset(datasetID).Table(tableID).Uploader()
+	if err = uploader.Put(ctx, benchmarks); err != nil {
+		return fmt.Errorf("failed to upload benchmarks to proejct %s, table %s.%s: %v", projectID, datasetID, tableID, err)
+	}
+
+	return nil
+}
+
+// BigQuery will error "409" for duplicate tables and datasets.
+func checkDuplicateError(err error) bool {
+	return strings.Contains(err.Error(), "googleapi: Error 409: Already Exists")
+}
diff --git a/tools/nogo.json b/tools/nogo.json
index 2b4c6d3b6..83cb76b93 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -35,7 +35,9 @@
       "/com_github_vishvananda_netlink/route_linux.go": "allowed: false positive",
       "/external/bazel_gazelle/cmd/gazelle/.*": "allowed: false positive",
       "/org_golang_x_tools/go/packages/golist.go": "allowed: runtime internals",
-      "/pkg/sentry/platform/kvm/kvm_test.go": "allowed: intentional"
+      "/pkg/sentry/platform/kvm/kvm_test.go": "allowed: intentional",
+      "/tools/bigquery/bigquery.go": "allowed: false positive",
+      "/external/io_opencensus_go/tag/map_codec.go": "allowed: false positive"
     }
   },
   "printf": {
-- 
cgit v1.2.3


From 32a133537e61bbceb6a0a16c95815495d8f17a35 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 30 Mar 2020 14:37:17 -0700
Subject: Add AMD Rome CPUID flag.

This flag is set on Rome CPUs, but it is not documented.

PiperOrigin-RevId: 303825532
---
 pkg/cpuid/cpuid_x86.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
index a0bc55ea1..9abf6914d 100644
--- a/pkg/cpuid/cpuid_x86.go
+++ b/pkg/cpuid/cpuid_x86.go
@@ -235,7 +235,9 @@ const (
 	X86FeaturePERFCTR_TSC
 	X86FeaturePERFCTR_LLC
 	X86FeatureMWAITX
-	// ECX[31:30] are reserved.
+	// TODO(b/152776797): Some CPUs set this but it is not documented anywhere.
+	X86FeatureBlock5Bit30
+	_ // ecx bit 31 is reserved.
 )
 
 // Block 6 constants are the extended feature bits in
@@ -438,6 +440,9 @@ var x86FeatureParseOnlyStrings = map[Feature]string{
 
 	// Block 3.
 	X86FeaturePREFETCHWT1: "prefetchwt1",
+
+	// Block 5.
+	X86FeatureBlock5Bit30: "block5_bit30",
 }
 
 // intelCacheDescriptors describe the caches and TLBs on the system. They are
-- 
cgit v1.2.3


From 0cfdd47391d30dfe8214e2d11bdad9b27419ad26 Mon Sep 17 00:00:00 2001
From: Aaron Lu <ziqian.lzq@antfin.com>
Date: Mon, 16 Mar 2020 15:12:56 +0800
Subject: checkpoint/restore: make sure the donated stdioFDs have the same
 value

Suppose I start a runsc container using kvm platform like this:
$ sudo runsc --debug=true --debug-log=1.txt --platform=kvm run rootbash
The donating FD and the corresponding cmdline for runsc-sandbox is:

D0313 17:50:12.608203   44389 x:0] Donating FD 3: "1.txt"
D0313 17:50:12.608214   44389 x:0] Donating FD 4: "control_server_socket"
D0313 17:50:12.608224   44389 x:0] Donating FD 5: "|0"
D0313 17:50:12.608229   44389 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json"
D0313 17:50:12.608234   44389 x:0] Donating FD 7: "|1"
D0313 17:50:12.608238   44389 x:0] Donating FD 8: "sandbox IO FD"
D0313 17:50:12.608242   44389 x:0] Donating FD 9: "/dev/kvm"
D0313 17:50:12.608246   44389 x:0] Donating FD 10: "/dev/stdin"
D0313 17:50:12.608249   44389 x:0] Donating FD 11: "/dev/stdout"
D0313 17:50:12.608253   44389 x:0] Donating FD 12: "/dev/stderr"
D0313 17:50:12.608257   44389 x:0] Starting sandbox: /proc/self/exe
[runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log=
--max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt
--debug-log-format=text --file-access=exclusive --overlay=false
--fsgofer-host-uds=false --network=sandbox --log-packets=false
--platform=kvm --strace=false --strace-syscalls=--strace-log-size=1024
--watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true
--num-network-channels=1 --rootless=false --alsologtostderr=false
--ref-leak-mode=disabled --gso=true --software-gso=true
--overlayfs-stale-read=false --shared-volume= --debug-log-fd=3
--panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc
--controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8
--device-fd=9 --stdio-fds=10 --stdio-fds=11 --stdio-fds=12 --pidns=true
--setup-root --cpu-num 32 --total-memory 4294967296 rootbash]

Note stdioFDs starts from 10 with kvm platform and stderr's FD is 12.

If I restore a container from the checkpoint image which is derived
by checkpointing the above rootbash container, but either omit the
platform switch or specify to use ptrace platform explicitely:
$ sudo runsc --debug=true --debug-log=1.txt restore --image-path=some_path restored_rootbash

the donating FD and corresponding cmdline for runsc-sandbox is:

D0313 17:50:15.258632   44452 x:0] Donating FD 3: "1.txt"
D0313 17:50:15.258640   44452 x:0] Donating FD 4: "control_server_socket"
D0313 17:50:15.258645   44452 x:0] Donating FD 5: "|0"
D0313 17:50:15.258648   44452 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json"
D0313 17:50:15.258653   44452 x:0] Donating FD 7: "|1"
D0313 17:50:15.258657   44452 x:0] Donating FD 8: "sandbox IO FD"
D0313 17:50:15.258661   44452 x:0] Donating FD 9: "/dev/stdin"
D0313 17:50:15.258675   44452 x:0] Donating FD 10: "/dev/stdout"
D0313 17:50:15.258680   44452 x:0] Donating FD 11: "/dev/stderr"
D0313 17:50:15.258684   44452 x:0] Starting sandbox: /proc/self/exe
[runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log=
--max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt
--debug-log-format=text --file-access=exclusive --overlay=false
--fsgofer-host-uds=false --network=sandbox --log-packets=false
--platform=ptrace --strace=false --strace-syscalls=
--strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1
--profile=false --net-raw=true --num-network-channels=1 --rootless=false
--alsologtostderr=false --ref-leak-mode=disabled --gso=true
--software-gso=true --overlayfs-stale-read=false --shared-volume=
--debug-log-fd=3 --panic-signal=15 boot
--bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4
--mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --stdio-fds=9
--stdio-fds=10 --stdio-fds=11 --setup-root --cpu-num 32 --total-memory
4294967296 restored_rootbash]

Note this time, stdioFDs starts from 9 and stderr's FD is 11(so the
saved host.descritor.origFD which is 12 for stderr is no longer valid).

For the three host FD based files, The s.Dev and s.Ino derived from
fstat(fd) shall all be the same and since the two fields are used
as device.MultiDeviceKey, the host.inodeFileState.sattr.InodeId which is
the value of MultiDevice.Map(MultiDeviceKey), shall also all be the same.
Note that for MultiDevice m, m.cache records the mapping of key to value
and m.rcache records the mapping of value to key. If same value doesn't
map to the same key, it will panic on restore.

Now that stderr's origFD 12 is no longer valid(it happens to be
/memfd:runsc-memory in my test on restore), the s.Dev and s.Ino derived
from fstat(fd=12) in host.inodeFileState.afterLoad() will neither be
correct. But its InodeID is still the same as saved, MultiDevice.Load()
will complain about the same value(InodeID) being mapped to different
keys (different from stdin and stdout's) and panic with: "MultiDevice's
caches are inconsistent".

Solve this problem by making sure stdioFDs for root container's init
task are always the same on initial start and on restore time, no matter
what cmdline user has used: debug log specified or not, platform changed
or not etc. shall not affect the ability to restore.

Fixes #1844.
---
 runsc/boot/loader.go | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index e7ca98134..1ed46bdb9 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -175,6 +175,9 @@ type Args struct {
 	UserLogFD int
 }
 
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 64
+
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(args Args) (*Loader, error) {
@@ -319,6 +322,21 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		err = syscall.Close(fd)
+		if err != nil {
+			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+		}
+		newfd++
+	}
+
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
@@ -327,7 +345,7 @@ func New(args Args) (*Loader, error) {
 		watchdog:     dog,
 		spec:         args.Spec,
 		goferFDs:     args.GoferFDs,
-		stdioFDs:     args.StdioFDs,
+		stdioFDs:     stdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
 		processes:    map[execID]*execProcess{eid: {}},
@@ -569,6 +587,16 @@ func (l *Loader) run() error {
 		}
 	})
 
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now.
+	for _, fd := range l.stdioFDs {
+		err := syscall.Close(fd)
+		if err != nil {
+			return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+		}
+	}
+
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
-- 
cgit v1.2.3


From 8ce5b569714351f9f2f7fc48b0ff0bebbdb018ee Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 20 Mar 2020 08:45:07 +0000
Subject: Cleanup for syscall tests on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8008c0375fc7e23225a21026f359e78e691729e5
---
 test/syscalls/linux/getrandom.cc |  2 ++
 test/syscalls/linux/lseek.cc     |  2 +-
 test/syscalls/linux/mlock.cc     |  4 +++-
 test/syscalls/linux/mmap.cc      | 10 ++++++++--
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
index f97f60029..f87cdd7a1 100644
--- a/test/syscalls/linux/getrandom.cc
+++ b/test/syscalls/linux/getrandom.cc
@@ -29,6 +29,8 @@ namespace {
 #define SYS_getrandom 318
 #elif defined(__i386__)
 #define SYS_getrandom 355
+#elif defined(__aarch64__)
+#define SYS_getrandom 278
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index a8af8e545..6ce1e6cc3 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -53,7 +53,7 @@ TEST(LseekTest, NegativeOffset) {
 // A 32-bit off_t is not large enough to represent an offset larger than
 // maximum file size on standard file systems, so it isn't possible to cause
 // overflow.
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
 TEST(LseekTest, Overflow) {
   // HA! Classic Linux. We really should have an EOVERFLOW
   // here, since we're seeking to something that cannot be
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 367a90fe1..78ac96bed 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -199,8 +199,10 @@ TEST(MunlockallTest, Basic) {
 }
 
 #ifndef SYS_mlock2
-#ifdef __x86_64__
+#if defined(__x86_64__)
 #define SYS_mlock2 325
+#elif defined(__aarch64__)
+#define SYS_mlock2 284
 #endif
 #endif
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 11fb1b457..6d3227ab6 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -361,7 +361,7 @@ TEST_F(MMapTest, MapFixed) {
 }
 
 // 64-bit addresses work too
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
 TEST_F(MMapTest, MapFixed64) {
   EXPECT_THAT(Map(0x300000000000, kPageSize, PROT_NONE,
                   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
@@ -571,6 +571,12 @@ const uint8_t machine_code[] = {
     0xb8, 0x2a, 0x00, 0x00, 0x00,  // movl $42, %eax
     0xc3,                          // retq
 };
+#elif defined(__aarch64__)
+const uint8_t machine_code[] = {
+    0x40, 0x05, 0x80, 0x52,  // mov w0, #42
+    0xc0, 0x03, 0x5f, 0xd6,  // ret
+};
+#endif
 
 // PROT_EXEC allows code execution
 TEST_F(MMapTest, ProtExec) {
@@ -605,7 +611,6 @@ TEST_F(MMapTest, NoProtExecDeath) {
 
   EXPECT_EXIT(func(), ::testing::KilledBySignal(SIGSEGV), "");
 }
-#endif
 
 TEST_F(MMapTest, NoExceedLimitData) {
   void* prevbrk;
@@ -1644,6 +1649,7 @@ TEST(MMapNoFixtureTest, MapReadOnlyAfterCreateWriteOnly) {
 }
 
 // Conditional on MAP_32BIT.
+// This flag is supported only on x86-64, for 64-bit programs.
 #ifdef __x86_64__
 
 TEST(MMapNoFixtureTest, Map32Bit) {
-- 
cgit v1.2.3


From 9de982ea790ffe56eca07b6535e9420b669b7c0c Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 31 Mar 2020 15:00:25 -0700
Subject: Allow passing root file type to tmpfs.

PiperOrigin-RevId: 304053357
---
 pkg/sentry/fsimpl/testutil/testutil.go |  3 +++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go       | 32 ++++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index e16808c63..0556af877 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -162,6 +162,9 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
 // exactly the specified set of expected entries. AssertAllDirentTypes respects
 // collector.skipDots, and implicitly checks for "." and ".." accordingly.
 func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) {
+	if expected == nil {
+		expected = make(map[string]DirentType)
+	}
 	// Also implicitly check for "." and "..", if enabled.
 	if !collector.skipDots {
 		expected["."] = linux.DT_DIR
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index b07b0dbae..afd9f8533 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -68,6 +68,17 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// FilesystemOpts is used to pass configuration data to tmpfs.
+type FilesystemOpts struct {
+	// RootFileType is the FileType of the filesystem root. Valid values
+	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
+	RootFileType uint16
+
+	// RootSymlinkTarget is the target of the root symlink. Only valid if
+	// RootFileType == S_IFLNK.
+	RootSymlinkTarget string
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -79,9 +90,26 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		memFile: memFileProvider.MemoryFile(),
 		clock:   clock,
 	}
+
 	fs.vfsfs.Init(vfsObj, &fstype, &fs)
-	root := fs.newDentry(fs.newDirectory(creds, 01777))
-	return &fs.vfsfs, &root.vfsd, nil
+
+	typ := uint16(linux.S_IFDIR)
+	tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
+	if ok && tmpfsOpts.RootFileType != 0 {
+		typ = tmpfsOpts.RootFileType
+	}
+	var root *inode
+	switch typ {
+	case linux.S_IFREG:
+		root = fs.newRegularFile(creds, 0777)
+	case linux.S_IFLNK:
+		root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)
+	case linux.S_IFDIR:
+		root = fs.newDirectory(creds, 01777)
+	default:
+		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", typ)
+	}
+	return &fs.vfsfs, &fs.newDentry(root).vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
-- 
cgit v1.2.3


From e1c8eaca8f8413b17dab8f01b2e123e9d4b9ddbc Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 31 Mar 2020 15:00:26 -0700
Subject: Fix /proc/self/mounts and /proc/self/mountinfo in VFS2.

Some extra fields were added to the Mount type to expose necessary data to the
proc filesystem.

PiperOrigin-RevId: 304053361
---
 pkg/sentry/fsimpl/proc/task_files.go | 183 +++++----------------------------
 pkg/sentry/vfs/mount.go              | 192 ++++++++++++++++++++++++++++++++++-
 2 files changed, 219 insertions(+), 156 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 8c743df8d..df0d1bcc5 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -18,13 +18,10 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"sort"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -634,51 +631,6 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 	return
 }
 
-// forEachMountSource runs f for the process root mount and each mount that is
-// a descendant of the root.
-func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
-	var fsctx *kernel.FSContext
-	t.WithMuLocked(func(t *kernel.Task) {
-		fsctx = t.FSContext()
-	})
-	if fsctx == nil {
-		// The task has been destroyed. Nothing to show here.
-		return
-	}
-
-	// All mount points must be relative to the rootDir, and mounts outside
-	// will be excluded.
-	rootDir := fsctx.RootDirectory()
-	if rootDir == nil {
-		// The task has been destroyed. Nothing to show here.
-		return
-	}
-	defer rootDir.DecRef()
-
-	mnt := t.MountNamespace().FindMount(rootDir)
-	if mnt == nil {
-		// Has it just been unmounted?
-		return
-	}
-	ms := t.MountNamespace().AllMountsUnder(mnt)
-	sort.Slice(ms, func(i, j int) bool {
-		return ms[i].ID < ms[j].ID
-	})
-	for _, m := range ms {
-		mroot := m.Root()
-		if mroot == nil {
-			continue // No longer valid.
-		}
-		mountPath, desc := mroot.FullName(rootDir)
-		mroot.DecRef()
-		if !desc {
-			// MountSources that are not descendants of the chroot jail are ignored.
-			continue
-		}
-		fn(mountPath, m)
-	}
-}
-
 // mountInfoData is used to implement /proc/[pid]/mountinfo.
 //
 // +stateify savable
@@ -692,92 +644,22 @@ var _ dynamicInode = (*mountInfoData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
-		mroot := m.Root()
-		if mroot == nil {
-			return // No longer valid.
-		}
-		defer mroot.DecRef()
-
-		// Format:
-		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
-		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
-
-		// (1) MountSource ID.
-		fmt.Fprintf(buf, "%d ", m.ID)
-
-		// (2)  Parent ID (or this ID if there is no parent).
-		pID := m.ID
-		if !m.IsRoot() && !m.IsUndo() {
-			pID = m.ParentID
-		}
-		fmt.Fprintf(buf, "%d ", pID)
-
-		// (3) Major:Minor device ID. We don't have a superblock, so we
-		// just use the root inode device number.
-		sa := mroot.Inode.StableAttr
-		fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
-
-		// (4) Root: the pathname of the directory in the filesystem
-		// which forms the root of this mount.
-		//
-		// NOTE(b/78135857): This will always be "/" until we implement
-		// bind mounts.
-		fmt.Fprintf(buf, "/ ")
-
-		// (5) Mount point (relative to process root).
-		fmt.Fprintf(buf, "%s ", mountPath)
-
-		// (6) Mount options.
-		flags := mroot.Inode.MountSource.Flags
-		opts := "rw"
-		if flags.ReadOnly {
-			opts = "ro"
-		}
-		if flags.NoAtime {
-			opts += ",noatime"
-		}
-		if flags.NoExec {
-			opts += ",noexec"
-		}
-		fmt.Fprintf(buf, "%s ", opts)
-
-		// (7) Optional fields: zero or more fields of the form "tag[:value]".
-		// (8) Separator: the end of the optional fields is marked by a single hyphen.
-		fmt.Fprintf(buf, "- ")
-
-		// (9) Filesystem type.
-		fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType)
-
-		// (10) Mount source: filesystem-specific information or "none".
-		fmt.Fprintf(buf, "none ")
-
-		// (11) Superblock options, and final newline.
-		fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource))
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
 	})
-	return nil
-}
-
-func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
-	// gVisor doesn't (yet) have a concept of super block options, so we
-	// use the ro/rw bit from the mount flag.
-	opts := "rw"
-	if msrc.Flags.ReadOnly {
-		opts = "ro"
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
 	}
-
-	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
-	// the cgroup name in the options. For now we just read that from the
-	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
-	// should get this value from the cgroup itself, and not rely on the
-	// path.
-	if msrc.FilesystemType == "cgroup" {
-		splitPath := strings.Split(mountPath, "/")
-		cgroupType := splitPath[len(splitPath)-1]
-		opts += "," + cgroupType
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
 	}
-	return opts
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
+	return nil
 }
 
 // mountsData is used to implement /proc/[pid]/mounts.
@@ -789,33 +671,24 @@ type mountsData struct {
 	task *kernel.Task
 }
 
-var _ dynamicInode = (*mountInfoData)(nil)
+var _ dynamicInode = (*mountsData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
-		// Format:
-		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
-		//
-		// We use the filesystem name as the first field, since there
-		// is no real block device we can point to, and we also should
-		// not expose anything about the remote filesystem.
-		//
-		// Only ro/rw option is supported for now.
-		//
-		// The "needs dump"and fsck flags are always 0, which is allowed.
-		root := m.Root()
-		if root == nil {
-			return // No longer valid.
-		}
-		defer root.DecRef()
-
-		flags := root.Inode.MountSource.Flags
-		opts := "rw"
-		if flags.ReadOnly {
-			opts = "ro"
-		}
-		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0)
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
 	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
 	return nil
 }
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 4b68cabda..7792eb1a0 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -15,7 +15,11 @@
 package vfs
 
 import (
+	"bytes"
+	"fmt"
 	"math"
+	"sort"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -44,7 +48,7 @@ var lastMountID uint64
 //
 // +stateify savable
 type Mount struct {
-	// vfs, fs, and root are immutable. References are held on fs and root.
+	// vfs, fs, root are immutable. References are held on fs and root.
 	//
 	// Invariant: root belongs to fs.
 	vfs  *VirtualFilesystem
@@ -639,12 +643,28 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	return nil
 }
 
+func (mnt *Mount) readOnly() bool {
+	return atomic.LoadInt64(&mnt.writers) < 0
+}
+
 // Filesystem returns the mounted Filesystem. It does not take a reference on
 // the returned Filesystem.
 func (mnt *Mount) Filesystem() *Filesystem {
 	return mnt.fs
 }
 
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+	mounts := []*Mount{mnt}
+	for m := range mnt.children {
+		mounts = append(mounts, m.submountsLocked()...)
+	}
+	return mounts
+}
+
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
@@ -655,3 +675,173 @@ func (mntns *MountNamespace) Root() VirtualDentry {
 	vd.IncRef()
 	return vd
 }
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.flags.NoExec {
+			opts += ",noexec"
+		}
+
+		// Format:
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// The "needs dump" and "fsck order" flags are always 0, which
+		// is allowed.
+		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+	}
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+		// Stat the mount root to get the major/minor device numbers.
+		pop := &PathOperation{
+			Root:  mntRootVD,
+			Start: mntRootVD,
+		}
+		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		if err != nil {
+			// Well that's not good. Ignore this mount.
+			break
+		}
+
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) Mount ID.
+		fmt.Fprintf(buf, "%d ", mnt.ID)
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := mnt.ID
+		if p := mnt.parent(); p != nil {
+			pID = p.ID
+		}
+		fmt.Fprintf(buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE(b/78135857): This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(buf, "%s ", manglePath(path))
+
+		// (6) Mount options.
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.flags.NoExec {
+			opts += ",noexec"
+		}
+		// TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
+		// set.
+		fmt.Fprintf(buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(buf, "- ")
+
+		// (9) Filesystem type.
+		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(buf, "none ")
+
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+	}
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+	return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if mnt.readOnly() {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if mnt.fs.FilesystemType().Name() == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}
-- 
cgit v1.2.3


From b6639f77e59d885cb092c15a7a0c5a988e149b40 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 31 Mar 2020 15:00:30 -0700
Subject: Include original copyUp error in panic if cleanupUpper fails.

When copyUp fails, we attempt to clean up the upper filesystem by removing any
files that have already been copied-up. If the cleanup fails, we panic because
the "overlay filesystem is in an inconsistent state".

This CL adds the original copy-up error to the panic information, to hopefully
make it easier to track down how the overlay filesystem got into the
inconsistent state.

PiperOrigin-RevId: 304053370
---
 pkg/sentry/fs/copy_up.go       | 28 +++++++++++++++-------------
 pkg/sentry/fs/inode_overlay.go |  3 ++-
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index b060a12ff..ab1424c95 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -222,8 +222,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		}
 		childUpper, err := parentUpper.Lookup(ctx, next.name)
 		if err != nil {
-			log.Warningf("copy up failed to lookup directory: %v", err)
-			cleanupUpper(ctx, parentUpper, next.name)
+			werr := fmt.Errorf("copy up failed to lookup directory: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name, werr)
 			return syserror.EIO
 		}
 		defer childUpper.DecRef()
@@ -242,8 +242,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		}
 		childUpper, err := parentUpper.Lookup(ctx, next.name)
 		if err != nil {
-			log.Warningf("copy up failed to lookup symlink: %v", err)
-			cleanupUpper(ctx, parentUpper, next.name)
+			werr := fmt.Errorf("copy up failed to lookup symlink: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name, werr)
 			return syserror.EIO
 		}
 		defer childUpper.DecRef()
@@ -256,23 +256,23 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Bring file attributes up to date. This does not include size, which will be
 	// brought up to date with copyContentsLocked.
 	if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil {
-		log.Warningf("copy up failed to copy up attributes: %v", err)
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed to copy up attributes: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
 	// Copy the entire file.
 	if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil {
-		log.Warningf("copy up failed to copy up contents: %v", err)
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed to copy up contents: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
 	lowerMappable := next.Inode.overlay.lower.Mappable()
 	upperMappable := childUpperInode.Mappable()
 	if lowerMappable != nil && upperMappable == nil {
-		log.Warningf("copy up failed: cannot ensure memory mapping coherence")
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence")
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
@@ -324,12 +324,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	return nil
 }
 
-// cleanupUpper removes name from parent, and panics if it is unsuccessful.
-func cleanupUpper(ctx context.Context, parent *Inode, name string) {
+// cleanupUpper is called when copy-up fails. It logs the copy-up error and
+// attempts to remove name from parent. If that fails, then it panics.
+func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) {
+	log.Warningf(copyUpErr.Error())
 	if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil {
 		// Unfortunately we don't have much choice. We shouldn't
 		// willingly give the caller access to a nonsense filesystem.
-		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err))
+		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err))
 	}
 }
 
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 5ada33a32..537c8d257 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -231,7 +231,8 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
 	upperFile.Dirent.Inode.IncRef()
 	entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false)
 	if err != nil {
-		cleanupUpper(ctx, o.upper, name)
+		werr := fmt.Errorf("newOverlayEntry failed: %v", err)
+		cleanupUpper(ctx, o.upper, name, werr)
 		return nil, err
 	}
 
-- 
cgit v1.2.3


From 57e67e32b59a30365a79f6dceb3e0cb772407029 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Tue, 31 Mar 2020 16:16:45 -0700
Subject: Debug script issues on Kokoro.

PiperOrigin-RevId: 304068950
---
 scripts/benchmark.sh | 2 ++
 1 file changed, 2 insertions(+)
 mode change 100644 => 100755 scripts/benchmark.sh

diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
old mode 100644
new mode 100755
index 334684675..3fd80fc2e
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -20,6 +20,8 @@ source $(dirname $0)/common.sh
 # variable for authentication.
 export GOOGLE_APPLICATION_CREDENTIALS="${KOKORO_KEYSTORE_DIR}/${GCLOUD_CREDENTIALS}"
 
+which gcloud
+
 gcloud auth activate-service-account \
    --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
 
-- 
cgit v1.2.3


From c71e97784cfc57a0664a07cb798aca3d39d6bb11 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 28 Feb 2020 09:14:57 +0000
Subject: Enable rseq syscall test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: If30154a2d73e98f211cfe589853b232019b9e130
---
 test/syscalls/linux/rseq/BUILD            | 41 ++++++++++---------
 test/syscalls/linux/rseq/critical.S       | 66 -------------------------------
 test/syscalls/linux/rseq/critical_amd64.S | 66 +++++++++++++++++++++++++++++++
 test/syscalls/linux/rseq/critical_arm64.S | 66 +++++++++++++++++++++++++++++++
 test/syscalls/linux/rseq/start.S          | 45 ---------------------
 test/syscalls/linux/rseq/start_amd64.S    | 45 +++++++++++++++++++++
 test/syscalls/linux/rseq/start_arm64.S    | 45 +++++++++++++++++++++
 test/syscalls/linux/rseq/syscalls.h       |  5 ++-
 test/syscalls/linux/rseq/uapi.h           |  4 +-
 9 files changed, 251 insertions(+), 132 deletions(-)
 delete mode 100644 test/syscalls/linux/rseq/critical.S
 create mode 100644 test/syscalls/linux/rseq/critical_amd64.S
 create mode 100644 test/syscalls/linux/rseq/critical_arm64.S
 delete mode 100644 test/syscalls/linux/rseq/start.S
 create mode 100644 test/syscalls/linux/rseq/start_amd64.S
 create mode 100644 test/syscalls/linux/rseq/start_arm64.S

diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
index ed488dbc2..ee5b0a11b 100644
--- a/test/syscalls/linux/rseq/BUILD
+++ b/test/syscalls/linux/rseq/BUILD
@@ -1,7 +1,7 @@
 # This package contains a standalone rseq test binary. This binary must not
 # depend on libc, which might use rseq itself.
 
-load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain", "select_arch")
 
 package(licenses = ["notice"])
 
@@ -9,32 +9,35 @@ genrule(
     name = "rseq_binary",
     srcs = [
         "critical.h",
-        "critical.S",
+        "critical_amd64.S",
+        "critical_arm64.S",
         "rseq.cc",
         "syscalls.h",
-        "start.S",
+        "start_amd64.S",
+        "start_arm64.S",
         "test.h",
         "types.h",
         "uapi.h",
     ],
     outs = ["rseq"],
-    cmd = " ".join([
-        "$(CC)",
-        "$(CC_FLAGS) ",
-        "-I.",
-        "-Wall",
-        "-Werror",
-        "-O2",
-        "-std=c++17",
-        "-static",
-        "-nostdlib",
-        "-ffreestanding",
-        "-o",
-        "$(location rseq)",
-        "$(location critical.S)",
+    cmd = "$(CC) " +
+        "$(CC_FLAGS) " +
+        "-I. " +
+        "-Wall " +
+        "-Werror " +
+        "-O2 " +
+        "-std=c++17 " +
+        "-static " +
+        "-nostdlib " +
+        "-ffreestanding " +
+        "-o " +
+        "$(location rseq) " +
+        select_arch(
+            amd64 = "$(location critical_amd64.S) $(location start_amd64.S) ",
+            arm64 = "$(location critical_arm64.S) $(location start_arm64.S) ",
+	    no_match_error = "unsupported architecture",
+        ) +
         "$(location rseq.cc)",
-        "$(location start.S)",
-    ]),
     toolchains = [
         cc_toolchain,
         ":no_pie_cc_flags",
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S
deleted file mode 100644
index 8c0687e6d..000000000
--- a/test/syscalls/linux/rseq/critical.S
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Restartable sequences critical sections.
-
-// Loops continuously until aborted.
-//
-// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
-
-  .text
-  .globl  rseq_loop
-  .type   rseq_loop, @function
-
-rseq_loop:
-  jmp begin
-
-  // Abort block before the critical section.
-  // Abort signature is 4 nops for simplicity.
-  .byte 0x90, 0x90, 0x90, 0x90
-  .globl  rseq_loop_early_abort
-rseq_loop_early_abort:
-  ret
-
-begin:
-  // r->rseq_cs = cs
-  movq %rsi, 8(%rdi)
-
-  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
-  // section. Thus it must be set in or immediately before the critical section
-  // to ensure it is not cleared before the section begins.
-  .globl  rseq_loop_start
-rseq_loop_start:
-  jmp rseq_loop_start
-
-  // "Pre-commit": extra instructions inside the critical section.  These are
-  // used as the abort point in TestAbortPreCommit, which is not valid.
-  .globl  rseq_loop_pre_commit
-rseq_loop_pre_commit:
-  // Extra abort signature + nop for TestAbortPostCommit.
-  .byte 0x90, 0x90, 0x90, 0x90
-  nop
-
-  // "Post-commit": never reached in this case.
-  .globl  rseq_loop_post_commit
-rseq_loop_post_commit:
-
-  // Abort signature is 4 nops for simplicity.
-  .byte 0x90, 0x90, 0x90, 0x90
-
-  .globl  rseq_loop_abort
-rseq_loop_abort:
-  ret
-
-  .size  rseq_loop,.-rseq_loop
-  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical_amd64.S b/test/syscalls/linux/rseq/critical_amd64.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_amd64.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  jmp begin
+
+  // Abort block before the critical section.
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  movq %rsi, 8(%rdi)
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  jmp rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical_arm64.S b/test/syscalls/linux/rseq/critical_arm64.S
new file mode 100644
index 000000000..bfe7e8307
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_arm64.S
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  b begin
+
+  // Abort block before the critical section.
+  // Abort signature.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  str x1, [x0, #8]
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  b rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S
deleted file mode 100644
index b9611b276..000000000
--- a/test/syscalls/linux/rseq/start.S
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-  .text
-  .align 4
-  .type  _start,@function
-  .globl  _start
-
-_start:
-  movq  %rsp,%rdi
-  call  __init
-  hlt
-
-  .size  _start,.-_start
-  .section  .note.GNU-stack,"",@progbits
-
-  .text
-  .globl  raw_syscall
-  .type   raw_syscall, @function
-
-raw_syscall:
-  mov  %rdi,%rax      // syscall #
-  mov  %rsi,%rdi      // arg0
-  mov  %rdx,%rsi      // arg1
-  mov  %rcx,%rdx      // arg2
-  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
-  mov  %r9,%r8        // arg4
-  mov  0x8(%rsp),%r9  // arg5
-  syscall
-  ret
-
-  .size  raw_syscall,.-raw_syscall
-  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start_amd64.S b/test/syscalls/linux/rseq/start_amd64.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_amd64.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  movq  %rsp,%rdi
+  call  __init
+  hlt
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  %rdi,%rax      // syscall #
+  mov  %rsi,%rdi      // arg0
+  mov  %rdx,%rsi      // arg1
+  mov  %rcx,%rdx      // arg2
+  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
+  mov  %r9,%r8        // arg4
+  mov  0x8(%rsp),%r9  // arg5
+  syscall
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start_arm64.S b/test/syscalls/linux/rseq/start_arm64.S
new file mode 100644
index 000000000..693c1c6eb
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_arm64.S
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  mov  x29, sp
+  bl   __init
+  wfi
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  x8,x0   // syscall #
+  mov  x0,x1   // arg0
+  mov  x1,x2   // arg1
+  mov  x2,x3   // arg2
+  mov  x3,x4   // arg3
+  mov  x4,x5   // arg4
+  mov  x5,x6   // arg5
+  svc  #0
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
index e5299c188..c4118e6c5 100644
--- a/test/syscalls/linux/rseq/syscalls.h
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -17,10 +17,13 @@
 
 #include "test/syscalls/linux/rseq/types.h"
 
-#ifdef __x86_64__
 // Syscall numbers.
+#if defined(__x86_64__)
 constexpr int kGetpid = 39;
 constexpr int kExitGroup = 231;
+#elif defined(__aarch64__)
+constexpr int kGetpid = 172;
+constexpr int kExitGroup = 94;
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
index ca1d67691..d3e60d0a4 100644
--- a/test/syscalls/linux/rseq/uapi.h
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -19,9 +19,11 @@
 
 // User-kernel ABI for restartable sequences.
 
-#ifdef __x86_64__
 // Syscall numbers.
+#if defined(__x86_64__)
 constexpr int kRseqSyscall = 334;
+#elif defined(__aarch64__)
+constexpr int kRseqSyscall = 293;
 #else
 #error "Unknown architecture"
 #endif  // __x86_64__
-- 
cgit v1.2.3


From 639d94f9f71b43e86320a6e9157c932f5d7936a7 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 31 Mar 2020 19:15:55 -0700
Subject: Add socket filesystem and global disconnected socket mount for VFS2.

A socket mount where anonymous sockets will reside is added to the
VirtualFilesystem. Socketfs is built on top of kernfs.

Updates #1476, #1478, #1484, #1485.

PiperOrigin-RevId: 304095251
---
 pkg/sentry/fsimpl/sockfs/BUILD     | 16 +++++++++
 pkg/sentry/fsimpl/sockfs/sockfs.go | 73 ++++++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/BUILD            |  1 +
 pkg/sentry/kernel/kernel.go        | 24 +++++++++++++
 test/syscalls/linux/socket_unix.cc |  2 ++
 5 files changed, 116 insertions(+)
 create mode 100644 pkg/sentry/fsimpl/sockfs/BUILD
 create mode 100644 pkg/sentry/fsimpl/sockfs/sockfs.go

diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD
new file mode 100644
index 000000000..790d50e65
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "sockfs",
+    srcs = ["sockfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
new file mode 100644
index 000000000..c13511de2
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -0,0 +1,73 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sockfs provides a filesystem implementation for anonymous sockets.
+package sockfs
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// NewFilesystem creates a new sockfs filesystem.
+//
+// Note that there should only ever be one instance of sockfs.Filesystem,
+// backing a global socket mount.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
+	fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{})
+	if err != nil {
+		panic("failed to create sockfs filesystem")
+	}
+	return fs
+}
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj, fsType)
+	return fs.VFSFilesystem(), nil, nil
+}
+
+// Name implements FilesystemType.Name.
+//
+// Note that registering sockfs is unnecessary, except for the fact that it
+// will not show up under /proc/filesystems as a result. This is a very minor
+// discrepancy from Linux.
+func (filesystemType) Name() string {
+	return "sockfs"
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ENXIO
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index beba29a09..bb7e3cbc3 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -169,6 +169,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 6feda8fa1..0a448b57c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -225,6 +226,11 @@ type Kernel struct {
 	// by extMu.
 	nextSocketEntry uint64
 
+	// socketMount is a disconnected vfs.Mount, not included in k.vfs,
+	// representing a sockfs.filesystem. socketMount is used to back
+	// VirtualDentries representing anonymous sockets.
+	socketMount *vfs.Mount
+
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
 
@@ -348,6 +354,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+	if VFS2Enabled {
+		if err := k.vfs.Init(); err != nil {
+			return fmt.Errorf("failed to initialize VFS: %v", err)
+		}
+		fs := sockfs.NewFilesystem(&k.vfs)
+		// NewDisconnectedMount will take an additional reference on fs.
+		defer fs.DecRef()
+		sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to initialize socket mount: %v", err)
+		}
+		k.socketMount = sm
+	}
 	return nil
 }
 
@@ -1452,6 +1471,11 @@ func (k *Kernel) ListSockets() []*SocketEntry {
 	return socks
 }
 
+// SocketMount returns the global socket mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+	return k.socketMount
+}
+
 // supervisorContext is a privileged context.
 type supervisorContext struct {
 	context.NoopSleeper
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 4cf1f76f1..8bf663e8b 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -257,6 +257,8 @@ TEST_P(UnixSocketPairTest, ShutdownWrite) {
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
   // TODO(b/122310852): We should be returning ENXIO and NOT EIO.
+  // TODO(github.dev/issue/1624): This should be resolved in VFS2. Verify
+  // that this is the case and delete the SKIP_IF once we delete VFS1.
   SKIP_IF(IsRunningOnGvisor());
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-- 
cgit v1.2.3


From d25036ad17a3ada7fa6ce9900f20e246e07acd2f Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Tue, 31 Mar 2020 19:51:52 -0700
Subject: Test receiving multicast packets over UDP

PiperOrigin-RevId: 304098611
---
 pkg/tcpip/header/udp.go                            |   5 +
 test/packetimpact/dut/posix_server.cc              |  11 ++
 test/packetimpact/proto/posix_server.proto         |  30 ++-
 test/packetimpact/testbench/BUILD                  |   1 +
 test/packetimpact/testbench/connections.go         | 216 ++++++++++++++++++---
 test/packetimpact/testbench/dut.go                 |  44 ++++-
 test/packetimpact/testbench/layers.go              | 156 ++++++++++++---
 test/packetimpact/tests/BUILD                      |  13 ++
 test/packetimpact/tests/Dockerfile                 |  14 +-
 test/packetimpact/tests/defs.bzl                   |  18 +-
 test/packetimpact/tests/fin_wait2_timeout_test.go  |   2 +-
 test/packetimpact/tests/test_runner.sh             |  24 ++-
 test/packetimpact/tests/udp_recv_multicast_test.go |  37 ++++
 13 files changed, 501 insertions(+), 70 deletions(-)
 create mode 100644 test/packetimpact/tests/udp_recv_multicast_test.go

diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index 74412c894..9339d637f 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -99,6 +99,11 @@ func (b UDP) SetChecksum(checksum uint16) {
 	binary.BigEndian.PutUint16(b[udpChecksum:], checksum)
 }
 
+// SetLength sets the "length" field of the udp header.
+func (b UDP) SetLength(length uint16) {
+	binary.BigEndian.PutUint16(b[udpLength:], length)
+}
+
 // CalculateChecksum calculates the checksum of the udp packet, given the
 // checksum of the network-layer pseudo-header and the checksum of the payload.
 func (b UDP) CalculateChecksum(partialChecksum uint16) uint16 {
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index 2f10dda40..4a71c54c6 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -181,6 +181,17 @@ class PosixImpl final : public posix_server::Posix::Service {
     response->set_errno_(errno);
     return ::grpc::Status::OK;
   }
+
+  ::grpc::Status Recv(::grpc::ServerContext *context,
+                      const ::posix_server::RecvRequest *request,
+                      ::posix_server::RecvResponse *response) override {
+    std::vector<char> buf(request->len());
+    response->set_ret(
+        recv(request->sockfd(), buf.data(), buf.size(), request->flags()));
+    response->set_errno_(errno);
+    response->set_buf(buf.data(), response->ret());
+    return ::grpc::Status::OK;
+  }
 };
 
 // Parse command line options. Returns a pointer to the first argument beyond
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index 026876fc2..53ec49410 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -24,7 +24,7 @@ message SocketRequest {
 
 message SocketResponse {
   int32 fd = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
 message SockaddrIn {
@@ -55,7 +55,7 @@ message BindRequest {
 
 message BindResponse {
   int32 ret = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
 message GetSockNameRequest {
@@ -64,7 +64,7 @@ message GetSockNameRequest {
 
 message GetSockNameResponse {
   int32 ret = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
   Sockaddr addr = 3;
 }
 
@@ -75,7 +75,7 @@ message ListenRequest {
 
 message ListenResponse {
   int32 ret = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
 message AcceptRequest {
@@ -84,7 +84,7 @@ message AcceptRequest {
 
 message AcceptResponse {
   int32 fd = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
   Sockaddr addr = 3;
 }
 
@@ -97,7 +97,7 @@ message SetSockOptRequest {
 
 message SetSockOptResponse {
   int32 ret = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
 message Timeval {
@@ -114,7 +114,7 @@ message SetSockOptTimevalRequest {
 
 message SetSockOptTimevalResponse {
   int32 ret = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
 message CloseRequest {
@@ -123,7 +123,19 @@ message CloseRequest {
 
 message CloseResponse {
   int32 ret = 1;
-  int32 errno_ = 2;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message RecvRequest {
+  int32 sockfd = 1;
+  int32 len = 2;
+  int32 flags = 3;
+}
+
+message RecvResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+  bytes buf = 3;
 }
 
 service Posix {
@@ -147,4 +159,6 @@ service Posix {
       returns (SetSockOptTimevalResponse);
   // Call close() on the DUT.
   rpc Close(CloseRequest) returns (CloseResponse);
+  // Call recv() on the DUT.
+  rpc Recv(RecvRequest) returns (RecvResponse);
 }
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index a34c81fcc..4a9d8efa6 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -16,6 +16,7 @@ go_library(
     ],
     deps = [
         "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
         "//pkg/usermem",
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index b7aa63934..8d1f562ee 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -36,19 +36,6 @@ var remoteIPv4 = flag.String("remote_ipv4", "", "remote IPv4 address for test pa
 var localMAC = flag.String("local_mac", "", "local mac address for test packets")
 var remoteMAC = flag.String("remote_mac", "", "remote mac address for test packets")
 
-// TCPIPv4 maintains state about a TCP/IPv4 connection.
-type TCPIPv4 struct {
-	outgoing     Layers
-	incoming     Layers
-	LocalSeqNum  seqnum.Value
-	RemoteSeqNum seqnum.Value
-	SynAck       *TCP
-	sniffer      Sniffer
-	injector     Injector
-	portPickerFD int
-	t            *testing.T
-}
-
 // pickPort makes a new socket and returns the socket FD and port. The caller
 // must close the FD when done with the port if there is no error.
 func pickPort() (int, uint16, error) {
@@ -75,12 +62,25 @@ func pickPort() (int, uint16, error) {
 	return fd, uint16(newSockAddrInet4.Port), nil
 }
 
+// TCPIPv4 maintains state about a TCP/IPv4 connection.
+type TCPIPv4 struct {
+	outgoing     Layers
+	incoming     Layers
+	LocalSeqNum  seqnum.Value
+	RemoteSeqNum seqnum.Value
+	SynAck       *TCP
+	sniffer      Sniffer
+	injector     Injector
+	portPickerFD int
+	t            *testing.T
+}
+
 // tcpLayerIndex is the position of the TCP layer in the TCPIPv4 connection. It
 // is the third, after Ethernet and IPv4.
 const tcpLayerIndex int = 2
 
 // NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults.
-func NewTCPIPv4(t *testing.T, dut DUT, outgoingTCP, incomingTCP TCP) TCPIPv4 {
+func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 {
 	lMAC, err := tcpip.ParseMACAddress(*localMAC)
 	if err != nil {
 		t.Fatalf("can't parse localMAC %q: %s", *localMAC, err)
@@ -109,18 +109,16 @@ func NewTCPIPv4(t *testing.T, dut DUT, outgoingTCP, incomingTCP TCP) TCPIPv4 {
 	}
 
 	newOutgoingTCP := &TCP{
-		DataOffset: Uint8(header.TCPMinimumSize),
-		WindowSize: Uint16(32768),
-		SrcPort:    &localPort,
+		SrcPort: &localPort,
 	}
 	if err := newOutgoingTCP.merge(outgoingTCP); err != nil {
-		t.Fatalf("can't merge %v into %v: %s", outgoingTCP, newOutgoingTCP, err)
+		t.Fatalf("can't merge %+v into %+v: %s", outgoingTCP, newOutgoingTCP, err)
 	}
 	newIncomingTCP := &TCP{
 		DstPort: &localPort,
 	}
 	if err := newIncomingTCP.merge(incomingTCP); err != nil {
-		t.Fatalf("can't merge %v into %v: %s", incomingTCP, newIncomingTCP, err)
+		t.Fatalf("can't merge %+v into %+v: %s", incomingTCP, newIncomingTCP, err)
 	}
 	return TCPIPv4{
 		outgoing: Layers{
@@ -149,8 +147,9 @@ func (conn *TCPIPv4) Close() {
 	conn.portPickerFD = -1
 }
 
-// Send a packet with reasonable defaults and override some fields by tcp.
-func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+// CreateFrame builds a frame for the connection with tcp overriding defaults
+// and additionalLayers added after the TCP header.
+func (conn *TCPIPv4) CreateFrame(tcp TCP, additionalLayers ...Layer) Layers {
 	if tcp.SeqNum == nil {
 		tcp.SeqNum = Uint32(uint32(conn.LocalSeqNum))
 	}
@@ -159,30 +158,41 @@ func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
 	}
 	layersToSend := deepcopy.Copy(conn.outgoing).(Layers)
 	if err := layersToSend[tcpLayerIndex].(*TCP).merge(tcp); err != nil {
-		conn.t.Fatalf("can't merge %v into %v: %s", tcp, layersToSend[tcpLayerIndex], err)
+		conn.t.Fatalf("can't merge %+v into %+v: %s", tcp, layersToSend[tcpLayerIndex], err)
 	}
 	layersToSend = append(layersToSend, additionalLayers...)
-	outBytes, err := layersToSend.toBytes()
+	return layersToSend
+}
+
+// SendFrame sends a frame with reasonable defaults.
+func (conn *TCPIPv4) SendFrame(frame Layers) {
+	outBytes, err := frame.toBytes()
 	if err != nil {
 		conn.t.Fatalf("can't build outgoing TCP packet: %s", err)
 	}
 	conn.injector.Send(outBytes)
 
 	// Compute the next TCP sequence number.
-	for i := tcpLayerIndex + 1; i < len(layersToSend); i++ {
-		conn.LocalSeqNum.UpdateForward(seqnum.Size(layersToSend[i].length()))
+	for i := tcpLayerIndex + 1; i < len(frame); i++ {
+		conn.LocalSeqNum.UpdateForward(seqnum.Size(frame[i].length()))
 	}
+	tcp := frame[tcpLayerIndex].(*TCP)
 	if tcp.Flags != nil && *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
 		conn.LocalSeqNum.UpdateForward(1)
 	}
 }
 
+// Send a packet with reasonable defaults and override some fields by tcp.
+func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+	conn.SendFrame(conn.CreateFrame(tcp, additionalLayers...))
+}
+
 // Recv gets a packet from the sniffer within the timeout provided. If no packet
 // arrives before the timeout, it returns nil.
 func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
 	deadline := time.Now().Add(timeout)
 	for {
-		timeout = deadline.Sub(time.Now())
+		timeout = time.Until(deadline)
 		if timeout <= 0 {
 			break
 		}
@@ -192,6 +202,7 @@ func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
 		}
 		layers, err := ParseEther(b)
 		if err != nil {
+			conn.t.Logf("can't parse frame: %s", err)
 			continue // Ignore packets that can't be parsed.
 		}
 		if !conn.incoming.match(layers) {
@@ -215,7 +226,7 @@ func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
 func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) *TCP {
 	deadline := time.Now().Add(timeout)
 	for {
-		timeout = deadline.Sub(time.Now())
+		timeout = time.Until(deadline)
 		if timeout <= 0 {
 			return nil
 		}
@@ -243,3 +254,154 @@ func (conn *TCPIPv4) Handshake() {
 	// Send an ACK.
 	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
 }
+
+// UDPIPv4 maintains state about a UDP/IPv4 connection.
+type UDPIPv4 struct {
+	outgoing     Layers
+	incoming     Layers
+	sniffer      Sniffer
+	injector     Injector
+	portPickerFD int
+	t            *testing.T
+}
+
+// udpLayerIndex is the position of the UDP layer in the UDPIPv4 connection. It
+// is the third, after Ethernet and IPv4.
+const udpLayerIndex int = 2
+
+// NewUDPIPv4 creates a new UDPIPv4 connection with reasonable defaults.
+func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 {
+	lMAC, err := tcpip.ParseMACAddress(*localMAC)
+	if err != nil {
+		t.Fatalf("can't parse localMAC %q: %s", *localMAC, err)
+	}
+
+	rMAC, err := tcpip.ParseMACAddress(*remoteMAC)
+	if err != nil {
+		t.Fatalf("can't parse remoteMAC %q: %s", *remoteMAC, err)
+	}
+
+	portPickerFD, localPort, err := pickPort()
+	if err != nil {
+		t.Fatalf("can't pick a port: %s", err)
+	}
+	lIP := tcpip.Address(net.ParseIP(*localIPv4).To4())
+	rIP := tcpip.Address(net.ParseIP(*remoteIPv4).To4())
+
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make new sniffer: %s", err)
+	}
+
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make new injector: %s", err)
+	}
+
+	newOutgoingUDP := &UDP{
+		SrcPort: &localPort,
+	}
+	if err := newOutgoingUDP.merge(outgoingUDP); err != nil {
+		t.Fatalf("can't merge %+v into %+v: %s", outgoingUDP, newOutgoingUDP, err)
+	}
+	newIncomingUDP := &UDP{
+		DstPort: &localPort,
+	}
+	if err := newIncomingUDP.merge(incomingUDP); err != nil {
+		t.Fatalf("can't merge %+v into %+v: %s", incomingUDP, newIncomingUDP, err)
+	}
+	return UDPIPv4{
+		outgoing: Layers{
+			&Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
+			&IPv4{SrcAddr: &lIP, DstAddr: &rIP},
+			newOutgoingUDP},
+		incoming: Layers{
+			&Ether{SrcAddr: &rMAC, DstAddr: &lMAC},
+			&IPv4{SrcAddr: &rIP, DstAddr: &lIP},
+			newIncomingUDP},
+		sniffer:      sniffer,
+		injector:     injector,
+		portPickerFD: portPickerFD,
+		t:            t,
+	}
+}
+
+// Close the injector and sniffer associated with this connection.
+func (conn *UDPIPv4) Close() {
+	conn.sniffer.Close()
+	conn.injector.Close()
+	if err := unix.Close(conn.portPickerFD); err != nil {
+		conn.t.Fatalf("can't close portPickerFD: %s", err)
+	}
+	conn.portPickerFD = -1
+}
+
+// CreateFrame builds a frame for the connection with the provided udp
+// overriding defaults and the additionalLayers added after the UDP header.
+func (conn *UDPIPv4) CreateFrame(udp UDP, additionalLayers ...Layer) Layers {
+	layersToSend := deepcopy.Copy(conn.outgoing).(Layers)
+	if err := layersToSend[udpLayerIndex].(*UDP).merge(udp); err != nil {
+		conn.t.Fatalf("can't merge %+v into %+v: %s", udp, layersToSend[udpLayerIndex], err)
+	}
+	layersToSend = append(layersToSend, additionalLayers...)
+	return layersToSend
+}
+
+// SendFrame sends a frame with reasonable defaults.
+func (conn *UDPIPv4) SendFrame(frame Layers) {
+	outBytes, err := frame.toBytes()
+	if err != nil {
+		conn.t.Fatalf("can't build outgoing UDP packet: %s", err)
+	}
+	conn.injector.Send(outBytes)
+}
+
+// Send a packet with reasonable defaults and override some fields by udp.
+func (conn *UDPIPv4) Send(udp UDP, additionalLayers ...Layer) {
+	conn.SendFrame(conn.CreateFrame(udp, additionalLayers...))
+}
+
+// Recv gets a packet from the sniffer within the timeout provided. If no packet
+// arrives before the timeout, it returns nil.
+func (conn *UDPIPv4) Recv(timeout time.Duration) *UDP {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = time.Until(deadline)
+		if timeout <= 0 {
+			break
+		}
+		b := conn.sniffer.Recv(timeout)
+		if b == nil {
+			break
+		}
+		layers, err := ParseEther(b)
+		if err != nil {
+			conn.t.Logf("can't parse frame: %s", err)
+			continue // Ignore packets that can't be parsed.
+		}
+		if !conn.incoming.match(layers) {
+			continue // Ignore packets that don't match the expected incoming.
+		}
+		return (layers[udpLayerIndex]).(*UDP)
+	}
+	return nil
+}
+
+// Expect a packet that matches the provided udp within the timeout specified.
+// If it doesn't arrive in time, the test fails.
+func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) *UDP {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = time.Until(deadline)
+		if timeout <= 0 {
+			return nil
+		}
+		gotUDP := conn.Recv(timeout)
+		if gotUDP == nil {
+			return nil
+		}
+		if udp.match(gotUDP) {
+			return gotUDP
+		}
+	}
+}
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index 8ea1706d3..f80dbb35f 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -305,6 +305,35 @@ func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval
 	}
 }
 
+// RecvWithErrno calls recv on the DUT.
+func (dut *DUT) RecvWithErrno(ctx context.Context, sockfd, len, flags int32) (int32, []byte, error) {
+	dut.t.Helper()
+	req := pb.RecvRequest{
+		Sockfd: sockfd,
+		Len:    len,
+		Flags:  flags,
+	}
+	resp, err := dut.posixServer.Recv(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Recv: %s", err)
+	}
+	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
+}
+
+// Recv calls recv on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// RecvWithErrno.
+func (dut *DUT) Recv(sockfd, len, flags int32) []byte {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, buf, err := dut.RecvWithErrno(ctx, sockfd, len, flags)
+	if ret == -1 {
+		dut.t.Fatalf("failed to recv: %s", err)
+	}
+	return buf
+}
+
 // CloseWithErrno calls close on the DUT.
 func (dut *DUT) CloseWithErrno(fd int32) (int32, error) {
 	dut.t.Helper()
@@ -330,10 +359,11 @@ func (dut *DUT) Close(fd int32) {
 	}
 }
 
-// CreateListener makes a new TCP connection.  If it fails, the test ends.
-func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
+// CreateBoundSocket makes a new socket on the DUT, with type typ and protocol
+// proto, and bound to the IP address addr. Returns the new file descriptor and
+// the port that was selected on the DUT.
+func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) {
 	dut.t.Helper()
-	addr := net.ParseIP(*remoteIPv4)
 	var fd int32
 	if addr.To4() != nil {
 		fd = dut.Socket(unix.AF_INET, typ, proto)
@@ -358,6 +388,12 @@ func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
 	default:
 		dut.t.Fatalf("unknown sockaddr type from getsockname: %t", sa)
 	}
-	dut.Listen(fd, backlog)
 	return fd, uint16(port)
 }
+
+// CreateListener makes a new TCP connection. If it fails, the test ends.
+func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
+	fd, remotePort := dut.CreateBoundSocket(typ, proto, net.ParseIP(*remoteIPv4))
+	dut.Listen(fd, backlog)
+	return fd, remotePort
+}
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 35fa4dcb6..d7434c3d2 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -22,6 +22,7 @@ import (
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/imdario/mergo"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
@@ -97,7 +98,7 @@ func equalLayer(x, y Layer) bool {
 	return cmp.Equal(x, y, opt, cmpopts.IgnoreUnexported(LayerBase{}))
 }
 
-// Ether can construct and match the ethernet encapsulation.
+// Ether can construct and match an ethernet encapsulation.
 type Ether struct {
 	LayerBase
 	SrcAddr *tcpip.LinkAddress
@@ -161,7 +162,7 @@ func ParseEther(b []byte) (Layers, error) {
 		return append(layers, moreLayers...), nil
 	default:
 		// TODO(b/150301488): Support more protocols, like IPv6.
-		return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %v", b)
+		return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %#v", b)
 	}
 }
 
@@ -173,7 +174,7 @@ func (l *Ether) length() int {
 	return header.EthernetMinimumSize
 }
 
-// IPv4 can construct and match the ethernet excapulation.
+// IPv4 can construct and match an IPv4 encapsulation.
 type IPv4 struct {
 	LayerBase
 	IHL            *uint8
@@ -236,9 +237,11 @@ func (l *IPv4) toBytes() ([]byte, error) {
 		switch n := l.next().(type) {
 		case *TCP:
 			fields.Protocol = uint8(header.TCPProtocolNumber)
+		case *UDP:
+			fields.Protocol = uint8(header.UDPProtocolNumber)
 		default:
-			// TODO(b/150301488): Support more protocols, like UDP.
-			return nil, fmt.Errorf("can't deduce the ip header's next protocol: %+v", n)
+			// TODO(b/150301488): Support more protocols as needed.
+			return nil, fmt.Errorf("can't deduce the ip header's next protocol: %#v", n)
 		}
 	}
 	if l.SrcAddr != nil {
@@ -294,13 +297,19 @@ func ParseIPv4(b []byte) (Layers, error) {
 		DstAddr:        Address(h.DestinationAddress()),
 	}
 	layers := Layers{&ipv4}
-	switch h.Protocol() {
-	case uint8(header.TCPProtocolNumber):
+	switch h.TransportProtocol() {
+	case header.TCPProtocolNumber:
 		moreLayers, err := ParseTCP(b[ipv4.length():])
 		if err != nil {
 			return nil, err
 		}
 		return append(layers, moreLayers...), nil
+	case header.UDPProtocolNumber:
+		moreLayers, err := ParseUDP(b[ipv4.length():])
+		if err != nil {
+			return nil, err
+		}
+		return append(layers, moreLayers...), nil
 	}
 	return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %d", h.Protocol())
 }
@@ -316,7 +325,7 @@ func (l *IPv4) length() int {
 	return int(*l.IHL)
 }
 
-// TCP can construct and match the TCP excapulation.
+// TCP can construct and match a TCP encapsulation.
 type TCP struct {
 	LayerBase
 	SrcPort       *uint16
@@ -347,12 +356,16 @@ func (l *TCP) toBytes() ([]byte, error) {
 	}
 	if l.DataOffset != nil {
 		h.SetDataOffset(*l.DataOffset)
+	} else {
+		h.SetDataOffset(uint8(l.length()))
 	}
 	if l.Flags != nil {
 		h.SetFlags(*l.Flags)
 	}
 	if l.WindowSize != nil {
 		h.SetWindowSize(*l.WindowSize)
+	} else {
+		h.SetWindowSize(32768)
 	}
 	if l.UrgentPointer != nil {
 		h.SetUrgentPoiner(*l.UrgentPointer)
@@ -361,38 +374,52 @@ func (l *TCP) toBytes() ([]byte, error) {
 		h.SetChecksum(*l.Checksum)
 		return h, nil
 	}
-	if err := setChecksum(&h, l); err != nil {
+	if err := setTCPChecksum(&h, l); err != nil {
 		return nil, err
 	}
 	return h, nil
 }
 
-// setChecksum calculates the checksum of the TCP header and sets it in h.
-func setChecksum(h *header.TCP, tcp *TCP) error {
-	h.SetChecksum(0)
-	tcpLength := uint16(tcp.length())
-	current := tcp.next()
-	for current != nil {
-		tcpLength += uint16(current.length())
-		current = current.next()
+// totalLength returns the length of the provided layer and all following
+// layers.
+func totalLength(l Layer) int {
+	var totalLength int
+	for ; l != nil; l = l.next() {
+		totalLength += l.length()
 	}
+	return totalLength
+}
 
+// layerChecksum calculates the checksum of the Layer header, including the
+// peusdeochecksum of the layer before it and all the bytes after it..
+func layerChecksum(l Layer, protoNumber tcpip.TransportProtocolNumber) (uint16, error) {
+	totalLength := uint16(totalLength(l))
 	var xsum uint16
-	switch s := tcp.prev().(type) {
+	switch s := l.prev().(type) {
 	case *IPv4:
-		xsum = header.PseudoHeaderChecksum(header.TCPProtocolNumber, *s.SrcAddr, *s.DstAddr, tcpLength)
+		xsum = header.PseudoHeaderChecksum(protoNumber, *s.SrcAddr, *s.DstAddr, totalLength)
 	default:
 		// TODO(b/150301488): Support more protocols, like IPv6.
-		return fmt.Errorf("can't get src and dst addr from previous layer")
+		return 0, fmt.Errorf("can't get src and dst addr from previous layer: %#v", s)
 	}
-	current = tcp.next()
-	for current != nil {
+	var payloadBytes buffer.VectorisedView
+	for current := l.next(); current != nil; current = current.next() {
 		payload, err := current.toBytes()
 		if err != nil {
-			return fmt.Errorf("can't get bytes for next header: %s", payload)
+			return 0, fmt.Errorf("can't get bytes for next header: %s", payload)
 		}
-		xsum = header.Checksum(payload, xsum)
-		current = current.next()
+		payloadBytes.AppendView(payload)
+	}
+	xsum = header.ChecksumVV(payloadBytes, xsum)
+	return xsum, nil
+}
+
+// setTCPChecksum calculates the checksum of the TCP header and sets it in h.
+func setTCPChecksum(h *header.TCP, tcp *TCP) error {
+	h.SetChecksum(0)
+	xsum, err := layerChecksum(tcp, header.TCPProtocolNumber)
+	if err != nil {
+		return err
 	}
 	h.SetChecksum(^h.CalculateChecksum(xsum))
 	return nil
@@ -444,6 +471,85 @@ func (l *TCP) merge(other TCP) error {
 	return mergo.Merge(l, other, mergo.WithOverride)
 }
 
+// UDP can construct and match a UDP encapsulation.
+type UDP struct {
+	LayerBase
+	SrcPort  *uint16
+	DstPort  *uint16
+	Length   *uint16
+	Checksum *uint16
+}
+
+func (l *UDP) toBytes() ([]byte, error) {
+	b := make([]byte, header.UDPMinimumSize)
+	h := header.UDP(b)
+	if l.SrcPort != nil {
+		h.SetSourcePort(*l.SrcPort)
+	}
+	if l.DstPort != nil {
+		h.SetDestinationPort(*l.DstPort)
+	}
+	if l.Length != nil {
+		h.SetLength(*l.Length)
+	} else {
+		h.SetLength(uint16(totalLength(l)))
+	}
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+		return h, nil
+	}
+	if err := setUDPChecksum(&h, l); err != nil {
+		return nil, err
+	}
+	return h, nil
+}
+
+// setUDPChecksum calculates the checksum of the UDP header and sets it in h.
+func setUDPChecksum(h *header.UDP, udp *UDP) error {
+	h.SetChecksum(0)
+	xsum, err := layerChecksum(udp, header.UDPProtocolNumber)
+	if err != nil {
+		return err
+	}
+	h.SetChecksum(^h.CalculateChecksum(xsum))
+	return nil
+}
+
+// ParseUDP parses the bytes assuming that they start with a udp header and
+// continues parsing further encapsulations.
+func ParseUDP(b []byte) (Layers, error) {
+	h := header.UDP(b)
+	udp := UDP{
+		SrcPort:  Uint16(h.SourcePort()),
+		DstPort:  Uint16(h.DestinationPort()),
+		Length:   Uint16(h.Length()),
+		Checksum: Uint16(h.Checksum()),
+	}
+	layers := Layers{&udp}
+	moreLayers, err := ParsePayload(b[udp.length():])
+	if err != nil {
+		return nil, err
+	}
+	return append(layers, moreLayers...), nil
+}
+
+func (l *UDP) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *UDP) length() int {
+	if l.Length == nil {
+		return header.UDPMinimumSize
+	}
+	return int(*l.Length)
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *UDP) merge(other UDP) error {
+	return mergo.Merge(l, other, mergo.WithOverride)
+}
+
 // Payload has bytes beyond OSI layer 4.
 type Payload struct {
 	LayerBase
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 1dff2a4d5..9a4d66ea9 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -15,6 +15,19 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "udp_recv_multicast",
+    srcs = ["udp_recv_multicast_test.go"],
+    # TODO(b/152813495): Fix netstack then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/Dockerfile b/test/packetimpact/tests/Dockerfile
index 507030cc7..9075bc555 100644
--- a/test/packetimpact/tests/Dockerfile
+++ b/test/packetimpact/tests/Dockerfile
@@ -1,5 +1,17 @@
 FROM ubuntu:bionic
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y iptables netcat tcpdump iproute2 tshark
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        # iptables to disable OS native packet processing.
+        iptables \
+        # nc to check that the posix_server is running.
+        netcat \
+        # tcpdump to log brief packet sniffing.
+        tcpdump \
+        # ip link show to display MAC addresses.
+        iproute2 \
+        # tshark to log verbose packet sniffing.
+        tshark \
+        # killall for cleanup.
+        psmisc
 RUN hash -r
 CMD /bin/bash
diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/tests/defs.bzl
index 1b4213d9b..8c0d058b2 100644
--- a/test/packetimpact/tests/defs.bzl
+++ b/test/packetimpact/tests/defs.bzl
@@ -93,7 +93,17 @@ def packetimpact_netstack_test(name, testbench_binary, **kwargs):
         **kwargs
     )
 
-def packetimpact_go_test(name, size = "small", pure = True, **kwargs):
+def packetimpact_go_test(name, size = "small", pure = True, linux = True, netstack = True, **kwargs):
+    """Add packetimpact tests written in go.
+
+    Args:
+        name: name of the test
+        size: size of the test
+        pure: make a static go binary
+        linux: generate a linux test
+        netstack: generate a netstack test
+        **kwargs: all the other args, forwarded to go_test
+    """
     testbench_binary = name + "_test"
     go_test(
         name = testbench_binary,
@@ -102,5 +112,7 @@ def packetimpact_go_test(name, size = "small", pure = True, **kwargs):
         tags = PACKETIMPACT_TAGS,
         **kwargs
     )
-    packetimpact_linux_test(name = name, testbench_binary = testbench_binary)
-    packetimpact_netstack_test(name = name, testbench_binary = testbench_binary)
+    if linux:
+        packetimpact_linux_test(name = name, testbench_binary = testbench_binary)
+    if netstack:
+        packetimpact_netstack_test(name = name, testbench_binary = testbench_binary)
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
index 5f54e67ed..2b3f39045 100644
--- a/test/packetimpact/tests/fin_wait2_timeout_test.go
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -36,7 +36,7 @@ func TestFinWait2Timeout(t *testing.T) {
 			defer dut.TearDown()
 			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 			defer dut.Close(listenFd)
-			conn := tb.NewTCPIPv4(t, dut, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
 			defer conn.Close()
 			conn.Handshake()
 
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
index 5281cb53d..e99fc7d09 100755
--- a/test/packetimpact/tests/test_runner.sh
+++ b/test/packetimpact/tests/test_runner.sh
@@ -29,13 +29,15 @@ function failure() {
 }
 trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
 
-declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark"
+declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark,extra_test_arg:"
 
 # Don't use declare below so that the error from getopt will end the script.
 PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
 
 eval set -- "$PARSED"
 
+declare -a EXTRA_TEST_ARGS
+
 while true; do
   case "$1" in
     --dut_platform)
@@ -62,6 +64,10 @@ while true; do
       declare -r TSHARK="1"
       shift 1
       ;;
+    --extra_test_arg)
+      EXTRA_TEST_ARGS+="$2"
+      shift 2
+      ;;
     --)
       shift
       break
@@ -125,6 +131,19 @@ docker --version
 
 function finish {
   local cleanup_success=1
+
+  if [[ -z "${TSHARK-}" ]]; then
+    # Kill tcpdump so that it will flush output.
+    docker exec -t "${TESTBENCH}" \
+      killall tcpdump || \
+      cleanup_success=0
+  else
+    # Kill tshark so that it will flush output.
+    docker exec -t "${TESTBENCH}" \
+      killall tshark || \
+      cleanup_success=0
+  fi
+
   for net in "${CTRL_NET}" "${TEST_NET}"; do
     # Kill all processes attached to ${net}.
     for docker_command in "kill" "rm"; do
@@ -224,6 +243,8 @@ else
   # interface with the test packets.
   docker exec -t "${TESTBENCH}" \
     tshark -V -l -n -i "${TEST_DEVICE}" \
+    -o tcp.check_checksum:TRUE \
+    -o udp.check_checksum:TRUE \
     host "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" &
 fi
 
@@ -235,6 +256,7 @@ sleep 3
 # be executed on the DUT.
 docker exec -t "${TESTBENCH}" \
   /bin/bash -c "${DOCKER_TESTBENCH_BINARY} \
+  ${EXTRA_TEST_ARGS[@]-} \
   --posix_server_ip=${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
   --posix_server_port=${CTRL_PORT} \
   --remote_ipv4=${TEST_NET_PREFIX}${DUT_NET_SUFFIX} \
diff --git a/test/packetimpact/tests/udp_recv_multicast_test.go b/test/packetimpact/tests/udp_recv_multicast_test.go
new file mode 100644
index 000000000..bc1b0be49
--- /dev/null
+++ b/test/packetimpact/tests/udp_recv_multicast_test.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp_recv_multicast_test
+
+import (
+	"net"
+	"testing"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func TestUDPRecvMulticast(t *testing.T) {
+	dut := tb.NewDUT(t)
+	defer dut.TearDown()
+	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+	defer dut.Close(boundFD)
+	conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+	defer conn.Close()
+	frame := conn.CreateFrame(tb.UDP{}, &tb.Payload{Bytes: []byte("hello world")})
+	frame[1].(*tb.IPv4).DstAddr = tb.Address(tcpip.Address(net.ParseIP("224.0.0.1").To4()))
+	conn.SendFrame(frame)
+	dut.Recv(boundFD, 100, 0)
+}
-- 
cgit v1.2.3


From 5eb41c8fbabac090251fbfb43bd9c814124aa575 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 11 Feb 2020 02:55:51 -0500
Subject: Arm64 signal#2: signal support in arch module

SA_RESTORER is always used on Intel platform.
But this flag is optional on other platforms.

The vdso is enabled, so we can use the sigreturn trampolines
the vdso provides instead on Arm platform.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/arch/signal_arm64.go   | 30 ++++++++++++++++++++++++------
 pkg/sentry/kernel/task_signals.go | 13 +++++++++++++
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 4f4cc46a8..b57d6a17d 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -83,9 +83,12 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	if ucSize < 0 {
 		panic("can't get size of UContext64")
 	}
-	// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
-	frameSize := int(st.Arch.Width()) + ucSize + 128
-	frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
+
+	// frameSize = ucSize + sizeof(siginfo).
+	// sizeof(siginfo) == 128.
+	// R30 stores the restorer address.
+	frameSize := ucSize + 128
+	frameBottom := (sp - usermem.Addr(frameSize)) & ^usermem.Addr(15)
 	sp = frameBottom + usermem.Addr(frameSize)
 	st.Bottom = sp
 
@@ -115,12 +118,27 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	c.Regs.Regs[0] = uint64(info.Signo)
 	c.Regs.Regs[1] = uint64(infoAddr)
 	c.Regs.Regs[2] = uint64(ucAddr)
-
+	c.Regs.Regs[30] = uint64(act.Restorer)
 	return nil
 }
 
 // SignalRestore implements Context.SignalRestore.
-// Only used on intel.
 func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
-	return 0, SignalStack{}, nil
+	// Copy out the stack frame.
+	var uc UContext64
+	if _, err := st.Pop(&uc); err != nil {
+		return 0, SignalStack{}, err
+	}
+	var info SignalInfo
+	if _, err := st.Pop(&info); err != nil {
+		return 0, SignalStack{}, err
+	}
+
+	// Restore registers.
+	c.Regs.Regs = uc.MContext.Regs
+	c.Regs.Pc = uc.MContext.Pc
+	c.Regs.Sp = uc.MContext.Sp
+	c.Regs.Pstate = uc.MContext.Pstate
+
+	return uc.Sigset, uc.Stack, nil
 }
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8802db142..0e74236c9 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -263,6 +263,19 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	if t.haveSavedSignalMask {
 		mask = t.savedSignalMask
 	}
+
+	// Set up the restorer.
+	// x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms.
+	// Please see the linux code as reference:
+	// linux/arch/x86/kernel/signal.c:__setup_rt_frame()
+	// If SA_RESTORER is not configured, we can use the sigreturn trampolines
+	// the vdso provides instead.
+	// Please see the linux code as reference:
+	// linux/arch/arm64/kernel/signal.c:setup_return()
+	if act.Flags&linux.SA_RESTORER == 0 {
+		act.Restorer = t.MemoryManager().VDSOSigReturn()
+	}
+
 	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
 		return err
 	}
-- 
cgit v1.2.3


From 840980aeba0b5224b13bcaadf5785ac5305a5230 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 31 Mar 2020 22:54:50 -0700
Subject: Implement automated marshalling for slices of Marshallable types.

PiperOrigin-RevId: 304119255
---
 pkg/sentry/kernel/rseq.go                          |   2 +-
 pkg/sentry/syscalls/linux/sys_stat.go              |   6 +-
 pkg/sentry/syscalls/linux/vfs2/epoll.go            |   4 +-
 pkg/sentry/syscalls/linux/vfs2/poll.go             |  14 +-
 pkg/sentry/syscalls/linux/vfs2/setstat.go          |   2 +-
 pkg/sentry/syscalls/linux/vfs2/stat.go             |  23 +-
 tools/go_marshal/analysis/analysis_unsafe.go       |   4 +
 tools/go_marshal/defs.bzl                          |   3 +-
 tools/go_marshal/gomarshal/generator.go            | 130 ++++--
 tools/go_marshal/gomarshal/generator_interfaces.go |  62 +++
 .../generator_interfaces_array_newtype.go          |  84 +---
 .../generator_interfaces_primitive_newtype.go      | 173 ++++---
 .../gomarshal/generator_interfaces_struct.go       | 308 +++++++++---
 tools/go_marshal/gomarshal/generator_tests.go      |  52 ++-
 tools/go_marshal/gomarshal/util.go                 |  21 +-
 tools/go_marshal/marshal/marshal.go                | 103 ++++-
 tools/go_marshal/primitive/BUILD                   |  18 +
 tools/go_marshal/primitive/primitive.go            | 175 +++++++
 tools/go_marshal/test/BUILD                        |  14 +
 tools/go_marshal/test/benchmark_test.go            |  42 ++
 tools/go_marshal/test/external/external.go         |   8 +
 tools/go_marshal/test/marshal_test.go              | 515 +++++++++++++++++++++
 tools/go_marshal/test/test.go                      |  36 +-
 23 files changed, 1525 insertions(+), 274 deletions(-)
 create mode 100644 tools/go_marshal/primitive/BUILD
 create mode 100644 tools/go_marshal/primitive/primitive.go
 create mode 100644 tools/go_marshal/test/marshal_test.go

diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index ded95f532..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() {
 	}
 
 	var cs linux.RSeqCriticalSection
-	if err := cs.CopyIn(t, critAddr); err != nil {
+	if _, err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index a11a87cd1..46ebf27a2 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -115,7 +115,8 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 		return err
 	}
 	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
-	return s.CopyOut(t, statAddr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -125,7 +126,8 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 		return err
 	}
 	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
-	return s.CopyOut(t, statAddr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // Statx implements linux syscall statx(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index d6cb0e79a..5a938cee2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -101,14 +101,14 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	var event linux.EpollEvent
 	switch op {
 	case linux.EPOLL_CTL_ADD:
-		if err := event.CopyIn(t, eventAddr); err != nil {
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, ep.AddInterest(file, fd, event)
 	case linux.EPOLL_CTL_DEL:
 		return 0, nil, ep.DeleteInterest(file, fd)
 	case linux.EPOLL_CTL_MOD:
-		if err := event.CopyIn(t, eventAddr); err != nil {
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, ep.ModifyInterest(file, fd, event)
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index dbf4882da..ff1b25d7b 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -374,7 +374,8 @@ func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.D
 	}
 	remaining := timeoutRemaining(t, startNs, timeout)
 	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
-	return tsRemaining.CopyOut(t, timespecAddr)
+	_, err := tsRemaining.CopyOut(t, timespecAddr)
+	return err
 }
 
 // copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
@@ -386,7 +387,8 @@ func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Du
 	}
 	remaining := timeoutRemaining(t, startNs, timeout)
 	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
-	return tvRemaining.CopyOut(t, timevalAddr)
+	_, err := tvRemaining.CopyOut(t, timevalAddr)
+	return err
 }
 
 // pollRestartBlock encapsulates the state required to restart poll(2) via
@@ -477,7 +479,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	timeout := time.Duration(-1)
 	if timevalAddr != 0 {
 		var timeval linux.Timeval
-		if err := timeval.CopyIn(t, timevalAddr); err != nil {
+		if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
 			return 0, nil, err
 		}
 		if timeval.Sec < 0 || timeval.Usec < 0 {
@@ -519,7 +521,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
 		}
 		var maskStruct sigSetWithSize
-		if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+		if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
 			return 0, nil, err
 		}
 		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
@@ -554,7 +556,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D
 	timeout := time.Duration(-1)
 	if timespecAddr != 0 {
 		var timespec linux.Timespec
-		if err := timespec.CopyIn(t, timespecAddr); err != nil {
+		if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
 			return 0, err
 		}
 		if !timespec.Valid() {
@@ -573,7 +575,7 @@ func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) erro
 		return syserror.EINVAL
 	}
 	var mask linux.SignalSet
-	if err := mask.CopyIn(t, maskAddr); err != nil {
+	if _, err := mask.CopyIn(t, maskAddr); err != nil {
 		return err
 	}
 	mask &^= kernel.UnblockableSignals
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 136453ccc..4e61f1452 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -226,7 +226,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
 	} else {
 		var times linux.Utime
-		if err := times.CopyIn(t, timesAddr); err != nil {
+		if _, err := times.CopyIn(t, timesAddr); err != nil {
 			return 0, nil, err
 		}
 		opts.Stat.Atime.Sec = times.Actime
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index fdfe49243..bb1d5cac4 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -91,7 +91,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
 				}
 				var stat linux.Stat
 				convertStatxToUserStat(t, &statx, &stat)
-				return stat.CopyOut(t, statAddr)
+				_, err = stat.CopyOut(t, statAddr)
+				return err
 			}
 			start = dirfile.VirtualDentry()
 			start.IncRef()
@@ -111,7 +112,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
 	}
 	var stat linux.Stat
 	convertStatxToUserStat(t, &statx, &stat)
-	return stat.CopyOut(t, statAddr)
+	_, err = stat.CopyOut(t, statAddr)
+	return err
 }
 
 func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
@@ -140,7 +142,8 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	var stat linux.Stat
 	convertStatxToUserStat(t, &statx, &stat)
-	return 0, nil, stat.CopyOut(t, statAddr)
+	_, err = stat.CopyOut(t, statAddr)
+	return 0, nil, err
 }
 
 // Statx implements Linux syscall statx(2).
@@ -199,7 +202,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 					return 0, nil, err
 				}
 				userifyStatx(t, &statx)
-				return 0, nil, statx.CopyOut(t, statxAddr)
+				_, err = statx.CopyOut(t, statxAddr)
+				return 0, nil, err
 			}
 			start = dirfile.VirtualDentry()
 			start.IncRef()
@@ -218,7 +222,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 	userifyStatx(t, &statx)
-	return 0, nil, statx.CopyOut(t, statxAddr)
+	_, err = statx.CopyOut(t, statxAddr)
+	return 0, nil, err
 }
 
 func userifyStatx(t *kernel.Task, statx *linux.Statx) {
@@ -359,8 +364,8 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if err != nil {
 		return 0, nil, err
 	}
-
-	return 0, nil, statfs.CopyOut(t, bufAddr)
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
 }
 
 // Fstatfs implements Linux syscall fstatfs(2).
@@ -378,6 +383,6 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	if err != nil {
 		return 0, nil, err
 	}
-
-	return 0, nil, statfs.CopyOut(t, bufAddr)
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
 }
diff --git a/tools/go_marshal/analysis/analysis_unsafe.go b/tools/go_marshal/analysis/analysis_unsafe.go
index 9a9a4f298..cd55cf5cb 100644
--- a/tools/go_marshal/analysis/analysis_unsafe.go
+++ b/tools/go_marshal/analysis/analysis_unsafe.go
@@ -161,6 +161,10 @@ func AlignmentCheck(t *testing.T, typ reflect.Type) (ok bool, delta uint64) {
 		if typ.NumField() > 0 && nextXOff != int(typ.Size()) {
 			implicitPad := int(typ.Size()) - nextXOff
 			f := typ.Field(typ.NumField() - 1) // Final field
+			if tag, ok := f.Tag.Lookup("marshal"); ok && tag == "unaligned" {
+				// Final field explicitly marked unaligned.
+				break
+			}
 			t.Fatalf("Suspect offset for field %s.%s at the end of %s, detected an implicit %d byte padding from offset %d to %d at the end of the struct; either add %d bytes of explict padding at end of the struct or tag the final field %s as `marshal:\"unaligned\"`.",
 				typ.Name(), f.Name, typ.Name(), implicitPad, nextXOff, typ.Size(), implicitPad, f.Name)
 		}
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index d79786a68..323e33882 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -53,9 +53,10 @@ go_marshal = rule(
 
 # marshal_deps are the dependencies requied by generated code.
 marshal_deps = [
-    "//tools/go_marshal/marshal",
+    "//pkg/gohacks",
     "//pkg/safecopy",
     "//pkg/usermem",
+    "//tools/go_marshal/marshal",
 ]
 
 # marshal_test_deps are required by test targets.
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 82983804c..935a36b25 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -28,12 +28,6 @@ import (
 	"gvisor.dev/gvisor/tools/tags"
 )
 
-const (
-	marshalImport  = "gvisor.dev/gvisor/tools/go_marshal/marshal"
-	safecopyImport = "gvisor.dev/gvisor/pkg/safecopy"
-	usermemImport  = "gvisor.dev/gvisor/pkg/usermem"
-)
-
 // List of identifiers we use in generated code that may conflict with a
 // similarly-named source identifier. Abort gracefully when we see these to
 // avoid potentially confusing compilation failures in generated code.
@@ -44,8 +38,8 @@ const (
 // All recievers are single letters, so we don't allow import aliases to be a
 // single letter.
 var badIdents = []string{
-	"addr", "blk", "buf", "dst", "dsts", "err", "hdr", "idx", "inner", "len",
-	"ptr", "src", "srcs", "task", "val",
+	"addr", "blk", "buf", "dst", "dsts", "count", "err", "hdr", "idx", "inner",
+	"length", "limit", "ptr", "size", "src", "srcs", "task", "val",
 	// All single-letter identifiers.
 }
 
@@ -110,9 +104,10 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G
 	g.imports.add("reflect")
 	g.imports.add("runtime")
 	g.imports.add("unsafe")
-	g.imports.add(marshalImport)
-	g.imports.add(safecopyImport)
-	g.imports.add(usermemImport)
+	g.imports.add("gvisor.dev/gvisor/pkg/gohacks")
+	g.imports.add("gvisor.dev/gvisor/pkg/safecopy")
+	g.imports.add("gvisor.dev/gvisor/pkg/usermem")
+	g.imports.add("gvisor.dev/gvisor/tools/go_marshal/marshal")
 
 	return &g, nil
 }
@@ -194,10 +189,73 @@ func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) {
 	return files, fsets, nil
 }
 
+// sliceAPI carries information about the '+marshal slice' directive.
+type sliceAPI struct {
+	// Comment node in the AST containing the +marshal tag.
+	comment *ast.Comment
+	// Identifier fragment to use when naming generated functions for the slice
+	// API.
+	ident string
+	// Whether the generated functions should reference the newtype name, or the
+	// inner type name. Only meaningful on newtype declarations on primitives.
+	inner bool
+}
+
+// marshallableType carries information about a type marked with the '+marshal'
+// directive.
+type marshallableType struct {
+	spec  *ast.TypeSpec
+	slice *sliceAPI
+}
+
+func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.TypeSpec) marshallableType {
+	mt := marshallableType{
+		spec:  spec,
+		slice: nil,
+	}
+
+	var unhandledTags []string
+
+	for _, tag := range strings.Fields(strings.TrimPrefix(tagLine.Text, "// +marshal")) {
+		if strings.HasPrefix(tag, "slice:") {
+			tokens := strings.Split(tag, ":")
+			if len(tokens) < 2 || len(tokens) > 3 {
+				abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive has invalid 'slice' clause. Expecting format 'slice:<IDENTIFIER>[:inner]', got '%v'", tag))
+			}
+			if len(tokens[1]) == 0 {
+				abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has empty identifier argument. Expecting '+marshal slice:identifier'")
+			}
+
+			sa := &sliceAPI{
+				comment: tagLine,
+				ident:   tokens[1],
+			}
+			mt.slice = sa
+
+			if len(tokens) == 3 {
+				if tokens[2] != "inner" {
+					abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has an invalid argument. Expecting '+marshal slice:<IDENTIFIER>[:inner]'")
+				}
+				sa.inner = true
+			}
+
+			continue
+		}
+
+		unhandledTags = append(unhandledTags, tag)
+	}
+
+	if len(unhandledTags) > 0 {
+		abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive contained the following unknown clauses: %v", strings.Join(unhandledTags, " ")))
+	}
+
+	return mt
+}
+
 // collectMarshallableTypes walks the parsed AST and collects a list of type
 // declarations for which we need to generate the Marshallable interface.
-func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*ast.TypeSpec {
-	var types []*ast.TypeSpec
+func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []marshallableType {
+	var types []marshallableType
 	for _, decl := range a.Decls {
 		gdecl, ok := decl.(*ast.GenDecl)
 		// Type declaration?
@@ -212,9 +270,11 @@ func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*a
 		}
 		// Does the comment contain a "+marshal" line?
 		marked := false
+		var tagLine *ast.Comment
 		for _, c := range gdecl.Doc.List {
-			if c.Text == "// +marshal" {
+			if strings.HasPrefix(c.Text, "// +marshal") {
 				marked = true
+				tagLine = c
 				break
 			}
 		}
@@ -229,20 +289,17 @@ func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []*a
 			switch t.Type.(type) {
 			case *ast.StructType:
 				debugfAt(f.Position(t.Pos()), "Collected marshallable struct %s.\n", t.Name.Name)
-				types = append(types, t)
-				continue
 			case *ast.Ident: // Newtype on primitive.
 				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on primitive %s.\n", t.Name.Name)
-				types = append(types, t)
-				continue
 			case *ast.ArrayType: // Newtype on array.
 				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on array %s.\n", t.Name.Name)
-				types = append(types, t)
-				continue
+			default:
+				// A user specifically requested marshalling on this type, but we
+				// don't support it.
+				abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name))
 			}
-			// A user specifically requested marshalling on this type, but we
-			// don't support it.
-			abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name))
+			types = append(types, newMarshallableType(f, tagLine, t))
+
 		}
 	}
 	return types
@@ -281,19 +338,28 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp
 
 }
 
-func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
-	i := newInterfaceGenerator(t, fset)
-	switch ty := t.Type.(type) {
+func (g *Generator) generateOne(t marshallableType, fset *token.FileSet) *interfaceGenerator {
+	i := newInterfaceGenerator(t.spec, fset)
+	switch ty := t.spec.Type.(type) {
 	case *ast.StructType:
-		i.validateStruct(t, ty)
+		i.validateStruct(t.spec, ty)
 		i.emitMarshallableForStruct(ty)
+		if t.slice != nil {
+			i.emitMarshallableSliceForStruct(ty, t.slice)
+		}
 	case *ast.Ident:
 		i.validatePrimitiveNewtype(ty)
 		i.emitMarshallableForPrimitiveNewtype(ty)
+		if t.slice != nil {
+			i.emitMarshallableSliceForPrimitiveNewtype(ty, t.slice)
+		}
 	case *ast.ArrayType:
-		i.validateArrayNewtype(t.Name, ty)
+		i.validateArrayNewtype(t.spec.Name, ty)
 		// After validate, we can safely call arrayLen.
-		i.emitMarshallableForArrayNewtype(t.Name, ty.Elt.(*ast.Ident), arrayLen(ty))
+		i.emitMarshallableForArrayNewtype(t.spec.Name, ty.Elt.(*ast.Ident), arrayLen(ty))
+		if t.slice != nil {
+			abortAt(fset.Position(t.slice.comment.Slash), fmt.Sprintf("Array type marked as '+marshal slice:...', but this is not supported. Perhaps fold one of the dimensions?"))
+		}
 	default:
 		// This should've been filtered out by collectMarshallabeTypes.
 		panic(fmt.Sprintf("Unexpected type %+v", ty))
@@ -303,9 +369,9 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface
 
 // generateOneTestSuite generates a test suite for the automatically generated
 // implementations type t.
-func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator {
-	i := newTestGenerator(t)
-	i.emitTests()
+func (g *Generator) generateOneTestSuite(t marshallableType) *testGenerator {
+	i := newTestGenerator(t.spec)
+	i.emitTests(t.slice)
 	return i
 }
 
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 8babf61d2..8812c6878 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -163,3 +163,65 @@ func (g *interfaceGenerator) unmarshalScalar(accessor, typ, bufVar string) {
 		g.recordPotentiallyNonPackedField(accessor)
 	}
 }
+
+// emitCastToByteSlice unsafely casts an arbitrary type's underlying memory to a
+// byte slice, bypassing escape analysis. The caller is responsible for ensuring
+// srcPtr lives until they're done with dstVar, the runtime does not consider
+// dstVar dependent on srcPtr due to the escape analysis bypass.
+//
+// srcPtr must be a pointer.
+//
+// This function uses internally uses the identifier "hdr", and cannot be used
+// in a context where it is already bound.
+func (g *interfaceGenerator) emitCastToByteSlice(srcPtr, dstVar, lenExpr string) {
+	g.recordUsedImport("gohacks")
+	g.emit("// Construct a slice backed by dst's underlying memory.\n")
+	g.emit("var %s []byte\n", dstVar)
+	g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar)
+	g.emit("hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(%s)))\n", srcPtr)
+	g.emit("hdr.Len = %s\n", lenExpr)
+	g.emit("hdr.Cap = %s\n\n", lenExpr)
+}
+
+// emitCastToByteSlice unsafely casts a slice with elements of an abitrary type
+// to a byte slice. As part of the cast, the byte slice is made to look
+// independent of the src slice by bypassing escape analysis. This means the
+// byte slice can be used without causing the source to escape. The caller is
+// responsible for ensuring srcPtr lives until they're done with dstVar, as the
+// runtime no longer considers dstVar dependent on srcPtr and is free to GC it.
+//
+// srcPtr must be a pointer.
+//
+// This function uses internally uses the identifiers "ptr", "val" and "hdr",
+// and cannot be used in a context where these identifiers are already bound.
+func (g *interfaceGenerator) emitCastSliceToByteSlice(srcPtr, dstVar, lenExpr string) {
+	g.emitNoEscapeSliceDataPointer(srcPtr, "val")
+
+	g.emit("// Construct a slice backed by dst's underlying memory.\n")
+	g.emit("var %s []byte\n", dstVar)
+	g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar)
+	g.emit("hdr.Data = uintptr(val)\n")
+	g.emit("hdr.Len = %s\n", lenExpr)
+	g.emit("hdr.Cap = %s\n\n", lenExpr)
+}
+
+// emitNoEscapeSliceDataPointer unsafely casts a slice's data pointer to an
+// unsafe.Pointer, bypassing escape analysis. The caller is responsible for
+// ensuring srcPtr lives until they're done with dstVar, as the runtime no
+// longer considers dstVar dependent on srcPtr and is free to GC it.
+//
+// srcPtr must be a pointer.
+//
+// This function uses internally uses the identifier "ptr" cannot be used in a
+// context where this identifier is already bound.
+func (g *interfaceGenerator) emitNoEscapeSliceDataPointer(srcPtr, dstVar string) {
+	g.recordUsedImport("gohacks")
+	g.emit("ptr := unsafe.Pointer(%s)\n", srcPtr)
+	g.emit("%s := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data))\n\n", dstVar)
+}
+
+func (g *interfaceGenerator) emitKeepAlive(ptrVar string) {
+	g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", ptrVar)
+	g.emit("// must live until the use above.\n")
+	g.emit("runtime.KeepAlive(%s)\n", ptrVar)
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
index da36d9305..5ba74a606 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
@@ -104,79 +104,43 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident,
 	})
 	g.emit("}\n\n")
 
+	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		// Fast serialization.
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the CopyOutBytes.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return err\n")
+		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
 	})
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the CopyInBytes.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return err\n")
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
 	})
 	g.emit("}\n\n")
 
 	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
 	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("len, err := w.Write(buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the Write.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return int64(len), err\n")
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := w.Write(buf)\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return int64(length), err\n")
 
 	})
 	g.emit("}\n\n")
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
index 159397825..ef9bb903d 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
@@ -150,80 +150,133 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 	})
 	g.emit("}\n\n")
 
+	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		// Fast serialization.
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the CopyOutBytes.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return err\n")
+		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
 	})
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the CopyInBytes.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return err\n")
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
 	})
 	g.emit("}\n\n")
 
 	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
 	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-		g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-		g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-		g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-		g.emit("val := uintptr(ptr)\n")
-		g.emit("val = val^0\n\n")
-
-		g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-		g.emit("var buf []byte\n")
-		g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-		g.emit("hdr.Data = val\n")
-		g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-		g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-		g.emit("len, err := w.Write(buf)\n")
-		g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-		g.emit("// must live until after the Write.\n")
-		g.emit("runtime.KeepAlive(%s)\n", g.r)
-		g.emit("return int64(len), err\n")
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := w.Write(buf)\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return int64(length), err\n")
+
+	})
+	g.emit("}\n\n")
+}
+
+func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Ident, slice *sliceAPI) {
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("unsafe")
+
+	eltType := g.typeName()
+	if slice.inner {
+		eltType = nt.Name
+	}
+
+	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, eltType)
+	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, eltType)
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
+
+		g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+		g.emitKeepAlive("dst")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, eltType)
+	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, eltType)
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitCastSliceToByteSlice("&src", "buf", "size * count")
+
+		g.emit("length, err := task.CopyOutBytes(addr, buf)\n")
+		g.emitKeepAlive("src")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func MarshalUnsafe%s(src []%s, dst []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitNoEscapeSliceDataPointer("&src", "val")
+
+		g.emit("length, err := safecopy.CopyIn(dst[:(size*count)], val)\n")
+		g.emitKeepAlive("src")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitNoEscapeSliceDataPointer("&dst", "val")
 
+		g.emit("length, err := safecopy.CopyOut(val, src[:(size*count)])\n")
+		g.emitKeepAlive("dst")
+		g.emit("return length, err\n")
 	})
 	g.emit("}\n\n")
 }
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
index e66a38b2e..bd57eae0e 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -72,20 +72,24 @@ func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType
 	})
 }
 
-func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
-	// Is g.t a packed struct without consideing field types?
-	thisPacked := true
+func (g *interfaceGenerator) isStructPacked(st *ast.StructType) bool {
+	packed := true
 	forEachStructField(st, func(f *ast.Field) {
 		if f.Tag != nil {
 			if f.Tag.Value == "`marshal:\"unaligned\"`" {
-				if thisPacked {
+				if packed {
 					debugfAt(g.f.Position(g.t.Pos()),
 						fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name))
-					thisPacked = false
+					packed = false
 				}
 			}
 		}
 	})
+	return packed
+}
+
+func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
+	thisPacked := g.isStructPacked(st)
 
 	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
 	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
@@ -302,17 +306,16 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	})
 	g.emit("}\n\n")
 
-	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
 			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
 			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-			g.emit("return err\n")
+			g.emit("return task.CopyOutBytes(addr, buf[:limit])\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -324,48 +327,39 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				g.emit("}\n\n")
 			}
 			// Fast serialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
-
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the CopyOutBytes.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return err\n")
+			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+			g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n")
+			g.emitKeepAlive(g.r)
+			g.emit("return length, err\n")
 		} else {
 			fallback()
 		}
 	})
 	g.emit("}\n\n")
 
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+	})
+	g.emit("}\n\n")
+
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
 			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-			g.emit("if err != nil {\n")
-			g.inIndent(func() {
-				g.emit("return err\n")
-			})
-			g.emit("}\n")
-
+			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n")
+			g.emit("// partially unmarshalled struct.\n")
 			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
-			g.emit("return nil\n")
+			g.emit("return length, err\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -377,25 +371,11 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				g.emit("}\n\n")
 			}
 			// Fast deserialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
-
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the CopyInBytes.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return err\n")
+			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emitKeepAlive(g.r)
+			g.emit("return length, err\n")
 		} else {
 			fallback()
 		}
@@ -410,8 +390,8 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
 			g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r)
 			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("n, err := w.Write(buf)\n")
-			g.emit("return int64(n), err\n")
+			g.emit("length, err := w.Write(buf)\n")
+			g.emit("return int64(length), err\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -423,25 +403,199 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				g.emit("}\n\n")
 			}
 			// Fast serialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
-
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
-
-			g.emit("len, err := w.Write(buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the Write.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return int64(len), err\n")
+			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+			g.emit("length, err := w.Write(buf)\n")
+			g.emitKeepAlive(g.r)
+			g.emit("return int64(length), err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+}
+
+func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType, slice *sliceAPI) {
+	thisPacked := g.isStructPacked(st)
+
+	if slice.inner {
+		abortAt(g.f.Position(slice.comment.Slash), fmt.Sprintf("The ':inner' argument to '+marshal slice:%s:inner' is only applicable to newtypes on primitives. Remove it from this struct declaration.", slice.ident))
+	}
+
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+
+	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, g.typeName())
+	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(size * count)\n")
+			g.emit("length, err := task.CopyInBytes(addr, buf)\n\n")
+
+			g.emit("// Unmarshal as much as possible, even on error. First handle full objects.\n")
+			g.emit("limit := length/size\n")
+			g.emit("for idx := 0; idx < limit; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n\n")
+
+			g.emit("// Handle any final partial object.\n")
+			g.emit("if length < size*count && length%size != 0 {\n")
+			g.inIndent(func() {
+				g.emit("idx := limit\n")
+				g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n\n")
+
+			g.emit("return length, err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !dst[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast deserialization.
+			g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
+
+			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emitKeepAlive("dst")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, g.typeName())
+	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(size * count)\n")
+			g.emit("for idx := 0; idx < count; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("src[idx].MarshalBytes(buf[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n")
+			g.emit("return task.CopyOutBytes(addr, buf)\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !src[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emitCastSliceToByteSlice("&src", "buf", "size * count")
+
+			g.emit("length, err := task.CopyOutBytes(addr, buf)\n")
+			g.emitKeepAlive("src")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func MarshalUnsafe%s(src []%s, dst []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("for idx := 0; idx < count; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("src[idx].MarshalBytes(dst[size*idx:(size)*(idx+1)])\n")
+			})
+			g.emit("}\n")
+			g.emit("return size * count, nil\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !src[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			g.emitNoEscapeSliceDataPointer("&src", "val")
+
+			g.emit("length, err := safecopy.CopyIn(dst[:(size*count)], val)\n")
+			g.emitKeepAlive("src")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("for idx := 0; idx < count; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("dst[idx].UnmarshalBytes(src[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n")
+			g.emit("return size * count, nil\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !dst[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			g.emitNoEscapeSliceDataPointer("&dst", "val")
+
+			g.emit("length, err := safecopy.CopyOut(val, src[:(size*count)])\n")
+			g.emitKeepAlive("dst")
+			g.emit("return length, err\n")
 		} else {
 			fallback()
 		}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index fd992e44a..631295373 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -30,6 +30,11 @@ var standardImports = []string{
 	"gvisor.dev/gvisor/tools/go_marshal/analysis",
 }
 
+var sliceAPIImports = []string{
+	"encoding/binary",
+	"gvisor.dev/gvisor/pkg/usermem",
+}
+
 type testGenerator struct {
 	sourceBuffer
 
@@ -58,6 +63,11 @@ func newTestGenerator(t *ast.TypeSpec) *testGenerator {
 	for _, i := range standardImports {
 		g.imports.add(i).markUsed()
 	}
+	// These imports are used if a type requests the slice API. Don't
+	// mark them as used by default.
+	for _, i := range sliceAPIImports {
+		g.imports.add(i)
+	}
 
 	return g
 }
@@ -132,6 +142,42 @@ func (g *testGenerator) emitTestMarshalUnmarshalPreservesData() {
 	})
 }
 
+func (g *testGenerator) emitTestMarshalUnmarshalSlicePreservesData(slice *sliceAPI) {
+	for _, name := range []string{"binary", "usermem"} {
+		if !g.imports.markUsed(name) {
+			panic(fmt.Sprintf("Generated test for '%s' referenced a non-existent import with local name '%s'", g.typeName(), name))
+		}
+	}
+
+	g.inTestFunction("TestSafeMarshalUnmarshalSlicePreservesData", func() {
+		g.emit("var x, y, yUnsafe [8]%s\n", g.typeName())
+		g.emit("analysis.RandomizeValue(&x)\n\n")
+		g.emit("size := (*%s)(nil).SizeBytes() * len(x)\n", g.typeName())
+		g.emit("buf := bytes.NewBuffer(make([]byte, size))\n")
+		g.emit("buf.Reset()\n")
+		g.emit("if err := binary.Write(buf, usermem.ByteOrder, x[:]); err != nil {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"binary.Write failed: %v\", err))\n")
+		})
+		g.emit("}\n")
+		g.emit("bufUnsafe := make([]byte, size)\n")
+		g.emit("MarshalUnsafe%s(x[:], bufUnsafe)\n\n", slice.ident)
+
+		g.emit("UnmarshalUnsafe%s(y[:], buf.Bytes())\n", slice.ident)
+		g.emit("if !reflect.DeepEqual(x, y) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across binary.Write/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n")
+		})
+		g.emit("}\n")
+		g.emit("UnmarshalUnsafe%s(yUnsafe[:], bufUnsafe)\n", slice.ident)
+		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafeSlice/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n")
+		})
+		g.emit("}\n\n")
+	})
+}
+
 func (g *testGenerator) emitTestWriteToUnmarshalPreservesData() {
 	g.inTestFunction("TestWriteToUnmarshalPreservesData", func() {
 		g.emit("var x, y, yUnsafe %s\n", g.typeName())
@@ -170,12 +216,16 @@ func (g *testGenerator) emitTestSizeBytesOnTypedNilPtr() {
 	})
 }
 
-func (g *testGenerator) emitTests() {
+func (g *testGenerator) emitTests(slice *sliceAPI) {
 	g.emitTestNonZeroSize()
 	g.emitTestSuspectAlignment()
 	g.emitTestMarshalUnmarshalPreservesData()
 	g.emitTestWriteToUnmarshalPreservesData()
 	g.emitTestSizeBytesOnTypedNilPtr()
+
+	if slice != nil {
+		g.emitTestMarshalUnmarshalSlicePreservesData(slice)
+	}
 }
 
 func (g *testGenerator) write(out io.Writer) error {
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index a0936e013..4cb22dd2d 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -344,22 +344,25 @@ func newImportTable() *importTable {
 // result in a panic.
 func (i *importTable) merge(other *importTable) {
 	for name, im := range other.is {
-		if dup, ok := i.is[name]; ok && !dup.equivalent(im) {
-			panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
+		dup, ok := i.is[name]
+		if ok {
+			// When merging two imports, if either are marked used, the merged entry
+			// should also be marked used.
+			im.used = im.used || dup.used
+
+			if !dup.equivalent(im) {
+				panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
+			}
 		}
-
 		i.is[name] = im
 	}
 }
 
 func (i *importTable) addStmt(s *importStmt) *importStmt {
 	if old, ok := i.is[s.name]; ok && !old.equivalent(s) {
-		// A collision should always be between an import inserted by the
-		// go-marshal tool and an import from the original source file (assuming
-		// the original source file was valid). We could theoretically handle
-		// the collision by assigning a local name to our import. However, this
-		// would need to be plumbed throughout the generator. Given that
-		// collisions should be rare, simply panic on collision.
+		// We could theoretically handle the collision by assigning a local name
+		// to one of the imports. However, this is a non-trivial transformation.
+		// Given that collisions should be rare, simply panic on collision.
 		panic(fmt.Sprintf("Import collision: old: %s as %v; new: %v as %v", old.path, old.name, s.path, s.name))
 	}
 	i.is[s.name] = s
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index f129788e0..cb2166252 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -42,7 +42,11 @@ type Task interface {
 	CopyInBytes(addr usermem.Addr, b []byte) (int, error)
 }
 
-// Marshallable represents a type that can be marshalled to and from memory.
+// Marshallable represents operations on a type that can be marshalled to and
+// from memory.
+//
+// go-marshal automatically generates implementations for this interface for
+// types marked as '+marshal'.
 type Marshallable interface {
 	io.WriterTo
 
@@ -54,12 +58,18 @@ type Marshallable interface {
 	// likely make use of the type of these fields).
 	SizeBytes() int
 
-	// MarshalBytes serializes a copy of a type to dst. dst must be at least
-	// SizeBytes() long.
+	// MarshalBytes serializes a copy of a type to dst. dst may be smaller than
+	// SizeBytes(), which results in a part of the struct being marshalled. Note
+	// that this may have unexpected results for non-packed types, as implicit
+	// padding needs to be taken into account when reasoning about how much of
+	// the type is serialized.
 	MarshalBytes(dst []byte)
 
-	// UnmarshalBytes deserializes a type from src. src must be at least
-	// SizeBytes() long.
+	// UnmarshalBytes deserializes a type from src. src may be smaller than
+	// SizeBytes(), which results in a partially deserialized struct. Note that
+	// this may have unexpected results for non-packed types, as implicit
+	// padding needs to be taken into account when reasoning about how much of
+	// the type is deserialized.
 	UnmarshalBytes(src []byte)
 
 	// Packed returns true if the marshalled size of the type is the same as the
@@ -67,13 +77,20 @@ type Marshallable interface {
 	// starting at unaligned addresses (should always be true by default for ABI
 	// structs, verified by automatically generated tests when using
 	// go_marshal), and has no fields marked `marshal:"unaligned"`.
+	//
+	// Packed must return the same result for all possible values of the type
+	// implementing it. Violating this constraint implies the type doesn't have
+	// a static memory layout, and will lead to memory corruption.
+	// Go-marshal-generated code reuses the result of Packed for multiple values
+	// of the same type.
 	Packed() bool
 
 	// MarshalUnsafe serializes a type by bulk copying its in-memory
 	// representation to the dst buffer. This is only safe to do when the type
 	// has no implicit padding, see Marshallable.Packed. When Packed would
 	// return false, MarshalUnsafe should fall back to the safer but slower
-	// MarshalBytes.
+	// MarshalBytes. dst may be smaller than SizeBytes(), see comment for
+	// MarshalBytes for implications.
 	MarshalUnsafe(dst []byte)
 
 	// UnmarshalUnsafe deserializes a type by directly copying to the underlying
@@ -82,7 +99,8 @@ type Marshallable interface {
 	// This allows much faster unmarshalling of types which have no implicit
 	// padding, see Marshallable.Packed. When Packed would return false,
 	// UnmarshalUnsafe should fall back to the safer but slower unmarshal
-	// mechanism implemented in UnmarshalBytes.
+	// mechanism implemented in UnmarshalBytes. src may be smaller than
+	// SizeBytes(), see comment for UnmarshalBytes for implications.
 	UnmarshalUnsafe(src []byte)
 
 	// CopyIn deserializes a Marshallable type from a task's memory. This may
@@ -91,12 +109,79 @@ type Marshallable interface {
 	// marshalled does not escape. The implementation should avoid creating
 	// extra copies in memory by directly deserializing to the object's
 	// underlying memory.
-	CopyIn(task Task, addr usermem.Addr) error
+	//
+	// If the copy-in from the task memory is only partially successful, CopyIn
+	// should still attempt to deserialize as much data as possible. See comment
+	// for UnmarshalBytes.
+	CopyIn(task Task, addr usermem.Addr) (int, error)
 
 	// CopyOut serializes a Marshallable type to a task's memory. This may only
 	// be called from a task goroutine. This is more efficient than calling
 	// MarshalUnsafe on Marshallable.Packed types, as the type being serialized
 	// does not escape. The implementation should avoid creating extra copies in
 	// memory by directly serializing from the object's underlying memory.
-	CopyOut(task Task, addr usermem.Addr) error
+	//
+	// The copy-out to the task memory may be partially successful, in which
+	// case CopyOut returns how much data was serialized. See comment for
+	// MarshalBytes for implications.
+	CopyOut(task Task, addr usermem.Addr) (int, error)
+
+	// CopyOutN is like CopyOut, but explicitly requests a partial
+	// copy-out. Note that this may yield unexpected results for non-packed
+	// types and the caller may only want to allow this for packed types. See
+	// comment on MarshalBytes.
+	//
+	// The limit must be less than or equal to SizeBytes().
+	CopyOutN(task Task, addr usermem.Addr, limit int) (int, error)
 }
+
+// go-marshal generates additional functions for a type based on additional
+// clauses to the +marshal directive. They are documented below.
+//
+// Slice API
+// =========
+//
+// Adding a "slice" clause to the +marshal directive for structs or newtypes on
+// primitives like this:
+//
+// // +marshal slice:FooSlice
+// type Foo struct { ... }
+//
+// Generates four additional functions for marshalling slices of Foos like this:
+//
+// // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It's
+// // more efficient that repeatedly calling calling Foo.MarshalUnsafe over a
+// // []Foo in a loop.
+// func MarshalUnsafeFooSlice(src []Foo, dst []byte) (int, error) { ... }
+//
+// // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It's
+// // more efficient that repeatedly calling calling Foo.UnmarshalUnsafe over a
+// // []Foo in a loop.
+// func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) (int, error) { ... }
+//
+// // CopyFooSliceIn copies in a slice of Foo objects from the task's memory.
+// func CopyFooSliceIn(task marshal.Task, addr usermem.Addr, dst []Foo) (int, error) { ... }
+//
+// // CopyFooSliceIn copies out a slice of Foo objects to the task's memory.
+// func CopyFooSliceOut(task marshal.Task, addr usermem.Addr, src []Foo) (int, error) { ... }
+//
+// The name of the functions are of the format "Copy%sIn" and "Copy%sOut", where
+// %s is the first argument to the slice clause. This directive is not supported
+// for newtypes on arrays.
+//
+// The slice clause also takes an optional second argument, which must be the
+// value "inner":
+//
+// // +marshal slice:Int32Slice:inner
+// type Int32 int32
+//
+// This is only valid on newtypes on primitives, and causes the generated
+// functions to accept slices of the inner type instead:
+//
+// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []int32) (int, error) { ... }
+//
+// Without "inner", they would instead be:
+//
+// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []Int32) (int, error) { ... }
+//
+// This may help avoid a cast depending on how the generated functions are used.
diff --git a/tools/go_marshal/primitive/BUILD b/tools/go_marshal/primitive/BUILD
new file mode 100644
index 000000000..cc08ba63a
--- /dev/null
+++ b/tools/go_marshal/primitive/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "primitive",
+    srcs = [
+        "primitive.go",
+    ],
+    marshal = True,
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = [
+        "//pkg/usermem",
+        "//tools/go_marshal/marshal",
+    ],
+)
diff --git a/tools/go_marshal/primitive/primitive.go b/tools/go_marshal/primitive/primitive.go
new file mode 100644
index 000000000..ebcf130ae
--- /dev/null
+++ b/tools/go_marshal/primitive/primitive.go
@@ -0,0 +1,175 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package primitive defines marshal.Marshallable implementations for primitive
+// types.
+package primitive
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// Int16 is a marshal.Marshallable implementation for int16.
+//
+// +marshal slice:Int16Slice:inner
+type Int16 int16
+
+// Uint16 is a marshal.Marshallable implementation for uint16.
+//
+// +marshal slice:Uint16Slice:inner
+type Uint16 uint16
+
+// Int32 is a marshal.Marshallable implementation for int32.
+//
+// +marshal slice:Int32Slice:inner
+type Int32 int32
+
+// Uint32 is a marshal.Marshallable implementation for uint32.
+//
+// +marshal slice:Uint32Slice:inner
+type Uint32 uint32
+
+// Int64 is a marshal.Marshallable implementation for int64.
+//
+// +marshal slice:Int64Slice:inner
+type Int64 int64
+
+// Uint64 is a marshal.Marshallable implementation for uint64.
+//
+// +marshal slice:Uint64Slice:inner
+type Uint64 uint64
+
+// Below, we define some convenience functions for marshalling primitive types
+// using the newtypes above, without requiring superfluous casts.
+
+// 16-bit integers
+
+// CopyInt16In is a convenient wrapper for copying in an int16 from the task's
+// memory.
+func CopyInt16In(task marshal.Task, addr usermem.Addr, dst *int16) (int, error) {
+	var buf Int16
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int16(buf)
+	return n, nil
+}
+
+// CopyInt16Out is a convenient wrapper for copying out an int16 to the task's
+// memory.
+func CopyInt16Out(task marshal.Task, addr usermem.Addr, src int16) (int, error) {
+	srcP := Int16(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// CopyUint16In is a convenient wrapper for copying in a uint16 from the task's
+// memory.
+func CopyUint16In(task marshal.Task, addr usermem.Addr, dst *uint16) (int, error) {
+	var buf Uint16
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint16(buf)
+	return n, nil
+}
+
+// CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's
+// memory.
+func CopyUint16Out(task marshal.Task, addr usermem.Addr, src uint16) (int, error) {
+	srcP := Uint16(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// 32-bit integers
+
+// CopyInt32In is a convenient wrapper for copying in an int32 from the task's
+// memory.
+func CopyInt32In(task marshal.Task, addr usermem.Addr, dst *int32) (int, error) {
+	var buf Int32
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int32(buf)
+	return n, nil
+}
+
+// CopyInt32Out is a convenient wrapper for copying out an int32 to the task's
+// memory.
+func CopyInt32Out(task marshal.Task, addr usermem.Addr, src int32) (int, error) {
+	srcP := Int32(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// CopyUint32In is a convenient wrapper for copying in a uint32 from the task's
+// memory.
+func CopyUint32In(task marshal.Task, addr usermem.Addr, dst *uint32) (int, error) {
+	var buf Uint32
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint32(buf)
+	return n, nil
+}
+
+// CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's
+// memory.
+func CopyUint32Out(task marshal.Task, addr usermem.Addr, src uint32) (int, error) {
+	srcP := Uint32(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// 64-bit integers
+
+// CopyInt64In is a convenient wrapper for copying in an int64 from the task's
+// memory.
+func CopyInt64In(task marshal.Task, addr usermem.Addr, dst *int64) (int, error) {
+	var buf Int64
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int64(buf)
+	return n, nil
+}
+
+// CopyInt64Out is a convenient wrapper for copying out an int64 to the task's
+// memory.
+func CopyInt64Out(task marshal.Task, addr usermem.Addr, src int64) (int, error) {
+	srcP := Int64(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// CopyUint64In is a convenient wrapper for copying in a uint64 from the task's
+// memory.
+func CopyUint64In(task marshal.Task, addr usermem.Addr, dst *uint64) (int, error) {
+	var buf Uint64
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint64(buf)
+	return n, nil
+}
+
+// CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's
+// memory.
+func CopyUint64Out(task marshal.Task, addr usermem.Addr, src uint64) (int, error) {
+	srcP := Uint64(src)
+	return srcP.CopyOut(task, addr)
+}
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index f27c5ce52..3b839799d 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -39,3 +39,17 @@ go_binary(
         "//tools/go_marshal/marshal",
     ],
 )
+
+go_test(
+    name = "marshal_test",
+    size = "small",
+    srcs = ["marshal_test.go"],
+    deps = [
+        ":test",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//tools/go_marshal/analysis",
+        "//tools/go_marshal/marshal",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go
index c79defe9e..224d308c7 100644
--- a/tools/go_marshal/test/benchmark_test.go
+++ b/tools/go_marshal/test/benchmark_test.go
@@ -176,3 +176,45 @@ func BenchmarkGoMarshalUnsafe(b *testing.B) {
 		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
 	}
 }
+
+func BenchmarkBinarySlice(b *testing.B) {
+	var s1, s2 [64]test.Stat
+	analysis.RandomizeValue(&s1)
+
+	size := binary.Size(s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, 0, size)
+		buf = binary.Marshal(buf, usermem.ByteOrder, &s1)
+		binary.Unmarshal(buf, usermem.ByteOrder, &s2)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
+
+func BenchmarkGoMarshalUnsafeSlice(b *testing.B) {
+	var s1, s2 [64]test.Stat
+	analysis.RandomizeValue(&s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, (*test.Stat)(nil).SizeBytes()*len(s1))
+		test.MarshalUnsafeStatSlice(s1[:], buf)
+		test.UnmarshalUnsafeStatSlice(s2[:], buf)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
diff --git a/tools/go_marshal/test/external/external.go b/tools/go_marshal/test/external/external.go
index 4be3722f3..26fe8e0c8 100644
--- a/tools/go_marshal/test/external/external.go
+++ b/tools/go_marshal/test/external/external.go
@@ -21,3 +21,11 @@ package external
 type External struct {
 	j int64
 }
+
+// NotPacked is an unaligned Marshallable type for use in testing.
+//
+// +marshal
+type NotPacked struct {
+	a int32
+	b byte `marshal:"unaligned"`
+}
diff --git a/tools/go_marshal/test/marshal_test.go b/tools/go_marshal/test/marshal_test.go
new file mode 100644
index 000000000..16829ee45
--- /dev/null
+++ b/tools/go_marshal/test/marshal_test.go
@@ -0,0 +1,515 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package marshal_test contains manual tests for the marshal interface. These
+// are intended to test behaviour not covered by the automatically generated
+// tests.
+package marshal_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"runtime"
+	"testing"
+	"unsafe"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/analysis"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
+)
+
+var simulatedErr error = syserror.EFAULT
+
+// mockTask implements marshal.Task.
+type mockTask struct {
+	taskMem usermem.BytesIO
+}
+
+// populate fills the task memory with the contents of val.
+func (t *mockTask) populate(val interface{}) {
+	var buf bytes.Buffer
+	// Use binary.Write so we aren't testing go-marshal against its own
+	// potentially buggy implementation.
+	if err := binary.Write(&buf, usermem.ByteOrder, val); err != nil {
+		panic(err)
+	}
+	t.taskMem.Bytes = buf.Bytes()
+}
+
+func (t *mockTask) setLimit(n int) {
+	if len(t.taskMem.Bytes) < n {
+		grown := make([]byte, n)
+		copy(grown, t.taskMem.Bytes)
+		t.taskMem.Bytes = grown
+		return
+	}
+	t.taskMem.Bytes = t.taskMem.Bytes[:n]
+}
+
+// CopyScratchBuffer implements marshal.Task.CopyScratchBuffer.
+func (t *mockTask) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+// CopyOutBytes implements marshal.Task.CopyOutBytes. The implementation
+// completely ignores the target address and stores a copy of b in its
+// internally buffer, overriding any previous contents.
+func (t *mockTask) CopyOutBytes(_ usermem.Addr, b []byte) (int, error) {
+	return t.taskMem.CopyOut(nil, 0, b, usermem.IOOpts{})
+}
+
+// CopyInBytes implements marshal.Task.CopyInBytes. The implementation
+// completely ignores the source address and always fills b from the begining of
+// its internal buffer.
+func (t *mockTask) CopyInBytes(_ usermem.Addr, b []byte) (int, error) {
+	return t.taskMem.CopyIn(nil, 0, b, usermem.IOOpts{})
+}
+
+// unsafeMemory returns the underlying memory for m. The returned slice is only
+// valid for the lifetime for m. The garbage collector isn't aware that the
+// returned slice is related to m, the caller must ensure m lives long enough.
+func unsafeMemory(m marshal.Marshallable) []byte {
+	if !m.Packed() {
+		// We can't return a slice pointing to the underlying memory
+		// since the layout isn't packed. Allocate a temporary buffer
+		// and marshal instead.
+		var buf bytes.Buffer
+		if err := binary.Write(&buf, usermem.ByteOrder, m); err != nil {
+			panic(err)
+		}
+		return buf.Bytes()
+	}
+
+	// reflect.ValueOf(m)
+	//   .Elem() // Unwrap interface to inner concrete object
+	//   .Addr() // Pointer value to object
+	//   .Pointer() // Actual address from the pointer value
+	ptr := reflect.ValueOf(m).Elem().Addr().Pointer()
+
+	size := m.SizeBytes()
+
+	var mem []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&mem))
+	hdr.Data = ptr
+	hdr.Len = size
+	hdr.Cap = size
+
+	return mem
+}
+
+// unsafeMemorySlice returns the underlying memory for m. The returned slice is
+// only valid for the lifetime for m. The garbage collector isn't aware that the
+// returned slice is related to m, the caller must ensure m lives long enough.
+//
+// Precondition: m must be a slice.
+func unsafeMemorySlice(m interface{}, elt marshal.Marshallable) []byte {
+	kind := reflect.TypeOf(m).Kind()
+	if kind != reflect.Slice {
+		panic("unsafeMemorySlice called on non-slice")
+	}
+
+	if !elt.Packed() {
+		// We can't return a slice pointing to the underlying memory
+		// since the layout isn't packed. Allocate a temporary buffer
+		// and marshal instead.
+		var buf bytes.Buffer
+		if err := binary.Write(&buf, usermem.ByteOrder, m); err != nil {
+			panic(err)
+		}
+		return buf.Bytes()
+	}
+
+	v := reflect.ValueOf(m)
+	length := v.Len() * elt.SizeBytes()
+
+	var mem []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&mem))
+	hdr.Data = v.Pointer() // This is a pointer to the first elem for slices.
+	hdr.Len = length
+	hdr.Cap = length
+
+	return mem
+}
+
+func isZeroes(buf []byte) bool {
+	for _, b := range buf {
+		if b != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// compareMemory compares the first n bytes of two chuncks of memory represented
+// by expected and actual.
+func compareMemory(t *testing.T, expected, actual []byte, n int) {
+	t.Logf("Expected (%d): %v (%d) + (%d) %v\n", len(expected), expected[:n], n, len(expected)-n, expected[n:])
+	t.Logf("Actual   (%d): %v (%d) + (%d) %v\n", len(actual), actual[:n], n, len(actual)-n, actual[n:])
+
+	if diff := cmp.Diff(expected[:n], actual[:n]); diff != "" {
+		t.Errorf("Memory buffers don't match:\n--- expected only\n+++ actual only\n%v", diff)
+	}
+}
+
+// limitedCopyIn populates task memory with src, then unmarshals task memory to
+// dst. The task signals an error at limit bytes during copy-in, which should
+// result in a truncated unmarshalling.
+func limitedCopyIn(t *testing.T, src, dst marshal.Marshallable, limit int) {
+	var task mockTask
+	task.populate(src)
+	task.setLimit(limit)
+
+	n, err := dst.CopyIn(&task, usermem.Addr(0))
+	if n != limit {
+		t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
+	}
+	if err != simulatedErr {
+		t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err)
+	}
+
+	expectedMem := unsafeMemory(src)
+	defer runtime.KeepAlive(src)
+	actualMem := unsafeMemory(dst)
+	defer runtime.KeepAlive(dst)
+
+	compareMemory(t, expectedMem, actualMem, n)
+
+	// The last n bytes should be zero for actual, since actual was
+	// zero-initialized, and CopyIn shouldn't have touched those bytes. However
+	// we can only guarantee we didn't touch anything in the last n bytes if the
+	// layout is packed.
+	if dst.Packed() && !isZeroes(actualMem[n:]) {
+		t.Errorf("Expected the last %d bytes of copied in object to be zeroes, got %v\n", dst.SizeBytes()-n, actualMem)
+	}
+}
+
+// limitedCopyOut marshals src to task memory. The task signals an error at
+// limit bytes during copy-out, which should result in a truncated marshalling.
+func limitedCopyOut(t *testing.T, src marshal.Marshallable, limit int) {
+	var task mockTask
+	task.setLimit(limit)
+
+	n, err := src.CopyOut(&task, usermem.Addr(0))
+	if n != limit {
+		t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n)
+	}
+	if err != simulatedErr {
+		t.Errorf("CopyOut returned unexpected error, expected %v, got %v", simulatedErr, err)
+	}
+
+	expectedMem := unsafeMemory(src)
+	defer runtime.KeepAlive(src)
+	actualMem := task.taskMem.Bytes
+
+	compareMemory(t, expectedMem, actualMem, n)
+}
+
+// copyOutN marshals src to task memory, requesting the marshalling to be
+// limited to limit bytes.
+func copyOutN(t *testing.T, src marshal.Marshallable, limit int) {
+	var task mockTask
+	task.setLimit(limit)
+
+	n, err := src.CopyOutN(&task, usermem.Addr(0), limit)
+	if err != nil {
+		t.Errorf("CopyOut returned unexpected error: %v", err)
+	}
+	if n != limit {
+		t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n)
+	}
+
+	expectedMem := unsafeMemory(src)
+	defer runtime.KeepAlive(src)
+	actualMem := task.taskMem.Bytes
+
+	t.Logf("Expected: %v + %v\n", expectedMem[:n], expectedMem[n:])
+	t.Logf("Actual  : %v + %v\n", actualMem[:n], actualMem[n:])
+
+	compareMemory(t, expectedMem, actualMem, n)
+}
+
+// TestLimitedMarshalling verifies marshalling/unmarshalling succeeds when the
+// underyling copy in/out operations partially succeed.
+func TestLimitedMarshalling(t *testing.T) {
+	types := []reflect.Type{
+		// Packed types.
+		reflect.TypeOf((*test.Type2)(nil)),
+		reflect.TypeOf((*test.Type3)(nil)),
+		reflect.TypeOf((*test.Timespec)(nil)),
+		reflect.TypeOf((*test.Stat)(nil)),
+		reflect.TypeOf((*test.InetAddr)(nil)),
+		reflect.TypeOf((*test.SignalSet)(nil)),
+		reflect.TypeOf((*test.SignalSetAlias)(nil)),
+		// Non-packed types.
+		reflect.TypeOf((*test.Type1)(nil)),
+		reflect.TypeOf((*test.Type4)(nil)),
+		reflect.TypeOf((*test.Type5)(nil)),
+		reflect.TypeOf((*test.Type6)(nil)),
+		reflect.TypeOf((*test.Type7)(nil)),
+		reflect.TypeOf((*test.Type8)(nil)),
+	}
+
+	for _, tyPtr := range types {
+		// Remove one level of pointer-indirection from the type. We get this
+		// back when we pass the type to reflect.New.
+		ty := tyPtr.Elem()
+
+		// Partial copy-in.
+		t.Run(fmt.Sprintf("PartialCopyIn_%v", ty), func(t *testing.T) {
+			expected := reflect.New(ty).Interface().(marshal.Marshallable)
+			actual := reflect.New(ty).Interface().(marshal.Marshallable)
+			analysis.RandomizeValue(expected)
+
+			limitedCopyIn(t, expected, actual, expected.SizeBytes()/2)
+		})
+
+		// Partial copy-out.
+		t.Run(fmt.Sprintf("PartialCopyOut_%v", ty), func(t *testing.T) {
+			expected := reflect.New(ty).Interface().(marshal.Marshallable)
+			analysis.RandomizeValue(expected)
+
+			limitedCopyOut(t, expected, expected.SizeBytes()/2)
+		})
+
+		// Explicitly request partial copy-out.
+		t.Run(fmt.Sprintf("PartialCopyOutN_%v", ty), func(t *testing.T) {
+			expected := reflect.New(ty).Interface().(marshal.Marshallable)
+			analysis.RandomizeValue(expected)
+
+			copyOutN(t, expected, expected.SizeBytes()/2)
+		})
+	}
+}
+
+// TestLimitedMarshalling verifies marshalling/unmarshalling of slices of
+// marshallable types succeed when the underyling copy in/out operations
+// partially succeed.
+func TestLimitedSliceMarshalling(t *testing.T) {
+	types := []struct {
+		arrayPtrType reflect.Type
+		copySliceIn  func(task marshal.Task, addr usermem.Addr, dstSlice interface{}) (int, error)
+		copySliceOut func(task marshal.Task, addr usermem.Addr, srcSlice interface{}) (int, error)
+		unsafeMemory func(arrPtr interface{}) []byte
+	}{
+		// Packed types.
+		{
+			reflect.TypeOf((*[20]test.Stat)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[20]test.Stat)[:]
+				return test.CopyStatSliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[20]test.Stat)[:]
+				return test.CopyStatSliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[20]test.Stat)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[1]test.Stat)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[1]test.Stat)[:]
+				return test.CopyStatSliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[1]test.Stat)[:]
+				return test.CopyStatSliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[1]test.Stat)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[5]test.SignalSetAlias)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[5]test.SignalSetAlias)[:]
+				return test.CopySignalSetAliasSliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[5]test.SignalSetAlias)[:]
+				return test.CopySignalSetAliasSliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[5]test.SignalSetAlias)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		// Non-packed types.
+		{
+			reflect.TypeOf((*[20]test.Type1)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[20]test.Type1)[:]
+				return test.CopyType1SliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[20]test.Type1)[:]
+				return test.CopyType1SliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[20]test.Type1)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[1]test.Type1)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[1]test.Type1)[:]
+				return test.CopyType1SliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[1]test.Type1)[:]
+				return test.CopyType1SliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[1]test.Type1)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[7]test.Type8)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[7]test.Type8)[:]
+				return test.CopyType8SliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[7]test.Type8)[:]
+				return test.CopyType8SliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[7]test.Type8)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+	}
+
+	for _, tt := range types {
+		// The body of this loop is generic over the type tt.arrayPtrType, with
+		// the help of reflection. To aid in readability, comments below show
+		// the equivalent go code assuming
+		// tt.arrayPtrType = typeof(*[20]test.Stat).
+
+		// Equivalent:
+		// var x *[20]test.Stat
+		// arrayTy := reflect.TypeOf(*x)
+		arrayTy := tt.arrayPtrType.Elem()
+
+		// Partial copy-in of slices.
+		t.Run(fmt.Sprintf("PartialCopySliceIn_%v", arrayTy), func(t *testing.T) {
+			// Equivalent:
+			// var x [20]test.Stat
+			// length := len(x)
+			length := arrayTy.Len()
+			if length < 1 {
+				panic("Test type can't be zero-length array")
+			}
+			// Equivalent:
+			// elem := new(test.Stat).(marshal.Marshallable)
+			elem := reflect.New(arrayTy.Elem()).Interface().(marshal.Marshallable)
+
+			// Equivalent:
+			// var expected, actual interface{}
+			// expected = new([20]test.Stat)
+			// actual = new([20]test.Stat)
+			expected := reflect.New(arrayTy).Interface()
+			actual := reflect.New(arrayTy).Interface()
+
+			analysis.RandomizeValue(expected)
+
+			limit := (length * elem.SizeBytes()) / 2
+			// Also make sure the limit is partially inside one of the elements.
+			limit += elem.SizeBytes() / 2
+			analysis.RandomizeValue(expected)
+
+			var task mockTask
+			task.populate(expected)
+			task.setLimit(limit)
+
+			n, err := tt.copySliceIn(&task, usermem.Addr(0), actual)
+			if n != limit {
+				t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
+			}
+			if n < length*elem.SizeBytes() && err != simulatedErr {
+				t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err)
+			}
+
+			expectedMem := tt.unsafeMemory(expected)
+			defer runtime.KeepAlive(expected)
+			actualMem := tt.unsafeMemory(actual)
+			defer runtime.KeepAlive(actual)
+
+			compareMemory(t, expectedMem, actualMem, n)
+
+			// The last n bytes should be zero for actual, since actual was
+			// zero-initialized, and CopyIn shouldn't have touched those bytes. However
+			// we can only guarantee we didn't touch anything in the last n bytes if the
+			// layout is packed.
+			if elem.Packed() && !isZeroes(actualMem[n:]) {
+				t.Errorf("Expected the last %d bytes of copied in object to be zeroes, got %v\n", (elem.SizeBytes()*length)-n, actualMem)
+			}
+		})
+
+		// Partial copy-out of slices.
+		t.Run(fmt.Sprintf("PartialCopySliceOut_%v", arrayTy), func(t *testing.T) {
+			// Equivalent:
+			// var x [20]test.Stat
+			// length := len(x)
+			length := arrayTy.Len()
+			if length < 1 {
+				panic("Test type can't be zero-length array")
+			}
+			// Equivalent:
+			// elem := new(test.Stat).(marshal.Marshallable)
+			elem := reflect.New(arrayTy.Elem()).Interface().(marshal.Marshallable)
+
+			// Equivalent:
+			// var expected, actual interface{}
+			// expected = new([20]test.Stat)
+			// actual = new([20]test.Stat)
+			expected := reflect.New(arrayTy).Interface()
+
+			analysis.RandomizeValue(expected)
+
+			limit := (length * elem.SizeBytes()) / 2
+			// Also make sure the limit is partially inside one of the elements.
+			limit += elem.SizeBytes() / 2
+			analysis.RandomizeValue(expected)
+
+			var task mockTask
+			task.populate(expected)
+			task.setLimit(limit)
+
+			n, err := tt.copySliceOut(&task, usermem.Addr(0), expected)
+			if n != limit {
+				t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
+			}
+			if n < length*elem.SizeBytes() && err != simulatedErr {
+				t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err)
+			}
+
+			expectedMem := tt.unsafeMemory(expected)
+			defer runtime.KeepAlive(expected)
+			actualMem := task.taskMem.Bytes
+
+			compareMemory(t, expectedMem, actualMem, n)
+		})
+	}
+}
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
index c829db6da..43df73545 100644
--- a/tools/go_marshal/test/test.go
+++ b/tools/go_marshal/test/test.go
@@ -23,7 +23,7 @@ import (
 
 // Type1 is a test data type.
 //
-// +marshal
+// +marshal slice:Type1Slice
 type Type1 struct {
 	a    Type2
 	x, y int64 // Multiple field names.
@@ -75,6 +75,34 @@ type Type5 struct {
 	m int64
 }
 
+// Type6 is a test data type ends mid-word.
+//
+// +marshal
+type Type6 struct {
+	a int64
+	b int64
+	// If c isn't marked unaligned, analysis fails (as it should, since
+	// the unsafe API corrupts Type7).
+	c byte `marshal:"unaligned"`
+}
+
+// Type7 is a test data type that contains a child struct that ends
+// mid-word.
+// +marshal
+type Type7 struct {
+	x Type6
+	y int64
+}
+
+// Type8 is a test data type which contains an external non-packed field.
+//
+// +marshal slice:Type8Slice
+type Type8 struct {
+	a  int64
+	np ex.NotPacked
+	b  int64
+}
+
 // Timespec represents struct timespec in <time.h>.
 //
 // +marshal
@@ -85,7 +113,7 @@ type Timespec struct {
 
 // Stat represents struct stat.
 //
-// +marshal
+// +marshal slice:StatSlice
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
@@ -111,10 +139,10 @@ type InetAddr [4]byte
 
 // SignalSet is an example marshallable newtype on a primitive.
 //
-// +marshal
+// +marshal slice:SignalSetSlice:inner
 type SignalSet uint64
 
 // SignalSetAlias is an example newtype on another marshallable type.
 //
-// +marshal
+// +marshal slice:SignalSetAliasSlice
 type SignalSetAlias SignalSet
-- 
cgit v1.2.3


From 507f997213d4b6778c5da982dd447044b769e7b9 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 1 Apr 2020 00:42:34 -0700
Subject: go-marshal: Improve collision detection of import statments.

Previously, the import statement collision detection mechanism aborted
go-marshal whenever it detected two imports in any package that has
the same local name. Consider this trivial package, defined by the the
following two source files:

file1.go:

package example
import (
        path/a/to/foo
)
...

file2.go:

package example
import (
       another/package/with/final/component/foo
)
...

Go-marshal previously couldn't handle generating code for the the
above package, even if none of the types marked for marshalling used
either of the imported foo packages. This turns out to be too
restrictive as we run into this a lot in practice. Examples include
"encoding/binary" vs "gvisor/pkg/binary/binary", and "sync" vs
"gvisor/pkg/sync/sync".

This change allows go-marshal to proceed with marshalling, and only
abort if the code generated by go-marshal references any such
ambiguous import names.

PiperOrigin-RevId: 304131190
---
 tools/go_marshal/gomarshal/generator.go            |   4 +-
 tools/go_marshal/gomarshal/generator_interfaces.go |   1 -
 .../gomarshal/generator_interfaces_struct.go       |  23 ++-
 tools/go_marshal/gomarshal/util.go                 | 154 ++++++++++++++++-----
 4 files changed, 141 insertions(+), 41 deletions(-)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 935a36b25..43e668b63 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -326,7 +326,7 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp
 
 			// Make sure we have an import that doesn't use any local names that
 			// would conflict with identifiers in the generated code.
-			if len(i.name) == 1 {
+			if len(i.name) == 1 && i.name != "_" {
 				abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import has a single character local name '%s'; this may conflict with code generated by go_marshal, use a multi-character import alias", i.name))
 			}
 			if _, ok := badIdentsMap[i.name]; ok {
@@ -421,7 +421,7 @@ func (g *Generator) Run() error {
 			// the list of imports we need to copy to the generated code.
 			for name, _ := range impl.is {
 				if !g.imports.markUsed(name) {
-					panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'", impl.typeName(), name))
+					panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'. Either go-marshal needs to add an import to the generated file, or a package in an input source file has a package name differ from the final component of its path, which go-marshal doesn't know how to detect; use an import alias to work around this limitation.", impl.typeName(), name))
 				}
 			}
 			ts = append(ts, g.generateOneTestSuite(t))
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 8812c6878..8f1c27145 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -72,7 +72,6 @@ func (g *interfaceGenerator) recordUsedMarshallable(m string) {
 
 func (g *interfaceGenerator) recordUsedImport(i string) {
 	g.is[i] = struct{}{}
-
 }
 
 func (g *interfaceGenerator) recordPotentiallyNonPackedField(fieldName string) {
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
index bd57eae0e..e837f58db 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -152,7 +152,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 						g.shift("dst", len)
 					} else {
 						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can referece here
+						// an instance of the dynamic type we can reference here
 						// (since the version in this struct is anonymous). Use
 						// a typed nil pointer to call SizeBytes() instead.
 						g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name)
@@ -162,6 +162,11 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				g.marshalScalar(g.fieldAccessor(n), t.Name, "dst")
 			},
 			selector: func(n, tX, tSel *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", tX.Name, tSel.Name)
+					g.emit("dst = dst[(*%s.%s)(nil).SizeBytes():]\n", tX.Name, tSel.Name)
+					return
+				}
 				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
 			},
 			array: func(n, t *ast.Ident, size int) {
@@ -199,11 +204,11 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 					if len, dynamic := g.scalarSize(t); !dynamic {
 						g.shift("src", len)
 					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can reference here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("src = src[(*%s)(nil).SizeBytes():]\n", t.Name)
+						// We don't have an instance of the dynamic type we can
+						// reference here (since the version in this struct is
+						// anonymous). Use a typed nil pointer to call
+						// SizeBytes() instead.
+						g.shiftDynamic("src", fmt.Sprintf("(*%s)(nil)", t.Name))
 						g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name))
 					}
 					return
@@ -211,6 +216,12 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src")
 			},
 			selector: func(n, tX, tSel *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: %s ~= src[:sizeof(%s.%s)]\n", g.fieldAccessor(n), tX.Name, tSel.Name)
+					g.emit("src = src[(*%s.%s)(nil).SizeBytes():]\n", tX.Name, tSel.Name)
+					g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s.%s)(nil)", tX.Name, tSel.Name))
+					return
+				}
 				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
 			},
 			array: func(n, t *ast.Ident, size int) {
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index 4cb22dd2d..96025ff39 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -285,6 +285,11 @@ type importStmt struct {
 	aliased bool
 	// Indicates whether this import was referenced by generated code.
 	used bool
+	// AST node and file set representing the import statement, if any. These
+	// are only non-nil if the import statement originates from an input source
+	// file.
+	spec *ast.ImportSpec
+	fset *token.FileSet
 }
 
 func newImport(p string) *importStmt {
@@ -310,14 +315,27 @@ func newImportFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
 		name:    name,
 		path:    p,
 		aliased: spec.Name != nil,
+		spec:    spec,
+		fset:    f,
 	}
 }
 
+// String implements fmt.Stringer.String. This generates a string for the import
+// statement appropriate for writing directly to generated code.
 func (i *importStmt) String() string {
 	if i.aliased {
-		return fmt.Sprintf("%s \"%s\"", i.name, i.path)
+		return fmt.Sprintf("%s %q", i.name, i.path)
 	}
-	return fmt.Sprintf("\"%s\"", i.path)
+	return fmt.Sprintf("%q", i.path)
+}
+
+// debugString returns a debug string representing an import statement. This
+// representation is not valid golang code and is used for debugging output.
+func (i *importStmt) debugString() string {
+	if i.spec != nil && i.fset != nil {
+		return fmt.Sprintf("%s: %s", i.fset.Position(i.spec.Path.Pos()), i)
+	}
+	return fmt.Sprintf("(go-marshal import): %s", i)
 }
 
 func (i *importStmt) markUsed() {
@@ -329,43 +347,78 @@ func (i *importStmt) equivalent(other *importStmt) bool {
 }
 
 // importTable represents a collection of importStmts.
+//
+// An importTable may contain multiple import statements referencing the same
+// local name. All import statements aliasing to the same local name are
+// technically ambiguous, as if such an import name is used in the generated
+// code, it's not clear which import statement it refers to. We ignore any
+// potential collisions until actually writing the import table to the generated
+// source file. See importTable.write.
+//
+// Given the following import statements across all the files comprising a
+// package marshalled:
+//
+// "sync"
+// "pkg/sync"
+// "pkg/sentry/kernel"
+// ktime "pkg/sentry/kernel/time"
+//
+// An importTable representing them would look like this:
+//
+// importTable {
+//     is: map[string][]*importStmt {
+//         "sync": []*importStmt{
+//             importStmt{name:"sync", path:"sync", aliased:false}
+//             importStmt{name:"sync", path:"pkg/sync", aliased:false}
+//         },
+//         "kernel": []*importStmt{importStmt{
+//            name: "kernel",
+//            path: "pkg/sentry/kernel",
+//            aliased: false
+//         }},
+//         "ktime": []*importStmt{importStmt{
+//             name: "ktime",
+//             path: "pkg/sentry/kernel/time",
+//             aliased: true,
+//         }},
+//     }
+// }
+//
+// Note that the local name "sync" is assigned to two different import
+// statements. This is possible if the import statements are from different
+// source files in the same package.
+//
+// Since go-marshal generates a single output file per package regardless of the
+// number of input files, if "sync" is referenced by any generated code, it's
+// unclear which import statement "sync" refers to. While it's theoretically
+// possible to resolve this by assigning a unique local alias to each instance
+// of the sync package, go-marshal currently aborts when it encounters such an
+// ambiguity.
+//
+// TODO(b/151478251): importTable considers the final component of an import
+// path to be the package name, but this is only a convention. The actual
+// package name is determined by the package statement in the source files for
+// the package.
 type importTable struct {
 	// Map of imports and whether they should be copied to the output.
-	is map[string]*importStmt
+	is map[string][]*importStmt
 }
 
 func newImportTable() *importTable {
 	return &importTable{
-		is: make(map[string]*importStmt),
+		is: make(map[string][]*importStmt),
 	}
 }
 
-// Merges import statements from other into i. Collisions in import statements
-// result in a panic.
+// Merges import statements from other into i.
 func (i *importTable) merge(other *importTable) {
-	for name, im := range other.is {
-		dup, ok := i.is[name]
-		if ok {
-			// When merging two imports, if either are marked used, the merged entry
-			// should also be marked used.
-			im.used = im.used || dup.used
-
-			if !dup.equivalent(im) {
-				panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
-			}
-		}
-		i.is[name] = im
+	for name, ims := range other.is {
+		i.is[name] = append(i.is[name], ims...)
 	}
 }
 
 func (i *importTable) addStmt(s *importStmt) *importStmt {
-	if old, ok := i.is[s.name]; ok && !old.equivalent(s) {
-		// We could theoretically handle the collision by assigning a local name
-		// to one of the imports. However, this is a non-trivial transformation.
-		// Given that collisions should be rare, simply panic on collision.
-		panic(fmt.Sprintf("Import collision: old: %s as %v; new: %v as %v", old.path, old.name, s.path, s.name))
-	}
-	i.is[s.name] = s
+	i.is[s.name] = append(i.is[s.name], s)
 	return s
 }
 
@@ -381,16 +434,20 @@ func (i *importTable) addFromSpec(spec *ast.ImportSpec, f *token.FileSet) *impor
 // Marks the import named n as used. If no such import is in the table, returns
 // false.
 func (i *importTable) markUsed(n string) bool {
-	if n, ok := i.is[n]; ok {
-		n.markUsed()
+	if ns, ok := i.is[n]; ok {
+		for _, n := range ns {
+			n.markUsed()
+		}
 		return true
 	}
 	return false
 }
 
 func (i *importTable) clear() {
-	for _, i := range i.is {
-		i.used = false
+	for _, is := range i.is {
+		for _, i := range is {
+			i.used = false
+		}
 	}
 }
 
@@ -401,9 +458,42 @@ func (i *importTable) write(out io.Writer) error {
 	}
 
 	imports := make([]string, 0, len(i.is))
-	for _, i := range i.is {
-		if i.used {
-			imports = append(imports, i.String())
+	for name, is := range i.is {
+		var lastUsed *importStmt
+		var ambiguous bool
+
+		for _, i := range is {
+			if i.used {
+				if lastUsed != nil {
+					if !i.equivalent(lastUsed) {
+						ambiguous = true
+					}
+				}
+				lastUsed = i
+			}
+		}
+
+		if ambiguous {
+			// We have two or more import statements across the different source
+			// files that share a local name, and at least one of these imports
+			// are used by the generated code. This ambiguity can't be resolved
+			// by go-marshal and requires the user intervention. Dump a list of
+			// the colliding import statements and let the user modify the input
+			// files as appropriate.
+			var b strings.Builder
+			fmt.Fprintf(&b, "The imported name %q is used by one of the types marked for marshalling, and which import statement the code refers to is ambiguous. Perhaps give the imports unique local names?\n\n", name)
+			fmt.Fprintf(&b, "The following %d import statements are ambiguous for the local name %q:\n", len(is), name)
+			// Note: len(is) is guaranteed to be 1 or greater or ambiguous can't
+			// be true. Therefore the slicing below is safe.
+			for _, i := range is[:len(is)-1] {
+				fmt.Fprintf(&b, "  %v\n", i.debugString())
+			}
+			fmt.Fprintf(&b, "  %v", is[len(is)-1].debugString())
+			panic(b.String())
+		}
+
+		if lastUsed != nil {
+			imports = append(imports, lastUsed.String())
 		}
 	}
 	sort.Strings(imports)
-- 
cgit v1.2.3


From d01a8ca3473bfbc7a5eb8da5ea662925e5f2673d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 1 Apr 2020 07:56:05 -0700
Subject: Add FileDescription interface for socket files.

Refactor the existing socket interface to share methods between VFS1 and VFS2.
The method signatures do not contain anything filesystem-related, so they don't
need to be re-defined for VFS2.

Updates #1476, #1478, #1484, #1485.

PiperOrigin-RevId: 304184545
---
 pkg/sentry/socket/BUILD     |  1 +
 pkg/sentry/socket/socket.go | 87 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 611fa22c3..c40c6d673 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 50d9744e6..b5ba4a56b 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -48,11 +49,25 @@ func (c *ControlMessages) Release() {
 	c.Unix.Release()
 }
 
-// Socket is the interface containing socket syscalls used by the syscall layer
-// to redirect them to the appropriate implementation.
+// Socket is an interface combining fs.FileOperations and SocketOps,
+// representing a VFS1 socket file.
 type Socket interface {
 	fs.FileOperations
+	SocketOps
+}
+
+// SocketVFS2 is an interface combining vfs.FileDescription and SocketOps,
+// representing a VFS2 socket file.
+type SocketVFS2 interface {
+	vfs.FileDescriptionImpl
+	SocketOps
+}
 
+// SocketOps is the interface containing socket syscalls used by the syscall
+// layer to redirect them to the appropriate implementation.
+//
+// It is implemented by both Socket and SocketVFS2.
+type SocketOps interface {
 	// Connect implements the connect(2) linux syscall.
 	Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error
 
@@ -153,6 +168,8 @@ var families = make(map[int][]Provider)
 // RegisterProvider registers the provider of a given address family so that
 // sockets of that type can be created via socket() and/or socketpair()
 // syscalls.
+//
+// This should only be called during the initialization of the address family.
 func RegisterProvider(family int, provider Provider) {
 	families[family] = append(families[family], provider)
 }
@@ -216,6 +233,72 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 	return fs.NewDirent(ctx, inode, fmt.Sprintf("socket:[%d]", ino))
 }
 
+// ProviderVFS2 is the vfs2 interface implemented by providers of sockets for
+// specific address families (e.g., AF_INET).
+type ProviderVFS2 interface {
+	// Socket creates a new socket.
+	//
+	// If a nil Socket _and_ a nil error is returned, it means that the
+	// protocol is not supported. A non-nil error should only be returned
+	// if the protocol is supported, but an error occurs during creation.
+	Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error)
+
+	// Pair creates a pair of connected sockets.
+	//
+	// See Socket for error information.
+	Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error)
+}
+
+// familiesVFS2 holds a map of all known address families and their providers.
+var familiesVFS2 = make(map[int][]ProviderVFS2)
+
+// RegisterProviderVFS2 registers the provider of a given address family so that
+// sockets of that type can be created via socket() and/or socketpair()
+// syscalls.
+//
+// This should only be called during the initialization of the address family.
+func RegisterProviderVFS2(family int, provider ProviderVFS2) {
+	familiesVFS2[family] = append(familiesVFS2[family], provider)
+}
+
+// NewVFS2 creates a new socket with the given family, type and protocol.
+func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	for _, p := range familiesVFS2[family] {
+		s, err := p.Socket(t, stype, protocol)
+		if err != nil {
+			return nil, err
+		}
+		if s != nil {
+			// TODO: Add vfs2 sockets to global socket table.
+			return s, nil
+		}
+	}
+
+	return nil, syserr.ErrAddressFamilyNotSupported
+}
+
+// PairVFS2 creates a new connected socket pair with the given family, type and
+// protocol.
+func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	providers, ok := familiesVFS2[family]
+	if !ok {
+		return nil, nil, syserr.ErrAddressFamilyNotSupported
+	}
+
+	for _, p := range providers {
+		s1, s2, err := p.Pair(t, stype, protocol)
+		if err != nil {
+			return nil, nil, err
+		}
+		if s1 != nil && s2 != nil {
+			// TODO: Add vfs2 sockets to global socket table.
+			return s1, s2, nil
+		}
+	}
+
+	return nil, nil, syserr.ErrSocketNotSupported
+}
+
 // SendReceiveTimeout stores timeouts for send and receive calls.
 //
 // It is meant to be embedded into Socket implementations to help satisfy the
-- 
cgit v1.2.3


From db7917556a7e4bd2cd6d183c68f04a4787dec493 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 1 Apr 2020 09:58:51 -0700
Subject: Fix 386 build tags

The build tag for 32-bit x86 is 386, not i386.

Updates #2298

PiperOrigin-RevId: 304206373
---
 pkg/cpuid/cpuid_parse_x86_test.go                      | 2 +-
 pkg/cpuid/cpuid_x86.go                                 | 2 +-
 pkg/cpuid/cpuid_x86_test.go                            | 2 +-
 pkg/sentry/arch/arch_state_x86.go                      | 2 +-
 pkg/sentry/arch/arch_x86.go                            | 2 +-
 pkg/sentry/arch/arch_x86_impl.go                       | 2 +-
 pkg/sentry/arch/signal_stack.go                        | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_x86.go | 2 +-
 pkg/sentry/platform/ring0/x86.go                       | 2 +-
 pkg/usermem/usermem_x86.go                             | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
index d48418e69..c9bd40e1b 100644
--- a/pkg/cpuid/cpuid_parse_x86_test.go
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package cpuid
 
diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
index 9abf6914d..562f8f405 100644
--- a/pkg/cpuid/cpuid_x86.go
+++ b/pkg/cpuid/cpuid_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package cpuid
 
diff --git a/pkg/cpuid/cpuid_x86_test.go b/pkg/cpuid/cpuid_x86_test.go
index 0fe20c213..bacf345c8 100644
--- a/pkg/cpuid/cpuid_x86_test.go
+++ b/pkg/cpuid/cpuid_x86_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package cpuid
 
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index e35c9214a..aa31169e0 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 88b40a9d1..7fc4c0473 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go
index 04ac283c6..3edf40764 100644
--- a/pkg/sentry/arch/arch_x86_impl.go
+++ b/pkg/sentry/arch/arch_x86_impl.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 1a6056171..e58f055c7 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64 arm64
+// +build 386 amd64 arm64
 
 package arch
 
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index dcf061df9..157438d9b 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package pagetables
 
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 5f80d64e8..9da0ea685 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package ring0
 
diff --git a/pkg/usermem/usermem_x86.go b/pkg/usermem/usermem_x86.go
index 8059b72d2..d96f829fb 100644
--- a/pkg/usermem/usermem_x86.go
+++ b/pkg/usermem/usermem_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package usermem
 
-- 
cgit v1.2.3


From 4e6a1a5adb5607423c180089d8b464ef7dfdd1ae Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 1 Apr 2020 11:05:05 -0700
Subject: Automated rollback of changelist 303799678

PiperOrigin-RevId: 304221302
---
 pkg/atomicbitops/atomicbitops_amd64.s           | 16 ++--
 pkg/atomicbitops/atomicbitops_arm64.s           | 16 ++--
 pkg/atomicbitops/atomicbitops_noasm.go          |  8 --
 pkg/safecopy/safecopy.go                        |  4 +-
 pkg/safecopy/safecopy_unsafe.go                 |  6 +-
 pkg/sentry/platform/kvm/BUILD                   |  1 -
 pkg/sentry/platform/kvm/bluepill.go             | 12 +--
 pkg/sentry/platform/kvm/bluepill_unsafe.go      | 97 +++++++++----------------
 pkg/sentry/platform/kvm/kvm_const.go            |  1 -
 pkg/sentry/platform/kvm/kvm_test.go             | 64 ++--------------
 pkg/sentry/platform/kvm/machine.go              |  9 ++-
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 25 +++++++
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 26 +++++++
 pkg/sentry/platform/kvm/machine_unsafe.go       | 41 -----------
 runsc/sandbox/sandbox.go                        |  6 ++
 15 files changed, 125 insertions(+), 207 deletions(-)

diff --git a/pkg/atomicbitops/atomicbitops_amd64.s b/pkg/atomicbitops/atomicbitops_amd64.s
index f0edd4de7..54c887ee5 100644
--- a/pkg/atomicbitops/atomicbitops_amd64.s
+++ b/pkg/atomicbitops/atomicbitops_amd64.s
@@ -16,28 +16,28 @@
 
 #include "textflag.h"
 
-TEXT ·AndUint32(SB),NOSPLIT,$0-12
+TEXT ·AndUint32(SB),$0-12
   MOVQ  addr+0(FP), BP
   MOVL  val+8(FP), AX
   LOCK
   ANDL   AX, 0(BP)
   RET
 
-TEXT ·OrUint32(SB),NOSPLIT,$0-12
+TEXT ·OrUint32(SB),$0-12
   MOVQ  addr+0(FP), BP
   MOVL  val+8(FP), AX
   LOCK
   ORL   AX, 0(BP)
   RET
 
-TEXT ·XorUint32(SB),NOSPLIT,$0-12
+TEXT ·XorUint32(SB),$0-12
   MOVQ  addr+0(FP), BP
   MOVL  val+8(FP), AX
   LOCK
   XORL   AX, 0(BP)
   RET
 
-TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20
+TEXT ·CompareAndSwapUint32(SB),$0-20
   MOVQ  addr+0(FP), DI
   MOVL  old+8(FP), AX
   MOVL  new+12(FP), DX
@@ -46,28 +46,28 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20
   MOVL  AX, ret+16(FP)
   RET
 
-TEXT ·AndUint64(SB),NOSPLIT,$0-16
+TEXT ·AndUint64(SB),$0-16
   MOVQ  addr+0(FP), BP
   MOVQ  val+8(FP), AX
   LOCK
   ANDQ   AX, 0(BP)
   RET
 
-TEXT ·OrUint64(SB),NOSPLIT,$0-16
+TEXT ·OrUint64(SB),$0-16
   MOVQ  addr+0(FP), BP
   MOVQ  val+8(FP), AX
   LOCK
   ORQ   AX, 0(BP)
   RET
 
-TEXT ·XorUint64(SB),NOSPLIT,$0-16
+TEXT ·XorUint64(SB),$0-16
   MOVQ  addr+0(FP), BP
   MOVQ  val+8(FP), AX
   LOCK
   XORQ   AX, 0(BP)
   RET
 
-TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-32
+TEXT ·CompareAndSwapUint64(SB),$0-32
   MOVQ  addr+0(FP), DI
   MOVQ  old+8(FP), AX
   MOVQ  new+16(FP), DX
diff --git a/pkg/atomicbitops/atomicbitops_arm64.s b/pkg/atomicbitops/atomicbitops_arm64.s
index 644a6bca5..5c780851b 100644
--- a/pkg/atomicbitops/atomicbitops_arm64.s
+++ b/pkg/atomicbitops/atomicbitops_arm64.s
@@ -16,7 +16,7 @@
 
 #include "textflag.h"
 
-TEXT ·AndUint32(SB),NOSPLIT,$0-12
+TEXT ·AndUint32(SB),$0-12
   MOVD    ptr+0(FP), R0
   MOVW    val+8(FP), R1
 again:
@@ -26,7 +26,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·OrUint32(SB),NOSPLIT,$0-12
+TEXT ·OrUint32(SB),$0-12
   MOVD    ptr+0(FP), R0
   MOVW    val+8(FP), R1
 again:
@@ -36,7 +36,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·XorUint32(SB),NOSPLIT,$0-12
+TEXT ·XorUint32(SB),$0-12
   MOVD    ptr+0(FP), R0
   MOVW    val+8(FP), R1
 again:
@@ -46,7 +46,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20
+TEXT ·CompareAndSwapUint32(SB),$0-20
   MOVD addr+0(FP), R0
   MOVW old+8(FP), R1
   MOVW new+12(FP), R2
@@ -60,7 +60,7 @@ done:
   MOVW R3, prev+16(FP)
   RET
 
-TEXT ·AndUint64(SB),NOSPLIT,$0-16
+TEXT ·AndUint64(SB),$0-16
   MOVD    ptr+0(FP), R0
   MOVD    val+8(FP), R1
 again:
@@ -70,7 +70,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·OrUint64(SB),NOSPLIT,$0-16
+TEXT ·OrUint64(SB),$0-16
   MOVD    ptr+0(FP), R0
   MOVD    val+8(FP), R1
 again:
@@ -80,7 +80,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·XorUint64(SB),NOSPLIT,$0-16
+TEXT ·XorUint64(SB),$0-16
   MOVD    ptr+0(FP), R0
   MOVD    val+8(FP), R1
 again:
@@ -90,7 +90,7 @@ again:
   CBNZ    R3, again
   RET
 
-TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-32
+TEXT ·CompareAndSwapUint64(SB),$0-32
   MOVD addr+0(FP), R0
   MOVD old+8(FP), R1
   MOVD new+16(FP), R2
diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go
index 4e9c27b98..3b2898256 100644
--- a/pkg/atomicbitops/atomicbitops_noasm.go
+++ b/pkg/atomicbitops/atomicbitops_noasm.go
@@ -20,7 +20,6 @@ import (
 	"sync/atomic"
 )
 
-//go:nosplit
 func AndUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -31,7 +30,6 @@ func AndUint32(addr *uint32, val uint32) {
 	}
 }
 
-//go:nosplit
 func OrUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -42,7 +40,6 @@ func OrUint32(addr *uint32, val uint32) {
 	}
 }
 
-//go:nosplit
 func XorUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -53,7 +50,6 @@ func XorUint32(addr *uint32, val uint32) {
 	}
 }
 
-//go:nosplit
 func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
 	for {
 		prev = atomic.LoadUint32(addr)
@@ -66,7 +62,6 @@ func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
 	}
 }
 
-//go:nosplit
 func AndUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -77,7 +72,6 @@ func AndUint64(addr *uint64, val uint64) {
 	}
 }
 
-//go:nosplit
 func OrUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -88,7 +82,6 @@ func OrUint64(addr *uint64, val uint64) {
 	}
 }
 
-//go:nosplit
 func XorUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -99,7 +92,6 @@ func XorUint64(addr *uint64, val uint64) {
 	}
 }
 
-//go:nosplit
 func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
 	for {
 		prev = atomic.LoadUint64(addr)
diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go
index 521f1a82d..2fb7e5809 100644
--- a/pkg/safecopy/safecopy.go
+++ b/pkg/safecopy/safecopy.go
@@ -127,10 +127,10 @@ func initializeAddresses() {
 
 func init() {
 	initializeAddresses()
-	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler, 0); err != nil {
+	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
 	}
-	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler, 0); err != nil {
+	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
 	}
 	syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) {
diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go
index b15b920fe..41dd567f3 100644
--- a/pkg/safecopy/safecopy_unsafe.go
+++ b/pkg/safecopy/safecopy_unsafe.go
@@ -324,13 +324,11 @@ func errorFromFaultSignal(addr uintptr, sig int32) error {
 //
 // It stores the value of the previously set handler in previous.
 //
-// The extraMask parameter is OR'ed into the existing signal handler mask.
-//
 // This function will be called on initialization in order to install safecopy
 // handlers for appropriate signals. These handlers will call the previous
 // handler however, and if this is function is being used externally then the
 // same courtesy is expected.
-func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr, extraMask uint64) error {
+func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error {
 	var sa struct {
 		handler  uintptr
 		flags    uint64
@@ -350,10 +348,10 @@ func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr
 	if sa.handler == 0 {
 		return fmt.Errorf("previous handler for signal %x isn't set", sig)
 	}
+
 	*previous = sa.handler
 
 	// Install our own handler.
-	sa.mask |= extraMask
 	sa.handler = handler
 	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
 		return e
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index e27f57536..159f7eafd 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -70,7 +70,6 @@ go_test(
         "requires-kvm",
     ],
     deps = [
-        "//pkg/procid",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 555b5fa96..4b23f7803 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -46,14 +46,6 @@ var (
 	// bounceSignalMask has only bounceSignal set.
 	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
 
-	// otherSignalsMask includes all other signals that will be cause the
-	// vCPU to exit during execution.
-	//
-	// Currently, this includes the preemption signal and the profiling
-	// signal. In general, these should be signals whose delivery actually
-	// influences the way the program executes as the switch can be costly.
-	otherSignalsMask = uint64(1<<(uint64(syscall.SIGURG)-1)) | uint64(1<<(uint64(syscall.SIGPROF)-1))
-
 	// bounce is the interrupt vector used to return to the kernel.
 	bounce = uint32(ring0.VirtualizationException)
 
@@ -94,8 +86,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 }
 
 func init() {
-	// Install the handler, masking all signals.
-	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler, ^uint64(0)); err != nil {
+	// Install the handler.
+	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
 	}
 
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 4e9d80765..9add7c944 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -24,7 +24,6 @@ import (
 	"syscall"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -59,19 +58,6 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 	return &((*arch.UContext64)(context).MContext)
 }
 
-// injectInterrupt is a helper to inject an interrupt.
-//
-//go:nosplit
-func injectInterrupt(c *vCPU) {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_INTERRUPT,
-		uintptr(unsafe.Pointer(&bounce))); errno != 0 {
-		throw("interrupt injection failed")
-	}
-}
-
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
@@ -83,9 +69,6 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
 	c := bluepillArchEnter(bluepillArchContext(context))
 
-	// Enable preemption.
-	c.setSignalMask(true)
-
 	// Increment the number of switches.
 	atomic.AddUint32(&c.switches, 1)
 
@@ -106,9 +89,6 @@ func bluepillHandler(context unsafe.Pointer) {
 			// interrupted KVM. Since we're in a signal handler
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
-			//
-			// We will not be able to actually do subsequent
-			// KVM_RUNs until this signal is processed.
 			timeout := syscall.Timespec{}
 			sig, _, errno := syscall.RawSyscall6(
 				syscall.SYS_RT_SIGTIMEDWAIT,
@@ -118,24 +98,12 @@ func bluepillHandler(context unsafe.Pointer) {
 				8,                                 // sigset size.
 				0, 0)
 			if errno == syscall.EAGAIN {
-				// If weren't able to process this signal, then
-				// it must not have been in the bounceMask. By
-				// elimination, it must have been the
-				// preemption signal. We can't process this
-				// signal right now, so we need to disable
-				// preemption until the interrupt is actually
-				// handled.
-				c.setSignalMask(false)
-				// Note that there is a waiter for this vCPU.
-				// This will cause the vCPU to exit at some
-				// point in the future (releasing the user lock
-				// and guest mode).
-				atomicbitops.OrUint32(&c.state, vCPUWaiter)
-			} else if errno != 0 {
-				// We only expect success or a timeout.
+				continue
+			}
+			if errno != 0 {
 				throw("error waiting for pending signal")
-			} else if sig != uintptr(bounceSignal) {
-				// Only the bounce should be processed.
+			}
+			if sig != uintptr(bounceSignal) {
 				throw("unexpected signal")
 			}
 
@@ -146,10 +114,11 @@ func bluepillHandler(context unsafe.Pointer) {
 			// ready.
 			if c.runData.readyForInterruptInjection == 0 {
 				c.runData.requestInterruptWindow = 1
+				continue // Rerun vCPU.
 			} else {
-				injectInterrupt(c)
+				// Force injection below; the vCPU is ready.
+				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
 			}
-			continue // Rerun vCPU.
 		case syscall.EFAULT:
 			// If a fault is not serviceable due to the host
 			// backing pages having page permissions, instead of an
@@ -168,30 +137,6 @@ func bluepillHandler(context unsafe.Pointer) {
 		}
 
 		switch c.runData.exitReason {
-		case _KVM_EXIT_HLT:
-			// Copy out registers.
-			bluepillArchExit(c, bluepillArchContext(context))
-
-			// Return to the vCPUReady state; notify any waiters.
-			user := atomic.LoadUint32(&c.state) & vCPUUser
-			switch atomic.SwapUint32(&c.state, user) {
-			case user | vCPUGuest: // Expected case.
-			case user | vCPUGuest | vCPUWaiter:
-				c.notify()
-			default:
-				throw("invalid state")
-			}
-			return
-		case _KVM_EXIT_IRQ_WINDOW_OPEN:
-			// Inject an interrupt now.
-			injectInterrupt(c)
-			// Clear previous injection request.
-			c.runData.requestInterruptWindow = 0
-		case _KVM_EXIT_INTR:
-			// This is fine, it is the normal exit reason during
-			// signal delivery. However, we still need to handle
-			// other potential exit reasons *combined* with EINTR,
-			// so this switch must be hit even after the above.
 		case _KVM_EXIT_EXCEPTION:
 			c.die(bluepillArchContext(context), "exception")
 			return
@@ -210,6 +155,20 @@ func bluepillHandler(context unsafe.Pointer) {
 		case _KVM_EXIT_DEBUG:
 			c.die(bluepillArchContext(context), "debug")
 			return
+		case _KVM_EXIT_HLT:
+			// Copy out registers.
+			bluepillArchExit(c, bluepillArchContext(context))
+
+			// Return to the vCPUReady state; notify any waiters.
+			user := atomic.LoadUint32(&c.state) & vCPUUser
+			switch atomic.SwapUint32(&c.state, user) {
+			case user | vCPUGuest: // Expected case.
+			case user | vCPUGuest | vCPUWaiter:
+				c.notify()
+			default:
+				throw("invalid state")
+			}
+			return
 		case _KVM_EXIT_MMIO:
 			// Increment the fault count.
 			atomic.AddUint32(&c.faults, 1)
@@ -241,6 +200,18 @@ func bluepillHandler(context unsafe.Pointer) {
 					data[i] = *b
 				}
 			}
+		case _KVM_EXIT_IRQ_WINDOW_OPEN:
+			// Interrupt: we must have requested an interrupt
+			// window; set the interrupt line.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_INTERRUPT,
+				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+				throw("interrupt injection failed")
+			}
+			// Clear previous injection request.
+			c.runData.requestInterruptWindow = 0
 		case _KVM_EXIT_SHUTDOWN:
 			c.die(bluepillArchContext(context), "shutdown")
 			return
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 07d9c9a98..1d5c77ff4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -48,7 +48,6 @@ const (
 	_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
 	_KVM_EXIT_SHUTDOWN        = 0x8
 	_KVM_EXIT_FAIL_ENTRY      = 0x9
-	_KVM_EXIT_INTR            = 0xa
 	_KVM_EXIT_INTERNAL_ERROR  = 0x11
 	_KVM_EXIT_SYSTEM_EVENT    = 0x18
 )
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index d42ba3f24..c42752d50 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -16,15 +16,12 @@ package kvm
 
 import (
 	"math/rand"
-	"os"
 	"reflect"
-	"runtime"
 	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/procid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
@@ -323,18 +320,15 @@ func TestBounce(t *testing.T) {
 	})
 }
 
-// randomSleep is used by some race tests below.
-//
-// O(hundreds of microseconds) is appropriate to ensure different overlaps and
-// different schedules.
-func randomSleep() {
-	if n := rand.Intn(1000); n > 100 {
-		time.Sleep(time.Duration(n) * time.Microsecond)
-	}
-}
-
 func TestBounceStress(t *testing.T) {
 	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		randomSleep := func() {
+			// O(hundreds of microseconds) is appropriate to ensure
+			// different overlaps and different schedules.
+			if n := rand.Intn(1000); n > 100 {
+				time.Sleep(time.Duration(n) * time.Microsecond)
+			}
+		}
 		for i := 0; i < 1000; i++ {
 			// Start an asynchronously executing goroutine that
 			// calls Bounce at pseudo-random point in time.
@@ -361,50 +355,6 @@ func TestBounceStress(t *testing.T) {
 	})
 }
 
-func TestPreemption(t *testing.T) {
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		// Lock the main vCPU thread.
-		runtime.LockOSThread()
-		pid := os.Getpid()
-		tid := procid.Current()
-		running := uint32(1)
-		defer atomic.StoreUint32(&running, 0)
-
-		// Start generating "preemptions".
-		go func() {
-			for atomic.LoadUint32(&running) != 0 {
-				// Kick via a preemption: best effort.
-				syscall.Tgkill(pid, int(tid), syscall.SIGURG)
-				randomSleep()
-			}
-		}()
-
-		for i := 0; i < 1000; i++ {
-			randomSleep()
-			var si arch.SignalInfo
-			if _, err := c.SwitchToUser(ring0.SwitchOpts{
-				Registers:          regs,
-				FloatingPointState: dummyFPState,
-				PageTables:         pt,
-			}, &si); err != platform.ErrContextInterrupt {
-				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
-			}
-			// Was this caused by a preemption signal?
-			if got := atomic.LoadUint32(&c.state); got&vCPUGuest != 0 && got&vCPUWaiter == 0 {
-				continue
-			}
-			c.unlock()
-			// Should have dropped from guest mode, processed preemption.
-			if got := atomic.LoadUint32(&c.state); got != vCPUReady {
-				t.Errorf("vCPU not in ready state: got %v", got)
-			}
-			randomSleep()
-			c.lock()
-		}
-		return false
-	})
-}
-
 func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 345b71e8f..f1afc74dc 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -108,9 +108,6 @@ type vCPU struct {
 	// This is a bitmask of the three fields (vCPU*) described above.
 	state uint32
 
-	// signalMask is the vCPU signal mask.
-	signalMask uint64
-
 	// runData for this vCPU.
 	runData *runData
 
@@ -124,7 +121,6 @@ type vCPU struct {
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
 
-	// dieState is the temporary state associated with throwing exceptions.
 	dieState dieState
 }
 
@@ -157,6 +153,11 @@ func (m *machine) newVCPU() *vCPU {
 	c.CPU.Init(&m.kernel, c)
 	m.vCPUsByID[c.id] = c
 
+	// Ensure the signal mask is correct.
+	if err := c.setSignalMask(); err != nil {
+		panic(fmt.Sprintf("error setting signal mask: %v", err))
+	}
+
 	// Map the run data.
 	runData, err := mapRunData(int(fd))
 	if err != nil {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 52286e56d..7156c245f 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -111,6 +111,31 @@ func (c *vCPU) setSystemTime() error {
 	return nil
 }
 
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+	return nil
+}
+
 // setUserRegisters sets user registers in the vCPU.
 func (c *vCPU) setUserRegisters(uregs *userRegs) error {
 	if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 185eeb4f0..b531f2f85 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -268,6 +268,32 @@ func (c *vCPU) setSystemTime() error {
 	return nil
 }
 
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+
+	return nil
+}
+
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
 	// Check for canonical addresses.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index e4de0a889..f04be2ab5 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,47 +87,6 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
-// setSignalMask sets the vCPU signal mask.
-//
-// This will be called from the bluepill handler, and therefore must not
-// perform any allocation.
-//
-//go:nosplit
-func (c *vCPU) setSignalMask(enableOthers bool) {
-	// The signal mask is either:
-	// *) Only the bounce signal, which we need to use to execute the
-	//    machine state up until the bounce interrupt can be processed.
-	//    or
-	// *) All signals, which is the default state unless we need to
-	//    continue execution to exit guest mode (the case above).
-	mask := bounceSignalMask
-	if enableOthers {
-		mask |= otherSignalsMask
-	}
-	if c.signalMask == mask {
-		return // Already set.
-	}
-
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
-	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(mask & 0xffffffff)
-	data.mask2 = ^uint32(mask >> 32)
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		throw("setSignal mask failed")
-	}
-}
-
 // atomicAddressSpace is an atomic address space pointer.
 type atomicAddressSpace struct {
 	pointer unsafe.Pointer
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6c15727fa..8de75ae57 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -444,6 +444,12 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
+	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
+	// isn't set.
+	if conf.Platform == "kvm" {
+		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
+	}
+
 	// The current process' stdio must be passed to the application via the
 	// --stdio-fds flag. The stdio of the sandbox process itself must not
 	// be connected to the same FDs, otherwise we risk leaking sandbox
-- 
cgit v1.2.3


From 38f4501c995d7d915bcd168d58655e67e2b34566 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 1 Apr 2020 11:25:43 -0700
Subject: Add context.Context argument to XxxWithErrno functions

This allows control over the gRPC timeouts as needed.

PiperOrigin-RevId: 304225713
---
 test/packetimpact/testbench/dut.go | 81 ++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index f80dbb35f..d102dc7bb 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -143,13 +143,12 @@ func (dut *DUT) protoToSockaddr(sa *pb.Sockaddr) unix.Sockaddr {
 }
 
 // BindWithErrno calls bind on the DUT.
-func (dut *DUT) BindWithErrno(fd int32, sa unix.Sockaddr) (int32, error) {
+func (dut *DUT) BindWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (int32, error) {
 	dut.t.Helper()
 	req := pb.BindRequest{
 		Sockfd: fd,
 		Addr:   dut.sockaddrToProto(sa),
 	}
-	ctx := context.Background()
 	resp, err := dut.posixServer.Bind(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call Bind: %s", err)
@@ -158,22 +157,24 @@ func (dut *DUT) BindWithErrno(fd int32, sa unix.Sockaddr) (int32, error) {
 }
 
 // Bind calls bind on the DUT and causes a fatal test failure if it doesn't
-// succeed.
+// succeed. If more control over the timeout or error handling is
+// needed, use BindWithErrno.
 func (dut *DUT) Bind(fd int32, sa unix.Sockaddr) {
 	dut.t.Helper()
-	ret, err := dut.BindWithErrno(fd, sa)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.BindWithErrno(ctx, fd, sa)
 	if ret != 0 {
 		dut.t.Fatalf("failed to bind socket: %s", err)
 	}
 }
 
 // GetSockNameWithErrno calls getsockname on the DUT.
-func (dut *DUT) GetSockNameWithErrno(sockfd int32) (int32, unix.Sockaddr, error) {
+func (dut *DUT) GetSockNameWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
 	dut.t.Helper()
 	req := pb.GetSockNameRequest{
 		Sockfd: sockfd,
 	}
-	ctx := context.Background()
 	resp, err := dut.posixServer.GetSockName(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call Bind: %s", err)
@@ -182,10 +183,13 @@ func (dut *DUT) GetSockNameWithErrno(sockfd int32) (int32, unix.Sockaddr, error)
 }
 
 // GetSockName calls getsockname on the DUT and causes a fatal test failure if
-// it doens't succeed.
+// it doesn't succeed. If more control over the timeout or error handling is
+// needed, use GetSockNameWithErrno.
 func (dut *DUT) GetSockName(sockfd int32) unix.Sockaddr {
 	dut.t.Helper()
-	ret, sa, err := dut.GetSockNameWithErrno(sockfd)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, sa, err := dut.GetSockNameWithErrno(ctx, sockfd)
 	if ret != 0 {
 		dut.t.Fatalf("failed to getsockname: %s", err)
 	}
@@ -193,14 +197,12 @@ func (dut *DUT) GetSockName(sockfd int32) unix.Sockaddr {
 }
 
 // ListenWithErrno calls listen on the DUT.
-func (dut *DUT) ListenWithErrno(sockfd, backlog int32) (int32, error) {
+func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int32, error) {
 	dut.t.Helper()
 	req := pb.ListenRequest{
 		Sockfd:  sockfd,
 		Backlog: backlog,
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
 	resp, err := dut.posixServer.Listen(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call Listen: %s", err)
@@ -209,23 +211,24 @@ func (dut *DUT) ListenWithErrno(sockfd, backlog int32) (int32, error) {
 }
 
 // Listen calls listen on the DUT and causes a fatal test failure if it doesn't
-// succeed.
+// succeed. If more control over the timeout or error handling is needed, use
+// ListenWithErrno.
 func (dut *DUT) Listen(sockfd, backlog int32) {
 	dut.t.Helper()
-	ret, err := dut.ListenWithErrno(sockfd, backlog)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.ListenWithErrno(ctx, sockfd, backlog)
 	if ret != 0 {
 		dut.t.Fatalf("failed to listen: %s", err)
 	}
 }
 
 // AcceptWithErrno calls accept on the DUT.
-func (dut *DUT) AcceptWithErrno(sockfd int32) (int32, unix.Sockaddr, error) {
+func (dut *DUT) AcceptWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
 	dut.t.Helper()
 	req := pb.AcceptRequest{
 		Sockfd: sockfd,
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
 	resp, err := dut.posixServer.Accept(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call Accept: %s", err)
@@ -234,18 +237,23 @@ func (dut *DUT) AcceptWithErrno(sockfd int32) (int32, unix.Sockaddr, error) {
 }
 
 // Accept calls accept on the DUT and causes a fatal test failure if it doesn't
-// succeed.
+// succeed. If more control over the timeout or error handling is needed, use
+// AcceptWithErrno.
 func (dut *DUT) Accept(sockfd int32) (int32, unix.Sockaddr) {
 	dut.t.Helper()
-	fd, sa, err := dut.AcceptWithErrno(sockfd)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	fd, sa, err := dut.AcceptWithErrno(ctx, sockfd)
 	if fd < 0 {
 		dut.t.Fatalf("failed to accept: %s", err)
 	}
 	return fd, sa
 }
 
-// SetSockOptWithErrno calls setsockopt on the DUT.
-func (dut *DUT) SetSockOptWithErrno(sockfd, level, optname int32, optval []byte) (int32, error) {
+// SetSockOptWithErrno calls setsockopt on the DUT. Because endianess and the
+// width of values might differ between the testbench and DUT architectures,
+// prefer to use a more specific SetSockOptXxxWithErrno function.
+func (dut *DUT) SetSockOptWithErrno(ctx context.Context, sockfd, level, optname int32, optval []byte) (int32, error) {
 	dut.t.Helper()
 	req := pb.SetSockOptRequest{
 		Sockfd:  sockfd,
@@ -253,8 +261,6 @@ func (dut *DUT) SetSockOptWithErrno(sockfd, level, optname int32, optval []byte)
 		Optname: optname,
 		Optval:  optval,
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
 	resp, err := dut.posixServer.SetSockOpt(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call SetSockOpt: %s", err)
@@ -263,10 +269,15 @@ func (dut *DUT) SetSockOptWithErrno(sockfd, level, optname int32, optval []byte)
 }
 
 // SetSockOpt calls setsockopt on the DUT and causes a fatal test failure if it
-// doesn't succeed.
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use SetSockOptWithErrno. Because endianess and the width of values
+// might differ between the testbench and DUT architectures, prefer to use a
+// more specific SetSockOptXxx function.
 func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) {
 	dut.t.Helper()
-	ret, err := dut.SetSockOptWithErrno(sockfd, level, optname, optval)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.SetSockOptWithErrno(ctx, sockfd, level, optname, optval)
 	if ret != 0 {
 		dut.t.Fatalf("failed to SetSockOpt: %s", err)
 	}
@@ -274,7 +285,7 @@ func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) {
 
 // SetSockOptTimevalWithErrno calls setsockopt with the timeval converted to
 // bytes.
-func (dut *DUT) SetSockOptTimevalWithErrno(sockfd, level, optname int32, tv *unix.Timeval) (int32, error) {
+func (dut *DUT) SetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, optname int32, tv *unix.Timeval) (int32, error) {
 	dut.t.Helper()
 	timeval := pb.Timeval{
 		Seconds:      int64(tv.Sec),
@@ -286,8 +297,6 @@ func (dut *DUT) SetSockOptTimevalWithErrno(sockfd, level, optname int32, tv *uni
 		Optname: optname,
 		Timeval: &timeval,
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
 	resp, err := dut.posixServer.SetSockOptTimeval(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call SetSockOptTimeval: %s", err)
@@ -296,10 +305,13 @@ func (dut *DUT) SetSockOptTimevalWithErrno(sockfd, level, optname int32, tv *uni
 }
 
 // SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure
-// if it doesn't succeed.
+// if it doesn't succeed. If more control over the timeout or error handling is
+// needed, use SetSockOptTimevalWithErrno.
 func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval) {
 	dut.t.Helper()
-	ret, err := dut.SetSockOptTimevalWithErrno(sockfd, level, optname, tv)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.SetSockOptTimevalWithErrno(ctx, sockfd, level, optname, tv)
 	if ret != 0 {
 		dut.t.Fatalf("failed to SetSockOptTimeval: %s", err)
 	}
@@ -335,13 +347,11 @@ func (dut *DUT) Recv(sockfd, len, flags int32) []byte {
 }
 
 // CloseWithErrno calls close on the DUT.
-func (dut *DUT) CloseWithErrno(fd int32) (int32, error) {
+func (dut *DUT) CloseWithErrno(ctx context.Context, fd int32) (int32, error) {
 	dut.t.Helper()
 	req := pb.CloseRequest{
 		Fd: fd,
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
 	resp, err := dut.posixServer.Close(ctx, &req)
 	if err != nil {
 		dut.t.Fatalf("failed to call Close: %s", err)
@@ -350,10 +360,13 @@ func (dut *DUT) CloseWithErrno(fd int32) (int32, error) {
 }
 
 // Close calls close on the DUT and causes a fatal test failure if it doesn't
-// succeed.
+// succeed. If more control over the timeout or error handling is needed, use
+// CloseWithErrno.
 func (dut *DUT) Close(fd int32) {
 	dut.t.Helper()
-	ret, err := dut.CloseWithErrno(fd)
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.CloseWithErrno(ctx, fd)
 	if ret != 0 {
 		dut.t.Fatalf("failed to close: %s", err)
 	}
-- 
cgit v1.2.3


From 0d1e299079392043fae24c2be524a0eefe7d8085 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 1 Apr 2020 12:05:17 -0700
Subject: Pass configurable FilesystemType to tmpfs.

PiperOrigin-RevId: 304234086
---
 pkg/sentry/fsimpl/tmpfs/tmpfs.go | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index afd9f8533..8bc8818c0 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -77,6 +77,11 @@ type FilesystemOpts struct {
 	// RootSymlinkTarget is the target of the root symlink. Only valid if
 	// RootFileType == S_IFLNK.
 	RootSymlinkTarget string
+
+	// FilesystemType allows setting a different FilesystemType for this
+	// tmpfs filesystem. This allows tmpfs to "impersonate" other
+	// filesystems, like ramdiskfs and cgroupfs.
+	FilesystemType vfs.FilesystemType
 }
 
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
@@ -91,15 +96,22 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		clock:   clock,
 	}
 
-	fs.vfsfs.Init(vfsObj, &fstype, &fs)
-
-	typ := uint16(linux.S_IFDIR)
+	rootFileType := uint16(linux.S_IFDIR)
+	newFSType := vfs.FilesystemType(&fstype)
 	tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
-	if ok && tmpfsOpts.RootFileType != 0 {
-		typ = tmpfsOpts.RootFileType
+	if ok {
+		if tmpfsOpts.RootFileType != 0 {
+			rootFileType = tmpfsOpts.RootFileType
+		}
+		if tmpfsOpts.FilesystemType != nil {
+			newFSType = tmpfsOpts.FilesystemType
+		}
 	}
+
+	fs.vfsfs.Init(vfsObj, newFSType, &fs)
+
 	var root *inode
-	switch typ {
+	switch rootFileType {
 	case linux.S_IFREG:
 		root = fs.newRegularFile(creds, 0777)
 	case linux.S_IFLNK:
@@ -107,7 +119,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	case linux.S_IFDIR:
 		root = fs.newDirectory(creds, 01777)
 	default:
-		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", typ)
+		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
 	}
 	return &fs.vfsfs, &fs.newDentry(root).vfsd, nil
 }
-- 
cgit v1.2.3


From aecd3a25a99bc04fe2c032ea3422f10b2dba3256 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 1 Apr 2020 16:40:08 -0700
Subject: Deflake tcpip/stack:stack_x_test

Timeouts were increased to deflake pkg/tcpip/stack:stack_x_test tests
that depend on timers. Some timeouts used previously were intended for
tests that do not depend on timers, so this change updates those
timeouts to give more time for a timer-based event to occur. This
change also de-parallelizes non-subtests to reduce the number of active
timers.

Test: bazel test //pkg/tcpip/stack:stack_x_test --runs_per_test=500
PiperOrigin-RevId: 304287622
---
 pkg/tcpip/stack/ndp_test.go   | 126 +++++++++++++++---------------------------
 pkg/tcpip/stack/stack_test.go |   2 -
 2 files changed, 44 insertions(+), 84 deletions(-)

diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 06edd05b6..598468bdd 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -406,8 +406,7 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 			}
 
-			// Address should not be considered bound to the NIC yet
-			// (DAD ongoing).
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
 			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
@@ -416,10 +415,9 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
-			// Wait for the remaining time - some delta (500ms), to
-			// make sure the address is still not resolved.
-			const delta = 500 * time.Millisecond
-			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - delta)
+			// Make sure the address does not resolve before the resolution time has
+			// passed.
+			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - defaultAsyncEventTimeout)
 			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
@@ -430,13 +428,7 @@ func TestDADResolve(t *testing.T) {
 
 			// Wait for DAD to resolve.
 			select {
-			case <-time.After(2 * delta):
-				// We should get a resolution event after 500ms
-				// (delta) since we wait for 500ms less than the
-				// expected resolution time above to make sure
-				// that the address did not yet resolve. Waiting
-				// for 1s (2x delta) without a resolution event
-				// means something is wrong.
+			case <-time.After(2 * defaultAsyncEventTimeout):
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
 				if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
@@ -1034,8 +1026,6 @@ func TestNoRouterDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				routerC: make(chan ndpRouterEvent, 1),
 			}
@@ -1074,8 +1064,6 @@ func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) str
 // TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered router when the dispatcher asks it not to.
 func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		routerC: make(chan ndpRouterEvent, 1),
 	}
@@ -1116,8 +1104,6 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestRouterDiscovery(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 1),
 		rememberRouter: true,
@@ -1219,8 +1205,6 @@ func TestRouterDiscovery(t *testing.T) {
 // TestRouterDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
 func TestRouterDiscoveryMaxRouters(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 1),
 		rememberRouter: true,
@@ -1287,8 +1271,6 @@ func TestNoPrefixDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				prefixC: make(chan ndpPrefixEvent, 1),
 			}
@@ -1328,8 +1310,6 @@ func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) st
 // TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered on-link prefix when the dispatcher asks it not to.
 func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
-	t.Parallel()
-
 	prefix, subnet, _ := prefixSubnetAddr(0, "")
 
 	ndpDisp := ndpDispatcher{
@@ -1373,8 +1353,6 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestPrefixDiscovery(t *testing.T) {
-	t.Parallel()
-
 	prefix1, subnet1, _ := prefixSubnetAddr(0, "")
 	prefix2, subnet2, _ := prefixSubnetAddr(1, "")
 	prefix3, subnet3, _ := prefixSubnetAddr(2, "")
@@ -1563,8 +1541,6 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 // TestPrefixDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
 func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
@@ -1659,8 +1635,6 @@ func TestNoAutoGenAddr(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
@@ -2410,8 +2384,6 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 		},
 	}
 
-	const delta = 500 * time.Millisecond
-
 	// This Run will not return until the parallel tests finish.
 	//
 	// We need this because we need to do some teardown work after the
@@ -2464,24 +2436,21 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 				// to test.evl.
 				//
 
-				// Make sure we do not get any invalidation
-				// events until atleast 500ms (delta) before
-				// test.evl.
+				// The address should not be invalidated until the effective valid
+				// lifetime has passed.
 				select {
 				case <-ndpDisp.autoGenAddrC:
 					t.Fatal("unexpectedly received an auto gen addr event")
-				case <-time.After(time.Duration(test.evl)*time.Second - delta):
+				case <-time.After(time.Duration(test.evl)*time.Second - defaultAsyncEventTimeout):
 				}
 
-				// Wait for another second (2x delta), but now
-				// we expect the invalidation event.
+				// Wait for the invalidation event.
 				select {
 				case e := <-ndpDisp.autoGenAddrC:
 					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
-
-				case <-time.After(2 * delta):
+				case <-time.After(2 * defaultAsyncEventTimeout):
 					t.Fatal("timeout waiting for addr auto gen event")
 				}
 			})
@@ -2493,8 +2462,6 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 // by the user, its resources will be cleaned up and an invalidation event will
 // be sent to the integrator.
 func TestAutoGenAddrRemoval(t *testing.T) {
-	t.Parallel()
-
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
@@ -2551,8 +2518,6 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 // TestAutoGenAddrAfterRemoval tests adding a SLAAC address that was previously
 // assigned to the NIC but is in the permanentExpired state.
 func TestAutoGenAddrAfterRemoval(t *testing.T) {
-	t.Parallel()
-
 	const nicID = 1
 
 	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
@@ -2664,8 +2629,6 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) {
 // TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
 // is already assigned to the NIC, the static address remains.
 func TestAutoGenAddrStaticConflict(t *testing.T) {
-	t.Parallel()
-
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
@@ -2721,8 +2684,6 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 // TestAutoGenAddrWithOpaqueIID tests that SLAAC generated addresses will use
 // opaque interface identifiers when configured to do so.
 func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
-	t.Parallel()
-
 	const nicID = 1
 	const nicName = "nic1"
 	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
@@ -2826,8 +2787,6 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 // to the integrator when an RA is received with the NDP Recursive DNS Server
 // option with at least one valid address.
 func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
-	t.Parallel()
-
 	tests := []struct {
 		name     string
 		opt      header.NDPRecursiveDNSServer
@@ -2919,11 +2878,7 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		test := test
-
 		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				// We do not expect more than a single RDNSS
 				// event at any time for this test.
@@ -2973,8 +2928,6 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 // TestCleanupNDPState tests that all discovered routers and prefixes, and
 // auto-generated addresses are invalidated when a NIC becomes a router.
 func TestCleanupNDPState(t *testing.T) {
-	t.Parallel()
-
 	const (
 		lifetimeSeconds          = 5
 		maxRouterAndPrefixEvents = 4
@@ -3417,8 +3370,6 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 // TestRouterSolicitation tests the initial Router Solicitations that are sent
 // when a NIC newly becomes enabled.
 func TestRouterSolicitation(t *testing.T) {
-	t.Parallel()
-
 	const nicID = 1
 
 	tests := []struct {
@@ -3435,13 +3386,22 @@ func TestRouterSolicitation(t *testing.T) {
 		effectiveMaxRtrSolicitDelay time.Duration
 	}{
 		{
-			name:                        "Single RS with delay",
+			name:                        "Single RS with 2s delay and interval",
 			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               1,
-			rtrSolicitInt:               time.Second,
-			effectiveRtrSolicitInt:      time.Second,
-			maxRtrSolicitDelay:          time.Second,
-			effectiveMaxRtrSolicitDelay: time.Second,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
+			maxRtrSolicitDelay:          2 * time.Second,
+			effectiveMaxRtrSolicitDelay: 2 * time.Second,
+		},
+		{
+			name:                        "Single RS with 4s delay and interval",
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               4 * time.Second,
+			effectiveRtrSolicitInt:      4 * time.Second,
+			maxRtrSolicitDelay:          4 * time.Second,
+			effectiveMaxRtrSolicitDelay: 4 * time.Second,
 		},
 		{
 			name:                        "Two RS with delay",
@@ -3449,8 +3409,8 @@ func TestRouterSolicitation(t *testing.T) {
 			nicAddr:                     llAddr1,
 			expectedSrcAddr:             llAddr1,
 			maxRtrSolicit:               2,
-			rtrSolicitInt:               time.Second,
-			effectiveRtrSolicitInt:      time.Second,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
 			maxRtrSolicitDelay:          500 * time.Millisecond,
 			effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
 		},
@@ -3464,8 +3424,8 @@ func TestRouterSolicitation(t *testing.T) {
 				header.NDPSourceLinkLayerAddressOption(linkAddr1),
 			},
 			maxRtrSolicit:               1,
-			rtrSolicitInt:               time.Second,
-			effectiveRtrSolicitInt:      time.Second,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
 			maxRtrSolicitDelay:          0,
 			effectiveMaxRtrSolicitDelay: 0,
 		},
@@ -3515,6 +3475,7 @@ func TestRouterSolicitation(t *testing.T) {
 
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
+
 				e := channelLinkWithHeaderLength{
 					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
 					headerLength: test.linkHeaderLen,
@@ -3583,15 +3544,19 @@ func TestRouterSolicitation(t *testing.T) {
 				}
 
 				for ; remaining > 0; remaining-- {
-					waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
-					waitForPkt(defaultAsyncEventTimeout)
+					if test.effectiveRtrSolicitInt > defaultAsyncEventTimeout {
+						waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncEventTimeout)
+						waitForPkt(2 * defaultAsyncEventTimeout)
+					} else {
+						waitForPkt(test.effectiveRtrSolicitInt * defaultAsyncEventTimeout)
+					}
 				}
 
 				// Make sure no more RS.
 				if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
-					waitForNothing(test.effectiveRtrSolicitInt + defaultTimeout)
+					waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncEventTimeout)
 				} else {
-					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
 				}
 
 				// Make sure the counter got properly
@@ -3605,11 +3570,9 @@ func TestRouterSolicitation(t *testing.T) {
 }
 
 func TestStopStartSolicitingRouters(t *testing.T) {
-	t.Parallel()
-
 	const nicID = 1
+	const delay = 0
 	const interval = 500 * time.Millisecond
-	const delay = time.Second
 	const maxRtrSolicitations = 3
 
 	tests := []struct {
@@ -3684,7 +3647,6 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 				p, ok := e.ReadContext(ctx)
 				if !ok {
 					t.Fatal("timed out waiting for packet")
-					return
 				}
 
 				if p.Proto != header.IPv6ProtocolNumber {
@@ -3710,11 +3672,11 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 
 			// Stop soliciting routers.
 			test.stopFn(t, s, true /* first */)
-			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultTimeout)
+			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
-				// A single RS may have been sent before forwarding was enabled.
-				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultTimeout)
+				// A single RS may have been sent before solicitations were stopped.
+				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultAsyncEventTimeout)
 				defer cancel()
 				if _, ok = e.ReadContext(ctx); ok {
 					t.Fatal("should not have sent more than one RS message")
@@ -3724,7 +3686,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			// Stopping router solicitations after it has already been stopped should
 			// do nothing.
 			test.stopFn(t, s, false /* first */)
-			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
@@ -3740,7 +3702,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			waitForPkt(delay + defaultAsyncEventTimeout)
 			waitForPkt(interval + defaultAsyncEventTimeout)
 			waitForPkt(interval + defaultAsyncEventTimeout)
-			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultAsyncEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
@@ -3749,7 +3711,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			// Starting router solicitations after it has already completed should do
 			// nothing.
 			test.startFn(t, s)
-			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got a packet after finishing router solicitations")
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 555fcd92f..b8543b71e 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -3176,8 +3176,6 @@ func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
 // TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
 // was disabled have DAD performed on them when the NIC is enabled.
 func TestDoDADWhenNICEnabled(t *testing.T) {
-	t.Parallel()
-
 	const dadTransmits = 1
 	const retransmitTimer = time.Second
 	const nicID = 1
-- 
cgit v1.2.3


From 1561ae3037e5a3efdd26320f229fc4c602258dba Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 1 Apr 2020 16:50:16 -0700
Subject: go-marshal: Allow array lens to be consts and simple expressions.

Previously, go-marshal only allowed literals for array
lengths. However, it's very common for ABI structs to have a fix-sized
array whose length is defined by a constant; for example PATH_MAX.
Having to convert all such arrays to have literal lengths is too
awkward.

PiperOrigin-RevId: 304289345
---
 tools/go_marshal/gomarshal/generator.go            |  2 +-
 tools/go_marshal/gomarshal/generator_interfaces.go | 50 ++++++++++++++++++++++
 .../generator_interfaces_array_newtype.go          | 20 +++------
 .../gomarshal/generator_interfaces_struct.go       | 41 +++++++++---------
 tools/go_marshal/gomarshal/util.go                 | 24 +----------
 tools/go_marshal/test/test.go                      | 28 ++++++++++++
 6 files changed, 108 insertions(+), 57 deletions(-)

diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 43e668b63..177013dbb 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -356,7 +356,7 @@ func (g *Generator) generateOne(t marshallableType, fset *token.FileSet) *interf
 	case *ast.ArrayType:
 		i.validateArrayNewtype(t.spec.Name, ty)
 		// After validate, we can safely call arrayLen.
-		i.emitMarshallableForArrayNewtype(t.spec.Name, ty.Elt.(*ast.Ident), arrayLen(ty))
+		i.emitMarshallableForArrayNewtype(t.spec.Name, ty, ty.Elt.(*ast.Ident))
 		if t.slice != nil {
 			abortAt(fset.Position(t.slice.comment.Slash), fmt.Sprintf("Array type marked as '+marshal slice:...', but this is not supported. Perhaps fold one of the dimensions?"))
 		}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 8f1c27145..e3c3dac63 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -15,8 +15,10 @@
 package gomarshal
 
 import (
+	"fmt"
 	"go/ast"
 	"go/token"
+	"strings"
 )
 
 // interfaceGenerator generates marshalling interfaces for a single type.
@@ -224,3 +226,51 @@ func (g *interfaceGenerator) emitKeepAlive(ptrVar string) {
 	g.emit("// must live until the use above.\n")
 	g.emit("runtime.KeepAlive(%s)\n", ptrVar)
 }
+
+func (g *interfaceGenerator) expandBinaryExpr(b *strings.Builder, e *ast.BinaryExpr) {
+	switch x := e.X.(type) {
+	case *ast.BinaryExpr:
+		// Recursively expand sub-expression.
+		g.expandBinaryExpr(b, x)
+	case *ast.Ident:
+		fmt.Fprintf(b, "%s", x.Name)
+	case *ast.BasicLit:
+		fmt.Fprintf(b, "%s", x.Value)
+	default:
+		g.abortAt(e.Pos(), "Cannot convert binary expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers")
+	}
+
+	fmt.Fprintf(b, "%s", e.Op)
+
+	switch y := e.Y.(type) {
+	case *ast.BinaryExpr:
+		// Recursively expand sub-expression.
+		g.expandBinaryExpr(b, y)
+	case *ast.Ident:
+		fmt.Fprintf(b, "%s", y.Name)
+	case *ast.BasicLit:
+		fmt.Fprintf(b, "%s", y.Value)
+	default:
+		g.abortAt(e.Pos(), "Cannot convert binary expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers")
+	}
+}
+
+// arrayLenExpr returns a string containing a valid golang expression
+// representing the length of array a. The returned expression should be treated
+// as a single value, and will be already parenthesized as required.
+func (g *interfaceGenerator) arrayLenExpr(a *ast.ArrayType) string {
+	var b strings.Builder
+
+	switch l := a.Len.(type) {
+	case *ast.Ident:
+		fmt.Fprintf(&b, "%s", l.Name)
+	case *ast.BasicLit:
+		fmt.Fprintf(&b, "%s", l.Value)
+	case *ast.BinaryExpr:
+		g.expandBinaryExpr(&b, l)
+		return fmt.Sprintf("(%s)", b.String())
+	default:
+		g.abortAt(l.Pos(), "Cannot convert this array len expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers")
+	}
+	return b.String()
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
index 5ba74a606..8d6f102d5 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
@@ -27,20 +27,12 @@ func (g *interfaceGenerator) validateArrayNewtype(n *ast.Ident, a *ast.ArrayType
 		g.abortAt(a.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name))
 	}
 
-	if _, ok := a.Len.(*ast.BasicLit); !ok {
-		g.abortAt(a.Len.Pos(), fmt.Sprintf("Array size must be a literal, don't use consts or expressions"))
-	}
-
 	if _, ok := a.Elt.(*ast.Ident); !ok {
 		g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt)))
 	}
-
-	if arrayLen(a) <= 0 {
-		g.abortAt(a.Len.Pos(), fmt.Sprintf("Marshalling not supported for zero length arrays, why does an ABI struct have one?"))
-	}
 }
 
-func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident, len int) {
+func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *ast.ArrayType, elt *ast.Ident) {
 	g.recordUsedImport("io")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("reflect")
@@ -49,13 +41,15 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident,
 	g.recordUsedImport("unsafe")
 	g.recordUsedImport("usermem")
 
+	lenExpr := g.arrayLenExpr(a)
+
 	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
 	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		if size, dynamic := g.scalarSize(elt); !dynamic {
-			g.emit("return %d\n", size*len)
+			g.emit("return %d * %s\n", size, lenExpr)
 		} else {
-			g.emit("return (*%s)(nil).SizeBytes() * %d\n", n.Name, len)
+			g.emit("return (*%s)(nil).SizeBytes() * %s\n", n.Name, lenExpr)
 		}
 	})
 	g.emit("}\n\n")
@@ -63,7 +57,7 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident,
 	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
 	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("for idx := 0; idx < %d; idx++ {\n", len)
+		g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
 		g.inIndent(func() {
 			g.marshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "dst")
 		})
@@ -74,7 +68,7 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n, elt *ast.Ident,
 	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
 	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("for idx := 0; idx < %d; idx++ {\n", len)
+		g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
 		g.inIndent(func() {
 			g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "src")
 		})
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
index e837f58db..4236e978e 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -62,8 +62,8 @@ func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType
 				// No validation to perform on selector fields. However this
 				// callback must still be provided.
 			},
-			array: func(n, _ *ast.Ident, len int) {
-				g.validateArrayNewtype(n, f.Type.(*ast.ArrayType))
+			array: func(n *ast.Ident, a *ast.ArrayType, _ *ast.Ident) {
+				g.validateArrayNewtype(n, a)
 			},
 			unhandled: func(_ *ast.Ident) {
 				g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type)))
@@ -112,16 +112,13 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				g.recordUsedMarshallable(tName)
 				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
 			},
-			array: func(n, t *ast.Ident, len int) {
-				if len < 1 {
-					// Zero-length arrays should've been rejected by validate().
-					panic("unreachable")
-				}
+			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+				lenExpr := g.arrayLenExpr(a)
 				if size, dynamic := g.scalarSize(t); !dynamic {
-					primitiveSize += size * len
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%d*%s", size, lenExpr))
 				} else {
 					g.recordUsedMarshallable(t.Name)
-					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%d", t.Name, len))
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%s", t.Name, lenExpr))
 				}
 			},
 		}.dispatch)
@@ -169,22 +166,23 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				}
 				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
 			},
-			array: func(n, t *ast.Ident, size int) {
+			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+				lenExpr := g.arrayLenExpr(a)
 				if n.Name == "_" {
-					g.emit("// Padding: dst[:sizeof(%s)*%d] ~= [%d]%s{0}\n", t.Name, size, size, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("dst", len*size)
+					g.emit("// Padding: dst[:sizeof(%s)*%s] ~= [%s]%s{0}\n", t.Name, lenExpr, lenExpr, t.Name)
+					if size, dynamic := g.scalarSize(t); !dynamic {
+						g.emit("dst = dst[%d*(%s):]\n", size, lenExpr)
 					} else {
 						// We can't use shiftDynamic here because we don't have
 						// an instance of the dynamic type we can reference here
 						// (since the version in this struct is anonymous). Use
 						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("dst = dst[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
+						g.emit("dst = dst[(*%s)(nil).SizeBytes()*(%s):]\n", t.Name, lenExpr)
 					}
 					return
 				}
 
-				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
+				g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
 				g.inIndent(func() {
 					g.marshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "dst")
 				})
@@ -224,22 +222,23 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 				}
 				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
 			},
-			array: func(n, t *ast.Ident, size int) {
+			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+				lenExpr := g.arrayLenExpr(a)
 				if n.Name == "_" {
-					g.emit("// Padding: ~ copy([%d]%s(%s), src[:sizeof(%s)*%d])\n", size, t.Name, g.fieldAccessor(n), t.Name, size)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("src", len*size)
+					g.emit("// Padding: ~ copy([%s]%s(%s), src[:sizeof(%s)*%s])\n", lenExpr, t.Name, g.fieldAccessor(n), t.Name, lenExpr)
+					if size, dynamic := g.scalarSize(t); !dynamic {
+						g.emit("src = src[%d*(%s):]\n", size, lenExpr)
 					} else {
 						// We can't use shiftDynamic here because we don't have
 						// an instance of the dynamic type we can referece here
 						// (since the version in this struct is anonymous). Use
 						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("src = src[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
+						g.emit("src = src[(*%s)(nil).SizeBytes()*(%s):]\n", t.Name, lenExpr)
 					}
 					return
 				}
 
-				g.emit("for idx := 0; idx < %d; idx++ {\n", size)
+				g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
 				g.inIndent(func() {
 					g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "src")
 				})
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index 96025ff39..d94314302 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -25,7 +25,6 @@ import (
 	"path"
 	"reflect"
 	"sort"
-	"strconv"
 	"strings"
 )
 
@@ -75,29 +74,10 @@ func forEachStructField(st *ast.StructType, fn func(f *ast.Field)) {
 type fieldDispatcher struct {
 	primitive func(n, t *ast.Ident)
 	selector  func(n, tX, tSel *ast.Ident)
-	array     func(n, t *ast.Ident, size int)
+	array     func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident)
 	unhandled func(n *ast.Ident)
 }
 
-// Precondition: a must have a literal for the array length. Consts and
-// expressions are not allowed as array lengths, and should be rejected by the
-// caller.
-func arrayLen(a *ast.ArrayType) int {
-	if a.Len == nil {
-		// Probably a slice? Must be handled by caller.
-		panic("Nil array length in array type")
-	}
-	lenLit, ok := a.Len.(*ast.BasicLit)
-	if !ok {
-		panic("Array has non-literal for length")
-	}
-	len, err := strconv.Atoi(lenLit.Value)
-	if err != nil {
-		panic(fmt.Sprintf("Failed to parse array length '%s' as number: %v", lenLit.Value, err))
-	}
-	return len
-}
-
 // Precondition: All dispatch callbacks that will be invoked must be
 // provided. Embedded fields are not allowed, len(f.Names) >= 1.
 func (fd fieldDispatcher) dispatch(f *ast.Field) {
@@ -123,7 +103,7 @@ func (fd fieldDispatcher) dispatch(f *ast.Field) {
 		case *ast.ArrayType:
 			switch t := v.Elt.(type) {
 			case *ast.Ident:
-				fd.array(name, t, arrayLen(v))
+				fd.array(name, v, t)
 			default:
 				// Should be handled with a better error message during validate.
 				panic(fmt.Sprintf("Array element type is of unsupported kind. Expected *ast.Ident, got %v", t))
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
index 43df73545..f75ca1b7f 100644
--- a/tools/go_marshal/test/test.go
+++ b/tools/go_marshal/test/test.go
@@ -146,3 +146,31 @@ type SignalSet uint64
 //
 // +marshal slice:SignalSetAliasSlice
 type SignalSetAlias SignalSet
+
+const sizeA = 64
+const sizeB = 8
+
+// TestArray is a test data structure on an array with a constant length.
+//
+// +marshal
+type TestArray [sizeA]int32
+
+// TestArray2 is a newtype on an array with a simple arithmetic expression of
+// constants for the array length.
+//
+// +marshal
+type TestArray2 [sizeA * sizeB]int32
+
+// TestArray2 is a newtype on an array with a simple arithmetic expression of
+// mixed constants and literals for the array length.
+//
+// +marshal
+type TestArray3 [sizeA*sizeB + 12]int32
+
+// Type9 is a test data type containing an array with a non-literal length.
+//
+// +marshal
+type Type9 struct {
+	x int64
+	y [sizeA]int32
+}
-- 
cgit v1.2.3


From 37025990d6ed1f1160937c640855070d9a559cb0 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Wed, 1 Apr 2020 17:39:12 -0700
Subject: Add "/snap/bin" to PATH.

"gcloud" may be installed as a snap, under "/snap/bin". Make
sure this is in our PATH so that we can use gcloud.

PiperOrigin-RevId: 304297180
---
 scripts/benchmark.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index 3fd80fc2e..e2d688710 100755
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -16,12 +16,19 @@
 
 source $(dirname $0)/common.sh
 
+# gcloud may be installed as a "snap". If it is, include it in PATH.
+declare -r snap="/snap/bin"
+if [[ -d "-d ${snap}" ]]; then
+  export PATH="${PATH}:${snap}"
+fi
+
+# Make sure we can call gcloud and exit if not.
+which gcloud
+
 # Exporting for subprocesses as GCP APIs and tools check this environmental
 # variable for authentication.
 export GOOGLE_APPLICATION_CREDENTIALS="${KOKORO_KEYSTORE_DIR}/${GCLOUD_CREDENTIALS}"
 
-which gcloud
-
 gcloud auth activate-service-account \
    --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
 
-- 
cgit v1.2.3


From c6d5742c21c19f9cf8b964b49b8df935c1303417 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 2 Apr 2020 10:39:56 -0700
Subject: Fix flaky TCPLinger2TimeoutAfterClose test.

The test is flaky in cooperative S/R mode because TCP timers are not restored
across a S/R. This can cause the TCPLinger2 timer to not fire. This change
disables S/R before setting the TCP_LINGER2 timeout.

PiperOrigin-RevId: 304430536
---
 test/syscalls/linux/socket_inet_loopback.cc | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index b24618a88..16888de2a 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -605,15 +605,23 @@ TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
                   &conn_addrlen),
       SyscallSucceeds());
 
-  constexpr int kTCPLingerTimeout = 5;
-  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
-                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
-              SyscallSucceedsWithValue(0));
-
-  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
-  conn_fd.reset();
-
-  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+  // Disable cooperative saves after this point as TCP timers are not restored
+  // across a S/R.
+  {
+    DisableSave ds;
+    constexpr int kTCPLingerTimeout = 5;
+    EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                           &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+                SyscallSucceedsWithValue(0));
+
+    // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+    conn_fd.reset();
+
+    absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+    // ds going out of scope will Re-enable S/R's since at this point the timer
+    // must have fired and cleaned up the endpoint.
+  }
 
   // Now bind and connect a new socket and verify that we can immediately
   // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
-- 
cgit v1.2.3


From 035836193e6d9e1fc9cce6a0161cb3907fbc2ef5 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 2 Apr 2020 11:24:04 -0700
Subject: Fix typo in benchmarks.sh

PiperOrigin-RevId: 304440599
---
 scripts/benchmark.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index e2d688710..e0f6df438 100755
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -18,11 +18,11 @@ source $(dirname $0)/common.sh
 
 # gcloud may be installed as a "snap". If it is, include it in PATH.
 declare -r snap="/snap/bin"
-if [[ -d "-d ${snap}" ]]; then
+if [[ -d "${snap}" ]]; then
   export PATH="${PATH}:${snap}"
 fi
 
-# Make sure we can call gcloud and exit if not.
+# Make sure we can find gcloud and exit if not.
 which gcloud
 
 # Exporting for subprocesses as GCP APIs and tools check this environmental
-- 
cgit v1.2.3


From 30388ff5919df33e7184719dfc6c0d9cb110b2e2 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 2 Apr 2020 11:55:55 -0700
Subject: Rename files in //pkg/sync to better reflect what they contain.

PiperOrigin-RevId: 304447031
---
 pkg/sync/BUILD                          |  10 +-
 pkg/sync/downgradable_rwmutex_test.go   | 205 --------------------------------
 pkg/sync/downgradable_rwmutex_unsafe.go | 198 ------------------------------
 pkg/sync/mutex_test.go                  |  71 +++++++++++
 pkg/sync/mutex_unsafe.go                |  49 ++++++++
 pkg/sync/rwmutex_test.go                | 205 ++++++++++++++++++++++++++++++++
 pkg/sync/rwmutex_unsafe.go              | 198 ++++++++++++++++++++++++++++++
 pkg/sync/sync.go                        |   7 ++
 pkg/sync/syncutil.go                    |   7 --
 pkg/sync/tmutex_test.go                 |  71 -----------
 pkg/sync/tmutex_unsafe.go               |  49 --------
 11 files changed, 535 insertions(+), 535 deletions(-)
 delete mode 100644 pkg/sync/downgradable_rwmutex_test.go
 delete mode 100644 pkg/sync/downgradable_rwmutex_unsafe.go
 create mode 100644 pkg/sync/mutex_test.go
 create mode 100644 pkg/sync/mutex_unsafe.go
 create mode 100644 pkg/sync/rwmutex_test.go
 create mode 100644 pkg/sync/rwmutex_unsafe.go
 create mode 100644 pkg/sync/sync.go
 delete mode 100644 pkg/sync/syncutil.go
 delete mode 100644 pkg/sync/tmutex_test.go
 delete mode 100644 pkg/sync/tmutex_unsafe.go

diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 5340cf0d6..0e35d7d17 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -31,13 +31,13 @@ go_library(
     name = "sync",
     srcs = [
         "aliases.go",
-        "downgradable_rwmutex_unsafe.go",
         "memmove_unsafe.go",
+        "mutex_unsafe.go",
         "norace_unsafe.go",
         "race_unsafe.go",
+        "rwmutex_unsafe.go",
         "seqcount.go",
-        "syncutil.go",
-        "tmutex_unsafe.go",
+        "sync.go",
     ],
 )
 
@@ -45,9 +45,9 @@ go_test(
     name = "sync_test",
     size = "small",
     srcs = [
-        "downgradable_rwmutex_test.go",
+        "mutex_test.go",
+        "rwmutex_test.go",
         "seqcount_test.go",
-        "tmutex_test.go",
     ],
     library = ":sync",
 )
diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go
deleted file mode 100644
index ce667e825..000000000
--- a/pkg/sync/downgradable_rwmutex_test.go
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GOMAXPROCS=10 go test
-
-// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
-// addition of downgradingWriter and the renaming of num_iterations to
-// numIterations to shut up Golint.
-
-package sync
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-)
-
-func parallelReader(m *RWMutex, clocked, cunlock, cdone chan bool) {
-	m.RLock()
-	clocked <- true
-	<-cunlock
-	m.RUnlock()
-	cdone <- true
-}
-
-func doTestParallelReaders(numReaders, gomaxprocs int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	var m RWMutex
-	clocked := make(chan bool)
-	cunlock := make(chan bool)
-	cdone := make(chan bool)
-	for i := 0; i < numReaders; i++ {
-		go parallelReader(&m, clocked, cunlock, cdone)
-	}
-	// Wait for all parallel RLock()s to succeed.
-	for i := 0; i < numReaders; i++ {
-		<-clocked
-	}
-	for i := 0; i < numReaders; i++ {
-		cunlock <- true
-	}
-	// Wait for the goroutines to finish.
-	for i := 0; i < numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestParallelReaders(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	doTestParallelReaders(1, 4)
-	doTestParallelReaders(3, 4)
-	doTestParallelReaders(4, 2)
-}
-
-func reader(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.RLock()
-		n := atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func writer(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.Unlock()
-	}
-	cdone <- true
-}
-
-func downgradingWriter(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.DowngradeLock()
-		n = atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		n = atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	// Number of active readers + 10000 * number of active writers.
-	var activity int32
-	var rwm RWMutex
-	cdone := make(chan bool)
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	var i int
-	for i = 0; i < numReaders/2; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	for ; i < numReaders; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	// Wait for the 4 writers and all readers to finish.
-	for i := 0; i < 4+numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestDowngradableRWMutex(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	n := 1000
-	if testing.Short() {
-		n = 5
-	}
-	HammerDowngradableRWMutex(1, 1, n)
-	HammerDowngradableRWMutex(1, 3, n)
-	HammerDowngradableRWMutex(1, 10, n)
-	HammerDowngradableRWMutex(4, 1, n)
-	HammerDowngradableRWMutex(4, 3, n)
-	HammerDowngradableRWMutex(4, 10, n)
-	HammerDowngradableRWMutex(10, 1, n)
-	HammerDowngradableRWMutex(10, 3, n)
-	HammerDowngradableRWMutex(10, 10, n)
-	HammerDowngradableRWMutex(10, 5, n)
-}
-
-func TestRWDoubleTryLock(t *testing.T) {
-	var rwm RWMutex
-	if !rwm.TryLock() {
-		t.Fatal("failed to aquire lock")
-	}
-	if rwm.TryLock() {
-		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
-	}
-}
-
-func TestRWTryLockAfterLock(t *testing.T) {
-	var rwm RWMutex
-	rwm.Lock()
-	if rwm.TryLock() {
-		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
-	}
-}
-
-func TestRWTryLockUnlock(t *testing.T) {
-	var rwm RWMutex
-	if !rwm.TryLock() {
-		t.Fatal("failed to aquire lock")
-	}
-	rwm.Unlock()
-	if !rwm.TryLock() {
-		t.Fatal("failed to aquire lock after unlock")
-	}
-}
-
-func TestTryRLockAfterLock(t *testing.T) {
-	var rwm RWMutex
-	rwm.Lock()
-	if rwm.TryRLock() {
-		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
-	}
-}
-
-func TestTryLockAfterRLock(t *testing.T) {
-	var rwm RWMutex
-	rwm.RLock()
-	if rwm.TryLock() {
-		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
-	}
-}
-
-func TestDoubleTryRLock(t *testing.T) {
-	var rwm RWMutex
-	if !rwm.TryRLock() {
-		t.Fatal("failed to aquire lock")
-	}
-	if !rwm.TryRLock() {
-		t.Fatal("failed to read aquire read locked lock")
-	}
-}
diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go
deleted file mode 100644
index ea6cdc447..000000000
--- a/pkg/sync/downgradable_rwmutex_unsafe.go
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-// This is mostly copied from the standard library's sync/rwmutex.go.
-//
-// Happens-before relationships indicated to the race detector:
-// - Unlock -> Lock (via writerSem)
-// - Unlock -> RLock (via readerSem)
-// - RUnlock -> Lock (via writerSem)
-// - DowngradeLock -> RLock (via readerSem)
-
-package sync
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
-
-// RWMutex is identical to sync.RWMutex, but adds the DowngradeLock,
-// TryLock and TryRLock methods.
-type RWMutex struct {
-	w           Mutex  // held if there are pending writers
-	writerSem   uint32 // semaphore for writers to wait for completing readers
-	readerSem   uint32 // semaphore for readers to wait for completing writers
-	readerCount int32  // number of pending readers
-	readerWait  int32  // number of departing readers
-}
-
-const rwmutexMaxReaders = 1 << 30
-
-// TryRLock locks rw for reading. It returns true if it succeeds and false
-// otherwise. It does not block.
-func (rw *RWMutex) TryRLock() bool {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	for {
-		rc := atomic.LoadInt32(&rw.readerCount)
-		if rc < 0 {
-			if RaceEnabled {
-				RaceEnable()
-			}
-			return false
-		}
-		if !atomic.CompareAndSwapInt32(&rw.readerCount, rc, rc+1) {
-			continue
-		}
-		if RaceEnabled {
-			RaceEnable()
-			RaceAcquire(unsafe.Pointer(&rw.readerSem))
-		}
-		return true
-	}
-}
-
-// RLock locks rw for reading.
-func (rw *RWMutex) RLock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
-		// A writer is pending, wait for it.
-		runtimeSemacquire(&rw.readerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.readerSem))
-	}
-}
-
-// RUnlock undoes a single RLock call.
-func (rw *RWMutex) RUnlock() {
-	if RaceEnabled {
-		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
-		RaceDisable()
-	}
-	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
-		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
-			panic("RUnlock of unlocked RWMutex")
-		}
-		// A writer is pending.
-		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
-			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false, 0)
-		}
-	}
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// TryLock locks rw for writing. It returns true if it succeeds and false
-// otherwise. It does not block.
-func (rw *RWMutex) TryLock() bool {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	// First, resolve competition with other writers.
-	if !rw.w.TryLock() {
-		if RaceEnabled {
-			RaceEnable()
-		}
-		return false
-	}
-	// Only proceed if there are no readers.
-	if !atomic.CompareAndSwapInt32(&rw.readerCount, 0, -rwmutexMaxReaders) {
-		rw.w.Unlock()
-		if RaceEnabled {
-			RaceEnable()
-		}
-		return false
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.writerSem))
-	}
-	return true
-}
-
-// Lock locks rw for writing.
-func (rw *RWMutex) Lock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	// First, resolve competition with other writers.
-	rw.w.Lock()
-	// Announce to readers there is a pending writer.
-	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
-	// Wait for active readers.
-	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
-		runtimeSemacquire(&rw.writerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.writerSem))
-	}
-}
-
-// Unlock unlocks rw for writing.
-func (rw *RWMutex) Unlock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.writerSem))
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
-	if r >= rwmutexMaxReaders {
-		panic("Unlock of unlocked RWMutex")
-	}
-	// Unblock blocked readers, if any.
-	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *RWMutex) DowngradeLock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer and one additional reader.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
-	if r >= rwmutexMaxReaders+1 {
-		panic("DowngradeLock of unlocked RWMutex")
-	}
-	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
-	// includes this goroutine.
-	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
-	// block on rw.writerSem since at least this reader exists, such that
-	// DowngradeLock() is atomic with the previous write lock.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
diff --git a/pkg/sync/mutex_test.go b/pkg/sync/mutex_test.go
new file mode 100644
index 000000000..0838248b4
--- /dev/null
+++ b/pkg/sync/mutex_test.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync"
+	"testing"
+	"unsafe"
+)
+
+// TestStructSize verifies that syncMutex's size hasn't drifted from the
+// standard library's version.
+//
+// The correctness of this package relies on these remaining in sync.
+func TestStructSize(t *testing.T) {
+	const (
+		got  = unsafe.Sizeof(syncMutex{})
+		want = unsafe.Sizeof(sync.Mutex{})
+	)
+	if got != want {
+		t.Errorf("got sizeof(syncMutex) = %d, want = sizeof(sync.Mutex) = %d", got, want)
+	}
+}
+
+// TestFieldValues verifies that the semantics of syncMutex.state from the
+// standard library's implementation.
+//
+// The correctness of this package relies on these remaining in sync.
+func TestFieldValues(t *testing.T) {
+	var m Mutex
+	m.Lock()
+	if got := *m.state(); got != mutexLocked {
+		t.Errorf("got locked sync.Mutex.state = %d, want = %d", got, mutexLocked)
+	}
+	m.Unlock()
+	if got := *m.state(); got != mutexUnlocked {
+		t.Errorf("got unlocked sync.Mutex.state = %d, want = %d", got, mutexUnlocked)
+	}
+}
+
+func TestDoubleTryLock(t *testing.T) {
+	var m Mutex
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestTryLockAfterLock(t *testing.T) {
+	var m Mutex
+	m.Lock()
+	if m.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestTryLockUnlock(t *testing.T) {
+	var m Mutex
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	m.Unlock()
+	if !m.TryLock() {
+		t.Fatal("failed to aquire lock after unlock")
+	}
+}
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
new file mode 100644
index 000000000..3dd15578b
--- /dev/null
+++ b/pkg/sync/mutex_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.15
+
+// When updating the build constraint (above), check that syncMutex matches the
+// standard library sync.Mutex definition.
+
+package sync
+
+import (
+	"sync"
+	"sync/atomic"
+	"unsafe"
+)
+
+// Mutex is a try lock.
+type Mutex struct {
+	sync.Mutex
+}
+
+type syncMutex struct {
+	state int32
+	sema  uint32
+}
+
+func (m *Mutex) state() *int32 {
+	return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
+}
+
+const (
+	mutexUnlocked = 0
+	mutexLocked   = 1
+)
+
+// TryLock tries to aquire the mutex. It returns true if it succeeds and false
+// otherwise. TryLock does not block.
+func (m *Mutex) TryLock() bool {
+	if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) {
+		if RaceEnabled {
+			RaceAcquire(unsafe.Pointer(&m.Mutex))
+		}
+		return true
+	}
+	return false
+}
diff --git a/pkg/sync/rwmutex_test.go b/pkg/sync/rwmutex_test.go
new file mode 100644
index 000000000..ce667e825
--- /dev/null
+++ b/pkg/sync/rwmutex_test.go
@@ -0,0 +1,205 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
+// addition of downgradingWriter and the renaming of num_iterations to
+// numIterations to shut up Golint.
+
+package sync
+
+import (
+	"fmt"
+	"runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *RWMutex, clocked, cunlock, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	<-cunlock
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders, gomaxprocs int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	var m RWMutex
+	clocked := make(chan bool)
+	cunlock := make(chan bool)
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	for i := 0; i < numReaders; i++ {
+		cunlock <- true
+	}
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelReaders(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	doTestParallelReaders(1, 4)
+	doTestParallelReaders(3, 4)
+	doTestParallelReaders(4, 2)
+}
+
+func reader(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func downgradingWriter(rwm *RWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.DowngradeLock()
+		n = atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		n = atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm RWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	// Wait for the 4 writers and all readers to finish.
+	for i := 0; i < 4+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestDowngradableRWMutex(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerDowngradableRWMutex(1, 1, n)
+	HammerDowngradableRWMutex(1, 3, n)
+	HammerDowngradableRWMutex(1, 10, n)
+	HammerDowngradableRWMutex(4, 1, n)
+	HammerDowngradableRWMutex(4, 3, n)
+	HammerDowngradableRWMutex(4, 10, n)
+	HammerDowngradableRWMutex(10, 1, n)
+	HammerDowngradableRWMutex(10, 3, n)
+	HammerDowngradableRWMutex(10, 10, n)
+	HammerDowngradableRWMutex(10, 5, n)
+}
+
+func TestRWDoubleTryLock(t *testing.T) {
+	var rwm RWMutex
+	if !rwm.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	if rwm.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestRWTryLockAfterLock(t *testing.T) {
+	var rwm RWMutex
+	rwm.Lock()
+	if rwm.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestRWTryLockUnlock(t *testing.T) {
+	var rwm RWMutex
+	if !rwm.TryLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	rwm.Unlock()
+	if !rwm.TryLock() {
+		t.Fatal("failed to aquire lock after unlock")
+	}
+}
+
+func TestTryRLockAfterLock(t *testing.T) {
+	var rwm RWMutex
+	rwm.Lock()
+	if rwm.TryRLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestTryLockAfterRLock(t *testing.T) {
+	var rwm RWMutex
+	rwm.RLock()
+	if rwm.TryLock() {
+		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
+	}
+}
+
+func TestDoubleTryRLock(t *testing.T) {
+	var rwm RWMutex
+	if !rwm.TryRLock() {
+		t.Fatal("failed to aquire lock")
+	}
+	if !rwm.TryRLock() {
+		t.Fatal("failed to read aquire read locked lock")
+	}
+}
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
new file mode 100644
index 000000000..ea6cdc447
--- /dev/null
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -0,0 +1,198 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+// This is mostly copied from the standard library's sync/rwmutex.go.
+//
+// Happens-before relationships indicated to the race detector:
+// - Unlock -> Lock (via writerSem)
+// - Unlock -> RLock (via readerSem)
+// - RUnlock -> Lock (via writerSem)
+// - DowngradeLock -> RLock (via readerSem)
+
+package sync
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+//go:linkname runtimeSemacquire sync.runtime_Semacquire
+func runtimeSemacquire(s *uint32)
+
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
+
+// RWMutex is identical to sync.RWMutex, but adds the DowngradeLock,
+// TryLock and TryRLock methods.
+type RWMutex struct {
+	w           Mutex  // held if there are pending writers
+	writerSem   uint32 // semaphore for writers to wait for completing readers
+	readerSem   uint32 // semaphore for readers to wait for completing writers
+	readerCount int32  // number of pending readers
+	readerWait  int32  // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// TryRLock locks rw for reading. It returns true if it succeeds and false
+// otherwise. It does not block.
+func (rw *RWMutex) TryRLock() bool {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	for {
+		rc := atomic.LoadInt32(&rw.readerCount)
+		if rc < 0 {
+			if RaceEnabled {
+				RaceEnable()
+			}
+			return false
+		}
+		if !atomic.CompareAndSwapInt32(&rw.readerCount, rc, rc+1) {
+			continue
+		}
+		if RaceEnabled {
+			RaceEnable()
+			RaceAcquire(unsafe.Pointer(&rw.readerSem))
+		}
+		return true
+	}
+}
+
+// RLock locks rw for reading.
+func (rw *RWMutex) RLock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
+		// A writer is pending, wait for it.
+		runtimeSemacquire(&rw.readerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.readerSem))
+	}
+}
+
+// RUnlock undoes a single RLock call.
+func (rw *RWMutex) RUnlock() {
+	if RaceEnabled {
+		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
+		RaceDisable()
+	}
+	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			panic("RUnlock of unlocked RWMutex")
+		}
+		// A writer is pending.
+		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			runtimeSemrelease(&rw.writerSem, false, 0)
+		}
+	}
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// TryLock locks rw for writing. It returns true if it succeeds and false
+// otherwise. It does not block.
+func (rw *RWMutex) TryLock() bool {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	if !rw.w.TryLock() {
+		if RaceEnabled {
+			RaceEnable()
+		}
+		return false
+	}
+	// Only proceed if there are no readers.
+	if !atomic.CompareAndSwapInt32(&rw.readerCount, 0, -rwmutexMaxReaders) {
+		rw.w.Unlock()
+		if RaceEnabled {
+			RaceEnable()
+		}
+		return false
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+	return true
+}
+
+// Lock locks rw for writing.
+func (rw *RWMutex) Lock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	rw.w.Lock()
+	// Announce to readers there is a pending writer.
+	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
+	// Wait for active readers.
+	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
+		runtimeSemacquire(&rw.writerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+}
+
+// Unlock unlocks rw for writing.
+func (rw *RWMutex) Unlock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.writerSem))
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
+	if r >= rwmutexMaxReaders {
+		panic("Unlock of unlocked RWMutex")
+	}
+	// Unblock blocked readers, if any.
+	for i := 0; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// DowngradeLock atomically unlocks rw for writing and locks it for reading.
+func (rw *RWMutex) DowngradeLock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer and one additional reader.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
+	if r >= rwmutexMaxReaders+1 {
+		panic("DowngradeLock of unlocked RWMutex")
+	}
+	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
+	// includes this goroutine.
+	for i := 1; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
+	// block on rw.writerSem since at least this reader exists, such that
+	// DowngradeLock() is atomic with the previous write lock.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
diff --git a/pkg/sync/sync.go b/pkg/sync/sync.go
new file mode 100644
index 000000000..b16cf5333
--- /dev/null
+++ b/pkg/sync/sync.go
@@ -0,0 +1,7 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sync provides synchronization primitives.
+package sync
diff --git a/pkg/sync/syncutil.go b/pkg/sync/syncutil.go
deleted file mode 100644
index b16cf5333..000000000
--- a/pkg/sync/syncutil.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package sync provides synchronization primitives.
-package sync
diff --git a/pkg/sync/tmutex_test.go b/pkg/sync/tmutex_test.go
deleted file mode 100644
index 0838248b4..000000000
--- a/pkg/sync/tmutex_test.go
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package sync
-
-import (
-	"sync"
-	"testing"
-	"unsafe"
-)
-
-// TestStructSize verifies that syncMutex's size hasn't drifted from the
-// standard library's version.
-//
-// The correctness of this package relies on these remaining in sync.
-func TestStructSize(t *testing.T) {
-	const (
-		got  = unsafe.Sizeof(syncMutex{})
-		want = unsafe.Sizeof(sync.Mutex{})
-	)
-	if got != want {
-		t.Errorf("got sizeof(syncMutex) = %d, want = sizeof(sync.Mutex) = %d", got, want)
-	}
-}
-
-// TestFieldValues verifies that the semantics of syncMutex.state from the
-// standard library's implementation.
-//
-// The correctness of this package relies on these remaining in sync.
-func TestFieldValues(t *testing.T) {
-	var m Mutex
-	m.Lock()
-	if got := *m.state(); got != mutexLocked {
-		t.Errorf("got locked sync.Mutex.state = %d, want = %d", got, mutexLocked)
-	}
-	m.Unlock()
-	if got := *m.state(); got != mutexUnlocked {
-		t.Errorf("got unlocked sync.Mutex.state = %d, want = %d", got, mutexUnlocked)
-	}
-}
-
-func TestDoubleTryLock(t *testing.T) {
-	var m Mutex
-	if !m.TryLock() {
-		t.Fatal("failed to aquire lock")
-	}
-	if m.TryLock() {
-		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
-	}
-}
-
-func TestTryLockAfterLock(t *testing.T) {
-	var m Mutex
-	m.Lock()
-	if m.TryLock() {
-		t.Fatal("unexpectedly succeeded in aquiring locked mutex")
-	}
-}
-
-func TestTryLockUnlock(t *testing.T) {
-	var m Mutex
-	if !m.TryLock() {
-		t.Fatal("failed to aquire lock")
-	}
-	m.Unlock()
-	if !m.TryLock() {
-		t.Fatal("failed to aquire lock after unlock")
-	}
-}
diff --git a/pkg/sync/tmutex_unsafe.go b/pkg/sync/tmutex_unsafe.go
deleted file mode 100644
index 3dd15578b..000000000
--- a/pkg/sync/tmutex_unsafe.go
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// When updating the build constraint (above), check that syncMutex matches the
-// standard library sync.Mutex definition.
-
-package sync
-
-import (
-	"sync"
-	"sync/atomic"
-	"unsafe"
-)
-
-// Mutex is a try lock.
-type Mutex struct {
-	sync.Mutex
-}
-
-type syncMutex struct {
-	state int32
-	sema  uint32
-}
-
-func (m *Mutex) state() *int32 {
-	return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
-}
-
-const (
-	mutexUnlocked = 0
-	mutexLocked   = 1
-)
-
-// TryLock tries to aquire the mutex. It returns true if it succeeds and false
-// otherwise. TryLock does not block.
-func (m *Mutex) TryLock() bool {
-	if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) {
-		if RaceEnabled {
-			RaceAcquire(unsafe.Pointer(&m.Mutex))
-		}
-		return true
-	}
-	return false
-}
-- 
cgit v1.2.3


From ecc3d01d181a6ae6d3cc72531542d9ea5fe3e376 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 2 Apr 2020 15:58:38 -0700
Subject: Increment NDP message RX stats before validation

Tests:
- ipv6_test.TestHopLimitValidation
- ipv6_test.TestRouterAdvertValidation
PiperOrigin-RevId: 304495723
---
 pkg/tcpip/network/ipv6/icmp.go     | 57 ++++++++++++++++----------------------
 pkg/tcpip/network/ipv6/ndp_test.go | 49 ++++++++++++++++----------------
 2 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index e0dd5afd3..81e6f4d67 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -86,25 +86,12 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		return
 	}
 
-	// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
-	// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
-	// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
-	// set to 0.
-	switch h.Type() {
-	case header.ICMPv6NeighborSolicit,
-		header.ICMPv6NeighborAdvert,
-		header.ICMPv6RouterSolicit,
-		header.ICMPv6RouterAdvert,
-		header.ICMPv6RedirectMsg:
-		if iph.HopLimit() != header.NDPHopLimit {
-			received.Invalid.Increment()
-			return
-		}
-
-		if h.Code() != 0 {
-			received.Invalid.Increment()
-			return
-		}
+	isNDPValid := func() bool {
+		// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
+		// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
+		// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
+		// set to 0.
+		return iph.HopLimit() == header.NDPHopLimit && h.Code() == 0
 	}
 
 	// TODO(b/112892170): Meaningfully handle all ICMP types.
@@ -133,7 +120,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if len(v) < header.ICMPv6NeighborSolicitMinimumSize {
+		if len(v) < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
@@ -253,7 +240,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if len(v) < header.ICMPv6NeighborAdvertSize {
+		if len(v) < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
@@ -355,8 +342,20 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6RouterSolicit:
 		received.RouterSolicit.Increment()
+		if !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
 
 	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
+
+		p := h.NDPPayload()
+		if len(p) < header.NDPRAMinimumSize || !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
 		routerAddr := iph.SourceAddress()
 
 		//
@@ -370,16 +369,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 			return
 		}
 
-		p := h.NDPPayload()
-
-		// Is the NDP payload of sufficient size to hold a Router
-		// Advertisement?
-		if len(p) < header.NDPRAMinimumSize {
-			// ...No, silently drop the packet.
-			received.Invalid.Increment()
-			return
-		}
-
 		ra := header.NDPRouterAdvert(p)
 		opts := ra.Options()
 
@@ -395,8 +384,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		// as RFC 4861 section 6.1.2 is concerned.
 		//
 
-		received.RouterAdvert.Increment()
-
 		// Tell the NIC to handle the RA.
 		stack := r.Stack()
 		rxNICID := r.NICID()
@@ -404,6 +391,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6RedirectMsg:
 		received.RedirectMsg.Increment()
+		if !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
 
 	default:
 		received.Invalid.Increment()
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index f924ed9e1..3b05e8062 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -381,44 +381,48 @@ func TestHopLimitValidation(t *testing.T) {
 			pkt.SetType(typ.typ)
 			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
 
+			// Rx count of the NDP message should initially be 0.
+			if got := typStat.Value(); got != 0 {
+				t.Errorf("got %s = %d, want = 0", typ.name, got)
+			}
+
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
+				t.Errorf("got invalid = %d, want = 0", got)
 			}
 
-			// Should not have received any ICMPv6 packets with
-			// type = typ.typ.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			if t.Failed() {
+				t.FailNow()
 			}
 
-			// Receive the NDP packet with an invalid hop limit
-			// value.
+			// Receive the NDP packet with an invalid hop limit.
 			handleIPv6Payload(hdr, header.NDPHopLimit-1, ep, &r)
 
+			// Rx count of the NDP packet should have increased.
+			if got := typStat.Value(); got != 1 {
+				t.Errorf("got %s = %d, want = 1", typ.name, got)
+			}
+
 			// Invalid count should have increased.
 			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
+				t.Errorf("got invalid = %d, want = 1", got)
 			}
 
-			// Rx count of NDP packet of type typ.typ should not
-			// have increased.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			if t.Failed() {
+				t.FailNow()
 			}
 
 			// Receive the NDP packet with a valid hop limit value.
 			handleIPv6Payload(hdr, header.NDPHopLimit, ep, &r)
 
-			// Rx count of NDP packet of type typ.typ should have
-			// increased.
-			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			// Rx count of the NDP packet should have increased.
+			if got := typStat.Value(); got != 2 {
+				t.Errorf("got %s = %d, want = 2", typ.name, got)
 			}
 
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
+				t.Errorf("got invalid = %d, want = 1", got)
 			}
 		})
 	}
@@ -592,21 +596,18 @@ func TestRouterAdvertValidation(t *testing.T) {
 				Data: hdr.View().ToVectorisedView(),
 			})
 
+			if got := rxRA.Value(); got != 1 {
+				t.Fatalf("got rxRA = %d, want = 1", got)
+			}
+
 			if test.expectedSuccess {
 				if got := invalid.Value(); got != 0 {
 					t.Fatalf("got invalid = %d, want = 0", got)
 				}
-				if got := rxRA.Value(); got != 1 {
-					t.Fatalf("got rxRA = %d, want = 1", got)
-				}
-
 			} else {
 				if got := invalid.Value(); got != 1 {
 					t.Fatalf("got invalid = %d, want = 1", got)
 				}
-				if got := rxRA.Value(); got != 0 {
-					t.Fatalf("got rxRA = %d, want = 0", got)
-				}
 			}
 		})
 	}
-- 
cgit v1.2.3


From dbc507dc5cfde2f69a94a58fcb2744ef0899ce7e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 2 Apr 2020 16:57:08 -0700
Subject: Add equivalents to FMODE_PREAD/PWRITE to VFS2.

This is mostly required for PipeTest_OffsetCalls.

The options are DenyPRead/PWrite rather than AllowPRead/PWrite since, in Linux
terms, fs/open.c:do_dentry_open sets FMODE_PREAD|FMODE_PWRITE unconditionally
(although it allows filesystem implementations of open to unset these flags),
so they're set for most FDs; it's usually FDs created outside of open(2) that
don't get them, e.g.:

- Syscall-created pipes (fs/pipe.c:create_pipe_files =>
  fs/file_table.c:alloc_file_pseudo)

- Epoll instances (fs/eventpoll.c:do_epoll_create =>
  fs/anon_inodes.c:anon_inode_getfile => alloc_file_pseudo)

- Sockets (net/socket.c:sock_alloc_file => alloc_file_pseudo)

This CL adds the flags to epoll instances; a subsequent CL reworks the VFS2
implementation of pipe FDs to be filesystem-independent and adds the flags
there, and sockets aren't implemented yet.

Updates #1035

PiperOrigin-RevId: 304506434
---
 pkg/sentry/vfs/epoll.go            |  2 ++
 pkg/sentry/vfs/file_description.go | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 3da45d744..8e0b40841 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -99,6 +99,8 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
 		interest: make(map[epollInterestKey]*epollInterest),
 	}
 	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
 		UseDentryMetadata: true,
 	}); err != nil {
 		return nil, err
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 8ee549dc2..5df4bbf45 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -84,6 +84,13 @@ type FileDescriptionOptions struct {
 	// usually only the case if O_DIRECT would actually have an effect.
 	AllowDirectIO bool
 
+	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+	DenyPRead bool
+
+	// If DenyPWrite is true, calls to FileDescription.PWrite() return
+	// ESPIPE.
+	DenyPWrite bool
+
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
 	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
@@ -306,6 +313,7 @@ type FileDescriptionImpl interface {
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	//
 	// Preconditions: The FileDescription was opened for reading.
+	// FileDescriptionOptions.DenyPRead == false.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -337,6 +345,7 @@ type FileDescriptionImpl interface {
 	// EOPNOTSUPP.
 	//
 	// Preconditions: The FileDescription was opened for writing.
+	// FileDescriptionOptions.DenyPWrite == false.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -515,6 +524,9 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
 // offset, and returns the number of bytes read. PRead is permitted to return
 // partial reads with a nil error.
 func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	if fd.opts.DenyPRead {
+		return 0, syserror.ESPIPE
+	}
 	if !fd.readable {
 		return 0, syserror.EBADF
 	}
@@ -533,6 +545,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
 // offset, and returns the number of bytes written. PWrite is permitted to
 // return partial writes with a nil error.
 func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if fd.opts.DenyPWrite {
+		return 0, syserror.ESPIPE
+	}
 	if !fd.writable {
 		return 0, syserror.EBADF
 	}
-- 
cgit v1.2.3


From 5b2396d244ed6283d928a72bdd4cc58d78ef3175 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 2 Apr 2020 17:06:19 -0700
Subject: Fix typo in TODO comments.

PiperOrigin-RevId: 304508083
---
 pkg/sentry/fs/proc/mounts.go                | 3 ++-
 pkg/sentry/fsimpl/tmpfs/filesystem.go       | 6 +++---
 pkg/sentry/fsimpl/tmpfs/stat_test.go        | 4 ++--
 pkg/sentry/fsimpl/tmpfs/tmpfs.go            | 2 +-
 pkg/sentry/socket/netstack/netstack.go      | 2 +-
 pkg/sentry/vfs/mount.go                     | 3 ++-
 test/syscalls/linux/socket_inet_loopback.cc | 2 +-
 7 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 94deb553b..1fc9c703c 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -170,7 +170,8 @@ func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
 	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
 	// the cgroup name in the options. For now we just read that from the
 	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
 	// should get this value from the cgroup itself, and not rely on the
 	// path.
 	if msrc.FilesystemType == "cgroup" {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e678ecc37..4cf27bf13 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -57,7 +57,7 @@ afterSymlink:
 	}
 	next := nextVFSD.Impl().(*dentry)
 	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO(gvisor.dev/issues/1197): Symlink traversals updates
+		// TODO(gvisor.dev/issue/1197): Symlink traversals updates
 		// access time.
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
@@ -515,7 +515,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.inode.decLinksLocked()
 		newParent.inode.incLinksLocked()
 	}
-	// TODO(gvisor.dev/issues/1197): Update timestamps and parent directory
+	// TODO(gvisor.dev/issue/1197): Update timestamps and parent directory
 	// sizes.
 	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
 	return nil
@@ -600,7 +600,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issues/1197): Actually implement statfs.
+	// TODO(gvisor.dev/issue/1197): Actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index ebe035dee..3e02e7190 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -29,7 +29,7 @@ func TestStatAfterCreate(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	// TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
@@ -169,7 +169,7 @@ func TestSetStat(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	// TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 8bc8818c0..54da15849 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -315,7 +315,7 @@ func (i *inode) statTo(stat *linux.Statx) {
 	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
 	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
 	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
-	// TODO(gvisor.dev/issues/1197): Device number.
+	// TODO(gvisor.dev/issue/1197): Device number.
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index f14c336b9..06a5b53bc 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -663,7 +663,7 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error
 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
 // represented by the empty string.
 //
-// TODO(gvisor.dev/issues/1556): remove this function.
+// TODO(gvisor.dev/issue/1556): remove this function.
 func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
 	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
 		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 7792eb1a0..1b8ecc415 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -835,7 +835,8 @@ func superBlockOpts(mountPath string, mnt *Mount) string {
 	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
 	// the cgroup name in the options. For now we just read that from the
 	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
 	// should get this value from the cgroup itself, and not rely on the
 	// path.
 	if mnt.fs.FilesystemType().Name() == "cgroup" {
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 16888de2a..2ffc86382 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -234,7 +234,7 @@ TEST_P(DualStackSocketTest, AddressOperations) {
   }
 }
 
-// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+// TODO(gvisor.dev/issue/1556): uncomment V4MappedAny.
 INSTANTIATE_TEST_SUITE_P(
     All, DualStackSocketTest,
     ::testing::Combine(
-- 
cgit v1.2.3


From 4582a2f188953d34591aef1a479d19d9be8f640f Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 2 Apr 2020 18:29:09 -0700
Subject: Drop NDP messages with fragment extension header

As per RFC 6980 section 5, nodes MUST silently ignore NDP messages if
the packet carrying them include an IPv6 Fragmentation Header.

Test: ipv6_test.TestNDPValidation
PiperOrigin-RevId: 304519379
---
 pkg/tcpip/header/ipv6_extension_headers.go |   4 +
 pkg/tcpip/network/ipv6/icmp.go             |   7 +-
 pkg/tcpip/network/ipv6/ipv6.go             |   5 +-
 pkg/tcpip/network/ipv6/ndp_test.go         | 157 +++++++++++++++++------------
 4 files changed, 108 insertions(+), 65 deletions(-)

diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 1b6c3f328..82485ed6a 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -62,6 +62,10 @@ const (
 	// within an IPv6RoutingExtHdr.
 	ipv6RoutingExtHdrSegmentsLeftIdx = 1
 
+	// IPv6FragmentExtHdrLength is the length of an IPv6 extension header, in
+	// bytes.
+	IPv6FragmentExtHdrLength = 8
+
 	// ipv6FragmentExtHdrFragmentOffsetOffset is the offset to the start of the
 	// Fragment Offset field within an IPv6FragmentExtHdr.
 	ipv6FragmentExtHdrFragmentOffsetOffset = 0
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 81e6f4d67..6d2d2c034 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -62,7 +62,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
@@ -91,7 +91,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
 		// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
 		// set to 0.
-		return iph.HopLimit() == header.NDPHopLimit && h.Code() == 0
+		//
+		// As per RFC 6980 section 5, nodes MUST silently drop NDP messages if the
+		// packet includes a fragmentation header.
+		return !hasFragmentHeader && iph.HopLimit() == header.NDPHopLimit && h.Code() == 0
 	}
 
 	// TODO(b/112892170): Meaningfully handle all ICMP types.
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 685239017..b462b8604 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -185,6 +185,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	pkt.Data.CapLength(int(h.PayloadLength()))
 
 	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), pkt.Data)
+	hasFragmentHeader := false
 
 	for firstHeader := true; ; firstHeader = false {
 		extHdr, done, err := it.Next()
@@ -257,6 +258,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			}
 
 		case header.IPv6FragmentExtHdr:
+			hasFragmentHeader = true
+
 			fragmentOffset := extHdr.FragmentOffset()
 			more := extHdr.More()
 			if !more && fragmentOffset == 0 {
@@ -344,7 +347,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			pkt.Data = extHdr.Buf
 
 			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
-				e.handleICMP(r, headerView, pkt)
+				e.handleICMP(r, headerView, pkt, hasFragmentHeader)
 			} else {
 				r.Stats().IP.PacketsDelivered.Increment()
 				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 3b05e8062..b113aaacc 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -276,9 +276,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	}
 }
 
-// TestHopLimitValidation is a test that makes sure that NDP packets are only
-// received if their IP header's hop limit is set to 255.
-func TestHopLimitValidation(t *testing.T) {
+func TestNDPValidation(t *testing.T) {
 	setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
 		t.Helper()
 
@@ -294,12 +292,19 @@ func TestHopLimitValidation(t *testing.T) {
 		return s, ep, r
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable, hopLimit uint8, ep stack.NetworkEndpoint, r *stack.Route) {
+	handleIPv6Payload := func(hdr buffer.Prependable, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+		nextHdr := uint8(header.ICMPv6ProtocolNumber)
+		if atomicFragment {
+			bytes := hdr.Prepend(header.IPv6FragmentExtHdrLength)
+			bytes[0] = nextHdr
+			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
+		}
+
 		payloadLength := hdr.UsedLength()
 		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 		ip.Encode(&header.IPv6Fields{
 			PayloadLength: uint16(payloadLength),
-			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+			NextHeader:    nextHdr,
 			HopLimit:      hopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
@@ -364,65 +369,93 @@ func TestHopLimitValidation(t *testing.T) {
 		},
 	}
 
+	subTests := []struct {
+		name           string
+		atomicFragment bool
+		hopLimit       uint8
+		code           uint8
+		valid          bool
+	}{
+		{
+			name:           "Valid",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit,
+			code:           0,
+			valid:          true,
+		},
+		{
+			name:           "Fragmented",
+			atomicFragment: true,
+			hopLimit:       header.NDPHopLimit,
+			code:           0,
+			valid:          false,
+		},
+		{
+			name:           "Invalid hop limit",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit - 1,
+			code:           0,
+			valid:          false,
+		},
+		{
+			name:           "Invalid ICMPv6 code",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit,
+			code:           1,
+			valid:          false,
+		},
+	}
+
 	for _, typ := range types {
 		t.Run(typ.name, func(t *testing.T) {
-			s, ep, r := setup(t)
-			defer r.Release()
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			typStat := typ.statCounter(stats)
-
-			extraDataLen := len(typ.extraData)
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-			extraData := buffer.View(hdr.Prepend(extraDataLen))
-			copy(extraData, typ.extraData)
-			pkt := header.ICMPv6(hdr.Prepend(typ.size))
-			pkt.SetType(typ.typ)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
-
-			// Rx count of the NDP message should initially be 0.
-			if got := typStat.Value(); got != 0 {
-				t.Errorf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// Invalid count should initially be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Errorf("got invalid = %d, want = 0", got)
-			}
-
-			if t.Failed() {
-				t.FailNow()
-			}
-
-			// Receive the NDP packet with an invalid hop limit.
-			handleIPv6Payload(hdr, header.NDPHopLimit-1, ep, &r)
-
-			// Rx count of the NDP packet should have increased.
-			if got := typStat.Value(); got != 1 {
-				t.Errorf("got %s = %d, want = 1", typ.name, got)
-			}
-
-			// Invalid count should have increased.
-			if got := invalid.Value(); got != 1 {
-				t.Errorf("got invalid = %d, want = 1", got)
-			}
-
-			if t.Failed() {
-				t.FailNow()
-			}
-
-			// Receive the NDP packet with a valid hop limit value.
-			handleIPv6Payload(hdr, header.NDPHopLimit, ep, &r)
-
-			// Rx count of the NDP packet should have increased.
-			if got := typStat.Value(); got != 2 {
-				t.Errorf("got %s = %d, want = 2", typ.name, got)
-			}
-
-			// Invalid count should not have increased again.
-			if got := invalid.Value(); got != 1 {
-				t.Errorf("got invalid = %d, want = 1", got)
+			for _, test := range subTests {
+				t.Run(test.name, func(t *testing.T) {
+					s, ep, r := setup(t)
+					defer r.Release()
+
+					stats := s.Stats().ICMP.V6PacketsReceived
+					invalid := stats.Invalid
+					typStat := typ.statCounter(stats)
+
+					extraDataLen := len(typ.extraData)
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen + header.IPv6FragmentExtHdrLength)
+					extraData := buffer.View(hdr.Prepend(extraDataLen))
+					copy(extraData, typ.extraData)
+					pkt := header.ICMPv6(hdr.Prepend(typ.size))
+					pkt.SetType(typ.typ)
+					pkt.SetCode(test.code)
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
+
+					// Rx count of the NDP message should initially be 0.
+					if got := typStat.Value(); got != 0 {
+						t.Errorf("got %s = %d, want = 0", typ.name, got)
+					}
+
+					// Invalid count should initially be 0.
+					if got := invalid.Value(); got != 0 {
+						t.Errorf("got invalid = %d, want = 0", got)
+					}
+
+					if t.Failed() {
+						t.FailNow()
+					}
+
+					handleIPv6Payload(hdr, test.hopLimit, test.atomicFragment, ep, &r)
+
+					// Rx count of the NDP packet should have increased.
+					if got := typStat.Value(); got != 1 {
+						t.Errorf("got %s = %d, want = 1", typ.name, got)
+					}
+
+					want := uint64(0)
+					if !test.valid {
+						// Invalid count should have increased.
+						want = 1
+					}
+					if got := invalid.Value(); got != want {
+						t.Errorf("got invalid = %d, want = %d", got, want)
+					}
+				})
 			}
 		})
 	}
-- 
cgit v1.2.3


From dd3bc499970c22ebbd270030b4564e6b8e4e929e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 2 Apr 2020 19:37:41 -0700
Subject: Add NAME_MAX checks and update file times

NAME_MAX should be enforced per filesystem implementation
because other file systems may not have the same restriction.

Gofer filesystem now keeps a reference to the kernel clock to
avoid lookup in the Context on file access to update atime.

Update access, modification, and status change times in tmpfs.

Updates #1197, #1198.

PiperOrigin-RevId: 304527148
---
 pkg/sentry/fsimpl/gofer/directory.go    |  7 +++-
 pkg/sentry/fsimpl/gofer/filesystem.go   | 13 ++++++--
 pkg/sentry/fsimpl/gofer/gofer.go        | 22 ++++++-------
 pkg/sentry/fsimpl/gofer/regular_file.go |  7 ++--
 pkg/sentry/fsimpl/gofer/special_file.go |  4 +--
 pkg/sentry/fsimpl/gofer/symlink.go      |  2 +-
 pkg/sentry/fsimpl/gofer/time.go         | 39 +++++++++++-----------
 pkg/sentry/fsimpl/kernfs/filesystem.go  |  9 ++++++
 pkg/sentry/fsimpl/tmpfs/directory.go    |  2 ++
 pkg/sentry/fsimpl/tmpfs/filesystem.go   | 25 +++++++++++++--
 pkg/sentry/fsimpl/tmpfs/regular_file.go |  4 ++-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        | 57 ++++++++++++++++++++++++++++++---
 12 files changed, 139 insertions(+), 52 deletions(-)

diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 5dbfc6250..49d9f859b 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -56,14 +56,19 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 
+	d := fd.dentry()
 	if fd.dirents == nil {
-		ds, err := fd.dentry().getDirents(ctx)
+		ds, err := d.getDirents(ctx)
 		if err != nil {
 			return err
 		}
 		fd.dirents = ds
 	}
 
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(fd.vfsfd.Mount())
+	}
+
 	for fd.off < int64(len(fd.dirents)) {
 		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
 			return err
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 269624362..305228bda 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -356,7 +356,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := create(parent, name); err != nil {
 		return err
 	}
-	parent.touchCMtime(ctx)
+	if fs.opts.interop != InteropModeShared {
+		parent.touchCMtime()
+	}
 	delete(parent.negativeChildren, name)
 	parent.dirents = nil
 	return nil
@@ -454,7 +456,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		return err
 	}
 	if fs.opts.interop != InteropModeShared {
-		parent.touchCMtime(ctx)
+		parent.touchCMtime()
 		if dir {
 			parent.decLinks()
 		}
@@ -802,7 +804,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	d.IncRef() // reference held by child on its parent d
 	d.vfsd.InsertChild(&child.vfsd, name)
 	if d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime(ctx)
 		delete(d.negativeChildren, name)
 		d.dirents = nil
 	}
@@ -834,6 +835,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	}
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime()
+	}
 	return childVFSFD, nil
 }
 
@@ -975,6 +979,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			oldParent.decLinks()
 			newParent.incLinks()
 		}
+		oldParent.touchCMtime()
+		newParent.touchCMtime()
+		renamed.touchCtime()
 	}
 	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
 	return nil
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 8e41b6b1c..adee8bb60 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -44,6 +44,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -72,6 +73,9 @@ type filesystem struct {
 	// client is the client used by this filesystem. client is immutable.
 	client *p9.Client
 
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock ktime.Clock
+
 	// uid and gid are the effective KUID and KGID of the filesystem's creator,
 	// and are used as the owner and group for files that don't specify one.
 	// uid and gid are immutable.
@@ -376,6 +380,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		uid:            creds.EffectiveKUID,
 		gid:            creds.EffectiveKGID,
 		client:         client,
+		clock:          ktime.RealtimeClockFromContext(ctx),
 		dentries:       make(map[*dentry]struct{}),
 		specialFileFDs: make(map[*specialFileFD]struct{}),
 	}
@@ -779,10 +784,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		// data, so there's no cache to truncate either.)
 		return nil
 	}
-	now, haveNow := nowFromContext(ctx)
-	if !haveNow {
-		ctx.Warningf("gofer.dentry.setStat: current time not available")
-	}
+	now := d.fs.clock.Now().Nanoseconds()
 	if stat.Mask&linux.STATX_MODE != 0 {
 		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
 	}
@@ -794,25 +796,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	}
 	if setLocalAtime {
 		if stat.Atime.Nsec == linux.UTIME_NOW {
-			if haveNow {
-				atomic.StoreInt64(&d.atime, now)
-			}
+			atomic.StoreInt64(&d.atime, now)
 		} else {
 			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
 		}
 	}
 	if setLocalMtime {
 		if stat.Mtime.Nsec == linux.UTIME_NOW {
-			if haveNow {
-				atomic.StoreInt64(&d.mtime, now)
-			}
+			atomic.StoreInt64(&d.mtime, now)
 		} else {
 			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
 		}
 	}
-	if haveNow {
-		atomic.StoreInt64(&d.ctime, now)
-	}
+	atomic.StoreInt64(&d.ctime, now)
 	if stat.Mask&linux.STATX_SIZE != 0 {
 		d.dataMu.Lock()
 		oldSize := d.size
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 3593eb1d5..857f7c74e 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	putDentryReadWriter(rw)
 	if d.fs.opts.interop != InteropModeShared {
 		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(ctx, fd.vfsfd.Mount())
+		d.touchAtime(fd.vfsfd.Mount())
 	}
 	return n, err
 }
@@ -139,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
 		// file_update_time(). This is d.touchCMtime(), but without locking
 		// d.metadataMu (recursively).
-		if now, ok := nowFromContext(ctx); ok {
-			atomic.StoreInt64(&d.mtime, now)
-			atomic.StoreInt64(&d.ctime, now)
-		}
+		d.touchCMtimeLocked()
 	}
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Write dirty cached pages that will be touched by the write back to
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 274f7346f..507e0e276 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -76,7 +76,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	// hold here since specialFileFD doesn't client-cache data. Just buffer the
 	// read instead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
-		d.touchAtime(ctx, fd.vfsfd.Mount())
+		d.touchAtime(fd.vfsfd.Mount())
 	}
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
@@ -117,7 +117,7 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 
 	// Do a buffered write. See rationale in PRead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime(ctx)
+		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
 	// Don't do partial writes if we get a partial read from src.
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
index adf43be60..2ec819f86 100644
--- a/pkg/sentry/fsimpl/gofer/symlink.go
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool {
 // Precondition: d.isSymlink().
 func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 	if d.fs.opts.interop != InteropModeShared {
-		d.touchAtime(ctx, mnt)
+		d.touchAtime(mnt)
 		d.dataMu.Lock()
 		if d.haveTarget {
 			target := d.target
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 7598ec6a8..2608e7e1d 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -18,8 +18,6 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -38,23 +36,12 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 	}
 }
 
-func nowFromContext(ctx context.Context) (int64, bool) {
-	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
-		return clock.Now().Nanoseconds(), true
-	}
-	return 0, false
-}
-
 // Preconditions: fs.interop != InteropModeShared.
-func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
+func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
-	now, ok := nowFromContext(ctx)
-	if !ok {
-		mnt.EndWrite()
-		return
-	}
+	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
 	atomic.StoreInt64(&d.atime, now)
 	d.metadataMu.Unlock()
@@ -63,13 +50,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
 
 // Preconditions: fs.interop != InteropModeShared. The caller has successfully
 // called vfs.Mount.CheckBeginWrite().
-func (d *dentry) touchCMtime(ctx context.Context) {
-	now, ok := nowFromContext(ctx)
-	if !ok {
-		return
-	}
+func (d *dentry) touchCtime() {
+	now := d.fs.clock.Now().Nanoseconds()
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
+
+// Preconditions: fs.interop != InteropModeShared. The caller has successfully
+// called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime() {
+	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
 	atomic.StoreInt64(&d.mtime, now)
 	atomic.StoreInt64(&d.ctime, now)
 	d.metadataMu.Unlock()
 }
+
+func (d *dentry) touchCMtimeLocked() {
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index a429fa23d..89f5da3d4 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -63,6 +63,9 @@ afterSymlink:
 		rp.Advance()
 		return nextVFSD, nil
 	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	d.dirMu.Lock()
 	nextVFSD, err := rp.ResolveChild(vfsd, name)
 	if err != nil {
@@ -191,6 +194,9 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if pc == "." || pc == ".." {
 		return "", syserror.EEXIST
 	}
+	if len(pc) > linux.NAME_MAX {
+		return "", syserror.ENAMETOOLONG
+	}
 	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
 	if err != nil {
 		return "", err
@@ -433,6 +439,9 @@ afterTrailingSymlink:
 	if pc == "." || pc == ".." {
 		return nil, syserror.EISDIR
 	}
+	if len(pc) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	// Determine whether or not we need to create a file.
 	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 37c75ab64..45712c9b9 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -68,6 +68,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 
+	fd.inode().touchAtime(fd.vfsfd.Mount())
+
 	if fd.off == 0 {
 		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 4cf27bf13..1978af69c 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -46,6 +46,9 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 		return nil, err
 	}
 afterSymlink:
+	if len(rp.Component()) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
 	if err != nil {
 		return nil, err
@@ -133,6 +136,9 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if name == "." || name == ".." {
 		return syserror.EEXIST
 	}
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
 	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
 	// because if the child exists we want to return EEXIST immediately instead
 	// of attempting symlink/mount traversal.
@@ -153,7 +159,11 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 		return err
 	}
 	defer mnt.EndWrite()
-	return create(parent, name)
+	if err := create(parent, name); err != nil {
+		return err
+	}
+	parent.inode.touchCMtime()
+	return nil
 }
 
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
@@ -328,7 +338,12 @@ afterTrailingSymlink:
 		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
 		parent.vfsd.InsertChild(&child.vfsd, name)
 		parent.inode.impl.(*directory).childList.PushBack(child)
-		return child.open(ctx, rp, &opts, true)
+		fd, err := child.open(ctx, rp, &opts, true)
+		if err != nil {
+			return nil, err
+		}
+		parent.inode.touchCMtime()
+		return fd, nil
 	}
 	if err != nil {
 		return nil, err
@@ -398,6 +413,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 	if !ok {
 		return "", syserror.EINVAL
 	}
+	symlink.inode.touchAtime(rp.Mount())
 	return symlink.target, nil
 }
 
@@ -515,6 +531,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.inode.decLinksLocked()
 		newParent.inode.incLinksLocked()
 	}
+	oldParent.inode.touchCMtime()
+	newParent.inode.touchCMtime()
+	renamed.inode.touchCtime()
 	// TODO(gvisor.dev/issue/1197): Update timestamps and parent directory
 	// sizes.
 	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
@@ -565,6 +584,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parent.inode.decLinksLocked() // from child's ".."
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(childVFSD)
+	parent.inode.touchCMtime()
 	return nil
 }
 
@@ -654,6 +674,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parent.inode.impl.(*directory).childList.Remove(child)
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(childVFSD)
+	parent.inode.touchCMtime()
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 26cd65605..57e5e28ec 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -286,7 +286,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := dst.CopyOutFrom(ctx, rw)
 	putRegularFileReadWriter(rw)
-	return int64(n), err
+	fd.inode().touchAtime(fd.vfsfd.Mount())
+	return n, err
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -323,6 +324,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
+	fd.inode().touchCMtimeLocked()
 	f.inode.mu.Unlock()
 	putRegularFileReadWriter(rw)
 	return n, err
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 54da15849..ad47288f8 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -385,28 +385,41 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 			return syserror.EINVAL
 		}
 	}
+	now := i.clock.Now().Nanoseconds()
 	if mask&linux.STATX_ATIME != 0 {
-		atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.atime, now)
+		} else {
+			atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		}
 		needsCtimeBump = true
 	}
 	if mask&linux.STATX_MTIME != 0 {
-		atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.mtime, now)
+		} else {
+			atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		}
 		needsCtimeBump = true
 		// Ignore the mtime bump, since we just set it ourselves.
 		needsMtimeBump = false
 	}
 	if mask&linux.STATX_CTIME != 0 {
-		atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		if stat.Ctime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.ctime, now)
+		} else {
+			atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		}
 		// Ignore the ctime bump, since we just set it ourselves.
 		needsCtimeBump = false
 	}
-	now := i.clock.Now().Nanoseconds()
 	if needsMtimeBump {
 		atomic.StoreInt64(&i.mtime, now)
 	}
 	if needsCtimeBump {
 		atomic.StoreInt64(&i.ctime, now)
 	}
+
 	i.mu.Unlock()
 	return nil
 }
@@ -484,6 +497,42 @@ func (i *inode) isDir() bool {
 	return linux.FileMode(i.mode).FileType() == linux.S_IFDIR
 }
 
+func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := i.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.atime, now)
+	i.mu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCtime() {
+	now := i.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCMtime() {
+	now := i.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
+// inode.mu.
+func (i *inode) touchCMtimeLocked() {
+	now := i.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
-- 
cgit v1.2.3


From d151693530db68db43188ce0fbc9f81aa5f27e2e Mon Sep 17 00:00:00 2001
From: Uros Prestor <urosp@google.com>
Date: Thu, 2 Apr 2020 22:01:57 -0700
Subject: Avoid sending a partial dirent when the Rreaddir response exceeds
 message limit.

PiperOrigin-RevId: 304542967
---
 pkg/p9/messages.go      | 14 ++++++--------
 pkg/p9/messages_test.go |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 3863ad1f5..57b89ad7d 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1926,19 +1926,17 @@ func (r *Rreaddir) decode(b *buffer) {
 // encode implements encoder.encode.
 func (r *Rreaddir) encode(b *buffer) {
 	entriesBuf := buffer{}
+	payloadSize := 0
 	for _, d := range r.Entries {
 		d.encode(&entriesBuf)
-		if len(entriesBuf.data) >= int(r.Count) {
+		if len(entriesBuf.data) > int(r.Count) {
 			break
 		}
+		payloadSize = len(entriesBuf.data)
 	}
-	if len(entriesBuf.data) < int(r.Count) {
-		r.Count = uint32(len(entriesBuf.data))
-		r.payload = entriesBuf.data
-	} else {
-		r.payload = entriesBuf.data[:r.Count]
-	}
-	b.Write32(uint32(r.Count))
+	r.Count = uint32(payloadSize)
+	r.payload = entriesBuf.data[:payloadSize]
+	b.Write32(r.Count)
 }
 
 // Type implements message.Type.
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index c20324404..7facc9f5e 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -216,7 +216,7 @@ func TestEncodeDecode(t *testing.T) {
 		},
 		&Rreaddir{
 			// Count must be sufficient to encode a dirent.
-			Count:   0x18,
+			Count:   0x1a,
 			Entries: []Dirent{{QID: QID{Type: 2}}},
 		},
 		&Tfsync{
-- 
cgit v1.2.3


From 1921c246a9907cd1623af4aabde086af9cf172d8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 3 Apr 2020 10:19:42 -0700
Subject: Internal change.

PiperOrigin-RevId: 304641990
---
 test/syscalls/linux/proc_net_unix.cc | 6 +++---
 test/syscalls/linux/pty.cc           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 66db0acaa..a63067586 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -106,7 +106,7 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
   std::vector<UnixEntry> entries;
   std::vector<std::string> lines = absl::StrSplit(content, '\n');
   std::cerr << "<contents of /proc/net/unix>" << std::endl;
-  for (std::string line : lines) {
+  for (const std::string& line : lines) {
     // Emit the proc entry to the test output to provide context for the test
     // results.
     std::cerr << line << std::endl;
@@ -374,7 +374,7 @@ TEST(ProcNetUnix, DgramSocketStateDisconnectingOnBind) {
   // corresponding entries, as they don't have an address yet.
   if (IsRunningOnGvisor()) {
     ASSERT_EQ(entries.size(), 2);
-    for (auto e : entries) {
+    for (const auto& e : entries) {
       ASSERT_EQ(e.state, SS_DISCONNECTING);
     }
   }
@@ -403,7 +403,7 @@ TEST(ProcNetUnix, DgramSocketStateConnectingOnConnect) {
   // corresponding entries, as they don't have an address yet.
   if (IsRunningOnGvisor()) {
     ASSERT_EQ(entries.size(), 2);
-    for (auto e : entries) {
+    for (const auto& e : entries) {
       ASSERT_EQ(e.state, SS_DISCONNECTING);
     }
   }
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index dafe64d20..b8a0159ba 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -1126,7 +1126,7 @@ TEST_F(PtyTest, SwitchTwiceMultiline) {
   std::string kExpected = "GO\nBLUE\n!";
 
   // Write each line.
-  for (std::string input : kInputs) {
+  for (const std::string& input : kInputs) {
     ASSERT_THAT(WriteFd(master_.get(), input.c_str(), input.size()),
                 SyscallSucceedsWithValue(input.size()));
   }
-- 
cgit v1.2.3


From ea98693d915ebb55bb6b93797bc58d7675ffbe9d Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 3 Apr 2020 11:37:16 -0700
Subject: Add missing newline

PiperOrigin-RevId: 304659346
---
 test/syscalls/linux/exec.cc                 | 10 ++++++----
 test/syscalls/linux/poll.cc                 |  2 +-
 test/syscalls/linux/proc_pid_smaps.cc       |  4 ++--
 test/syscalls/linux/ptrace.cc               |  2 +-
 test/syscalls/linux/sendfile_socket.cc      |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc |  2 +-
 test/syscalls/linux/socket_netlink_route.cc |  2 +-
 test/util/capability_util.cc                |  4 ++--
 8 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 07bd527e6..12c9b05ca 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -812,26 +812,28 @@ void ExecFromThread() {
 bool ValidateProcCmdlineVsArgv(const int argc, const char* const* argv) {
   auto contents_or = GetContents("/proc/self/cmdline");
   if (!contents_or.ok()) {
-    std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error();
+    std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error()
+              << std::endl;
     return false;
   }
   auto contents = contents_or.ValueOrDie();
   if (contents.back() != '\0') {
-    std::cerr << "Non-null terminated /proc/self/cmdline!";
+    std::cerr << "Non-null terminated /proc/self/cmdline!" << std::endl;
     return false;
   }
   contents.pop_back();
   std::vector<std::string> procfs_cmdline = absl::StrSplit(contents, '\0');
 
   if (static_cast<int>(procfs_cmdline.size()) != argc) {
-    std::cerr << "argc = " << argc << " != " << procfs_cmdline.size();
+    std::cerr << "argc = " << argc << " != " << procfs_cmdline.size()
+              << std::endl;
     return false;
   }
 
   for (int i = 0; i < argc; ++i) {
     if (procfs_cmdline[i] != argv[i]) {
       std::cerr << "Procfs command line argument " << i << " mismatch "
-                << procfs_cmdline[i] << " != " << argv[i];
+                << procfs_cmdline[i] << " != " << argv[i] << std::endl;
       return false;
     }
   }
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index c42472474..1e35a4a8b 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -266,7 +266,7 @@ TEST_F(PollTest, Nfds) {
   }
 
   rlim_t max_fds = rlim.rlim_cur;
-  std::cout << "Using limit: " << max_fds;
+  std::cout << "Using limit: " << max_fds << std::endl;
 
   // Create an eventfd. Since its value is initially zero, it is writable.
   FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index 7f2e8f203..9fb1b3a2c 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -173,7 +173,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
       return;
     }
     unknown_fields.insert(std::string(key));
-    std::cerr << "skipping unknown smaps field " << key;
+    std::cerr << "skipping unknown smaps field " << key << std::endl;
   };
 
   auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
@@ -191,7 +191,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
     // amount of whitespace).
     if (!entry) {
       std::cerr << "smaps line not considered a maps line: "
-                << maybe_maps_entry.error_message();
+                << maybe_maps_entry.error_message() << std::endl;
       return PosixError(
           EINVAL,
           absl::StrCat("smaps field line without preceding maps line: ", l));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index bfe3e2603..cb828ff88 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1188,7 +1188,7 @@ TEST(PtraceTest, SeizeSetOptions) {
     // gVisor is not susceptible to this race because
     // kernel.Task.waitCollectTraceeStopLocked() checks specifically for an
     // active ptraceStop, which is not initiated if SIGKILL is pending.
-    std::cout << "Observed syscall-exit after SIGKILL";
+    std::cout << "Observed syscall-exit after SIGKILL" << std::endl;
     ASSERT_THAT(waitpid(child_pid, &status, 0),
                 SyscallSucceedsWithValue(child_pid));
   }
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 8f7ee4163..e94672679 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -149,7 +149,7 @@ TEST_P(SendFileTest, SendMultiple) {
   for (size_t sent = 0; sent < data.size(); cnt++) {
     const size_t remain = data.size() - sent;
     std::cout << "sendfile, size=" << data.size() << ", sent=" << sent
-              << ", remain=" << remain;
+              << ", remain=" << remain << std::endl;
 
     // Send data and verify that sendfile returns the correct value.
     int res = sendfile(client.get(), inf.get(), nullptr, remain);
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2ffc86382..1b34e4ef7 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -2212,7 +2212,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
           setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &portreuse2, sizeof(int)),
           SyscallSucceeds());
 
-      std::cout << portreuse1 << " " << portreuse2;
+      std::cout << portreuse1 << " " << portreuse2 << std::endl;
       int ret = bind(fd2, reinterpret_cast<sockaddr*>(&addr), addrlen);
 
       // Verify that two sockets can be bound to the same port only if
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index e5aed1eec..2efb96bc3 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -152,7 +152,7 @@ TEST(NetlinkRouteTest, GetLinkDump) {
     const struct ifinfomsg* msg =
         reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
     std::cout << "Found interface idx=" << msg->ifi_index
-              << ", type=" << std::hex << msg->ifi_type;
+              << ", type=" << std::hex << msg->ifi_type << std::endl;
     if (msg->ifi_type == ARPHRD_LOOPBACK) {
       loopbackFound = true;
       EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index 9fee52fbb..a1b994c45 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -63,13 +63,13 @@ PosixErrorOr<bool> CanCreateUserNamespace() {
     // is in a chroot environment (i.e., the caller's root directory does
     // not match the root directory of the mount namespace in which it
     // resides)."
-    std::cerr << "clone(CLONE_NEWUSER) failed with EPERM";
+    std::cerr << "clone(CLONE_NEWUSER) failed with EPERM" << std::endl;
     return false;
   } else if (errno == EUSERS) {
     // "(since Linux 3.11) CLONE_NEWUSER was specified in flags, and the call
     // would cause the limit on the number of nested user namespaces to be
     // exceeded. See user_namespaces(7)."
-    std::cerr << "clone(CLONE_NEWUSER) failed with EUSERS";
+    std::cerr << "clone(CLONE_NEWUSER) failed with EUSERS" << std::endl;
     return false;
   } else {
     // Unexpected error code; indicate an actual error.
-- 
cgit v1.2.3


From 4032cf06e4fe5f952f145427dec34629939df980 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 3 Apr 2020 12:28:00 -0700
Subject: Deflake
 //third_party/gvisor/test/perf:getdents_benchmark_runsc_ptrace

* Increase a buffer size for getdents64
* Increase a number of shards

PiperOrigin-RevId: 304670004
---
 test/perf/BUILD                       | 1 +
 test/perf/linux/getdents_benchmark.cc | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/perf/BUILD b/test/perf/BUILD
index 0a0def6a3..471d8c2ab 100644
--- a/test/perf/BUILD
+++ b/test/perf/BUILD
@@ -30,6 +30,7 @@ syscall_test(
 
 syscall_test(
     size = "enormous",
+    shard_count = 10,
     tags = ["nogotsan"],
     test = "//test/perf/linux:getdents_benchmark",
 )
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
index afc599ad2..d8e81fa8c 100644
--- a/test/perf/linux/getdents_benchmark.cc
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -38,7 +38,7 @@ namespace testing {
 
 namespace {
 
-constexpr int kBufferSize = 16384;
+constexpr int kBufferSize = 65536;
 
 PosixErrorOr<TempPath> CreateDirectory(int count,
                                        std::vector<std::string>* files) {
-- 
cgit v1.2.3


From a94309628ebbc2e6c4997890f1b966fa7a16be20 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 3 Apr 2020 13:39:45 -0700
Subject: Ensure EOF is handled propertly during splice.

PiperOrigin-RevId: 304684417
---
 pkg/sentry/kernel/pipe/pipe.go  | 13 ++++++++++---
 test/syscalls/linux/sendfile.cc | 28 ++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 725e9db7d..62c8691f1 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -255,7 +255,8 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.view.Size(); wanted > avail {
+	avail := p.max - p.view.Size()
+	if wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
@@ -268,8 +269,14 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 		return done, err
 	}
 
-	if wanted > done {
-		// Partial write due to full pipe.
+	if done < avail {
+		// Non-failure, but short write.
+		return done, nil
+	}
+	if done < wanted {
+		// Partial write due to full pipe. Note that this could also be
+		// the short write case above, we would expect a second call
+		// and the write to return zero bytes in this case.
 		return done, syserror.ErrWouldBlock
 	}
 
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 580ab5193..ebaafe47e 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -530,6 +530,34 @@ TEST(SendFileTest, SendToSpecialFile) {
               SyscallSucceedsWithValue(kSize & (~7)));
 }
 
+TEST(SendFileTest, SendFileToPipe) {
+  // Create temp file.
+  constexpr char kData[] = "<insert-quote-here>";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Create a pipe for sending to a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Expect to read up to the given size.
+  std::vector<char> buf(kDataSize);
+  ScopedThread t([&]() {
+    absl::SleepFor(absl::Milliseconds(100));
+    ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(kDataSize));
+  });
+
+  // Send with twice the size of the file, which should hit EOF.
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize * 2),
+              SyscallSucceedsWithValue(kDataSize));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 5818663ebe26857845685702d99db41c7aa2cf3d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 3 Apr 2020 14:07:42 -0700
Subject: Add FileDescriptionImpl for Unix sockets.

This change involves several steps:
- Refactor the VFS1 unix socket implementation to share methods between VFS1
  and VFS2 where possible. Re-implement the rest.
- Override the default PRead, Read, PWrite, Write, Ioctl, Release methods in
  FileDescriptionDefaultImpl.
- Add functions to create and initialize a new Dentry/Inode and FileDescription
  for a Unix socket file.

Updates #1476

PiperOrigin-RevId: 304689796
---
 pkg/sentry/fsimpl/sockfs/BUILD         |   1 +
 pkg/sentry/fsimpl/sockfs/sockfs.go     |  29 +++
 pkg/sentry/kernel/BUILD                |   1 +
 pkg/sentry/socket/netstack/netstack.go |   8 +-
 pkg/sentry/socket/unix/BUILD           |   4 +
 pkg/sentry/socket/unix/unix.go         |  89 ++++++---
 pkg/sentry/socket/unix/unix_vfs2.go    | 348 +++++++++++++++++++++++++++++++++
 pkg/sentry/vfs/options.go              |   5 +
 8 files changed, 456 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sentry/socket/unix/unix_vfs2.go

diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD
index 790d50e65..52084ddb5 100644
--- a/pkg/sentry/fsimpl/sockfs/BUILD
+++ b/pkg/sentry/fsimpl/sockfs/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = ["sockfs.go"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index c13511de2..3f7ad1d65 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -16,6 +16,7 @@
 package sockfs
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -60,6 +61,10 @@ type filesystem struct {
 }
 
 // inode implements kernfs.Inode.
+//
+// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are
+// not included in InodeAttrs) to store the numbers of the appropriate
+// socket device. Override InodeAttrs.Stat() accordingly.
 type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
@@ -71,3 +76,27 @@ type inode struct {
 func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ENXIO
 }
+
+// InitSocket initializes a socket FileDescription, with a corresponding
+// Dentry in mnt.
+//
+// fd should be the FileDescription associated with socketImpl, i.e. its first
+// field. mnt should be the global socket mount, Kernel.socketMount.
+func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error {
+	fsimpl := mnt.Filesystem().Impl()
+	fs := fsimpl.(*kernfs.Filesystem)
+
+	// File mode matches net/socket.c:sock_alloc.
+	filemode := linux.FileMode(linux.S_IFSOCK | 0600)
+	i := &inode{}
+	i.Init(creds, fs.NextIno(), filemode)
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+
+	opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true}
+	if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index bb7e3cbc3..e0ff58d8c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -169,6 +169,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 06a5b53bc..5d0085462 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -940,7 +940,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -966,7 +966,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 }
 
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
 	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_ERROR:
@@ -1541,7 +1541,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 
 // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
 // sockets backed by a commonEndpoint.
-func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
+func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
 	case linux.SOL_SOCKET:
 		return setSockOptSocket(t, s, ep, name, optVal)
@@ -1568,7 +1568,7 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 }
 
 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
-func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.SO_SNDBUF:
 		if len(optVal) < sizeOfInt32 {
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 08743deba..de2cc4bdf 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -8,23 +8,27 @@ go_library(
         "device.go",
         "io.go",
         "unix.go",
+        "unix_vfs2.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fspath",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 4d30aa714..7c64f30fa 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -23,6 +23,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -33,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -52,11 +54,8 @@ type SocketOperations struct {
 	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	refs.AtomicRefCount
-	socket.SendReceiveTimeout
 
-	ep    transport.Endpoint
-	stype linux.SockType
+	socketOpsCommon
 }
 
 // New creates a new unix socket.
@@ -75,16 +74,29 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 	}
 
 	s := SocketOperations{
-		ep:    ep,
-		stype: stype,
+		socketOpsCommon: socketOpsCommon{
+			ep:    ep,
+			stype: stype,
+		},
 	}
 	s.EnableLeakCheck("unix.SocketOperations")
 
 	return fs.NewFile(ctx, d, flags, &s)
 }
 
+// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
+//
+// +stateify savable
+type socketOpsCommon struct {
+	refs.AtomicRefCount
+	socket.SendReceiveTimeout
+
+	ep    transport.Endpoint
+	stype linux.SockType
+}
+
 // DecRef implements RefCounter.DecRef.
-func (s *SocketOperations) DecRef() {
+func (s *socketOpsCommon) DecRef() {
 	s.DecRefWithDestructor(func() {
 		s.ep.Close()
 	})
@@ -97,7 +109,7 @@ func (s *SocketOperations) Release() {
 	s.DecRef()
 }
 
-func (s *SocketOperations) isPacket() bool {
+func (s *socketOpsCommon) isPacket() bool {
 	switch s.stype {
 	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
 		return true
@@ -110,7 +122,7 @@ func (s *SocketOperations) isPacket() bool {
 }
 
 // Endpoint extracts the transport.Endpoint.
-func (s *SocketOperations) Endpoint() transport.Endpoint {
+func (s *socketOpsCommon) Endpoint() transport.Endpoint {
 	return s.ep
 }
 
@@ -143,7 +155,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) {
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetRemoteAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -155,7 +167,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32,
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetLocalAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -178,7 +190,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // Listen implements the linux syscall listen(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	return s.ep.Listen(backlog)
 }
 
@@ -310,6 +322,8 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			}
 
 			// Create the socket.
+			//
+			// TODO(gvisor.dev/issue/2324): Correctly set file permissions.
 			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
 			if err != nil {
 				return syserr.ErrPortInUse
@@ -345,6 +359,31 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 		return ep, nil
 	}
 
+	if kernel.VFS2Enabled {
+		p := fspath.Parse(path)
+		root := t.FSContext().RootDirectoryVFS2()
+		start := root
+		relPath := !p.Absolute
+		if relPath {
+			start = t.FSContext().WorkingDirectoryVFS2()
+		}
+		pop := vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               p,
+			FollowFinalSymlink: true,
+		}
+		ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop)
+		root.DecRef()
+		if relPath {
+			start.DecRef()
+		}
+		if e != nil {
+			return nil, syserr.FromError(e)
+		}
+		return ep, nil
+	}
+
 	// Find the node in the filesystem.
 	root := t.FSContext().RootDirectory()
 	cwd := t.FSContext().WorkingDirectory()
@@ -363,12 +402,11 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 		// No socket!
 		return nil, syserr.ErrConnectionRefused
 	}
-
 	return ep, nil
 }
 
 // Connect implements the linux syscall connect(2) for unix sockets.
-func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	ep, err := extractEndpoint(t, sockaddr)
 	if err != nil {
 		return err
@@ -379,7 +417,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	return s.ep.Connect(t, ep)
 }
 
-// Writev implements fs.FileOperations.Write.
+// Write implements fs.FileOperations.Write.
 func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	t := kernel.TaskFromContext(ctx)
 	ctrl := control.New(t, s.ep, nil)
@@ -399,7 +437,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Ctx:      t,
 		Endpoint: s.ep,
@@ -453,27 +491,27 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 }
 
 // Passcred implements transport.Credentialer.Passcred.
-func (s *SocketOperations) Passcred() bool {
+func (s *socketOpsCommon) Passcred() bool {
 	return s.ep.Passcred()
 }
 
 // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
-func (s *SocketOperations) ConnectedPasscred() bool {
+func (s *socketOpsCommon) ConnectedPasscred() bool {
 	return s.ep.ConnectedPasscred()
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.ep.Readiness(mask)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	s.ep.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (s *SocketOperations) EventUnregister(e *waiter.Entry) {
+func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
 	s.ep.EventUnregister(e)
 }
 
@@ -485,7 +523,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 
 // Shutdown implements the linux syscall shutdown(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	f, err := netstack.ConvertShutdown(how)
 	if err != nil {
 		return err
@@ -511,7 +549,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -648,12 +686,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 }
 
 // State implements socket.Socket.State.
-func (s *SocketOperations) State() uint32 {
+func (s *socketOpsCommon) State() uint32 {
 	return s.ep.State()
 }
 
 // Type implements socket.Socket.Type.
-func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
 	// Unix domain sockets always have a protocol of 0.
 	return linux.AF_UNIX, s.stype, 0
 }
@@ -706,4 +744,5 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F
 
 func init() {
 	socket.RegisterProvider(linux.AF_UNIX, &provider{})
+	socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{})
 }
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
new file mode 100644
index 000000000..ca1388e2c
--- /dev/null
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -0,0 +1,348 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unix
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SocketVFS2 implements socket.SocketVFS2 (and by extension,
+// vfs.FileDescriptionImpl) for Unix sockets.
+type SocketVFS2 struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	socketOpsCommon
+}
+
+// NewVFS2File creates and returns a new vfs.FileDescription for a unix socket.
+func NewVFS2File(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
+	sock := NewFDImpl(ep, stype)
+	vfsfd := &sock.vfsfd
+	if err := sockfs.InitSocket(sock, vfsfd, t.Kernel().SocketMount(), t.Credentials()); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return vfsfd, nil
+}
+
+// NewFDImpl creates and returns a new SocketVFS2.
+func NewFDImpl(ep transport.Endpoint, stype linux.SockType) *SocketVFS2 {
+	// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
+	// SOCK_DGRAM and don't require CAP_NET_RAW.
+	if stype == linux.SOCK_RAW {
+		stype = linux.SOCK_DGRAM
+	}
+
+	return &SocketVFS2{
+		socketOpsCommon: socketOpsCommon{
+			ep:    ep,
+			stype: stype,
+		},
+	}
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+}
+
+// blockingAccept implements a blocking version of accept(2), that is, if no
+// connections are ready to be accept, it will block until one becomes ready.
+func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+	// Register for notifications.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
+	defer s.socketOpsCommon.EventUnregister(&e)
+
+	// Try to accept the connection; if it fails, then wait until we get a
+	// notification.
+	for {
+		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+			return ep, err
+		}
+
+		if err := t.Block(ch); err != nil {
+			return nil, syserr.FromError(err)
+		}
+	}
+}
+
+// Accept implements the linux syscall accept(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+	// Issue the accept request to get the new endpoint.
+	ep, err := s.ep.Accept()
+	if err != nil {
+		if err != syserr.ErrWouldBlock || !blocking {
+			return 0, nil, 0, err
+		}
+
+		var err *syserr.Error
+		ep, err = s.blockingAccept(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	// We expect this to be a FileDescription here.
+	ns, err := NewVFS2File(t, ep, s.stype)
+	if err != nil {
+		return 0, nil, 0, err
+	}
+	defer ns.DecRef()
+
+	if flags&linux.SOCK_NONBLOCK != 0 {
+		ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK)
+	}
+
+	var addr linux.SockAddr
+	var addrLen uint32
+	if peerRequested {
+		// Get address of the peer.
+		var err *syserr.Error
+		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	})
+	if e != nil {
+		return 0, nil, 0, syserr.FromError(e)
+	}
+
+	// TODO: add vfs2 sockets to global table.
+	return fd, addr, addrLen, nil
+}
+
+// Bind implements the linux syscall bind(2) for unix sockets.
+func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	p, e := extractPath(sockaddr)
+	if e != nil {
+		return e
+	}
+
+	bep, ok := s.ep.(transport.BoundEndpoint)
+	if !ok {
+		// This socket can't be bound.
+		return syserr.ErrInvalidArgument
+	}
+
+	return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
+		// Is it abstract?
+		if p[0] == 0 {
+			if t.IsNetworkNamespaced() {
+				return syserr.ErrInvalidEndpointState
+			}
+			if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+				// syserr.ErrPortInUse corresponds to EADDRINUSE.
+				return syserr.ErrPortInUse
+			}
+		} else {
+			path := fspath.Parse(p)
+			root := t.FSContext().RootDirectoryVFS2()
+			defer root.DecRef()
+			start := root
+			relPath := !path.Absolute
+			if relPath {
+				start = t.FSContext().WorkingDirectoryVFS2()
+				defer start.DecRef()
+			}
+			pop := vfs.PathOperation{
+				Root:  root,
+				Start: start,
+				Path:  path,
+			}
+			err := t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{
+				// TODO(gvisor.dev/issue/2324): The file permissions should be taken
+				// from s and t.FSContext().Umask() (see net/unix/af_unix.c:unix_bind),
+				// but VFS1 just always uses 0400. Resolve this inconsistency.
+				Mode:     linux.S_IFSOCK | 0400,
+				Endpoint: bep,
+			})
+			if err == syserror.EEXIST {
+				return syserr.ErrAddressInUse
+			}
+			return syserr.FromError(err)
+		}
+
+		return nil
+	})
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return netstack.Ioctl(ctx, s.ep, uio, args)
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Read implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	return dst.CopyOutFrom(ctx, &EndpointReader{
+		Ctx:       ctx,
+		Endpoint:  s.ep,
+		NumRights: 0,
+		Peek:      false,
+		From:      nil,
+	})
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	t := kernel.TaskFromContext(ctx)
+	ctrl := control.New(t, s.ep, nil)
+
+	if src.NumBytes() == 0 {
+		nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil)
+		return int64(nInt), err.ToError()
+	}
+
+	return src.CopyInTo(ctx, &EndpointWriter{
+		Ctx:      ctx,
+		Endpoint: s.ep,
+		Control:  ctrl,
+		To:       nil,
+	})
+}
+
+// Release implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Release() {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef()
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.socketOpsCommon.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.socketOpsCommon.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
+	s.socketOpsCommon.EventUnregister(e)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
+}
+
+// providerVFS2 is a unix domain socket provider for VFS2.
+type providerVFS2 struct{}
+
+func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	// Create the endpoint and socket.
+	var ep transport.Endpoint
+	switch stype {
+	case linux.SOCK_DGRAM, linux.SOCK_RAW:
+		ep = transport.NewConnectionless(t)
+	case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
+		ep = transport.NewConnectioned(t, stype, t.Kernel())
+	default:
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	f, err := NewVFS2File(t, ep, stype)
+	if err != nil {
+		ep.Close()
+		return nil, err
+	}
+	return f, nil
+}
+
+// Pair creates a new pair of AF_UNIX connected sockets.
+func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, nil, syserr.ErrProtocolNotSupported
+	}
+
+	switch stype {
+	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW:
+		// Ok
+	default:
+		return nil, nil, syserr.ErrInvalidArgument
+	}
+
+	// Create the endpoints and sockets.
+	ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
+	s1, err := NewVFS2File(t, ep1, stype)
+	if err != nil {
+		ep1.Close()
+		ep2.Close()
+		return nil, nil, err
+	}
+	s2, err := NewVFS2File(t, ep2, stype)
+	if err != nil {
+		s1.DecRef()
+		ep2.Close()
+		return nil, nil, err
+	}
+
+	return s1, s2, nil
+}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3e90dc4ed..2f04bf882 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
@@ -44,6 +45,10 @@ type MknodOptions struct {
 	// DevMinor are the major and minor device numbers for the created device.
 	DevMajor uint32
 	DevMinor uint32
+
+	// Endpoint is the endpoint to bind to the created file, if a socket file is
+	// being created for bind(2) on a Unix domain socket.
+	Endpoint transport.BoundEndpoint
 }
 
 // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
-- 
cgit v1.2.3


From fc99a7ebf0c24b6f7b3cfd6351436373ed54548b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 3 Apr 2020 18:34:48 -0700
Subject: Refactor software GSO code.

Software GSO implementation currently has a complicated code path with
implicit assumptions that all packets to WritePackets carry same Data
and it does this to avoid allocations on the path etc. But this makes it
hard to reuse the WritePackets API.

This change breaks all such assumptions by introducing a new Vectorised
View API ReadToVV which can be used to cleanly split a VV into multiple
independent VVs. Further this change also makes packet buffers linkable
to form an intrusive list. This allows us to get rid of the array of
packet buffers that are passed in the WritePackets API call and replace
it with a list of packet buffers.

While this code does introduce some more allocations in the benchmarks
it doesn't cause any degradation.

Updates #231

PiperOrigin-RevId: 304731742
---
 pkg/ilist/list.go                        |  13 ++-
 pkg/sentry/kernel/kernel.go              |  22 +++--
 pkg/tcpip/buffer/view.go                 |  53 +++++++++-
 pkg/tcpip/buffer/view_test.go            | 137 ++++++++++++++++++++++++++
 pkg/tcpip/link/channel/channel.go        |  18 ++--
 pkg/tcpip/link/fdbased/endpoint.go       | 162 ++++++++++++++-----------------
 pkg/tcpip/link/loopback/loopback.go      |   2 +-
 pkg/tcpip/link/muxed/injectable.go       |   2 +-
 pkg/tcpip/link/sharedmem/sharedmem.go    |   2 +-
 pkg/tcpip/link/sniffer/sniffer.go        |  14 +--
 pkg/tcpip/link/waitable/waitable.go      |   4 +-
 pkg/tcpip/link/waitable/waitable_test.go |   6 +-
 pkg/tcpip/network/arp/arp.go             |   2 +-
 pkg/tcpip/network/ip_test.go             |   2 +-
 pkg/tcpip/network/ipv4/ipv4.go           |  37 +++++--
 pkg/tcpip/network/ipv6/icmp.go           |   2 +-
 pkg/tcpip/network/ipv6/ipv6.go           |  12 +--
 pkg/tcpip/stack/BUILD                    |  14 ++-
 pkg/tcpip/stack/forwarder_test.go        |   8 +-
 pkg/tcpip/stack/iptables.go              |  17 ++++
 pkg/tcpip/stack/ndp_test.go              |   2 +-
 pkg/tcpip/stack/packet_buffer.go         |  14 +--
 pkg/tcpip/stack/packet_buffer_state.go   |  27 ------
 pkg/tcpip/stack/registration.go          |   4 +-
 pkg/tcpip/stack/route.go                 |  19 ++--
 pkg/tcpip/stack/stack_test.go            |   2 +-
 pkg/tcpip/transport/tcp/connect.go       |  47 +++++----
 pkg/tcpip/transport/tcp/segment.go       |   6 +-
 28 files changed, 420 insertions(+), 230 deletions(-)
 delete mode 100644 pkg/tcpip/stack/packet_buffer_state.go

diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 8f93e4d6d..0d07da3b1 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -86,12 +86,21 @@ func (l *List) Back() Element {
 	return l.tail
 }
 
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+func (l *List) Len() (count int) {
+	for e := l.Front(); e != nil; e = e.Next() {
+		count++
+	}
+	return count
+}
+
 // PushFront inserts the element e at the front of list l.
 func (l *List) PushFront(e Element) {
 	linker := ElementMapper{}.linkerFor(e)
 	linker.SetNext(l.head)
 	linker.SetPrev(nil)
-
 	if l.head != nil {
 		ElementMapper{}.linkerFor(l.head).SetPrev(e)
 	} else {
@@ -106,7 +115,6 @@ func (l *List) PushBack(e Element) {
 	linker := ElementMapper{}.linkerFor(e)
 	linker.SetNext(nil)
 	linker.SetPrev(l.tail)
-
 	if l.tail != nil {
 		ElementMapper{}.linkerFor(l.tail).SetNext(e)
 	} else {
@@ -127,7 +135,6 @@ func (l *List) PushBackList(m *List) {
 
 		l.tail = m.tail
 	}
-
 	m.head = nil
 	m.tail = nil
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0a448b57c..2e6f42b92 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -564,15 +564,25 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
+
+	// Tasks that belong to the same process could potentially point to the
+	// same FDTable. So we retain a map of processed ones to avoid
+	// processing the same FDTable multiple times.
+	processed := make(map[*FDTable]struct{})
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
-		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
-					e.UnregisterEpollWaiters()
-				}
-			})
+		if t.fdTable == nil {
+			continue
+		}
+		if _, ok := processed[t.fdTable]; ok {
+			continue
 		}
+		t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+			if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+				e.UnregisterEpollWaiters()
+			}
+		})
+		processed[t.fdTable] = struct{}{}
 	}
 }
 
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 8d42cd066..8ec5d5d5c 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -17,6 +17,7 @@ package buffer
 
 import (
 	"bytes"
+	"io"
 )
 
 // View is a slice of a buffer, with convenience methods.
@@ -89,6 +90,47 @@ func (vv *VectorisedView) TrimFront(count int) {
 	}
 }
 
+// Read implements io.Reader.
+func (vv *VectorisedView) Read(v View) (copied int, err error) {
+	count := len(v)
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			copy(v[copied:], vv.views[0][:count])
+			vv.views[0].TrimFront(count)
+			copied += count
+			return copied, nil
+		}
+		count -= len(vv.views[0])
+		copy(v[copied:], vv.views[0])
+		copied += len(vv.views[0])
+		vv.RemoveFirst()
+	}
+	if copied == 0 {
+		return 0, io.EOF
+	}
+	return copied, nil
+}
+
+// ReadToVV reads up to n bytes from vv to dstVV and removes them from vv. It
+// returns the number of bytes copied.
+func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int) {
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			dstVV.AppendView(vv.views[0][:count])
+			vv.views[0].TrimFront(count)
+			copied += count
+			return
+		}
+		count -= len(vv.views[0])
+		dstVV.AppendView(vv.views[0])
+		copied += len(vv.views[0])
+		vv.RemoveFirst()
+	}
+	return copied
+}
+
 // CapLength irreversibly reduces the length of the vectorised view.
 func (vv *VectorisedView) CapLength(length int) {
 	if length < 0 {
@@ -116,12 +158,12 @@ func (vv *VectorisedView) CapLength(length int) {
 // Clone returns a clone of this VectorisedView.
 // If the buffer argument is large enough to contain all the Views of this VectorisedView,
 // the method will avoid allocations and use the buffer to store the Views of the clone.
-func (vv VectorisedView) Clone(buffer []View) VectorisedView {
+func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
 
 // First returns the first view of the vectorised view.
-func (vv VectorisedView) First() View {
+func (vv *VectorisedView) First() View {
 	if len(vv.views) == 0 {
 		return nil
 	}
@@ -134,11 +176,12 @@ func (vv *VectorisedView) RemoveFirst() {
 		return
 	}
 	vv.size -= len(vv.views[0])
+	vv.views[0] = nil
 	vv.views = vv.views[1:]
 }
 
 // Size returns the size in bytes of the entire content stored in the vectorised view.
-func (vv VectorisedView) Size() int {
+func (vv *VectorisedView) Size() int {
 	return vv.size
 }
 
@@ -146,7 +189,7 @@ func (vv VectorisedView) Size() int {
 //
 // If the vectorised view contains a single view, that view will be returned
 // directly.
-func (vv VectorisedView) ToView() View {
+func (vv *VectorisedView) ToView() View {
 	if len(vv.views) == 1 {
 		return vv.views[0]
 	}
@@ -158,7 +201,7 @@ func (vv VectorisedView) ToView() View {
 }
 
 // Views returns the slice containing the all views.
-func (vv VectorisedView) Views() []View {
+func (vv *VectorisedView) Views() []View {
 	return vv.views
 }
 
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index ebc3a17b7..106e1994c 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -233,3 +233,140 @@ func TestToClone(t *testing.T) {
 		})
 	}
 }
+
+func TestVVReadToVV(t *testing.T) {
+	testCases := []struct {
+		comment     string
+		vv          VectorisedView
+		bytesToRead int
+		wantBytes   string
+		leftVV      VectorisedView
+	}{
+		{
+			comment:     "large VV, short read",
+			vv:          vv(30, "012345678901234567890123456789"),
+			bytesToRead: 10,
+			wantBytes:   "0123456789",
+			leftVV:      vv(20, "01234567890123456789"),
+		},
+		{
+			comment:     "largeVV, multiple views, short read",
+			vv:          vv(13, "123", "345", "567", "8910"),
+			bytesToRead: 6,
+			wantBytes:   "123345",
+			leftVV:      vv(7, "567", "8910"),
+		},
+		{
+			comment:     "smallVV (multiple views), large read",
+			vv:          vv(3, "1", "2", "3"),
+			bytesToRead: 10,
+			wantBytes:   "123",
+			leftVV:      vv(0, ""),
+		},
+		{
+			comment:     "smallVV (single view), large read",
+			vv:          vv(1, "1"),
+			bytesToRead: 10,
+			wantBytes:   "1",
+			leftVV:      vv(0, ""),
+		},
+		{
+			comment:     "emptyVV, large read",
+			vv:          vv(0, ""),
+			bytesToRead: 10,
+			wantBytes:   "",
+			leftVV:      vv(0, ""),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.comment, func(t *testing.T) {
+			var readTo VectorisedView
+			inSize := tc.vv.Size()
+			copied := tc.vv.ReadToVV(&readTo, tc.bytesToRead)
+			if got, want := copied, len(tc.wantBytes); got != want {
+				t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc: %+v", got, want, tc)
+			}
+			if got, want := string(readTo.ToView()), tc.wantBytes; got != want {
+				t.Errorf("unexpected content in readTo got: %s, want: %s", got, want)
+			}
+			if got, want := tc.vv.Size(), inSize-copied; got != want {
+				t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(tc.vv.ToView()), string(tc.leftVV.ToView()); got != want {
+				t.Errorf("unexpected data left in vv after read got: %+v, want: %+v", got, want)
+			}
+		})
+	}
+}
+
+func TestVVRead(t *testing.T) {
+	testCases := []struct {
+		comment     string
+		vv          VectorisedView
+		bytesToRead int
+		readBytes   string
+		leftBytes   string
+		wantError   bool
+	}{
+		{
+			comment:     "large VV, short read",
+			vv:          vv(30, "012345678901234567890123456789"),
+			bytesToRead: 10,
+			readBytes:   "0123456789",
+			leftBytes:   "01234567890123456789",
+		},
+		{
+			comment:     "largeVV, multiple buffers, short read",
+			vv:          vv(13, "123", "345", "567", "8910"),
+			bytesToRead: 6,
+			readBytes:   "123345",
+			leftBytes:   "5678910",
+		},
+		{
+			comment:     "smallVV, large read",
+			vv:          vv(3, "1", "2", "3"),
+			bytesToRead: 10,
+			readBytes:   "123",
+			leftBytes:   "",
+		},
+		{
+			comment:     "smallVV, large read",
+			vv:          vv(1, "1"),
+			bytesToRead: 10,
+			readBytes:   "1",
+			leftBytes:   "",
+		},
+		{
+			comment:     "emptyVV, large read",
+			vv:          vv(0, ""),
+			bytesToRead: 10,
+			readBytes:   "",
+			wantError:   true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.comment, func(t *testing.T) {
+			readTo := NewView(tc.bytesToRead)
+			inSize := tc.vv.Size()
+			copied, err := tc.vv.Read(readTo)
+			if !tc.wantError && err != nil {
+				t.Fatalf("unexpected error in tc.vv.Read(..) = %s", err)
+			}
+			readTo = readTo[:copied]
+			if got, want := copied, len(tc.readBytes); got != want {
+				t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(readTo), tc.readBytes; got != want {
+				t.Errorf("unexpected data in readTo got: %s, want: %s", got, want)
+			}
+			if got, want := tc.vv.Size(), inSize-copied; got != want {
+				t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(tc.vv.ToView()), tc.leftBytes; got != want {
+				t.Errorf("vv has incorrect data after Read got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index a8d6653ce..b4a0ae53d 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -28,7 +28,7 @@ import (
 
 // PacketInfo holds all the information about an outbound packet.
 type PacketInfo struct {
-	Pkt   stack.PacketBuffer
+	Pkt   *stack.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
 	Route stack.Route
@@ -257,7 +257,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 	route := r.Clone()
 	route.Release()
 	p := PacketInfo{
-		Pkt:   pkt,
+		Pkt:   &pkt,
 		Proto: protocol,
 		GSO:   gso,
 		Route: route,
@@ -269,21 +269,15 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
 	route.Release()
-	payloadView := pkts[0].Data.ToView()
 	n := 0
-	for _, pkt := range pkts {
-		off := pkt.DataOffset
-		size := pkt.DataSize
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
 		p := PacketInfo{
-			Pkt: stack.PacketBuffer{
-				Header: pkt.Header,
-				Data:   buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(),
-			},
+			Pkt:   pkt,
 			Proto: protocol,
 			GSO:   gso,
 			Route: route,
@@ -301,7 +295,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.Pac
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := PacketInfo{
-		Pkt:   stack.PacketBuffer{Data: vv},
+		Pkt:   &stack.PacketBuffer{Data: vv},
 		Proto: 0,
 		GSO:   nil,
 	}
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 3b3b6909b..7198742b7 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -441,118 +441,106 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 
 // WritePackets writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	var ethHdrBuf []byte
-	// hdr + data
-	iovLen := 2
-	if e.hdrSize > 0 {
-		// Add ethernet header if needed.
-		ethHdrBuf = make([]byte, header.EthernetMinimumSize)
-		eth := header.Ethernet(ethHdrBuf)
-		ethHdr := &header.EthernetFields{
-			DstAddr: r.RemoteLinkAddress,
-			Type:    protocol,
-		}
-
-		// Preserve the src address if it's set in the route.
-		if r.LocalLinkAddress != "" {
-			ethHdr.SrcAddr = r.LocalLinkAddress
-		} else {
-			ethHdr.SrcAddr = e.addr
-		}
-		eth.Encode(ethHdr)
-		iovLen++
-	}
+//
+// NOTE: This API uses sendmmsg to batch packets. As a result the underlying FD
+// picked to write the packet out has to be the same for all packets in the
+// list. In other words all packets in the batch should belong to the same
+// flow.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	n := pkts.Len()
 
-	n := len(pkts)
-
-	views := pkts[0].Data.Views()
-	/*
-	 * Each boundary in views can add one more iovec.
-	 *
-	 * payload |      |          |         |
-	 *         -----------------------------
-	 * packets |    |    |    |    |    |  |
-	 *         -----------------------------
-	 * iovecs  |    | |  |    |  | |    |  |
-	 */
-	iovec := make([]syscall.Iovec, n*iovLen+len(views)-1)
 	mmsgHdrs := make([]rawfile.MMsgHdr, n)
+	i := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		var ethHdrBuf []byte
+		iovLen := 0
+		if e.hdrSize > 0 {
+			// Add ethernet header if needed.
+			ethHdrBuf = make([]byte, header.EthernetMinimumSize)
+			eth := header.Ethernet(ethHdrBuf)
+			ethHdr := &header.EthernetFields{
+				DstAddr: r.RemoteLinkAddress,
+				Type:    protocol,
+			}
 
-	iovecIdx := 0
-	viewIdx := 0
-	viewOff := 0
-	off := 0
-	nextOff := 0
-	for i := range pkts {
-		// TODO(b/134618279): Different packets may have different data
-		// in the future. We should handle this.
-		if !viewsEqual(pkts[i].Data.Views(), views) {
-			panic("All packets in pkts should have the same Data.")
+			// Preserve the src address if it's set in the route.
+			if r.LocalLinkAddress != "" {
+				ethHdr.SrcAddr = r.LocalLinkAddress
+			} else {
+				ethHdr.SrcAddr = e.addr
+			}
+			eth.Encode(ethHdr)
+			iovLen++
 		}
 
-		prevIovecIdx := iovecIdx
-		mmsgHdr := &mmsgHdrs[i]
-		mmsgHdr.Msg.Iov = &iovec[iovecIdx]
-		packetSize := pkts[i].DataSize
-		hdr := &pkts[i].Header
-
-		off = pkts[i].DataOffset
-		if off != nextOff {
-			// We stop in a different point last time.
-			size := packetSize
-			viewIdx = 0
-			viewOff = 0
-			for size > 0 {
-				if size >= len(views[viewIdx]) {
-					viewIdx++
-					viewOff = 0
-					size -= len(views[viewIdx])
-				} else {
-					viewOff = size
-					size = 0
+		var vnetHdrBuf []byte
+		vnetHdr := virtioNetHdr{}
+		if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			if gso != nil {
+				vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+				if gso.NeedsCsum {
+					vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
+					vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
+					vnetHdr.csumOffset = gso.CsumOffset
+				}
+				if gso.Type != stack.GSONone && uint16(pkt.Data.Size()) > gso.MSS {
+					switch gso.Type {
+					case stack.GSOTCPv4:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
+					case stack.GSOTCPv6:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+					default:
+						panic(fmt.Sprintf("Unknown gso type: %v", gso.Type))
+					}
+					vnetHdr.gsoSize = gso.MSS
 				}
 			}
+			vnetHdrBuf = vnetHdrToByteSlice(&vnetHdr)
+			iovLen++
 		}
-		nextOff = off + packetSize
 
+		iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views()))
+		mmsgHdr := &mmsgHdrs[i]
+		mmsgHdr.Msg.Iov = &iovecs[0]
+		iovecIdx := 0
+		if vnetHdrBuf != nil {
+			v := &iovecs[iovecIdx]
+			v.Base = &vnetHdrBuf[0]
+			v.Len = uint64(len(vnetHdrBuf))
+			iovecIdx++
+		}
 		if ethHdrBuf != nil {
-			v := &iovec[iovecIdx]
+			v := &iovecs[iovecIdx]
 			v.Base = &ethHdrBuf[0]
 			v.Len = uint64(len(ethHdrBuf))
 			iovecIdx++
 		}
-
-		v := &iovec[iovecIdx]
+		pktSize := uint64(0)
+		// Encode L3 Header
+		v := &iovecs[iovecIdx]
+		hdr := &pkt.Header
 		hdrView := hdr.View()
 		v.Base = &hdrView[0]
 		v.Len = uint64(len(hdrView))
+		pktSize += v.Len
 		iovecIdx++
 
-		for packetSize > 0 {
-			vec := &iovec[iovecIdx]
+		// Now encode the Transport Payload.
+		pktViews := pkt.Data.Views()
+		for i := range pktViews {
+			vec := &iovecs[iovecIdx]
 			iovecIdx++
-
-			v := views[viewIdx]
-			vec.Base = &v[viewOff]
-			s := len(v) - viewOff
-			if s <= packetSize {
-				viewIdx++
-				viewOff = 0
-			} else {
-				s = packetSize
-				viewOff += s
-			}
-			vec.Len = uint64(s)
-			packetSize -= s
+			vec.Base = &pktViews[i][0]
+			vec.Len = uint64(len(pktViews[i]))
+			pktSize += vec.Len
 		}
-
-		mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx)
+		mmsgHdr.Msg.Iovlen = uint64(iovecIdx)
+		i++
 	}
 
 	packets := 0
 	for packets < n {
-		fd := e.fds[pkts[packets].Hash%uint32(len(e.fds))]
+		fd := e.fds[pkts.Front().Hash%uint32(len(e.fds))]
 		sent, err := rawfile.NonBlockingSendMMsg(fd, mmsgHdrs)
 		if err != nil {
 			return packets, err
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 4039753b7..1e2255bfa 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -92,7 +92,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index f5973066d..a5478ce17 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -87,7 +87,7 @@ func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber,
 // WritePackets writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
 // r.RemoteAddress has a route registered in this endpoint.
-func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	endpoint, ok := m.routes[r.RemoteAddress]
 	if !ok {
 		return 0, tcpip.ErrNoRoute
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 6461d0108..0796d717e 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -214,7 +214,7 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 0a6b8945c..062388f4d 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -200,7 +200,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("send", protocol, pkt.Header.View(), gso)
 	}
@@ -233,20 +233,16 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumb
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
-	e.dumpPacket(gso, protocol, pkt)
+	e.dumpPacket(gso, protocol, &pkt)
 	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
 
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	view := pkts[0].Data.ToView()
-	for _, pkt := range pkts {
-		e.dumpPacket(gso, protocol, stack.PacketBuffer{
-			Header: pkt.Header,
-			Data:   view[pkt.DataOffset:][:pkt.DataSize].ToVectorisedView(),
-		})
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.dumpPacket(gso, protocol, pkt)
 	}
 	return e.lower.WritePackets(r, gso, pkts, protocol)
 }
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 52fe397bf..2b3741276 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -112,9 +112,9 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	if !e.writeGate.Enter() {
-		return len(pkts), nil
+		return pkts.Len(), nil
 	}
 
 	n, err := e.lower.WritePackets(r, gso, pkts, protocol)
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 88224e494..54eb5322b 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -71,9 +71,9 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcp
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	e.writeCount += len(pkts)
-	return len(pkts), nil
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	e.writeCount += pkts.Len()
+	return pkts.Len(), nil
 }
 
 func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 255098372..7acbfa0a8 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderPara
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 4950d69fc..4c20301c6 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -172,7 +172,7 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index a7d9a8b25..104aafbed 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -280,28 +280,47 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
 	if r.Loop&stack.PacketOut == 0 {
-		return len(pkts), nil
+		return pkts.Len(), nil
+	}
+
+	for pkt := pkts.Front(); pkt != nil; {
+		ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+		pkt.NetworkHeader = buffer.View(ip)
+		pkt = pkt.Next()
 	}
 
 	// iptables filtering. All packets that reach here are locally
 	// generated.
 	ipt := e.stack.IPTables()
-	for i := range pkts {
-		if ok := ipt.Check(stack.Output, pkts[i]); !ok {
-			// iptables is telling us to drop the packet.
+	dropped := ipt.CheckPackets(stack.Output, pkts)
+	if len(dropped) == 0 {
+		// Fast path: If no packets are to be dropped then we can just invoke the
+		// faster WritePackets API directly.
+		n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		return n, err
+	}
+
+	// Slow Path as we are dropping some packets in the batch degrade to
+	// emitting one packet at a time.
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if _, ok := dropped[pkt]; ok {
 			continue
 		}
-		ip := e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params)
-		pkts[i].NetworkHeader = buffer.View(ip)
+		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, *pkt); err != nil {
+			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+			return n, err
+		}
+		n++
 	}
-	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-	return n, err
+	return n, nil
 }
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 6d2d2c034..f91180aa3 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -79,7 +79,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	// Only the first view in vv is accounted for by h. To account for the
 	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
-	payload := pkt.Data
+	payload := pkt.Data.Clone(nil)
 	payload.RemoveFirst()
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index b462b8604..a815b4d9b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -143,19 +143,17 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
 	}
 	if r.Loop&stack.PacketOut == 0 {
-		return len(pkts), nil
+		return pkts.Len(), nil
 	}
 
-	for i := range pkts {
-		hdr := &pkts[i].Header
-		size := pkts[i].DataSize
-		ip := e.addIPHeader(r, hdr, size, params)
-		pkts[i].NetworkHeader = buffer.View(ip)
+	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
+		ip := e.addIPHeader(r, &pb.Header, pb.Data.Size(), params)
+		pb.NetworkHeader = buffer.View(ip)
 	}
 
 	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 8d80e9cee..5e963a4af 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "packet_buffer_list",
+    out = "packet_buffer_list.go",
+    package = "stack",
+    prefix = "PacketBuffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*PacketBuffer",
+        "Linker": "*PacketBuffer",
+    },
+)
+
 go_library(
     name = "stack",
     srcs = [
@@ -29,7 +41,7 @@ go_library(
         "ndp.go",
         "nic.go",
         "packet_buffer.go",
-        "packet_buffer_state.go",
+        "packet_buffer_list.go",
         "rand.go",
         "registration.go",
         "route.go",
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index c45c43d21..e9c652042 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -101,7 +101,7 @@ func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkH
 }
 
 // WritePackets implements LinkEndpoint.WritePackets.
-func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -260,10 +260,10 @@ func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.Netw
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	n := 0
-	for _, pkt := range pkts {
-		e.WritePacket(r, gso, protocol, pkt)
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.WritePacket(r, gso, protocol, *pkt)
 		n++
 	}
 
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 37907ae24..6c0a4b24d 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -209,6 +209,23 @@ func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool {
 	return true
 }
 
+// CheckPackets runs pkts through the rules for hook and returns a map of packets that
+// should not go forward.
+//
+// NOTE: unlike the Check API the returned map contains packets that should be
+// dropped.
+func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*PacketBuffer]struct{}) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if ok := it.Check(hook, *pkt); !ok {
+			if drop == nil {
+				drop = make(map[*PacketBuffer]struct{})
+			}
+			drop[pkt] = struct{}{}
+		}
+	}
+	return drop
+}
+
 // Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 598468bdd..27dc8baf9 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -468,7 +468,7 @@ func TestDADResolve(t *testing.T) {
 				// As per RFC 4861 section 4.3, a possible option is the Source Link
 				// Layer option, but this option MUST NOT be included when the source
 				// address of the packet is the unspecified address.
-				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
+				checker.IPv6(t, p.Pkt.Header.View(),
 					checker.SrcAddr(header.IPv6Any),
 					checker.DstAddr(snmc),
 					checker.TTL(header.NDPHopLimit),
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 9367de180..dc125f25e 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -23,9 +23,11 @@ import (
 // As a PacketBuffer traverses up the stack, it may be necessary to pass it to
 // multiple endpoints. Clone() should be called in such cases so that
 // modifications to the Data field do not affect other copies.
-//
-// +stateify savable
 type PacketBuffer struct {
+	// PacketBufferEntry is used to build an intrusive list of
+	// PacketBuffers.
+	PacketBufferEntry
+
 	// Data holds the payload of the packet. For inbound packets, it also
 	// holds the headers, which are consumed as the packet moves up the
 	// stack. Headers are guaranteed not to be split across views.
@@ -34,14 +36,6 @@ type PacketBuffer struct {
 	// or otherwise modified.
 	Data buffer.VectorisedView
 
-	// DataOffset is used for GSO output. It is the offset into the Data
-	// field where the payload of this packet starts.
-	DataOffset int
-
-	// DataSize is used for GSO output. It is the size of this packet's
-	// payload.
-	DataSize int
-
 	// Header holds the headers of outbound packets. As a packet is passed
 	// down the stack, each layer adds to Header.
 	Header buffer.Prependable
diff --git a/pkg/tcpip/stack/packet_buffer_state.go b/pkg/tcpip/stack/packet_buffer_state.go
deleted file mode 100644
index 0c6b7924c..000000000
--- a/pkg/tcpip/stack/packet_buffer_state.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package stack
-
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
-
-// beforeSave is invoked by stateify.
-func (pk *PacketBuffer) beforeSave() {
-	// Non-Data fields may be slices of the Data field. This causes
-	// problems for SR, so during save we make each header independent.
-	pk.Header = pk.Header.DeepCopy()
-	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
-	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
-	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
-}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index ac043b722..23ca9ee03 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -246,7 +246,7 @@ type NetworkEndpoint interface {
 
 	// WritePackets writes packets to the given destination address and
 	// protocol. pkts must not be zero length.
-	WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
@@ -393,7 +393,7 @@ type LinkEndpoint interface {
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
-	WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 9fbe8a411..a0e5e0300 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -168,23 +168,26 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt PacketBuff
 	return err
 }
 
-// WritePackets writes the set of packets through the given route.
-func (r *Route) WritePackets(gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+// WritePackets writes a list of n packets through the given route and returns
+// the number of packets written.
+func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
 	if !r.ref.isValidForOutgoing() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
 	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
 	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
 	}
 	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
-	payloadSize := 0
-	for i := 0; i < n; i++ {
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkts[i].Header.UsedLength()))
-		payloadSize += pkts[i].DataSize
+
+	writtenBytes := 0
+	for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() {
+		writtenBytes += pb.Header.UsedLength()
+		writtenBytes += pb.Data.Size()
 	}
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize))
+
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
 	return n, err
 }
 
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b8543b71e..3f8a2a095 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -153,7 +153,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 3239a5911..2ca3fb809 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -756,8 +756,7 @@ func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedV
 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	optLen := len(tf.opts)
 	hdr := &pkt.Header
-	packetSize := pkt.DataSize
-	off := pkt.DataOffset
+	packetSize := pkt.Data.Size()
 	// Initialize the header.
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
 	pkt.TransportHeader = buffer.View(tcp)
@@ -782,12 +781,18 @@ func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *sta
 		// header and data and get the right sum of the TCP packet.
 		tcp.SetChecksum(xsum)
 	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
-		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, off, packetSize)
+		xsum = header.ChecksumVV(pkt.Data, xsum)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
 }
 
 func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
+	// We need to shallow clone the VectorisedView here as ReadToView will
+	// split the VectorisedView and Trim underlying views as it splits. Not
+	// doing the clone here will cause the underlying views of data itself
+	// to be altered.
+	data = data.Clone(nil)
+
 	optLen := len(tf.opts)
 	if tf.rcvWnd > 0xffff {
 		tf.rcvWnd = 0xffff
@@ -796,31 +801,25 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 	mss := int(gso.MSS)
 	n := (data.Size() + mss - 1) / mss
 
-	// Allocate one big slice for all the headers.
-	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
-	buf := make([]byte, n*hdrSize)
-	pkts := make([]stack.PacketBuffer, n)
-	for i := range pkts {
-		pkts[i].Header = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
-	}
-
 	size := data.Size()
-	off := 0
+	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
+	var pkts stack.PacketBufferList
 	for i := 0; i < n; i++ {
 		packetSize := mss
 		if packetSize > size {
 			packetSize = size
 		}
 		size -= packetSize
-		pkts[i].DataOffset = off
-		pkts[i].DataSize = packetSize
-		pkts[i].Data = data
-		pkts[i].Hash = tf.txHash
-		pkts[i].Owner = owner
-		buildTCPHdr(r, tf, &pkts[i], gso)
-		off += packetSize
+		var pkt stack.PacketBuffer
+		pkt.Header = buffer.NewPrependable(hdrSize)
+		pkt.Hash = tf.txHash
+		pkt.Owner = owner
+		data.ReadToVV(&pkt.Data, packetSize)
+		buildTCPHdr(r, tf, &pkt, gso)
 		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
+		pkts.PushBack(&pkt)
 	}
+
 	if tf.ttl == 0 {
 		tf.ttl = r.DefaultTTL()
 	}
@@ -845,12 +844,10 @@ func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stac
 	}
 
 	pkt := stack.PacketBuffer{
-		Header:     buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
-		DataOffset: 0,
-		DataSize:   data.Size(),
-		Data:       data,
-		Hash:       tf.txHash,
-		Owner:      owner,
+		Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
+		Data:   data,
+		Hash:   tf.txHash,
+		Owner:  owner,
 	}
 	buildTCPHdr(r, tf, &pkt, gso)
 
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index e6fe7985d..40461fd31 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -77,9 +77,11 @@ func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.V
 		id:     id,
 		route:  r.Clone(),
 	}
-	s.views[0] = v
-	s.data = buffer.NewVectorisedView(len(v), s.views[:1])
 	s.rcvdTime = time.Now()
+	if len(v) != 0 {
+		s.views[0] = v
+		s.data = buffer.NewVectorisedView(len(v), s.views[:1])
+	}
 	return s
 }
 
-- 
cgit v1.2.3


From 24bee1c1813a691072cff5bad7a528690a99eb5e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Sat, 4 Apr 2020 21:01:42 -0700
Subject: Record VFS2 sockets in global socket map.

Updates #1476, #1478, #1484, #1485.

PiperOrigin-RevId: 304845354
---
 pkg/sentry/fsimpl/proc/BUILD        |  1 -
 pkg/sentry/fsimpl/proc/task_net.go  | 88 ++++++++++++++++++++++---------------
 pkg/sentry/kernel/kernel.go         | 30 +++++++++++--
 pkg/sentry/socket/socket.go         |  6 ++-
 pkg/sentry/socket/unix/unix_vfs2.go |  2 +-
 pkg/sentry/vfs/file_description.go  |  6 +++
 6 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 8156984eb..17c1342b5 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
-        "//pkg/sentry/fs",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 373a7b17d..6b2a77328 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -32,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -206,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil)
 func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
 	for _, se := range n.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+		if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
 			s.DecRef()
 			// Not a unix socket.
 			continue
 		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
+		sops := s.Impl().(*unix.SocketVFS2)
 
 		addr, err := sops.Endpoint().GetLocalAddress()
 		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			log.Warningf("Failed to retrieve socket name from %+v: %v", s, err)
 			addr.Addr = "<unknown>"
 		}
 
@@ -234,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 			}
 		}
 
+		// Get inode number.
+		var ino uint64
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO})
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve ino for socket file: %v", statErr)
+		} else {
+			ino = stat.Ino
+		}
+
 		// In the socket entry below, the value for the 'Num' field requires
 		// some consideration. Linux prints the address to the struct
 		// unix_sock representing a socket in the kernel, but may redact the
@@ -252,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// the definition of this struct changes over time.
 		//
 		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			s.Refs()-1,                    // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
 			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
+			ino,                           // Inode.
 		)
 
 		// Path
@@ -341,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 	t := kernel.TaskFromContext(ctx)
 
 	for _, se := range k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
+		sops, ok := s.Impl().(socket.SocketVFS2)
 		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
 		}
 		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
 			s.DecRef()
@@ -398,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 		// Unimplemented.
 		fmt.Fprintf(buf, "%08X ", 0)
 
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
 		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
 			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout; number of unanswered 0-window probes.
@@ -413,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 		fmt.Fprintf(buf, "%8d ", 0)
 
 		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -499,15 +512,14 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	t := kernel.TaskFromContext(ctx)
 
 	for _, se := range d.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
+		sops, ok := s.Impl().(socket.SocketVFS2)
 		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
 		}
 		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
 			s.DecRef()
@@ -551,25 +563,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// Field: retrnsmt. Always 0 for UDP.
 		fmt.Fprintf(buf, "%08X ", 0)
 
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
 		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
 			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout. Always 0 for UDP.
 		fmt.Fprintf(buf, "%8d ", 0)
 
 		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 2e6f42b92..ba8935a82 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1445,9 +1445,10 @@ func (k *Kernel) SupervisorContext() context.Context {
 // +stateify savable
 type SocketEntry struct {
 	socketEntry
-	k    *Kernel
-	Sock *refs.WeakRef
-	ID   uint64 // Socket table entry number.
+	k        *Kernel
+	Sock     *refs.WeakRef
+	SockVFS2 *vfs.FileDescription
+	ID       uint64 // Socket table entry number.
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
@@ -1470,7 +1471,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
 	k.extMu.Unlock()
 }
 
+// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
+// tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+//
+// Note that the socket table will not hold a reference on the
+// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
+	k.extMu.Lock()
+	id := k.nextSocketEntry
+	k.nextSocketEntry++
+	s := &SocketEntry{
+		k:        k,
+		ID:       id,
+		SockVFS2: sock,
+	}
+	k.sockets.PushBack(s)
+	k.extMu.Unlock()
+}
+
 // ListSockets returns a snapshot of all sockets.
+//
+// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// to get a reference on a socket in the table.
 func (k *Kernel) ListSockets() []*SocketEntry {
 	k.extMu.Lock()
 	var socks []*SocketEntry
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index b5ba4a56b..6580bd6e9 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -269,7 +269,7 @@ func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*v
 			return nil, err
 		}
 		if s != nil {
-			// TODO: Add vfs2 sockets to global socket table.
+			t.Kernel().RecordSocketVFS2(s)
 			return s, nil
 		}
 	}
@@ -291,7 +291,9 @@ func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*
 			return nil, nil, err
 		}
 		if s1 != nil && s2 != nil {
-			// TODO: Add vfs2 sockets to global socket table.
+			k := t.Kernel()
+			k.RecordSocketVFS2(s1)
+			k.RecordSocketVFS2(s2)
 			return s1, s2, nil
 		}
 	}
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index ca1388e2c..3e54d49c4 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -141,7 +141,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 		return 0, nil, 0, syserr.FromError(e)
 	}
 
-	// TODO: add vfs2 sockets to global table.
+	t.Kernel().RecordSocketVFS2(ns)
 	return fd, addr, addrLen, nil
 }
 
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5df4bbf45..28e93a441 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -182,6 +182,12 @@ func (fd *FileDescription) DecRef() {
 	}
 }
 
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+	return atomic.LoadInt64(&fd.refs)
+}
+
 // Mount returns the mount on which fd was opened. It does not take a reference
 // on the returned Mount.
 func (fd *FileDescription) Mount() *Mount {
-- 
cgit v1.2.3


From 748290236408f2c3e33b5f208352f8fd2ecbfa1e Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Mon, 6 Apr 2020 06:24:20 -0700
Subject: Implement Stringer for Layer

Tested:
  Sample output for printing 3 different Layer structs:

  &testbench.Ether{SrcAddr:02:42:c4:77:5d:14 DstAddr:02:42:c4:77:5d:0a}
  &testbench.IPv4{SrcAddr:196.119.93.20 DstAddr:224.0.0.1}
  &testbench.UDP{SrcPort:0xc00033b260 DstPort:0xc00033b280}

  Sample output for printing a Layers struct (word-wrapped):
  [&testbench.Ether{SrcAddr:02:42:c4:77:5d:14 DstAddr:02:42:c4:77:5d:0a}
   &testbench.IPv4{SrcAddr:196.119.93.20 DstAddr:224.0.0.1}
   &testbench.UDP{SrcPort:0xc00033b260 DstPort:0xc00033b280}
   &testbench.Payload{Bytes:[104 101 108 108 111 32 119 111 114 108 100]}]

PiperOrigin-RevId: 305014376
---
 test/packetimpact/testbench/BUILD          |  9 ++++-
 test/packetimpact/testbench/layers.go      | 61 ++++++++++++++++++++++++++----
 test/packetimpact/testbench/layers_test.go | 49 ++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 9 deletions(-)
 create mode 100644 test/packetimpact/testbench/layers_test.go

diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index 4a9d8efa6..199823419 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//test/packetimpact:__subpackages__"],
@@ -30,3 +30,10 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+go_test(
+    name = "testbench_test",
+    size = "small",
+    srcs = ["layers_test.go"],
+    library = ":testbench",
+)
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index d7434c3d2..4d6625941 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -17,6 +17,7 @@ package testbench
 import (
 	"fmt"
 	"reflect"
+	"strings"
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
@@ -32,6 +33,8 @@ import (
 // Layer contains all the fields of the encapsulation. Each field is a pointer
 // and may be nil.
 type Layer interface {
+	fmt.Stringer
+
 	// toBytes converts the Layer into bytes. In places where the Layer's field
 	// isn't nil, the value that is pointed to is used. When the field is nil, a
 	// reasonable default for the Layer is used. For example, "64" for IPv4 TTL
@@ -43,7 +46,8 @@ type Layer interface {
 
 	// match checks if the current Layer matches the provided Layer. If either
 	// Layer has a nil in a given field, that field is considered matching.
-	// Otherwise, the values pointed to by the fields must match.
+	// Otherwise, the values pointed to by the fields must match. The LayerBase is
+	// ignored.
 	match(Layer) bool
 
 	// length in bytes of the current encapsulation
@@ -84,18 +88,39 @@ func (lb *LayerBase) setPrev(l Layer) {
 	lb.prevLayer = l
 }
 
+// equalLayer compares that two Layer structs match while ignoring field in
+// which either input has a nil and also ignoring the LayerBase of the inputs.
 func equalLayer(x, y Layer) bool {
+	// opt ignores comparison pairs where either of the inputs is a nil.
 	opt := cmp.FilterValues(func(x, y interface{}) bool {
-		if reflect.ValueOf(x).Kind() == reflect.Ptr && reflect.ValueOf(x).IsNil() {
-			return true
-		}
-		if reflect.ValueOf(y).Kind() == reflect.Ptr && reflect.ValueOf(y).IsNil() {
-			return true
+		for _, l := range []interface{}{x, y} {
+			v := reflect.ValueOf(l)
+			if (v.Kind() == reflect.Ptr || v.Kind() == reflect.Slice) && v.IsNil() {
+				return true
+			}
 		}
 		return false
-
 	}, cmp.Ignore())
-	return cmp.Equal(x, y, opt, cmpopts.IgnoreUnexported(LayerBase{}))
+	return cmp.Equal(x, y, opt, cmpopts.IgnoreTypes(LayerBase{}))
+}
+
+func stringLayer(l Layer) string {
+	v := reflect.ValueOf(l).Elem()
+	t := v.Type()
+	var ret []string
+	for i := 0; i < v.NumField(); i++ {
+		t := t.Field(i)
+		if t.Anonymous {
+			// Ignore the LayerBase in the Layer struct.
+			continue
+		}
+		v := v.Field(i)
+		if v.IsNil() {
+			continue
+		}
+		ret = append(ret, fmt.Sprintf("%s:%v", t.Name, v))
+	}
+	return fmt.Sprintf("&%s{%s}", t, strings.Join(ret, " "))
 }
 
 // Ether can construct and match an ethernet encapsulation.
@@ -106,6 +131,10 @@ type Ether struct {
 	Type    *tcpip.NetworkProtocolNumber
 }
 
+func (l *Ether) String() string {
+	return stringLayer(l)
+}
+
 func (l *Ether) toBytes() ([]byte, error) {
 	b := make([]byte, header.EthernetMinimumSize)
 	h := header.Ethernet(b)
@@ -190,6 +219,10 @@ type IPv4 struct {
 	DstAddr        *tcpip.Address
 }
 
+func (l *IPv4) String() string {
+	return stringLayer(l)
+}
+
 func (l *IPv4) toBytes() ([]byte, error) {
 	b := make([]byte, header.IPv4MinimumSize)
 	h := header.IPv4(b)
@@ -339,6 +372,10 @@ type TCP struct {
 	UrgentPointer *uint16
 }
 
+func (l *TCP) String() string {
+	return stringLayer(l)
+}
+
 func (l *TCP) toBytes() ([]byte, error) {
 	b := make([]byte, header.TCPMinimumSize)
 	h := header.TCP(b)
@@ -480,6 +517,10 @@ type UDP struct {
 	Checksum *uint16
 }
 
+func (l *UDP) String() string {
+	return stringLayer(l)
+}
+
 func (l *UDP) toBytes() ([]byte, error) {
 	b := make([]byte, header.UDPMinimumSize)
 	h := header.UDP(b)
@@ -556,6 +597,10 @@ type Payload struct {
 	Bytes []byte
 }
 
+func (l *Payload) String() string {
+	return stringLayer(l)
+}
+
 // ParsePayload parses the bytes assuming that they start with a payload and
 // continue to the end. There can be no further encapsulations.
 func ParsePayload(b []byte) (Layers, error) {
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
new file mode 100644
index 000000000..b39839625
--- /dev/null
+++ b/test/packetimpact/testbench/layers_test.go
@@ -0,0 +1,49 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import "testing"
+
+func TestLayerMatch(t *testing.T) {
+	var nilPayload *Payload
+	noPayload := &Payload{}
+	emptyPayload := &Payload{Bytes: []byte{}}
+	fullPayload := &Payload{Bytes: []byte{1, 2, 3}}
+	emptyTCP := &TCP{SrcPort: Uint16(1234), LayerBase: LayerBase{nextLayer: emptyPayload}}
+	fullTCP := &TCP{SrcPort: Uint16(1234), LayerBase: LayerBase{nextLayer: fullPayload}}
+	for _, tt := range []struct {
+		a, b Layer
+		want bool
+	}{
+		{nilPayload, nilPayload, true},
+		{nilPayload, noPayload, true},
+		{nilPayload, emptyPayload, true},
+		{nilPayload, fullPayload, true},
+		{noPayload, noPayload, true},
+		{noPayload, emptyPayload, true},
+		{noPayload, fullPayload, true},
+		{emptyPayload, emptyPayload, true},
+		{emptyPayload, fullPayload, false},
+		{fullPayload, fullPayload, true},
+		{emptyTCP, fullTCP, true},
+	} {
+		if got := tt.a.match(tt.b); got != tt.want {
+			t.Errorf("%s.match(%s) = %t, want %t", tt.a, tt.b, got, tt.want)
+		}
+		if got := tt.b.match(tt.a); got != tt.want {
+			t.Errorf("%s.match(%s) = %t, want %t", tt.b, tt.a, got, tt.want)
+		}
+	}
+}
-- 
cgit v1.2.3


From 00d9776a4bb1cc1d7125af7d3e54a939a4f3847a Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 6 Apr 2020 07:30:20 -0700
Subject: Add socket files to tmpfs VFS2.

Updates #1476.

PiperOrigin-RevId: 305024274
---
 pkg/sentry/fsimpl/tmpfs/BUILD          |  1 +
 pkg/sentry/fsimpl/tmpfs/filesystem.go  | 20 +++++++++++++++-----
 pkg/sentry/fsimpl/tmpfs/socket_file.go | 34 ++++++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go       |  4 +++-
 4 files changed, 53 insertions(+), 6 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/tmpfs/socket_file.go

diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 6ea35affb..f2ac23c88 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -24,6 +24,7 @@ go_library(
         "filesystem.go",
         "named_pipe.go",
         "regular_file.go",
+        "socket_file.go",
         "symlink.go",
         "tmpfs.go",
     ],
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 1978af69c..5339d7072 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -261,8 +261,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		case linux.S_IFCHR:
 			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFSOCK:
-			// Not yet supported.
-			return syserror.EPERM
+			childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
 		default:
 			return syserror.EINVAL
 		}
@@ -396,6 +395,8 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
+	case *socketFile:
+		return nil, syserror.ENXIO
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
 	}
@@ -679,10 +680,19 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 }
 
 // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
-//
-// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
-	return nil, syserror.ECONNREFUSED
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	switch impl := d.inode.impl.(type) {
+	case *socketFile:
+		return impl.ep, nil
+	default:
+		return nil, syserror.ECONNREFUSED
+	}
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
new file mode 100644
index 000000000..25c2321af
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -0,0 +1,34 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+// socketFile is a socket (=S_IFSOCK) tmpfs file.
+type socketFile struct {
+	inode inode
+	ep    transport.BoundEndpoint
+}
+
+func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+	file := &socketFile{ep: ep}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index ad47288f8..654e788e3 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -331,7 +331,7 @@ func (i *inode) statTo(stat *linux.Statx) {
 	case *deviceFile:
 		stat.RdevMajor = impl.major
 		stat.RdevMinor = impl.minor
-	case *directory, *namedPipe:
+	case *socketFile, *directory, *namedPipe:
 		// Nothing to do.
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
@@ -479,6 +479,8 @@ func (i *inode) direntType() uint8 {
 		return linux.DT_DIR
 	case *symlink:
 		return linux.DT_LNK
+	case *socketFile:
+		return linux.DT_SOCK
 	case *deviceFile:
 		switch impl.kind {
 		case vfs.BlockDevice:
-- 
cgit v1.2.3


From 4baa7e70795edbb350d55a9365807341515d3af4 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 6 Apr 2020 09:50:13 -0700
Subject: Bump up acceptable sample count for flaky itimer test.

Running the test 1000x almost always produces 1+ test failures where
the sample count is slightly more than 60.

PiperOrigin-RevId: 305051754
---
 test/syscalls/linux/itimer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 8b48f0804..dd981a278 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -246,7 +246,7 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 
   // The number of samples on the main thread should be very low as it did
   // nothing.
-  TEST_CHECK(result.main_thread_samples < 60);
+  TEST_CHECK(result.main_thread_samples < 80);
 
   // Both workers should get roughly equal number of samples.
   TEST_CHECK(result.worker_samples.size() == 2);
-- 
cgit v1.2.3


From f332a864e8cc7799332838deffab37244ff8ffc7 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 6 Apr 2020 10:51:54 -0700
Subject: Port timerfd to VFS2.

PiperOrigin-RevId: 305067208
---
 pkg/sentry/kernel/kernel.go                        |  28 ++--
 pkg/sentry/syscalls/linux/vfs2/BUILD               |   1 +
 .../syscalls/linux/vfs2/linux64_override_amd64.go  |   6 +-
 pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go      | 123 ++++++++++++++++++
 pkg/sentry/vfs/BUILD                               |   2 +
 pkg/sentry/vfs/file_description.go                 |   7 +
 pkg/sentry/vfs/timerfd.go                          | 142 +++++++++++++++++++++
 7 files changed, 295 insertions(+), 14 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go
 create mode 100644 pkg/sentry/vfs/timerfd.go

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index ba8935a82..de8a95854 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1044,14 +1044,17 @@ func (k *Kernel) pauseTimeLocked() {
 		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if t.fdTable != nil {
-			// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-			if !VFS2Enabled {
-				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+						tfd.PauseTimer()
+					}
+				} else {
 					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 						tfd.PauseTimer()
 					}
-				})
-			}
+				}
+			})
 		}
 	}
 	k.timekeeper.PauseUpdates()
@@ -1076,15 +1079,18 @@ func (k *Kernel) resumeTimeLocked() {
 				it.ResumeTimer()
 			}
 		}
-		// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-		if !VFS2Enabled {
-			if t.fdTable != nil {
-				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+		if t.fdTable != nil {
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok {
+						tfd.ResumeTimer()
+					}
+				} else {
 					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
 						tfd.ResumeTimer()
 					}
-				})
-			}
+				}
+			})
 		}
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 2eb210014..0004e60d9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -25,6 +25,7 @@ go_library(
         "stat_amd64.go",
         "stat_arm64.go",
         "sync.go",
+        "sys_timerfd.go",
         "xattr.go",
     ],
     marshal = True,
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index 7d220bc20..63febc2f7 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -139,11 +139,11 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[280] = syscalls.Supported("utimensat", Utimensat)
 	table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
 	delete(table, 282) // signalfd
-	delete(table, 283) // timerfd_create
+	table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
 	delete(table, 284) // eventfd
 	delete(table, 285) // fallocate
-	delete(table, 286) // timerfd_settime
-	delete(table, 287) // timerfd_gettime
+	table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+	table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
 	delete(table, 288) // accept4
 	delete(table, 289) // signalfd4
 	delete(table, 290) // eventfd2
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go
new file mode 100644
index 000000000..7938a5249
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go
@@ -0,0 +1,123 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var fileFlags uint32
+	if flags&linux.TFD_NONBLOCK != 0 {
+		fileFlags = linux.O_NONBLOCK
+	}
+
+	var clock ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		clock = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+		clock = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	file, err := t.Kernel().VFS().NewTimerFD(clock, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*vfs.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+	tm, oldS := tfd.SetTime(newS)
+	if oldValAddr != 0 {
+		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	curValAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*vfs.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	tm, s := tfd.GetTime()
+	curVal := ktime.ItimerspecFromSetting(tm, s)
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index bf4d27c7d..9aeb83fb0 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -36,6 +36,7 @@ go_library(
         "pathname.go",
         "permissions.go",
         "resolving_path.go",
+        "timerfd.go",
         "vfs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -51,6 +52,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 28e93a441..20c545fca 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -91,6 +91,10 @@ type FileDescriptionOptions struct {
 	// ESPIPE.
 	DenyPWrite bool
 
+	// if InvalidWrite is true, calls to FileDescription.Write() return
+	// EINVAL.
+	InvalidWrite bool
+
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
 	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
@@ -562,6 +566,9 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o
 
 // Write is similar to PWrite, but does not specify an offset.
 func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	if fd.opts.InvalidWrite {
+		return 0, syserror.EINVAL
+	}
 	if !fd.writable {
 		return 0, syserror.EBADF
 	}
diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go
new file mode 100644
index 000000000..42b880656
--- /dev/null
+++ b/pkg/sentry/vfs/timerfd.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// implements ktime.TimerListener.
+type TimerFileDescription struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+
+	events waiter.Queue
+	timer  *ktime.Timer
+
+	// val is the number of timer expirations since the last successful
+	// call to PRead, or SetTime. val must be accessed using atomic memory
+	// operations.
+	val uint64
+}
+
+var _ FileDescriptionImpl = (*TimerFileDescription)(nil)
+var _ ktime.TimerListener = (*TimerFileDescription)(nil)
+
+// NewTimerFD returns a new timer fd.
+func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) {
+	vd := vfs.NewAnonVirtualDentry("[timerfd]")
+	defer vd.DecRef()
+	tfd := &TimerFileDescription{}
+	tfd.timer = ktime.NewTimer(clock, tfd)
+	if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		InvalidWrite:      true,
+	}); err != nil {
+		return nil, err
+	}
+	return &tfd.vfsfd, nil
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	const sizeofUint64 = 8
+	if dst.NumBytes() < sizeofUint64 {
+		return 0, syserror.EINVAL
+	}
+	if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
+		var buf [sizeofUint64]byte
+		usermem.ByteOrder.PutUint64(buf[:], val)
+		if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+			// Linux does not undo consuming the number of
+			// expirations even if writing to userspace fails.
+			return 0, err
+		}
+		return sizeofUint64, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// Clock returns the timer fd's Clock.
+func (tfd *TimerFileDescription) Clock() ktime.Clock {
+	return tfd.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
+	return tfd.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+	return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	if atomic.LoadUint64(&tfd.val) != 0 {
+		ready |= waiter.EventIn
+	}
+	return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	tfd.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
+	tfd.events.EventUnregister(e)
+}
+
+// PauseTimer pauses the associated Timer.
+func (tfd *TimerFileDescription) PauseTimer() {
+	tfd.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (tfd *TimerFileDescription) ResumeTimer() {
+	tfd.timer.Resume()
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (tfd *TimerFileDescription) Release() {
+	tfd.timer.Destroy()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	atomic.AddUint64(&tfd.val, exp)
+	tfd.events.Notify(waiter.EventIn)
+	return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (tfd *TimerFileDescription) Destroy() {}
-- 
cgit v1.2.3


From dd98fdd5beb7f02e7c7b3aeb4f07f5d00ffc41e7 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 6 Apr 2020 16:31:27 -0700
Subject: Correctly implement magic symlinks in VFS2 procfs.

Updates #1195

PiperOrigin-RevId: 305143567
---
 pkg/sentry/fsbridge/vfs.go                  | 28 +++++++++++++---------
 pkg/sentry/fsimpl/kernfs/filesystem.go      | 36 +++++++++++++++++++----------
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |  5 ++++
 pkg/sentry/fsimpl/kernfs/kernfs.go          | 16 ++++++++++++-
 pkg/sentry/fsimpl/kernfs/symlink.go         |  5 ++++
 pkg/sentry/fsimpl/proc/task_fds.go          |  6 +++++
 pkg/sentry/fsimpl/proc/task_files.go        | 17 ++++++++++++++
 pkg/sentry/fsimpl/proc/tasks_files.go       | 10 ++++++++
 8 files changed, 99 insertions(+), 24 deletions(-)

diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 79b808359..89168220a 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -26,22 +26,22 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// fsFile implements File interface over vfs.FileDescription.
+// VFSFile implements File interface over vfs.FileDescription.
 //
 // +stateify savable
-type vfsFile struct {
+type VFSFile struct {
 	file *vfs.FileDescription
 }
 
-var _ File = (*vfsFile)(nil)
+var _ File = (*VFSFile)(nil)
 
 // NewVFSFile creates a new File over fs.File.
 func NewVFSFile(file *vfs.FileDescription) File {
-	return &vfsFile{file: file}
+	return &VFSFile{file: file}
 }
 
 // PathnameWithDeleted implements File.
-func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
+func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string {
 	root := vfs.RootFromContext(ctx)
 	defer root.DecRef()
 
@@ -51,7 +51,7 @@ func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
 }
 
 // ReadFull implements File.
-func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+func (f *VFSFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
 	var total int64
 	for dst.NumBytes() > 0 {
 		n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
@@ -67,12 +67,12 @@ func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset i
 }
 
 // ConfigureMMap implements File.
-func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+func (f *VFSFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	return f.file.ConfigureMMap(ctx, opts)
 }
 
 // Type implements File.
-func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
+func (f *VFSFile) Type(ctx context.Context) (linux.FileMode, error) {
 	stat, err := f.file.Stat(ctx, vfs.StatOptions{})
 	if err != nil {
 		return 0, err
@@ -81,15 +81,21 @@ func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
 }
 
 // IncRef implements File.
-func (f *vfsFile) IncRef() {
+func (f *VFSFile) IncRef() {
 	f.file.IncRef()
 }
 
 // DecRef implements File.
-func (f *vfsFile) DecRef() {
+func (f *VFSFile) DecRef() {
 	f.file.DecRef()
 }
 
+// FileDescription returns the FileDescription represented by f. It does not
+// take an additional reference on the returned FileDescription.
+func (f *VFSFile) FileDescription() *vfs.FileDescription {
+	return f.file
+}
+
 // fsLookup implements Lookup interface using fs.File.
 //
 // +stateify savable
@@ -132,5 +138,5 @@ func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.Open
 	if err != nil {
 		return nil, err
 	}
-	return &vfsFile{file: fd}, nil
+	return &VFSFile{file: fd}, nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 89f5da3d4..16a3c18ae 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -79,16 +79,22 @@ afterSymlink:
 	}
 	// Resolve any symlink at current path component.
 	if rp.ShouldFollowSymlink() && next.isSymlink() {
-		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
-		target, err := next.inode.Readlink(ctx)
+		targetVD, targetPathname, err := next.inode.Getlink(ctx)
 		if err != nil {
 			return nil, err
 		}
-		if err := rp.HandleSymlink(target); err != nil {
-			return nil, err
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef()
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
 		}
 		goto afterSymlink
-
 	}
 	rp.Advance()
 	return &next.vfsd, nil
@@ -470,19 +476,25 @@ afterTrailingSymlink:
 	}
 	childDentry := childVFSD.Impl().(*Dentry)
 	childInode := childDentry.inode
-	if rp.ShouldFollowSymlink() {
-		if childDentry.isSymlink() {
-			target, err := childInode.Readlink(ctx)
+	if rp.ShouldFollowSymlink() && childDentry.isSymlink() {
+		targetVD, targetPathname, err := childInode.Getlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef()
 			if err != nil {
 				return nil, err
 			}
-			if err := rp.HandleSymlink(target); err != nil {
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
 				return nil, err
 			}
-			// rp.Final() may no longer be true since we now need to resolve the
-			// symlink target.
-			goto afterTrailingSymlink
 		}
+		// rp.Final() may no longer be true since we now need to resolve the
+		// symlink target.
+		goto afterTrailingSymlink
 	}
 	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 5c84b10c9..65f09af5d 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -181,6 +181,11 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) {
 	return "", syserror.EINVAL
 }
 
+// Getlink implements Inode.Getlink.
+func (InodeNotSymlink) Getlink(context.Context) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, "", syserror.EINVAL
+}
+
 // InodeAttrs partially implements the Inode interface, specifically the
 // inodeMetadata sub interface. InodeAttrs provides functionality related to
 // inode attributes.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 2cefef020..ad76b9f64 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -414,7 +414,21 @@ type inodeDynamicLookup interface {
 }
 
 type inodeSymlink interface {
-	// Readlink resolves the target of a symbolic link. If an inode is not a
+	// Readlink returns the target of a symbolic link. If an inode is not a
 	// symlink, the implementation should return EINVAL.
 	Readlink(ctx context.Context) (string, error)
+
+	// Getlink returns the target of a symbolic link, as used by path
+	// resolution:
+	//
+	// - If the inode is a "magic link" (a link whose target is most accurately
+	// represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
+	// nil). A reference is taken on the returned VirtualDentry.
+	//
+	// - If the inode is an ordinary symlink, Getlink returns (zero-value
+	// VirtualDentry, symlink target, nil).
+	//
+	// - If the inode is not a symlink, Getlink returns (zero-value
+	// VirtualDentry, "", EINVAL).
+	Getlink(ctx context.Context) (vfs.VirtualDentry, string, error)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 5918d3309..018aa503c 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -55,6 +55,11 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 	return s.target, nil
 }
 
+// Getlink implements Inode.Getlink.
+func (s *StaticSymlink) Getlink(_ context.Context) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, s.target, nil
+}
+
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
 func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 76bfc5307..9c8656b28 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -196,6 +196,12 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
 	return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry())
 }
 
+func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	vd := s.file.VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
 func (s *fdSymlink) DecRef() {
 	s.AtomicRefCount.DecRefWithDestructor(func() {
 		s.Destroy()
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index df0d1bcc5..88ea6a6d8 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -610,6 +610,23 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
 	return exec.PathnameWithDeleted(ctx), nil
 }
 
+// Getlink implements kernfs.Inode.Getlink.
+func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+
+	exec, err := s.executable()
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef()
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
 func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 882c1981e..4621e2de0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -63,6 +63,11 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
+func (s *selfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
 func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
@@ -101,6 +106,11 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
+func (s *threadSelfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
 // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
 func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
-- 
cgit v1.2.3


From 32fc11ee3e39b7ef1152825090112f4b239887c4 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Mon, 6 Apr 2020 17:52:25 -0700
Subject: Sort posix service functions

PiperOrigin-RevId: 305157179
---
 test/packetimpact/dut/posix_server.cc      |  42 ++--
 test/packetimpact/proto/posix_server.proto |  74 +++----
 test/packetimpact/testbench/dut.go         | 299 +++++++++++++++--------------
 3 files changed, 210 insertions(+), 205 deletions(-)

diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index 4a71c54c6..b8177f5b1 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -61,13 +61,15 @@
 }
 
 class PosixImpl final : public posix_server::Posix::Service {
-  ::grpc::Status Socket(grpc_impl::ServerContext *context,
-                        const ::posix_server::SocketRequest *request,
-                        ::posix_server::SocketResponse *response) override {
-    response->set_fd(
-        socket(request->domain(), request->type(), request->protocol()));
+  ::grpc::Status Accept(grpc_impl::ServerContext *context,
+                        const ::posix_server::AcceptRequest *request,
+                        ::posix_server::AcceptResponse *response) override {
+    sockaddr_storage addr;
+    socklen_t addrlen = sizeof(addr);
+    response->set_fd(accept(request->sockfd(),
+                            reinterpret_cast<sockaddr *>(&addr), &addrlen));
     response->set_errno_(errno);
-    return ::grpc::Status::OK;
+    return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
   }
 
   ::grpc::Status Bind(grpc_impl::ServerContext *context,
@@ -119,6 +121,14 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
+  ::grpc::Status Close(grpc_impl::ServerContext *context,
+                       const ::posix_server::CloseRequest *request,
+                       ::posix_server::CloseResponse *response) override {
+    response->set_ret(close(request->fd()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
   ::grpc::Status GetSockName(
       grpc_impl::ServerContext *context,
       const ::posix_server::GetSockNameRequest *request,
@@ -139,17 +149,6 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Accept(grpc_impl::ServerContext *context,
-                        const ::posix_server::AcceptRequest *request,
-                        ::posix_server::AcceptResponse *response) override {
-    sockaddr_storage addr;
-    socklen_t addrlen = sizeof(addr);
-    response->set_fd(accept(request->sockfd(),
-                            reinterpret_cast<sockaddr *>(&addr), &addrlen));
-    response->set_errno_(errno);
-    return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
-  }
-
   ::grpc::Status SetSockOpt(
       grpc_impl::ServerContext *context,
       const ::posix_server::SetSockOptRequest *request,
@@ -174,10 +173,11 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Close(grpc_impl::ServerContext *context,
-                       const ::posix_server::CloseRequest *request,
-                       ::posix_server::CloseResponse *response) override {
-    response->set_ret(close(request->fd()));
+  ::grpc::Status Socket(grpc_impl::ServerContext *context,
+                        const ::posix_server::SocketRequest *request,
+                        ::posix_server::SocketResponse *response) override {
+    response->set_fd(
+        socket(request->domain(), request->type(), request->protocol()));
     response->set_errno_(errno);
     return ::grpc::Status::OK;
   }
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index 53ec49410..1565f31fa 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -16,17 +16,6 @@ syntax = "proto3";
 
 package posix_server;
 
-message SocketRequest {
-  int32 domain = 1;
-  int32 type = 2;
-  int32 protocol = 3;
-}
-
-message SocketResponse {
-  int32 fd = 1;
-  int32 errno_ = 2;  // "errno" may fail to compile in c++.
-}
-
 message SockaddrIn {
   int32 family = 1;
   uint32 port = 2;
@@ -48,6 +37,23 @@ message Sockaddr {
   }
 }
 
+message Timeval {
+  int64 seconds = 1;
+  int64 microseconds = 2;
+}
+
+// Request and Response pairs for each Posix service RPC call, sorted.
+
+message AcceptRequest {
+  int32 sockfd = 1;
+}
+
+message AcceptResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+  Sockaddr addr = 3;
+}
+
 message BindRequest {
   int32 sockfd = 1;
   Sockaddr addr = 2;
@@ -58,6 +64,15 @@ message BindResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message CloseRequest {
+  int32 fd = 1;
+}
+
+message CloseResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
 message GetSockNameRequest {
   int32 sockfd = 1;
 }
@@ -78,16 +93,6 @@ message ListenResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
-message AcceptRequest {
-  int32 sockfd = 1;
-}
-
-message AcceptResponse {
-  int32 fd = 1;
-  int32 errno_ = 2;  // "errno" may fail to compile in c++.
-  Sockaddr addr = 3;
-}
-
 message SetSockOptRequest {
   int32 sockfd = 1;
   int32 level = 2;
@@ -100,11 +105,6 @@ message SetSockOptResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
-message Timeval {
-  int64 seconds = 1;
-  int64 microseconds = 2;
-}
-
 message SetSockOptTimevalRequest {
   int32 sockfd = 1;
   int32 level = 2;
@@ -117,12 +117,14 @@ message SetSockOptTimevalResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
-message CloseRequest {
-  int32 fd = 1;
+message SocketRequest {
+  int32 domain = 1;
+  int32 type = 2;
+  int32 protocol = 3;
 }
 
-message CloseResponse {
-  int32 ret = 1;
+message SocketResponse {
+  int32 fd = 1;
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
@@ -139,16 +141,16 @@ message RecvResponse {
 }
 
 service Posix {
-  // Call socket() on the DUT.
-  rpc Socket(SocketRequest) returns (SocketResponse);
+  // Call accept() on the DUT.
+  rpc Accept(AcceptRequest) returns (AcceptResponse);
   // Call bind() on the DUT.
   rpc Bind(BindRequest) returns (BindResponse);
+  // Call close() on the DUT.
+  rpc Close(CloseRequest) returns (CloseResponse);
   // Call getsockname() on the DUT.
   rpc GetSockName(GetSockNameRequest) returns (GetSockNameResponse);
   // Call listen() on the DUT.
   rpc Listen(ListenRequest) returns (ListenResponse);
-  // Call accept() on the DUT.
-  rpc Accept(AcceptRequest) returns (AcceptResponse);
   // Call setsockopt() on the DUT.  You should prefer one of the other
   // SetSockOpt* functions with a more structured optval or else you may get the
   // encoding wrong, such as making a bad assumption about the server's word
@@ -157,8 +159,8 @@ service Posix {
   // Call setsockopt() on the DUT with a Timeval optval.
   rpc SetSockOptTimeval(SetSockOptTimevalRequest)
       returns (SetSockOptTimevalResponse);
-  // Call close() on the DUT.
-  rpc Close(CloseRequest) returns (CloseResponse);
+  // Call socket() on the DUT.
+  rpc Socket(SocketRequest) returns (SocketResponse);
   // Call recv() on the DUT.
   rpc Recv(RecvRequest) returns (RecvResponse);
 }
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index d102dc7bb..f342aee01 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -65,33 +65,6 @@ func (dut *DUT) TearDown() {
 	dut.conn.Close()
 }
 
-// SocketWithErrno calls socket on the DUT and returns the fd and errno.
-func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) {
-	dut.t.Helper()
-	req := pb.SocketRequest{
-		Domain:   domain,
-		Type:     typ,
-		Protocol: proto,
-	}
-	ctx := context.Background()
-	resp, err := dut.posixServer.Socket(ctx, &req)
-	if err != nil {
-		dut.t.Fatalf("failed to call Socket: %s", err)
-	}
-	return resp.GetFd(), syscall.Errno(resp.GetErrno_())
-}
-
-// Socket calls socket on the DUT and returns the file descriptor. If socket
-// fails on the DUT, the test ends.
-func (dut *DUT) Socket(domain, typ, proto int32) int32 {
-	dut.t.Helper()
-	fd, err := dut.SocketWithErrno(domain, typ, proto)
-	if fd < 0 {
-		dut.t.Fatalf("failed to create socket: %s", err)
-	}
-	return fd
-}
-
 func (dut *DUT) sockaddrToProto(sa unix.Sockaddr) *pb.Sockaddr {
 	dut.t.Helper()
 	switch s := sa.(type) {
@@ -142,6 +115,88 @@ func (dut *DUT) protoToSockaddr(sa *pb.Sockaddr) unix.Sockaddr {
 	return nil
 }
 
+// CreateBoundSocket makes a new socket on the DUT, with type typ and protocol
+// proto, and bound to the IP address addr. Returns the new file descriptor and
+// the port that was selected on the DUT.
+func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) {
+	dut.t.Helper()
+	var fd int32
+	if addr.To4() != nil {
+		fd = dut.Socket(unix.AF_INET, typ, proto)
+		sa := unix.SockaddrInet4{}
+		copy(sa.Addr[:], addr.To4())
+		dut.Bind(fd, &sa)
+	} else if addr.To16() != nil {
+		fd = dut.Socket(unix.AF_INET6, typ, proto)
+		sa := unix.SockaddrInet6{}
+		copy(sa.Addr[:], addr.To16())
+		dut.Bind(fd, &sa)
+	} else {
+		dut.t.Fatal("unknown ip addr type for remoteIP")
+	}
+	sa := dut.GetSockName(fd)
+	var port int
+	switch s := sa.(type) {
+	case *unix.SockaddrInet4:
+		port = s.Port
+	case *unix.SockaddrInet6:
+		port = s.Port
+	default:
+		dut.t.Fatalf("unknown sockaddr type from getsockname: %t", sa)
+	}
+	return fd, uint16(port)
+}
+
+// CreateListener makes a new TCP connection. If it fails, the test ends.
+func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
+	fd, remotePort := dut.CreateBoundSocket(typ, proto, net.ParseIP(*remoteIPv4))
+	dut.Listen(fd, backlog)
+	return fd, remotePort
+}
+
+// All the functions that make gRPC calls to the Posix service are below, sorted
+// alphabetically.
+
+// Accept calls accept on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// AcceptWithErrno.
+func (dut *DUT) Accept(sockfd int32) (int32, unix.Sockaddr) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	fd, sa, err := dut.AcceptWithErrno(ctx, sockfd)
+	if fd < 0 {
+		dut.t.Fatalf("failed to accept: %s", err)
+	}
+	return fd, sa
+}
+
+// AcceptWithErrno calls accept on the DUT.
+func (dut *DUT) AcceptWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
+	dut.t.Helper()
+	req := pb.AcceptRequest{
+		Sockfd: sockfd,
+	}
+	resp, err := dut.posixServer.Accept(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Accept: %s", err)
+	}
+	return resp.GetFd(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+}
+
+// Bind calls bind on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is
+// needed, use BindWithErrno.
+func (dut *DUT) Bind(fd int32, sa unix.Sockaddr) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.BindWithErrno(ctx, fd, sa)
+	if ret != 0 {
+		dut.t.Fatalf("failed to bind socket: %s", err)
+	}
+}
+
 // BindWithErrno calls bind on the DUT.
 func (dut *DUT) BindWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (int32, error) {
 	dut.t.Helper()
@@ -156,30 +211,30 @@ func (dut *DUT) BindWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
-// Bind calls bind on the DUT and causes a fatal test failure if it doesn't
-// succeed. If more control over the timeout or error handling is
-// needed, use BindWithErrno.
-func (dut *DUT) Bind(fd int32, sa unix.Sockaddr) {
+// Close calls close on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// CloseWithErrno.
+func (dut *DUT) Close(fd int32) {
 	dut.t.Helper()
 	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
 	defer cancel()
-	ret, err := dut.BindWithErrno(ctx, fd, sa)
+	ret, err := dut.CloseWithErrno(ctx, fd)
 	if ret != 0 {
-		dut.t.Fatalf("failed to bind socket: %s", err)
+		dut.t.Fatalf("failed to close: %s", err)
 	}
 }
 
-// GetSockNameWithErrno calls getsockname on the DUT.
-func (dut *DUT) GetSockNameWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
+// CloseWithErrno calls close on the DUT.
+func (dut *DUT) CloseWithErrno(ctx context.Context, fd int32) (int32, error) {
 	dut.t.Helper()
-	req := pb.GetSockNameRequest{
-		Sockfd: sockfd,
+	req := pb.CloseRequest{
+		Fd: fd,
 	}
-	resp, err := dut.posixServer.GetSockName(ctx, &req)
+	resp, err := dut.posixServer.Close(ctx, &req)
 	if err != nil {
-		dut.t.Fatalf("failed to call Bind: %s", err)
+		dut.t.Fatalf("failed to call Close: %s", err)
 	}
-	return resp.GetRet(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
 // GetSockName calls getsockname on the DUT and causes a fatal test failure if
@@ -196,18 +251,17 @@ func (dut *DUT) GetSockName(sockfd int32) unix.Sockaddr {
 	return sa
 }
 
-// ListenWithErrno calls listen on the DUT.
-func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int32, error) {
+// GetSockNameWithErrno calls getsockname on the DUT.
+func (dut *DUT) GetSockNameWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
 	dut.t.Helper()
-	req := pb.ListenRequest{
-		Sockfd:  sockfd,
-		Backlog: backlog,
+	req := pb.GetSockNameRequest{
+		Sockfd: sockfd,
 	}
-	resp, err := dut.posixServer.Listen(ctx, &req)
+	resp, err := dut.posixServer.GetSockName(ctx, &req)
 	if err != nil {
-		dut.t.Fatalf("failed to call Listen: %s", err)
+		dut.t.Fatalf("failed to call Bind: %s", err)
 	}
-	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+	return resp.GetRet(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
 }
 
 // Listen calls listen on the DUT and causes a fatal test failure if it doesn't
@@ -223,31 +277,33 @@ func (dut *DUT) Listen(sockfd, backlog int32) {
 	}
 }
 
-// AcceptWithErrno calls accept on the DUT.
-func (dut *DUT) AcceptWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
+// ListenWithErrno calls listen on the DUT.
+func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int32, error) {
 	dut.t.Helper()
-	req := pb.AcceptRequest{
-		Sockfd: sockfd,
+	req := pb.ListenRequest{
+		Sockfd:  sockfd,
+		Backlog: backlog,
 	}
-	resp, err := dut.posixServer.Accept(ctx, &req)
+	resp, err := dut.posixServer.Listen(ctx, &req)
 	if err != nil {
-		dut.t.Fatalf("failed to call Accept: %s", err)
+		dut.t.Fatalf("failed to call Listen: %s", err)
 	}
-	return resp.GetFd(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
-// Accept calls accept on the DUT and causes a fatal test failure if it doesn't
-// succeed. If more control over the timeout or error handling is needed, use
-// AcceptWithErrno.
-func (dut *DUT) Accept(sockfd int32) (int32, unix.Sockaddr) {
+// SetSockOpt calls setsockopt on the DUT and causes a fatal test failure if it
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use SetSockOptWithErrno. Because endianess and the width of values
+// might differ between the testbench and DUT architectures, prefer to use a
+// more specific SetSockOptXxx function.
+func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) {
 	dut.t.Helper()
 	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
 	defer cancel()
-	fd, sa, err := dut.AcceptWithErrno(ctx, sockfd)
-	if fd < 0 {
-		dut.t.Fatalf("failed to accept: %s", err)
+	ret, err := dut.SetSockOptWithErrno(ctx, sockfd, level, optname, optval)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOpt: %s", err)
 	}
-	return fd, sa
 }
 
 // SetSockOptWithErrno calls setsockopt on the DUT. Because endianess and the
@@ -268,18 +324,16 @@ func (dut *DUT) SetSockOptWithErrno(ctx context.Context, sockfd, level, optname
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
-// SetSockOpt calls setsockopt on the DUT and causes a fatal test failure if it
-// doesn't succeed. If more control over the timeout or error handling is
-// needed, use SetSockOptWithErrno. Because endianess and the width of values
-// might differ between the testbench and DUT architectures, prefer to use a
-// more specific SetSockOptXxx function.
-func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) {
+// SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed. If more control over the timeout or error handling is
+// needed, use SetSockOptTimevalWithErrno.
+func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval) {
 	dut.t.Helper()
 	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
 	defer cancel()
-	ret, err := dut.SetSockOptWithErrno(ctx, sockfd, level, optname, optval)
+	ret, err := dut.SetSockOptTimevalWithErrno(ctx, sockfd, level, optname, tv)
 	if ret != 0 {
-		dut.t.Fatalf("failed to SetSockOpt: %s", err)
+		dut.t.Fatalf("failed to SetSockOptTimeval: %s", err)
 	}
 }
 
@@ -304,32 +358,31 @@ func (dut *DUT) SetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, o
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
-// SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure
-// if it doesn't succeed. If more control over the timeout or error handling is
-// needed, use SetSockOptTimevalWithErrno.
-func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval) {
+// Socket calls socket on the DUT and returns the file descriptor. If socket
+// fails on the DUT, the test ends.
+func (dut *DUT) Socket(domain, typ, proto int32) int32 {
 	dut.t.Helper()
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
-	ret, err := dut.SetSockOptTimevalWithErrno(ctx, sockfd, level, optname, tv)
-	if ret != 0 {
-		dut.t.Fatalf("failed to SetSockOptTimeval: %s", err)
+	fd, err := dut.SocketWithErrno(domain, typ, proto)
+	if fd < 0 {
+		dut.t.Fatalf("failed to create socket: %s", err)
 	}
+	return fd
 }
 
-// RecvWithErrno calls recv on the DUT.
-func (dut *DUT) RecvWithErrno(ctx context.Context, sockfd, len, flags int32) (int32, []byte, error) {
+// SocketWithErrno calls socket on the DUT and returns the fd and errno.
+func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) {
 	dut.t.Helper()
-	req := pb.RecvRequest{
-		Sockfd: sockfd,
-		Len:    len,
-		Flags:  flags,
+	req := pb.SocketRequest{
+		Domain:   domain,
+		Type:     typ,
+		Protocol: proto,
 	}
-	resp, err := dut.posixServer.Recv(ctx, &req)
+	ctx := context.Background()
+	resp, err := dut.posixServer.Socket(ctx, &req)
 	if err != nil {
-		dut.t.Fatalf("failed to call Recv: %s", err)
+		dut.t.Fatalf("failed to call Socket: %s", err)
 	}
-	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
+	return resp.GetFd(), syscall.Errno(resp.GetErrno_())
 }
 
 // Recv calls recv on the DUT and causes a fatal test failure if it doesn't
@@ -346,67 +399,17 @@ func (dut *DUT) Recv(sockfd, len, flags int32) []byte {
 	return buf
 }
 
-// CloseWithErrno calls close on the DUT.
-func (dut *DUT) CloseWithErrno(ctx context.Context, fd int32) (int32, error) {
+// RecvWithErrno calls recv on the DUT.
+func (dut *DUT) RecvWithErrno(ctx context.Context, sockfd, len, flags int32) (int32, []byte, error) {
 	dut.t.Helper()
-	req := pb.CloseRequest{
-		Fd: fd,
+	req := pb.RecvRequest{
+		Sockfd: sockfd,
+		Len:    len,
+		Flags:  flags,
 	}
-	resp, err := dut.posixServer.Close(ctx, &req)
+	resp, err := dut.posixServer.Recv(ctx, &req)
 	if err != nil {
-		dut.t.Fatalf("failed to call Close: %s", err)
-	}
-	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
-}
-
-// Close calls close on the DUT and causes a fatal test failure if it doesn't
-// succeed. If more control over the timeout or error handling is needed, use
-// CloseWithErrno.
-func (dut *DUT) Close(fd int32) {
-	dut.t.Helper()
-	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
-	defer cancel()
-	ret, err := dut.CloseWithErrno(ctx, fd)
-	if ret != 0 {
-		dut.t.Fatalf("failed to close: %s", err)
-	}
-}
-
-// CreateBoundSocket makes a new socket on the DUT, with type typ and protocol
-// proto, and bound to the IP address addr. Returns the new file descriptor and
-// the port that was selected on the DUT.
-func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) {
-	dut.t.Helper()
-	var fd int32
-	if addr.To4() != nil {
-		fd = dut.Socket(unix.AF_INET, typ, proto)
-		sa := unix.SockaddrInet4{}
-		copy(sa.Addr[:], addr.To4())
-		dut.Bind(fd, &sa)
-	} else if addr.To16() != nil {
-		fd = dut.Socket(unix.AF_INET6, typ, proto)
-		sa := unix.SockaddrInet6{}
-		copy(sa.Addr[:], addr.To16())
-		dut.Bind(fd, &sa)
-	} else {
-		dut.t.Fatal("unknown ip addr type for remoteIP")
-	}
-	sa := dut.GetSockName(fd)
-	var port int
-	switch s := sa.(type) {
-	case *unix.SockaddrInet4:
-		port = s.Port
-	case *unix.SockaddrInet6:
-		port = s.Port
-	default:
-		dut.t.Fatalf("unknown sockaddr type from getsockname: %t", sa)
+		dut.t.Fatalf("failed to call Recv: %s", err)
 	}
-	return fd, uint16(port)
-}
-
-// CreateListener makes a new TCP connection. If it fails, the test ends.
-func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
-	fd, remotePort := dut.CreateBoundSocket(typ, proto, net.ParseIP(*remoteIPv4))
-	dut.Listen(fd, backlog)
-	return fd, remotePort
+	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
 }
-- 
cgit v1.2.3


From 51e461cf9c49f6ad5a9a68d93c5928647aae11d8 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 6 Apr 2020 20:07:32 -0700
Subject: Add concurrency guarantees to p9 extended attribute methods.

PiperOrigin-RevId: 305171772
---
 pkg/p9/file.go     |  8 ++++----
 pkg/p9/handlers.go | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index d4ffbc8e3..cab35896f 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -97,12 +97,12 @@ type File interface {
 	// free to ignore the hint entirely (i.e. the value returned may be larger
 	// than size). All size checking is done independently at the syscall layer.
 	//
-	// TODO(b/127675828): Determine concurrency guarantees once implemented.
+	// On the server, GetXattr has a read concurrency guarantee.
 	GetXattr(name string, size uint64) (string, error)
 
 	// SetXattr sets extended attributes on this node.
 	//
-	// TODO(b/127675828): Determine concurrency guarantees once implemented.
+	// On the server, SetXattr has a write concurrency guarantee.
 	SetXattr(name, value string, flags uint32) error
 
 	// ListXattr lists the names of the extended attributes on this node.
@@ -113,12 +113,12 @@ type File interface {
 	// free to ignore the hint entirely (i.e. the value returned may be larger
 	// than size). All size checking is done independently at the syscall layer.
 	//
-	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	// On the server, ListXattr has a read concurrency guarantee.
 	ListXattr(size uint64) (map[string]struct{}, error)
 
 	// RemoveXattr removes extended attributes on this node.
 	//
-	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	// On the server, RemoveXattr has a write concurrency guarantee.
 	RemoveXattr(name string) error
 
 	// Allocate allows the caller to directly manipulate the allocated disk space
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 2ac45eb80..a8b714cf5 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -920,8 +920,15 @@ func (t *Tgetxattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	val, err := ref.file.GetXattr(t.Name, t.Size)
-	if err != nil {
+	var val string
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow getxattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		val, err = ref.file.GetXattr(t.Name, t.Size)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 	return &Rgetxattr{Value: val}
@@ -935,7 +942,13 @@ func (t *Tsetxattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	if err := ref.file.SetXattr(t.Name, t.Value, t.Flags); err != nil {
+	if err := ref.safelyWrite(func() error {
+		// Don't allow setxattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		return ref.file.SetXattr(t.Name, t.Value, t.Flags)
+	}); err != nil {
 		return newErr(err)
 	}
 	return &Rsetxattr{}
@@ -949,10 +962,18 @@ func (t *Tlistxattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	xattrs, err := ref.file.ListXattr(t.Size)
-	if err != nil {
+	var xattrs map[string]struct{}
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow listxattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		xattrs, err = ref.file.ListXattr(t.Size)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
+
 	xattrList := make([]string, 0, len(xattrs))
 	for x := range xattrs {
 		xattrList = append(xattrList, x)
@@ -968,7 +989,13 @@ func (t *Tremovexattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	if err := ref.file.RemoveXattr(t.Name); err != nil {
+	if err := ref.safelyWrite(func() error {
+		// Don't allow removexattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		return ref.file.RemoveXattr(t.Name)
+	}); err != nil {
 		return newErr(err)
 	}
 	return &Rremovexattr{}
-- 
cgit v1.2.3


From 94319a8241cb299edc812024d6132b7a3819a4dc Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 7 Apr 2020 09:40:38 -0700
Subject: Make gofer.dentry.destroyLocked idempotent

gofer operations accumulate dentries touched in a slice to call
checkCachingLocked on them when the operation is over. In case
the same dentry is touched multiple times during the operation,
checkCachingLocked, and consequently destroyLocked, may be called
more than once for the same dentry.

Updates #1198

PiperOrigin-RevId: 305276819
---
 pkg/sentry/fsimpl/gofer/BUILD         | 12 ++++++-
 pkg/sentry/fsimpl/gofer/gofer.go      | 36 +++++++++++++++++---
 pkg/sentry/fsimpl/gofer/gofer_test.go | 64 +++++++++++++++++++++++++++++++++++
 test/syscalls/linux/open.cc           | 22 ++++++++++++
 4 files changed, 129 insertions(+), 5 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/gofer/gofer_test.go

diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index d15a36709..99d1e3f8f 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
@@ -54,3 +54,13 @@ go_library(
         "//pkg/usermem",
     ],
 )
+
+go_test(
+    name = "gofer_test",
+    srcs = ["gofer_test.go"],
+    library = ":gofer",
+    deps = [
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index adee8bb60..20edaf643 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -444,7 +444,8 @@ type dentry struct {
 
 	// refs is the reference count. Each dentry holds a reference on its
 	// parent, even if disowned. refs is accessed using atomic memory
-	// operations.
+	// operations. When refs reaches 0, the dentry may be added to the cache or
+	// destroyed. If refs==-1 the dentry has already been destroyed.
 	refs int64
 
 	// fs is the owning filesystem. fs is immutable.
@@ -860,7 +861,7 @@ func (d *dentry) IncRef() {
 func (d *dentry) TryIncRef() bool {
 	for {
 		refs := atomic.LoadInt64(&d.refs)
-		if refs == 0 {
+		if refs <= 0 {
 			return false
 		}
 		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
@@ -883,13 +884,20 @@ func (d *dentry) DecRef() {
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
 // Preconditions: d.fs.renameMu must be locked for writing.
 func (d *dentry) checkCachingLocked() {
 	// Dentries with a non-zero reference count must be retained. (The only way
 	// to obtain a reference on a dentry with zero references is via path
 	// resolution, which requires renameMu, so if d.refs is zero then it will
 	// remain zero while we hold renameMu for writing.)
-	if atomic.LoadInt64(&d.refs) != 0 {
+	refs := atomic.LoadInt64(&d.refs)
+	if refs > 0 {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -897,6 +905,10 @@ func (d *dentry) checkCachingLocked() {
 		}
 		return
 	}
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
 	// Non-child dentries with zero references are no longer reachable by path
 	// resolution and should be dropped immediately.
 	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
@@ -949,9 +961,22 @@ func (d *dentry) checkCachingLocked() {
 	}
 }
 
+// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
+// close p9 file and remove reference on parent dentry.
+//
 // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
 // not a child dentry.
 func (d *dentry) destroyLocked() {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
 	ctx := context.Background()
 	d.handleMu.Lock()
 	if !d.handle.file.isNil() {
@@ -971,7 +996,10 @@ func (d *dentry) destroyLocked() {
 		d.handle.close(ctx)
 	}
 	d.handleMu.Unlock()
-	d.file.close(ctx)
+	if !d.file.isNil() {
+		d.file.close(ctx)
+		d.file = p9file{}
+	}
 	// Remove d from the set of all dentries.
 	d.fs.syncMu.Lock()
 	delete(d.fs.dentries, d)
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
new file mode 100644
index 000000000..82bc239db
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+)
+
+func TestDestroyIdempotent(t *testing.T) {
+	fs := filesystem{
+		dentries: make(map[*dentry]struct{}),
+		opts: filesystemOptions{
+			// Test relies on no dentry being held in the cache.
+			maxCachedDentries: 0,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	attr := &p9.Attr{
+		Mode: p9.ModeRegular,
+	}
+	mask := p9.AttrMask{
+		Mode: true,
+		Size: true,
+	}
+	parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+
+	child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+	parent.IncRef() // reference held by child on its parent.
+	parent.vfsd.InsertChild(&child.vfsd, "child")
+
+	child.checkCachingLocked()
+	if got := atomic.LoadInt64(&child.refs); got != -1 {
+		t.Fatalf("child.refs=%d, want: -1", got)
+	}
+	// Parent will also be destroyed when child reference is removed.
+	if got := atomic.LoadInt64(&parent.refs); got != -1 {
+		t.Fatalf("parent.refs=%d, want: -1", got)
+	}
+	child.checkCachingLocked()
+	child.checkCachingLocked()
+}
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 267ae19f6..640fe6bfc 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -186,6 +186,28 @@ TEST_F(OpenTest, OpenNoFollowStillFollowsLinksInPath) {
       ASSERT_NO_ERRNO_AND_VALUE(Open(path_via_symlink, O_RDONLY | O_NOFOLLOW));
 }
 
+// Test that open(2) can follow symlinks that point back to the same tree.
+// Test sets up files as follows:
+//   root/child/symlink => redirects to ../..
+//   root/child/target => regular file
+//
+// open("root/child/symlink/root/child/file")
+TEST_F(OpenTest, SymlinkRecurse) {
+  auto root =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir()));
+  auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+  auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(child.path(), "../.."));
+  auto target = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(child.path(), "abc", 0644));
+  auto path_via_symlink =
+      JoinPath(symlink.path(), Basename(root.path()), Basename(child.path()),
+               Basename(target.path()));
+  const auto contents =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents(path_via_symlink));
+  ASSERT_EQ(contents, "abc");
+}
+
 TEST_F(OpenTest, Fault) {
   char* totally_not_null = nullptr;
   ASSERT_THAT(open(totally_not_null, O_RDONLY), SyscallFailsWithErrno(EFAULT));
-- 
cgit v1.2.3


From 71770e56629339c9853466e994b78b172bc668a9 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 7 Apr 2020 13:27:26 -0700
Subject: mkdir test: Address TODOs and re-enable a test.

PiperOrigin-RevId: 305328184
---
 test/syscalls/linux/mkdir.cc | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index def4c50a4..4036a9275 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -36,21 +36,12 @@ class MkdirTest : public ::testing::Test {
 
   // TearDown unlinks created files.
   void TearDown() override {
-    // FIXME(edahlgren): We don't currently implement rmdir.
-    // We do this unconditionally because there's no harm in trying.
-    rmdir(dirname_.c_str());
+    EXPECT_THAT(rmdir(dirname_.c_str()), SyscallSucceeds());
   }
 
   std::string dirname_;
 };
 
-TEST_F(MkdirTest, DISABLED_CanCreateReadbleDir) {
-  ASSERT_THAT(mkdir(dirname_.c_str(), 0444), SyscallSucceeds());
-  ASSERT_THAT(
-      open(JoinPath(dirname_, "anything").c_str(), O_RDWR | O_CREAT, 0666),
-      SyscallFailsWithErrno(EACCES));
-}
-
 TEST_F(MkdirTest, CanCreateWritableDir) {
   ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
   std::string filename = JoinPath(dirname_, "anything");
@@ -84,10 +75,11 @@ TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
-  auto parent = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
-  auto dir = JoinPath(parent.path(), "foo");
-  ASSERT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0555), SyscallSucceeds());
+  auto dir = JoinPath(dirname_.c_str(), "foo");
+  EXPECT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(open(JoinPath(dirname_, "file").c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallFailsWithErrno(EACCES));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 6db55a5bd8933b217d285018ed2187812ebae6ef Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 7 Apr 2020 13:35:58 -0700
Subject: Require that IPv6 headers be in the first fragment

Test:
- header_test.TestIPv6ExtHdrIter
- ipv6_test.TestReceiveIPv6Fragments

Updates #2197, #2333

PiperOrigin-RevId: 305330178
---
 pkg/tcpip/header/ipv6_extension_headers.go      | 37 +++++++++++-------
 pkg/tcpip/header/ipv6_extension_headers_test.go | 41 ++++++++++++++++++--
 pkg/tcpip/network/ipv6/ipv6.go                  | 50 ++++++++++++++++++++++++-
 pkg/tcpip/network/ipv6/ipv6_test.go             |  2 +-
 4 files changed, 111 insertions(+), 19 deletions(-)

diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 82485ed6a..2c4591409 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -395,17 +395,24 @@ func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, pa
 }
 
 // AsRawHeader returns the remaining payload of i as a raw header and
-// completes the iterator.
+// optionally consumes the iterator.
 //
-// Calls to Next after calling AsRawHeader on i will indicate that the
-// iterator is done.
-func (i *IPv6PayloadIterator) AsRawHeader() IPv6RawPayloadHeader {
-	buf := i.payload
+// If consume is true, calls to Next after calling AsRawHeader on i will
+// indicate that the iterator is done.
+func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
 	identifier := i.nextHdrIdentifier
 
-	// Mark i as done.
-	*i = IPv6PayloadIterator{
-		nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
+	var buf buffer.VectorisedView
+	if consume {
+		// Since we consume the iterator, we return the payload as is.
+		buf = i.payload
+
+		// Mark i as done.
+		*i = IPv6PayloadIterator{
+			nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
+		}
+	} else {
+		buf = i.payload.Clone(nil)
 	}
 
 	return IPv6RawPayloadHeader{Identifier: identifier, Buf: buf}
@@ -424,7 +431,7 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 	// a fragment extension header as the data following the fragment extension
 	// header may not be complete.
 	if i.forceRaw {
-		return i.AsRawHeader(), false, nil
+		return i.AsRawHeader(true /* consume */), false, nil
 	}
 
 	// Is the header we are parsing a known extension header?
@@ -456,10 +463,12 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 
 		fragmentExtHdr := IPv6FragmentExtHdr(data)
 
-		// If the packet is a fragmented packet, do not attempt to parse
-		// anything after the fragment extension header as the data following
-		// the extension header may not be complete.
-		if fragmentExtHdr.More() || fragmentExtHdr.FragmentOffset() != 0 {
+		// If the packet is not the first fragment, do not attempt to parse anything
+		// after the fragment extension header as the payload following the fragment
+		// extension header should not contain any headers; the first fragment must
+		// hold all the headers up to and including any upper layer headers, as per
+		// RFC 8200 section 4.5.
+		if fragmentExtHdr.FragmentOffset() != 0 {
 			i.forceRaw = true
 		}
 
@@ -480,7 +489,7 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 	default:
 		// The header we are parsing is not a known extension header. Return the
 		// raw payload.
-		return i.AsRawHeader(), false, nil
+		return i.AsRawHeader(true /* consume */), false, nil
 	}
 }
 
diff --git a/pkg/tcpip/header/ipv6_extension_headers_test.go b/pkg/tcpip/header/ipv6_extension_headers_test.go
index 133ccc8b6..ab20c5f37 100644
--- a/pkg/tcpip/header/ipv6_extension_headers_test.go
+++ b/pkg/tcpip/header/ipv6_extension_headers_test.go
@@ -673,19 +673,26 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 		payload      buffer.VectorisedView
 		expected     []IPv6PayloadHeader
 	}{
-		// With a non-atomic fragment, the payload after the fragment will not be
-		// parsed because the payload may not be complete.
+		// With a non-atomic fragment that is not the first fragment, the payload
+		// after the fragment will not be parsed because the payload is expected to
+		// only hold upper layer data.
 		{
-			name:         "hopbyhop - fragment - routing - upper",
+			name:         "hopbyhop - fragment (not first) - routing - upper",
 			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
 			payload: makeVectorisedViewFromByteBuffers([]byte{
 				// Hop By Hop extension header.
 				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
 
 				// Fragment extension header.
+				//
+				// More = 1, Fragment Offset = 2117, ID = 2147746305
 				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
 
 				// Routing extension header.
+				//
+				// Even though we have a routing ext header here, it should be
+				// be interpretted as raw bytes as only the first fragment is expected
+				// to hold headers.
 				255, 0, 1, 2, 3, 4, 5, 6,
 
 				// Upper layer data.
@@ -700,6 +707,34 @@ func TestIPv6ExtHdrIter(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:         "hopbyhop - fragment (first) - routing - upper",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Fragment extension header.
+				//
+				// More = 1, Fragment Offset = 0, ID = 2147746305
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 0, 1, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6FragmentExtHdr([6]byte{0, 1, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
 		{
 			name:         "fragment - routing - upper (across views)",
 			firstNextHdr: IPv6FragmentExtHdrIdentifier,
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index a815b4d9b..331b0817b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -270,7 +270,55 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 				continue
 			}
 
-			rawPayload := it.AsRawHeader()
+			// Don't consume the iterator if we have the first fragment because we
+			// will use it to validate that the first fragment holds the upper layer
+			// header.
+			rawPayload := it.AsRawHeader(fragmentOffset != 0 /* consume */)
+
+			if fragmentOffset == 0 {
+				// Check that the iterator ends with a raw payload as the first fragment
+				// should include all headers up to and including any upper layer
+				// headers, as per RFC 8200 section 4.5; only upper layer data
+				// (non-headers) should follow the fragment extension header.
+				var lastHdr header.IPv6PayloadHeader
+
+				for {
+					it, done, err := it.Next()
+					if err != nil {
+						r.Stats().IP.MalformedPacketsReceived.Increment()
+						r.Stats().IP.MalformedPacketsReceived.Increment()
+						return
+					}
+					if done {
+						break
+					}
+
+					lastHdr = it
+				}
+
+				// If the last header is a raw header, then the last portion of the IPv6
+				// payload is not a known IPv6 extension header. Note, this does not
+				// mean that the last portion is an upper layer header or not an
+				// extension header because:
+				//  1) we do not yet support all extension headers
+				//  2) we do not validate the upper layer header before reassembling.
+				//
+				// This check makes sure that a known IPv6 extension header is not
+				// present after the Fragment extension header in a non-initial
+				// fragment.
+				//
+				// TODO(#2196): Support IPv6 Authentication and Encapsulated
+				// Security Payload extension headers.
+				// TODO(#2333): Validate that the upper layer header is valid.
+				switch lastHdr.(type) {
+				case header.IPv6RawPayloadHeader:
+				default:
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					r.Stats().IP.MalformedFragmentsReceived.Increment()
+					return
+				}
+			}
+
 			fragmentPayloadLen := rawPayload.Buf.Size()
 			if fragmentPayloadLen == 0 {
 				// Drop the packet as it's marked as a fragment but has no payload.
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 37f7e53ce..95e5dbf8e 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -1014,7 +1014,7 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: nil,
 		},
 		{
 			name: "Two fragments with routing header with non-zero segments left across fragments",
-- 
cgit v1.2.3


From 47db097773f9c0badb9f7b5866e697a7e7d0da13 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 7 Apr 2020 14:28:23 -0700
Subject: Internal change.

PiperOrigin-RevId: 305341059
---
 test/packetimpact/testbench/rawsockets.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
index 0c7d0f979..0074484f7 100644
--- a/test/packetimpact/testbench/rawsockets.go
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -47,6 +47,12 @@ func NewSniffer(t *testing.T) (Sniffer, error) {
 	if err != nil {
 		return Sniffer{}, err
 	}
+	if err := unix.SetsockoptInt(snifferFd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, 1); err != nil {
+		t.Fatalf("can't set sockopt SO_RCVBUFFORCE to 1: %s", err)
+	}
+	if err := unix.SetsockoptInt(snifferFd, unix.SOL_SOCKET, unix.SO_RCVBUF, 1e7); err != nil {
+		t.Fatalf("can't setsockopt SO_RCVBUF to 10M: %s", err)
+	}
 	return Sniffer{
 		t:  t,
 		fd: snifferFd,
-- 
cgit v1.2.3


From d5ddb5365086b13c0688c40fc74fa4cc4c5528db Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 7 Apr 2020 14:32:24 -0700
Subject: Remove out-of-date TODOs.

We already have network namespace for netstack.

PiperOrigin-RevId: 305341954
---
 pkg/sentry/socket/netstack/provider.go | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index eb090e79b..c3f04b613 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -62,10 +62,6 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		}
 
 	case linux.SOCK_RAW:
-		// TODO(b/142504697): "In order to create a raw socket, a
-		// process must have the CAP_NET_RAW capability in the user
-		// namespace that governs its network namespace." - raw(7)
-
 		// Raw sockets require CAP_NET_RAW.
 		creds := auth.CredentialsFromContext(ctx)
 		if !creds.HasCapability(linux.CAP_NET_RAW) {
@@ -141,10 +137,6 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 }
 
 func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
-	// TODO(b/142504697): "In order to create a packet socket, a process
-	// must have the CAP_NET_RAW capability in the user namespace that
-	// governs its network namespace." - packet(7)
-
 	// Packet sockets require CAP_NET_RAW.
 	creds := auth.CredentialsFromContext(t)
 	if !creds.HasCapability(linux.CAP_NET_RAW) {
-- 
cgit v1.2.3


From fc72eb3595a7c4e2fa83caa39a9bb4171875c208 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 7 Apr 2020 14:47:16 -0700
Subject: Remove TODOs for local gofer extended attributes.

PiperOrigin-RevId: 305344989
---
 runsc/fsgofer/fsgofer.go | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index cadd83273..1942f50d7 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -767,22 +767,18 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
-// TODO(b/127675828): support getxattr.
 func (*localFile) GetXattr(string, uint64) (string, error) {
 	return "", syscall.EOPNOTSUPP
 }
 
-// TODO(b/127675828): support setxattr.
 func (*localFile) SetXattr(string, string, uint32) error {
 	return syscall.EOPNOTSUPP
 }
 
-// TODO(b/148303075): support listxattr.
 func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
 	return nil, syscall.EOPNOTSUPP
 }
 
-// TODO(b/148303075): support removexattr.
 func (*localFile) RemoveXattr(string) error {
 	return syscall.EOPNOTSUPP
 }
-- 
cgit v1.2.3


From 693b6bdda9206a5910c552a3997b2df5480d6947 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 7 Apr 2020 16:16:31 -0700
Subject: Correctly distinguish between seekable and non-seekable host fds.

Check whether an fd is seekable by calling the seek syscall and
examining the return value, instead of checking the file type,
which is inaccurate.

PiperOrigin-RevId: 305361593
---
 pkg/sentry/fsimpl/host/host.go | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 7d9dcd4c9..97fa7f7ab 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -74,31 +74,34 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err
 	}
 
 	// Retrieve metadata.
-	var s syscall.Stat_t
-	if err := syscall.Fstat(hostFD, &s); err != nil {
+	var s unix.Stat_t
+	if err := unix.Fstat(hostFD, &s); err != nil {
 		return nil, err
 	}
 
 	fileMode := linux.FileMode(s.Mode)
 	fileType := fileMode.FileType()
-	// Pipes, character devices, and sockets.
-	isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+
+	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+	// devices.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
 
 	i := &inode{
 		hostFD:   hostFD,
-		isStream: isStream,
+		seekable: seekable,
 		isTTY:    isTTY,
 		canMap:   canMap(uint32(fileType)),
 		ino:      fs.NextIno(),
 		mode:     fileMode,
-		// For simplicity, set offset to 0. Technically, we should
-		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
-		// and use the offset from the host fd otherwise.
+		// For simplicity, set offset to 0. Technically, we should use the existing
+		// offset on the host if the file is seekable.
 		offset: 0,
 	}
 
-	// These files can't be memory mapped, assert this.
-	if i.isStream && i.canMap {
+	// Non-seekable files can't be memory mapped, assert this.
+	if !i.seekable && i.canMap {
 		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 	}
 
@@ -124,12 +127,12 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	hostFD int
 
-	// isStream is true if the host fd points to a file representing a stream,
+	// seekable is false if the host fd points to a file representing a stream,
 	// e.g. a socket or a pipe. Such files are not seekable and can return
 	// EWOULDBLOCK for I/O operations.
 	//
 	// This field is initialized at creation time and is immutable.
-	isStream bool
+	seekable bool
 
 	// isTTY is true if this file represents a TTY.
 	//
@@ -481,8 +484,7 @@ func (f *fileDescription) Release() {
 // PRead implements FileDescriptionImpl.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		return 0, syserror.ESPIPE
 	}
 
@@ -492,8 +494,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
 // Read implements FileDescriptionImpl.
 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
 		if isBlockError(err) {
 			// If we got any data at all, return it as a "completed" partial read
@@ -538,8 +539,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
 // PWrite implements FileDescriptionImpl.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		return 0, syserror.ESPIPE
 	}
 
@@ -549,8 +549,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
 // Write implements FileDescriptionImpl.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
 		if isBlockError(err) {
 			err = syserror.ErrWouldBlock
@@ -593,8 +592,7 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs
 // allow directory fds to be imported at all.
 func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
 	i := f.inode
-	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
-	if i.isStream {
+	if !i.seekable {
 		return 0, syserror.ESPIPE
 	}
 
-- 
cgit v1.2.3


From acf0259255bae190759e39fbff3bac6c94122734 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 7 Apr 2020 16:44:43 -0700
Subject: Don't map the 0 uid into a sandbox user namespace

Starting with go1.13, we can specify ambient capabilities when we execute a new
process with os/exe.Cmd.

PiperOrigin-RevId: 305366706
---
 runsc/sandbox/sandbox.go | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 8de75ae57..3b06da98b 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -588,44 +588,31 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 			cmd.Args = append(cmd.Args, "--setup-root")
 
+			const nobody = 65534
 			if conf.Rootless {
-				log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
+				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
 				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
 					{
-						ContainerID: 0,
+						ContainerID: nobody,
 						HostID:      os.Getuid(),
 						Size:        1,
 					},
 				}
 				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
 					{
-						ContainerID: 0,
+						ContainerID: nobody,
 						HostID:      os.Getgid(),
 						Size:        1,
 					},
 				}
-				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
 
 			} else {
 				// Map nobody in the new namespace to nobody in the parent namespace.
 				//
 				// A sandbox process will construct an empty
-				// root for itself, so it has to have the CAP_SYS_ADMIN
-				// capability.
-				//
-				// FIXME(b/122554829): The current implementations of
-				// os/exec doesn't allow to set ambient capabilities if
-				// a process is started in a new user namespace. As a
-				// workaround, we start the sandbox process with the 0
-				// UID and then it constructs a chroot and sets UID to
-				// nobody.  https://github.com/golang/go/issues/2315
-				const nobody = 65534
+				// root for itself, so it has to have
+				// CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
 				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
-					{
-						ContainerID: 0,
-						HostID:      nobody - 1,
-						Size:        1,
-					},
 					{
 						ContainerID: nobody,
 						HostID:      nobody,
@@ -639,11 +626,11 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 						Size:        1,
 					},
 				}
-
-				// Set credentials to run as user and group nobody.
-				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody}
 			}
 
+			// Set credentials to run as user and group nobody.
+			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
+			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT))
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
-- 
cgit v1.2.3


From dbcc59af0b834b6295589a594fe4cc1c360e66f7 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 7 Apr 2020 17:48:06 -0700
Subject: Test TCP sender behavior against window shrinking

RFC 1122 Section 3.7: A sending TCP MUST be robust against window shrinking,
which may cause the "useable window" to become negative.

PiperOrigin-RevId: 305377072
---
 test/packetimpact/dut/posix_server.cc             | 20 ++++++++
 test/packetimpact/proto/posix_server.proto        | 27 +++++++++++
 test/packetimpact/testbench/connections.go        | 52 +++++++++++++++++---
 test/packetimpact/testbench/dut.go                | 58 +++++++++++++++++++++++
 test/packetimpact/tests/BUILD                     | 12 +++++
 test/packetimpact/tests/tcp_window_shrink_test.go | 58 +++++++++++++++++++++++
 6 files changed, 220 insertions(+), 7 deletions(-)
 create mode 100644 test/packetimpact/tests/tcp_window_shrink_test.go

diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index b8177f5b1..86e580c6f 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -149,6 +149,15 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
+  ::grpc::Status Send(::grpc::ServerContext *context,
+                      const ::posix_server::SendRequest *request,
+                      ::posix_server::SendResponse *response) override {
+    response->set_ret(::send(request->sockfd(), request->buf().data(),
+                             request->buf().size(), request->flags()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
   ::grpc::Status SetSockOpt(
       grpc_impl::ServerContext *context,
       const ::posix_server::SetSockOptRequest *request,
@@ -160,6 +169,17 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
+  ::grpc::Status SetSockOptInt(
+      ::grpc::ServerContext *context,
+      const ::posix_server::SetSockOptIntRequest *request,
+      ::posix_server::SetSockOptIntResponse *response) override {
+    int opt = request->intval();
+    response->set_ret(::setsockopt(request->sockfd(), request->level(),
+                                   request->optname(), &opt, sizeof(opt)));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
   ::grpc::Status SetSockOptTimeval(
       ::grpc::ServerContext *context,
       const ::posix_server::SetSockOptTimevalRequest *request,
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index 1565f31fa..4035e1ee6 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -93,6 +93,17 @@ message ListenResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message SendRequest {
+  int32 sockfd = 1;
+  bytes buf = 2;
+  int32 flags = 3;
+}
+
+message SendResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
 message SetSockOptRequest {
   int32 sockfd = 1;
   int32 level = 2;
@@ -105,6 +116,18 @@ message SetSockOptResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message SetSockOptIntRequest {
+  int32 sockfd = 1;
+  int32 level = 2;
+  int32 optname = 3;
+  int32 intval = 4;
+}
+
+message SetSockOptIntResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
 message SetSockOptTimevalRequest {
   int32 sockfd = 1;
   int32 level = 2;
@@ -151,11 +174,15 @@ service Posix {
   rpc GetSockName(GetSockNameRequest) returns (GetSockNameResponse);
   // Call listen() on the DUT.
   rpc Listen(ListenRequest) returns (ListenResponse);
+  // Call send() on the DUT.
+  rpc Send(SendRequest) returns (SendResponse);
   // Call setsockopt() on the DUT.  You should prefer one of the other
   // SetSockOpt* functions with a more structured optval or else you may get the
   // encoding wrong, such as making a bad assumption about the server's word
   // sizes or endianness.
   rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
+  // Call setsockopt() on the DUT with an int optval.
+  rpc SetSockOptInt(SetSockOptIntRequest) returns (SetSockOptIntResponse);
   // Call setsockopt() on the DUT with a Timeval optval.
   rpc SetSockOptTimeval(SetSockOptTimevalRequest)
       returns (SetSockOptTimevalResponse);
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 8d1f562ee..579da59c3 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -187,9 +187,19 @@ func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
 	conn.SendFrame(conn.CreateFrame(tcp, additionalLayers...))
 }
 
-// Recv gets a packet from the sniffer within the timeout provided. If no packet
-// arrives before the timeout, it returns nil.
+// Recv gets a packet from the sniffer within the timeout provided.
+// If no packet arrives before the timeout, it returns nil.
 func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
+	layers := conn.RecvFrame(timeout)
+	if tcpLayerIndex < len(layers) {
+		return layers[tcpLayerIndex].(*TCP)
+	}
+	return nil
+}
+
+// RecvFrame gets a frame (of type Layers) within the timeout provided.
+// If no frame arrives before the timeout, it returns nil.
+func (conn *TCPIPv4) RecvFrame(timeout time.Duration) Layers {
 	deadline := time.Now().Add(timeout)
 	for {
 		timeout = time.Until(deadline)
@@ -216,14 +226,16 @@ func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
 		for i := tcpLayerIndex + 1; i < len(layers); i++ {
 			conn.RemoteSeqNum.UpdateForward(seqnum.Size(layers[i].length()))
 		}
-		return tcpHeader
+		return layers
 	}
 	return nil
 }
 
 // Expect a packet that matches the provided tcp within the timeout specified.
-// If it doesn't arrive in time, the test fails.
+// If it doesn't arrive in time, it returns nil.
 func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) *TCP {
+	// We cannot implement this directly using ExpectFrame as we cannot specify
+	// the Payload part.
 	deadline := time.Now().Add(timeout)
 	for {
 		timeout = time.Until(deadline)
@@ -231,15 +243,41 @@ func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) *TCP {
 			return nil
 		}
 		gotTCP := conn.Recv(timeout)
-		if gotTCP == nil {
-			return nil
-		}
 		if tcp.match(gotTCP) {
 			return gotTCP
 		}
 	}
 }
 
+// ExpectFrame expects a frame that matches the specified layers within the
+// timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *TCPIPv4) ExpectFrame(layers Layers, timeout time.Duration) Layers {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = time.Until(deadline)
+		if timeout <= 0 {
+			return nil
+		}
+		gotLayers := conn.RecvFrame(timeout)
+		if layers.match(gotLayers) {
+			return gotLayers
+		}
+	}
+}
+
+// ExpectData is a convenient method that expects a TCP packet along with
+// the payload to arrive within the timeout specified. If it doesn't arrive
+// in time, it causes a fatal test failure.
+func (conn *TCPIPv4) ExpectData(tcp TCP, data []byte, timeout time.Duration) {
+	expected := []Layer{&Ether{}, &IPv4{}, &tcp}
+	if len(data) > 0 {
+		expected = append(expected, &Payload{Bytes: data})
+	}
+	if conn.ExpectFrame(expected, timeout) == nil {
+		conn.t.Fatalf("expected to get a TCP frame %s with payload %x", &tcp, data)
+	}
+}
+
 // Handshake performs a TCP 3-way handshake.
 func (conn *TCPIPv4) Handshake() {
 	// Send the SYN.
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index f342aee01..9335909c0 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -291,6 +291,35 @@ func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
+// Send calls send on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// SendWithErrno.
+func (dut *DUT) Send(sockfd int32, buf []byte, flags int32) int32 {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.SendWithErrno(ctx, sockfd, buf, flags)
+	if ret == -1 {
+		dut.t.Fatalf("failed to send: %s", err)
+	}
+	return ret
+}
+
+// SendWithErrno calls send on the DUT.
+func (dut *DUT) SendWithErrno(ctx context.Context, sockfd int32, buf []byte, flags int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.SendRequest{
+		Sockfd: sockfd,
+		Buf:    buf,
+		Flags:  flags,
+	}
+	resp, err := dut.posixServer.Send(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Send: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
 // SetSockOpt calls setsockopt on the DUT and causes a fatal test failure if it
 // doesn't succeed. If more control over the timeout or error handling is
 // needed, use SetSockOptWithErrno. Because endianess and the width of values
@@ -324,6 +353,35 @@ func (dut *DUT) SetSockOptWithErrno(ctx context.Context, sockfd, level, optname
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
+// SetSockOptInt calls setsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed. If more control over the int optval or error handling
+// is needed, use SetSockOptIntWithErrno.
+func (dut *DUT) SetSockOptInt(sockfd, level, optname, optval int32) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), *rpcTimeout)
+	defer cancel()
+	ret, err := dut.SetSockOptIntWithErrno(ctx, sockfd, level, optname, optval)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOptInt: %s", err)
+	}
+}
+
+// SetSockOptIntWithErrno calls setsockopt with an integer optval.
+func (dut *DUT) SetSockOptIntWithErrno(ctx context.Context, sockfd, level, optname, optval int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.SetSockOptIntRequest{
+		Sockfd:  sockfd,
+		Level:   level,
+		Optname: optname,
+		Intval:  optval,
+	}
+	resp, err := dut.posixServer.SetSockOptInt(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call SetSockOptInt: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
 // SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure
 // if it doesn't succeed. If more control over the timeout or error handling is
 // needed, use SetSockOptTimevalWithErrno.
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 9a4d66ea9..a9b2de9b9 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -28,6 +28,18 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "tcp_window_shrink",
+    srcs = ["tcp_window_shrink_test.go"],
+    # TODO(b/153202472): Fix netstack then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/tcp_window_shrink_test.go b/test/packetimpact/tests/tcp_window_shrink_test.go
new file mode 100644
index 000000000..b48cc6491
--- /dev/null
+++ b/test/packetimpact/tests/tcp_window_shrink_test.go
@@ -0,0 +1,58 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_window_shrink_test
+
+import (
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func TestWindowShrink(t *testing.T) {
+	dut := tb.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Handshake()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+
+	dut.Send(acceptFd, sampleData, 0)
+	conn.ExpectData(tb.TCP{}, sampleData, time.Second)
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+
+	dut.Send(acceptFd, sampleData, 0)
+	dut.Send(acceptFd, sampleData, 0)
+	conn.ExpectData(tb.TCP{}, sampleData, time.Second)
+	conn.ExpectData(tb.TCP{}, sampleData, time.Second)
+	// We close our receiving window here
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
+
+	dut.Send(acceptFd, []byte("Sample Data"), 0)
+	// Note: There is another kind of zero-window probing which Windows uses (by sending one
+	// new byte at `RemoteSeqNum`), if netstack wants to go that way, we may want to change
+	// the following lines.
+	conn.ExpectData(tb.TCP{SeqNum: tb.Uint32(uint32(conn.RemoteSeqNum - 1))}, nil, time.Second)
+}
-- 
cgit v1.2.3


From 5a1324625f1d0d9ea2d4874f9d6d1008ec12f45e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 7 Apr 2020 18:26:34 -0700
Subject: Make unlink tests pass with goferfs

Required directory checks were being skipped when there was
no child cached. Now the code always loads the child file
before unlinking it.

Updates #1198

PiperOrigin-RevId: 305382323
---
 pkg/sentry/fsimpl/gofer/filesystem.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 305228bda..137260898 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -437,14 +437,19 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	flags := uint32(0)
 	if dir {
 		if child != nil && !child.isDir() {
+			vfsObj.AbortDeleteDentry(childVFSD)
 			return syserror.ENOTDIR
 		}
 		flags = linux.AT_REMOVEDIR
 	} else {
 		if child != nil && child.isDir() {
+			vfsObj.AbortDeleteDentry(childVFSD)
 			return syserror.EISDIR
 		}
 		if rp.MustBeDir() {
+			if childVFSD != nil {
+				vfsObj.AbortDeleteDentry(childVFSD)
+			}
 			return syserror.ENOTDIR
 		}
 	}
-- 
cgit v1.2.3


From 5802051b3d60a802713fabbd805614f22c9291ea Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Tue, 7 Apr 2020 18:38:13 -0700
Subject: Update TODO to #238

Move TODO to #238 so that proper synchronization of operations is handled
when we create the urpc client.

Issue #238
Fixes #512

PiperOrigin-RevId: 305383924
---
 runsc/container/container.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runsc/container/container.go b/runsc/container/container.go
index c9839044c..7233659b1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -1077,9 +1077,9 @@ func (c *Container) adjustGoferOOMScoreAdj() error {
 // oom_score_adj is set to the lowest oom_score_adj among the containers
 // running in the sandbox.
 //
-// TODO(gvisor.dev/issue/512): This call could race with other containers being
+// TODO(gvisor.dev/issue/238): This call could race with other containers being
 // created at the same time and end up setting the wrong oom_score_adj to the
-// sandbox.
+// sandbox. Use rpc client to synchronize.
 func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error {
 	containers, err := loadSandbox(rootDir, s.ID)
 	if err != nil {
-- 
cgit v1.2.3


From b574c715a799e476ac788e5f5b2c68f1a00b3538 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 4 Mar 2020 05:44:46 +0000
Subject: Move pagetables.limitPCID to arch-specific file.

X86 provide 12 bits for PCID while arm64 support
8/16 bits ASID.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I0bd9236e44e6b6c4c88eb6e9adc5ac27b918bf6c
---
 pkg/sentry/platform/ring0/pagetables/BUILD         |  3 ++
 pkg/sentry/platform/ring0/pagetables/pcids.go      |  5 +--
 .../platform/ring0/pagetables/pcids_aarch64.go     | 32 +++++++++++++++
 .../platform/ring0/pagetables/pcids_aarch64.s      | 45 ++++++++++++++++++++++
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go  | 20 ++++++++++
 5 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_x86.go

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 581841555..16d5f478b 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -81,6 +81,9 @@ go_library(
         "pagetables_arm64.go",
         "pagetables_x86.go",
         "pcids.go",
+        "pcids_aarch64.go",
+        "pcids_aarch64.s",
+        "pcids_x86.go",
         "walker_amd64.go",
         "walker_arm64.go",
         "walker_empty.go",
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids.go b/pkg/sentry/platform/ring0/pagetables/pcids.go
index 9206030bf..964496aac 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids.go
@@ -18,9 +18,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// limitPCID is the number of valid PCIDs.
-const limitPCID = 4096
-
 // PCIDs is a simple PCID database.
 //
 // This is not protected by locks and is thus suitable for use only with a
@@ -44,7 +41,7 @@ type PCIDs struct {
 //
 // Nil is returned iff the start and size are out of range.
 func NewPCIDs(start, size uint16) *PCIDs {
-	if start+uint16(size) >= limitPCID {
+	if start+uint16(size) > limitPCID {
 		return nil // See comment.
 	}
 	p := &PCIDs{
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go
new file mode 100644
index 000000000..fbfd41d83
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package pagetables
+
+// limitPCID is the maximum value of PCIDs.
+//
+// In VMSAv8-64, the PCID(ASID) size is an IMPLEMENTATION DEFINED choice
+// of 8 bits or 16 bits, and ID_AA64MMFR0_EL1.ASIDBits identifies the
+// supported size. When an implementation supports a 16-bit ASID, TCR_ELx.AS
+// selects whether the top 8 bits of the ASID are used.
+var limitPCID uint16
+
+// GetASIDBits return the system ASID bits, 8 or 16 bits.
+func GetASIDBits() uint8
+
+func init() {
+	limitPCID = uint16(1)<<GetASIDBits() - 1
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s
new file mode 100644
index 000000000..e9d62d768
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+#include "funcdata.h"
+#include "textflag.h"
+
+#define ID_AA64MMFR0_ASIDBITS_SHIFT 4
+#define ID_AA64MMFR0_ASIDBITS_16 2
+#define TCR_EL1_AS_BIT 36
+
+// GetASIDBits return the system ASID bits, 8 or 16 bits.
+//
+// func GetASIDBits() uint8
+TEXT ·GetASIDBits(SB),NOSPLIT,$0-1
+	// First, check whether 16bits ASID is supported.
+	// ID_AA64MMFR0_EL1.ASIDBITS[7:4] == 0010.
+	WORD $0xd5380700    // MRS ID_AA64MMFR0_EL1, R0
+	UBFX $ID_AA64MMFR0_ASIDBITS_SHIFT, R0, $4, R0
+	CMPW $ID_AA64MMFR0_ASIDBITS_16, R0
+	BNE bits_8
+
+	// Second, check whether 16bits ASID is enabled.
+	// TCR_EL1.AS[36] == 1.
+	WORD $0xd5382040    // MRS TCR_EL1, R0
+	TBZ  $TCR_EL1_AS_BIT, R0, bits_8
+	MOVD $16, R0
+	B done
+bits_8:
+	MOVD $8, R0
+done:
+	MOVB R0, ret+0(FP)
+	RET
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
new file mode 100644
index 000000000..91fc5e8dd
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+// limitPCID is the maximum value of valid PCIDs.
+const limitPCID = 4095
-- 
cgit v1.2.3


From 56054fc1fb0b92cb985f96467f9059e202d8095c Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Tue, 7 Apr 2020 18:49:52 -0700
Subject: Add friendlier messages for frequently encountered errors.

Issue #2270
Issue #1765

PiperOrigin-RevId: 305385436
---
 runsc/boot/fs.go             | 15 +++++++++++-
 runsc/sandbox/sandbox.go     | 58 ++++++++++++++++++++++++++++++++++++++++++--
 runsc/specutils/specutils.go |  5 ++++
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 0f62842ea..82cc612d2 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -824,7 +824,20 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 
 	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
 	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		// Check to see if this is a common error due to a Linux bug.
+		// This error is generated here in order to cause it to be
+		// printed to the user using Docker via 'runsc create' etc. rather
+		// than simply printed to the logs for the 'runsc boot' command.
+		//
+		// We check the error message string rather than type because the
+		// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
+		// implementation (e.g. p9).
+		// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
+		if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
+			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
+		}
+		return err
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a ramfs
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 3b06da98b..2d464b1bf 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,10 +18,12 @@ package sandbox
 import (
 	"context"
 	"fmt"
+	"io"
 	"math"
 	"os"
 	"os/exec"
 	"strconv"
+	"strings"
 	"syscall"
 	"time"
 
@@ -142,7 +144,19 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
 	// Wait until the sandbox has booted.
 	b := make([]byte, 1)
 	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
-		return nil, fmt.Errorf("waiting for sandbox to start: %v", err)
+		err := fmt.Errorf("waiting for sandbox to start: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), io.EOF.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return nil, fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return nil, err
 	}
 
 	c.Release()
@@ -706,7 +720,19 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return fmt.Errorf("Sandbox: %v", err)
+		err := fmt.Errorf("starting sandbox: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), syscall.EACCES.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return err
 	}
 	s.child = true
 	s.Pid = cmd.Process.Pid
@@ -1169,3 +1195,31 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 	}
 	return f, nil
 }
+
+// checkBinaryPermissions verifies that the required binary bits are set on
+// the runsc executable.
+func checkBinaryPermissions(conf *boot.Config) error {
+	// All platforms need the other exe bit
+	neededBits := os.FileMode(0001)
+	if conf.Platform == platforms.Ptrace {
+		// Ptrace needs the other read bit
+		neededBits |= os.FileMode(0004)
+	}
+
+	exePath, err := os.Executable()
+	if err != nil {
+		return fmt.Errorf("getting exe path: %v", err)
+	}
+
+	// Check the permissions of the runsc binary and print an error if it
+	// doesn't match expectations.
+	info, err := os.Stat(exePath)
+	if err != nil {
+		return fmt.Errorf("stat file: %v", err)
+	}
+
+	if info.Mode().Perm()&neededBits != neededBits {
+		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
+	}
+	return nil
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index d3c2e4e78..0f4a9cf6d 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -528,3 +528,8 @@ func EnvVar(env []string, name string) (string, bool) {
 	}
 	return "", false
 }
+
+// FaqErrorMsg returns an error message pointing to the FAQ.
+func FaqErrorMsg(anchor, msg string) string {
+	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)
+}
-- 
cgit v1.2.3


From c7d841ac6e0be2aaacd6a3a81786508be797f667 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 8 Apr 2020 00:25:16 -0700
Subject: tests: Specify NoRandomSave for PortReuse tests

SO_REUSEPORT is not properly restored:
https://github.com/google/gvisor/issues/873

PiperOrigin-RevId: 305422775
---
 test/syscalls/linux/socket_inet_loopback.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 1b34e4ef7..030c3b835 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1157,7 +1157,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
-TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -1270,7 +1270,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
-TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -2146,8 +2146,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
 
-  ASSERT_THAT(connect(connected_fd.get(),
-                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+  ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&bound_addr),
+                                  bound_addr_len),
               SyscallSucceeds());
 
   // Get the ephemeral port.
-- 
cgit v1.2.3


From a86ffefd3f52dede3ffd6ae3c20d67734ecc2616 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 8 Apr 2020 04:06:14 -0400
Subject: Enable exec_binary syscall test on Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/BUILD          |   5 +-
 test/syscalls/linux/exec_binary.cc | 164 ++++++++++++++++++++++++++++++++-----
 2 files changed, 143 insertions(+), 26 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d0c431234..9447b06a8 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -663,10 +663,7 @@ cc_binary(
 cc_binary(
     name = "exec_binary_test",
     testonly = 1,
-    srcs = select_arch(
-        amd64 = ["exec_binary.cc"],
-        arm64 = [],
-    ),
+    srcs = ["exec_binary.cc"],
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 736452b0c..ae2683256 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -48,10 +48,17 @@ namespace {
 using ::testing::AnyOf;
 using ::testing::Eq;
 
-#ifndef __x86_64__
+#if !defined(__x86_64__) && !defined(__aarch64__)
 // The assembly stub and ELF internal details must be ported to other arches.
-#error "Test only supported on x86-64"
-#endif  // __x86_64__
+#error "Test only supported on x86-64/arm64"
+#endif  // __x86_64__ || __aarch64__
+
+#if defined(__x86_64__)
+#define EM_TYPE EM_X86_64
+#define IP_REG(p) ((p).rip)
+#define RAX_REG(p) ((p).rax)
+#define RDI_REG(p) ((p).rdi)
+#define RETURN_REG(p) ((p).rax)
 
 // amd64 stub that calls PTRACE_TRACEME and sends itself SIGSTOP.
 const char kPtraceCode[] = {
@@ -139,6 +146,76 @@ const char kPtraceCode[] = {
 // Size of a syscall instruction.
 constexpr int kSyscallSize = 2;
 
+#elif defined(__aarch64__)
+#define EM_TYPE EM_AARCH64
+#define IP_REG(p) ((p).pc)
+#define RAX_REG(p) ((p).regs[8])
+#define RDI_REG(p) ((p).regs[0])
+#define RETURN_REG(p) ((p).regs[0])
+
+const char kPtraceCode[] = {
+    // MOVD $117, R8 /* ptrace */
+    '\xa8',
+    '\x0e',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R0 /* PTRACE_TRACEME */
+    '\x00',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R1 /* pid */
+    '\x01',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R2 /* addr */
+    '\x02',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R3 /* data */
+    '\x03',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+    // MOVD $172, R8 /* getpid */
+    '\x88',
+    '\x15',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+    // MOVD $129, R8 /* kill, R0=pid */
+    '\x28',
+    '\x10',
+    '\x80',
+    '\xd2',
+    // MOVD $19, R1  /* SIGSTOP */
+    '\x61',
+    '\x02',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+};
+// Size of a syscall instruction.
+constexpr int kSyscallSize = 4;
+#else
+#error "Unknown architecture"
+#endif
+
 // This test suite tests executable loading in the kernel (ELF and interpreter
 // scripts).
 
@@ -281,7 +358,7 @@ ElfBinary<64> StandardElf() {
   elf.header.e_ident[EI_DATA] = ELFDATA2LSB;
   elf.header.e_ident[EI_VERSION] = EV_CURRENT;
   elf.header.e_type = ET_EXEC;
-  elf.header.e_machine = EM_X86_64;
+  elf.header.e_machine = EM_TYPE;
   elf.header.e_version = EV_CURRENT;
   elf.header.e_phoff = sizeof(elf.header);
   elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
@@ -327,9 +404,15 @@ TEST(ElfTest, Execute) {
   ASSERT_NO_ERRNO(WaitStopped(child));
 
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
-  // RIP is just beyond the final syscall instruction.
-  EXPECT_EQ(regs.rip, elf.header.e_entry + sizeof(kPtraceCode));
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+ // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
+  // RIP/PC is just beyond the final syscall instruction.
+  EXPECT_EQ(IP_REG(regs), elf.header.e_entry + sizeof(kPtraceCode));
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
@@ -718,9 +801,16 @@ TEST(ElfTest, PIE) {
 
   // RIP tells us which page the first segment was loaded into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
 
-  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+  const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          // text page.
@@ -787,9 +877,15 @@ TEST(ElfTest, PIENonZeroStart) {
 
   // RIP tells us which page the first segment was loaded into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
   //
@@ -910,9 +1006,15 @@ TEST(ElfTest, ELFInterpreter) {
   // RIP tells us which page the first segment of the interpreter was loaded
   // into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1084,9 +1186,15 @@ TEST(ElfTest, ELFInterpreterRelative) {
   // RIP tells us which page the first segment of the interpreter was loaded
   // into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1480,14 +1588,21 @@ TEST(ExecveTest, BrkAfterBinary) {
   ASSERT_NO_ERRNO(WaitStopped(child));
 
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+    SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
   // RIP is just beyond the final syscall instruction. Rewind to execute a brk
   // syscall.
-  regs.rip -= kSyscallSize;
-  regs.rax = __NR_brk;
-  regs.rdi = 0;
-  ASSERT_THAT(ptrace(PTRACE_SETREGS, child, 0, &regs), SyscallSucceeds());
+  IP_REG(regs) -= kSyscallSize;
+  RAX_REG(regs) = __NR_brk;
+  RDI_REG(regs) = 0;
+  ASSERT_THAT(ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, &iov),
+    SyscallSucceeds());
 
   // Resume the child, waiting for syscall entry.
   ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
@@ -1504,7 +1619,12 @@ TEST(ExecveTest, BrkAfterBinary) {
   ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
       << "status = " << status;
 
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+    SyscallSucceeds());
+  //Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
   // brk is after the text page.
   //
@@ -1512,7 +1632,7 @@ TEST(ExecveTest, BrkAfterBinary) {
   // address will be, but it is always beyond the final page in the binary.
   // i.e., it does not start immediately after memsz in the middle of a page.
   // Userspace may expect to use that space.
-  EXPECT_GE(regs.rax, 0x41000);
+  EXPECT_GE(RETURN_REG(regs), 0x41000);
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 71c7e24e5cb8641f4cb98b5fc848ae2033b29eac Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 8 Apr 2020 06:41:52 -0700
Subject: Return all packets when Expect fails.

PiperOrigin-RevId: 305466309
---
 test/packetimpact/testbench/connections.go        | 39 +++++++++++++----------
 test/packetimpact/tests/fin_wait2_timeout_test.go | 12 +++----
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 579da59c3..ed8689fd3 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"math/rand"
 	"net"
+	"strings"
 	"testing"
 	"time"
 
@@ -233,19 +234,23 @@ func (conn *TCPIPv4) RecvFrame(timeout time.Duration) Layers {
 
 // Expect a packet that matches the provided tcp within the timeout specified.
 // If it doesn't arrive in time, it returns nil.
-func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) *TCP {
+func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
 	// We cannot implement this directly using ExpectFrame as we cannot specify
 	// the Payload part.
 	deadline := time.Now().Add(timeout)
+	var allTCP []string
 	for {
-		timeout = time.Until(deadline)
-		if timeout <= 0 {
-			return nil
+		var gotTCP *TCP
+		if timeout = time.Until(deadline); timeout > 0 {
+			gotTCP = conn.Recv(timeout)
+		}
+		if gotTCP == nil {
+			return nil, fmt.Errorf("got %d packets:\n%s", len(allTCP), strings.Join(allTCP, "\n"))
 		}
-		gotTCP := conn.Recv(timeout)
 		if tcp.match(gotTCP) {
-			return gotTCP
+			return gotTCP, nil
 		}
+		allTCP = append(allTCP, gotTCP.String())
 	}
 }
 
@@ -284,10 +289,11 @@ func (conn *TCPIPv4) Handshake() {
 	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn)})
 
 	// Wait for the SYN-ACK.
-	conn.SynAck = conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
-	if conn.SynAck == nil {
-		conn.t.Fatalf("didn't get synack during handshake")
+	synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if synAck == nil {
+		conn.t.Fatalf("didn't get synack during handshake: %s", err)
 	}
+	conn.SynAck = synAck
 
 	// Send an ACK.
 	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
@@ -427,19 +433,20 @@ func (conn *UDPIPv4) Recv(timeout time.Duration) *UDP {
 
 // Expect a packet that matches the provided udp within the timeout specified.
 // If it doesn't arrive in time, the test fails.
-func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) *UDP {
+func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) (*UDP, error) {
 	deadline := time.Now().Add(timeout)
+	var allUDP []string
 	for {
-		timeout = time.Until(deadline)
-		if timeout <= 0 {
-			return nil
+		var gotUDP *UDP
+		if timeout = time.Until(deadline); timeout > 0 {
+			gotUDP = conn.Recv(timeout)
 		}
-		gotUDP := conn.Recv(timeout)
 		if gotUDP == nil {
-			return nil
+			return nil, fmt.Errorf("got %d packets:\n%s", len(allUDP), strings.Join(allUDP, "\n"))
 		}
 		if udp.match(gotUDP) {
-			return gotUDP
+			return gotUDP, nil
 		}
+		allUDP = append(allUDP, gotUDP.String())
 	}
 }
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
index 2b3f39045..90e16ef65 100644
--- a/test/packetimpact/tests/fin_wait2_timeout_test.go
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -47,20 +47,20 @@ func TestFinWait2Timeout(t *testing.T) {
 			}
 			dut.Close(acceptFd)
 
-			if gotOne := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); gotOne == nil {
-				t.Fatal("expected a FIN-ACK within 1 second but got none")
+			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Fatalf("expected a FIN-ACK within 1 second but got none: %s", err)
 			}
 			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
 
 			time.Sleep(5 * time.Second)
 			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
 			if tt.linger2 {
-				if gotOne := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); gotOne == nil {
-					t.Fatal("expected a RST packet within a second but got none")
+				if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+					t.Fatalf("expected a RST packet within a second but got none: %s", err)
 				}
 			} else {
-				if gotOne := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, 10*time.Second); gotOne != nil {
-					t.Fatal("expected no RST packets within ten seconds but got one")
+				if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, 10*time.Second); err == nil {
+					t.Fatalf("expected no RST packets within ten seconds but got one: %s", err)
 				}
 			}
 		})
-- 
cgit v1.2.3


From 94b793262d3c54b4c32fed83d2bd121069680d15 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 25 Mar 2020 16:55:02 -0700
Subject: Fix all copy locks violations.

This required minor restructuring of how system call tables were saved
and restored, but it makes way more sense this way.

Updates #2243
---
 pkg/log/glog.go                       |  6 +++---
 pkg/log/json.go                       |  2 +-
 pkg/log/json_k8s.go                   |  4 ++--
 pkg/log/log.go                        |  2 +-
 pkg/log/log_test.go                   |  6 +++---
 pkg/sentry/contexttest/contexttest.go |  4 ++--
 pkg/sentry/fs/host/socket_test.go     |  6 +++---
 pkg/sentry/fs/proc/sys_net.go         |  4 ++--
 pkg/sentry/kernel/syscalls.go         | 33 ++++++++++++++++----------------
 pkg/sentry/kernel/syscalls_state.go   | 36 ++++++++++++++++++++++++++---------
 pkg/sentry/kernel/task_context.go     |  2 +-
 pkg/sentry/kernel/time/time.go        | 10 +++++-----
 pkg/state/state.go                    |  5 +----
 runsc/boot/compat.go                  |  2 +-
 runsc/main.go                         |  6 +++---
 tools/go_stateify/main.go             |  2 +-
 tools/nogo.json                       | 13 -------------
 17 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index b4f7bb5a4..f57c4427b 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -25,7 +25,7 @@ import (
 // GoogleEmitter is a wrapper that emits logs in a format compatible with
 // package github.com/golang/glog.
 type GoogleEmitter struct {
-	Writer
+	*Writer
 }
 
 // pid is used for the threadid component of the header.
@@ -46,7 +46,7 @@ var pid = os.Getpid()
 //   line             The line number
 //   msg              The user-supplied message
 //
-func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) {
+func (g GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) {
 	// Log level.
 	prefix := byte('?')
 	switch level {
@@ -81,5 +81,5 @@ func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format
 	message := fmt.Sprintf(format, args...)
 
 	// Emit the formatted result.
-	fmt.Fprintf(&g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message)
+	fmt.Fprintf(g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message)
 }
diff --git a/pkg/log/json.go b/pkg/log/json.go
index 0943db1cc..bdf9d691e 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -58,7 +58,7 @@ func (lv *Level) UnmarshalJSON(b []byte) error {
 
 // JSONEmitter logs messages in json format.
 type JSONEmitter struct {
-	Writer
+	*Writer
 }
 
 // Emit implements Emitter.Emit.
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index 6c6fc8b6f..5883e95e1 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -29,11 +29,11 @@ type k8sJSONLog struct {
 // K8sJSONEmitter logs messages in json format that is compatible with
 // Kubernetes fluent configuration.
 type K8sJSONEmitter struct {
-	Writer
+	*Writer
 }
 
 // Emit implements Emitter.Emit.
-func (e *K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := k8sJSONLog{
 		Log:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/log.go b/pkg/log/log.go
index a794da1aa..37e0605ad 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -374,5 +374,5 @@ func CopyStandardLogTo(l Level) error {
 
 func init() {
 	// Store the initial value for the log.
-	log.Store(&BasicLogger{Level: Info, Emitter: &GoogleEmitter{Writer{Next: os.Stderr}}})
+	log.Store(&BasicLogger{Level: Info, Emitter: GoogleEmitter{&Writer{Next: os.Stderr}}})
 }
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index 402cc29ae..9ff18559b 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -52,7 +52,7 @@ func TestDropMessages(t *testing.T) {
 		t.Fatalf("Write should have failed")
 	}
 
-	fmt.Printf("writer: %+v\n", w)
+	fmt.Printf("writer: %#v\n", &w)
 
 	tw.fail = false
 	if _, err := w.Write([]byte("line 2\n")); err != nil {
@@ -76,7 +76,7 @@ func TestDropMessages(t *testing.T) {
 
 func TestCaller(t *testing.T) {
 	tw := &testWriter{}
-	e := &GoogleEmitter{Writer: Writer{Next: tw}}
+	e := GoogleEmitter{Writer: &Writer{Next: tw}}
 	bl := &BasicLogger{
 		Emitter: e,
 		Level:   Debug,
@@ -94,7 +94,7 @@ func BenchmarkGoogleLogging(b *testing.B) {
 	tw := &testWriter{
 		limit: 1, // Only record one message.
 	}
-	e := &GoogleEmitter{Writer: Writer{Next: tw}}
+	e := GoogleEmitter{Writer: &Writer{Next: tw}}
 	bl := &BasicLogger{
 		Emitter: e,
 		Level:   Debug,
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 031fc64ec..8e5658c7a 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -97,7 +97,7 @@ type hostClock struct {
 }
 
 // Now implements ktime.Clock.Now.
-func (hostClock) Now() ktime.Time {
+func (*hostClock) Now() ktime.Time {
 	return ktime.FromNanoseconds(time.Now().UnixNano())
 }
 
@@ -127,7 +127,7 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	case uniqueid.CtxInotifyCookie:
 		return atomic.AddUint32(&lastInotifyCookie, 1)
 	case ktime.CtxRealtimeClock:
-		return hostClock{}
+		return &hostClock{}
 	default:
 		if val, ok := t.otherValues[key]; ok {
 			return val
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index eb4afe520..affdbcacb 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -199,14 +199,14 @@ func TestListen(t *testing.T) {
 }
 
 func TestPasscred(t *testing.T) {
-	e := ConnectedEndpoint{}
+	e := &ConnectedEndpoint{}
 	if got, want := e.Passcred(), false; got != want {
 		t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
 	}
 }
 
 func TestGetLocalAddress(t *testing.T) {
-	e := ConnectedEndpoint{path: "foo"}
+	e := &ConnectedEndpoint{path: "foo"}
 	want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
 	if got, err := e.GetLocalAddress(); err != nil || got != want {
 		t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
@@ -214,7 +214,7 @@ func TestGetLocalAddress(t *testing.T) {
 }
 
 func TestQueuedSize(t *testing.T) {
-	e := ConnectedEndpoint{}
+	e := &ConnectedEndpoint{}
 	tests := []struct {
 		name string
 		f    func() int64
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index d4c4b533d..702fdd392 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -80,7 +80,7 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
+func (*tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
@@ -196,7 +196,7 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 93c4fe969..c9a2321b8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -218,56 +218,55 @@ type Stracer interface {
 	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
 }
 
-// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
-// is *immutable*. In order to make supporting suspend and resume sane, they
-// must be uniquely registered and may not change during operation.
+// SyscallTable is a lookup table of system calls.
 //
-// +stateify savable
+// Note that a SyscallTable is not savable directly. Instead, they are saved as
+// an OS/Arch pair and lookup happens again on restore.
 type SyscallTable struct {
 	// OS is the operating system that this syscall table implements.
-	OS abi.OS `state:"wait"`
+	OS abi.OS
 
 	// Arch is the architecture that this syscall table targets.
-	Arch arch.Arch `state:"wait"`
+	Arch arch.Arch
 
 	// The OS version that this syscall table implements.
-	Version Version `state:"manual"`
+	Version Version
 
 	// AuditNumber is a numeric constant that represents the syscall table. If
 	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
 	// linux/audit.h.
-	AuditNumber uint32 `state:"manual"`
+	AuditNumber uint32
 
 	// Table is the collection of functions.
-	Table map[uintptr]Syscall `state:"manual"`
+	Table map[uintptr]Syscall
 
 	// lookup is a fixed-size array that holds the syscalls (indexed by
 	// their numbers). It is used for fast look ups.
-	lookup []SyscallFn `state:"manual"`
+	lookup []SyscallFn
 
 	// Emulate is a collection of instruction addresses to emulate. The
 	// keys are addresses, and the values are system call numbers.
-	Emulate map[usermem.Addr]uintptr `state:"manual"`
+	Emulate map[usermem.Addr]uintptr
 
 	// The function to call in case of a missing system call.
-	Missing MissingFn `state:"manual"`
+	Missing MissingFn
 
 	// Stracer traces this syscall table.
-	Stracer Stracer `state:"manual"`
+	Stracer Stracer
 
 	// External is used to handle an external callback.
-	External func(*Kernel) `state:"manual"`
+	External func(*Kernel)
 
 	// ExternalFilterBefore is called before External is called before the syscall is executed.
 	// External is not called if it returns false.
-	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
 
 	// ExternalFilterAfter is called before External is called after the syscall is executed.
 	// External is not called if it returns false.
-	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
 
 	// FeatureEnable stores the strace and one-shot enable bits.
-	FeatureEnable SyscallFlagsTable `state:"manual"`
+	FeatureEnable SyscallFlagsTable
 }
 
 // allSyscallTables contains all known tables.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 00358326b..90f890495 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -14,16 +14,34 @@
 
 package kernel
 
-import "fmt"
+import (
+	"fmt"
 
-// afterLoad is invoked by stateify.
-func (s *SyscallTable) afterLoad() {
-	otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
-	if !ok {
-		// Couldn't find a reference?
-		panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// syscallTableInfo is used to reload the SyscallTable.
+//
+// +stateify savable
+type syscallTableInfo struct {
+	OS   abi.OS
+	Arch arch.Arch
+}
+
+// saveSt saves the SyscallTable.
+func (tc *TaskContext) saveSt() syscallTableInfo {
+	return syscallTableInfo{
+		OS:   tc.st.OS,
+		Arch: tc.st.Arch,
 	}
+}
 
-	// Copy the table.
-	*s = *otherTable
+// loadSt loads the SyscallTable.
+func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+	st, ok := LookupSyscallTable(sti.OS, sti.Arch)
+	if !ok {
+		panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
+	}
+	tc.st = st // Save the table reference.
 }
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 0158b1788..c115e8d1f 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -49,7 +49,7 @@ type TaskContext struct {
 	fu *futex.Manager
 
 	// st is the task's syscall table.
-	st *SyscallTable
+	st *SyscallTable `state:".(syscallTableInfo)"`
 }
 
 // release releases all resources held by the TaskContext. release is called by
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 706de83ef..e959700f2 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -245,7 +245,7 @@ type Clock interface {
 type WallRateClock struct{}
 
 // WallTimeUntil implements Clock.WallTimeUntil.
-func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
 	return t.Sub(now)
 }
 
@@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
 type NoClockEvents struct{}
 
 // Readiness implements waiter.Waitable.Readiness.
-func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return 0
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
 }
 
 // ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
@@ -273,7 +273,7 @@ type ClockEventsQueue struct {
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return 0
 }
 
diff --git a/pkg/state/state.go b/pkg/state/state.go
index dbe507ab4..03ae2dbb0 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -241,10 +241,7 @@ func Register(name string, instance interface{}, fns Fns) {
 //
 // This function is used by the stateify tool.
 func IsZeroValue(val interface{}) bool {
-	if val == nil {
-		return true
-	}
-	return reflect.DeepEqual(val, reflect.Zero(reflect.TypeOf(val)).Interface())
+	return val == nil || reflect.ValueOf(val).Elem().IsZero()
 }
 
 // step captures one encoding / decoding step. On each step, there is up to one
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 8995d678e..b7cfb35bf 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -65,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
-		target := &log.MultiEmitter{c.sink, &log.K8sJSONEmitter{log.Writer{Next: f}}}
+		target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}}
 		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
 	}
 	return c, nil
diff --git a/runsc/main.go b/runsc/main.go
index 62e184ec9..c1c78529c 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -342,11 +342,11 @@ func main() {
 func newEmitter(format string, logFile io.Writer) log.Emitter {
 	switch format {
 	case "text":
-		return &log.GoogleEmitter{log.Writer{Next: logFile}}
+		return log.GoogleEmitter{&log.Writer{Next: logFile}}
 	case "json":
-		return &log.JSONEmitter{log.Writer{Next: logFile}}
+		return log.JSONEmitter{&log.Writer{Next: logFile}}
 	case "json-k8s":
-		return &log.K8sJSONEmitter{log.Writer{Next: logFile}}
+		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
 	}
 	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
 	panic("unreachable")
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 3437aa476..309ee9c21 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -206,7 +206,7 @@ func main() {
 		initCalls = append(initCalls, fmt.Sprintf("%sRegister(\"%s.%s\", (*%s)(nil), state.Fns{Save: (*%s).save, Load: (*%s).load})", statePrefix, *fullPkg, name, name, name, name))
 	}
 	emitZeroCheck := func(name string) {
-		fmt.Fprintf(outputFile, "	if !%sIsZeroValue(x.%s) { m.Failf(\"%s is %%v, expected zero\", x.%s) }\n", statePrefix, name, name, name)
+		fmt.Fprintf(outputFile, "	if !%sIsZeroValue(&x.%s) { m.Failf(\"%s is %%#v, expected zero\", &x.%s) }\n", statePrefix, name, name, name)
 	}
 	emitLoadValue := func(name, typName string) {
 		fmt.Fprintf(outputFile, "	m.LoadValue(\"%s\", new(%s), func(y interface{}) { x.load%s(y.(%s)) })\n", name, typName, camelCased(name), typName)
diff --git a/tools/nogo.json b/tools/nogo.json
index 83cb76b93..cc05ba027 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -9,19 +9,6 @@
       "/external/": "allowed: not subject to unsafe naming rules"
     }
   },
-  "copylocks": {
-    "exclude_files": {
-      ".*_state_autogen.go": "fix: m.Failf copies by value",
-      "/pkg/log/json.go": "fix: Emit passes lock by value: gvisor.dev/gvisor/pkg/log.JSONEmitter contains gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex",
-      "/pkg/log/log_test.go": "fix: call of fmt.Printf copies lock value: gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex",
-      "/pkg/sentry/fs/host/socket_test.go": "fix: call of t.Errorf copies lock value: gvisor.dev/gvisor/pkg/sentry/fs/host.ConnectedEndpoint contains gvisor.dev/gvisor/pkg/refs.AtomicRefCount contains gvisor.dev/gvisor/pkg/sync.Mutex",
-      "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpMemInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
-      "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpSack contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
-      "/pkg/sentry/fs/tty/slave.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/tty.slaveInodeOperations contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
-      "/pkg/sentry/kernel/time/time.go": "fix: Readiness passes lock by value: gvisor.dev/gvisor/pkg/sentry/kernel/time.ClockEventsQueue contains gvisor.dev/gvisor/pkg/waiter.Queue contains gvisor.dev/gvisor/pkg/sync.RWMutex",
-      "/pkg/sentry/kernel/syscalls_state.go": "fix: assignment copies lock value to *s: gvisor.dev/gvisor/pkg/sentry/kernel.SyscallTable contains gvisor.dev/gvisor/pkg/sentry/kernel.SyscallFlagsTable contains gvisor.dev/gvisor/pkg/sync.Mutex"
-    }
-  },
   "lostcancel": {
     "exclude_files": {
       "/pkg/tcpip/network/arp/arp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
-- 
cgit v1.2.3


From 928a7c60b8f02811e9c0fcbed0077efd55471cc4 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 25 Mar 2020 17:15:05 -0700
Subject: Fix all printf formatting errors.

Updates #2243
---
 pkg/eventchannel/event_test.go       |  4 ++--
 pkg/segment/test/segment_test.go     |  2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go    |  4 ++--
 pkg/sentry/fsimpl/tmpfs/stat_test.go |  8 ++++----
 pkg/tcpip/header/eth_test.go         |  2 +-
 pkg/tcpip/header/ndp_test.go         |  2 +-
 pkg/tcpip/link/fdbased/endpoint.go   |  2 +-
 pkg/tcpip/stack/ndp_test.go          | 12 ++++++------
 pkg/tcpip/stack/stack_test.go        | 32 ++++++++++++++++----------------
 pkg/tcpip/tcpip_test.go              |  2 +-
 pkg/tcpip/transport/tcp/tcp_test.go  |  4 ++--
 pkg/tcpip/transport/udp/udp_test.go  |  2 +-
 runsc/container/test_app/test_app.go |  2 +-
 test/root/cgroup_test.go             |  4 ++--
 test/runtimes/blacklist_test.go      |  2 +-
 test/runtimes/runner.go              |  2 +-
 tools/nogo.json                      | 31 -------------------------------
 17 files changed, 43 insertions(+), 74 deletions(-)

diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 7f41b4a27..43750360b 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -78,7 +78,7 @@ func TestMultiEmitter(t *testing.T) {
 	for _, name := range names {
 		m := testMessage{name: name}
 		if _, err := me.Emit(m); err != nil {
-			t.Fatal("me.Emit(%v) failed: %v", m, err)
+			t.Fatalf("me.Emit(%v) failed: %v", m, err)
 		}
 	}
 
@@ -96,7 +96,7 @@ func TestMultiEmitter(t *testing.T) {
 
 	// Close multiEmitter.
 	if err := me.Close(); err != nil {
-		t.Fatal("me.Close() failed: %v", err)
+		t.Fatalf("me.Close() failed: %v", err)
 	}
 
 	// All testEmitters should be closed.
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index f19a005f3..97b16c158 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -63,7 +63,7 @@ func checkSet(s *Set, expectedSegments int) error {
 			return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments)
 		}
 		if got, want := seg.Value(), seg.Start()+valueOffset; got != want {
-			return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start, got, want)
+			return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start(), got, want)
 		}
 		prev = next
 		havePrev = true
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 5aff0cc95..a0082ecca 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -119,7 +119,7 @@ func TestNewPipe(t *testing.T) {
 				continue
 			}
 			if flags := p.flags; test.flags != flags {
-				t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags)
+				t.Errorf("%s: got file flags %v, want %v", test.desc, flags, test.flags)
 				continue
 			}
 			if len(test.readAheadBuffer) != len(p.readAheadBuffer) {
@@ -136,7 +136,7 @@ func TestNewPipe(t *testing.T) {
 				continue
 			}
 			if !fdnotifier.HasFD(int32(f.FD())) {
-				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD)
+				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD())
 			}
 		}
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index 3e02e7190..d4f59ee5b 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -140,7 +140,7 @@ func TestSetStatAtime(t *testing.T) {
 		Mask:  0,
 		Atime: linux.NsecToStatxTimestamp(100),
 	}}); err != nil {
-		t.Errorf("SetStat atime without mask failed: %v")
+		t.Errorf("SetStat atime without mask failed: %v", err)
 	}
 	// Atime should be unchanged.
 	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
@@ -155,7 +155,7 @@ func TestSetStatAtime(t *testing.T) {
 		Atime: linux.NsecToStatxTimestamp(100),
 	}
 	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
-		t.Errorf("SetStat atime with mask failed: %v")
+		t.Errorf("SetStat atime with mask failed: %v", err)
 	}
 	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
 		t.Errorf("Stat got error: %v", err)
@@ -205,7 +205,7 @@ func TestSetStat(t *testing.T) {
 				Mask:  0,
 				Atime: linux.NsecToStatxTimestamp(100),
 			}}); err != nil {
-				t.Errorf("SetStat atime without mask failed: %v")
+				t.Errorf("SetStat atime without mask failed: %v", err)
 			}
 			// Atime should be unchanged.
 			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
@@ -220,7 +220,7 @@ func TestSetStat(t *testing.T) {
 				Atime: linux.NsecToStatxTimestamp(100),
 			}
 			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
-				t.Errorf("SetStat atime with mask failed: %v")
+				t.Errorf("SetStat atime with mask failed: %v", err)
 			}
 			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
 				t.Errorf("Stat got error: %v", err)
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
index 7a0014ad9..14413f2ce 100644
--- a/pkg/tcpip/header/eth_test.go
+++ b/pkg/tcpip/header/eth_test.go
@@ -88,7 +88,7 @@ func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr {
-				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", got, test.expectedLinkAddr)
+				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", test.addr, got, test.expectedLinkAddr)
 			}
 		})
 	}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 1cb9f5dc8..969f8f1e8 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -115,7 +115,7 @@ func TestNDPNeighborAdvert(t *testing.T) {
 
 	// Make sure flags got updated in the backing buffer.
 	if got := b[ndpNAFlagsOffset]; got != 64 {
-		t.Errorf("got flags byte = %d, want = 64")
+		t.Errorf("got flags byte = %d, want = 64", got)
 	}
 }
 
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 7198742b7..b857ce9d0 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -91,7 +91,7 @@ func (p PacketDispatchMode) String() string {
 	case PacketMMap:
 		return "PacketMMap"
 	default:
-		return fmt.Sprintf("unknown packet dispatch mode %v", p)
+		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
 	}
 }
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 27dc8baf9..bc822e3b1 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1959,7 +1959,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// addr2 is deprecated but if explicitly requested, it should be used.
 	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
 	}
 
 	// Another PI w/ 0 preferred lifetime should not result in a deprecation
@@ -1972,7 +1972,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	}
 	expectPrimaryAddr(addr1)
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
 	}
 
 	// Refresh lifetimes of addr generated from prefix2.
@@ -2084,7 +2084,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// addr1 is deprecated but if explicitly requested, it should be used.
 	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
 	}
 
 	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
@@ -2097,7 +2097,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	}
 	expectPrimaryAddr(addr2)
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
 	}
 
 	// Refresh lifetimes for addr of prefix1.
@@ -2121,7 +2121,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// addr2 should be the primary endpoint now since it is not deprecated.
 	expectPrimaryAddr(addr2)
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
 	}
 
 	// Wait for addr of prefix1 to be invalidated.
@@ -2564,7 +2564,7 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) {
 		AddressWithPrefix: addr2,
 	}
 	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
-		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d, %s) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
 	}
 	// addr2 should be more preferred now since it is at the front of the primary
 	// list.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 3f8a2a095..c7634ceb1 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1445,19 +1445,19 @@ func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) {
 
 	protoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Any, 0}}
 	if err := s.AddProtocolAddress(1, protoAddr); err != nil {
-		t.Fatalf("AddProtocolAddress(1, %s) failed: %s", protoAddr, err)
+		t.Fatalf("AddProtocolAddress(1, %v) failed: %v", protoAddr, err)
 	}
 	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(1, %s, %s, %d) failed: %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 	if err := verifyRoute(r, stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil {
-		t.Errorf("FindRoute(1, %s, %s, %d) returned unexpected Route: %s)", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
 	// If the NIC doesn't exist, it won't work.
 	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
-		t.Fatalf("got FindRoute(2, %s, %s, %d) = %s want = %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
+		t.Fatalf("got FindRoute(2, %v, %v, %d) = %v want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
 	}
 }
 
@@ -1483,12 +1483,12 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	}
 	nic1ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic1Addr}
 	if err := s.AddProtocolAddress(1, nic1ProtoAddr); err != nil {
-		t.Fatalf("AddProtocolAddress(1, %s) failed: %s", nic1ProtoAddr, err)
+		t.Fatalf("AddProtocolAddress(1, %v) failed: %v", nic1ProtoAddr, err)
 	}
 
 	nic2ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic2Addr}
 	if err := s.AddProtocolAddress(2, nic2ProtoAddr); err != nil {
-		t.Fatalf("AddAddress(2, %s) failed: %s", nic2ProtoAddr, err)
+		t.Fatalf("AddAddress(2, %v) failed: %v", nic2ProtoAddr, err)
 	}
 
 	// Set the initial route table.
@@ -1503,10 +1503,10 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	// When an interface is given, the route for a broadcast goes through it.
 	r, err := s.FindRoute(1, nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(1, %s, %s, %d) failed: %s", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 	if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
-		t.Errorf("FindRoute(1, %s, %s, %d) returned unexpected Route: %s)", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
 	// When an interface is not given, it consults the route table.
@@ -2399,7 +2399,7 @@ func TestNICContextPreservation(t *testing.T) {
 				t.Fatalf("got nicinfos[%d] = _, %t, want _, true; nicinfos = %+v", id, ok, nicinfos)
 			}
 			if got, want := nicinfo.Context == test.want, true; got != want {
-				t.Fatal("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
+				t.Fatalf("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
 			}
 		})
 	}
@@ -2768,7 +2768,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				{
 					subnet, err := tcpip.NewSubnet("\x00", "\x00")
 					if err != nil {
-						t.Fatalf("NewSubnet failed:", err)
+						t.Fatalf("NewSubnet failed: %v", err)
 					}
 					s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
 				}
@@ -2782,11 +2782,11 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// permanentExpired kind.
 				r, err := s.FindRoute(1, "\x01", "\x02", fakeNetNumber, false)
 				if err != nil {
-					t.Fatal("FindRoute failed:", err)
+					t.Fatalf("FindRoute failed: %v", err)
 				}
 				defer r.Release()
 				if err := s.RemoveAddress(1, "\x01"); err != nil {
-					t.Fatalf("RemoveAddress failed:", err)
+					t.Fatalf("RemoveAddress failed: %v", err)
 				}
 
 				//
@@ -2798,7 +2798,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// Add some other address with peb set to
 				// FirstPrimaryEndpoint.
 				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x03", stack.FirstPrimaryEndpoint); err != nil {
-					t.Fatal("AddAddressWithOptions failed:", err)
+					t.Fatalf("AddAddressWithOptions failed: %v", err)
 
 				}
 
@@ -2806,7 +2806,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// make sure the new peb was respected.
 				// (The address should just be promoted now).
 				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", ps); err != nil {
-					t.Fatal("AddAddressWithOptions failed:", err)
+					t.Fatalf("AddAddressWithOptions failed: %v", err)
 				}
 				var primaryAddrs []tcpip.Address
 				for _, pa := range s.NICInfo()[1].ProtocolAddresses {
@@ -2839,11 +2839,11 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// GetMainNICAddress; else, our original address
 				// should be returned.
 				if err := s.RemoveAddress(1, "\x03"); err != nil {
-					t.Fatalf("RemoveAddress failed:", err)
+					t.Fatalf("RemoveAddress failed: %v", err)
 				}
 				addr, err = s.GetMainNICAddress(1, fakeNetNumber)
 				if err != nil {
-					t.Fatal("s.GetMainNICAddress failed:", err)
+					t.Fatalf("s.GetMainNICAddress failed: %v", err)
 				}
 				if ps == stack.NeverPrimaryEndpoint {
 					if want := (tcpip.AddressWithPrefix{}); addr != want {
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index 8c0aacffa..1c8e2bc34 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -218,7 +218,7 @@ func TestAddressWithPrefixSubnet(t *testing.T) {
 		gotSubnet := ap.Subnet()
 		wantSubnet, err := NewSubnet(tt.subnetAddr, tt.subnetMask)
 		if err != nil {
-			t.Error("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err)
+			t.Errorf("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err)
 			continue
 		}
 		if gotSubnet != wantSubnet {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index ce3df7478..8a87fa0a8 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -284,7 +284,7 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// are released instantly on Close.
 	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
-		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %s) = %s", tcp.ProtocolNumber, tcpTW, err)
+		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %v) = %v", tcp.ProtocolNumber, tcpTW, err)
 	}
 
 	c.EP.Close()
@@ -5609,7 +5609,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 				return
 			}
 			if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) {
-				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w, wantRcvWnd)
+				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w)
 			}
 		},
 	))
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0905726c1..b3fe557ca 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -607,7 +607,7 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	// Check the peer address.
 	h := flow.header4Tuple(incoming)
 	if addr.Addr != h.srcAddr.Addr {
-		c.t.Fatalf("unexpected remote address: got %s, want %s", addr.Addr, h.srcAddr)
+		c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr)
 	}
 
 	// Check the payload.
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 01c47c79f..5f1c4b7d6 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -96,7 +96,7 @@ func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 
 	listener, err := net.Listen("unix", c.socketPath)
 	if err != nil {
-		log.Fatal("error listening on socket %q:", c.socketPath, err)
+		log.Fatalf("error listening on socket %q: %v", c.socketPath, err)
 	}
 
 	go server(listener, outputFile)
diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
index 4038661cb..679342def 100644
--- a/test/root/cgroup_test.go
+++ b/test/root/cgroup_test.go
@@ -53,7 +53,7 @@ func verifyPid(pid int, path string) error {
 	if scanner.Err() != nil {
 		return scanner.Err()
 	}
-	return fmt.Errorf("got: %s, want: %d", gots, pid)
+	return fmt.Errorf("got: %v, want: %d", gots, pid)
 }
 
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
@@ -106,7 +106,7 @@ func TestMemCGroup(t *testing.T) {
 		time.Sleep(100 * time.Millisecond)
 	}
 
-	t.Fatalf("%vMB is less than %vMB: %v", memUsage>>20, allocMemSize>>20)
+	t.Fatalf("%vMB is less than %vMB", memUsage>>20, allocMemSize>>20)
 }
 
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
diff --git a/test/runtimes/blacklist_test.go b/test/runtimes/blacklist_test.go
index 52f49b984..0ff69ab18 100644
--- a/test/runtimes/blacklist_test.go
+++ b/test/runtimes/blacklist_test.go
@@ -32,6 +32,6 @@ func TestBlacklists(t *testing.T) {
 		t.Fatalf("error parsing blacklist: %v", err)
 	}
 	if *blacklistFile != "" && len(bl) == 0 {
-		t.Errorf("got empty blacklist for file %q", blacklistFile)
+		t.Errorf("got empty blacklist for file %q", *blacklistFile)
 	}
 }
diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go
index ddb890dbc..3c98f4570 100644
--- a/test/runtimes/runner.go
+++ b/test/runtimes/runner.go
@@ -114,7 +114,7 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 			F: func(t *testing.T) {
 				// Is the test blacklisted?
 				if _, ok := blacklist[tc]; ok {
-					t.Skip("SKIP: blacklisted test %q", tc)
+					t.Skipf("SKIP: blacklisted test %q", tc)
 				}
 
 				var (
diff --git a/tools/nogo.json b/tools/nogo.json
index cc05ba027..f69999e50 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -27,37 +27,6 @@
       "/external/io_opencensus_go/tag/map_codec.go": "allowed: false positive"
     }
   },
-  "printf": {
-    "exclude_files": {
-      ".*_abi_autogen_test.go": "fix: Sprintf format has insufficient args",
-      "/pkg/segment/test/segment_test.go": "fix: Errorf format %d arg seg.Start is a func value, not called",
-      "/pkg/tcpip/tcpip_test.go": "fix: Error call has possible formatting directive %q",
-      "/pkg/tcpip/header/eth_test.go": "fix: Fatalf format %s reads arg #3, but call has 2 args",
-      "/pkg/tcpip/header/ndp_test.go": "fix: Errorf format %d reads arg #1, but call has 0 args",
-      "/pkg/eventchannel/event_test.go": "fix: Fatal call has possible formatting directive %v",
-      "/pkg/tcpip/stack/ndp.go": "fix: Fatalf format %s has arg protocolAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
-      "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %s has arg flags of wrong type gvisor.dev/gvisor/pkg/sentry/fs.FileFlags",
-      "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %d arg f.FD is a func value, not called",
-      "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call",
-      "/pkg/tcpip/transport/udp/udp_test.go": "fix: Fatalf format %s has arg h.srcAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.FullAddress",
-      "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Fatalf format %s has arg tcpTW of wrong type gvisor.dev/gvisor/pkg/tcpip.TCPTimeWaitTimeoutOption",
-      "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Errorf call needs 1 arg but has 2 args",
-      "/pkg/tcpip/stack/ndp_test.go": "fix: Errorf format %s reads arg #3, but call has 2 args",
-      "/pkg/tcpip/stack/ndp_test.go": "fix: Fatalf format %s reads arg #5, but call has 4 args",
-      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg protoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
-      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic1ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
-      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic2ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
-      "/pkg/tcpip/stack/stack_test.go": "fix: Fatal call has possible formatting directive %t",
-      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf call has arguments but no formatting directives",
-      "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call",
-      "/pkg/sentry/fsimpl/tmpfs/stat_test.go": "fix: Errorf format %v reads arg #1, but call has 0 args",
-      "/runsc/container/test_app/test_app.go": "fix: Fatal call has possible formatting directive %q",
-      "/test/root/cgroup_test.go": "fix: Errorf format %s has arg gots of wrong type []int",
-      "/test/root/cgroup_test.go": "fix: Fatalf format %v reads arg #3, but call has 2 args",
-      "/test/runtimes/runner.go": "fix: Skip call has possible formatting directive %q",
-      "/test/runtimes/blacklist_test.go": "fix: Errorf format %q has arg blacklistFile of wrong type *string"
-    }
-  },
   "structtag": {
     "exclude_files": {
       "/external/": "allowed: may use arbitrary tags"
-- 
cgit v1.2.3


From f888b9ce83c202bb77c1e29c3ee60cc485906536 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 25 Mar 2020 16:57:37 -0700
Subject: Fix unused result errors.

This fixes a bug in the proc net directory.

Updates #2243
---
 pkg/sentry/fsimpl/proc/task_net.go | 4 ++--
 tools/nogo.json                    | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 6b2a77328..6595fcee6 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -688,9 +688,9 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		if line.prefix == "Tcp" {
 			tcp := stat.(*inet.StatSNMPTCP)
 			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
-			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+			fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
 		} else {
-			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+			fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
 		}
 	}
 	return nil
diff --git a/tools/nogo.json b/tools/nogo.json
index f69999e50..09bda9212 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -44,11 +44,5 @@
       "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case",
       "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case"
     }
-  },
-  "unusedresult": {
-    "exclude_files": {
-      "/pkg/sentry/fsimpl/proc/task_net.go": "fix: result of fmt.Sprintf call not used",
-      "/pkg/sentry/fsimpl/proc/tasks_net.go": "fix: result of fmt.Sprintf call not used"
-    }
   }
 }
-- 
cgit v1.2.3


From 867eeb18d8c65019fb755194d5bdf768837f07a8 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 25 Mar 2020 17:20:16 -0700
Subject: Remove lostcancel warnings.

Updates #2243
---
 pkg/tcpip/network/arp/arp_test.go                  |  3 ++-
 pkg/tcpip/stack/ndp_test.go                        |  6 ++++--
 pkg/tcpip/transport/tcp/testing/context/context.go |  9 ++++++---
 pkg/tcpip/transport/udp/udp_test.go                | 15 ++++++++++-----
 tools/nogo.json                                    |  8 --------
 5 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index b3e239ac7..1646d9cde 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -138,7 +138,8 @@ func TestDirectRequest(t *testing.T) {
 	// Sleep tests are gross, but this will only potentially flake
 	// if there's a bug. If there is no bug this will reliably
 	// succeed.
-	ctx, _ := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
 	if pkt, ok := c.linkEP.ReadContext(ctx); ok {
 		t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto)
 	}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index bc822e3b1..acb2d4731 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -3483,7 +3483,8 @@ func TestRouterSolicitation(t *testing.T) {
 				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
-					ctx, _ := context.WithTimeout(context.Background(), timeout)
+					ctx, cancel := context.WithTimeout(context.Background(), timeout)
+					defer cancel()
 					p, ok := e.ReadContext(ctx)
 					if !ok {
 						t.Fatal("timed out waiting for packet")
@@ -3513,7 +3514,8 @@ func TestRouterSolicitation(t *testing.T) {
 				}
 				waitForNothing := func(timeout time.Duration) {
 					t.Helper()
-					ctx, _ := context.WithTimeout(context.Background(), timeout)
+					ctx, cancel := context.WithTimeout(context.Background(), timeout)
+					defer cancel()
 					if _, ok := e.ReadContext(ctx); ok {
 						t.Fatal("unexpectedly got a packet")
 					}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index d4f6bc635..431ab4e6b 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -217,7 +217,8 @@ func (c *Context) Stack() *stack.Stack {
 func (c *Context) CheckNoPacketTimeout(errMsg string, wait time.Duration) {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), wait)
+	ctx, cancel := context.WithTimeout(context.Background(), wait)
+	defer cancel()
 	if _, ok := c.linkEP.ReadContext(ctx); ok {
 		c.t.Fatal(errMsg)
 	}
@@ -235,7 +236,8 @@ func (c *Context) CheckNoPacket(errMsg string) {
 func (c *Context) GetPacket() []byte {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
@@ -486,7 +488,8 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 func (c *Context) GetV6Packet() []byte {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index b3fe557ca..fd818243f 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -358,7 +358,8 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.NetworkChecker) []byte {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
@@ -1563,7 +1564,8 @@ func TestV4UnknownDestination(t *testing.T) {
 			}
 			c.injectPacket(tc.flow, payload)
 			if !tc.icmpRequired {
-				ctx, _ := context.WithTimeout(context.Background(), time.Second)
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+				defer cancel()
 				if p, ok := c.linkEP.ReadContext(ctx); ok {
 					t.Fatalf("unexpected packet received: %+v", p)
 				}
@@ -1571,7 +1573,8 @@ func TestV4UnknownDestination(t *testing.T) {
 			}
 
 			// ICMP required.
-			ctx, _ := context.WithTimeout(context.Background(), time.Second)
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			defer cancel()
 			p, ok := c.linkEP.ReadContext(ctx)
 			if !ok {
 				t.Fatalf("packet wasn't written out")
@@ -1639,7 +1642,8 @@ func TestV6UnknownDestination(t *testing.T) {
 			}
 			c.injectPacket(tc.flow, payload)
 			if !tc.icmpRequired {
-				ctx, _ := context.WithTimeout(context.Background(), time.Second)
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+				defer cancel()
 				if p, ok := c.linkEP.ReadContext(ctx); ok {
 					t.Fatalf("unexpected packet received: %+v", p)
 				}
@@ -1647,7 +1651,8 @@ func TestV6UnknownDestination(t *testing.T) {
 			}
 
 			// ICMP required.
-			ctx, _ := context.WithTimeout(context.Background(), time.Second)
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			defer cancel()
 			p, ok := c.linkEP.ReadContext(ctx)
 			if !ok {
 				t.Fatalf("packet wasn't written out")
diff --git a/tools/nogo.json b/tools/nogo.json
index 09bda9212..85fac8604 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -9,14 +9,6 @@
       "/external/": "allowed: not subject to unsafe naming rules"
     }
   },
-  "lostcancel": {
-    "exclude_files": {
-      "/pkg/tcpip/network/arp/arp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
-      "/pkg/tcpip/stack/ndp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
-      "/pkg/tcpip/transport/udp/udp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
-      "/pkg/tcpip/transport/tcp/testing/context/context.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak"
-    }
-  },
   "nilness": {
     "exclude_files": {
       "/com_github_vishvananda_netlink/route_linux.go": "allowed: false positive",
-- 
cgit v1.2.3


From b30130567d81157e39b692e0116f86015f0bcc71 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Apr 2020 13:33:44 -0700
Subject: Enable SubprocessExited and SubprocessZombie for gVisor

Updates #164

PiperOrigin-RevId: 305544029
---
 pkg/sentry/fs/proc/task.go           | 28 ++++++++++++++++--
 pkg/sentry/fsimpl/proc/task.go       | 16 -----------
 pkg/sentry/fsimpl/proc/task_files.go | 56 ++++++++++++++++++++++++++++++++++--
 test/syscalls/linux/proc.cc          | 31 ++++++++------------
 4 files changed, 90 insertions(+), 41 deletions(-)

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index d6c5dd2c1..4d42eac83 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -57,6 +57,16 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 // taskDir represents a task-level directory.
 //
 // +stateify savable
@@ -254,11 +264,12 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 func (e *exe) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(e.t); err != nil {
+		return nil, err
+	}
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -268,7 +279,7 @@ func (e *exe) executable() (file fsbridge.File, err error) {
 		// (with locks held).
 		file = mm.Executable()
 		if file == nil {
-			err = syserror.ENOENT
+			err = syserror.ESRCH
 		}
 	})
 	return
@@ -313,11 +324,22 @@ func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.
 	return newProcInode(t, n, msrc, fs.Symlink, t)
 }
 
+// Readlink reads the symlink value.
+func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if err := checkTaskState(n.t); err != nil {
+		return "", err
+	}
+	return n.Symlink.Readlink(ctx, inode)
+}
+
 // Getlink implements fs.InodeOperations.Getlink.
 func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
 	if !kernel.ContextCanTrace(ctx, n.t, false) {
 		return nil, syserror.EACCES
 	}
+	if err := checkTaskState(n.t); err != nil {
+		return nil, err
+	}
 
 	// Create a new regular file to fake the namespace file.
 	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index aee2a4392..888afc0fd 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -214,22 +214,6 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
 	return &ioData{ioUsage: t}
 }
 
-func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
-	// Namespace symlinks should contain the namespace name and the inode number
-	// for the namespace instance, so for example user:[123456]. We currently fake
-	// the inode number by sticking the symlink inode in its place.
-	target := fmt.Sprintf("%s:[%d]", ns, ino)
-
-	inode := &kernfs.StaticSymlink{}
-	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), ino, target)
-
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
-}
-
 // newCgroupData creates inode that shows cgroup information.
 // From man 7 cgroups: "For each cgroup hierarchy of which the process is a
 // member, there is one entry containing three colon-separated fields:
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 88ea6a6d8..2c6f8bdfc 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -64,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 type bufferWriter struct {
 	buf *bytes.Buffer
 }
@@ -628,11 +638,13 @@ func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, er
 }
 
 func (s *exeSymlink) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(s.task); err != nil {
+		return nil, err
+	}
+
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -642,7 +654,7 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 		// (with locks held).
 		file = mm.Executable()
 		if file == nil {
-			err = syserror.ENOENT
+			err = syserror.ESRCH
 		}
 	})
 	return
@@ -709,3 +721,41 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
 	return nil
 }
+
+type namespaceSymlink struct {
+	kernfs.StaticSymlink
+
+	task *kernel.Task
+}
+
+func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+	// Namespace symlinks should contain the namespace name and the inode number
+	// for the namespace instance, so for example user:[123456]. We currently fake
+	// the inode number by sticking the symlink inode in its place.
+	target := fmt.Sprintf("%s:[%d]", ns, ino)
+
+	inode := &namespaceSymlink{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), ino, target)
+
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
+}
+
+// Readlink implements Inode.
+func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return "", err
+	}
+	return s.StaticSymlink.Readlink(ctx)
+}
+
+// Getlink implements Inode.Getlink.
+func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	return s.StaticSymlink.Getlink(ctx)
+}
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 5a70f6c3b..da98e1f66 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1326,8 +1326,6 @@ TEST(ProcPidSymlink, SubprocessRunning) {
               SyscallSucceedsWithValue(sizeof(buf)));
 }
 
-// FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-// on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -1337,7 +1335,7 @@ TEST(ProcPidSymlink, SubprocessZombied) {
   int want = EACCES;
   if (!IsRunningOnGvisor()) {
     auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
-    if (version.major == 4 && version.minor > 3) {
+    if (version.major > 4 || (version.major == 4 && version.minor > 3)) {
       want = ENOENT;
     }
   }
@@ -1350,30 +1348,25 @@ TEST(ProcPidSymlink, SubprocessZombied) {
                 SyscallFailsWithErrno(want));
   }
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-  // on proc files.
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between linux on proc
+  // files.
   //
   // ~4.3: Syscall fails with EACCES.
-  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  // 4.17: Syscall succeeds and returns 1.
   //
-  // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
-  //            SyscallFailsWithErrno(EACCES));
+  if (!IsRunningOnGvisor()) {
+    return;
+  }
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-  // on proc files.
-  //
-  // ~4.3: Syscall fails with EACCES.
-  // 4.17 & gVisor: Syscall succeeds and returns 1.
-  //
-  // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
-  //            SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
+
+  EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
 }
 
 // Test whether /proc/PID/ symlinks can be read for an exited process.
 TEST(ProcPidSymlink, SubprocessExited) {
-  // FIXME(gvisor.dev/issue/164): These all succeed on gVisor.
-  SKIP_IF(IsRunningOnGvisor());
-
   char buf[1];
 
   EXPECT_THAT(ReadlinkWhileExited("exe", buf, sizeof(buf)),
-- 
cgit v1.2.3


From d8c0c1d9d5f30f4cfe8e0adedd264b10aff793f7 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 8 Apr 2020 13:39:02 -0700
Subject: Do not hold FileDescription references in VFS2 procfs inodes.

FileDescription references are side-effectual; for example, holding a reference
on the write end of a pipe prevents reads from the read end from returning EOF.

This change is consistent with Linux, but not VFS1; while VFS1 also has this
bug, it's less visible there since VFS1 procfs disables caching.

Updates #1195

PiperOrigin-RevId: 305545099
---
 pkg/sentry/fsimpl/proc/task_fds.go | 125 ++++++++++++++++++++-----------------
 1 file changed, 67 insertions(+), 58 deletions(-)

diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 9c8656b28..046265eca 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -30,34 +30,35 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-type fdDir struct {
-	inoGen InoGenerator
-	task   *kernel.Task
-
-	// When produceSymlinks is set, dirents produces for the FDs are reported
-	// as symlink. Otherwise, they are reported as regular files.
-	produceSymlink bool
-}
-
-func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) {
-	fd, err := strconv.ParseUint(name, 10, 64)
-	if err != nil {
-		return nil, kernel.FDFlags{}, syserror.ENOENT
-	}
-
+func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) {
 	var (
 		file  *vfs.FileDescription
 		flags kernel.FDFlags
 	)
-	i.task.WithMuLocked(func(t *kernel.Task) {
-		if fdTable := t.FDTable(); fdTable != nil {
-			file, flags = fdTable.GetVFS2(int32(fd))
+	t.WithMuLocked(func(t *kernel.Task) {
+		if fdt := t.FDTable(); fdt != nil {
+			file, flags = fdt.GetVFS2(fd)
 		}
 	})
+	return file, flags
+}
+
+func taskFDExists(t *kernel.Task, fd int32) bool {
+	file, _ := getTaskFD(t, fd)
 	if file == nil {
-		return nil, kernel.FDFlags{}, syserror.ENOENT
+		return false
 	}
-	return file, flags, nil
+	file.DecRef()
+	return true
+}
+
+type fdDir struct {
+	inoGen InoGenerator
+	task   *kernel.Task
+
+	// When produceSymlinks is set, dirents produces for the FDs are reported
+	// as symlink. Otherwise, they are reported as regular files.
+	produceSymlink bool
 }
 
 // IterDirents implements kernfs.inodeDynamicLookup.
@@ -128,11 +129,15 @@ func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
 
 // Lookup implements kernfs.inodeDynamicLookup.
 func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	file, _, err := i.lookup(name)
+	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
-		return nil, err
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(i.task, fd) {
+		return nil, syserror.ENOENT
 	}
-	taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno())
+	taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno())
 	return taskDentry.VFSDentry(), nil
 }
 
@@ -169,19 +174,22 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
 //
 // +stateify savable
 type fdSymlink struct {
-	refs.AtomicRefCount
 	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
 
-	file *vfs.FileDescription
+	task *kernel.Task
+	fd   int32
 }
 
 var _ kernfs.Inode = (*fdSymlink)(nil)
 
-func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry {
-	file.IncRef()
-	inode := &fdSymlink{file: file}
-	inode.Init(creds, ino, linux.ModeSymlink|0777)
+func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+	inode := &fdSymlink{
+		task: task,
+		fd:   fd,
+	}
+	inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -189,29 +197,27 @@ func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64
 }
 
 func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return "", syserror.ENOENT
+	}
+	defer file.DecRef()
 	root := vfs.RootFromContext(ctx)
 	defer root.DecRef()
-
-	vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
-	return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry())
+	return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
 }
 
 func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
-	vd := s.file.VirtualDentry()
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return vfs.VirtualDentry{}, "", syserror.ENOENT
+	}
+	defer file.DecRef()
+	vd := file.VirtualDentry()
 	vd.IncRef()
 	return vd, "", nil
 }
 
-func (s *fdSymlink) DecRef() {
-	s.AtomicRefCount.DecRefWithDestructor(func() {
-		s.Destroy()
-	})
-}
-
-func (s *fdSymlink) Destroy() {
-	s.file.DecRef()
-}
-
 // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
 //
 // +stateify savable
@@ -244,12 +250,18 @@ func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
 
 // Lookup implements kernfs.inodeDynamicLookup.
 func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	file, flags, err := i.lookup(name)
+	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
-		return nil, err
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(i.task, fd) {
+		return nil, syserror.ENOENT
+	}
+	data := &fdInfoData{
+		task: i.task,
+		fd:   fd,
 	}
-
-	data := &fdInfoData{file: file, flags: flags}
 	dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data)
 	return dentry.VFSDentry(), nil
 }
@@ -268,26 +280,23 @@ type fdInfoData struct {
 	kernfs.DynamicBytesFile
 	refs.AtomicRefCount
 
-	file  *vfs.FileDescription
-	flags kernel.FDFlags
+	task *kernel.Task
+	fd   int32
 }
 
 var _ dynamicInode = (*fdInfoData)(nil)
 
-func (d *fdInfoData) DecRef() {
-	d.AtomicRefCount.DecRefWithDestructor(d.destroy)
-}
-
-func (d *fdInfoData) destroy() {
-	d.file.DecRef()
-}
-
 // Generate implements vfs.DynamicBytesSource.Generate.
 func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	file, descriptorFlags := getTaskFD(d.task, d.fd)
+	if file == nil {
+		return syserror.ENOENT
+	}
+	defer file.DecRef()
 	// TODO(b/121266871): Include pos, locks, and other data. For now we only
 	// have flags.
 	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
-	flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags()
+	flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags()
 	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
 	return nil
 }
-- 
cgit v1.2.3


From 2907e6da5e9fc7eeda51644db7bec4d15691b384 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Wed, 8 Apr 2020 13:46:51 -0700
Subject: file test: Remove FIXME about FIFO. It is already tested in mknod
 test.

PiperOrigin-RevId: 305546584
---
 test/syscalls/linux/file_base.h | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 6f80bc97c..25fdd7106 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -52,17 +52,6 @@ class FileTest : public ::testing::Test {
     test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
         Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
 
-    // FIXME(edahlgren): enable when mknod syscall is supported.
-    // test_fifo_name_ = NewTempAbsPath();
-    // ASSERT_THAT(mknod(test_fifo_name_.c_str()), S_IFIFO|0644, 0,
-    //             SyscallSucceeds());
-    // ASSERT_THAT(test_fifo_[1] = open(test_fifo_name_.c_str(),
-    //                                             O_WRONLY),
-    //             SyscallSucceeds());
-    // ASSERT_THAT(test_fifo_[0] = open(test_fifo_name_.c_str(),
-    //                                             O_RDONLY),
-    //             SyscallSucceeds());
-
     ASSERT_THAT(pipe(test_pipe_), SyscallSucceeds());
     ASSERT_THAT(fcntl(test_pipe_[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
   }
@@ -96,18 +85,11 @@ class FileTest : public ::testing::Test {
     CloseFile();
     UnlinkFile();
     ClosePipes();
-
-    // FIXME(edahlgren): enable when mknod syscall is supported.
-    // close(test_fifo_[0]);
-    // close(test_fifo_[1]);
-    // unlink(test_fifo_name_.c_str());
   }
 
   std::string test_file_name_;
-  std::string test_fifo_name_;
   FileDescriptor test_file_fd_;
 
-  int test_fifo_[2];
   int test_pipe_[2];
 };
 
-- 
cgit v1.2.3


From 357f136e42de81b033b65b7f39a4a555275a17e3 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 8 Apr 2020 14:38:09 -0700
Subject: Handle utimes correctly for shared gofer filesystems.

Determine system time from within the sentry rather than relying on the remote
filesystem to prevent inconsistencies.
Resolve related TODOs; the time discrepancies in question don't exist anymore.

PiperOrigin-RevId: 305557099
---
 pkg/sentry/fs/gofer/util.go   | 16 ++++++++++++++--
 test/syscalls/linux/utimes.cc | 18 +-----------------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 2d8d3a2ea..47a6c69bf 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -20,17 +20,29 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
 
 func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error {
 	if ts.ATimeOmit && ts.MTimeOmit {
 		return nil
 	}
+
+	// Replace requests to use the "system time" with the current time to
+	// ensure that timestamps remain consistent with the remote
+	// filesystem.
+	now := ktime.NowFromContext(ctx)
+	if ts.ATimeSetSystemTime {
+		ts.ATime = now
+	}
+	if ts.MTimeSetSystemTime {
+		ts.MTime = now
+	}
 	mask := p9.SetAttrMask{
 		ATime:              !ts.ATimeOmit,
-		ATimeNotSystemTime: !ts.ATimeSetSystemTime,
+		ATimeNotSystemTime: true,
 		MTime:              !ts.MTimeOmit,
-		MTimeNotSystemTime: !ts.MTimeSetSystemTime,
+		MTimeNotSystemTime: true,
 	}
 	as, ans := ts.ATime.Unix()
 	ms, mns := ts.MTime.Unix()
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index 3a927a430..22e6d1a85 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -34,17 +34,10 @@ namespace testing {
 
 namespace {
 
-// TODO(b/36516566): utimes(nullptr) does not pick the "now" time in the
-// application's time domain, so when asserting that times are within a window,
-// we expand the window to allow for differences between the time domains.
-constexpr absl::Duration kClockSlack = absl::Milliseconds(100);
-
 // TimeBoxed runs fn, setting before and after to (coarse realtime) times
 // guaranteed* to come before and after fn started and completed, respectively.
 //
 // fn may be called more than once if the clock is adjusted.
-//
-// * See the comment on kClockSlack. gVisor breaks this guarantee.
 void TimeBoxed(absl::Time* before, absl::Time* after,
                std::function<void()> const& fn) {
   do {
@@ -69,12 +62,6 @@ void TimeBoxed(absl::Time* before, absl::Time* after,
       // which could lead to test failures, but that is very unlikely to happen.
       continue;
     }
-
-    if (IsRunningOnGvisor()) {
-      // See comment on kClockSlack.
-      *before -= kClockSlack;
-      *after += kClockSlack;
-    }
   } while (*after < *before);
 }
 
@@ -235,10 +222,7 @@ void TestUtimensat(int dirFd, std::string const& path) {
   EXPECT_GE(mtime3, before);
   EXPECT_LE(mtime3, after);
 
-  if (!IsRunningOnGvisor()) {
-    // FIXME(b/36516566): Gofers set atime and mtime to different "now" times.
-    EXPECT_EQ(atime3, mtime3);
-  }
+  EXPECT_EQ(atime3, mtime3);
 }
 
 TEST(UtimensatTest, OnAbsPath) {
-- 
cgit v1.2.3


From 981a587476e11e49cf49edb31705d8727b0db556 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 8 Apr 2020 17:32:57 -0700
Subject: Remove InodeOperations FIXMEs that will be obsoleted by VFS2.

PiperOrigin-RevId: 305588941
---
 pkg/sentry/fs/gofer/inode.go | 3 ---
 pkg/sentry/fs/host/inode.go  | 3 ---
 pkg/sentry/fs/inode.go       | 2 --
 3 files changed, 8 deletions(-)

diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 1c934981b..811e8ea30 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -710,13 +710,10 @@ func init() {
 }
 
 // AddLink implements InodeOperations.AddLink, but is currently a noop.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) AddLink() {}
 
 // DropLink implements InodeOperations.DropLink, but is currently a noop.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 1da3c0a17..62f1246aa 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -397,15 +397,12 @@ func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) AddLink() {}
 
 // DropLink implements fs.InodeOperations.DropLink.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
 
 // readdirAll returns all of the directory entries in i.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 55fb71c16..73f89abcc 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -397,8 +397,6 @@ func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) {
 // AddLink calls i.InodeOperations.AddLink.
 func (i *Inode) AddLink() {
 	if i.overlay != nil {
-		// FIXME(b/63117438): Remove this from InodeOperations altogether.
-		//
 		// This interface is only used by ramfs to update metadata of
 		// children. These filesystems should _never_ have overlay
 		// Inodes cached as children. So explicitly disallow this
-- 
cgit v1.2.3


From 6dd5a1f3fe55daa8510b1ee5e3a59219aad92af6 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Apr 2020 17:56:55 -0700
Subject: Clean up TODOs

PiperOrigin-RevId: 305592245
---
 pkg/sentry/fs/tmpfs/fs.go              |  3 ---
 pkg/sentry/fsimpl/kernfs/filesystem.go |  2 +-
 pkg/sentry/kernel/ptrace.go            |  1 -
 pkg/sentry/vfs/filesystem.go           |  2 +-
 pkg/sentry/vfs/mount.go                | 12 ++++++------
 pkg/sentry/vfs/mount_test.go           |  2 +-
 runsc/cmd/gofer.go                     |  5 ++---
 test/syscalls/linux/epoll.cc           |  4 ----
 test/syscalls/linux/file_base.h        |  1 +
 test/syscalls/linux/pwrite64.cc        |  9 +--------
 test/syscalls/linux/tuntap.cc          |  7 ++++---
 test/syscalls/linux/write.cc           | 10 ++--------
 tools/go_generics/defs.bzl             |  1 -
 13 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d5be56c3f..bc117ca6a 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -44,9 +44,6 @@ const (
 	// lookup.
 	cacheRevalidate = "revalidate"
 
-	// TODO(edahlgren/mpratt): support a tmpfs size limit.
-	// size = "size"
-
 	// Permissions that exceed modeMask will be rejected.
 	modeMask = 01777
 
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 16a3c18ae..4433071aa 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -682,7 +682,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO: actually implement statfs
+	// TODO(gvisor.dev/issue/1193): actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 35ad97d5d..e23e796ef 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
 		return false
 	}
-	// TODO: Yama LSM
 	return true
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index cd34782ff..bef1bd312 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -497,7 +497,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: inotify_add_watch()
+	// TODO(gvisor.dev/issue/1479): inotify_add_watch()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1b8ecc415..f06946103 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -233,9 +233,9 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 		}
 		vd.dentry.mu.Lock()
 	}
-	// TODO: Linux requires that either both the mount point and the mount root
-	// are directories, or neither are, and returns ENOTDIR if this is not the
-	// case.
+	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+	// point and the mount root are directories, or neither are, and returns
+	// ENOTDIR if this is not the case.
 	mntns := vd.mount.ns
 	mnt := newMount(vfs, fs, root, mntns, opts)
 	vfs.mounts.seq.BeginWrite()
@@ -274,9 +274,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 		}
 	}
 
-	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
-	// we don't implement yet (we'll just fail it since the caller holds a
-	// reference on it).
+	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+	// root, which we don't implement yet (we'll just fail it since the caller
+	// holds a reference on it).
 
 	vfs.mounts.seq.BeginWrite()
 	if opts.Flags&linux.MNT_DETACH == 0 {
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3b933468d..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	}
 }
 
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
 
 // must be powers of 2
 var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 02e5af3d3..28f0d54b9 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -272,9 +272,8 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	root := spec.Root.Path
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// FIXME: runsc can't be re-executed without
-		// /proc, so we create a tmpfs mount, mount ./proc and ./root
-		// there, then move this mount to the root and after
+		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
+		// mount ./proc and ./root there, then move this mount to the root and after
 		// setCapsAndCallSelf, runsc will chroot into /root.
 		//
 		// We need a directory to construct a new root and we know that
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index a4f8f3cec..f57d38dc7 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -56,10 +56,6 @@ TEST(EpollTest, AllWritable) {
   struct epoll_event result[kFDsPerEpoll];
   ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
               SyscallSucceedsWithValue(kFDsPerEpoll));
-  // TODO(edahlgren): Why do some tests check epoll_event::data, and others
-  // don't? Does Linux actually guarantee that, in any of these test cases,
-  // epoll_wait will necessarily write out the epoll_events in the order that
-  // they were registered?
   for (int i = 0; i < kFDsPerEpoll; i++) {
     ASSERT_EQ(result[i].events, EPOLLOUT);
   }
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 25fdd7106..fb418e052 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -87,6 +87,7 @@ class FileTest : public ::testing::Test {
     ClosePipes();
   }
 
+ protected:
   std::string test_file_name_;
   FileDescriptor test_file_fd_;
 
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index b48fe540d..c2f72e010 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -27,14 +27,7 @@ namespace testing {
 
 namespace {
 
-// This test is currently very rudimentary.
-//
-// TODO(edahlgren):
-// * bad buffer states (EFAULT).
-// * bad fds (wrong permission, wrong type of file, EBADF).
-// * check offset is not incremented.
-// * check for EOF.
-// * writing to pipes, symlinks, special files.
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
 class Pwrite64 : public ::testing::Test {
   void SetUp() override {
     name_ = NewTempAbsPath();
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index 53ad2dda3..3a8ba37eb 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -242,7 +242,7 @@ TEST_F(TuntapTest, InvalidReadWrite) {
 TEST_F(TuntapTest, WriteToDownDevice) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-  // FIXME: gVisor always creates enabled/up'd interfaces.
+  // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
   SKIP_IF(IsRunningOnGvisor());
 
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
@@ -280,10 +280,11 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(
                                    &addr, sizeof(addr)));
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
+    // FIXME(b/110961832): gVisor doesn't support setting MAC address on
+    // interfaces yet.
     RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
 
-    // FIXME: gVisor always creates enabled/up'd interfaces.
+    // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
     RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
   }
 
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 9b219cfd6..39b5b2f56 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -31,14 +31,8 @@ namespace gvisor {
 namespace testing {
 
 namespace {
-// This test is currently very rudimentary.
-//
-// TODO(edahlgren):
-// * bad buffer states (EFAULT).
-// * bad fds (wrong permission, wrong type of file, EBADF).
-// * check offset is incremented.
-// * check for EOF.
-// * writing to pipes, symlinks, special files.
+
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
 class WriteTest : public ::testing::Test {
  public:
   ssize_t WriteBytes(int fd, int bytes) {
diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl
index c5be52ecd..8c9995fd4 100644
--- a/tools/go_generics/defs.bzl
+++ b/tools/go_generics/defs.bzl
@@ -105,7 +105,6 @@ def _go_template_instance_impl(ctx):
         executable = ctx.executable._tool,
     )
 
-    # TODO: How can we get the dependencies out?
     return struct(
         files = depset([output]),
     )
-- 
cgit v1.2.3


From 0f75f7273d8c4ace73d93b6b00f81d53a5cf76ea Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 8 Apr 2020 18:40:46 -0700
Subject: Don't call platform.AddressSpace.MapFile with no permissions.

PiperOrigin-RevId: 305598136
---
 pkg/sentry/mm/address_space.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 0332fc71c..5c667117c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -201,8 +201,10 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if pma.needCOW {
 			perms.Write = false
 		}
-		if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
-			return err
+		if perms.Any() { // MapFile precondition
+			if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+				return err
+			}
 		}
 		pseg = pseg.NextSegment()
 	}
-- 
cgit v1.2.3


From 7297fd7238e17803e073fb5a5ef85edf992bdf6b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 8 Apr 2020 19:40:15 -0700
Subject: Bump proc_test's kRSSTolerance to 10MB.

PiperOrigin-RevId: 305604557
---
 test/syscalls/linux/proc.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index da98e1f66..79a625ebc 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -994,7 +994,7 @@ constexpr uint64_t kMappingSize = 100 << 20;
 
 // Tolerance on RSS comparisons to account for background thread mappings,
 // reclaimed pages, newly faulted pages, etc.
-constexpr uint64_t kRSSTolerance = 5 << 20;
+constexpr uint64_t kRSSTolerance = 10 << 20;
 
 // Capture RSS before and after an anonymous mapping with passed prot.
 void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
-- 
cgit v1.2.3


From a10389e783aab5f530641394ef44c8a1dede9372 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 8 Apr 2020 23:02:09 -0700
Subject: splice: cap splice calls to MAX_RW_COUNT

The Linux does the same.

Reported-by: syzbot+e81716e8956e92e9d56b@syzkaller.appspotmail.com
PiperOrigin-RevId: 305625439
---
 pkg/sentry/syscalls/linux/sys_splice.go |   4 ++
 test/syscalls/linux/BUILD               |   2 +
 test/syscalls/linux/sendfile_socket.cc  | 105 ++++++++++++++------------------
 3 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index fd642834b..fbc6cf15f 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -29,6 +29,10 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		return 0, syserror.EINVAL
 	}
 
+	if opts.Length > int64(kernel.MAX_RW_COUNT) {
+		opts.Length = int64(kernel.MAX_RW_COUNT)
+	}
+
 	var (
 		total int64
 		n     int64
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d0c431234..ae3017608 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2026,6 +2026,8 @@ cc_binary(
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings",
         gtest,
+        ":ip_socket_test_util",
+        ":unix_domain_socket_test_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index e94672679..c101fe9d2 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -23,6 +23,7 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
@@ -35,61 +36,39 @@ namespace {
 
 class SendFileTest : public ::testing::TestWithParam<int> {
  protected:
-  PosixErrorOr<std::tuple<int, int>> Sockets() {
+  PosixErrorOr<std::unique_ptr<SocketPair>> Sockets(int type) {
     // Bind a server socket.
     int family = GetParam();
-    struct sockaddr server_addr = {};
     switch (family) {
       case AF_INET: {
-        struct sockaddr_in* server_addr_in =
-            reinterpret_cast<struct sockaddr_in*>(&server_addr);
-        server_addr_in->sin_family = family;
-        server_addr_in->sin_addr.s_addr = INADDR_ANY;
-        break;
+        if (type == SOCK_STREAM) {
+          return SocketPairKind{
+              "TCP", AF_INET, type, 0,
+              TCPAcceptBindSocketPairCreator(AF_INET, type, 0, false)}
+              .Create();
+        } else {
+          return SocketPairKind{
+              "UDP", AF_INET, type, 0,
+              UDPBidirectionalBindSocketPairCreator(AF_INET, type, 0, false)}
+              .Create();
+        }
       }
       case AF_UNIX: {
-        struct sockaddr_un* server_addr_un =
-            reinterpret_cast<struct sockaddr_un*>(&server_addr);
-        server_addr_un->sun_family = family;
-        server_addr_un->sun_path[0] = '\0';
-        break;
+        if (type == SOCK_STREAM) {
+          return SocketPairKind{
+              "UNIX", AF_UNIX, type, 0,
+              FilesystemAcceptBindSocketPairCreator(AF_UNIX, type, 0)}
+              .Create();
+        } else {
+          return SocketPairKind{
+              "UNIX", AF_UNIX, type, 0,
+              FilesystemBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)}
+              .Create();
+        }
       }
       default:
         return PosixError(EINVAL);
     }
-    int server = socket(family, SOCK_STREAM, 0);
-    if (bind(server, &server_addr, sizeof(server_addr)) < 0) {
-      return PosixError(errno);
-    }
-    if (listen(server, 1) < 0) {
-      close(server);
-      return PosixError(errno);
-    }
-
-    // Fetch the address; both are anonymous.
-    socklen_t length = sizeof(server_addr);
-    if (getsockname(server, &server_addr, &length) < 0) {
-      close(server);
-      return PosixError(errno);
-    }
-
-    // Connect the client.
-    int client = socket(family, SOCK_STREAM, 0);
-    if (connect(client, &server_addr, length) < 0) {
-      close(server);
-      close(client);
-      return PosixError(errno);
-    }
-
-    // Accept on the server.
-    int server_client = accept(server, nullptr, 0);
-    if (server_client < 0) {
-      close(server);
-      close(client);
-      return PosixError(errno);
-    }
-    close(server);
-    return std::make_tuple(client, server_client);
   }
 };
 
@@ -106,9 +85,7 @@ TEST_P(SendFileTest, SendMultiple) {
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
   // Create sockets.
-  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
-  const FileDescriptor server(std::get<0>(fds));
-  FileDescriptor client(std::get<1>(fds));  // non-const, reset is used.
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
 
   // Thread that reads data from socket and dumps to a file.
   ScopedThread th([&] {
@@ -118,7 +95,7 @@ TEST_P(SendFileTest, SendMultiple) {
     // Read until socket is closed.
     char buf[10240];
     for (int cnt = 0;; cnt++) {
-      int r = RetryEINTR(read)(server.get(), buf, sizeof(buf));
+      int r = RetryEINTR(read)(socks->first_fd(), buf, sizeof(buf));
       // We cannot afford to save on every read() call.
       if (cnt % 1000 == 0) {
         ASSERT_THAT(r, SyscallSucceeds());
@@ -152,7 +129,7 @@ TEST_P(SendFileTest, SendMultiple) {
               << ", remain=" << remain << std::endl;
 
     // Send data and verify that sendfile returns the correct value.
-    int res = sendfile(client.get(), inf.get(), nullptr, remain);
+    int res = sendfile(socks->second_fd(), inf.get(), nullptr, remain);
     // We cannot afford to save on every sendfile() call.
     if (cnt % 120 == 0) {
       MaybeSave();
@@ -169,7 +146,7 @@ TEST_P(SendFileTest, SendMultiple) {
   }
 
   // Close socket to stop thread.
-  client.reset();
+  close(socks->release_second_fd());
   th.Join();
 
   // Verify that the output file has the correct data.
@@ -183,9 +160,7 @@ TEST_P(SendFileTest, SendMultiple) {
 
 TEST_P(SendFileTest, Shutdown) {
   // Create a socket.
-  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
-  const FileDescriptor client(std::get<0>(fds));
-  FileDescriptor server(std::get<1>(fds));  // non-const, reset below.
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
 
   // If this is a TCP socket, then turn off linger.
   if (GetParam() == AF_INET) {
@@ -193,7 +168,7 @@ TEST_P(SendFileTest, Shutdown) {
     sl.l_onoff = 1;
     sl.l_linger = 0;
     ASSERT_THAT(
-        setsockopt(server.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+        setsockopt(socks->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
         SyscallSucceeds());
   }
 
@@ -212,12 +187,12 @@ TEST_P(SendFileTest, Shutdown) {
   ScopedThread t([&]() {
     size_t done = 0;
     while (done < data.size()) {
-      int n = RetryEINTR(read)(server.get(), data.data(), data.size());
+      int n = RetryEINTR(read)(socks->first_fd(), data.data(), data.size());
       ASSERT_THAT(n, SyscallSucceeds());
       done += n;
     }
     // Close the server side socket.
-    server.reset();
+    close(socks->release_first_fd());
   });
 
   // Continuously stream from the file to the socket. Note we do not assert
@@ -225,7 +200,7 @@ TEST_P(SendFileTest, Shutdown) {
   // data is written. Eventually, we should get a connection reset error.
   while (1) {
     off_t offset = 0;  // Always read from the start.
-    int n = sendfile(client.get(), inf.get(), &offset, data.size());
+    int n = sendfile(socks->second_fd(), inf.get(), &offset, data.size());
     EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET),
                          SyscallFailsWithErrno(EPIPE), SyscallSucceeds()));
     if (n <= 0) {
@@ -234,6 +209,20 @@ TEST_P(SendFileTest, Shutdown) {
   }
 }
 
+TEST_P(SendFileTest, SendpageFromEmptyFileToUDP) {
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_DGRAM));
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  // The value to the count argument has to be so that it is impossible to
+  // allocate a buffer of this size. In Linux, sendfile transfer at most
+  // 0x7ffff000 (MAX_RW_COUNT) bytes.
+  EXPECT_THAT(sendfile(socks->first_fd(), fd.get(), 0x0, 0x8000000000004),
+              SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest,
                          ::testing::Values(AF_UNIX, AF_INET));
 
-- 
cgit v1.2.3


From 21e438d257861eadc1dafcee914e4a51cffd3852 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 8 Apr 2020 23:28:52 -0700
Subject: Dereference pointers in Layer's Stringer impl

Dereference any fields which are pointers before string formatting so that the
value pointed to ends up in the string representation.

Tested:
  Added TestLayerStringFormat to
  //third_party/gvisor/test/packetimpact/testbench:testbench_test
PiperOrigin-RevId: 305627821
---
 test/packetimpact/testbench/BUILD          |  1 +
 test/packetimpact/testbench/layers.go      |  2 +-
 test/packetimpact/testbench/layers_test.go | 95 ++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index 199823419..838a10ffe 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -36,4 +36,5 @@ go_test(
     size = "small",
     srcs = ["layers_test.go"],
     library = ":testbench",
+    deps = ["//pkg/tcpip"],
 )
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 4d6625941..093a46e23 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -118,7 +118,7 @@ func stringLayer(l Layer) string {
 		if v.IsNil() {
 			continue
 		}
-		ret = append(ret, fmt.Sprintf("%s:%v", t.Name, v))
+		ret = append(ret, fmt.Sprintf("%s:%v", t.Name, reflect.Indirect(v)))
 	}
 	return fmt.Sprintf("&%s{%s}", t, strings.Join(ret, " "))
 }
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
index b39839625..8ffc26bf9 100644
--- a/test/packetimpact/testbench/layers_test.go
+++ b/test/packetimpact/testbench/layers_test.go
@@ -16,6 +16,8 @@ package testbench
 
 import "testing"
 
+import "gvisor.dev/gvisor/pkg/tcpip"
+
 func TestLayerMatch(t *testing.T) {
 	var nilPayload *Payload
 	noPayload := &Payload{}
@@ -47,3 +49,96 @@ func TestLayerMatch(t *testing.T) {
 		}
 	}
 }
+
+func TestLayerStringFormat(t *testing.T) {
+	for _, tt := range []struct {
+		name string
+		l    Layer
+		want string
+	}{
+		{
+			name: "TCP",
+			l: &TCP{
+				SrcPort:    Uint16(34785),
+				DstPort:    Uint16(47767),
+				SeqNum:     Uint32(3452155723),
+				AckNum:     Uint32(2596996163),
+				DataOffset: Uint8(5),
+				Flags:      Uint8(20),
+				WindowSize: Uint16(64240),
+				Checksum:   Uint16(0x2e2b),
+			},
+			want: "&testbench.TCP{" +
+				"SrcPort:34785 " +
+				"DstPort:47767 " +
+				"SeqNum:3452155723 " +
+				"AckNum:2596996163 " +
+				"DataOffset:5 " +
+				"Flags:20 " +
+				"WindowSize:64240 " +
+				"Checksum:11819" +
+				"}",
+		},
+		{
+			name: "UDP",
+			l: &UDP{
+				SrcPort: Uint16(34785),
+				DstPort: Uint16(47767),
+				Length:  Uint16(12),
+			},
+			want: "&testbench.UDP{" +
+				"SrcPort:34785 " +
+				"DstPort:47767 " +
+				"Length:12" +
+				"}",
+		},
+		{
+			name: "IPv4",
+			l: &IPv4{
+				IHL:            Uint8(5),
+				TOS:            Uint8(0),
+				TotalLength:    Uint16(44),
+				ID:             Uint16(0),
+				Flags:          Uint8(2),
+				FragmentOffset: Uint16(0),
+				TTL:            Uint8(64),
+				Protocol:       Uint8(6),
+				Checksum:       Uint16(0x2e2b),
+				SrcAddr:        Address(tcpip.Address([]byte{197, 34, 63, 10})),
+				DstAddr:        Address(tcpip.Address([]byte{197, 34, 63, 20})),
+			},
+			want: "&testbench.IPv4{" +
+				"IHL:5 " +
+				"TOS:0 " +
+				"TotalLength:44 " +
+				"ID:0 " +
+				"Flags:2 " +
+				"FragmentOffset:0 " +
+				"TTL:64 " +
+				"Protocol:6 " +
+				"Checksum:11819 " +
+				"SrcAddr:197.34.63.10 " +
+				"DstAddr:197.34.63.20" +
+				"}",
+		},
+		{
+			name: "Ether",
+			l: &Ether{
+				SrcAddr: LinkAddress(tcpip.LinkAddress([]byte{0x02, 0x42, 0xc5, 0x22, 0x3f, 0x0a})),
+				DstAddr: LinkAddress(tcpip.LinkAddress([]byte{0x02, 0x42, 0xc5, 0x22, 0x3f, 0x14})),
+				Type:    NetworkProtocolNumber(4),
+			},
+			want: "&testbench.Ether{" +
+				"SrcAddr:02:42:c5:22:3f:0a " +
+				"DstAddr:02:42:c5:22:3f:14 " +
+				"Type:4" +
+				"}",
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := tt.l.String(); got != tt.want {
+				t.Errorf("%s.String() = %s, want: %s", tt.name, got, tt.want)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From ab54d4f496dc59721066e295e162b3be903795d5 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 2 Apr 2020 01:04:58 -0400
Subject: remove nogo exemption for machine_arm64_unsafe.go

Minimize the use of unsafe.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/kvm_arm64.go            |   4 +-
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go |  63 --------------
 pkg/sentry/platform/ring0/BUILD                 |   2 +
 pkg/sentry/platform/ring0/lib_arm64.go          |   7 ++
 pkg/sentry/platform/ring0/lib_arm64_unsafe.go   | 108 ++++++++++++++++++++++++
 tools/nogo.json                                 |   1 -
 6 files changed, 120 insertions(+), 65 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/lib_arm64_unsafe.go

diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 79045651e..716198712 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -18,6 +18,8 @@ package kvm
 
 import (
 	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
 type kvmOneReg struct {
@@ -46,6 +48,6 @@ type userRegs struct {
 func updateGlobalOnce(fd int) error {
 	physicalInit()
 	err := updateSystemValues(int(fd))
-	updateVectorTable()
+	ring0.Init()
 	return err
 }
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index b531f2f85..3b35858ae 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -48,69 +48,6 @@ func (m *machine) initArchState() error {
 	return nil
 }
 
-func getPageWithReflect(p uintptr) []byte {
-	return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()]
-}
-
-// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
-//
-// According to the design documentation of Arm64,
-// the start address of exception vector table should be 11-bits aligned.
-// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
-// But, we can't align a function's start address to a specific address by using golang.
-// We have raised this question in golang community:
-// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
-// This function will be removed when golang supports this feature.
-//
-// There are 2 jobs were implemented in this function:
-// 1, move the start address of exception vector table into the specific address.
-// 2, modify the offset of each instruction.
-func updateVectorTable() {
-	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
-	offset := fromLocation & (1<<11 - 1)
-	if offset != 0 {
-		offset = 1<<11 - offset
-	}
-
-	toLocation := fromLocation + offset
-	page := getPageWithReflect(toLocation)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	page = getPageWithReflect(toLocation + 4096)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	// Move exception-vector-table into the specific address.
-	var entry *uint32
-	var entryFrom *uint32
-	for i := 1; i <= 0x800; i++ {
-		entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i)))
-		entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i)))
-		*entry = *entryFrom
-	}
-
-	// The offset from the address of each unconditionally branch is changed.
-	// We should modify the offset of each instruction.
-	nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780}
-	for _, num := range nums {
-		entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num)))
-		*entry = *entry - (uint32)(offset/4)
-	}
-
-	page = getPageWithReflect(toLocation)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	page = getPageWithReflect(toLocation + 4096)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-}
-
 // initArchState initializes architecture-specific state.
 func (c *vCPU) initArchState() error {
 	var (
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 934b6fbcd..cdcad0fdc 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -71,12 +71,14 @@ go_library(
         "lib_amd64.go",
         "lib_amd64.s",
         "lib_arm64.go",
+        "lib_arm64_unsafe.go",
         "lib_arm64.s",
         "ring0.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/cpuid",
+        "//pkg/safecopy",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index af075aae4..242b9305c 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -37,3 +37,10 @@ func SaveVRegs(*byte)
 
 // LoadVRegs loads V0-V31 registers.
 func LoadVRegs(*byte)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init() {
+	rewriteVectors()
+}
diff --git a/pkg/sentry/platform/ring0/lib_arm64_unsafe.go b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go
new file mode 100644
index 000000000..c05166fea
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go
@@ -0,0 +1,108 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safecopy"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	nopInstruction = 0xd503201f
+	instSize       = unsafe.Sizeof(uint32(0))
+	vectorsRawLen  = 0x800
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []uint32) {
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	hdr.Data = addr
+	hdr.Len = length / int(instSize)
+	hdr.Cap = length / int(instSize)
+	return slice
+}
+
+// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
+//
+// According to the design documentation of Arm64,
+// the start address of exception vector table should be 11-bits aligned.
+// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
+// But, we can't align a function's start address to a specific address by using golang.
+// We have raised this question in golang community:
+// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
+// This function will be removed when golang supports this feature.
+//
+// There are 2 jobs were implemented in this function:
+// 1, move the start address of exception vector table into the specific address.
+// 2, modify the offset of each instruction.
+func rewriteVectors() {
+	vectorsBegin := reflect.ValueOf(Vectors).Pointer()
+
+	// The exception-vector-table is required to be 11-bits aligned.
+	// And the size is 0x800.
+	// Please see the documentation as reference:
+	// https://developer.arm.com/docs/100933/0100/aarch64-exception-vector-table
+	//
+	// But, golang does not allow to set a function's address to a specific value.
+	// So, for gvisor, I defined the size of exception-vector-table as 4K,
+	// filled the 2nd 2K part with NOP-s.
+	// So that, I can safely move the 1st 2K part into the address with 11-bits alignment.
+	//
+	// So, the prerequisite for this function to work correctly is:
+	// vectorsSafeLen >= 0x1000
+	// vectorsRawLen  = 0x800
+	vectorsSafeLen := int(safecopy.FindEndAddress(vectorsBegin) - vectorsBegin)
+	if vectorsSafeLen < 2*vectorsRawLen {
+		panic("Can't update vectors")
+	}
+
+	vectorsSafeTable := unsafeSlice(vectorsBegin, vectorsSafeLen) // Now a []uint32
+	vectorsRawLen32 := vectorsRawLen / int(instSize)
+
+	offset := vectorsBegin & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	pageBegin := (vectorsBegin + offset) & ^uintptr(usermem.PageSize-1)
+
+	_, _, errno := syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC))
+	if errno != 0 {
+		panic(errno.Error())
+	}
+
+	offset = offset / instSize // By index, not bytes.
+	// Move exception-vector-table into the specific address, should uses memmove here.
+	for i := 1; i <= vectorsRawLen32; i++ {
+		vectorsSafeTable[int(offset)+vectorsRawLen32-i] = vectorsSafeTable[vectorsRawLen32-i]
+	}
+
+	// Adjust branch since instruction was moved forward.
+	for i := 0; i < vectorsRawLen32; i++ {
+		if vectorsSafeTable[int(offset)+i] != nopInstruction {
+			vectorsSafeTable[int(offset)+i] -= uint32(offset)
+		}
+	}
+
+	_, _, errno = syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_EXEC))
+	if errno != 0 {
+		panic(errno.Error())
+	}
+}
diff --git a/tools/nogo.json b/tools/nogo.json
index 83cb76b93..2e354e160 100644
--- a/tools/nogo.json
+++ b/tools/nogo.json
@@ -83,7 +83,6 @@
       "/pkg/gohacks/gohacks_unsafe.go": "allowed: special case",
       "/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go": "allowed: special case",
       "/pkg/sentry/platform/kvm/(bluepill|machine)_unsafe.go": "allowed: special case",
-      "/pkg/sentry/platform/kvm/machine_arm64_unsafe.go": "fix: gvisor.dev/issue/22464",
       "/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go": "allowed: special case",
       "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case",
       "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case"
-- 
cgit v1.2.3


From 1ebfdcc86c1b066a044a64e1f34b679f327a1f36 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 9 Apr 2020 01:11:20 -0700
Subject: kokoro: fix handling of apt-get errors

When a command is called as if expression, its error
code can be get only in this if block.

For example, the next script prints 0:

if ( false ); then
  true
fi
echo $?

PiperOrigin-RevId: 305638629
---
 scripts/common.sh | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/scripts/common.sh b/scripts/common.sh
index 735a383de..bc6ba71e8 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -89,12 +89,20 @@ function install_runsc() {
 # be correct, otherwise this may result in a loop that spins until time out.
 function apt_install() {
   while true; do
-    if (sudo apt-get update && sudo apt-get install -y "$@"); then
-      break
-    fi
-    result=$?
-    if [[ $result -ne 100 ]]; then
-      return $result
-    fi
+    sudo apt-get update &&
+      sudo apt-get install -y "$@" &&
+      true
+    result="${?}"
+    case $result in
+      0)
+        break
+        ;;
+      100)
+        # 100 is the error code that apt-get returns.
+        ;;
+      *)
+        exit $result
+        ;;
+    esac
   done
 }
-- 
cgit v1.2.3


From 7928aa345e334f2c68f8f03b71d8cabe79e8db7e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 9 Apr 2020 09:30:39 -0700
Subject: Convert int and bool socket options to use GetSockOptInt and
 GetSockOptBool

PiperOrigin-RevId: 305699233
---
 pkg/sentry/socket/netstack/netstack.go    | 155 +++++-------
 pkg/sentry/socket/unix/transport/BUILD    |   1 +
 pkg/sentry/socket/unix/transport/unix.go  |  50 ++--
 pkg/tcpip/stack/transport_demuxer_test.go |  35 ++-
 pkg/tcpip/tcpip.go                        | 145 ++++++-----
 pkg/tcpip/transport/icmp/endpoint.go      |  47 ++--
 pkg/tcpip/transport/raw/endpoint.go       |  18 +-
 pkg/tcpip/transport/tcp/endpoint.go       | 387 +++++++++++++-----------------
 pkg/tcpip/transport/tcp/tcp_test.go       | 110 +++++----
 pkg/tcpip/transport/udp/endpoint.go       | 231 +++++++++---------
 pkg/tcpip/transport/udp/udp_test.go       |  60 ++---
 11 files changed, 583 insertions(+), 656 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 5d0085462..20e3fa0d2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -300,7 +300,7 @@ type SocketOperations struct {
 // New creates a new endpoint socket.
 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
 	if skType == linux.SOCK_STREAM {
-		if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil {
+		if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 	}
@@ -965,6 +965,13 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
+func boolToInt32(v bool) int32 {
+	if v {
+		return 1
+	}
+	return 0
+}
+
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
 func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
 	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
@@ -998,12 +1005,11 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.PasscredOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.PasscredOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_SNDBUF:
 		if outLen < sizeOfInt32 {
@@ -1042,24 +1048,22 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.ReuseAddressOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_REUSEPORT:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.ReusePortOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.ReusePortOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_BINDTODEVICE:
 		var v tcpip.BindToDeviceOption
@@ -1089,24 +1093,22 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.BroadcastOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.BroadcastOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_KEEPALIVE:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.KeepaliveEnabledOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_LINGER:
 		if outLen < linux.SizeOfLinger {
@@ -1156,47 +1158,41 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptInt(tcpip.DelayOption)
+		v, err := ep.GetSockOptBool(tcpip.DelayOption)
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		if v == 0 {
-			return int32(1), nil
-		}
-		return int32(0), nil
+		return boolToInt32(!v), nil
 
 	case linux.TCP_CORK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.CorkOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.CorkOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.TCP_QUICKACK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.QuickAckOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.QuickAckOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.TCP_MAXSEG:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MaxSegOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1328,11 +1324,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1342,8 +1334,8 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if outLen == 0 {
 			return make([]byte, 0), nil
 		}
-		var v tcpip.IPv6TrafficClassOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1365,12 +1357,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	default:
 		emitUnimplementedEventIPv6(t, name)
@@ -1386,8 +1373,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.TTLOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.TTLOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1403,8 +1390,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MulticastTTLOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1429,23 +1416,19 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MulticastLoopOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		if v {
-			return int32(1), nil
-		}
-		return int32(0), nil
+		return boolToInt32(v), nil
 
 	case linux.IP_TOS:
 		// Length handling for parity with Linux.
 		if outLen == 0 {
 			return []byte(nil), nil
 		}
-		var v tcpip.IPv4TOSOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 		if outLen < sizeOfInt32 {
@@ -1462,11 +1445,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	case linux.IP_PKTINFO:
 		if outLen < sizeOfInt32 {
@@ -1477,11 +1456,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	default:
 		emitUnimplementedEventIP(t, name)
@@ -1592,7 +1567,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0))
 
 	case linux.SO_REUSEPORT:
 		if len(optVal) < sizeOfInt32 {
@@ -1600,7 +1575,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0))
 
 	case linux.SO_BINDTODEVICE:
 		n := bytes.IndexByte(optVal, 0)
@@ -1628,7 +1603,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BroadcastOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.BroadcastOption, v != 0))
 
 	case linux.SO_PASSCRED:
 		if len(optVal) < sizeOfInt32 {
@@ -1636,7 +1611,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0))
 
 	case linux.SO_KEEPALIVE:
 		if len(optVal) < sizeOfInt32 {
@@ -1644,7 +1619,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveEnabledOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0))
 
 	case linux.SO_SNDTIMEO:
 		if len(optVal) < linux.SizeOfTimeval {
@@ -1716,11 +1691,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		var o int
-		if v == 0 {
-			o = 1
-		}
-		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0))
 
 	case linux.TCP_CORK:
 		if len(optVal) < sizeOfInt32 {
@@ -1728,7 +1699,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0))
 
 	case linux.TCP_QUICKACK:
 		if len(optVal) < sizeOfInt32 {
@@ -1736,7 +1707,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0))
 
 	case linux.TCP_MAXSEG:
 		if len(optVal) < sizeOfInt32 {
@@ -1744,7 +1715,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
 
 	case linux.TCP_KEEPIDLE:
 		if len(optVal) < sizeOfInt32 {
@@ -1855,7 +1826,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		if v == -1 {
 			v = 0
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
 
 	case linux.IPV6_RECVTCLASS:
 		v, err := parseIntOrChar(optVal)
@@ -1940,7 +1911,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		if v < 0 || v > 255 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
 
 	case linux.IP_ADD_MEMBERSHIP:
 		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
@@ -1987,9 +1958,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(
-			tcpip.MulticastLoopOption(v != 0),
-		))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0))
 
 	case linux.MCAST_JOIN_GROUP:
 		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
@@ -2008,7 +1977,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		} else if v < 1 || v > 255 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TTLOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v)))
 
 	case linux.IP_TOS:
 		if len(optVal) == 0 {
@@ -2018,7 +1987,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		if err != nil {
 			return err
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
 
 	case linux.IP_RECVTOS:
 		v, err := parseIntOrChar(optVal)
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 74bcd6300..c708b6030 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/ilist",
+        "//pkg/log",
         "//pkg/refs",
         "//pkg/sync",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2ef654235..1f3880cc5 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -838,24 +839,45 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
 
 // SetSockOpt sets a socket option. Currently not supported.
 func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
-	case tcpip.PasscredOption:
-		e.setPasscred(v != 0)
-		return nil
-	}
 	return nil
 }
 
 func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.BroadcastOption:
+	case tcpip.PasscredOption:
+		e.setPasscred(v)
+	case tcpip.ReuseAddressOption:
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return tcpip.ErrUnknownProtocolOption
+	}
 	return nil
 }
 
 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+	case tcpip.ReceiveBufferSizeOption:
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return tcpip.ErrUnknownProtocolOption
+	}
 	return nil
 }
 
 func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.PasscredOption:
+		return e.Passcred(), nil
+
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
@@ -914,29 +936,19 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		return int(v), nil
 
 	default:
+		log.Warningf("Unsupported socket option: %d", opt)
 		return -1, tcpip.ErrUnknownProtocolOption
 	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.PasscredOption:
-		if e.Passcred() {
-			*o = tcpip.PasscredOption(1)
-		} else {
-			*o = tcpip.PasscredOption(0)
-		}
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
 	default:
+		log.Warningf("Unsupported socket option: %T", opt)
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index c65b0c632..2474a7db3 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -206,7 +206,7 @@ func TestTransportDemuxerRegister(t *testing.T) {
 // the distribution of packets received matches expectations.
 func TestBindToDeviceDistribution(t *testing.T) {
 	type endpointSockopts struct {
-		reuse        int
+		reuse        bool
 		bindToDevice tcpip.NICID
 	}
 	for _, test := range []struct {
@@ -221,11 +221,11 @@ func TestBindToDeviceDistribution(t *testing.T) {
 			"BindPortReuse",
 			// 5 endpoints that all have reuse set.
 			[]endpointSockopts{
-				{reuse: 1, bindToDevice: 0},
-				{reuse: 1, bindToDevice: 0},
-				{reuse: 1, bindToDevice: 0},
-				{reuse: 1, bindToDevice: 0},
-				{reuse: 1, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed evenly.
@@ -236,9 +236,9 @@ func TestBindToDeviceDistribution(t *testing.T) {
 			"BindToDevice",
 			// 3 endpoints with various bindings.
 			[]endpointSockopts{
-				{reuse: 0, bindToDevice: 1},
-				{reuse: 0, bindToDevice: 2},
-				{reuse: 0, bindToDevice: 3},
+				{reuse: false, bindToDevice: 1},
+				{reuse: false, bindToDevice: 2},
+				{reuse: false, bindToDevice: 3},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 go only to the endpoint bound to dev0.
@@ -253,12 +253,12 @@ func TestBindToDeviceDistribution(t *testing.T) {
 			"ReuseAndBindToDevice",
 			// 6 endpoints with various bindings.
 			[]endpointSockopts{
-				{reuse: 1, bindToDevice: 1},
-				{reuse: 1, bindToDevice: 1},
-				{reuse: 1, bindToDevice: 2},
-				{reuse: 1, bindToDevice: 2},
-				{reuse: 1, bindToDevice: 2},
-				{reuse: 1, bindToDevice: 0},
+				{reuse: true, bindToDevice: 1},
+				{reuse: true, bindToDevice: 1},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed among endpoints bound to
@@ -309,9 +309,8 @@ func TestBindToDeviceDistribution(t *testing.T) {
 						}(ep)
 
 						defer ep.Close()
-						reusePortOption := tcpip.ReusePortOption(endpoint.reuse)
-						if err := ep.SetSockOpt(reusePortOption); err != nil {
-							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", reusePortOption, i, err)
+						if err := ep.SetSockOptBool(tcpip.ReusePortOption, endpoint.reuse); err != nil {
+							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
 						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 2ef3271f1..aec7126ff 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -520,34 +520,90 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
+	// datagram sockets are allowed to send packets to a broadcast address.
+	BroadcastOption SockOptBool = iota
+
+	// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
+	// held until segments are full by the TCP transport protocol.
+	CorkOption
+
+	// DelayOption is used by SetSockOpt/GetSockOpt to specify if data
+	// should be sent out immediately by the transport protocol. For TCP,
+	// it determines if the Nagle algorithm is on or off.
+	DelayOption
+
+	// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
+	// TCP keepalive is enabled for this socket.
+	KeepaliveEnabledOption
+
+	// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
+	// multicast packets sent over a non-loopback interface will be looped back.
+	MulticastLoopOption
+
+	// PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
+	// SCM_CREDENTIALS socket control messages are enabled.
+	//
+	// Only supported on Unix sockets.
+	PasscredOption
+
+	// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
+	QuickAckOption
+
 	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
 	// IPV6_TCLASS ancillary message is passed with incoming packets.
-	ReceiveTClassOption SockOptBool = iota
+	ReceiveTClassOption
 
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
 	ReceiveTOSOption
 
-	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
-	// socket is to be restricted to sending and receiving IPv6 packets only.
-	V6OnlyOption
-
 	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
 
-	// TODO(b/146901447): convert existing bool socket options to be handled via
-	// Get/SetSockOptBool
+	// ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
+	// should allow reuse of local address.
+	ReuseAddressOption
+
+	// ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
+	// to be bound to an identical socket address.
+	ReusePortOption
+
+	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
+	// socket is to be restricted to sending and receiving IPv6 packets only.
+	V6OnlyOption
 )
 
 // SockOptInt represents socket options which values have the int type.
 type SockOptInt int
 
 const (
+	// KeepaliveCountOption is used by SetSockOpt/GetSockOpt to specify the number
+	// of un-ACKed TCP keepalives that will be sent before the connection is
+	// closed.
+	KeepaliveCountOption SockOptInt = iota
+
+	// IPv4TOSOption is used by SetSockOpt/GetSockOpt to specify TOS
+	// for all subsequent outgoing IPv4 packets from the endpoint.
+	IPv4TOSOption
+
+	// IPv6TrafficClassOption is used by SetSockOpt/GetSockOpt to specify TOS
+	// for all subsequent outgoing IPv6 packets from the endpoint.
+	IPv6TrafficClassOption
+
+	// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current
+	// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option.
+	MaxSegOption
+
+	// MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
+	// TTL value for multicast messages. The default is 1.
+	MulticastTTLOption
+
 	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
 	// number of unread bytes in the input buffer should be returned.
-	ReceiveQueueSizeOption SockOptInt = iota
+	ReceiveQueueSizeOption
 
 	// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
 	// specify the send buffer size option.
@@ -561,44 +617,21 @@ const (
 	// number of unread bytes in the output buffer should be returned.
 	SendQueueSizeOption
 
-	// DelayOption is used by SetSockOpt/GetSockOpt to specify if data
-	// should be sent out immediately by the transport protocol. For TCP,
-	// it determines if the Nagle algorithm is on or off.
-	DelayOption
-
-	// TODO(b/137664753): convert all int socket options to be handled via
-	// GetSockOptInt.
+	// TTLOption is used by SetSockOpt/GetSockOpt to control the default TTL/hop
+	// limit value for unicast messages. The default is protocol specific.
+	//
+	// A zero value indicates the default.
+	TTLOption
 )
 
 // ErrorOption is used in GetSockOpt to specify that the last error reported by
 // the endpoint should be cleared and returned.
 type ErrorOption struct{}
 
-// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
-// held until segments are full by the TCP transport protocol.
-type CorkOption int
-
-// ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
-// should allow reuse of local address.
-type ReuseAddressOption int
-
-// ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
-// to be bound to an identical socket address.
-type ReusePortOption int
-
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
 type BindToDeviceOption NICID
 
-// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
-type QuickAckOption int
-
-// PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
-// SCM_CREDENTIALS socket control messages are enabled.
-//
-// Only supported on Unix sockets.
-type PasscredOption int
-
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO(b/64800844): Add and populate stat fields.
@@ -607,10 +640,6 @@ type TCPInfoOption struct {
 	RTTVar time.Duration
 }
 
-// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
-// TCP keepalive is enabled for this socket.
-type KeepaliveEnabledOption int
-
 // KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
 // connection must remain idle before the first TCP keepalive packet is sent.
 // Once this time is reached, KeepaliveIntervalOption is used instead.
@@ -620,11 +649,6 @@ type KeepaliveIdleOption time.Duration
 // interval between sending TCP keepalive packets.
 type KeepaliveIntervalOption time.Duration
 
-// KeepaliveCountOption is used by SetSockOpt/GetSockOpt to specify the number
-// of un-ACKed TCP keepalives that will be sent before the connection is
-// closed.
-type KeepaliveCountOption int
-
 // TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
 // specified timeout for a given TCP connection.
 // See: RFC5482 for details.
@@ -638,20 +662,9 @@ type CongestionControlOption string
 // control algorithms.
 type AvailableCongestionControlOption string
 
-// ModerateReceiveBufferOption allows the caller to enable/disable TCP receive
 // buffer moderation.
 type ModerateReceiveBufferOption bool
 
-// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current
-// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option.
-type MaxSegOption int
-
-// TTLOption is used by SetSockOpt/GetSockOpt to control the default TTL/hop
-// limit value for unicast messages. The default is protocol specific.
-//
-// A zero value indicates the default.
-type TTLOption uint8
-
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
 // before being marked closed.
@@ -668,10 +681,6 @@ type TCPTimeWaitTimeoutOption time.Duration
 // for a handshake till the specified timeout until a segment with data arrives.
 type TCPDeferAcceptOption time.Duration
 
-// MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
-// TTL value for multicast messages. The default is 1.
-type MulticastTTLOption uint8
-
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
@@ -679,10 +688,6 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
-// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
-// multicast packets sent over a non-loopback interface will be looped back.
-type MulticastLoopOption bool
-
 // MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
 // AddMembershipOption and RemoveMembershipOption.
 type MembershipOption struct {
@@ -705,22 +710,10 @@ type RemoveMembershipOption MembershipOption
 // TCP out-of-band data is delivered along with the normal in-band data.
 type OutOfBandInlineOption int
 
-// BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
-// datagram sockets are allowed to send packets to a broadcast address.
-type BroadcastOption int
-
 // DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
 // a default TTL.
 type DefaultTTLOption uint8
 
-// IPv4TOSOption is used by SetSockOpt/GetSockOpt to specify TOS
-// for all subsequent outgoing IPv4 packets from the endpoint.
-type IPv4TOSOption uint8
-
-// IPv6TrafficClassOption is used by SetSockOpt/GetSockOpt to specify TOS
-// for all subsequent outgoing IPv6 packets from the endpoint.
-type IPv6TrafficClassOption uint8
-
 // IPPacketInfo is the message struture for IP_PKTINFO.
 //
 // +stateify savable
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index b007302fb..3a133eef9 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -348,29 +348,37 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.TTLOption:
-		e.mu.Lock()
-		e.ttl = uint8(o)
-		e.mu.Unlock()
-	}
-
 	return nil
 }
 
 // SetSockOptBool sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	return nil
+	return tcpip.ErrUnknownProtocolOption
 }
 
 // SetSockOptInt sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		e.ttl = uint8(v)
+		e.mu.Unlock()
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 	return nil
 }
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -397,26 +405,23 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.rcvMu.Unlock()
 		return v, nil
 
+	case tcpip.TTLOption:
+		e.rcvMu.Lock()
+		v := int(e.ttl)
+		e.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
 	}
-	return -1, tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
-	case *tcpip.TTLOption:
-		e.rcvMu.Lock()
-		*o = tcpip.TTLOption(e.ttl)
-		e.rcvMu.Unlock()
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 337bc1c71..eee754a5a 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -533,14 +533,10 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -548,7 +544,13 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -576,9 +578,9 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.rcvMu.Unlock()
 		return v, nil
 
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
 	}
-
-	return -1, tcpip.ErrUnknownProtocolOption
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9b123e968..a8d443f73 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -821,7 +821,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 
 	var de DelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
-		e.SetSockOptInt(tcpip.DelayOption, 1)
+		e.SetSockOptBool(tcpip.DelayOption, true)
 	}
 
 	var tcpLT tcpip.TCPLingerTimeoutOption
@@ -1409,10 +1409,60 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo
 
 // SetSockOptBool sets a socket option.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	e.LockUser()
-	defer e.UnlockUser()
-
 	switch opt {
+
+	case tcpip.BroadcastOption:
+		e.LockUser()
+		e.broadcast = v
+		e.UnlockUser()
+
+	case tcpip.CorkOption:
+		e.LockUser()
+		if !v {
+			atomic.StoreUint32(&e.cork, 0)
+
+			// Handle the corked data.
+			e.sndWaker.Assert()
+		} else {
+			atomic.StoreUint32(&e.cork, 1)
+		}
+		e.UnlockUser()
+
+	case tcpip.DelayOption:
+		if v {
+			atomic.StoreUint32(&e.delay, 1)
+		} else {
+			atomic.StoreUint32(&e.delay, 0)
+
+			// Handle delayed data.
+			e.sndWaker.Assert()
+		}
+
+	case tcpip.KeepaliveEnabledOption:
+		e.keepalive.Lock()
+		e.keepalive.enabled = v
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.QuickAckOption:
+		o := uint32(1)
+		if v {
+			o = 0
+		}
+		atomic.StoreUint32(&e.slowAck, o)
+
+	case tcpip.ReuseAddressOption:
+		e.LockUser()
+		e.reuseAddr = v
+		e.UnlockUser()
+		return nil
+
+	case tcpip.ReusePortOption:
+		e.LockUser()
+		e.reusePort = v
+		e.UnlockUser()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1424,7 +1474,11 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
+		e.LockUser()
 		e.v6only = v
+		e.UnlockUser()
+	default:
+		return tcpip.ErrUnknownProtocolOption
 	}
 
 	return nil
@@ -1432,7 +1486,40 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt sets a socket option.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
+	const inetECNMask = 3
+
 	switch opt {
+	case tcpip.KeepaliveCountOption:
+		e.keepalive.Lock()
+		e.keepalive.count = int(v)
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.IPv4TOSOption:
+		e.LockUser()
+		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
+		// ignore the bits for now.
+		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
+		e.UnlockUser()
+
+	case tcpip.IPv6TrafficClassOption:
+		e.LockUser()
+		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
+		// ignore the bits for now.
+		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
+		e.UnlockUser()
+
+	case tcpip.MaxSegOption:
+		userMSS := v
+		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
+			return tcpip.ErrInvalidOptionValue
+		}
+		e.LockUser()
+		e.userMSS = uint16(userMSS)
+		e.UnlockUser()
+		e.notifyProtocolGoroutine(notifyMSSChanged)
+
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
@@ -1483,7 +1570,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.rcvListMu.Unlock()
 		e.UnlockUser()
 		e.notifyProtocolGoroutine(mask)
-		return nil
 
 	case tcpip.SendBufferSizeOption:
 		// Make sure the send buffer size is within the min and max
@@ -1502,52 +1588,21 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.sndBufMu.Lock()
 		e.sndBufSize = size
 		e.sndBufMu.Unlock()
-		return nil
 
-	case tcpip.DelayOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.delay, 0)
-
-			// Handle delayed data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.delay, 1)
-		}
-		return nil
+	case tcpip.TTLOption:
+		e.LockUser()
+		e.ttl = uint8(v)
+		e.UnlockUser()
 
 	default:
-		return nil
+		return tcpip.ErrUnknownProtocolOption
 	}
+	return nil
 }
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
-	const inetECNMask = 3
 	switch v := opt.(type) {
-	case tcpip.CorkOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.cork, 0)
-
-			// Handle the corked data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.cork, 1)
-		}
-		return nil
-
-	case tcpip.ReuseAddressOption:
-		e.LockUser()
-		e.reuseAddr = v != 0
-		e.UnlockUser()
-		return nil
-
-	case tcpip.ReusePortOption:
-		e.LockUser()
-		e.reusePort = v != 0
-		e.UnlockUser()
-		return nil
-
 	case tcpip.BindToDeviceOption:
 		id := tcpip.NICID(v)
 		if id != 0 && !e.stack.HasNIC(id) {
@@ -1556,72 +1611,26 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.LockUser()
 		e.bindToDevice = id
 		e.UnlockUser()
-		return nil
-
-	case tcpip.QuickAckOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.slowAck, 1)
-		} else {
-			atomic.StoreUint32(&e.slowAck, 0)
-		}
-		return nil
-
-	case tcpip.MaxSegOption:
-		userMSS := v
-		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
-			return tcpip.ErrInvalidOptionValue
-		}
-		e.LockUser()
-		e.userMSS = uint16(userMSS)
-		e.UnlockUser()
-		e.notifyProtocolGoroutine(notifyMSSChanged)
-		return nil
-
-	case tcpip.TTLOption:
-		e.LockUser()
-		e.ttl = uint8(v)
-		e.UnlockUser()
-		return nil
-
-	case tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		e.keepalive.enabled = v != 0
-		e.keepalive.Unlock()
-		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
 
 	case tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
 		e.keepalive.idle = time.Duration(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
 
 	case tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
 		e.keepalive.interval = time.Duration(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
 
-	case tcpip.KeepaliveCountOption:
-		e.keepalive.Lock()
-		e.keepalive.count = int(v)
-		e.keepalive.Unlock()
-		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
+	case tcpip.OutOfBandInlineOption:
+		// We don't currently support disabling this option.
 
 	case tcpip.TCPUserTimeoutOption:
 		e.LockUser()
 		e.userTimeout = time.Duration(v)
 		e.UnlockUser()
-		return nil
-
-	case tcpip.BroadcastOption:
-		e.LockUser()
-		e.broadcast = v != 0
-		e.UnlockUser()
-		return nil
 
 	case tcpip.CongestionControlOption:
 		// Query the available cc algorithms in the stack and
@@ -1652,22 +1661,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		// control algorithm is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.IPv4TOSOption:
-		e.LockUser()
-		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
-		// ignore the bits for now.
-		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.UnlockUser()
-		return nil
-
-	case tcpip.IPv6TrafficClassOption:
-		e.LockUser()
-		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
-		// ignore the bits for now.
-		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.UnlockUser()
-		return nil
-
 	case tcpip.TCPLingerTimeoutOption:
 		e.LockUser()
 		if v < 0 {
@@ -1688,7 +1681,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		e.tcpLingerTimeout = time.Duration(v)
 		e.UnlockUser()
-		return nil
 
 	case tcpip.TCPDeferAcceptOption:
 		e.LockUser()
@@ -1697,11 +1689,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		e.deferAccept = time.Duration(v)
 		e.UnlockUser()
-		return nil
 
 	default:
 		return nil
 	}
+	return nil
 }
 
 // readyReceiveSize returns the number of bytes ready to be received.
@@ -1723,6 +1715,43 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.BroadcastOption:
+		e.LockUser()
+		v := e.broadcast
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.CorkOption:
+		return atomic.LoadUint32(&e.cork) != 0, nil
+
+	case tcpip.DelayOption:
+		return atomic.LoadUint32(&e.delay) != 0, nil
+
+	case tcpip.KeepaliveEnabledOption:
+		e.keepalive.Lock()
+		v := e.keepalive.enabled
+		e.keepalive.Unlock()
+
+		return v, nil
+
+	case tcpip.QuickAckOption:
+		v := atomic.LoadUint32(&e.slowAck) == 0
+		return v, nil
+
+	case tcpip.ReuseAddressOption:
+		e.LockUser()
+		v := e.reuseAddr
+		e.UnlockUser()
+
+		return v, nil
+
+	case tcpip.ReusePortOption:
+		e.LockUser()
+		v := e.reusePort
+		e.UnlockUser()
+
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1734,14 +1763,41 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.UnlockUser()
 
 		return v, nil
-	}
 
-	return false, tcpip.ErrUnknownProtocolOption
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
+	case tcpip.KeepaliveCountOption:
+		e.keepalive.Lock()
+		v := e.keepalive.count
+		e.keepalive.Unlock()
+		return v, nil
+
+	case tcpip.IPv4TOSOption:
+		e.LockUser()
+		v := int(e.sendTOS)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.IPv6TrafficClassOption:
+		e.LockUser()
+		v := int(e.sendTOS)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.MaxSegOption:
+		// This is just stubbed out. Linux never returns the user_mss
+		// value as it either returns the defaultMSS or returns the
+		// actual current MSS. Netstack just returns the defaultMSS
+		// always for now.
+		v := header.TCPDefaultMSS
+		return v, nil
+
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
 
@@ -1757,12 +1813,11 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.rcvListMu.Unlock()
 		return v, nil
 
-	case tcpip.DelayOption:
-		var o int
-		if v := atomic.LoadUint32(&e.delay); v != 0 {
-			o = 1
-		}
-		return o, nil
+	case tcpip.TTLOption:
+		e.LockUser()
+		v := int(e.ttl)
+		e.UnlockUser()
+		return v, nil
 
 	default:
 		return -1, tcpip.ErrUnknownProtocolOption
@@ -1779,61 +1834,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.lastErrorMu.Unlock()
 		return err
 
-	case *tcpip.MaxSegOption:
-		// This is just stubbed out. Linux never returns the user_mss
-		// value as it either returns the defaultMSS or returns the
-		// actual current MSS. Netstack just returns the defaultMSS
-		// always for now.
-		*o = header.TCPDefaultMSS
-		return nil
-
-	case *tcpip.CorkOption:
-		*o = 0
-		if v := atomic.LoadUint32(&e.cork); v != 0 {
-			*o = 1
-		}
-		return nil
-
-	case *tcpip.ReuseAddressOption:
-		e.LockUser()
-		v := e.reuseAddr
-		e.UnlockUser()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
-	case *tcpip.ReusePortOption:
-		e.LockUser()
-		v := e.reusePort
-		e.UnlockUser()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
 	case *tcpip.BindToDeviceOption:
 		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
 		e.UnlockUser()
-		return nil
-
-	case *tcpip.QuickAckOption:
-		*o = 1
-		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
-			*o = 0
-		}
-		return nil
-
-	case *tcpip.TTLOption:
-		e.LockUser()
-		*o = tcpip.TTLOption(e.ttl)
-		e.UnlockUser()
-		return nil
 
 	case *tcpip.TCPInfoOption:
 		*o = tcpip.TCPInfoOption{}
@@ -1846,92 +1850,45 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			o.RTTVar = snd.rtt.rttvar
 			snd.rtt.Unlock()
 		}
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		v := e.keepalive.enabled
-		e.keepalive.Unlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
 
 	case *tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
 		e.keepalive.Unlock()
-		return nil
 
 	case *tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
 		e.keepalive.Unlock()
-		return nil
-
-	case *tcpip.KeepaliveCountOption:
-		e.keepalive.Lock()
-		*o = tcpip.KeepaliveCountOption(e.keepalive.count)
-		e.keepalive.Unlock()
-		return nil
 
 	case *tcpip.TCPUserTimeoutOption:
 		e.LockUser()
 		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
 		e.UnlockUser()
-		return nil
 
 	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 		*o = 1
-		return nil
-
-	case *tcpip.BroadcastOption:
-		e.LockUser()
-		v := e.broadcast
-		e.UnlockUser()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
 
 	case *tcpip.CongestionControlOption:
 		e.LockUser()
 		*o = e.cc
 		e.UnlockUser()
-		return nil
-
-	case *tcpip.IPv4TOSOption:
-		e.LockUser()
-		*o = tcpip.IPv4TOSOption(e.sendTOS)
-		e.UnlockUser()
-		return nil
-
-	case *tcpip.IPv6TrafficClassOption:
-		e.LockUser()
-		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
-		e.UnlockUser()
-		return nil
 
 	case *tcpip.TCPLingerTimeoutOption:
 		e.LockUser()
 		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
 		e.UnlockUser()
-		return nil
 
 	case *tcpip.TCPDeferAcceptOption:
 		e.LockUser()
 		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
 		e.UnlockUser()
-		return nil
 
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
+	return nil
 }
 
 // checkV4MappedLocked determines the effective network protocol and converts
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index ce3df7478..32d0af6c4 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -728,7 +728,7 @@ func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
 	const maxMSS = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
 	tests := []struct {
 		name   string
-		setMSS uint16
+		setMSS int
 		expMSS uint16
 	}{
 		{
@@ -756,15 +756,14 @@ func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
 			c.Create(-1)
 
 			// Set the MSS socket option.
-			opt := tcpip.MaxSegOption(test.setMSS)
-			if err := c.EP.SetSockOpt(opt); err != nil {
-				t.Fatalf("SetSockOpt(%#v) failed: %s", opt, err)
+			if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, test.setMSS); err != nil {
+				t.Fatalf("SetSockOptInt(MaxSegOption, %d) failed: %s", test.setMSS, err)
 			}
 
 			// Get expected window size.
 			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
 			if err != nil {
-				t.Fatalf("GetSockOpt(%v) failed: %s", tcpip.ReceiveBufferSizeOption, err)
+				t.Fatalf("GetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 			}
 			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
 
@@ -818,15 +817,14 @@ func TestUserSuppliedMSSOnConnectV6(t *testing.T) {
 			c.CreateV6Endpoint(true)
 
 			// Set the MSS socket option.
-			opt := tcpip.MaxSegOption(test.setMSS)
-			if err := c.EP.SetSockOpt(opt); err != nil {
-				t.Fatalf("SetSockOpt(%#v) failed: %s", opt, err)
+			if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
+				t.Fatalf("SetSockOptInt(MaxSegOption, %d) failed: %s", test.setMSS, err)
 			}
 
 			// Get expected window size.
 			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
 			if err != nil {
-				t.Fatalf("GetSockOpt(%v) failed: %s", tcpip.ReceiveBufferSizeOption, err)
+				t.Fatalf("GetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 			}
 			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
 
@@ -1077,17 +1075,17 @@ func TestTOSV4(t *testing.T) {
 	c.EP = ep
 
 	const tos = 0xC0
-	if err := c.EP.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-		t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+	if err := c.EP.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
+		t.Errorf("SetSockOptInt(IPv4TOSOption, %d) failed: %s", tos, err)
 	}
 
-	var v tcpip.IPv4TOSOption
-	if err := c.EP.GetSockOpt(&v); err != nil {
-		t.Errorf("GetSockopt failed: %s", err)
+	v, err := c.EP.GetSockOptInt(tcpip.IPv4TOSOption)
+	if err != nil {
+		t.Errorf("GetSockoptInt(IPv4TOSOption) failed: %s", err)
 	}
 
-	if want := tcpip.IPv4TOSOption(tos); v != want {
-		t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+	if v != tos {
+		t.Errorf("got GetSockOptInt(IPv4TOSOption) = %d, want = %d", v, tos)
 	}
 
 	testV4Connect(t, c, checker.TOS(tos, 0))
@@ -1125,17 +1123,17 @@ func TestTrafficClassV6(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	const tos = 0xC0
-	if err := c.EP.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-		t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv6TrafficClassOption(tos), err)
+	if err := c.EP.SetSockOptInt(tcpip.IPv6TrafficClassOption, tos); err != nil {
+		t.Errorf("SetSockOpInt(IPv6TrafficClassOption, %d) failed: %s", tos, err)
 	}
 
-	var v tcpip.IPv6TrafficClassOption
-	if err := c.EP.GetSockOpt(&v); err != nil {
-		t.Fatalf("GetSockopt failed: %s", err)
+	v, err := c.EP.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+	if err != nil {
+		t.Fatalf("GetSockoptInt(IPv6TrafficClassOption) failed: %s", err)
 	}
 
-	if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-		t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+	if v != tos {
+		t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = %d, want = %d", v, tos)
 	}
 
 	// Test the connection request.
@@ -1711,7 +1709,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	c.CreateConnected(789, 30000, 10)
 
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
-		t.Fatalf("SetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %v", err)
 	}
 
 	we, ch := waiter.NewChannelEntry(nil)
@@ -1984,7 +1982,7 @@ func TestScaledWindowAccept(t *testing.T) {
 
 	// Set the window size greater than the maximum non-scaled window.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %v", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
@@ -2057,7 +2055,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 
 	// Set the window size greater than the maximum non-scaled window.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %v", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
@@ -2221,10 +2219,10 @@ func TestSegmentMerging(t *testing.T) {
 		{
 			"cork",
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOpt(tcpip.CorkOption(1))
+				ep.SetSockOptBool(tcpip.CorkOption, true)
 			},
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOpt(tcpip.CorkOption(0))
+				ep.SetSockOptBool(tcpip.CorkOption, false)
 			},
 		},
 	}
@@ -2316,7 +2314,7 @@ func TestDelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptInt(tcpip.DelayOption, 1)
+	c.EP.SetSockOptBool(tcpip.DelayOption, true)
 
 	var allData []byte
 	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
@@ -2364,7 +2362,7 @@ func TestUndelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptInt(tcpip.DelayOption, 1)
+	c.EP.SetSockOptBool(tcpip.DelayOption, true)
 
 	allData := [][]byte{{0}, {1, 2, 3}}
 	for i, data := range allData {
@@ -2397,7 +2395,7 @@ func TestUndelay(t *testing.T) {
 	// Check that we don't get the second packet yet.
 	c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond)
 
-	c.EP.SetSockOptInt(tcpip.DelayOption, 0)
+	c.EP.SetSockOptBool(tcpip.DelayOption, false)
 
 	// Check that data is received.
 	second := c.GetPacket()
@@ -2434,8 +2432,8 @@ func TestMSSNotDelayed(t *testing.T) {
 		fn   func(tcpip.Endpoint)
 	}{
 		{"no-op", func(tcpip.Endpoint) {}},
-		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptInt(tcpip.DelayOption, 1) }},
-		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOpt(tcpip.CorkOption(1)) }},
+		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }},
+		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }},
 	}
 
 	for _, test := range tests {
@@ -2576,12 +2574,12 @@ func TestSetTTL(t *testing.T) {
 				t.Fatalf("NewEndpoint failed: %v", err)
 			}
 
-			if err := c.EP.SetSockOpt(tcpip.TTLOption(wantTTL)); err != nil {
-				t.Fatalf("SetSockOpt failed: %v", err)
+			if err := c.EP.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
+				t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
 			}
 
 			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-				t.Fatalf("Unexpected return value from Connect: %v", err)
+				t.Fatalf("Unexpected return value from Connect: %s", err)
 			}
 
 			// Receive SYN packet.
@@ -2621,7 +2619,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	// window scaling option.
 	const rcvBufferSize = 0x20000
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
@@ -2765,7 +2763,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	const rcvBufferSize = 0x20000
 	const wndScale = 2
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
 	}
 
 	// Start connection attempt.
@@ -3882,26 +3880,26 @@ func TestMinMaxBufferSizes(t *testing.T) {
 
 	// Set values below the min.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 199) failed: %s", err)
 	}
 
 	checkRecvBufferSize(t, ep, 200)
 
 	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err)
 	}
 
 	checkSendBufferSize(t, ep, 300)
 
 	// Set values above the max.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 1+tcp.DefaultReceiveBufferSize*20); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 	}
 
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
 
 	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err)
 	}
 
 	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
@@ -4147,11 +4145,11 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												case "ipv4":
 												case "ipv6":
 													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-														t.Fatalf("SetSockOpt(V6OnlyOption(true)) failed: %v", err)
+														t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err)
 													}
 												case "dual":
 													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
-														t.Fatalf("SetSockOpt(V6OnlyOption(false)) failed: %v", err)
+														t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err)
 													}
 												default:
 													t.Fatalf("unknown network: '%s'", network)
@@ -4477,8 +4475,8 @@ func TestKeepalive(t *testing.T) {
 	const keepAliveInterval = 10 * time.Millisecond
 	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5))
-	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
+	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -5770,14 +5768,14 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 func TestDelayEnabled(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-	checkDelayOption(t, c, false, 0) // Delay is disabled by default.
+	checkDelayOption(t, c, false, false) // Delay is disabled by default.
 
 	for _, v := range []struct {
 		delayEnabled    tcp.DelayEnabled
-		wantDelayOption int
+		wantDelayOption bool
 	}{
-		{delayEnabled: false, wantDelayOption: 0},
-		{delayEnabled: true, wantDelayOption: 1},
+		{delayEnabled: false, wantDelayOption: false},
+		{delayEnabled: true, wantDelayOption: true},
 	} {
 		c := context.New(t, defaultMTU)
 		defer c.Cleanup()
@@ -5788,7 +5786,7 @@ func TestDelayEnabled(t *testing.T) {
 	}
 }
 
-func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption int) {
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption bool) {
 	t.Helper()
 
 	var gotDelayEnabled tcp.DelayEnabled
@@ -5803,12 +5801,12 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.Del
 	if err != nil {
 		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %v", err)
 	}
-	gotDelayOption, err := ep.GetSockOptInt(tcpip.DelayOption)
+	gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption)
 	if err != nil {
-		t.Fatalf("ep.GetSockOptInt(tcpip.DelayOption) failed: %v", err)
+		t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err)
 	}
 	if gotDelayOption != wantDelayOption {
-		t.Errorf("ep.GetSockOptInt(tcpip.DelayOption) got: %d, want: %d", gotDelayOption, wantDelayOption)
+		t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption)
 	}
 }
 
@@ -6620,8 +6618,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	const keepAliveInterval = 10 * time.Millisecond
 	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10))
-	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
+	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
 
 	// Set userTimeout to be the duration for 3 keepalive probes.
 	userTimeout := 30 * time.Millisecond
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 120d3baa3..492cc1fcb 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -501,11 +501,20 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	switch opt {
+	case tcpip.BroadcastOption:
+		e.mu.Lock()
+		e.broadcast = v
+		e.mu.Unlock()
+
+	case tcpip.MulticastLoopOption:
+		e.mu.Lock()
+		e.multicastLoop = v
+		e.mu.Unlock()
+
 	case tcpip.ReceiveTOSOption:
 		e.mu.Lock()
 		e.receiveTOS = v
 		e.mu.Unlock()
-		return nil
 
 	case tcpip.ReceiveTClassOption:
 		// We only support this option on v6 endpoints.
@@ -516,7 +525,18 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Lock()
 		e.receiveTClass = v
 		e.mu.Unlock()
-		return nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+
+	case tcpip.ReuseAddressOption:
+
+	case tcpip.ReusePortOption:
+		e.mu.Lock()
+		e.reusePort = v
+		e.mu.Unlock()
 
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
@@ -533,13 +553,8 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
-		return nil
-
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.Lock()
-		e.receiveIPPacketInfo = v
-		e.mu.Unlock()
-		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
 	}
 
 	return nil
@@ -547,22 +562,40 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
-	return nil
-}
+	switch opt {
+	case tcpip.MulticastTTLOption:
+		e.mu.Lock()
+		e.multicastTTL = uint8(v)
+		e.mu.Unlock()
 
-// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
 	case tcpip.TTLOption:
 		e.mu.Lock()
 		e.ttl = uint8(v)
 		e.mu.Unlock()
 
-	case tcpip.MulticastTTLOption:
+	case tcpip.IPv4TOSOption:
 		e.mu.Lock()
-		e.multicastTTL = uint8(v)
+		e.sendTOS = uint8(v)
+		e.mu.Unlock()
+
+	case tcpip.IPv6TrafficClassOption:
+		e.mu.Lock()
+		e.sendTOS = uint8(v)
 		e.mu.Unlock()
 
+	case tcpip.ReceiveBufferSizeOption:
+	case tcpip.SendBufferSizeOption:
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+
+	return nil
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
 	case tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		defer e.mu.Unlock()
@@ -686,16 +719,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastMemberships[memToRemoveIndex] = e.multicastMemberships[len(e.multicastMemberships)-1]
 		e.multicastMemberships = e.multicastMemberships[:len(e.multicastMemberships)-1]
 
-	case tcpip.MulticastLoopOption:
-		e.mu.Lock()
-		e.multicastLoop = bool(v)
-		e.mu.Unlock()
-
-	case tcpip.ReusePortOption:
-		e.mu.Lock()
-		e.reusePort = v != 0
-		e.mu.Unlock()
-
 	case tcpip.BindToDeviceOption:
 		id := tcpip.NICID(v)
 		if id != 0 && !e.stack.HasNIC(id) {
@@ -704,26 +727,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		e.bindToDevice = id
 		e.mu.Unlock()
-		return nil
-
-	case tcpip.BroadcastOption:
-		e.mu.Lock()
-		e.broadcast = v != 0
-		e.mu.Unlock()
-
-		return nil
-
-	case tcpip.IPv4TOSOption:
-		e.mu.Lock()
-		e.sendTOS = uint8(v)
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.IPv6TrafficClassOption:
-		e.mu.Lock()
-		e.sendTOS = uint8(v)
-		e.mu.Unlock()
-		return nil
 	}
 	return nil
 }
@@ -731,6 +734,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.BroadcastOption:
+		e.mu.RLock()
+		v := e.broadcast
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.MulticastLoopOption:
+		e.mu.RLock()
+		v := e.multicastLoop
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.ReceiveTOSOption:
 		e.mu.RLock()
 		v := e.receiveTOS
@@ -748,6 +766,22 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.ReuseAddressOption:
+		return false, nil
+
+	case tcpip.ReusePortOption:
+		e.mu.RLock()
+		v := e.reusePort
+		e.mu.RUnlock()
+
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -760,19 +794,32 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 
 		return v, nil
 
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.RLock()
-		v := e.receiveIPPacketInfo
-		e.mu.RUnlock()
-		return v, nil
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
 	}
-
-	return false, tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
+	case tcpip.IPv4TOSOption:
+		e.mu.RLock()
+		v := int(e.sendTOS)
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.IPv6TrafficClassOption:
+		e.mu.RLock()
+		v := int(e.sendTOS)
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.MulticastTTLOption:
+		e.mu.Lock()
+		v := int(e.multicastTTL)
+		e.mu.Unlock()
+		return v, nil
+
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
 		e.rcvMu.Lock()
@@ -794,29 +841,22 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		v := e.rcvBufSizeMax
 		e.rcvMu.Unlock()
 		return v, nil
-	}
 
-	return -1, tcpip.ErrUnknownProtocolOption
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		v := int(e.ttl)
+		e.mu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
 	case tcpip.ErrorOption:
-		return nil
-
-	case *tcpip.TTLOption:
-		e.mu.Lock()
-		*o = tcpip.TTLOption(e.ttl)
-		e.mu.Unlock()
-		return nil
-
-	case *tcpip.MulticastTTLOption:
-		e.mu.Lock()
-		*o = tcpip.MulticastTTLOption(e.multicastTTL)
-		e.mu.Unlock()
-		return nil
-
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastInterfaceOption{
@@ -824,67 +864,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			e.multicastAddr,
 		}
 		e.mu.Unlock()
-		return nil
-
-	case *tcpip.MulticastLoopOption:
-		e.mu.RLock()
-		v := e.multicastLoop
-		e.mu.RUnlock()
-
-		*o = tcpip.MulticastLoopOption(v)
-		return nil
-
-	case *tcpip.ReuseAddressOption:
-		*o = 0
-		return nil
-
-	case *tcpip.ReusePortOption:
-		e.mu.RLock()
-		v := e.reusePort
-		e.mu.RUnlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
 
 	case *tcpip.BindToDeviceOption:
 		e.mu.RLock()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
 		e.mu.RUnlock()
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
-	case *tcpip.BroadcastOption:
-		e.mu.RLock()
-		v := e.broadcast
-		e.mu.RUnlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
-	case *tcpip.IPv4TOSOption:
-		e.mu.RLock()
-		*o = tcpip.IPv4TOSOption(e.sendTOS)
-		e.mu.RUnlock()
-		return nil
-
-	case *tcpip.IPv6TrafficClassOption:
-		e.mu.RLock()
-		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
-		e.mu.RUnlock()
-		return nil
 
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
+	return nil
 }
 
 // sendUDP sends a UDP segment via the provided network endpoint and under the
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0905726c1..b3ee688b7 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -343,11 +343,11 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 	c.createEndpoint(flow.sockProto())
 	if flow.isV6Only() {
 		if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-			c.t.Fatalf("SetSockOpt failed: %v", err)
+			c.t.Fatalf("SetSockOptBool failed: %s", err)
 		}
 	} else if flow.isBroadcast() {
-		if err := c.ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
-			c.t.Fatal("SetSockOpt failed:", err)
+		if err := c.ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil {
+			c.t.Fatalf("SetSockOptBool failed: %s", err)
 		}
 	}
 }
@@ -1271,8 +1271,8 @@ func TestTTL(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			const multicastTTL = 42
-			if err := c.ep.SetSockOpt(tcpip.MulticastTTLOption(multicastTTL)); err != nil {
-				c.t.Fatalf("SetSockOpt failed: %v", err)
+			if err := c.ep.SetSockOptInt(tcpip.MulticastTTLOption, multicastTTL); err != nil {
+				c.t.Fatalf("SetSockOptInt failed: %s", err)
 			}
 
 			var wantTTL uint8
@@ -1311,8 +1311,8 @@ func TestSetTTL(t *testing.T) {
 
 					c.createEndpointForFlow(flow)
 
-					if err := c.ep.SetSockOpt(tcpip.TTLOption(wantTTL)); err != nil {
-						c.t.Fatalf("SetSockOpt failed: %v", err)
+					if err := c.ep.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
+						c.t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
 					}
 
 					var p stack.NetworkProtocol
@@ -1346,25 +1346,26 @@ func TestSetTOS(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			const tos = testTOS
-			var v tcpip.IPv4TOSOption
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
+			v, err := c.ep.GetSockOptInt(tcpip.IPv4TOSOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv4TOSOption) failed: %s", err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
+				c.t.Errorf("got GetSockOpt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv4TOSOption(tos), err)
+			if err := c.ep.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
+				c.t.Errorf("SetSockOptInt(IPv4TOSOption, 0x%x) failed: %s", tos, err)
 			}
 
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
+			v, err = c.ep.GetSockOptInt(tcpip.IPv4TOSOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv4TOSOption) failed: %s", err)
 			}
 
-			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
+			if v != tos {
+				c.t.Errorf("got GetSockOptInt(IPv4TOSOption) = 0x%x, want = 0x%x", v, tos)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1381,25 +1382,26 @@ func TestSetTClass(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			const tClass = testTOS
-			var v tcpip.IPv6TrafficClassOption
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
+			v, err := c.ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv6TrafficClassOption) failed: %s", err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
+				c.t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tClass)); err != nil {
-				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv6TrafficClassOption(tClass), err)
+			if err := c.ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, tClass); err != nil {
+				c.t.Errorf("SetSockOptInt(IPv6TrafficClassOption, 0x%x) failed: %s", tClass, err)
 			}
 
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
+			v, err = c.ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv6TrafficClassOption) failed: %s", err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tClass); v != want {
-				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
+			if v != tClass {
+				c.t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = 0x%x, want = 0x%x", v, tClass)
 			}
 
 			// The header getter for TClass is called TOS, so use that checker.
@@ -1430,7 +1432,7 @@ func TestReceiveTosTClass(t *testing.T) {
 				// Verify that setting and reading the option works.
 				v, err := c.ep.GetSockOptBool(option)
 				if err != nil {
-					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
 				}
 				// Test for expected default value.
 				if v != false {
@@ -1444,7 +1446,7 @@ func TestReceiveTosTClass(t *testing.T) {
 
 				got, err := c.ep.GetSockOptBool(option)
 				if err != nil {
-					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
 				}
 
 				if got != want {
-- 
cgit v1.2.3


From 8f68be74919751775cfb3d8162a51351822a6f5f Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 9 Apr 2020 11:02:05 -0700
Subject: Don't use REP string instructions in safecopy.memcpy.

PiperOrigin-RevId: 305718392
---
 pkg/safecopy/memcpy_amd64.s | 111 ++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 71 deletions(-)

diff --git a/pkg/safecopy/memcpy_amd64.s b/pkg/safecopy/memcpy_amd64.s
index 129691d68..00b46c18f 100644
--- a/pkg/safecopy/memcpy_amd64.s
+++ b/pkg/safecopy/memcpy_amd64.s
@@ -55,15 +55,9 @@ TEXT ·memcpy(SB), NOSPLIT, $0-36
 	MOVQ	from+8(FP), SI
 	MOVQ	n+16(FP), BX
 
-	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code. The REP MOVSQ instruction is really fast
-	// for large sizes. The cutover is approximately 2K.
 tail:
-	// move_129through256 or smaller work whether or not the source and the
-	// destination memory regions overlap because they load all data into
-	// registers before writing it back.  move_256through2048 on the other
-	// hand can be used only when the memory regions don't overlap or the copy
-	// direction is forward.
+	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not
+	// worth doing.
 	TESTQ	BX, BX
 	JEQ	move_0
 	CMPQ	BX, $2
@@ -83,31 +77,45 @@ tail:
 	JBE	move_65through128
 	CMPQ	BX, $256
 	JBE	move_129through256
-	// TODO: use branch table and BSR to make this just a single dispatch
 
-/*
- * forward copy loop
- */
-	CMPQ	BX, $2048
-	JLS	move_256through2048
-
-	// Check alignment
-	MOVL	SI, AX
-	ORL	DI, AX
-	TESTL	$7, AX
-	JEQ	fwdBy8
-
-	// Do 1 byte at a time
-	MOVQ	BX, CX
-	REP;	MOVSB
-	RET
-
-fwdBy8:
-	// Do 8 bytes at a time
-	MOVQ	BX, CX
-	SHRQ	$3, CX
-	ANDQ	$7, BX
-	REP;	MOVSQ
+move_257plus:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	128(SI), X8
+	MOVOU	X8, 128(DI)
+	MOVOU	144(SI), X9
+	MOVOU	X9, 144(DI)
+	MOVOU	160(SI), X10
+	MOVOU	X10, 160(DI)
+	MOVOU	176(SI), X11
+	MOVOU	X11, 176(DI)
+	MOVOU	192(SI), X12
+	MOVOU	X12, 192(DI)
+	MOVOU	208(SI), X13
+	MOVOU	X13, 208(DI)
+	MOVOU	224(SI), X14
+	MOVOU	X14, 224(DI)
+	MOVOU	240(SI), X15
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_257plus
 	JMP	tail
 
 move_1or2:
@@ -209,42 +217,3 @@ move_129through256:
 	MOVOU	-16(SI)(BX*1), X15
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
-move_256through2048:
-	SUBQ	$256, BX
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	16(SI), X1
-	MOVOU	X1, 16(DI)
-	MOVOU	32(SI), X2
-	MOVOU	X2, 32(DI)
-	MOVOU	48(SI), X3
-	MOVOU	X3, 48(DI)
-	MOVOU	64(SI), X4
-	MOVOU	X4, 64(DI)
-	MOVOU	80(SI), X5
-	MOVOU	X5, 80(DI)
-	MOVOU	96(SI), X6
-	MOVOU	X6, 96(DI)
-	MOVOU	112(SI), X7
-	MOVOU	X7, 112(DI)
-	MOVOU	128(SI), X8
-	MOVOU	X8, 128(DI)
-	MOVOU	144(SI), X9
-	MOVOU	X9, 144(DI)
-	MOVOU	160(SI), X10
-	MOVOU	X10, 160(DI)
-	MOVOU	176(SI), X11
-	MOVOU	X11, 176(DI)
-	MOVOU	192(SI), X12
-	MOVOU	X12, 192(DI)
-	MOVOU	208(SI), X13
-	MOVOU	X13, 208(DI)
-	MOVOU	224(SI), X14
-	MOVOU	X14, 224(DI)
-	MOVOU	240(SI), X15
-	MOVOU	X15, 240(DI)
-	CMPQ	BX, $256
-	LEAQ	256(SI), SI
-	LEAQ	256(DI), DI
-	JGE	move_256through2048
-	JMP	tail
-- 
cgit v1.2.3


From 2b4687a46bffc0999f1d5730397c13daba6ae4a9 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 9 Apr 2020 11:15:42 -0700
Subject: Handle os.LinkError in p9/handlers.go.

PiperOrigin-RevId: 305721329
---
 pkg/p9/handlers.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index a8b714cf5..1db5797dd 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -48,6 +48,8 @@ func ExtractErrno(err error) syscall.Errno {
 		return ExtractErrno(e.Err)
 	case *os.SyscallError:
 		return ExtractErrno(e.Err)
+	case *os.LinkError:
+		return ExtractErrno(e.Err)
 	}
 
 	// Default case.
-- 
cgit v1.2.3


From 64c2b490671852aaa024a4bb4757eef309fadf18 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 9 Apr 2020 13:33:18 -0700
Subject: Dedup netlink utility functions in tests.

PiperOrigin-RevId: 305749697
---
 test/syscalls/linux/BUILD                        |  3 +-
 test/syscalls/linux/socket_netlink_route.cc      | 75 ++++--------------------
 test/syscalls/linux/socket_netlink_route_util.cc |  7 +--
 test/syscalls/linux/socket_netlink_route_util.h  |  4 +-
 test/syscalls/linux/tuntap.cc                    | 19 +++---
 5 files changed, 24 insertions(+), 84 deletions(-)

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ae3017608..96ca39583 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -138,7 +138,6 @@ cc_library(
     hdrs = ["socket_netlink_route_util.h"],
     deps = [
         ":socket_netlink_util",
-        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -2804,13 +2803,13 @@ cc_binary(
     srcs = ["socket_netlink_route.cc"],
     linkstatic = 1,
     deps = [
+        ":socket_netlink_route_util",
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 2efb96bc3..fbe61c5a0 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -26,7 +26,7 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
-#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/capability_util.h"
@@ -118,24 +118,6 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
-PosixError DumpLinks(
-    const FileDescriptor& fd, uint32_t seq,
-    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
-  struct request {
-    struct nlmsghdr hdr;
-    struct ifinfomsg ifm;
-  };
-
-  struct request req = {};
-  req.hdr.nlmsg_len = sizeof(req);
-  req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
-  req.hdr.nlmsg_seq = seq;
-  req.ifm.ifi_family = AF_UNSPEC;
-
-  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
-}
-
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -161,37 +143,6 @@ TEST(NetlinkRouteTest, GetLinkDump) {
   EXPECT_TRUE(loopbackFound);
 }
 
-struct Link {
-  int index;
-  std::string name;
-};
-
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
-  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
-
-  absl::optional<Link> link;
-  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
-    if (hdr->nlmsg_type != RTM_NEWLINK ||
-        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
-      return;
-    }
-    const struct ifinfomsg* msg =
-        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-    if (msg->ifi_type == ARPHRD_LOOPBACK) {
-      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
-      if (rta == nullptr) {
-        // Ignore links that do not have a name.
-        return;
-      }
-
-      link = Link();
-      link->index = msg->ifi_index;
-      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
-    }
-  }));
-  return link;
-}
-
 // CheckLinkMsg checks a netlink message against an expected link.
 void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
   ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
@@ -209,9 +160,7 @@ void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
 }
 
 TEST(NetlinkRouteTest, GetLinkByIndex) {
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -227,13 +176,13 @@ TEST(NetlinkRouteTest, GetLinkByIndex) {
   req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
-  req.ifm.ifi_index = loopback_link->index;
+  req.ifm.ifi_index = loopback_link.index;
 
   bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckLinkMsg(hdr, *loopback_link);
+        CheckLinkMsg(hdr, loopback_link);
         found = true;
       },
       false));
@@ -241,9 +190,7 @@ TEST(NetlinkRouteTest, GetLinkByIndex) {
 }
 
 TEST(NetlinkRouteTest, GetLinkByName) {
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -262,8 +209,8 @@ TEST(NetlinkRouteTest, GetLinkByName) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
   req.rtattr.rta_type = IFLA_IFNAME;
-  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
-  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link.name.size() + 1);
+  strncpy(req.ifname, loopback_link.name.c_str(), sizeof(req.ifname));
   req.hdr.nlmsg_len =
       NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
 
@@ -271,7 +218,7 @@ TEST(NetlinkRouteTest, GetLinkByName) {
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckLinkMsg(hdr, *loopback_link);
+        CheckLinkMsg(hdr, loopback_link);
         found = true;
       },
       false));
@@ -523,9 +470,7 @@ TEST(NetlinkRouteTest, LookupAll) {
 TEST(NetlinkRouteTest, AddAddr) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -545,7 +490,7 @@ TEST(NetlinkRouteTest, AddAddr) {
   req.ifa.ifa_prefixlen = 24;
   req.ifa.ifa_flags = 0;
   req.ifa.ifa_scope = 0;
-  req.ifa.ifa_index = loopback_link->index;
+  req.ifa.ifa_index = loopback_link.index;
   req.rtattr.rta_type = IFA_LOCAL;
   req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
   inet_pton(AF_INET, "10.0.0.1", &req.addr);
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
index 53eb3b6b2..bde1dbb4d 100644
--- a/test/syscalls/linux/socket_netlink_route_util.cc
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -18,7 +18,6 @@
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
-#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 
 namespace gvisor {
@@ -73,14 +72,14 @@ PosixErrorOr<std::vector<Link>> DumpLinks() {
   return links;
 }
 
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+PosixErrorOr<Link> LoopbackLink() {
   ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
   for (const auto& link : links) {
     if (link.type == ARPHRD_LOOPBACK) {
-      return absl::optional<Link>(link);
+      return link;
     }
   }
-  return absl::optional<Link>();
+  return PosixError(ENOENT, "loopback link not found");
 }
 
 PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
index 2c018e487..149c4a7f6 100644
--- a/test/syscalls/linux/socket_netlink_route_util.h
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -20,7 +20,6 @@
 
 #include <vector>
 
-#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 
 namespace gvisor {
@@ -37,7 +36,8 @@ PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
 
 PosixErrorOr<std::vector<Link>> DumpLinks();
 
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink();
+// Returns the loopback link on the system. ENOENT if not found.
+PosixErrorOr<Link> LoopbackLink();
 
 // LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
 PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index 3a8ba37eb..6195b11e1 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -56,14 +56,14 @@ PosixErrorOr<std::set<std::string>> DumpLinkNames() {
   return names;
 }
 
-PosixErrorOr<absl::optional<Link>> GetLinkByName(const std::string& name) {
+PosixErrorOr<Link> GetLinkByName(const std::string& name) {
   ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
   for (const auto& link : links) {
     if (link.name == name) {
-      return absl::optional<Link>(link);
+      return link;
     }
   }
-  return absl::optional<Link>();
+  return PosixError(ENOENT, "interface not found");
 }
 
 struct pihdr {
@@ -268,24 +268,21 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(
     return PosixError(errno);
   }
 
-  ASSIGN_OR_RETURN_ERRNO(absl::optional<Link> link, GetLinkByName(dev_name));
-  if (!link.has_value()) {
-    return PosixError(ENOENT, "no link");
-  }
+  ASSIGN_OR_RETURN_ERRNO(auto link, GetLinkByName(dev_name));
 
   // Interface setup.
   struct in_addr addr;
   inet_pton(AF_INET, dev_ipv4_addr.c_str(), &addr);
-  EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
-                                   &addr, sizeof(addr)));
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, /*prefixlen=*/24, &addr,
+                                   sizeof(addr)));
 
   if (!IsRunningOnGvisor()) {
     // FIXME(b/110961832): gVisor doesn't support setting MAC address on
     // interfaces yet.
-    RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+    RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA)));
 
     // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
-    RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+    RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP));
   }
 
   return fd;
-- 
cgit v1.2.3


From 9a5e5ab2fa2623fa5c1acf5faf5527e7eba20c07 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 9 Apr 2020 13:41:03 -0700
Subject: Bump rule_go, bazel toolchain, and go toolchain versions.

PiperOrigin-RevId: 305751225
---
 WORKSPACE | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 4d2b4a72f..bca63c0d9 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,10 +4,10 @@ load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 # Load go bazel rules and gazelle.
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "94f90feaa65c9cdc840cd21f67d967870b5943d684966a47569da8073e42063d",
+    sha256 = "db2b2d35293f405430f553bc7a865a8749a8ef60c30287e90d2b278c32771afe",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.22.0/rules_go-v0.22.0.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.22.0/rules_go-v0.22.0.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
     ],
 )
 
@@ -25,7 +25,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_depe
 go_rules_dependencies()
 
 go_register_toolchains(
-    go_version = "1.14",
+    go_version = "1.14.2",
     nogo = "@//:nogo",
 )
 
@@ -99,11 +99,11 @@ pip_install()
 # See releases at https://releases.bazel.build/bazel-toolchains.html
 http_archive(
     name = "bazel_toolchains",
-    sha256 = "b5a8039df7119d618402472f3adff8a1bd0ae9d5e253f53fcc4c47122e91a3d2",
-    strip_prefix = "bazel-toolchains-2.1.1",
+    sha256 = "239a1a673861eabf988e9804f45da3b94da28d1aff05c373b013193c315d9d9e",
+    strip_prefix = "bazel-toolchains-3.0.1",
     urls = [
-        "https://github.com/bazelbuild/bazel-toolchains/releases/download/2.1.1/bazel-toolchains-2.1.1.tar.gz",
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/2.1.1.tar.gz",
+        "https://github.com/bazelbuild/bazel-toolchains/releases/download/3.0.1/bazel-toolchains-3.0.1.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/releases/download/3.0.1/bazel-toolchains-3.0.1.tar.gz",
     ],
 )
 
-- 
cgit v1.2.3


From 5b41f3364230f71e2d0dde0d8608810abe65f99f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 9 Apr 2020 14:17:42 -0700
Subject: Remove "no-sandbox" tag.

It seems no longer necessary.

PiperOrigin-RevId: 305758572
---
 tools/bazeldefs/platforms.bzl | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tools/bazeldefs/platforms.bzl b/tools/bazeldefs/platforms.bzl
index 92b0b5fc0..132040c20 100644
--- a/tools/bazeldefs/platforms.bzl
+++ b/tools/bazeldefs/platforms.bzl
@@ -2,15 +2,10 @@
 
 # Platform to associated tags.
 platforms = {
-    "ptrace": [
-        # TODO(b/120560048): Make the tests run without this tag.
-        "no-sandbox",
-    ],
+    "ptrace": [],
     "kvm": [
         "manual",
         "local",
-        # TODO(b/120560048): Make the tests run without this tag.
-        "no-sandbox",
     ],
 }
 
-- 
cgit v1.2.3


From ace90f823cf33d1c1180dcd0d2061c702270a0d6 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 9 Apr 2020 16:21:02 -0700
Subject: Make some functions in IfAddrHelper const.

PiperOrigin-RevId: 305782490
---
 test/syscalls/linux/ip_socket_test_util.cc         | 10 ++---
 test/syscalls/linux/ip_socket_test_util.h          |  6 +--
 .../socket_ipv4_udp_unbound_external_networking.cc | 49 ++++++++++------------
 .../socket_ipv4_udp_unbound_external_networking.h  |  6 +--
 4 files changed, 32 insertions(+), 39 deletions(-)

diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index bba022a41..d28dc0db6 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -177,17 +177,17 @@ SocketKind IPv6TCPUnboundSocket(int type) {
 PosixError IfAddrHelper::Load() {
   Release();
   RETURN_ERROR_IF_SYSCALL_FAIL(getifaddrs(&ifaddr_));
-  return PosixError(0);
+  return NoError();
 }
 
 void IfAddrHelper::Release() {
   if (ifaddr_) {
     freeifaddrs(ifaddr_);
+    ifaddr_ = nullptr;
   }
-  ifaddr_ = nullptr;
 }
 
-std::vector<std::string> IfAddrHelper::InterfaceList(int family) {
+std::vector<std::string> IfAddrHelper::InterfaceList(int family) const {
   std::vector<std::string> names;
   for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
@@ -198,7 +198,7 @@ std::vector<std::string> IfAddrHelper::InterfaceList(int family) {
   return names;
 }
 
-sockaddr* IfAddrHelper::GetAddr(int family, std::string name) {
+const sockaddr* IfAddrHelper::GetAddr(int family, std::string name) const {
   for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
       continue;
@@ -210,7 +210,7 @@ sockaddr* IfAddrHelper::GetAddr(int family, std::string name) {
   return nullptr;
 }
 
-PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) {
+PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) const {
   return InterfaceIndex(name);
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 39fd6709d..9c3859fcd 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -110,10 +110,10 @@ class IfAddrHelper {
   PosixError Load();
   void Release();
 
-  std::vector<std::string> InterfaceList(int family);
+  std::vector<std::string> InterfaceList(int family) const;
 
-  struct sockaddr* GetAddr(int family, std::string name);
-  PosixErrorOr<int> GetIndex(std::string name);
+  const sockaddr* GetAddr(int family, std::string name) const;
+  PosixErrorOr<int> GetIndex(std::string name) const;
 
  private:
   struct ifaddrs* ifaddr_;
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 40e673625..d690d9564 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -45,37 +45,31 @@ void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
   got_if_infos_ = false;
 
   // Get interface list.
-  std::vector<std::string> if_names;
   ASSERT_NO_ERRNO(if_helper_.Load());
-  if_names = if_helper_.InterfaceList(AF_INET);
+  std::vector<std::string> if_names = if_helper_.InterfaceList(AF_INET);
   if (if_names.size() != 2) {
     return;
   }
 
   // Figure out which interface is where.
-  int lo = 0, eth = 1;
-  if (if_names[lo] != "lo") {
-    lo = 1;
-    eth = 0;
-  }
-
-  if (if_names[lo] != "lo") {
-    return;
-  }
-
-  lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(if_names[lo]));
-  lo_if_addr_ = if_helper_.GetAddr(AF_INET, if_names[lo]);
-  if (lo_if_addr_ == nullptr) {
+  std::string lo = if_names[0];
+  std::string eth = if_names[1];
+  if (lo != "lo") std::swap(lo, eth);
+  if (lo != "lo") return;
+
+  lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(lo));
+  auto lo_if_addr = if_helper_.GetAddr(AF_INET, lo);
+  if (lo_if_addr == nullptr) {
     return;
   }
-  lo_if_sin_addr_ = reinterpret_cast<sockaddr_in*>(lo_if_addr_)->sin_addr;
+  lo_if_addr_ = *reinterpret_cast<const sockaddr_in*>(lo_if_addr);
 
-  eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(if_names[eth]));
-  eth_if_addr_ = if_helper_.GetAddr(AF_INET, if_names[eth]);
-  if (eth_if_addr_ == nullptr) {
+  eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(eth));
+  auto eth_if_addr = if_helper_.GetAddr(AF_INET, eth);
+  if (eth_if_addr == nullptr) {
     return;
   }
-  eth_if_sin_addr_ = reinterpret_cast<sockaddr_in*>(eth_if_addr_)->sin_addr;
+  eth_if_addr_ = *reinterpret_cast<const sockaddr_in*>(eth_if_addr);
 
   got_if_infos_ = true;
 }
@@ -242,7 +236,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   // Bind the non-receiving socket to the unicast ethernet address.
   auto norecv_addr = rcv1_addr;
   reinterpret_cast<sockaddr_in*>(&norecv_addr.addr)->sin_addr =
-      eth_if_sin_addr_;
+      eth_if_addr_.sin_addr;
   ASSERT_THAT(bind(norcv->get(), reinterpret_cast<sockaddr*>(&norecv_addr.addr),
                    norecv_addr.addr_len),
               SyscallSucceedsWithValue(0));
@@ -1028,7 +1022,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   ip_mreqn iface = {};
   iface.imr_ifindex = lo_if_idx_;
-  iface.imr_address = eth_if_sin_addr_;
+  iface.imr_address = eth_if_addr_.sin_addr;
   ASSERT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
                          sizeof(iface)),
               SyscallSucceeds());
@@ -1058,7 +1052,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   SKIP_IF(IsRunningOnGvisor());
 
   // Verify the received source address.
-  EXPECT_EQ(eth_if_sin_addr_.s_addr, src_addr_in->sin_addr.s_addr);
+  EXPECT_EQ(eth_if_addr_.sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
 }
 
 // Check that when we are bound to one interface we can set IP_MULTICAST_IF to
@@ -1075,7 +1069,8 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
 
   // Create sender and bind to eth interface.
   auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-  ASSERT_THAT(bind(sender->get(), eth_if_addr_, sizeof(sockaddr_in)),
+  ASSERT_THAT(bind(sender->get(), reinterpret_cast<sockaddr*>(&eth_if_addr_),
+                   sizeof(eth_if_addr_)),
               SyscallSucceeds());
 
   // Run through all possible combinations of index and address for
@@ -1085,9 +1080,9 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
     struct in_addr imr_address;
   } test_data[] = {
       {lo_if_idx_, {}},
-      {0, lo_if_sin_addr_},
-      {lo_if_idx_, lo_if_sin_addr_},
-      {lo_if_idx_, eth_if_sin_addr_},
+      {0, lo_if_addr_.sin_addr},
+      {lo_if_idx_, lo_if_addr_.sin_addr},
+      {lo_if_idx_, eth_if_addr_.sin_addr},
   };
   for (auto t : test_data) {
     ip_mreqn iface = {};
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
index bec2e96ee..10b90b1e0 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -36,10 +36,8 @@ class IPv4UDPUnboundExternalNetworkingSocketTest : public SimpleSocketTest {
   // Interface infos.
   int lo_if_idx_;
   int eth_if_idx_;
-  sockaddr* lo_if_addr_;
-  sockaddr* eth_if_addr_;
-  in_addr lo_if_sin_addr_;
-  in_addr eth_if_sin_addr_;
+  sockaddr_in lo_if_addr_;
+  sockaddr_in eth_if_addr_;
 };
 
 }  // namespace testing
-- 
cgit v1.2.3


From 9f87502b4619b60779ce19c41ea0e6bd6582e8e4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 Apr 2020 16:40:12 -0700
Subject: Remove TODOs from Async IO

Block and drain requests in io_destroy(2).
Note the reason to create read-only mapping.

PiperOrigin-RevId: 305786312
---
 pkg/sentry/mm/aio_context.go         | 101 ++++++++++++++++++++++++-----------
 pkg/sentry/mm/aio_context_state.go   |   2 +-
 pkg/sentry/syscalls/linux/sys_aio.go |  34 +++++++++---
 test/syscalls/linux/aio.cc           |  12 +++--
 4 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index cb29d94b0..379148903 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -59,25 +59,27 @@ func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
 	}
 
 	a.contexts[id] = &AIOContext{
-		done:           make(chan struct{}, 1),
+		requestReady:   make(chan struct{}, 1),
 		maxOutstanding: events,
 	}
 	return true
 }
 
-// destroyAIOContext destroys an asynchronous I/O context.
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
 //
-// False is returned if the context does not exist.
-func (a *aioManager) destroyAIOContext(id uint64) bool {
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	ctx, ok := a.contexts[id]
 	if !ok {
-		return false
+		return nil
 	}
 	delete(a.contexts, id)
 	ctx.destroy()
-	return true
+	return ctx
 }
 
 // lookupAIOContext looks up the given context.
@@ -102,8 +104,8 @@ type ioResult struct {
 //
 // +stateify savable
 type AIOContext struct {
-	// done is the notification channel used for all requests.
-	done chan struct{} `state:"nosave"`
+	// requestReady is the notification channel used for all requests.
+	requestReady chan struct{} `state:"nosave"`
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -129,8 +131,14 @@ func (ctx *AIOContext) destroy() {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
 	ctx.dead = true
-	if ctx.outstanding == 0 {
-		close(ctx.done)
+	ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+	if ctx.dead && ctx.outstanding == 0 {
+		close(ctx.requestReady)
+		ctx.requestReady = nil
 	}
 }
 
@@ -154,11 +162,12 @@ func (ctx *AIOContext) PopRequest() (interface{}, bool) {
 
 	// Is there anything ready?
 	if e := ctx.results.Front(); e != nil {
-		ctx.results.Remove(e)
-		ctx.outstanding--
-		if ctx.outstanding == 0 && ctx.dead {
-			close(ctx.done)
+		if ctx.outstanding == 0 {
+			panic("AIOContext outstanding is going negative")
 		}
+		ctx.outstanding--
+		ctx.results.Remove(e)
+		ctx.checkForDone()
 		return e.data, true
 	}
 	return nil, false
@@ -172,26 +181,58 @@ func (ctx *AIOContext) FinishRequest(data interface{}) {
 
 	// Push to the list and notify opportunistically. The channel notify
 	// here is guaranteed to be safe because outstanding must be non-zero.
-	// The done channel is only closed when outstanding reaches zero.
+	// The requestReady channel is only closed when outstanding reaches zero.
 	ctx.results.PushBack(&ioResult{data: data})
 
 	select {
-	case ctx.done <- struct{}{}:
+	case ctx.requestReady <- struct{}{}:
 	default:
 	}
 }
 
 // WaitChannel returns a channel that is notified when an AIO request is
-// completed.
-//
-// The boolean return value indicates whether or not the context is active.
-func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
-	if ctx.outstanding == 0 && ctx.dead {
-		return nil, false
+	return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		panic("AIOContext outstanding is going negative")
 	}
-	return ctx.done, true
+	ctx.outstanding--
+	ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		return
+	}
+	size := uint32(ctx.results.Len())
+	if ctx.outstanding < size {
+		panic("AIOContext outstanding is going negative")
+	}
+	ctx.outstanding -= size
+	ctx.results.Reset()
+	ctx.checkForDone()
 }
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
@@ -332,9 +373,9 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 		Length:          aioRingBufferSize,
 		MappingIdentity: m,
 		Mappable:        m,
-		// TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
-		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
-		// mapping read-only?
+		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+		// user mode should not write to this page.
 		Perms:    usermem.Read,
 		MaxPerms: usermem.Read,
 	})
@@ -349,11 +390,11 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 	return id, nil
 }
 
-// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
-// the context does not exist.
-func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
 	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
-		return false
+		return nil
 	}
 
 	// Only unmaps after it assured that the address is a valid aio context to
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index c37fc9f7b..3dabac1af 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -16,5 +16,5 @@ package mm
 
 // afterLoad is invoked by stateify.
 func (a *AIOContext) afterLoad() {
-	a.done = make(chan struct{}, 1)
+	a.requestReady = make(chan struct{}, 1)
 }
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index b401978db..38cbeba5a 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -114,14 +114,28 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	id := args[0].Uint64()
 
-	// Destroy the given context.
-	if !t.MemoryManager().DestroyAIOContext(t, id) {
+	ctx := t.MemoryManager().DestroyAIOContext(t, id)
+	if ctx == nil {
 		// Does not exist.
 		return 0, nil, syserror.EINVAL
 	}
-	// FIXME(fvoznika): Linux blocks until all AIO to the destroyed context is
-	// done.
-	return 0, nil, nil
+
+	// Drain completed requests amd wait for pending requests until there are no
+	// more.
+	for {
+		ctx.Drain()
+
+		ch := ctx.WaitChannel()
+		if ch == nil {
+			// No more requests, we're done.
+			return 0, nil, nil
+		}
+		// The task cannot be interrupted during the wait. Equivalent to
+		// TASK_UNINTERRUPTIBLE in Linux.
+		t.UninterruptibleSleepStart(true /* deactivate */)
+		<-ch
+		t.UninterruptibleSleepFinish(true /* activate */)
+	}
 }
 
 // IoGetevents implements linux syscall io_getevents(2).
@@ -200,13 +214,13 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
 	for {
 		if v, ok := ctx.PopRequest(); ok {
-			// Request was readly available. Just return it.
+			// Request was readily available. Just return it.
 			return v, nil
 		}
 
 		// Need to wait for request completion.
-		done, active := ctx.WaitChannel()
-		if !active {
+		done := ctx.WaitChannel()
+		if done == nil {
 			// Context has been destroyed.
 			return nil, syserror.EINVAL
 		}
@@ -248,6 +262,10 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
 }
 
 func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
+	if ctx.Dead() {
+		ctx.CancelPendingRequest()
+		return
+	}
 	ev := &ioEvent{
 		Data: cb.Data,
 		Obj:  uint64(cbAddr),
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index a33daff17..806d5729e 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -89,6 +89,7 @@ class AIOTest : public FileTest {
     FileTest::TearDown();
     if (ctx_ != 0) {
       ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+      ctx_ = 0;
     }
   }
 
@@ -188,14 +189,19 @@ TEST_F(AIOTest, BadWrite) {
 }
 
 TEST_F(AIOTest, ExitWithPendingIo) {
-  // Setup a context that is 5 entries deep.
-  ASSERT_THAT(SetupContext(5), SyscallSucceeds());
+  // Setup a context that is 100 entries deep.
+  ASSERT_THAT(SetupContext(100), SyscallSucceeds());
 
   struct iocb cb = CreateCallback();
   struct iocb* cbs[] = {&cb};
 
   // Submit a request but don't complete it to make it pending.
-  EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+  }
+
+  ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+  ctx_ = 0;
 }
 
 int Submitter(void* arg) {
-- 
cgit v1.2.3


From 2a28e3e9c3463cf68cfa639425cfdcc298ad357a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 Apr 2020 17:19:08 -0700
Subject: Don't unconditionally set --panic-signal

Closes #2393

PiperOrigin-RevId: 305793027
---
 runsc/sandbox/sandbox.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 2d464b1bf..e82bcef6f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -402,8 +402,6 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
-	cmd.Args = append(cmd.Args, "--panic-signal="+strconv.Itoa(int(syscall.SIGTERM)))
-
 	// Add the "boot" command to the args.
 	//
 	// All flags after this must be for the boot command
-- 
cgit v1.2.3


From 257225c34b81ff0d0b5ce8ae333f5905f9e86cce Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 9 Apr 2020 17:29:43 -0700
Subject: Downgrade VFS1-specific FIXME to a NOTE.

PiperOrigin-RevId: 305794509
---
 pkg/sentry/fs/dirent.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 0266a5287..65be12175 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -312,9 +312,9 @@ func (d *Dirent) SyncAll(ctx context.Context) {
 
 	// There is nothing to sync for a read-only filesystem.
 	if !d.Inode.MountSource.Flags.ReadOnly {
-		// FIXME(b/34856369): This should be a mount traversal, not a
-		// Dirent traversal, because some Inodes that need to be synced
-		// may no longer be reachable by name (after sys_unlink).
+		// NOTE(b/34856369): This should be a mount traversal, not a Dirent
+		// traversal, because some Inodes that need to be synced may no longer
+		// be reachable by name (after sys_unlink).
 		//
 		// Write out metadata, dirty page cached pages, and sync disk/remote
 		// caches.
-- 
cgit v1.2.3


From c9195349c9ac24ccb538e92b308225dfa4184c42 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 9 Apr 2020 17:59:30 -0700
Subject: Replace type assertion with TaskFromContext.

This should fix panic at aio callback.

PiperOrigin-RevId: 305798549
---
 pkg/sentry/socket/netstack/netstack.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 20e3fa0d2..7ac38764d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -535,7 +535,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	}
 
 	if resCh != nil {
-		t := ctx.(*kernel.Task)
+		t := kernel.TaskFromContext(ctx)
 		if err := t.Block(resCh); err != nil {
 			return 0, syserr.FromError(err).ToError()
 		}
@@ -608,7 +608,7 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader
 	}
 
 	if resCh != nil {
-		t := ctx.(*kernel.Task)
+		t := kernel.TaskFromContext(ctx)
 		if err := t.Block(resCh); err != nil {
 			return 0, syserr.FromError(err).ToError()
 		}
-- 
cgit v1.2.3


From c560bfd1a8cd61c869e180c6cc7bb917fc29e92e Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 9 Apr 2020 18:02:36 -0700
Subject: Drop invalid NDP NS messages

Better validate NDP NS messages and their options before doing work in
response to them. Also make sure that NA messages sent in response to
an NS use the correct IPv6 and link-layer addresses so they are
routed properly and received by the right node.

Test: stack_test.TestNeighorSolicitationResponse
PiperOrigin-RevId: 305799054
---
 pkg/tcpip/checker/checker.go        |  81 +++++++++++-
 pkg/tcpip/network/ipv6/BUILD        |   1 +
 pkg/tcpip/network/ipv6/icmp.go      | 125 ++++++++++++------
 pkg/tcpip/network/ipv6/icmp_test.go |   3 +-
 pkg/tcpip/network/ipv6/ndp_test.go  | 252 ++++++++++++++++++++++++++++++++++++
 5 files changed, 414 insertions(+), 48 deletions(-)

diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 8dc0f7c0e..307f1b666 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -728,7 +728,7 @@ func ICMPv6Code(want byte) TransportChecker {
 // message for type of ty, with potentially additional checks specified by
 // checkers.
 //
-// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
 // NDP message as far as the size of the message (minSize) is concerned. The
 // values within the message are up to checkers to validate.
 func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker {
@@ -760,9 +760,9 @@ func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) N
 // Neighbor Solicitation message (as per the raw wire format), with potentially
 // additional checks specified by checkers.
 //
-// checkers may assume that a valid ICMPv6 is passed to it containing a valid
-// NDPNS message as far as the size of the messages concerned. The values within
-// the message are up to checkers to validate.
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNS message as far as the size of the message is concerned. The values
+// within the message are up to checkers to validate.
 func NDPNS(checkers ...TransportChecker) NetworkChecker {
 	return NDP(header.ICMPv6NeighborSolicit, header.NDPNSMinimumSize, checkers...)
 }
@@ -780,7 +780,54 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
 
 		if got := ns.TargetAddress(); got != want {
-			t.Fatalf("got %T.TargetAddress = %s, want = %s", ns, got, want)
+			t.Errorf("got %T.TargetAddress() = %s, want = %s", ns, got, want)
+		}
+	}
+}
+
+// NDPNA creates a checker that checks that the packet contains a valid NDP
+// Neighbor Advertisement message (as per the raw wire format), with potentially
+// additional checks specified by checkers.
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNA message as far as the size of the message is concerned. The values
+// within the message are up to checkers to validate.
+func NDPNA(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6NeighborAdvert, header.NDPNAMinimumSize, checkers...)
+}
+
+// NDPNATargetAddress creates a checker that checks the Target Address field of
+// a header.NDPNeighborAdvert.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNATargetAddress(want tcpip.Address) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+
+		if got := na.TargetAddress(); got != want {
+			t.Errorf("got %T.TargetAddress() = %s, want = %s", na, got, want)
+		}
+	}
+}
+
+// NDPNASolicitedFlag creates a checker that checks the Solicited field of
+// a header.NDPNeighborAdvert.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNASolicitedFlag(want bool) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+
+		if got := na.SolicitedFlag(); got != want {
+			t.Errorf("got %T.SolicitedFlag = %t, want = %t", na, got, want)
 		}
 	}
 }
@@ -819,6 +866,13 @@ func ndpOptions(t *testing.T, optsBuf header.NDPOptions, opts []header.NDPOption
 			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
 				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
 			}
+		case header.NDPTargetLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPTargetLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
 		default:
 			t.Fatalf("checker not implemented for expected NDP option: %T", wantOpt)
 		}
@@ -831,6 +885,21 @@ func ndpOptions(t *testing.T, optsBuf header.NDPOptions, opts []header.NDPOption
 	}
 }
 
+// NDPNAOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Neighbor Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNAOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+		ndpOptions(t, na.Options(), opts)
+	}
+}
+
 // NDPNSOptions creates a checker that checks that the packet contains the
 // provided NDP options within an NDP Neighbor Solicitation message.
 //
@@ -849,7 +918,7 @@ func NDPNSOptions(opts []header.NDPOption) TransportChecker {
 // NDPRS creates a checker that checks that the packet contains a valid NDP
 // Router Solicitation message (as per the raw wire format).
 //
-// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
 // NDPRS as far as the size of the message is concerned. The values within the
 // message are up to checkers to validate.
 func NDPRS(checkers ...TransportChecker) NetworkChecker {
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index a93a7621a..3f71fc520 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -31,6 +31,7 @@ go_test(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index f91180aa3..dc0369156 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -138,53 +138,48 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 		targetAddr := ns.TargetAddress()
 		s := r.Stack()
-		rxNICID := r.NICID()
-		if isTentative, err := s.IsAddrTentative(rxNICID, targetAddr); err != nil {
-			// We will only get an error if rxNICID is unrecognized,
-			// which should not happen. For now short-circuit this
-			// packet.
+		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
+			// We will only get an error if the NIC is unrecognized, which should not
+			// happen. For now, drop this packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
 		} else if isTentative {
-			// If the target address is tentative and the source
-			// of the packet is a unicast (specified) address, then
-			// the source of the packet is attempting to perform
-			// address resolution on the target. In this case, the
-			// solicitation is silently ignored, as per RFC 4862
-			// section 5.4.3.
+			// If the target address is tentative and the source of the packet is a
+			// unicast (specified) address, then the source of the packet is
+			// attempting to perform address resolution on the target. In this case,
+			// the solicitation is silently ignored, as per RFC 4862 section 5.4.3.
 			//
-			// If the target address is tentative and the source of
-			// the packet is the unspecified address (::), then we
-			// know another node is also performing DAD for the
-			// same address (since targetAddr is tentative for us,
-			// we know we are also performing DAD on it). In this
-			// case we let the stack know so it can handle such a
-			// scenario and do nothing further with the NDP NS.
-			if iph.SourceAddress() == header.IPv6Any {
-				s.DupTentativeAddrDetected(rxNICID, targetAddr)
+			// If the target address is tentative and the source of the packet is the
+			// unspecified address (::), then we know another node is also performing
+			// DAD for the same address (since the target address is tentative for us,
+			// we know we are also performing DAD on it). In this case we let the
+			// stack know so it can handle such a scenario and do nothing further with
+			// the NS.
+			if r.RemoteAddress == header.IPv6Any {
+				s.DupTentativeAddrDetected(e.nicID, targetAddr)
 			}
 
-			// Do not handle neighbor solicitations targeted
-			// to an address that is tentative on the received
-			// NIC any further.
+			// Do not handle neighbor solicitations targeted to an address that is
+			// tentative on the NIC any further.
 			return
 		}
 
-		// At this point we know that targetAddr is not tentative on
-		// rxNICID so the packet is processed as defined in RFC 4861,
-		// as per RFC 4862 section 5.4.3.
+		// At this point we know that the target address is not tentative on the NIC
+		// so the packet is processed as defined in RFC 4861, as per RFC 4862
+		// section 5.4.3.
 
+		// Is the NS targetting us?
 		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
-			// We don't have a useful answer; the best we can do is ignore the request.
 			return
 		}
 
-		// If the NS message has the source link layer option, update the link
-		// address cache with the link address for the sender of the message.
+		// If the NS message contains the Source Link-Layer Address option, update
+		// the link address cache with the value of the option.
 		//
 		// TODO(b/148429853): Properly process the NS message and do Neighbor
 		// Unreachability Detection.
+		var sourceLinkAddr tcpip.LinkAddress
 		for {
 			opt, done, err := it.Next()
 			if err != nil {
@@ -197,22 +192,36 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 			switch opt := opt.(type) {
 			case header.NDPSourceLinkLayerAddressOption:
-				e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, opt.EthernetAddress())
+				// No RFCs define what to do when an NS message has multiple Source
+				// Link-Layer Address options. Since no interface can have multiple
+				// link-layer addresses, we consider such messages invalid.
+				if len(sourceLinkAddr) != 0 {
+					received.Invalid.Increment()
+					return
+				}
+
+				sourceLinkAddr = opt.EthernetAddress()
 			}
 		}
 
-		optsSerializer := header.NDPOptionsSerializer{
-			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
+		unspecifiedSource := r.RemoteAddress == header.IPv6Any
+
+		// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
+		// NOT be included when the source IP address is the unspecified address.
+		// Otherwise, on link layers that have addresses this option MUST be
+		// included in multicast solicitations and SHOULD be included in unicast
+		// solicitations.
+		if len(sourceLinkAddr) == 0 {
+			if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
+				received.Invalid.Increment()
+				return
+			}
+		} else if unspecifiedSource {
+			received.Invalid.Increment()
+			return
+		} else {
+			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, sourceLinkAddr)
 		}
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
-		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
-		packet.SetType(header.ICMPv6NeighborAdvert)
-		na := header.NDPNeighborAdvert(packet.NDPPayload())
-		na.SetSolicitedFlag(true)
-		na.SetOverrideFlag(true)
-		na.SetTargetAddress(targetAddr)
-		opts := na.Options()
-		opts.Serialize(optsSerializer)
 
 		// ICMPv6 Neighbor Solicit messages are always sent to
 		// specially crafted IPv6 multicast addresses. As a result, the
@@ -225,6 +234,40 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		r := r.Clone()
 		defer r.Release()
 		r.LocalAddress = targetAddr
+
+		// As per RFC 4861 section 7.2.4, if the the source of the solicitation is
+		// the unspecified address, the node MUST set the Solicited flag to zero and
+		// multicast the advertisement to the all-nodes address.
+		solicited := true
+		if unspecifiedSource {
+			solicited = false
+			r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+		}
+
+		// If the NS has a source link-layer option, use the link address it
+		// specifies as the remote link address for the response instead of the
+		// source link address of the packet.
+		//
+		// TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
+		// address cache for the right destination link address instead of manually
+		// patching the route with the remote link address if one is specified in a
+		// Source Link-Layer Address option.
+		if len(sourceLinkAddr) != 0 {
+			r.RemoteLinkAddress = sourceLinkAddr
+		}
+
+		optsSerializer := header.NDPOptionsSerializer{
+			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+		}
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+		packet.SetType(header.ICMPv6NeighborAdvert)
+		na := header.NDPNeighborAdvert(packet.NDPPayload())
+		na.SetSolicitedFlag(solicited)
+		na.SetOverrideFlag(true)
+		na.SetTargetAddress(targetAddr)
+		opts := na.Options()
+		opts.Serialize(optsSerializer)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index bae09ed94..bd099a7f8 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -32,7 +32,8 @@ import (
 
 const (
 	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
+	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
 )
 
 var (
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index b113aaacc..8db51da96 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -173,6 +174,257 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	}
 }
 
+func TestNeighorSolicitationResponse(t *testing.T) {
+	const nicID = 1
+	nicAddr := lladdr0
+	remoteAddr := lladdr1
+	nicAddrSNMC := header.SolicitedNodeAddr(nicAddr)
+	nicLinkAddr := linkAddr0
+	remoteLinkAddr0 := linkAddr1
+	remoteLinkAddr1 := linkAddr2
+
+	tests := []struct {
+		name          string
+		nsOpts        header.NDPOptionsSerializer
+		nsSrcLinkAddr tcpip.LinkAddress
+		nsSrc         tcpip.Address
+		nsDst         tcpip.Address
+		nsInvalid     bool
+		naDstLinkAddr tcpip.LinkAddress
+		naSolicited   bool
+		naSrc         tcpip.Address
+		naDst         tcpip.Address
+	}{
+		{
+			name:          "Unspecified source to multicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   false,
+			naSrc:         nicAddr,
+			naDst:         header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name: "Unspecified source with source ll option to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+		{
+			name:          "Unspecified source to unicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   false,
+			naSrc:         nicAddr,
+			naDst:         header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name: "Unspecified source with source ll option to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddr,
+			nsInvalid:     true,
+		},
+
+		{
+			name: "Specified source with 1 source ll to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll different from route to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr1,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name:          "Specified source to multicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+		{
+			name: "Specified source with 2 source ll to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+
+		{
+			name:          "Specified source to unicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll different from route to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr1,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 2 source ll to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(1, 1280, nicLinkAddr)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(nicAddr)
+			opts := ns.Options()
+			opts.Serialize(test.nsOpts)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       test.nsSrc,
+				DstAddr:       test.nsDst,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			if test.nsInvalid {
+				if got := invalid.Value(); got != 1 {
+					t.Fatalf("got invalid = %d, want = 1", got)
+				}
+
+				if p, got := e.Read(); got {
+					t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
+				}
+
+				// If we expected the NS to be invalid, we have nothing else to check.
+				return
+			}
+
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			p, got := e.Read()
+			if !got {
+				t.Fatal("expected an NDP NA response")
+			}
+
+			if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
+				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+			}
+
+			checker.IPv6(t, p.Pkt.Header.View(),
+				checker.SrcAddr(test.naSrc),
+				checker.DstAddr(test.naDst),
+				checker.TTL(header.NDPHopLimit),
+				checker.NDPNA(
+					checker.NDPNASolicitedFlag(test.naSolicited),
+					checker.NDPNATargetAddress(nicAddr),
+					checker.NDPNAOptions([]header.NDPOption{
+						header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
+					}),
+				))
+		})
+	}
+}
+
 // TestNeighorAdvertisementWithTargetLinkLayerOption tests that receiving a
 // valid NDP NA message with the Target Link Layer Address option results in a
 // new entry in the link address cache for the target of the message.
-- 
cgit v1.2.3


From 35e6b6bf1aeb909a12fb80cc99d5695408a9eaa5 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 17 Mar 2020 06:59:54 +0000
Subject: Enable syscall fork_test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I033692bcf4f8139df29e369a12b150d10fccbe32
---
 test/syscalls/linux/fork.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index ff8bdfeb0..853f6231a 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -431,7 +431,6 @@ TEST(CloneTest, NewUserNamespacePermitsAllOtherNamespaces) {
       << "status = " << status;
 }
 
-#ifdef __x86_64__
 // Clone with CLONE_SETTLS and a non-canonical TLS address is rejected.
 TEST(CloneTest, NonCanonicalTLS) {
   constexpr uintptr_t kNonCanonical = 1ull << 48;
@@ -440,11 +439,25 @@ TEST(CloneTest, NonCanonicalTLS) {
   // on this.
   char stack;
 
+  // The raw system call interface on x86-64 is:
+  // long clone(unsigned long flags, void *stack,
+  //            int *parent_tid, int *child_tid,
+  //            unsigned long tls);
+  //
+  // While on arm64, the order of the last two arguments is reversed:
+  // long clone(unsigned long flags, void *stack,
+  //            int *parent_tid, unsigned long tls,
+  //            int *child_tid);
+#if defined(__x86_64__)
   EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
                       nullptr, kNonCanonical),
               SyscallFailsWithErrno(EPERM));
-}
+#elif defined(__aarch64__)
+  EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
+                      kNonCanonical, nullptr),
+              SyscallFailsWithErrno(EPERM));
 #endif
+}
 
 }  // namespace
 }  // namespace testing
-- 
cgit v1.2.3


From 7aa5caae71c29b0be9047a7c156a9daaa435ebb8 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 11 Mar 2020 03:21:34 +0000
Subject: Enable syscall ptrace test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I5bb8fa7d580d173b1438d6465e1adb442216c8fa
---
 pkg/sentry/arch/arch.go           |  3 +++
 pkg/sentry/arch/syscalls_amd64.go |  7 +++++++
 pkg/sentry/arch/syscalls_arm64.go | 13 ++++++++++++-
 pkg/sentry/kernel/task_syscall.go | 14 ++++++++++++++
 test/syscalls/linux/ptrace.cc     | 31 ++++++++++++++++++++++++-------
 5 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 1d11cc472..a903d031c 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -88,6 +88,9 @@ type Context interface {
 	// SyscallNo returns the syscall number.
 	SyscallNo() uintptr
 
+	// SyscallSaveOrig save orignal register value.
+	SyscallSaveOrig()
+
 	// SyscallArgs returns the syscall arguments in an array.
 	SyscallArgs() SyscallArguments
 
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 8b4f23007..3859f41ee 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -18,6 +18,13 @@ package arch
 
 const restartSyscallNr = uintptr(219)
 
+// SyscallSaveOrig save the value of the register which is clobbered in
+// syscall handler(doSyscall()).
+//
+// Noop on x86.
+func (c *context64) SyscallSaveOrig() {
+}
+
 // SyscallNo returns the syscall number according to the 64-bit convention.
 func (c *context64) SyscallNo() uintptr {
 	return uintptr(c.Regs.Orig_rax)
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
index dc13b6124..92d062513 100644
--- a/pkg/sentry/arch/syscalls_arm64.go
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -18,6 +18,17 @@ package arch
 
 const restartSyscallNr = uintptr(128)
 
+// SyscallSaveOrig save the value of the register R0 which is clobbered in
+// syscall handler(doSyscall()).
+//
+// In linux, at the entry of the syscall handler(el0_svc_common()), value of R0
+// is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0
+// was not accessible to the user space application, so we have to do the same
+// operation in the sentry code to save the R0 value into the App context.
+func (c *context64) SyscallSaveOrig() {
+	c.OrigR0 = c.Regs.Regs[0]
+}
+
 // SyscallNo returns the syscall number according to the 64-bit convention.
 func (c *context64) SyscallNo() uintptr {
 	return uintptr(c.Regs.Regs[8])
@@ -40,7 +51,7 @@ func (c *context64) SyscallNo() uintptr {
 // R30: the link register.
 func (c *context64) SyscallArgs() SyscallArguments {
 	return SyscallArguments{
-		SyscallArgument{Value: uintptr(c.Regs.Regs[0])},
+		SyscallArgument{Value: uintptr(c.OrigR0)},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[1])},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[2])},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[3])},
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index d555d69a8..3d7a734ef 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,6 +194,19 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 //
 // The syscall path is very hot; avoid defer.
 func (t *Task) doSyscall() taskRunState {
+	// Save value of the register which is clobbered in the following
+	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
+	//
+	// On x86, register rax was shared by syscall number and return
+	// value, and at the entry of the syscall handler, the rax was
+	// saved to regs.orig_rax which was exposed to user space.
+	// But on arm64, syscall number was passed through X8, and the X0
+	// was shared by the first syscall argument and return value. The
+	// X0 was saved to regs.orig_x0 which was not exposed to user space.
+	// So we have to do the same operation here to save the X0 value
+	// into the task context.
+	t.Arch().SyscallSaveOrig()
+
 	sysno := t.Arch().SyscallNo()
 	args := t.Arch().SyscallArgs()
 
@@ -269,6 +282,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 		return (*runSyscallExit)(nil)
 	}
 	args := t.Arch().SyscallArgs()
+
 	return t.doSyscallInvoke(sysno, args)
 }
 
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index cb828ff88..926690eb8 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -400,9 +400,11 @@ TEST(PtraceTest, GetRegSet) {
   // Read exactly the full register set.
   EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-#ifdef __x86_64__
+#if defined(__x86_64__)
   // Child called kill(2), with SIGSTOP as arg 2.
   EXPECT_EQ(regs.rsi, SIGSTOP);
+#elif defined(__aarch64__)
+  EXPECT_EQ(regs.regs[1], SIGSTOP);
 #endif
 
   // Suppress SIGSTOP and resume the child.
@@ -752,15 +754,23 @@ TEST(PtraceTest,
               SyscallSucceeds());
   EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
       << "si_code = " << siginfo.si_code;
-#ifdef __x86_64__
+
   {
     struct user_regs_struct regs = {};
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    struct iovec iov;
+    iov.iov_base = &regs;
+    iov.iov_len = sizeof(regs);
+    EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
+#if defined(__x86_64__)
     EXPECT_TRUE(regs.orig_rax == SYS_vfork || regs.orig_rax == SYS_clone)
         << "orig_rax = " << regs.orig_rax;
     EXPECT_EQ(grandchild_pid, regs.rax);
-  }
+#elif defined(__aarch64__)
+    EXPECT_TRUE(regs.regs[8] == SYS_clone) << "regs[8] = " << regs.regs[8];
+    EXPECT_EQ(grandchild_pid, regs.regs[0]);
 #endif  // defined(__x86_64__)
+  }
 
   // After this point, the child will be making wait4 syscalls that will be
   // interrupted by saving, so saving is not permitted. Note that this is
@@ -805,14 +815,21 @@ TEST(PtraceTest,
               SyscallSucceedsWithValue(child_pid));
   EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
       << " status " << status;
-#ifdef __x86_64__
   {
     struct user_regs_struct regs = {};
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    struct iovec iov;
+    iov.iov_base = &regs;
+    iov.iov_len = sizeof(regs);
+    EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
+#if defined(__x86_64__)
     EXPECT_EQ(SYS_wait4, regs.orig_rax);
     EXPECT_EQ(grandchild_pid, regs.rax);
-  }
+#elif defined(__aarch64__)
+    EXPECT_EQ(SYS_wait4, regs.regs[8]);
+    EXPECT_EQ(grandchild_pid, regs.regs[0]);
 #endif  // defined(__x86_64__)
+  }
 
   // Detach from the child and wait for it to exit.
   ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
-- 
cgit v1.2.3


From 935007937cee1e2867cc4fc5c00b7f370864e241 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 10 Apr 2020 07:13:16 -0700
Subject: test: remove 1s delay after non-blocking socket pair accept

It was added in cl/201419897 to deflake
socket_ip_tcp_loopback_non_blocking_test_gvisor.
It seems we don't need this hack, because the origin issue isn't
reproducible without this hack.

PiperOrigin-RevId: 305871748
---
 test/syscalls/linux/socket_test_util.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 5d3a39868..53b678e94 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -364,11 +364,6 @@ CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
   }
   MaybeSave();  // Successful accept.
 
-  // FIXME(b/110484944)
-  if (connect_result == -1) {
-    absl::SleepFor(absl::Seconds(1));
-  }
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
-- 
cgit v1.2.3


From 12b00c815638a28943c23d3be6ef09b955c6149e Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 10 Apr 2020 08:20:56 -0700
Subject: Test that RST is sent after ABORT in ESTABLISHED TCP state.

PiperOrigin-RevId: 305879441
---
 test/packetimpact/tests/BUILD                      | 12 +++++++
 .../tests/tcp_noaccept_close_rst_test.go           | 37 ++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 test/packetimpact/tests/tcp_noaccept_close_rst_test.go

diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index a9b2de9b9..956b1addf 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -40,6 +40,18 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "tcp_noaccept_close_rst",
+    srcs = ["tcp_noaccept_close_rst_test.go"],
+    # TODO(b/153380909): Fix netstack then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
new file mode 100644
index 000000000..7ebdd1950
--- /dev/null
+++ b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_noaccept_close_rst_test
+
+import (
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func TestTcpNoAcceptCloseReset(t *testing.T) {
+	dut := tb.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	conn.Handshake()
+	defer conn.Close()
+	dut.Close(listenFd)
+	if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, 1*time.Second); err != nil {
+		t.Fatalf("expected a RST-ACK packet but got none: %s", err)
+	}
+}
-- 
cgit v1.2.3


From 1798d6cbee3360b09d3736069e15fd746e863bd2 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 10 Apr 2020 11:17:59 -0700
Subject: Remove TODO from kernel.Stracer

The dependency strace=>kernel grew over time. strace also depends on
task's FD table and FSContext. It could be fixed with some interfaces
the other way, but then we're trading an interface for another, and
kernel.Stracer is likely cleaner.

Closes #155

PiperOrigin-RevId: 305909678
---
 pkg/sentry/kernel/syscalls.go | 3 ---
 pkg/sentry/strace/strace.go   | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index c9a2321b8..2e3565747 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -209,9 +209,6 @@ type Stracer interface {
 	// SyscallEnter is called on syscall entry.
 	//
 	// The returned private data is passed to SyscallExit.
-	//
-	// TODO(gvisor.dev/issue/155): remove kernel imports from the strace
-	// package so that the type can be used directly.
 	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
 
 	// SyscallExit is called on syscall exit.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 77655558e..b94c4fbf5 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -778,9 +778,6 @@ func (s SyscallMap) Name(sysno uintptr) string {
 //
 // N.B. This is not in an init function because we can't be sure all syscall
 // tables are registered with the kernel when init runs.
-//
-// TODO(gvisor.dev/issue/155): remove kernel package dependencies from this
-// package and have the kernel package self-initialize all syscall tables.
 func Initialize() {
 	for _, table := range kernel.SyscallTables() {
 		// Is this known?
-- 
cgit v1.2.3


From 8bb8027d55d59c9c08e4b7896cf688c0225a1244 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 10 Apr 2020 11:35:38 -0700
Subject: Return EIO from p9 if sending/receiving fails.

Continues the modifications in cl/272963663. This prevents non-syscall errors
from being propogated to kernel/task_syscall.go:ExtractErrno(), which causes a
sentry panic.

PiperOrigin-RevId: 305913127
---
 pkg/p9/client.go             | 45 +++++++++++++++++++++++++++++++-------------
 pkg/p9/client_test.go        |  7 ++++++-
 pkg/p9/transport_flipcall.go |  2 +-
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index a6f493b82..71e944c30 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -174,7 +174,7 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 		// our sendRecv function to use that functionality.  Otherwise,
 		// we stick to sendRecvLegacy.
 		rversion := Rversion{}
-		err := c.sendRecvLegacy(&Tversion{
+		_, err := c.sendRecvLegacy(&Tversion{
 			Version: versionString(requested),
 			MSize:   messageSize,
 		}, &rversion)
@@ -219,11 +219,11 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 			c.sendRecv = c.sendRecvChannel
 		} else {
 			// Channel setup failed; fallback.
-			c.sendRecv = c.sendRecvLegacy
+			c.sendRecv = c.sendRecvLegacySyscallErr
 		}
 	} else {
 		// No channels available: use the legacy mechanism.
-		c.sendRecv = c.sendRecvLegacy
+		c.sendRecv = c.sendRecvLegacySyscallErr
 	}
 
 	// Ensure that the socket and channels are closed when the socket is shut
@@ -305,7 +305,7 @@ func (c *Client) openChannel(id int) error {
 	)
 
 	// Open the data channel.
-	if err := c.sendRecvLegacy(&Tchannel{
+	if _, err := c.sendRecvLegacy(&Tchannel{
 		ID:      uint32(id),
 		Control: 0,
 	}, &rchannel0); err != nil {
@@ -319,7 +319,7 @@ func (c *Client) openChannel(id int) error {
 	defer rchannel0.FilePayload().Close()
 
 	// Open the channel for file descriptors.
-	if err := c.sendRecvLegacy(&Tchannel{
+	if _, err := c.sendRecvLegacy(&Tchannel{
 		ID:      uint32(id),
 		Control: 1,
 	}, &rchannel1); err != nil {
@@ -431,13 +431,28 @@ func (c *Client) waitAndRecv(done chan error) error {
 	}
 }
 
+// sendRecvLegacySyscallErr is a wrapper for sendRecvLegacy that converts all
+// non-syscall errors to EIO.
+func (c *Client) sendRecvLegacySyscallErr(t message, r message) error {
+	received, err := c.sendRecvLegacy(t, r)
+	if !received {
+		log.Warningf("p9.Client.sendRecvChannel: %v", err)
+		return syscall.EIO
+	}
+	return err
+}
+
 // sendRecvLegacy performs a roundtrip message exchange.
 //
+// sendRecvLegacy returns true if a message was received. This allows us to
+// differentiate between failed receives and successful receives where the
+// response was an error message.
+//
 // This is called by internal functions.
-func (c *Client) sendRecvLegacy(t message, r message) error {
+func (c *Client) sendRecvLegacy(t message, r message) (bool, error) {
 	tag, ok := c.tagPool.Get()
 	if !ok {
-		return ErrOutOfTags
+		return false, ErrOutOfTags
 	}
 	defer c.tagPool.Put(tag)
 
@@ -457,12 +472,12 @@ func (c *Client) sendRecvLegacy(t message, r message) error {
 	err := send(c.socket, Tag(tag), t)
 	c.sendMu.Unlock()
 	if err != nil {
-		return err
+		return false, err
 	}
 
 	// Co-ordinate with other receivers.
 	if err := c.waitAndRecv(resp.done); err != nil {
-		return err
+		return false, err
 	}
 
 	// Is it an error message?
@@ -470,14 +485,14 @@ func (c *Client) sendRecvLegacy(t message, r message) error {
 	// For convenience, we transform these directly
 	// into errors. Handlers need not handle this case.
 	if rlerr, ok := resp.r.(*Rlerror); ok {
-		return syscall.Errno(rlerr.Error)
+		return true, syscall.Errno(rlerr.Error)
 	}
 
 	// At this point, we know it matches.
 	//
 	// Per recv call above, we will only allow a type
 	// match (and give our r) or an instance of Rlerror.
-	return nil
+	return true, nil
 }
 
 // sendRecvChannel uses channels to send a message.
@@ -486,7 +501,7 @@ func (c *Client) sendRecvChannel(t message, r message) error {
 	c.channelsMu.Lock()
 	if len(c.availableChannels) == 0 {
 		c.channelsMu.Unlock()
-		return c.sendRecvLegacy(t, r)
+		return c.sendRecvLegacySyscallErr(t, r)
 	}
 	idx := len(c.availableChannels) - 1
 	ch := c.availableChannels[idx]
@@ -526,7 +541,11 @@ func (c *Client) sendRecvChannel(t message, r message) error {
 	}
 
 	// Parse the server's response.
-	_, retErr := ch.recv(r, rsz)
+	resp, retErr := ch.recv(r, rsz)
+	if resp == nil {
+		log.Warningf("p9.Client.sendRecvChannel: p9.channel.recv: %v", retErr)
+		retErr = syscall.EIO
+	}
 
 	// Release the channel.
 	c.channelsMu.Lock()
diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go
index 29a0afadf..c757583e0 100644
--- a/pkg/p9/client_test.go
+++ b/pkg/p9/client_test.go
@@ -96,7 +96,12 @@ func benchmarkSendRecv(b *testing.B, fn func(c *Client) func(message, message) e
 }
 
 func BenchmarkSendRecvLegacy(b *testing.B) {
-	benchmarkSendRecv(b, func(c *Client) func(message, message) error { return c.sendRecvLegacy })
+	benchmarkSendRecv(b, func(c *Client) func(message, message) error {
+		return func(t message, r message) error {
+			_, err := c.sendRecvLegacy(t, r)
+			return err
+		}
+	})
 }
 
 func BenchmarkSendRecvChannel(b *testing.B) {
diff --git a/pkg/p9/transport_flipcall.go b/pkg/p9/transport_flipcall.go
index a0d274f3b..38038abdf 100644
--- a/pkg/p9/transport_flipcall.go
+++ b/pkg/p9/transport_flipcall.go
@@ -236,7 +236,7 @@ func (ch *channel) recv(r message, rsz uint32) (message, error) {
 
 	// Convert errors appropriately; see above.
 	if rlerr, ok := r.(*Rlerror); ok {
-		return nil, syscall.Errno(rlerr.Error)
+		return r, syscall.Errno(rlerr.Error)
 	}
 
 	return r, nil
-- 
cgit v1.2.3


From 76c4314c4fe002691d7b527fd11cb3c91116bf45 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Apr 2020 12:21:50 -0700
Subject: Install Bazel 3.0.0 on Kokoro image.

PiperOrigin-RevId: 305922105
---
 tools/images/ubuntu1604/20_bazel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
index bb7afa676..317bfbffb 100755
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ b/tools/images/ubuntu1604/20_bazel.sh
@@ -16,7 +16,7 @@
 
 set -xeo pipefail
 
-declare -r BAZEL_VERSION=2.0.0
+declare -r BAZEL_VERSION=3.0.0
 
 # Install bazel dependencies.
 while true; do
-- 
cgit v1.2.3


From 82dfc406e21a75945da1a12f40eb876f519e04aa Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Apr 2020 12:51:56 -0700
Subject: Automated rollback of changelist 305922105

PiperOrigin-RevId: 305927989
---
 tools/images/ubuntu1604/20_bazel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
index 317bfbffb..bb7afa676 100755
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ b/tools/images/ubuntu1604/20_bazel.sh
@@ -16,7 +16,7 @@
 
 set -xeo pipefail
 
-declare -r BAZEL_VERSION=3.0.0
+declare -r BAZEL_VERSION=2.0.0
 
 # Install bazel dependencies.
 while true; do
-- 
cgit v1.2.3


From 99056b6bd6bf7227f38c07a6f0214359d212bfc2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Apr 2020 14:02:48 -0700
Subject: Upgrade Kokoro and RBE to bazel 3.0.0

PiperOrigin-RevId: 305940483
---
 BUILD                               | 7 +++++--
 tools/images/ubuntu1604/20_bazel.sh | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/BUILD b/BUILD
index a709a9816..42ec54d37 100644
--- a/BUILD
+++ b/BUILD
@@ -97,7 +97,7 @@ platform(
     remote_execution_properties = """
         properties: {
           name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:5464e3e83dc656fc6e4eae6a01f5c2645f1f7e95854b3802b85e86484132d90e"
         }
         properties: {
           name: "dockerAddCapabilities"
@@ -110,12 +110,15 @@ platform(
     """,
 )
 
+# Bazel version on RBE must by in sync with version on Kokoro.
+# LINT.IfChange
 toolchain(
     name = "cc-toolchain-clang-x86_64-default",
     exec_compatible_with = [
     ],
     target_compatible_with = [
     ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/11.0.0/bazel_3.0.0/cc:cc-compiler-k8",
     toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
 )
+# LINT.ThenChange(tools/images/ubuntu1604/20_bazel.sh)
diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
index bb7afa676..5f668884d 100755
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ b/tools/images/ubuntu1604/20_bazel.sh
@@ -16,7 +16,10 @@
 
 set -xeo pipefail
 
-declare -r BAZEL_VERSION=2.0.0
+# Bazel version on Kokoro must be kept in sync with RBE.
+# LINT.IfChange
+declare -r BAZEL_VERSION=3.0.0
+# LINT.ThenChange(../../../BUILD.opensource)
 
 # Install bazel dependencies.
 while true; do
-- 
cgit v1.2.3


From ca868e3e384fb1dabec4fd27ff3627f61a3bd54d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Apr 2020 14:27:48 -0700
Subject: Automated rollback of changelist 305940483

PiperOrigin-RevId: 305944892
---
 BUILD                               | 7 ++-----
 tools/images/ubuntu1604/20_bazel.sh | 5 +----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/BUILD b/BUILD
index 42ec54d37..a709a9816 100644
--- a/BUILD
+++ b/BUILD
@@ -97,7 +97,7 @@ platform(
     remote_execution_properties = """
         properties: {
           name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:5464e3e83dc656fc6e4eae6a01f5c2645f1f7e95854b3802b85e86484132d90e"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
         }
         properties: {
           name: "dockerAddCapabilities"
@@ -110,15 +110,12 @@ platform(
     """,
 )
 
-# Bazel version on RBE must by in sync with version on Kokoro.
-# LINT.IfChange
 toolchain(
     name = "cc-toolchain-clang-x86_64-default",
     exec_compatible_with = [
     ],
     target_compatible_with = [
     ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/11.0.0/bazel_3.0.0/cc:cc-compiler-k8",
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
     toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
 )
-# LINT.ThenChange(tools/images/ubuntu1604/20_bazel.sh)
diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
index 5f668884d..bb7afa676 100755
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ b/tools/images/ubuntu1604/20_bazel.sh
@@ -16,10 +16,7 @@
 
 set -xeo pipefail
 
-# Bazel version on Kokoro must be kept in sync with RBE.
-# LINT.IfChange
-declare -r BAZEL_VERSION=3.0.0
-# LINT.ThenChange(../../../BUILD.opensource)
+declare -r BAZEL_VERSION=2.0.0
 
 # Install bazel dependencies.
 while true; do
-- 
cgit v1.2.3


From 96f914295920404e7c5c97553771e09b31f6900a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 10 Apr 2020 15:46:16 -0700
Subject: Use O_CLOEXEC when dup'ing FDs

The sentry doesn't allow execve, but it's a good defense
in-depth measure.

PiperOrigin-RevId: 305958737
---
 pkg/sentry/fs/gofer/inode.go     | 2 +-
 pkg/sentry/fsimpl/gofer/gofer.go | 2 +-
 runsc/boot/filter/config.go      | 2 +-
 runsc/main.go                    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 811e8ea30..a016c896e 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -273,7 +273,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle
 	// operations on the old will see the new data. Then, make the new handle take
 	// ownereship of the old FD and mark the old readHandle to not close the FD
 	// when done.
-	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil {
+	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), syscall.O_CLOEXEC); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 20edaf643..bdf11fa65 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1089,7 +1089,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// description, but this doesn't matter since they refer to the
 				// same file (unless d.fs.opts.overlayfsStaleRead is true,
 				// which we handle separately).
-				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil {
+				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil {
 					d.handleMu.Unlock()
 					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
 					h.close(ctx)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 06b9f888a..1828d116a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -44,7 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.AllowValue(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
diff --git a/runsc/main.go b/runsc/main.go
index c1c78529c..59f624842 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -291,7 +291,7 @@ func main() {
 		// want with them. Since Docker and Containerd both eat boot's stderr, we
 		// dup our stderr to the provided log FD so that panics will appear in the
 		// logs, rather than just disappear.
-		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), syscall.O_CLOEXEC); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
 		}
 	}
-- 
cgit v1.2.3


From 09ddb5a4262c39744643b612109dd12dcce176a8 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 10 Apr 2020 19:01:39 -0700
Subject: Port extended attributes to VFS2.

As in VFS1, we only support the user.* namespace. Plumbing is added to tmpfs
and goferfs.
Note that because of the slightly different order of checks between VFS2 and
Linux, one of the xattr tests needs to be relaxed slightly.

Fixes #2363.

PiperOrigin-RevId: 305985121
---
 pkg/sentry/fsimpl/ext/filesystem.go          |   4 +-
 pkg/sentry/fsimpl/gofer/filesystem.go        |  12 ++--
 pkg/sentry/fsimpl/gofer/gofer.go             |  58 +++++++++++----
 pkg/sentry/fsimpl/gofer/p9file.go            |  14 ++++
 pkg/sentry/fsimpl/kernfs/filesystem.go       |   4 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                |   1 +
 pkg/sentry/fsimpl/tmpfs/filesystem.go        |  24 +++----
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             |  77 ++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/xattr.go      |  13 ++--
 pkg/sentry/vfs/anonfs.go                     |   4 +-
 pkg/sentry/vfs/file_description.go           |  32 ++++++---
 pkg/sentry/vfs/file_description_impl_util.go |   4 +-
 pkg/sentry/vfs/filesystem.go                 |  24 ++++++-
 pkg/sentry/vfs/memxattr/BUILD                |  15 ++++
 pkg/sentry/vfs/memxattr/xattr.go             | 102 +++++++++++++++++++++++++++
 pkg/sentry/vfs/options.go                    |  14 ++++
 pkg/sentry/vfs/vfs.go                        |   8 +--
 test/syscalls/linux/xattr.cc                 |   8 +--
 18 files changed, 350 insertions(+), 68 deletions(-)
 create mode 100644 pkg/sentry/vfs/memxattr/BUILD
 create mode 100644 pkg/sentry/vfs/memxattr/xattr.go

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 48eaccdbc..afea58f65 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -476,7 +476,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return nil, err
@@ -485,7 +485,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return "", err
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 137260898..cd744bf5e 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1080,7 +1080,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1088,11 +1088,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx)
+	return d.listxattr(ctx, rp.Credentials(), size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1100,7 +1100,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, name)
+	return d.getxattr(ctx, rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
@@ -1112,7 +1112,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return err
 	}
-	return d.setxattr(ctx, &opts)
+	return d.setxattr(ctx, rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -1124,7 +1124,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 	if err != nil {
 		return err
 	}
-	return d.removexattr(ctx, name)
+	return d.removexattr(ctx, rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index bdf11fa65..2485cdb53 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -34,6 +34,7 @@ package gofer
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -1024,21 +1025,50 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
-	return nil, syserror.ENOTSUP
+// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+// there is no need to expose any other xattrs through a gofer.
+func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+	xattrMap, err := d.file.listXattr(ctx, size)
+	if err != nil {
+		return nil, err
+	}
+	xattrs := make([]string, 0, len(xattrMap))
+	for x := range xattrMap {
+		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			xattrs = append(xattrs, x)
+		}
+	}
+	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
-	// TODO(jamieliu): add vfs.GetxattrOptions.Size
-	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, name string) error {
-	return syserror.ENOTSUP
+func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	return d.file.removeXattr(ctx, name)
 }
 
 // Preconditions: d.isRegularFile() || d.isDirectory().
@@ -1189,21 +1219,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
-	return fd.dentry().listxattr(ctx)
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
 // Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
-	return fd.dentry().getxattr(ctx, name)
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.dentry().setxattr(ctx, &opts)
+	return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.dentry().removexattr(ctx, name)
+	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
 }
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 755ac2985..87f0b877f 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt
 	return err
 }
 
+func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := f.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
 func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
 	ctx.UninterruptibleSleepStart(false)
 	val, err := f.file.GetXattr(name, size)
@@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32)
 	return err
 }
 
+func (f p9file) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := f.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4433071aa..baf81b4db 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -763,7 +763,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -776,7 +776,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index f2ac23c88..4e6cd3491 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sentry/vfs/lock",
+        "//pkg/sentry/vfs/memxattr",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 5339d7072..f4d50d64f 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -696,51 +696,47 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(b/127675828): support extended attributes
-	return nil, syserror.ENOTSUP
+	return d.inode.listxattr(size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
-	// TODO(b/127675828): support extended attributes
-	return "", syserror.ENOTSUP
+	return d.inode.getxattr(rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	return d.inode.setxattr(rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	return d.inode.removexattr(rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 654e788e3..9fa8637d5 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -27,6 +27,7 @@ package tmpfs
 import (
 	"fmt"
 	"math"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -37,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -186,6 +188,11 @@ type inode struct {
 	// filesystem.RmdirAt() drops the reference.
 	refs int64
 
+	// xattrs implements extended attributes.
+	//
+	// TODO(b/148380782): Support xattrs other than user.*
+	xattrs memxattr.SimpleExtendedAttributes
+
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
 	mu    sync.Mutex
@@ -535,6 +542,56 @@ func (i *inode) touchCMtimeLocked() {
 	atomic.StoreInt64(&i.ctime, now)
 }
 
+func (i *inode) listxattr(size uint64) ([]string, error) {
+	return i.xattrs.Listxattr(size)
+}
+
+func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
+	return i.xattrs.Getxattr(opts)
+}
+
+func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Setxattr(opts)
+}
+
+func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Removexattr(name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (i *inode) userXattrSupported() bool {
+	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
+	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -562,3 +619,23 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	creds := auth.CredentialsFromContext(ctx)
 	return fd.inode().setStat(ctx, creds, &opts.Stat)
 }
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listxattr(size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index 89e9ff4d7..af455d5c1 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
 	}
 	defer tpop.Release()
 
-	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 	defer file.DecRef()
 
-	names, err := file.Listxattr(t)
+	names, err := file.Listxattr(t, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -116,7 +116,10 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return 0, nil, err
 	}
 
-	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+		Name: name,
+		Size: uint64(size),
+	})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -145,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	value, err := file.Getxattr(t, name)
+	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -230,7 +233,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
+	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index d1f6dfb45..a64d86122 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -245,7 +245,7 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
 }
 
 // ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
@@ -253,7 +253,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([
 }
 
 // GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 20c545fca..4fb9aea87 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -401,11 +401,11 @@ type FileDescriptionImpl interface {
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
 	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context) ([]string, error)
+	Listxattr(ctx context.Context, size uint64) ([]string, error)
 
 	// Getxattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, name string) (string, error)
+	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
 
 	// Setxattr changes the value associated with the given extended attribute
 	// for the file.
@@ -605,18 +605,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 
 // Listxattr returns all extended attribute names for the file represented by
 // fd.
-func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx)
+	names, err := fd.impl.Listxattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -629,34 +634,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
 
 // Getxattr returns the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, name)
+	return fd.impl.Getxattr(ctx, *opts)
 }
 
 // Setxattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, opts)
+	return fd.impl.Setxattr(ctx, *opts)
 }
 
 // Removexattr removes the given extended attribute from the file represented
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index d45e602ce..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 
 // Listxattr implements FileDescriptionImpl.Listxattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	// This isn't exactly accurate; see FileDescription.Listxattr.
 	return nil, syserror.ENOTSUP
 }
 
 // Getxattr implements FileDescriptionImpl.Getxattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index bef1bd312..a537a29d1 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -442,7 +442,13 @@ type FilesystemImpl interface {
 	// - If extended attributes are not supported by the filesystem,
 	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
 	// explanation.)
-	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+	//
+	// - If the size of the list (including a NUL terminating byte after every
+	// entry) would exceed size, ERANGE may be returned. Note that
+	// implementations are free to ignore size entirely and return without
+	// error). In all cases, if size is 0, the list should be returned without
+	// error, regardless of size.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
 	// GetxattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
@@ -451,7 +457,15 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, GetxattrAt
 	// returns ENOTSUP.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+	//
+	// - If an extended attribute named opts.Name does not exist, ENODATA is
+	// returned.
+	//
+	// - If the size of the return value exceeds opts.Size, ERANGE may be
+	// returned (note that implementations are free to ignore opts.Size entirely
+	// and return without error). In all cases, if opts.Size is 0, the value
+	// should be returned without error, regardless of size.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
 
 	// SetxattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
@@ -460,6 +474,10 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, SetxattrAt
 	// returns ENOTSUP.
+	//
+	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+	// ENODATA is returned.
 	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
 
 	// RemovexattrAt removes the given extended attribute from the file at rp.
@@ -468,6 +486,8 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem,
 	// RemovexattrAt returns ENOTSUP.
+	//
+	// - If name does not exist, ENODATA is returned.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memxattr",
+    srcs = ["xattr.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+	// mu protects the below fields.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+	x.mu.RLock()
+	value, ok := x.xattrs[opts.Name]
+	x.mu.RUnlock()
+	if !ok {
+		return "", syserror.ENODATA
+	}
+	// Check that the size of the buffer provided in getxattr(2) is large enough
+	// to contain the value.
+	if opts.Size != 0 && uint64(len(value)) > opts.Size {
+		return "", syserror.ERANGE
+	}
+	return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if x.xattrs == nil {
+		if opts.Flags&linux.XATTR_REPLACE != 0 {
+			return syserror.ENODATA
+		}
+		x.xattrs = make(map[string]string)
+	}
+
+	_, ok := x.xattrs[opts.Name]
+	if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+		return syserror.EEXIST
+	}
+	if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+		return syserror.ENODATA
+	}
+
+	x.xattrs[opts.Name] = opts.Value
+	return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+	// Keep track of the size of the buffer needed in listxattr(2) for the list.
+	listSize := 0
+	x.mu.RLock()
+	names := make([]string, 0, len(x.xattrs))
+	for n := range x.xattrs {
+		names = append(names, n)
+		// Add one byte per null terminator.
+		listSize += len(n) + 1
+	}
+	x.mu.RUnlock()
+	if size != 0 && uint64(listSize) > size {
+		return nil, syserror.ERANGE
+	}
+	return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if _, ok := x.xattrs[name]; !ok {
+		return syserror.ENODATA
+	}
+	delete(x.xattrs, name)
+	return nil
+}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 2f04bf882..534528ce6 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -132,6 +132,20 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+	// Name is the name of the extended attribute to retrieve.
+	Name string
+
+	// Size is the maximum value size that the caller will tolerate. If the value
+	// is larger than size, getxattr methods may return ERANGE, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	Size uint64
+}
+
 // SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
 // FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
 // FileDescriptionImpl.Setxattr().
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 720b90d8f..f592913d5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -680,10 +680,10 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 
 // ListxattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return names, nil
@@ -705,10 +705,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 
 // GetxattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return val, nil
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 8b00ef44c..3231732ec 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -41,12 +41,12 @@ class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNonexistentFile) {
   const char* path = "/does/not/exist";
-  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
-              SyscallFailsWithErrno(ENOENT));
-  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+  const char* name = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENOENT));
   EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
-  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENOENT));
 }
 
 TEST_F(XattrTest, XattrNullName) {
-- 
cgit v1.2.3


From daf3322498b698518a3c8545ad05f790deb3848c Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Fri, 10 Apr 2020 20:31:07 -0700
Subject: Add logging message for noNewPrivileges OCI option.

noNewPrivileges is ignored if set to false since gVisor assumes that
PR_SET_NO_NEW_PRIVS is always enabled.

PiperOrigin-RevId: 305991947
---
 pkg/sentry/kernel/task_identity.go     | 2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go | 4 ++--
 runsc/specutils/specutils.go           | 6 ++++++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index ce3e6ef28..0325967e4 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) {
 	t.creds.Store(creds)
 }
 
-// updateCredsForExec updates t.creds to reflect an execve().
+// updateCredsForExecLocked updates t.creds to reflect an execve().
 //
 // NOTE(b/30815691): We currently do not implement privileged executables
 // (set-user/group-ID bits and file capabilities). This allows us to make a lot
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 9c6728530..f92bf8096 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -161,8 +161,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
 			return 0, nil, syserror.EINVAL
 		}
-		// no_new_privs is assumed to always be set. See
-		// kernel.Task.updateCredsForExec.
+		// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+		// See kernel.Task.updateCredsForExecLocked.
 		return 0, nil, nil
 
 	case linux.PR_GET_NO_NEW_PRIVS:
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 0f4a9cf6d..837d5e238 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -92,6 +92,12 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
 	}
 
+	// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+	// See kernel.Task.updateCredsForExecLocked.
+	if !spec.Process.NoNewPrivileges {
+		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
+	}
+
 	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
-- 
cgit v1.2.3


From 20203494680f869669ab5318b36e9470ad4b3e7b Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Sat, 11 Apr 2020 06:45:15 -0700
Subject: Improve error messages when parsing headers.

Tested:
  Looked at output of failing tests.
PiperOrigin-RevId: 306031407
---
 test/packetimpact/testbench/connections.go | 2 +-
 test/packetimpact/testbench/layers.go      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index ed8689fd3..b11a534ac 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -213,7 +213,7 @@ func (conn *TCPIPv4) RecvFrame(timeout time.Duration) Layers {
 		}
 		layers, err := ParseEther(b)
 		if err != nil {
-			conn.t.Logf("can't parse frame: %s", err)
+			conn.t.Logf("debug: can't parse frame, ignoring: %s", err)
 			continue // Ignore packets that can't be parsed.
 		}
 		if !conn.incoming.match(layers) {
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 093a46e23..ff800377e 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -153,7 +153,7 @@ func (l *Ether) toBytes() ([]byte, error) {
 			fields.Type = header.IPv4ProtocolNumber
 		default:
 			// TODO(b/150301488): Support more protocols, like IPv6.
-			return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %d", n)
+			return nil, fmt.Errorf("ethernet header's next layer is unrecognized: %#v", n)
 		}
 	}
 	h.Encode(fields)
@@ -191,7 +191,7 @@ func ParseEther(b []byte) (Layers, error) {
 		return append(layers, moreLayers...), nil
 	default:
 		// TODO(b/150301488): Support more protocols, like IPv6.
-		return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %#v", b)
+		return nil, fmt.Errorf("ethernet header's type field is unrecognized: %#04x", h.Type())
 	}
 }
 
@@ -274,7 +274,7 @@ func (l *IPv4) toBytes() ([]byte, error) {
 			fields.Protocol = uint8(header.UDPProtocolNumber)
 		default:
 			// TODO(b/150301488): Support more protocols as needed.
-			return nil, fmt.Errorf("can't deduce the ip header's next protocol: %#v", n)
+			return nil, fmt.Errorf("ipv4 header's next layer is unrecognized: %#v", n)
 		}
 	}
 	if l.SrcAddr != nil {
@@ -344,7 +344,7 @@ func ParseIPv4(b []byte) (Layers, error) {
 		}
 		return append(layers, moreLayers...), nil
 	}
-	return nil, fmt.Errorf("can't deduce the ethernet header's next protocol: %d", h.Protocol())
+	return nil, fmt.Errorf("ipv4 header's protocol field is unrecognized: %#02x", h.Protocol())
 }
 
 func (l *IPv4) match(other Layer) bool {
-- 
cgit v1.2.3


From ef0b5584e5389cc392e03d20976a15974f277251 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Sun, 12 Apr 2020 18:32:18 -0700
Subject: Refactor parser to use a for loop instead of recursion.

This makes the code shorter and less repetitive.

TESTED:
  All unit tests still pass.
PiperOrigin-RevId: 306161475
---
 test/packetimpact/testbench/connections.go | 12 +---
 test/packetimpact/testbench/layers.go      | 93 +++++++++++++++++-------------
 2 files changed, 55 insertions(+), 50 deletions(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index b11a534ac..79c0ccf5c 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -211,11 +211,7 @@ func (conn *TCPIPv4) RecvFrame(timeout time.Duration) Layers {
 		if b == nil {
 			break
 		}
-		layers, err := ParseEther(b)
-		if err != nil {
-			conn.t.Logf("debug: can't parse frame, ignoring: %s", err)
-			continue // Ignore packets that can't be parsed.
-		}
+		layers := Parse(ParseEther, b)
 		if !conn.incoming.match(layers) {
 			continue // Ignore packets that don't match the expected incoming.
 		}
@@ -418,11 +414,7 @@ func (conn *UDPIPv4) Recv(timeout time.Duration) *UDP {
 		if b == nil {
 			break
 		}
-		layers, err := ParseEther(b)
-		if err != nil {
-			conn.t.Logf("can't parse frame: %s", err)
-			continue // Ignore packets that can't be parsed.
-		}
+		layers := Parse(ParseEther, b)
 		if !conn.incoming.match(layers) {
 			continue // Ignore packets that don't match the expected incoming.
 		}
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index ff800377e..b467c15cc 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -172,27 +172,46 @@ func NetworkProtocolNumber(v tcpip.NetworkProtocolNumber) *tcpip.NetworkProtocol
 	return &v
 }
 
+// LayerParser parses the input bytes and returns a Layer along with the next
+// LayerParser to run. If there is no more parsing to do, the returned
+// LayerParser is nil.
+type LayerParser func([]byte) (Layer, LayerParser)
+
+// Parse parses bytes starting with the first LayerParser and using successive
+// LayerParsers until all the bytes are parsed.
+func Parse(parser LayerParser, b []byte) Layers {
+	var layers Layers
+	for {
+		var layer Layer
+		layer, parser = parser(b)
+		layers = append(layers, layer)
+		if parser == nil {
+			break
+		}
+		b = b[layer.length():]
+	}
+	layers.linkLayers()
+	return layers
+}
+
 // ParseEther parses the bytes assuming that they start with an ethernet header
 // and continues parsing further encapsulations.
-func ParseEther(b []byte) (Layers, error) {
+func ParseEther(b []byte) (Layer, LayerParser) {
 	h := header.Ethernet(b)
 	ether := Ether{
 		SrcAddr: LinkAddress(h.SourceAddress()),
 		DstAddr: LinkAddress(h.DestinationAddress()),
 		Type:    NetworkProtocolNumber(h.Type()),
 	}
-	layers := Layers{&ether}
+	var nextParser LayerParser
 	switch h.Type() {
 	case header.IPv4ProtocolNumber:
-		moreLayers, err := ParseIPv4(b[ether.length():])
-		if err != nil {
-			return nil, err
-		}
-		return append(layers, moreLayers...), nil
+		nextParser = ParseIPv4
 	default:
-		// TODO(b/150301488): Support more protocols, like IPv6.
-		return nil, fmt.Errorf("ethernet header's type field is unrecognized: %#04x", h.Type())
+		// Assume that the rest is a payload.
+		nextParser = ParsePayload
 	}
+	return &ether, nextParser
 }
 
 func (l *Ether) match(other Layer) bool {
@@ -313,7 +332,7 @@ func Address(v tcpip.Address) *tcpip.Address {
 
 // ParseIPv4 parses the bytes assuming that they start with an ipv4 header and
 // continues parsing further encapsulations.
-func ParseIPv4(b []byte) (Layers, error) {
+func ParseIPv4(b []byte) (Layer, LayerParser) {
 	h := header.IPv4(b)
 	tos, _ := h.TOS()
 	ipv4 := IPv4{
@@ -329,22 +348,17 @@ func ParseIPv4(b []byte) (Layers, error) {
 		SrcAddr:        Address(h.SourceAddress()),
 		DstAddr:        Address(h.DestinationAddress()),
 	}
-	layers := Layers{&ipv4}
+	var nextParser LayerParser
 	switch h.TransportProtocol() {
 	case header.TCPProtocolNumber:
-		moreLayers, err := ParseTCP(b[ipv4.length():])
-		if err != nil {
-			return nil, err
-		}
-		return append(layers, moreLayers...), nil
+		nextParser = ParseTCP
 	case header.UDPProtocolNumber:
-		moreLayers, err := ParseUDP(b[ipv4.length():])
-		if err != nil {
-			return nil, err
-		}
-		return append(layers, moreLayers...), nil
+		nextParser = ParseUDP
+	default:
+		// Assume that the rest is a payload.
+		nextParser = ParsePayload
 	}
-	return nil, fmt.Errorf("ipv4 header's protocol field is unrecognized: %#02x", h.Protocol())
+	return &ipv4, nextParser
 }
 
 func (l *IPv4) match(other Layer) bool {
@@ -470,7 +484,7 @@ func Uint32(v uint32) *uint32 {
 
 // ParseTCP parses the bytes assuming that they start with a tcp header and
 // continues parsing further encapsulations.
-func ParseTCP(b []byte) (Layers, error) {
+func ParseTCP(b []byte) (Layer, LayerParser) {
 	h := header.TCP(b)
 	tcp := TCP{
 		SrcPort:       Uint16(h.SourcePort()),
@@ -483,12 +497,7 @@ func ParseTCP(b []byte) (Layers, error) {
 		Checksum:      Uint16(h.Checksum()),
 		UrgentPointer: Uint16(h.UrgentPointer()),
 	}
-	layers := Layers{&tcp}
-	moreLayers, err := ParsePayload(b[tcp.length():])
-	if err != nil {
-		return nil, err
-	}
-	return append(layers, moreLayers...), nil
+	return &tcp, ParsePayload
 }
 
 func (l *TCP) match(other Layer) bool {
@@ -557,8 +566,8 @@ func setUDPChecksum(h *header.UDP, udp *UDP) error {
 }
 
 // ParseUDP parses the bytes assuming that they start with a udp header and
-// continues parsing further encapsulations.
-func ParseUDP(b []byte) (Layers, error) {
+// returns the parsed layer and the next parser to use.
+func ParseUDP(b []byte) (Layer, LayerParser) {
 	h := header.UDP(b)
 	udp := UDP{
 		SrcPort:  Uint16(h.SourcePort()),
@@ -566,12 +575,7 @@ func ParseUDP(b []byte) (Layers, error) {
 		Length:   Uint16(h.Length()),
 		Checksum: Uint16(h.Checksum()),
 	}
-	layers := Layers{&udp}
-	moreLayers, err := ParsePayload(b[udp.length():])
-	if err != nil {
-		return nil, err
-	}
-	return append(layers, moreLayers...), nil
+	return &udp, ParsePayload
 }
 
 func (l *UDP) match(other Layer) bool {
@@ -603,11 +607,11 @@ func (l *Payload) String() string {
 
 // ParsePayload parses the bytes assuming that they start with a payload and
 // continue to the end. There can be no further encapsulations.
-func ParsePayload(b []byte) (Layers, error) {
+func ParsePayload(b []byte) (Layer, LayerParser) {
 	payload := Payload{
 		Bytes: b,
 	}
-	return Layers{&payload}, nil
+	return &payload, nil
 }
 
 func (l *Payload) toBytes() ([]byte, error) {
@@ -625,15 +629,24 @@ func (l *Payload) length() int {
 // Layers is an array of Layer and supports similar functions to Layer.
 type Layers []Layer
 
-func (ls *Layers) toBytes() ([]byte, error) {
+// linkLayers sets the linked-list ponters in ls.
+func (ls *Layers) linkLayers() {
 	for i, l := range *ls {
 		if i > 0 {
 			l.setPrev((*ls)[i-1])
+		} else {
+			l.setPrev(nil)
 		}
 		if i+1 < len(*ls) {
 			l.setNext((*ls)[i+1])
+		} else {
+			l.setNext(nil)
 		}
 	}
+}
+
+func (ls *Layers) toBytes() ([]byte, error) {
+	ls.linkLayers()
 	outBytes := []byte{}
 	for _, l := range *ls {
 		layerBytes, err := l.toBytes()
-- 
cgit v1.2.3


From 445c366581637b64336a18d69519faee5a444f5d Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 13 Apr 2020 10:51:08 -0700
Subject: Fix VFS2 getdents()/getdents64() alignment.

PiperOrigin-RevId: 306263615
---
 pkg/sentry/syscalls/linux/vfs2/getdents.go | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index a61cc5059..62e98817d 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -97,6 +97,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		//     char           d_name[]; /* Filename (null-terminated) */
 		// };
 		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of 8
 		if size > cb.remaining {
 			return syserror.EINVAL
 		}
@@ -106,7 +107,12 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
 		buf[18] = dirent.Type
 		copy(buf[19:], dirent.Name)
-		buf[size-1] = 0 // NUL terminator
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name.
+		bufTail := buf[19+len(dirent.Name):]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
 	} else {
 		// struct linux_dirent {
 		//     unsigned long  d_ino;     /* Inode number */
@@ -125,6 +131,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
 		}
 		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
 		if size > cb.remaining {
 			return syserror.EINVAL
 		}
@@ -133,9 +140,14 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
 		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
 		copy(buf[18:], dirent.Name)
-		buf[size-3] = 0 // NUL terminator
-		buf[size-2] = 0 // zero padding byte
-		buf[size-1] = dirent.Type
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name and the zero padding byte between the name and
+		// dirent type.
+		bufTail := buf[18+len(dirent.Name):]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
+		bufTail[2] = dirent.Type
 	}
 	n, err := cb.t.CopyOutBytes(cb.addr, buf)
 	if err != nil {
-- 
cgit v1.2.3


From 6a4d17a31dc209afbbca66e871a7c6dc299c167b Mon Sep 17 00:00:00 2001
From: Jon Budd <jonbudd@google.com>
Date: Mon, 13 Apr 2020 11:01:02 -0700
Subject: Remove obsolete TODOs for b/38173783

The comments in the ticket indicate that this behavior
is fine and that the ticket should be closed, so we shouldn't
need pointers to the ticket.

PiperOrigin-RevId: 306266071
---
 pkg/context/context.go               | 4 ----
 pkg/sentry/arch/stack.go             | 3 ---
 pkg/sentry/fs/gofer/file_state.go    | 1 -
 pkg/sentry/fs/gofer/handles.go       | 1 -
 pkg/sentry/fs/gofer/inode_state.go   | 1 -
 pkg/sentry/fs/gofer/session_state.go | 1 -
 pkg/sentry/fs/inode.go               | 1 -
 pkg/sentry/kernel/shm/shm.go         | 2 +-
 pkg/sentry/kernel/task_context.go    | 1 -
 pkg/sentry/kernel/task_signals.go    | 2 --
 pkg/usermem/usermem.go               | 3 ---
 11 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/pkg/context/context.go b/pkg/context/context.go
index 23e009ef3..5319b6d8d 100644
--- a/pkg/context/context.go
+++ b/pkg/context/context.go
@@ -127,10 +127,6 @@ func (logContext) Value(key interface{}) interface{} {
 var bgContext = &logContext{Logger: log.Log()}
 
 // Background returns an empty context using the default logger.
-//
-// Users should be wary of using a Background context. Please tag any use with
-// FIXME(b/38173783) and a note to remove this use.
-//
 // Generally, one should use the Task as their context when available, or avoid
 // having to use a context in places where a Task is unavailable.
 //
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 09bceabc9..1108fa0bd 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -97,7 +97,6 @@ func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
 		if c < 0 {
 			return 0, fmt.Errorf("bad binary.Size for %T", v)
 		}
-		// TODO(b/38173783): Use a real context.Context.
 		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
 		if err != nil || c != n {
 			return 0, err
@@ -121,11 +120,9 @@ func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
 		var err error
 		if isVaddr {
 			value := s.Arch.Native(uintptr(0))
-			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
 			*vaddr = usermem.Addr(s.Arch.Value(value))
 		} else {
-			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
 		}
 		if err != nil {
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index ff96b28ba..edd6576aa 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -34,7 +34,6 @@ func (f *fileOperations) afterLoad() {
 		flags := f.flags
 		flags.Truncate = false
 
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 9f7c3e89f..fc14249be 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -57,7 +57,6 @@ func (h *handles) DecRef() {
 				}
 			}
 		}
-		// FIXME(b/38173783): Context is not plumbed here.
 		if err := h.File.close(context.Background()); err != nil {
 			log.Warningf("error closing p9 file: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 238f7804c..a3402e343 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -123,7 +123,6 @@ func (i *inodeFileState) afterLoad() {
 			// beforeSave.
 			return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))
 		}
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 111da59f9..2d398b753 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -104,7 +104,6 @@ func (s *session) afterLoad() {
 	// If private unix sockets are enabled, create and fill the session's endpoint
 	// maps.
 	if opts.privateunixsocket {
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		if err = s.restoreEndpointMaps(ctx); err != nil {
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 73f89abcc..a34fbc946 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -102,7 +102,6 @@ func (i *Inode) DecRef() {
 
 // destroy releases the Inode and releases the msrc reference taken.
 func (i *Inode) destroy() {
-	// FIXME(b/38173783): Context is not plumbed here.
 	ctx := context.Background()
 	if err := i.WriteOut(ctx); err != nil {
 		// FIXME(b/65209558): Mark as warning again once noatime is
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 208569057..f66cfcc7f 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
+	// RemoveMapping may be called during task exit, when ctx
 	// is context.Background. Gracefully handle missing clocks. Failing to
 	// update the detach time in these cases is ok, since no one can observe the
 	// omission.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index c115e8d1f..9fa528384 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -58,7 +58,6 @@ func (tc *TaskContext) release() {
 	// Nil out pointers so that if the task is saved after release, it doesn't
 	// follow the pointers to possibly now-invalid objects.
 	if tc.MemoryManager != nil {
-		// TODO(b/38173783)
 		tc.MemoryManager.DecUsers(context.Background())
 		tc.MemoryManager = nil
 	}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8802db142..6aa798346 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -513,8 +513,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
 	if t.stop != nil {
 		return false
 	}
-	// - TODO(b/38173783): No special case for when t is also the sending task,
-	// because the identity of the sender is unknown.
 	// - Do not choose tasks that have already been interrupted, as they may be
 	// busy handling another signal.
 	if len(t.interruptChan) != 0 {
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index d2f4403b0..cd6a0ea6b 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -29,9 +29,6 @@ import (
 )
 
 // IO provides access to the contents of a virtual memory space.
-//
-// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any
-// meaningful data.
 type IO interface {
 	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
 	// returns the number of bytes copied. If the number of bytes copied is <
-- 
cgit v1.2.3


From aa75a3da5188d8f62d00fc6590708ca4886526b4 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 13 Apr 2020 12:47:32 -0700
Subject: Fix build.sh and VM targets.

PiperOrigin-RevId: 306289643
---
 benchmarks/runner/__init__.py |   6 +-
 benchmarks/runner/commands.py |   8 +--
 tools/image_build.sh          |  98 ------------------------------
 tools/images/BUILD            |   9 +--
 tools/images/README.md        |  42 +++++++++++++
 tools/images/build.sh         |  35 ++++++-----
 tools/images/defs.bzl         | 136 ++++++++++++++++++++++--------------------
 tools/images/zone.sh          |  17 ++++++
 8 files changed, 161 insertions(+), 190 deletions(-)
 delete mode 100755 tools/image_build.sh
 create mode 100644 tools/images/README.md
 create mode 100755 tools/images/zone.sh

diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
index ca785a148..fc59cf505 100644
--- a/benchmarks/runner/__init__.py
+++ b/benchmarks/runner/__init__.py
@@ -19,6 +19,7 @@ import logging
 import pkgutil
 import pydoc
 import re
+import subprocess
 import sys
 import types
 from typing import List
@@ -125,9 +126,8 @@ def run_gcp(ctx, image_file: str, zone_file: str, internal: bool,
   """Runs all benchmarks on GCP instances."""
 
   # Resolve all files.
-  image = open(image_file).read().rstrip()
-  zone = open(zone_file).read().rstrip()
-
+  image = subprocess.check_output([image_file]).rstrip()
+  zone = subprocess.check_output([zone_file]).rstrip()
   key_file = harness.make_key()
 
   producer = gcloud_producer.GCloudProducer(
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
index 194804527..e8289f6c5 100644
--- a/benchmarks/runner/commands.py
+++ b/benchmarks/runner/commands.py
@@ -101,15 +101,15 @@ class GCPCommand(RunCommand):
 
     image_file = click.core.Option(
         ("--image_file",),
-        help="The file containing the image for VMs.",
+        help="The binary that emits the GCP image.",
         default=os.path.join(
-            os.path.dirname(__file__), "../../tools/images/ubuntu1604.txt"),
+            os.path.dirname(__file__), "../../tools/images/ubuntu1604"),
     )
     zone_file = click.core.Option(
         ("--zone_file",),
-        help="The file containing the GCP zone.",
+        help="The binary that emits the GCP zone.",
         default=os.path.join(
-            os.path.dirname(__file__), "../../tools/images/zone.txt"),
+            os.path.dirname(__file__), "../../tools/images/zone"),
     )
     internal = click.core.Option(
         ("--internal/--no-internal",),
diff --git a/tools/image_build.sh b/tools/image_build.sh
deleted file mode 100755
index 9b20a740d..000000000
--- a/tools/image_build.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script is responsible for building a new GCP image that: 1) has nested
-# virtualization enabled, and 2) has been completely set up with the
-# image_setup.sh script. This script should be idempotent, as we memoize the
-# setup script with a hash and check for that name.
-#
-# The GCP project name should be defined via a gcloud config.
-
-set -xeo pipefail
-
-# Parameters.
-declare -r ZONE=${ZONE:-us-central1-f}
-declare -r USERNAME=${USERNAME:-test}
-declare -r IMAGE_PROJECT=${IMAGE_PROJECT:-ubuntu-os-cloud}
-declare -r IMAGE_FAMILY=${IMAGE_FAMILY:-ubuntu-1604-lts}
-
-# Random names.
-declare -r DISK_NAME=$(mktemp -u disk-XXXXXX | tr A-Z a-z)
-declare -r SNAPSHOT_NAME=$(mktemp -u snapshot-XXXXXX | tr A-Z a-z)
-declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
-
-# Hashes inputs.
-declare -r SETUP_BLOB=$(echo ${ZONE} ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && sha256sum "$@")
-declare -r SETUP_HASH=$(echo ${SETUP_BLOB} | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
-declare -r IMAGE_NAME=${IMAGE_NAME:-image-}${SETUP_HASH}
-
-# Does the image already exist? Skip the build.
-declare -r existing=$(gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
-if ! [[ -z "${existing}" ]]; then
-  echo "${existing}"
-  exit 0
-fi
-
-# Set the zone for all actions.
-gcloud config set compute/zone "${ZONE}"
-
-# Start a unique instance. Note that this instance will have a unique persistent
-# disk as it's boot disk with the same name as the instance.
-gcloud compute instances create \
-    --quiet \
-    --image-project "${IMAGE_PROJECT}" \
-    --image-family "${IMAGE_FAMILY}" \
-    --boot-disk-size "200GB" \
-    "${INSTANCE_NAME}"
-function cleanup {
-    gcloud compute instances delete --quiet "${INSTANCE_NAME}"
-}
-trap cleanup EXIT
-
-# Wait for the instance to become available.
-declare attempts=0
-while [[ "${attempts}" -lt 30 ]]; do
-  attempts=$((${attempts}+1))
-  if gcloud compute ssh "${USERNAME}"@"${INSTANCE_NAME}" -- true; then
-    break
-  fi
-done
-if [[ "${attempts}" -ge 30 ]]; then
-  echo "too many attempts: failed"
-  exit 1
-fi
-
-# Run the install scripts provided.
-for arg; do
-  gcloud compute ssh "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}"
-done
-
-# Stop the instance; required before creating an image.
-gcloud compute instances stop --quiet "${INSTANCE_NAME}"
-
-# Create a snapshot of the instance disk.
-gcloud compute disks snapshot \
-    --quiet \
-    --zone="${ZONE}" \
-    --snapshot-names="${SNAPSHOT_NAME}" \
-    "${INSTANCE_NAME}"
-
-# Create the disk image.
-gcloud compute images create \
-    --quiet \
-    --source-snapshot="${SNAPSHOT_NAME}" \
-    --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
-    "${IMAGE_NAME}"
diff --git a/tools/images/BUILD b/tools/images/BUILD
index 66ffd02aa..8d319e3e4 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -6,14 +6,9 @@ package(
     licenses = ["notice"],
 )
 
-genrule(
+sh_binary(
     name = "zone",
-    outs = ["zone.txt"],
-    cmd = "gcloud config get-value compute/zone > \"$@\"",
-    tags = [
-        "local",
-        "manual",
-    ],
+    srcs = ["zone.sh"],
 )
 
 sh_binary(
diff --git a/tools/images/README.md b/tools/images/README.md
new file mode 100644
index 000000000..26c0f84f2
--- /dev/null
+++ b/tools/images/README.md
@@ -0,0 +1,42 @@
+# Images
+
+All commands in this directory require the `gcloud` project to be set.
+
+For example: `gcloud config set project gvisor-kokoro-testing`.
+
+Images can be generated by using the `vm_image` rule. This rule will generate a
+binary target that builds an image in an idempotent way, and can be referenced
+from other rules.
+
+For example:
+
+```
+vm_image(
+    name = "ubuntu",
+    project = "ubuntu-1604-lts",
+    family = "ubuntu-os-cloud",
+    scripts = [
+        "script.sh",
+        "other.sh",
+    ],
+)
+```
+
+These images can be built manually by executing the target. The output on
+`stdout` will be the image id (in the current project).
+
+Images are always named per the hash of all the hermetic input scripts. This
+allows images to be memoized quickly and easily.
+
+The `vm_test` rule can be used to execute a command remotely. This is still
+under development however, and will likely change over time.
+
+For example:
+
+```
+vm_test(
+    name = "mycommand",
+    image = ":ubuntu",
+    targets = [":test"],
+)
+```
diff --git a/tools/images/build.sh b/tools/images/build.sh
index f89f39cbd..f39f723b8 100755
--- a/tools/images/build.sh
+++ b/tools/images/build.sh
@@ -19,7 +19,7 @@
 # image_setup.sh script. This script should be idempotent, as we memoize the
 # setup script with a hash and check for that name.
 
-set -xeou pipefail
+set -eou pipefail
 
 # Parameters.
 declare -r USERNAME=${USERNAME:-test}
@@ -34,10 +34,10 @@ declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
 
 # Hash inputs in order to memoize the produced image.
 declare -r SETUP_HASH=$( (echo ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && cat "$@") | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
-declare -r IMAGE_NAME=${IMAGE_FAMILY:-image-}${SETUP_HASH}
+declare -r IMAGE_NAME=${IMAGE_FAMILY:-image}-${SETUP_HASH}
 
 # Does the image already exist? Skip the build.
-declare -r existing=$(gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
+declare -r existing=$(set -x; gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
 if ! [[ -z "${existing}" ]]; then
   echo "${existing}"
   exit 0
@@ -48,28 +48,30 @@ export PATH=${PATH:-/bin:/usr/bin:/usr/local/bin}
 
 # Start a unique instance. Note that this instance will have a unique persistent
 # disk as it's boot disk with the same name as the instance.
-gcloud compute instances create \
+(set -x; gcloud compute instances create \
     --quiet \
     --image-project "${IMAGE_PROJECT}" \
     --image-family "${IMAGE_FAMILY}" \
     --boot-disk-size "200GB" \
     --zone "${ZONE}" \
-    "${INSTANCE_NAME}" >/dev/null
+    "${INSTANCE_NAME}" >/dev/null)
 function cleanup {
-    gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}"
+  (set -x; gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}")
 }
 trap cleanup EXIT
 
 # Wait for the instance to become available (up to 5 minutes).
+echo -n "Waiting for ${INSTANCE_NAME}"
 declare timeout=300
 declare success=0
 declare internal=""
 declare -r start=$(date +%s)
 declare -r end=$((${start}+${timeout}))
 while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
-  if gcloud compute ssh --zone "${internal}" "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
+  echo -n "."
+  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
     success=$((${success}+1))
-  elif gcloud compute ssh --zone --internal-ip "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
+  elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
     success=$((${success}+1))
     internal="--internal-ip"
   fi
@@ -78,29 +80,34 @@ done
 if [[ "${success}" -eq "0" ]]; then
   echo "connect timed out after ${timeout} seconds."
   exit 1
+else
+  echo "done."
 fi
 
 # Run the install scripts provided.
 for arg; do
-  gcloud compute ssh --zone "${internal}" "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}" >/dev/null
+  (set -x; gcloud compute ssh ${internal} \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      sudo bash - <"${arg}" >/dev/null)
 done
 
 # Stop the instance; required before creating an image.
-gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null
+(set -x; gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null)
 
 # Create a snapshot of the instance disk.
-gcloud compute disks snapshot \
+(set -x; gcloud compute disks snapshot \
     --quiet \
     --zone "${ZONE}" \
     --snapshot-names="${SNAPSHOT_NAME}" \
-    "${INSTANCE_NAME}" >/dev/null
+    "${INSTANCE_NAME}" >/dev/null)
 
 # Create the disk image.
-gcloud compute images create \
+(set -x; gcloud compute images create \
     --quiet \
     --source-snapshot="${SNAPSHOT_NAME}" \
     --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
-    "${IMAGE_NAME}" >/dev/null
+    "${IMAGE_NAME}" >/dev/null)
 
 # Finish up.
 echo "${IMAGE_NAME}"
diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl
index de365d153..2847e1847 100644
--- a/tools/images/defs.bzl
+++ b/tools/images/defs.bzl
@@ -1,76 +1,49 @@
-"""Image configuration.
-
-Images can be generated by using the vm_image rule. For example,
-
-  vm_image(
-      name = "ubuntu",
-      project = "...",
-      family = "...",
-      scripts = [
-          "script.sh",
-          "other.sh",
-      ],
-  )
-
-This will always create an vm_image in the current default gcloud project. The
-rule has a text file as its output containing the image name. This will enforce
-serialization for all dependent rules.
-
-Images are always named per the hash of all the hermetic input scripts. This
-allows images to be memoized quickly and easily.
-
-The vm_test rule can be used to execute a command remotely. For example,
-
-  vm_test(
-      name = "mycommand",
-      image = ":myimage",
-      targets = [":test"],
-  )
-"""
+"""Image configuration. See README.md."""
 
 load("//tools:defs.bzl", "default_installer")
 
-def _vm_image_impl(ctx):
+# vm_image_builder is a rule that will construct a shell script that actually
+# generates a given VM image. Note that this does not _run_ the shell script
+# (although it can be run manually). It will be run manually during generation
+# of the vm_image target itself. This level of indirection is used so that the
+# build system itself only runs the builder once when multiple targets depend
+# on it, avoiding a set of races and conflicts.
+def _vm_image_builder_impl(ctx):
+    # Generate a binary that actually builds the image.
+    builder = ctx.actions.declare_file(ctx.label.name)
     script_paths = []
     for script in ctx.files.scripts:
         script_paths.append(script.short_path)
+    builder_content = "\n".join([
+        "#!/bin/bash",
+        "export ZONE=$(%s)" % ctx.files.zone[0].short_path,
+        "export USERNAME=%s" % ctx.attr.username,
+        "export IMAGE_PROJECT=%s" % ctx.attr.project,
+        "export IMAGE_FAMILY=%s" % ctx.attr.family,
+        "%s %s" % (ctx.files._builder[0].short_path, " ".join(script_paths)),
+        "",
+    ])
+    ctx.actions.write(builder, builder_content, is_executable = True)
 
-    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
-        command = "USERNAME=%s ZONE=$(cat %s) IMAGE_PROJECT=%s IMAGE_FAMILY=%s %s %s > %s" %
-                  (
-                      ctx.attr.username,
-                      ctx.files.zone[0].path,
-                      ctx.attr.project,
-                      ctx.attr.family,
-                      ctx.executable.builder.path,
-                      " ".join(script_paths),
-                      ctx.outputs.out.path,
-                  ),
-        tools = [ctx.attr.builder] + ctx.attr.scripts,
-    )
-
-    ctx.actions.run_shell(
-        tools = resolved_inputs,
-        outputs = [ctx.outputs.out],
-        progress_message = "Building image...",
-        execution_requirements = {"local": "true"},
-        command = argv,
-        input_manifests = runfiles_manifests,
-    )
+    # Note that the scripts should only be files, and should not include any
+    # indirect transitive dependencies. The build script wouldn't work.
     return [DefaultInfo(
-        files = depset([ctx.outputs.out]),
-        runfiles = ctx.runfiles(files = [ctx.outputs.out]),
+        executable = builder,
+        runfiles = ctx.runfiles(
+            files = ctx.files.scripts + ctx.files._builder + ctx.files.zone,
+        ),
     )]
 
-_vm_image = rule(
+vm_image_builder = rule(
     attrs = {
-        "builder": attr.label(
+        "_builder": attr.label(
             executable = True,
             default = "//tools/images:builder",
             cfg = "host",
         ),
         "username": attr.string(default = "$(whoami)"),
         "zone": attr.label(
+            executable = True,
             default = "//tools/images:zone",
             cfg = "host",
         ),
@@ -78,20 +51,55 @@ _vm_image = rule(
         "project": attr.string(mandatory = True),
         "scripts": attr.label_list(allow_files = True),
     },
-    outputs = {
-        "out": "%{name}.txt",
+    executable = True,
+    implementation = _vm_image_builder_impl,
+)
+
+# See vm_image_builder above.
+def _vm_image_impl(ctx):
+    # Run the builder to generate our output.
+    echo = ctx.actions.declare_file(ctx.label.name)
+    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
+        command = "echo -ne \"#!/bin/bash\\necho $(%s)\\n\" > %s && chmod 0755 %s" % (
+            ctx.files.builder[0].path,
+            echo.path,
+            echo.path,
+        ),
+        tools = [ctx.attr.builder],
+    )
+    ctx.actions.run_shell(
+        tools = resolved_inputs,
+        outputs = [echo],
+        progress_message = "Building image...",
+        execution_requirements = {"local": "true"},
+        command = argv,
+        input_manifests = runfiles_manifests,
+    )
+
+    # Return just the echo command. All of the builder runfiles have been
+    # resolved and consumed in the generation of the trivial echo script.
+    return [DefaultInfo(executable = echo)]
+
+_vm_image = rule(
+    attrs = {
+        "builder": attr.label(
+            executable = True,
+            cfg = "host",
+        ),
     },
+    executable = True,
     implementation = _vm_image_impl,
 )
 
-def vm_image(**kwargs):
-    _vm_image(
-        tags = [
-            "local",
-            "manual",
-        ],
+def vm_image(name, **kwargs):
+    vm_image_builder(
+        name = name + "_builder",
         **kwargs
     )
+    _vm_image(
+        name = name,
+        builder = ":" + name + "_builder",
+    )
 
 def _vm_test_impl(ctx):
     runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
diff --git a/tools/images/zone.sh b/tools/images/zone.sh
new file mode 100755
index 000000000..79569fb19
--- /dev/null
+++ b/tools/images/zone.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exec gcloud config get-value compute/zone
-- 
cgit v1.2.3


From 5d885d7fb21414d903d57ffe2b95bcc62c098d6a Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 13 Apr 2020 13:01:28 -0700
Subject: Port socket-related syscalls to VFS2.

Note that most kinds of sockets are not yet supported in VFS2
(only Unix sockets are partially supported at the moment), so
these syscalls will still generally fail. Enabling them allows
us to begin running socket tests for VFS2 as more features are
ported over.

Updates #1476, #1478, #1484, #1485.

PiperOrigin-RevId: 306292294
---
 pkg/sentry/kernel/fd_table.go                      |   55 +
 pkg/sentry/kernel/task.go                          |    9 +
 pkg/sentry/syscalls/linux/sys_socket.go            |    9 +-
 pkg/sentry/syscalls/linux/vfs2/BUILD               |    6 +
 .../syscalls/linux/vfs2/linux64_override_amd64.go  |   40 +-
 pkg/sentry/syscalls/linux/vfs2/socket.go           | 1138 ++++++++++++++++++++
 6 files changed, 1238 insertions(+), 19 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/socket.go

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index d09d97825..ed40b5303 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -307,6 +307,61 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return nil, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if fd >= end {
+			return nil, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
+	// Install all entries.
+	for i := fd; i < end && len(fds) < len(files); i++ {
+		if d, _, _ := f.getVFS2(i); d == nil {
+			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)                 // Record the file descriptor.
+		}
+	}
+
+	// Failure? Unwind existing FDs.
+	if len(fds) < len(files) {
+		for _, i := range fds {
+			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+		}
+		return nil, syscall.EMFILE
+	}
+
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
+	return fds, nil
+}
+
 // NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
 // the given file description. If it succeeds, it takes a reference on file.
 func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index d6546735e..e5d133d6c 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -777,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error
 	return t.fdTable.NewFDs(t, fd, files, flags)
 }
 
+// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
+	return t.fdTable.NewFDsVFS2(t, fd, files, flags)
+}
+
 // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
 //
 // This automatically passes the task as the context.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 2919228d0..61b2576ac 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -31,6 +31,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
 const minListenBacklog = 8
 
@@ -244,7 +246,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	// Copy the file descriptors out.
 	if _, err := t.CopyOut(socks, fds); err != nil {
-		// Note that we don't close files here; see pipe(2) also.
+		for _, fd := range fds {
+			_, file := t.FDTable().Remove(fd)
+			file.DecRef()
+		}
 		return 0, nil, err
 	}
 
@@ -1128,3 +1133,5 @@ func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
 	return n, nil, err
 }
+
+// LINT.ThenChange(./vfs2/socket.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 0004e60d9..b32abfe59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -21,6 +21,7 @@ go_library(
         "poll.go",
         "read_write.go",
         "setstat.go",
+        "socket.go",
         "stat.go",
         "stat_amd64.go",
         "stat_arm64.go",
@@ -32,6 +33,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/bits",
         "//pkg/fspath",
         "//pkg/gohacks",
@@ -43,10 +45,14 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
         "//pkg/sync",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index 63febc2f7..645e0bcb8 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -44,21 +44,22 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[32] = syscalls.Supported("dup", Dup)
 	table[33] = syscalls.Supported("dup2", Dup2)
 	delete(table, 40) // sendfile
-	delete(table, 41) // socket
-	delete(table, 42) // connect
-	delete(table, 43) // accept
-	delete(table, 44) // sendto
-	delete(table, 45) // recvfrom
-	delete(table, 46) // sendmsg
-	delete(table, 47) // recvmsg
-	delete(table, 48) // shutdown
-	delete(table, 49) // bind
-	delete(table, 50) // listen
-	delete(table, 51) // getsockname
-	delete(table, 52) // getpeername
-	delete(table, 53) // socketpair
-	delete(table, 54) // setsockopt
-	delete(table, 55) // getsockopt
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[41] = syscalls.PartiallySupported("socket", Socket, "In process of porting socket syscalls to VFS2.", nil)
+	table[42] = syscalls.PartiallySupported("connect", Connect, "In process of porting socket syscalls to VFS2.", nil)
+	table[43] = syscalls.PartiallySupported("accept", Accept, "In process of porting socket syscalls to VFS2.", nil)
+	table[44] = syscalls.PartiallySupported("sendto", SendTo, "In process of porting socket syscalls to VFS2.", nil)
+	table[45] = syscalls.PartiallySupported("recvfrom", RecvFrom, "In process of porting socket syscalls to VFS2.", nil)
+	table[46] = syscalls.PartiallySupported("sendmsg", SendMsg, "In process of porting socket syscalls to VFS2.", nil)
+	table[47] = syscalls.PartiallySupported("recvmsg", RecvMsg, "In process of porting socket syscalls to VFS2.", nil)
+	table[48] = syscalls.PartiallySupported("shutdown", Shutdown, "In process of porting socket syscalls to VFS2.", nil)
+	table[49] = syscalls.PartiallySupported("bind", Bind, "In process of porting socket syscalls to VFS2.", nil)
+	table[50] = syscalls.PartiallySupported("listen", Listen, "In process of porting socket syscalls to VFS2.", nil)
+	table[51] = syscalls.PartiallySupported("getsockname", GetSockName, "In process of porting socket syscalls to VFS2.", nil)
+	table[52] = syscalls.PartiallySupported("getpeername", GetPeerName, "In process of porting socket syscalls to VFS2.", nil)
+	table[53] = syscalls.PartiallySupported("socketpair", SocketPair, "In process of porting socket syscalls to VFS2.", nil)
+	table[54] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
+	table[55] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
 	table[59] = syscalls.Supported("execve", Execve)
 	table[72] = syscalls.Supported("fcntl", Fcntl)
 	delete(table, 73) // flock
@@ -144,7 +145,8 @@ func Override(table map[uintptr]kernel.Syscall) {
 	delete(table, 285) // fallocate
 	table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
 	table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
-	delete(table, 288) // accept4
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[288] = syscalls.PartiallySupported("accept4", Accept4, "In process of porting socket syscalls to VFS2.", nil)
 	delete(table, 289) // signalfd4
 	delete(table, 290) // eventfd2
 	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
@@ -153,9 +155,11 @@ func Override(table map[uintptr]kernel.Syscall) {
 	delete(table, 294) // inotify_init1
 	table[295] = syscalls.Supported("preadv", Preadv)
 	table[296] = syscalls.Supported("pwritev", Pwritev)
-	delete(table, 299) // recvmmsg
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[299] = syscalls.PartiallySupported("recvmmsg", RecvMMsg, "In process of porting socket syscalls to VFS2.", nil)
 	table[306] = syscalls.Supported("syncfs", Syncfs)
-	delete(table, 307) // sendmmsg
+	// TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2.
+	table[307] = syscalls.PartiallySupported("sendmmsg", SendMMsg, "In process of porting socket syscalls to VFS2.", nil)
 	table[316] = syscalls.Supported("renameat2", Renameat2)
 	delete(table, 319) // memfd_create
 	table[322] = syscalls.Supported("execveat", Execveat)
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
new file mode 100644
index 000000000..79a4a7ada
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -0,0 +1,1138 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syserror.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syserror.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	defer s.DecRef()
+
+	if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
+		return 0, nil, err
+	}
+
+	fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	addr := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	// Adding to the FD table will cause an extra reference to be acquired.
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
+	if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+	if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+
+	// Create the FDs for the sockets.
+	flags := kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			_, file := t.FDTable().Remove(fd)
+			file.DecRef()
+		}
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE(magi): Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Read the length. Reject negative values.
+	optLen := int32(0)
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+	if level == linux.SOL_SOCKET {
+		switch name {
+		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+			if len < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+		}
+
+		switch name {
+		case linux.SO_TYPE:
+			_, skType, _ := s.Type()
+			return int32(skType), nil
+		case linux.SO_DOMAIN:
+			family, _, _ := s.Type()
+			return int32(family), nil
+		case linux.SO_PROTOCOL:
+			_, _, protocol := s.Type()
+			return int32(protocol), nil
+		}
+	}
+
+	return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syserror.EINVAL
+	}
+	buf := t.CopyScratchBuffer(int(optLen))
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		var ts linux.Timespec
+		if _, err := ts.CopyIn(t, toPtr); err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		if dl := s.RecvTimeout(); dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		} else if dl < 0 {
+			flags |= linux.MSG_DONTWAIT
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME(b/63594852): Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syserror.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		if !cms.Unix.Empty() {
+			mflags |= linux.MSG_CTRUNC
+			cms.Release()
+		}
+
+		if int(msg.Flags) != mflags {
+			// Copy out the flags to the caller.
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+				return 0, err
+			}
+		}
+
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syserror.ENOBUFS
+	}
+	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
+
+	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy out the flags to the caller.
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+		return 0, err
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syserror.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
-- 
cgit v1.2.3


From d303684d7ab9b8a3961398fcf12560956ee9e2e3 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Mon, 13 Apr 2020 16:59:45 -0700
Subject: Remove unnecessary threads

The work being done in these threads is not asynchronous with respect to
the test; that is, it is equivalent to issue non-blocking `connect`
calls serially, since the work is done asynchronously with respect to
the caller. Futhermore, this test was added to test closing a listener
with completed but not delivered connections, which never required
threading in the first place.

PiperOrigin-RevId: 306339486
---
 test/syscalls/linux/socket_inet_loopback.cc | 40 ++++++++---------------------
 1 file changed, 11 insertions(+), 29 deletions(-)

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 030c3b835..71bd7c14d 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -325,11 +325,9 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
-  constexpr int kAcceptCount = 32;
-  constexpr int kBacklog = kAcceptCount * 2;
-  constexpr int kFDs = 128;
-  constexpr int kThreadCount = 4;
-  constexpr int kFDsPerThread = kFDs / kThreadCount;
+  constexpr int kAcceptCount = 2;
+  constexpr int kBacklog = kAcceptCount + 2;
+  constexpr int kFDs = kBacklog * 3;
 
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
@@ -348,39 +346,23 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
-  DisableSave ds;  // Too many system calls.
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  FileDescriptor clients[kFDs];
-  std::unique_ptr<ScopedThread> threads[kThreadCount];
+  std::vector<FileDescriptor> clients;
   for (int i = 0; i < kFDs; i++) {
-    clients[i] = ASSERT_NO_ERRNO_AND_VALUE(
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
-  }
-  for (int i = 0; i < kThreadCount; i++) {
-    threads[i] = absl::make_unique<ScopedThread>([&connector, &conn_addr,
-                                                  &clients, i]() {
-      for (int j = 0; j < kFDsPerThread; j++) {
-        int k = i * kFDsPerThread + j;
-        int ret =
-            connect(clients[k].get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                    connector.addr_len);
-        if (ret != 0) {
-          EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-        }
-      }
-    });
-  }
-  for (int i = 0; i < kThreadCount; i++) {
-    threads[i]->Join();
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+    }
+    clients.push_back(std::move(client));
   }
   for (int i = 0; i < kAcceptCount; i++) {
     auto accepted =
         ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
   }
-  // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
-  // before function end.
-  // ds.reset();
 }
 
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
-- 
cgit v1.2.3


From 71e6ac3e1f551cf52166bf501de114f06502b994 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 13 Apr 2020 17:58:52 -0700
Subject: Don't allow read/write when offset+size overflows.

PiperOrigin-RevId: 306348346
---
 pkg/sentry/syscalls/linux/sys_read.go        |  8 ++++----
 pkg/sentry/syscalls/linux/sys_splice.go      |  4 +++-
 pkg/sentry/syscalls/linux/sys_write.go       |  4 ++--
 pkg/sentry/syscalls/linux/vfs2/read_write.go |  8 ++++----
 test/syscalls/linux/memfd.cc                 |  1 +
 test/syscalls/linux/pread64.cc               | 16 ++++++++++++++++
 test/syscalls/linux/pwrite64.cc              | 12 ++++++++++++
 test/syscalls/linux/sendfile.cc              | 23 +++++++++++++++++++++++
 test/syscalls/linux/splice.cc                |  1 +
 9 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 78a2cb750..071b4bacc 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -96,8 +96,8 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -120,8 +120,8 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index fbc6cf15f..df0d0f461 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -25,7 +26,8 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
-	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+	log.Infof("NLAC: doSplice opts: %+v", opts)
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
 		return 0, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 506ee54ce..6ec0de96e 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -87,8 +87,8 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 35f6308d6..898b190fd 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -130,8 +130,8 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -362,8 +362,8 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index e57b49a4a..f8b7f7938 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -16,6 +16,7 @@
 #include <fcntl.h>
 #include <linux/magic.h>
 #include <linux/memfd.h>
+#include <linux/unistd.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/statfs.h>
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 2cecf2e5f..bcdbbb044 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -118,6 +119,21 @@ TEST_F(Pread64Test, EndOfFile) {
   EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallSucceedsWithValue(0));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST_F(Pread64Test, Overflow) {
+  int f = memfd_create("negative", 0);
+  const FileDescriptor fd(f);
+
+  EXPECT_THAT(ftruncate(fd.get(), 0x7fffffffffffffffull), SyscallSucceeds());
+
+  char buf[10];
+  EXPECT_THAT(pread64(fd.get(), buf, sizeof(buf), 0x7fffffffffffffffull),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(Pread64TestNoTempFile, CantReadSocketPair_NoRandomSave) {
   int sock_fds[2];
   EXPECT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds), SyscallSucceeds());
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index c2f72e010..e69794910 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -65,6 +66,17 @@ TEST_F(Pwrite64, InvalidArgs) {
   EXPECT_THAT(close(fd), SyscallSucceeds());
 }
 
+TEST_F(Pwrite64, Overflow) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+  constexpr int64_t kBufSize = 1024;
+  std::vector<char> buf(kBufSize);
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0x7fffffffffffffffull),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index ebaafe47e..64123e904 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/eventfd.h>
 #include <sys/sendfile.h>
 #include <unistd.h>
@@ -70,6 +71,28 @@ TEST(SendFileTest, InvalidOffset) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SendFileTest, Overflow) {
+  // Create input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor outf(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), &out_offset, kSize),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SendFileTest, SendTrivially) {
   // Create temp files.
   constexpr char kData[] = "To be, or not to be, that is the question:";
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index faa1247f6..f103e2e56 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/eventfd.h>
 #include <sys/resource.h>
 #include <sys/sendfile.h>
-- 
cgit v1.2.3


From c230d12b5ce540239df06e517f3b1b72722dcc14 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 13 Apr 2020 23:04:01 -0700
Subject: Add Sniffer.Drain() draining socket receive buffer

Add Sniffer.Drain() which drains the socket's receive buffer by temporarily
setting the socket to non-blocking, and receiving in a loop until EINTR,
EWOULDBLOCK or EAGAIN. This method should be used when long periods of time
elapses without receiving on the socket, because uninteresting packets may have
piled up in the receive buffer, filling it up and causing packets critical to
test operation to be dropped.

PiperOrigin-RevId: 306380480
---
 test/packetimpact/testbench/connections.go        | 12 ++++++++++++
 test/packetimpact/testbench/rawsockets.go         | 23 +++++++++++++++++++++++
 test/packetimpact/tests/fin_wait2_timeout_test.go |  2 ++
 3 files changed, 37 insertions(+)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 79c0ccf5c..2b8e2f005 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -228,6 +228,12 @@ func (conn *TCPIPv4) RecvFrame(timeout time.Duration) Layers {
 	return nil
 }
 
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *TCPIPv4) Drain() {
+	conn.sniffer.Drain()
+}
+
 // Expect a packet that matches the provided tcp within the timeout specified.
 // If it doesn't arrive in time, it returns nil.
 func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
@@ -423,6 +429,12 @@ func (conn *UDPIPv4) Recv(timeout time.Duration) *UDP {
 	return nil
 }
 
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *UDPIPv4) Drain() {
+	conn.sniffer.Drain()
+}
+
 // Expect a packet that matches the provided udp within the timeout specified.
 // If it doesn't arrive in time, the test fails.
 func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) (*UDP, error) {
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
index 0074484f7..09bfa43c5 100644
--- a/test/packetimpact/testbench/rawsockets.go
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -97,6 +97,29 @@ func (s *Sniffer) Recv(timeout time.Duration) []byte {
 	}
 }
 
+// Drain drains the Sniffer's socket receive buffer by receiving until there's
+// nothing else to receive.
+func (s *Sniffer) Drain() {
+	s.t.Helper()
+	flags, err := unix.FcntlInt(uintptr(s.fd), unix.F_GETFL, 0)
+	if err != nil {
+		s.t.Fatalf("failed to get sniffer socket fd flags: %s", err)
+	}
+	if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, flags|unix.O_NONBLOCK); err != nil {
+		s.t.Fatalf("failed to make sniffer socket non-blocking: %s", err)
+	}
+	for {
+		buf := make([]byte, maxReadSize)
+		_, _, err := unix.Recvfrom(s.fd, buf, unix.MSG_TRUNC)
+		if err == unix.EINTR || err == unix.EAGAIN || err == unix.EWOULDBLOCK {
+			break
+		}
+	}
+	if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, flags); err != nil {
+		s.t.Fatalf("failed to restore sniffer socket fd flags: %s", err)
+	}
+}
+
 // Close the socket that Sniffer is using.
 func (s *Sniffer) Close() {
 	if err := unix.Close(s.fd); err != nil {
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
index 90e16ef65..b98594f94 100644
--- a/test/packetimpact/tests/fin_wait2_timeout_test.go
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -53,6 +53,8 @@ func TestFinWait2Timeout(t *testing.T) {
 			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
 
 			time.Sleep(5 * time.Second)
+			conn.Drain()
+
 			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
 			if tt.linger2 {
 				if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
-- 
cgit v1.2.3


From 81c44c4cd7cfa121d9ef028db18b3ee550845811 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 14 Apr 2020 11:04:30 -0700
Subject: Test TCP should piggyback ACK in ESTAB state

TCP, in ESTABLISHED state, SHOULD piggyback acknowledgement with a segment being
transmitted (whenever possible) without incurring undue delay

PiperOrigin-RevId: 306474550
---
 test/packetimpact/tests/BUILD                      | 12 +++++
 .../tests/tcp_should_piggyback_test.go             | 53 ++++++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 test/packetimpact/tests/tcp_should_piggyback_test.go

diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 956b1addf..308590162 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -52,6 +52,18 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "tcp_should_piggyback",
+    srcs = ["tcp_should_piggyback_test.go"],
+    # TODO(b/153680566): Fix netstack then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/tcp_should_piggyback_test.go b/test/packetimpact/tests/tcp_should_piggyback_test.go
new file mode 100644
index 000000000..f2ab49e51
--- /dev/null
+++ b/test/packetimpact/tests/tcp_should_piggyback_test.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_should_piggyback_test
+
+import (
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func TestPiggyback(t *testing.T) {
+	dut := tb.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort, WindowSize: tb.Uint16(12)}, tb.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Handshake()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+
+	dut.Send(acceptFd, sampleData, 0)
+	conn.ExpectData(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, sampleData, time.Second)
+
+	// Cause DUT to send us more data as soon as we ACK their first data segment because we have
+	// a small window.
+	dut.Send(acceptFd, sampleData, 0)
+
+	// DUT should ACK our segment by piggybacking ACK to their outstanding data segment instead of
+	// sending a separate ACK packet.
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &tb.Payload{Bytes: sampleData})
+	conn.ExpectData(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, sampleData, time.Second)
+}
-- 
cgit v1.2.3


From 52b4b19249adfeba65fe6f0ef27111f2ed887888 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 14 Apr 2020 13:36:36 -0700
Subject: Pass O_LARGEFILE in syscalls/linux/vfs2.openat.

Needed for PipeTest_Flags: files opened by open() and openat() get O_LARGEFILE
(on architectures with 64-bit off_t), but not FDs created by other syscalls
such as pipe().

Updates #1035

PiperOrigin-RevId: 306504788
---
 pkg/sentry/syscalls/linux/vfs2/filesystem.go | 2 +-
 pkg/sentry/vfs/file_description.go           | 2 +-
 pkg/sentry/vfs/vfs.go                        | 5 +----
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index a859095e2..46d3e189c 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -172,7 +172,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo
 	defer tpop.Release()
 
 	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
-		Flags: flags,
+		Flags: flags | linux.O_LARGEFILE,
 		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
 	})
 	if err != nil {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 4fb9aea87..5976b5ccd 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -122,7 +122,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 	}
 
 	fd.refs = 1
-	fd.statusFlags = statusFlags | linux.O_LARGEFILE
+	fd.statusFlags = statusFlags
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index f592913d5..053c6e1d1 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -383,14 +383,11 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
 	// Remove:
 	//
-	// - O_LARGEFILE, which we always report in FileDescription status flags
-	// since only 64-bit architectures are supported at this time.
-	//
 	// - O_CLOEXEC, which affects file descriptors and therefore must be
 	// handled outside of VFS.
 	//
 	// - Unknown flags.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
 	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
 	if opts.Flags&linux.O_SYNC != 0 {
 		opts.Flags |= linux.O_DSYNC
-- 
cgit v1.2.3


From 2dd6384de89a866bddb9184b8d7ab85b5b8f7100 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 14 Apr 2020 14:40:08 -0700
Subject: Fix cleanup around socketpair() failure to copy out FDs.

- Use the fs.File, rather than the vfs.FileDescription, in the VFS1 version.

- Check for a nil fs.File/vfs.FileDescription before calling DecRef, which is
  possible if a racing dup2() or dup3() replaces the file descriptor between
  when it is installed and when it is returned. (This is not possible in Linux
  because Linux separates allocation of a file descriptor from binding an
  allocated file descriptor to a struct file, and dup2/dup3 return EBUSY if
  asked to replace an allocated but unbound file descriptor.)

PiperOrigin-RevId: 306517101
---
 pkg/sentry/syscalls/linux/sys_socket.go  | 5 +++--
 pkg/sentry/syscalls/linux/vfs2/socket.go | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 61b2576ac..0760af77b 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -247,8 +247,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	// Copy the file descriptors out.
 	if _, err := t.CopyOut(socks, fds); err != nil {
 		for _, fd := range fds {
-			_, file := t.FDTable().Remove(fd)
-			file.DecRef()
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
 		}
 		return 0, nil, err
 	}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index 79a4a7ada..b1ede32f0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -250,8 +250,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	if _, err := t.CopyOut(addr, fds); err != nil {
 		for _, fd := range fds {
-			_, file := t.FDTable().Remove(fd)
-			file.DecRef()
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
 		}
 		return 0, nil, err
 	}
-- 
cgit v1.2.3


From 36fbaac5201365ffec4c323956f8465492c8a32c Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 14 Apr 2020 18:31:20 -0700
Subject: Attempt SLAAC address regeneration on DAD conflicts

As per RFC 7217 section 6, attempt to regenerate IPv6 SLAAC address in response
to a DAD conflict if the address was generated with an opaque IID as outlined in
RFC 7217 section 5.

Test:
- stack_test.TestAutoGenAddrWithOpaqueIIDDADRetries
- stack_test.TestAutoGenAddrWithEUI64IIDNoDADRetries
- stack_test.TestAutoGenAddrContinuesLifetimesAfterRetry
PiperOrigin-RevId: 306555645
---
 pkg/tcpip/network/ipv6/ipv6_test.go |  66 ++++--
 pkg/tcpip/stack/ndp.go              | 210 +++++++++++-----
 pkg/tcpip/stack/ndp_test.go         | 461 ++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/nic.go              |  73 ++++--
 4 files changed, 703 insertions(+), 107 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 95e5dbf8e..841a0cb7a 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -34,6 +34,7 @@ const (
 	// The least significant 3 bytes are the same as addr2 so both addr2 and
 	// addr3 will have the same solicited-node address.
 	addr3 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x02"
+	addr4 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x03"
 
 	// Tests use the extension header identifier values as uint8 instead of
 	// header.IPv6ExtensionHeaderIdentifier.
@@ -167,6 +168,8 @@ func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
 // packets destined to the IPv6 solicited-node address of an assigned IPv6
 // address.
 func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
+	const nicID = 1
+
 	tests := []struct {
 		name            string
 		protocolFactory stack.TransportProtocol
@@ -184,50 +187,61 @@ func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
 				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
 				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
 			})
-			e := channel.New(10, 1280, linkAddr1)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			e := channel.New(1, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			// Should not receive a packet destined to the solicited
-			// node address of addr2/addr3 yet as we haven't added
-			// those addresses.
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// Should not receive a packet destined to the solicited node address of
+			// addr2/addr3 yet as we haven't added those addresses.
 			test.rxf(t, s, e, addr1, snmc, 0)
 
-			if err := s.AddAddress(1, ProtocolNumber, addr2); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, addr2, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
 			}
 
-			// Should receive a packet destined to the solicited
-			// node address of addr2/addr3 now that we have added
-			// added addr2.
+			// Should receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have added added addr2.
 			test.rxf(t, s, e, addr1, snmc, 1)
 
-			if err := s.AddAddress(1, ProtocolNumber, addr3); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, addr3, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr3, err)
 			}
 
-			// Should still receive a packet destined to the
-			// solicited node address of addr2/addr3 now that we
-			// have added addr3.
+			// Should still receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have added addr3.
 			test.rxf(t, s, e, addr1, snmc, 2)
 
-			if err := s.RemoveAddress(1, addr2); err != nil {
-				t.Fatalf("RemoveAddress(_, %s) = %s", addr2, err)
+			if err := s.RemoveAddress(nicID, addr2); err != nil {
+				t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr2, err)
 			}
 
-			// Should still receive a packet destined to the
-			// solicited node address of addr2/addr3 now that we
-			// have removed addr2.
+			// Should still receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have removed addr2.
 			test.rxf(t, s, e, addr1, snmc, 3)
 
-			if err := s.RemoveAddress(1, addr3); err != nil {
-				t.Fatalf("RemoveAddress(_, %s) = %s", addr3, err)
+			// Make sure addr3's endpoint does not get removed from the NIC by
+			// incrementing its reference count with a route.
+			r, err := s.FindRoute(nicID, addr3, addr4, ProtocolNumber, false)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr3, addr4, ProtocolNumber, err)
+			}
+			defer r.Release()
+
+			if err := s.RemoveAddress(nicID, addr3); err != nil {
+				t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr3, err)
 			}
 
-			// Should not receive a packet destined to the solicited
-			// node address of addr2/addr3 yet as both of them got
-			// removed.
+			// Should not receive a packet destined to the solicited node address of
+			// addr2/addr3 yet as both of them got removed, even though a route using
+			// addr3 exists.
 			test.rxf(t, s, e, addr1, snmc, 3)
 		})
 	}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 7c9fc48d1..7f66c6c09 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -305,6 +305,15 @@ type NDPConfigurations struct {
 	// lifetime(s) of the generated address changes; this option only
 	// affects the generation of new addresses as part of SLAAC.
 	AutoGenGlobalAddresses bool
+
+	// AutoGenAddressConflictRetries determines how many times to attempt to retry
+	// generation of a permanent auto-generated address in response to DAD
+	// conflicts.
+	//
+	// If the method used to generate the address does not support creating
+	// alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's
+	// MAC address), then no attempt will be made to resolve the conflict.
+	AutoGenAddressConflictRetries uint8
 }
 
 // DefaultNDPConfigurations returns an NDPConfigurations populated with
@@ -411,8 +420,23 @@ type slaacPrefixState struct {
 	// Nonzero only when the address is not valid forever.
 	validUntil time.Time
 
+	// Nonzero only when the address is not preferred forever.
+	preferredUntil time.Time
+
 	// The prefix's permanent address endpoint.
+	//
+	// May only be nil when a SLAAC address is being (re-)generated. Otherwise,
+	// must not be nil as all SLAAC prefixes must have a SLAAC address.
 	ref *referencedNetworkEndpoint
+
+	// The number of times a permanent address has been generated for the prefix.
+	//
+	// Addresses may be regenerated in reseponse to a DAD conflicts.
+	generationAttempts uint8
+
+	// The maximum number of times to attempt regeneration of a permanent SLAAC
+	// address in response to DAD conflicts.
+	maxGenerationAttempts uint8
 }
 
 // startDuplicateAddressDetection performs Duplicate Address Detection.
@@ -935,60 +959,83 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 		return
 	}
 
-	// If the preferred lifetime is zero, then the prefix should be considered
-	// deprecated.
-	deprecated := pl == 0
-	ref := ndp.addSLAACAddr(prefix, deprecated)
-	if ref == nil {
-		// We were unable to generate a permanent address for prefix so do nothing
-		// further as there is no reason to maintain state for a SLAAC prefix we
-		// cannot generate a permanent address for.
-		return
-	}
-
 	state := slaacPrefixState{
 		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
-			prefixState, ok := ndp.slaacPrefixes[prefix]
+			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
-				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the SLAAC prefix %s", prefix))
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
 			}
 
-			ndp.deprecateSLAACAddress(prefixState.ref)
+			ndp.deprecateSLAACAddress(state.ref)
 		}),
 		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
-			ndp.invalidateSLAACPrefix(prefix, true)
+			state, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
+			}
+
+			ndp.invalidateSLAACPrefix(prefix, state)
 		}),
-		ref: ref,
+		maxGenerationAttempts: ndp.configs.AutoGenAddressConflictRetries + 1,
+	}
+
+	now := time.Now()
+
+	// The time an address is preferred until is needed to properly generate the
+	// address.
+	if pl < header.NDPInfiniteLifetime {
+		state.preferredUntil = now.Add(pl)
+	}
+
+	if !ndp.generateSLAACAddr(prefix, &state) {
+		// We were unable to generate an address for the prefix, we do not nothing
+		// further as there is no reason to maintain state or timers for a prefix we
+		// do not have an address for.
+		return
 	}
 
 	// Setup the initial timers to deprecate and invalidate prefix.
 
-	if !deprecated && pl < header.NDPInfiniteLifetime {
+	if pl < header.NDPInfiniteLifetime && pl != 0 {
 		state.deprecationTimer.Reset(pl)
 	}
 
 	if vl < header.NDPInfiniteLifetime {
 		state.invalidationTimer.Reset(vl)
-		state.validUntil = time.Now().Add(vl)
+		state.validUntil = now.Add(vl)
 	}
 
 	ndp.slaacPrefixes[prefix] = state
 }
 
-// addSLAACAddr adds a SLAAC address for prefix.
+// generateSLAACAddr generates a SLAAC address for prefix.
+//
+// Returns true if an address was successfully generated.
+//
+// Panics if the prefix is not a SLAAC prefix or it already has an address.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) addSLAACAddr(prefix tcpip.Subnet, deprecated bool) *referencedNetworkEndpoint {
+func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool {
+	if r := state.ref; r != nil {
+		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, r.addrWithPrefix()))
+	}
+
+	// If we have already reached the maximum address generation attempts for the
+	// prefix, do not generate another address.
+	if state.generationAttempts == state.maxGenerationAttempts {
+		return false
+	}
+
 	addrBytes := []byte(prefix.ID())
 	if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
 		addrBytes = header.AppendOpaqueInterfaceIdentifier(
 			addrBytes[:header.IIDOffsetInIPv6Address],
 			prefix,
 			oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name),
-			0, /* dadCounter */
+			state.generationAttempts,
 			oIID.SecretKey,
 		)
-	} else {
+	} else if state.generationAttempts == 0 {
 		// Only attempt to generate an interface-specific IID if we have a valid
 		// link address.
 		//
@@ -996,12 +1043,16 @@ func (ndp *ndpState) addSLAACAddr(prefix tcpip.Subnet, deprecated bool) *referen
 		// LinkEndpoint.LinkAddress) before reaching this point.
 		linkAddr := ndp.nic.linkEP.LinkAddress()
 		if !header.IsValidUnicastEthernetAddress(linkAddr) {
-			return nil
+			return false
 		}
 
 		// Generate an address within prefix from the modified EUI-64 of ndp's NIC's
 		// Ethernet MAC address.
 		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	} else {
+		// We have no way to regenerate an address when addresses are not generated
+		// with opaque IIDs.
+		return false
 	}
 
 	generatedAddr := tcpip.ProtocolAddress{
@@ -1014,26 +1065,52 @@ func (ndp *ndpState) addSLAACAddr(prefix tcpip.Subnet, deprecated bool) *referen
 
 	// If the nic already has this address, do nothing further.
 	if ndp.nic.hasPermanentAddrLocked(generatedAddr.AddressWithPrefix.Address) {
-		return nil
+		return false
 	}
 
 	// Inform the integrator that we have a new SLAAC address.
 	ndpDisp := ndp.nic.stack.ndpDisp
 	if ndpDisp == nil {
-		return nil
+		return false
 	}
 
 	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), generatedAddr.AddressWithPrefix) {
 		// Informed by the integrator not to add the address.
-		return nil
+		return false
 	}
 
+	deprecated := time.Since(state.preferredUntil) >= 0
 	ref, err := ndp.nic.addAddressLocked(generatedAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
 	if err != nil {
 		panic(fmt.Sprintf("ndp: error when adding address %+v: %s", generatedAddr, err))
 	}
 
-	return ref
+	state.generationAttempts++
+	state.ref = ref
+	return true
+}
+
+// regenerateSLAACAddr regenerates an address for a SLAAC prefix.
+//
+// If generating a new address for the prefix fails, the prefix will be
+// invalidated.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok {
+		panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate address for %s", prefix))
+	}
+
+	if ndp.generateSLAACAddr(prefix, &state) {
+		ndp.slaacPrefixes[prefix] = state
+		return
+	}
+
+	// We were unable to generate a permanent address for the SLAAC prefix so
+	// invalidate the prefix as there is no reason to maintain state for a
+	// SLAAC prefix we do not have an address for.
+	ndp.invalidateSLAACPrefix(prefix, state)
 }
 
 // refreshSLAACPrefixLifetimes refreshes the lifetimes of a SLAAC prefix.
@@ -1060,9 +1137,16 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, pl, vl tim
 	// deprecation timer so it can be reset.
 	prefixState.deprecationTimer.StopLocked()
 
+	now := time.Now()
+
 	// Reset the deprecation timer if prefix has a finite preferred lifetime.
-	if !deprecated && pl < header.NDPInfiniteLifetime {
-		prefixState.deprecationTimer.Reset(pl)
+	if pl < header.NDPInfiniteLifetime {
+		if !deprecated {
+			prefixState.deprecationTimer.Reset(pl)
+		}
+		prefixState.preferredUntil = now.Add(pl)
+	} else {
+		prefixState.preferredUntil = time.Time{}
 	}
 
 	// As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix:
@@ -1105,7 +1189,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, pl, vl tim
 
 	prefixState.invalidationTimer.StopLocked()
 	prefixState.invalidationTimer.Reset(effectiveVl)
-	prefixState.validUntil = time.Now().Add(effectiveVl)
+	prefixState.validUntil = now.Add(effectiveVl)
 }
 
 // deprecateSLAACAddress marks ref as deprecated and notifies the stack's NDP
@@ -1121,48 +1205,60 @@ func (ndp *ndpState) deprecateSLAACAddress(ref *referencedNetworkEndpoint) {
 
 	ref.deprecated = true
 	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{
-			Address:   ref.ep.ID().LocalAddress,
-			PrefixLen: ref.ep.PrefixLen(),
-		})
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), ref.addrWithPrefix())
 	}
 }
 
 // invalidateSLAACPrefix invalidates a SLAAC prefix.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, removeAddr bool) {
-	state, ok := ndp.slaacPrefixes[prefix]
-	if !ok {
-		return
+func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) {
+	if r := state.ref; r != nil {
+		// Since we are already invalidating the prefix, do not invalidate the
+		// prefix when removing the address.
+		if err := ndp.nic.removePermanentIPv6EndpointLocked(r, false /* allowSLAACPrefixInvalidation */); err != nil {
+			panic(fmt.Sprintf("ndp: removePermanentIPv6EndpointLocked(%s, false): %s", r.addrWithPrefix(), err))
+		}
 	}
 
-	state.deprecationTimer.StopLocked()
-	state.invalidationTimer.StopLocked()
-	delete(ndp.slaacPrefixes, prefix)
+	ndp.cleanupSLAACPrefixResources(prefix, state)
+}
 
-	addr := state.ref.ep.ID().LocalAddress
+// cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's
+// resources.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	}
 
-	if removeAddr {
-		if err := ndp.nic.removePermanentAddressLocked(addr); err != nil {
-			panic(fmt.Sprintf("ndp: removePermanentAddressLocked(%s): %s", addr, err))
-		}
+	prefix := addr.Subnet()
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok || state.ref == nil || addr.Address != state.ref.ep.ID().LocalAddress {
+		return
 	}
 
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
-			Address:   addr,
-			PrefixLen: state.ref.ep.PrefixLen(),
-		})
+	if !invalidatePrefix {
+		// If the prefix is not being invalidated, disassociate the address from the
+		// prefix and do nothing further.
+		state.ref = nil
+		ndp.slaacPrefixes[prefix] = state
+		return
 	}
+
+	ndp.cleanupSLAACPrefixResources(prefix, state)
 }
 
-// cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC
-// address's resources from ndp.
+// cleanupSLAACPrefixResources cleansup a SLAAC prefix's timers and entry.
+//
+// Panics if the SLAAC prefix is not known.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix) {
-	ndp.invalidateSLAACPrefix(addr.Subnet(), false)
+func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) {
+	state.deprecationTimer.StopLocked()
+	state.invalidationTimer.StopLocked()
+	delete(ndp.slaacPrefixes, prefix)
 }
 
 // cleanupState cleans up ndp's state.
@@ -1181,7 +1277,7 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr
 func (ndp *ndpState) cleanupState(hostOnly bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
 	linkLocalPrefixes := 0
-	for prefix := range ndp.slaacPrefixes {
+	for prefix, state := range ndp.slaacPrefixes {
 		// RFC 4862 section 5 states that routers are also expected to generate a
 		// link-local address so we do not invalidate them if we are cleaning up
 		// host-only state.
@@ -1190,7 +1286,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 			continue
 		}
 
-		ndp.invalidateSLAACPrefix(prefix, true)
+		ndp.invalidateSLAACPrefix(prefix, state)
 	}
 
 	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index acb2d4731..6562a2d22 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -623,6 +623,12 @@ func TestDADFail(t *testing.T) {
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
+
+			// Attempting to add the address again should not fail if the address's
+			// state was cleaned up when DAD failed.
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
 		})
 	}
 }
@@ -2783,6 +2789,461 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	}
 }
 
+// TestAutoGenAddrWithOpaqueIIDDADRetries tests the regeneration of an
+// auto-generated IPv6 address in response to a DAD conflict.
+func TestAutoGenAddrWithOpaqueIIDDADRetries(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic"
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const maxMaxRetries = 3
+	const lifetimeSeconds = 10
+
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	for maxRetries := uint8(0); maxRetries <= maxMaxRetries; maxRetries++ {
+		for numFailures := uint8(0); numFailures <= maxRetries+1; numFailures++ {
+			addrTypes := []struct {
+				name             string
+				ndpConfigs       stack.NDPConfigurations
+				autoGenLinkLocal bool
+				subnet           tcpip.Subnet
+				triggerSLAACFn   func(e *channel.Endpoint)
+			}{
+				{
+					name: "Global address",
+					ndpConfigs: stack.NDPConfigurations{
+						DupAddrDetectTransmits:        dadTransmits,
+						RetransmitTimer:               retransmitTimer,
+						HandleRAs:                     true,
+						AutoGenGlobalAddresses:        true,
+						AutoGenAddressConflictRetries: maxRetries,
+					},
+					subnet: subnet,
+					triggerSLAACFn: func(e *channel.Endpoint) {
+						// Receive an RA with prefix1 in a PI.
+						e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+					},
+				},
+				{
+					name: "LinkLocal address",
+					ndpConfigs: stack.NDPConfigurations{
+						DupAddrDetectTransmits:        dadTransmits,
+						RetransmitTimer:               retransmitTimer,
+						AutoGenAddressConflictRetries: maxRetries,
+					},
+					autoGenLinkLocal: true,
+					subnet:           header.IPv6LinkLocalPrefix.Subnet(),
+					triggerSLAACFn:   func(e *channel.Endpoint) {},
+				},
+			}
+
+			for _, addrType := range addrTypes {
+				maxRetries := maxRetries
+				numFailures := numFailures
+				addrType := addrType
+
+				t.Run(fmt.Sprintf("%s with %d max retries and %d failures", addrType.name, maxRetries, numFailures), func(t *testing.T) {
+					t.Parallel()
+
+					ndpDisp := ndpDispatcher{
+						dadC:         make(chan ndpDADEvent, 1),
+						autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+					}
+					e := channel.New(0, 1280, linkAddr1)
+					s := stack.New(stack.Options{
+						NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+						AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+						NDPConfigs:           addrType.ndpConfigs,
+						NDPDisp:              &ndpDisp,
+						OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+							NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+								return nicName
+							},
+							SecretKey: secretKey,
+						},
+					})
+					opts := stack.NICOptions{Name: nicName}
+					if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+						t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+					}
+
+					expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+						t.Helper()
+
+						select {
+						case e := <-ndpDisp.autoGenAddrC:
+							if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+								t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+							}
+						default:
+							t.Fatal("expected addr auto gen event")
+						}
+					}
+
+					addrType.triggerSLAACFn(e)
+
+					// Simulate DAD conflicts so the address is regenerated.
+					for i := uint8(0); i < numFailures; i++ {
+						addrBytes := []byte(addrType.subnet.ID())
+						addr := tcpip.AddressWithPrefix{
+							Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], addrType.subnet, nicName, i, secretKey)),
+							PrefixLen: 64,
+						}
+						expectAutoGenAddrEvent(addr, newAddr)
+
+						// Should not have any addresses assigned to the NIC.
+						mainAddr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+						if err != nil {
+							t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+						}
+						if want := (tcpip.AddressWithPrefix{}); mainAddr != want {
+							t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", mainAddr, want)
+						}
+
+						// Simulate a DAD conflict.
+						if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+							t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+						}
+						expectAutoGenAddrEvent(addr, invalidatedAddr)
+						select {
+						case e := <-ndpDisp.dadC:
+							if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+								t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+							}
+						default:
+							t.Fatal("expected DAD event")
+						}
+
+						// Attempting to add the address manually should not fail if the
+						// address's state was cleaned up when DAD failed.
+						if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr.Address); err != nil {
+							t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr.Address, err)
+						}
+						if err := s.RemoveAddress(nicID, addr.Address); err != nil {
+							t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr.Address, err)
+						}
+						select {
+						case e := <-ndpDisp.dadC:
+							if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+								t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+							}
+						default:
+							t.Fatal("expected DAD event")
+						}
+					}
+
+					// Should not have any addresses assigned to the NIC.
+					mainAddr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+					if err != nil {
+						t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+					}
+					if want := (tcpip.AddressWithPrefix{}); mainAddr != want {
+						t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", mainAddr, want)
+					}
+
+					// If we had less failures than generation attempts, we should have an
+					// address after DAD resolves.
+					if maxRetries+1 > numFailures {
+						addrBytes := []byte(addrType.subnet.ID())
+						addr := tcpip.AddressWithPrefix{
+							Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], addrType.subnet, nicName, numFailures, secretKey)),
+							PrefixLen: 64,
+						}
+						expectAutoGenAddrEvent(addr, newAddr)
+
+						select {
+						case e := <-ndpDisp.dadC:
+							if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
+								t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+							}
+						case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+							t.Fatal("timed out waiting for DAD event")
+						}
+
+						mainAddr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+						if err != nil {
+							t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+						}
+						if mainAddr != addr {
+							t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", mainAddr, addr)
+						}
+					}
+
+					// Should not attempt address regeneration again.
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
+					case <-time.After(defaultAsyncEventTimeout):
+					}
+				})
+			}
+		}
+	}
+}
+
+// TestAutoGenAddrWithEUI64IIDNoDADRetries tests that a regeneration attempt is
+// not made for SLAAC addresses generated with an IID based on the NIC's link
+// address.
+func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
+	const nicID = 1
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const maxRetries = 3
+	const lifetimeSeconds = 10
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	addrTypes := []struct {
+		name             string
+		ndpConfigs       stack.NDPConfigurations
+		autoGenLinkLocal bool
+		subnet           tcpip.Subnet
+		triggerSLAACFn   func(e *channel.Endpoint)
+	}{
+		{
+			name: "Global address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenAddressConflictRetries: maxRetries,
+			},
+			subnet: subnet,
+			triggerSLAACFn: func(e *channel.Endpoint) {
+				// Receive an RA with prefix1 in a PI.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+			},
+		},
+		{
+			name: "LinkLocal address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				AutoGenAddressConflictRetries: maxRetries,
+			},
+			autoGenLinkLocal: true,
+			subnet:           header.IPv6LinkLocalPrefix.Subnet(),
+			triggerSLAACFn:   func(e *channel.Endpoint) {},
+		},
+	}
+
+	for _, addrType := range addrTypes {
+		addrType := addrType
+
+		t.Run(addrType.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				dadC:         make(chan ndpDADEvent, 1),
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+				NDPConfigs:           addrType.ndpConfigs,
+				NDPDisp:              &ndpDisp,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
+
+			addrType.triggerSLAACFn(e)
+
+			addrBytes := []byte(addrType.subnet.ID())
+			header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr1, addrBytes[header.IIDOffsetInIPv6Address:])
+			addr := tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(addrBytes),
+				PrefixLen: 64,
+			}
+			expectAutoGenAddrEvent(addr, newAddr)
+
+			// Simulate a DAD conflict.
+			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+			}
+			expectAutoGenAddrEvent(addr, invalidatedAddr)
+			select {
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected DAD event")
+			}
+
+			// Should not attempt address regeneration.
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
+			case <-time.After(defaultAsyncEventTimeout):
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrContinuesLifetimesAfterRetry tests that retrying address
+// generation in response to DAD conflicts does not refresh the lifetimes.
+func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic"
+	const dadTransmits = 1
+	const retransmitTimer = 2 * time.Second
+	const failureTimer = time.Second
+	const maxRetries = 1
+	const lifetimeSeconds = 5
+
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		dadC:         make(chan ndpDADEvent, 1),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits:        dadTransmits,
+			RetransmitTimer:               retransmitTimer,
+			HandleRAs:                     true,
+			AutoGenGlobalAddresses:        true,
+			AutoGenAddressConflictRetries: maxRetries,
+		},
+		NDPDisp: &ndpDisp,
+		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+				return nicName
+			},
+			SecretKey: secretKey,
+		},
+	})
+	opts := stack.NICOptions{Name: nicName}
+	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+	addrBytes := []byte(subnet.ID())
+	addr := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, 0, secretKey)),
+		PrefixLen: 64,
+	}
+	expectAutoGenAddrEvent(addr, newAddr)
+
+	// Simulate a DAD conflict after some time has passed.
+	time.Sleep(failureTimer)
+	if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+		t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+	}
+	expectAutoGenAddrEvent(addr, invalidatedAddr)
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected DAD event")
+	}
+
+	// Let the next address resolve.
+	addr.Address = tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, 1, secretKey))
+	expectAutoGenAddrEvent(addr, newAddr)
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for DAD event")
+	}
+
+	// Address should be deprecated/invalidated after the lifetime expires.
+	//
+	// Note, the remaining lifetime is calculated from when the PI was first
+	// processed. Since we wait for some time before simulating a DAD conflict
+	// and more time for the new address to resolve, the new address is only
+	// expected to be valid for the remaining time. The DAD conflict should
+	// not have reset the lifetimes.
+	//
+	// We expect either just the invalidation event or the deprecation event
+	// followed by the invalidation event.
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if e.eventType == deprecatedAddr {
+			if diff := checkAutoGenAddrEvent(e, addr, deprecatedAddr); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				}
+			case <-time.After(defaultAsyncEventTimeout):
+				t.Fatal("timed out waiting for invalidated auto gen addr event after deprecation")
+			}
+		} else {
+			if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		}
+	case <-time.After(lifetimeSeconds*time.Second - failureTimer - dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for auto gen addr event")
+	}
+}
+
 // TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
 // to the integrator when an RA is received with the NDP Recursive DNS Server
 // option with at least one valid address.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 4835251bc..016dbe15e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1012,29 +1012,31 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 		return tcpip.ErrBadLocalAddress
 	}
 
-	isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
+	switch r.protocol {
+	case header.IPv6ProtocolNumber:
+		return n.removePermanentIPv6EndpointLocked(r, true /* allowSLAAPrefixInvalidation */)
+	default:
+		r.expireLocked()
+		return nil
+	}
+}
+
+func (n *NIC) removePermanentIPv6EndpointLocked(r *referencedNetworkEndpoint, allowSLAACPrefixInvalidation bool) *tcpip.Error {
+	addr := r.addrWithPrefix()
+
+	isIPv6Unicast := header.IsV6UnicastAddress(addr.Address)
 
 	if isIPv6Unicast {
-		// If we are removing a tentative IPv6 unicast address, stop DAD.
-		if kind == permanentTentative {
-			n.mu.ndp.stopDuplicateAddressDetection(addr)
-		}
+		n.mu.ndp.stopDuplicateAddressDetection(addr.Address)
 
 		// If we are removing an address generated via SLAAC, cleanup
 		// its SLAAC resources and notify the integrator.
 		if r.configType == slaac {
-			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(tcpip.AddressWithPrefix{
-				Address:   addr,
-				PrefixLen: r.ep.PrefixLen(),
-			})
+			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACPrefixInvalidation)
 		}
 	}
 
-	r.setKind(permanentExpired)
-	if !r.decRefLocked() {
-		// The endpoint still has references to it.
-		return nil
-	}
+	r.expireLocked()
 
 	// At this point the endpoint is deleted.
 
@@ -1044,7 +1046,7 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 	// We ignore the tcpip.ErrBadLocalAddress error because the solicited-node
 	// multicast group may be left by user action.
 	if isIPv6Unicast {
-		snmc := header.SolicitedNodeAddr(addr)
+		snmc := header.SolicitedNodeAddr(addr.Address)
 		if err := n.leaveGroupLocked(snmc, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
 			return err
 		}
@@ -1425,10 +1427,12 @@ func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
 	return ref.getKind() == permanentTentative
 }
 
-// dupTentativeAddrDetected attempts to inform n that a tentative addr
-// is a duplicate on a link.
+// dupTentativeAddrDetected attempts to inform n that a tentative addr is a
+// duplicate on a link.
 //
-// dupTentativeAddrDetected will delete the tentative address if it exists.
+// dupTentativeAddrDetected will remove the tentative address if it exists. If
+// the address was generated via SLAAC, an attempt will be made to generate a
+// new address.
 func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
@@ -1442,7 +1446,17 @@ func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	return n.removePermanentAddressLocked(addr)
+	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as a
+	// new address will be generated for it.
+	if err := n.removePermanentIPv6EndpointLocked(ref, false /* allowSLAACPrefixInvalidation */); err != nil {
+		return err
+	}
+
+	if ref.configType == slaac {
+		n.mu.ndp.regenerateSLAACAddr(ref.addrWithPrefix().Subnet())
+	}
+
+	return nil
 }
 
 // setNDPConfigs sets the NDP configurations for n.
@@ -1570,6 +1584,13 @@ type referencedNetworkEndpoint struct {
 	deprecated bool
 }
 
+func (r *referencedNetworkEndpoint) addrWithPrefix() tcpip.AddressWithPrefix {
+	return tcpip.AddressWithPrefix{
+		Address:   r.ep.ID().LocalAddress,
+		PrefixLen: r.ep.PrefixLen(),
+	}
+}
+
 func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
 	return networkEndpointKind(atomic.LoadInt32((*int32)(&r.kind)))
 }
@@ -1597,6 +1618,13 @@ func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
 	return r.nic.mu.enabled && (r.getKind() != permanentExpired || r.nic.mu.spoofing)
 }
 
+// expireLocked decrements the reference count and marks the permanent endpoint
+// as expired.
+func (r *referencedNetworkEndpoint) expireLocked() {
+	r.setKind(permanentExpired)
+	r.decRefLocked()
+}
+
 // decRef decrements the ref count and cleans up the endpoint once it reaches
 // zero.
 func (r *referencedNetworkEndpoint) decRef() {
@@ -1606,14 +1634,11 @@ func (r *referencedNetworkEndpoint) decRef() {
 }
 
 // decRefLocked is the same as decRef but assumes that the NIC.mu mutex is
-// locked. Returns true if the endpoint was removed.
-func (r *referencedNetworkEndpoint) decRefLocked() bool {
+// locked.
+func (r *referencedNetworkEndpoint) decRefLocked() {
 	if atomic.AddInt32(&r.refs, -1) == 0 {
 		r.nic.removeEndpointLocked(r)
-		return true
 	}
-
-	return false
 }
 
 // incRef increments the ref count. It must only be called when the caller is
-- 
cgit v1.2.3


From 28212b3f179dc23bb966f72b11f635017cdf8664 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 14 Apr 2020 19:32:32 -0700
Subject: Reduce flakiness in tcp_test.

Tests now use a MinRTO of 3s instead of default 200ms. This reduced flakiness in
a lot of the congestion control/recovery tests which were flaky due to
retransmit timer firing too early in case the test executors were overloaded.

This change also bumps some of the timeouts in tests which were too sensitive to
timer variations and reduces the number of slow start iterations which can
make the tests run for too long and also trigger retansmit timeouts etc if
the executor is overloaded.

PiperOrigin-RevId: 306562645
---
 pkg/tcpip/checker/checker.go                       | 19 +++++
 pkg/tcpip/link/channel/channel.go                  | 34 ++-------
 pkg/tcpip/tcpip.go                                 |  4 ++
 pkg/tcpip/transport/tcp/BUILD                      |  5 +-
 pkg/tcpip/transport/tcp/protocol.go                | 17 +++++
 pkg/tcpip/transport/tcp/snd.go                     | 15 +++-
 pkg/tcpip/transport/tcp/tcp_noracedetector_test.go | 83 ++++++++++++++--------
 pkg/tcpip/transport/tcp/tcp_sack_test.go           |  2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                | 50 ++++++-------
 pkg/tcpip/transport/tcp/testing/context/context.go | 17 ++++-
 10 files changed, 159 insertions(+), 87 deletions(-)

diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 307f1b666..c1745ba6a 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -107,6 +107,8 @@ func DstAddr(addr tcpip.Address) NetworkChecker {
 // TTL creates a checker that checks the TTL (ipv4) or HopLimit (ipv6).
 func TTL(ttl uint8) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
 		var v uint8
 		switch ip := h[0].(type) {
 		case header.IPv4:
@@ -310,6 +312,8 @@ func SrcPort(port uint16) TransportChecker {
 // DstPort creates a checker that checks the destination port.
 func DstPort(port uint16) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		if p := h.DestinationPort(); p != port {
 			t.Errorf("Bad destination port, got %v, want %v", p, port)
 		}
@@ -336,6 +340,7 @@ func SeqNum(seq uint32) TransportChecker {
 func AckNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -350,6 +355,8 @@ func AckNum(seq uint32) TransportChecker {
 // Window creates a checker that checks the tcp window.
 func Window(window uint16) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -381,6 +388,8 @@ func TCPFlags(flags uint8) TransportChecker {
 // given mask, match the supplied flags.
 func TCPFlagsMatch(flags, mask uint8) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -398,6 +407,8 @@ func TCPFlagsMatch(flags, mask uint8) TransportChecker {
 // If wndscale is negative, the window scale option must not be present.
 func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -494,6 +505,8 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 // skipped.
 func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -612,6 +625,8 @@ func TCPSACKBlockChecker(sackBlocks []header.SACKBlock) TransportChecker {
 // Payload creates a checker that checks the payload.
 func Payload(want []byte) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		if got := h.Payload(); !reflect.DeepEqual(got, want) {
 			t.Errorf("Wrong payload, got %v, want %v", got, want)
 		}
@@ -644,6 +659,7 @@ func ICMPv4(checkers ...TransportChecker) NetworkChecker {
 func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
@@ -658,6 +674,7 @@ func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
 func ICMPv4Code(want byte) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
@@ -700,6 +717,7 @@ func ICMPv6(checkers ...TransportChecker) NetworkChecker {
 func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
@@ -714,6 +732,7 @@ func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
 func ICMPv6Code(want byte) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index b4a0ae53d..9bf67686d 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -50,13 +50,11 @@ type NotificationHandle struct {
 }
 
 type queue struct {
+	// c is the outbound packet channel.
+	c chan PacketInfo
 	// mu protects fields below.
-	mu sync.RWMutex
-	// c is the outbound packet channel. Sending to c should hold mu.
-	c        chan PacketInfo
-	numWrite int
-	numRead  int
-	notify   []*NotificationHandle
+	mu     sync.RWMutex
+	notify []*NotificationHandle
 }
 
 func (q *queue) Close() {
@@ -64,11 +62,8 @@ func (q *queue) Close() {
 }
 
 func (q *queue) Read() (PacketInfo, bool) {
-	q.mu.Lock()
-	defer q.mu.Unlock()
 	select {
 	case p := <-q.c:
-		q.numRead++
 		return p, true
 	default:
 		return PacketInfo{}, false
@@ -76,15 +71,8 @@ func (q *queue) Read() (PacketInfo, bool) {
 }
 
 func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
-	// We have to receive from channel without holding the lock, since it can
-	// block indefinitely. This will cause a window that numWrite - numRead
-	// produces a larger number, but won't go to negative. numWrite >= numRead
-	// still holds.
 	select {
 	case pkt := <-q.c:
-		q.mu.Lock()
-		defer q.mu.Unlock()
-		q.numRead++
 		return pkt, true
 	case <-ctx.Done():
 		return PacketInfo{}, false
@@ -93,16 +81,12 @@ func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
 
 func (q *queue) Write(p PacketInfo) bool {
 	wrote := false
-
-	// It's important to make sure nobody can see numWrite until we increment it,
-	// so numWrite >= numRead holds.
-	q.mu.Lock()
 	select {
 	case q.c <- p:
 		wrote = true
-		q.numWrite++
 	default:
 	}
+	q.mu.Lock()
 	notify := q.notify
 	q.mu.Unlock()
 
@@ -116,13 +100,7 @@ func (q *queue) Write(p PacketInfo) bool {
 }
 
 func (q *queue) Num() int {
-	q.mu.RLock()
-	defer q.mu.RUnlock()
-	n := q.numWrite - q.numRead
-	if n < 0 {
-		panic("numWrite < numRead")
-	}
-	return n
+	return len(q.c)
 }
 
 func (q *queue) AddNotify(notify Notification) *NotificationHandle {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index aec7126ff..109121dbc 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -681,6 +681,10 @@ type TCPTimeWaitTimeoutOption time.Duration
 // for a handshake till the specified timeout until a segment with data arrives.
 type TCPDeferAcceptOption time.Duration
 
+// TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
+// default MinRTO used by the Stack.
+type TCPMinRTOOption time.Duration
+
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 7f94f9646..edb7718a6 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -87,7 +87,9 @@ go_test(
         "tcp_timestamp_test.go",
     ],
     # FIXME(b/68809571)
-    tags = ["flaky"],
+    tags = [
+        "flaky",
+    ],
     deps = [
         ":tcp",
         "//pkg/sync",
@@ -104,5 +106,6 @@ go_test(
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/tcp/testing/context",
         "//pkg/waiter",
+        "//runsc/testutil",
     ],
 )
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index dce9a1652..91f25c132 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -105,6 +105,7 @@ type protocol struct {
 	moderateReceiveBuffer      bool
 	tcpLingerTimeout           time.Duration
 	tcpTimeWaitTimeout         time.Duration
+	minRTO                     time.Duration
 	dispatcher                 *dispatcher
 }
 
@@ -272,6 +273,15 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case tcpip.TCPMinRTOOption:
+		if v < 0 {
+			v = tcpip.TCPMinRTOOption(MinRTO)
+		}
+		p.mu.Lock()
+		p.minRTO = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -334,6 +344,12 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.RUnlock()
 		return nil
 
+	case *tcpip.TCPMinRTOOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMinRTOOption(p.minRTO)
+		p.mu.RUnlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -359,5 +375,6 @@ func NewProtocol() stack.TransportProtocol {
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
 		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
+		minRTO:                     MinRTO,
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 6b7bac37d..d8cfe3115 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -15,6 +15,7 @@
 package tcp
 
 import (
+	"fmt"
 	"math"
 	"sync/atomic"
 	"time"
@@ -149,6 +150,9 @@ type sender struct {
 	rtt rtt
 	rto time.Duration
 
+	// minRTO is the minimum permitted value for sender.rto.
+	minRTO time.Duration
+
 	// maxPayloadSize is the maximum size of the payload of a given segment.
 	// It is initialized on demand.
 	maxPayloadSize int
@@ -260,6 +264,13 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 	// etc.
 	s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss)
 
+	// Get Stack wide minRTO.
+	var v tcpip.TCPMinRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
+		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
+	}
+	s.minRTO = time.Duration(v)
+
 	return s
 }
 
@@ -394,8 +405,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
 
 	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
 	s.rtt.Unlock()
-	if s.rto < MinRTO {
-		s.rto = MinRTO
+	if s.rto < s.minRTO {
+		s.rto = s.minRTO
 	}
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
index 782d7b42c..359a75e73 100644
--- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func TestFastRecovery(t *testing.T) {
@@ -40,7 +41,7 @@ func TestFastRecovery(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -86,16 +87,23 @@ func TestFastRecovery(t *testing.T) {
 	// Receive the retransmitted packet.
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
-	}
+	// Wait before checking metrics.
+	metricPollFn := func() error {
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+		}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+		if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+		}
+		return nil
 	}
 
-	if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Now send 7 mode duplicate acks. Each of these should cause a window
@@ -117,12 +125,18 @@ func TestFastRecovery(t *testing.T) {
 	// Receive the retransmit due to partial ack.
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+	// Wait before checking metrics.
+	metricPollFn = func() error {
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+		}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+		}
+		return nil
 	}
-
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
-		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Receive the 10 extra packets that should have been released due to
@@ -192,7 +206,7 @@ func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -234,7 +248,7 @@ func TestCongestionAvoidance(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -338,7 +352,7 @@ func TestCubicCongestionAvoidance(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 
 	for i := range data {
@@ -447,7 +461,7 @@ func TestRetransmit(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -492,24 +506,33 @@ func TestRetransmit(t *testing.T) {
 	rtxOffset := bytesRead - maxPayload*expected
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
-	if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
-	}
+	metricPollFn := func() error {
+		if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
-	}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Timeouts.Value(), uint64(1); got != want {
-		t.Errorf("got EP SendErrors.Timeouts.Value = %v, want = %v", got, want)
-	}
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Timeouts.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP SendErrors.Timeouts.Value = %v, want = %v", got, want)
+		}
+
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP stats SendErrors.Retransmits.Value = %v, want = %v", got, want)
+		}
+
+		if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got EP stats SendErrors.Retransmits.Value = %v, want = %v", got, want)
+		return nil
 	}
 
-	if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+	// Poll when checking metrics.
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Acknowledge half of the pending data.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index afea124ec..c439d5281 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -387,7 +387,7 @@ func TestSACKRecovery(t *testing.T) {
 	setStackSACKPermitted(t, c, true)
 	createConnectedWithSACKAndTS(c)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 29301a45c..41caa9ed4 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -590,6 +590,10 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 		),
 	)
 
+	// Give the stack a few ms to transition the endpoint out of ESTABLISHED
+	// state.
+	time.Sleep(10 * time.Millisecond)
+
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateCloseWait; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
@@ -4472,8 +4476,8 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const keepAliveInterval = 10 * time.Millisecond
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	const keepAliveInterval = 3 * time.Second
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
 	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
 	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
@@ -4567,7 +4571,7 @@ func TestKeepalive(t *testing.T) {
 	// Sleep for a litte over the KeepAlive interval to make sure
 	// the timer has time to fire after the last ACK and close the
 	// close the socket.
-	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+	time.Sleep(keepAliveInterval + keepAliveInterval/2)
 
 	// The connection should be terminated after 5 unacked keepalives.
 	// Send an ACK to trigger a RST from the stack as the endpoint should
@@ -6615,14 +6619,17 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 
 	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
-	const keepAliveInterval = 10 * time.Millisecond
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	const keepAliveInterval = 3 * time.Second
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
 	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
 	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
 
-	// Set userTimeout to be the duration for 3 keepalive probes.
-	userTimeout := 30 * time.Millisecond
+	// Set userTimeout to be the duration to be 1 keepalive
+	// probes. Which means that after the first probe is sent
+	// the second one should cause the connection to be
+	// closed due to userTimeout being hit.
+	userTimeout := 1 * keepAliveInterval
 	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
 
 	// Check that the connection is still alive.
@@ -6630,28 +6637,23 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
 	}
 
-	// Now receive 2 keepalives, but don't ACK them. The connection should
-	// be reset when the 3rd one should be sent due to userTimeout being
-	// 30ms and each keepalive probe should be sent 10ms apart as set above after
-	// the connection has been idle for 10ms.
-	for i := 0; i < 2; i++ {
-		b := c.GetPacket()
-		checker.IPv4(t, b,
-			checker.TCP(
-				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)),
-				checker.AckNum(uint32(790)),
-				checker.TCPFlags(header.TCPFlagAck),
-			),
-		)
-	}
+	// Now receive 1 keepalives, but don't ACK it.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)),
+			checker.AckNum(uint32(790)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
 
 	// Sleep for a litte over the KeepAlive interval to make sure
 	// the timer has time to fire after the last ACK and close the
 	// close the socket.
-	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+	time.Sleep(keepAliveInterval + keepAliveInterval/2)
 
-	// The connection should be terminated after 30ms.
+	// The connection should be closed with a timeout.
 	// Send an ACK to trigger a RST from the stack as the endpoint should
 	// be dead.
 	c.SendPacket(nil, &context.Headers{
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 431ab4e6b..7b1d72cf4 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -152,6 +152,13 @@ func New(t *testing.T, mtu uint32) *Context {
 		t.Fatalf("SetTransportProtocolOption failed: %v", err)
 	}
 
+	// Increase minimum RTO in tests to avoid test flakes due to early
+	// retransmit in case the test executors are overloaded and cause timers
+	// to fire earlier than expected.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMinRTOOption(3*time.Second)); err != nil {
+		t.Fatalf("failed to set stack-wide minRTO: %s", err)
+	}
+
 	// Some of the congestion control tests send up to 640 packets, we so
 	// set the channel size to 1000.
 	ep := channel.New(1000, mtu, "")
@@ -236,7 +243,7 @@ func (c *Context) CheckNoPacket(errMsg string) {
 func (c *Context) GetPacket() []byte {
 	c.t.Helper()
 
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
@@ -417,6 +424,8 @@ func (c *Context) SendAckWithSACK(seq seqnum.Value, bytesReceived int, sackBlock
 // verifies that the packet packet payload of packet matches the slice
 // of data indicated by offset & size.
 func (c *Context) ReceiveAndCheckPacket(data []byte, offset, size int) {
+	c.t.Helper()
+
 	c.ReceiveAndCheckPacketWithOptions(data, offset, size, 0)
 }
 
@@ -425,6 +434,8 @@ func (c *Context) ReceiveAndCheckPacket(data []byte, offset, size int) {
 // data indicated by offset & size and skips optlen bytes in addition to the IP
 // TCP headers when comparing the data.
 func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, optlen int) {
+	c.t.Helper()
+
 	b := c.GetPacket()
 	checker.IPv4(c.t, b,
 		checker.PayloadLen(size+header.TCPMinimumSize+optlen),
@@ -447,6 +458,8 @@ func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, op
 // data indicated by offset & size. It returns true if a packet was received and
 // processed.
 func (c *Context) ReceiveNonBlockingAndCheckPacket(data []byte, offset, size int) bool {
+	c.t.Helper()
+
 	b := c.GetPacketNonBlocking()
 	if b == nil {
 		return false
@@ -570,6 +583,8 @@ func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf
 //
 // PreCondition: c.EP must already be created.
 func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte) {
+	c.t.Helper()
+
 	// Start connection attempt.
 	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
-- 
cgit v1.2.3


From 9c918340e4e6126cca1dfedbf28fec8c8f836e1a Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Wed, 15 Apr 2020 01:10:38 -0700
Subject: Reset pending connections on listener close

Attempt to redeliver TCP segments that are enqueued into a closing
TCP endpoint. This was being done for Established endpoints but not
for those that are listening or performing connection handshake.

Fixes #2417

PiperOrigin-RevId: 306598155
---
 pkg/tcpip/transport/tcp/accept.go           |  7 +++-
 pkg/tcpip/transport/tcp/connect.go          | 30 ++++++++------
 pkg/tcpip/transport/tcp/endpoint.go         | 30 +++++++-------
 pkg/tcpip/transport/tcp/tcp_test.go         | 37 +++++++++++++++++
 test/packetimpact/tests/BUILD               |  2 -
 test/syscalls/linux/socket_inet_loopback.cc | 62 +++++++++++++++++++++++++++++
 6 files changed, 138 insertions(+), 30 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 7a9dea4ac..e07b436c4 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -330,6 +330,9 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		if l.listenEP != nil {
 			l.removePendingEndpoint(ep)
 		}
+
+		ep.drainClosingSegmentQueue()
+
 		return nil, err
 	}
 	ep.isConnectNotified = true
@@ -378,7 +381,7 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	for {
 		if e.acceptedChan == nil {
 			e.acceptMu.Unlock()
-			n.Close()
+			n.notifyProtocolGoroutine(notifyReset)
 			return
 		}
 		select {
@@ -656,6 +659,8 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		}
 		e.mu.Unlock()
 
+		e.drainClosingSegmentQueue()
+
 		// Notify waiters that the endpoint is shutdown.
 		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 	}()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 2ca3fb809..994ac52a3 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1062,6 +1062,20 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	}
 }
 
+// Drain segment queue from the endpoint and try to re-match the segment to a
+// different endpoint. This is used when the current endpoint is transitioned to
+// StateClose and has been unregistered from the transport demuxer.
+func (e *endpoint) drainClosingSegmentQueue() {
+	for {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			break
+		}
+
+		e.tryDeliverSegmentFromClosedEndpoint(s)
+	}
+}
+
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 	if e.rcv.acceptable(s.sequenceNumber, 0) {
 		// RFC 793, page 37 states that "in all states
@@ -1315,6 +1329,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		}
 
 		e.mu.Unlock()
+
+		e.drainClosingSegmentQueue()
+
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1565,19 +1582,6 @@ loop:
 	// Lock released below.
 	epilogue()
 
-	// epilogue removes the endpoint from the transport-demuxer and
-	// unlocks e.mu. Now that no new segments can get enqueued to this
-	// endpoint, try to re-match the segment to a different endpoint
-	// as the current endpoint is closed.
-	for {
-		s := e.segmentQueue.dequeue()
-		if s == nil {
-			break
-		}
-
-		e.tryDeliverSegmentFromClosedEndpoint(s)
-	}
-
 	// A new SYN was received during TIME_WAIT and we need to abort
 	// the timewait and redirect the segment to the listener queue
 	if reuseTW != nil {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index a8d443f73..7ed78d57f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -980,25 +980,22 @@ func (e *endpoint) closeNoShutdownLocked() {
 
 	// Mark endpoint as closed.
 	e.closed = true
+
+	switch e.EndpointState() {
+	case StateClose, StateError:
+		return
+	}
+
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
-	switch e.EndpointState() {
-	// Sockets in StateSynRecv state(passive connections) are closed when
-	// the handshake fails or if the listening socket is closed while
-	// handshake was in progress. In such cases the handshake goroutine
-	// is already gone by the time Close is called and we need to cleanup
-	// here.
-	case StateInitial, StateBound, StateSynRecv:
-		e.cleanupLocked()
-		e.setEndpointState(StateClose)
-	case StateError, StateClose:
-		// do nothing.
-	default:
+	if e.workerRunning {
 		e.workerCleanup = true
 		tcpip.AddDanglingEndpoint(e)
 		// Worker will remove the dangling endpoint when the endpoint
 		// goroutine terminates.
 		e.notifyProtocolGoroutine(notifyClose)
+	} else {
+		e.transitionToStateCloseLocked()
 	}
 }
 
@@ -1010,13 +1007,18 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 		e.acceptMu.Unlock()
 		return
 	}
-
 	close(e.acceptedChan)
+	ch := e.acceptedChan
 	e.acceptedChan = nil
 	e.acceptCond.Broadcast()
 	e.acceptMu.Unlock()
 
-	// Wait for all pending endpoints to close.
+	// Reset all connections that are waiting to be accepted.
+	for n := range ch {
+		n.notifyProtocolGoroutine(notifyReset)
+	}
+	// Wait for reset of all endpoints that are still waiting to be delivered to
+	// the now closed acceptedChan.
 	e.pendingAccepted.Wait()
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 41caa9ed4..a9f121c17 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1068,6 +1068,43 @@ func TestListenShutdown(t *testing.T) {
 	c.CheckNoPacket("Packet received when listening socket was shutdown")
 }
 
+// TestListenCloseWhileConnect tests for the listening endpoint to
+// drain the accept-queue when closed. This should reset all of the
+// pending connections that are waiting to be accepted.
+func TestListenCloseWhileConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventIn)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+	// Wait for the new endpoint created because of handshake to be delivered
+	// to the listening endpoint's accept queue.
+	<-notifyCh
+
+	// Close the listening endpoint.
+	c.EP.Close()
+
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
+}
+
 func TestTOSV4(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 308590162..1274d9f60 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -43,8 +43,6 @@ packetimpact_go_test(
 packetimpact_go_test(
     name = "tcp_noaccept_close_rst",
     srcs = ["tcp_noaccept_close_rst_test.go"],
-    # TODO(b/153380909): Fix netstack then remove the line below.
-    netstack = False,
     deps = [
         "//pkg/tcpip/header",
         "//test/packetimpact/testbench",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 71bd7c14d..cd84e633a 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -365,6 +365,68 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 2;
+  constexpr int kClients = kBacklog + 1;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  std::vector<FileDescriptor> clients;
+  for (int i = 0; i < kClients; i++) {
+    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+      clients.push_back(std::move(client));
+    }
+  }
+  // Close the listening socket.
+  listen_fd.reset();
+
+  for (auto& client : clients) {
+    const int kTimeout = 10000;
+    struct pollfd pfd = {
+        .fd = client.get(),
+        .events = POLLIN,
+    };
+    // When the listening socket is closed, then we expect the remote to reset
+    // the connection.
+    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+    char c;
+    // Subsequent read can fail with:
+    // ECONNRESET: If the client connection was established and was reset by the
+    // remote. ECONNREFUSED: If the client connection failed to be established.
+    ASSERT_THAT(read(client.get(), &c, sizeof(c)),
+                AnyOf(SyscallFailsWithErrno(ECONNRESET),
+                      SyscallFailsWithErrno(ECONNREFUSED)));
+  }
+}
+
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   auto const& param = GetParam();
 
-- 
cgit v1.2.3


From 0348edc3cb070d1167b45bd13b1d202ddf3e7b0a Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 15 Apr 2020 06:04:30 -0700
Subject: Remove unnecessary code

Remove useless casts and duplicate return statements.

PiperOrigin-RevId: 306627916
---
 pkg/tcpip/transport/tcp/endpoint.go | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 7ed78d57f..bffc59e9f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1457,13 +1457,11 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.LockUser()
 		e.reuseAddr = v
 		e.UnlockUser()
-		return nil
 
 	case tcpip.ReusePortOption:
 		e.LockUser()
 		e.reusePort = v
 		e.UnlockUser()
-		return nil
 
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
@@ -1494,7 +1492,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
 	case tcpip.KeepaliveCountOption:
 		e.keepalive.Lock()
-		e.keepalive.count = int(v)
+		e.keepalive.count = v
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
@@ -1526,13 +1524,12 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
 		var rs ReceiveBufferSizeOption
-		size := int(v)
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
-			if size < rs.Min {
-				size = rs.Min
+			if v < rs.Min {
+				v = rs.Min
 			}
-			if size > rs.Max {
-				size = rs.Max
+			if v > rs.Max {
+				v = rs.Max
 			}
 		}
 
@@ -1547,17 +1544,17 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		if e.rcv != nil {
 			scale = e.rcv.rcvWndScale
 		}
-		if size>>scale == 0 {
-			size = 1 << scale
+		if v>>scale == 0 {
+			v = 1 << scale
 		}
 
 		// Make sure 2*size doesn't overflow.
-		if size > math.MaxInt32/2 {
-			size = math.MaxInt32 / 2
+		if v > math.MaxInt32/2 {
+			v = math.MaxInt32 / 2
 		}
 
 		availBefore := e.receiveBufferAvailableLocked()
-		e.rcvBufSize = size
+		e.rcvBufSize = v
 		availAfter := e.receiveBufferAvailableLocked()
 
 		e.rcvAutoParams.disabled = true
@@ -1576,19 +1573,18 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.SendBufferSizeOption:
 		// Make sure the send buffer size is within the min and max
 		// allowed.
-		size := int(v)
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
-			if size < ss.Min {
-				size = ss.Min
+			if v < ss.Min {
+				v = ss.Min
 			}
-			if size > ss.Max {
-				size = ss.Max
+			if v > ss.Max {
+				v = ss.Max
 			}
 		}
 
 		e.sndBufMu.Lock()
-		e.sndBufSize = size
+		e.sndBufSize = v
 		e.sndBufMu.Unlock()
 
 	case tcpip.TTLOption:
-- 
cgit v1.2.3


From 7c13546d3b6fff7ce507fae6d0435e0082dc20ba Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 15 Apr 2020 11:01:29 -0700
Subject: Deduplicate packet logging

PiperOrigin-RevId: 306677789
---
 pkg/tcpip/link/sniffer/sniffer.go | 128 +++++++++++---------------------------
 1 file changed, 38 insertions(+), 90 deletions(-)

diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 062388f4d..03def4013 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -21,7 +21,6 @@
 package sniffer
 
 import (
-	"bytes"
 	"encoding/binary"
 	"fmt"
 	"io"
@@ -124,36 +123,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
 func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, pkt.Data.First(), nil)
-	}
-	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		vs := pkt.Data.Views()
-		length := pkt.Data.Size()
-		if length > int(e.maxPCAPLen) {
-			length = int(e.maxPCAPLen)
-		}
-
-		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(pkt.Data.Size()))); err != nil {
-			panic(err)
-		}
-		for _, v := range vs {
-			if length == 0 {
-				break
-			}
-			if len(v) > length {
-				v = v[:length]
-			}
-			if _, err := buf.Write([]byte(v)); err != nil {
-				panic(err)
-			}
-			length -= len(v)
-		}
-		if _, err := e.file.Write(buf.Bytes()); err != nil {
-			panic(err)
-		}
-	}
+	e.dumpPacket("recv", nil, protocol, &pkt)
 	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
 }
 
@@ -200,31 +170,43 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
-	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", protocol, pkt.Header.View(), gso)
+func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	file := e.file
+	if file == nil && atomic.LoadUint32(&LogPackets) == 1 {
+		first := pkt.Header.View()
+		if len(first) == 0 {
+			first = pkt.Data.First()
+		}
+		logPacket(prefix, protocol, first, gso)
 	}
-	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		hdrBuf := pkt.Header.View()
-		length := len(hdrBuf) + pkt.Data.Size()
-		if length > int(e.maxPCAPLen) {
-			length = int(e.maxPCAPLen)
+	if file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
+		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
+		length := totalLength
+		if max := int(e.maxPCAPLen); length > max {
+			length = max
 		}
-
-		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(len(hdrBuf)+pkt.Data.Size()))); err != nil {
+		if err := binary.Write(file, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil {
 			panic(err)
 		}
-		if len(hdrBuf) > length {
-			hdrBuf = hdrBuf[:length]
-		}
-		if _, err := buf.Write(hdrBuf); err != nil {
-			panic(err)
+		write := func(b []byte) {
+			if len(b) > length {
+				b = b[:length]
+			}
+			for len(b) != 0 {
+				n, err := file.Write(b)
+				if err != nil {
+					panic(err)
+				}
+				b = b[n:]
+				length -= n
+			}
 		}
-		length -= len(hdrBuf)
-		logVectorisedView(pkt.Data, length, buf)
-		if _, err := e.file.Write(buf.Bytes()); err != nil {
-			panic(err)
+		write(pkt.Header.View())
+		for _, view := range pkt.Data.Views() {
+			if length == 0 {
+				break
+			}
+			write(view)
 		}
 	}
 }
@@ -233,7 +215,7 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumb
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
-	e.dumpPacket(gso, protocol, &pkt)
+	e.dumpPacket("send", gso, protocol, &pkt)
 	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
 
@@ -242,53 +224,19 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // forwards the request to the lower endpoint.
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.dumpPacket(gso, protocol, pkt)
+		e.dumpPacket("send", gso, protocol, pkt)
 	}
 	return e.lower.WritePackets(r, gso, pkts, protocol)
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
-	}
-	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		length := vv.Size()
-		if length > int(e.maxPCAPLen) {
-			length = int(e.maxPCAPLen)
-		}
-
-		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(vv.Size()))); err != nil {
-			panic(err)
-		}
-		logVectorisedView(vv, length, buf)
-		if _, err := e.file.Write(buf.Bytes()); err != nil {
-			panic(err)
-		}
-	}
+	e.dumpPacket("send", nil, 0, &stack.PacketBuffer{
+		Data: vv,
+	})
 	return e.lower.WriteRawPacket(vv)
 }
 
-func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer) {
-	if length <= 0 {
-		return
-	}
-	for _, v := range vv.Views() {
-		if len(v) > length {
-			v = v[:length]
-		}
-		n, err := buf.Write(v)
-		if err != nil {
-			panic(err)
-		}
-		length -= n
-		if length == 0 {
-			return
-		}
-	}
-}
-
 // Wait implements stack.LinkEndpoint.Wait.
 func (*endpoint) Wait() {}
 
-- 
cgit v1.2.3


From 1bcc2bf17f0e2ccf8e98e934cb9f9ce66711d27a Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 15 Apr 2020 12:59:58 -0700
Subject: Refactor connections.go to make it easier to add new connection
 types.

Rather than have a struct for the state of each type of connection, such as
TCP/IPv4, UDP/IPv4, TCP/IPv6, etc, have a state for each layer, such as UDP,
TCP, IPv4, IPv6.  Those states can be composed into connections.

Tested:
  Existing unit tests still pass/fail as expected.
PiperOrigin-RevId: 306703180
---
 test/packetimpact/testbench/connections.go         | 743 +++++++++++++--------
 test/packetimpact/testbench/layers.go              |  97 ++-
 test/packetimpact/testbench/rawsockets.go          |  15 +-
 .../tests/tcp_should_piggyback_test.go             |  12 +-
 test/packetimpact/tests/tcp_window_shrink_test.go  |  18 +-
 test/packetimpact/tests/udp_recv_multicast_test.go |   2 +-
 6 files changed, 566 insertions(+), 321 deletions(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 2b8e2f005..169db01b0 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -63,229 +63,469 @@ func pickPort() (int, uint16, error) {
 	return fd, uint16(newSockAddrInet4.Port), nil
 }
 
-// TCPIPv4 maintains state about a TCP/IPv4 connection.
-type TCPIPv4 struct {
-	outgoing     Layers
-	incoming     Layers
-	LocalSeqNum  seqnum.Value
-	RemoteSeqNum seqnum.Value
-	SynAck       *TCP
-	sniffer      Sniffer
-	injector     Injector
-	portPickerFD int
-	t            *testing.T
+// layerState stores the state of a layer of a connection.
+type layerState interface {
+	// outgoing returns an outgoing layer to be sent in a frame.
+	outgoing() Layer
+
+	// incoming creates an expected Layer for comparing against a received Layer.
+	// Because the expectation can depend on values in the received Layer, it is
+	// an input to incoming. For example, the ACK number needs to be checked in a
+	// TCP packet but only if the ACK flag is set in the received packet.
+	incoming(received Layer) Layer
+
+	// sent updates the layerState based on the Layer that was sent. The input is
+	// a Layer with all prev and next pointers populated so that the entire frame
+	// as it was sent is available.
+	sent(sent Layer) error
+
+	// received updates the layerState based on a Layer that is receieved. The
+	// input is a Layer with all prev and next pointers populated so that the
+	// entire frame as it was receieved is available.
+	received(received Layer) error
+
+	// close frees associated resources held by the LayerState.
+	close() error
 }
 
-// tcpLayerIndex is the position of the TCP layer in the TCPIPv4 connection. It
-// is the third, after Ethernet and IPv4.
-const tcpLayerIndex int = 2
+// etherState maintains state about an Ethernet connection.
+type etherState struct {
+	out, in Ether
+}
 
-// NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults.
-func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 {
+var _ layerState = (*etherState)(nil)
+
+// newEtherState creates a new etherState.
+func newEtherState(out, in Ether) (*etherState, error) {
 	lMAC, err := tcpip.ParseMACAddress(*localMAC)
 	if err != nil {
-		t.Fatalf("can't parse localMAC %q: %s", *localMAC, err)
+		return nil, err
 	}
 
 	rMAC, err := tcpip.ParseMACAddress(*remoteMAC)
 	if err != nil {
-		t.Fatalf("can't parse remoteMAC %q: %s", *remoteMAC, err)
+		return nil, err
 	}
-
-	portPickerFD, localPort, err := pickPort()
-	if err != nil {
-		t.Fatalf("can't pick a port: %s", err)
+	s := etherState{
+		out: Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
+		in:  Ether{SrcAddr: &rMAC, DstAddr: &lMAC},
 	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+func (s *etherState) outgoing() Layer {
+	return &s.out
+}
+
+func (s *etherState) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (*etherState) sent(Layer) error {
+	return nil
+}
+
+func (*etherState) received(Layer) error {
+	return nil
+}
+
+func (*etherState) close() error {
+	return nil
+}
+
+// ipv4State maintains state about an IPv4 connection.
+type ipv4State struct {
+	out, in IPv4
+}
+
+var _ layerState = (*ipv4State)(nil)
+
+// newIPv4State creates a new ipv4State.
+func newIPv4State(out, in IPv4) (*ipv4State, error) {
 	lIP := tcpip.Address(net.ParseIP(*localIPv4).To4())
 	rIP := tcpip.Address(net.ParseIP(*remoteIPv4).To4())
-
-	sniffer, err := NewSniffer(t)
-	if err != nil {
-		t.Fatalf("can't make new sniffer: %s", err)
+	s := ipv4State{
+		out: IPv4{SrcAddr: &lIP, DstAddr: &rIP},
+		in:  IPv4{SrcAddr: &rIP, DstAddr: &lIP},
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
 	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
 
-	injector, err := NewInjector(t)
+func (s *ipv4State) outgoing() Layer {
+	return &s.out
+}
+
+func (s *ipv4State) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (*ipv4State) sent(Layer) error {
+	return nil
+}
+
+func (*ipv4State) received(Layer) error {
+	return nil
+}
+
+func (*ipv4State) close() error {
+	return nil
+}
+
+// tcpState maintains state about a TCP connection.
+type tcpState struct {
+	out, in                   TCP
+	localSeqNum, remoteSeqNum *seqnum.Value
+	synAck                    *TCP
+	portPickerFD              int
+}
+
+var _ layerState = (*tcpState)(nil)
+
+// SeqNumValue is a helper routine that allocates a new seqnum.Value value to
+// store v and returns a pointer to it.
+func SeqNumValue(v seqnum.Value) *seqnum.Value {
+	return &v
+}
+
+// newTCPState creates a new TCPState.
+func newTCPState(out, in TCP) (*tcpState, error) {
+	portPickerFD, localPort, err := pickPort()
 	if err != nil {
-		t.Fatalf("can't make new injector: %s", err)
+		return nil, err
+	}
+	s := tcpState{
+		out:          TCP{SrcPort: &localPort},
+		in:           TCP{DstPort: &localPort},
+		localSeqNum:  SeqNumValue(seqnum.Value(rand.Uint32())),
+		portPickerFD: portPickerFD,
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
 	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
 
-	newOutgoingTCP := &TCP{
-		SrcPort: &localPort,
+func (s *tcpState) outgoing() Layer {
+	newOutgoing := deepcopy.Copy(s.out).(TCP)
+	if s.localSeqNum != nil {
+		newOutgoing.SeqNum = Uint32(uint32(*s.localSeqNum))
 	}
-	if err := newOutgoingTCP.merge(outgoingTCP); err != nil {
-		t.Fatalf("can't merge %+v into %+v: %s", outgoingTCP, newOutgoingTCP, err)
+	if s.remoteSeqNum != nil {
+		newOutgoing.AckNum = Uint32(uint32(*s.remoteSeqNum))
 	}
-	newIncomingTCP := &TCP{
-		DstPort: &localPort,
+	return &newOutgoing
+}
+
+func (s *tcpState) incoming(received Layer) Layer {
+	tcpReceived, ok := received.(*TCP)
+	if !ok {
+		return nil
 	}
-	if err := newIncomingTCP.merge(incomingTCP); err != nil {
-		t.Fatalf("can't merge %+v into %+v: %s", incomingTCP, newIncomingTCP, err)
+	newIn := deepcopy.Copy(s.in).(TCP)
+	if s.remoteSeqNum != nil {
+		newIn.SeqNum = Uint32(uint32(*s.remoteSeqNum))
 	}
-	return TCPIPv4{
-		outgoing: Layers{
-			&Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
-			&IPv4{SrcAddr: &lIP, DstAddr: &rIP},
-			newOutgoingTCP},
-		incoming: Layers{
-			&Ether{SrcAddr: &rMAC, DstAddr: &lMAC},
-			&IPv4{SrcAddr: &rIP, DstAddr: &lIP},
-			newIncomingTCP},
-		sniffer:      sniffer,
-		injector:     injector,
-		portPickerFD: portPickerFD,
-		t:            t,
-		LocalSeqNum:  seqnum.Value(rand.Uint32()),
+	if s.localSeqNum != nil && (*tcpReceived.Flags&header.TCPFlagAck) != 0 {
+		// The caller didn't specify an AckNum so we'll expect the calculated one,
+		// but only if the ACK flag is set because the AckNum is not valid in a
+		// header if ACK is not set.
+		newIn.AckNum = Uint32(uint32(*s.localSeqNum))
 	}
+	return &newIn
 }
 
-// Close the injector and sniffer associated with this connection.
-func (conn *TCPIPv4) Close() {
-	conn.sniffer.Close()
-	conn.injector.Close()
-	if err := unix.Close(conn.portPickerFD); err != nil {
-		conn.t.Fatalf("can't close portPickerFD: %s", err)
+func (s *tcpState) sent(sent Layer) error {
+	tcp, ok := sent.(*TCP)
+	if !ok {
+		return fmt.Errorf("can't update tcpState with %T Layer", sent)
 	}
-	conn.portPickerFD = -1
+	for current := tcp.next(); current != nil; current = current.next() {
+		s.localSeqNum.UpdateForward(seqnum.Size(current.length()))
+	}
+	if tcp.Flags != nil && *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
+		s.localSeqNum.UpdateForward(1)
+	}
+	return nil
 }
 
-// CreateFrame builds a frame for the connection with tcp overriding defaults
-// and additionalLayers added after the TCP header.
-func (conn *TCPIPv4) CreateFrame(tcp TCP, additionalLayers ...Layer) Layers {
-	if tcp.SeqNum == nil {
-		tcp.SeqNum = Uint32(uint32(conn.LocalSeqNum))
+func (s *tcpState) received(l Layer) error {
+	tcp, ok := l.(*TCP)
+	if !ok {
+		return fmt.Errorf("can't update tcpState with %T Layer", l)
 	}
-	if tcp.AckNum == nil {
-		tcp.AckNum = Uint32(uint32(conn.RemoteSeqNum))
+	s.remoteSeqNum = SeqNumValue(seqnum.Value(*tcp.SeqNum))
+	if *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
+		s.remoteSeqNum.UpdateForward(1)
 	}
-	layersToSend := deepcopy.Copy(conn.outgoing).(Layers)
-	if err := layersToSend[tcpLayerIndex].(*TCP).merge(tcp); err != nil {
-		conn.t.Fatalf("can't merge %+v into %+v: %s", tcp, layersToSend[tcpLayerIndex], err)
+	for current := tcp.next(); current != nil; current = current.next() {
+		s.remoteSeqNum.UpdateForward(seqnum.Size(current.length()))
 	}
-	layersToSend = append(layersToSend, additionalLayers...)
-	return layersToSend
+	return nil
 }
 
-// SendFrame sends a frame with reasonable defaults.
-func (conn *TCPIPv4) SendFrame(frame Layers) {
-	outBytes, err := frame.toBytes()
-	if err != nil {
-		conn.t.Fatalf("can't build outgoing TCP packet: %s", err)
+// close frees the port associated with this connection.
+func (s *tcpState) close() error {
+	if err := unix.Close(s.portPickerFD); err != nil {
+		return err
 	}
-	conn.injector.Send(outBytes)
+	s.portPickerFD = -1
+	return nil
+}
+
+// udpState maintains state about a UDP connection.
+type udpState struct {
+	out, in      UDP
+	portPickerFD int
+}
 
-	// Compute the next TCP sequence number.
-	for i := tcpLayerIndex + 1; i < len(frame); i++ {
-		conn.LocalSeqNum.UpdateForward(seqnum.Size(frame[i].length()))
+var _ layerState = (*udpState)(nil)
+
+// newUDPState creates a new udpState.
+func newUDPState(out, in UDP) (*udpState, error) {
+	portPickerFD, localPort, err := pickPort()
+	if err != nil {
+		return nil, err
 	}
-	tcp := frame[tcpLayerIndex].(*TCP)
-	if tcp.Flags != nil && *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
-		conn.LocalSeqNum.UpdateForward(1)
+	s := udpState{
+		out:          UDP{SrcPort: &localPort},
+		in:           UDP{DstPort: &localPort},
+		portPickerFD: portPickerFD,
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
 	}
+	return &s, nil
 }
 
-// Send a packet with reasonable defaults and override some fields by tcp.
-func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
-	conn.SendFrame(conn.CreateFrame(tcp, additionalLayers...))
+func (s *udpState) outgoing() Layer {
+	return &s.out
+}
+
+func (s *udpState) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (*udpState) sent(l Layer) error {
+	return nil
 }
 
-// Recv gets a packet from the sniffer within the timeout provided.
-// If no packet arrives before the timeout, it returns nil.
-func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {
-	layers := conn.RecvFrame(timeout)
-	if tcpLayerIndex < len(layers) {
-		return layers[tcpLayerIndex].(*TCP)
+func (*udpState) received(l Layer) error {
+	return nil
+}
+
+// close frees the port associated with this connection.
+func (s *udpState) close() error {
+	if err := unix.Close(s.portPickerFD); err != nil {
+		return err
 	}
+	s.portPickerFD = -1
 	return nil
 }
 
-// RecvFrame gets a frame (of type Layers) within the timeout provided.
-// If no frame arrives before the timeout, it returns nil.
-func (conn *TCPIPv4) RecvFrame(timeout time.Duration) Layers {
-	deadline := time.Now().Add(timeout)
-	for {
-		timeout = time.Until(deadline)
-		if timeout <= 0 {
-			break
-		}
-		b := conn.sniffer.Recv(timeout)
-		if b == nil {
-			break
+// Connection holds a collection of layer states for maintaining a connection
+// along with sockets for sniffer and injecting packets.
+type Connection struct {
+	layerStates []layerState
+	injector    Injector
+	sniffer     Sniffer
+	t           *testing.T
+}
+
+// match tries to match each Layer in received against the incoming filter. If
+// received is longer than layerStates then that may still count as a match. The
+// reverse is never a match. override overrides the default matchers for each
+// Layer.
+func (conn *Connection) match(override, received Layers) bool {
+	if len(received) < len(conn.layerStates) {
+		return false
+	}
+	for i, s := range conn.layerStates {
+		toMatch := s.incoming(received[i])
+		if toMatch == nil {
+			return false
 		}
-		layers := Parse(ParseEther, b)
-		if !conn.incoming.match(layers) {
-			continue // Ignore packets that don't match the expected incoming.
+		if i < len(override) {
+			toMatch.merge(override[i])
 		}
-		tcpHeader := (layers[tcpLayerIndex]).(*TCP)
-		conn.RemoteSeqNum = seqnum.Value(*tcpHeader.SeqNum)
-		if *tcpHeader.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
-			conn.RemoteSeqNum.UpdateForward(1)
+		if !toMatch.match(received[i]) {
+			return false
 		}
-		for i := tcpLayerIndex + 1; i < len(layers); i++ {
-			conn.RemoteSeqNum.UpdateForward(seqnum.Size(layers[i].length()))
+	}
+	return true
+}
+
+// Close frees associated resources held by the Connection.
+func (conn *Connection) Close() {
+	if err := conn.sniffer.close(); err != nil {
+		conn.t.Fatal(err)
+	}
+	if err := conn.injector.close(); err != nil {
+		conn.t.Fatal(err)
+	}
+	for _, s := range conn.layerStates {
+		if err := s.close(); err != nil {
+			conn.t.Fatalf("unable to close %+v: %s", s, err)
 		}
-		return layers
 	}
-	return nil
 }
 
-// Drain drains the sniffer's receive buffer by receiving packets until there's
-// nothing else to receive.
-func (conn *TCPIPv4) Drain() {
-	conn.sniffer.Drain()
+// CreateFrame builds a frame for the connection with layer overriding defaults
+// of the innermost layer and additionalLayers added after it.
+func (conn *Connection) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {
+	var layersToSend Layers
+	for _, s := range conn.layerStates {
+		layersToSend = append(layersToSend, s.outgoing())
+	}
+	if err := layersToSend[len(layersToSend)-1].merge(layer); err != nil {
+		conn.t.Fatalf("can't merge %+v into %+v: %s", layer, layersToSend[len(layersToSend)-1], err)
+	}
+	layersToSend = append(layersToSend, additionalLayers...)
+	return layersToSend
 }
 
-// Expect a packet that matches the provided tcp within the timeout specified.
-// If it doesn't arrive in time, it returns nil.
-func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
-	// We cannot implement this directly using ExpectFrame as we cannot specify
-	// the Payload part.
-	deadline := time.Now().Add(timeout)
-	var allTCP []string
-	for {
-		var gotTCP *TCP
-		if timeout = time.Until(deadline); timeout > 0 {
-			gotTCP = conn.Recv(timeout)
-		}
-		if gotTCP == nil {
-			return nil, fmt.Errorf("got %d packets:\n%s", len(allTCP), strings.Join(allTCP, "\n"))
-		}
-		if tcp.match(gotTCP) {
-			return gotTCP, nil
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *Connection) SendFrame(frame Layers) {
+	outBytes, err := frame.toBytes()
+	if err != nil {
+		conn.t.Fatalf("can't build outgoing TCP packet: %s", err)
+	}
+	conn.injector.Send(outBytes)
+
+	// frame might have nil values where the caller wanted to use default values.
+	// sentFrame will have no nil values in it because it comes from parsing the
+	// bytes that were actually sent.
+	sentFrame := parse(parseEther, outBytes)
+	// Update the state of each layer based on what was sent.
+	for i, s := range conn.layerStates {
+		if err := s.sent(sentFrame[i]); err != nil {
+			conn.t.Fatalf("Unable to update the state of %+v with %s: %s", s, sentFrame[i], err)
 		}
-		allTCP = append(allTCP, gotTCP.String())
 	}
 }
 
-// ExpectFrame expects a frame that matches the specified layers within the
+// Send a packet with reasonable defaults. Potentially override the final layer
+// in the connection with the provided layer and add additionLayers.
+func (conn *Connection) Send(layer Layer, additionalLayers ...Layer) {
+	conn.SendFrame(conn.CreateFrame(layer, additionalLayers...))
+}
+
+// recvFrame gets the next successfully parsed frame (of type Layers) within the
+// timeout provided. If no parsable frame arrives before the timeout, it returns
+// nil.
+func (conn *Connection) recvFrame(timeout time.Duration) Layers {
+	if timeout <= 0 {
+		return nil
+	}
+	b := conn.sniffer.Recv(timeout)
+	if b == nil {
+		return nil
+	}
+	return parse(parseEther, b)
+}
+
+// Expect a frame with the final layerStates layer matching the provided Layer
+// within the timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error) {
+	// Make a frame that will ignore all but the final layer.
+	layers := make([]Layer, len(conn.layerStates))
+	layers[len(layers)-1] = layer
+
+	gotFrame, err := conn.ExpectFrame(layers, timeout)
+	if err != nil {
+		return nil, err
+	}
+	if len(conn.layerStates)-1 < len(gotFrame) {
+		return gotFrame[len(conn.layerStates)-1], nil
+	}
+	conn.t.Fatal("the received frame should be at least as long as the expected layers")
+	return nil, fmt.Errorf("the received frame should be at least as long as the expected layers")
+}
+
+// ExpectFrame expects a frame that matches the provided Layers within the
 // timeout specified. If it doesn't arrive in time, it returns nil.
-func (conn *TCPIPv4) ExpectFrame(layers Layers, timeout time.Duration) Layers {
+func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layers, error) {
 	deadline := time.Now().Add(timeout)
+	var allLayers []string
 	for {
-		timeout = time.Until(deadline)
-		if timeout <= 0 {
-			return nil
+		var gotLayers Layers
+		if timeout = time.Until(deadline); timeout > 0 {
+			gotLayers = conn.recvFrame(timeout)
+		}
+		if gotLayers == nil {
+			return nil, fmt.Errorf("got %d packets:\n%s", len(allLayers), strings.Join(allLayers, "\n"))
 		}
-		gotLayers := conn.RecvFrame(timeout)
-		if layers.match(gotLayers) {
-			return gotLayers
+		if conn.match(layers, gotLayers) {
+			for i, s := range conn.layerStates {
+				if err := s.received(gotLayers[i]); err != nil {
+					conn.t.Fatal(err)
+				}
+			}
+			return gotLayers, nil
 		}
+		allLayers = append(allLayers, fmt.Sprintf("%s", gotLayers))
 	}
 }
 
-// ExpectData is a convenient method that expects a TCP packet along with
-// the payload to arrive within the timeout specified. If it doesn't arrive
-// in time, it causes a fatal test failure.
-func (conn *TCPIPv4) ExpectData(tcp TCP, data []byte, timeout time.Duration) {
-	expected := []Layer{&Ether{}, &IPv4{}, &tcp}
-	if len(data) > 0 {
-		expected = append(expected, &Payload{Bytes: data})
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *Connection) Drain() {
+	conn.sniffer.Drain()
+}
+
+// TCPIPv4 maintains the state for all the layers in a TCP/IPv4 connection.
+type TCPIPv4 Connection
+
+// NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults.
+func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 {
+	etherState, err := newEtherState(Ether{}, Ether{})
+	if err != nil {
+		t.Fatalf("can't make etherState: %s", err)
+	}
+	ipv4State, err := newIPv4State(IPv4{}, IPv4{})
+	if err != nil {
+		t.Fatalf("can't make ipv4State: %s", err)
+	}
+	tcpState, err := newTCPState(outgoingTCP, incomingTCP)
+	if err != nil {
+		t.Fatalf("can't make tcpState: %s", err)
 	}
-	if conn.ExpectFrame(expected, timeout) == nil {
-		conn.t.Fatalf("expected to get a TCP frame %s with payload %x", &tcp, data)
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make injector: %s", err)
+	}
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make sniffer: %s", err)
+	}
+
+	return TCPIPv4{
+		layerStates: []layerState{etherState, ipv4State, tcpState},
+		injector:    injector,
+		sniffer:     sniffer,
+		t:           t,
 	}
 }
 
-// Handshake performs a TCP 3-way handshake.
+// Handshake performs a TCP 3-way handshake. The input Connection should have a
+// final TCP Layer.
 func (conn *TCPIPv4) Handshake() {
 	// Send the SYN.
 	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn)})
@@ -295,138 +535,111 @@ func (conn *TCPIPv4) Handshake() {
 	if synAck == nil {
 		conn.t.Fatalf("didn't get synack during handshake: %s", err)
 	}
-	conn.SynAck = synAck
+	conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck
 
 	// Send an ACK.
 	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
 }
 
-// UDPIPv4 maintains state about a UDP/IPv4 connection.
-type UDPIPv4 struct {
-	outgoing     Layers
-	incoming     Layers
-	sniffer      Sniffer
-	injector     Injector
-	portPickerFD int
-	t            *testing.T
+// ExpectData is a convenient method that expects a Layer and the Layer after
+// it. If it doens't arrive in time, it returns nil.
+func (conn *TCPIPv4) ExpectData(tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) {
+	expected := make([]Layer, len(conn.layerStates))
+	expected[len(expected)-1] = tcp
+	if payload != nil {
+		expected = append(expected, payload)
+	}
+	return (*Connection)(conn).ExpectFrame(expected, timeout)
+}
+
+// Send a packet with reasonable defaults. Potentially override the TCP layer in
+// the connection with the provided layer and add additionLayers.
+func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+	(*Connection)(conn).Send(&tcp, additionalLayers...)
 }
 
-// udpLayerIndex is the position of the UDP layer in the UDPIPv4 connection. It
-// is the third, after Ethernet and IPv4.
-const udpLayerIndex int = 2
+// Close frees associated resources held by the TCPIPv4 connection.
+func (conn *TCPIPv4) Close() {
+	(*Connection)(conn).Close()
+}
+
+// Expect a frame with the TCP layer matching the provided TCP within the
+// timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
+	layer, err := (*Connection)(conn).Expect(&tcp, timeout)
+	if layer == nil {
+		return nil, err
+	}
+	gotTCP, ok := layer.(*TCP)
+	if !ok {
+		conn.t.Fatalf("expected %s to be TCP", layer)
+	}
+	return gotTCP, err
+}
+
+// RemoteSeqNum returns the next expected sequence number from the DUT.
+func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
+	state, ok := conn.layerStates[len(conn.layerStates)-1].(*tcpState)
+	if !ok {
+		conn.t.Fatalf("expected final state of %v to be tcpState", conn.layerStates)
+	}
+	return state.remoteSeqNum
+}
+
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *TCPIPv4) Drain() {
+	conn.sniffer.Drain()
+}
+
+// UDPIPv4 maintains the state for all the layers in a UDP/IPv4 connection.
+type UDPIPv4 Connection
 
 // NewUDPIPv4 creates a new UDPIPv4 connection with reasonable defaults.
 func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 {
-	lMAC, err := tcpip.ParseMACAddress(*localMAC)
+	etherState, err := newEtherState(Ether{}, Ether{})
 	if err != nil {
-		t.Fatalf("can't parse localMAC %q: %s", *localMAC, err)
+		t.Fatalf("can't make etherState: %s", err)
 	}
-
-	rMAC, err := tcpip.ParseMACAddress(*remoteMAC)
+	ipv4State, err := newIPv4State(IPv4{}, IPv4{})
 	if err != nil {
-		t.Fatalf("can't parse remoteMAC %q: %s", *remoteMAC, err)
+		t.Fatalf("can't make ipv4State: %s", err)
 	}
-
-	portPickerFD, localPort, err := pickPort()
+	tcpState, err := newUDPState(outgoingUDP, incomingUDP)
 	if err != nil {
-		t.Fatalf("can't pick a port: %s", err)
+		t.Fatalf("can't make udpState: %s", err)
 	}
-	lIP := tcpip.Address(net.ParseIP(*localIPv4).To4())
-	rIP := tcpip.Address(net.ParseIP(*remoteIPv4).To4())
-
-	sniffer, err := NewSniffer(t)
+	injector, err := NewInjector(t)
 	if err != nil {
-		t.Fatalf("can't make new sniffer: %s", err)
+		t.Fatalf("can't make injector: %s", err)
 	}
-
-	injector, err := NewInjector(t)
+	sniffer, err := NewSniffer(t)
 	if err != nil {
-		t.Fatalf("can't make new injector: %s", err)
+		t.Fatalf("can't make sniffer: %s", err)
 	}
 
-	newOutgoingUDP := &UDP{
-		SrcPort: &localPort,
-	}
-	if err := newOutgoingUDP.merge(outgoingUDP); err != nil {
-		t.Fatalf("can't merge %+v into %+v: %s", outgoingUDP, newOutgoingUDP, err)
-	}
-	newIncomingUDP := &UDP{
-		DstPort: &localPort,
-	}
-	if err := newIncomingUDP.merge(incomingUDP); err != nil {
-		t.Fatalf("can't merge %+v into %+v: %s", incomingUDP, newIncomingUDP, err)
-	}
 	return UDPIPv4{
-		outgoing: Layers{
-			&Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
-			&IPv4{SrcAddr: &lIP, DstAddr: &rIP},
-			newOutgoingUDP},
-		incoming: Layers{
-			&Ether{SrcAddr: &rMAC, DstAddr: &lMAC},
-			&IPv4{SrcAddr: &rIP, DstAddr: &lIP},
-			newIncomingUDP},
-		sniffer:      sniffer,
-		injector:     injector,
-		portPickerFD: portPickerFD,
-		t:            t,
+		layerStates: []layerState{etherState, ipv4State, tcpState},
+		injector:    injector,
+		sniffer:     sniffer,
+		t:           t,
 	}
 }
 
-// Close the injector and sniffer associated with this connection.
-func (conn *UDPIPv4) Close() {
-	conn.sniffer.Close()
-	conn.injector.Close()
-	if err := unix.Close(conn.portPickerFD); err != nil {
-		conn.t.Fatalf("can't close portPickerFD: %s", err)
-	}
-	conn.portPickerFD = -1
-}
-
-// CreateFrame builds a frame for the connection with the provided udp
-// overriding defaults and the additionalLayers added after the UDP header.
-func (conn *UDPIPv4) CreateFrame(udp UDP, additionalLayers ...Layer) Layers {
-	layersToSend := deepcopy.Copy(conn.outgoing).(Layers)
-	if err := layersToSend[udpLayerIndex].(*UDP).merge(udp); err != nil {
-		conn.t.Fatalf("can't merge %+v into %+v: %s", udp, layersToSend[udpLayerIndex], err)
-	}
-	layersToSend = append(layersToSend, additionalLayers...)
-	return layersToSend
+// CreateFrame builds a frame for the connection with layer overriding defaults
+// of the innermost layer and additionalLayers added after it.
+func (conn *UDPIPv4) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {
+	return (*Connection)(conn).CreateFrame(layer, additionalLayers...)
 }
 
-// SendFrame sends a frame with reasonable defaults.
+// SendFrame sends a frame on the wire and updates the state of all layers.
 func (conn *UDPIPv4) SendFrame(frame Layers) {
-	outBytes, err := frame.toBytes()
-	if err != nil {
-		conn.t.Fatalf("can't build outgoing UDP packet: %s", err)
-	}
-	conn.injector.Send(outBytes)
-}
-
-// Send a packet with reasonable defaults and override some fields by udp.
-func (conn *UDPIPv4) Send(udp UDP, additionalLayers ...Layer) {
-	conn.SendFrame(conn.CreateFrame(udp, additionalLayers...))
+	(*Connection)(conn).SendFrame(frame)
 }
 
-// Recv gets a packet from the sniffer within the timeout provided. If no packet
-// arrives before the timeout, it returns nil.
-func (conn *UDPIPv4) Recv(timeout time.Duration) *UDP {
-	deadline := time.Now().Add(timeout)
-	for {
-		timeout = time.Until(deadline)
-		if timeout <= 0 {
-			break
-		}
-		b := conn.sniffer.Recv(timeout)
-		if b == nil {
-			break
-		}
-		layers := Parse(ParseEther, b)
-		if !conn.incoming.match(layers) {
-			continue // Ignore packets that don't match the expected incoming.
-		}
-		return (layers[udpLayerIndex]).(*UDP)
-	}
-	return nil
+// Close frees associated resources held by the UDPIPv4 connection.
+func (conn *UDPIPv4) Close() {
+	(*Connection)(conn).Close()
 }
 
 // Drain drains the sniffer's receive buffer by receiving packets until there's
@@ -434,23 +647,3 @@ func (conn *UDPIPv4) Recv(timeout time.Duration) *UDP {
 func (conn *UDPIPv4) Drain() {
 	conn.sniffer.Drain()
 }
-
-// Expect a packet that matches the provided udp within the timeout specified.
-// If it doesn't arrive in time, the test fails.
-func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) (*UDP, error) {
-	deadline := time.Now().Add(timeout)
-	var allUDP []string
-	for {
-		var gotUDP *UDP
-		if timeout = time.Until(deadline); timeout > 0 {
-			gotUDP = conn.Recv(timeout)
-		}
-		if gotUDP == nil {
-			return nil, fmt.Errorf("got %d packets:\n%s", len(allUDP), strings.Join(allUDP, "\n"))
-		}
-		if udp.match(gotUDP) {
-			return gotUDP, nil
-		}
-		allUDP = append(allUDP, gotUDP.String())
-	}
-}
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index b467c15cc..1ec94ce17 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -64,6 +64,9 @@ type Layer interface {
 
 	// setPrev sets the pointer to the Layer encapsulating this one.
 	setPrev(Layer)
+
+	// merge overrides the values in the interface with the provided values.
+	merge(Layer) error
 }
 
 // LayerBase is the common elements of all layers.
@@ -91,6 +94,9 @@ func (lb *LayerBase) setPrev(l Layer) {
 // equalLayer compares that two Layer structs match while ignoring field in
 // which either input has a nil and also ignoring the LayerBase of the inputs.
 func equalLayer(x, y Layer) bool {
+	if x == nil || y == nil {
+		return true
+	}
 	// opt ignores comparison pairs where either of the inputs is a nil.
 	opt := cmp.FilterValues(func(x, y interface{}) bool {
 		for _, l := range []interface{}{x, y} {
@@ -104,6 +110,15 @@ func equalLayer(x, y Layer) bool {
 	return cmp.Equal(x, y, opt, cmpopts.IgnoreTypes(LayerBase{}))
 }
 
+// mergeLayer merges other in layer. Any non-nil value in other overrides the
+// corresponding value in layer. If other is nil, no action is performed.
+func mergeLayer(layer, other Layer) error {
+	if other == nil {
+		return nil
+	}
+	return mergo.Merge(layer, other, mergo.WithOverride)
+}
+
 func stringLayer(l Layer) string {
 	v := reflect.ValueOf(l).Elem()
 	t := v.Type()
@@ -172,14 +187,14 @@ func NetworkProtocolNumber(v tcpip.NetworkProtocolNumber) *tcpip.NetworkProtocol
 	return &v
 }
 
-// LayerParser parses the input bytes and returns a Layer along with the next
-// LayerParser to run. If there is no more parsing to do, the returned
-// LayerParser is nil.
-type LayerParser func([]byte) (Layer, LayerParser)
+// layerParser parses the input bytes and returns a Layer along with the next
+// layerParser to run. If there is no more parsing to do, the returned
+// layerParser is nil.
+type layerParser func([]byte) (Layer, layerParser)
 
-// Parse parses bytes starting with the first LayerParser and using successive
-// LayerParsers until all the bytes are parsed.
-func Parse(parser LayerParser, b []byte) Layers {
+// parse parses bytes starting with the first layerParser and using successive
+// layerParsers until all the bytes are parsed.
+func parse(parser layerParser, b []byte) Layers {
 	var layers Layers
 	for {
 		var layer Layer
@@ -194,22 +209,22 @@ func Parse(parser LayerParser, b []byte) Layers {
 	return layers
 }
 
-// ParseEther parses the bytes assuming that they start with an ethernet header
+// parseEther parses the bytes assuming that they start with an ethernet header
 // and continues parsing further encapsulations.
-func ParseEther(b []byte) (Layer, LayerParser) {
+func parseEther(b []byte) (Layer, layerParser) {
 	h := header.Ethernet(b)
 	ether := Ether{
 		SrcAddr: LinkAddress(h.SourceAddress()),
 		DstAddr: LinkAddress(h.DestinationAddress()),
 		Type:    NetworkProtocolNumber(h.Type()),
 	}
-	var nextParser LayerParser
+	var nextParser layerParser
 	switch h.Type() {
 	case header.IPv4ProtocolNumber:
-		nextParser = ParseIPv4
+		nextParser = parseIPv4
 	default:
 		// Assume that the rest is a payload.
-		nextParser = ParsePayload
+		nextParser = parsePayload
 	}
 	return &ether, nextParser
 }
@@ -222,6 +237,12 @@ func (l *Ether) length() int {
 	return header.EthernetMinimumSize
 }
 
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *Ether) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
 // IPv4 can construct and match an IPv4 encapsulation.
 type IPv4 struct {
 	LayerBase
@@ -330,9 +351,9 @@ func Address(v tcpip.Address) *tcpip.Address {
 	return &v
 }
 
-// ParseIPv4 parses the bytes assuming that they start with an ipv4 header and
+// parseIPv4 parses the bytes assuming that they start with an ipv4 header and
 // continues parsing further encapsulations.
-func ParseIPv4(b []byte) (Layer, LayerParser) {
+func parseIPv4(b []byte) (Layer, layerParser) {
 	h := header.IPv4(b)
 	tos, _ := h.TOS()
 	ipv4 := IPv4{
@@ -348,15 +369,15 @@ func ParseIPv4(b []byte) (Layer, LayerParser) {
 		SrcAddr:        Address(h.SourceAddress()),
 		DstAddr:        Address(h.DestinationAddress()),
 	}
-	var nextParser LayerParser
+	var nextParser layerParser
 	switch h.TransportProtocol() {
 	case header.TCPProtocolNumber:
-		nextParser = ParseTCP
+		nextParser = parseTCP
 	case header.UDPProtocolNumber:
-		nextParser = ParseUDP
+		nextParser = parseUDP
 	default:
 		// Assume that the rest is a payload.
-		nextParser = ParsePayload
+		nextParser = parsePayload
 	}
 	return &ipv4, nextParser
 }
@@ -372,6 +393,12 @@ func (l *IPv4) length() int {
 	return int(*l.IHL)
 }
 
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *IPv4) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
 // TCP can construct and match a TCP encapsulation.
 type TCP struct {
 	LayerBase
@@ -482,9 +509,9 @@ func Uint32(v uint32) *uint32 {
 	return &v
 }
 
-// ParseTCP parses the bytes assuming that they start with a tcp header and
+// parseTCP parses the bytes assuming that they start with a tcp header and
 // continues parsing further encapsulations.
-func ParseTCP(b []byte) (Layer, LayerParser) {
+func parseTCP(b []byte) (Layer, layerParser) {
 	h := header.TCP(b)
 	tcp := TCP{
 		SrcPort:       Uint16(h.SourcePort()),
@@ -497,7 +524,7 @@ func ParseTCP(b []byte) (Layer, LayerParser) {
 		Checksum:      Uint16(h.Checksum()),
 		UrgentPointer: Uint16(h.UrgentPointer()),
 	}
-	return &tcp, ParsePayload
+	return &tcp, parsePayload
 }
 
 func (l *TCP) match(other Layer) bool {
@@ -513,8 +540,8 @@ func (l *TCP) length() int {
 
 // merge overrides the values in l with the values from other but only in fields
 // where the value is not nil.
-func (l *TCP) merge(other TCP) error {
-	return mergo.Merge(l, other, mergo.WithOverride)
+func (l *TCP) merge(other Layer) error {
+	return mergeLayer(l, other)
 }
 
 // UDP can construct and match a UDP encapsulation.
@@ -565,9 +592,9 @@ func setUDPChecksum(h *header.UDP, udp *UDP) error {
 	return nil
 }
 
-// ParseUDP parses the bytes assuming that they start with a udp header and
+// parseUDP parses the bytes assuming that they start with a udp header and
 // returns the parsed layer and the next parser to use.
-func ParseUDP(b []byte) (Layer, LayerParser) {
+func parseUDP(b []byte) (Layer, layerParser) {
 	h := header.UDP(b)
 	udp := UDP{
 		SrcPort:  Uint16(h.SourcePort()),
@@ -575,7 +602,7 @@ func ParseUDP(b []byte) (Layer, LayerParser) {
 		Length:   Uint16(h.Length()),
 		Checksum: Uint16(h.Checksum()),
 	}
-	return &udp, ParsePayload
+	return &udp, parsePayload
 }
 
 func (l *UDP) match(other Layer) bool {
@@ -591,8 +618,8 @@ func (l *UDP) length() int {
 
 // merge overrides the values in l with the values from other but only in fields
 // where the value is not nil.
-func (l *UDP) merge(other UDP) error {
-	return mergo.Merge(l, other, mergo.WithOverride)
+func (l *UDP) merge(other Layer) error {
+	return mergeLayer(l, other)
 }
 
 // Payload has bytes beyond OSI layer 4.
@@ -605,9 +632,9 @@ func (l *Payload) String() string {
 	return stringLayer(l)
 }
 
-// ParsePayload parses the bytes assuming that they start with a payload and
+// parsePayload parses the bytes assuming that they start with a payload and
 // continue to the end. There can be no further encapsulations.
-func ParsePayload(b []byte) (Layer, LayerParser) {
+func parsePayload(b []byte) (Layer, layerParser) {
 	payload := Payload{
 		Bytes: b,
 	}
@@ -626,6 +653,12 @@ func (l *Payload) length() int {
 	return len(l.Bytes)
 }
 
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *Payload) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
 // Layers is an array of Layer and supports similar functions to Layer.
 type Layers []Layer
 
@@ -662,8 +695,8 @@ func (ls *Layers) match(other Layers) bool {
 	if len(*ls) > len(other) {
 		return false
 	}
-	for i := 0; i < len(*ls); i++ {
-		if !equalLayer((*ls)[i], other[i]) {
+	for i, l := range *ls {
+		if !equalLayer(l, other[i]) {
 			return false
 		}
 	}
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
index 09bfa43c5..ff722d4a6 100644
--- a/test/packetimpact/testbench/rawsockets.go
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -17,6 +17,7 @@ package testbench
 import (
 	"encoding/binary"
 	"flag"
+	"fmt"
 	"math"
 	"net"
 	"testing"
@@ -120,12 +121,13 @@ func (s *Sniffer) Drain() {
 	}
 }
 
-// Close the socket that Sniffer is using.
-func (s *Sniffer) Close() {
+// close the socket that Sniffer is using.
+func (s *Sniffer) close() error {
 	if err := unix.Close(s.fd); err != nil {
-		s.t.Fatalf("can't close sniffer socket: %s", err)
+		return fmt.Errorf("can't close sniffer socket: %w", err)
 	}
 	s.fd = -1
+	return nil
 }
 
 // Injector can inject raw frames.
@@ -171,10 +173,11 @@ func (i *Injector) Send(b []byte) {
 	}
 }
 
-// Close the underlying socket.
-func (i *Injector) Close() {
+// close the underlying socket.
+func (i *Injector) close() error {
 	if err := unix.Close(i.fd); err != nil {
-		i.t.Fatalf("can't close sniffer socket: %s", err)
+		return fmt.Errorf("can't close sniffer socket: %w", err)
 	}
 	i.fd = -1
+	return nil
 }
diff --git a/test/packetimpact/tests/tcp_should_piggyback_test.go b/test/packetimpact/tests/tcp_should_piggyback_test.go
index f2ab49e51..b0be6ba23 100644
--- a/test/packetimpact/tests/tcp_should_piggyback_test.go
+++ b/test/packetimpact/tests/tcp_should_piggyback_test.go
@@ -40,7 +40,11 @@ func TestPiggyback(t *testing.T) {
 	sampleData := []byte("Sample Data")
 
 	dut.Send(acceptFd, sampleData, 0)
-	conn.ExpectData(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, sampleData, time.Second)
+	expectedTCP := tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}
+	expectedPayload := tb.Payload{Bytes: sampleData}
+	if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("Expected %v but didn't get one: %s", tb.Layers{&expectedTCP, &expectedPayload}, err)
+	}
 
 	// Cause DUT to send us more data as soon as we ACK their first data segment because we have
 	// a small window.
@@ -48,6 +52,8 @@ func TestPiggyback(t *testing.T) {
 
 	// DUT should ACK our segment by piggybacking ACK to their outstanding data segment instead of
 	// sending a separate ACK packet.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &tb.Payload{Bytes: sampleData})
-	conn.ExpectData(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, sampleData, time.Second)
+	conn.Send(expectedTCP, &expectedPayload)
+	if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("Expected %v but didn't get one: %s", tb.Layers{&expectedTCP, &expectedPayload}, err)
+	}
 }
diff --git a/test/packetimpact/tests/tcp_window_shrink_test.go b/test/packetimpact/tests/tcp_window_shrink_test.go
index b48cc6491..c9354074e 100644
--- a/test/packetimpact/tests/tcp_window_shrink_test.go
+++ b/test/packetimpact/tests/tcp_window_shrink_test.go
@@ -38,15 +38,22 @@ func TestWindowShrink(t *testing.T) {
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 
 	sampleData := []byte("Sample Data")
+	samplePayload := &tb.Payload{Bytes: sampleData}
 
 	dut.Send(acceptFd, sampleData, 0)
-	conn.ExpectData(tb.TCP{}, sampleData, time.Second)
+	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
 	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
 
 	dut.Send(acceptFd, sampleData, 0)
 	dut.Send(acceptFd, sampleData, 0)
-	conn.ExpectData(tb.TCP{}, sampleData, time.Second)
-	conn.ExpectData(tb.TCP{}, sampleData, time.Second)
+	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
 	// We close our receiving window here
 	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
 
@@ -54,5 +61,8 @@ func TestWindowShrink(t *testing.T) {
 	// Note: There is another kind of zero-window probing which Windows uses (by sending one
 	// new byte at `RemoteSeqNum`), if netstack wants to go that way, we may want to change
 	// the following lines.
-	conn.ExpectData(tb.TCP{SeqNum: tb.Uint32(uint32(conn.RemoteSeqNum - 1))}, nil, time.Second)
+	expectedRemoteSeqNum := *conn.RemoteSeqNum() - 1
+	if _, err := conn.ExpectData(&tb.TCP{SeqNum: tb.Uint32(uint32(expectedRemoteSeqNum))}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %v: %s", expectedRemoteSeqNum, err)
+	}
 }
diff --git a/test/packetimpact/tests/udp_recv_multicast_test.go b/test/packetimpact/tests/udp_recv_multicast_test.go
index bc1b0be49..61fd17050 100644
--- a/test/packetimpact/tests/udp_recv_multicast_test.go
+++ b/test/packetimpact/tests/udp_recv_multicast_test.go
@@ -30,7 +30,7 @@ func TestUDPRecvMulticast(t *testing.T) {
 	defer dut.Close(boundFD)
 	conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
 	defer conn.Close()
-	frame := conn.CreateFrame(tb.UDP{}, &tb.Payload{Bytes: []byte("hello world")})
+	frame := conn.CreateFrame(&tb.UDP{}, &tb.Payload{Bytes: []byte("hello world")})
 	frame[1].(*tb.IPv4).DstAddr = tb.Address(tcpip.Address(net.ParseIP("224.0.0.1").To4()))
 	conn.SendFrame(frame)
 	dut.Recv(boundFD, 100, 0)
-- 
cgit v1.2.3


From ea5b8e9633cd2731bb5656dea523beaf3d643472 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Wed, 15 Apr 2020 14:30:20 -0700
Subject: Use if_nametoindex to get interface index.

Removed the TODO to use netlink.

PiperOrigin-RevId: 306721468
---
 test/syscalls/linux/ip_socket_test_util.cc | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index d28dc0db6..98d07ae85 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -16,7 +16,6 @@
 
 #include <net/if.h>
 #include <netinet/in.h>
-#include <sys/ioctl.h>
 #include <sys/socket.h>
 
 #include <cstring>
@@ -35,12 +34,11 @@ uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
 }
 
 PosixErrorOr<int> InterfaceIndex(std::string name) {
-  // TODO(igudger): Consider using netlink.
-  ifreq req = {};
-  memcpy(req.ifr_name, name.c_str(), name.size());
-  ASSIGN_OR_RETURN_ERRNO(auto sock, Socket(AF_INET, SOCK_DGRAM, 0));
-  RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(sock.get(), SIOCGIFINDEX, &req));
-  return req.ifr_ifindex;
+  int index = if_nametoindex(name.c_str());
+  if (index) {
+    return index;
+  }
+  return PosixError(errno);
 }
 
 namespace {
-- 
cgit v1.2.3


From 3d3bf9603d9a933b4bf19c38190c583894b75d66 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 15 Apr 2020 14:57:56 -0700
Subject: Use hex.Dump for Layer.String() of byte slices.

PiperOrigin-RevId: 306726587
---
 test/packetimpact/testbench/layers.go      |  8 +++++++-
 test/packetimpact/testbench/layers_test.go | 16 ++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 1ec94ce17..5ce324f0d 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -15,6 +15,7 @@
 package testbench
 
 import (
+	"encoding/hex"
 	"fmt"
 	"reflect"
 	"strings"
@@ -133,7 +134,12 @@ func stringLayer(l Layer) string {
 		if v.IsNil() {
 			continue
 		}
-		ret = append(ret, fmt.Sprintf("%s:%v", t.Name, reflect.Indirect(v)))
+		v = reflect.Indirect(v)
+		if v.Kind() == reflect.Slice && v.Type().Elem().Kind() == reflect.Uint8 {
+			ret = append(ret, fmt.Sprintf("%s:\n%v", t.Name, hex.Dump(v.Bytes())))
+		} else {
+			ret = append(ret, fmt.Sprintf("%s:%v", t.Name, v))
+		}
 	}
 	return fmt.Sprintf("&%s{%s}", t, strings.Join(ret, " "))
 }
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
index 8ffc26bf9..b32efda93 100644
--- a/test/packetimpact/testbench/layers_test.go
+++ b/test/packetimpact/testbench/layers_test.go
@@ -14,9 +14,11 @@
 
 package testbench
 
-import "testing"
+import (
+	"testing"
 
-import "gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
 
 func TestLayerMatch(t *testing.T) {
 	var nilPayload *Payload
@@ -134,6 +136,16 @@ func TestLayerStringFormat(t *testing.T) {
 				"Type:4" +
 				"}",
 		},
+		{
+			name: "Payload",
+			l: &Payload{
+				Bytes: []byte("Hooray for packetimpact."),
+			},
+			want: "&testbench.Payload{Bytes:\n" +
+				"00000000  48 6f 6f 72 61 79 20 66  6f 72 20 70 61 63 6b 65  |Hooray for packe|\n" +
+				"00000010  74 69 6d 70 61 63 74 2e                           |timpact.|\n" +
+				"}",
+		},
 	} {
 		t.Run(tt.name, func(t *testing.T) {
 			if got := tt.l.String(); got != tt.want {
-- 
cgit v1.2.3


From 09c7e3f6e497f4ae267772e7357763ac7c18659f Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 15 Apr 2020 19:36:03 -0700
Subject: Add tests for segments outside the receive window.

The tests are based on RFC 793 page 69.

Updates #1607

PiperOrigin-RevId: 306768847
---
 test/packetimpact/testbench/connections.go         | 20 ++++-
 test/packetimpact/tests/BUILD                      | 13 ++++
 .../tests/tcp_outside_the_window_test.go           | 88 ++++++++++++++++++++++
 3 files changed, 118 insertions(+), 3 deletions(-)
 create mode 100644 test/packetimpact/tests/tcp_outside_the_window_test.go

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 169db01b0..be62d051d 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -577,13 +577,27 @@ func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
 	return gotTCP, err
 }
 
-// RemoteSeqNum returns the next expected sequence number from the DUT.
-func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
+func (conn *TCPIPv4) state() *tcpState {
 	state, ok := conn.layerStates[len(conn.layerStates)-1].(*tcpState)
 	if !ok {
 		conn.t.Fatalf("expected final state of %v to be tcpState", conn.layerStates)
 	}
-	return state.remoteSeqNum
+	return state
+}
+
+// RemoteSeqNum returns the next expected sequence number from the DUT.
+func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
+	return conn.state().remoteSeqNum
+}
+
+// LocalSeqNum returns the next expected sequence number from the DUT.
+func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value {
+	return conn.state().localSeqNum
+}
+
+// SynAck returns the SynAck that was part of the handshake.
+func (conn *TCPIPv4) SynAck() *TCP {
+	return conn.state().synAck
 }
 
 // Drain drains the sniffer's receive buffer by receiving packets until there's
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 1274d9f60..4f8c8bdc0 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -40,6 +40,19 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "tcp_outside_the_window",
+    srcs = ["tcp_outside_the_window_test.go"],
+    # TODO(eyalsoha): Fix #1607 then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 packetimpact_go_test(
     name = "tcp_noaccept_close_rst",
     srcs = ["tcp_noaccept_close_rst_test.go"],
diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go
new file mode 100644
index 000000000..db3d3273b
--- /dev/null
+++ b/test/packetimpact/tests/tcp_outside_the_window_test.go
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_outside_the_window_test
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+// TestTCPOutsideTheWindows tests the behavior of the DUT when packets arrive
+// that are inside or outside the TCP window. Packets that are outside the
+// window should force an extra ACK, as described in RFC793 page 69:
+// https://tools.ietf.org/html/rfc793#page-69
+func TestTCPOutsideTheWindow(t *testing.T) {
+	for _, tt := range []struct {
+		description  string
+		tcpFlags     uint8
+		payload      []tb.Layer
+		seqNumOffset seqnum.Size
+		expectACK    bool
+	}{
+		{"SYN", header.TCPFlagSyn, nil, 0, true},
+		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 0, true},
+		{"ACK", header.TCPFlagAck, nil, 0, false},
+		{"FIN", header.TCPFlagFin, nil, 0, false},
+		{"Data", header.TCPFlagAck, []tb.Layer{&tb.Payload{Bytes: []byte("abc123")}}, 0, true},
+
+		{"SYN", header.TCPFlagSyn, nil, 1, true},
+		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 1, true},
+		{"ACK", header.TCPFlagAck, nil, 1, true},
+		{"FIN", header.TCPFlagFin, nil, 1, false},
+		{"Data", header.TCPFlagAck, []tb.Layer{&tb.Payload{Bytes: []byte("abc123")}}, 1, true},
+
+		{"SYN", header.TCPFlagSyn, nil, 2, true},
+		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 2, true},
+		{"ACK", header.TCPFlagAck, nil, 2, true},
+		{"FIN", header.TCPFlagFin, nil, 2, false},
+		{"Data", header.TCPFlagAck, []tb.Layer{&tb.Payload{Bytes: []byte("abc123")}}, 2, true},
+	} {
+		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := tb.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFD)
+			conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+			conn.Handshake()
+			acceptFD, _ := dut.Accept(listenFD)
+			defer dut.Close(acceptFD)
+
+			windowSize := seqnum.Size(*conn.SynAck().WindowSize) + tt.seqNumOffset
+			conn.Drain()
+			// Ignore whatever incrementing that this out-of-order packet might cause
+			// to the AckNum.
+			localSeqNum := tb.Uint32(uint32(*conn.LocalSeqNum()))
+			conn.Send(tb.TCP{
+				Flags:  tb.Uint8(tt.tcpFlags),
+				SeqNum: tb.Uint32(uint32(conn.LocalSeqNum().Add(windowSize))),
+			}, tt.payload...)
+			timeout := 3 * time.Second
+			gotACK, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout)
+			if tt.expectACK && err != nil {
+				t.Fatalf("expected an ACK packet within %s but got none: %s", timeout, err)
+			}
+			if !tt.expectACK && gotACK != nil {
+				t.Fatalf("expected no ACK packet within %s but got one: %s", timeout, gotACK)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From 28399818fc1e5d294cc93ddd4a1ac7e31c375fbf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 16 Apr 2020 11:48:14 -0700
Subject: Make ExtractErrno a function

PiperOrigin-RevId: 306891171
---
 pkg/sentry/kernel/task_run.go        |  2 +-
 pkg/sentry/kernel/task_signals.go    |  2 +-
 pkg/sentry/kernel/task_syscall.go    | 12 ++++++------
 pkg/sentry/strace/strace.go          |  2 +-
 pkg/sentry/syscalls/linux/sys_aio.go |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 799cbcd93..2ba8d7e63 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -353,7 +353,7 @@ func (app *runApp) execute(t *Task) taskRunState {
 	default:
 		// What happened? Can't continue.
 		t.Warningf("Unexpected SwitchToApp error: %v", err)
-		t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+		t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
 		return (*runExit)(nil)
 	}
 }
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 6aa798346..f07de2089 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -174,7 +174,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 					fallthrough
 				case (sre == ERESTARTSYS && !act.IsRestart()):
 					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
-					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+					t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
 				default:
 					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
 					t.Arch().RestartSyscall()
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 3d7a734ef..c9db78e06 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -312,7 +312,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu
 			return ctrl.next
 		}
 	} else if err != nil {
-		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
 		t.haveSyscallReturn = true
 	} else {
 		t.Arch().SetReturn(rval)
@@ -431,7 +431,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
 			// A return is not emulated in this case.
 			return (*runApp)(nil)
 		}
-		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
 	}
 	t.Arch().SetIP(t.Arch().Value(caller))
 	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
@@ -441,7 +441,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
 // ExtractErrno extracts an integer error number from the error.
 // The syscall number is purely for context in the error case. Use -1 if
 // syscall number is unknown.
-func (t *Task) ExtractErrno(err error, sysno int) int {
+func ExtractErrno(err error, sysno int) int {
 	switch err := err.(type) {
 	case nil:
 		return 0
@@ -455,11 +455,11 @@ func (t *Task) ExtractErrno(err error, sysno int) int {
 		// handled (and the SIGBUS is delivered).
 		return int(syscall.EFAULT)
 	case *os.PathError:
-		return t.ExtractErrno(err.Err, sysno)
+		return ExtractErrno(err.Err, sysno)
 	case *os.LinkError:
-		return t.ExtractErrno(err.Err, sysno)
+		return ExtractErrno(err.Err, sysno)
 	case *os.SyscallError:
-		return t.ExtractErrno(err.Err, sysno)
+		return ExtractErrno(err.Err, sysno)
 	default:
 		if errno, ok := syserror.TranslateError(err); ok {
 			return int(errno)
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index b94c4fbf5..68ca537c8 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -719,7 +719,7 @@ func (s SyscallMap) SyscallEnter(t *kernel.Task, sysno uintptr, args arch.Syscal
 // SyscallExit implements kernel.Stracer.SyscallExit. It logs the syscall
 // exit trace.
 func (s SyscallMap) SyscallExit(context interface{}, t *kernel.Task, sysno, rval uintptr, err error) {
-	errno := t.ExtractErrno(err, int(sysno))
+	errno := kernel.ExtractErrno(err, int(sysno))
 	c := context.(*syscallContext)
 
 	elapsed := time.Since(c.start)
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 38cbeba5a..d781d6a04 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -290,7 +290,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC
 	// Update the result.
 	if err != nil {
 		err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
-		ev.Result = -int64(t.ExtractErrno(err, 0))
+		ev.Result = -int64(kernel.ExtractErrno(err, 0))
 	}
 
 	file.DecRef()
-- 
cgit v1.2.3


From eb7b1903e00eda9248da59991d80594590c9aab6 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 16 Apr 2020 12:21:06 -0700
Subject: Test TCP behavior when receiving unacceptable segment in CLOSE_WAIT

TCP, in CLOSE-WAIT state, MUST return ACK with proper SEQ and ACK numbers after
recv a seg with OTW SEQ or unacc ACK number, and remain in same state. If the
connection is in a synchronized state, any unacceptable segment (out of window
sequence number or unacceptable acknowledgment number) must elicit only an empty
acknowledgment segment containing the current send-sequence number and an
acknowledgment indicating the next sequence number expected to be received, and
the connection remains in the same state.

PiperOrigin-RevId: 306897984
---
 test/packetimpact/testbench/connections.go         |  14 ++-
 test/packetimpact/tests/BUILD                      |  13 +++
 test/packetimpact/tests/tcp_close_wait_ack_test.go | 102 +++++++++++++++++++++
 3 files changed, 126 insertions(+), 3 deletions(-)
 create mode 100644 test/packetimpact/tests/tcp_close_wait_ack_test.go

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index be62d051d..c1b3c4380 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -189,6 +189,7 @@ type tcpState struct {
 	localSeqNum, remoteSeqNum *seqnum.Value
 	synAck                    *TCP
 	portPickerFD              int
+	finSent                   bool
 }
 
 var _ layerState = (*tcpState)(nil)
@@ -210,6 +211,7 @@ func newTCPState(out, in TCP) (*tcpState, error) {
 		in:           TCP{DstPort: &localPort},
 		localSeqNum:  SeqNumValue(seqnum.Value(rand.Uint32())),
 		portPickerFD: portPickerFD,
+		finSent:      false,
 	}
 	if err := s.out.merge(&out); err != nil {
 		return nil, err
@@ -254,12 +256,18 @@ func (s *tcpState) sent(sent Layer) error {
 	if !ok {
 		return fmt.Errorf("can't update tcpState with %T Layer", sent)
 	}
-	for current := tcp.next(); current != nil; current = current.next() {
-		s.localSeqNum.UpdateForward(seqnum.Size(current.length()))
+	if !s.finSent {
+		// update localSeqNum by the payload only when FIN is not yet sent by us
+		for current := tcp.next(); current != nil; current = current.next() {
+			s.localSeqNum.UpdateForward(seqnum.Size(current.length()))
+		}
 	}
 	if tcp.Flags != nil && *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
 		s.localSeqNum.UpdateForward(1)
 	}
+	if *tcp.Flags&(header.TCPFlagFin) != 0 {
+		s.finSent = true
+	}
 	return nil
 }
 
@@ -590,7 +598,7 @@ func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
 	return conn.state().remoteSeqNum
 }
 
-// LocalSeqNum returns the next expected sequence number from the DUT.
+// LocalSeqNum returns the next sequence number to send from the testbench.
 func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value {
 	return conn.state().localSeqNum
 }
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 4f8c8bdc0..690cee140 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -75,6 +75,19 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "tcp_close_wait_ack",
+    srcs = ["tcp_close_wait_ack_test.go"],
+    # TODO(b/153574037): Fix netstack then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/tcp_close_wait_ack_test.go b/test/packetimpact/tests/tcp_close_wait_ack_test.go
new file mode 100644
index 000000000..eb4cc7a65
--- /dev/null
+++ b/test/packetimpact/tests/tcp_close_wait_ack_test.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_close_wait_ack_test
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func TestCloseWaitAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(conn *tb.TCPIPv4, seqNumOffset seqnum.Size) tb.TCP
+		seqNumOffset   seqnum.Size
+		expectAck      bool
+	}{
+		{"OTW", GenerateOTWSeqSegment, 0, false},
+		{"OTW", GenerateOTWSeqSegment, 1, true},
+		{"OTW", GenerateOTWSeqSegment, 2, true},
+		{"ACK", GenerateUnaccACKSegment, 0, false},
+		{"ACK", GenerateUnaccACKSegment, 1, true},
+		{"ACK", GenerateUnaccACKSegment, 2, true},
+	} {
+		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := tb.NewDUT(t)
+			defer dut.TearDown()
+			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFd)
+			conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+
+			conn.Handshake()
+			acceptFd, _ := dut.Accept(listenFd)
+
+			// Send a FIN to DUT to intiate the active close
+			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
+			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
+			}
+
+			// Send a segment with OTW Seq / unacc ACK and expect an ACK back
+			conn.Send(tt.makeTestingTCP(&conn, tt.seqNumOffset), &tb.Payload{Bytes: []byte("Sample Data")})
+			gotAck, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, time.Second)
+			if tt.expectAck && err != nil {
+				t.Fatalf("expected an ack but got none: %s", err)
+			}
+			if !tt.expectAck && gotAck != nil {
+				t.Fatalf("expected no ack but got one: %s", gotAck)
+			}
+
+			// Now let's verify DUT is indeed in CLOSE_WAIT
+			dut.Close(acceptFd)
+			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send a FIN: %s", err)
+			}
+			// Ack the FIN from DUT
+			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+			// Send some extra data to DUT
+			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, &tb.Payload{Bytes: []byte("Sample Data")})
+			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send an RST: %s", err)
+			}
+		})
+	}
+}
+
+// This generates an segment with seqnum = RCV.NXT + RCV.WND + seqNumOffset, the
+// generated segment is only acceptable when seqNumOffset is 0, otherwise an ACK
+// is expected from the receiver.
+func GenerateOTWSeqSegment(conn *tb.TCPIPv4, seqNumOffset seqnum.Size) tb.TCP {
+	windowSize := seqnum.Size(*conn.SynAck().WindowSize)
+	lastAcceptable := conn.LocalSeqNum().Add(windowSize - 1)
+	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
+	return tb.TCP{SeqNum: tb.Uint32(otwSeq), Flags: tb.Uint8(header.TCPFlagAck)}
+}
+
+// This generates an segment with acknum = SND.NXT + seqNumOffset, the generated
+// segment is only acceptable when seqNumOffset is 0, otherwise an ACK is
+// expected from the receiver.
+func GenerateUnaccACKSegment(conn *tb.TCPIPv4, seqNumOffset seqnum.Size) tb.TCP {
+	lastAcceptable := conn.RemoteSeqNum()
+	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
+	return tb.TCP{AckNum: tb.Uint32(unaccAck), Flags: tb.Uint8(header.TCPFlagAck)}
+}
-- 
cgit v1.2.3


From 5a8ee1beee364559bac37376949de1ea01d00ae2 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 16 Apr 2020 13:15:47 -0700
Subject: Preserve log FD after execve

PiperOrigin-RevId: 306908296
---
 runsc/main.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runsc/main.go b/runsc/main.go
index 59f624842..c1c78529c 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -291,7 +291,7 @@ func main() {
 		// want with them. Since Docker and Containerd both eat boot's stderr, we
 		// dup our stderr to the provided log FD so that panics will appear in the
 		// logs, rather than just disappear.
-		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), syscall.O_CLOEXEC); err != nil {
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
 		}
 	}
-- 
cgit v1.2.3


From 75e864fc7529bf71484ecabbb2ce2264e96399cf Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 16 Apr 2020 15:14:44 -0700
Subject: Use multierr in packetimpact Connection.Close()

PiperOrigin-RevId: 306930652
---
 WORKSPACE                                  | 14 ++++++++++++++
 test/packetimpact/testbench/BUILD          |  1 +
 test/packetimpact/testbench/connections.go | 13 ++++++-------
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index bca63c0d9..c40e03ad2 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -400,6 +400,20 @@ go_repository(
     version = "v0.20.0",
 )
 
+go_repository(
+    name = "org_uber_go_atomic",
+    importpath = "go.uber.org/atomic",
+    version = "v1.6.0",
+    sum = "h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk=",
+)
+
+go_repository(
+    name = "org_uber_go_multierr",
+    importpath = "go.uber.org/multierr",
+    version = "v1.5.0",
+    sum = "h1:KCa4XfM8CWFCpxXRGok+Q0SS/0XBhMDbHHGABQLvD2A=",
+)
+
 # BigQuery Dependencies for Benchmarks
 go_repository(
     name = "com_google_cloud_go",
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index 838a10ffe..b6a254882 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -28,6 +28,7 @@ go_library(
         "@org_golang_google_grpc//:go_default_library",
         "@org_golang_google_grpc//keepalive:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
+        "@org_uber_go_multierr//:go_default_library",
     ],
 )
 
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index c1b3c4380..f84fd8ba7 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -26,6 +26,7 @@ import (
 	"time"
 
 	"github.com/mohae/deepcopy"
+	"go.uber.org/multierr"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -382,17 +383,15 @@ func (conn *Connection) match(override, received Layers) bool {
 
 // Close frees associated resources held by the Connection.
 func (conn *Connection) Close() {
-	if err := conn.sniffer.close(); err != nil {
-		conn.t.Fatal(err)
-	}
-	if err := conn.injector.close(); err != nil {
-		conn.t.Fatal(err)
-	}
+	errs := multierr.Combine(conn.sniffer.close(), conn.injector.close())
 	for _, s := range conn.layerStates {
 		if err := s.close(); err != nil {
-			conn.t.Fatalf("unable to close %+v: %s", s, err)
+			errs = multierr.Append(errs, fmt.Errorf("unable to close %+v: %s", s, err))
 		}
 	}
+	if errs != nil {
+		conn.t.Fatalf("unable to close %+v: %s", conn, errs)
+	}
 }
 
 // CreateFrame builds a frame for the connection with layer overriding defaults
-- 
cgit v1.2.3


From 0eda0104a5a7c95a36dd288199ec1e90be9d8be9 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 16 Apr 2020 16:48:14 -0700
Subject: Fix data race in tcp_test.

This change makes SynRcvdCountThreshold and the global synRcvdCount into a stack
configurable value. This is required because in cases like mod_proxy which
create multiple Stack instances the count will be a global value that impacts
all Stack instances.

Further the tests relied on modifying the global threshold to simulate tests
where we want to verify SYN cookie based behaviour. This lead to data races due
to the global being modified/read without locks or atomics.

PiperOrigin-RevId: 306947723
---
 pkg/tcpip/tcpip.go                            |   5 ++
 pkg/tcpip/transport/tcp/accept.go             | 106 ++++++++++----------------
 pkg/tcpip/transport/tcp/dual_stack_test.go    |   9 +--
 pkg/tcpip/transport/tcp/protocol.go           |  77 +++++++++++++++++++
 pkg/tcpip/transport/tcp/tcp_sack_test.go      |  39 +++++-----
 pkg/tcpip/transport/tcp/tcp_test.go           |  32 ++++----
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go |  30 ++++----
 7 files changed, 176 insertions(+), 122 deletions(-)

diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 109121dbc..1ca4088c9 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -685,6 +685,11 @@ type TCPDeferAcceptOption time.Duration
 // default MinRTO used by the Stack.
 type TCPMinRTOOption time.Duration
 
+// TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
+// the number of endpoints that can be in SYN-RCVD state before the stack
+// switches to using SYN cookies.
+type TCPSynRcvdCountThresholdOption uint64
+
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index e07b436c4..b61c2a8c3 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -17,6 +17,7 @@ package tcp
 import (
 	"crypto/sha1"
 	"encoding/binary"
+	"fmt"
 	"hash"
 	"io"
 	"time"
@@ -49,17 +50,14 @@ const (
 	// timestamp and the current timestamp. If the difference is greater
 	// than maxTSDiff, the cookie is expired.
 	maxTSDiff = 2
-)
 
-var (
-	// SynRcvdCountThreshold is the global maximum number of connections
-	// that are allowed to be in SYN-RCVD state before TCP starts using SYN
-	// cookies to accept connections.
-	//
-	// It is an exported variable only for testing, and should not otherwise
-	// be used by importers of this package.
+	// SynRcvdCountThreshold is the default global maximum number of
+	// connections that are allowed to be in SYN-RCVD state before TCP
+	// starts using SYN cookies to accept connections.
 	SynRcvdCountThreshold uint64 = 1000
+)
 
+var (
 	// mssTable is a slice containing the possible MSS values that we
 	// encode in the SYN cookie with two bits.
 	mssTable = []uint16{536, 1300, 1440, 1460}
@@ -74,29 +72,42 @@ func encodeMSS(mss uint16) uint32 {
 	return 0
 }
 
-// syncRcvdCount is the number of endpoints in the SYN-RCVD state. The value is
-// protected by a mutex so that we can increment only when it's guaranteed not
-// to go above a threshold.
-var synRcvdCount struct {
-	sync.Mutex
-	value   uint64
-	pending sync.WaitGroup
-}
-
 // listenContext is used by a listening endpoint to store state used while
 // listening for connections. This struct is allocated by the listen goroutine
 // and must not be accessed or have its methods called concurrently as they
 // may mutate the stored objects.
 type listenContext struct {
-	stack    *stack.Stack
-	rcvWnd   seqnum.Size
-	nonce    [2][sha1.BlockSize]byte
+	stack *stack.Stack
+
+	// synRcvdCount is a reference to the stack level synRcvdCount.
+	synRcvdCount *synRcvdCounter
+
+	// rcvWnd is the receive window that is sent by this listening context
+	// in the initial SYN-ACK.
+	rcvWnd seqnum.Size
+
+	// nonce are random bytes that are initialized once when the context
+	// is created and used to seed the hash function when generating
+	// the SYN cookie.
+	nonce [2][sha1.BlockSize]byte
+
+	// listenEP is a reference to the listening endpoint associated with
+	// this context. Can be nil if the context is created by the forwarder.
 	listenEP *endpoint
 
+	// hasherMu protects hasher.
 	hasherMu sync.Mutex
-	hasher   hash.Hash
-	v6only   bool
+	// hasher is the hash function used to generate a SYN cookie.
+	hasher hash.Hash
+
+	// v6Only is true if listenEP is a dual stack socket and has the
+	// IPV6_V6ONLY option set.
+	v6only bool
+
+	// netProto indicates the network protocol(IPv4/v6) for the listening
+	// endpoint.
 	netProto tcpip.NetworkProtocolNumber
+
 	// pendingMu protects pendingEndpoints. This should only be accessed
 	// by the listening endpoint's worker goroutine.
 	//
@@ -115,44 +126,6 @@ func timeStamp() uint32 {
 	return uint32(time.Now().Unix()>>6) & tsMask
 }
 
-// incSynRcvdCount tries to increment the global number of endpoints in SYN-RCVD
-// state. It succeeds if the increment doesn't make the count go beyond the
-// threshold, and fails otherwise.
-func incSynRcvdCount() bool {
-	synRcvdCount.Lock()
-
-	if synRcvdCount.value >= SynRcvdCountThreshold {
-		synRcvdCount.Unlock()
-		return false
-	}
-
-	synRcvdCount.pending.Add(1)
-	synRcvdCount.value++
-
-	synRcvdCount.Unlock()
-	return true
-}
-
-// decSynRcvdCount atomically decrements the global number of endpoints in
-// SYN-RCVD state. It must only be called if a previous call to incSynRcvdCount
-// succeeded.
-func decSynRcvdCount() {
-	synRcvdCount.Lock()
-
-	synRcvdCount.value--
-	synRcvdCount.pending.Done()
-	synRcvdCount.Unlock()
-}
-
-// synCookiesInUse() returns true if the synRcvdCount is greater than
-// SynRcvdCountThreshold.
-func synCookiesInUse() bool {
-	synRcvdCount.Lock()
-	v := synRcvdCount.value
-	synRcvdCount.Unlock()
-	return v >= SynRcvdCountThreshold
-}
-
 // newListenContext creates a new listen context.
 func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
 	l := &listenContext{
@@ -164,6 +137,11 @@ func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size,
 		listenEP:         listenEP,
 		pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
 	}
+	p, ok := stk.TransportProtocolInstance(ProtocolNumber).(*protocol)
+	if !ok {
+		panic(fmt.Sprintf("unable to get TCP protocol instance from stack: %+v", stk))
+	}
+	l.synRcvdCount = p.SynRcvdCounter()
 
 	rand.Read(l.nonce[0][:])
 	rand.Read(l.nonce[1][:])
@@ -410,7 +388,7 @@ func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
 // A limited number of these goroutines are allowed before TCP starts using SYN
 // cookies to accept connections.
 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
-	defer decSynRcvdCount()
+	defer ctx.synRcvdCount.dec()
 	defer func() {
 		e.mu.Lock()
 		e.decSynRcvdCount()
@@ -477,7 +455,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 	switch {
 	case s.flags == header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
-		if incSynRcvdCount() {
+		if ctx.synRcvdCount.inc() {
 			// Only handle the syn if the following conditions hold
 			//   - accept queue is not full.
 			//   - number of connections in synRcvd state is less than the
@@ -487,7 +465,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 				go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
 				return
 			}
-			decSynRcvdCount()
+			ctx.synRcvdCount.dec()
 			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
 			e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
@@ -540,7 +518,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			return
 		}
 
-		if !synCookiesInUse() {
+		if !ctx.synRcvdCount.synCookiesInUse() {
 			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
 			// Any acknowledgment is bad if it arrives on a connection still in
 			// the LISTEN state.  An acceptable reset segment should be formed
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 4f361b226..804e95aea 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -568,11 +568,10 @@ func TestV4AcceptOnV4(t *testing.T) {
 func testV4ListenClose(t *testing.T, c *context.Context) {
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 0
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption failed: %s", err)
+	}
+
 	const n = uint16(32)
 
 	// Start listening.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 91f25c132..effbf203f 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -94,6 +94,63 @@ const (
 	ccCubic = "cubic"
 )
 
+// syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
+// value is protected by a mutex so that we can increment only when it's
+// guaranteed not to go above a threshold.
+type synRcvdCounter struct {
+	sync.Mutex
+	value     uint64
+	pending   sync.WaitGroup
+	threshold uint64
+}
+
+// inc tries to increment the global number of endpoints in SYN-RCVD state. It
+// succeeds if the increment doesn't make the count go beyond the threshold, and
+// fails otherwise.
+func (s *synRcvdCounter) inc() bool {
+	s.Lock()
+	defer s.Unlock()
+	if s.value >= s.threshold {
+		return false
+	}
+
+	s.pending.Add(1)
+	s.value++
+
+	return true
+}
+
+// dec atomically decrements the global number of endpoints in SYN-RCVD
+// state. It must only be called if a previous call to inc succeeded.
+func (s *synRcvdCounter) dec() {
+	s.Lock()
+	defer s.Unlock()
+	s.value--
+	s.pending.Done()
+}
+
+// synCookiesInUse returns true if the synRcvdCount is greater than
+// SynRcvdCountThreshold.
+func (s *synRcvdCounter) synCookiesInUse() bool {
+	s.Lock()
+	defer s.Unlock()
+	return s.value >= s.threshold
+}
+
+// SetThreshold sets synRcvdCounter.Threshold to ths new threshold.
+func (s *synRcvdCounter) SetThreshold(threshold uint64) {
+	s.Lock()
+	defer s.Unlock()
+	s.threshold = threshold
+}
+
+// Threshold returns the current value of synRcvdCounter.Threhsold.
+func (s *synRcvdCounter) Threshold() uint64 {
+	s.Lock()
+	defer s.Unlock()
+	return s.threshold
+}
+
 type protocol struct {
 	mu                         sync.RWMutex
 	sackEnabled                bool
@@ -106,6 +163,7 @@ type protocol struct {
 	tcpLingerTimeout           time.Duration
 	tcpTimeWaitTimeout         time.Duration
 	minRTO                     time.Duration
+	synRcvdCount               synRcvdCounter
 	dispatcher                 *dispatcher
 }
 
@@ -282,6 +340,12 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case tcpip.TCPSynRcvdCountThresholdOption:
+		p.mu.Lock()
+		p.synRcvdCount.SetThreshold(uint64(v))
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -350,6 +414,12 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.RUnlock()
 		return nil
 
+	case *tcpip.TCPSynRcvdCountThresholdOption:
+		p.mu.RLock()
+		*v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold())
+		p.mu.RUnlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -365,6 +435,12 @@ func (p *protocol) Wait() {
 	p.dispatcher.wait()
 }
 
+// SynRcvdCounter returns a reference to the synRcvdCount for this protocol
+// instance.
+func (p *protocol) SynRcvdCounter() *synRcvdCounter {
+	return &p.synRcvdCount
+}
+
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{
@@ -374,6 +450,7 @@ func NewProtocol() stack.TransportProtocol {
 		availableCongestionControl: []string{ccReno, ccCubic},
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		synRcvdCount:               synRcvdCounter{threshold: SynRcvdCountThreshold},
 		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
 		minRTO:                     MinRTO,
 	}
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index c439d5281..1dd63dd61 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -149,21 +149,22 @@ func TestSackPermittedAccept(t *testing.T) {
 		{true, false, -1, 0xffff}, // When cookie is used window scaling is disabled.
 		{false, true, 5, 0x8000},  // 0x8000 * 2^5 = 1<<20 = 1MB window (the default).
 	}
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
+
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("test: %#v", tc), func(t *testing.T) {
-			if tc.cookieEnabled {
-				tcp.SynRcvdCountThreshold = 0
-			} else {
-				tcp.SynRcvdCountThreshold = savedSynCountThreshold
-			}
 			for _, sackEnabled := range []bool{false, true} {
 				t.Run(fmt.Sprintf("test stack.sackEnabled: %v", sackEnabled), func(t *testing.T) {
 					c := context.New(t, defaultMTU)
 					defer c.Cleanup()
+
+					if tc.cookieEnabled {
+						// Set the SynRcvd threshold to
+						// zero to force a syn cookie
+						// based accept to happen.
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						}
+					}
 					setStackSACKPermitted(t, c, sackEnabled)
 
 					rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, SACKPermitted: tc.sackPermitted})
@@ -222,21 +223,23 @@ func TestSackDisabledAccept(t *testing.T) {
 		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
 		{false, 5, 0x8000}, // 0x8000 * 2^5 = 1<<20 = 1MB window (the default).
 	}
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
+
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("test: %#v", tc), func(t *testing.T) {
-			if tc.cookieEnabled {
-				tcp.SynRcvdCountThreshold = 0
-			} else {
-				tcp.SynRcvdCountThreshold = savedSynCountThreshold
-			}
 			for _, sackEnabled := range []bool{false, true} {
 				t.Run(fmt.Sprintf("test: sackEnabled: %v", sackEnabled), func(t *testing.T) {
 					c := context.New(t, defaultMTU)
 					defer c.Cleanup()
+
+					if tc.cookieEnabled {
+						// Set the SynRcvd threshold to
+						// zero to force a syn cookie
+						// based accept to happen.
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						}
+					}
+
 					setStackSACKPermitted(t, c, sackEnabled)
 
 					rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS})
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a9f121c17..74fb6e064 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2706,26 +2706,24 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 0
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+	}
 
 	// Create EP and start listening.
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	defer ep.Close()
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Do 3-way handshake.
@@ -2743,7 +2741,7 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -5143,25 +5141,23 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 }
 
 func TestListenBacklogFullSynCookieInUse(t *testing.T) {
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 1
-
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(1)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 1 failed: %s", err)
+	}
+
 	// Create TCP endpoint.
 	var err *tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	// Bind to wildcard.
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -5169,7 +5165,7 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	listenBacklog := 1
 	portOffset := uint16(0)
 	if err := c.EP.Listen(listenBacklog); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	executeHandshake(t, c, context.TestPort+portOffset, false)
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index a641e953d..8edbff964 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -127,16 +127,14 @@ func TestTimeStampDisabledConnect(t *testing.T) {
 }
 
 func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) {
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
 
 	if cookieEnabled {
-		tcp.SynRcvdCountThreshold = 0
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		}
 	}
-	c := context.New(t, defaultMTU)
-	defer c.Cleanup()
 
 	t.Logf("Test w/ CookieEnabled = %v", cookieEnabled)
 	tsVal := rand.Uint32()
@@ -148,7 +146,7 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Unexpected error from Write: %v", err)
+		t.Fatalf("Unexpected error from Write: %s", err)
 	}
 
 	// Check that data is received and that the timestamp option TSEcr field
@@ -190,17 +188,15 @@ func TestTimeStampEnabledAccept(t *testing.T) {
 }
 
 func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) {
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
-	if cookieEnabled {
-		tcp.SynRcvdCountThreshold = 0
-	}
-
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	if cookieEnabled {
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		}
+	}
+
 	t.Logf("Test w/ CookieEnabled = %v", cookieEnabled)
 	c.AcceptWithOptions(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS})
 
@@ -211,7 +207,7 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Unexpected error from Write: %v", err)
+		t.Fatalf("Unexpected error from Write: %s", err)
 	}
 
 	// Check that data is received and that the timestamp option is disabled
-- 
cgit v1.2.3


From b33c3bb4a73974bbae4274da5100a3cd3f5deef8 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 16 Apr 2020 17:26:12 -0700
Subject: Return detailed errors when iterating NDP options

Test: header_test.TestNDPOptionsIterCheck
PiperOrigin-RevId: 306953867
---
 pkg/tcpip/header/BUILD                         |   1 +
 pkg/tcpip/header/ndp_options.go                | 196 ++++++++++++++-----------
 pkg/tcpip/header/ndp_test.go                   | 118 ++++++++-------
 pkg/tcpip/header/ndpoptionidentifier_string.go |  50 +++++++
 pkg/tcpip/stack/ndp.go                         |   3 +-
 5 files changed, 228 insertions(+), 140 deletions(-)
 create mode 100644 pkg/tcpip/header/ndpoptionidentifier_string.go

diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 7094f3f0b..0cde694dc 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -21,6 +21,7 @@ go_library(
         "ndp_options.go",
         "ndp_router_advert.go",
         "ndp_router_solicit.go",
+        "ndpoptionidentifier_string.go",
         "tcp.go",
         "udp.go",
     ],
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index e6a6ad39b..444e90820 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -15,32 +15,43 @@
 package header
 
 import (
+	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"io"
 	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// NDPOptionIdentifier is an NDP option type identifier.
+type NDPOptionIdentifier uint8
+
 const (
 	// NDPSourceLinkLayerAddressOptionType is the type of the Source Link Layer
 	// Address option, as per RFC 4861 section 4.6.1.
-	NDPSourceLinkLayerAddressOptionType = 1
+	NDPSourceLinkLayerAddressOptionType NDPOptionIdentifier = 1
 
 	// NDPTargetLinkLayerAddressOptionType is the type of the Target Link Layer
 	// Address option, as per RFC 4861 section 4.6.1.
-	NDPTargetLinkLayerAddressOptionType = 2
+	NDPTargetLinkLayerAddressOptionType NDPOptionIdentifier = 2
+
+	// NDPPrefixInformationType is the type of the Prefix Information
+	// option, as per RFC 4861 section 4.6.2.
+	NDPPrefixInformationType NDPOptionIdentifier = 3
+
+	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
+	// Server option, as per RFC 8106 section 5.1.
+	NDPRecursiveDNSServerOptionType NDPOptionIdentifier = 25
+)
 
+const (
 	// NDPLinkLayerAddressSize is the size of a Source or Target Link Layer
 	// Address option for an Ethernet address.
 	NDPLinkLayerAddressSize = 8
 
-	// NDPPrefixInformationType is the type of the Prefix Information
-	// option, as per RFC 4861 section 4.6.2.
-	NDPPrefixInformationType = 3
-
 	// ndpPrefixInformationLength is the expected length, in bytes, of the
 	// body of an NDP Prefix Information option, as per RFC 4861 section
 	// 4.6.2 which specifies that the Length field is 4. Given this, the
@@ -91,10 +102,6 @@ const (
 	// within an NDPPrefixInformation.
 	ndpPrefixInformationPrefixOffset = 14
 
-	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
-	// Server option, as per RFC 8106 section 5.1.
-	NDPRecursiveDNSServerOptionType = 25
-
 	// ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
 	// Lifetime field within an NDPRecursiveDNSServer.
 	ndpRecursiveDNSServerLifetimeOffset = 2
@@ -103,10 +110,10 @@ const (
 	// for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
 	ndpRecursiveDNSServerAddressesOffset = 6
 
-	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS
-	// Server option's length field value when it contains at least one
-	// IPv6 address.
-	minNDPRecursiveDNSServerLength = 3
+	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS Server
+	// option's body size when it contains at least one IPv6 address, as per
+	// RFC 8106 section 5.3.1.
+	minNDPRecursiveDNSServerBodySize = 22
 
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
@@ -132,16 +139,13 @@ var (
 // few NDPOption then modify the backing NDPOptions so long as the
 // NDPOptionIterator obtained before modification is no longer used.
 type NDPOptionIterator struct {
-	// The NDPOptions this NDPOptionIterator is iterating over.
-	opts NDPOptions
+	opts *bytes.Buffer
 }
 
 // Potential errors when iterating over an NDPOptions.
 var (
-	ErrNDPOptBufExhausted  = errors.New("Buffer unexpectedly exhausted")
-	ErrNDPOptZeroLength    = errors.New("NDP option has zero-valued Length field")
-	ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body")
-	ErrNDPInvalidLength    = errors.New("NDP option's Length value is invalid as per relevant RFC")
+	ErrNDPOptMalformedBody   = errors.New("NDP option has a malformed body")
+	ErrNDPOptMalformedHeader = errors.New("NDP option has a malformed header")
 )
 
 // Next returns the next element in the backing NDPOptions, or true if we are
@@ -152,48 +156,50 @@ var (
 func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 	for {
 		// Do we still have elements to look at?
-		if len(i.opts) == 0 {
+		if i.opts.Len() == 0 {
 			return nil, true, nil
 		}
 
-		// Do we have enough bytes for an NDP option that has a Length
-		// field of at least 1? Note, 0 in the Length field is invalid.
-		if len(i.opts) < lengthByteUnits {
-			return nil, true, ErrNDPOptBufExhausted
-		}
-
 		// Get the Type field.
-		t := i.opts[0]
-
-		// Get the Length field.
-		l := i.opts[1]
+		temp, err := i.opts.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				// ReadByte should only ever return nil or io.EOF.
+				panic(fmt.Sprintf("unexpected error when reading the option's Type field: %s", err))
+			}
 
-		// This would indicate an erroneous NDP option as the Length
-		// field should never be 0.
-		if l == 0 {
-			return nil, true, ErrNDPOptZeroLength
+			// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
+			// we start parsing an option; we expect the buffer to contain enough
+			// bytes for the whole option.
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Type field: %w", io.ErrUnexpectedEOF)
 		}
+		kind := NDPOptionIdentifier(temp)
 
-		// How many bytes are in the option body?
-		numBytes := int(l) * lengthByteUnits
-		numBodyBytes := numBytes - 2
-
-		potentialBody := i.opts[2:]
+		// Get the Length field.
+		length, err := i.opts.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				panic(fmt.Sprintf("unexpected error when reading the option's Length field for %s: %s", kind, err))
+			}
 
-		// This would indicate an erroenous NDPOptions buffer as we ran
-		// out of the buffer in the middle of an NDP option.
-		if left := len(potentialBody); left < numBodyBytes {
-			return nil, true, ErrNDPOptBufExhausted
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Length field for %s: %w", kind, io.ErrUnexpectedEOF)
 		}
 
-		// Get only the options body, leaving the rest of the options
-		// buffer alone.
-		body := potentialBody[:numBodyBytes]
+		// This would indicate an erroneous NDP option as the Length field should
+		// never be 0.
+		if length == 0 {
+			return nil, true, fmt.Errorf("zero valued Length field for %s: %w", kind, ErrNDPOptMalformedHeader)
+		}
 
-		// Update opts with the remaining options body.
-		i.opts = i.opts[numBytes:]
+		// Get the body.
+		numBytes := int(length) * lengthByteUnits
+		numBodyBytes := numBytes - 2
+		body := i.opts.Next(numBodyBytes)
+		if len(body) < numBodyBytes {
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Body for %s: %w", kind, io.ErrUnexpectedEOF)
+		}
 
-		switch t {
+		switch kind {
 		case NDPSourceLinkLayerAddressOptionType:
 			return NDPSourceLinkLayerAddressOption(body), false, nil
 
@@ -205,22 +211,15 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 			// body is ndpPrefixInformationLength, as per RFC 4861
 			// section 4.6.2.
 			if numBodyBytes != ndpPrefixInformationLength {
-				return nil, true, ErrNDPOptMalformedBody
+				return nil, true, fmt.Errorf("got %d bytes for NDP Prefix Information option's body, expected %d bytes: %w", numBodyBytes, ndpPrefixInformationLength, ErrNDPOptMalformedBody)
 			}
 
 			return NDPPrefixInformation(body), false, nil
 
 		case NDPRecursiveDNSServerOptionType:
-			// RFC 8106 section 5.3.1 outlines that the RDNSS option
-			// must have a minimum length of 3 so it contains at
-			// least one IPv6 address.
-			if l < minNDPRecursiveDNSServerLength {
-				return nil, true, ErrNDPInvalidLength
-			}
-
 			opt := NDPRecursiveDNSServer(body)
-			if len(opt.Addresses()) == 0 {
-				return nil, true, ErrNDPOptMalformedBody
+			if err := opt.checkAddresses(); err != nil {
+				return nil, true, err
 			}
 
 			return opt, false, nil
@@ -247,10 +246,16 @@ type NDPOptions []byte
 //
 // See NDPOptionIterator for more information.
 func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) {
-	it := NDPOptionIterator{opts: b}
+	it := NDPOptionIterator{
+		opts: bytes.NewBuffer(b),
+	}
 
 	if check {
-		for it2 := it; true; {
+		it2 := NDPOptionIterator{
+			opts: bytes.NewBuffer(b),
+		}
+
+		for {
 			if _, done, err := it2.Next(); err != nil || done {
 				return it, err
 			}
@@ -278,7 +283,7 @@ func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
 			continue
 		}
 
-		b[0] = o.Type()
+		b[0] = byte(o.Type())
 
 		// We know this safe because paddedLength would have returned
 		// 0 if o had an invalid length (> 255 * lengthByteUnits).
@@ -304,7 +309,7 @@ type NDPOption interface {
 	fmt.Stringer
 
 	// Type returns the type of the receiver.
-	Type() uint8
+	Type() NDPOptionIdentifier
 
 	// Length returns the length of the body of the receiver, in bytes.
 	Length() int
@@ -386,7 +391,7 @@ func (b NDPOptionsSerializer) Length() int {
 type NDPSourceLinkLayerAddressOption tcpip.LinkAddress
 
 // Type implements NDPOption.Type.
-func (o NDPSourceLinkLayerAddressOption) Type() uint8 {
+func (o NDPSourceLinkLayerAddressOption) Type() NDPOptionIdentifier {
 	return NDPSourceLinkLayerAddressOptionType
 }
 
@@ -426,7 +431,7 @@ func (o NDPSourceLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
 type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
 
 // Type implements NDPOption.Type.
-func (o NDPTargetLinkLayerAddressOption) Type() uint8 {
+func (o NDPTargetLinkLayerAddressOption) Type() NDPOptionIdentifier {
 	return NDPTargetLinkLayerAddressOptionType
 }
 
@@ -466,7 +471,7 @@ func (o NDPTargetLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
 type NDPPrefixInformation []byte
 
 // Type implements NDPOption.Type.
-func (o NDPPrefixInformation) Type() uint8 {
+func (o NDPPrefixInformation) Type() NDPOptionIdentifier {
 	return NDPPrefixInformationType
 }
 
@@ -590,7 +595,7 @@ type NDPRecursiveDNSServer []byte
 // Type returns the type of an NDP Recursive DNS Server option.
 //
 // Type implements NDPOption.Type.
-func (NDPRecursiveDNSServer) Type() uint8 {
+func (NDPRecursiveDNSServer) Type() NDPOptionIdentifier {
 	return NDPRecursiveDNSServerOptionType
 }
 
@@ -613,7 +618,12 @@ func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
 
 // String implements fmt.Stringer.String.
 func (o NDPRecursiveDNSServer) String() string {
-	return fmt.Sprintf("%T(%s valid for %s)", o, o.Addresses(), o.Lifetime())
+	lt := o.Lifetime()
+	addrs, err := o.Addresses()
+	if err != nil {
+		return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
+	}
+	return fmt.Sprintf("%T(%s valid for %s)", o, addrs, lt)
 }
 
 // Lifetime returns the length of time that the DNS server addresses
@@ -632,29 +642,45 @@ func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
 // Addresses returns the recursive DNS server IPv6 addresses that may be
 // used for name resolution.
 //
-// Note, some of the addresses returned MAY be link-local addresses.
+// Note, the addresses MAY be link-local addresses.
+func (o NDPRecursiveDNSServer) Addresses() ([]tcpip.Address, error) {
+	var addrs []tcpip.Address
+	return addrs, o.iterAddresses(func(addr tcpip.Address) { addrs = append(addrs, addr) })
+}
+
+// checkAddresses iterates over the addresses in an NDP Recursive DNS Server
+// option and returns any error it encounters.
+func (o NDPRecursiveDNSServer) checkAddresses() error {
+	return o.iterAddresses(nil)
+}
+
+// iterAddresses iterates over the addresses in an NDP Recursive DNS Server
+// option and calls a function with each valid unicast IPv6 address.
 //
-// Addresses may panic if o does not hold valid IPv6 addresses.
-func (o NDPRecursiveDNSServer) Addresses() []tcpip.Address {
-	l := len(o)
-	if l < ndpRecursiveDNSServerAddressesOffset {
-		return nil
+// Note, the addresses MAY be link-local addresses.
+func (o NDPRecursiveDNSServer) iterAddresses(fn func(tcpip.Address)) error {
+	if l := len(o); l < minNDPRecursiveDNSServerBodySize {
+		return fmt.Errorf("got %d bytes for NDP Recursive DNS Server option's body, expected at least %d bytes: %w", l, minNDPRecursiveDNSServerBodySize, io.ErrUnexpectedEOF)
 	}
 
-	l -= ndpRecursiveDNSServerAddressesOffset
+	o = o[ndpRecursiveDNSServerAddressesOffset:]
+	l := len(o)
 	if l%IPv6AddressSize != 0 {
-		return nil
+		return fmt.Errorf("NDP Recursive DNS Server option's body ends in the middle of an IPv6 address (addresses body size = %d bytes): %w", l, ErrNDPOptMalformedBody)
 	}
 
-	buf := o[ndpRecursiveDNSServerAddressesOffset:]
-	var addrs []tcpip.Address
-	for len(buf) > 0 {
-		addr := tcpip.Address(buf[:IPv6AddressSize])
+	for i := 0; len(o) != 0; i++ {
+		addr := tcpip.Address(o[:IPv6AddressSize])
 		if !IsV6UnicastAddress(addr) {
-			return nil
+			return fmt.Errorf("%d-th address (%s) in NDP Recursive DNS Server option is not a valid unicast IPv6 address: %w", i, addr, ErrNDPOptMalformedBody)
+		}
+
+		if fn != nil {
+			fn(addr)
 		}
-		addrs = append(addrs, addr)
-		buf = buf[IPv6AddressSize:]
+
+		o = o[IPv6AddressSize:]
 	}
-	return addrs
+
+	return nil
 }
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 969f8f1e8..2341329c4 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -16,6 +16,8 @@ package header
 
 import (
 	"bytes"
+	"errors"
+	"io"
 	"testing"
 	"time"
 
@@ -543,8 +545,12 @@ func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) {
 	want := []tcpip.Address{
 		"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
 	}
-	if got := opt.Addresses(); !cmp.Equal(got, want) {
-		t.Errorf("got Addresses = %v, want = %v", got, want)
+	addrs, err := opt.Addresses()
+	if err != nil {
+		t.Errorf("opt.Addresses() = %s", err)
+	}
+	if diff := cmp.Diff(addrs, want); diff != "" {
+		t.Errorf("mismatched addresses (-want +got):\n%s", diff)
 	}
 
 	// Iterator should not return anything else.
@@ -638,8 +644,12 @@ func TestNDPRecursiveDNSServerOption(t *testing.T) {
 			if got := opt.Lifetime(); got != test.lifetime {
 				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
 			}
-			if got := opt.Addresses(); !cmp.Equal(got, test.addrs) {
-				t.Errorf("got Addresses = %v, want = %v", got, test.addrs)
+			addrs, err := opt.Addresses()
+			if err != nil {
+				t.Errorf("opt.Addresses() = %s", err)
+			}
+			if diff := cmp.Diff(addrs, test.addrs); diff != "" {
+				t.Errorf("mismatched addresses (-want +got):\n%s", diff)
 			}
 
 			// Iterator should not return anything else.
@@ -661,38 +671,38 @@ func TestNDPRecursiveDNSServerOption(t *testing.T) {
 // the iterator was returned for is malformed.
 func TestNDPOptionsIterCheck(t *testing.T) {
 	tests := []struct {
-		name     string
-		buf      []byte
-		expected error
+		name        string
+		buf         []byte
+		expectedErr error
 	}{
 		{
-			"ZeroLengthField",
-			[]byte{0, 0, 0, 0, 0, 0, 0, 0},
-			ErrNDPOptZeroLength,
+			name:        "ZeroLengthField",
+			buf:         []byte{0, 0, 0, 0, 0, 0, 0, 0},
+			expectedErr: ErrNDPOptMalformedHeader,
 		},
 		{
-			"ValidSourceLinkLayerAddressOption",
-			[]byte{1, 1, 1, 2, 3, 4, 5, 6},
-			nil,
+			name:        "ValidSourceLinkLayerAddressOption",
+			buf:         []byte{1, 1, 1, 2, 3, 4, 5, 6},
+			expectedErr: nil,
 		},
 		{
-			"TooSmallSourceLinkLayerAddressOption",
-			[]byte{1, 1, 1, 2, 3, 4, 5},
-			ErrNDPOptBufExhausted,
+			name:        "TooSmallSourceLinkLayerAddressOption",
+			buf:         []byte{1, 1, 1, 2, 3, 4, 5},
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"ValidTargetLinkLayerAddressOption",
-			[]byte{2, 1, 1, 2, 3, 4, 5, 6},
-			nil,
+			name:        "ValidTargetLinkLayerAddressOption",
+			buf:         []byte{2, 1, 1, 2, 3, 4, 5, 6},
+			expectedErr: nil,
 		},
 		{
-			"TooSmallTargetLinkLayerAddressOption",
-			[]byte{2, 1, 1, 2, 3, 4, 5},
-			ErrNDPOptBufExhausted,
+			name:        "TooSmallTargetLinkLayerAddressOption",
+			buf:         []byte{2, 1, 1, 2, 3, 4, 5},
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"ValidPrefixInformation",
-			[]byte{
+			name: "ValidPrefixInformation",
+			buf: []byte{
 				3, 4, 43, 64,
 				1, 2, 3, 4,
 				5, 6, 7, 8,
@@ -702,11 +712,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23, 24,
 			},
-			nil,
+			expectedErr: nil,
 		},
 		{
-			"TooSmallPrefixInformation",
-			[]byte{
+			name: "TooSmallPrefixInformation",
+			buf: []byte{
 				3, 4, 43, 64,
 				1, 2, 3, 4,
 				5, 6, 7, 8,
@@ -716,11 +726,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23,
 			},
-			ErrNDPOptBufExhausted,
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"InvalidPrefixInformationLength",
-			[]byte{
+			name: "InvalidPrefixInformationLength",
+			buf: []byte{
 				3, 3, 43, 64,
 				1, 2, 3, 4,
 				5, 6, 7, 8,
@@ -728,11 +738,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				9, 10, 11, 12,
 				13, 14, 15, 16,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 		{
-			"ValidSourceAndTargetLinkLayerAddressWithPrefixInformation",
-			[]byte{
+			name: "ValidSourceAndTargetLinkLayerAddressWithPrefixInformation",
+			buf: []byte{
 				// Source Link-Layer Address.
 				1, 1, 1, 2, 3, 4, 5, 6,
 
@@ -749,11 +759,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23, 24,
 			},
-			nil,
+			expectedErr: nil,
 		},
 		{
-			"ValidSourceAndTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
-			[]byte{
+			name: "ValidSourceAndTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
+			buf: []byte{
 				// Source Link-Layer Address.
 				1, 1, 1, 2, 3, 4, 5, 6,
 
@@ -775,52 +785,52 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23, 24,
 			},
-			nil,
+			expectedErr: nil,
 		},
 		{
-			"InvalidRecursiveDNSServerCutsOffAddress",
-			[]byte{
+			name: "InvalidRecursiveDNSServerCutsOffAddress",
+			buf: []byte{
 				25, 4, 0, 0,
 				0, 0, 0, 0,
 				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
 				0, 1, 2, 3, 4, 5, 6, 7,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 		{
-			"InvalidRecursiveDNSServerInvalidLengthField",
-			[]byte{
+			name: "InvalidRecursiveDNSServerInvalidLengthField",
+			buf: []byte{
 				25, 2, 0, 0,
 				0, 0, 0, 0,
 				0, 1, 2, 3, 4, 5, 6, 7, 8,
 			},
-			ErrNDPInvalidLength,
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"RecursiveDNSServerTooSmall",
-			[]byte{
+			name: "RecursiveDNSServerTooSmall",
+			buf: []byte{
 				25, 1, 0, 0,
 				0, 0, 0,
 			},
-			ErrNDPOptBufExhausted,
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"RecursiveDNSServerMulticast",
-			[]byte{
+			name: "RecursiveDNSServerMulticast",
+			buf: []byte{
 				25, 3, 0, 0,
 				0, 0, 0, 0,
 				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 		{
-			"RecursiveDNSServerUnspecified",
-			[]byte{
+			name: "RecursiveDNSServerUnspecified",
+			buf: []byte{
 				25, 3, 0, 0,
 				0, 0, 0, 0,
 				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 	}
 
@@ -828,8 +838,8 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			opts := NDPOptions(test.buf)
 
-			if _, err := opts.Iter(true); err != test.expected {
-				t.Fatalf("got Iter(true) = (_, %v), want = (_, %v)", err, test.expected)
+			if _, err := opts.Iter(true); !errors.Is(err, test.expectedErr) {
+				t.Fatalf("got Iter(true) = (_, %v), want = (_, %v)", err, test.expectedErr)
 			}
 
 			// test.buf may be malformed but we chose not to check
diff --git a/pkg/tcpip/header/ndpoptionidentifier_string.go b/pkg/tcpip/header/ndpoptionidentifier_string.go
new file mode 100644
index 000000000..6fe9a336b
--- /dev/null
+++ b/pkg/tcpip/header/ndpoptionidentifier_string.go
@@ -0,0 +1,50 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type NDPOptionIdentifier ."; DO NOT EDIT.
+
+package header
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[NDPSourceLinkLayerAddressOptionType-1]
+	_ = x[NDPTargetLinkLayerAddressOptionType-2]
+	_ = x[NDPPrefixInformationType-3]
+	_ = x[NDPRecursiveDNSServerOptionType-25]
+}
+
+const (
+	_NDPOptionIdentifier_name_0 = "NDPSourceLinkLayerAddressOptionTypeNDPTargetLinkLayerAddressOptionTypeNDPPrefixInformationType"
+	_NDPOptionIdentifier_name_1 = "NDPRecursiveDNSServerOptionType"
+)
+
+var (
+	_NDPOptionIdentifier_index_0 = [...]uint8{0, 35, 70, 94}
+)
+
+func (i NDPOptionIdentifier) String() string {
+	switch {
+	case 1 <= i && i <= 3:
+		i -= 1
+		return _NDPOptionIdentifier_name_0[_NDPOptionIdentifier_index_0[i]:_NDPOptionIdentifier_index_0[i+1]]
+	case i == 25:
+		return _NDPOptionIdentifier_name_1
+	default:
+		return "NDPOptionIdentifier(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 7f66c6c09..8140c6dd4 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -711,7 +711,8 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 				continue
 			}
 
-			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), opt.Addresses(), opt.Lifetime())
+			addrs, _ := opt.Addresses()
+			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), addrs, opt.Lifetime())
 
 		case header.NDPPrefixInformation:
 			prefix := opt.Subnet()
-- 
cgit v1.2.3


From 3b05f576d73be644daa17203d9ed64481c45b4a8 Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Thu, 16 Apr 2020 17:57:06 -0700
Subject: Reset pending connections on listener shutdown.

When the listening socket is read shutdown, we need to reset all pending
and incoming connections. Ensure that the endpoint is not cleaned up
from the demuxer and subsequent bind to same port does not go through.

PiperOrigin-RevId: 306958038
---
 pkg/tcpip/transport/tcp/accept.go           | 20 +++---
 pkg/tcpip/transport/tcp/connect.go          |  7 ++-
 pkg/tcpip/transport/tcp/endpoint.go         | 30 ++++++---
 pkg/tcpip/transport/tcp/forwarder.go        |  2 +-
 pkg/tcpip/transport/tcp/protocol.go         |  8 +--
 pkg/tcpip/transport/tcp/tcp_test.go         | 16 ++---
 test/syscalls/linux/socket_inet_loopback.cc | 94 +++++++++++++++++++++++++++--
 7 files changed, 138 insertions(+), 39 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index b61c2a8c3..5bb243e3b 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -433,19 +432,16 @@ func (e *endpoint) acceptQueueIsFull() bool {
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
-	if s.flagsAreSet(header.TCPFlagSyn | header.TCPFlagAck) {
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
+	if rcvClosed || s.flagsAreSet(header.TCPFlagSyn|header.TCPFlagAck) {
+		// If the endpoint is shutdown, reply with reset.
+		//
 		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
 		// must be sent in response to a SYN-ACK while in the listen
 		// state to prevent completing a handshake from an old SYN.
-		e.sendTCP(&s.route, tcpFields{
-			id:     s.id,
-			ttl:    e.ttl,
-			tos:    e.sendTOS,
-			flags:  header.TCPFlagRst,
-			seq:    s.ackNumber,
-			ack:    0,
-			rcvWnd: 0,
-		}, buffer.VectorisedView{}, nil)
+		replyWithReset(s, e.sendTOS, e.ttl)
 		return
 	}
 
@@ -534,7 +530,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			// The only time we should reach here when a connection
 			// was opened and closed really quickly and a delayed
 			// ACK was received from the sender.
-			replyWithReset(s)
+			replyWithReset(s, e.sendTOS, e.ttl)
 			return
 		}
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 994ac52a3..368865911 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1053,10 +1053,15 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
 	}
 	if ep == nil {
-		replyWithReset(s)
+		replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 		s.decRef()
 		return
 	}
+
+	if e == ep {
+		panic("current endpoint not removed from demuxer, enqueing segments to itself")
+	}
+
 	if ep.(*endpoint).enqueueSegment(s) {
 		ep.(*endpoint).newSegmentWaker.Assert()
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index bffc59e9f..5d0ea9e93 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2101,7 +2101,7 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 	switch {
 	case e.EndpointState().connected():
 		// Close for read.
-		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
 			// Mark read side as closed.
 			e.rcvListMu.Lock()
 			e.rcvClosed = true
@@ -2110,7 +2110,7 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
-			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
+			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
 				e.resetConnectionLocked(tcpip.ErrConnectionAborted)
 				// Wake up worker to terminate loop.
 				e.notifyProtocolGoroutine(notifyTickleWorker)
@@ -2119,7 +2119,7 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 		}
 
 		// Close for write.
-		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
+		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
 			e.sndBufMu.Lock()
 			if e.sndClosed {
 				// Already closed.
@@ -2142,12 +2142,23 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 		return nil
 	case e.EndpointState() == StateListen:
-		// Tell protocolListenLoop to stop.
-		if flags&tcpip.ShutdownRead != 0 {
-			e.notifyProtocolGoroutine(notifyClose)
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
+			// Reset all connections from the accept queue and keep the
+			// worker running so that it can continue handling incoming
+			// segments by replying with RST.
+			//
+			// By not removing this endpoint from the demuxer mapping, we
+			// ensure that any other bind to the same port fails, as on Linux.
+			// TODO(gvisor.dev/issue/2468): We need to enable applications to
+			// start listening on this endpoint again similar to Linux.
+			e.rcvListMu.Lock()
+			e.rcvClosed = true
+			e.rcvListMu.Unlock()
+			e.closePendingAcceptableConnectionsLocked()
+			// Notify waiters that the endpoint is shutdown.
+			e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 		}
 		return nil
-
 	default:
 		return tcpip.ErrNotConnected
 	}
@@ -2251,8 +2262,11 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	e.LockUser()
 	defer e.UnlockUser()
 
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
 	// Endpoint must be in listen state before it can accept connections.
-	if e.EndpointState() != StateListen {
+	if rcvClosed || e.EndpointState() != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 808410c92..704d01c64 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -130,7 +130,7 @@ func (r *ForwarderRequest) Complete(sendReset bool) {
 
 	// If the caller requested, send a reset.
 	if sendReset {
-		replyWithReset(r.segment)
+		replyWithReset(r.segment, stack.DefaultTOS, r.segment.route.DefaultTTL())
 	}
 
 	// Release all resources.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index effbf203f..cfd9a4e8e 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -223,12 +223,12 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 		return true
 	}
 
-	replyWithReset(s)
+	replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 	return true
 }
 
 // replyWithReset replies to the given segment with a reset segment.
-func replyWithReset(s *segment) {
+func replyWithReset(s *segment, tos, ttl uint8) {
 	// Get the seqnum from the packet if the ack flag is set.
 	seq := seqnum.Value(0)
 	ack := seqnum.Value(0)
@@ -252,8 +252,8 @@ func replyWithReset(s *segment) {
 	}
 	sendTCP(&s.route, tcpFields{
 		id:     s.id,
-		ttl:    s.route.DefaultTTL(),
-		tos:    stack.DefaultTOS,
+		ttl:    ttl,
+		tos:    tos,
 		flags:  flags,
 		seq:    seq,
 		ack:    ack,
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 74fb6e064..ab1014c7f 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1034,8 +1034,8 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 		checker.SeqNum(200)))
 }
 
-// TestListenShutdown tests for the listening endpoint not processing
-// any receive when it is on read shutdown.
+// TestListenShutdown tests for the listening endpoint replying with RST
+// on read shutdown.
 func TestListenShutdown(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -1046,7 +1046,7 @@ func TestListenShutdown(t *testing.T) {
 		t.Fatal("Bind failed:", err)
 	}
 
-	if err := c.EP.Listen(10 /* backlog */); err != nil {
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
 		t.Fatal("Listen failed:", err)
 	}
 
@@ -1054,9 +1054,6 @@ func TestListenShutdown(t *testing.T) {
 		t.Fatal("Shutdown failed:", err)
 	}
 
-	// Wait for the endpoint state to be propagated.
-	time.Sleep(10 * time.Millisecond)
-
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
@@ -1065,7 +1062,12 @@ func TestListenShutdown(t *testing.T) {
 		AckNum:  200,
 	})
 
-	c.CheckNoPacket("Packet received when listening socket was shutdown")
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
 }
 
 // TestListenCloseWhileConnect tests for the listening endpoint to
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index cd84e633a..d3000dbc6 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -319,6 +319,75 @@ TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
   tcpSimpleConnectTest(listener, connector, false);
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenShutdown) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 2;
+  constexpr int kFDs = kBacklog + 1;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  // Shutdown the write of the listener, expect to not have any effect.
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_WR), SyscallSucceeds());
+
+  for (int i = 0; i < kFDs; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallSucceeds());
+    ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+  }
+
+  // Shutdown the read of the listener, expect to fail subsequent
+  // server accepts, binds and client connects.
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Check that shutdown did not release the port.
+  FileDescriptor new_listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(
+      bind(new_listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+           listener.addr_len),
+      SyscallFailsWithErrno(EADDRINUSE));
+
+  // Check that subsequent connection attempts receive a RST.
+  auto client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  for (int i = 0; i < kFDs; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallFailsWithErrno(ECONNREFUSED));
+  }
+}
+
 TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   auto const& param = GetParam();
 
@@ -365,9 +434,8 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
 }
 
-TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
-  auto const& param = GetParam();
-
+void TestListenWhileConnect(const TestParam& param,
+                            void (*stopListen)(FileDescriptor&)) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
@@ -404,8 +472,8 @@ TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
       clients.push_back(std::move(client));
     }
   }
-  // Close the listening socket.
-  listen_fd.reset();
+
+  stopListen(listen_fd);
 
   for (auto& client : clients) {
     const int kTimeout = 10000;
@@ -420,13 +488,26 @@ TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
     char c;
     // Subsequent read can fail with:
     // ECONNRESET: If the client connection was established and was reset by the
-    // remote. ECONNREFUSED: If the client connection failed to be established.
+    // remote.
+    // ECONNREFUSED: If the client connection failed to be established.
     ASSERT_THAT(read(client.get(), &c, sizeof(c)),
                 AnyOf(SyscallFailsWithErrno(ECONNRESET),
                       SyscallFailsWithErrno(ECONNREFUSED)));
   }
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
+  TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+    ASSERT_THAT(close(f.release()), SyscallSucceeds());
+  });
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
+  TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+    ASSERT_THAT(shutdown(f.get(), SHUT_RD), SyscallSucceeds());
+  });
+}
+
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   auto const& param = GetParam();
 
@@ -1134,6 +1215,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
               if (connects_received >= kConnectAttempts) {
                 // Another thread have shutdown our read side causing the
                 // accept to fail.
+                ASSERT_EQ(errno, EINVAL);
                 break;
               }
               ASSERT_NO_ERRNO(fd);
-- 
cgit v1.2.3


From e7dcd942acacc1f326817f5148bc41455ef1cd1d Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Thu, 16 Apr 2020 18:05:51 -0700
Subject: Properly delegate Wait

PiperOrigin-RevId: 306959393
---
 pkg/tcpip/link/sniffer/sniffer.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 03def4013..938540c14 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -238,7 +238,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
-func (*endpoint) Wait() {}
+func (e *endpoint) Wait() { e.lower.Wait() }
 
 func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
 	// Figure out the network layer info.
-- 
cgit v1.2.3


From f367cf8e67818b0ca3be6fb15b8be481635c2575 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 16 Apr 2020 18:32:15 -0700
Subject: Drop invalid NDP NA messages

Better validate NDP NAs options before updating the link address cache.

Test: stack_test.TestNeighorAdvertisementWithTargetLinkLayerOption
PiperOrigin-RevId: 306962924
---
 pkg/tcpip/network/ipv6/icmp.go     | 50 +++++++++++++++++++++++---------------
 pkg/tcpip/network/ipv6/ndp_test.go |  7 ++++++
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index dc0369156..b68983d10 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -301,40 +301,38 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 		targetAddr := na.TargetAddress()
 		stack := r.Stack()
-		rxNICID := r.NICID()
 
-		if isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr); err != nil {
-			// We will only get an error if rxNICID is unrecognized,
-			// which should not happen. For now short-circuit this
-			// packet.
+		if isTentative, err := stack.IsAddrTentative(e.nicID, targetAddr); err != nil {
+			// We will only get an error if the NIC is unrecognized, which should not
+			// happen. For now short-circuit this packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
 		} else if isTentative {
-			// We just got an NA from a node that owns an address we
-			// are performing DAD on, implying the address is not
-			// unique. In this case we let the stack know so it can
-			// handle such a scenario and do nothing furthur with
+			// We just got an NA from a node that owns an address we are performing
+			// DAD on, implying the address is not unique. In this case we let the
+			// stack know so it can handle such a scenario and do nothing furthur with
 			// the NDP NA.
-			stack.DupTentativeAddrDetected(rxNICID, targetAddr)
+			stack.DupTentativeAddrDetected(e.nicID, targetAddr)
 			return
 		}
 
-		// At this point we know that the targetAddress is not tentative
-		// on rxNICID. However, targetAddr may still be assigned to
-		// rxNICID but not tentative (it could be permanent). Such a
-		// scenario is beyond the scope of RFC 4862. As such, we simply
-		// ignore such a scenario for now and proceed as normal.
+		// At this point we know that the target address is not tentative on the
+		// NIC. However, the target address may still be assigned to the NIC but not
+		// tentative (it could be permanent). Such a scenario is beyond the scope of
+		// RFC 4862. As such, we simply ignore such a scenario for now and proceed
+		// as normal.
 		//
+		// TODO(b/143147598): Handle the scenario described above. Also inform the
+		// netstack integration that a duplicate address was detected outside of
+		// DAD.
+
 		// If the NA message has the target link layer option, update the link
 		// address cache with the link address for the target of the message.
 		//
-		// TODO(b/143147598): Handle the scenario described above. Also
-		// inform the netstack integration that a duplicate address was
-		// detected outside of DAD.
-		//
 		// TODO(b/148429853): Properly process the NA message and do Neighbor
 		// Unreachability Detection.
+		var targetLinkAddr tcpip.LinkAddress
 		for {
 			opt, done, err := it.Next()
 			if err != nil {
@@ -347,10 +345,22 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 			switch opt := opt.(type) {
 			case header.NDPTargetLinkLayerAddressOption:
-				e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, opt.EthernetAddress())
+				// No RFCs define what to do when an NA message has multiple Target
+				// Link-Layer Address options. Since no interface can have multiple
+				// link-layer addresses, we consider such messages invalid.
+				if len(targetLinkAddr) != 0 {
+					received.Invalid.Increment()
+					return
+				}
+
+				targetLinkAddr = opt.EthernetAddress()
 			}
 		}
 
+		if len(targetLinkAddr) != 0 {
+			e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
+		}
+
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
 		if len(v) < header.ICMPv6EchoMinimumSize {
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 8db51da96..12b70f7e9 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -449,6 +449,13 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 			name:    "Invalid Length",
 			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
 		},
+		{
+			name: "Multiple",
+			optsBuf: []byte{
+				2, 1, 2, 3, 4, 5, 6, 7,
+				2, 1, 2, 3, 4, 5, 6, 8,
+			},
+		},
 	}
 
 	for _, test := range tests {
-- 
cgit v1.2.3


From f03996c5e9803934226e4b3a10827501cb936ab9 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 16 Apr 2020 19:26:02 -0700
Subject: Implement pipe(2) and pipe2(2) for VFS2.

Updates #1035

PiperOrigin-RevId: 306968644
---
 pkg/sentry/fsimpl/pipefs/BUILD                     |  20 +++
 pkg/sentry/fsimpl/pipefs/pipefs.go                 | 148 +++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/filesystem.go              |   2 +-
 pkg/sentry/fsimpl/tmpfs/named_pipe.go              |  23 +--
 pkg/sentry/fsimpl/tmpfs/tmpfs.go                   |   2 +-
 pkg/sentry/kernel/BUILD                            |   1 +
 pkg/sentry/kernel/kernel.go                        |  30 +++-
 pkg/sentry/kernel/pipe/vfs.go                      | 162 ++++++++++++---------
 pkg/sentry/syscalls/linux/sys_pipe.go              |  14 +-
 pkg/sentry/syscalls/linux/vfs2/BUILD               |   3 +
 pkg/sentry/syscalls/linux/vfs2/fd.go               |  17 +++
 .../syscalls/linux/vfs2/linux64_override_amd64.go  |   4 +-
 pkg/sentry/syscalls/linux/vfs2/pipe.go             |  63 ++++++++
 pkg/sentry/syscalls/linux/vfs2/read_write.go       |   8 +-
 pkg/sentry/vfs/vfs.go                              |   2 +-
 test/syscalls/linux/pipe.cc                        |   2 +
 16 files changed, 389 insertions(+), 112 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/pipefs/BUILD
 create mode 100644 pkg/sentry/fsimpl/pipefs/pipefs.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/pipe.go

diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD
new file mode 100644
index 000000000..0d411606f
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "pipefs",
+    srcs = ["pipefs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
new file mode 100644
index 000000000..faf3179bc
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -0,0 +1,148 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipefs provides the filesystem implementation backing
+// Kernel.PipeMount.
+package pipefs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type filesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "pipefs"
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("pipefs.filesystemType.GetFilesystem should never be called")
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+
+	// TODO(gvisor.dev/issue/1193):
+	//
+	// - kernfs does not provide a way to implement statfs, from which we
+	// should indicate PIPEFS_MAGIC.
+	//
+	// - kernfs does not provide a way to override names for
+	// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic
+	// name fmt.Sprintf("pipe:[%d]", inode.ino).
+}
+
+// NewFilesystem sets up and returns a new vfs.Filesystem implemented by
+// pipefs.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
+	fs := &filesystem{}
+	fs.Init(vfsObj, filesystemType{})
+	return fs.VFSFilesystem()
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoopRefCount
+
+	pipe *pipe.VFSPipe
+
+	ino uint64
+	uid auth.KUID
+	gid auth.KGID
+	// We use the creation timestamp for all of atime, mtime, and ctime.
+	ctime ktime.Time
+}
+
+func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode {
+	creds := auth.CredentialsFromContext(ctx)
+	return &inode{
+		pipe:  pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+		ino:   fs.NextIno(),
+		uid:   creds.EffectiveKUID,
+		gid:   creds.EffectiveKGID,
+		ctime: ktime.NowFromContext(ctx),
+	}
+}
+
+const pipeMode = 0600 | linux.S_IFIFO
+
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid)
+}
+
+// Mode implements kernfs.Inode.Mode.
+func (i *inode) Mode() linux.FileMode {
+	return pipeMode
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
+	return linux.Statx{
+		Mask:    linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+		Blksize: usermem.PageSize,
+		Nlink:   1,
+		UID:     uint32(i.uid),
+		GID:     uint32(i.gid),
+		Mode:    pipeMode,
+		Ino:     i.ino,
+		Size:    0,
+		Blocks:  0,
+		Atime:   ts,
+		Ctime:   ts,
+		Mtime:   ts,
+		// TODO(gvisor.dev/issue/1197): Device number.
+	}, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// FIXME(b/38173783): kernfs does not plumb Context here.
+	return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags)
+}
+
+// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
+// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
+//
+// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
+func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	fs := mnt.Filesystem().Impl().(*kernfs.Filesystem)
+	inode := newInode(ctx, fs)
+	var d kernfs.Dentry
+	d.Init(inode)
+	defer d.DecRef()
+	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index f4d50d64f..660f5a29b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -392,7 +392,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags)
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
 	case *socketFile:
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 2c5c739df..8d77b3fa8 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -16,10 +16,8 @@ package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -33,27 +31,8 @@ type namedPipe struct {
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
+	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
 	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
-
-// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
-// entirely via struct embedding.
-type namedPipeFD struct {
-	fileDescription
-
-	*pipe.VFSPipeFD
-}
-
-func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	var err error
-	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
-	fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{})
-	return &fd.vfsfd, nil
-}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 9fa8637d5..a59b24d45 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -357,6 +357,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 		return err
 	}
 	i.mu.Lock()
+	defer i.mu.Unlock()
 	var (
 		needsMtimeBump bool
 		needsCtimeBump bool
@@ -427,7 +428,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 		atomic.StoreInt64(&i.ctime, now)
 	}
 
-	i.mu.Unlock()
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e0ff58d8c..e47af66d6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -170,6 +170,7 @@ go_library(
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/fsimpl/pipefs",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index de8a95854..fef60e636 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -254,6 +255,10 @@ type Kernel struct {
 	// VFS keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
 
+	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
+	// syscalls (as opposed to named pipes created by mknod()).
+	pipeMount *vfs.Mount
+
 	// If set to true, report address space activation waits as if the task is in
 	// external wait so that the watchdog doesn't report the task stuck.
 	SleepForAddressSpaceActivation bool
@@ -354,19 +359,29 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+
 	if VFS2Enabled {
 		if err := k.vfs.Init(); err != nil {
 			return fmt.Errorf("failed to initialize VFS: %v", err)
 		}
-		fs := sockfs.NewFilesystem(&k.vfs)
-		// NewDisconnectedMount will take an additional reference on fs.
-		defer fs.DecRef()
-		sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{})
+
+		pipeFilesystem := pipefs.NewFilesystem(&k.vfs)
+		defer pipeFilesystem.DecRef()
+		pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create pipefs mount: %v", err)
+		}
+		k.pipeMount = pipeMount
+
+		socketFilesystem := sockfs.NewFilesystem(&k.vfs)
+		defer socketFilesystem.DecRef()
+		socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
 		if err != nil {
 			return fmt.Errorf("failed to initialize socket mount: %v", err)
 		}
-		k.socketMount = sm
+		k.socketMount = socketMount
 	}
+
 	return nil
 }
 
@@ -1613,3 +1628,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 func (k *Kernel) VFS() *vfs.VirtualFilesystem {
 	return &k.vfs
 }
+
+// PipeMount returns the pipefs mount.
+func (k *Kernel) PipeMount() *vfs.Mount {
+	return k.pipeMount
+}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index a5675bd70..b54f08a30 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -49,38 +49,42 @@ type VFSPipe struct {
 }
 
 // NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
 	var vp VFSPipe
-	initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes)
+	initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
 	return &vp
 }
 
-// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics
-// during open:
+// ReaderWriterPair returns read-only and write-only FDs for vp.
 //
-// "Normally, opening the FIFO blocks until the other end is opened also. A
-// process can open a FIFO in nonblocking mode. In this case, opening for
-// read-only will succeed even if no-one has opened on the write side yet,
-// opening for write-only will fail with ENXIO (no such device or address)
-// unless the other end has already been opened. Under Linux, opening a FIFO
-// for read and write will succeed both in blocking and nonblocking mode. POSIX
-// leaves this behavior undefined. This can be used to open a FIFO for writing
-// while there are no readers available." - fifo(7)
-func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+// Preconditions: statusFlags should not contain an open access mode.
+func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+}
+
+// Open opens the pipe represented by vp.
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
 	defer vp.mu.Unlock()
 
-	readable := vfs.MayReadFileWithOpenFlags(flags)
-	writable := vfs.MayWriteFileWithOpenFlags(flags)
+	readable := vfs.MayReadFileWithOpenFlags(statusFlags)
+	writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
 	if !readable && !writable {
 		return nil, syserror.EINVAL
 	}
 
-	vfd, err := vp.open(vfsd, vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
+	fd := vp.newFD(mnt, vfsd, statusFlags)
 
+	// Named pipes have special blocking semantics during open:
+	//
+	// "Normally, opening the FIFO blocks until the other end is opened also. A
+	// process can open a FIFO in nonblocking mode. In this case, opening for
+	// read-only will succeed even if no-one has opened on the write side yet,
+	// opening for write-only will fail with ENXIO (no such device or address)
+	// unless the other end has already been opened. Under Linux, opening a
+	// FIFO for read and write will succeed both in blocking and nonblocking
+	// mode. POSIX leaves this behavior undefined. This can be used to open a
+	// FIFO for writing while there are no readers available." - fifo(7)
 	switch {
 	case readable && writable:
 		// Pipes opened for read-write always succeed without blocking.
@@ -89,23 +93,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
 
 	case readable:
 		newHandleLocked(&vp.rWakeup)
-		// If this pipe is being opened as nonblocking and there's no
+		// If this pipe is being opened as blocking and there's no
 		// writer, we have to wait for a writer to open the other end.
-		if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+		if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+			fd.DecRef()
 			return nil, syserror.EINTR
 		}
 
 	case writable:
 		newHandleLocked(&vp.wWakeup)
 
-		if !vp.pipe.HasReaders() {
-			// Nonblocking, write-only opens fail with ENXIO when
-			// the read side isn't open yet.
-			if flags&linux.O_NONBLOCK != 0 {
+		if vp.pipe.isNamed && !vp.pipe.HasReaders() {
+			// Non-blocking, write-only opens fail with ENXIO when the read
+			// side isn't open yet.
+			if statusFlags&linux.O_NONBLOCK != 0 {
+				fd.DecRef()
 				return nil, syserror.ENXIO
 			}
 			// Wait for a reader to open the other end.
 			if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
+				fd.DecRef()
 				return nil, syserror.EINTR
 			}
 		}
@@ -114,96 +121,93 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	return vfd, nil
+	return fd, nil
 }
 
 // Preconditions: vp.mu must be held.
-func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
-	var fd VFSPipeFD
-	fd.flags = flags
-	fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-	fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-	fd.vfsfd = vfsfd
-	fd.pipe = &vp.pipe
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+	fd := &VFSPipeFD{
+		pipe: &vp.pipe,
+	}
+	fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	})
 
 	switch {
-	case fd.readable && fd.writable:
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
 		vp.pipe.rOpen()
 		vp.pipe.wOpen()
-	case fd.readable:
+	case fd.vfsfd.IsReadable():
 		vp.pipe.rOpen()
-	case fd.writable:
+	case fd.vfsfd.IsWritable():
 		vp.pipe.wOpen()
 	default:
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	return &fd, nil
+	return &fd.vfsfd
 }
 
-// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is
-// expected that filesystesm will use this in a struct implementing
-// vfs.FileDescriptionImpl.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
 type VFSPipeFD struct {
-	pipe     *Pipe
-	flags    uint32
-	readable bool
-	writable bool
-	vfsfd    *vfs.FileDescription
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	pipe *Pipe
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *VFSPipeFD) Release() {
 	var event waiter.EventMask
-	if fd.readable {
+	if fd.vfsfd.IsReadable() {
 		fd.pipe.rClose()
-		event |= waiter.EventIn
+		event |= waiter.EventOut
 	}
-	if fd.writable {
+	if fd.vfsfd.IsWritable() {
 		fd.pipe.wClose()
-		event |= waiter.EventOut
+		event |= waiter.EventIn | waiter.EventHUp
 	}
 	if event == 0 {
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-
 	fd.pipe.Notify(event)
 }
 
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *VFSPipeFD) OnClose(_ context.Context) error {
-	return nil
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	switch {
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
+		return fd.pipe.rwReadiness()
+	case fd.vfsfd.IsReadable():
+		return fd.pipe.rReadiness()
+	case fd.vfsfd.IsWritable():
+		return fd.pipe.wReadiness()
+	default:
+		panic("pipe FD is neither readable nor writable")
+	}
 }
 
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) {
-	return 0, syserror.ESPIPE
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.pipe.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
+	fd.pipe.EventUnregister(e)
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-
 	return fd.pipe.Read(ctx, dst)
 }
 
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) {
-	return 0, syserror.ESPIPE
-}
-
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-
 	return fd.pipe.Write(ctx, src)
 }
 
@@ -211,3 +215,17 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr
 func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	return fd.pipe.Ioctl(ctx, uio, args)
 }
+
+// PipeSize implements fcntl(F_GETPIPE_SZ).
+func (fd *VFSPipeFD) PipeSize() int64 {
+	// Inline Pipe.FifoSize() rather than calling it with nil Context and
+	// fs.File and ignoring the returned error (which is always nil).
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+	return fd.pipe.max
+}
+
+// SetPipeSize implements fcntl(F_SETPIPE_SZ).
+func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
+	return fd.pipe.SetFifoSize(size)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 798344042..43c510930 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -24,6 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // pipe2 implements the actual system call with flags.
 func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
@@ -45,10 +47,12 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	}
 
 	if _, err := t.CopyOut(addr, fds); err != nil {
-		// The files are not closed in this case, the exact semantics
-		// of this error case are not well defined, but they could have
-		// already been observed by user space.
-		return 0, syserror.EFAULT
+		for _, fd := range fds {
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, err
 	}
 	return 0, nil
 }
@@ -69,3 +73,5 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	n, err := pipe2(t, addr, flags)
 	return n, nil, err
 }
+
+// LINT.ThenChange(vfs2/pipe.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index b32abfe59..6ff2d84d2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -18,6 +18,7 @@ go_library(
         "linux64_override_arm64.go",
         "mmap.go",
         "path.go",
+        "pipe.go",
         "poll.go",
         "read_write.go",
         "setstat.go",
@@ -39,8 +40,10 @@ go_library(
         "//pkg/gohacks",
         "//pkg/sentry/arch",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/pipefs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 3afcea665..8181d80f4 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -140,6 +141,22 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(file.StatusFlags()), nil, nil
 	case linux.F_SETFL:
 		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	case linux.F_SETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, nil
+	case linux.F_GETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		return uintptr(pipefile.PipeSize()), nil, nil
 	default:
 		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index 645e0bcb8..21eb98444 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -39,7 +39,7 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[19] = syscalls.Supported("readv", Readv)
 	table[20] = syscalls.Supported("writev", Writev)
 	table[21] = syscalls.Supported("access", Access)
-	delete(table, 22) // pipe
+	table[22] = syscalls.Supported("pipe", Pipe)
 	table[23] = syscalls.Supported("select", Select)
 	table[32] = syscalls.Supported("dup", Dup)
 	table[33] = syscalls.Supported("dup2", Dup2)
@@ -151,7 +151,7 @@ func Override(table map[uintptr]kernel.Syscall) {
 	delete(table, 290) // eventfd2
 	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
 	table[292] = syscalls.Supported("dup3", Dup3)
-	delete(table, 293) // pipe2
+	table[293] = syscalls.Supported("pipe2", Pipe2)
 	delete(table, 294) // inotify_init1
 	table[295] = syscalls.Supported("preadv", Preadv)
 	table[296] = syscalls.Supported("pwritev", Pwritev)
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
new file mode 100644
index 000000000..4a01e4209
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Pipe implements Linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, pipe2(t, addr, 0)
+}
+
+// Pipe2 implements Linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+	return 0, nil, pipe2(t, addr, flags)
+}
+
+func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
+	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+		return syserror.EINVAL
+	}
+	r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
+	defer r.DecRef()
+	defer w.DecRef()
+
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return err
+	}
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 898b190fd..6c6998f45 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -103,7 +103,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.Read(t, dst, opts)
+		n, err = file.Read(t, dst, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
@@ -248,7 +248,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.PRead(t, dst, offset+total, opts)
+		n, err = file.PRead(t, dst, offset+total, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
@@ -335,7 +335,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.Write(t, src, opts)
+		n, err = file.Write(t, src, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
@@ -480,7 +480,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.PWrite(t, src, offset+total, opts)
+		n, err = file.PWrite(t, src, offset+total, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 053c6e1d1..cb5bbd781 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -335,7 +335,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
-		if err != nil {
+		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index d8e19e910..67228b66b 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -265,6 +265,8 @@ TEST_P(PipeTest, OffsetCalls) {
               SyscallFailsWithErrno(ESPIPE));
 
   struct iovec iov;
+  iov.iov_base = &buf;
+  iov.iov_len = sizeof(buf);
   EXPECT_THAT(preadv(wfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
   EXPECT_THAT(pwritev(rfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
 }
-- 
cgit v1.2.3


From fe001edb14e6e879ab4ebca0d2ac71d770ac8cce Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 11 Feb 2020 02:35:39 -0500
Subject: Arm64: VDSO support for signal

The vdso is enabled, so we can use the sigreturn trampolines
the vdso provides in arch module.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/loader/loader.go |  9 +++++++++
 pkg/sentry/loader/vdso.go   | 25 +++++++++++++++++++++++++
 vdso/syscalls.h             | 25 +++++++++++++------------
 vdso/vdso.cc                | 12 ++++++------
 vdso/vdso_amd64.lds         |  1 +
 5 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 9a613d6b7..57422706d 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -392,6 +392,15 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	m.SetAuxv(auxv)
 	m.SetExecutable(d)
 
+	symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn")
+	if err != nil {
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to find rt_sigreturn in vdso: %v", err), syserr.FromError(err).ToLinux())
+	}
+
+	// Found rt_sigretrun.
+	addr := uint64(vdsoAddr) + symbolValue - vdsoPrelink
+	m.SetVDSOSigReturn(addr)
+
 	ac.SetIP(uintptr(loaded.entry))
 	ac.SetStack(uintptr(stack.Bottom))
 
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 52f446ed7..01eaefd26 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -15,9 +15,11 @@
 package loader
 
 import (
+	"bytes"
 	"debug/elf"
 	"fmt"
 	"io"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/context"
@@ -37,6 +39,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+const vdsoPrelink = 0xffffffffff700000
+
 type fileContext struct {
 	context.Context
 }
@@ -218,6 +222,27 @@ type VDSO struct {
 	phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
 }
 
+// getSymbolValueFromVDSO returns the specific symbol value in vdso.so.
+func getSymbolValueFromVDSO(symbol string) (uint64, error) {
+	f, err := elf.NewFile(bytes.NewReader(vdsoBin))
+	if err != nil {
+		return 0, err
+	}
+	syms, err := f.Symbols()
+	if err != nil {
+		return 0, err
+	}
+
+	for _, sym := range syms {
+		if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF {
+			if strings.Contains(sym.Name, symbol) {
+				return sym.Value, nil
+			}
+		}
+	}
+	return 0, fmt.Errorf("no %v in vdso.so", symbol)
+}
+
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
 func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index b6d15a7d3..f630ae563 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -26,6 +26,9 @@
 #include <stddef.h>
 #include <sys/types.h>
 
+#define __stringify_1(x...)     #x
+#define __stringify(x...)       __stringify_1(x)
+
 namespace vdso {
 
 #if __x86_64__
@@ -51,20 +54,13 @@ static inline int sys_getcpu(unsigned* cpu, unsigned* node,
   return num;
 }
 
-#elif __aarch64__
-
-static inline int sys_rt_sigreturn(void) {
-  int num = __NR_rt_sigreturn;
-
-  asm volatile(
-      "mov x8, %0\n"
-      "svc #0    \n"
-      : "+r"(num)
-      :
-      :);
-  return num;
+static inline void sys_rt_sigreturn(void) {
+  asm volatile("movl $" __stringify(__NR_rt_sigreturn)", %eax \n"
+               "syscall \n");
 }
 
+#elif __aarch64__
+
 static inline int sys_clock_gettime(clockid_t _clkid, struct timespec* _ts) {
   register struct timespec* ts asm("x1") = _ts;
   register clockid_t clkid asm("x0") = _clkid;
@@ -91,6 +87,11 @@ static inline int sys_clock_getres(clockid_t _clkid, struct timespec* _ts) {
   return ret;
 }
 
+static inline void sys_rt_sigreturn(void) {
+  asm volatile("mov x8, #" __stringify(__NR_rt_sigreturn)" \n"
+               "svc #0 \n");
+}
+
 #else
 #error "unsupported architecture"
 #endif
diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index 8bb80a7a4..62f59766d 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -69,6 +69,12 @@ int __common_gettimeofday(struct timeval* tv, struct timezone* tz) {
 }
 }  // namespace
 
+// __kernel_rt_sigreturn() implements rt_sigreturn()
+extern "C" void __kernel_rt_sigreturn(unsigned long unused) {
+  // No optimizations yet, just make the real system call.
+  sys_rt_sigreturn();
+}
+
 #if __x86_64__
 
 // __vdso_clock_gettime() implements clock_gettime()
@@ -139,12 +145,6 @@ extern "C" int __kernel_clock_getres(clockid_t clock, struct timespec* res) {
   return ret;
 }
 
-// __kernel_rt_sigreturn() implements gettimeofday()
-extern "C" int __kernel_rt_sigreturn(unsigned long unused) {
-  // No optimizations yet, just make the real system call.
-  return sys_rt_sigreturn();
-}
-
 #else
 #error "unsupported architecture"
 #endif
diff --git a/vdso/vdso_amd64.lds b/vdso/vdso_amd64.lds
index e2615ae9e..d114290da 100644
--- a/vdso/vdso_amd64.lds
+++ b/vdso/vdso_amd64.lds
@@ -95,6 +95,7 @@ VERSION {
     __vdso_getcpu;
     time;
     __vdso_time;
+    __kernel_rt_sigreturn;
 
   local: *;
   };
-- 
cgit v1.2.3


From b4de018a67f5b5cb5ffc782c915107e1402ed833 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 17 Apr 2020 06:40:23 -0700
Subject: Permit setting unknown options

This previously changed in 305699233, but this behaviour turned out to
be load bearing.

PiperOrigin-RevId: 307033802
---
 pkg/sentry/socket/unix/transport/unix.go | 2 --
 pkg/tcpip/transport/icmp/endpoint.go     | 4 +---
 pkg/tcpip/transport/tcp/endpoint.go      | 4 ----
 pkg/tcpip/transport/udp/endpoint.go      | 4 ----
 4 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 1f3880cc5..2f1b127df 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -850,7 +850,6 @@ func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Erro
 	case tcpip.ReuseAddressOption:
 	default:
 		log.Warningf("Unsupported socket option: %d", opt)
-		return tcpip.ErrUnknownProtocolOption
 	}
 	return nil
 }
@@ -861,7 +860,6 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.ReceiveBufferSizeOption:
 	default:
 		log.Warningf("Unsupported socket option: %d", opt)
-		return tcpip.ErrUnknownProtocolOption
 	}
 	return nil
 }
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 3a133eef9..feef8dca0 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -353,7 +353,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 // SetSockOptBool sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	return nil
 }
 
 // SetSockOptInt sets a socket option. Currently not supported.
@@ -364,8 +364,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.ttl = uint8(v)
 		e.mu.Unlock()
 
-	default:
-		return tcpip.ErrUnknownProtocolOption
 	}
 	return nil
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5d0ea9e93..7e8def82d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1477,8 +1477,6 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.LockUser()
 		e.v6only = v
 		e.UnlockUser()
-	default:
-		return tcpip.ErrUnknownProtocolOption
 	}
 
 	return nil
@@ -1592,8 +1590,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.ttl = uint8(v)
 		e.UnlockUser()
 
-	default:
-		return tcpip.ErrUnknownProtocolOption
 	}
 	return nil
 }
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 492cc1fcb..edb54f0be 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -553,8 +553,6 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
-	default:
-		return tcpip.ErrUnknownProtocolOption
 	}
 
 	return nil
@@ -586,8 +584,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.ReceiveBufferSizeOption:
 	case tcpip.SendBufferSizeOption:
 
-	default:
-		return tcpip.ErrUnknownProtocolOption
 	}
 
 	return nil
-- 
cgit v1.2.3


From 6c225ea2d59d3287484fe3eeddffc1d877a5972a Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 17 Apr 2020 09:04:02 -0700
Subject: Allow caller-defined sinks for packet sniffing.

PiperOrigin-RevId: 307053624
---
 pkg/tcpip/link/sniffer/sniffer.go | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 938540c14..be2537a82 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -24,7 +24,6 @@ import (
 	"encoding/binary"
 	"fmt"
 	"io"
-	"os"
 	"sync/atomic"
 	"time"
 
@@ -41,12 +40,12 @@ import (
 // LogPackets must be accessed atomically.
 var LogPackets uint32 = 1
 
-// LogPacketsToFile is a flag used to enable or disable logging packets to a
-// pcap file. Valid values are 0 or 1. A file must have been specified when the
+// LogPacketsToPCAP is a flag used to enable or disable logging packets to a
+// pcap writer. Valid values are 0 or 1. A writer must have been specified when the
 // sniffer was created for this flag to have effect.
 //
-// LogPacketsToFile must be accessed atomically.
-var LogPacketsToFile uint32 = 1
+// LogPacketsToPCAP must be accessed atomically.
+var LogPacketsToPCAP uint32 = 1
 
 var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.TransportProtocolNumber]int{
 	header.ICMPv4ProtocolNumber: header.IPv4MinimumSize,
@@ -58,7 +57,7 @@ var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.
 type endpoint struct {
 	dispatcher stack.NetworkDispatcher
 	lower      stack.LinkEndpoint
-	file       *os.File
+	writer     io.Writer
 	maxPCAPLen uint32
 }
 
@@ -98,23 +97,22 @@ func writePCAPHeader(w io.Writer, maxLen uint32) error {
 	})
 }
 
-// NewWithFile creates a new sniffer link-layer endpoint. It wraps around
-// another endpoint and logs packets and they traverse the endpoint.
+// NewWithWriter creates a new sniffer link-layer endpoint. It wraps around
+// another endpoint and logs packets as they traverse the endpoint.
 //
-// Packets can be logged to file in the pcap format. A sniffer created
-// with this function will not emit packets using the standard log
-// package.
+// Packets are logged to writer in the pcap format. A sniffer created with this
+// function will not emit packets using the standard log package.
 //
 // snapLen is the maximum amount of a packet to be saved. Packets with a length
-// less than or equal too snapLen will be saved in their entirety. Longer
+// less than or equal to snapLen will be saved in their entirety. Longer
 // packets will be truncated to snapLen.
-func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack.LinkEndpoint, error) {
-	if err := writePCAPHeader(file, snapLen); err != nil {
+func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (stack.LinkEndpoint, error) {
+	if err := writePCAPHeader(writer, snapLen); err != nil {
 		return nil, err
 	}
 	return &endpoint{
 		lower:      lower,
-		file:       file,
+		writer:     writer,
 		maxPCAPLen: snapLen,
 	}, nil
 }
@@ -171,21 +169,21 @@ func (e *endpoint) GSOMaxSize() uint32 {
 }
 
 func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
-	file := e.file
-	if file == nil && atomic.LoadUint32(&LogPackets) == 1 {
+	writer := e.writer
+	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
 		first := pkt.Header.View()
 		if len(first) == 0 {
 			first = pkt.Data.First()
 		}
 		logPacket(prefix, protocol, first, gso)
 	}
-	if file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
+	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
 		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
 		length := totalLength
 		if max := int(e.maxPCAPLen); length > max {
 			length = max
 		}
-		if err := binary.Write(file, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil {
+		if err := binary.Write(writer, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil {
 			panic(err)
 		}
 		write := func(b []byte) {
@@ -193,7 +191,7 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
 				b = b[:length]
 			}
 			for len(b) != 0 {
-				n, err := file.Write(b)
+				n, err := writer.Write(b)
 				if err != nil {
 					panic(err)
 				}
-- 
cgit v1.2.3


From 4a818d64378f16f3738ba51c7804cff90f753b1d Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 17 Apr 2020 10:33:54 -0700
Subject: proc net test: Annotate disable-save test with NoRandomSave.

PiperOrigin-RevId: 307069884
---
 test/syscalls/linux/proc_net.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 4e23d1e78..cac394910 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -353,7 +353,7 @@ TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
   EXPECT_EQ(oldNoPorts, newNoPorts - 1);
 }
 
-TEST(ProcNetSnmp, UdpIn) {
+TEST(ProcNetSnmp, UdpIn_NoRandomSave) {
   // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
   const DisableSave ds;
 
-- 
cgit v1.2.3


From 12bde95635ac266aab8087b4705372bb177638f3 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 17 Apr 2020 10:38:04 -0700
Subject: Get /bin/true to run on VFS2

Included:
- loader_test.go RunTest and TestStartSignal VFS2
- container_test.go TestAppExitStatus on VFS2
- experimental flag added to runsc to turn on VFS2

Note: shared mounts are not yet supported.
PiperOrigin-RevId: 307070753
---
 pkg/sentry/kernel/syscalls.go     |   7 +
 runsc/boot/BUILD                  |  11 ++
 runsc/boot/config.go              |   5 +
 runsc/boot/fds.go                 |  33 ++++
 runsc/boot/fs.go                  |   9 +-
 runsc/boot/loader.go              |  31 +++-
 runsc/boot/loader_amd64.go        |   5 +-
 runsc/boot/loader_arm64.go        |   5 +-
 runsc/boot/loader_test.go         |  37 ++++-
 runsc/boot/user.go                |  64 ++++++++
 runsc/boot/vfs.go                 | 310 ++++++++++++++++++++++++++++++++++++++
 runsc/container/container_test.go |  14 +-
 runsc/main.go                     |   3 +
 13 files changed, 513 insertions(+), 21 deletions(-)
 create mode 100644 runsc/boot/vfs.go

diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 2e3565747..84156d5a1 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -326,6 +326,13 @@ func RegisterSyscallTable(s *SyscallTable) {
 	allSyscallTables = append(allSyscallTables, s)
 }
 
+// FlushSyscallTablesTestOnly flushes the syscall tables for tests. Used for
+// parameterized VFSv2 tests.
+// TODO(gvisor.dv/issue/1624): Remove when VFS1 is no longer supported.
+func FlushSyscallTablesTestOnly() {
+	allSyscallTables = nil
+}
+
 // Lookup returns the syscall implementation, if one exists.
 func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
 	if sysno < uintptr(len(s.lookup)) {
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 23f42382f..5451f1eba 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -21,6 +21,7 @@ go_library(
         "network.go",
         "strace.go",
         "user.go",
+        "vfs.go",
     ],
     visibility = [
         "//runsc:__subpackages__",
@@ -33,6 +34,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
@@ -40,6 +42,7 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/control",
+        "//pkg/sentry/devices/memdev",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/dev",
         "//pkg/sentry/fs/gofer",
@@ -49,6 +52,12 @@ go_library(
         "//pkg/sentry/fs/sys",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/gofer",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/proc",
+        "//pkg/sentry/fsimpl/sys",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel:uncaught_signal_go_proto",
@@ -71,6 +80,7 @@ go_library(
         "//pkg/sentry/time",
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
         "//pkg/syserror",
@@ -114,6 +124,7 @@ go_test(
         "//pkg/p9",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sync",
         "//pkg/unet",
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 7ea5bfade..715a19112 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -305,5 +305,10 @@ func (c *Config) ToFlags() []string {
 	if len(c.TestOnlyTestNameEnv) != 0 {
 		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
 	}
+
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+
 	return f
 }
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 5314b0f2a..7e49f6f9f 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 )
 
@@ -31,6 +32,10 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
 
+	if kernel.VFS2Enabled {
+		return createFDTableVFS2(ctx, console, stdioFDs)
+	}
+
 	k := kernel.KernelFromContext(ctx)
 	fdTable := k.NewFDTable()
 	defer fdTable.DecRef()
@@ -78,3 +83,31 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 	fdTable.IncRef()
 	return fdTable, nil
 }
+
+func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	defer fdTable.DecRef()
+
+	hostMount, err := vfshost.NewMount(k.VFS())
+	if err != nil {
+		return nil, fmt.Errorf("creating host mount: %w", err)
+	}
+
+	for appFD, hostFD := range stdioFDs {
+		// TODO(gvisor.dev/issue/1482): Add TTY support.
+		appFile, err := vfshost.ImportFD(hostMount, hostFD, false)
+		if err != nil {
+			return nil, err
+		}
+
+		if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
+			appFile.DecRef()
+			return nil, err
+		}
+		appFile.DecRef()
+	}
+
+	fdTable.IncRef()
+	return fdTable, nil
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 82cc612d2..98cce60af 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -278,6 +278,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
 }
 
 func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if conf.VFS2 {
+		return setupContainerVFS2(ctx, conf, mntr, procArgs)
+	}
 	mns, err := mntr.setupFS(conf, procArgs)
 	if err != nil {
 		return err
@@ -573,6 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
 func (c *containerMounter) processHints(conf *Config) error {
+	if conf.VFS2 {
+		return nil
+	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
 		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
@@ -781,9 +787,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
-		// TODO(nlacasse): Support all the mount types and make this a fatal error.
-		// Most applications will "just work" without them, so this is a warning
-		// for now.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
 	return fsName, opts, useOverlay, nil
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 654441f65..cf1f47bc7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -26,7 +26,6 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
@@ -73,6 +72,8 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
+var syscallTable *kernel.SyscallTable
+
 // Loader keeps state needed to start the kernel and run the container..
 type Loader struct {
 	// k is the kernel.
@@ -195,13 +196,14 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
-	if args.Conf.VFS2 {
-		st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host)
-		if ok {
-			vfs2.Override(st.Table)
-		}
+	// Patch the syscall table.
+	kernel.VFS2Enabled = args.Conf.VFS2
+	if kernel.VFS2Enabled {
+		vfs2.Override(syscallTable.Table)
 	}
 
+	kernel.RegisterSyscallTable(syscallTable)
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
@@ -392,11 +394,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.
 		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
 	}
 
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
 		Argv:                    spec.Process.Args,
 		Envv:                    spec.Process.Env,
-		WorkingDirectory:        spec.Process.Cwd, // Defaults to '/' if empty.
+		WorkingDirectory:        wd,
 		Credentials:             creds,
 		Umask:                   0022,
 		Limits:                  ls,
@@ -541,7 +548,15 @@ func (l *Loader) run() error {
 		}
 
 		// Add the HOME enviroment variable if it is not already set.
-		envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		var envv []string
+		if kernel.VFS2Enabled {
+			envv, err = maybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+		} else {
+			envv, err = maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		}
 		if err != nil {
 			return err
 		}
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
index b9669f2ac..78df86611 100644
--- a/runsc/boot/loader_amd64.go
+++ b/runsc/boot/loader_amd64.go
@@ -17,11 +17,10 @@
 package boot
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 )
 
 func init() {
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(linux.AMD64)
+	// Set the global syscall table.
+	syscallTable = linux.AMD64
 }
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
index cf64d28c8..250785010 100644
--- a/runsc/boot/loader_arm64.go
+++ b/runsc/boot/loader_arm64.go
@@ -17,11 +17,10 @@
 package boot
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 )
 
 func init() {
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(linux.ARM64)
+	// Set the global syscall table.
+	syscallTable = linux.ARM64
 }
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index c9a75b76d..e7c71734f 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/fsgofer"
@@ -66,6 +67,11 @@ func testSpec() *specs.Spec {
 	}
 }
 
+func resetSyscallTable() {
+	kernel.VFS2Enabled = false
+	kernel.FlushSyscallTablesTestOnly()
+}
+
 // startGofer starts a new gofer routine serving 'root' path. It returns the
 // sandbox side of the connection, and a function that when called will stop the
 // gofer.
@@ -101,7 +107,7 @@ func startGofer(root string) (int, func(), error) {
 	return sandboxEnd, cleanup, nil
 }
 
-func createLoader() (*Loader, func(), error) {
+func createLoader(vfsEnabled bool) (*Loader, func(), error) {
 	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
 		return nil, nil, err
@@ -109,6 +115,8 @@ func createLoader() (*Loader, func(), error) {
 	conf := testConfig()
 	spec := testSpec()
 
+	conf.VFS2 = vfsEnabled
+
 	sandEnd, cleanup, err := startGofer(spec.Root.Path)
 	if err != nil {
 		return nil, nil, err
@@ -142,10 +150,22 @@ func createLoader() (*Loader, func(), error) {
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	l, cleanup, err := createLoader()
+	defer resetSyscallTable()
+	doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+	defer resetSyscallTable()
+	doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled)
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
+
 	defer l.Destroy()
 	defer cleanup()
 
@@ -179,7 +199,18 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	l, cleanup, err := createLoader()
+	defer resetSyscallTable()
+	doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+	defer resetSyscallTable()
+	doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled)
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index f0aa52135..332e4fce5 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -23,8 +23,10 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -84,6 +86,48 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
 		File: f,
 	}
 
+	return findHomeInPasswd(uint32(uid), r, defaultHome)
+}
+
+type fileReaderVFS2 struct {
+	ctx context.Context
+	fd  *vfs.FileDescription
+}
+
+func (r *fileReaderVFS2) Read(buf []byte) (int, error) {
+	n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	return int(n), err
+}
+
+func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) {
+	const defaultHome = "/"
+
+	root := mns.Root()
+	defer root.DecRef()
+
+	creds := auth.CredentialsFromContext(ctx)
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/etc/passwd"),
+	}
+
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}
+
+	fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts)
+	if err != nil {
+		return defaultHome, nil
+	}
+	defer fd.DecRef()
+
+	r := &fileReaderVFS2{
+		ctx: ctx,
+		fd:  fd,
+	}
+
 	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
 	if err != nil {
 		return "", err
@@ -111,6 +155,26 @@ func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.
 	if err != nil {
 		return nil, fmt.Errorf("error reading exec user: %v", err)
 	}
+
+	return append(envv, "HOME="+homeDir), nil
+}
+
+func maybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
 	return append(envv, "HOME="+homeDir), nil
 }
 
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..82083c57d
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,310 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path"
+	"strconv"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+
+	vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	// Setup files in devtmpfs.
+	if err := memdev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering memdev: %w", err)
+	}
+	a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name)
+	if err != nil {
+		return fmt.Errorf("creating devtmpfs accessor: %w", err)
+	}
+	defer a.Release()
+
+	if err := a.UserspaceInit(ctx); err != nil {
+		return fmt.Errorf("initializing userspace: %w", err)
+	}
+	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating devtmpfs files: %w", err)
+	}
+	return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if err := mntr.k.VFS().Init(); err != nil {
+		return fmt.Errorf("failed to initialize VFS: %w", err)
+	}
+	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	if err != nil {
+		return fmt.Errorf("failed to setupFS: %w", err)
+	}
+	procArgs.MountNamespaceVFS2 = mns
+	return setExecutablePathVFS2(ctx, procArgs)
+}
+
+func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
+
+	exe := procArgs.Argv[0]
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(exe) {
+		procArgs.Filename = exe
+		return nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(exe, '/') > 0 {
+
+		if !path.IsAbs(procArgs.WorkingDirectory) {
+			return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
+		}
+
+		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+		return nil
+	}
+
+	// Paths with a '/' are relative to the CWD.
+	if strings.IndexByte(exe, '/') > 0 {
+		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+		return nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// root directory.
+	root := procArgs.MountNamespaceVFS2.Root()
+	defer root.DecRef()
+
+	paths := fs.GetPath(procArgs.Envv)
+	creds := procArgs.Credentials
+
+	for _, p := range paths {
+
+		binPath := path.Join(p, exe)
+
+		pop := &vfs.PathOperation{
+			Root:               root,
+			Start:              root,
+			Path:               fspath.Parse(binPath),
+			FollowFinalSymlink: true,
+		}
+
+		opts := &vfs.OpenOptions{
+			FileExec: true,
+			Flags:    linux.O_RDONLY,
+		}
+
+		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return err
+		}
+		dentry.DecRef()
+
+		procArgs.Filename = binPath
+		return nil
+	}
+
+	return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":"))
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+	log.Infof("Configuring container's file system with VFS2")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := procArgs.NewContext(c.k)
+
+	creds := procArgs.Credentials
+	if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil {
+		return nil, fmt.Errorf("register filesystems: %w", err)
+	}
+
+	fd := c.fds.remove()
+
+	opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mountnamespace: %w", err)
+	}
+
+	rootProcArgs.MountNamespaceVFS2 = mns
+
+	// Mount submounts.
+	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil {
+		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+	}
+
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+
+	for _, submount := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil {
+			return err
+		}
+	}
+
+	// TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
+
+	return c.checkDispenser()
+}
+
+// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version.
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error {
+	root := mns.Root()
+	defer root.DecRef()
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+
+	_, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount)
+	if err != nil {
+		return fmt.Errorf("mountOptions failed: %w", err)
+	}
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(options, ","),
+		},
+		InternalMount: true,
+	}
+
+	// All writes go to upper, be paranoid and make lower readonly.
+	opts.ReadOnly = useOverlay
+
+	if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
+		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	}
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts)
+	return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+	)
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m))
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in
+// fs.go when privateunixsocket lands.
+func p9MountOptionsVFS2(fd int, fa FileAccessType) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+	}
+	if fa == FileAccessShared {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 442e80ac0..24f9ecc35 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -521,9 +521,21 @@ func TestExePath(t *testing.T) {
 
 // Test the we can retrieve the application exit status from the container.
 func TestAppExitStatus(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.VFS2 = false
+	doAppExitStatus(t, conf)
+}
+
+// This is TestAppExitStatus for VFSv2.
+func TestAppExitStatusVFS2(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.VFS2 = true
+	doAppExitStatus(t, conf)
+}
+
+func doAppExitStatus(t *testing.T, conf *boot.Config) {
 	// First container will succeed.
 	succSpec := testutil.NewSpecWithArgs("true")
-	conf := testutil.TestConfig()
 	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
diff --git a/runsc/main.go b/runsc/main.go
index c1c78529c..9d52f3006 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -84,6 +84,7 @@ var (
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
 	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -230,6 +231,7 @@ func main() {
 		ReferenceLeakMode:  refsLeakMode,
 		OverlayfsStaleRead: *overlayfsStaleRead,
 		CPUNumFromQuota:    *cpuNumFromQuota,
+		VFS2:               *vfs2Enabled,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
@@ -313,6 +315,7 @@ func main() {
 	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
 	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
 	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
 	log.Infof("***************************")
 
 	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
-- 
cgit v1.2.3


From a551add5d8a5bf631cd9859c761e579fdb33ec82 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 13 Apr 2020 17:37:21 -0700
Subject: Remove View.First() and View.RemoveFirst()

These methods let users eaily break the VectorisedView abstraction, and
allowed netstack to slip into pseudo-enforcement of the "all headers are
in the first View" invariant. Removing them and replacing with PullUp(n)
breaks this reliance and will make it easier to add iptables support and
rework network buffer management.

The new View.PullUp(n) method is low cost in the common case, when when
all the headers fit in the first View.
---
 pkg/sentry/socket/netfilter/tcp_matcher.go |   5 +-
 pkg/sentry/socket/netfilter/udp_matcher.go |   5 +-
 pkg/tcpip/buffer/view.go                   |  55 ++++++++++----
 pkg/tcpip/buffer/view_test.go              | 113 +++++++++++++++++++++++++++++
 pkg/tcpip/link/loopback/loopback.go        |  10 +--
 pkg/tcpip/link/sharedmem/sharedmem_test.go |   2 +-
 pkg/tcpip/link/sniffer/sniffer.go          |  65 +++++++++++++----
 pkg/tcpip/network/arp/arp.go               |   5 +-
 pkg/tcpip/network/ipv4/icmp.go             |  20 +++--
 pkg/tcpip/network/ipv4/ipv4.go             |  12 ++-
 pkg/tcpip/network/ipv6/icmp.go             |  74 ++++++++++++-------
 pkg/tcpip/network/ipv6/icmp_test.go        |   3 +-
 pkg/tcpip/network/ipv6/ipv6.go             |   6 +-
 pkg/tcpip/stack/forwarder_test.go          |  13 ++--
 pkg/tcpip/stack/iptables.go                |  22 +++++-
 pkg/tcpip/stack/iptables_targets.go        |  23 ++++--
 pkg/tcpip/stack/nic.go                     |  34 +++------
 pkg/tcpip/stack/packet_buffer.go           |   4 +-
 pkg/tcpip/stack/stack_test.go              |  10 ++-
 pkg/tcpip/stack/transport_test.go          |   5 +-
 pkg/tcpip/transport/icmp/endpoint.go       |   8 +-
 pkg/tcpip/transport/tcp/segment.go         |  29 +++++---
 pkg/tcpip/transport/tcp/tcp_test.go        |   4 +-
 pkg/tcpip/transport/udp/endpoint.go        |   6 +-
 pkg/tcpip/transport/udp/protocol.go        |   9 ++-
 25 files changed, 395 insertions(+), 147 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index ff1cfd8f6..55c0f04f3 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -121,12 +121,13 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		tcpHeader = header.TCP(pkt.TransportHeader)
 	} else {
 		// The TCP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.TCPMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.TCPMinimumSize)
+		if !ok {
 			// There's no valid TCP header here, so we hotdrop the
 			// packet.
 			return false, true
 		}
-		tcpHeader = header.TCP(pkt.Data.First())
+		tcpHeader = header.TCP(hdr)
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 3359418c1..04d03d494 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -120,12 +120,13 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		udpHeader = header.UDP(pkt.TransportHeader)
 	} else {
 		// The UDP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.UDPMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+		if !ok {
 			// There's no valid UDP header here, so we hotdrop the
 			// packet.
 			return false, true
 		}
-		udpHeader = header.UDP(pkt.Data.First())
+		udpHeader = header.UDP(hdr)
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 8ec5d5d5c..f01217c91 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -77,7 +77,8 @@ func NewVectorisedView(size int, views []View) VectorisedView {
 	return VectorisedView{views: views, size: size}
 }
 
-// TrimFront removes the first "count" bytes of the vectorised view.
+// TrimFront removes the first "count" bytes of the vectorised view. It panics
+// if count > vv.Size().
 func (vv *VectorisedView) TrimFront(count int) {
 	for count > 0 && len(vv.views) > 0 {
 		if count < len(vv.views[0]) {
@@ -86,7 +87,7 @@ func (vv *VectorisedView) TrimFront(count int) {
 			return
 		}
 		count -= len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
 	}
 }
 
@@ -104,7 +105,7 @@ func (vv *VectorisedView) Read(v View) (copied int, err error) {
 		count -= len(vv.views[0])
 		copy(v[copied:], vv.views[0])
 		copied += len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
 	}
 	if copied == 0 {
 		return 0, io.EOF
@@ -126,7 +127,7 @@ func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int
 		count -= len(vv.views[0])
 		dstVV.AppendView(vv.views[0])
 		copied += len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
 	}
 	return copied
 }
@@ -162,22 +163,37 @@ func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
 
-// First returns the first view of the vectorised view.
-func (vv *VectorisedView) First() View {
+// PullUp returns the first "count" bytes of the vectorised view. If those
+// bytes aren't already contiguous inside the vectorised view, PullUp will
+// reallocate as needed to make them contiguous. PullUp fails and returns false
+// when count > vv.Size().
+func (vv *VectorisedView) PullUp(count int) (View, bool) {
 	if len(vv.views) == 0 {
-		return nil
+		return nil, count == 0
+	}
+	if count <= len(vv.views[0]) {
+		return vv.views[0][:count], true
+	}
+	if count > vv.size {
+		return nil, false
 	}
-	return vv.views[0]
-}
 
-// RemoveFirst removes the first view of the vectorised view.
-func (vv *VectorisedView) RemoveFirst() {
-	if len(vv.views) == 0 {
-		return
+	newFirst := NewView(count)
+	i := 0
+	for offset := 0; offset < count; i++ {
+		copy(newFirst[offset:], vv.views[i])
+		if count-offset < len(vv.views[i]) {
+			vv.views[i].TrimFront(count - offset)
+			break
+		}
+		offset += len(vv.views[i])
+		vv.views[i] = nil
 	}
-	vv.size -= len(vv.views[0])
-	vv.views[0] = nil
-	vv.views = vv.views[1:]
+	// We're guaranteed that i > 0, since count is too large for the first
+	// view.
+	vv.views[i-1] = newFirst
+	vv.views = vv.views[i-1:]
+	return newFirst, true
 }
 
 // Size returns the size in bytes of the entire content stored in the vectorised view.
@@ -225,3 +241,10 @@ func (vv *VectorisedView) Readers() []bytes.Reader {
 	}
 	return readers
 }
+
+// removeFirst panics when len(vv.views) < 1.
+func (vv *VectorisedView) removeFirst() {
+	vv.size -= len(vv.views[0])
+	vv.views[0] = nil
+	vv.views = vv.views[1:]
+}
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index 106e1994c..c56795c7b 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -16,6 +16,7 @@
 package buffer
 
 import (
+	"bytes"
 	"reflect"
 	"testing"
 )
@@ -370,3 +371,115 @@ func TestVVRead(t *testing.T) {
 		})
 	}
 }
+
+var pullUpTestCases = []struct {
+	comment string
+	in      VectorisedView
+	count   int
+	want    []byte
+	result  VectorisedView
+	ok      bool
+}{
+	{
+		comment: "simple case",
+		in:      vv(2, "12"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "12"),
+		ok:      true,
+	},
+	{
+		comment: "entire View",
+		in:      vv(2, "1", "2"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "1", "2"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across two Views",
+		in:      vv(3, "1", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across all Views",
+		in:      vv(5, "1", "23", "45"),
+		count:   5,
+		want:    []byte("12345"),
+		result:  vv(5, "12345"),
+		ok:      true,
+	},
+	{
+		comment: "count = 0",
+		in:      vv(1, "1"),
+		count:   0,
+		want:    []byte{},
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count = size",
+		in:      vv(1, "1"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count too large",
+		in:      vv(3, "1", "23"),
+		count:   4,
+		want:    nil,
+		result:  vv(3, "1", "23"),
+		ok:      false,
+	},
+	{
+		comment: "empty vv",
+		in:      vv(0, ""),
+		count:   1,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      false,
+	},
+	{
+		comment: "empty vv, count = 0",
+		in:      vv(0, ""),
+		count:   0,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      true,
+	},
+	{
+		comment: "empty views",
+		in:      vv(3, "", "1", "", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+}
+
+func TestPullUp(t *testing.T) {
+	for _, c := range pullUpTestCases {
+		got, ok := c.in.PullUp(c.count)
+
+		// Is the return value right?
+		if ok != c.ok {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got an ok of %t. Want %t",
+				c.comment, c.count, c.in, ok, c.ok)
+		}
+		if bytes.Compare(got, View(c.want)) != 0 {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got %v. Want %v",
+				c.comment, c.count, c.in, got, c.want)
+		}
+
+		// Is the underlying structure right?
+		if !reflect.DeepEqual(c.in, c.result) {
+			t.Errorf("Test %q failed when calling PullUp(%d). Got vv with structure %v. Wanted %v",
+				c.comment, c.count, c.in, c.result)
+		}
+	}
+}
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 1e2255bfa..073c84ef9 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -98,13 +98,13 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	// Reject the packet if it's shorter than an ethernet header.
-	if vv.Size() < header.EthernetMinimumSize {
+	// There should be an ethernet header at the beginning of vv.
+	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
+	if !ok {
+		// Reject the packet if it's shorter than an ethernet header.
 		return tcpip.ErrBadAddress
 	}
-
-	// There should be an ethernet header at the beginning of vv.
-	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
+	linkHeader := header.Ethernet(hdr)
 	vv.TrimFront(len(linkHeader))
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
 		Data:       vv,
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 27ea3f531..33f640b85 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -674,7 +674,7 @@ func TestSimpleReceive(t *testing.T) {
 		// Wait for packet to be received, then check it.
 		c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet")
 		c.mu.Lock()
-		rcvd := []byte(c.packets[0].vv.First())
+		rcvd := []byte(c.packets[0].vv.ToView())
 		c.packets = c.packets[:0]
 		c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index be2537a82..0799c8f4d 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -171,11 +171,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	writer := e.writer
 	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
-		first := pkt.Header.View()
-		if len(first) == 0 {
-			first = pkt.Data.First()
-		}
-		logPacket(prefix, protocol, first, gso)
+		logPacket(prefix, protocol, pkt, gso)
 	}
 	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
 		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
@@ -238,7 +234,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 // Wait implements stack.LinkEndpoint.Wait.
 func (e *endpoint) Wait() { e.lower.Wait() }
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -247,28 +243,49 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	size := uint16(0)
 	var fragmentOffset uint16
 	var moreFragments bool
+
+	// Create a clone of pkt, including any headers if present. Avoid allocating
+	// backing memory for the clone.
+	views := [8]buffer.View{}
+	vv := buffer.NewVectorisedView(0, views[:0])
+	vv.AppendView(pkt.Header.View())
+	vv.Append(pkt.Data)
+
 	switch protocol {
 	case header.IPv4ProtocolNumber:
-		ipv4 := header.IPv4(b)
+		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			return
+		}
+		ipv4 := header.IPv4(hdr)
 		fragmentOffset = ipv4.FragmentOffset()
 		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
 		src = ipv4.SourceAddress()
 		dst = ipv4.DestinationAddress()
 		transProto = ipv4.Protocol()
 		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		b = b[ipv4.HeaderLength():]
+		vv.TrimFront(int(ipv4.HeaderLength()))
 		id = int(ipv4.ID())
 
 	case header.IPv6ProtocolNumber:
-		ipv6 := header.IPv6(b)
+		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		if !ok {
+			return
+		}
+		ipv6 := header.IPv6(hdr)
 		src = ipv6.SourceAddress()
 		dst = ipv6.DestinationAddress()
 		transProto = ipv6.NextHeader()
 		size = ipv6.PayloadLength()
-		b = b[header.IPv6MinimumSize:]
+		vv.TrimFront(header.IPv6MinimumSize)
 
 	case header.ARPProtocolNumber:
-		arp := header.ARP(b)
+		hdr, ok := vv.PullUp(header.ARPSize)
+		if !ok {
+			return
+		}
+		vv.TrimFront(header.ARPSize)
+		arp := header.ARP(hdr)
 		log.Infof(
 			"%s arp %v (%v) -> %v (%v) valid:%v",
 			prefix,
@@ -284,7 +301,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	// We aren't guaranteed to have a transport header - it's possible for
 	// writes via raw endpoints to contain only network headers.
-	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && len(b) < minSize {
+	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && vv.Size() < minSize {
 		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
 		return
 	}
@@ -297,7 +314,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	switch tcpip.TransportProtocolNumber(transProto) {
 	case header.ICMPv4ProtocolNumber:
 		transName = "icmp"
-		icmp := header.ICMPv4(b)
+		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv4(hdr)
 		icmpType := "unknown"
 		if fragmentOffset == 0 {
 			switch icmp.Type() {
@@ -330,7 +351,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.ICMPv6ProtocolNumber:
 		transName = "icmp"
-		icmp := header.ICMPv6(b)
+		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv6(hdr)
 		icmpType := "unknown"
 		switch icmp.Type() {
 		case header.ICMPv6DstUnreachable:
@@ -361,7 +386,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.UDPProtocolNumber:
 		transName = "udp"
-		udp := header.UDP(b)
+		hdr, ok := vv.PullUp(header.UDPMinimumSize)
+		if !ok {
+			break
+		}
+		udp := header.UDP(hdr)
 		if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
 			srcPort = udp.SourcePort()
 			dstPort = udp.DestinationPort()
@@ -371,7 +400,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.TCPProtocolNumber:
 		transName = "tcp"
-		tcp := header.TCP(b)
+		hdr, ok := vv.PullUp(header.TCPMinimumSize)
+		if !ok {
+			break
+		}
+		tcp := header.TCP(hdr)
 		if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
 			offset := int(tcp.DataOffset())
 			if offset < header.TCPMinimumSize {
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 7acbfa0a8..cf73a939e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -93,7 +93,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	v := pkt.Data.First()
+	v, ok := pkt.Data.PullUp(header.ARPSize)
+	if !ok {
+		return
+	}
 	h := header.ARP(v)
 	if !h.IsValid() {
 		return
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index c4bf1ba5c..4cbefe5ab 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -25,7 +25,11 @@ import (
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
 func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
-	h := header.IPv4(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv4(h)
 
 	// We don't use IsValid() here because ICMP only requires that the IP
 	// header plus 8 bytes of the transport header be included. So it's
@@ -34,12 +38,12 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	//
 	// Drop packet if it doesn't have the basic IPv4 header or if the
 	// original source address doesn't match the endpoint's address.
-	if len(h) < header.IPv4MinimumSize || h.SourceAddress() != e.id.LocalAddress {
+	if hdr.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
-	hlen := int(h.HeaderLength())
-	if pkt.Data.Size() < hlen || h.FragmentOffset() != 0 {
+	hlen := int(hdr.HeaderLength())
+	if pkt.Data.Size() < hlen || hdr.FragmentOffset() != 0 {
 		// We won't be able to handle this if it doesn't contain the
 		// full IPv4 header, or if it's a fragment not at offset 0
 		// (because it won't have the transport header).
@@ -48,15 +52,15 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 
 	// Skip the ip header, then deliver control message.
 	pkt.Data.TrimFront(hlen)
-	p := h.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	p := hdr.TransportProtocol()
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
-	v := pkt.Data.First()
-	if len(v) < header.ICMPv4MinimumSize {
+	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+	if !ok {
 		received.Invalid.Increment()
 		return
 	}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 104aafbed..17202cc7a 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -328,7 +328,11 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
-	ip := header.IPv4(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return tcpip.ErrInvalidOptionValue
+	}
+	ip := header.IPv4(h)
 	if !ip.IsValid(pkt.Data.Size()) {
 		return tcpip.ErrInvalidOptionValue
 	}
@@ -378,7 +382,11 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView := pkt.Data.First()
+	headerView, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
 	h := header.IPv4(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index b68983d10..bdf3a0d25 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -28,7 +28,11 @@ import (
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
 func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
-	h := header.IPv6(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv6(h)
 
 	// We don't use IsValid() here because ICMP only requires that up to
 	// 1280 bytes of the original packet be included. So it's likely that it
@@ -36,17 +40,21 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	//
 	// Drop packet if it doesn't have the basic IPv6 header or if the
 	// original source address doesn't match the endpoint's address.
-	if len(h) < header.IPv6MinimumSize || h.SourceAddress() != e.id.LocalAddress {
+	if hdr.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
 	// Skip the IP header, then handle the fragmentation header if there
 	// is one.
 	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	p := h.TransportProtocol()
+	p := hdr.TransportProtocol()
 	if p == header.IPv6FragmentHeader {
-		f := header.IPv6Fragment(pkt.Data.First())
-		if !f.IsValid() || f.FragmentOffset() != 0 {
+		f, ok := pkt.Data.PullUp(header.IPv6FragmentHeaderSize)
+		if !ok {
+			return
+		}
+		fragHdr := header.IPv6Fragment(f)
+		if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 {
 			// We can't handle fragments that aren't at offset 0
 			// because they don't have the transport headers.
 			return
@@ -55,19 +63,19 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 		// Skip fragmentation header and find out the actual protocol
 		// number.
 		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
-		p = f.TransportProtocol()
+		p = fragHdr.TransportProtocol()
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
-	v := pkt.Data.First()
-	if len(v) < header.ICMPv6MinimumSize {
+	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
+	if !ok {
 		received.Invalid.Increment()
 		return
 	}
@@ -76,11 +84,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	// Validate ICMPv6 checksum before processing the packet.
 	//
-	// Only the first view in vv is accounted for by h. To account for the
-	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
 	payload := pkt.Data.Clone(nil)
-	payload.RemoveFirst()
+	payload.TrimFront(len(h))
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
 		return
@@ -101,34 +107,40 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
-		if len(v) < header.ICMPv6PacketTooBigMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
-		mtu := h.MTU()
+		mtu := header.ICMPv6(hdr).MTU()
 		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
-		if len(v) < header.ICMPv6DstUnreachableMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6DstUnreachableMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
-		switch h.Code() {
+		switch header.ICMPv6(hdr).Code() {
 		case header.ICMPv6PortUnreachable:
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if len(v) < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		ns := header.NDPNeighborSolicit(h.NDPPayload())
+		// The remainder of payload must be only the neighbor solicitation, so
+		// payload.ToView() always returns the solicitation. Per RFC 6980 section 5,
+		// NDP messages cannot be fragmented. Also note that in the common case NDP
+		// datagrams are very small and ToView() will not incur allocations.
+		ns := header.NDPNeighborSolicit(payload.ToView())
 		it, err := ns.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NS option, drop the packet.
@@ -286,12 +298,16 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if len(v) < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		na := header.NDPNeighborAdvert(h.NDPPayload())
+		// The remainder of payload must be only the neighbor advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		na := header.NDPNeighborAdvert(payload.ToView())
 		it, err := na.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NA option, drop the packet.
@@ -363,14 +379,15 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
-		if len(v) < header.ICMPv6EchoMinimumSize {
+		icmpHdr, ok := pkt.Data.PullUp(header.ICMPv6EchoMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
 		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-		copy(packet, h)
+		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
 		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
@@ -384,7 +401,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoReply:
 		received.EchoReply.Increment()
-		if len(v) < header.ICMPv6EchoMinimumSize {
+		if pkt.Data.Size() < header.ICMPv6EchoMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -406,8 +423,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	case header.ICMPv6RouterAdvert:
 		received.RouterAdvert.Increment()
 
-		p := h.NDPPayload()
-		if len(p) < header.NDPRAMinimumSize || !isNDPValid() {
+		// Is the NDP payload of sufficient size to hold a Router
+		// Advertisement?
+		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
@@ -425,7 +443,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 			return
 		}
 
-		ra := header.NDPRouterAdvert(p)
+		// The remainder of payload must be only the router advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		ra := header.NDPRouterAdvert(payload.ToView())
 		opts := ra.Options()
 
 		// Are options valid as per the wire format?
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index bd099a7f8..d412ff688 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -166,7 +166,8 @@ func TestICMPCounts(t *testing.T) {
 		},
 		{
 			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize},
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+		},
 		{
 			typ:       header.ICMPv6NeighborAdvert,
 			size:      header.ICMPv6NeighborAdvertMinimumSize,
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 331b0817b..486725131 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -171,7 +171,11 @@ func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffe
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView := pkt.Data.First()
+	headerView, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
 	h := header.IPv6(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index e9c652042..c7c663498 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -70,7 +70,10 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 
 func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
 	// Consume the network header.
-	b := pkt.Data.First()
+	b, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
+	if !ok {
+		return
+	}
 	pkt.Data.TrimFront(fwdTestNetHeaderLen)
 
 	// Dispatch the packet to the transport protocol.
@@ -473,7 +476,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Header.View()
+	b := p.Pkt.Data.ToView()
 	if b[0] != 3 {
 		t.Fatalf("got b[0] = %d, want = 3", b[0])
 	}
@@ -517,7 +520,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.ToView()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -564,7 +567,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.ToView()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -619,7 +622,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.ToView()
 		if b[0] < 8 {
 			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
 		}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 6c0a4b24d..6b91159d4 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -212,6 +212,11 @@ func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool {
 // CheckPackets runs pkts through the rules for hook and returns a map of packets that
 // should not go forward.
 //
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+//
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
+//
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
 func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*PacketBuffer]struct{}) {
@@ -226,7 +231,9 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*Pa
 	return drop
 }
 
-// Precondition: pkt.NetworkHeader is set.
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
 func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
@@ -271,14 +278,21 @@ func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx
 	return chainDrop
 }
 
-// Precondition: pk.NetworkHeader is set.
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
 func (it *IPTables) checkRule(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
-	// pkt.Data.First().
+	// pkt.Data.
 	if pkt.NetworkHeader == nil {
-		pkt.NetworkHeader = pkt.Data.First()
+		var ok bool
+		pkt.NetworkHeader, ok = pkt.Data.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			// Precondition has been violated.
+			panic(fmt.Sprintf("iptables checks require IPv4 headers of at least %d bytes", header.IPv4MinimumSize))
+		}
 	}
 
 	// Check whether the packet matches the IP header filter.
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 7b4543caf..8be61f4b1 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -96,9 +96,12 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 	newPkt := pkt.Clone()
 
 	// Set network header.
-	headerView := newPkt.Data.First()
+	headerView, ok := newPkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return RuleDrop, 0
+	}
 	netHeader := header.IPv4(headerView)
-	newPkt.NetworkHeader = headerView[:header.IPv4MinimumSize]
+	newPkt.NetworkHeader = headerView
 
 	hlen := int(netHeader.HeaderLength())
 	tlen := int(netHeader.TotalLength())
@@ -117,10 +120,14 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 		if newPkt.TransportHeader != nil {
 			udpHeader = header.UDP(newPkt.TransportHeader)
 		} else {
-			if len(pkt.Data.First()) < header.UDPMinimumSize {
+			if pkt.Data.Size() < header.UDPMinimumSize {
+				return RuleDrop, 0
+			}
+			hdr, ok := newPkt.Data.PullUp(header.UDPMinimumSize)
+			if !ok {
 				return RuleDrop, 0
 			}
-			udpHeader = header.UDP(newPkt.Data.First())
+			udpHeader = header.UDP(hdr)
 		}
 		udpHeader.SetDestinationPort(rt.MinPort)
 	case header.TCPProtocolNumber:
@@ -128,10 +135,14 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 		if newPkt.TransportHeader != nil {
 			tcpHeader = header.TCP(newPkt.TransportHeader)
 		} else {
-			if len(pkt.Data.First()) < header.TCPMinimumSize {
+			if pkt.Data.Size() < header.TCPMinimumSize {
 				return RuleDrop, 0
 			}
-			tcpHeader = header.TCP(newPkt.TransportHeader)
+			hdr, ok := newPkt.Data.PullUp(header.TCPMinimumSize)
+			if !ok {
+				return RuleDrop, 0
+			}
+			tcpHeader = header.TCP(hdr)
 		}
 		// TODO(gvisor.dev/issue/170): Need to recompute checksum
 		// and implement nat connection tracking to support TCP.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 016dbe15e..0c2b1f36a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1203,12 +1203,12 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	if len(pkt.Data.First()) < netProto.MinimumPacketSize() {
+	netHeader, ok := pkt.Data.PullUp(netProto.MinimumPacketSize())
+	if !ok {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
-
-	src, dst := netProto.ParseAddresses(pkt.Data.First())
+	src, dst := netProto.ParseAddresses(netHeader)
 
 	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
 		// The source address is one of our own, so we never should have gotten a
@@ -1289,22 +1289,8 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-
-	firstData := pkt.Data.First()
-	pkt.Data.RemoveFirst()
-
-	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen == 0 {
-		pkt.Header = buffer.NewPrependableFromView(firstData)
-	} else {
-		firstDataLen := len(firstData)
-
-		// pkt.Header should have enough capacity to hold n.linkEP's headers.
-		pkt.Header = buffer.NewPrependable(firstDataLen + linkHeaderLen)
-
-		// TODO(b/151227689): avoid copying the packet when forwarding
-		if n := copy(pkt.Header.Prepend(firstDataLen), firstData); n != firstDataLen {
-			panic(fmt.Sprintf("copied %d bytes, expected %d", n, firstDataLen))
-		}
+	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen != 0 {
+		pkt.Header = buffer.NewPrependable(linkHeaderLen)
 	}
 
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
@@ -1332,12 +1318,13 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// validly formed.
 	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	if len(pkt.Data.First()) < transProto.MinimumPacketSize() {
+	transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
+	if !ok {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
+	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
@@ -1375,11 +1362,12 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
 	// be present in the payload. We know that the ports are within the
 	// first 8 bytes for all known transport protocols.
-	if len(pkt.Data.First()) < 8 {
+	transHeader, ok := pkt.Data.PullUp(8)
+	if !ok {
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
+	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
 	if err != nil {
 		return
 	}
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index dc125f25e..e954a8b7e 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -37,7 +37,9 @@ type PacketBuffer struct {
 	Data buffer.VectorisedView
 
 	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header.
+	// down the stack, each layer adds to Header. Note that forwarded
+	// packets don't populate Headers on their way out -- their headers and
+	// payload are never parsed out and remain in Data.
 	Header buffer.Prependable
 
 	// These fields are used by both inbound and outbound packets. They
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index c7634ceb1..d45d2cc1f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -95,16 +95,18 @@ func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffe
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
 	// Consume the network header.
-	b := pkt.Data.First()
+	b, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+	if !ok {
+		return
+	}
 	pkt.Data.TrimFront(fakeNetHeaderLen)
 
 	// Handle control packets.
 	if b[2] == uint8(fakeControlProtocol) {
-		nb := pkt.Data.First()
-		if len(nb) < fakeNetHeaderLen {
+		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+		if !ok {
 			return
 		}
-
 		pkt.Data.TrimFront(fakeNetHeaderLen)
 		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
 		return
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 3084e6593..a611e44ab 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -642,10 +642,11 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	if dst := p.Pkt.Header.View()[0]; dst != 3 {
+	hdrs := p.Pkt.Data.ToView()
+	if dst := hdrs[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := p.Pkt.Header.View()[1]; src != 1 {
+	if src := hdrs[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index feef8dca0..b1d820372 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -747,15 +747,15 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h := header.ICMPv4(pkt.Data.First())
-		if h.Type() != header.ICMPv4EchoReply {
+		h, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+		if !ok || header.ICMPv4(h).Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h := header.ICMPv6(pkt.Data.First())
-		if h.Type() != header.ICMPv6EchoReply {
+		h, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
+		if !ok || header.ICMPv6(h).Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 40461fd31..7712ce652 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -144,7 +144,11 @@ func (s *segment) logicalLen() seqnum.Size {
 // TCP checksum and stores the checksum and result of checksum verification in
 // the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
-	h := header.TCP(s.data.First())
+	h, ok := s.data.PullUp(header.TCPMinimumSize)
+	if !ok {
+		return false
+	}
+	hdr := header.TCP(h)
 
 	// h is the header followed by the payload. We check that the offset to
 	// the data respects the following constraints:
@@ -156,12 +160,16 @@ func (s *segment) parse() bool {
 	// N.B. The segment has already been validated as having at least the
 	//      minimum TCP size before reaching here, so it's safe to read the
 	//      fields.
-	offset := int(h.DataOffset())
-	if offset < header.TCPMinimumSize || offset > len(h) {
+	offset := int(hdr.DataOffset())
+	if offset < header.TCPMinimumSize {
+		return false
+	}
+	hdrWithOpts, ok := s.data.PullUp(offset)
+	if !ok {
 		return false
 	}
 
-	s.options = []byte(h[header.TCPMinimumSize:offset])
+	s.options = []byte(hdrWithOpts[header.TCPMinimumSize:])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
 
 	// Query the link capabilities to decide if checksum validation is
@@ -173,18 +181,19 @@ func (s *segment) parse() bool {
 		s.data.TrimFront(offset)
 	}
 	if verifyChecksum {
-		s.csum = h.Checksum()
+		hdr = header.TCP(hdrWithOpts)
+		s.csum = hdr.Checksum()
 		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
-		xsum = h.CalculateChecksum(xsum)
+		xsum = hdr.CalculateChecksum(xsum)
 		s.data.TrimFront(offset)
 		xsum = header.ChecksumVV(s.data, xsum)
 		s.csumValid = xsum == 0xffff
 	}
 
-	s.sequenceNumber = seqnum.Value(h.SequenceNumber())
-	s.ackNumber = seqnum.Value(h.AckNumber())
-	s.flags = h.Flags()
-	s.window = seqnum.Size(h.WindowSize())
+	s.sequenceNumber = seqnum.Value(hdr.SequenceNumber())
+	s.ackNumber = seqnum.Value(hdr.AckNumber())
+	s.flags = hdr.Flags()
+	s.window = seqnum.Size(hdr.WindowSize())
 	return true
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index ab1014c7f..286c66cf5 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -3548,7 +3548,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
 	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
 
 	c.SendSegment(vv)
@@ -3575,7 +3575,7 @@ func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
 	// Overwrite a byte in the payload which should cause checksum
 	// verification to fail.
 	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index edb54f0be..756ab913a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1250,8 +1250,8 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(pkt.Data.First())
-	if int(hdr.Length()) > pkt.Data.Size() {
+	hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok || int(header.UDP(hdr).Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
@@ -1286,7 +1286,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
-			Port: hdr.SourcePort(),
+			Port: header.UDP(hdr).SourcePort(),
 		},
 	}
 	packet.data = pkt.Data
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 6e31a9bac..52af6de22 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -68,8 +68,13 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // that don't match any existing endpoint.
 func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(pkt.Data.First())
-	if int(hdr.Length()) > pkt.Data.Size() {
+	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok {
+		// Malformed packet.
+		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
+		return true
+	}
+	if int(header.UDP(h).Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
-- 
cgit v1.2.3


From a80cd4302337f1c3a807e127f5a6edc2f014f431 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 17 Apr 2020 13:27:35 -0700
Subject: Add test name to boot and gofer log files

This is to make easier to find corresponding logs in
case test fails.

PiperOrigin-RevId: 307104283
---
 runsc/cmd/capability_test.go            |  2 +-
 runsc/container/console_test.go         |  6 +--
 runsc/container/container_test.go       | 72 ++++++++++++++++-----------------
 runsc/container/multi_container_test.go | 44 ++++++++++----------
 runsc/container/shared_volume_test.go   |  4 +-
 runsc/main.go                           |  4 +-
 runsc/testutil/testutil.go              |  7 ++--
 test/root/oom_score_adj_test.go         |  4 +-
 8 files changed, 70 insertions(+), 73 deletions(-)

diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 0c27f7313..9360d7442 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -85,7 +85,7 @@ func TestCapabilities(t *testing.T) {
 		Inheritable: caps,
 	}
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 
 	// Use --network=host to make sandbox use spec's capabilities.
 	conf.Network = boot.NetworkHost
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 651615d4c..af245b6d8 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -118,7 +118,7 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
 
 // Test that an pty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		spec := testutil.NewSpecWithArgs("true")
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
@@ -163,7 +163,7 @@ func TestConsoleSocket(t *testing.T) {
 // Test that job control signals work on a console created with "exec -ti".
 func TestJobControlSignalExec(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
@@ -286,7 +286,7 @@ func TestJobControlSignalExec(t *testing.T) {
 
 // Test that job control signals work on a console created with "run -ti".
 func TestJobControlSignalRootContainer(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	// Don't let bash execute from profile or rc files, otherwise our PID
 	// counts get messed up.
 	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 24f9ecc35..5db6d64aa 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -251,12 +251,12 @@ var noOverlay = []configOption{kvm, nonExclusiveFS}
 var all = append(noOverlay, overlay)
 
 // configs generates different configurations to run tests.
-func configs(opts ...configOption) []*boot.Config {
+func configs(t *testing.T, opts ...configOption) []*boot.Config {
 	// Always load the default config.
-	cs := []*boot.Config{testutil.TestConfig()}
+	cs := []*boot.Config{testutil.TestConfig(t)}
 
 	for _, o := range opts {
-		c := testutil.TestConfig()
+		c := testutil.TestConfig(t)
 		switch o {
 		case overlay:
 			c.Overlay = true
@@ -285,7 +285,7 @@ func TestLifecycle(t *testing.T) {
 	childReaper.Start()
 	defer childReaper.Stop()
 
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		// The container will just sleep for a long time.  We will kill it before
 		// it finishes sleeping.
@@ -457,7 +457,7 @@ func TestExePath(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 		for _, test := range []struct {
 			path    string
@@ -521,21 +521,19 @@ func TestExePath(t *testing.T) {
 
 // Test the we can retrieve the application exit status from the container.
 func TestAppExitStatus(t *testing.T) {
-	conf := testutil.TestConfig()
-	conf.VFS2 = false
-	doAppExitStatus(t, conf)
+	doAppExitStatus(t, false)
 }
 
 // This is TestAppExitStatus for VFSv2.
 func TestAppExitStatusVFS2(t *testing.T) {
-	conf := testutil.TestConfig()
-	conf.VFS2 = true
-	doAppExitStatus(t, conf)
+	doAppExitStatus(t, true)
 }
 
-func doAppExitStatus(t *testing.T, conf *boot.Config) {
+func doAppExitStatus(t *testing.T, vfs2 bool) {
 	// First container will succeed.
 	succSpec := testutil.NewSpecWithArgs("true")
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
 	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -585,7 +583,7 @@ func doAppExitStatus(t *testing.T, conf *boot.Config) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		const uid = 343
@@ -679,7 +677,7 @@ func TestExec(t *testing.T) {
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		app, err := testutil.FindFile("runsc/container/test_app/test_app")
@@ -755,7 +753,7 @@ func TestKillPid(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -916,7 +914,7 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1054,7 +1052,7 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Run(fmt.Sprintf("conf: %+v", conf), func(t *testing.T) {
 			t.Logf("Running test with conf: %+v", conf)
 
@@ -1135,7 +1133,7 @@ func TestPauseResume(t *testing.T) {
 // occurs given the correct state.
 func TestPauseResumeStatus(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("sleep", "20")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1201,7 +1199,7 @@ func TestCapabilities(t *testing.T) {
 	uid := auth.KUID(os.Getuid() + 1)
 	gid := auth.KGID(os.Getgid() + 1)
 
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("sleep", "100")
@@ -1290,7 +1288,7 @@ func TestCapabilities(t *testing.T) {
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("/bin/true")
@@ -1334,7 +1332,7 @@ func TestRunNonRoot(t *testing.T) {
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		root, err := ioutil.TempDir(testutil.TmpDir(), "root")
@@ -1363,7 +1361,7 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
@@ -1401,7 +1399,7 @@ func TestReadonlyRoot(t *testing.T) {
 }
 
 func TestUIDMap(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
+	for _, conf := range configs(t, noOverlay...) {
 		t.Logf("Running test with conf: %+v", conf)
 		testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
 		if err != nil {
@@ -1482,7 +1480,7 @@ func TestUIDMap(t *testing.T) {
 }
 
 func TestReadonlyMount(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
@@ -1539,7 +1537,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	cids := []string{
@@ -1597,7 +1595,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1666,7 +1664,7 @@ func TestRootNotMount(t *testing.T) {
 	spec.Root.Readonly = true
 	spec.Mounts = nil
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	if err := run(spec, conf); err != nil {
 		t.Fatalf("error running sandbox: %v", err)
 	}
@@ -1680,7 +1678,7 @@ func TestUserLog(t *testing.T) {
 
 	// sched_rr_get_interval = 148 - not implemented in gvisor.
 	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1720,7 +1718,7 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		// Run a shell that sleeps for 1 second and then exits with a
@@ -1775,7 +1773,7 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 
 func TestDestroyNotStarted(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1802,7 +1800,7 @@ func TestDestroyNotStarted(t *testing.T) {
 func TestDestroyStarting(t *testing.T) {
 	for i := 0; i < 10; i++ {
 		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
-		conf := testutil.TestConfig()
+		conf := testutil.TestConfig(t)
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
@@ -1847,7 +1845,7 @@ func TestDestroyStarting(t *testing.T) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
@@ -1920,7 +1918,7 @@ func TestMountPropagation(t *testing.T) {
 		},
 	}
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1971,7 +1969,7 @@ func TestMountPropagation(t *testing.T) {
 }
 
 func TestMountSymlink(t *testing.T) {
-	for _, conf := range configs(overlay) {
+	for _, conf := range configs(t, overlay) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
@@ -2051,7 +2049,7 @@ func TestNetRaw(t *testing.T) {
 	}
 
 	for _, enableRaw := range []bool{true, false} {
-		conf := testutil.TestConfig()
+		conf := testutil.TestConfig(t)
 		conf.EnableRaw = enableRaw
 
 		test := "--enabled"
@@ -2068,7 +2066,7 @@ func TestNetRaw(t *testing.T) {
 
 // TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
 func TestOverlayfsStaleRead(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.OverlayfsStaleRead = true
 
 	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
@@ -2132,7 +2130,7 @@ func TestTTYField(t *testing.T) {
 
 	for _, test := range testCases {
 		t.Run(test.name, func(t *testing.T) {
-			conf := testutil.TestConfig()
+			conf := testutil.TestConfig(t)
 
 			// We will run /bin/sleep, possibly with an open TTY.
 			cmd := []string{"/bin/sleep", "10000"}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 2da93ec5b..dc2fb42ce 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -135,7 +135,7 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -173,7 +173,7 @@ func TestMultiContainerSanity(t *testing.T) {
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -218,7 +218,7 @@ func TestMultiPIDNS(t *testing.T) {
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -289,7 +289,7 @@ func TestMultiContainerWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// The first container should run the entire duration of the test.
@@ -367,7 +367,7 @@ func TestExecWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// The first container should run the entire duration of the test.
@@ -463,7 +463,7 @@ func TestMultiContainerMount(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	containers, cleanup, err := startContainers(conf, sps, ids)
@@ -484,7 +484,7 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -585,7 +585,7 @@ func TestMultiContainerDestroy(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -653,7 +653,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
@@ -712,7 +712,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	for _, tc := range []struct {
@@ -804,7 +804,7 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -858,7 +858,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	specs, ids := createSpecs(cmds...)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -943,7 +943,7 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Make sure overlay is enabled, and none of the root filesystems are
@@ -1006,7 +1006,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	childrenSpecs := allSpecs[1:]
 	childrenIDs := allIDs[1:]
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1080,7 +1080,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 // Test that pod shared mounts are properly mounted in 2 containers and that
 // changes from one container is reflected in the other.
 func TestMultiContainerSharedMount(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -1195,7 +1195,7 @@ func TestMultiContainerSharedMount(t *testing.T) {
 
 // Test that pod mounts are mounted as readonly when requested.
 func TestMultiContainerSharedMountReadonly(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -1262,7 +1262,7 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	for _, conf := range configs(all...) {
+	for _, conf := range configs(t, all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		rootDir, err := testutil.SetupRootDir()
@@ -1381,7 +1381,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Setup the containers.
@@ -1463,7 +1463,7 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Create the specs.
@@ -1500,7 +1500,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	sleep := []string{"sleep", "100"}
@@ -1587,7 +1587,7 @@ func TestMultiContainerLoadSandbox(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Create containers for the sandbox.
@@ -1687,7 +1687,7 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	pod, cleanup, err := startContainers(conf, podSpecs, ids)
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index dc4194134..f80852414 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -31,7 +31,7 @@ import (
 // TestSharedVolume checks that modifications to a volume mount are propagated
 // into and out of the sandbox.
 func TestSharedVolume(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
 	t.Logf("Running test with conf: %+v", conf)
 
@@ -190,7 +190,7 @@ func checkFile(c *Container, filename string, want []byte) error {
 // TestSharedVolumeFile tests that changes to file content outside the sandbox
 // is reflected inside.
 func TestSharedVolumeFile(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
 	t.Logf("Running test with conf: %+v", conf)
 
diff --git a/runsc/main.go b/runsc/main.go
index 9d52f3006..2baba90f8 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -296,9 +296,7 @@ func main() {
 		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
 		}
-	}
-
-	if *alsoLogToStderr {
+	} else if *alsoLogToStderr {
 		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
 
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 51e487715..5e09f8f16 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -31,11 +31,13 @@ import (
 	"os"
 	"os/exec"
 	"os/signal"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
+	"testing"
 	"time"
 
 	"github.com/cenkalti/backoff"
@@ -81,17 +83,16 @@ func ConfigureExePath() error {
 
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
-func TestConfig() *boot.Config {
+func TestConfig(t *testing.T) *boot.Config {
 	logDir := ""
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
 	return &boot.Config{
 		Debug:              true,
-		DebugLog:           logDir,
+		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
 		LogFormat:          "text",
 		DebugLogFormat:     "text",
-		AlsoLogToStderr:    true,
 		LogPackets:         true,
 		Network:            boot.NetworkNone,
 		Strace:             true,
diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go
index 126f0975a..22488b05d 100644
--- a/test/root/oom_score_adj_test.go
+++ b/test/root/oom_score_adj_test.go
@@ -46,7 +46,7 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	ppid, err := specutils.GetParentPid(os.Getpid())
@@ -137,7 +137,7 @@ func TestOOMScoreAdjMulti(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	ppid, err := specutils.GetParentPid(os.Getpid())
-- 
cgit v1.2.3


From 486759a37d61b770019a81af00e0e733c655b3a8 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 17 Apr 2020 13:54:47 -0700
Subject: Support NDP DNS Search List option

Inform the netstack integrator when the netstack receives an NDP Router
Advertisement message with the NDP DNS Search List option with at least
one domain name. The stack will not maintain any state related to the
search list - the integrator is expected to maintain any required state
and invalidate domain names after their lifetime expires, or refresh the
lifetime when a new one is received for a known domain name.

Test:
- header_test.TestNDPDNSSearchListOption
- header_test.TestNDPDNSSearchListOptionSerialize
- header_test.TestNDPSearchListOptionDomainNameLabelInvalidSymbols
- header_test.TestNDPOptionsIterCheck
- stack_test.TestNDPDNSSearchListDispatch
PiperOrigin-RevId: 307109375
---
 pkg/tcpip/header/ndp_options.go | 213 +++++++++++++++
 pkg/tcpip/header/ndp_test.go    | 574 ++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/stack/ndp.go          |  18 ++
 pkg/tcpip/stack/ndp_test.go     | 125 +++++++++
 4 files changed, 930 insertions(+)

diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 444e90820..5d3975c56 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -45,6 +45,10 @@ const (
 	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
 	// Server option, as per RFC 8106 section 5.1.
 	NDPRecursiveDNSServerOptionType NDPOptionIdentifier = 25
+
+	// NDPDNSSearchListOptionType is the type of the DNS Search List option,
+	// as per RFC 8106 section 5.2.
+	NDPDNSSearchListOptionType = 31
 )
 
 const (
@@ -115,6 +119,27 @@ const (
 	// RFC 8106 section 5.3.1.
 	minNDPRecursiveDNSServerBodySize = 22
 
+	// ndpDNSSearchListLifetimeOffset is the start of the 4-byte
+	// Lifetime field within an NDPDNSSearchList.
+	ndpDNSSearchListLifetimeOffset = 2
+
+	// ndpDNSSearchListDomainNamesOffset is the start of the DNS search list
+	// domain names within an NDPDNSSearchList.
+	ndpDNSSearchListDomainNamesOffset = 6
+
+	// minNDPDNSSearchListBodySize is the minimum NDP DNS Search List option's
+	// body size when it contains at least one domain name, as per RFC 8106
+	// section 5.3.1.
+	minNDPDNSSearchListBodySize = 14
+
+	// maxDomainNameLabelLength is the maximum length of a domain name
+	// label, as per RFC 1035 section 3.1.
+	maxDomainNameLabelLength = 63
+
+	// maxDomainNameLength is the maximum length of a domain name, including
+	// label AND label length octet, as per RFC 1035 section 3.1.
+	maxDomainNameLength = 255
+
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
 	// 8 octets, as per RFC 4861 section 4.6.
@@ -224,6 +249,14 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 
 			return opt, false, nil
 
+		case NDPDNSSearchListOptionType:
+			opt := NDPDNSSearchList(body)
+			if err := opt.checkDomainNames(); err != nil {
+				return nil, true, err
+			}
+
+			return opt, false, nil
+
 		default:
 			// We do not yet recognize the option, just skip for
 			// now. This is okay because RFC 4861 allows us to
@@ -684,3 +717,183 @@ func (o NDPRecursiveDNSServer) iterAddresses(fn func(tcpip.Address)) error {
 
 	return nil
 }
+
+// NDPDNSSearchList is the NDP DNS Search List option, as defined by
+// RFC 8106 section 5.2.
+type NDPDNSSearchList []byte
+
+// Type implements NDPOption.Type.
+func (o NDPDNSSearchList) Type() NDPOptionIdentifier {
+	return NDPDNSSearchListOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPDNSSearchList) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPDNSSearchList) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the reserved bytes that are before the Lifetime field.
+	for i := 0; i < ndpDNSSearchListLifetimeOffset; i++ {
+		b[i] = 0
+	}
+
+	return used
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPDNSSearchList) String() string {
+	lt := o.Lifetime()
+	domainNames, err := o.DomainNames()
+	if err != nil {
+		return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
+	}
+	return fmt.Sprintf("%T(%s valid for %s)", o, domainNames, lt)
+}
+
+// Lifetime returns the length of time that the DNS search list of domain names
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the domain names should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+func (o NDPDNSSearchList) Lifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 8106 section 5.1.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpDNSSearchListLifetimeOffset:]))
+}
+
+// DomainNames returns a DNS search list of domain names.
+//
+// DomainNames will parse the backing buffer as outlined by RFC 1035 section
+// 3.1 and return a list of strings, with all domain names in lower case.
+func (o NDPDNSSearchList) DomainNames() ([]string, error) {
+	var domainNames []string
+	return domainNames, o.iterDomainNames(func(domainName string) { domainNames = append(domainNames, domainName) })
+}
+
+// checkDomainNames iterates over the domain names in an NDP DNS Search List
+// option and returns any error it encounters.
+func (o NDPDNSSearchList) checkDomainNames() error {
+	return o.iterDomainNames(nil)
+}
+
+// iterDomainNames iterates over the domain names in an NDP DNS Search List
+// option and calls a function with each valid domain name.
+func (o NDPDNSSearchList) iterDomainNames(fn func(string)) error {
+	if l := len(o); l < minNDPDNSSearchListBodySize {
+		return fmt.Errorf("got %d bytes for NDP DNS Search List  option's body, expected at least %d bytes: %w", l, minNDPDNSSearchListBodySize, io.ErrUnexpectedEOF)
+	}
+
+	var searchList bytes.Reader
+	searchList.Reset(o[ndpDNSSearchListDomainNamesOffset:])
+
+	var scratch [maxDomainNameLength]byte
+	domainName := bytes.NewBuffer(scratch[:])
+
+	// Parse the domain names, as per RFC 1035 section 3.1.
+	for searchList.Len() != 0 {
+		domainName.Reset()
+
+		// Parse a label within a domain name, as per RFC 1035 section 3.1.
+		for {
+			// The first byte is the label length.
+			labelLenByte, err := searchList.ReadByte()
+			if err != nil {
+				if err != io.EOF {
+					// ReadByte should only ever return nil or io.EOF.
+					panic(fmt.Sprintf("unexpected error when reading a label's length: %s", err))
+				}
+
+				// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected
+				// once we start parsing a domain name; we expect the buffer to contain
+				// enough bytes for the whole domain name.
+				return fmt.Errorf("unexpected exhausted buffer while parsing a new label for a domain from NDP Search List option: %w", io.ErrUnexpectedEOF)
+			}
+			labelLen := int(labelLenByte)
+
+			// A zero-length label implies the end of a domain name.
+			if labelLen == 0 {
+				// If the domain name is empty or we have no callback function, do
+				// nothing further with the current domain name.
+				if domainName.Len() == 0 || fn == nil {
+					break
+				}
+
+				// Ignore the trailing period in the parsed domain name.
+				domainName.Truncate(domainName.Len() - 1)
+				fn(domainName.String())
+				break
+			}
+
+			// The label's length must not exceed the maximum length for a label.
+			if labelLen > maxDomainNameLabelLength {
+				return fmt.Errorf("label length of %d bytes is greater than the max label length of %d bytes for an NDP Search List option: %w", labelLen, maxDomainNameLabelLength, ErrNDPOptMalformedBody)
+			}
+
+			// The label (and trailing period) must not make the domain name too long.
+			if labelLen+1 > domainName.Cap()-domainName.Len() {
+				return fmt.Errorf("label would make an NDP Search List option's domain name longer than the max domain name length of %d bytes: %w", maxDomainNameLength, ErrNDPOptMalformedBody)
+			}
+
+			// Copy the label and add a trailing period.
+			for i := 0; i < labelLen; i++ {
+				b, err := searchList.ReadByte()
+				if err != nil {
+					if err != io.EOF {
+						panic(fmt.Sprintf("unexpected error when reading domain name's label: %s", err))
+					}
+
+					return fmt.Errorf("read %d out of %d bytes for a domain name's label from NDP Search List option: %w", i, labelLen, io.ErrUnexpectedEOF)
+				}
+
+				// As per RFC 1035 section 2.3.1:
+				//  1) the label must only contain ASCII include letters, digits and
+				//     hyphens
+				//  2) the first character in a label must be a letter
+				//  3) the last letter in a label must be a letter or digit
+
+				if !isLetter(b) {
+					if i == 0 {
+						return fmt.Errorf("first character of a domain name's label in an NDP Search List option must be a letter, got character code = %d: %w", b, ErrNDPOptMalformedBody)
+					}
+
+					if b == '-' {
+						if i == labelLen-1 {
+							return fmt.Errorf("last character of a domain name's label in an NDP Search List option must not be a hyphen (-): %w", ErrNDPOptMalformedBody)
+						}
+					} else if !isDigit(b) {
+						return fmt.Errorf("domain name's label in an NDP Search List option may only contain letters, digits and hyphens, got character code = %d: %w", b, ErrNDPOptMalformedBody)
+					}
+				}
+
+				// If b is an upper case character, make it lower case.
+				if isUpperLetter(b) {
+					b = b - 'A' + 'a'
+				}
+
+				if err := domainName.WriteByte(b); err != nil {
+					panic(fmt.Sprintf("unexpected error writing label to domain name buffer: %s", err))
+				}
+			}
+			if err := domainName.WriteByte('.'); err != nil {
+				panic(fmt.Sprintf("unexpected error writing trailing period to domain name buffer: %s", err))
+			}
+		}
+	}
+
+	return nil
+}
+
+func isLetter(b byte) bool {
+	return b >= 'a' && b <= 'z' || isUpperLetter(b)
+}
+
+func isUpperLetter(b byte) bool {
+	return b >= 'A' && b <= 'Z'
+}
+
+func isDigit(b byte) bool {
+	return b >= '0' && b <= '9'
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 2341329c4..dc4591253 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -17,7 +17,9 @@ package header
 import (
 	"bytes"
 	"errors"
+	"fmt"
 	"io"
+	"regexp"
 	"testing"
 	"time"
 
@@ -667,6 +669,477 @@ func TestNDPRecursiveDNSServerOption(t *testing.T) {
 	}
 }
 
+// TestNDPDNSSearchListOption tests the getters of NDPDNSSearchList.
+func TestNDPDNSSearchListOption(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		lifetime    time.Duration
+		domainNames []string
+		err         error
+	}{
+		{
+			name: "Valid1Label",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime: time.Second,
+			domainNames: []string{
+				"abc",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid2Label",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 5,
+				3, 'a', 'b', 'c',
+				4, 'a', 'b', 'c', 'd',
+				0,
+				0, 0, 0, 0, 0, 0,
+			},
+			lifetime: 5 * time.Second,
+			domainNames: []string{
+				"abc.abcd",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid3Label",
+			buf: []byte{
+				0, 0,
+				1, 0, 0, 0,
+				3, 'a', 'b', 'c',
+				4, 'a', 'b', 'c', 'd',
+				1, 'e',
+				0,
+				0, 0, 0, 0,
+			},
+			lifetime: 16777216 * time.Second,
+			domainNames: []string{
+				"abc.abcd.e",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid2Domains",
+			buf: []byte{
+				0, 0,
+				1, 2, 3, 4,
+				3, 'a', 'b', 'c',
+				0,
+				2, 'd', 'e',
+				3, 'x', 'y', 'z',
+				0,
+				0, 0, 0,
+			},
+			lifetime: 16909060 * time.Second,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid3DomainsMixedCase",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 'B', 'c',
+				0,
+				2, 'd', 'E',
+				3, 'X', 'y', 'z',
+				0,
+				1, 'J',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+				"j",
+			},
+			err: nil,
+		},
+		{
+			name: "ValidDomainAfterNULL",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 'B', 'c',
+				0, 0, 0, 0,
+				2, 'd', 'E',
+				3, 'X', 'y', 'z',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid0Domains",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				0,
+				0, 0, 0, 0, 0, 0, 0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         nil,
+		},
+		{
+			name: "NoTrailingNull",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				7, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         io.ErrUnexpectedEOF,
+		},
+		{
+			name: "IncorrectLength",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				8, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         io.ErrUnexpectedEOF,
+		},
+		{
+			name: "IncorrectLengthWithNULL",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				7, 'a', 'b', 'c', 'd', 'e', 'f',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "LabelOfLength63",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk",
+			},
+			err: nil,
+		},
+		{
+			name: "LabelOfLength64",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DomainNameOfLength255",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				62, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij",
+			},
+			err: nil,
+		},
+		{
+			name: "DomainNameOfLength256",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "StartingDigitForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, '9', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "StartingHyphenForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, '-', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "EndingHyphenForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', '-',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "EndingDigitForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', '9',
+				0,
+				0, 0, 0,
+			},
+			lifetime: time.Second,
+			domainNames: []string{
+				"ab9",
+			},
+			err: nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opt := NDPDNSSearchList(test.buf)
+
+			if got := opt.Lifetime(); got != test.lifetime {
+				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+			}
+			domainNames, err := opt.DomainNames()
+			if !errors.Is(err, test.err) {
+				t.Errorf("opt.DomainNames() = %s", err)
+			}
+			if diff := cmp.Diff(domainNames, test.domainNames); diff != "" {
+				t.Errorf("mismatched domain names (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNDPSearchListOptionDomainNameLabelInvalidSymbols(t *testing.T) {
+	for r := rune(0); r <= 255; r++ {
+		t.Run(fmt.Sprintf("RuneVal=%d", r), func(t *testing.T) {
+			buf := []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 0 /* will be replaced */, 'c',
+				0,
+				0, 0, 0,
+			}
+			buf[8] = uint8(r)
+			opt := NDPDNSSearchList(buf)
+
+			// As per RFC 1035 section 2.3.1, the label must only include ASCII
+			// letters, digits and hyphens (a-z, A-Z, 0-9, -).
+			var expectedErr error
+			re := regexp.MustCompile(`[a-zA-Z0-9-]`)
+			if !re.Match([]byte{byte(r)}) {
+				expectedErr = ErrNDPOptMalformedBody
+			}
+
+			if domainNames, err := opt.DomainNames(); !errors.Is(err, expectedErr) {
+				t.Errorf("got opt.DomainNames() = (%s, %v), want = (_, %v)", domainNames, err, ErrNDPOptMalformedBody)
+			}
+		})
+	}
+}
+
+func TestNDPDNSSearchListOptionSerialize(t *testing.T) {
+	b := []byte{
+		9, 8,
+		1, 0, 0, 0,
+		3, 'a', 'b', 'c',
+		4, 'a', 'b', 'c', 'd',
+		1, 'e',
+		0,
+	}
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	expected := []byte{
+		31, 3, 0, 0,
+		1, 0, 0, 0,
+		3, 'a', 'b', 'c',
+		4, 'a', 'b', 'c', 'd',
+		1, 'e',
+		0,
+		0, 0, 0, 0,
+	}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPDNSSearchList(b),
+	}
+	if got, want := opts.Serialize(serializer), len(expected); got != want {
+		t.Errorf("got Serialize = %d, want = %d", got, want)
+	}
+	if !bytes.Equal(targetBuf, expected) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPDNSSearchListOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPDNSSearchListOptionType)
+	}
+
+	opt, ok := next.(NDPDNSSearchList)
+	if !ok {
+		t.Fatalf("next (type = %T) cannot be casted to an NDPDNSSearchList", next)
+	}
+	if got := opt.Type(); got != 31 {
+		t.Errorf("got Type = %d, want = 31", got)
+	}
+	if got := opt.Length(); got != 22 {
+		t.Errorf("got Length = %d, want = 22", got)
+	}
+	if got, want := opt.Lifetime(), 16777216*time.Second; got != want {
+		t.Errorf("got Lifetime = %s, want = %s", got, want)
+	}
+	domainNames, err := opt.DomainNames()
+	if err != nil {
+		t.Errorf("opt.DomainNames() = %s", err)
+	}
+	if diff := cmp.Diff(domainNames, []string{"abc.abcd.e"}); diff != "" {
+		t.Errorf("domain names mismatch (-want +got):\n%s", diff)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
 // TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
 // the iterator was returned for is malformed.
 func TestNDPOptionsIterCheck(t *testing.T) {
@@ -832,6 +1305,107 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 			},
 			expectedErr: ErrNDPOptMalformedBody,
 		},
+		{
+			name: "DNSSearchListLargeCompliantRFC1035",
+			buf: []byte{
+				31, 33, 0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				62, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j',
+				0,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "DNSSearchListNonCompliantRFC1035",
+			buf: []byte{
+				31, 33, 0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DNSSearchListValidSmall",
+			buf: []byte{
+				31, 2, 0, 0,
+				0, 0, 0, 0,
+				6, 'a', 'b', 'c', 'd', 'e', 'f',
+				0,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "DNSSearchListTooSmall",
+			buf: []byte{
+				31, 1, 0, 0,
+				0, 0, 0,
+			},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
 	}
 
 	for _, test := range tests {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 8140c6dd4..193a9dfde 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -241,6 +241,16 @@ type NDPDispatcher interface {
 	// call functions on the stack itself.
 	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
 
+	// OnDNSSearchListOption will be called when an NDP option with a DNS
+	// search list has been received.
+	//
+	// It is up to the caller to use the domain names in the search list
+	// for only their valid lifetime. OnDNSSearchListOption may be called
+	// with new or already known domain names. If called with known domain
+	// names, their valid lifetimes must be refreshed to lifetime (it may
+	// be increased, decreased or completely invalidated when lifetime = 0.
+	OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration)
+
 	// OnDHCPv6Configuration will be called with an updated configuration that is
 	// available via DHCPv6 for a specified NIC.
 	//
@@ -714,6 +724,14 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 			addrs, _ := opt.Addresses()
 			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), addrs, opt.Lifetime())
 
+		case header.NDPDNSSearchList:
+			if ndp.nic.stack.ndpDisp == nil {
+				continue
+			}
+
+			domainNames, _ := opt.DomainNames()
+			ndp.nic.stack.ndpDisp.OnDNSSearchListOption(ndp.nic.ID(), domainNames, opt.Lifetime())
+
 		case header.NDPPrefixInformation:
 			prefix := opt.Subnet()
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 6562a2d22..6dd460984 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -133,6 +133,12 @@ type ndpRDNSSEvent struct {
 	rdnss ndpRDNSS
 }
 
+type ndpDNSSLEvent struct {
+	nicID       tcpip.NICID
+	domainNames []string
+	lifetime    time.Duration
+}
+
 type ndpDHCPv6Event struct {
 	nicID         tcpip.NICID
 	configuration stack.DHCPv6ConfigurationFromNDPRA
@@ -150,6 +156,8 @@ type ndpDispatcher struct {
 	rememberPrefix       bool
 	autoGenAddrC         chan ndpAutoGenAddrEvent
 	rdnssC               chan ndpRDNSSEvent
+	dnsslC               chan ndpDNSSLEvent
+	routeTable           []tcpip.Route
 	dhcpv6ConfigurationC chan ndpDHCPv6Event
 }
 
@@ -257,6 +265,17 @@ func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tc
 	}
 }
 
+// Implements stack.NDPDispatcher.OnDNSSearchListOption.
+func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration) {
+	if n.dnsslC != nil {
+		n.dnsslC <- ndpDNSSLEvent{
+			nicID,
+			domainNames,
+			lifetime,
+		}
+	}
+}
+
 // Implements stack.NDPDispatcher.OnDHCPv6Configuration.
 func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
 	if c := n.dhcpv6ConfigurationC; c != nil {
@@ -3386,6 +3405,112 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 	}
 }
 
+// TestNDPDNSSearchListDispatch tests that the integrator is informed when an
+// NDP DNS Search List option is received with at least one domain name in the
+// search list.
+func TestNDPDNSSearchListDispatch(t *testing.T) {
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dnsslC: make(chan ndpDNSSLEvent, 3),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	optSer := header.NDPOptionsSerializer{
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 0, 0,
+			2, 'h', 'i',
+			0,
+		}),
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 0, 1,
+			1, 'i',
+			0,
+			2, 'a', 'm',
+			2, 'm', 'e',
+			0,
+		}),
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 1, 0,
+			3, 'x', 'y', 'z',
+			0,
+			5, 'h', 'e', 'l', 'l', 'o',
+			5, 'w', 'o', 'r', 'l', 'd',
+			0,
+			4, 't', 'h', 'i', 's',
+			2, 'i', 's',
+			1, 'a',
+			4, 't', 'e', 's', 't',
+			0,
+		}),
+	}
+	expected := []struct {
+		domainNames []string
+		lifetime    time.Duration
+	}{
+		{
+			domainNames: []string{
+				"hi",
+			},
+			lifetime: 0,
+		},
+		{
+			domainNames: []string{
+				"i",
+				"am.me",
+			},
+			lifetime: time.Second,
+		},
+		{
+			domainNames: []string{
+				"xyz",
+				"hello.world",
+				"this.is.a.test",
+			},
+			lifetime: 256 * time.Second,
+		},
+	}
+
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
+
+	for i, expected := range expected {
+		select {
+		case dnssl := <-ndpDisp.dnsslC:
+			if dnssl.nicID != nicID {
+				t.Errorf("got %d-th dnssl nicID = %d, want = %d", i, dnssl.nicID, nicID)
+			}
+			if diff := cmp.Diff(dnssl.domainNames, expected.domainNames); diff != "" {
+				t.Errorf("%d-th dnssl domain names mismatch (-want +got):\n%s", i, diff)
+			}
+			if dnssl.lifetime != expected.lifetime {
+				t.Errorf("got %d-th dnssl lifetime = %s, want = %s", i, dnssl.lifetime, expected.lifetime)
+			}
+		default:
+			t.Fatal("expected a DNSSL event")
+		}
+	}
+
+	// Should have no more DNSSL options.
+	select {
+	case <-ndpDisp.dnsslC:
+		t.Fatal("unexpectedly got a DNSSL event")
+	default:
+	}
+}
+
 // TestCleanupNDPState tests that all discovered routers and prefixes, and
 // auto-generated addresses are invalidated when a NIC becomes a router.
 func TestCleanupNDPState(t *testing.T) {
-- 
cgit v1.2.3


From e838290e671c9d72dbaa3aba13bf0c35f1147de4 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 17 Apr 2020 15:31:51 -0700
Subject: prlimit: don't check credentials on self

prlimit was erroneously comparing UIDs and GIDs when getting/setting a process'
own limits. From the manpage:

To set or get the resources of a process other than itself, the caller must have
the CAP_SYS_RESOURCE capability, or the real, effective, and saved set user IDs
of the target process must match the real user ID of the caller and the real,
effective, and saved set group IDs of the target process must match the real
group ID of the caller.

PiperOrigin-RevId: 307127266
---
 pkg/sentry/syscalls/linux/sys_rlimit.go |  2 +-
 test/syscalls/linux/uidgid.cc           | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index e08c333d6..d5d5b6959 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -197,7 +197,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	// saved set user IDs of the target process must match the real user ID of
 	// the caller and the real, effective, and saved set group IDs of the
 	// target process must match the real group ID of the caller."
-	if !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+	if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
 		cred, tcred := t.Credentials(), ot.Credentials()
 		if cred.RealKUID != tcred.RealKUID ||
 			cred.RealKUID != tcred.EffectiveKUID ||
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index 6218fbce1..ff66a79f4 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <grp.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -249,6 +250,17 @@ TEST(UidGidRootTest, Setgroups) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST(UidGidRootTest, Setuid_prlimit) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  // Change our UID.
+  EXPECT_THAT(seteuid(65534), SyscallSucceeds());
+
+  // Despite the UID change, we should be able to get our own limits.
+  struct rlimit rl = {};
+  ASSERT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 9a233c94f1bd54c7c3c11f7166a09e2eacd179c5 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 17 Apr 2020 22:18:36 -0700
Subject: Fix watchdog skipStack: the meaning was reversed.

PiperOrigin-RevId: 307166317
---
 pkg/sentry/watchdog/watchdog.go | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index f7d6009a0..fcc46420f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -319,8 +319,8 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 	// Dump stack only if a new task is detected or if it sometime has
 	// passed since the last time a stack dump was generated.
-	skipStack := newTaskFound || time.Since(w.lastStackDump) >= stackDumpSameTaskPeriod
-	w.doAction(w.TaskTimeoutAction, skipStack, &buf)
+	showStack := newTaskFound || time.Since(w.lastStackDump) >= stackDumpSameTaskPeriod
+	w.doAction(w.TaskTimeoutAction, showStack, &buf)
 }
 
 func (w *Watchdog) reportStuckWatchdog() {
@@ -329,16 +329,15 @@ func (w *Watchdog) reportStuckWatchdog() {
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
-// doAction will take the given action. If the action is LogWarnind and
-// skipStack is true, then the stack printing will be skipped.
-func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
+// doAction will take the given action. If the action is LogWarning and
+// showStack is false, then the stack printing will be skipped.
+func (w *Watchdog) doAction(action Action, showStack bool, msg *bytes.Buffer) {
 	switch action {
 	case LogWarning:
-		if skipStack {
+		if !showStack {
 			msg.WriteString("\n...[stack dump skipped]...")
 			log.Warningf(msg.String())
 			return
-
 		}
 		log.TracebackAll(msg.String())
 		w.lastStackDump = time.Now()
-- 
cgit v1.2.3


From 08b2fd9bc2a963ea15821b782cf6d80c15dbdf42 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Sun, 19 Apr 2020 20:47:55 -0700
Subject: Convert tcp_user_timeout test from packetdrill to packetimpact.

PiperOrigin-RevId: 307328289
---
 test/packetdrill/BUILD                           |  12 +--
 test/packetdrill/linux/tcp_user_timeout.pkt      |  39 ---------
 test/packetdrill/netstack/tcp_user_timeout.pkt   |  38 ---------
 test/packetimpact/tests/BUILD                    |  10 +++
 test/packetimpact/tests/tcp_user_timeout_test.go | 100 +++++++++++++++++++++++
 5 files changed, 111 insertions(+), 88 deletions(-)
 delete mode 100644 test/packetdrill/linux/tcp_user_timeout.pkt
 delete mode 100644 test/packetdrill/netstack/tcp_user_timeout.pkt
 create mode 100644 test/packetimpact/tests/tcp_user_timeout_test.go

diff --git a/test/packetdrill/BUILD b/test/packetdrill/BUILD
index fb0b2db41..dfcd55f60 100644
--- a/test/packetdrill/BUILD
+++ b/test/packetdrill/BUILD
@@ -1,4 +1,4 @@
-load("defs.bzl", "packetdrill_linux_test", "packetdrill_netstack_test", "packetdrill_test")
+load("defs.bzl", "packetdrill_test")
 
 package(licenses = ["notice"])
 
@@ -17,16 +17,6 @@ packetdrill_test(
     scripts = ["fin_wait2_timeout.pkt"],
 )
 
-packetdrill_linux_test(
-    name = "tcp_user_timeout_test_linux_test",
-    scripts = ["linux/tcp_user_timeout.pkt"],
-)
-
-packetdrill_netstack_test(
-    name = "tcp_user_timeout_test_netstack_test",
-    scripts = ["netstack/tcp_user_timeout.pkt"],
-)
-
 packetdrill_test(
     name = "listen_close_before_handshake_complete_test",
     scripts = ["listen_close_before_handshake_complete.pkt"],
diff --git a/test/packetdrill/linux/tcp_user_timeout.pkt b/test/packetdrill/linux/tcp_user_timeout.pkt
deleted file mode 100644
index 38018cb42..000000000
--- a/test/packetdrill/linux/tcp_user_timeout.pkt
+++ /dev/null
@@ -1,39 +0,0 @@
-// Test that a socket w/ TCP_USER_TIMEOUT set aborts the connection
-// if there is pending unacked data after the user specified timeout.
-
-0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
-+0 bind(3, ..., ...) = 0
-
-+0 listen(3, 1) = 0
-
-// Establish a connection without timestamps.
-+0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
-+0 > S. 0:0(0) ack 1 <...>
-+0.1 < . 1:1(0) ack 1 win 32792
-
-+0.100 accept(3, ..., ...) = 4
-
-// Okay, we received nothing, and decide to close this idle socket.
-// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth
-// trying hard to cleanly close this flow, at the price of keeping
-// a TCP structure in kernel for about 1 minute!
-+2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
-
-// The write/ack is required mainly for netstack as netstack does
-// not update its RTO during the handshake.
-+0 write(4, ..., 100) = 100
-+0 > P. 1:101(100) ack 1 <...>
-+0 < . 1:1(0) ack 101 win 32792
-
-+0 close(4) = 0
-
-+0 > F. 101:101(0) ack 1 <...>
-+.3~+.400 > F. 101:101(0) ack 1 <...>
-+.3~+.400 > F. 101:101(0) ack 1 <...>
-+.6~+.800 > F. 101:101(0) ack 1 <...>
-+1.2~+1.300 > F. 101:101(0) ack 1 <...>
-
-// We finally receive something from the peer, but it is way too late
-// Our socket vanished because TCP_USER_TIMEOUT was really small.
-+.1 < . 1:2(1) ack 102 win 32792
-+0 > R 102:102(0) win 0
diff --git a/test/packetdrill/netstack/tcp_user_timeout.pkt b/test/packetdrill/netstack/tcp_user_timeout.pkt
deleted file mode 100644
index 60103adba..000000000
--- a/test/packetdrill/netstack/tcp_user_timeout.pkt
+++ /dev/null
@@ -1,38 +0,0 @@
-// Test that a socket w/ TCP_USER_TIMEOUT set aborts the connection
-// if there is pending unacked data after the user specified timeout.
-
-0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
-+0 bind(3, ..., ...) = 0
-
-+0 listen(3, 1) = 0
-
-// Establish a connection without timestamps.
-+0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
-+0 > S. 0:0(0) ack 1 <...>
-+0.1 < . 1:1(0) ack 1 win 32792
-
-+0.100 accept(3, ..., ...) = 4
-
-// Okay, we received nothing, and decide to close this idle socket.
-// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth
-// trying hard to cleanly close this flow, at the price of keeping
-// a TCP structure in kernel for about 1 minute!
-+2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
-
-// The write/ack is required mainly for netstack as netstack does
-// not update its RTO during the handshake.
-+0 write(4, ..., 100) = 100
-+0 > P. 1:101(100) ack 1 <...>
-+0 < . 1:1(0) ack 101 win 32792
-
-+0 close(4) = 0
-
-+0 > F. 101:101(0) ack 1 <...>
-+.2~+.300 > F. 101:101(0) ack 1 <...>
-+.4~+.500 > F. 101:101(0) ack 1 <...>
-+.8~+.900 > F. 101:101(0) ack 1 <...>
-
-// We finally receive something from the peer, but it is way too late
-// Our socket vanished because TCP_USER_TIMEOUT was really small.
-+1.61 < . 1:2(1) ack 102 win 32792
-+0 > R 102:102(0) win 0
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 690cee140..1410512e6 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -88,6 +88,16 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "tcp_user_timeout",
+    srcs = ["tcp_user_timeout_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/tcp_user_timeout_test.go b/test/packetimpact/tests/tcp_user_timeout_test.go
new file mode 100644
index 000000000..3cf82badb
--- /dev/null
+++ b/test/packetimpact/tests/tcp_user_timeout_test.go
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_user_timeout_test
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func sendPayload(conn *tb.TCPIPv4, dut *tb.DUT, fd int32) error {
+	sampleData := make([]byte, 100)
+	for i := range sampleData {
+		sampleData[i] = uint8(i)
+	}
+	conn.Drain()
+	dut.Send(fd, sampleData, 0)
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &tb.Payload{Bytes: sampleData}, time.Second); err != nil {
+		return fmt.Errorf("expected data but got none: %w", err)
+	}
+	return nil
+}
+
+func sendFIN(conn *tb.TCPIPv4, dut *tb.DUT, fd int32) error {
+	dut.Close(fd)
+	return nil
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		userTimeout time.Duration
+		sendDelay   time.Duration
+	}{
+		{"NoUserTimeout", 0, 3 * time.Second},
+		{"ACKBeforeUserTimeout", 5 * time.Second, 4 * time.Second},
+		{"ACKAfterUserTimeout", 5 * time.Second, 7 * time.Second},
+	} {
+		for _, ttf := range []struct {
+			description string
+			f           func(conn *tb.TCPIPv4, dut *tb.DUT, fd int32) error
+		}{
+			{"AfterPayload", sendPayload},
+			{"AfterFIN", sendFIN},
+		} {
+			t.Run(tt.description+ttf.description, func(t *testing.T) {
+				// Create a socket, listen, TCP handshake, and accept.
+				dut := tb.NewDUT(t)
+				defer dut.TearDown()
+				listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+				defer dut.Close(listenFD)
+				conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+				defer conn.Close()
+				conn.Handshake()
+				acceptFD, _ := dut.Accept(listenFD)
+
+				if tt.userTimeout != 0 {
+					dut.SetSockOptInt(acceptFD, unix.SOL_TCP, unix.TCP_USER_TIMEOUT, int32(tt.userTimeout.Milliseconds()))
+				}
+
+				if err := ttf.f(&conn, &dut, acceptFD); err != nil {
+					t.Fatal(err)
+				}
+
+				time.Sleep(tt.sendDelay)
+				conn.Drain()
+				conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+
+				// If TCP_USER_TIMEOUT was set and the above delay was longer than the
+				// TCP_USER_TIMEOUT then the DUT should send a RST in response to the
+				// testbench's packet.
+				expectRST := tt.userTimeout != 0 && tt.sendDelay > tt.userTimeout
+				expectTimeout := 5 * time.Second
+				got, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, expectTimeout)
+				if expectRST && err != nil {
+					t.Errorf("expected RST packet within %s but got none: %s", expectTimeout, err)
+				}
+				if !expectRST && got != nil {
+					t.Errorf("expected no RST packet within %s but got one: %s", expectTimeout, got)
+				}
+			})
+		}
+	}
+}
-- 
cgit v1.2.3


From db2a60be67f0e869a58eb12d253a0d7fe13ebfa3 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Sun, 19 Apr 2020 22:14:53 -0700
Subject: Don't accept segments outside the receive window

Fixed to match RFC 793 page 69.

Fixes #1607

PiperOrigin-RevId: 307334892
---
 pkg/tcpip/seqnum/seqnum.go                        |  5 --
 pkg/tcpip/transport/tcp/BUILD                     | 10 +++
 pkg/tcpip/transport/tcp/accept.go                 | 14 ++---
 pkg/tcpip/transport/tcp/connect.go                | 15 +----
 pkg/tcpip/transport/tcp/endpoint.go               | 13 ++++
 pkg/tcpip/transport/tcp/rcv.go                    | 23 +++++--
 pkg/tcpip/transport/tcp/rcv_test.go               | 74 +++++++++++++++++++++++
 pkg/tcpip/transport/tcpconntrack/BUILD            |  1 +
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go | 13 +---
 test/packetimpact/tests/BUILD                     |  2 -
 10 files changed, 125 insertions(+), 45 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/rcv_test.go

diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
index b40a3c212..d3bea7de4 100644
--- a/pkg/tcpip/seqnum/seqnum.go
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -46,11 +46,6 @@ func (v Value) InWindow(first Value, size Size) bool {
 	return v.InRange(first, first.Add(size))
 }
 
-// Overlap checks if the window [a,a+b) overlaps with the window [x, x+y).
-func Overlap(a Value, b Size, x Value, y Size) bool {
-	return a.LessThan(x.Add(y)) && x.LessThan(a.Add(b))
-}
-
 // Add calculates the sequence number following the [v, v+s) window.
 func (v Value) Add(s Size) Value {
 	return v + Value(s)
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index edb7718a6..61426623c 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -109,3 +109,13 @@ go_test(
         "//runsc/testutil",
     ],
 )
+
+go_test(
+    name = "rcv_test",
+    size = "small",
+    srcs = ["rcv_test.go"],
+    deps = [
+        ":tcp",
+        "//pkg/tcpip/seqnum",
+    ],
+)
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 5bb243e3b..e6a23c978 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -101,7 +101,7 @@ type listenContext struct {
 
 	// v6Only is true if listenEP is a dual stack socket and has the
 	// IPV6_V6ONLY option set.
-	v6only bool
+	v6Only bool
 
 	// netProto indicates the network protocol(IPv4/v6) for the listening
 	// endpoint.
@@ -126,12 +126,12 @@ func timeStamp() uint32 {
 }
 
 // newListenContext creates a new listen context.
-func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
+func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
 	l := &listenContext{
 		stack:            stk,
 		rcvWnd:           rcvWnd,
 		hasher:           sha1.New(),
-		v6only:           v6only,
+		v6Only:           v6Only,
 		netProto:         netProto,
 		listenEP:         listenEP,
 		pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
@@ -207,7 +207,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 		netProto = s.route.NetProto
 	}
 	n := newEndpoint(l.stack, netProto, queue)
-	n.v6only = l.v6only
+	n.v6only = l.v6Only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
 	n.route = s.route.Clone()
@@ -293,7 +293,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	}
 
 	// Perform the 3-way handshake.
-	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
+	h := newPassiveHandshake(ep, ep.rcv.rcvWnd, isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.mu.Unlock()
 		ep.Close()
@@ -613,8 +613,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 // its own goroutine and is responsible for handling connection requests.
 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	e.mu.Lock()
-	v6only := e.v6only
-	ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.NetProto)
+	v6Only := e.v6only
+	ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)
 
 	defer func() {
 		// Mark endpoint as closed. This will prevent goroutines running
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 368865911..76e27bf26 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -105,24 +105,11 @@ type handshake struct {
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
-	rcvWndScale := ep.rcvWndScaleForHandshake()
-
-	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
-	// window offered in SYN won't be reduced due to the loss of precision if
-	// window scaling is enabled after the handshake.
-	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
-
-	// Ensure we can always accept at least 1 byte if the scale specified
-	// was too high for the provided rcvWnd.
-	if rcvWnd == 0 {
-		rcvWnd = 1
-	}
-
 	h := handshake{
 		ep:          ep,
 		active:      true,
 		rcvWnd:      rcvWnd,
-		rcvWndScale: int(rcvWndScale),
+		rcvWndScale: ep.rcvWndScaleForHandshake(),
 	}
 	h.resetState()
 	return h
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 7e8def82d..45f2aa78b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1062,6 +1062,19 @@ func (e *endpoint) initialReceiveWindow() int {
 	if rcvWnd > routeWnd {
 		rcvWnd = routeWnd
 	}
+	rcvWndScale := e.rcvWndScaleForHandshake()
+
+	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
+	// window offered in SYN won't be reduced due to the loss of precision if
+	// window scaling is enabled after the handshake.
+	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
+
+	// Ensure we can always accept at least 1 byte if the scale specified
+	// was too high for the provided rcvWnd.
+	if rcvWnd == 0 {
+		rcvWnd = 1
+	}
+
 	return rcvWnd
 }
 
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index caf8977b3..a4b73b588 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -70,13 +70,24 @@ func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale
 // acceptable checks if the segment sequence number range is acceptable
 // according to the table on page 26 of RFC 793.
 func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
-	rcvWnd := r.rcvNxt.Size(r.rcvAcc)
-	if rcvWnd == 0 {
-		return segLen == 0 && segSeq == r.rcvNxt
-	}
+	return Acceptable(segSeq, segLen, r.rcvNxt, r.rcvAcc)
+}
 
-	return segSeq.InWindow(r.rcvNxt, rcvWnd) ||
-		seqnum.Overlap(r.rcvNxt, rcvWnd, segSeq, segLen)
+// Acceptable checks if a segment that starts at segSeq and has length segLen is
+// "acceptable" for arriving in a receive window that starts at rcvNxt and ends
+// before rcvAcc, according to the table on page 26 and 69 of RFC 793.
+func Acceptable(segSeq seqnum.Value, segLen seqnum.Size, rcvNxt, rcvAcc seqnum.Value) bool {
+	if rcvNxt == rcvAcc {
+		return segLen == 0 && segSeq == rcvNxt
+	}
+	if segLen == 0 {
+		// rcvWnd is incremented by 1 because that is Linux's behavior despite the
+		// RFC.
+		return segSeq.InRange(rcvNxt, rcvAcc.Add(1))
+	}
+	// Page 70 of RFC 793 allows packets that can be made "acceptable" by trimming
+	// the payload, so we'll accept any payload that overlaps the receieve window.
+	return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThan(rcvAcc)
 }
 
 // getSendParams returns the parameters needed by the sender when building
diff --git a/pkg/tcpip/transport/tcp/rcv_test.go b/pkg/tcpip/transport/tcp/rcv_test.go
new file mode 100644
index 000000000..dc02729ce
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_test.go
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rcv_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+)
+
+func TestAcceptable(t *testing.T) {
+	for _, tt := range []struct {
+		segSeq         seqnum.Value
+		segLen         seqnum.Size
+		rcvNxt, rcvAcc seqnum.Value
+		want           bool
+	}{
+		// The segment is smaller than the window.
+		{105, 2, 100, 104, false},
+		{105, 2, 101, 105, false},
+		{105, 2, 102, 106, true},
+		{105, 2, 103, 107, true},
+		{105, 2, 104, 108, true},
+		{105, 2, 105, 109, true},
+		{105, 2, 106, 110, true},
+		{105, 2, 107, 111, false},
+
+		// The segment is larger than the window.
+		{105, 4, 103, 105, false},
+		{105, 4, 104, 106, true},
+		{105, 4, 105, 107, true},
+		{105, 4, 106, 108, true},
+		{105, 4, 107, 109, true},
+		{105, 4, 108, 110, true},
+		{105, 4, 109, 111, false},
+		{105, 4, 110, 112, false},
+
+		// The segment has no width.
+		{105, 0, 100, 102, false},
+		{105, 0, 101, 103, false},
+		{105, 0, 102, 104, false},
+		{105, 0, 103, 105, true},
+		{105, 0, 104, 106, true},
+		{105, 0, 105, 107, true},
+		{105, 0, 106, 108, false},
+		{105, 0, 107, 109, false},
+
+		// The receive window has no width.
+		{105, 2, 103, 103, false},
+		{105, 2, 104, 104, false},
+		{105, 2, 105, 105, false},
+		{105, 2, 106, 106, false},
+		{105, 2, 107, 107, false},
+		{105, 2, 108, 108, false},
+		{105, 2, 109, 109, false},
+	} {
+		if got := tcp.Acceptable(tt.segSeq, tt.segLen, tt.rcvNxt, tt.rcvAcc); got != tt.want {
+			t.Errorf("tcp.Acceptable(%d, %d, %d, %d) = %t, want %t", tt.segSeq, tt.segLen, tt.rcvNxt, tt.rcvAcc, got, tt.want)
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 3ad6994a7..2025ff757 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -9,6 +9,7 @@ go_library(
     deps = [
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
+        "//pkg/tcpip/transport/tcp",
     ],
 )
 
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 93712cd45..30d05200f 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -20,6 +20,7 @@ package tcpconntrack
 import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 )
 
 // Result is returned when the state of a TCB is updated in response to an
@@ -311,17 +312,7 @@ type stream struct {
 // the window is zero, if it's a packet with no payload and sequence number
 // equal to una.
 func (s *stream) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
-	wnd := s.una.Size(s.end)
-	if wnd == 0 {
-		return segLen == 0 && segSeq == s.una
-	}
-
-	// Make sure [segSeq, seqSeq+segLen) is non-empty.
-	if segLen == 0 {
-		segLen = 1
-	}
-
-	return seqnum.Overlap(s.una, wnd, segSeq, segLen)
+	return tcp.Acceptable(segSeq, segLen, s.una, s.end)
 }
 
 // closed determines if the stream has already been closed. This happens when
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 1410512e6..47c722ccd 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -43,8 +43,6 @@ packetimpact_go_test(
 packetimpact_go_test(
     name = "tcp_outside_the_window",
     srcs = ["tcp_outside_the_window_test.go"],
-    # TODO(eyalsoha): Fix #1607 then remove the line below.
-    netstack = False,
     deps = [
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
-- 
cgit v1.2.3


From 1a940f2b6c834693d9f85e3b648a3be3d986129d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 20 Apr 2020 08:50:27 -0700
Subject: Resolve issue with file mode for host fds.

Instead of plumbing error through kernfs.Inode.Mode, panic if err != nil.
The errors that can result from an fstat syscall all indicate that something
is fundamentally wrong, and panicking should be acceptable.

PiperOrigin-RevId: 307406847
---
 pkg/sentry/fsimpl/host/host.go | 34 +++-------------------------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 97fa7f7ab..fe14476f1 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -94,7 +94,6 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err
 		isTTY:    isTTY,
 		canMap:   canMap(uint32(fileType)),
 		ino:      fs.NextIno(),
-		mode:     fileMode,
 		// For simplicity, set offset to 0. Technically, we should use the existing
 		// offset on the host if the file is seekable.
 		offset: 0,
@@ -149,20 +148,6 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	ino uint64
 
-	// modeMu protects mode.
-	modeMu sync.Mutex
-
-	// mode is a cached version of the file mode on the host. Note that it may
-	// become out of date if the mode is changed on the host, e.g. with chmod.
-	//
-	// Generally, it is better to retrieve the mode from the host through an
-	// fstat syscall. We only use this value in inode.Mode(), which cannot
-	// return an error, if the syscall to host fails.
-	//
-	// FIXME(b/152294168): Plumb error into Inode.Mode() return value so we
-	// can get rid of this.
-	mode linux.FileMode
-
 	// offsetMu protects offset.
 	offsetMu sync.Mutex
 
@@ -195,10 +180,11 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
 // Mode implements kernfs.Inode.
 func (i *inode) Mode() linux.FileMode {
 	mode, _, _, err := i.getPermissions()
+	// Retrieving the mode from the host fd using fstat(2) should not fail.
+	// If the syscall does not succeed, something is fundamentally wrong.
 	if err != nil {
-		return i.mode
+		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
 	}
-
 	return linux.FileMode(mode)
 }
 
@@ -208,11 +194,6 @@ func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) {
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return 0, 0, 0, err
 	}
-
-	// Update cached mode.
-	i.modeMu.Lock()
-	i.mode = linux.FileMode(s.Mode)
-	i.modeMu.Unlock()
 	return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil
 }
 
@@ -292,12 +273,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro
 		ls.Ino = i.ino
 	}
 
-	// Update cached mode.
-	if (mask&linux.STATX_TYPE != 0) && (mask&linux.STATX_MODE != 0) {
-		i.modeMu.Lock()
-		i.mode = linux.FileMode(s.Mode)
-		i.modeMu.Unlock()
-	}
 	return ls, nil
 }
 
@@ -364,9 +339,6 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
 			return err
 		}
-		i.modeMu.Lock()
-		i.mode = linux.FileMode(s.Mode)
-		i.modeMu.Unlock()
 	}
 	if m&linux.STATX_SIZE != 0 {
 		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
-- 
cgit v1.2.3


From e72ce8cce46ed48af306df50d252853c8790924d Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 20 Apr 2020 10:09:23 -0700
Subject: Change lingering uses of "memfs" in fsimpl/tmpfs to "tmpfs".

PiperOrigin-RevId: 307422746
---
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 ++--
 pkg/sentry/fsimpl/tmpfs/filesystem.go     | 2 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go          | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 383133e44..651912169 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -168,7 +168,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 	}
 }
 
-func BenchmarkVFS2MemfsStat(b *testing.B) {
+func BenchmarkVFS2TmpfsStat(b *testing.B) {
 	for _, depth := range depths {
 		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
 			ctx := contexttest.Context(b)
@@ -362,7 +362,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 	}
 }
 
-func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 	for _, depth := range depths {
 		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
 			ctx := contexttest.Context(b)
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 660f5a29b..452c4e2e0 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -148,7 +148,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
-	// In memfs, the only way to cause a dentry to be disowned is by removing
+	// In tmpfs, the only way to cause a dentry to be disowned is by removing
 	// it from the filesystem, so this check is equivalent to checking if
 	// parent has been removed.
 	if parent.vfsd.IsDisowned() {
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index a59b24d45..82c709b43 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -247,7 +247,7 @@ func (i *inode) incLinksLocked() {
 		panic("tmpfs.inode.incLinksLocked() called with no existing links")
 	}
 	if i.nlink == maxLinks {
-		panic("memfs.inode.incLinksLocked() called with maximum link count")
+		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
 	}
 	atomic.AddUint32(&i.nlink, 1)
 }
-- 
cgit v1.2.3


From 9ba3086d9dfc7e9a4a957446f443786b179cd84e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 20 Apr 2020 11:08:01 -0700
Subject: Move runtime_tests.sh to align with other scripts.

PiperOrigin-RevId: 307435879
---
 kokoro/runtime_tests/go1.12.cfg       |  2 +-
 kokoro/runtime_tests/java11.cfg       |  2 +-
 kokoro/runtime_tests/nodejs12.4.0.cfg |  2 +-
 kokoro/runtime_tests/php7.3.6.cfg     |  2 +-
 kokoro/runtime_tests/python3.7.3.cfg  |  2 +-
 scripts/runtime_tests.sh              | 26 ++++++++++++++++++++++++++
 6 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100755 scripts/runtime_tests.sh

diff --git a/kokoro/runtime_tests/go1.12.cfg b/kokoro/runtime_tests/go1.12.cfg
index fd4911e88..04bfe2868 100644
--- a/kokoro/runtime_tests/go1.12.cfg
+++ b/kokoro/runtime_tests/go1.12.cfg
@@ -1,4 +1,4 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/scripts/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
diff --git a/kokoro/runtime_tests/java11.cfg b/kokoro/runtime_tests/java11.cfg
index 7f8611a08..c82855cd2 100644
--- a/kokoro/runtime_tests/java11.cfg
+++ b/kokoro/runtime_tests/java11.cfg
@@ -1,4 +1,4 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/scripts/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
diff --git a/kokoro/runtime_tests/nodejs12.4.0.cfg b/kokoro/runtime_tests/nodejs12.4.0.cfg
index c67ad5567..5512db5df 100644
--- a/kokoro/runtime_tests/nodejs12.4.0.cfg
+++ b/kokoro/runtime_tests/nodejs12.4.0.cfg
@@ -1,4 +1,4 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/scripts/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
diff --git a/kokoro/runtime_tests/php7.3.6.cfg b/kokoro/runtime_tests/php7.3.6.cfg
index f266c5e26..bc9ac92aa 100644
--- a/kokoro/runtime_tests/php7.3.6.cfg
+++ b/kokoro/runtime_tests/php7.3.6.cfg
@@ -1,4 +1,4 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/scripts/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
diff --git a/kokoro/runtime_tests/python3.7.3.cfg b/kokoro/runtime_tests/python3.7.3.cfg
index 574add152..12eb13860 100644
--- a/kokoro/runtime_tests/python3.7.3.cfg
+++ b/kokoro/runtime_tests/python3.7.3.cfg
@@ -1,4 +1,4 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
+build_file: "github/github/scripts/runtime_tests.sh"
 
 env_vars {
   key: "RUNTIME_TEST_NAME"
diff --git a/scripts/runtime_tests.sh b/scripts/runtime_tests.sh
new file mode 100755
index 000000000..350a59f7c
--- /dev/null
+++ b/scripts/runtime_tests.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Check that a runtime is provided.
+if [ ! -v RUNTIME_TEST_NAME ]; then
+  echo "Must set $RUNTIME_TEST_NAME" >&2
+  exit 1
+fi
+
+install_runsc_for_test runtimes
+test_runsc "//test/runtimes:${RUNTIME_TEST_NAME}_test"
-- 
cgit v1.2.3


From 470633d7e916e7956f4ebd75559f92cf12067cbf Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 20 Apr 2020 12:57:18 -0700
Subject: Fix release.sh. git commands need to be run in git repo.

PiperOrigin-RevId: 307458938
---
 scripts/release.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/release.sh b/scripts/release.sh
index e14ba04a7..ac7eff3ef 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-source $(dirname $0)/common.sh
+cd $(dirname $0)/..
+source scripts/common.sh
 
 # Tag a release only if provided.
 if ! [[ -v KOKORO_RELEASE_COMMIT ]]; then
-- 
cgit v1.2.3


From 1a597e01bed5d5fb30b3d444e0a23669c5587235 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 20 Apr 2020 15:47:16 -0700
Subject: Add a functional vm_test for root_test.

This change renames the tools/images directory to tools/vm for clarity, and
adds a functional vm_test. Sharding is also added to the same test, and some
documentation added around key flags & variables to describe how they work.

Subsequent changes will add vm_tests for other cases, such as the runtime tests.

PiperOrigin-RevId: 307492245
---
 benchmarks/BUILD                         |   4 +-
 benchmarks/runner/commands.py            |   5 +-
 runsc/dockerutil/dockerutil.go           |  12 +-
 test/root/BUILD                          |  14 ++-
 tools/images/BUILD                       |  63 ----------
 tools/images/README.md                   |  42 -------
 tools/images/build.sh                    | 113 ------------------
 tools/images/defs.bzl                    | 191 -----------------------------
 tools/images/execute.sh                  | 152 ------------------------
 tools/images/test.cc                     |  23 ----
 tools/images/ubuntu1604/10_core.sh       |  43 -------
 tools/images/ubuntu1604/20_bazel.sh      |  38 ------
 tools/images/ubuntu1604/25_docker.sh     |  54 ---------
 tools/images/ubuntu1604/30_containerd.sh |  86 --------------
 tools/images/ubuntu1604/40_kokoro.sh     |  72 -----------
 tools/images/ubuntu1604/BUILD            |   7 --
 tools/images/ubuntu1804/BUILD            |   7 --
 tools/images/zone.sh                     |  17 ---
 tools/installers/head.sh                 |   2 +-
 tools/vm/BUILD                           |  57 +++++++++
 tools/vm/README.md                       |  42 +++++++
 tools/vm/build.sh                        | 117 ++++++++++++++++++
 tools/vm/defs.bzl                        | 198 +++++++++++++++++++++++++++++++
 tools/vm/execute.sh                      | 160 +++++++++++++++++++++++++
 tools/vm/test.cc                         |  27 +++++
 tools/vm/ubuntu1604/10_core.sh           |  43 +++++++
 tools/vm/ubuntu1604/20_bazel.sh          |  38 ++++++
 tools/vm/ubuntu1604/25_docker.sh         |  54 +++++++++
 tools/vm/ubuntu1604/30_containerd.sh     |  86 ++++++++++++++
 tools/vm/ubuntu1604/40_kokoro.sh         |  72 +++++++++++
 tools/vm/ubuntu1604/BUILD                |   7 ++
 tools/vm/ubuntu1804/BUILD                |   7 ++
 tools/vm/zone.sh                         |  17 +++
 33 files changed, 954 insertions(+), 916 deletions(-)
 delete mode 100644 tools/images/BUILD
 delete mode 100644 tools/images/README.md
 delete mode 100755 tools/images/build.sh
 delete mode 100644 tools/images/defs.bzl
 delete mode 100755 tools/images/execute.sh
 delete mode 100644 tools/images/test.cc
 delete mode 100755 tools/images/ubuntu1604/10_core.sh
 delete mode 100755 tools/images/ubuntu1604/20_bazel.sh
 delete mode 100755 tools/images/ubuntu1604/25_docker.sh
 delete mode 100755 tools/images/ubuntu1604/30_containerd.sh
 delete mode 100755 tools/images/ubuntu1604/40_kokoro.sh
 delete mode 100644 tools/images/ubuntu1604/BUILD
 delete mode 100644 tools/images/ubuntu1804/BUILD
 delete mode 100755 tools/images/zone.sh
 create mode 100644 tools/vm/BUILD
 create mode 100644 tools/vm/README.md
 create mode 100755 tools/vm/build.sh
 create mode 100644 tools/vm/defs.bzl
 create mode 100755 tools/vm/execute.sh
 create mode 100644 tools/vm/test.cc
 create mode 100755 tools/vm/ubuntu1604/10_core.sh
 create mode 100755 tools/vm/ubuntu1604/20_bazel.sh
 create mode 100755 tools/vm/ubuntu1604/25_docker.sh
 create mode 100755 tools/vm/ubuntu1604/30_containerd.sh
 create mode 100755 tools/vm/ubuntu1604/40_kokoro.sh
 create mode 100644 tools/vm/ubuntu1604/BUILD
 create mode 100644 tools/vm/ubuntu1804/BUILD
 create mode 100755 tools/vm/zone.sh

diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 2a2d15d7e..ac44f479d 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -13,8 +13,8 @@ py_binary(
     data = select({
         ":gcloud_rule": [],
         "//conditions:default": [
-            "//tools/images:ubuntu1604",
-            "//tools/images:zone",
+            "//tools/vm:ubuntu1604",
+            "//tools/vm:zone",
         ],
     }),
     main = "run.py",
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
index e8289f6c5..9a391eb01 100644
--- a/benchmarks/runner/commands.py
+++ b/benchmarks/runner/commands.py
@@ -103,13 +103,12 @@ class GCPCommand(RunCommand):
         ("--image_file",),
         help="The binary that emits the GCP image.",
         default=os.path.join(
-            os.path.dirname(__file__), "../../tools/images/ubuntu1604"),
+            os.path.dirname(__file__), "../../tools/vm/ubuntu1604"),
     )
     zone_file = click.core.Option(
         ("--zone_file",),
         help="The binary that emits the GCP zone.",
-        default=os.path.join(
-            os.path.dirname(__file__), "../../tools/images/zone"),
+        default=os.path.join(os.path.dirname(__file__), "../../tools/vm/zone"),
     )
     internal = click.core.Option(
         ("--internal/--no-internal",),
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 1ff5e8cc3..f009486bc 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -36,8 +36,18 @@ import (
 )
 
 var (
+	// runtime is the runtime to use for tests. This will be applied to all
+	// containers. Note that the default here ("runsc") corresponds to the
+	// default used by the installations. This is important, because the
+	// default installer for vm_tests (in tools/installers:head, invoked
+	// via tools/vm:defs.bzl) will install with this name. So without
+	// changing anything, tests should have a runsc runtime available to
+	// them. Otherwise installers should update the existing runtime
+	// instead of installing a new one.
 	runtime = flag.String("runtime", "runsc", "specify which runtime to use")
-	config  = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
+
+	// config is the default Docker daemon configuration path.
+	config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
 )
 
 // EnsureSupportedDockerVersion checks if correct docker is installed.
diff --git a/test/root/BUILD b/test/root/BUILD
index ddc9b4955..05166673a 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -1,4 +1,5 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/vm:defs.bzl", "vm_test")
 
 package(licenses = ["notice"])
 
@@ -24,7 +25,9 @@ go_test(
     library = ":root",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
-        # Also test only runs as root.
+        # Also, the test needs to be run as root. Note that below, the
+        # root_vm_test relies on the default runtime 'runsc' being installed by
+        # the default installer.
         "manual",
         "local",
     ],
@@ -44,3 +47,12 @@ go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+vm_test(
+    name = "root_vm_test",
+    shard_count = 1,
+    targets = [
+        "//tools/installers:shim",
+        ":root_test",
+    ],
+)
diff --git a/tools/images/BUILD b/tools/images/BUILD
deleted file mode 100644
index 8d319e3e4..000000000
--- a/tools/images/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-load("//tools:defs.bzl", "cc_binary", "gtest")
-load("//tools/images:defs.bzl", "vm_image", "vm_test")
-
-package(
-    default_visibility = ["//:sandbox"],
-    licenses = ["notice"],
-)
-
-sh_binary(
-    name = "zone",
-    srcs = ["zone.sh"],
-)
-
-sh_binary(
-    name = "builder",
-    srcs = ["build.sh"],
-)
-
-sh_binary(
-    name = "executer",
-    srcs = ["execute.sh"],
-)
-
-cc_binary(
-    name = "test",
-    testonly = 1,
-    srcs = ["test.cc"],
-    linkstatic = 1,
-    deps = [
-        gtest,
-        "//test/util:test_main",
-    ],
-)
-
-vm_image(
-    name = "ubuntu1604",
-    family = "ubuntu-1604-lts",
-    project = "ubuntu-os-cloud",
-    scripts = [
-        "//tools/images/ubuntu1604",
-    ],
-)
-
-vm_test(
-    name = "ubuntu1604_test",
-    image = ":ubuntu1604",
-    targets = [":test"],
-)
-
-vm_image(
-    name = "ubuntu1804",
-    family = "ubuntu-1804-lts",
-    project = "ubuntu-os-cloud",
-    scripts = [
-        "//tools/images/ubuntu1804",
-    ],
-)
-
-vm_test(
-    name = "ubuntu1804_test",
-    image = ":ubuntu1804",
-    targets = [":test"],
-)
diff --git a/tools/images/README.md b/tools/images/README.md
deleted file mode 100644
index 26c0f84f2..000000000
--- a/tools/images/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Images
-
-All commands in this directory require the `gcloud` project to be set.
-
-For example: `gcloud config set project gvisor-kokoro-testing`.
-
-Images can be generated by using the `vm_image` rule. This rule will generate a
-binary target that builds an image in an idempotent way, and can be referenced
-from other rules.
-
-For example:
-
-```
-vm_image(
-    name = "ubuntu",
-    project = "ubuntu-1604-lts",
-    family = "ubuntu-os-cloud",
-    scripts = [
-        "script.sh",
-        "other.sh",
-    ],
-)
-```
-
-These images can be built manually by executing the target. The output on
-`stdout` will be the image id (in the current project).
-
-Images are always named per the hash of all the hermetic input scripts. This
-allows images to be memoized quickly and easily.
-
-The `vm_test` rule can be used to execute a command remotely. This is still
-under development however, and will likely change over time.
-
-For example:
-
-```
-vm_test(
-    name = "mycommand",
-    image = ":ubuntu",
-    targets = [":test"],
-)
-```
diff --git a/tools/images/build.sh b/tools/images/build.sh
deleted file mode 100755
index f39f723b8..000000000
--- a/tools/images/build.sh
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script is responsible for building a new GCP image that: 1) has nested
-# virtualization enabled, and 2) has been completely set up with the
-# image_setup.sh script. This script should be idempotent, as we memoize the
-# setup script with a hash and check for that name.
-
-set -eou pipefail
-
-# Parameters.
-declare -r USERNAME=${USERNAME:-test}
-declare -r IMAGE_PROJECT=${IMAGE_PROJECT:-ubuntu-os-cloud}
-declare -r IMAGE_FAMILY=${IMAGE_FAMILY:-ubuntu-1604-lts}
-declare -r ZONE=${ZONE:-us-central1-f}
-
-# Random names.
-declare -r DISK_NAME=$(mktemp -u disk-XXXXXX | tr A-Z a-z)
-declare -r SNAPSHOT_NAME=$(mktemp -u snapshot-XXXXXX | tr A-Z a-z)
-declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
-
-# Hash inputs in order to memoize the produced image.
-declare -r SETUP_HASH=$( (echo ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && cat "$@") | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
-declare -r IMAGE_NAME=${IMAGE_FAMILY:-image}-${SETUP_HASH}
-
-# Does the image already exist? Skip the build.
-declare -r existing=$(set -x; gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
-if ! [[ -z "${existing}" ]]; then
-  echo "${existing}"
-  exit 0
-fi
-
-# gcloud has path errors; is this a result of being a genrule?
-export PATH=${PATH:-/bin:/usr/bin:/usr/local/bin}
-
-# Start a unique instance. Note that this instance will have a unique persistent
-# disk as it's boot disk with the same name as the instance.
-(set -x; gcloud compute instances create \
-    --quiet \
-    --image-project "${IMAGE_PROJECT}" \
-    --image-family "${IMAGE_FAMILY}" \
-    --boot-disk-size "200GB" \
-    --zone "${ZONE}" \
-    "${INSTANCE_NAME}" >/dev/null)
-function cleanup {
-  (set -x; gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}")
-}
-trap cleanup EXIT
-
-# Wait for the instance to become available (up to 5 minutes).
-echo -n "Waiting for ${INSTANCE_NAME}"
-declare timeout=300
-declare success=0
-declare internal=""
-declare -r start=$(date +%s)
-declare -r end=$((${start}+${timeout}))
-while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
-  echo -n "."
-  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
-    success=$((${success}+1))
-  elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
-    success=$((${success}+1))
-    internal="--internal-ip"
-  fi
-done
-
-if [[ "${success}" -eq "0" ]]; then
-  echo "connect timed out after ${timeout} seconds."
-  exit 1
-else
-  echo "done."
-fi
-
-# Run the install scripts provided.
-for arg; do
-  (set -x; gcloud compute ssh ${internal} \
-      --zone "${ZONE}" \
-      "${USERNAME}"@"${INSTANCE_NAME}" -- \
-      sudo bash - <"${arg}" >/dev/null)
-done
-
-# Stop the instance; required before creating an image.
-(set -x; gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null)
-
-# Create a snapshot of the instance disk.
-(set -x; gcloud compute disks snapshot \
-    --quiet \
-    --zone "${ZONE}" \
-    --snapshot-names="${SNAPSHOT_NAME}" \
-    "${INSTANCE_NAME}" >/dev/null)
-
-# Create the disk image.
-(set -x; gcloud compute images create \
-    --quiet \
-    --source-snapshot="${SNAPSHOT_NAME}" \
-    --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
-    "${IMAGE_NAME}" >/dev/null)
-
-# Finish up.
-echo "${IMAGE_NAME}"
diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl
deleted file mode 100644
index 2847e1847..000000000
--- a/tools/images/defs.bzl
+++ /dev/null
@@ -1,191 +0,0 @@
-"""Image configuration. See README.md."""
-
-load("//tools:defs.bzl", "default_installer")
-
-# vm_image_builder is a rule that will construct a shell script that actually
-# generates a given VM image. Note that this does not _run_ the shell script
-# (although it can be run manually). It will be run manually during generation
-# of the vm_image target itself. This level of indirection is used so that the
-# build system itself only runs the builder once when multiple targets depend
-# on it, avoiding a set of races and conflicts.
-def _vm_image_builder_impl(ctx):
-    # Generate a binary that actually builds the image.
-    builder = ctx.actions.declare_file(ctx.label.name)
-    script_paths = []
-    for script in ctx.files.scripts:
-        script_paths.append(script.short_path)
-    builder_content = "\n".join([
-        "#!/bin/bash",
-        "export ZONE=$(%s)" % ctx.files.zone[0].short_path,
-        "export USERNAME=%s" % ctx.attr.username,
-        "export IMAGE_PROJECT=%s" % ctx.attr.project,
-        "export IMAGE_FAMILY=%s" % ctx.attr.family,
-        "%s %s" % (ctx.files._builder[0].short_path, " ".join(script_paths)),
-        "",
-    ])
-    ctx.actions.write(builder, builder_content, is_executable = True)
-
-    # Note that the scripts should only be files, and should not include any
-    # indirect transitive dependencies. The build script wouldn't work.
-    return [DefaultInfo(
-        executable = builder,
-        runfiles = ctx.runfiles(
-            files = ctx.files.scripts + ctx.files._builder + ctx.files.zone,
-        ),
-    )]
-
-vm_image_builder = rule(
-    attrs = {
-        "_builder": attr.label(
-            executable = True,
-            default = "//tools/images:builder",
-            cfg = "host",
-        ),
-        "username": attr.string(default = "$(whoami)"),
-        "zone": attr.label(
-            executable = True,
-            default = "//tools/images:zone",
-            cfg = "host",
-        ),
-        "family": attr.string(mandatory = True),
-        "project": attr.string(mandatory = True),
-        "scripts": attr.label_list(allow_files = True),
-    },
-    executable = True,
-    implementation = _vm_image_builder_impl,
-)
-
-# See vm_image_builder above.
-def _vm_image_impl(ctx):
-    # Run the builder to generate our output.
-    echo = ctx.actions.declare_file(ctx.label.name)
-    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
-        command = "echo -ne \"#!/bin/bash\\necho $(%s)\\n\" > %s && chmod 0755 %s" % (
-            ctx.files.builder[0].path,
-            echo.path,
-            echo.path,
-        ),
-        tools = [ctx.attr.builder],
-    )
-    ctx.actions.run_shell(
-        tools = resolved_inputs,
-        outputs = [echo],
-        progress_message = "Building image...",
-        execution_requirements = {"local": "true"},
-        command = argv,
-        input_manifests = runfiles_manifests,
-    )
-
-    # Return just the echo command. All of the builder runfiles have been
-    # resolved and consumed in the generation of the trivial echo script.
-    return [DefaultInfo(executable = echo)]
-
-_vm_image = rule(
-    attrs = {
-        "builder": attr.label(
-            executable = True,
-            cfg = "host",
-        ),
-    },
-    executable = True,
-    implementation = _vm_image_impl,
-)
-
-def vm_image(name, **kwargs):
-    vm_image_builder(
-        name = name + "_builder",
-        **kwargs
-    )
-    _vm_image(
-        name = name,
-        builder = ":" + name + "_builder",
-    )
-
-def _vm_test_impl(ctx):
-    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
-
-    # Note that the remote execution case must actually generate an
-    # intermediate target in order to collect all the relevant runfiles so that
-    # they can be copied over for remote execution.
-    runner_content = "\n".join([
-        "#!/bin/bash",
-        "export ZONE=$(cat %s)" % ctx.files.zone[0].short_path,
-        "export USERNAME=%s" % ctx.attr.username,
-        "export IMAGE=$(cat %s)" % ctx.files.image[0].short_path,
-        "export SUDO=%s" % "true" if ctx.attr.sudo else "false",
-        "%s %s" % (
-            ctx.executable.executer.short_path,
-            " ".join([
-                target.files_to_run.executable.short_path
-                for target in ctx.attr.targets
-            ]),
-        ),
-        "",
-    ])
-    ctx.actions.write(runner, runner_content, is_executable = True)
-
-    # Return with all transitive files.
-    runfiles = ctx.runfiles(
-        transitive_files = depset(transitive = [
-            depset(target.data_runfiles.files)
-            for target in ctx.attr.targets
-            if hasattr(target, "data_runfiles")
-        ]),
-        files = ctx.files.executer + ctx.files.zone + ctx.files.image +
-                ctx.files.targets,
-        collect_default = True,
-        collect_data = True,
-    )
-    return [DefaultInfo(executable = runner, runfiles = runfiles)]
-
-_vm_test = rule(
-    attrs = {
-        "image": attr.label(
-            mandatory = True,
-            cfg = "host",
-        ),
-        "executer": attr.label(
-            executable = True,
-            default = "//tools/images:executer",
-            cfg = "host",
-        ),
-        "username": attr.string(default = "$(whoami)"),
-        "zone": attr.label(
-            default = "//tools/images:zone",
-            cfg = "host",
-        ),
-        "sudo": attr.bool(default = True),
-        "machine": attr.string(default = "n1-standard-1"),
-        "targets": attr.label_list(
-            mandatory = True,
-            allow_empty = False,
-            cfg = "target",
-        ),
-    },
-    test = True,
-    implementation = _vm_test_impl,
-)
-
-def vm_test(
-        installer = "//tools/installers:head",
-        **kwargs):
-    """Runs the given targets as a remote test.
-
-    Args:
-      installer: Script to run before all targets.
-      **kwargs: All test arguments. Should include targets and image.
-    """
-    targets = kwargs.pop("targets", [])
-    if installer:
-        targets = [installer] + targets
-    if default_installer():
-        targets = [default_installer()] + targets
-    _vm_test(
-        tags = [
-            "local",
-            "manual",
-        ],
-        targets = targets,
-        local = 1,
-        **kwargs
-    )
diff --git a/tools/images/execute.sh b/tools/images/execute.sh
deleted file mode 100755
index ba4b1ac0e..000000000
--- a/tools/images/execute.sh
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Required input.
-if ! [[ -v IMAGE ]]; then
-  echo "no image provided: set IMAGE."
-  exit 1
-fi
-
-# Parameters.
-declare -r USERNAME=${USERNAME:-test}
-declare -r KEYNAME=$(mktemp --tmpdir -u key-XXXXXX)
-declare -r SSHKEYS=$(mktemp --tmpdir -u sshkeys-XXXXXX)
-declare -r INSTANCE_NAME=$(mktemp -u test-XXXXXX | tr A-Z a-z)
-declare -r MACHINE=${MACHINE:-n1-standard-1}
-declare -r ZONE=${ZONE:-us-central1-f}
-declare -r SUDO=${SUDO:-false}
-
-# This script is executed as a test rule, which will reset the value of HOME.
-# Unfortunately, it is needed to load the gconfig credentials. We will reset
-# HOME when we actually execute in the remote environment, defined below.
-export HOME=$(eval echo ~$(whoami))
-
-# Generate unique keys for this test.
-[[ -f "${KEYNAME}" ]] || ssh-keygen -t rsa -N "" -f "${KEYNAME}" -C "${USERNAME}"
-cat > "${SSHKEYS}" <<EOF
-${USERNAME}:$(cat ${KEYNAME}.pub)
-EOF
-
-# Start a unique instance. This means that we first generate a unique set of ssh
-# keys to ensure that only we have access to this instance. Note that we must
-# constrain ourselves to Haswell or greater in order to have nested
-# virtualization available.
-gcloud compute instances create \
-    --min-cpu-platform "Intel Haswell" \
-    --preemptible \
-    --no-scopes \
-    --metadata block-project-ssh-keys=TRUE \
-    --metadata-from-file ssh-keys="${SSHKEYS}" \
-    --machine-type "${MACHINE}" \
-    --image "${IMAGE}" \
-    --zone "${ZONE}" \
-    "${INSTANCE_NAME}"
-function cleanup {
-    gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}"
-}
-trap cleanup EXIT
-
-# Wait for the instance to become available (up to 5 minutes).
-declare timeout=300
-declare success=0
-declare -r start=$(date +%s)
-declare -r end=$((${start}+${timeout}))
-while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
-  if gcloud compute ssh --ssh-key-file="${KEYNAME}" --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
-    success=$((${success}+1))
-  fi
-done
-if [[ "${success}" -eq "0" ]]; then
-  echo "connect timed out after ${timeout} seconds."
-  exit 1
-fi
-
-# Copy the local directory over.
-tar czf - --dereference --exclude=.git . |
-    gcloud compute ssh \
-        --ssh-key-file="${KEYNAME}" \
-        --zone "${ZONE}" \
-        "${USERNAME}"@"${INSTANCE_NAME}" -- tar xzf -
-
-# Execute the command remotely.
-for cmd; do
-  # Setup relevant environment.
-  #
-  # N.B. This is not a complete test environment, but is complete enough to
-  # provide rudimentary sharding and test output support.
-  declare -a PREFIX=( "env" )
-  if [[ -v TEST_SHARD_INDEX ]]; then
-    PREFIX+=( "TEST_SHARD_INDEX=${TEST_SHARD_INDEX}" )
-  fi
-  if [[ -v TEST_SHARD_STATUS_FILE ]]; then
-    SHARD_STATUS_FILE=$(mktemp -u test-shard-status-XXXXXX)
-    PREFIX+=( "TEST_SHARD_STATUS_FILE=/tmp/${SHARD_STATUS_FILE}" )
-  fi
-  if [[ -v TEST_TOTAL_SHARDS ]]; then
-    PREFIX+=( "TEST_TOTAL_SHARDS=${TEST_TOTAL_SHARDS}" )
-  fi
-  if [[ -v TEST_TMPDIR ]]; then
-    REMOTE_TMPDIR=$(mktemp -u test-XXXXXX)
-    PREFIX+=( "TEST_TMPDIR=/tmp/${REMOTE_TMPDIR}" )
-    # Create remotely.
-    gcloud compute ssh \
-      --ssh-key-file="${KEYNAME}" \
-      --zone "${ZONE}" \
-      "${USERNAME}"@"${INSTANCE_NAME}" -- \
-      mkdir -p "/tmp/${REMOTE_TMPDIR}"
-  fi
-  if [[ -v XML_OUTPUT_FILE ]]; then
-    TEST_XML_OUTPUT=$(mktemp -u xml-output-XXXXXX)
-    PREFIX+=( "XML_OUTPUT_FILE=/tmp/${TEST_XML_OUTPUT}" )
-  fi
-  if [[ "${SUDO}" == "true" ]]; then
-    PREFIX+=( "sudo" "-E" )
-  fi
-
-  # Execute the command.
-  gcloud compute ssh \
-    --ssh-key-file="${KEYNAME}" \
-    --zone "${ZONE}" \
-    "${USERNAME}"@"${INSTANCE_NAME}" -- \
-    "${PREFIX[@]}" "${cmd}"
-
-  # Collect relevant results.
-  if [[ -v TEST_SHARD_STATUS_FILE ]]; then
-    gcloud compute scp \
-        --ssh-key-file="${KEYNAME}" \
-        --zone "${ZONE}" \
-        "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${SHARD_STATUS_FILE}" \
-        "${TEST_SHARD_STATUS_FILE}" 2>/dev/null || true # Allowed to fail.
-  fi
-  if [[ -v XML_OUTPUT_FILE ]]; then
-    gcloud compute scp \
-        --ssh-key-file="${KEYNAME}" \
-        --zone "${ZONE}" \
-        "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${TEST_XML_OUTPUT}" \
-        "${XML_OUTPUT_FILE}" 2>/dev/null || true # Allowed to fail.
-  fi
-
-  # Clean up the temporary directory.
-  if [[ -v TEST_TMPDIR ]]; then
-    gcloud compute ssh \
-      --ssh-key-file="${KEYNAME}" \
-      --zone "${ZONE}" \
-      "${USERNAME}"@"${INSTANCE_NAME}" -- \
-      rm -rf "/tmp/${REMOTE_TMPDIR}"
-  fi
-done
diff --git a/tools/images/test.cc b/tools/images/test.cc
deleted file mode 100644
index 4f31d93c5..000000000
--- a/tools/images/test.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gtest/gtest.h"
-
-namespace {
-
-TEST(Image, Sanity) {
-  // Do nothing.
-}
-
-}  // namespace
diff --git a/tools/images/ubuntu1604/10_core.sh b/tools/images/ubuntu1604/10_core.sh
deleted file mode 100755
index cd518d6ac..000000000
--- a/tools/images/ubuntu1604/10_core.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Install all essential build tools.
-while true; do
-  if (apt-get update && apt-get install -y \
-      make \
-      git-core \
-      build-essential \
-      linux-headers-$(uname -r) \
-      pkg-config); then
-    break
-  fi
-  result=$?
-  if [[ $result -ne 100 ]]; then
-    exit $result
-  fi
-done
-
-# Install a recent go toolchain.
-if ! [[ -d /usr/local/go ]]; then
-    wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
-    tar -xvf go1.13.5.linux-amd64.tar.gz
-    mv go /usr/local
-fi
-
-# Link the Go binary from /usr/bin; replacing anything there.
-(cd /usr/bin && rm -f go && sudo ln -fs /usr/local/go/bin/go go)
diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/images/ubuntu1604/20_bazel.sh
deleted file mode 100755
index bb7afa676..000000000
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-declare -r BAZEL_VERSION=2.0.0
-
-# Install bazel dependencies.
-while true; do
-  if (apt-get update && apt-get install -y \
-      openjdk-8-jdk-headless \
-      unzip); then
-    break
-  fi
-  result=$?
-  if [[ $result -ne 100 ]]; then
-    exit $result
-  fi
-done
-
-# Use the release installer.
-curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
-chmod a+x bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
-./bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
-rm -f bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
diff --git a/tools/images/ubuntu1604/25_docker.sh b/tools/images/ubuntu1604/25_docker.sh
deleted file mode 100755
index 11eea2d72..000000000
--- a/tools/images/ubuntu1604/25_docker.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Add dependencies.
-while true; do
-  if (apt-get update && apt-get install -y \
-      apt-transport-https \
-      ca-certificates \
-      curl \
-      gnupg-agent \
-      software-properties-common); then
-    break
-  fi
-  result=$?
-  if [[ $result -ne 100 ]]; then
-    exit $result
-  fi
-done
-
-# Install the key.
-curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
-
-# Add the repository.
-add-apt-repository \
-   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
-   $(lsb_release -cs) \
-   stable"
-
-# Install docker.
-while true; do
-  if (apt-get update && apt-get install -y \
-      docker-ce \
-      docker-ce-cli \
-      containerd.io); then
-    break
-  fi
-  result=$?
-  if [[ $result -ne 100 ]]; then
-    exit $result
-  fi
-done
diff --git a/tools/images/ubuntu1604/30_containerd.sh b/tools/images/ubuntu1604/30_containerd.sh
deleted file mode 100755
index fb3699c12..000000000
--- a/tools/images/ubuntu1604/30_containerd.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Helper for Go packages below.
-install_helper() {
-  PACKAGE="${1}"
-  TAG="${2}"
-  GOPATH="${3}"
-
-  # Clone the repository.
-  mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
-     git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
-
-  # Checkout and build the repository.
-  (cd "${GOPATH}"/src/"${PACKAGE}" && \
-      git checkout "${TAG}" && \
-      GOPATH="${GOPATH}" make && \
-      GOPATH="${GOPATH}" make install)
-}
-
-# Install dependencies for the crictl tests.
-while true; do
-  if (apt-get update && apt-get install -y \
-      btrfs-tools \
-      libseccomp-dev); then
-    break
-  fi
-  result=$?
-  if [[ $result -ne 100 ]]; then
-    exit $result
-  fi
-done
-
-# Install containerd & cri-tools.
-GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
-install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
-install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
-
-# Install gvisor-containerd-shim.
-declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
-declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
-declare -r shim_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX)
-wget --no-verbose "${base}"/latest -O ${latest}
-wget --no-verbose "${base}"/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
-chmod +x ${shim_path}
-mv ${shim_path} /usr/local/bin
-
-# Configure containerd-shim.
-declare -r shim_config_path=/etc/containerd
-declare -r shim_config_tmp_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX.toml)
-mkdir -p ${shim_config_path}
-cat > ${shim_config_tmp_path} <<-EOF
-    runc_shim = "/usr/local/bin/containerd-shim"
-
-[runsc_config]
-    debug = "true"
-    debug-log = "/tmp/runsc-logs/"
-    strace = "true"
-    file-access = "shared"
-EOF
-mv ${shim_config_tmp_path} ${shim_config_path}
-
-# Configure CNI.
-(cd "${GOPATH}" && GOPATH="${GOPATH}" \
-    src/github.com/containerd/containerd/script/setup/install-cni)
-
-# Cleanup the above.
-rm -rf "${GOPATH}"
-rm -rf "${latest}"
-rm -rf "${shim_path}"
-rm -rf "${shim_config_tmp_path}"
diff --git a/tools/images/ubuntu1604/40_kokoro.sh b/tools/images/ubuntu1604/40_kokoro.sh
deleted file mode 100755
index 06a1e6c48..000000000
--- a/tools/images/ubuntu1604/40_kokoro.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeo pipefail
-
-# Declare kokoro's required public keys.
-declare -r ssh_public_keys=(
-    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDg7L/ZaEauETWrPklUTky3kvxqQfe2Ax/2CsSqhNIGNMnK/8d79CHlmY9+dE1FFQ/RzKNCaltgy7XcN/fCYiCZr5jm2ZtnLuGNOTzupMNhaYiPL419qmL+5rZXt4/dWTrsHbFRACxT8j51PcRMO5wgbL0Bg2XXimbx8kDFaurL2gqduQYqlu4lxWCaJqOL71WogcimeL63Nq/yeH5PJPWpqE4P9VUQSwAzBWFK/hLeds/AiP3MgVS65qHBnhq0JsHy8JQsqjZbG7Iidt/Ll0+gqzEbi62gDIcczG4KC0iOVzDDP/1BxDtt1lKeA23ll769Fcm3rJyoBMYxjvdw1TDx sabujp@trigger.mtv.corp.google.com"
-    "ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBNgGK/hCdjmulHfRE3hp4rZs38NCR8yAh0eDsztxqGcuXnuSnL7jOlRrbcQpremJ84omD4eKrIpwJUs+YokMdv4= sabujp@trigger.svl.corp.google.com"
-)
-
-# Install dependencies.
-while true; do
-  if (apt-get update && apt-get install -y \
-      rsync \
-      coreutils \
-      python-psutil \
-      qemu-kvm \
-      python-pip \
-      python3-pip \
-      zip); then
-    break
-  fi
-  result=$?
-  if [[ $result -ne 100 ]]; then
-    exit $result
-  fi
-done
-
-# junitparser is used to merge junit xml files.
-pip install junitparser
-
-# We need a kbuilder user.
-if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
-    # User was added successfully; we add the relevant SSH keys here.
-    mkdir -p ~kbuilder/.ssh
-    (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
-    chmod 0600 ~kbuilder/.ssh/authorized_keys
-    chown -R kbuilder ~kbuilder/.ssh
-fi
-
-# Give passwordless sudo access.
-cat > /etc/sudoers.d/kokoro <<EOF
-kbuilder ALL=(ALL) NOPASSWD:ALL
-EOF
-
-# Ensure we can run Docker without sudo.
-usermod -aG docker kbuilder
-
-# Ensure that we can access kvm.
-usermod -aG kvm kbuilder
-
-# Ensure that /tmpfs exists and is writable by kokoro.
-#
-# Note that kokoro will typically attach a second disk (sdb) to the instance
-# that is used for the /tmpfs volume. In the future we could setup an init
-# script that formats and mounts this here; however, we don't expect our build
-# artifacts to be that large.
-mkdir -p /tmpfs && chmod 0777 /tmpfs && touch /tmpfs/READY
diff --git a/tools/images/ubuntu1604/BUILD b/tools/images/ubuntu1604/BUILD
deleted file mode 100644
index ab1df0c4c..000000000
--- a/tools/images/ubuntu1604/BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-package(licenses = ["notice"])
-
-filegroup(
-    name = "ubuntu1604",
-    srcs = glob(["*.sh"]),
-    visibility = ["//:sandbox"],
-)
diff --git a/tools/images/ubuntu1804/BUILD b/tools/images/ubuntu1804/BUILD
deleted file mode 100644
index 7aa1ecdf7..000000000
--- a/tools/images/ubuntu1804/BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-package(licenses = ["notice"])
-
-alias(
-    name = "ubuntu1804",
-    actual = "//tools/images/ubuntu1604",
-    visibility = ["//:sandbox"],
-)
diff --git a/tools/images/zone.sh b/tools/images/zone.sh
deleted file mode 100755
index 79569fb19..000000000
--- a/tools/images/zone.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Copyright 2020 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-exec gcloud config get-value compute/zone
diff --git a/tools/installers/head.sh b/tools/installers/head.sh
index 9de8f138c..7fc566ebd 100755
--- a/tools/installers/head.sh
+++ b/tools/installers/head.sh
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 # Install our runtime.
-$(dirname $0)/runsc install
+$(find . -executable -type f -name runsc) install
 
 # Restart docker.
 service docker restart || true
diff --git a/tools/vm/BUILD b/tools/vm/BUILD
new file mode 100644
index 000000000..f7160c627
--- /dev/null
+++ b/tools/vm/BUILD
@@ -0,0 +1,57 @@
+load("//tools:defs.bzl", "cc_binary", "gtest")
+load("//tools/vm:defs.bzl", "vm_image", "vm_test")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+sh_binary(
+    name = "zone",
+    srcs = ["zone.sh"],
+)
+
+sh_binary(
+    name = "builder",
+    srcs = ["build.sh"],
+)
+
+sh_binary(
+    name = "executer",
+    srcs = ["execute.sh"],
+)
+
+cc_binary(
+    name = "test",
+    testonly = 1,
+    srcs = ["test.cc"],
+    linkstatic = 1,
+    deps = [
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+vm_image(
+    name = "ubuntu1604",
+    family = "ubuntu-1604-lts",
+    project = "ubuntu-os-cloud",
+    scripts = [
+        "//tools/vm/ubuntu1604",
+    ],
+)
+
+vm_image(
+    name = "ubuntu1804",
+    family = "ubuntu-1804-lts",
+    project = "ubuntu-os-cloud",
+    scripts = [
+        "//tools/vm/ubuntu1804",
+    ],
+)
+
+vm_test(
+    name = "vm_test",
+    shard_count = 2,
+    targets = [":test"],
+)
diff --git a/tools/vm/README.md b/tools/vm/README.md
new file mode 100644
index 000000000..898c95fca
--- /dev/null
+++ b/tools/vm/README.md
@@ -0,0 +1,42 @@
+# VM Images & Tests
+
+All commands in this directory require the `gcloud` project to be set.
+
+For example: `gcloud config set project gvisor-kokoro-testing`.
+
+Images can be generated by using the `vm_image` rule. This rule will generate a
+binary target that builds an image in an idempotent way, and can be referenced
+from other rules.
+
+For example:
+
+```
+vm_image(
+    name = "ubuntu",
+    project = "ubuntu-1604-lts",
+    family = "ubuntu-os-cloud",
+    scripts = [
+        "script.sh",
+        "other.sh",
+    ],
+)
+```
+
+These images can be built manually by executing the target. The output on
+`stdout` will be the image id (in the current project).
+
+Images are always named per the hash of all the hermetic input scripts. This
+allows images to be memoized quickly and easily.
+
+The `vm_test` rule can be used to execute a command remotely. This is still
+under development however, and will likely change over time.
+
+For example:
+
+```
+vm_test(
+    name = "mycommand",
+    image = ":ubuntu",
+    targets = [":test"],
+)
+```
diff --git a/tools/vm/build.sh b/tools/vm/build.sh
new file mode 100755
index 000000000..5d3dc0bbf
--- /dev/null
+++ b/tools/vm/build.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script is responsible for building a new GCP image that: 1) has nested
+# virtualization enabled, and 2) has been completely set up with the
+# image_setup.sh script. This script should be idempotent, as we memoize the
+# setup script with a hash and check for that name.
+
+set -eou pipefail
+
+# Parameters.
+declare -r USERNAME=${USERNAME:-test}
+declare -r IMAGE_PROJECT=${IMAGE_PROJECT:-ubuntu-os-cloud}
+declare -r IMAGE_FAMILY=${IMAGE_FAMILY:-ubuntu-1604-lts}
+declare -r ZONE=${ZONE:-us-central1-f}
+
+# Random names.
+declare -r DISK_NAME=$(mktemp -u disk-XXXXXX | tr A-Z a-z)
+declare -r SNAPSHOT_NAME=$(mktemp -u snapshot-XXXXXX | tr A-Z a-z)
+declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
+
+# Hash inputs in order to memoize the produced image.
+declare -r SETUP_HASH=$( (echo ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && cat "$@") | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
+declare -r IMAGE_NAME=${IMAGE_FAMILY:-image}-${SETUP_HASH}
+
+# Does the image already exist? Skip the build.
+declare -r existing=$(set -x; gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
+if ! [[ -z "${existing}" ]]; then
+  echo "${existing}"
+  exit 0
+fi
+
+# Standard arguments (applies only on script execution).
+declare -ar SSH_ARGS=("-o" "ConnectTimeout=60" "--")
+
+# gcloud has path errors; is this a result of being a genrule?
+export PATH=${PATH:-/bin:/usr/bin:/usr/local/bin}
+
+# Start a unique instance. Note that this instance will have a unique persistent
+# disk as it's boot disk with the same name as the instance.
+(set -x; gcloud compute instances create \
+    --quiet \
+    --image-project "${IMAGE_PROJECT}" \
+    --image-family "${IMAGE_FAMILY}" \
+    --boot-disk-size "200GB" \
+    --zone "${ZONE}" \
+    "${INSTANCE_NAME}" >/dev/null)
+function cleanup {
+  (set -x; gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}")
+}
+trap cleanup EXIT
+
+# Wait for the instance to become available (up to 5 minutes).
+echo -n "Waiting for ${INSTANCE_NAME}"
+declare timeout=300
+declare success=0
+declare internal=""
+declare -r start=$(date +%s)
+declare -r end=$((${start}+${timeout}))
+while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
+  echo -n "."
+  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
+    success=$((${success}+1))
+  elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
+    success=$((${success}+1))
+    internal="--internal-ip"
+  fi
+done
+
+if [[ "${success}" -eq "0" ]]; then
+  echo "connect timed out after ${timeout} seconds."
+  exit 1
+else
+  echo "done."
+fi
+
+# Run the install scripts provided.
+for arg; do
+  (set -x; gcloud compute ssh ${internal} \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      "${SSH_ARGS[@]}" \
+      sudo bash - <"${arg}" >/dev/null)
+done
+
+# Stop the instance; required before creating an image.
+(set -x; gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null)
+
+# Create a snapshot of the instance disk.
+(set -x; gcloud compute disks snapshot \
+    --quiet \
+    --zone "${ZONE}" \
+    --snapshot-names="${SNAPSHOT_NAME}" \
+    "${INSTANCE_NAME}" >/dev/null)
+
+# Create the disk image.
+(set -x; gcloud compute images create \
+    --quiet \
+    --source-snapshot="${SNAPSHOT_NAME}" \
+    --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
+    "${IMAGE_NAME}" >/dev/null)
+
+# Finish up.
+echo "${IMAGE_NAME}"
diff --git a/tools/vm/defs.bzl b/tools/vm/defs.bzl
new file mode 100644
index 000000000..24bf0aabc
--- /dev/null
+++ b/tools/vm/defs.bzl
@@ -0,0 +1,198 @@
+"""Image configuration. See README.md."""
+
+load("//tools:defs.bzl", "default_installer")
+
+# vm_image_builder is a rule that will construct a shell script that actually
+# generates a given VM image. Note that this does not _run_ the shell script
+# (although it can be run manually). It will be run manually during generation
+# of the vm_image target itself. This level of indirection is used so that the
+# build system itself only runs the builder once when multiple targets depend
+# on it, avoiding a set of races and conflicts.
+def _vm_image_builder_impl(ctx):
+    # Generate a binary that actually builds the image.
+    builder = ctx.actions.declare_file(ctx.label.name)
+    script_paths = []
+    for script in ctx.files.scripts:
+        script_paths.append(script.short_path)
+    builder_content = "\n".join([
+        "#!/bin/bash",
+        "export ZONE=$(%s)" % ctx.files.zone[0].short_path,
+        "export USERNAME=%s" % ctx.attr.username,
+        "export IMAGE_PROJECT=%s" % ctx.attr.project,
+        "export IMAGE_FAMILY=%s" % ctx.attr.family,
+        "%s %s" % (ctx.files._builder[0].short_path, " ".join(script_paths)),
+        "",
+    ])
+    ctx.actions.write(builder, builder_content, is_executable = True)
+
+    # Note that the scripts should only be files, and should not include any
+    # indirect transitive dependencies. The build script wouldn't work.
+    return [DefaultInfo(
+        executable = builder,
+        runfiles = ctx.runfiles(
+            files = ctx.files.scripts + ctx.files._builder + ctx.files.zone,
+        ),
+    )]
+
+vm_image_builder = rule(
+    attrs = {
+        "_builder": attr.label(
+            executable = True,
+            default = "//tools/vm:builder",
+            cfg = "host",
+        ),
+        "username": attr.string(default = "$(whoami)"),
+        "zone": attr.label(
+            executable = True,
+            default = "//tools/vm:zone",
+            cfg = "host",
+        ),
+        "family": attr.string(mandatory = True),
+        "project": attr.string(mandatory = True),
+        "scripts": attr.label_list(allow_files = True),
+    },
+    executable = True,
+    implementation = _vm_image_builder_impl,
+)
+
+# See vm_image_builder above.
+def _vm_image_impl(ctx):
+    # Run the builder to generate our output.
+    echo = ctx.actions.declare_file(ctx.label.name)
+    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
+        command = "echo -ne \"#!/bin/bash\\necho $(%s)\\n\" > %s && chmod 0755 %s" % (
+            ctx.files.builder[0].path,
+            echo.path,
+            echo.path,
+        ),
+        tools = [ctx.attr.builder],
+    )
+    ctx.actions.run_shell(
+        tools = resolved_inputs,
+        outputs = [echo],
+        progress_message = "Building image...",
+        execution_requirements = {"local": "true"},
+        command = argv,
+        input_manifests = runfiles_manifests,
+    )
+
+    # Return just the echo command. All of the builder runfiles have been
+    # resolved and consumed in the generation of the trivial echo script.
+    return [DefaultInfo(executable = echo)]
+
+_vm_image_test = rule(
+    attrs = {
+        "builder": attr.label(
+            executable = True,
+            cfg = "host",
+        ),
+    },
+    test = True,
+    implementation = _vm_image_impl,
+)
+
+def vm_image(name, **kwargs):
+    vm_image_builder(
+        name = name + "_builder",
+        **kwargs
+    )
+    _vm_image_test(
+        name = name,
+        builder = ":" + name + "_builder",
+        tags = [
+            "local",
+            "manual",
+        ],
+    )
+
+def _vm_test_impl(ctx):
+    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
+
+    # Note that the remote execution case must actually generate an
+    # intermediate target in order to collect all the relevant runfiles so that
+    # they can be copied over for remote execution.
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "export ZONE=$(%s)" % ctx.files.zone[0].short_path,
+        "export USERNAME=%s" % ctx.attr.username,
+        "export IMAGE=$(%s)" % ctx.files.image[0].short_path,
+        "export SUDO=%s" % "true" if ctx.attr.sudo else "false",
+        "%s %s" % (
+            ctx.executable.executer.short_path,
+            " ".join([
+                target.files_to_run.executable.short_path
+                for target in ctx.attr.targets
+            ]),
+        ),
+        "",
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return with all transitive files.
+    runfiles = ctx.runfiles(
+        transitive_files = depset(transitive = [
+            depset(target.data_runfiles.files)
+            for target in ctx.attr.targets
+            if hasattr(target, "data_runfiles")
+        ]),
+        files = ctx.files.executer + ctx.files.zone + ctx.files.image +
+                ctx.files.targets,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_vm_test = rule(
+    attrs = {
+        "image": attr.label(
+            executable = True,
+            default = "//tools/vm:ubuntu1804",
+            cfg = "host",
+        ),
+        "executer": attr.label(
+            executable = True,
+            default = "//tools/vm:executer",
+            cfg = "host",
+        ),
+        "username": attr.string(default = "$(whoami)"),
+        "zone": attr.label(
+            executable = True,
+            default = "//tools/vm:zone",
+            cfg = "host",
+        ),
+        "sudo": attr.bool(default = True),
+        "machine": attr.string(default = "n1-standard-1"),
+        "targets": attr.label_list(
+            mandatory = True,
+            allow_empty = False,
+            cfg = "target",
+        ),
+    },
+    test = True,
+    implementation = _vm_test_impl,
+)
+
+def vm_test(
+        installers = None,
+        **kwargs):
+    """Runs the given targets as a remote test.
+
+    Args:
+      installer: Script to run before all targets.
+      **kwargs: All test arguments. Should include targets and image.
+    """
+    targets = kwargs.pop("targets", [])
+    if installers == None:
+        installers = ["//tools/installers:head"]
+    targets = installers + targets
+    if default_installer():
+        targets = [default_installer()] + targets
+    _vm_test(
+        tags = [
+            "local",
+            "manual",
+        ],
+        targets = targets,
+        local = 1,
+        **kwargs
+    )
diff --git a/tools/vm/execute.sh b/tools/vm/execute.sh
new file mode 100755
index 000000000..1f1f3ce01
--- /dev/null
+++ b/tools/vm/execute.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Required input.
+if ! [[ -v IMAGE ]]; then
+  echo "no image provided: set IMAGE."
+  exit 1
+fi
+
+# Parameters.
+declare -r USERNAME=${USERNAME:-test}
+declare -r KEYNAME=$(mktemp --tmpdir -u key-XXXXXX)
+declare -r SSHKEYS=$(mktemp --tmpdir -u sshkeys-XXXXXX)
+declare -r INSTANCE_NAME=$(mktemp -u test-XXXXXX | tr A-Z a-z)
+declare -r MACHINE=${MACHINE:-n1-standard-1}
+declare -r ZONE=${ZONE:-us-central1-f}
+declare -r SUDO=${SUDO:-false}
+
+# Standard arguments (applies only on script execution).
+declare -ar SSH_ARGS=("-o" "ConnectTimeout=60" "--")
+
+# This script is executed as a test rule, which will reset the value of HOME.
+# Unfortunately, it is needed to load the gconfig credentials. We will reset
+# HOME when we actually execute in the remote environment, defined below.
+export HOME=$(eval echo ~$(whoami))
+
+# Generate unique keys for this test.
+[[ -f "${KEYNAME}" ]] || ssh-keygen -t rsa -N "" -f "${KEYNAME}" -C "${USERNAME}"
+cat > "${SSHKEYS}" <<EOF
+${USERNAME}:$(cat ${KEYNAME}.pub)
+EOF
+
+# Start a unique instance. This means that we first generate a unique set of ssh
+# keys to ensure that only we have access to this instance. Note that we must
+# constrain ourselves to Haswell or greater in order to have nested
+# virtualization available.
+gcloud compute instances create \
+    --min-cpu-platform "Intel Haswell" \
+    --preemptible \
+    --no-scopes \
+    --metadata block-project-ssh-keys=TRUE \
+    --metadata-from-file ssh-keys="${SSHKEYS}" \
+    --machine-type "${MACHINE}" \
+    --image "${IMAGE}" \
+    --zone "${ZONE}" \
+    "${INSTANCE_NAME}"
+function cleanup {
+    gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}"
+}
+trap cleanup EXIT
+
+# Wait for the instance to become available (up to 5 minutes).
+declare timeout=300
+declare success=0
+declare -r start=$(date +%s)
+declare -r end=$((${start}+${timeout}))
+while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
+  if gcloud compute ssh --ssh-key-file="${KEYNAME}" --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
+    success=$((${success}+1))
+  fi
+done
+if [[ "${success}" -eq "0" ]]; then
+  echo "connect timed out after ${timeout} seconds."
+  exit 1
+fi
+
+# Copy the local directory over.
+tar czf - --dereference --exclude=.git . |
+    gcloud compute ssh \
+        --ssh-key-file="${KEYNAME}" \
+        --zone "${ZONE}" \
+        "${USERNAME}"@"${INSTANCE_NAME}" -- \
+        "${SSH_ARGS[@]}" \
+        tar xzf -
+
+# Execute the command remotely.
+for cmd; do
+  # Setup relevant environment.
+  #
+  # N.B. This is not a complete test environment, but is complete enough to
+  # provide rudimentary sharding and test output support.
+  declare -a PREFIX=( "env" )
+  if [[ -v TEST_SHARD_INDEX ]]; then
+    PREFIX+=( "TEST_SHARD_INDEX=${TEST_SHARD_INDEX}" )
+  fi
+  if [[ -v TEST_SHARD_STATUS_FILE ]]; then
+    SHARD_STATUS_FILE=$(mktemp -u test-shard-status-XXXXXX)
+    PREFIX+=( "TEST_SHARD_STATUS_FILE=/tmp/${SHARD_STATUS_FILE}" )
+  fi
+  if [[ -v TEST_TOTAL_SHARDS ]]; then
+    PREFIX+=( "TEST_TOTAL_SHARDS=${TEST_TOTAL_SHARDS}" )
+  fi
+  if [[ -v TEST_TMPDIR ]]; then
+    REMOTE_TMPDIR=$(mktemp -u test-XXXXXX)
+    PREFIX+=( "TEST_TMPDIR=/tmp/${REMOTE_TMPDIR}" )
+    # Create remotely.
+    gcloud compute ssh \
+      --ssh-key-file="${KEYNAME}" \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      "${SSH_ARGS[@]}" \
+      mkdir -p "/tmp/${REMOTE_TMPDIR}"
+  fi
+  if [[ -v XML_OUTPUT_FILE ]]; then
+    TEST_XML_OUTPUT=$(mktemp -u xml-output-XXXXXX)
+    PREFIX+=( "XML_OUTPUT_FILE=/tmp/${TEST_XML_OUTPUT}" )
+  fi
+  if [[ "${SUDO}" == "true" ]]; then
+    PREFIX+=( "sudo" "-E" )
+  fi
+
+  # Execute the command.
+  gcloud compute ssh \
+    --ssh-key-file="${KEYNAME}" \
+    --zone "${ZONE}" \
+    "${USERNAME}"@"${INSTANCE_NAME}" -- \
+    "${SSH_ARGS[@]}" \
+    "${PREFIX[@]}" "${cmd}"
+
+  # Collect relevant results.
+  if [[ -v TEST_SHARD_STATUS_FILE ]]; then
+    gcloud compute scp \
+        --ssh-key-file="${KEYNAME}" \
+        --zone "${ZONE}" \
+        "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${SHARD_STATUS_FILE}" \
+        "${TEST_SHARD_STATUS_FILE}" 2>/dev/null || true # Allowed to fail.
+  fi
+  if [[ -v XML_OUTPUT_FILE ]]; then
+    gcloud compute scp \
+        --ssh-key-file="${KEYNAME}" \
+        --zone "${ZONE}" \
+        "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${TEST_XML_OUTPUT}" \
+        "${XML_OUTPUT_FILE}" 2>/dev/null || true # Allowed to fail.
+  fi
+
+  # Clean up the temporary directory.
+  if [[ -v TEST_TMPDIR ]]; then
+    gcloud compute ssh \
+      --ssh-key-file="${KEYNAME}" \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      "${SSH_ARGS[@]}" \
+      rm -rf "/tmp/${REMOTE_TMPDIR}"
+  fi
+done
diff --git a/tools/vm/test.cc b/tools/vm/test.cc
new file mode 100644
index 000000000..c0ceacda1
--- /dev/null
+++ b/tools/vm/test.cc
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+namespace {
+
+TEST(Image, Sanity0) {
+  // Do nothing (in shard 0).
+}
+
+TEST(Image, Sanity1) {
+  // Do nothing (in shard 1).
+}
+
+}  // namespace
diff --git a/tools/vm/ubuntu1604/10_core.sh b/tools/vm/ubuntu1604/10_core.sh
new file mode 100755
index 000000000..cd518d6ac
--- /dev/null
+++ b/tools/vm/ubuntu1604/10_core.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Install all essential build tools.
+while true; do
+  if (apt-get update && apt-get install -y \
+      make \
+      git-core \
+      build-essential \
+      linux-headers-$(uname -r) \
+      pkg-config); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+# Install a recent go toolchain.
+if ! [[ -d /usr/local/go ]]; then
+    wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
+    tar -xvf go1.13.5.linux-amd64.tar.gz
+    mv go /usr/local
+fi
+
+# Link the Go binary from /usr/bin; replacing anything there.
+(cd /usr/bin && rm -f go && sudo ln -fs /usr/local/go/bin/go go)
diff --git a/tools/vm/ubuntu1604/20_bazel.sh b/tools/vm/ubuntu1604/20_bazel.sh
new file mode 100755
index 000000000..bb7afa676
--- /dev/null
+++ b/tools/vm/ubuntu1604/20_bazel.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+declare -r BAZEL_VERSION=2.0.0
+
+# Install bazel dependencies.
+while true; do
+  if (apt-get update && apt-get install -y \
+      openjdk-8-jdk-headless \
+      unzip); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+# Use the release installer.
+curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+chmod a+x bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+./bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+rm -f bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
diff --git a/tools/vm/ubuntu1604/25_docker.sh b/tools/vm/ubuntu1604/25_docker.sh
new file mode 100755
index 000000000..11eea2d72
--- /dev/null
+++ b/tools/vm/ubuntu1604/25_docker.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Add dependencies.
+while true; do
+  if (apt-get update && apt-get install -y \
+      apt-transport-https \
+      ca-certificates \
+      curl \
+      gnupg-agent \
+      software-properties-common); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+# Install the key.
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
+
+# Add the repository.
+add-apt-repository \
+   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+   $(lsb_release -cs) \
+   stable"
+
+# Install docker.
+while true; do
+  if (apt-get update && apt-get install -y \
+      docker-ce \
+      docker-ce-cli \
+      containerd.io); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
diff --git a/tools/vm/ubuntu1604/30_containerd.sh b/tools/vm/ubuntu1604/30_containerd.sh
new file mode 100755
index 000000000..fb3699c12
--- /dev/null
+++ b/tools/vm/ubuntu1604/30_containerd.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Helper for Go packages below.
+install_helper() {
+  PACKAGE="${1}"
+  TAG="${2}"
+  GOPATH="${3}"
+
+  # Clone the repository.
+  mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
+     git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
+
+  # Checkout and build the repository.
+  (cd "${GOPATH}"/src/"${PACKAGE}" && \
+      git checkout "${TAG}" && \
+      GOPATH="${GOPATH}" make && \
+      GOPATH="${GOPATH}" make install)
+}
+
+# Install dependencies for the crictl tests.
+while true; do
+  if (apt-get update && apt-get install -y \
+      btrfs-tools \
+      libseccomp-dev); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+# Install containerd & cri-tools.
+GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
+install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
+install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
+
+# Install gvisor-containerd-shim.
+declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
+declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
+declare -r shim_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX)
+wget --no-verbose "${base}"/latest -O ${latest}
+wget --no-verbose "${base}"/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
+chmod +x ${shim_path}
+mv ${shim_path} /usr/local/bin
+
+# Configure containerd-shim.
+declare -r shim_config_path=/etc/containerd
+declare -r shim_config_tmp_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX.toml)
+mkdir -p ${shim_config_path}
+cat > ${shim_config_tmp_path} <<-EOF
+    runc_shim = "/usr/local/bin/containerd-shim"
+
+[runsc_config]
+    debug = "true"
+    debug-log = "/tmp/runsc-logs/"
+    strace = "true"
+    file-access = "shared"
+EOF
+mv ${shim_config_tmp_path} ${shim_config_path}
+
+# Configure CNI.
+(cd "${GOPATH}" && GOPATH="${GOPATH}" \
+    src/github.com/containerd/containerd/script/setup/install-cni)
+
+# Cleanup the above.
+rm -rf "${GOPATH}"
+rm -rf "${latest}"
+rm -rf "${shim_path}"
+rm -rf "${shim_config_tmp_path}"
diff --git a/tools/vm/ubuntu1604/40_kokoro.sh b/tools/vm/ubuntu1604/40_kokoro.sh
new file mode 100755
index 000000000..06a1e6c48
--- /dev/null
+++ b/tools/vm/ubuntu1604/40_kokoro.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Declare kokoro's required public keys.
+declare -r ssh_public_keys=(
+    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDg7L/ZaEauETWrPklUTky3kvxqQfe2Ax/2CsSqhNIGNMnK/8d79CHlmY9+dE1FFQ/RzKNCaltgy7XcN/fCYiCZr5jm2ZtnLuGNOTzupMNhaYiPL419qmL+5rZXt4/dWTrsHbFRACxT8j51PcRMO5wgbL0Bg2XXimbx8kDFaurL2gqduQYqlu4lxWCaJqOL71WogcimeL63Nq/yeH5PJPWpqE4P9VUQSwAzBWFK/hLeds/AiP3MgVS65qHBnhq0JsHy8JQsqjZbG7Iidt/Ll0+gqzEbi62gDIcczG4KC0iOVzDDP/1BxDtt1lKeA23ll769Fcm3rJyoBMYxjvdw1TDx sabujp@trigger.mtv.corp.google.com"
+    "ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBNgGK/hCdjmulHfRE3hp4rZs38NCR8yAh0eDsztxqGcuXnuSnL7jOlRrbcQpremJ84omD4eKrIpwJUs+YokMdv4= sabujp@trigger.svl.corp.google.com"
+)
+
+# Install dependencies.
+while true; do
+  if (apt-get update && apt-get install -y \
+      rsync \
+      coreutils \
+      python-psutil \
+      qemu-kvm \
+      python-pip \
+      python3-pip \
+      zip); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+# junitparser is used to merge junit xml files.
+pip install junitparser
+
+# We need a kbuilder user.
+if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
+    # User was added successfully; we add the relevant SSH keys here.
+    mkdir -p ~kbuilder/.ssh
+    (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
+    chmod 0600 ~kbuilder/.ssh/authorized_keys
+    chown -R kbuilder ~kbuilder/.ssh
+fi
+
+# Give passwordless sudo access.
+cat > /etc/sudoers.d/kokoro <<EOF
+kbuilder ALL=(ALL) NOPASSWD:ALL
+EOF
+
+# Ensure we can run Docker without sudo.
+usermod -aG docker kbuilder
+
+# Ensure that we can access kvm.
+usermod -aG kvm kbuilder
+
+# Ensure that /tmpfs exists and is writable by kokoro.
+#
+# Note that kokoro will typically attach a second disk (sdb) to the instance
+# that is used for the /tmpfs volume. In the future we could setup an init
+# script that formats and mounts this here; however, we don't expect our build
+# artifacts to be that large.
+mkdir -p /tmpfs && chmod 0777 /tmpfs && touch /tmpfs/READY
diff --git a/tools/vm/ubuntu1604/BUILD b/tools/vm/ubuntu1604/BUILD
new file mode 100644
index 000000000..ab1df0c4c
--- /dev/null
+++ b/tools/vm/ubuntu1604/BUILD
@@ -0,0 +1,7 @@
+package(licenses = ["notice"])
+
+filegroup(
+    name = "ubuntu1604",
+    srcs = glob(["*.sh"]),
+    visibility = ["//:sandbox"],
+)
diff --git a/tools/vm/ubuntu1804/BUILD b/tools/vm/ubuntu1804/BUILD
new file mode 100644
index 000000000..0c8856dde
--- /dev/null
+++ b/tools/vm/ubuntu1804/BUILD
@@ -0,0 +1,7 @@
+package(licenses = ["notice"])
+
+alias(
+    name = "ubuntu1804",
+    actual = "//tools/vm/ubuntu1604",
+    visibility = ["//:sandbox"],
+)
diff --git a/tools/vm/zone.sh b/tools/vm/zone.sh
new file mode 100755
index 000000000..79569fb19
--- /dev/null
+++ b/tools/vm/zone.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exec gcloud config get-value compute/zone
-- 
cgit v1.2.3


From 782041509f4130e8e795b22379368239d5091c8f Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 20 Apr 2020 16:31:39 -0700
Subject: Prevent race when reassigning CancellableTimer

Capture a timer's locker for each instance of a CancellableTimer so that
reassigning a tcpip.CancellableTimer does not cause a data race.

Reassigning a tcpip.CancellableTimer updates its underlying locker. When
a timer fires, it does a read of the timer's locker variable to lock it.
This read of the locker was not synchronized so a race existed where one
goroutine may reassign the timer (updating the locker) and another
handles the timer firing (attempts to lock the timer's locker).

Test: tcpip_test.TestCancellableTimerReassignment
PiperOrigin-RevId: 307499822
---
 pkg/tcpip/timer.go      |  8 ++++++--
 pkg/tcpip/timer_test.go | 25 +++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
index f5f01f32f..67f66fc72 100644
--- a/pkg/tcpip/timer.go
+++ b/pkg/tcpip/timer.go
@@ -131,10 +131,14 @@ func (t *CancellableTimer) StopLocked() {
 func (t *CancellableTimer) Reset(d time.Duration) {
 	// Create a new instance.
 	earlyReturn := false
+
+	// Capture the locker so that updating the timer does not cause a data race
+	// when a timer fires and tries to obtain the lock (read the timer's locker).
+	locker := t.locker
 	t.instance = cancellableTimerInstance{
 		timer: time.AfterFunc(d, func() {
-			t.locker.Lock()
-			defer t.locker.Unlock()
+			locker.Lock()
+			defer locker.Unlock()
 
 			if earlyReturn {
 				// If we reach this point, it means that the timer fired while another
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
index 2d20f7ef3..730134906 100644
--- a/pkg/tcpip/timer_test.go
+++ b/pkg/tcpip/timer_test.go
@@ -28,6 +28,31 @@ const (
 	longDuration   = 1 * time.Second
 )
 
+func TestCancellableTimerReassignment(t *testing.T) {
+	var timer tcpip.CancellableTimer
+	var wg sync.WaitGroup
+	var lock sync.Mutex
+
+	for i := 0; i < 2; i++ {
+		wg.Add(1)
+
+		go func() {
+			lock.Lock()
+			// Assigning a new timer value updates the timer's locker and function.
+			// This test makes sure there is no data race when reassigning a timer
+			// that has an active timer (even if it has been stopped as a stopped
+			// timer may be blocked on a lock before it can check if it has been
+			// stopped while another goroutine holds the same lock).
+			timer = tcpip.MakeCancellableTimer(&lock, func() {
+				wg.Done()
+			})
+			timer.Reset(shortDuration)
+			lock.Unlock()
+		}()
+	}
+	wg.Wait()
+}
+
 func TestCancellableTimerFire(t *testing.T) {
 	t.Parallel()
 
-- 
cgit v1.2.3


From c615aafa219e8d9783b9c9a25252e4973de57d4a Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 20 Apr 2020 20:57:02 -0700
Subject: Add internal nogo analysis & checkescape tool.

See tools/nogo/README.md.

The checkescape tool is able to perform recursive escape analysis, using the
actual generated binary to confirm the results produced by the compiler itself.

As an initial use case, this replaces the manual escape analysis tests used for
go_marshal, and validates that the CopyIn and CopyOut paths will not require
any allocation or stack splits.

Updates #2243

PiperOrigin-RevId: 307532986
---
 BUILD                                              |  33 -
 WORKSPACE                                          |  33 +-
 pkg/metric/metric.go                               |   2 +-
 tools/BUILD                                        |   2 -
 tools/bazeldefs/defs.bzl                           |  41 +-
 tools/checkescape/BUILD                            |  16 +
 tools/checkescape/checkescape.go                   | 726 +++++++++++++++++++++
 tools/checkescape/test1/BUILD                      |   9 +
 tools/checkescape/test1/test1.go                   | 195 ++++++
 tools/checkescape/test2/BUILD                      |   9 +
 tools/checkescape/test2/test2.go                   |  94 +++
 tools/checkunsafe/BUILD                            |   7 +-
 tools/defs.bzl                                     |  12 +-
 .../generator_interfaces_array_newtype.go          |   9 +-
 .../generator_interfaces_primitive_newtype.go      |  15 +-
 .../gomarshal/generator_interfaces_struct.go       |  20 +-
 tools/go_marshal/test/BUILD                        |  15 +-
 tools/go_marshal/test/escape.go                    | 114 ----
 tools/go_marshal/test/escape/BUILD                 |  14 +
 tools/go_marshal/test/escape/escape.go             |  95 +++
 tools/nogo.json                                    |  39 --
 tools/nogo/BUILD                                   |  49 ++
 tools/nogo/README.md                               |  31 +
 tools/nogo/build.go                                |  36 +
 tools/nogo/check/BUILD                             |  12 +
 tools/nogo/check/main.go                           |  24 +
 tools/nogo/config.go                               | 113 ++++
 tools/nogo/data/BUILD                              |  10 +
 tools/nogo/data/data.go                            |  21 +
 tools/nogo/defs.bzl                                | 172 +++++
 tools/nogo/io_bazel_rules_go-visibility.patch      |  25 +
 tools/nogo/matchers.go                             | 138 ++++
 tools/nogo/nogo.go                                 | 316 +++++++++
 tools/nogo/register.go                             |  64 ++
 34 files changed, 2269 insertions(+), 242 deletions(-)
 create mode 100644 tools/checkescape/BUILD
 create mode 100644 tools/checkescape/checkescape.go
 create mode 100644 tools/checkescape/test1/BUILD
 create mode 100644 tools/checkescape/test1/test1.go
 create mode 100644 tools/checkescape/test2/BUILD
 create mode 100644 tools/checkescape/test2/test2.go
 delete mode 100644 tools/go_marshal/test/escape.go
 create mode 100644 tools/go_marshal/test/escape/BUILD
 create mode 100644 tools/go_marshal/test/escape/escape.go
 delete mode 100644 tools/nogo.json
 create mode 100644 tools/nogo/BUILD
 create mode 100644 tools/nogo/README.md
 create mode 100644 tools/nogo/build.go
 create mode 100644 tools/nogo/check/BUILD
 create mode 100644 tools/nogo/check/main.go
 create mode 100644 tools/nogo/config.go
 create mode 100644 tools/nogo/data/BUILD
 create mode 100644 tools/nogo/data/data.go
 create mode 100644 tools/nogo/defs.bzl
 create mode 100644 tools/nogo/io_bazel_rules_go-visibility.patch
 create mode 100644 tools/nogo/matchers.go
 create mode 100644 tools/nogo/nogo.go
 create mode 100644 tools/nogo/register.go

diff --git a/BUILD b/BUILD
index a709a9816..c010e2131 100644
--- a/BUILD
+++ b/BUILD
@@ -44,39 +44,6 @@ go_path(
 #   bazel run //:gazelle -- update-repos -from_file=go.mod
 gazelle(name = "gazelle")
 
-# nogo applies checks to all Go source in this repository, enforcing code
-# guidelines and restrictions. Note that the tool libraries themselves should
-# live in the tools subdirectory (unless they are standard).
-nogo(
-    name = "nogo",
-    config = "//tools:nogo.json",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tools/checkunsafe",
-        "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/assign:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/atomic:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/atomicalign:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/bools:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/buildtag:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/cgocall:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/copylock:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/deepequalerrors:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/loopclosure:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/lostcancel:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/nilfunc:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/nilness:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/printf:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/shift:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/stdmethods:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/structtag:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/tests:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/unmarshal:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/unsafeptr:go_tool_library",
-        "@org_golang_x_tools//go/analysis/passes/unusedresult:go_tool_library",
-    ],
-)
-
 # We need to define a bazel platform and toolchain to specify dockerPrivileged
 # and dockerRunAsRoot options, they are required to run tests on the RBE
 # cluster in Kokoro.
diff --git a/WORKSPACE b/WORKSPACE
index c40e03ad2..b895647fb 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,8 +2,16 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
 # Load go bazel rules and gazelle.
+#
+# Note that this repository actually patches some other Go repositories as it
+# loads it, in order to limit visibility. We hack this process by patching the
+# patch used by the Go rules, turning the trick against itself.
 http_archive(
     name = "io_bazel_rules_go",
+    patch_args = ["-p1"],
+    patches = [
+        "//tools/nogo:io_bazel_rules_go-visibility.patch",
+    ],
     sha256 = "db2b2d35293f405430f553bc7a865a8749a8ef60c30287e90d2b278c32771afe",
     urls = [
         "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
@@ -24,10 +32,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_depe
 
 go_rules_dependencies()
 
-go_register_toolchains(
-    go_version = "1.14.2",
-    nogo = "@//:nogo",
-)
+go_register_toolchains(go_version = "1.14.2")
 
 load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 
@@ -170,9 +175,13 @@ http_archive(
         "https://github.com/grpc/grpc/archive/v1.26.0.tar.gz",
     ],
 )
+
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+
 grpc_deps()
+
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
+
 grpc_extra_deps()
 
 # External repositories, in sorted order.
@@ -221,8 +230,8 @@ go_repository(
 go_repository(
     name = "com_github_imdario_mergo",
     importpath = "github.com/imdario/mergo",
-    version = "v0.3.8",
     sum = "h1:CGgOkSJeqMRmt0D9XLWExdT4m4F1vd3FV3VPt+0VxkQ=",
+    version = "v0.3.8",
 )
 
 go_repository(
@@ -248,8 +257,8 @@ go_repository(
 
 go_repository(
     name = "com_github_mohae_deepcopy",
-    importpath = "github.com/mohae/deepcopy",
     commit = "c48cc78d482608239f6c4c92a4abd87eb8761c90",
+    importpath = "github.com/mohae/deepcopy",
 )
 
 go_repository(
@@ -298,8 +307,8 @@ go_repository(
 go_repository(
     name = "org_golang_x_crypto",
     importpath = "golang.org/x/crypto",
-    sum = "h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=",
-    version = "v0.0.0-20191011191535-87dc89f01550",
+    sum = "h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=",
+    version = "v0.0.0-20190308221718-c2843e01d9a2",
 )
 
 go_repository(
@@ -340,15 +349,15 @@ go_repository(
 go_repository(
     name = "org_golang_x_tools",
     importpath = "golang.org/x/tools",
-    sum = "h1:aZzprAO9/8oim3qStq3wc1Xuxx4QmAGriC4VU4ojemQ=",
-    version = "v0.0.0-20191119224855-298f0cb1881e",
+    sum = "h1:Uglradbb4KfUWaYasZhlsDsGRwHHvRsHoNAEONef0W8=",
+    version = "v0.0.0-20200131233409-575de47986ce",
 )
 
 go_repository(
     name = "org_golang_x_xerrors",
     importpath = "golang.org/x/xerrors",
-    sum = "h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=",
-    version = "v0.0.0-20191204190536-9bdfabe68543",
+    sum = "h1:9zdDQZ7Thm29KFXgAX/+yaf3eVbP7djjWp/dXAppNCc=",
+    version = "v0.0.0-20190717185122-a985d3407aa7",
 )
 
 go_repository(
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 006fcd9ab..895253625 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -244,6 +244,6 @@ func EmitMetricUpdate() {
 		return
 	}
 
-	log.Debugf("Emitting metrics: %v", m)
+	log.Debugf("Emitting metrics: %v", &m)
 	eventchannel.Emit(&m)
 }
diff --git a/tools/BUILD b/tools/BUILD
index ba3506c04..34b950644 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -1,3 +1 @@
 package(licenses = ["notice"])
-
-exports_files(["nogo.json"])
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 0a74370a6..2207b9b34 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -1,7 +1,7 @@
 """Bazel implementations of standard rules."""
 
 load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
-load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
+load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _go_context = "go_context", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library")
 load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
 load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
@@ -99,6 +99,10 @@ def go_binary(name, static = False, pure = False, **kwargs):
         **kwargs
     )
 
+def go_importpath(target):
+    """Returns the importpath for the target."""
+    return target[GoLibrary].importpath
+
 def go_library(name, **kwargs):
     _go_library(
         name = name,
@@ -106,13 +110,6 @@ def go_library(name, **kwargs):
         **kwargs
     )
 
-def go_tool_library(name, **kwargs):
-    _go_tool_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name(),
-        **kwargs
-    )
-
 def go_test(name, pure = False, library = None, **kwargs):
     """Build a go test.
 
@@ -131,6 +128,34 @@ def go_test(name, pure = False, library = None, **kwargs):
         **kwargs
     )
 
+def go_rule(rule, implementation, **kwargs):
+    """Wraps a rule definition with Go attributes.
+
+    Args:
+      rule: rule function (typically rule or aspect).
+      implementation: implementation function.
+      **kwargs: other arguments to pass to rule.
+
+    Returns:
+        The result of invoking the rule.
+    """
+    attrs = kwargs.pop("attrs", [])
+    attrs["_go_context_data"] = attr.label(default = "@io_bazel_rules_go//:go_context_data")
+    attrs["_stdlib"] = attr.label(default = "@io_bazel_rules_go//:stdlib")
+    toolchains = kwargs.get("toolchains", []) + ["@io_bazel_rules_go//go:toolchain"]
+    return rule(implementation, attrs = attrs, toolchains = toolchains, **kwargs)
+
+def go_context(ctx):
+    go_ctx = _go_context(ctx)
+    return struct(
+        go = go_ctx.go,
+        env = go_ctx.env,
+        runfiles = depset([go_ctx.go] + go_ctx.sdk.tools + go_ctx.stdlib.libs),
+        goos = go_ctx.sdk.goos,
+        goarch = go_ctx.sdk.goarch,
+        tags = go_ctx.tags,
+    )
+
 def py_requirement(name, direct = True):
     return _py_requirement(name)
 
diff --git a/tools/checkescape/BUILD b/tools/checkescape/BUILD
new file mode 100644
index 000000000..b8c3ddf44
--- /dev/null
+++ b/tools/checkescape/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "checkescape",
+    srcs = ["checkescape.go"],
+    nogo = False,
+    visibility = ["//tools/nogo:__subpackages__"],
+    deps = [
+        "//tools/nogo/data",
+        "@org_golang_x_tools//go/analysis:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/buildssa:go_tool_library",
+        "@org_golang_x_tools//go/ssa:go_tool_library",
+    ],
+)
diff --git a/tools/checkescape/checkescape.go b/tools/checkescape/checkescape.go
new file mode 100644
index 000000000..571e9a6e6
--- /dev/null
+++ b/tools/checkescape/checkescape.go
@@ -0,0 +1,726 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package checkescape allows recursive escape analysis for hot paths.
+//
+// The analysis tracks multiple types of escapes, in two categories. First,
+// 'hard' escapes are explicit allocations. Second, 'soft' escapes are
+// interface dispatches or dynamic function dispatches; these don't necessarily
+// escape but they *may* escape. The analysis is capable of making assertions
+// recursively: soft escapes cannot be analyzed in this way, and therefore
+// count as escapes for recursive purposes.
+//
+// The different types of escapes are as follows, with the category in
+// parentheses:
+//
+// 	heap:      A direct allocation is made on the heap (hard).
+// 	builtin:   A call is made to a built-in allocation function (hard).
+// 	stack:     A stack split as part of a function preamble (soft).
+// 	interface: A call is made via an interface whicy *may* escape (soft).
+// 	dynamic:   A dynamic function is dispatched which *may* escape (soft).
+//
+// To the use the package, annotate a function-level comment with either the
+// line "// +checkescape" or "// +checkescape:OPTION[,OPTION]". In the second
+// case, the OPTION field is either a type above, or one of:
+//
+//	local: Escape analysis is limited to local hard escapes only.
+//	all: All the escapes are included.
+//	hard: All hard escapes are included.
+//
+// If the "// +checkescape" annotation is provided, this is equivalent to
+// provided the local and hard options.
+//
+// Some examples of this syntax are:
+//
+// +checkescape:all               - Analyzes for all escapes in this function and all calls.
+// +checkescape:local             - Analyzes only for default local hard escapes.
+// +checkescape:heap              - Only analyzes for heap escapes.
+// +checkescape:interface,dynamic - Only checks for dynamic calls and interface calls.
+// +checkescape                   - Does the same as +checkescape:local,hard.
+//
+// Note that all of the above can be inverted by using +mustescape. The
+// +checkescape keyword will ensure failure if the class of escape occurs,
+// whereas +mustescape will fail if the given class of escape does not occur.
+//
+// Local exemptions can be made by a comment of the form "// escapes: reason."
+// This must appear on the line of the escape and will also apply to callers of
+// the function as well (for non-local escape analysis).
+package checkescape
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"go/ast"
+	"go/token"
+	"go/types"
+	"io"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/passes/buildssa"
+	"golang.org/x/tools/go/ssa"
+	"gvisor.dev/gvisor/tools/nogo/data"
+)
+
+const (
+	// magic is the magic annotation.
+	magic = "// +checkescape"
+
+	// magicParams is the magic annotation with specific parameters.
+	magicParams = magic + ":"
+
+	// testMagic is the test magic annotation (parameters required).
+	testMagic = "// +mustescape:"
+
+	// exempt is the exemption annotation.
+	exempt = "// escapes:"
+)
+
+// escapingBuiltins are builtins known to escape.
+//
+// These are lowered at an earlier stage of compilation to explicit function
+// calls, but are not available for recursive analysis.
+var escapingBuiltins = []string{
+	"append",
+	"makemap",
+	"newobject",
+	"mallocgc",
+}
+
+// Analyzer defines the entrypoint.
+var Analyzer = &analysis.Analyzer{
+	Name:      "checkescape",
+	Doc:       "surfaces recursive escape analysis results",
+	Run:       run,
+	Requires:  []*analysis.Analyzer{buildssa.Analyzer},
+	FactTypes: []analysis.Fact{(*packageEscapeFacts)(nil)},
+}
+
+// packageEscapeFacts is the set of all functions in a package, and whether or
+// not they recursively pass escape analysis.
+//
+// All the type names for receivers are encoded in the full key. The key
+// represents the fully qualified package and type name used at link time.
+type packageEscapeFacts struct {
+	Funcs map[string][]Escape
+}
+
+// AFact implements analysis.Fact.AFact.
+func (*packageEscapeFacts) AFact() {}
+
+// CallSite is a single call site.
+//
+// These can be chained.
+type CallSite struct {
+	LocalPos token.Pos
+	Resolved LinePosition
+}
+
+// Escape is a single escape instance.
+type Escape struct {
+	Reason EscapeReason
+	Detail string
+	Chain  []CallSite
+}
+
+// LinePosition is a low-resolution token.Position.
+//
+// This is used to match against possible exemptions placed in the source.
+type LinePosition struct {
+	Filename string
+	Line     int
+}
+
+// String implements fmt.Stringer.String.
+func (e *LinePosition) String() string {
+	return fmt.Sprintf("%s:%d", e.Filename, e.Line)
+}
+
+// String implements fmt.Stringer.String.
+//
+// Note that this string will contain new lines.
+func (e *Escape) String() string {
+	var b bytes.Buffer
+	fmt.Fprintf(&b, "%s", e.Reason.String())
+	for i, cs := range e.Chain {
+		if i == len(e.Chain)-1 {
+			fmt.Fprintf(&b, "\n @ %s → %s", cs.Resolved.String(), e.Detail)
+		} else {
+			fmt.Fprintf(&b, "\n + %s", cs.Resolved.String())
+		}
+	}
+	return b.String()
+}
+
+// EscapeReason is an escape reason.
+//
+// This is a simple enum.
+type EscapeReason int
+
+const (
+	interfaceInvoke EscapeReason = iota
+	unknownPackage
+	allocation
+	builtin
+	dynamicCall
+	stackSplit
+	reasonCount // Count for below.
+)
+
+// String returns the string for the EscapeReason.
+//
+// Note that this also implicitly defines the reverse string -> EscapeReason
+// mapping, which is the word before the colon (computed below).
+func (e EscapeReason) String() string {
+	switch e {
+	case interfaceInvoke:
+		return "interface: function invocation via interface"
+	case unknownPackage:
+		return "unknown: no package information available"
+	case allocation:
+		return "heap: call to runtime heap allocation"
+	case builtin:
+		return "builtin: call to runtime builtin"
+	case dynamicCall:
+		return "dynamic: call via dynamic function"
+	case stackSplit:
+		return "stack: stack split on function entry"
+	default:
+		panic(fmt.Sprintf("unknown reason: %d", e))
+	}
+}
+
+var hardReasons = []EscapeReason{
+	allocation,
+	builtin,
+}
+
+var softReasons = []EscapeReason{
+	interfaceInvoke,
+	unknownPackage,
+	dynamicCall,
+	stackSplit,
+}
+
+var allReasons = append(hardReasons, softReasons...)
+
+var escapeTypes = func() map[string]EscapeReason {
+	result := make(map[string]EscapeReason)
+	for _, r := range allReasons {
+		parts := strings.Split(r.String(), ":")
+		result[parts[0]] = r // Key before ':'.
+	}
+	return result
+}()
+
+// EscapeCount counts escapes.
+//
+// It is used to avoid accumulating too many escapes for the same reason, for
+// the same function. We limit each class to 3 instances (arbitrarily).
+type EscapeCount struct {
+	byReason [reasonCount]uint32
+}
+
+// maxRecordsPerReason is the number of explicit records.
+//
+// See EscapeCount (and usage), and Record implementation.
+const maxRecordsPerReason = 5
+
+// Record records the reason or returns false if it should not be added.
+func (ec *EscapeCount) Record(reason EscapeReason) bool {
+	ec.byReason[reason]++
+	if ec.byReason[reason] > maxRecordsPerReason {
+		return false
+	}
+	return true
+}
+
+// loadObjdump reads the objdump output.
+//
+// This records if there is a call any function for every source line. It is
+// used only to remove false positives for escape analysis. The call will be
+// elided if escape analysis is able to put the object on the heap exclusively.
+func loadObjdump() (map[LinePosition]string, error) {
+	f, err := os.Open(data.Objdump)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	// Build the map.
+	m := make(map[LinePosition]string)
+	r := bufio.NewReader(f)
+	var (
+		lastField string
+		lastPos   LinePosition
+	)
+	for {
+		line, err := r.ReadString('\n')
+		if err != nil && err != io.EOF {
+			return nil, err
+		}
+
+		// We recognize lines corresponding to actual code (not the
+		// symbol name or other metadata) and annotate them if they
+		// correspond to an explicit CALL instruction. We assume that
+		// the lack of a CALL for a given line is evidence that escape
+		// analysis has eliminated an allocation.
+		//
+		// Lines look like this (including the first space):
+		//  gohacks_unsafe.go:33  0xa39                   488b442408              MOVQ 0x8(SP), AX
+		if len(line) > 0 && line[0] == ' ' {
+			fields := strings.Fields(line)
+			if !strings.Contains(fields[3], "CALL") {
+				continue
+			}
+
+			// Ignore strings containing duffzero, which is just
+			// used by stack allocations for types that are large
+			// enough to warrant Duff's device.
+			if strings.Contains(line, "runtime.duffzero") {
+				continue
+			}
+
+			// Ignore the racefuncenter call, which is used for
+			// race builds. This does not escape.
+			if strings.Contains(line, "runtime.racefuncenter") {
+				continue
+			}
+
+			// Calculate the filename and line. Note that per the
+			// example above, the filename is not a fully qualified
+			// base, just the basename (what we require).
+			if fields[0] != lastField {
+				parts := strings.SplitN(fields[0], ":", 2)
+				lineNum, err := strconv.ParseInt(parts[1], 10, 64)
+				if err != nil {
+					return nil, err
+				}
+				lastPos = LinePosition{
+					Filename: parts[0],
+					Line:     int(lineNum),
+				}
+				lastField = fields[0]
+			}
+			if _, ok := m[lastPos]; ok {
+				continue // Already marked.
+			}
+
+			// Save the actual call for the detail.
+			m[lastPos] = strings.Join(fields[3:], " ")
+		}
+		if err == io.EOF {
+			break
+		}
+	}
+
+	return m, nil
+}
+
+// poser is a type that implements Pos.
+type poser interface {
+	Pos() token.Pos
+}
+
+// run performs the analysis.
+func run(pass *analysis.Pass) (interface{}, error) {
+	calls, err := loadObjdump()
+	if err != nil {
+		return nil, err
+	}
+	pef := packageEscapeFacts{
+		Funcs: make(map[string][]Escape),
+	}
+	linePosition := func(inst, parent poser) LinePosition {
+		p := pass.Fset.Position(inst.Pos())
+		if (p.Filename == "" || p.Line == 0) && parent != nil {
+			p = pass.Fset.Position(parent.Pos())
+		}
+		return LinePosition{
+			Filename: filepath.Base(p.Filename),
+			Line:     p.Line,
+		}
+	}
+	hasCall := func(inst poser) (string, bool) {
+		p := linePosition(inst, nil)
+		s, ok := calls[p]
+		return s, ok
+	}
+	callSite := func(inst ssa.Instruction) CallSite {
+		return CallSite{
+			LocalPos: inst.Pos(),
+			Resolved: linePosition(inst, inst.Parent()),
+		}
+	}
+	escapes := func(reason EscapeReason, detail string, inst ssa.Instruction, ec *EscapeCount) []Escape {
+		if !ec.Record(reason) {
+			return nil // Skip.
+		}
+		es := Escape{
+			Reason: reason,
+			Detail: detail,
+			Chain:  []CallSite{callSite(inst)},
+		}
+		return []Escape{es}
+	}
+	resolve := func(sub []Escape, inst ssa.Instruction, ec *EscapeCount) (es []Escape) {
+		for _, e := range sub {
+			if !ec.Record(e.Reason) {
+				continue // Skip.
+			}
+			es = append(es, Escape{
+				Reason: e.Reason,
+				Detail: e.Detail,
+				Chain:  append([]CallSite{callSite(inst)}, e.Chain...),
+			})
+		}
+		return es
+	}
+	state := pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA)
+
+	var loadFunc func(*ssa.Function) []Escape // Used below.
+
+	analyzeInstruction := func(inst ssa.Instruction, ec *EscapeCount) []Escape {
+		switch x := inst.(type) {
+		case *ssa.Call:
+			if x.Call.IsInvoke() {
+				// This is an interface dispatch. There is no
+				// way to know if this is actually escaping or
+				// not, since we don't know the underlying
+				// type.
+				call, _ := hasCall(inst)
+				return escapes(interfaceInvoke, call, inst, ec)
+			}
+			switch x := x.Call.Value.(type) {
+			case *ssa.Function:
+				if x.Pkg == nil {
+					// Can't resolve the package.
+					return escapes(unknownPackage, "no package", inst, ec)
+				}
+
+				// Atomic functions are instrinics. We can
+				// assume that they don't escape.
+				if x.Pkg.Pkg.Name() == "atomic" {
+					return nil
+				}
+
+				// Is this a local function? If yes, call the
+				// function to load the local function. The
+				// local escapes are the escapes found in the
+				// local function.
+				if x.Pkg.Pkg == pass.Pkg {
+					return resolve(loadFunc(x), inst, ec)
+				}
+
+				// Recursively collect information from
+				// the other analyzers.
+				var imp packageEscapeFacts
+				if !pass.ImportPackageFact(x.Pkg.Pkg, &imp) {
+					// Unable to import the dependency; we must
+					// declare these as escaping.
+					return escapes(unknownPackage, "no analysis", inst, ec)
+				}
+
+				// The escapes of this instruction are the
+				// escapes of the called function directly.
+				return resolve(imp.Funcs[x.RelString(x.Pkg.Pkg)], inst, ec)
+			case *ssa.Builtin:
+				// Ignore elided escapes.
+				if _, has := hasCall(inst); !has {
+					return nil
+				}
+
+				// Check if the builtin is escaping.
+				for _, name := range escapingBuiltins {
+					if x.Name() == name {
+						return escapes(builtin, name, inst, ec)
+					}
+				}
+			default:
+				// All dynamic calls are counted as soft
+				// escapes. They are similar to interface
+				// dispatches. We cannot actually look up what
+				// this refers to using static analysis alone.
+				call, _ := hasCall(inst)
+				return escapes(dynamicCall, call, inst, ec)
+			}
+		case *ssa.Alloc:
+			// Ignore non-heap allocations.
+			if !x.Heap {
+				return nil
+			}
+
+			// Ignore elided escapes.
+			call, has := hasCall(inst)
+			if !has {
+				return nil
+			}
+
+			// This is a real heap allocation.
+			return escapes(allocation, call, inst, ec)
+		case *ssa.MakeMap:
+			return escapes(builtin, "makemap", inst, ec)
+		case *ssa.MakeSlice:
+			return escapes(builtin, "makeslice", inst, ec)
+		case *ssa.MakeClosure:
+			return escapes(builtin, "makeclosure", inst, ec)
+		case *ssa.MakeChan:
+			return escapes(builtin, "makechan", inst, ec)
+		}
+		return nil // No escapes.
+	}
+
+	var analyzeBasicBlock func(*ssa.BasicBlock, *EscapeCount) []Escape // Recursive.
+	analyzeBasicBlock = func(block *ssa.BasicBlock, ec *EscapeCount) (rval []Escape) {
+		for _, inst := range block.Instrs {
+			rval = append(rval, analyzeInstruction(inst, ec)...)
+		}
+		return rval // N.B. may be empty.
+	}
+
+	loadFunc = func(fn *ssa.Function) []Escape {
+		// Is this already available?
+		name := fn.RelString(pass.Pkg)
+		if es, ok := pef.Funcs[name]; ok {
+			return es
+		}
+
+		// In the case of a true cycle, we assume that the current
+		// function itself has no escapes until the rest of the
+		// analysis is complete. This will trip the above in the case
+		// of a cycle of any kind.
+		pef.Funcs[name] = nil
+
+		// Perform the basic analysis.
+		var (
+			es []Escape
+			ec EscapeCount
+		)
+		if fn.Recover != nil {
+			es = append(es, analyzeBasicBlock(fn.Recover, &ec)...)
+		}
+		for _, block := range fn.Blocks {
+			es = append(es, analyzeBasicBlock(block, &ec)...)
+		}
+
+		// Check for a stack split.
+		if call, has := hasCall(fn); has {
+			es = append(es, Escape{
+				Reason: stackSplit,
+				Detail: call,
+				Chain: []CallSite{CallSite{
+					LocalPos: fn.Pos(),
+					Resolved: linePosition(fn, fn.Parent()),
+				}},
+			})
+		}
+
+		// Save the result and return.
+		pef.Funcs[name] = es
+		return es
+	}
+
+	// Complete all local functions.
+	for _, fn := range state.SrcFuncs {
+		loadFunc(fn)
+	}
+
+	// Build the exception list.
+	exemptions := make(map[LinePosition]string)
+	for _, f := range pass.Files {
+		for _, cg := range f.Comments {
+			for _, c := range cg.List {
+				p := pass.Fset.Position(c.Slash)
+				if strings.HasPrefix(c.Text, exempt) {
+					exemptions[LinePosition{
+						Filename: filepath.Base(p.Filename),
+						Line:     p.Line,
+					}] = c.Text[len(exempt):]
+				}
+			}
+		}
+	}
+
+	// Delete everything matching the excemtions.
+	//
+	// This has the implication that exceptions are applied recursively,
+	// since this now modified set is what will be saved.
+	for name, escapes := range pef.Funcs {
+		var newEscapes []Escape
+		for _, escape := range escapes {
+			isExempt := false
+			for line, _ := range exemptions {
+				// Note that an exemption applies if it is
+				// marked as an exemption anywhere in the call
+				// chain. It need not be marked as escapes in
+				// the function itself, nor in the top-level
+				// caller.
+				for _, callSite := range escape.Chain {
+					if callSite.Resolved == line {
+						isExempt = true
+						break
+					}
+				}
+				if isExempt {
+					break
+				}
+			}
+			if !isExempt {
+				// Record this escape; not an exception.
+				newEscapes = append(newEscapes, escape)
+			}
+		}
+		pef.Funcs[name] = newEscapes // Update.
+	}
+
+	// Export all findings for future packages.
+	pass.ExportPackageFact(&pef)
+
+	// Scan all functions for violations.
+	for _, f := range pass.Files {
+		// Scan all declarations.
+		for _, decl := range f.Decls {
+			fdecl, ok := decl.(*ast.FuncDecl)
+			// Function declaration?
+			if !ok {
+				continue
+			}
+			// Is there a comment?
+			if fdecl.Doc == nil {
+				continue
+			}
+			var (
+				reasons     []EscapeReason
+				found       bool
+				local       bool
+				testReasons = make(map[EscapeReason]bool) // reason -> local?
+			)
+			// Does the comment contain a +checkescape line?
+			for _, c := range fdecl.Doc.List {
+				if !strings.HasPrefix(c.Text, magic) && !strings.HasPrefix(c.Text, testMagic) {
+					continue
+				}
+				if c.Text == magic {
+					// Default: hard reasons, local only.
+					reasons = hardReasons
+					local = true
+				} else if strings.HasPrefix(c.Text, magicParams) {
+					// Extract specific reasons.
+					types := strings.Split(c.Text[len(magicParams):], ",")
+					found = true // For below.
+					for i := 0; i < len(types); i++ {
+						if types[i] == "local" {
+							// Limit search to local escapes.
+							local = true
+						} else if types[i] == "all" {
+							// Append all reasons.
+							reasons = append(reasons, allReasons...)
+						} else if types[i] == "hard" {
+							// Append all hard reasons.
+							reasons = append(reasons, hardReasons...)
+						} else {
+							r, ok := escapeTypes[types[i]]
+							if !ok {
+								// This is not a valid escape reason.
+								pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
+								continue
+							}
+							reasons = append(reasons, r)
+						}
+					}
+				} else if strings.HasPrefix(c.Text, testMagic) {
+					types := strings.Split(c.Text[len(testMagic):], ",")
+					local := false
+					for i := 0; i < len(types); i++ {
+						if types[i] == "local" {
+							local = true
+						} else {
+							r, ok := escapeTypes[types[i]]
+							if !ok {
+								// This is not a valid escape reason.
+								pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
+								continue
+							}
+							if v, ok := testReasons[r]; ok && v {
+								// Already registered as local.
+								continue
+							}
+							testReasons[r] = local
+						}
+					}
+				}
+			}
+			if len(reasons) == 0 && found {
+				// A magic annotation was provided, but no reasons.
+				pass.Reportf(fdecl.Pos(), "no reasons provided")
+				continue
+			}
+
+			// Scan for matches.
+			fn := pass.TypesInfo.Defs[fdecl.Name].(*types.Func)
+			name := state.Pkg.Prog.FuncValue(fn).RelString(pass.Pkg)
+			es, ok := pef.Funcs[name]
+			if !ok {
+				pass.Reportf(fdecl.Pos(), "internal error: function %s not found.", name)
+				continue
+			}
+			for _, e := range es {
+				for _, r := range reasons {
+					// Is does meet our local requirement?
+					if local && len(e.Chain) > 1 {
+						continue
+					}
+					// Does this match the reason? Emit
+					// with a full stack trace that
+					// explains why this violates our
+					// constraints.
+					if e.Reason == r {
+						pass.Reportf(e.Chain[0].LocalPos, "%s", e.String())
+					}
+				}
+			}
+
+			// Scan for test (required) matches.
+			testReasonsFound := make(map[EscapeReason]bool)
+			for _, e := range es {
+				// Is this local?
+				local, ok := testReasons[e.Reason]
+				wantLocal := len(e.Chain) == 1
+				testReasonsFound[e.Reason] = wantLocal
+				if !ok {
+					continue
+				}
+				if local == wantLocal {
+					delete(testReasons, e.Reason)
+				}
+			}
+			for reason, local := range testReasons {
+				// We didn't find the escapes we wanted.
+				pass.Reportf(fdecl.Pos(), fmt.Sprintf("testescapes not found: reason=%s, local=%t", reason, local))
+			}
+			if len(testReasons) > 0 {
+				// Dump all reasons found to help in debugging.
+				for _, e := range es {
+					pass.Reportf(e.Chain[0].LocalPos, "escape found: %s", e.String())
+				}
+			}
+		}
+	}
+
+	return nil, nil
+}
diff --git a/tools/checkescape/test1/BUILD b/tools/checkescape/test1/BUILD
new file mode 100644
index 000000000..783403247
--- /dev/null
+++ b/tools/checkescape/test1/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "test1",
+    srcs = ["test1.go"],
+    visibility = ["//tools/checkescape/test2:__pkg__"],
+)
diff --git a/tools/checkescape/test1/test1.go b/tools/checkescape/test1/test1.go
new file mode 100644
index 000000000..68d3f72cc
--- /dev/null
+++ b/tools/checkescape/test1/test1.go
@@ -0,0 +1,195 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package test1 is a test package.
+package test1
+
+import (
+	"fmt"
+	"reflect"
+)
+
+// Interface is a generic interface.
+type Interface interface {
+	Foo()
+}
+
+// Type is a concrete implementation of Interface.
+type Type struct {
+	A uint64
+	B uint64
+}
+
+// Foo implements Interface.Foo.
+//go:nosplit
+func (t Type) Foo() {
+	fmt.Printf("%v", t) // Never executed.
+}
+
+// +checkescape:all,hard
+//go:nosplit
+func InterfaceFunction(i Interface) {
+	// Do nothing; exported for tests.
+}
+
+// +checkesacape:all,hard
+//go:nosplit
+func TypeFunction(t *Type) {
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinMap(x int) map[string]bool {
+	return make(map[string]bool)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinMapRec(x int) map[string]bool {
+	return BuiltinMap(x)
+}
+
+// +temustescapestescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinClosure(x int) func() {
+	return func() {
+		fmt.Printf("%v", x)
+	}
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinClosureRec(x int) func() {
+	return BuiltinClosure(x)
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinMakeSlice(x int) []byte {
+	return make([]byte, x)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinMakeSliceRec(x int) []byte {
+	return BuiltinMakeSlice(x)
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinAppend(x []byte) []byte {
+	return append(x, 0)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinAppendRec() []byte {
+	return BuiltinAppend(nil)
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinChan() chan int {
+	return make(chan int)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinChanRec() chan int {
+	return BuiltinChan()
+}
+
+// +mustescape:local,heap
+//go:noinline
+//go:nosplit
+func Heap() *Type {
+	var t Type
+	return &t
+}
+
+// +mustescape:heap
+//go:noinline
+//go:nosplit
+func heapRec() *Type {
+	return Heap()
+}
+
+// +mustescape:local,interface
+//go:noinline
+//go:nosplit
+func Dispatch(i Interface) {
+	i.Foo()
+}
+
+// +mustescape:interface
+//go:noinline
+//go:nosplit
+func dispatchRec(i Interface) {
+	Dispatch(i)
+}
+
+// +mustescape:local,dynamic
+//go:noinline
+//go:nosplit
+func Dynamic(f func()) {
+	f()
+}
+
+// +mustescape:dynamic
+//go:noinline
+//go:nosplit
+func dynamicRec(f func()) {
+	Dynamic(f)
+}
+
+// +mustescape:local,unknown
+//go:noinline
+//go:nosplit
+func Unknown() {
+	_ = reflect.TypeOf((*Type)(nil)) // Does not actually escape.
+}
+
+// +mustescape:unknown
+//go:noinline
+//go:nosplit
+func unknownRec() {
+	Unknown()
+}
+
+//go:noinline
+//go:nosplit
+func internalFunc() {
+}
+
+// +mustescape:local,stack
+//go:noinline
+func Split() {
+	internalFunc()
+}
+
+// +mustescape:stack
+//go:noinline
+func splitRec() {
+	Split()
+}
diff --git a/tools/checkescape/test2/BUILD b/tools/checkescape/test2/BUILD
new file mode 100644
index 000000000..5a11e4b43
--- /dev/null
+++ b/tools/checkescape/test2/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "test2",
+    srcs = ["test2.go"],
+    deps = ["//tools/checkescape/test1"],
+)
diff --git a/tools/checkescape/test2/test2.go b/tools/checkescape/test2/test2.go
new file mode 100644
index 000000000..7fce3e3be
--- /dev/null
+++ b/tools/checkescape/test2/test2.go
@@ -0,0 +1,94 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package test2 is a test package that imports test1.
+package test2
+
+import (
+	"gvisor.dev/gvisor/tools/checkescape/test1"
+)
+
+// +checkescape:all
+//go:nosplit
+func interfaceFunctionCrossPkg() {
+	var i test1.Interface
+	test1.InterfaceFunction(i)
+}
+
+// +checkesacape:all
+//go:nosplit
+func typeFunctionCrossPkg() {
+	var t test1.Type
+	test1.TypeFunction(&t)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinMapCrossPkg(x int) map[string]bool {
+	return test1.BuiltinMap(x)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinClosureCrossPkg(x int) func() {
+	return test1.BuiltinClosure(x)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinMakeSliceCrossPkg(x int) []byte {
+	return test1.BuiltinMakeSlice(x)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinAppendCrossPkg() []byte {
+	return test1.BuiltinAppend(nil)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinChanCrossPkg() chan int {
+	return test1.BuiltinChan()
+}
+
+// +mustescape:heap
+//go:noinline
+func heapCrossPkg() *test1.Type {
+	return test1.Heap()
+}
+
+// +mustescape:interface
+//go:noinline
+func dispatchCrossPkg(i test1.Interface) {
+	test1.Dispatch(i)
+}
+
+// +mustescape:dynamic
+//go:noinline
+func dynamicCrossPkg(f func()) {
+	test1.Dynamic(f)
+}
+
+// +mustescape:unknown
+//go:noinline
+func unknownCrossPkg() {
+	test1.Unknown()
+}
+
+// +mustescape:stack
+//go:noinline
+func splitCrosssPkt() {
+	test1.Split()
+}
diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD
index 4f1a31a6d..0c264151b 100644
--- a/tools/checkunsafe/BUILD
+++ b/tools/checkunsafe/BUILD
@@ -1,11 +1,12 @@
-load("//tools:defs.bzl", "go_tool_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-go_tool_library(
+go_library(
     name = "checkunsafe",
     srcs = ["check_unsafe.go"],
-    visibility = ["//:sandbox"],
+    nogo = False,
+    visibility = ["//tools/nogo:__subpackages__"],
     deps = [
         "@org_golang_x_tools//go/analysis:go_tool_library",
     ],
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 91d689a82..6a224d7d5 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,9 +7,10 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms")
 load("//tools/bazeldefs:tags.bzl", "go_suffixes")
+load("//tools/nogo:defs.bzl", "nogo_test")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -25,7 +26,6 @@ gbenchmark = _gbenchmark
 go_embed_data = _go_embed_data
 go_image = _go_image
 go_test = _go_test
-go_tool_library = _go_tool_library
 gtest = _gtest
 grpcpp = _grpcpp
 loopback = _loopback
@@ -38,6 +38,7 @@ py_test = _py_test
 select_arch = _select_arch
 select_system = _select_system
 
+# Platform options.
 default_platform = _default_platform
 platforms = _platforms
 
@@ -91,7 +92,7 @@ def go_imports(name, src, out):
         cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
     )
 
-def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, **kwargs):
+def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, nogo = True, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
     The recommended way is to use this rule with mostly identical configuration as the native
@@ -177,6 +178,11 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         deps = all_deps,
         **kwargs
     )
+    if nogo:
+        nogo_test(
+            name = name + "_nogo",
+            deps = [":" + name],
+        )
 
     if marshal:
         # Ignore importpath for go_test.
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
index 8d6f102d5..72ef03a22 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
@@ -44,6 +44,7 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *as
 	lenExpr := g.arrayLenExpr(a)
 
 	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		if size, dynamic := g.scalarSize(elt); !dynamic {
@@ -77,6 +78,7 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *as
 	g.emit("}\n\n")
 
 	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emit("// Array newtypes are always packed.\n")
@@ -99,17 +101,19 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *as
 	g.emit("}\n\n")
 
 	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n")
+		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
 	g.emit("}\n\n")
 
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
@@ -117,11 +121,12 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *as
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
index ef9bb903d..39f654ea8 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
@@ -104,6 +104,7 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 	g.recordUsedImport("usermem")
 
 	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		if size, dynamic := g.scalarSize(nt); !dynamic {
@@ -129,6 +130,7 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 	g.emit("}\n\n")
 
 	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emit("// Scalar newtypes are always packed.\n")
@@ -151,17 +153,19 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 	g.emit("}\n\n")
 
 	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n")
+		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
 	g.emit("}\n\n")
 
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
@@ -169,11 +173,12 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
@@ -205,6 +210,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 	}
 
 	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, eltType)
+	g.emit("//go:nosplit\n")
 	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, eltType)
 	g.inIndent(func() {
 		g.emit("count := len(dst)\n")
@@ -217,13 +223,14 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 
 		g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
 
-		g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive("dst")
 		g.emit("return length, err\n")
 	})
 	g.emit("}\n\n")
 
 	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, eltType)
+	g.emit("//go:nosplit\n")
 	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, eltType)
 	g.inIndent(func() {
 		g.emit("count := len(src)\n")
@@ -236,7 +243,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 
 		g.emitCastSliceToByteSlice("&src", "buf", "size * count")
 
-		g.emit("length, err := task.CopyOutBytes(addr, buf)\n")
+		g.emit("length, err := task.CopyOutBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive("src")
 		g.emit("return length, err\n")
 	})
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
index 4236e978e..9cd3c9579 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -249,6 +249,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("}\n\n")
 
 	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("//go:nosplit\n")
 	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		expr, fieldsMaybePacked := g.areFieldsPackedExpression()
@@ -317,15 +318,16 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("}\n\n")
 
 	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("//go:nosplit\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
 	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("return task.CopyOutBytes(addr, buf[:limit])\n")
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
+			g.emit("%s.MarshalBytes(buf) // escapes: fallback.\n", g.r)
+			g.emit("return task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -339,7 +341,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 			// Fast serialization.
 			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-			g.emit("length, err := task.CopyOutBytes(addr, buf[:limit])\n")
+			g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 			g.emitKeepAlive(g.r)
 			g.emit("return length, err\n")
 		} else {
@@ -349,6 +351,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("}\n\n")
 
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("//go:nosplit\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
 	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
@@ -358,17 +361,18 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("//go:nosplit\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
 	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
+			g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
 			g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n")
 			g.emit("// partially unmarshalled struct.\n")
-			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
+			g.emit("%s.UnmarshalBytes(buf) // escapes: fallback.\n", g.r)
 			g.emit("return length, err\n")
 		}
 		if thisPacked {
@@ -383,7 +387,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 			// Fast deserialization.
 			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
 			g.emitKeepAlive(g.r)
 			g.emit("return length, err\n")
 		} else {
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index 3b839799d..2fbcc8a03 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_binary", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 licenses(["notice"])
 
@@ -25,21 +25,10 @@ go_library(
     testonly = 1,
     srcs = ["test.go"],
     marshal = True,
+    visibility = ["//tools/go_marshal/test:__subpackages__"],
     deps = ["//tools/go_marshal/test/external"],
 )
 
-go_binary(
-    name = "escape",
-    testonly = 1,
-    srcs = ["escape.go"],
-    gc_goopts = ["-m"],
-    deps = [
-        ":test",
-        "//pkg/usermem",
-        "//tools/go_marshal/marshal",
-    ],
-)
-
 go_test(
     name = "marshal_test",
     size = "small",
diff --git a/tools/go_marshal/test/escape.go b/tools/go_marshal/test/escape.go
deleted file mode 100644
index 184f05ea3..000000000
--- a/tools/go_marshal/test/escape.go
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This binary provides a convienient target for analyzing how the go-marshal
-// API causes its various arguments to escape to the heap. To use, build and
-// observe the output from the go compiler's escape analysis:
-//
-// $ bazel build :escape
-// ...
-// escape.go:67:2: moved to heap: task
-// escape.go:77:31: make([]byte, size) escapes to heap
-// escape.go:87:31: make([]byte, size) escapes to heap
-// escape.go:96:6: moved to heap: stat
-// ...
-//
-// This is not an automated test, but simply a minimal binary for easy analysis.
-package main
-
-import (
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/test"
-)
-
-// dummyTask implements marshal.Task.
-type dummyTask struct {
-}
-
-func (*dummyTask) CopyScratchBuffer(size int) []byte {
-	return make([]byte, size)
-}
-
-func (*dummyTask) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
-	return len(b), nil
-}
-
-func (*dummyTask) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
-	return len(b), nil
-}
-
-func (task *dummyTask) MarshalBytes(addr usermem.Addr, marshallable marshal.Marshallable) {
-	buf := task.CopyScratchBuffer(marshallable.SizeBytes())
-	marshallable.MarshalBytes(buf)
-	task.CopyOutBytes(addr, buf)
-}
-
-func (task *dummyTask) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marshallable) {
-	buf := task.CopyScratchBuffer(marshallable.SizeBytes())
-	marshallable.MarshalUnsafe(buf)
-	task.CopyOutBytes(addr, buf)
-}
-
-// Expected escapes:
-// - task: passed to marshal.Marshallable.CopyOut as the marshal.Task interface.
-func doCopyOut() {
-	task := dummyTask{}
-	var stat test.Stat
-	stat.CopyOut(&task, usermem.Addr(0xf000ba12))
-}
-
-// Expected escapes:
-// - buf: make allocates on the heap.
-func doMarshalBytesDirect() {
-	task := dummyTask{}
-	var stat test.Stat
-	buf := task.CopyScratchBuffer(stat.SizeBytes())
-	stat.MarshalBytes(buf)
-	task.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
-}
-
-// Expected escapes:
-// - buf: make allocates on the heap.
-func doMarshalUnsafeDirect() {
-	task := dummyTask{}
-	var stat test.Stat
-	buf := task.CopyScratchBuffer(stat.SizeBytes())
-	stat.MarshalUnsafe(buf)
-	task.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
-}
-
-// Expected escapes:
-// - stat: passed to dummyTask.MarshalBytes as the marshal.Marshallable interface.
-func doMarshalBytesViaMarshallable() {
-	task := dummyTask{}
-	var stat test.Stat
-	task.MarshalBytes(usermem.Addr(0xf000ba12), &stat)
-}
-
-// Expected escapes:
-// - stat: passed to dummyTask.MarshalUnsafe as the marshal.Marshallable interface.
-func doMarshalUnsafeViaMarshallable() {
-	task := dummyTask{}
-	var stat test.Stat
-	task.MarshalUnsafe(usermem.Addr(0xf000ba12), &stat)
-}
-
-func main() {
-	doCopyOut()
-	doMarshalBytesDirect()
-	doMarshalUnsafeDirect()
-	doMarshalBytesViaMarshallable()
-	doMarshalUnsafeViaMarshallable()
-}
diff --git a/tools/go_marshal/test/escape/BUILD b/tools/go_marshal/test/escape/BUILD
new file mode 100644
index 000000000..f74e6ffae
--- /dev/null
+++ b/tools/go_marshal/test/escape/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "escape",
+    testonly = 1,
+    srcs = ["escape.go"],
+    deps = [
+        "//pkg/usermem",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/test",
+    ],
+)
diff --git a/tools/go_marshal/test/escape/escape.go b/tools/go_marshal/test/escape/escape.go
new file mode 100644
index 000000000..6a46ddbf8
--- /dev/null
+++ b/tools/go_marshal/test/escape/escape.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package escape
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
+)
+
+// dummyTask implements marshal.Task.
+type dummyTask struct {
+}
+
+func (*dummyTask) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+func (*dummyTask) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
+	return len(b), nil
+}
+
+func (*dummyTask) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
+	return len(b), nil
+}
+
+func (t *dummyTask) MarshalBytes(addr usermem.Addr, marshallable marshal.Marshallable) {
+	buf := t.CopyScratchBuffer(marshallable.SizeBytes())
+	marshallable.MarshalBytes(buf)
+	t.CopyOutBytes(addr, buf)
+}
+
+func (t *dummyTask) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marshallable) {
+	buf := t.CopyScratchBuffer(marshallable.SizeBytes())
+	marshallable.MarshalUnsafe(buf)
+	t.CopyOutBytes(addr, buf)
+}
+
+// +checkescape:all
+//go:nosplit
+func doCopyIn(t *dummyTask) {
+	var stat test.Stat
+	stat.CopyIn(t, usermem.Addr(0xf000ba12))
+}
+
+// +checkescape:all
+//go:nosplit
+func doCopyOut(t *dummyTask) {
+	var stat test.Stat
+	stat.CopyOut(t, usermem.Addr(0xf000ba12))
+}
+
+// +mustescape:builtin
+// +mustescape:stack
+func doMarshalBytesDirect(t *dummyTask) {
+	var stat test.Stat
+	buf := t.CopyScratchBuffer(stat.SizeBytes())
+	stat.MarshalBytes(buf)
+	t.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
+}
+
+// +mustescape:builtin
+// +mustescape:stack
+func doMarshalUnsafeDirect(t *dummyTask) {
+	var stat test.Stat
+	buf := t.CopyScratchBuffer(stat.SizeBytes())
+	stat.MarshalUnsafe(buf)
+	t.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
+}
+
+// +mustescape:local,heap
+// +mustescape:stack
+func doMarshalBytesViaMarshallable(t *dummyTask) {
+	var stat test.Stat
+	t.MarshalBytes(usermem.Addr(0xf000ba12), &stat)
+}
+
+// +mustescape:local,heap
+// +mustescape:stack
+func doMarshalUnsafeViaMarshallable(t *dummyTask) {
+	var stat test.Stat
+	t.MarshalUnsafe(usermem.Addr(0xf000ba12), &stat)
+}
diff --git a/tools/nogo.json b/tools/nogo.json
deleted file mode 100644
index ae969409e..000000000
--- a/tools/nogo.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "assign": {
-    "exclude_files": {
-      "/external/bazel_gazelle/walk/walk.go": "allowed: false positive"
-    }
-  },
-  "checkunsafe": {
-    "exclude_files": {
-      "/external/": "allowed: not subject to unsafe naming rules"
-    }
-  },
-  "nilness": {
-    "exclude_files": {
-      "/com_github_vishvananda_netlink/route_linux.go": "allowed: false positive",
-      "/external/bazel_gazelle/cmd/gazelle/.*": "allowed: false positive",
-      "/org_golang_x_tools/go/packages/golist.go": "allowed: runtime internals",
-      "/pkg/sentry/platform/kvm/kvm_test.go": "allowed: intentional",
-      "/tools/bigquery/bigquery.go": "allowed: false positive",
-      "/external/io_opencensus_go/tag/map_codec.go": "allowed: false positive"
-    }
-  },
-  "structtag": {
-    "exclude_files": {
-      "/external/": "allowed: may use arbitrary tags"
-    }
-  },
-  "unsafeptr": {
-    "exclude_files": {
-      ".*_test.go": "allowed: exclude tests",
-      "/pkg/flipcall/flipcall_unsafe.go": "allowed: special case",
-      "/pkg/gohacks/gohacks_unsafe.go": "allowed: special case",
-      "/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go": "allowed: special case",
-      "/pkg/sentry/platform/kvm/(bluepill|machine)_unsafe.go": "allowed: special case",
-      "/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go": "allowed: special case",
-      "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case",
-      "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case"
-    }
-  }
-}
diff --git a/tools/nogo/BUILD b/tools/nogo/BUILD
new file mode 100644
index 000000000..c21b09511
--- /dev/null
+++ b/tools/nogo/BUILD
@@ -0,0 +1,49 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "nogo",
+    srcs = [
+        "build.go",
+        "config.go",
+        "matchers.go",
+        "nogo.go",
+        "register.go",
+    ],
+    nogo = False,
+    visibility = ["//:sandbox"],
+    deps = [
+        "//tools/checkescape",
+        "//tools/checkunsafe",
+        "//tools/nogo/data",
+        "@org_golang_x_tools//go/analysis:go_tool_library",
+        "@org_golang_x_tools//go/analysis/internal/facts:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/assign:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/atomic:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/bools:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/buildtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/cgocall:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/composite:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/copylock:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/errorsas:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/httpresponse:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/loopclosure:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/lostcancel:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilfunc:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilness:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/printf:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/shadow:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/shift:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/stdmethods:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/stringintconv:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/structtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/tests:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unmarshal:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unreachable:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unsafeptr:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unusedresult:go_tool_library",
+        "@org_golang_x_tools//go/gcexportdata:go_tool_library",
+    ],
+)
diff --git a/tools/nogo/README.md b/tools/nogo/README.md
new file mode 100644
index 000000000..6e4db18de
--- /dev/null
+++ b/tools/nogo/README.md
@@ -0,0 +1,31 @@
+# Extended "nogo" analysis
+
+This package provides a build aspect that perform nogo analysis. This will be
+automatically injected to all relevant libraries when using the default
+`go_binary` and `go_library` rules.
+
+It exists for several reasons.
+
+*   The default `nogo` provided by bazel is insufficient with respect to the
+    possibility of binary analysis. This package allows us to analyze the
+    generated binary in addition to using the standard analyzers.
+
+*   The configuration provided in this package is much richer than the standard
+    `nogo` JSON blob. Specifically, it allows us to exclude specific structures
+    from the composite rules (such as the Ranges that are common with the set
+    types).
+
+*   The bazel version of `nogo` is run directly against the `go_library` and
+    `go_binary` targets, meaning that any change to the configuration requires a
+    rebuild from scratch (for some reason included all C++ source files in the
+    process). Using an aspect is more efficient in this regard.
+
+*   The checks supported by this package are exported as tests, which makes it
+    easier to reason about and plumb into the build system.
+
+*   For uninteresting reasons, it is impossible to integrate the default `nogo`
+    analyzer provided by bazel with internal Google tooling. To provide a
+    consistent experience, this package allows those systems to be unified.
+
+To use this package, import `nogo_test` from `defs.bzl` and add a single
+dependency which is a `go_binary` or `go_library` rule.
diff --git a/tools/nogo/build.go b/tools/nogo/build.go
new file mode 100644
index 000000000..1c0d08661
--- /dev/null
+++ b/tools/nogo/build.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"fmt"
+	"io"
+	"os"
+)
+
+var (
+	// internalPrefix is the internal path prefix. Note that this is not
+	// special, as paths should be passed relative to the repository root
+	// and should not have any special prefix applied.
+	internalPrefix = fmt.Sprintf("^")
+
+	// externalPrefix is external workspace packages.
+	externalPrefix = "^external/"
+)
+
+// findStdPkg needs to find the bundled standard library packages.
+func findStdPkg(path, GOOS, GOARCH string) (io.ReadCloser, error) {
+	return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", GOOS, GOARCH, path))
+}
diff --git a/tools/nogo/check/BUILD b/tools/nogo/check/BUILD
new file mode 100644
index 000000000..e2d76cd5c
--- /dev/null
+++ b/tools/nogo/check/BUILD
@@ -0,0 +1,12 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+# Note that the check binary must be public, since an aspect may be applied
+# across lots of different rules in different repositories.
+go_binary(
+    name = "check",
+    srcs = ["main.go"],
+    visibility = ["//visibility:public"],
+    deps = ["//tools/nogo"],
+)
diff --git a/tools/nogo/check/main.go b/tools/nogo/check/main.go
new file mode 100644
index 000000000..3828edf3a
--- /dev/null
+++ b/tools/nogo/check/main.go
@@ -0,0 +1,24 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary check is the nogo entrypoint.
+package main
+
+import (
+	"gvisor.dev/gvisor/tools/nogo"
+)
+
+func main() {
+	nogo.Main()
+}
diff --git a/tools/nogo/config.go b/tools/nogo/config.go
new file mode 100644
index 000000000..0c4b7dd40
--- /dev/null
+++ b/tools/nogo/config.go
@@ -0,0 +1,113 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/passes/asmdecl"
+	"golang.org/x/tools/go/analysis/passes/assign"
+	"golang.org/x/tools/go/analysis/passes/atomic"
+	"golang.org/x/tools/go/analysis/passes/bools"
+	"golang.org/x/tools/go/analysis/passes/buildtag"
+	"golang.org/x/tools/go/analysis/passes/cgocall"
+	"golang.org/x/tools/go/analysis/passes/composite"
+	"golang.org/x/tools/go/analysis/passes/copylock"
+	"golang.org/x/tools/go/analysis/passes/errorsas"
+	"golang.org/x/tools/go/analysis/passes/httpresponse"
+	"golang.org/x/tools/go/analysis/passes/loopclosure"
+	"golang.org/x/tools/go/analysis/passes/lostcancel"
+	"golang.org/x/tools/go/analysis/passes/nilfunc"
+	"golang.org/x/tools/go/analysis/passes/nilness"
+	"golang.org/x/tools/go/analysis/passes/printf"
+	"golang.org/x/tools/go/analysis/passes/shadow"
+	"golang.org/x/tools/go/analysis/passes/shift"
+	"golang.org/x/tools/go/analysis/passes/stdmethods"
+	"golang.org/x/tools/go/analysis/passes/stringintconv"
+	"golang.org/x/tools/go/analysis/passes/structtag"
+	"golang.org/x/tools/go/analysis/passes/tests"
+	"golang.org/x/tools/go/analysis/passes/unmarshal"
+	"golang.org/x/tools/go/analysis/passes/unreachable"
+	"golang.org/x/tools/go/analysis/passes/unsafeptr"
+	"golang.org/x/tools/go/analysis/passes/unusedresult"
+
+	"gvisor.dev/gvisor/tools/checkescape"
+	"gvisor.dev/gvisor/tools/checkunsafe"
+)
+
+var analyzerConfig = map[*analysis.Analyzer]matcher{
+	// Standard analyzers.
+	asmdecl.Analyzer: alwaysMatches(),
+	assign.Analyzer: externalExcluded(
+		".*gazelle/walk/walk.go", // False positive.
+	),
+	atomic.Analyzer:   alwaysMatches(),
+	bools.Analyzer:    alwaysMatches(),
+	buildtag.Analyzer: alwaysMatches(),
+	cgocall.Analyzer:  alwaysMatches(),
+	composite.Analyzer: and(
+		disableMatches(), // Disabled for now.
+		resultExcluded{
+			"Object_",
+			"Range{",
+		},
+	),
+	copylock.Analyzer:     internalMatches(), // Common external issues (e.g. protos).
+	errorsas.Analyzer:     alwaysMatches(),
+	httpresponse.Analyzer: alwaysMatches(),
+	loopclosure.Analyzer:  alwaysMatches(),
+	lostcancel.Analyzer:   internalMatches(), // Common external issues.
+	nilfunc.Analyzer:      alwaysMatches(),
+	nilness.Analyzer: and(
+		internalMatches(), // Common "tautological checks".
+		internalExcluded(
+			"pkg/sentry/platform/kvm/kvm_test.go", // Intentional.
+			"tools/bigquery/bigquery.go",          // False positive.
+		),
+	),
+	printf.Analyzer:     alwaysMatches(),
+	shift.Analyzer:      alwaysMatches(),
+	stdmethods.Analyzer: internalMatches(), // Common external issues (e.g. methods named "Write").
+	stringintconv.Analyzer: and(
+		internalExcluded(),
+		externalExcluded(
+			".*protobuf/.*.go",              // Bad conversions.
+			".*flate/huffman_bit_writer.go", // Bad conversion.
+		),
+	),
+	shadow.Analyzer:      disableMatches(),  // Disabled for now.
+	structtag.Analyzer:   internalMatches(), // External not subject to rules.
+	tests.Analyzer:       alwaysMatches(),
+	unmarshal.Analyzer:   alwaysMatches(),
+	unreachable.Analyzer: internalMatches(),
+	unsafeptr.Analyzer: and(
+		internalMatches(),
+		internalExcluded(
+			".*_test.go",                                               // Exclude tests.
+			"pkg/flipcall/.*_unsafe.go",                                // Special case.
+			"pkg/gohacks/gohacks_unsafe.go",                            // Special case.
+			"pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go",          // Special case.
+			"pkg/sentry/platform/kvm/bluepill_unsafe.go",               // Special case.
+			"pkg/sentry/platform/kvm/machine_unsafe.go",                // Special case.
+			"pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go", // Special case.
+			"pkg/sentry/platform/safecopy/safecopy_unsafe.go",          // Special case.
+			"pkg/sentry/vfs/mount_unsafe.go",                           // Special case.
+		),
+	),
+	unusedresult.Analyzer: alwaysMatches(),
+
+	// Internal analyzers: external packages not subject.
+	checkescape.Analyzer: internalMatches(),
+	checkunsafe.Analyzer: internalMatches(),
+}
diff --git a/tools/nogo/data/BUILD b/tools/nogo/data/BUILD
new file mode 100644
index 000000000..b7564cc44
--- /dev/null
+++ b/tools/nogo/data/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "data",
+    srcs = ["data.go"],
+    nogo = False,
+    visibility = ["//tools:__subpackages__"],
+)
diff --git a/tools/nogo/data/data.go b/tools/nogo/data/data.go
new file mode 100644
index 000000000..eb84d0d27
--- /dev/null
+++ b/tools/nogo/data/data.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package data contains shared data for nogo analysis.
+//
+// This is used to break a dependency cycle.
+package data
+
+// Objdump is the dumped binary under analysis.
+var Objdump string
diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl
new file mode 100644
index 000000000..6560b57c8
--- /dev/null
+++ b/tools/nogo/defs.bzl
@@ -0,0 +1,172 @@
+"""Nogo rules."""
+
+load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule")
+
+# NogoInfo is the serialized set of package facts for a nogo analysis.
+#
+# Each go_library rule will generate a corresponding nogo rule, which will run
+# with the source files as input. Note however, that the individual nogo rules
+# are simply stubs that enter into the shadow dependency tree (the "aspect").
+NogoInfo = provider(
+    fields = {
+        "facts": "serialized package facts",
+        "importpath": "package import path",
+        "binaries": "package binary files",
+    },
+)
+
+def _nogo_aspect_impl(target, ctx):
+    # If this is a nogo rule itself (and not the shadow of a go_library or
+    # go_binary rule created by such a rule), then we simply return nothing.
+    # All work is done in the shadow properties for go rules. For a proto
+    # library, we simply skip the analysis portion but still need to return a
+    # valid NogoInfo to reference the generated binary.
+    if ctx.rule.kind == "go_library":
+        srcs = ctx.rule.files.srcs
+    elif ctx.rule.kind == "go_proto_library" or ctx.rule.kind == "go_wrap_cc":
+        srcs = []
+    else:
+        return [NogoInfo()]
+
+    # Construct the Go environment from the go_context.env dictionary.
+    env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_context(ctx).env.items()])
+
+    # Start with all target files and srcs as input.
+    inputs = target.files.to_list() + srcs
+
+    # Generate a shell script that dumps the binary. Annoyingly, this seems
+    # necessary as the context in which a run_shell command runs does not seem
+    # to cleanly allow us redirect stdout to the actual output file. Perhaps
+    # I'm missing something here, but the intermediate script does work.
+    binaries = target.files.to_list()
+    disasm_file = ctx.actions.declare_file(target.label.name + ".out")
+    dumper = ctx.actions.declare_file("%s-dumper" % ctx.label.name)
+    ctx.actions.write(dumper, "\n".join([
+        "#!/bin/bash",
+        "%s %s tool objdump %s > %s\n" % (
+            env_prefix,
+            go_context(ctx).go.path,
+            [f.path for f in binaries if f.path.endswith(".a")][0],
+            disasm_file.path,
+        ),
+    ]), is_executable = True)
+    ctx.actions.run(
+        inputs = binaries,
+        outputs = [disasm_file],
+        tools = go_context(ctx).runfiles,
+        mnemonic = "GoObjdump",
+        progress_message = "Objdump %s" % target.label,
+        executable = dumper,
+    )
+    inputs.append(disasm_file)
+
+    # Extract the importpath for this package.
+    importpath = go_importpath(target)
+
+    # The nogo tool requires a configfile serialized in JSON format to do its
+    # work. This must line up with the nogo.Config fields.
+    facts = ctx.actions.declare_file(target.label.name + ".facts")
+    config = struct(
+        ImportPath = importpath,
+        GoFiles = [src.path for src in srcs if src.path.endswith(".go")],
+        NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")],
+        GOOS = go_context(ctx).goos,
+        GOARCH = go_context(ctx).goarch,
+        Tags = go_context(ctx).tags,
+        FactMap = {},  # Constructed below.
+        ImportMap = {},  # Constructed below.
+        FactOutput = facts.path,
+        Objdump = disasm_file.path,
+    )
+
+    # Collect all info from shadow dependencies.
+    for dep in ctx.rule.attr.deps:
+        # There will be no file attribute set for all transitive dependencies
+        # that are not go_library or go_binary rules, such as a proto rules.
+        # This is handled by the ctx.rule.kind check above.
+        info = dep[NogoInfo]
+        if not hasattr(info, "facts"):
+            continue
+
+        # Configure where to find the binary & fact files. Note that this will
+        # use .x and .a regardless of whether this is a go_binary rule, since
+        # these dependencies must be go_library rules.
+        x_files = [f.path for f in info.binaries if f.path.endswith(".x")]
+        if not len(x_files):
+            x_files = [f.path for f in info.binaries if f.path.endswith(".a")]
+        config.ImportMap[info.importpath] = x_files[0]
+        config.FactMap[info.importpath] = info.facts.path
+
+        # Ensure the above are available as inputs.
+        inputs.append(info.facts)
+        inputs += info.binaries
+
+    # Write the configuration and run the tool.
+    config_file = ctx.actions.declare_file(target.label.name + ".cfg")
+    ctx.actions.write(config_file, config.to_json())
+    inputs.append(config_file)
+
+    # Run the nogo tool itself.
+    ctx.actions.run(
+        inputs = inputs,
+        outputs = [facts],
+        tools = go_context(ctx).runfiles,
+        executable = ctx.files._nogo[0],
+        mnemonic = "GoStaticAnalysis",
+        progress_message = "Analyzing %s" % target.label,
+        arguments = ["-config=%s" % config_file.path],
+    )
+
+    # Return the package facts as output.
+    return [NogoInfo(
+        facts = facts,
+        importpath = importpath,
+        binaries = binaries,
+    )]
+
+nogo_aspect = go_rule(
+    aspect,
+    implementation = _nogo_aspect_impl,
+    attr_aspects = ["deps"],
+    attrs = {
+        "_nogo": attr.label(
+            default = "//tools/nogo/check:check",
+            allow_single_file = True,
+        ),
+    },
+)
+
+def _nogo_test_impl(ctx):
+    """Check nogo findings."""
+
+    # Build a runner that checks for the existence of the facts file. Note that
+    # the actual build will fail in the case of a broken analysis. We things
+    # this way so that any test applied is effectively pushed down to all
+    # upstream dependencies through the aspect.
+    inputs = []
+    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
+    runner_content = ["#!/bin/bash"]
+    for dep in ctx.attr.deps:
+        info = dep[NogoInfo]
+        inputs.append(info.facts)
+
+        # Draw a sweet unicode checkmark with the package name (in green).
+        runner_content.append("echo -e \"\\033[0;32m\\xE2\\x9C\\x94\\033[0;31m\\033[0m %s\"" % info.importpath)
+    runner_content.append("exit 0\n")
+    ctx.actions.write(runner, "\n".join(runner_content), is_executable = True)
+    return [DefaultInfo(
+        runfiles = ctx.runfiles(files = inputs),
+        executable = runner,
+    )]
+
+_nogo_test = rule(
+    implementation = _nogo_test_impl,
+    attrs = {
+        "deps": attr.label_list(aspects = [nogo_aspect]),
+    },
+    test = True,
+)
+
+def nogo_test(**kwargs):
+    tags = kwargs.pop("tags", []) + ["nogo"]
+    _nogo_test(tags = tags, **kwargs)
diff --git a/tools/nogo/io_bazel_rules_go-visibility.patch b/tools/nogo/io_bazel_rules_go-visibility.patch
new file mode 100644
index 000000000..6b64b2e85
--- /dev/null
+++ b/tools/nogo/io_bazel_rules_go-visibility.patch
@@ -0,0 +1,25 @@
+diff --git a/third_party/org_golang_x_tools-extras.patch b/third_party/org_golang_x_tools-extras.patch
+index 133fbccc..5f0d9a47 100644
+--- a/third_party/org_golang_x_tools-extras.patch
++++ b/third_party/org_golang_x_tools-extras.patch
+@@ -32,7 +32,7 @@ diff -urN c/go/analysis/internal/facts/BUILD.bazel d/go/analysis/internal/facts/
+  
+  go_library(
+      name = "go_default_library",
+-@@ -14,6 +14,23 @@
++@@ -14,6 +14,20 @@
+      ],
+  )
+  
+@@ -43,10 +43,7 @@ diff -urN c/go/analysis/internal/facts/BUILD.bazel d/go/analysis/internal/facts/
+ +        "imports.go",
+ +    ],
+ +    importpath = "golang.org/x/tools/go/analysis/internal/facts",
+-+    visibility = [
+-+        "//go/analysis:__subpackages__",
+-+        "@io_bazel_rules_go//go/tools/builders:__pkg__",
+-+    ],
+++    visibility = ["//visibility:public"],
+ +    deps = [
+ +        "//go/analysis:go_tool_library",
+ +        "//go/types/objectpath:go_tool_library",
diff --git a/tools/nogo/matchers.go b/tools/nogo/matchers.go
new file mode 100644
index 000000000..bc5772303
--- /dev/null
+++ b/tools/nogo/matchers.go
@@ -0,0 +1,138 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"go/token"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"golang.org/x/tools/go/analysis"
+)
+
+type matcher interface {
+	ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool
+}
+
+// pathRegexps excludes explicit paths.
+type pathRegexps struct {
+	expr      []*regexp.Regexp
+	whitelist bool
+}
+
+// buildRegexps builds a list of regular expressions.
+//
+// This will panic on error.
+func buildRegexps(prefix string, args ...string) []*regexp.Regexp {
+	result := make([]*regexp.Regexp, 0, len(args))
+	for _, arg := range args {
+		result = append(result, regexp.MustCompile(filepath.Join(prefix, arg)))
+	}
+	return result
+}
+
+// ShouldReport implements matcher.ShouldReport.
+func (p *pathRegexps) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool {
+	fullPos := fs.Position(d.Pos).String()
+	for _, path := range p.expr {
+		if path.MatchString(fullPos) {
+			return p.whitelist
+		}
+	}
+	return !p.whitelist
+}
+
+// internalExcluded excludes specific internal paths.
+func internalExcluded(paths ...string) *pathRegexps {
+	return &pathRegexps{
+		expr:      buildRegexps(internalPrefix, paths...),
+		whitelist: false,
+	}
+}
+
+// excludedExcluded excludes specific external paths.
+func externalExcluded(paths ...string) *pathRegexps {
+	return &pathRegexps{
+		expr:      buildRegexps(externalPrefix, paths...),
+		whitelist: false,
+	}
+}
+
+// internalMatches returns a path matcher for internal packages.
+func internalMatches() *pathRegexps {
+	return &pathRegexps{
+		expr:      buildRegexps(internalPrefix, ".*"),
+		whitelist: true,
+	}
+}
+
+// resultExcluded excludes explicit message contents.
+type resultExcluded []string
+
+// ShouldReport implements matcher.ShouldReport.
+func (r resultExcluded) ShouldReport(d analysis.Diagnostic, _ *token.FileSet) bool {
+	for _, str := range r {
+		if strings.Contains(d.Message, str) {
+			return false
+		}
+	}
+	return true // Not blacklisted.
+}
+
+// andMatcher is a composite matcher.
+type andMatcher struct {
+	first  matcher
+	second matcher
+}
+
+// ShouldReport implements matcher.ShouldReport.
+func (a *andMatcher) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool {
+	return a.first.ShouldReport(d, fs) && a.second.ShouldReport(d, fs)
+}
+
+// and is a syntactic convension for andMatcher.
+func and(first matcher, second matcher) *andMatcher {
+	return &andMatcher{
+		first:  first,
+		second: second,
+	}
+}
+
+// anyMatcher matches everything.
+type anyMatcher struct{}
+
+// ShouldReport implements matcher.ShouldReport.
+func (anyMatcher) ShouldReport(analysis.Diagnostic, *token.FileSet) bool {
+	return true
+}
+
+// alwaysMatches returns an anyMatcher instance.
+func alwaysMatches() anyMatcher {
+	return anyMatcher{}
+}
+
+// neverMatcher will never match.
+type neverMatcher struct{}
+
+// ShouldReport implements matcher.ShouldReport.
+func (neverMatcher) ShouldReport(analysis.Diagnostic, *token.FileSet) bool {
+	return false
+}
+
+// disableMatches returns a neverMatcher instance.
+func disableMatches() neverMatcher {
+	return neverMatcher{}
+}
diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go
new file mode 100644
index 000000000..203cdf688
--- /dev/null
+++ b/tools/nogo/nogo.go
@@ -0,0 +1,316 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nogo implements binary analysis similar to bazel's nogo,
+// or the unitchecker package. It exists in order to provide additional
+// facilities for analysis, namely plumbing through the output from
+// dumping the generated binary (to analyze actual produced code).
+package nogo
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"go/ast"
+	"go/build"
+	"go/parser"
+	"go/token"
+	"go/types"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"path/filepath"
+	"reflect"
+
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/internal/facts"
+	"golang.org/x/tools/go/gcexportdata"
+	"gvisor.dev/gvisor/tools/nogo/data"
+)
+
+// pkgConfig is serialized as the configuration.
+//
+// This contains everything required for the analysis.
+type pkgConfig struct {
+	ImportPath string
+	GoFiles    []string
+	NonGoFiles []string
+	Tags       []string
+	GOOS       string
+	GOARCH     string
+	ImportMap  map[string]string
+	FactMap    map[string]string
+	FactOutput string
+	Objdump    string
+}
+
+// loadFacts finds and loads facts per FactMap.
+func (c *pkgConfig) loadFacts(path string) ([]byte, error) {
+	realPath, ok := c.FactMap[path]
+	if !ok {
+		return nil, nil // No facts available.
+	}
+
+	// Read the files file.
+	data, err := ioutil.ReadFile(realPath)
+	if err != nil {
+		return nil, err
+	}
+	return data, nil
+}
+
+// shouldInclude indicates whether the file should be included.
+//
+// NOTE: This does only basic parsing of tags.
+func (c *pkgConfig) shouldInclude(path string) (bool, error) {
+	ctx := build.Default
+	ctx.GOOS = c.GOOS
+	ctx.GOARCH = c.GOARCH
+	ctx.BuildTags = c.Tags
+	return ctx.MatchFile(filepath.Dir(path), filepath.Base(path))
+}
+
+// importer is an implementation of go/types.Importer.
+//
+// This wraps a configuration, which provides the map of package names to
+// files, and the facts. Note that this importer implementation will always
+// pass when a given package is not available.
+type importer struct {
+	pkgConfig
+	fset  *token.FileSet
+	cache map[string]*types.Package
+}
+
+// Import implements types.Importer.Import.
+func (i *importer) Import(path string) (*types.Package, error) {
+	if path == "unsafe" {
+		// Special case: go/types has pre-defined type information for
+		// unsafe. We ensure that this package is correct, in case any
+		// analyzers are specifically looking for this.
+		return types.Unsafe, nil
+	}
+	realPath, ok := i.ImportMap[path]
+	var (
+		rc  io.ReadCloser
+		err error
+	)
+	if !ok {
+		// Not found in the import path. Attempt to find the package
+		// via the standard library.
+		rc, err = findStdPkg(path, i.GOOS, i.GOARCH)
+	} else {
+		// Open the file.
+		rc, err = os.Open(realPath)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer rc.Close()
+
+	// Load all exported data.
+	r, err := gcexportdata.NewReader(rc)
+	if err != nil {
+		return nil, err
+	}
+
+	return gcexportdata.Read(r, i.fset, i.cache, path)
+}
+
+// checkPackage runs all analyzers.
+//
+// The implementation was adapted from [1], which was in turn adpated from [2].
+// This returns a list of matching analysis issues, or an error if the analysis
+// could not be completed.
+//
+// [1] bazelbuid/rules_go/tools/builders/nogo_main.go
+// [2] golang.org/x/tools/go/checker/internal/checker
+func checkPackage(config pkgConfig) ([]string, error) {
+	imp := &importer{
+		pkgConfig: config,
+		fset:      token.NewFileSet(),
+		cache:     make(map[string]*types.Package),
+	}
+
+	// Load all source files.
+	var syntax []*ast.File
+	for _, file := range config.GoFiles {
+		include, err := config.shouldInclude(file)
+		if err != nil {
+			return nil, fmt.Errorf("error evaluating file %q: %v", file, err)
+		}
+		if !include {
+			continue
+		}
+		s, err := parser.ParseFile(imp.fset, file, nil, parser.ParseComments)
+		if err != nil {
+			return nil, fmt.Errorf("error parsing file %q: %v", file, err)
+		}
+		syntax = append(syntax, s)
+	}
+
+	// Check type information.
+	typesSizes := types.SizesFor("gc", config.GOARCH)
+	typeConfig := types.Config{Importer: imp}
+	typesInfo := &types.Info{
+		Types:      make(map[ast.Expr]types.TypeAndValue),
+		Uses:       make(map[*ast.Ident]types.Object),
+		Defs:       make(map[*ast.Ident]types.Object),
+		Implicits:  make(map[ast.Node]types.Object),
+		Scopes:     make(map[ast.Node]*types.Scope),
+		Selections: make(map[*ast.SelectorExpr]*types.Selection),
+	}
+	types, err := typeConfig.Check(config.ImportPath, imp.fset, syntax, typesInfo)
+	if err != nil {
+		return nil, fmt.Errorf("error checking types: %v", err)
+	}
+
+	// Load all package facts.
+	facts, err := facts.Decode(types, config.loadFacts)
+	if err != nil {
+		return nil, fmt.Errorf("error decoding facts: %v", err)
+	}
+
+	// Set the binary global for use.
+	data.Objdump = config.Objdump
+
+	// Register fact types and establish dependencies between analyzers.
+	// The visit closure will execute recursively, and populate results
+	// will all required analysis results.
+	diagnostics := make(map[*analysis.Analyzer][]analysis.Diagnostic)
+	results := make(map[*analysis.Analyzer]interface{})
+	var visit func(*analysis.Analyzer) error // For recursion.
+	visit = func(a *analysis.Analyzer) error {
+		if _, ok := results[a]; ok {
+			return nil
+		}
+
+		// Run recursively for all dependencies.
+		for _, req := range a.Requires {
+			if err := visit(req); err != nil {
+				return err
+			}
+		}
+
+		// Prepare the matcher.
+		m := analyzerConfig[a]
+		report := func(d analysis.Diagnostic) {
+			if m.ShouldReport(d, imp.fset) {
+				diagnostics[a] = append(diagnostics[a], d)
+			}
+		}
+
+		// Run the analysis.
+		factFilter := make(map[reflect.Type]bool)
+		for _, f := range a.FactTypes {
+			factFilter[reflect.TypeOf(f)] = true
+		}
+		p := &analysis.Pass{
+			Analyzer:          a,
+			Fset:              imp.fset,
+			Files:             syntax,
+			Pkg:               types,
+			TypesInfo:         typesInfo,
+			ResultOf:          results, // All results.
+			Report:            report,
+			ImportPackageFact: facts.ImportPackageFact,
+			ExportPackageFact: facts.ExportPackageFact,
+			ImportObjectFact:  facts.ImportObjectFact,
+			ExportObjectFact:  facts.ExportObjectFact,
+			AllPackageFacts:   func() []analysis.PackageFact { return facts.AllPackageFacts(factFilter) },
+			AllObjectFacts:    func() []analysis.ObjectFact { return facts.AllObjectFacts(factFilter) },
+			TypesSizes:        typesSizes,
+		}
+		result, err := a.Run(p)
+		if err != nil {
+			return fmt.Errorf("error running analysis %s: %v", a, err)
+		}
+
+		// Sanity check & save the result.
+		if got, want := reflect.TypeOf(result), a.ResultType; got != want {
+			return fmt.Errorf("error: analyzer %s returned a result of type %v, but declared ResultType %v", a, got, want)
+		}
+		results[a] = result
+		return nil // Success.
+	}
+
+	// Visit all analysis recursively.
+	for a, _ := range analyzerConfig {
+		if err := visit(a); err != nil {
+			return nil, err // Already has context.
+		}
+	}
+
+	// Write the output file.
+	if config.FactOutput != "" {
+		factData := facts.Encode()
+		if err := ioutil.WriteFile(config.FactOutput, factData, 0644); err != nil {
+			return nil, fmt.Errorf("error: unable to open facts output %q: %v", config.FactOutput, err)
+		}
+	}
+
+	// Convert all diagnostics to strings.
+	findings := make([]string, 0, len(diagnostics))
+	for a, ds := range diagnostics {
+		for _, d := range ds {
+			// Include the anlyzer name for debugability and configuration.
+			findings = append(findings, fmt.Sprintf("%s: %s: %s", a.Name, imp.fset.Position(d.Pos), d.Message))
+		}
+	}
+
+	// Return all findings.
+	return findings, nil
+}
+
+var (
+	configFile = flag.String("config", "", "configuration file (in JSON format)")
+)
+
+// Main is the entrypoint; it should be called directly from main.
+//
+// N.B. This package registers it's own flags.
+func Main() {
+	// Parse all flags.
+	flag.Parse()
+
+	// Load the configuration.
+	f, err := os.Open(*configFile)
+	if err != nil {
+		log.Fatalf("unable to open configuration %q: %v", *configFile, err)
+	}
+	defer f.Close()
+	config := new(pkgConfig)
+	dec := json.NewDecoder(f)
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(config); err != nil {
+		log.Fatalf("unable to decode configuration: %v", err)
+	}
+
+	// Process the package.
+	findings, err := checkPackage(*config)
+	if err != nil {
+		log.Fatalf("error checking package: %v", err)
+	}
+
+	// No findings?
+	if len(findings) == 0 {
+		os.Exit(0)
+	}
+
+	// Print findings and exit with non-zero code.
+	for _, finding := range findings {
+		fmt.Fprintf(os.Stdout, "%s\n", finding)
+	}
+	os.Exit(1)
+}
diff --git a/tools/nogo/register.go b/tools/nogo/register.go
new file mode 100644
index 000000000..62b499661
--- /dev/null
+++ b/tools/nogo/register.go
@@ -0,0 +1,64 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"encoding/gob"
+	"log"
+
+	"golang.org/x/tools/go/analysis"
+)
+
+// analyzers returns all configured analyzers.
+func analyzers() (all []*analysis.Analyzer) {
+	for a, _ := range analyzerConfig {
+		all = append(all, a)
+	}
+	return all
+}
+
+func init() {
+	// Validate basic configuration.
+	if err := analysis.Validate(analyzers()); err != nil {
+		log.Fatalf("unable to validate analyzer: %v", err)
+	}
+
+	// Register all fact types.
+	//
+	// N.B. This needs to be done recursively, because there may be
+	// analyzers in the Requires list that do not appear explicitly above.
+	registered := make(map[*analysis.Analyzer]struct{})
+	var register func(*analysis.Analyzer)
+	register = func(a *analysis.Analyzer) {
+		if _, ok := registered[a]; ok {
+			return
+		}
+
+		// Regsiter dependencies.
+		for _, da := range a.Requires {
+			register(da)
+		}
+
+		// Register local facts.
+		for _, f := range a.FactTypes {
+			gob.Register(f)
+		}
+
+		registered[a] = struct{}{} // Done.
+	}
+	for _, a := range analyzers() {
+		register(a)
+	}
+}
-- 
cgit v1.2.3


From 120d3b50f4875824ec69f0cc39a09ac84fced35c Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 21 Apr 2020 07:15:25 -0700
Subject: Automated rollback of changelist 307477185

PiperOrigin-RevId: 307598974
---
 pkg/sentry/socket/netfilter/tcp_matcher.go |   5 +-
 pkg/sentry/socket/netfilter/udp_matcher.go |   5 +-
 pkg/tcpip/buffer/view.go                   |  55 ++++----------
 pkg/tcpip/buffer/view_test.go              | 113 -----------------------------
 pkg/tcpip/link/loopback/loopback.go        |  10 +--
 pkg/tcpip/link/sharedmem/sharedmem_test.go |   2 +-
 pkg/tcpip/link/sniffer/sniffer.go          |  65 ++++-------------
 pkg/tcpip/network/arp/arp.go               |   5 +-
 pkg/tcpip/network/ipv4/icmp.go             |  20 ++---
 pkg/tcpip/network/ipv4/ipv4.go             |  12 +--
 pkg/tcpip/network/ipv6/icmp.go             |  74 +++++++------------
 pkg/tcpip/network/ipv6/icmp_test.go        |   3 +-
 pkg/tcpip/network/ipv6/ipv6.go             |   6 +-
 pkg/tcpip/stack/forwarder_test.go          |  13 ++--
 pkg/tcpip/stack/iptables.go                |  22 +-----
 pkg/tcpip/stack/iptables_targets.go        |  23 ++----
 pkg/tcpip/stack/nic.go                     |  34 ++++++---
 pkg/tcpip/stack/packet_buffer.go           |   4 +-
 pkg/tcpip/stack/stack_test.go              |  10 +--
 pkg/tcpip/stack/transport_test.go          |   5 +-
 pkg/tcpip/transport/icmp/endpoint.go       |   8 +-
 pkg/tcpip/transport/tcp/segment.go         |  29 +++-----
 pkg/tcpip/transport/tcp/tcp_test.go        |   4 +-
 pkg/tcpip/transport/udp/endpoint.go        |   6 +-
 pkg/tcpip/transport/udp/protocol.go        |   9 +--
 25 files changed, 147 insertions(+), 395 deletions(-)

diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 55c0f04f3..ff1cfd8f6 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -121,13 +121,12 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		tcpHeader = header.TCP(pkt.TransportHeader)
 	} else {
 		// The TCP header hasn't been parsed yet. We have to do it here.
-		hdr, ok := pkt.Data.PullUp(header.TCPMinimumSize)
-		if !ok {
+		if len(pkt.Data.First()) < header.TCPMinimumSize {
 			// There's no valid TCP header here, so we hotdrop the
 			// packet.
 			return false, true
 		}
-		tcpHeader = header.TCP(hdr)
+		tcpHeader = header.TCP(pkt.Data.First())
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 04d03d494..3359418c1 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -120,13 +120,12 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		udpHeader = header.UDP(pkt.TransportHeader)
 	} else {
 		// The UDP header hasn't been parsed yet. We have to do it here.
-		hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-		if !ok {
+		if len(pkt.Data.First()) < header.UDPMinimumSize {
 			// There's no valid UDP header here, so we hotdrop the
 			// packet.
 			return false, true
 		}
-		udpHeader = header.UDP(hdr)
+		udpHeader = header.UDP(pkt.Data.First())
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index f01217c91..8ec5d5d5c 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -77,8 +77,7 @@ func NewVectorisedView(size int, views []View) VectorisedView {
 	return VectorisedView{views: views, size: size}
 }
 
-// TrimFront removes the first "count" bytes of the vectorised view. It panics
-// if count > vv.Size().
+// TrimFront removes the first "count" bytes of the vectorised view.
 func (vv *VectorisedView) TrimFront(count int) {
 	for count > 0 && len(vv.views) > 0 {
 		if count < len(vv.views[0]) {
@@ -87,7 +86,7 @@ func (vv *VectorisedView) TrimFront(count int) {
 			return
 		}
 		count -= len(vv.views[0])
-		vv.removeFirst()
+		vv.RemoveFirst()
 	}
 }
 
@@ -105,7 +104,7 @@ func (vv *VectorisedView) Read(v View) (copied int, err error) {
 		count -= len(vv.views[0])
 		copy(v[copied:], vv.views[0])
 		copied += len(vv.views[0])
-		vv.removeFirst()
+		vv.RemoveFirst()
 	}
 	if copied == 0 {
 		return 0, io.EOF
@@ -127,7 +126,7 @@ func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int
 		count -= len(vv.views[0])
 		dstVV.AppendView(vv.views[0])
 		copied += len(vv.views[0])
-		vv.removeFirst()
+		vv.RemoveFirst()
 	}
 	return copied
 }
@@ -163,37 +162,22 @@ func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
 
-// PullUp returns the first "count" bytes of the vectorised view. If those
-// bytes aren't already contiguous inside the vectorised view, PullUp will
-// reallocate as needed to make them contiguous. PullUp fails and returns false
-// when count > vv.Size().
-func (vv *VectorisedView) PullUp(count int) (View, bool) {
+// First returns the first view of the vectorised view.
+func (vv *VectorisedView) First() View {
 	if len(vv.views) == 0 {
-		return nil, count == 0
-	}
-	if count <= len(vv.views[0]) {
-		return vv.views[0][:count], true
-	}
-	if count > vv.size {
-		return nil, false
+		return nil
 	}
+	return vv.views[0]
+}
 
-	newFirst := NewView(count)
-	i := 0
-	for offset := 0; offset < count; i++ {
-		copy(newFirst[offset:], vv.views[i])
-		if count-offset < len(vv.views[i]) {
-			vv.views[i].TrimFront(count - offset)
-			break
-		}
-		offset += len(vv.views[i])
-		vv.views[i] = nil
+// RemoveFirst removes the first view of the vectorised view.
+func (vv *VectorisedView) RemoveFirst() {
+	if len(vv.views) == 0 {
+		return
 	}
-	// We're guaranteed that i > 0, since count is too large for the first
-	// view.
-	vv.views[i-1] = newFirst
-	vv.views = vv.views[i-1:]
-	return newFirst, true
+	vv.size -= len(vv.views[0])
+	vv.views[0] = nil
+	vv.views = vv.views[1:]
 }
 
 // Size returns the size in bytes of the entire content stored in the vectorised view.
@@ -241,10 +225,3 @@ func (vv *VectorisedView) Readers() []bytes.Reader {
 	}
 	return readers
 }
-
-// removeFirst panics when len(vv.views) < 1.
-func (vv *VectorisedView) removeFirst() {
-	vv.size -= len(vv.views[0])
-	vv.views[0] = nil
-	vv.views = vv.views[1:]
-}
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index c56795c7b..106e1994c 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -16,7 +16,6 @@
 package buffer
 
 import (
-	"bytes"
 	"reflect"
 	"testing"
 )
@@ -371,115 +370,3 @@ func TestVVRead(t *testing.T) {
 		})
 	}
 }
-
-var pullUpTestCases = []struct {
-	comment string
-	in      VectorisedView
-	count   int
-	want    []byte
-	result  VectorisedView
-	ok      bool
-}{
-	{
-		comment: "simple case",
-		in:      vv(2, "12"),
-		count:   1,
-		want:    []byte("1"),
-		result:  vv(2, "12"),
-		ok:      true,
-	},
-	{
-		comment: "entire View",
-		in:      vv(2, "1", "2"),
-		count:   1,
-		want:    []byte("1"),
-		result:  vv(2, "1", "2"),
-		ok:      true,
-	},
-	{
-		comment: "spanning across two Views",
-		in:      vv(3, "1", "23"),
-		count:   2,
-		want:    []byte("12"),
-		result:  vv(3, "12", "3"),
-		ok:      true,
-	},
-	{
-		comment: "spanning across all Views",
-		in:      vv(5, "1", "23", "45"),
-		count:   5,
-		want:    []byte("12345"),
-		result:  vv(5, "12345"),
-		ok:      true,
-	},
-	{
-		comment: "count = 0",
-		in:      vv(1, "1"),
-		count:   0,
-		want:    []byte{},
-		result:  vv(1, "1"),
-		ok:      true,
-	},
-	{
-		comment: "count = size",
-		in:      vv(1, "1"),
-		count:   1,
-		want:    []byte("1"),
-		result:  vv(1, "1"),
-		ok:      true,
-	},
-	{
-		comment: "count too large",
-		in:      vv(3, "1", "23"),
-		count:   4,
-		want:    nil,
-		result:  vv(3, "1", "23"),
-		ok:      false,
-	},
-	{
-		comment: "empty vv",
-		in:      vv(0, ""),
-		count:   1,
-		want:    nil,
-		result:  vv(0, ""),
-		ok:      false,
-	},
-	{
-		comment: "empty vv, count = 0",
-		in:      vv(0, ""),
-		count:   0,
-		want:    nil,
-		result:  vv(0, ""),
-		ok:      true,
-	},
-	{
-		comment: "empty views",
-		in:      vv(3, "", "1", "", "23"),
-		count:   2,
-		want:    []byte("12"),
-		result:  vv(3, "12", "3"),
-		ok:      true,
-	},
-}
-
-func TestPullUp(t *testing.T) {
-	for _, c := range pullUpTestCases {
-		got, ok := c.in.PullUp(c.count)
-
-		// Is the return value right?
-		if ok != c.ok {
-			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got an ok of %t. Want %t",
-				c.comment, c.count, c.in, ok, c.ok)
-		}
-		if bytes.Compare(got, View(c.want)) != 0 {
-			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got %v. Want %v",
-				c.comment, c.count, c.in, got, c.want)
-		}
-
-		// Is the underlying structure right?
-		if !reflect.DeepEqual(c.in, c.result) {
-			t.Errorf("Test %q failed when calling PullUp(%d). Got vv with structure %v. Wanted %v",
-				c.comment, c.count, c.in, c.result)
-		}
-	}
-}
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 073c84ef9..1e2255bfa 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -98,13 +98,13 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	// There should be an ethernet header at the beginning of vv.
-	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
-	if !ok {
-		// Reject the packet if it's shorter than an ethernet header.
+	// Reject the packet if it's shorter than an ethernet header.
+	if vv.Size() < header.EthernetMinimumSize {
 		return tcpip.ErrBadAddress
 	}
-	linkHeader := header.Ethernet(hdr)
+
+	// There should be an ethernet header at the beginning of vv.
+	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
 	vv.TrimFront(len(linkHeader))
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
 		Data:       vv,
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 33f640b85..27ea3f531 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -674,7 +674,7 @@ func TestSimpleReceive(t *testing.T) {
 		// Wait for packet to be received, then check it.
 		c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet")
 		c.mu.Lock()
-		rcvd := []byte(c.packets[0].vv.ToView())
+		rcvd := []byte(c.packets[0].vv.First())
 		c.packets = c.packets[:0]
 		c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 0799c8f4d..be2537a82 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -171,7 +171,11 @@ func (e *endpoint) GSOMaxSize() uint32 {
 func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	writer := e.writer
 	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
-		logPacket(prefix, protocol, pkt, gso)
+		first := pkt.Header.View()
+		if len(first) == 0 {
+			first = pkt.Data.First()
+		}
+		logPacket(prefix, protocol, first, gso)
 	}
 	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
 		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
@@ -234,7 +238,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 // Wait implements stack.LinkEndpoint.Wait.
 func (e *endpoint) Wait() { e.lower.Wait() }
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -243,49 +247,28 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	size := uint16(0)
 	var fragmentOffset uint16
 	var moreFragments bool
-
-	// Create a clone of pkt, including any headers if present. Avoid allocating
-	// backing memory for the clone.
-	views := [8]buffer.View{}
-	vv := buffer.NewVectorisedView(0, views[:0])
-	vv.AppendView(pkt.Header.View())
-	vv.Append(pkt.Data)
-
 	switch protocol {
 	case header.IPv4ProtocolNumber:
-		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
-		if !ok {
-			return
-		}
-		ipv4 := header.IPv4(hdr)
+		ipv4 := header.IPv4(b)
 		fragmentOffset = ipv4.FragmentOffset()
 		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
 		src = ipv4.SourceAddress()
 		dst = ipv4.DestinationAddress()
 		transProto = ipv4.Protocol()
 		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		vv.TrimFront(int(ipv4.HeaderLength()))
+		b = b[ipv4.HeaderLength():]
 		id = int(ipv4.ID())
 
 	case header.IPv6ProtocolNumber:
-		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
-		if !ok {
-			return
-		}
-		ipv6 := header.IPv6(hdr)
+		ipv6 := header.IPv6(b)
 		src = ipv6.SourceAddress()
 		dst = ipv6.DestinationAddress()
 		transProto = ipv6.NextHeader()
 		size = ipv6.PayloadLength()
-		vv.TrimFront(header.IPv6MinimumSize)
+		b = b[header.IPv6MinimumSize:]
 
 	case header.ARPProtocolNumber:
-		hdr, ok := vv.PullUp(header.ARPSize)
-		if !ok {
-			return
-		}
-		vv.TrimFront(header.ARPSize)
-		arp := header.ARP(hdr)
+		arp := header.ARP(b)
 		log.Infof(
 			"%s arp %v (%v) -> %v (%v) valid:%v",
 			prefix,
@@ -301,7 +284,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	// We aren't guaranteed to have a transport header - it's possible for
 	// writes via raw endpoints to contain only network headers.
-	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && vv.Size() < minSize {
+	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && len(b) < minSize {
 		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
 		return
 	}
@@ -314,11 +297,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	switch tcpip.TransportProtocolNumber(transProto) {
 	case header.ICMPv4ProtocolNumber:
 		transName = "icmp"
-		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
-		if !ok {
-			break
-		}
-		icmp := header.ICMPv4(hdr)
+		icmp := header.ICMPv4(b)
 		icmpType := "unknown"
 		if fragmentOffset == 0 {
 			switch icmp.Type() {
@@ -351,11 +330,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.ICMPv6ProtocolNumber:
 		transName = "icmp"
-		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
-		if !ok {
-			break
-		}
-		icmp := header.ICMPv6(hdr)
+		icmp := header.ICMPv6(b)
 		icmpType := "unknown"
 		switch icmp.Type() {
 		case header.ICMPv6DstUnreachable:
@@ -386,11 +361,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.UDPProtocolNumber:
 		transName = "udp"
-		hdr, ok := vv.PullUp(header.UDPMinimumSize)
-		if !ok {
-			break
-		}
-		udp := header.UDP(hdr)
+		udp := header.UDP(b)
 		if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
 			srcPort = udp.SourcePort()
 			dstPort = udp.DestinationPort()
@@ -400,11 +371,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.TCPProtocolNumber:
 		transName = "tcp"
-		hdr, ok := vv.PullUp(header.TCPMinimumSize)
-		if !ok {
-			break
-		}
-		tcp := header.TCP(hdr)
+		tcp := header.TCP(b)
 		if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
 			offset := int(tcp.DataOffset())
 			if offset < header.TCPMinimumSize {
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index cf73a939e..7acbfa0a8 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -93,10 +93,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	v, ok := pkt.Data.PullUp(header.ARPSize)
-	if !ok {
-		return
-	}
+	v := pkt.Data.First()
 	h := header.ARP(v)
 	if !h.IsValid() {
 		return
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 4cbefe5ab..c4bf1ba5c 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -25,11 +25,7 @@ import (
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
 func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
-	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		return
-	}
-	hdr := header.IPv4(h)
+	h := header.IPv4(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that the IP
 	// header plus 8 bytes of the transport header be included. So it's
@@ -38,12 +34,12 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	//
 	// Drop packet if it doesn't have the basic IPv4 header or if the
 	// original source address doesn't match the endpoint's address.
-	if hdr.SourceAddress() != e.id.LocalAddress {
+	if len(h) < header.IPv4MinimumSize || h.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
-	hlen := int(hdr.HeaderLength())
-	if pkt.Data.Size() < hlen || hdr.FragmentOffset() != 0 {
+	hlen := int(h.HeaderLength())
+	if pkt.Data.Size() < hlen || h.FragmentOffset() != 0 {
 		// We won't be able to handle this if it doesn't contain the
 		// full IPv4 header, or if it's a fragment not at offset 0
 		// (because it won't have the transport header).
@@ -52,15 +48,15 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 
 	// Skip the ip header, then deliver control message.
 	pkt.Data.TrimFront(hlen)
-	p := hdr.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	p := h.TransportProtocol()
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
-	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
-	if !ok {
+	v := pkt.Data.First()
+	if len(v) < header.ICMPv4MinimumSize {
 		received.Invalid.Increment()
 		return
 	}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 17202cc7a..104aafbed 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -328,11 +328,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
-	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		return tcpip.ErrInvalidOptionValue
-	}
-	ip := header.IPv4(h)
+	ip := header.IPv4(pkt.Data.First())
 	if !ip.IsValid(pkt.Data.Size()) {
 		return tcpip.ErrInvalidOptionValue
 	}
@@ -382,11 +378,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
+	headerView := pkt.Data.First()
 	h := header.IPv4(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index bdf3a0d25..b68983d10 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -28,11 +28,7 @@ import (
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
 func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
-	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
-	if !ok {
-		return
-	}
-	hdr := header.IPv6(h)
+	h := header.IPv6(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that up to
 	// 1280 bytes of the original packet be included. So it's likely that it
@@ -40,21 +36,17 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	//
 	// Drop packet if it doesn't have the basic IPv6 header or if the
 	// original source address doesn't match the endpoint's address.
-	if hdr.SourceAddress() != e.id.LocalAddress {
+	if len(h) < header.IPv6MinimumSize || h.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
 	// Skip the IP header, then handle the fragmentation header if there
 	// is one.
 	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	p := hdr.TransportProtocol()
+	p := h.TransportProtocol()
 	if p == header.IPv6FragmentHeader {
-		f, ok := pkt.Data.PullUp(header.IPv6FragmentHeaderSize)
-		if !ok {
-			return
-		}
-		fragHdr := header.IPv6Fragment(f)
-		if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 {
+		f := header.IPv6Fragment(pkt.Data.First())
+		if !f.IsValid() || f.FragmentOffset() != 0 {
 			// We can't handle fragments that aren't at offset 0
 			// because they don't have the transport headers.
 			return
@@ -63,19 +55,19 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 		// Skip fragmentation header and find out the actual protocol
 		// number.
 		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
-		p = fragHdr.TransportProtocol()
+		p = f.TransportProtocol()
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
-	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
-	if !ok {
+	v := pkt.Data.First()
+	if len(v) < header.ICMPv6MinimumSize {
 		received.Invalid.Increment()
 		return
 	}
@@ -84,9 +76,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	// Validate ICMPv6 checksum before processing the packet.
 	//
+	// Only the first view in vv is accounted for by h. To account for the
+	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
 	payload := pkt.Data.Clone(nil)
-	payload.TrimFront(len(h))
+	payload.RemoveFirst()
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
 		return
@@ -107,40 +101,34 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
-		hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize)
-		if !ok {
+		if len(v) < header.ICMPv6PacketTooBigMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
-		mtu := header.ICMPv6(hdr).MTU()
+		mtu := h.MTU()
 		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
-		hdr, ok := pkt.Data.PullUp(header.ICMPv6DstUnreachableMinimumSize)
-		if !ok {
+		if len(v) < header.ICMPv6DstUnreachableMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
-		switch header.ICMPv6(hdr).Code() {
+		switch h.Code() {
 		case header.ICMPv6PortUnreachable:
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+		if len(v) < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		// The remainder of payload must be only the neighbor solicitation, so
-		// payload.ToView() always returns the solicitation. Per RFC 6980 section 5,
-		// NDP messages cannot be fragmented. Also note that in the common case NDP
-		// datagrams are very small and ToView() will not incur allocations.
-		ns := header.NDPNeighborSolicit(payload.ToView())
+		ns := header.NDPNeighborSolicit(h.NDPPayload())
 		it, err := ns.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NS option, drop the packet.
@@ -298,16 +286,12 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+		if len(v) < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		// The remainder of payload must be only the neighbor advertisement, so
-		// payload.ToView() always returns the advertisement. Per RFC 6980 section
-		// 5, NDP messages cannot be fragmented. Also note that in the common case
-		// NDP datagrams are very small and ToView() will not incur allocations.
-		na := header.NDPNeighborAdvert(payload.ToView())
+		na := header.NDPNeighborAdvert(h.NDPPayload())
 		it, err := na.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NA option, drop the packet.
@@ -379,15 +363,14 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
-		icmpHdr, ok := pkt.Data.PullUp(header.ICMPv6EchoMinimumSize)
-		if !ok {
+		if len(v) < header.ICMPv6EchoMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
 		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-		copy(packet, icmpHdr)
+		copy(packet, h)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
 		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
@@ -401,7 +384,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoReply:
 		received.EchoReply.Increment()
-		if pkt.Data.Size() < header.ICMPv6EchoMinimumSize {
+		if len(v) < header.ICMPv6EchoMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -423,9 +406,8 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	case header.ICMPv6RouterAdvert:
 		received.RouterAdvert.Increment()
 
-		// Is the NDP payload of sufficient size to hold a Router
-		// Advertisement?
-		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
+		p := h.NDPPayload()
+		if len(p) < header.NDPRAMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
@@ -443,11 +425,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 			return
 		}
 
-		// The remainder of payload must be only the router advertisement, so
-		// payload.ToView() always returns the advertisement. Per RFC 6980 section
-		// 5, NDP messages cannot be fragmented. Also note that in the common case
-		// NDP datagrams are very small and ToView() will not incur allocations.
-		ra := header.NDPRouterAdvert(payload.ToView())
+		ra := header.NDPRouterAdvert(p)
 		opts := ra.Options()
 
 		// Are options valid as per the wire format?
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d412ff688..bd099a7f8 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -166,8 +166,7 @@ func TestICMPCounts(t *testing.T) {
 		},
 		{
 			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize,
-		},
+			size: header.ICMPv6NeighborSolicitMinimumSize},
 		{
 			typ:       header.ICMPv6NeighborAdvert,
 			size:      header.ICMPv6NeighborAdvertMinimumSize,
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 486725131..331b0817b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -171,11 +171,7 @@ func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffe
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
-	if !ok {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
+	headerView := pkt.Data.First()
 	h := header.IPv6(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index c7c663498..e9c652042 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -70,10 +70,7 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 
 func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
 	// Consume the network header.
-	b, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
-	if !ok {
-		return
-	}
+	b := pkt.Data.First()
 	pkt.Data.TrimFront(fwdTestNetHeaderLen)
 
 	// Dispatch the packet to the transport protocol.
@@ -476,7 +473,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Data.ToView()
+	b := p.Pkt.Header.View()
 	if b[0] != 3 {
 		t.Fatalf("got b[0] = %d, want = 3", b[0])
 	}
@@ -520,7 +517,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.ToView()
+		b := p.Pkt.Header.View()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -567,7 +564,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.ToView()
+		b := p.Pkt.Header.View()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -622,7 +619,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Data.ToView()
+		b := p.Pkt.Header.View()
 		if b[0] < 8 {
 			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
 		}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 6b91159d4..6c0a4b24d 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -212,11 +212,6 @@ func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool {
 // CheckPackets runs pkts through the rules for hook and returns a map of packets that
 // should not go forward.
 //
-// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-//
-// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
-// precondition.
-//
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
 func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*PacketBuffer]struct{}) {
@@ -231,9 +226,7 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*Pa
 	return drop
 }
 
-// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
-// precondition.
+// Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
@@ -278,21 +271,14 @@ func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx
 	return chainDrop
 }
 
-// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
-// precondition.
+// Precondition: pk.NetworkHeader is set.
 func (it *IPTables) checkRule(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
-	// pkt.Data.
+	// pkt.Data.First().
 	if pkt.NetworkHeader == nil {
-		var ok bool
-		pkt.NetworkHeader, ok = pkt.Data.PullUp(header.IPv4MinimumSize)
-		if !ok {
-			// Precondition has been violated.
-			panic(fmt.Sprintf("iptables checks require IPv4 headers of at least %d bytes", header.IPv4MinimumSize))
-		}
+		pkt.NetworkHeader = pkt.Data.First()
 	}
 
 	// Check whether the packet matches the IP header filter.
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 8be61f4b1..7b4543caf 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -96,12 +96,9 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 	newPkt := pkt.Clone()
 
 	// Set network header.
-	headerView, ok := newPkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		return RuleDrop, 0
-	}
+	headerView := newPkt.Data.First()
 	netHeader := header.IPv4(headerView)
-	newPkt.NetworkHeader = headerView
+	newPkt.NetworkHeader = headerView[:header.IPv4MinimumSize]
 
 	hlen := int(netHeader.HeaderLength())
 	tlen := int(netHeader.TotalLength())
@@ -120,14 +117,10 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 		if newPkt.TransportHeader != nil {
 			udpHeader = header.UDP(newPkt.TransportHeader)
 		} else {
-			if pkt.Data.Size() < header.UDPMinimumSize {
-				return RuleDrop, 0
-			}
-			hdr, ok := newPkt.Data.PullUp(header.UDPMinimumSize)
-			if !ok {
+			if len(pkt.Data.First()) < header.UDPMinimumSize {
 				return RuleDrop, 0
 			}
-			udpHeader = header.UDP(hdr)
+			udpHeader = header.UDP(newPkt.Data.First())
 		}
 		udpHeader.SetDestinationPort(rt.MinPort)
 	case header.TCPProtocolNumber:
@@ -135,14 +128,10 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 		if newPkt.TransportHeader != nil {
 			tcpHeader = header.TCP(newPkt.TransportHeader)
 		} else {
-			if pkt.Data.Size() < header.TCPMinimumSize {
+			if len(pkt.Data.First()) < header.TCPMinimumSize {
 				return RuleDrop, 0
 			}
-			hdr, ok := newPkt.Data.PullUp(header.TCPMinimumSize)
-			if !ok {
-				return RuleDrop, 0
-			}
-			tcpHeader = header.TCP(hdr)
+			tcpHeader = header.TCP(newPkt.TransportHeader)
 		}
 		// TODO(gvisor.dev/issue/170): Need to recompute checksum
 		// and implement nat connection tracking to support TCP.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 0c2b1f36a..016dbe15e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1203,12 +1203,12 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	netHeader, ok := pkt.Data.PullUp(netProto.MinimumPacketSize())
-	if !ok {
+	if len(pkt.Data.First()) < netProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
-	src, dst := netProto.ParseAddresses(netHeader)
+
+	src, dst := netProto.ParseAddresses(pkt.Data.First())
 
 	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
 		// The source address is one of our own, so we never should have gotten a
@@ -1289,8 +1289,22 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen != 0 {
-		pkt.Header = buffer.NewPrependable(linkHeaderLen)
+
+	firstData := pkt.Data.First()
+	pkt.Data.RemoveFirst()
+
+	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen == 0 {
+		pkt.Header = buffer.NewPrependableFromView(firstData)
+	} else {
+		firstDataLen := len(firstData)
+
+		// pkt.Header should have enough capacity to hold n.linkEP's headers.
+		pkt.Header = buffer.NewPrependable(firstDataLen + linkHeaderLen)
+
+		// TODO(b/151227689): avoid copying the packet when forwarding
+		if n := copy(pkt.Header.Prepend(firstDataLen), firstData); n != firstDataLen {
+			panic(fmt.Sprintf("copied %d bytes, expected %d", n, firstDataLen))
+		}
 	}
 
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
@@ -1318,13 +1332,12 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// validly formed.
 	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
-	if !ok {
+	if len(pkt.Data.First()) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
@@ -1362,12 +1375,11 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
 	// be present in the payload. We know that the ports are within the
 	// first 8 bytes for all known transport protocols.
-	transHeader, ok := pkt.Data.PullUp(8)
-	if !ok {
+	if len(pkt.Data.First()) < 8 {
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
 	if err != nil {
 		return
 	}
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index e954a8b7e..dc125f25e 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -37,9 +37,7 @@ type PacketBuffer struct {
 	Data buffer.VectorisedView
 
 	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header. Note that forwarded
-	// packets don't populate Headers on their way out -- their headers and
-	// payload are never parsed out and remain in Data.
+	// down the stack, each layer adds to Header.
 	Header buffer.Prependable
 
 	// These fields are used by both inbound and outbound packets. They
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index d45d2cc1f..c7634ceb1 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -95,18 +95,16 @@ func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffe
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
 	// Consume the network header.
-	b, ok := pkt.Data.PullUp(fakeNetHeaderLen)
-	if !ok {
-		return
-	}
+	b := pkt.Data.First()
 	pkt.Data.TrimFront(fakeNetHeaderLen)
 
 	// Handle control packets.
 	if b[2] == uint8(fakeControlProtocol) {
-		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
-		if !ok {
+		nb := pkt.Data.First()
+		if len(nb) < fakeNetHeaderLen {
 			return
 		}
+
 		pkt.Data.TrimFront(fakeNetHeaderLen)
 		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
 		return
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index a611e44ab..3084e6593 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -642,11 +642,10 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	hdrs := p.Pkt.Data.ToView()
-	if dst := hdrs[0]; dst != 3 {
+	if dst := p.Pkt.Header.View()[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := hdrs[1]; src != 1 {
+	if src := p.Pkt.Header.View()[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index b1d820372..feef8dca0 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -747,15 +747,15 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
-		if !ok || header.ICMPv4(h).Type() != header.ICMPv4EchoReply {
+		h := header.ICMPv4(pkt.Data.First())
+		if h.Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
-		if !ok || header.ICMPv6(h).Type() != header.ICMPv6EchoReply {
+		h := header.ICMPv6(pkt.Data.First())
+		if h.Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 7712ce652..40461fd31 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -144,11 +144,7 @@ func (s *segment) logicalLen() seqnum.Size {
 // TCP checksum and stores the checksum and result of checksum verification in
 // the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
-	h, ok := s.data.PullUp(header.TCPMinimumSize)
-	if !ok {
-		return false
-	}
-	hdr := header.TCP(h)
+	h := header.TCP(s.data.First())
 
 	// h is the header followed by the payload. We check that the offset to
 	// the data respects the following constraints:
@@ -160,16 +156,12 @@ func (s *segment) parse() bool {
 	// N.B. The segment has already been validated as having at least the
 	//      minimum TCP size before reaching here, so it's safe to read the
 	//      fields.
-	offset := int(hdr.DataOffset())
-	if offset < header.TCPMinimumSize {
-		return false
-	}
-	hdrWithOpts, ok := s.data.PullUp(offset)
-	if !ok {
+	offset := int(h.DataOffset())
+	if offset < header.TCPMinimumSize || offset > len(h) {
 		return false
 	}
 
-	s.options = []byte(hdrWithOpts[header.TCPMinimumSize:])
+	s.options = []byte(h[header.TCPMinimumSize:offset])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
 
 	// Query the link capabilities to decide if checksum validation is
@@ -181,19 +173,18 @@ func (s *segment) parse() bool {
 		s.data.TrimFront(offset)
 	}
 	if verifyChecksum {
-		hdr = header.TCP(hdrWithOpts)
-		s.csum = hdr.Checksum()
+		s.csum = h.Checksum()
 		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
-		xsum = hdr.CalculateChecksum(xsum)
+		xsum = h.CalculateChecksum(xsum)
 		s.data.TrimFront(offset)
 		xsum = header.ChecksumVV(s.data, xsum)
 		s.csumValid = xsum == 0xffff
 	}
 
-	s.sequenceNumber = seqnum.Value(hdr.SequenceNumber())
-	s.ackNumber = seqnum.Value(hdr.AckNumber())
-	s.flags = hdr.Flags()
-	s.window = seqnum.Size(hdr.WindowSize())
+	s.sequenceNumber = seqnum.Value(h.SequenceNumber())
+	s.ackNumber = seqnum.Value(h.AckNumber())
+	s.flags = h.Flags()
+	s.window = seqnum.Size(h.WindowSize())
 	return true
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 286c66cf5..ab1014c7f 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -3548,7 +3548,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
+	tcpbuf := vv.First()[header.IPv4MinimumSize:]
 	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
 
 	c.SendSegment(vv)
@@ -3575,7 +3575,7 @@ func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
+	tcpbuf := vv.First()[header.IPv4MinimumSize:]
 	// Overwrite a byte in the payload which should cause checksum
 	// verification to fail.
 	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 756ab913a..edb54f0be 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1250,8 +1250,8 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-	if !ok || int(header.UDP(hdr).Length()) > pkt.Data.Size() {
+	hdr := header.UDP(pkt.Data.First())
+	if int(hdr.Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
@@ -1286,7 +1286,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
-			Port: header.UDP(hdr).SourcePort(),
+			Port: hdr.SourcePort(),
 		},
 	}
 	packet.data = pkt.Data
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 52af6de22..6e31a9bac 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -68,13 +68,8 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // that don't match any existing endpoint.
 func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	// Get the header then trim it from the view.
-	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-	if !ok {
-		// Malformed packet.
-		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
-		return true
-	}
-	if int(header.UDP(h).Length()) > pkt.Data.Size() {
+	hdr := header.UDP(pkt.Data.First())
+	if int(hdr.Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
-- 
cgit v1.2.3


From 7c0f3bc8576addbec001095d754a756691d26df3 Mon Sep 17 00:00:00 2001
From: Dave Bailey <davebailey@google.com>
Date: Tue, 21 Apr 2020 09:34:42 -0700
Subject: Sentry metrics updates.

Sentry metrics with nanoseconds units are labeled as such, and non-cumulative
sentry metrics are supported.

PiperOrigin-RevId: 307621080
---
 pkg/metric/metric.go                   | 41 +++++++++++++++++-----------------
 pkg/metric/metric.proto                | 10 ++++++++-
 pkg/metric/metric_test.go              | 22 +++++++++++-------
 pkg/sentry/fs/file.go                  |  2 +-
 pkg/sentry/fs/gofer/file.go            |  4 ++--
 pkg/sentry/fs/tmpfs/inode_file.go      |  2 +-
 pkg/sentry/socket/netstack/netstack.go | 14 ++++++++----
 7 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 895253625..64aa365ce 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -39,16 +39,11 @@ var (
 // Uint64Metric encapsulates a uint64 that represents some kind of metric to be
 // monitored.
 //
-// All metrics must be cumulative, meaning that their values will only increase
-// over time.
-//
 // Metrics are not saved across save/restore and thus reset to zero on restore.
 //
-// TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
 type Uint64Metric struct {
-	// value is the actual value of the metric. It must be accessed
-	// atomically.
+	// value is the actual value of the metric. It must be accessed atomically.
 	value uint64
 }
 
@@ -110,13 +105,10 @@ type customUint64Metric struct {
 // Register must only be called at init and will return and error if called
 // after Initialized.
 //
-// All metrics must be cumulative, meaning that the return values of value must
-// only increase over time.
-//
 // Preconditions:
 //  * name must be globally unique.
 //  * Initialize/Disable have not been called.
-func RegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) error {
+func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func() uint64) error {
 	if initialized {
 		return ErrInitializationDone
 	}
@@ -129,9 +121,10 @@ func RegisterCustomUint64Metric(name string, sync bool, description string, valu
 		metadata: &pb.MetricMetadata{
 			Name:        name,
 			Description: description,
-			Cumulative:  true,
+			Cumulative:  cumulative,
 			Sync:        sync,
-			Type:        pb.MetricMetadata_UINT64,
+			Type:        pb.MetricMetadata_TYPE_UINT64,
+			Units:       units,
 		},
 		value: value,
 	}
@@ -140,24 +133,32 @@ func RegisterCustomUint64Metric(name string, sync bool, description string, valu
 
 // MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric and panics
 // if it returns an error.
-func MustRegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) {
-	if err := RegisterCustomUint64Metric(name, sync, description, value); err != nil {
+func MustRegisterCustomUint64Metric(name string, cumulative, sync bool, description string, value func() uint64) {
+	if err := RegisterCustomUint64Metric(name, cumulative, sync, pb.MetricMetadata_UNITS_NONE, description, value); err != nil {
 		panic(fmt.Sprintf("Unable to register metric %q: %v", name, err))
 	}
 }
 
-// NewUint64Metric creates and registers a new metric with the given name.
+// NewUint64Metric creates and registers a new cumulative metric with the given name.
 //
 // Metrics must be statically defined (i.e., at init).
-func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric, error) {
+func NewUint64Metric(name string, sync bool, units pb.MetricMetadata_Units, description string) (*Uint64Metric, error) {
 	var m Uint64Metric
-	return &m, RegisterCustomUint64Metric(name, sync, description, m.Value)
+	return &m, RegisterCustomUint64Metric(name, true /* cumulative */, sync, units, description, m.Value)
 }
 
-// MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an
-// error.
+// MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an error.
 func MustCreateNewUint64Metric(name string, sync bool, description string) *Uint64Metric {
-	m, err := NewUint64Metric(name, sync, description)
+	m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NONE, description)
+	if err != nil {
+		panic(fmt.Sprintf("Unable to create metric %q: %v", name, err))
+	}
+	return m
+}
+
+// MustCreateNewUint64NanosecondsMetric calls NewUint64Metric and panics if it returns an error.
+func MustCreateNewUint64NanosecondsMetric(name string, sync bool, description string) *Uint64Metric {
+	m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NANOSECONDS, description)
 	if err != nil {
 		panic(fmt.Sprintf("Unable to create metric %q: %v", name, err))
 	}
diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto
index a2c2bd1ba..3cc89047d 100644
--- a/pkg/metric/metric.proto
+++ b/pkg/metric/metric.proto
@@ -36,10 +36,18 @@ message MetricMetadata {
   // the monitoring system.
   bool sync = 4;
 
-  enum Type { UINT64 = 0; }
+  enum Type { TYPE_UINT64 = 0; }
 
   // type is the type of the metric value.
   Type type = 5;
+
+  enum Units {
+    UNITS_NONE = 0;
+    UNITS_NANOSECONDS = 1;
+  }
+
+  // units is the units of the metric value.
+  Units units = 6;
 }
 
 // MetricRegistration contains the metadata for all metrics that will be in
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index 34969385a..c425ea532 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -66,12 +66,12 @@ const (
 func TestInitialize(t *testing.T) {
 	defer reset()
 
-	_, err := NewUint64Metric("/foo", false, fooDescription)
+	_, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
 
-	_, err = NewUint64Metric("/bar", true, barDescription)
+	_, err = NewUint64Metric("/bar", true, pb.MetricMetadata_UNITS_NANOSECONDS, barDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
@@ -94,8 +94,8 @@ func TestInitialize(t *testing.T) {
 	foundFoo := false
 	foundBar := false
 	for _, m := range mr.Metrics {
-		if m.Type != pb.MetricMetadata_UINT64 {
-			t.Errorf("Metadata %+v Type got %v want %v", m, m.Type, pb.MetricMetadata_UINT64)
+		if m.Type != pb.MetricMetadata_TYPE_UINT64 {
+			t.Errorf("Metadata %+v Type got %v want %v", m, m.Type, pb.MetricMetadata_TYPE_UINT64)
 		}
 		if !m.Cumulative {
 			t.Errorf("Metadata %+v Cumulative got false want true", m)
@@ -110,6 +110,9 @@ func TestInitialize(t *testing.T) {
 			if m.Sync {
 				t.Errorf("/foo %+v Sync got true want false", m)
 			}
+			if m.Units != pb.MetricMetadata_UNITS_NONE {
+				t.Errorf("/foo %+v Units got %v want %v", m, m.Units, pb.MetricMetadata_UNITS_NONE)
+			}
 		case "/bar":
 			foundBar = true
 			if m.Description != barDescription {
@@ -118,6 +121,9 @@ func TestInitialize(t *testing.T) {
 			if !m.Sync {
 				t.Errorf("/bar %+v Sync got true want false", m)
 			}
+			if m.Units != pb.MetricMetadata_UNITS_NANOSECONDS {
+				t.Errorf("/bar %+v Units got %v want %v", m, m.Units, pb.MetricMetadata_UNITS_NANOSECONDS)
+			}
 		}
 	}
 
@@ -132,12 +138,12 @@ func TestInitialize(t *testing.T) {
 func TestDisable(t *testing.T) {
 	defer reset()
 
-	_, err := NewUint64Metric("/foo", false, fooDescription)
+	_, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
 
-	_, err = NewUint64Metric("/bar", true, barDescription)
+	_, err = NewUint64Metric("/bar", true, pb.MetricMetadata_UNITS_NONE, barDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
@@ -161,12 +167,12 @@ func TestDisable(t *testing.T) {
 func TestEmitMetricUpdate(t *testing.T) {
 	defer reset()
 
-	foo, err := NewUint64Metric("/foo", false, fooDescription)
+	foo, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
 
-	_, err = NewUint64Metric("/bar", true, barDescription)
+	_, err = NewUint64Metric("/bar", true, pb.MetricMetadata_UNITS_NONE, barDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 78100e448..846252c89 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -44,7 +44,7 @@ var (
 	RecordWaitTime = false
 
 	reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
-	readWait = metric.MustCreateNewUint64Metric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
+	readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
 )
 
 // IncrementWait increments the given wait time metric, if enabled.
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 23296f246..b2fcab127 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -37,9 +37,9 @@ var (
 	opens9P      = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.")
 	opensHost    = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.")
 	reads9P      = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
-	readWait9P   = metric.MustCreateNewUint64Metric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
+	readWait9P   = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
 	readsHost    = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
-	readWaitHost = metric.MustCreateNewUint64Metric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
+	readWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
 )
 
 // fileOperations implements fs.FileOperations for a remote file system.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 25abbc151..1dc75291d 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -39,7 +39,7 @@ var (
 	opensRO  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
 	opensW   = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
 	reads    = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
-	readWait = metric.MustCreateNewUint64Metric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
+	readWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
 )
 
 // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 7ac38764d..d5879c10f 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -63,7 +63,13 @@ import (
 
 func mustCreateMetric(name, description string) *tcpip.StatCounter {
 	var cm tcpip.StatCounter
-	metric.MustRegisterCustomUint64Metric(name, false /* sync */, description, cm.Value)
+	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, cm.Value)
+	return &cm
+}
+
+func mustCreateGauge(name, description string) *tcpip.StatCounter {
+	var cm tcpip.StatCounter
+	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, cm.Value)
 	return &cm
 }
 
@@ -151,10 +157,10 @@ var Metrics = tcpip.Stats{
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
-		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
-		CurrentConnected:                   mustCreateMetric("/netstack/tcp/current_open", "Number of connections that are in connected state."),
+		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
+		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
 		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
-		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."),
+		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
 		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
 		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
 		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
-- 
cgit v1.2.3


From 8b72623e6ababc5448de0cb347476eaf4a611e2c Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 21 Apr 2020 09:41:40 -0700
Subject: Internal change.

PiperOrigin-RevId: 307622320
---
 test/runner/defs.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
index 56743a526..0a75b158f 100644
--- a/test/runner/defs.bzl
+++ b/test/runner/defs.bzl
@@ -23,7 +23,7 @@ def _runner_test_impl(ctx):
     # Return with all transitive files.
     runfiles = ctx.runfiles(
         transitive_files = depset(transitive = [
-            depset(target.data_runfiles.files)
+            target.data_runfiles.files
             for target in (ctx.attr.runner, ctx.attr.test)
             if hasattr(target, "data_runfiles")
         ]),
-- 
cgit v1.2.3


From 639c8dd80870133f61465588e717b725417a0c41 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Apr 2020 10:56:04 -0700
Subject: Restore euid upon test finish

PiperOrigin-RevId: 307638329
---
 test/syscalls/linux/uidgid.cc | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index ff66a79f4..64d6d0b8f 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -253,12 +253,21 @@ TEST(UidGidRootTest, Setgroups) {
 TEST(UidGidRootTest, Setuid_prlimit) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
 
-  // Change our UID.
-  EXPECT_THAT(seteuid(65534), SyscallSucceeds());
+  // Do seteuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test.
+  ScopedThread([&] {
+    // Use syscall instead of glibc setuid wrapper because we want this seteuid
+    // call to only apply to this task. POSIX threads, however, require that all
+    // threads have the same UIDs, so using the seteuid wrapper sets all
+    // threads' UID.
+    EXPECT_THAT(syscall(SYS_setreuid, -1, 65534), SyscallSucceeds());
 
-  // Despite the UID change, we should be able to get our own limits.
-  struct rlimit rl = {};
-  ASSERT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+    // Despite the UID change, we should be able to get our own limits.
+    struct rlimit rl = {};
+    EXPECT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+  });
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 9b5e305e05ef3ad51778981062d6152cea1cd4fb Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 21 Apr 2020 12:16:42 -0700
Subject: Remove filesystem structure from vfs.Dentry.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change:

- Drastically simplifies the synchronization model: filesystem structure is
  both implementation-defined and implementation-synchronized.

- Allows implementations of vfs.DentryImpl to use implementation-specific
  dentry types, reducing casts during path traversal.

- Doesn't require dentries representing non-directory files to waste space on a
  map of children.

- Allows dentry revalidation and mount lookup to be correctly ordered (fixed
  FIXME in fsimpl/gofer/filesystem.go).

- Removes the need to have two separate maps in gofer.dentry
  (dentry.vfsd.children and dentry.negativeChildren) for positive and negative
  lookups respectively.

//pkg/sentry/fsimpl/tmpfs/benchmark_test.go:
name                        old time/op  new time/op  delta
VFS2TmpfsStat/1-112          172ns ± 4%   165ns ± 3%   -4.08%  (p=0.002 n=9+9)
VFS2TmpfsStat/2-112          199ns ± 3%   195ns ±10%     ~     (p=0.132 n=8+9)
VFS2TmpfsStat/3-112          230ns ± 2%   216ns ± 2%   -6.15%  (p=0.000 n=8+8)
VFS2TmpfsStat/8-112          390ns ± 2%   358ns ± 4%   -8.33%  (p=0.000 n=9+8)
VFS2TmpfsStat/64-112        2.20µs ± 3%  2.01µs ± 3%   -8.48%  (p=0.000 n=10+8)
VFS2TmpfsStat/100-112       3.42µs ± 9%  3.08µs ± 2%   -9.82%  (p=0.000 n=9+8)
VFS2TmpfsMountStat/1-112     278ns ± 1%   286ns ±15%     ~     (p=0.712 n=8+10)
VFS2TmpfsMountStat/2-112     311ns ± 4%   298ns ± 2%   -4.27%  (p=0.000 n=9+8)
VFS2TmpfsMountStat/3-112     339ns ± 3%   330ns ± 9%     ~     (p=0.070 n=8+9)
VFS2TmpfsMountStat/8-112     503ns ± 3%   466ns ± 3%   -7.38%  (p=0.000 n=8+8)
VFS2TmpfsMountStat/64-112   2.53µs ±16%  2.17µs ± 7%  -14.19%  (p=0.000 n=10+9)
VFS2TmpfsMountStat/100-112  3.60µs ± 4%  3.30µs ± 8%   -8.33%  (p=0.001 n=8+9)

Updates #1035

PiperOrigin-RevId: 307655892
---
 pkg/sentry/fsimpl/ext/BUILD                   |  12 ++
 pkg/sentry/fsimpl/ext/dentry.go               |   4 +
 pkg/sentry/fsimpl/ext/directory.go            |  21 ++-
 pkg/sentry/fsimpl/ext/filesystem.go           |  54 ++++--
 pkg/sentry/fsimpl/ext/inode.go                |   2 +-
 pkg/sentry/fsimpl/gofer/BUILD                 |  12 ++
 pkg/sentry/fsimpl/gofer/directory.go          |  55 +++---
 pkg/sentry/fsimpl/gofer/filesystem.go         | 202 +++++++++++---------
 pkg/sentry/fsimpl/gofer/gofer.go              |  66 ++++---
 pkg/sentry/fsimpl/gofer/gofer_test.go         |   3 +-
 pkg/sentry/fsimpl/kernfs/BUILD                |  12 ++
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go      |   2 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go        | 159 +++++++++-------
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go   |   2 +-
 pkg/sentry/fsimpl/kernfs/kernfs.go            |  33 ++--
 pkg/sentry/fsimpl/proc/tasks_test.go          |  16 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                 |  12 ++
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go     |   7 -
 pkg/sentry/fsimpl/tmpfs/directory.go          |  84 ++++++---
 pkg/sentry/fsimpl/tmpfs/filesystem.go         | 249 +++++++++++++------------
 pkg/sentry/fsimpl/tmpfs/stat_test.go          |  12 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go              |  82 ++++----
 pkg/sentry/vfs/dentry.go                      | 259 +++++---------------------
 pkg/sentry/vfs/file_description.go            |   3 +-
 pkg/sentry/vfs/filesystem.go                  |   5 +-
 pkg/sentry/vfs/filesystem_impl_util.go        |  26 ---
 pkg/sentry/vfs/genericfstree/BUILD            |  16 ++
 pkg/sentry/vfs/genericfstree/genericfstree.go |  80 ++++++++
 pkg/sentry/vfs/mount.go                       |   9 +-
 pkg/sentry/vfs/pathname.go                    |   6 +-
 pkg/sentry/vfs/resolving_path.go              |  85 +++------
 31 files changed, 836 insertions(+), 754 deletions(-)
 create mode 100644 pkg/sentry/vfs/genericfstree/BUILD
 create mode 100644 pkg/sentry/vfs/genericfstree/genericfstree.go

diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index d83d75b3d..a4947c480 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "ext",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "ext",
     srcs = [
@@ -26,6 +37,7 @@ go_library(
         "extent_file.go",
         "file_description.go",
         "filesystem.go",
+        "fstree.go",
         "inode.go",
         "regular_file.go",
         "symlink.go",
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index a080cb189..bfbd7c3d4 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -22,6 +22,10 @@ import (
 type dentry struct {
 	vfsd vfs.Dentry
 
+	// Protected by filesystem.mu.
+	parent *dentry
+	name   string
+
 	// inode is the inode represented by this dentry. Multiple Dentries may
 	// share a single non-directory Inode (with hard links). inode is
 	// immutable.
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index bd6ede995..12b875c8f 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -31,6 +30,10 @@ import (
 type directory struct {
 	inode inode
 
+	// childCache maps filenames to dentries for children for which dentries
+	// have been instantiated. childCache is protected by filesystem.mu.
+	childCache map[string]*dentry
+
 	// mu serializes the changes to childList.
 	// Lock Order (outermost locks must be taken first):
 	//   directory.mu
@@ -50,9 +53,13 @@ type directory struct {
 	childMap map[string]*dirent
 }
 
-// newDirectroy is the directory constructor.
-func newDirectroy(inode inode, newDirent bool) (*directory, error) {
-	file := &directory{inode: inode, childMap: make(map[string]*dirent)}
+// newDirectory is the directory constructor.
+func newDirectory(inode inode, newDirent bool) (*directory, error) {
+	file := &directory{
+		inode:      inode,
+		childCache: make(map[string]*dentry),
+		childMap:   make(map[string]*dirent),
+	}
 	file.inode.impl = file
 
 	// Initialize childList by reading dirents from the underlying file.
@@ -299,9 +306,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 	fd.off = offset
 	return offset, nil
 }
-
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	// mmap(2) specifies that EACCESS should be returned for non-regular file fds.
-	return syserror.EACCES
-}
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index afea58f65..2c22a04af 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -89,14 +89,33 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 	}
 
 	for {
-		nextVFSD, err := rp.ResolveComponent(vfsd)
-		if err != nil {
-			return nil, nil, err
+		name := rp.Component()
+		if name == "." {
+			rp.Advance()
+			return vfsd, inode, nil
 		}
-		if nextVFSD == nil {
-			// Since the Dentry tree is not the sole source of truth for extfs, if it's
-			// not in the Dentry tree, it might need to be pulled from disk.
-			childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+		d := vfsd.Impl().(*dentry)
+		if name == ".." {
+			isRoot, err := rp.CheckRoot(vfsd)
+			if err != nil {
+				return nil, nil, err
+			}
+			if isRoot || d.parent == nil {
+				rp.Advance()
+				return vfsd, inode, nil
+			}
+			if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+				return nil, nil, err
+			}
+			rp.Advance()
+			return &d.parent.vfsd, d.parent.inode, nil
+		}
+
+		dir := inode.impl.(*directory)
+		child, ok := dir.childCache[name]
+		if !ok {
+			// We may need to instantiate a new dentry for this child.
+			childDirent, ok := dir.childMap[name]
 			if !ok {
 				// The underlying inode does not exist on disk.
 				return nil, nil, syserror.ENOENT
@@ -115,21 +134,22 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 			}
 			// incRef because this is being added to the dentry tree.
 			childInode.incRef()
-			child := newDentry(childInode)
-			vfsd.InsertChild(&child.vfsd, rp.Component())
-
-			// Continue as usual now that nextVFSD is not nil.
-			nextVFSD = &child.vfsd
+			child = newDentry(childInode)
+			child.parent = d
+			child.name = name
+			dir.childCache[name] = child
+		}
+		if err := rp.CheckMount(&child.vfsd); err != nil {
+			return nil, nil, err
 		}
-		nextInode := nextVFSD.Impl().(*dentry).inode
-		if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
-			if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+		if child.inode.isSymlink() && rp.ShouldFollowSymlink() {
+			if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil {
 				return nil, nil, err
 			}
 			continue
 		}
 		rp.Advance()
-		return nextVFSD, nextInode, nil
+		return &child.vfsd, child.inode, nil
 	}
 }
 
@@ -515,5 +535,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index a39a37318..a98512350 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -136,7 +136,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 		}
 		return &f.inode, nil
 	case linux.ModeDirectory:
-		f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType)
 		if err != nil {
 			return nil, err
 		}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 99d1e3f8f..acd061905 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -15,12 +15,24 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "gofer",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "gofer",
     srcs = [
         "dentry_list.go",
         "directory.go",
         "filesystem.go",
+        "fstree.go",
         "gofer.go",
         "handle.go",
         "handle_unsafe.go",
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 49d9f859b..d02691232 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -29,13 +29,25 @@ func (d *dentry) isDir() bool {
 	return d.fileType() == linux.S_IFDIR
 }
 
+// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
+// d.isDir(). child must be a newly-created dentry that has never had a parent.
+func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
+	d.IncRef() // reference held by child on its parent
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*dentry)
+	}
+	d.children[name] = child
+}
+
 // Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
 // InteropModeShared.
 func (d *dentry) cacheNegativeChildLocked(name string) {
-	if d.negativeChildren == nil {
-		d.negativeChildren = make(map[string]struct{})
+	if d.children == nil {
+		d.children = make(map[string]*dentry)
 	}
-	d.negativeChildren[name] = struct{}{}
+	d.children[name] = nil
 }
 
 type directoryFD struct {
@@ -80,34 +92,32 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 // Preconditions: d.isDir(). There exists at least one directoryFD representing d.
 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
-	// 9P2000.L's readdir does not specify behavior in the presence of
-	// concurrent mutation of an iterated directory, so implementations may
-	// duplicate or omit entries in this case, which violates POSIX semantics.
-	// Thus we read all directory entries while holding d.dirMu to exclude
-	// directory mutations. (Note that it is impossible for the client to
-	// exclude concurrent mutation from other remote filesystem users. Since
-	// there is no way to detect if the server has incorrectly omitted
-	// directory entries, we simply assume that the server is well-behaved
-	// under InteropModeShared.) This is inconsistent with Linux (which appears
-	// to assume that directory fids have the correct semantics, and translates
-	// struct file_operations::readdir calls directly to readdir RPCs), but is
-	// consistent with VFS1.
-	//
-	// NOTE(b/135560623): In particular, some gofer implementations may not
-	// retain state between calls to Readdir, so may not provide a coherent
-	// directory stream across in the presence of mutation.
-
+	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
+	// presence of concurrent mutation of an iterated directory, so
+	// implementations may duplicate or omit entries in this case, which
+	// violates POSIX semantics. Thus we read all directory entries while
+	// holding d.dirMu to exclude directory mutations. (Note that it is
+	// impossible for the client to exclude concurrent mutation from other
+	// remote filesystem users. Since there is no way to detect if the server
+	// has incorrectly omitted directory entries, we simply assume that the
+	// server is well-behaved under InteropModeShared.) This is inconsistent
+	// with Linux (which appears to assume that directory fids have the correct
+	// semantics, and translates struct file_operations::readdir calls directly
+	// to readdir RPCs), but is consistent with VFS1.
+
+	// filesystem.renameMu is needed for d.parent, and must be locked before
+	// dentry.dirMu.
 	d.fs.renameMu.RLock()
-	defer d.fs.renameMu.RUnlock()
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
 	if d.dirents != nil {
+		d.fs.renameMu.RUnlock()
 		return d.dirents, nil
 	}
 
 	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
 	// so we generate them here.
-	parent := d.vfsd.ParentOrSelf().Impl().(*dentry)
+	parent := genericParentOrSelf(d)
 	dirents := []vfs.Dirent{
 		{
 			Name:    ".",
@@ -122,6 +132,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 			NextOff: 2,
 		},
 	}
+	d.fs.renameMu.RUnlock()
 	off := uint64(0)
 	const count = 64 * 1024 // for consistency with the vfs1 client
 	d.handleMu.RLock()
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index cd744bf5e..43e863c61 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -116,6 +116,8 @@ func putDentrySlice(ds *[]*dentry) {
 // Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
 // !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
 // metadata must be up to date.
+//
+// Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -130,39 +132,42 @@ afterSymlink:
 		return d, nil
 	}
 	if name == ".." {
-		parentVFSD, err := rp.ResolveParent(&d.vfsd)
-		if err != nil {
+		if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		// We must assume that d.parent is correct, because if d has been moved
+		// elsewhere in the remote filesystem so that its parent has changed,
+		// we have no way of determining its new parent's location in the
+		// filesystem.
+		//
+		// Call rp.CheckMount() before updating d.parent's metadata, since if
+		// we traverse to another mount then d.parent's metadata is irrelevant.
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
 			return nil, err
 		}
-		parent := parentVFSD.Impl().(*dentry)
-		if fs.opts.interop == InteropModeShared {
-			// We must assume that parentVFSD is correct, because if d has been
-			// moved elsewhere in the remote filesystem so that its parent has
-			// changed, we have no way of determining its new parent's location
-			// in the filesystem. Get updated metadata for parentVFSD.
-			_, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask())
+		if fs.opts.interop == InteropModeShared && d != d.parent {
+			_, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
 			if err != nil {
 				return nil, err
 			}
-			parent.updateFromP9Attrs(attrMask, &attr)
+			d.parent.updateFromP9Attrs(attrMask, &attr)
 		}
 		rp.Advance()
-		return parent, nil
+		return d.parent, nil
 	}
-	childVFSD, err := rp.ResolveChild(&d.vfsd, name)
-	if err != nil {
-		return nil, err
-	}
-	// FIXME(jamieliu): Linux performs revalidation before mount lookup
-	// (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(),
-	// __follow_mount_rcu()).
-	child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds)
+	child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds)
 	if err != nil {
 		return nil, err
 	}
 	if child == nil {
 		return nil, syserror.ENOENT
 	}
+	if err := rp.CheckMount(&child.vfsd); err != nil {
+		return nil, err
+	}
 	if child.isSymlink() && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
 		if err != nil {
@@ -177,38 +182,37 @@ afterSymlink:
 	return child, nil
 }
 
-// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
-// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
-// nil) to verify that the returned child (or lack thereof) is correct. If no file
-// exists at name, revalidateChildLocked returns (nil, nil).
+// getChildLocked returns a dentry representing the child of parent with the
+// given name. If no such child exists, getChildLocked returns (nil, nil).
 //
 // Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
 // parent.isDir(). name is not "." or "..".
 //
-// Postconditions: If revalidateChildLocked returns a non-nil dentry, its
-// cached metadata is up to date.
-func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) {
-	if childVFSD != nil && fs.opts.interop != InteropModeShared {
-		// We have a cached dentry that is assumed to be correct.
-		return childVFSD.Impl().(*dentry), nil
-	}
-	// We either don't have a cached dentry or need to verify that it's still
-	// correct, either of which requires a remote lookup. Check if this name is
-	// valid before performing the lookup.
+// Postconditions: If getChildLocked returns a non-nil dentry, its cached
+// metadata is up to date.
+func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if len(name) > maxFilenameLen {
 		return nil, syserror.ENAMETOOLONG
 	}
-	// Check if we've already cached this lookup with a negative result.
-	if _, ok := parent.negativeChildren[name]; ok {
-		return nil, nil
+	child, ok := parent.children[name]
+	if ok && fs.opts.interop != InteropModeShared {
+		// Whether child is nil or not, it is cached information that is
+		// assumed to be correct.
+		return child, nil
 	}
-	// Perform the remote lookup.
+	// We either don't have cached information or need to verify that it's
+	// still correct, either of which requires a remote lookup. Check if this
+	// name is valid before performing the lookup.
+	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
+}
+
+// Preconditions: As for getChildLocked.
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
 	if err != nil && err != syserror.ENOENT {
 		return nil, err
 	}
-	if childVFSD != nil {
-		child := childVFSD.Impl().(*dentry)
+	if child != nil {
 		if !file.isNil() && qid.Path == child.ino {
 			// The file at this path hasn't changed. Just update cached
 			// metadata.
@@ -219,9 +223,8 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// The file at this path has changed or no longer exists. Remove
 		// the stale dentry from the tree, and re-evaluate its caching
 		// status (i.e. if it has 0 references, drop it).
-		vfsObj.ForceDeleteDentry(childVFSD)
+		vfsObj.InvalidateDentry(&child.vfsd)
 		*ds = appendDentry(*ds, child)
-		childVFSD = nil
 	}
 	if file.isNil() {
 		// No file exists at this path now. Cache the negative lookup if
@@ -232,13 +235,12 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		return nil, nil
 	}
 	// Create a new dentry representing the file.
-	child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
+	child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
 	if err != nil {
 		file.close(ctx)
 		return nil, err
 	}
-	parent.IncRef() // reference held by child on its parent
-	parent.vfsd.InsertChild(&child.vfsd, name)
+	parent.cacheNewChildLocked(child, name)
 	// For now, child has 0 references, so our caller should call
 	// child.checkCachingLocked().
 	*ds = appendDentry(*ds, child)
@@ -318,9 +320,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
-	if parent.isDeleted() {
-		return syserror.ENOENT
-	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EEXIST
@@ -331,6 +330,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
+	if parent.isDeleted() {
+		return syserror.ENOENT
+	}
 	mnt := rp.Mount()
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
@@ -348,7 +350,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// it's used.
 		return create(parent, name)
 	}
-	if parent.vfsd.Child(name) != nil {
+	if child := parent.children[name]; child != nil {
 		return syserror.EEXIST
 	}
 	// No cached dentry exists; however, there might still be an existing file
@@ -356,10 +358,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := create(parent, name); err != nil {
 		return err
 	}
-	if fs.opts.interop != InteropModeShared {
-		parent.touchCMtime()
-	}
-	delete(parent.negativeChildren, name)
+	parent.touchCMtime()
+	// Either parent.children[name] doesn't exist (in which case this is a
+	// no-op) or is nil (in which case this erases the now-stale information
+	// that the file doesn't exist).
+	delete(parent.children, name)
 	parent.dirents = nil
 	return nil
 }
@@ -407,56 +410,55 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	defer mntns.DecRef()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
-	childVFSD := parent.vfsd.Child(name)
-	var child *dentry
+	child, ok := parent.children[name]
+	if ok && child == nil {
+		return syserror.ENOENT
+	}
 	// We only need a dentry representing the file at name if it can be a mount
-	// point. If childVFSD is nil, then it can't be a mount point. If childVFSD
-	// is non-nil but stale, the actual file can't be a mount point either; we
+	// point. If child is nil, then it can't be a mount point. If child is
+	// non-nil but stale, the actual file can't be a mount point either; we
 	// detect this case by just speculatively calling PrepareDeleteDentry and
 	// only revalidating the dentry if that fails (indicating that the existing
 	// dentry is a mount point).
-	if childVFSD != nil {
-		child = childVFSD.Impl().(*dentry)
-		if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
-			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds)
+	if child != nil {
+		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+			if fs.opts.interop != InteropModeShared {
+				return err
+			}
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
 			if err != nil {
 				return err
 			}
 			if child != nil {
-				childVFSD = &child.vfsd
-				if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+				if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 					return err
 				}
-			} else {
-				childVFSD = nil
 			}
 		}
-	} else if _, ok := parent.negativeChildren[name]; ok {
-		return syserror.ENOENT
 	}
 	flags := uint32(0)
 	if dir {
 		if child != nil && !child.isDir() {
-			vfsObj.AbortDeleteDentry(childVFSD)
+			vfsObj.AbortDeleteDentry(&child.vfsd)
 			return syserror.ENOTDIR
 		}
 		flags = linux.AT_REMOVEDIR
 	} else {
 		if child != nil && child.isDir() {
-			vfsObj.AbortDeleteDentry(childVFSD)
+			vfsObj.AbortDeleteDentry(&child.vfsd)
 			return syserror.EISDIR
 		}
 		if rp.MustBeDir() {
-			if childVFSD != nil {
-				vfsObj.AbortDeleteDentry(childVFSD)
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
 			}
 			return syserror.ENOTDIR
 		}
 	}
 	err = parent.file.unlinkAt(ctx, name, flags)
 	if err != nil {
-		if childVFSD != nil {
-			vfsObj.AbortDeleteDentry(childVFSD)
+		if child != nil {
+			vfsObj.AbortDeleteDentry(&child.vfsd)
 		}
 		return err
 	}
@@ -467,10 +469,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		}
 		parent.cacheNegativeChildLocked(name)
 		parent.dirents = nil
+	} else {
+		delete(parent.children, name)
 	}
 	if child != nil {
 		child.setDeleted()
-		vfsObj.CommitDeleteDentry(childVFSD)
+		vfsObj.CommitDeleteDentry(&child.vfsd)
 		ds = appendDentry(ds, child)
 	}
 	return nil
@@ -806,16 +810,14 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	// eligible for caching yet, so we don't need to append to a dentry slice.)
 	child.refs = 1
 	// Insert the dentry into the tree.
-	d.IncRef() // reference held by child on its parent d
-	d.vfsd.InsertChild(&child.vfsd, name)
+	d.cacheNewChildLocked(child, name)
 	if d.fs.opts.interop != InteropModeShared {
-		delete(d.negativeChildren, name)
+		d.touchCMtime()
 		d.dirents = nil
 	}
 
 	// Finally, construct a file description representing the created file.
 	var childVFSFD *vfs.FileDescription
-	mnt.IncRef()
 	if useRegularFileFD {
 		fd := &regularFileFD{}
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
@@ -840,9 +842,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	}
-	if d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime()
-	}
 	return childVFSFD, nil
 }
 
@@ -902,7 +901,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// directory, we need to check for write permission on it.
 	oldParent.dirMu.Lock()
 	defer oldParent.dirMu.Unlock()
-	renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds)
+	renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds)
 	if err != nil {
 		return err
 	}
@@ -910,7 +909,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		return syserror.ENOENT
 	}
 	if renamed.isDir() {
-		if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) {
+		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
 			return syserror.EINVAL
 		}
 		if oldParent != newParent {
@@ -934,16 +933,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if newParent.isDeleted() {
 		return syserror.ENOENT
 	}
-	replacedVFSD := newParent.vfsd.Child(newName)
-	var replaced *dentry
+	replaced := newParent.children[newName]
 	// This is similar to unlinkAt, except:
 	//
-	// - We revalidate the replaced dentry unconditionally for simplicity.
+	// - If a dentry exists for the file to be replaced, we revalidate it
+	// unconditionally (instead of only if PrepareRenameDentry fails) for
+	// simplicity.
 	//
 	// - If rp.MustBeDir(), then we need a dentry representing the replaced
 	// file regardless to confirm that it's a directory.
-	if replacedVFSD != nil || rp.MustBeDir() {
-		replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds)
+	if replaced != nil || rp.MustBeDir() {
+		replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
 		if err != nil {
 			return err
 		}
@@ -957,11 +957,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 					return syserror.ENOTDIR
 				}
 			}
-			replacedVFSD = &replaced.vfsd
-		} else {
-			replacedVFSD = nil
 		}
 	}
+	var replacedVFSD *vfs.Dentry
+	if replaced != nil {
+		replacedVFSD = &replaced.vfsd
+	}
 
 	if oldParent == newParent && oldName == newName {
 		return nil
@@ -978,7 +979,6 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if fs.opts.interop != InteropModeShared {
 		oldParent.cacheNegativeChildLocked(oldName)
 		oldParent.dirents = nil
-		delete(newParent.negativeChildren, newName)
 		newParent.dirents = nil
 		if renamed.isDir() {
 			oldParent.decLinks()
@@ -987,8 +987,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.touchCMtime()
 		newParent.touchCMtime()
 		renamed.touchCtime()
+	} else {
+		delete(oldParent.children, oldName)
+	}
+	if oldParent != newParent {
+		appendDentry(ds, oldParent)
+		newParent.IncRef()
+	}
+	renamed.parent = newParent
+	renamed.name = newName
+	if newParent.children == nil {
+		newParent.children = make(map[string]*dentry)
+	}
+	newParent.children[newName] = renamed
+	if replaced != nil {
+		replaced.setDeleted()
+		appendDentry(ds, replaced)
 	}
-	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
 	return nil
 }
 
@@ -1131,5 +1147,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.renameMu.RLock()
 	defer fs.renameMu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 2485cdb53..293df2545 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -452,6 +452,16 @@ type dentry struct {
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
 
+	// parent is this dentry's parent directory. Each dentry holds a reference
+	// on its parent. If this dentry is a filesystem root, parent is nil.
+	// parent is protected by filesystem.renameMu.
+	parent *dentry
+
+	// name is the name of this dentry in its parent. If this dentry is a
+	// filesystem root, name is the empty string. name is protected by
+	// filesystem.renameMu.
+	name string
+
 	// We don't support hard links, so each dentry maps 1:1 to an inode.
 
 	// file is the unopened p9.File that backs this dentry. file is immutable.
@@ -469,10 +479,15 @@ type dentry struct {
 
 	dirMu sync.Mutex
 
-	// If this dentry represents a directory, and InteropModeShared is not in
-	// effect, negativeChildren is a set of child names in this directory that
-	// are known not to exist. negativeChildren is protected by dirMu.
-	negativeChildren map[string]struct{}
+	// If this dentry represents a directory, children contains:
+	//
+	// - Mappings of child filenames to dentries representing those children.
+	//
+	// - Mappings of child filenames that are known not to exist to nil
+	// dentries (only if InteropModeShared is not in effect).
+	//
+	// children is protected by dirMu.
+	children map[string]*dentry
 
 	// If this dentry represents a directory, InteropModeShared is not in
 	// effect, and dirents is not nil, it is a cache of all entries in the
@@ -910,9 +925,9 @@ func (d *dentry) checkCachingLocked() {
 		// Dentry has already been destroyed.
 		return
 	}
-	// Non-child dentries with zero references are no longer reachable by path
-	// resolution and should be dropped immediately.
-	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
+	// Deleted and invalidated dentries with zero references are no longer
+	// reachable by path resolution and should be dropped immediately.
+	if d.vfsd.IsDead() {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -937,28 +952,26 @@ func (d *dentry) checkCachingLocked() {
 		d.fs.cachedDentries.Remove(victim)
 		d.fs.cachedDentriesLen--
 		victim.cached = false
-		// victim.refs may have become non-zero from an earlier path
-		// resolution since it was inserted into fs.cachedDentries; see
-		// dentry.incRefLocked(). Either way, we brought
-		// fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so
-		// we don't loop.
+		// victim.refs may have become non-zero from an earlier path resolution
+		// since it was inserted into fs.cachedDentries.
 		if atomic.LoadInt64(&victim.refs) == 0 {
-			if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil {
-				victimParent := victimParentVFSD.Impl().(*dentry)
-				victimParent.dirMu.Lock()
-				if !victim.vfsd.IsDisowned() {
-					// victim can't be a mount point (in any mount
-					// namespace), since VFS holds references on mount
-					// points.
-					d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd)
+			if victim.parent != nil {
+				victim.parent.dirMu.Lock()
+				if !victim.vfsd.IsDead() {
+					// Note that victim can't be a mount point (in any mount
+					// namespace), since VFS holds references on mount points.
+					d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd)
+					delete(victim.parent.children, victim.name)
 					// We're only deleting the dentry, not the file it
 					// represents, so we don't need to update
 					// victimParent.dirents etc.
 				}
-				victimParent.dirMu.Unlock()
+				victim.parent.dirMu.Unlock()
 			}
 			victim.destroyLocked()
 		}
+		// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+		// back down to fs.opts.maxCachedDentries, so we don't loop.
 	}
 }
 
@@ -1005,12 +1018,11 @@ func (d *dentry) destroyLocked() {
 	d.fs.syncMu.Lock()
 	delete(d.fs.dentries, d)
 	d.fs.syncMu.Unlock()
-	// Drop the reference held by d on its parent.
-	if parentVFSD := d.vfsd.Parent(); parentVFSD != nil {
-		parent := parentVFSD.Impl().(*dentry)
-		// This is parent.DecRef() without recursive locking of d.fs.renameMu.
-		if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 {
-			parent.checkCachingLocked()
+	// Drop the reference held by d on its parent without recursively locking
+	// d.fs.renameMu.
+	if d.parent != nil {
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkCachingLocked()
 		} else if refs < 0 {
 			panic("gofer.dentry.DecRef() called without holding a reference")
 		}
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index 82bc239db..4041fb252 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -48,8 +48,7 @@ func TestDestroyIdempotent(t *testing.T) {
 	if err != nil {
 		t.Fatalf("fs.newDentry(): %v", err)
 	}
-	parent.IncRef() // reference held by child on its parent.
-	parent.vfsd.InsertChild(&child.vfsd, "child")
+	parent.cacheNewChildLocked(child, "child")
 
 	child.checkCachingLocked()
 	if got := atomic.LoadInt64(&child.refs); got != -1 {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index b3d6299d0..ef34cb28a 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -3,6 +3,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "kernfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "Dentry",
+    },
+)
+
 go_template_instance(
     name = "slot_list",
     out = "slot_list.go",
@@ -21,6 +32,7 @@ go_library(
         "dynamic_bytes_file.go",
         "fd_impl_util.go",
         "filesystem.go",
+        "fstree.go",
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index bfa786c88..e8a4670b8 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -129,7 +129,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 
 	// Handle "..".
 	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
+		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
 		stat, err := parentInode.Stat(vfsFS, opts)
 		if err != nil {
 			return err
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index baf81b4db..01c23d192 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -56,25 +56,28 @@ afterSymlink:
 		return vfsd, nil
 	}
 	if name == ".." {
-		nextVFSD, err := rp.ResolveParent(vfsd)
-		if err != nil {
+		if isRoot, err := rp.CheckRoot(vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return vfsd, nil
+		}
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
 			return nil, err
 		}
 		rp.Advance()
-		return nextVFSD, nil
+		return &d.parent.vfsd, nil
 	}
 	if len(name) > linux.NAME_MAX {
 		return nil, syserror.ENAMETOOLONG
 	}
 	d.dirMu.Lock()
-	nextVFSD, err := rp.ResolveChild(vfsd, name)
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name])
+	d.dirMu.Unlock()
 	if err != nil {
-		d.dirMu.Unlock()
 		return nil, err
 	}
-	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD)
-	d.dirMu.Unlock()
-	if err != nil {
+	if err := rp.CheckMount(&next.vfsd); err != nil {
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
@@ -108,17 +111,17 @@ afterSymlink:
 // parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) {
-	if childVFSD != nil {
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
+	if child != nil {
 		// Cached dentry exists, revalidate.
-		child := childVFSD.Impl().(*Dentry)
 		if !child.inode.Valid(ctx) {
-			vfsObj.ForceDeleteDentry(childVFSD)
-			fs.deferDecRef(childVFSD) // Reference from Lookup.
-			childVFSD = nil
+			delete(parent.children, name)
+			vfsObj.InvalidateDentry(&child.vfsd)
+			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			child = nil
 		}
 	}
-	if childVFSD == nil {
+	if child == nil {
 		// Dentry isn't cached; it either doesn't exist or failed
 		// revalidation. Attempt to resolve it via Lookup.
 		//
@@ -126,15 +129,15 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
 		// that all dentries in the filesystem are (kernfs.)Dentry and performs
 		// vfs.DentryImpl casts accordingly.
-		var err error
-		childVFSD, err = parent.inode.Lookup(ctx, name)
+		childVFSD, err := parent.inode.Lookup(ctx, name)
 		if err != nil {
 			return nil, err
 		}
 		// Reference on childVFSD dropped by a corresponding Valid.
-		parent.insertChildLocked(name, childVFSD)
+		child = childVFSD.Impl().(*Dentry)
+		parent.insertChildLocked(name, child)
 	}
-	return childVFSD.Impl().(*Dentry), nil
+	return child, nil
 }
 
 // walkExistingLocked resolves rp to an existing file.
@@ -203,14 +206,11 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if len(pc) > linux.NAME_MAX {
 		return "", syserror.ENAMETOOLONG
 	}
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
-	if err != nil {
-		return "", err
-	}
-	if childVFSD != nil {
+	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
+	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
 		return "", syserror.EEXIST
 	}
-	if parentVFSD.IsDisowned() {
+	if parentVFSD.IsDead() {
 		return "", syserror.ENOENT
 	}
 	return pc, nil
@@ -220,14 +220,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 //
 // Preconditions: Filesystem.mu must be locked for at least reading.
 func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
-	parentVFSD := vfsd.Parent()
-	if parentVFSD == nil {
+	parent := vfsd.Impl().(*Dentry).parent
+	if parent == nil {
 		return syserror.EBUSY
 	}
-	if parentVFSD.IsDisowned() {
+	if parent.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
-	if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	return nil
@@ -321,11 +321,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EPERM
 	}
 
-	child, err := parentInode.NewLink(ctx, pc, d.inode)
+	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -349,11 +349,11 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	child, err := parentInode.NewDir(ctx, pc, opts)
+	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -377,11 +377,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	new, err := parentInode.NewNode(ctx, pc, opts)
+	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, new)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -449,11 +449,8 @@ afterTrailingSymlink:
 		return nil, syserror.ENAMETOOLONG
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
-	if err != nil {
-		return nil, err
-	}
-	if childVFSD == nil {
+	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD)
+	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
 		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
@@ -463,21 +460,24 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		child, err := parentInode.NewFile(ctx, pc, opts)
+		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
 		if err != nil {
 			return nil, err
 		}
+		child := childVFSD.Impl().(*Dentry)
 		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		return child.Impl().(*Dentry).inode.Open(rp, child, opts)
+		return child.inode.Open(rp, childVFSD, opts)
+	}
+	if err != nil {
+		return nil, err
 	}
 	// Open existing file or follow symlink.
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	childDentry := childVFSD.Impl().(*Dentry)
-	childInode := childDentry.inode
-	if rp.ShouldFollowSymlink() && childDentry.isSymlink() {
-		targetVD, targetPathname, err := childInode.Getlink(ctx)
+	child := childVFSD.Impl().(*Dentry)
+	if rp.ShouldFollowSymlink() && child.isSymlink() {
+		targetVD, targetPathname, err := child.inode.Getlink(ctx)
 		if err != nil {
 			return nil, err
 		}
@@ -496,10 +496,10 @@ afterTrailingSymlink:
 		// symlink target.
 		goto afterTrailingSymlink
 	}
-	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	return childInode.Open(rp, childVFSD, opts)
+	return child.inode.Open(rp, &child.vfsd, opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -526,15 +526,16 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
-	defer fs.mu.Lock()
+	defer fs.processDeferredDecRefsLocked()
+	defer fs.mu.Unlock()
 
 	// Resolve the destination directory first to verify that it's on this
 	// Mount.
 	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
+	dstDir := dstDirVFSD.Impl().(*Dentry)
 	mnt := rp.Mount()
 	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
@@ -547,9 +548,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	srcDirVFSD := oldParentVD.Dentry()
 	srcDir := srcDirVFSD.Impl().(*Dentry)
 	srcDir.dirMu.Lock()
-	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName))
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName])
 	srcDir.dirMu.Unlock()
-	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
@@ -561,7 +561,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	// Can we create the dst dentry?
-	var dstVFSD *vfs.Dentry
+	var dst *Dentry
 	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
 	switch err {
 	case nil:
@@ -571,38 +571,51 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
 			return syserror.EEXIST
 		}
-		dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc)
-		if err != nil {
+		dst = dstDir.children[pc]
+		if dst == nil {
 			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
 		}
 	default:
 		return err
 	}
+	var dstVFSD *vfs.Dentry
+	if dst != nil {
+		dstVFSD = &dst.vfsd
+	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef()
 	virtfs := rp.VirtualFilesystem()
 
-	srcDirDentry := srcDirVFSD.Impl().(*Dentry)
-	dstDirDentry := dstDirVFSD.Impl().(*Dentry)
-
 	// We can't deadlock here due to lock ordering because we're protected from
 	// concurrent renames by fs.mu held for writing.
-	srcDirDentry.dirMu.Lock()
-	defer srcDirDentry.dirMu.Unlock()
-	dstDirDentry.dirMu.Lock()
-	defer dstDirDentry.dirMu.Unlock()
+	srcDir.dirMu.Lock()
+	defer srcDir.dirMu.Unlock()
+	if srcDir != dstDir {
+		dstDir.dirMu.Lock()
+		defer dstDir.dirMu.Unlock()
+	}
 
 	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
 		return err
 	}
-	srcDirInode := srcDirDentry.inode
-	replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD)
+	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
 	if err != nil {
 		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
 		return err
 	}
-	virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced)
+	delete(srcDir.children, src.name)
+	if srcDir != dstDir {
+		fs.deferDecRef(srcDirVFSD)
+		dstDir.IncRef()
+	}
+	src.parent = dstDir
+	src.name = pc
+	if dstDir.children == nil {
+		dstDir.children = make(map[string]*Dentry)
+	}
+	dstDir.children[pc] = src
+	virtfs.CommitRenameReplaceDentry(srcVFSD, replaced)
 	return nil
 }
 
@@ -622,14 +635,15 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
 		return err
 	}
-	if !vfsd.Impl().(*Dentry).isDir() {
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
 		return syserror.ENOTDIR
 	}
 	if inode.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
 	virtfs := rp.VirtualFilesystem()
-	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry := d.parent
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
 
@@ -706,11 +720,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	child, err := parentInode.NewSymlink(ctx, pc, target)
+	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -730,11 +744,12 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
 		return err
 	}
-	if vfsd.Impl().(*Dentry).isDir() {
+	d := vfsd.Impl().(*Dentry)
+	if d.isDir() {
 		return syserror.EISDIR
 	}
 	virtfs := rp.VirtualFilesystem()
-	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry := d.parent
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
 	mntns := vfs.MountNamespaceFromContext(ctx)
@@ -818,5 +833,5 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 65f09af5d..9f526359e 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -370,7 +370,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3
 		if err := o.Insert(name, child.VFSDentry()); err != nil {
 			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
 		}
-		d.InsertChild(name, child.VFSDentry())
+		d.InsertChild(name, child)
 	}
 	return links
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index ad76b9f64..f5041824f 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -168,17 +168,22 @@ const (
 //
 // Must be initialized by Init prior to first use.
 type Dentry struct {
-	refs.AtomicRefCount
+	vfsd vfs.Dentry
 
-	vfsd  vfs.Dentry
-	inode Inode
+	refs.AtomicRefCount
 
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
 	flags uint32
 
-	// dirMu protects vfsd.children for directory dentries.
-	dirMu sync.Mutex
+	parent *Dentry
+	name   string
+
+	// dirMu protects children and the names of child Dentries.
+	dirMu    sync.Mutex
+	children map[string]*Dentry
+
+	inode Inode
 }
 
 // Init initializes this dentry.
@@ -222,8 +227,8 @@ func (d *Dentry) DecRef() {
 func (d *Dentry) destroy() {
 	d.inode.DecRef() // IncRef from Init.
 	d.inode = nil
-	if parent := d.vfsd.Parent(); parent != nil {
-		parent.DecRef() // IncRef from Dentry.InsertChild.
+	if d.parent != nil {
+		d.parent.DecRef() // IncRef from Dentry.InsertChild.
 	}
 }
 
@@ -233,7 +238,7 @@ func (d *Dentry) destroy() {
 // updates the link count on d if required.
 //
 // Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+func (d *Dentry) InsertChild(name string, child *Dentry) {
 	d.dirMu.Lock()
 	d.insertChildLocked(name, child)
 	d.dirMu.Unlock()
@@ -243,13 +248,17 @@ func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
 // preconditions.
 //
 // Precondition: d.dirMu must be locked.
-func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) {
+func (d *Dentry) insertChildLocked(name string, child *Dentry) {
 	if !d.isDir() {
 		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
 	}
-	vfsDentry := d.VFSDentry()
-	vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
-	vfsDentry.InsertChild(child, name)
+	d.IncRef() // DecRef in child's Dentry.destroy.
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*Dentry)
+	}
+	d.children[name] = child
 }
 
 // The Inode interface maps filesystem-level operations that operate on paths to
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index d0f97c137..19abb5034 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -415,36 +415,36 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F
 		if d.Name == "." || d.Name == ".." {
 			continue
 		}
-		childPath := path.Join(fd.MappedName(ctx), d.Name)
+		absPath := path.Join(fd.MappedName(ctx), d.Name)
 		if d.Type == linux.DT_LNK {
 			link, err := s.VFS.ReadlinkAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)},
+				&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)},
 			)
 			if err != nil {
-				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err)
+				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err)
 			} else {
-				t.Logf("Skipping symlink: /proc%s => %s", childPath, link)
+				t.Logf("Skipping symlink: %s => %s", absPath, link)
 			}
 			continue
 		}
 
-		t.Logf("Opening: /proc%s", childPath)
+		t.Logf("Opening: %s", absPath)
 		child, err := s.VFS.OpenAt(
 			ctx,
 			auth.CredentialsFromContext(ctx),
-			&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)},
+			&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)},
 			&vfs.OpenOptions{},
 		)
 		if err != nil {
-			t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err)
+			t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err)
 			continue
 		}
 		defer child.DecRef()
 		stat, err := child.Stat(ctx, vfs.StatOptions{})
 		if err != nil {
-			t.Errorf("Stat(%v) failed: %v", childPath, err)
+			t.Errorf("Stat(%v) failed: %v", absPath, err)
 		}
 		if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type {
 			t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type)
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 4e6cd3491..a2d9649e7 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "tmpfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "tmpfs",
     srcs = [
@@ -22,6 +33,7 @@ go_library(
         "device_file.go",
         "directory.go",
         "filesystem.go",
+        "fstree.go",
         "named_pipe.go",
         "regular_file.go",
         "socket_file.go",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 651912169..2fb5c4d84 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -438,13 +438,6 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 				filePathBuilder.WriteByte('/')
 			}
 
-			// Verify that we didn't create any directories under the mount
-			// point (i.e. they were all created on the submount).
-			firstDirName := fmt.Sprintf("%d", depth)
-			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
-				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
-			}
-
 			// Create the file that will be stat'd.
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 45712c9b9..f2399981b 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -15,35 +15,77 @@
 package tmpfs
 
 import (
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 type directory struct {
-	inode inode
+	// Since directories can't be hard-linked, each directory can only be
+	// associated with a single dentry, which we can store in the directory
+	// struct.
+	dentry dentry
+	inode  inode
+
+	// childMap maps the names of the directory's children to their dentries.
+	// childMap is protected by filesystem.mu.
+	childMap map[string]*dentry
 
-	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// numChildren is len(childMap), but accessed using atomic memory
+	// operations to avoid locking in inode.statTo().
+	numChildren int64
+
+	// childList is a list containing (1) child dentries and (2) fake dentries
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is protected by filesystem.mu.
+	// efficiently. childList is protected by iterMu.
+	iterMu    sync.Mutex
 	childList dentryList
 }
 
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
 	dir := &directory{}
 	dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
-	return &dir.inode
+	dir.dentry.inode = &dir.inode
+	dir.dentry.vfsd.Init(&dir.dentry)
+	return dir
+}
+
+// Preconditions: filesystem.mu must be locked for writing. dir must not
+// already contain a child with the given name.
+func (dir *directory) insertChildLocked(child *dentry, name string) {
+	child.parent = &dir.dentry
+	child.name = name
+	if dir.childMap == nil {
+		dir.childMap = make(map[string]*dentry)
+	}
+	dir.childMap[name] = child
+	atomic.AddInt64(&dir.numChildren, 1)
+	dir.iterMu.Lock()
+	dir.childList.PushBack(child)
+	dir.iterMu.Unlock()
+}
+
+// Preconditions: filesystem.mu must be locked for writing.
+func (dir *directory) removeChildLocked(child *dentry) {
+	delete(dir.childMap, child.name)
+	atomic.AddInt64(&dir.numChildren, -1)
+	dir.iterMu.Lock()
+	dir.childList.Remove(child)
+	dir.iterMu.Unlock()
 }
 
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	// Protected by filesystem.mu.
+	// Protected by directory.iterMu.
 	iter *dentry
 	off  int64
 }
@@ -51,11 +93,10 @@ type directoryFD struct {
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *directoryFD) Release() {
 	if fd.iter != nil {
-		fs := fd.filesystem()
 		dir := fd.inode().impl.(*directory)
-		fs.mu.Lock()
+		dir.iterMu.Lock()
 		dir.childList.Remove(fd.iter)
-		fs.mu.Unlock()
+		dir.iterMu.Unlock()
 		fd.iter = nil
 	}
 }
@@ -63,10 +104,13 @@ func (fd *directoryFD) Release() {
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
 	fs := fd.filesystem()
-	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+	dir := fd.inode().impl.(*directory)
 
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	// fs.mu is required to read d.parent and dentry.name.
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	dir.iterMu.Lock()
+	defer dir.iterMu.Unlock()
 
 	fd.inode().touchAtime(fd.vfsfd.Mount())
 
@@ -74,15 +118,16 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
-			Ino:     vfsd.Impl().(*dentry).inode.ino,
+			Ino:     dir.inode.ino,
 			NextOff: 1,
 		}); err != nil {
 			return err
 		}
 		fd.off++
 	}
+
 	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
+		parentInode := genericParentOrSelf(&dir.dentry).inode
 		if err := cb.Handle(vfs.Dirent{
 			Name:    "..",
 			Type:    parentInode.direntType(),
@@ -94,7 +139,6 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		fd.off++
 	}
 
-	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
 	var child *dentry
 	if fd.iter == nil {
 		// Start iteration at the beginning of dir.
@@ -109,7 +153,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		// Skip other directoryFD iterators.
 		if child.inode != nil {
 			if err := cb.Handle(vfs.Dirent{
-				Name:    child.vfsd.Name(),
+				Name:    child.name,
 				Type:    child.inode.direntType(),
 				Ino:     child.inode.ino,
 				NextOff: fd.off + 1,
@@ -127,9 +171,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fs := fd.filesystem()
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	dir := fd.inode().impl.(*directory)
+	dir.iterMu.Lock()
+	defer dir.iterMu.Unlock()
 
 	switch whence {
 	case linux.SEEK_SET:
@@ -157,8 +201,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 		remChildren = offset - 2
 	}
 
-	dir := fd.inode().impl.(*directory)
-
 	// Ensure that fd.iter exists and is not linked into dir.childList.
 	if fd.iter == nil {
 		fd.iter = &dentry{}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 452c4e2e0..5b62f9ebb 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -39,27 +39,43 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 //
 // Preconditions: filesystem.mu must be locked. !rp.Done().
 func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
-	if !d.inode.isDir() {
+	dir, ok := d.inode.impl.(*directory)
+	if !ok {
 		return nil, syserror.ENOTDIR
 	}
 	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
-	if len(rp.Component()) > linux.NAME_MAX {
-		return nil, syserror.ENAMETOOLONG
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
 	}
-	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
-	if err != nil {
-		return nil, err
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
 	}
-	if nextVFSD == nil {
-		// Since the Dentry tree is the sole source of truth for tmpfs, if it's
-		// not in the Dentry tree, it doesn't exist.
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	child, ok := dir.childMap[name]
+	if !ok {
 		return nil, syserror.ENOENT
 	}
-	next := nextVFSD.Impl().(*dentry)
-	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+	if err := rp.CheckMount(&child.vfsd); err != nil {
+		return nil, err
+	}
+	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// TODO(gvisor.dev/issue/1197): Symlink traversals updates
 		// access time.
 		if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -68,7 +84,7 @@ afterSymlink:
 		goto afterSymlink // don't check the current directory again
 	}
 	rp.Advance()
-	return next, nil
+	return child, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -80,7 +96,7 @@ afterSymlink:
 // fs/namei.c:path_parentat().
 //
 // Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
 	for !rp.Final() {
 		next, err := stepLocked(rp, d)
 		if err != nil {
@@ -88,10 +104,11 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 		}
 		d = next
 	}
-	if !d.inode.isDir() {
+	dir, ok := d.inode.impl.(*directory)
+	if !ok {
 		return nil, syserror.ENOTDIR
 	}
-	return d, nil
+	return dir, nil
 }
 
 // resolveLocked resolves rp to an existing file.
@@ -122,14 +139,14 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
 //
 // Preconditions: !rp.Done(). For the final path component in rp,
 // !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -139,19 +156,15 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if len(name) > linux.NAME_MAX {
 		return syserror.ENAMETOOLONG
 	}
-	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the child exists we want to return EEXIST immediately instead
-	// of attempting symlink/mount traversal.
-	if parent.vfsd.Child(name) != nil {
+	if _, ok := parentDir.childMap[name]; ok {
 		return syserror.EEXIST
 	}
 	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
-	// In tmpfs, the only way to cause a dentry to be disowned is by removing
-	// it from the filesystem, so this check is equivalent to checking if
-	// parent has been removed.
-	if parent.vfsd.IsDisowned() {
+	// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
+	// be dead if it was deleted.
+	if parentDir.dentry.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
 	mnt := rp.Mount()
@@ -159,10 +172,10 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 		return err
 	}
 	defer mnt.EndWrite()
-	if err := create(parent, name); err != nil {
+	if err := create(parentDir, name); err != nil {
 		return err
 	}
-	parent.inode.touchCMtime()
+	parentDir.inode.touchCMtime()
 	return nil
 }
 
@@ -201,17 +214,17 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return nil, err
 	}
-	d.IncRef()
-	return &d.vfsd, nil
+	dir.dentry.IncRef()
+	return &dir.dentry.vfsd, nil
 }
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
 		if rp.Mount() != vd.Mount() {
 			return syserror.EXDEV
 		}
@@ -226,30 +239,27 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			return syserror.EMLINK
 		}
 		d.inode.incLinksLocked()
-		child := fs.newDentry(d.inode)
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.insertChildLocked(fs.newDentry(d.inode), name)
 		return nil
 	})
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
-		if parent.inode.nlink == maxLinks {
+	return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+		if parentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
-		parent.inode.incLinksLocked() // from child's ".."
-		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.inode.incLinksLocked() // from child's ".."
+		childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+		parentDir.insertChildLocked(&childDir.dentry, name)
 		return nil
 	})
 }
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
 		var childInode *inode
 		switch opts.Mode.FileType() {
 		case 0, linux.S_IFREG:
@@ -266,8 +276,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			return syserror.EINVAL
 		}
 		child := fs.newDentry(childInode)
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.insertChildLocked(child, name)
 		return nil
 	})
 }
@@ -306,12 +315,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		return start.open(ctx, rp, &opts, false /* afterCreate */)
 	}
 afterTrailingSymlink:
-	parent, err := walkParentDirLocked(rp, start)
+	parentDir, err := walkParentDirLocked(rp, start)
 	if err != nil {
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -322,11 +331,14 @@ afterTrailingSymlink:
 	if name == "." || name == ".." {
 		return nil, syserror.EISDIR
 	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	// Determine whether or not we need to create a file.
-	child, err := stepLocked(rp, parent)
-	if err == syserror.ENOENT {
+	child, ok := parentDir.childMap[name]
+	if !ok {
 		// Already checked for searchability above; now check for writability.
-		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -335,21 +347,26 @@ afterTrailingSymlink:
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
 		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.insertChildLocked(child, name)
 		fd, err := child.open(ctx, rp, &opts, true)
 		if err != nil {
 			return nil, err
 		}
-		parent.inode.touchCMtime()
+		parentDir.inode.touchCMtime()
 		return fd, nil
 	}
-	if err != nil {
+	// Is the file mounted over?
+	if err := rp.CheckMount(&child.vfsd); err != nil {
 		return nil, err
 	}
 	// Do we need to resolve a trailing symlink?
-	if !rp.Done() {
-		start = parent
+	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// TODO(gvisor.dev/issue/1197): Symlink traversals updates
+		// access time.
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		start = &parentDir.dentry
 		goto afterTrailingSymlink
 	}
 	// Open existing file.
@@ -428,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// Resolve newParent first to verify that it's on this Mount.
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
@@ -445,23 +462,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 	defer mnt.EndWrite()
 
-	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
+	if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
-	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the existing child is a symlink or mount point then we want
-	// to rename over it rather than follow it.
-	renamedVFSD := oldParent.vfsd.Child(oldName)
-	if renamedVFSD == nil {
+	renamed, ok := oldParentDir.childMap[oldName]
+	if !ok {
 		return syserror.ENOENT
 	}
-	renamed := renamedVFSD.Impl().(*dentry)
+	// Note that we don't need to call rp.CheckMount(), since if renamed is a
+	// mount point then we want to rename the mount point, not anything in the
+	// mounted filesystem.
 	if renamed.inode.isDir() {
-		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+		if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
 			return syserror.EINVAL
 		}
-		if oldParent != newParent {
+		if oldParentDir != newParentDir {
 			// Writability is needed to change renamed's "..".
 			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
@@ -473,18 +489,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 	}
 
-	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
-	replacedVFSD := newParent.vfsd.Child(newName)
-	var replaced *dentry
-	if replacedVFSD != nil {
-		replaced = replacedVFSD.Impl().(*dentry)
-		if replaced.inode.isDir() {
+	replaced, ok := newParentDir.childMap[newName]
+	if ok {
+		replacedDir, ok := replaced.inode.impl.(*directory)
+		if ok {
 			if !renamed.inode.isDir() {
 				return syserror.EISDIR
 			}
-			if replaced.vfsd.HasChildren() {
+			if len(replacedDir.childMap) != 0 {
 				return syserror.ENOTEMPTY
 			}
 		} else {
@@ -496,11 +511,13 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			}
 		}
 	} else {
-		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+		if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
 	}
-	if newParent.vfsd.IsDisowned() {
+	// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
+	// only be dead if it was deleted.
+	if newParentDir.dentry.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
 
@@ -508,36 +525,38 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// simplicity, under the assumption that applications are not intentionally
 	// doing noop renames expecting them to succeed where non-noop renames
 	// would fail.
-	if renamedVFSD == replacedVFSD {
+	if renamed == replaced {
 		return nil
 	}
 	vfsObj := rp.VirtualFilesystem()
-	oldParentDir := oldParent.inode.impl.(*directory)
-	newParentDir := newParent.inode.impl.(*directory)
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef()
-	if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil {
+	var replacedVFSD *vfs.Dentry
+	if replaced != nil {
+		replacedVFSD = &replaced.vfsd
+	}
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
 		return err
 	}
 	if replaced != nil {
-		newParentDir.childList.Remove(replaced)
+		newParentDir.removeChildLocked(replaced)
 		if replaced.inode.isDir() {
-			newParent.inode.decLinksLocked() // from replaced's ".."
+			newParentDir.inode.decLinksLocked() // from replaced's ".."
 		}
 		replaced.inode.decLinksLocked()
 	}
-	oldParentDir.childList.Remove(renamed)
-	newParentDir.childList.PushBack(renamed)
-	if renamed.inode.isDir() {
-		oldParent.inode.decLinksLocked()
-		newParent.inode.incLinksLocked()
+	oldParentDir.removeChildLocked(renamed)
+	newParentDir.insertChildLocked(renamed, newName)
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+	oldParentDir.inode.touchCMtime()
+	if oldParentDir != newParentDir {
+		if renamed.inode.isDir() {
+			oldParentDir.inode.decLinksLocked()
+			newParentDir.inode.incLinksLocked()
+		}
+		newParentDir.inode.touchCMtime()
 	}
-	oldParent.inode.touchCMtime()
-	newParent.inode.touchCMtime()
 	renamed.inode.touchCtime()
-	// TODO(gvisor.dev/issue/1197): Update timestamps and parent directory
-	// sizes.
-	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
 	return nil
 }
 
@@ -545,11 +564,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -559,15 +578,15 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if name == ".." {
 		return syserror.ENOTEMPTY
 	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
+	child, ok := parentDir.childMap[name]
+	if !ok {
 		return syserror.ENOENT
 	}
-	child := childVFSD.Impl().(*dentry)
-	if !child.inode.isDir() {
+	childDir, ok := child.inode.impl.(*directory)
+	if !ok {
 		return syserror.ENOTDIR
 	}
-	if childVFSD.HasChildren() {
+	if len(childDir.childMap) != 0 {
 		return syserror.ENOTEMPTY
 	}
 	mnt := rp.Mount()
@@ -578,14 +597,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	vfsObj := rp.VirtualFilesystem()
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef()
-	if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
-	parent.inode.impl.(*directory).childList.Remove(child)
-	parent.inode.decLinksLocked() // from child's ".."
+	parentDir.removeChildLocked(child)
+	parentDir.inode.decLinksLocked() // from child's ".."
 	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
-	parent.inode.touchCMtime()
+	vfsObj.CommitDeleteDentry(&child.vfsd)
+	parentDir.inode.touchCMtime()
 	return nil
 }
 
@@ -627,10 +646,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
 		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.insertChildLocked(child, name)
 		return nil
 	})
 }
@@ -639,22 +657,21 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EISDIR
 	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
+	child, ok := parentDir.childMap[name]
+	if !ok {
 		return syserror.ENOENT
 	}
-	child := childVFSD.Impl().(*dentry)
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
@@ -669,13 +686,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	vfsObj := rp.VirtualFilesystem()
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef()
-	if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
-	parent.inode.impl.(*directory).childList.Remove(child)
+	parentDir.removeChildLocked(child)
 	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
-	parent.inode.touchCMtime()
+	vfsObj.CommitDeleteDentry(&child.vfsd)
+	parentDir.inode.touchCMtime()
 	return nil
 }
 
@@ -743,5 +760,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index d4f59ee5b..60c2c980e 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -71,9 +71,15 @@ func TestStatAfterCreate(t *testing.T) {
 				t.Errorf("got btime %d, want 0", got.Btime.ToNsec())
 			}
 
-			// Size should be 0.
-			if got.Size != 0 {
-				t.Errorf("got size %d, want 0", got.Size)
+			// Size should be 0 (except for directories, which make up a size
+			// of 20 per entry, including the "." and ".." entries present in
+			// otherwise-empty directories).
+			wantSize := uint64(0)
+			if typ == "dir" {
+				wantSize = 40
+			}
+			if got.Size != wantSize {
+				t.Errorf("got size %d, want %d", got.Size, wantSize)
 			}
 
 			// Nlink should be 1 for files, 2 for dirs.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 82c709b43..efc931468 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -12,16 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
-// the Dentry tree is the sole source of truth for the state of the filesystem.
+// Package tmpfs provides an in-memory filesystem whose contents are
+// application-mutable, consistent with Linux's tmpfs.
 //
 // Lock order:
 //
 // filesystem.mu
 //   inode.mu
 //     regularFileFD.offMu
+//       *** "memmap.Mappable locks" below this point
 //       regularFile.mapsMu
+//         *** "memmap.Mappable locks taken by Translate" below this point
 //         regularFile.dataMu
+//     directory.iterMu
 package tmpfs
 
 import (
@@ -41,6 +44,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Name is the default filesystem name.
@@ -112,18 +116,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 
 	fs.vfsfs.Init(vfsObj, newFSType, &fs)
 
-	var root *inode
+	var root *dentry
 	switch rootFileType {
 	case linux.S_IFREG:
-		root = fs.newRegularFile(creds, 0777)
+		root = fs.newDentry(fs.newRegularFile(creds, 0777))
 	case linux.S_IFLNK:
-		root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)
+		root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
 	case linux.S_IFDIR:
-		root = fs.newDirectory(creds, 01777)
+		root = &fs.newDirectory(creds, 01777).dentry
 	default:
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
 	}
-	return &fs.vfsfs, &fs.newDentry(root).vfsd, nil
+	return &fs.vfsfs, &root.vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -134,20 +138,29 @@ func (fs *filesystem) Release() {
 type dentry struct {
 	vfsd vfs.Dentry
 
+	// parent is this dentry's parent directory. Each referenced dentry holds a
+	// reference on parent.dentry. If this dentry is a filesystem root, parent
+	// is nil. parent is protected by filesystem.mu.
+	parent *dentry
+
+	// name is the name of this dentry in its parent. If this dentry is a
+	// filesystem root, name is the empty string. name is protected by
+	// filesystem.mu.
+	name string
+
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
+	dentryEntry
+
 	// inode is the inode represented by this dentry. Multiple Dentries may
 	// share a single non-directory inode (with hard links). inode is
 	// immutable.
-	inode *inode
-
+	//
 	// tmpfs doesn't count references on dentries; because the dentry tree is
 	// the sole source of truth, it is by definition always consistent with the
 	// state of the filesystem. However, it does count references on inodes,
 	// because inode resources are released when all references are dropped.
-	// (tmpfs doesn't really have resources to release, but we implement
-	// reference counting because tmpfs regular files will.)
-
-	// dentryEntry (ugh) links dentries into their parent directory.childList.
-	dentryEntry
+	// dentry therefore forwards reference counting directly to inode.
+	inode *inode
 }
 
 func (fs *filesystem) newDentry(inode *inode) *dentry {
@@ -207,10 +220,6 @@ type inode struct {
 	ctime int64 // nanoseconds
 	mtime int64 // nanoseconds
 
-	// Only meaningful for device special files.
-	rdevMajor uint32
-	rdevMinor uint32
-
 	// Advisory file locks, which lock at the inode level.
 	locks lock.FileLocks
 
@@ -230,7 +239,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
 	i.gid = uint32(creds.EffectiveKGID)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
 	// Tmpfs creation sets atime, ctime, and mtime to current time.
-	now := i.clock.Now().Nanoseconds()
+	now := fs.clock.Now().Nanoseconds()
 	i.atime = now
 	i.ctime = now
 	i.mtime = now
@@ -283,14 +292,10 @@ func (i *inode) tryIncRef() bool {
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
 		if regFile, ok := i.impl.(*regularFile); ok {
-			// Hold inode.mu and regFile.dataMu while mutating
-			// size.
-			i.mu.Lock()
-			regFile.dataMu.Lock()
+			// Release memory used by regFile to store data. Since regFile is
+			// no longer usable, we don't need to grab any locks or update any
+			// metadata.
 			regFile.data.DropAll(regFile.memFile)
-			atomic.StoreUint64(&regFile.size, 0)
-			regFile.dataMu.Unlock()
-			i.mu.Unlock()
 		}
 	} else if refs < 0 {
 		panic("tmpfs.inode.decRef() called without holding a reference")
@@ -310,15 +315,15 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) e
 // a concurrent modification), so we do not require holding inode.mu.
 func (i *inode) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
-		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME |
-		linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME
-	stat.Blksize = 1 // usermem.PageSize in tmpfs
+		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
+		linux.STATX_MTIME
+	stat.Blksize = usermem.PageSize
 	stat.Nlink = atomic.LoadUint32(&i.nlink)
 	stat.UID = atomic.LoadUint32(&i.uid)
 	stat.GID = atomic.LoadUint32(&i.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
 	stat.Ino = i.ino
-	// Linux's tmpfs has no concept of btime, so zero-value is returned.
 	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
 	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
 	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
@@ -327,19 +332,22 @@ func (i *inode) statTo(stat *linux.Statx) {
 	case *regularFile:
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(atomic.LoadUint64(&impl.size))
-		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
-		// a uint64 accessed using atomic memory operations to avoid taking
-		// locks).
+		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
+		// too expensive to compute here. Cache it in regularFile.
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		// "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
+		stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren)))
+		// stat.Blocks is 0.
 	case *symlink:
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(len(impl.target))
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
+		// stat.Blocks is 0.
+	case *namedPipe, *socketFile:
+		// stat.Size and stat.Blocks are 0.
 	case *deviceFile:
+		// stat.Size and stat.Blocks are 0.
 		stat.RdevMajor = impl.major
 		stat.RdevMinor = impl.minor
-	case *socketFile, *directory, *namedPipe:
-		// Nothing to do.
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 35b208721..8624dbd5d 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -15,34 +15,17 @@
 package vfs
 
 import (
-	"fmt"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Dentry represents a node in a Filesystem tree which may represent a file.
+// Dentry represents a node in a Filesystem tree at which a file exists.
 //
 // Dentries are reference-counted. Unless otherwise specified, all Dentry
 // methods require that a reference is held.
 //
-// A Dentry transitions through up to 3 different states through its lifetime:
-//
-// - Dentries are initially "independent". Independent Dentries have no parent,
-// and consequently no name.
-//
-// - Dentry.InsertChild() causes an independent Dentry to become a "child" of
-// another Dentry. A child node has a parent node, and a name in that parent,
-// both of which are mutable by DentryMoveChild(). Each child Dentry's name is
-// unique within its parent.
-//
-// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A
-// disowned Dentry can still refer to its former parent and its former name in
-// said parent, but the disowned Dentry is no longer reachable from its parent,
-// and a new Dentry with the same name may become a child of the parent. (This
-// is analogous to a struct dentry being "unhashed" in Linux.)
-//
 // Dentry is loosely analogous to Linux's struct dentry, but:
 //
 // - VFS does not associate Dentries with inodes. gVisor interacts primarily
@@ -57,9 +40,6 @@ import (
 // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
 // support inodes may store appropriate state in implementations of DentryImpl.
 //
-// - VFS does not provide synchronization for mutable Dentry fields, other than
-// mount-related ones.
-//
 // - VFS does not require that Dentries are instantiated for all paths accessed
 // through VFS, only those that are tracked beyond the scope of a single
 // Filesystem operation. This includes file descriptions, mount points, mount
@@ -67,6 +47,10 @@ import (
 // of Dentries for operations on mutable remote filesystems that can't actually
 // cache any state in the Dentry.
 //
+// - VFS does not track filesystem structure (i.e. relationships between
+// Dentries), since both the relevant state and synchronization are
+// filesystem-specific.
+//
 // - For the reasons above, VFS is not directly responsible for managing Dentry
 // lifetime. Dentry reference counts only indicate the extent to which VFS
 // requires Dentries to exist; Filesystems may elect to cache or discard
@@ -74,36 +58,23 @@ import (
 //
 // +stateify savable
 type Dentry struct {
-	// parent is this Dentry's parent in this Filesystem. If this Dentry is
-	// independent, parent is nil.
-	parent *Dentry
-
-	// name is this Dentry's name in parent.
-	name string
+	// mu synchronizes deletion/invalidation and mounting over this Dentry.
+	mu sync.Mutex `state:"nosave"`
 
-	flags uint32
+	// dead is true if the file represented by this Dentry has been deleted (by
+	// CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by
+	// InvalidateDentry). dead is protected by mu.
+	dead bool
 
 	// mounts is the number of Mounts for which this Dentry is Mount.point.
 	// mounts is accessed using atomic memory operations.
 	mounts uint32
 
-	// children are child Dentries.
-	children map[string]*Dentry
-
-	// mu synchronizes disowning and mounting over this Dentry.
-	mu sync.Mutex `state:"nosave"`
-
 	// impl is the DentryImpl associated with this Dentry. impl is immutable.
 	// This should be the last field in Dentry.
 	impl DentryImpl
 }
 
-const (
-	// dflagsDisownedMask is set in Dentry.flags if the Dentry has been
-	// disowned.
-	dflagsDisownedMask = 1 << iota
-)
-
 // Init must be called before first use of d.
 func (d *Dentry) Init(impl DentryImpl) {
 	d.impl = impl
@@ -134,20 +105,6 @@ type DentryImpl interface {
 	DecRef()
 }
 
-// IsDisowned returns true if d is disowned.
-func (d *Dentry) IsDisowned() bool {
-	return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0
-}
-
-// Preconditions: !d.IsDisowned().
-func (d *Dentry) setDisowned() {
-	atomic.AddUint32(&d.flags, dflagsDisownedMask)
-}
-
-func (d *Dentry) isMounted() bool {
-	return atomic.LoadUint32(&d.mounts) != 0
-}
-
 // IncRef increments d's reference count.
 func (d *Dentry) IncRef() {
 	d.impl.IncRef()
@@ -164,104 +121,26 @@ func (d *Dentry) DecRef() {
 	d.impl.DecRef()
 }
 
-// These functions are exported so that filesystem implementations can use
-// them. The vfs package, and users of VFS, should not call these functions.
-// Unless otherwise specified, these methods require that there are no
-// concurrent mutators of d.
-
-// Name returns d's name in its parent in its owning Filesystem. If d is
-// independent, Name returns an empty string.
-func (d *Dentry) Name() string {
-	return d.name
-}
-
-// Parent returns d's parent in its owning Filesystem. It does not take a
-// reference on the returned Dentry. If d is independent, Parent returns nil.
-func (d *Dentry) Parent() *Dentry {
-	return d.parent
-}
-
-// ParentOrSelf is equivalent to Parent, but returns d if d is independent.
-func (d *Dentry) ParentOrSelf() *Dentry {
-	if d.parent == nil {
-		return d
-	}
-	return d.parent
-}
-
-// Child returns d's child with the given name in its owning Filesystem. It
-// does not take a reference on the returned Dentry. If no such child exists,
-// Child returns nil.
-func (d *Dentry) Child(name string) *Dentry {
-	return d.children[name]
-}
-
-// HasChildren returns true if d has any children.
-func (d *Dentry) HasChildren() bool {
-	return len(d.children) != 0
-}
-
-// Children returns a map containing all of d's children.
-func (d *Dentry) Children() map[string]*Dentry {
-	if !d.HasChildren() {
-		return nil
-	}
-	m := make(map[string]*Dentry)
-	for name, child := range d.children {
-		m[name] = child
-	}
-	return m
+// IsDead returns true if d has been deleted or invalidated by its owning
+// filesystem.
+func (d *Dentry) IsDead() bool {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.dead
 }
 
-// InsertChild makes child a child of d with the given name.
-//
-// InsertChild is a mutator of d and child.
-//
-// Preconditions: child must be an independent Dentry. d and child must be from
-// the same Filesystem. d must not already have a child with the given name.
-func (d *Dentry) InsertChild(child *Dentry, name string) {
-	if checkInvariants {
-		if _, ok := d.children[name]; ok {
-			panic(fmt.Sprintf("parent already contains a child named %q", name))
-		}
-		if child.parent != nil || child.name != "" {
-			panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name))
-		}
-	}
-	if d.children == nil {
-		d.children = make(map[string]*Dentry)
-	}
-	d.children[name] = child
-	child.parent = d
-	child.name = name
+func (d *Dentry) isMounted() bool {
+	return atomic.LoadUint32(&d.mounts) != 0
 }
 
-// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
-// d2's parent or an ancestor of d2's parent.
-func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
-	for d2.parent != nil {
-		if d2.parent == d {
-			return true
-		}
-		d2 = d2.parent
-	}
-	return false
-}
+// The following functions are exported so that filesystem implementations can
+// use them. The vfs package, and users of VFS, should not call these
+// functions.
 
 // PrepareDeleteDentry must be called before attempting to delete the file
 // represented by d. If PrepareDeleteDentry succeeds, the caller must call
 // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
-//
-// Preconditions: d is a child Dentry.
 func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
-	if checkInvariants {
-		if d.parent == nil {
-			panic("d is independent")
-		}
-		if d.IsDisowned() {
-			panic("d is already disowned")
-		}
-	}
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[d] != 0 {
 		vfs.mountMu.Unlock()
@@ -280,42 +159,27 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
 	d.mu.Unlock()
 }
 
-// CommitDeleteDentry must be called after the file represented by d is
-// deleted, and causes d to become disowned.
-//
-// CommitDeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: PrepareDeleteDentry was previously called on d.
+// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// succeeds.
 func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
-	if d.parent != nil {
-		delete(d.parent.children, d.name)
-	}
-	d.setDisowned()
+	d.dead = true
 	d.mu.Unlock()
 	if d.isMounted() {
-		vfs.forgetDisownedMountpoint(d)
+		vfs.forgetDeadMountpoint(d)
 	}
 }
 
-// ForceDeleteDentry causes d to become disowned. It should only be used in
-// cases where VFS has no ability to stop the deletion (e.g. d represents the
-// local state of a file on a remote filesystem on which the file has already
-// been deleted).
-//
-// ForceDeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
-	if checkInvariants {
-		if d.parent == nil {
-			panic("d is independent")
-		}
-		if d.IsDisowned() {
-			panic("d is already disowned")
-		}
-	}
+// InvalidateDentry is called when d ceases to represent the file it formerly
+// did for reasons outside of VFS' control (e.g. d represents the local state
+// of a file on a remote filesystem on which the file has already been
+// deleted).
+func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
 	d.mu.Lock()
-	vfs.CommitDeleteDentry(d)
+	d.dead = true
+	d.mu.Unlock()
+	if d.isMounted() {
+		vfs.forgetDeadMountpoint(d)
+	}
 }
 
 // PrepareRenameDentry must be called before attempting to rename the file
@@ -324,25 +188,9 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
 // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
-// Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem. from != to.
+// Preconditions: If to is not nil, it must be a child Dentry from the same
+// Filesystem. from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
-	if checkInvariants {
-		if from.parent == nil {
-			panic("from is independent")
-		}
-		if from.IsDisowned() {
-			panic("from is already disowned")
-		}
-		if to != nil {
-			if to.parent == nil {
-				panic("to is independent")
-			}
-			if to.IsDisowned() {
-				panic("to is already disowned")
-			}
-		}
-	}
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[from] != 0 {
 		vfs.mountMu.Unlock()
@@ -376,24 +224,14 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
 // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
 // that was replaced by from.
 //
-// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and
-// to.Parent().
-//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
-// newParent.Child(newName) == to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
-	if newParent.children == nil {
-		newParent.children = make(map[string]*Dentry)
-	}
-	newParent.children[newName] = from
-	from.parent = newParent
-	from.name = newName
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
 	from.mu.Unlock()
 	if to != nil {
-		to.setDisowned()
+		to.dead = true
 		to.mu.Unlock()
 		if to.isMounted() {
-			vfs.forgetDisownedMountpoint(to)
+			vfs.forgetDeadMountpoint(to)
 		}
 	}
 }
@@ -401,25 +239,18 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry,
 // CommitRenameExchangeDentry must be called after the files represented by
 // from and to are exchanged by rename(RENAME_EXCHANGE).
 //
-// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and
-// to.Parent().
-//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
 func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
-	from.parent, to.parent = to.parent, from.parent
-	from.name, to.name = to.name, from.name
-	from.parent.children[from.name] = from
-	to.parent.children[to.name] = to
 	from.mu.Unlock()
 	to.mu.Unlock()
 }
 
-// forgetDisownedMountpoint is called when a mount point is deleted to umount
-// all mounts using it in all other mount namespaces.
+// forgetDeadMountpoint is called when a mount point is deleted or invalidated
+// to umount all mounts using it in all other mount namespaces.
 //
-// forgetDisownedMountpoint is analogous to Linux's
+// forgetDeadMountpoint is analogous to Linux's
 // fs/namespace.c:__detach_mounts().
-func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
 	var (
 		vdsToDecRef    []VirtualDentry
 		mountsToDecRef []*Mount
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5976b5ccd..15cc091e2 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -127,7 +127,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 		mount:  mnt,
 		dentry: d,
 	}
-	fd.vd.IncRef()
+	mnt.IncRef()
+	d.IncRef()
 	fd.opts = *opts
 	fd.readable = MayReadFileWithOpenFlags(statusFlags)
 	fd.writable = writable
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index a537a29d1..74577bc2f 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -346,7 +346,10 @@ type FilesystemImpl interface {
 	// ENOTEMPTY.
 	//
 	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+	// !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
+	// previous call to
+	// oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
+	// not "." or "..".
 	//
 	// Postconditions: If RenameAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 7315a588e..465e610e0 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,8 +16,6 @@ package vfs
 
 import (
 	"strings"
-
-	"gvisor.dev/gvisor/pkg/fspath"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -43,27 +41,3 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
-
-// GenericPrependPath may be used by implementations of
-// FilesystemImpl.PrependPath() for which a single statically-determined lock
-// or set of locks is sufficient to ensure its preconditions (as opposed to
-// e.g. per-Dentry locks).
-//
-// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
-// vd.Dentry() and all of its ancestors.
-func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
-	mnt, d := vd.mount, vd.dentry
-	for {
-		if mnt == vfsroot.mount && d == vfsroot.dentry {
-			return PrependPathAtVFSRootError{}
-		}
-		if d == mnt.root {
-			return nil
-		}
-		if d.parent == nil {
-			return PrependPathAtNonMountRootError{}
-		}
-		b.PrependComponent(d.name)
-		d = d.parent
-	}
-}
diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD
new file mode 100644
index 000000000..d8fd92677
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+go_template(
+    name = "generic_fstree",
+    srcs = [
+        "genericfstree.go",
+    ],
+    types = [
+        "Dentry",
+    ],
+)
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
new file mode 100644
index 000000000..286510195
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -0,0 +1,80 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package genericfstree provides tools for implementing vfs.FilesystemImpls
+// where a single statically-determined lock or set of locks is sufficient to
+// ensure that a Dentry's name and parent are contextually immutable.
+//
+// Clients using this package must use the go_template_instance rule in
+// tools/go_generics/defs.bzl to create an instantiation of this template
+// package, providing types to use in place of Dentry.
+package genericfstree
+
+import (
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Dentry is a required type parameter that is a struct with the given fields.
+type Dentry struct {
+	// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
+	vfsd vfs.Dentry
+
+	// parent is the parent of this Dentry in the filesystem's tree. If this
+	// Dentry is a filesystem root, parent is nil.
+	parent *Dentry
+
+	// name is the name of this Dentry in its parent. If this Dentry is a
+	// filesystem root, name is unspecified.
+	name string
+}
+
+// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
+// either d2's parent or an ancestor of d2's parent.
+func IsAncestorDentry(d, d2 *Dentry) bool {
+	for {
+		if d2.parent == d {
+			return true
+		}
+		if d2.parent == d2 {
+			return false
+		}
+		d2 = d2.parent
+	}
+}
+
+// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
+func ParentOrSelf(d *Dentry) *Dentry {
+	if d.parent != nil {
+		return d.parent
+	}
+	return d
+}
+
+// PrependPath is a generic implementation of FilesystemImpl.PrependPath().
+func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error {
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index f06946103..02850b65c 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -188,6 +188,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	if err != nil {
 		return err
 	}
+
 	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
 	// lock ordering.
 	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
@@ -199,7 +200,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	vfs.mountMu.Lock()
 	vd.dentry.mu.Lock()
 	for {
-		if vd.dentry.IsDisowned() {
+		if vd.dentry.dead {
 			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
@@ -665,6 +666,12 @@ func (mnt *Mount) submountsLocked() []*Mount {
 	return mounts
 }
 
+// Root returns the mount's root. It does not take a reference on the returned
+// Dentry.
+func (mnt *Mount) Root() *Dentry {
+	return mnt.root
+}
+
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index f21a88034..cd78d66bc 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -58,7 +58,7 @@ loop:
 		switch err.(type) {
 		case nil:
 			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
-				// GenericPrependPath() will have returned
+				// genericfstree.PrependPath() will have returned
 				// PrependPathAtVFSRootError in this case since it checks
 				// against vfsroot before mnt.root, but other implementations
 				// of FilesystemImpl.PrependPath() may return nil instead.
@@ -84,7 +84,7 @@ loop:
 		}
 	}
 	b.PrependByte('/')
-	if origD.IsDisowned() {
+	if origD.IsDead() {
 		b.AppendString(" (deleted)")
 	}
 	return b.String(), nil
@@ -136,7 +136,7 @@ loop:
 // PathnameForGetcwd returns an absolute pathname to vd, consistent with
 // Linux's sys_getcwd().
 func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
-	if vd.dentry.IsDisowned() {
+	if vd.dentry.IsDead() {
 		return "", syserror.ENOENT
 	}
 
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8f31495da..9d047ff88 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -29,7 +29,9 @@ import (
 //
 // From the perspective of FilesystemImpl methods, a ResolvingPath represents a
 // starting Dentry on the associated Filesystem (on which a reference is
-// already held) and a stream of path components relative to that Dentry.
+// already held), a stream of path components relative to that Dentry, and
+// elements of the invoking Context that are commonly required by
+// FilesystemImpl methods.
 //
 // ResolvingPath is loosely analogous to Linux's struct nameidata.
 type ResolvingPath struct {
@@ -251,18 +253,17 @@ func (rp *ResolvingPath) relpathCommit() {
 	rp.origParts[rp.curPart] = rp.pit
 }
 
-// ResolveParent returns the VFS parent of d. It does not take a reference on
-// the returned Dentry.
-//
-// Preconditions: There are no concurrent mutators of d.
-//
-// Postconditions: If the returned error is nil, then the returned Dentry is
-// not nil.
-func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
-	var parent *Dentry
+// CheckRoot is called before resolving the parent of the Dentry d. If the
+// Dentry is contextually a VFS root, such that path resolution should treat
+// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
+// root of a non-root mount, such that path resolution should switch to another
+// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
+// resolution should resolve d's parent normally, and CheckRoot returns (false,
+// nil).
+func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
 	if d == rp.root.dentry && rp.mount == rp.root.mount {
-		// At contextual VFS root.
-		parent = d
+		// At contextual VFS root (due to e.g. chroot(2)).
+		return true, nil
 	} else if d == rp.mount.root {
 		// At mount root ...
 		vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
@@ -270,59 +271,27 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
 			// ... of non-root mount.
 			rp.nextMount = vd.mount
 			rp.nextStart = vd.dentry
-			return nil, resolveMountRootOrJumpError{}
+			return false, resolveMountRootOrJumpError{}
 		}
 		// ... of root mount.
-		parent = d
-	} else if d.parent == nil {
-		// At filesystem root.
-		parent = d
-	} else {
-		parent = d.parent
+		return true, nil
 	}
-	if parent.isMounted() {
-		if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil {
-			rp.nextMount = mnt
-			return nil, resolveMountPointError{}
-		}
-	}
-	return parent, nil
+	return false, nil
 }
 
-// ResolveChild returns the VFS child of d with the given name. It does not
-// take a reference on the returned Dentry. If no such child exists,
-// ResolveChild returns (nil, nil).
-//
-// Preconditions: There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) {
-	child := d.children[name]
-	if child == nil {
-		return nil, nil
+// CheckMount is called after resolving the parent or child of another Dentry
+// to d. If d is a mount point, such that path resolution should switch to
+// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
+// returns nil.
+func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+	if !d.isMounted() {
+		return nil
 	}
-	if child.isMounted() {
-		if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil {
-			rp.nextMount = mnt
-			return nil, resolveMountPointError{}
-		}
-	}
-	return child, nil
-}
-
-// ResolveComponent returns the Dentry reached by starting at d and resolving
-// the current path component in the stream represented by rp. It does not
-// advance the stream. It does not take a reference on the returned Dentry. If
-// no such Dentry exists, ResolveComponent returns (nil, nil).
-//
-// Preconditions: !rp.Done(). There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
-	switch pc := rp.Component(); pc {
-	case ".":
-		return d, nil
-	case "..":
-		return rp.ResolveParent(d)
-	default:
-		return rp.ResolveChild(d, pc)
+	if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+		rp.nextMount = mnt
+		return resolveMountPointError{}
 	}
+	return nil
 }
 
 // ShouldFollowSymlink returns true if, supposing that the current path
-- 
cgit v1.2.3


From a4711053672ae6732583f4eae54596783ec47547 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 21 Apr 2020 15:52:42 -0400
Subject: benchmarks: use absolute bazel target

bazel run :benchmarks only works from the benchmarks directory.
bazel run //benchmarks works from anywhere in the workspace.

Also fix help commands, which should be a multiline code section.
---
 benchmarks/README.md | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 6d1ea3ae2..814bcb220 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -10,7 +10,7 @@ The scripts assume the following:
     (controller) and one or more machines on which docker containers will be run
     (environment).
 *   The controller machine must have bazel installed along with this source
-    code. You should be able to run a command like `bazel run :benchmarks --
+    code. You should be able to run a command like `bazel run //benchmarks --
     --list`
 *   Environment machines must have docker and the required runtimes installed.
     More specifically, you should be able to run a command like: `docker run
@@ -33,7 +33,7 @@ but it does support GCP workflows. To run locally, run the following from the
 benchmarks directory:
 
 ```bash
-bazel run --define gcloud=off :benchmarks -- run-local startup
+bazel run --define gcloud=off //benchmarks -- run-local startup
 
 ...
 method,metric,result
@@ -48,16 +48,20 @@ runtime, runc. Running on another installed runtime, like say runsc, is as
 simple as:
 
 ```bash
-bazel run  --define gcloud=off :benchmarks -- run-local startup --runtime=runsc
+bazel run  --define gcloud=off //benchmarks -- run-local startup --runtime=runsc
 ```
 
-There is help: `bash bazel run --define gcloud=off :benchmarks -- --help bazel
-run --define gcloud=off :benchmarks -- run-local --help`
+There is help:
+
+```bash
+bazel run --define gcloud=off //benchmarks -- --help
+bazel run --define gcloud=off //benchmarks -- run-local --help
+```
 
 To list available benchmarks, use the `list` commmand:
 
 ```bash
-bazel --define gcloud=off  run :benchmarks -- list
+bazel --define gcloud=off  run //benchmarks -- list
 
 ...
 Benchmark: sysbench.cpu
@@ -70,7 +74,7 @@ Metrics: events_per_second
 You can choose benchmarks by name or regex like:
 
 ```bash
-bazel run --define gcloud=off :benchmarks -- run-local startup.node
+bazel run --define gcloud=off //benchmarks -- run-local startup.node
 ...
 metric,result
 startup_time_ms,1671.7178000000001
@@ -80,7 +84,7 @@ startup_time_ms,1671.7178000000001
 or
 
 ```bash
-bazel run --define gcloud=off :benchmarks -- run-local s
+bazel run --define gcloud=off //benchmarks -- run-local s
 ...
 method,metric,result
 startup.empty,startup_time_ms,1792.8292
@@ -98,13 +102,13 @@ You can run parameterized benchmarks, for example to run with different
 runtimes:
 
 ```bash
-bazel run --define gcloud=off :benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
+bazel run --define gcloud=off //benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
 ```
 
 Or with different parameters:
 
 ```bash
-bazel run --define gcloud=off :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
+bazel run --define gcloud=off //benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
 ```
 
 ### On Google Compute Engine (GCE)
@@ -117,7 +121,7 @@ runtime is installed from the workspace. See the files in `tools/installers` for
 supported install targets.
 
 ```bash
-bazel run :benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu
+bazel run //benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu
 ```
 
 When running on GCE, the scripts generate a per run SSH key, which is added to
-- 
cgit v1.2.3


From 89822a446161f1ccb3b84d53f8528bc8b0a28053 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 21 Apr 2020 13:12:38 -0700
Subject: Move to GitHub's new issue templates.

This allows us to specify a richer configuration for the issue template, that
effectively moves a lot of the "metadata" from the template itself to the main
issue page.

PiperOrigin-RevId: 307666509
---
 .github/ISSUE_TEMPLATE/bug_report.md      | 31 +++++++++++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/config.yml         | 14 ++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 21 +++++++++++++++++++++
 .github/issue_template.md                 | 20 --------------------
 4 files changed, 66 insertions(+), 20 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 delete mode 100644 .github/issue_template.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000..49a1ba697
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,31 @@
+---
+name: Bug report
+about: Create a bug report to help us improve
+title:
+labels:
+  - 'type: bug'
+assignees: ''
+---
+
+**Description**
+
+A clear description of what the bug is. If possible, explicitly indicate the
+expected behavior vs. the observed behavior.
+
+**Steps to reproduce**
+
+If available, please include detailed reproduction steps.
+
+If the bug requires software that is not publicly available, see if it can be
+reproduced with software that is publicly available.
+
+**Environment**
+
+Please include the following details of your environment:
+
+*   `runsc -v`
+*   `docker version` or `docker info` (if available)
+*   `kubectl version` and `kubectl get nodes` (if using Kubernetes)
+*   `uname -a`
+*   `git describe` (if built from source)
+*   `runsc` debug logs (if available)
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..f42510b1f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,14 @@
+blank_issues_enabled: false
+contact_links:
+  - name: gVisor Documentation (FAQ)
+    url: https://gvisor.dev/docs/user_guide/faq/
+    about: Please see our documentation for common questions and answers.
+  - name: gVisor Documentation (Debugging)
+    url: https://gvisor.dev/docs/user_guide/debugging/
+    about: Please see our documentation for debugging tips.
+  - name: gVisor User Forum
+    url: https://groups.google.com/g/gvisor-users
+    about: Please ask and answer questions here.
+  - name: gVisor Security List
+    url: https://github.com/google/gvisor/blob/master/SECURITY.md
+    about: Please report security vulnerabilities using the process described here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 000000000..65f60f385
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,21 @@
+---
+name: Feature request
+about: Suggest an idea or improvement
+title: ''
+labels:
+  - 'type: enhancement'
+assignees: ''
+---
+
+**Description**
+
+A clear description of the feature or enhancement.
+
+**Is this feature related to a specific bug?**
+
+Please include a bug references if yes.
+
+**Do you have a specific solution in mind?**
+
+Please include any details about a solution that you have in mind, including any
+alternatives considered.
diff --git a/.github/issue_template.md b/.github/issue_template.md
deleted file mode 100644
index 77c401d22..000000000
--- a/.github/issue_template.md
+++ /dev/null
@@ -1,20 +0,0 @@
-Before filling an issue, please consult our FAQ:
-https://gvisor.dev/docs/user_guide/faq/
-
-Also check that the issue hasn't been reported before.
-
-If you have a question, please email gvisor-users@googlegroups.com rather than filing a bug.
-
-If you believe you've found a security issue, please email gvisor-security@googlegroups.com rather than filing a bug.
-
-If this is your first time compiling or running gVisor, please make sure that your system meets the minimum requirements: https://github.com/google/gvisor#requirements
-
-For all other issues, please attach debug logs. To get debug logs, follow the
-instructions here: https://gvisor.dev/docs/user_guide/debugging/
-
-Other useful information to include is:
-
-*   `runsc -v`
-*   `docker version` or `docker info` if more relevant
-*   `uname -a` - `git describe`
-*   Detailed reproduction steps
-- 
cgit v1.2.3


From 37e01fd2ea6a0e67637975863317be9aae1b02f0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 21 Apr 2020 16:30:26 -0700
Subject: Misc VFS2 fixes

- Fix defer operation ordering in kernfs.Filesystem.AccessAt()
- Add AT_NULL entry in proc/pid/auvx
- Fix line padding in /proc/pid/maps
- Fix linux_dirent serialization for getdents(2)
- Remove file creation flags from vfs.FileDescription.statusFlags()

Updates #1193, #1035

PiperOrigin-RevId: 307704159
---
 pkg/sentry/fs/proc/task.go                 |  3 +--
 pkg/sentry/fsimpl/kernfs/filesystem.go     |  2 +-
 pkg/sentry/fsimpl/proc/task_files.go       | 13 +++++++------
 pkg/sentry/fsimpl/proc/tasks_sys.go        |  2 +-
 pkg/sentry/mm/procfs.go                    |  4 ++--
 pkg/sentry/syscalls/linux/vfs2/getdents.go |  6 +++---
 pkg/sentry/vfs/file_description.go         | 15 +++++++++------
 7 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 4d42eac83..4bbe90198 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -73,8 +73,7 @@ func checkTaskState(t *kernel.Task) error {
 type taskDir struct {
 	ramfs.Dir
 
-	t     *kernel.Task
-	pidns *kernel.PIDNamespace
+	t *kernel.Task
 }
 
 var _ fs.InodeOperations = (*taskDir)(nil)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 01c23d192..3164d022c 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -246,8 +246,8 @@ func (fs *Filesystem) Sync(ctx context.Context) error {
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
 func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	fs.mu.RLock()
-	defer fs.mu.RUnlock()
 	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
 
 	_, inode, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 2c6f8bdfc..f3173e197 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -111,17 +111,18 @@ func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	}
 	defer m.DecUsers(ctx)
 
-	// Space for buffer with AT_NULL (0) terminator at the end.
 	auxv := m.Auxv()
+	// Space for buffer with AT_NULL (0) terminator at the end.
 	buf.Grow((len(auxv) + 1) * 16)
 	for _, e := range auxv {
-		var tmp [8]byte
-		usermem.ByteOrder.PutUint64(tmp[:], e.Key)
-		buf.Write(tmp[:])
-
-		usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value))
+		var tmp [16]byte
+		usermem.ByteOrder.PutUint64(tmp[:8], e.Key)
+		usermem.ByteOrder.PutUint64(tmp[8:], uint64(e.Value))
 		buf.Write(tmp[:])
 	}
+	var atNull [16]byte
+	buf.Write(atNull[:])
+
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 3d5dc463c..f08668ca2 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -39,7 +39,7 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 			"shmmni":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)),
 		}),
 		"vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"mmap_min_addr":     newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}),
+			"mmap_min_addr":     newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{k: k}),
 			"overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")),
 		}),
 		"net": newSysNetDir(root, inoGen, k),
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 1ab92f046..6efe5102b 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -148,7 +148,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 
 	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
 	// stack_guard_page_start().
-	fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+	lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
 		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
 
 	// Figure out our filename or hint.
@@ -165,7 +165,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 	}
 	if s != "" {
 		// Per linux, we pad until the 74th character.
-		if pad := 73 - b.Len(); pad > 0 {
+		if pad := 73 - lineLen; pad > 0 {
 			b.WriteString(strings.Repeat(" ", pad))
 		}
 		b.WriteString(s)
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index 62e98817d..c7c7bf7ce 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -130,7 +130,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		if cb.t.Arch().Width() != 8 {
 			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
 		}
-		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
 		size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
 		if size > cb.remaining {
 			return syserror.EINVAL
@@ -143,11 +143,11 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		// Zero out all remaining bytes in buf, including the NUL terminator
 		// after dirent.Name and the zero padding byte between the name and
 		// dirent type.
-		bufTail := buf[18+len(dirent.Name):]
+		bufTail := buf[18+len(dirent.Name) : size-1]
 		for i := range bufTail {
 			bufTail[i] = 0
 		}
-		bufTail[2] = dirent.Type
+		buf[size-1] = dirent.Type
 	}
 	n, err := cb.t.CopyOutBytes(cb.addr, buf)
 	if err != nil {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 15cc091e2..418d69b96 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -111,10 +111,10 @@ type FileDescriptionOptions struct {
 }
 
 // Init must be called before first use of fd. If it succeeds, it takes
-// references on mnt and d. statusFlags is the initial file description status
-// flags, which is usually the full set of flags passed to open(2).
-func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
-	writable := MayWriteFileWithOpenFlags(statusFlags)
+// references on mnt and d. flags is the initial file description flags, which
+// is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+	writable := MayWriteFileWithOpenFlags(flags)
 	if writable {
 		if err := mnt.CheckBeginWrite(); err != nil {
 			return err
@@ -122,7 +122,10 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 	}
 
 	fd.refs = 1
-	fd.statusFlags = statusFlags
+
+	// Remove "file creation flags" to mirror the behavior from file.f_flags in
+	// fs/open.c:do_dentry_open
+	fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
@@ -130,7 +133,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 	mnt.IncRef()
 	d.IncRef()
 	fd.opts = *opts
-	fd.readable = MayReadFileWithOpenFlags(statusFlags)
+	fd.readable = MayReadFileWithOpenFlags(flags)
 	fd.writable = writable
 	fd.impl = impl
 	return nil
-- 
cgit v1.2.3


From 0e013d8b00dbc3ad96e98bc0405ec2e21887308e Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 21 Apr 2020 16:54:08 -0700
Subject: Don't ignore override if it is longer than layerStates

PiperOrigin-RevId: 307708653
---
 test/packetimpact/testbench/connections.go | 33 +++++++++++++++-----
 test/packetimpact/testbench/layers_test.go | 50 ++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index f84fd8ba7..00a366894 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -363,16 +363,33 @@ type Connection struct {
 // reverse is never a match. override overrides the default matchers for each
 // Layer.
 func (conn *Connection) match(override, received Layers) bool {
-	if len(received) < len(conn.layerStates) {
+	var layersToMatch int
+	if len(override) < len(conn.layerStates) {
+		layersToMatch = len(conn.layerStates)
+	} else {
+		layersToMatch = len(override)
+	}
+	if len(received) < layersToMatch {
 		return false
 	}
-	for i, s := range conn.layerStates {
-		toMatch := s.incoming(received[i])
-		if toMatch == nil {
-			return false
-		}
-		if i < len(override) {
-			toMatch.merge(override[i])
+	for i := 0; i < layersToMatch; i++ {
+		var toMatch Layer
+		if i < len(conn.layerStates) {
+			s := conn.layerStates[i]
+			toMatch = s.incoming(received[i])
+			if toMatch == nil {
+				return false
+			}
+			if i < len(override) {
+				if err := toMatch.merge(override[i]); err != nil {
+					conn.t.Fatalf("failed to merge: %s", err)
+				}
+			}
+		} else {
+			toMatch = override[i]
+			if toMatch == nil {
+				conn.t.Fatalf("expect the overriding layers to be non-nil")
+			}
 		}
 		if !toMatch.match(received[i]) {
 			return false
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
index b32efda93..c99cf6312 100644
--- a/test/packetimpact/testbench/layers_test.go
+++ b/test/packetimpact/testbench/layers_test.go
@@ -154,3 +154,53 @@ func TestLayerStringFormat(t *testing.T) {
 		})
 	}
 }
+
+func TestConnectionMatch(t *testing.T) {
+	conn := Connection{
+		layerStates: []layerState{&etherState{}},
+	}
+	protoNum0 := tcpip.NetworkProtocolNumber(0)
+	protoNum1 := tcpip.NetworkProtocolNumber(1)
+	for _, tt := range []struct {
+		description        string
+		override, received Layers
+		wantMatch          bool
+	}{
+		{
+			description: "shorter override",
+			override:    []Layer{&Ether{}},
+			received:    []Layer{&Ether{}, &Payload{Bytes: []byte("hello")}},
+			wantMatch:   true,
+		},
+		{
+			description: "longer override",
+			override:    []Layer{&Ether{}, &Payload{Bytes: []byte("hello")}},
+			received:    []Layer{&Ether{}},
+			wantMatch:   false,
+		},
+		{
+			description: "ether layer mismatch",
+			override:    []Layer{&Ether{Type: &protoNum0}},
+			received:    []Layer{&Ether{Type: &protoNum1}},
+			wantMatch:   false,
+		},
+		{
+			description: "both nil",
+			override:    nil,
+			received:    nil,
+			wantMatch:   false,
+		},
+		{
+			description: "nil override",
+			override:    nil,
+			received:    []Layer{&Ether{}},
+			wantMatch:   true,
+		},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			if gotMatch := conn.match(tt.override, tt.received); gotMatch != tt.wantMatch {
+				t.Fatalf("conn.match(%s, %s) = %t, want %t", tt.override, tt.received, gotMatch, tt.wantMatch)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From 80d0a958199cc6095e2d580e403d50ac1c3b5206 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 21 Apr 2020 17:58:43 -0700
Subject: Update gofer.filesystem.BoundEndpointAt() to allow path resolution.

Even though BoundEndpointAt is not yet implemented for gofer fs, allow path
resolution errors to be returned so that we can jump to tmpfs, where it is
implemented.

Updates #1476.

PiperOrigin-RevId: 307718335
---
 pkg/sentry/fsimpl/gofer/filesystem.go | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 43e863c61..eba4aabe8 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1089,9 +1089,15 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 }
 
 // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
-//
-// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt.
 	return nil, syserror.ECONNREFUSED
 }
 
-- 
cgit v1.2.3


From 5e3596a6b8abb4c7ee8253be447b86a7b0fad7ad Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 21 Apr 2020 19:01:51 -0700
Subject: Fix set/getsockopt in vfs2 override.

Updates #1476.

PiperOrigin-RevId: 307726055
---
 pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index 21eb98444..74920f785 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -58,8 +58,8 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[51] = syscalls.PartiallySupported("getsockname", GetSockName, "In process of porting socket syscalls to VFS2.", nil)
 	table[52] = syscalls.PartiallySupported("getpeername", GetPeerName, "In process of porting socket syscalls to VFS2.", nil)
 	table[53] = syscalls.PartiallySupported("socketpair", SocketPair, "In process of porting socket syscalls to VFS2.", nil)
-	table[54] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
-	table[55] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
+	table[54] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
+	table[55] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil)
 	table[59] = syscalls.Supported("execve", Execve)
 	table[72] = syscalls.Supported("fcntl", Fcntl)
 	delete(table, 73) // flock
-- 
cgit v1.2.3


From 6d23673e10bca2fb573809ff78506fc0566817dd Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 22 Apr 2020 07:27:34 -0700
Subject: Add comments about deepcopy in Layer.incoming()

PiperOrigin-RevId: 307812340
---
 test/packetimpact/testbench/connections.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 00a366894..952a717e0 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -72,7 +72,8 @@ type layerState interface {
 	// incoming creates an expected Layer for comparing against a received Layer.
 	// Because the expectation can depend on values in the received Layer, it is
 	// an input to incoming. For example, the ACK number needs to be checked in a
-	// TCP packet but only if the ACK flag is set in the received packet.
+	// TCP packet but only if the ACK flag is set in the received packet. The
+	// calles takes ownership of the returned Layer.
 	incoming(received Layer) Layer
 
 	// sent updates the layerState based on the Layer that was sent. The input is
@@ -124,6 +125,7 @@ func (s *etherState) outgoing() Layer {
 	return &s.out
 }
 
+// incoming implements layerState.incoming.
 func (s *etherState) incoming(Layer) Layer {
 	return deepcopy.Copy(&s.in).(Layer)
 }
@@ -168,6 +170,7 @@ func (s *ipv4State) outgoing() Layer {
 	return &s.out
 }
 
+// incoming implements layerState.incoming.
 func (s *ipv4State) incoming(Layer) Layer {
 	return deepcopy.Copy(&s.in).(Layer)
 }
@@ -234,6 +237,7 @@ func (s *tcpState) outgoing() Layer {
 	return &newOutgoing
 }
 
+// incoming implements layerState.incoming.
 func (s *tcpState) incoming(received Layer) Layer {
 	tcpReceived, ok := received.(*TCP)
 	if !ok {
@@ -328,6 +332,7 @@ func (s *udpState) outgoing() Layer {
 	return &s.out
 }
 
+// incoming implements layerState.incoming.
 func (s *udpState) incoming(Layer) Layer {
 	return deepcopy.Copy(&s.in).(Layer)
 }
-- 
cgit v1.2.3


From a27d6329df96d50b5b5cd4b550ca024cc3f0b16c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 22 Apr 2020 10:13:40 -0700
Subject: Remove unnecessary kokoro configurations.

PiperOrigin-RevId: 307841689
---
 kokoro/benchmark_tests.cfg            | 26 -----------------
 kokoro/build.cfg                      | 24 ----------------
 kokoro/build_tests.cfg                |  1 -
 kokoro/common.cfg                     | 29 -------------------
 kokoro/do_tests.cfg                   |  9 ------
 kokoro/docker_tests.cfg               | 10 -------
 kokoro/go.cfg                         | 20 -------------
 kokoro/go_tests.cfg                   |  1 -
 kokoro/hostnet_tests.cfg              | 10 -------
 kokoro/iptables_tests.cfg             | 10 -------
 kokoro/issue_reviver.cfg              | 15 ----------
 kokoro/kvm_tests.cfg                  | 10 -------
 kokoro/kythe/generate_xrefs.cfg       | 29 -------------------
 kokoro/kythe/generate_xrefs.sh        | 54 -----------------------------------
 kokoro/make_tests.cfg                 |  9 ------
 kokoro/overlay_tests.cfg              | 10 -------
 kokoro/packetdrill_tests.cfg          |  9 ------
 kokoro/packetimpact_tests.cfg         |  9 ------
 kokoro/release.cfg                    | 15 ----------
 kokoro/root_tests.cfg                 | 10 -------
 kokoro/runtime_tests.cfg              |  1 -
 kokoro/runtime_tests/go1.12.cfg       | 16 -----------
 kokoro/runtime_tests/java11.cfg       | 16 -----------
 kokoro/runtime_tests/nodejs12.4.0.cfg | 16 -----------
 kokoro/runtime_tests/php7.3.6.cfg     | 16 -----------
 kokoro/runtime_tests/python3.7.3.cfg  | 16 -----------
 kokoro/runtime_tests/runtime_tests.sh | 29 -------------------
 kokoro/simple_tests.cfg               |  9 ------
 kokoro/swgso_tests.cfg                |  9 ------
 kokoro/syscall_kvm_tests.cfg          |  9 ------
 kokoro/syscall_tests.cfg              |  9 ------
 scripts/benchmarks.sh                 | 53 ----------------------------------
 32 files changed, 509 deletions(-)
 delete mode 100644 kokoro/benchmark_tests.cfg
 delete mode 100644 kokoro/build.cfg
 delete mode 100644 kokoro/build_tests.cfg
 delete mode 100644 kokoro/common.cfg
 delete mode 100644 kokoro/do_tests.cfg
 delete mode 100644 kokoro/docker_tests.cfg
 delete mode 100644 kokoro/go.cfg
 delete mode 100644 kokoro/go_tests.cfg
 delete mode 100644 kokoro/hostnet_tests.cfg
 delete mode 100644 kokoro/iptables_tests.cfg
 delete mode 100644 kokoro/issue_reviver.cfg
 delete mode 100644 kokoro/kvm_tests.cfg
 delete mode 100644 kokoro/kythe/generate_xrefs.cfg
 delete mode 100644 kokoro/kythe/generate_xrefs.sh
 delete mode 100644 kokoro/make_tests.cfg
 delete mode 100644 kokoro/overlay_tests.cfg
 delete mode 100644 kokoro/packetdrill_tests.cfg
 delete mode 100644 kokoro/packetimpact_tests.cfg
 delete mode 100644 kokoro/release.cfg
 delete mode 100644 kokoro/root_tests.cfg
 delete mode 100644 kokoro/runtime_tests.cfg
 delete mode 100644 kokoro/runtime_tests/go1.12.cfg
 delete mode 100644 kokoro/runtime_tests/java11.cfg
 delete mode 100644 kokoro/runtime_tests/nodejs12.4.0.cfg
 delete mode 100644 kokoro/runtime_tests/php7.3.6.cfg
 delete mode 100644 kokoro/runtime_tests/python3.7.3.cfg
 delete mode 100755 kokoro/runtime_tests/runtime_tests.sh
 delete mode 100644 kokoro/simple_tests.cfg
 delete mode 100644 kokoro/swgso_tests.cfg
 delete mode 100644 kokoro/syscall_kvm_tests.cfg
 delete mode 100644 kokoro/syscall_tests.cfg
 delete mode 100755 scripts/benchmarks.sh

diff --git a/kokoro/benchmark_tests.cfg b/kokoro/benchmark_tests.cfg
deleted file mode 100644
index f85cc9681..000000000
--- a/kokoro/benchmark_tests.cfg
+++ /dev/null
@@ -1,26 +0,0 @@
-build_file : 'repo/scripts/benchmark.sh'
-
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-        keystore_config_id : 73898
-        keyname : 'gvisor-benchmarks-service-account'
-    },
-  }
-}
-
-env_vars {
-  key : 'PROJECT'
-  value : 'gvisor-benchmarks'
-}
-
-env_vars {
-  key : 'ZONE'
-  value : 'us-central1-b'
-}
-
-env_vars {
-  key : 'GCLOUD_CREDENTIALS'
-  value : '73898_gvisor-benchmarks-service-account'
-}
diff --git a/kokoro/build.cfg b/kokoro/build.cfg
deleted file mode 100644
index c9ceda947..000000000
--- a/kokoro/build.cfg
+++ /dev/null
@@ -1,24 +0,0 @@
-build_file: "repo/scripts/build.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-repo-key"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_REPO_KEY"
-  value: "73898_kokoro-repo-key"
-}
-
-action {
-  define_artifacts {
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-    regex: "**/dists/**"
-    regex: "**/pool/**"
-  }
-}
diff --git a/kokoro/build_tests.cfg b/kokoro/build_tests.cfg
deleted file mode 100644
index c64b7e679..000000000
--- a/kokoro/build_tests.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "repo/scripts/build.sh"
diff --git a/kokoro/common.cfg b/kokoro/common.cfg
deleted file mode 100644
index 669a2e458..000000000
--- a/kokoro/common.cfg
+++ /dev/null
@@ -1,29 +0,0 @@
-# Give Kokoro access to Remote Build Executor (RBE) service account key.
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-rbe-service-account"
-    }
-  }
-}
-
-# Configure bazel to access RBE.
-bazel_setting {
-  # Our GCP project name.
-  project_id: "gvisor-rbe"
-
-  # Use RBE for execution as well as caching.
-  local_execution: false
-
-  # This must match the values in the job config.
-  auth_credential: {
-    keystore_config_id: 73898
-    keyname: "kokoro-rbe-service-account"
-  }
-
-  # Do not change unless you know what you are doing.
-  bes_backend_address: "buildeventservice.googleapis.com"
-  foundry_backend_address: "remotebuildexecution.googleapis.com"
-  upsalite_frontend_address: "https://source.cloud.google.com"
-}
diff --git a/kokoro/do_tests.cfg b/kokoro/do_tests.cfg
deleted file mode 100644
index b45ec0b42..000000000
--- a/kokoro/do_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/do_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/docker_tests.cfg b/kokoro/docker_tests.cfg
deleted file mode 100644
index 0a0ef87ed..000000000
--- a/kokoro/docker_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/docker_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/go.cfg b/kokoro/go.cfg
deleted file mode 100644
index b9c1fcb12..000000000
--- a/kokoro/go.cfg
+++ /dev/null
@@ -1,20 +0,0 @@
-build_file: "repo/scripts/go.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-github-access-token"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_GITHUB_ACCESS_TOKEN"
-  value: "73898_kokoro-github-access-token"
-}
-
-env_vars {
-  key: "KOKORO_GO_PUSH"
-  value: "true"
-}
diff --git a/kokoro/go_tests.cfg b/kokoro/go_tests.cfg
deleted file mode 100644
index 5eb51041a..000000000
--- a/kokoro/go_tests.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "repo/scripts/go.sh"
diff --git a/kokoro/hostnet_tests.cfg b/kokoro/hostnet_tests.cfg
deleted file mode 100644
index 520dc55a3..000000000
--- a/kokoro/hostnet_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/hostnet_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
deleted file mode 100644
index a30d82591..000000000
--- a/kokoro/iptables_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/iptables_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/issue_reviver.cfg b/kokoro/issue_reviver.cfg
deleted file mode 100644
index 2370d9250..000000000
--- a/kokoro/issue_reviver.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-build_file: "repo/scripts/issue_reviver.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-github-access-token"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_GITHUB_ACCESS_TOKEN"
-  value: "73898_kokoro-github-access-token"
-}
diff --git a/kokoro/kvm_tests.cfg b/kokoro/kvm_tests.cfg
deleted file mode 100644
index 1feb60c8a..000000000
--- a/kokoro/kvm_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/kvm_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg
deleted file mode 100644
index ccf657983..000000000
--- a/kokoro/kythe/generate_xrefs.cfg
+++ /dev/null
@@ -1,29 +0,0 @@
-build_file: "gvisor/kokoro/kythe/generate_xrefs.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-rbe-service-account"
-    }
-  }
-}
-
-bazel_setting {
-  project_id: "gvisor-rbe"
-  local_execution: false
-  auth_credential: {
-    keystore_config_id: 73898
-    keyname: "kokoro-rbe-service-account"
-  }
-  bes_backend_address: "buildeventservice.googleapis.com"
-  foundry_backend_address: "remotebuildexecution.googleapis.com"
-  upsalite_frontend_address: "https://source.cloud.google.com"
-}
-
-action {
-  define_artifacts {
-    regex: "**/*.kzip"
-    fail_if_no_artifacts: true
-  }
-}
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
deleted file mode 100644
index 323b0f77b..000000000
--- a/kokoro/kythe/generate_xrefs.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -ex
-
-if command -v use_bazel.sh >/dev/null; then
-  use_bazel.sh latest
-fi
-bazel version
-
-python3 -V
-
-readonly KYTHE_VERSION='v0.0.41'
-readonly WORKDIR="$(mktemp -d)"
-readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
-if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
-  readonly KZIP_FILENAME="${KOKORO_ARTIFACTS_DIR}/${KOKORO_GIT_COMMIT}.kzip"
-else
-  readonly KZIP_FILENAME="$(git rev-parse HEAD).kzip"
-fi
-
-wget -q -O "${WORKDIR}/kythe.tar.gz" \
-  "https://github.com/kythe/kythe/releases/download/${KYTHE_VERSION}/kythe-${KYTHE_VERSION}.tar.gz"
-tar --no-same-owner -xzf "${WORKDIR}/kythe.tar.gz" --directory "$WORKDIR"
-
-if [[ -n "$KOKORO_ARTIFACTS_DIR" ]]; then
-  cd "${KOKORO_ARTIFACTS_DIR}/github/gvisor"
-fi
-bazel \
-  --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \
-  build \
-  --override_repository kythe_release="${KYTHE_DIR}" \
-  --define=kythe_corpus=github.com/google/gvisor \
-  --cxxopt=-std=c++17 \
-  --config=remote \
-  --auth_credentials="${KOKORO_BAZEL_AUTH_CREDENTIAL}" \
-  //...
-
-"${KYTHE_DIR}/tools/kzip" merge \
-  --output "$KZIP_FILENAME" \
-  $(find -L bazel-out/*/extra_actions/ -name '*.kzip')
diff --git a/kokoro/make_tests.cfg b/kokoro/make_tests.cfg
deleted file mode 100644
index d973130ff..000000000
--- a/kokoro/make_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/make_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/overlay_tests.cfg b/kokoro/overlay_tests.cfg
deleted file mode 100644
index 6a2ddbd03..000000000
--- a/kokoro/overlay_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/overlay_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/packetdrill_tests.cfg b/kokoro/packetdrill_tests.cfg
deleted file mode 100644
index 258d7deb4..000000000
--- a/kokoro/packetdrill_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/packetdrill_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/packetimpact_tests.cfg b/kokoro/packetimpact_tests.cfg
deleted file mode 100644
index db86b52d5..000000000
--- a/kokoro/packetimpact_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/packetimpact_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/release.cfg b/kokoro/release.cfg
deleted file mode 100644
index 5cec1790a..000000000
--- a/kokoro/release.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-build_file: "repo/scripts/release.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-github-access-token"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_GITHUB_ACCESS_TOKEN"
-  value: "73898_kokoro-github-access-token"
-}
diff --git a/kokoro/root_tests.cfg b/kokoro/root_tests.cfg
deleted file mode 100644
index 28351695c..000000000
--- a/kokoro/root_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/root_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/runtime_tests.cfg b/kokoro/runtime_tests.cfg
deleted file mode 100644
index 7d56d5aca..000000000
--- a/kokoro/runtime_tests.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "repo/scripts/runtime_tests.sh"
diff --git a/kokoro/runtime_tests/go1.12.cfg b/kokoro/runtime_tests/go1.12.cfg
deleted file mode 100644
index 04bfe2868..000000000
--- a/kokoro/runtime_tests/go1.12.cfg
+++ /dev/null
@@ -1,16 +0,0 @@
-build_file: "github/github/scripts/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "go1.12"
-}
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-  }
-}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/java11.cfg b/kokoro/runtime_tests/java11.cfg
deleted file mode 100644
index c82855cd2..000000000
--- a/kokoro/runtime_tests/java11.cfg
+++ /dev/null
@@ -1,16 +0,0 @@
-build_file: "github/github/scripts/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "java11"
-}
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-  }
-}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/nodejs12.4.0.cfg b/kokoro/runtime_tests/nodejs12.4.0.cfg
deleted file mode 100644
index 5512db5df..000000000
--- a/kokoro/runtime_tests/nodejs12.4.0.cfg
+++ /dev/null
@@ -1,16 +0,0 @@
-build_file: "github/github/scripts/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "nodejs12.4.0"
-}
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-  }
-}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/php7.3.6.cfg b/kokoro/runtime_tests/php7.3.6.cfg
deleted file mode 100644
index bc9ac92aa..000000000
--- a/kokoro/runtime_tests/php7.3.6.cfg
+++ /dev/null
@@ -1,16 +0,0 @@
-build_file: "github/github/scripts/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "php7.3.6"
-}
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-  }
-}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/python3.7.3.cfg b/kokoro/runtime_tests/python3.7.3.cfg
deleted file mode 100644
index 12eb13860..000000000
--- a/kokoro/runtime_tests/python3.7.3.cfg
+++ /dev/null
@@ -1,16 +0,0 @@
-build_file: "github/github/scripts/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "python3.7.3"
-}
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-  }
-}
\ No newline at end of file
diff --git a/kokoro/runtime_tests/runtime_tests.sh b/kokoro/runtime_tests/runtime_tests.sh
deleted file mode 100755
index 73a58f806..000000000
--- a/kokoro/runtime_tests/runtime_tests.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Run in the root of the repo.
-cd "$(dirname "$0")"
-cd "$(git rev-parse --show-toplevel)"
-
-source scripts/common.sh
-
-if [ ! -v RUNTIME_TEST_NAME ]; then
-  echo 'Must set $RUNTIME_TEST_NAME' >&2
-  exit 1
-fi
-
-install_runsc_for_test runtimes
-test_runsc "//test/runtimes:${RUNTIME_TEST_NAME}_test"
diff --git a/kokoro/simple_tests.cfg b/kokoro/simple_tests.cfg
deleted file mode 100644
index 32e0a9431..000000000
--- a/kokoro/simple_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/simple_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/swgso_tests.cfg b/kokoro/swgso_tests.cfg
deleted file mode 100644
index 101a9c607..000000000
--- a/kokoro/swgso_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/swgso_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/syscall_kvm_tests.cfg b/kokoro/syscall_kvm_tests.cfg
deleted file mode 100644
index 3b99e9c13..000000000
--- a/kokoro/syscall_kvm_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/syscall_kvm_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/syscall_tests.cfg b/kokoro/syscall_tests.cfg
deleted file mode 100644
index ee6e4a3a4..000000000
--- a/kokoro/syscall_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/syscall_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh
deleted file mode 100755
index 6b9065b07..000000000
--- a/scripts/benchmarks.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/usr/bin/env bash
-
-if [ "$#" -lt "1" ]; then
-  echo "usage: $0 <--mock |--env=<filename>> ..."
-  echo "example: $0 --mock --runs=8"
-  exit 1
-fi
-
-source $(dirname $0)/common.sh
-
-readonly TIMESTAMP=`date "+%Y%m%d-%H%M%S"`
-readonly OUTDIR="$(mktemp --tmpdir -d run-${TIMESTAMP}-XXX)"
-readonly DEFAULT_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
-readonly ALL_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
-cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio.csv" && rm "${OUTDIR}/tmp_fio.csv"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio-tmpfs.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
-cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio-tmpfs.csv" && rm "${OUTDIR}/tmp_fio.csv"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} startup --count=50  >  "${OUTDIR}/startup.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} density > "${OUTDIR}/density.csv"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.cpu --threads=1 --max_prime=50000 --options='--max-time=5' > "${OUTDIR}/sysbench-cpu.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.memory --threads=1 --options='--memory-block-size=1M --memory-total-size=500G'  > "${OUTDIR}/sysbench-memory.csv"
-run //benchmarks:perf -- run "$@" ${ALL_RUNTIMES} syscall > "${OUTDIR}/syscall.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'network.(upload|download)' --runs=20 > "${OUTDIR}/iperf.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} ml.tensorflow > "${OUTDIR}/tensorflow.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} media.ffmpeg > "${OUTDIR}/ffmpeg.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin100k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd100k.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin10240k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd10240k.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} redis > "${OUTDIR}/redis.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'http.(ruby|node)' > "${OUTDIR}/applications.csv"
-
-echo "${OUTPUT}" && exit 0
-- 
cgit v1.2.3


From c31641150d9ee0e4b9c7cf1210c4e89a030a6bd7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 22 Apr 2020 12:11:38 -0700
Subject: Add GitHub pull request template.

This just provides some sane reminders and ticks a box on the GitHub UI. This
change also cleans up the issue template, as there is already an automatic
link to the repository's security disclosure policy.

PiperOrigin-RevId: 307868833
---
 .github/ISSUE_TEMPLATE/config.yml | 5 +----
 .github/pull_request_template.md  | 5 +++++
 2 files changed, 6 insertions(+), 4 deletions(-)
 create mode 100644 .github/pull_request_template.md

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index f42510b1f..772c9a0ac 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -8,7 +8,4 @@ contact_links:
     about: Please see our documentation for debugging tips.
   - name: gVisor User Forum
     url: https://groups.google.com/g/gvisor-users
-    about: Please ask and answer questions here.
-  - name: gVisor Security List
-    url: https://github.com/google/gvisor/blob/master/SECURITY.md
-    about: Please report security vulnerabilities using the process described here.
+    about: Ask and answer general questions here.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 000000000..264b4e9fa
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,5 @@
+* [ ] Have you followed the guidelines in [CONTRIBUTING.md](../blob/master/CONTRIBUTING.md)?
+* [ ] Have you formatted and linted your code?
+* [ ] Have you added relevant tests?
+* [ ] Have you added appropriate Fixes & Updates references?
+* [ ] If yes, please erase all these lines!
-- 
cgit v1.2.3


From 37f863f62813f76b05979494c1bc2fe102629321 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 22 Apr 2020 14:15:33 -0700
Subject: tcp: handle listen after shutdown properly

Right now, sentry panics in this case:
panic: close of nil channel

goroutine 67 [running]:
pkg/tcpip/transport/tcp/tcp.(*endpoint).listen(0xc0000ce000, 0x9, 0x0)
        pkg/tcpip/transport/tcp/endpoint.go:2208 +0x170
pkg/tcpip/transport/tcp/tcp.(*endpoint).Listen(0xc0000ce000, 0x9, 0xc0003a1ad0)
        pkg/tcpip/transport/tcp/endpoint.go:2179 +0x50

Fixes #2468

PiperOrigin-RevId: 307896725
---
 pkg/tcpip/transport/tcp/endpoint.go         | 43 +++++++++++++++--------------
 pkg/tcpip/transport/tcp/endpoint_state.go   |  5 ++++
 test/syscalls/linux/socket_inet_loopback.cc | 43 +++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 45f2aa78b..07d3e64c8 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2158,8 +2158,6 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 			//
 			// By not removing this endpoint from the demuxer mapping, we
 			// ensure that any other bind to the same port fails, as on Linux.
-			// TODO(gvisor.dev/issue/2468): We need to enable applications to
-			// start listening on this endpoint again similar to Linux.
 			e.rcvListMu.Lock()
 			e.rcvClosed = true
 			e.rcvListMu.Unlock()
@@ -2188,26 +2186,31 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.LockUser()
 	defer e.UnlockUser()
 
-	// Allow the backlog to be adjusted if the endpoint is not shutting down.
-	// When the endpoint shuts down, it sets workerCleanup to true, and from
-	// that point onward, acceptedChan is the responsibility of the cleanup()
-	// method (and should not be touched anywhere else, including here).
-	if e.EndpointState() == StateListen && !e.workerCleanup {
-		// Adjust the size of the channel iff we can fix existing
-		// pending connections into the new one.
+	if e.EndpointState() == StateListen && !e.closed {
 		e.acceptMu.Lock()
 		defer e.acceptMu.Unlock()
-		if len(e.acceptedChan) > backlog {
-			return tcpip.ErrInvalidEndpointState
-		}
-		if cap(e.acceptedChan) == backlog {
-			return nil
-		}
-		origChan := e.acceptedChan
-		e.acceptedChan = make(chan *endpoint, backlog)
-		close(origChan)
-		for ep := range origChan {
-			e.acceptedChan <- ep
+		if e.acceptedChan == nil {
+			// listen is called after shutdown.
+			e.acceptedChan = make(chan *endpoint, backlog)
+			e.shutdownFlags = 0
+			e.rcvListMu.Lock()
+			e.rcvClosed = false
+			e.rcvListMu.Unlock()
+		} else {
+			// Adjust the size of the channel iff we can fix
+			// existing pending connections into the new one.
+			if len(e.acceptedChan) > backlog {
+				return tcpip.ErrInvalidEndpointState
+			}
+			if cap(e.acceptedChan) == backlog {
+				return nil
+			}
+			origChan := e.acceptedChan
+			e.acceptedChan = make(chan *endpoint, backlog)
+			close(origChan)
+			for ep := range origChan {
+				e.acceptedChan <- ep
+			}
 		}
 
 		// Notify any blocked goroutines that they can attempt to
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index c3c692555..8b7562396 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -247,6 +247,11 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			if err := e.Listen(backlog); err != nil {
 				panic("endpoint listening failed: " + err.String())
 			}
+			e.LockUser()
+			if e.shutdownFlags != 0 {
+				e.shutdownLocked(e.shutdownFlags)
+			}
+			e.UnlockUser()
 			listenLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index d3000dbc6..9400ffaeb 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -319,6 +319,49 @@ TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
   tcpSimpleConnectTest(listener, connector, false);
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) {
+  const auto& param = GetParam();
+
+  const TestAddress& listener = param.listener;
+  const TestAddress& connector = param.connector;
+
+  constexpr int kBacklog = 5;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  for (int i = 0; i < kBacklog; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallSucceeds());
+  }
+  for (int i = 0; i < kBacklog; i++) {
+    ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+  }
+}
+
 TEST_P(SocketInetLoopbackTest, TCPListenShutdown) {
   auto const& param = GetParam();
 
-- 
cgit v1.2.3


From 0c586946ea26610b87c4ff7bda783a5a9ca11ec0 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 22 Apr 2020 17:48:59 -0700
Subject: Specify a memory file in platform.New().

PiperOrigin-RevId: 307941984
---
 pkg/abi/linux/BUILD                      |  1 +
 pkg/abi/linux/arch_amd64.go              | 23 +++++++++++++++++++++++
 pkg/abi/linux/seccomp.go                 |  7 +++++++
 pkg/flipcall/packet_window_allocator.go  |  4 ++--
 pkg/seccomp/seccomp_unsafe.go            |  9 +--------
 pkg/sentry/kernel/task_run.go            |  1 +
 pkg/sentry/platform/kvm/context.go       |  3 +++
 pkg/sentry/platform/kvm/kvm.go           |  5 +++++
 pkg/sentry/platform/platform.go          | 21 +++++++++++++++++++++
 pkg/sentry/platform/ptrace/ptrace.go     | 13 +++++++++++++
 pkg/sentry/platform/ptrace/subprocess.go |  2 +-
 runsc/cmd/BUILD                          |  2 +-
 runsc/cmd/boot.go                        |  9 +++++++--
 runsc/sandbox/sandbox.go                 | 10 +++++++---
 tools/nogo/config.go                     |  3 +++
 15 files changed, 96 insertions(+), 17 deletions(-)
 create mode 100644 pkg/abi/linux/arch_amd64.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 322d1ccc4..59b0e138a 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -10,6 +10,7 @@ go_library(
     name = "linux",
     srcs = [
         "aio.go",
+        "arch_amd64.go",
         "audit.go",
         "bpf.go",
         "capability.go",
diff --git a/pkg/abi/linux/arch_amd64.go b/pkg/abi/linux/arch_amd64.go
new file mode 100644
index 000000000..0be31e755
--- /dev/null
+++ b/pkg/abi/linux/arch_amd64.go
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+// Start and end addresses of the vsyscall page.
+const (
+	VSyscallStartAddr uint64 = 0xffffffffff600000
+	VSyscallEndAddr   uint64 = 0xffffffffff601000
+)
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 4eeb5cd7a..d0607e256 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -63,3 +63,10 @@ func (a BPFAction) String() string {
 func (a BPFAction) Data() uint16 {
 	return uint16(a & SECCOMP_RET_DATA)
 }
+
+// SockFprog is sock_fprog taken from <linux/filter.h>.
+type SockFprog struct {
+	Len    uint16
+	pad    [6]byte
+	Filter *BPFInstruction
+}
diff --git a/pkg/flipcall/packet_window_allocator.go b/pkg/flipcall/packet_window_allocator.go
index ccb918fab..af9cc3d21 100644
--- a/pkg/flipcall/packet_window_allocator.go
+++ b/pkg/flipcall/packet_window_allocator.go
@@ -134,7 +134,7 @@ func (pwa *PacketWindowAllocator) Allocate(size int) (PacketWindowDescriptor, er
 	start := pwa.nextAlloc
 	pwa.nextAlloc = end
 	return PacketWindowDescriptor{
-		FD:     pwa.fd,
+		FD:     pwa.FD(),
 		Offset: start,
 		Length: size,
 	}, nil
@@ -158,7 +158,7 @@ func (pwa *PacketWindowAllocator) ensureFileSize(min int64) error {
 		}
 		newSize = newNewSize
 	}
-	if err := syscall.Ftruncate(pwa.fd, newSize); err != nil {
+	if err := syscall.Ftruncate(pwa.FD(), newSize); err != nil {
 		return fmt.Errorf("ftruncate failed: %v", err)
 	}
 	pwa.fileSize = newSize
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index be328db12..f7e986589 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -21,13 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
-// sockFprog is sock_fprog taken from <linux/filter.h>.
-type sockFprog struct {
-	Len    uint16
-	pad    [6]byte
-	Filter *linux.BPFInstruction
-}
-
 // SetFilter installs the given BPF program.
 //
 // This is safe to call from an afterFork context.
@@ -39,7 +32,7 @@ func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
 		return errno
 	}
 
-	sockProg := sockFprog{
+	sockProg := linux.SockFprog{
 		Len:    uint16(len(instrs)),
 		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 2ba8d7e63..d654dd997 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -96,6 +96,7 @@ func (t *Task) run(threadID uintptr) {
 			t.tg.liveGoroutines.Done()
 			t.tg.pidns.owner.liveGoroutines.Done()
 			t.tg.pidns.owner.runningGoroutines.Done()
+			t.p.Release()
 
 			// Keep argument alive because stack trace for dead variables may not be correct.
 			runtime.KeepAlive(threadID)
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index c769ac7b4..6507121ea 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -85,3 +85,6 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 func (c *context) Interrupt() {
 	c.interrupt.NotifyInterrupt()
 }
+
+// Release implements platform.Context.Release().
+func (c *context) Release() {}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index a9b4af43e..ae813e24e 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -191,6 +191,11 @@ func (*constructor) OpenDevice() (*os.File, error) {
 	return OpenDevice()
 }
 
+// Flags implements platform.Constructor.Flags().
+func (*constructor) Requirements() platform.Requirements {
+	return platform.Requirements{}
+}
+
 func init() {
 	platform.Register("kvm", &constructor{})
 }
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 2ca696382..171513f3f 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -148,6 +148,9 @@ type Context interface {
 	// Interrupt interrupts a concurrent call to Switch(), causing it to return
 	// ErrContextInterrupt.
 	Interrupt()
+
+	// Release() releases any resources associated with this context.
+	Release()
 }
 
 var (
@@ -353,10 +356,28 @@ func (fr FileRange) String() string {
 	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
 }
 
+// Requirements is used to specify platform specific requirements.
+type Requirements struct {
+	// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
+	// current pid namespace.
+	RequiresCurrentPIDNS bool
+	// RequiresCapSysPtrace indicates that the sandbox has to be started with
+	// the CAP_SYS_PTRACE capability.
+	RequiresCapSysPtrace bool
+}
+
 // Constructor represents a platform type.
 type Constructor interface {
+	// New returns a new platform instance.
+	//
+	// Arguments:
+	//
+	// * deviceFile - the device file (e.g. /dev/kvm for the KVM platform).
 	New(deviceFile *os.File) (Platform, error)
 	OpenDevice() (*os.File, error)
+
+	// Requirements returns platform specific requirements.
+	Requirements() Requirements
 }
 
 // platforms contains all available platform types.
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 03adb624b..08d055e05 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -177,6 +177,9 @@ func (c *context) Interrupt() {
 	c.interrupt.NotifyInterrupt()
 }
 
+// Release implements platform.Context.Release().
+func (c *context) Release() {}
+
 // PTrace represents a collection of ptrace subprocesses.
 type PTrace struct {
 	platform.MMapMinAddr
@@ -248,6 +251,16 @@ func (*constructor) OpenDevice() (*os.File, error) {
 	return nil, nil
 }
 
+// Flags implements platform.Constructor.Flags().
+func (*constructor) Requirements() platform.Requirements {
+	// TODO(b/75837838): Also set a new PID namespace so that we limit
+	// access to other host processes.
+	return platform.Requirements{
+		RequiresCapSysPtrace: true,
+		RequiresCurrentPIDNS: true,
+	}
+}
+
 func init() {
 	platform.Register("ptrace", &constructor{})
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index a644609ef..773ddb1ed 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -332,7 +332,7 @@ func (t *thread) unexpectedStubExit() {
 	msg, err := t.getEventMessage()
 	status := syscall.WaitStatus(msg)
 	if status.Signaled() && status.Signal() == syscall.SIGKILL {
-		// SIGKILL can be only sent by an user or OOM-killer. In both
+		// SIGKILL can be only sent by a user or OOM-killer. In both
 		// these cases, we don't need to panic. There is no reasons to
 		// think that something wrong in gVisor.
 		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index d0bb4613a..4900fbe16 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -44,13 +44,13 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/platform",
         "//pkg/state",
         "//pkg/state/statefile",
         "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
-        "//runsc/boot/platforms",
         "//runsc/console",
         "//runsc/container",
         "//runsc/flag",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 0938944a6..4c2ac6ff0 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -25,8 +25,8 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -183,7 +183,12 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if caps == nil {
 			caps = &specs.LinuxCapabilities{}
 		}
-		if conf.Platform == platforms.Ptrace {
+
+		gPlatform, err := platform.Lookup(conf.Platform)
+		if err != nil {
+			Fatalf("loading platform: %v", err)
+		}
+		if gPlatform.Requirements().RequiresCapSysPtrace {
 			// Ptrace platform requires extra capabilities.
 			const c = "CAP_SYS_PTRACE"
 			caps.Bounding = append(caps.Bounding, c)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e82bcef6f..e4ec16e2f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -446,9 +446,13 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
-	// If the platform needs a device FD we must pass it in.
-	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+	gPlatform, err := platform.Lookup(conf.Platform)
+	if err != nil {
 		return err
+	}
+
+	if deviceFile, err := gPlatform.OpenDevice(); err != nil {
+		return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err)
 	} else if deviceFile != nil {
 		defer deviceFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
@@ -539,7 +543,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		{Type: specs.UTSNamespace},
 	}
 
-	if conf.Platform == platforms.Ptrace {
+	if gPlatform.Requirements().RequiresCurrentPIDNS {
 		// TODO(b/75837838): Also set a new PID namespace so that we limit
 		// access to other host processes.
 		log.Infof("Sandbox will be started in the current PID namespace")
diff --git a/tools/nogo/config.go b/tools/nogo/config.go
index 0c4b7dd40..6958fca69 100644
--- a/tools/nogo/config.go
+++ b/tools/nogo/config.go
@@ -103,6 +103,9 @@ var analyzerConfig = map[*analysis.Analyzer]matcher{
 			"pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go", // Special case.
 			"pkg/sentry/platform/safecopy/safecopy_unsafe.go",          // Special case.
 			"pkg/sentry/vfs/mount_unsafe.go",                           // Special case.
+			"pkg/sentry/platform/systrap/stub_unsafe.go",               // Special case.
+			"pkg/sentry/platform/systrap/switchto_google_unsafe.go",    // Special case.
+			"pkg/sentry/platform/systrap/sysmsg_thread_unsafe.go",      // Special case.
 		),
 	),
 	unusedresult.Analyzer: alwaysMatches(),
-- 
cgit v1.2.3


From e69a871c7bd4e4859b0acd8b875171f3ebbaec29 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 22 Apr 2020 22:17:01 -0700
Subject: Move user home detection to its own library.

PiperOrigin-RevId: 307977689
---
 pkg/sentry/fs/user/BUILD        |  34 ++++++
 pkg/sentry/fs/user/user.go      | 237 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/user/user_test.go | 198 +++++++++++++++++++++++++++++++
 runsc/boot/BUILD                |   5 +-
 runsc/boot/loader.go            |   7 +-
 runsc/boot/user.go              | 234 ------------------------------------
 runsc/boot/user_test.go         | 254 ----------------------------------------
 7 files changed, 474 insertions(+), 495 deletions(-)
 create mode 100644 pkg/sentry/fs/user/BUILD
 create mode 100644 pkg/sentry/fs/user/user.go
 create mode 100644 pkg/sentry/fs/user/user_test.go
 delete mode 100644 runsc/boot/user.go
 delete mode 100644 runsc/boot/user_test.go

diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD
new file mode 100644
index 000000000..f37f979f1
--- /dev/null
+++ b/pkg/sentry/fs/user/BUILD
@@ -0,0 +1,34 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "user",
+    srcs = ["user.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "user_test",
+    size = "small",
+    srcs = ["user_test.go"],
+    library = ":user",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/contexttest",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
new file mode 100644
index 000000000..fe7f67c00
--- /dev/null
+++ b/pkg/sentry/fs/user/user.go
@@ -0,0 +1,237 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package user
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type fileReader struct {
+	// Ctx is the context for the file reader.
+	Ctx context.Context
+
+	// File is the file to read from.
+	File *fs.File
+}
+
+// Read implements io.Reader.Read.
+func (r *fileReader) Read(buf []byte) (int, error) {
+	n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf))
+	return int(n), err
+}
+
+// getExecUserHome returns the home directory of the executing user read from
+// /etc/passwd as read from the container filesystem.
+func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) {
+	// The default user home directory to return if no user matching the user
+	// if found in the /etc/passwd found in the image.
+	const defaultHome = "/"
+
+	// Open the /etc/passwd file from the dirent via the root mount namespace.
+	mnsRoot := rootMns.Root()
+	maxTraversals := uint(linux.MaxSymlinkTraversals)
+	dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals)
+	if err != nil {
+		// NOTE: Ignore errors opening the passwd file. If the passwd file
+		// doesn't exist we will return the default home directory.
+		return defaultHome, nil
+	}
+	defer dirent.DecRef()
+
+	// Check read permissions on the file.
+	if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil {
+		// NOTE: Ignore permissions errors here and return default root dir.
+		return defaultHome, nil
+	}
+
+	// Only open regular files. We don't open other files like named pipes as
+	// they may block and might present some attack surface to the container.
+	// Note that runc does not seem to do this kind of checking.
+	if !fs.IsRegular(dirent.Inode.StableAttr) {
+		return defaultHome, nil
+	}
+
+	f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false})
+	if err != nil {
+		return "", err
+	}
+	defer f.DecRef()
+
+	r := &fileReader{
+		Ctx:  ctx,
+		File: f,
+	}
+
+	return findHomeInPasswd(uint32(uid), r, defaultHome)
+}
+
+type fileReaderVFS2 struct {
+	ctx context.Context
+	fd  *vfs.FileDescription
+}
+
+func (r *fileReaderVFS2) Read(buf []byte) (int, error) {
+	n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	return int(n), err
+}
+
+func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) {
+	const defaultHome = "/"
+
+	root := mns.Root()
+	defer root.DecRef()
+
+	creds := auth.CredentialsFromContext(ctx)
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/etc/passwd"),
+	}
+
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}
+
+	fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts)
+	if err != nil {
+		return defaultHome, nil
+	}
+	defer fd.DecRef()
+
+	r := &fileReaderVFS2{
+		ctx: ctx,
+		fd:  fd,
+	}
+
+	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
+	if err != nil {
+		return "", err
+	}
+
+	return homeDir, nil
+}
+
+// MaybeAddExecUserHome returns a new slice with the HOME enviroment variable
+// set if the slice does not already contain it, otherwise it returns the
+// original slice unmodified.
+func MaybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHome(ctx, mns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
+
+	return append(envv, "HOME="+homeDir), nil
+}
+
+// MaybeAddExecUserHomeVFS2 returns a new slice with the HOME enviroment
+// variable set if the slice does not already contain it, otherwise it returns
+// the original slice unmodified.
+func MaybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
+	return append(envv, "HOME="+homeDir), nil
+}
+
+// findHomeInPasswd parses a passwd file and returns the given user's home
+// directory. This function does it's best to replicate the runc's behavior.
+func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
+	s := bufio.NewScanner(passwd)
+
+	for s.Scan() {
+		if err := s.Err(); err != nil {
+			return "", err
+		}
+
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+
+		// Pull out part of passwd entry. Loosely parse the passwd entry as some
+		// passwd files could be poorly written and for compatibility with runc.
+		//
+		// Per 'man 5 passwd'
+		// /etc/passwd contains one line for each user account, with seven
+		// fields delimited by colons (“:”). These fields are:
+		//
+		// - login name
+		// - optional encrypted password
+		// - numerical user ID
+		// - numerical group ID
+		// - user name or comment field
+		// - user home directory
+		// - optional user command interpreter
+		parts := strings.Split(line, ":")
+
+		found := false
+		homeDir := ""
+		for i, p := range parts {
+			switch i {
+			case 2:
+				parsedUID, err := strconv.ParseUint(p, 10, 32)
+				if err == nil && parsedUID == uint64(uid) {
+					found = true
+				}
+			case 5:
+				homeDir = p
+			}
+		}
+		if found {
+			// NOTE: If the uid is present but the home directory is not
+			// present in the /etc/passwd entry we return an empty string. This
+			// is, for better or worse, what runc does.
+			return homeDir, nil
+		}
+	}
+
+	return defaultHome, nil
+}
diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go
new file mode 100644
index 000000000..7d8e9ac7c
--- /dev/null
+++ b/pkg/sentry/fs/user/user_test.go
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package user
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// createEtcPasswd creates /etc/passwd with the given contents and mode. If
+// mode is empty, then no file will be created. If mode is not a regular file
+// mode, then contents is ignored.
+func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode linux.FileMode) error {
+	if err := root.CreateDirectory(ctx, root, "etc", fs.FilePermsFromMode(0755)); err != nil {
+		return err
+	}
+	etc, err := root.Walk(ctx, root, "etc")
+	if err != nil {
+		return err
+	}
+	defer etc.DecRef()
+	switch mode.FileType() {
+	case 0:
+		// Don't create anything.
+		return nil
+	case linux.S_IFREG:
+		passwd, err := etc.Create(ctx, root, "passwd", fs.FileFlags{Write: true}, fs.FilePermsFromMode(mode))
+		if err != nil {
+			return err
+		}
+		defer passwd.DecRef()
+		if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil {
+			return err
+		}
+		return nil
+	case linux.S_IFDIR:
+		return etc.CreateDirectory(ctx, root, "passwd", fs.FilePermsFromMode(mode))
+	case linux.S_IFIFO:
+		return etc.CreateFifo(ctx, root, "passwd", fs.FilePermsFromMode(mode))
+	default:
+		return fmt.Errorf("unknown file type %x", mode.FileType())
+	}
+}
+
+// TestGetExecUserHome tests the getExecUserHome function.
+func TestGetExecUserHome(t *testing.T) {
+	tests := map[string]struct {
+		uid            auth.KUID
+		passwdContents string
+		passwdMode     linux.FileMode
+		expected       string
+	}{
+		"success": {
+			uid:            1000,
+			passwdContents: "adin::1000:1111::/home/adin:/bin/sh",
+			passwdMode:     linux.S_IFREG | 0666,
+			expected:       "/home/adin",
+		},
+		"no_perms": {
+			uid:            1000,
+			passwdContents: "adin::1000:1111::/home/adin:/bin/sh",
+			passwdMode:     linux.S_IFREG,
+			expected:       "/",
+		},
+		"no_passwd": {
+			uid:      1000,
+			expected: "/",
+		},
+		"directory": {
+			uid:        1000,
+			passwdMode: linux.S_IFDIR | 0666,
+			expected:   "/",
+		},
+		// Currently we don't allow named pipes.
+		"named_pipe": {
+			uid:        1000,
+			passwdMode: linux.S_IFIFO | 0666,
+			expected:   "/",
+		},
+	}
+
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			ctx := contexttest.Context(t)
+			msrc := fs.NewPseudoMountSource(ctx)
+			rootInode := tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc)
+
+			mns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				t.Fatalf("NewMountNamespace failed: %v", err)
+			}
+			defer mns.DecRef()
+			root := mns.Root()
+			defer root.DecRef()
+			ctx = fs.WithRoot(ctx, root)
+
+			if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil {
+				t.Fatalf("createEtcPasswd failed: %v", err)
+			}
+
+			got, err := getExecUserHome(ctx, mns, tc.uid)
+			if err != nil {
+				t.Fatalf("failed to get user home: %v", err)
+			}
+
+			if got != tc.expected {
+				t.Fatalf("expected %v, got: %v", tc.expected, got)
+			}
+		})
+	}
+}
+
+// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing.
+func TestFindHomeInPasswd(t *testing.T) {
+	tests := map[string]struct {
+		uid      uint32
+		passwd   string
+		expected string
+		def      string
+	}{
+		"empty": {
+			uid:      1000,
+			passwd:   "",
+			expected: "/",
+			def:      "/",
+		},
+		"whitespace": {
+			uid:      1000,
+			passwd:   "       ",
+			expected: "/",
+			def:      "/",
+		},
+		"full": {
+			uid:      1000,
+			passwd:   "adin::1000:1111::/home/adin:/bin/sh",
+			expected: "/home/adin",
+			def:      "/",
+		},
+		// For better or worse, this is how runc works.
+		"partial": {
+			uid:      1000,
+			passwd:   "adin::1000:1111:",
+			expected: "",
+			def:      "/",
+		},
+		"multiple": {
+			uid:      1001,
+			passwd:   "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh",
+			expected: "/home/ian",
+			def:      "/",
+		},
+		"duplicate": {
+			uid:      1000,
+			passwd:   "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh",
+			expected: "/home/adin",
+			def:      "/",
+		},
+		"empty_lines": {
+			uid:      1001,
+			passwd:   "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh",
+			expected: "/home/ian",
+			def:      "/",
+		},
+	}
+
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def)
+			if err != nil {
+				t.Fatalf("error parsing passwd: %v", err)
+			}
+			if tc.expected != got {
+				t.Fatalf("expected %v, got: %v", tc.expected, got)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 5451f1eba..72c2fe381 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -20,7 +20,6 @@ go_library(
         "loader_arm64.go",
         "network.go",
         "strace.go",
-        "user.go",
         "vfs.go",
     ],
     visibility = [
@@ -52,6 +51,7 @@ go_library(
         "//pkg/sentry/fs/sys",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fs/user",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/fsimpl/gofer",
         "//pkg/sentry/fsimpl/host",
@@ -97,7 +97,6 @@ go_library(
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
-        "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/boot/pprof",
@@ -115,7 +114,6 @@ go_test(
         "compat_test.go",
         "fs_test.go",
         "loader_test.go",
-        "user_test.go",
     ],
     library = ":boot",
     deps = [
@@ -125,7 +123,6 @@ go_test(
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/kernel/auth",
         "//pkg/sync",
         "//pkg/unet",
         "//runsc/fsgofer",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index cf1f47bc7..096b0e9f0 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -550,11 +551,11 @@ func (l *Loader) run() error {
 		// Add the HOME enviroment variable if it is not already set.
 		var envv []string
 		if kernel.VFS2Enabled {
-			envv, err = maybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
 				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
 
 		} else {
-			envv, err = maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
 				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
 		}
 		if err != nil {
@@ -860,7 +861,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	root := args.MountNamespace.Root()
 	defer root.DecRef()
 	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
-	envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+	envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
 	if err != nil {
 		return 0, err
 	}
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
deleted file mode 100644
index 332e4fce5..000000000
--- a/runsc/boot/user.go
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"bufio"
-	"fmt"
-	"io"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-type fileReader struct {
-	// Ctx is the context for the file reader.
-	Ctx context.Context
-
-	// File is the file to read from.
-	File *fs.File
-}
-
-// Read implements io.Reader.Read.
-func (r *fileReader) Read(buf []byte) (int, error) {
-	n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf))
-	return int(n), err
-}
-
-// getExecUserHome returns the home directory of the executing user read from
-// /etc/passwd as read from the container filesystem.
-func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) {
-	// The default user home directory to return if no user matching the user
-	// if found in the /etc/passwd found in the image.
-	const defaultHome = "/"
-
-	// Open the /etc/passwd file from the dirent via the root mount namespace.
-	mnsRoot := rootMns.Root()
-	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals)
-	if err != nil {
-		// NOTE: Ignore errors opening the passwd file. If the passwd file
-		// doesn't exist we will return the default home directory.
-		return defaultHome, nil
-	}
-	defer dirent.DecRef()
-
-	// Check read permissions on the file.
-	if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil {
-		// NOTE: Ignore permissions errors here and return default root dir.
-		return defaultHome, nil
-	}
-
-	// Only open regular files. We don't open other files like named pipes as
-	// they may block and might present some attack surface to the container.
-	// Note that runc does not seem to do this kind of checking.
-	if !fs.IsRegular(dirent.Inode.StableAttr) {
-		return defaultHome, nil
-	}
-
-	f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false})
-	if err != nil {
-		return "", err
-	}
-	defer f.DecRef()
-
-	r := &fileReader{
-		Ctx:  ctx,
-		File: f,
-	}
-
-	return findHomeInPasswd(uint32(uid), r, defaultHome)
-}
-
-type fileReaderVFS2 struct {
-	ctx context.Context
-	fd  *vfs.FileDescription
-}
-
-func (r *fileReaderVFS2) Read(buf []byte) (int, error) {
-	n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
-	return int(n), err
-}
-
-func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) {
-	const defaultHome = "/"
-
-	root := mns.Root()
-	defer root.DecRef()
-
-	creds := auth.CredentialsFromContext(ctx)
-
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse("/etc/passwd"),
-	}
-
-	opts := &vfs.OpenOptions{
-		Flags: linux.O_RDONLY,
-	}
-
-	fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts)
-	if err != nil {
-		return defaultHome, nil
-	}
-	defer fd.DecRef()
-
-	r := &fileReaderVFS2{
-		ctx: ctx,
-		fd:  fd,
-	}
-
-	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
-	if err != nil {
-		return "", err
-	}
-
-	return homeDir, nil
-}
-
-// maybeAddExecUserHome returns a new slice with the HOME enviroment variable
-// set if the slice does not already contain it, otherwise it returns the
-// original slice unmodified.
-func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
-	// Check if the envv already contains HOME.
-	for _, env := range envv {
-		if strings.HasPrefix(env, "HOME=") {
-			// We have it. Return the original slice unmodified.
-			return envv, nil
-		}
-	}
-
-	// Read /etc/passwd for the user's HOME directory and set the HOME
-	// environment variable as required by POSIX if it is not overridden by
-	// the user.
-	homeDir, err := getExecUserHome(ctx, mns, uid)
-	if err != nil {
-		return nil, fmt.Errorf("error reading exec user: %v", err)
-	}
-
-	return append(envv, "HOME="+homeDir), nil
-}
-
-func maybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
-	// Check if the envv already contains HOME.
-	for _, env := range envv {
-		if strings.HasPrefix(env, "HOME=") {
-			// We have it. Return the original slice unmodified.
-			return envv, nil
-		}
-	}
-
-	// Read /etc/passwd for the user's HOME directory and set the HOME
-	// environment variable as required by POSIX if it is not overridden by
-	// the user.
-	homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid)
-	if err != nil {
-		return nil, fmt.Errorf("error reading exec user: %v", err)
-	}
-	return append(envv, "HOME="+homeDir), nil
-}
-
-// findHomeInPasswd parses a passwd file and returns the given user's home
-// directory. This function does it's best to replicate the runc's behavior.
-func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
-	s := bufio.NewScanner(passwd)
-
-	for s.Scan() {
-		if err := s.Err(); err != nil {
-			return "", err
-		}
-
-		line := strings.TrimSpace(s.Text())
-		if line == "" {
-			continue
-		}
-
-		// Pull out part of passwd entry. Loosely parse the passwd entry as some
-		// passwd files could be poorly written and for compatibility with runc.
-		//
-		// Per 'man 5 passwd'
-		// /etc/passwd contains one line for each user account, with seven
-		// fields delimited by colons (“:”). These fields are:
-		//
-		// - login name
-		// - optional encrypted password
-		// - numerical user ID
-		// - numerical group ID
-		// - user name or comment field
-		// - user home directory
-		// - optional user command interpreter
-		parts := strings.Split(line, ":")
-
-		found := false
-		homeDir := ""
-		for i, p := range parts {
-			switch i {
-			case 2:
-				parsedUID, err := strconv.ParseUint(p, 10, 32)
-				if err == nil && parsedUID == uint64(uid) {
-					found = true
-				}
-			case 5:
-				homeDir = p
-			}
-		}
-		if found {
-			// NOTE: If the uid is present but the home directory is not
-			// present in the /etc/passwd entry we return an empty string. This
-			// is, for better or worse, what runc does.
-			return homeDir, nil
-		}
-	}
-
-	return defaultHome, nil
-}
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
deleted file mode 100644
index fb4e13dfb..000000000
--- a/runsc/boot/user_test.go
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"testing"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-)
-
-func setupTempDir() (string, error) {
-	tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test")
-	if err != nil {
-		return "", err
-	}
-	return tmpDir, nil
-}
-
-func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
-	return func() (string, error) {
-		tmpDir, err := setupTempDir()
-		if err != nil {
-			return "", err
-		}
-
-		if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-			return "", err
-		}
-
-		f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd"))
-		if err != nil {
-			return "", err
-		}
-		defer f.Close()
-
-		_, err = f.WriteString(contents)
-		if err != nil {
-			return "", err
-		}
-
-		err = f.Chmod(perms)
-		if err != nil {
-			return "", err
-		}
-		return tmpDir, nil
-	}
-}
-
-// TestGetExecUserHome tests the getExecUserHome function.
-func TestGetExecUserHome(t *testing.T) {
-	tests := map[string]struct {
-		uid        auth.KUID
-		createRoot func() (string, error)
-		expected   string
-	}{
-		"success": {
-			uid:        1000,
-			createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666),
-			expected:   "/home/adin",
-		},
-		"no_passwd": {
-			uid:        1000,
-			createRoot: setupTempDir,
-			expected:   "/",
-		},
-		"no_perms": {
-			uid:        1000,
-			createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000),
-			expected:   "/",
-		},
-		"directory": {
-			uid: 1000,
-			createRoot: func() (string, error) {
-				tmpDir, err := setupTempDir()
-				if err != nil {
-					return "", err
-				}
-
-				if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-					return "", err
-				}
-
-				if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
-					return "", err
-				}
-
-				return tmpDir, nil
-			},
-			expected: "/",
-		},
-		// Currently we don't allow named pipes.
-		"named_pipe": {
-			uid: 1000,
-			createRoot: func() (string, error) {
-				tmpDir, err := setupTempDir()
-				if err != nil {
-					return "", err
-				}
-
-				if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-					return "", err
-				}
-
-				if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
-					return "", err
-				}
-
-				return tmpDir, nil
-			},
-			expected: "/",
-		},
-	}
-
-	for name, tc := range tests {
-		t.Run(name, func(t *testing.T) {
-			tmpDir, err := tc.createRoot()
-			if err != nil {
-				t.Fatalf("failed to create root dir: %v", err)
-			}
-
-			sandEnd, cleanup, err := startGofer(tmpDir)
-			if err != nil {
-				t.Fatalf("failed to create gofer: %v", err)
-			}
-			defer cleanup()
-
-			ctx := contexttest.Context(t)
-			conf := &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				DisableSeccomp: true,
-			}
-
-			spec := &specs.Spec{
-				Root: &specs.Root{
-					Path:     tmpDir,
-					Readonly: true,
-				},
-				// Add /proc mount as tmpfs to avoid needing a kernel.
-				Mounts: []specs.Mount{
-					{
-						Destination: "/proc",
-						Type:        "tmpfs",
-					},
-				},
-			}
-
-			mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{})
-			mns, err := mntr.createMountNamespace(ctx, conf)
-			if err != nil {
-				t.Fatalf("failed to create mount namespace: %v", err)
-			}
-			ctx = fs.WithRoot(ctx, mns.Root())
-			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
-				t.Fatalf("failed to create mount namespace: %v", err)
-			}
-
-			got, err := getExecUserHome(ctx, mns, tc.uid)
-			if err != nil {
-				t.Fatalf("failed to get user home: %v", err)
-			}
-
-			if got != tc.expected {
-				t.Fatalf("expected %v, got: %v", tc.expected, got)
-			}
-		})
-	}
-}
-
-// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing.
-func TestFindHomeInPasswd(t *testing.T) {
-	tests := map[string]struct {
-		uid      uint32
-		passwd   string
-		expected string
-		def      string
-	}{
-		"empty": {
-			uid:      1000,
-			passwd:   "",
-			expected: "/",
-			def:      "/",
-		},
-		"whitespace": {
-			uid:      1000,
-			passwd:   "       ",
-			expected: "/",
-			def:      "/",
-		},
-		"full": {
-			uid:      1000,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh",
-			expected: "/home/adin",
-			def:      "/",
-		},
-		// For better or worse, this is how runc works.
-		"partial": {
-			uid:      1000,
-			passwd:   "adin::1000:1111:",
-			expected: "",
-			def:      "/",
-		},
-		"multiple": {
-			uid:      1001,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh",
-			expected: "/home/ian",
-			def:      "/",
-		},
-		"duplicate": {
-			uid:      1000,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh",
-			expected: "/home/adin",
-			def:      "/",
-		},
-		"empty_lines": {
-			uid:      1001,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh",
-			expected: "/home/ian",
-			def:      "/",
-		},
-	}
-
-	for name, tc := range tests {
-		t.Run(name, func(t *testing.T) {
-			got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def)
-			if err != nil {
-				t.Fatalf("error parsing passwd: %v", err)
-			}
-			if tc.expected != got {
-				t.Fatalf("expected %v, got: %v", tc.expected, got)
-			}
-		})
-	}
-}
-- 
cgit v1.2.3


From a2925a079fa04ff4c891016a0eea1818bdb2cf4b Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 23 Apr 2020 08:34:42 -0700
Subject: Run failing packetimpact test and expect failure.

This will make it easier to notice if a code change causes an existing test to
pass.

PiperOrigin-RevId: 308057978
---
 test/packetimpact/tests/defs.bzl       | 35 ++++++++++++++++++++++++++--------
 test/packetimpact/tests/test_runner.sh | 19 +++++++++++++++---
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/tests/defs.bzl
index 8c0d058b2..27c5de375 100644
--- a/test/packetimpact/tests/defs.bzl
+++ b/test/packetimpact/tests/defs.bzl
@@ -59,7 +59,11 @@ _packetimpact_test = rule(
 
 PACKETIMPACT_TAGS = ["local", "manual"]
 
-def packetimpact_linux_test(name, testbench_binary, **kwargs):
+def packetimpact_linux_test(
+        name,
+        testbench_binary,
+        expect_failure = False,
+        **kwargs):
     """Add a packetimpact test on linux.
 
     Args:
@@ -67,28 +71,37 @@ def packetimpact_linux_test(name, testbench_binary, **kwargs):
         testbench_binary: the testbench binary
         **kwargs: all the other args, forwarded to _packetimpact_test
     """
+    expect_failure_flag = ["--expect_failure"] if expect_failure else []
     _packetimpact_test(
         name = name + "_linux_test",
         testbench_binary = testbench_binary,
-        flags = ["--dut_platform", "linux"],
+        flags = ["--dut_platform", "linux"] + expect_failure_flag,
         tags = PACKETIMPACT_TAGS + ["packetimpact"],
         **kwargs
     )
 
-def packetimpact_netstack_test(name, testbench_binary, **kwargs):
+def packetimpact_netstack_test(
+        name,
+        testbench_binary,
+        expect_failure = False,
+        **kwargs):
     """Add a packetimpact test on netstack.
 
     Args:
         name: name of the test
         testbench_binary: the testbench binary
+        expect_failure: the test must fail
         **kwargs: all the other args, forwarded to _packetimpact_test
     """
+    expect_failure_flag = []
+    if expect_failure:
+        expect_failure_flag = ["--expect_failure"]
     _packetimpact_test(
         name = name + "_netstack_test",
         testbench_binary = testbench_binary,
         # This is the default runtime unless
         # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
-        flags = ["--dut_platform", "netstack", "--runtime=runsc-d"],
+        flags = ["--dut_platform", "netstack", "--runtime=runsc-d"] + expect_failure_flag,
         tags = PACKETIMPACT_TAGS + ["packetimpact"],
         **kwargs
     )
@@ -112,7 +125,13 @@ def packetimpact_go_test(name, size = "small", pure = True, linux = True, netsta
         tags = PACKETIMPACT_TAGS,
         **kwargs
     )
-    if linux:
-        packetimpact_linux_test(name = name, testbench_binary = testbench_binary)
-    if netstack:
-        packetimpact_netstack_test(name = name, testbench_binary = testbench_binary)
+    packetimpact_linux_test(
+        name = name,
+        expect_failure = not linux,
+        testbench_binary = testbench_binary,
+    )
+    packetimpact_netstack_test(
+        name = name,
+        expect_failure = not netstack,
+        testbench_binary = testbench_binary,
+    )
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
index e99fc7d09..2be3c17c3 100755
--- a/test/packetimpact/tests/test_runner.sh
+++ b/test/packetimpact/tests/test_runner.sh
@@ -29,7 +29,7 @@ function failure() {
 }
 trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
 
-declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark,extra_test_arg:"
+declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark,extra_test_arg:,expect_failure"
 
 # Don't use declare below so that the error from getopt will end the script.
 PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
@@ -68,6 +68,10 @@ while true; do
       EXTRA_TEST_ARGS+="$2"
       shift 2
       ;;
+    --expect_failure)
+      declare -r EXPECT_FAILURE="1"
+      shift 1
+      ;;
     --)
       shift
       break
@@ -263,6 +267,15 @@ docker exec -t "${TESTBENCH}" \
   --local_ipv4=${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX} \
   --remote_mac=${REMOTE_MAC} \
   --local_mac=${LOCAL_MAC} \
-  --device=${TEST_DEVICE}"
-
+  --device=${TEST_DEVICE}" && true
+declare -r TEST_RESULT="${?}"
+if [[ -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" != 0 ]]; then
+  echo 'FAIL: This test was expected to pass.'
+  exit ${TEST_RESULT}
+fi
+if [[ ! -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" == 0 ]]; then
+  echo 'FAIL: This test was expected to fail but passed.  Enable the test and' \
+    'mark the corresponding bug as fixed.'
+  exit 1
+fi
 echo PASS: No errors.
-- 
cgit v1.2.3


From 7d1b7daf7e89c99899fc46187bcb1f3a3bcab7fb Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 23 Apr 2020 10:19:23 -0700
Subject: Disable nogo because it breaks Go 1.13

Even though the default build option is to use 1.14, we want to be
want to keep the ability to target different Go versions for testing
and in case the new release has bugs.

PiperOrigin-RevId: 308078876
---
 tools/defs.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/defs.bzl b/tools/defs.bzl
index 6a224d7d5..33240e7f4 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -92,7 +92,7 @@ def go_imports(name, src, out):
         cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
     )
 
-def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, nogo = True, **kwargs):
+def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, nogo = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
     The recommended way is to use this rule with mostly identical configuration as the native
-- 
cgit v1.2.3


From e0c67014cb2200ad58cd28b12fddb3f55652a21b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 23 Apr 2020 11:06:59 -0700
Subject: Factor fsimpl/gofer.host{Preadv,Pwritev} out of fsimpl/gofer.

Also fix returning EOF when 0 bytes are read.

PiperOrigin-RevId: 308089875
---
 pkg/sentry/fsimpl/gofer/BUILD            |   2 +-
 pkg/sentry/fsimpl/gofer/handle.go        |   5 +-
 pkg/sentry/fsimpl/gofer/handle_unsafe.go |  66 -------------------
 pkg/sentry/fsimpl/host/BUILD             |   3 +-
 pkg/sentry/fsimpl/host/host.go           |  31 ++-------
 pkg/sentry/hostfd/BUILD                  |  17 +++++
 pkg/sentry/hostfd/hostfd.go              |  84 ++++++++++++++++++++++++
 pkg/sentry/hostfd/hostfd_unsafe.go       | 107 +++++++++++++++++++++++++++++++
 8 files changed, 218 insertions(+), 97 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go
 create mode 100644 pkg/sentry/hostfd/BUILD
 create mode 100644 pkg/sentry/hostfd/hostfd.go
 create mode 100644 pkg/sentry/hostfd/hostfd_unsafe.go

diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index acd061905..b9c4beee4 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -35,7 +35,6 @@ go_library(
         "fstree.go",
         "gofer.go",
         "handle.go",
-        "handle_unsafe.go",
         "p9file.go",
         "pagemath.go",
         "regular_file.go",
@@ -53,6 +52,7 @@ go_library(
         "//pkg/p9",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index cfe66f797..724a3f1f7 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
 )
 
 // handle represents a remote "open file descriptor", consisting of an opened
@@ -77,7 +78,7 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs
 	}
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
-		n, err := hostPreadv(h.fd, dsts, int64(offset))
+		n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */)
 		ctx.UninterruptibleSleepFinish(false)
 		return n, err
 	}
@@ -103,7 +104,7 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
 	}
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
-		n, err := hostPwritev(h.fd, srcs, int64(offset))
+		n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */)
 		ctx.UninterruptibleSleepFinish(false)
 		return n, err
 	}
diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
deleted file mode 100644
index 19560ab26..000000000
--- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package gofer
-
-import (
-	"syscall"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-)
-
-// Preconditions: !dsts.IsEmpty().
-func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) {
-	// No buffering is necessary regardless of safecopy; host syscalls will
-	// return EFAULT if appropriate, instead of raising SIGBUS.
-	if dsts.NumBlocks() == 1 {
-		// Use pread() instead of preadv() to avoid iovec allocation and
-		// copying.
-		dst := dsts.Head()
-		n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0)
-		if e != 0 {
-			return 0, e
-		}
-		return uint64(n), nil
-	}
-	iovs := safemem.IovecsFromBlockSeq(dsts)
-	n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
-	if e != 0 {
-		return 0, e
-	}
-	return uint64(n), nil
-}
-
-// Preconditions: !srcs.IsEmpty().
-func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) {
-	// No buffering is necessary regardless of safecopy; host syscalls will
-	// return EFAULT if appropriate, instead of raising SIGBUS.
-	if srcs.NumBlocks() == 1 {
-		// Use pwrite() instead of pwritev() to avoid iovec allocation and
-		// copying.
-		src := srcs.Head()
-		n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0)
-		if e != 0 {
-			return 0, e
-		}
-		return uint64(n), nil
-	}
-	iovs := safemem.IovecsFromBlockSeq(srcs)
-	n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
-	if e != 0 {
-		return 0, e
-	}
-	return uint64(n), nil
-}
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 82e1fb74b..44dd9f672 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -15,12 +15,11 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
-        "//pkg/fd",
         "//pkg/log",
         "//pkg/refs",
-        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index fe14476f1..ae94cfa6e 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -25,11 +25,10 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -492,19 +491,9 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
 	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
-
-	var reader safemem.Reader
-	if offset == -1 {
-		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
-	} else {
-		reader = safemem.FromVecReaderFunc{
-			func(srcs [][]byte) (int64, error) {
-				n, err := unix.Preadv(hostFD, srcs, offset)
-				return int64(n), err
-			},
-		}
-	}
+	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
 	n, err := dst.CopyOutFrom(ctx, reader)
+	hostfd.PutReadWriterAt(reader)
 	return int64(n), err
 }
 
@@ -542,19 +531,9 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs
 	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
-
-	var writer safemem.Writer
-	if offset == -1 {
-		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
-	} else {
-		writer = safemem.FromVecWriterFunc{
-			func(srcs [][]byte) (int64, error) {
-				n, err := unix.Pwritev(hostFD, srcs, offset)
-				return int64(n), err
-			},
-		}
-	}
+	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
 	n, err := src.CopyInTo(ctx, writer)
+	hostfd.PutReadWriterAt(writer)
 	return int64(n), err
 }
 
diff --git a/pkg/sentry/hostfd/BUILD b/pkg/sentry/hostfd/BUILD
new file mode 100644
index 000000000..364a78306
--- /dev/null
+++ b/pkg/sentry/hostfd/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "hostfd",
+    srcs = [
+        "hostfd.go",
+        "hostfd_unsafe.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/safemem",
+        "//pkg/sync",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/hostfd/hostfd.go b/pkg/sentry/hostfd/hostfd.go
new file mode 100644
index 000000000..70dd9cafb
--- /dev/null
+++ b/pkg/sentry/hostfd/hostfd.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostfd provides efficient I/O with host file descriptors.
+package hostfd
+
+import (
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// ReadWriterAt implements safemem.Reader and safemem.Writer by reading from
+// and writing to a host file descriptor respectively. ReadWriterAts should be
+// obtained by calling GetReadWriterAt.
+//
+// Clients should usually prefer to use Preadv2 and Pwritev2 directly.
+type ReadWriterAt struct {
+	fd     int32
+	offset int64
+	flags  uint32
+}
+
+var rwpool = sync.Pool{
+	New: func() interface{} {
+		return &ReadWriterAt{}
+	},
+}
+
+// GetReadWriterAt returns a ReadWriterAt that reads from / writes to the given
+// host file descriptor, starting at the given offset and using the given
+// preadv2(2)/pwritev2(2) flags. If offset is -1, the host file descriptor's
+// offset is used instead. Users are responsible for ensuring that fd remains
+// valid for the lifetime of the returned ReadWriterAt, and must call
+// PutReadWriterAt when it is no longer needed.
+func GetReadWriterAt(fd int32, offset int64, flags uint32) *ReadWriterAt {
+	rw := rwpool.Get().(*ReadWriterAt)
+	*rw = ReadWriterAt{
+		fd:     fd,
+		offset: offset,
+		flags:  flags,
+	}
+	return rw
+}
+
+// PutReadWriterAt releases a ReadWriterAt returned by a previous call to
+// GetReadWriterAt that is no longer in use.
+func PutReadWriterAt(rw *ReadWriterAt) {
+	rwpool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *ReadWriterAt) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	n, err := Preadv2(rw.fd, dsts, rw.offset, rw.flags)
+	if rw.offset >= 0 {
+		rw.offset += int64(n)
+	}
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *ReadWriterAt) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	n, err := Pwritev2(rw.fd, srcs, rw.offset, rw.flags)
+	if rw.offset >= 0 {
+		rw.offset += int64(n)
+	}
+	return n, err
+}
diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go
new file mode 100644
index 000000000..5e9e60fc4
--- /dev/null
+++ b/pkg/sentry/hostfd/hostfd_unsafe.go
@@ -0,0 +1,107 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostfd
+
+import (
+	"io"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into
+// dsts. offset and flags are interpreted as for preadv2(2).
+//
+// Preconditions: !dsts.IsEmpty().
+func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	var (
+		n uintptr
+		e syscall.Errno
+	)
+	// Avoid preadv2(2) if possible, since it's relatively new and thus least
+	// likely to be supported by the host kernel.
+	if flags == 0 {
+		if dsts.NumBlocks() == 1 {
+			// Use read() or pread() to avoid iovec allocation and copying.
+			dst := dsts.Head()
+			if offset == -1 {
+				n, _, e = syscall.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len()))
+			} else {
+				n, _, e = syscall.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
+			}
+		} else {
+			iovs := safemem.IovecsFromBlockSeq(dsts)
+			if offset == -1 {
+				n, _, e = syscall.Syscall(unix.SYS_READV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)))
+			} else {
+				n, _, e = syscall.Syscall6(unix.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
+			}
+		}
+	} else {
+		iovs := safemem.IovecsFromBlockSeq(dsts)
+		n, _, e = syscall.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
+	}
+	if e != 0 {
+		return 0, e
+	}
+	if n == 0 {
+		return 0, io.EOF
+	}
+	return uint64(n), nil
+}
+
+// Pwritev2 writes up to srcs.NumBytes() from srcs into host file descriptor
+// fd. offset and flags are interpreted as for pwritev2(2).
+//
+// Preconditions: !srcs.IsEmpty().
+func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	var (
+		n uintptr
+		e syscall.Errno
+	)
+	// Avoid pwritev2(2) if possible, since it's relatively new and thus least
+	// likely to be supported by the host kernel.
+	if flags == 0 {
+		if srcs.NumBlocks() == 1 {
+			// Use write() or pwrite() to avoid iovec allocation and copying.
+			src := srcs.Head()
+			if offset == -1 {
+				n, _, e = syscall.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len()))
+			} else {
+				n, _, e = syscall.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
+			}
+		} else {
+			iovs := safemem.IovecsFromBlockSeq(srcs)
+			if offset == -1 {
+				n, _, e = syscall.Syscall(unix.SYS_WRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)))
+			} else {
+				n, _, e = syscall.Syscall6(unix.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
+			}
+		}
+	} else {
+		iovs := safemem.IovecsFromBlockSeq(srcs)
+		n, _, e = syscall.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
+	}
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
-- 
cgit v1.2.3


From 1481499fe27157ad2716c00682f6ad819115a6c7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 23 Apr 2020 11:32:08 -0700
Subject: Simplify Docker test infrastructure.

This change adds a layer of abstraction around the internal Docker APIs,
and eliminates all direct dependencies on Dockerfiles in the infrastructure.

A subsequent change will automated the generation of local images (with
efficient caching). Note that this change drops the use of bazel container
rules, as that experiment does not seem to be viable.

PiperOrigin-RevId: 308095430
---
 WORKSPACE                                          |   39 -
 pkg/sentry/fsimpl/ext/BUILD                        |    2 +-
 pkg/sentry/fsimpl/ext/ext_test.go                  |    3 +-
 pkg/tcpip/transport/tcp/BUILD                      |    2 +-
 pkg/tcpip/transport/tcp/tcp_noracedetector_test.go |    2 +-
 pkg/test/criutil/BUILD                             |   14 +
 pkg/test/criutil/criutil.go                        |  306 +++
 pkg/test/dockerutil/BUILD                          |   14 +
 pkg/test/dockerutil/dockerutil.go                  |  581 ++++++
 pkg/test/testutil/BUILD                            |   20 +
 pkg/test/testutil/testutil.go                      |  550 ++++++
 pkg/test/testutil/testutil_runfiles.go             |   75 +
 runsc/boot/BUILD                                   |    1 +
 runsc/cmd/BUILD                                    |    2 +-
 runsc/cmd/capability_test.go                       |    9 +-
 runsc/container/BUILD                              |    6 +-
 runsc/container/console_test.go                    |  115 +-
 runsc/container/container.go                       |    2 +-
 runsc/container/container_norace_test.go           |   20 +
 runsc/container/container_race_test.go             |   20 +
 runsc/container/container_test.go                  | 2046 ++++++++++----------
 runsc/container/multi_container_test.go            | 1161 +++++------
 runsc/container/shared_volume_test.go              |   18 +-
 runsc/container/test_app/BUILD                     |   21 -
 runsc/container/test_app/fds.go                    |  185 --
 runsc/container/test_app/test_app.go               |  394 ----
 runsc/criutil/BUILD                                |   11 -
 runsc/criutil/criutil.go                           |  277 ---
 runsc/dockerutil/BUILD                             |   14 -
 runsc/dockerutil/dockerutil.go                     |  486 -----
 runsc/testutil/BUILD                               |   21 -
 runsc/testutil/testutil.go                         |  433 -----
 runsc/testutil/testutil_runfiles.go                |   75 -
 scripts/iptables_tests.sh                          |   13 +-
 test/cmd/test_app/BUILD                            |   21 +
 test/cmd/test_app/fds.go                           |  185 ++
 test/cmd/test_app/test_app.go                      |  394 ++++
 test/e2e/BUILD                                     |    4 +-
 test/e2e/exec_test.go                              |  193 +-
 test/e2e/integration_test.go                       |  233 ++-
 test/e2e/regression_test.go                        |   18 +-
 test/image/BUILD                                   |    4 +-
 test/image/image_test.go                           |  195 +-
 test/image/ruby.sh                                 |    0
 test/iptables/BUILD                                |    8 +-
 test/iptables/README.md                            |    2 +-
 test/iptables/iptables.go                          |    7 +
 test/iptables/iptables_test.go                     |  271 +--
 test/iptables/iptables_util.go                     |    2 +-
 test/iptables/runner/BUILD                         |   17 +-
 test/iptables/runner/main.go                       |    3 +
 test/packetdrill/packetdrill_test.sh               |   25 +-
 test/packetimpact/testbench/dut.go                 |    2 +-
 test/packetimpact/tests/test_runner.sh             |   24 +-
 test/root/BUILD                                    |    8 +-
 test/root/cgroup_test.go                           |  114 +-
 test/root/chroot_test.go                           |   20 +-
 test/root/crictl_test.go                           |  192 +-
 test/root/main_test.go                             |    2 +-
 test/root/oom_score_adj_test.go                    |   78 +-
 test/root/runsc_test.go                            |    2 +-
 test/root/testdata/BUILD                           |   18 -
 test/root/testdata/busybox.go                      |   32 -
 test/root/testdata/containerd_config.go            |   39 -
 test/root/testdata/httpd.go                        |   32 -
 test/root/testdata/httpd_mount_paths.go            |   53 -
 test/root/testdata/sandbox.go                      |   30 -
 test/root/testdata/simple.go                       |   41 -
 test/runner/BUILD                                  |    2 +-
 test/runner/runner.go                              |   12 +-
 test/runtimes/BUILD                                |   22 +-
 test/runtimes/README.md                            |   56 -
 test/runtimes/blacklist_test.go                    |   37 -
 test/runtimes/build_defs.bzl                       |   75 -
 test/runtimes/defs.bzl                             |   79 +
 test/runtimes/images/proctor/BUILD                 |   26 -
 test/runtimes/images/proctor/go.go                 |   90 -
 test/runtimes/images/proctor/java.go               |   71 -
 test/runtimes/images/proctor/nodejs.go             |   46 -
 test/runtimes/images/proctor/php.go                |   42 -
 test/runtimes/images/proctor/proctor.go            |  163 --
 test/runtimes/images/proctor/proctor_test.go       |  127 --
 test/runtimes/images/proctor/python.go             |   49 -
 test/runtimes/proctor/BUILD                        |   27 +
 test/runtimes/proctor/go.go                        |   90 +
 test/runtimes/proctor/java.go                      |   71 +
 test/runtimes/proctor/nodejs.go                    |   46 +
 test/runtimes/proctor/php.go                       |   42 +
 test/runtimes/proctor/proctor.go                   |  163 ++
 test/runtimes/proctor/proctor_test.go              |  127 ++
 test/runtimes/proctor/python.go                    |   49 +
 test/runtimes/runner.go                            |  196 --
 test/runtimes/runner.sh                            |   35 -
 test/runtimes/runner/BUILD                         |   21 +
 test/runtimes/runner/blacklist_test.go             |   37 +
 test/runtimes/runner/main.go                       |  189 ++
 tools/bazeldefs/defs.bzl                           |    4 -
 tools/defs.bzl                                     |    4 +-
 98 files changed, 5512 insertions(+), 5693 deletions(-)
 create mode 100644 pkg/test/criutil/BUILD
 create mode 100644 pkg/test/criutil/criutil.go
 create mode 100644 pkg/test/dockerutil/BUILD
 create mode 100644 pkg/test/dockerutil/dockerutil.go
 create mode 100644 pkg/test/testutil/BUILD
 create mode 100644 pkg/test/testutil/testutil.go
 create mode 100644 pkg/test/testutil/testutil_runfiles.go
 create mode 100644 runsc/container/container_norace_test.go
 create mode 100644 runsc/container/container_race_test.go
 delete mode 100644 runsc/container/test_app/BUILD
 delete mode 100644 runsc/container/test_app/fds.go
 delete mode 100644 runsc/container/test_app/test_app.go
 delete mode 100644 runsc/criutil/BUILD
 delete mode 100644 runsc/criutil/criutil.go
 delete mode 100644 runsc/dockerutil/BUILD
 delete mode 100644 runsc/dockerutil/dockerutil.go
 delete mode 100644 runsc/testutil/BUILD
 delete mode 100644 runsc/testutil/testutil.go
 delete mode 100644 runsc/testutil/testutil_runfiles.go
 create mode 100644 test/cmd/test_app/BUILD
 create mode 100644 test/cmd/test_app/fds.go
 create mode 100644 test/cmd/test_app/test_app.go
 mode change 100644 => 100755 test/image/ruby.sh
 delete mode 100644 test/root/testdata/BUILD
 delete mode 100644 test/root/testdata/busybox.go
 delete mode 100644 test/root/testdata/containerd_config.go
 delete mode 100644 test/root/testdata/httpd.go
 delete mode 100644 test/root/testdata/httpd_mount_paths.go
 delete mode 100644 test/root/testdata/sandbox.go
 delete mode 100644 test/root/testdata/simple.go
 delete mode 100644 test/runtimes/README.md
 delete mode 100644 test/runtimes/blacklist_test.go
 delete mode 100644 test/runtimes/build_defs.bzl
 create mode 100644 test/runtimes/defs.bzl
 delete mode 100644 test/runtimes/images/proctor/BUILD
 delete mode 100644 test/runtimes/images/proctor/go.go
 delete mode 100644 test/runtimes/images/proctor/java.go
 delete mode 100644 test/runtimes/images/proctor/nodejs.go
 delete mode 100644 test/runtimes/images/proctor/php.go
 delete mode 100644 test/runtimes/images/proctor/proctor.go
 delete mode 100644 test/runtimes/images/proctor/proctor_test.go
 delete mode 100644 test/runtimes/images/proctor/python.go
 create mode 100644 test/runtimes/proctor/BUILD
 create mode 100644 test/runtimes/proctor/go.go
 create mode 100644 test/runtimes/proctor/java.go
 create mode 100644 test/runtimes/proctor/nodejs.go
 create mode 100644 test/runtimes/proctor/php.go
 create mode 100644 test/runtimes/proctor/proctor.go
 create mode 100644 test/runtimes/proctor/proctor_test.go
 create mode 100644 test/runtimes/proctor/python.go
 delete mode 100644 test/runtimes/runner.go
 delete mode 100755 test/runtimes/runner.sh
 create mode 100644 test/runtimes/runner/BUILD
 create mode 100644 test/runtimes/runner/blacklist_test.go
 create mode 100644 test/runtimes/runner/main.go

diff --git a/WORKSPACE b/WORKSPACE
index b895647fb..3bf5cc9c1 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -127,45 +127,6 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 rules_pkg_dependencies()
 
-# Container rules.
-http_archive(
-    name = "io_bazel_rules_docker",
-    sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b",
-    strip_prefix = "rules_docker-0.12.1",
-    urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"],
-)
-
-load(
-    "@io_bazel_rules_docker//repositories:repositories.bzl",
-    container_repositories = "repositories",
-)
-
-container_repositories()
-
-load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
-
-container_deps()
-
-load(
-    "@io_bazel_rules_docker//container:container.bzl",
-    "container_pull",
-)
-
-# This container is built from the Dockerfile in test/iptables/runner.
-container_pull(
-    name = "iptables-test",
-    digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
-    registry = "gcr.io",
-    repository = "gvisor-presubmit/iptables-test",
-)
-
-load(
-    "@io_bazel_rules_docker//go:image.bzl",
-    _go_image_repos = "repositories",
-)
-
-_go_image_repos()
-
 # Load C++ grpc rules.
 http_archive(
     name = "com_github_grpc_grpc",
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index a4947c480..ff861d0fe 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -93,8 +93,8 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/test/testutil",
         "//pkg/usermem",
-        "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
     ],
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 29bb73765..64e9a579f 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -32,9 +32,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/usermem"
-
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 const (
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 61426623c..f2aa69069 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -105,8 +105,8 @@ go_test(
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/tcp/testing/context",
+        "//pkg/test/testutil",
         "//pkg/waiter",
-        "//runsc/testutil",
     ],
 )
 
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
index 359a75e73..5fe23113b 100644
--- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -31,7 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func TestFastRecovery(t *testing.T) {
diff --git a/pkg/test/criutil/BUILD b/pkg/test/criutil/BUILD
new file mode 100644
index 000000000..a7b082cee
--- /dev/null
+++ b/pkg/test/criutil/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "criutil",
+    testonly = 1,
+    srcs = ["criutil.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
+    ],
+)
diff --git a/pkg/test/criutil/criutil.go b/pkg/test/criutil/criutil.go
new file mode 100644
index 000000000..bebebb48e
--- /dev/null
+++ b/pkg/test/criutil/criutil.go
@@ -0,0 +1,306 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package criutil contains utility functions for interacting with the
+// Container Runtime Interface (CRI), principally via the crictl command line
+// tool. This requires critools to be installed on the local system.
+package criutil
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+// Crictl contains information required to run the crictl utility.
+type Crictl struct {
+	logger   testutil.Logger
+	endpoint string
+	cleanup  []func()
+}
+
+// resolvePath attempts to find binary paths. It may set the path to invalid,
+// which will cause the execution to fail with a sensible error.
+func resolvePath(executable string) string {
+	guess, err := exec.LookPath(executable)
+	if err != nil {
+		guess = fmt.Sprintf("/usr/local/bin/%s", executable)
+	}
+	return guess
+}
+
+// NewCrictl returns a Crictl configured with a timeout and an endpoint over
+// which it will talk to containerd.
+func NewCrictl(logger testutil.Logger, endpoint string) *Crictl {
+	// Attempt to find the executable, but don't bother propagating the
+	// error at this point. The first command executed will return with a
+	// binary not found error.
+	return &Crictl{
+		logger:   logger,
+		endpoint: endpoint,
+	}
+}
+
+// CleanUp executes cleanup functions.
+func (cc *Crictl) CleanUp() {
+	for _, c := range cc.cleanup {
+		c()
+	}
+	cc.cleanup = nil
+}
+
+// RunPod creates a sandbox. It corresponds to `crictl runp`.
+func (cc *Crictl) RunPod(sbSpecFile string) (string, error) {
+	podID, err := cc.run("runp", sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("runp failed: %v", err)
+	}
+	// Strip the trailing newline from crictl output.
+	return strings.TrimSpace(podID), nil
+}
+
+// Create creates a container within a sandbox. It corresponds to `crictl
+// create`.
+func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) {
+	podID, err := cc.run("create", podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("create failed: %v", err)
+	}
+	// Strip the trailing newline from crictl output.
+	return strings.TrimSpace(podID), nil
+}
+
+// Start starts a container. It corresponds to `crictl start`.
+func (cc *Crictl) Start(contID string) (string, error) {
+	output, err := cc.run("start", contID)
+	if err != nil {
+		return "", fmt.Errorf("start failed: %v", err)
+	}
+	return output, nil
+}
+
+// Stop stops a container. It corresponds to `crictl stop`.
+func (cc *Crictl) Stop(contID string) error {
+	_, err := cc.run("stop", contID)
+	return err
+}
+
+// Exec execs a program inside a container. It corresponds to `crictl exec`.
+func (cc *Crictl) Exec(contID string, args ...string) (string, error) {
+	a := []string{"exec", contID}
+	a = append(a, args...)
+	output, err := cc.run(a...)
+	if err != nil {
+		return "", fmt.Errorf("exec failed: %v", err)
+	}
+	return output, nil
+}
+
+// Rm removes a container. It corresponds to `crictl rm`.
+func (cc *Crictl) Rm(contID string) error {
+	_, err := cc.run("rm", contID)
+	return err
+}
+
+// StopPod stops a pod. It corresponds to `crictl stopp`.
+func (cc *Crictl) StopPod(podID string) error {
+	_, err := cc.run("stopp", podID)
+	return err
+}
+
+// containsConfig is a minimal copy of
+// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto
+// It only contains fields needed for testing.
+type containerConfig struct {
+	Status containerStatus
+}
+
+type containerStatus struct {
+	Network containerNetwork
+}
+
+type containerNetwork struct {
+	IP string
+}
+
+// PodIP returns a pod's IP address.
+func (cc *Crictl) PodIP(podID string) (string, error) {
+	output, err := cc.run("inspectp", podID)
+	if err != nil {
+		return "", err
+	}
+	conf := &containerConfig{}
+	if err := json.Unmarshal([]byte(output), conf); err != nil {
+		return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output)
+	}
+	if conf.Status.Network.IP == "" {
+		return "", fmt.Errorf("no IP found in config: %s", output)
+	}
+	return conf.Status.Network.IP, nil
+}
+
+// RmPod removes a container. It corresponds to `crictl rmp`.
+func (cc *Crictl) RmPod(podID string) error {
+	_, err := cc.run("rmp", podID)
+	return err
+}
+
+// Import imports the given container from the local Docker instance.
+func (cc *Crictl) Import(image string) error {
+	// Note that we provide a 10 minute timeout after connect because we may
+	// be pushing a lot of bytes in order to import the image. The connect
+	// timeout stays the same and is inherited from the Crictl instance.
+	cmd := testutil.Command(cc.logger,
+		resolvePath("ctr"),
+		fmt.Sprintf("--connect-timeout=%s", 30*time.Second),
+		fmt.Sprintf("--address=%s", cc.endpoint),
+		"-n", "k8s.io", "images", "import", "-")
+	cmd.Stderr = os.Stderr // Pass through errors.
+
+	// Create a pipe and start the program.
+	w, err := cmd.StdinPipe()
+	if err != nil {
+		return err
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	// Save the image on the other end.
+	if err := dockerutil.Save(cc.logger, image, w); err != nil {
+		cmd.Wait()
+		return err
+	}
+
+	// Close our pipe reference & see if it was loaded.
+	if err := w.Close(); err != nil {
+		return w.Close()
+	}
+
+	return cmd.Wait()
+}
+
+// StartContainer pulls the given image ands starts the container in the
+// sandbox with the given podID.
+//
+// Note that the image will always be imported from the local docker daemon.
+func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) {
+	if err := cc.Import(image); err != nil {
+		return "", err
+	}
+
+	// Write the specs to files that can be read by crictl.
+	sbSpecFile, cleanup, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+	if err != nil {
+		return "", fmt.Errorf("failed to write sandbox spec: %v", err)
+	}
+	cc.cleanup = append(cc.cleanup, cleanup)
+	contSpecFile, cleanup, err := testutil.WriteTmpFile("contSpec", contSpec)
+	if err != nil {
+		return "", fmt.Errorf("failed to write container spec: %v", err)
+	}
+	cc.cleanup = append(cc.cleanup, cleanup)
+
+	return cc.startContainer(podID, image, sbSpecFile, contSpecFile)
+}
+
+func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) {
+	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
+	}
+
+	if _, err := cc.Start(contID); err != nil {
+		return "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
+	}
+
+	return contID, nil
+}
+
+// StopContainer stops and deletes the container with the given container ID.
+func (cc *Crictl) StopContainer(contID string) error {
+	if err := cc.Stop(contID); err != nil {
+		return fmt.Errorf("failed to stop container %q: %v", contID, err)
+	}
+
+	if err := cc.Rm(contID); err != nil {
+		return fmt.Errorf("failed to remove container %q: %v", contID, err)
+	}
+
+	return nil
+}
+
+// StartPodAndContainer starts a sandbox and container in that sandbox. It
+// returns the pod ID and container ID.
+func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
+	if err := cc.Import(image); err != nil {
+		return "", "", err
+	}
+
+	// Write the specs to files that can be read by crictl.
+	sbSpecFile, cleanup, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to write sandbox spec: %v", err)
+	}
+	cc.cleanup = append(cc.cleanup, cleanup)
+	contSpecFile, cleanup, err := testutil.WriteTmpFile("contSpec", contSpec)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to write container spec: %v", err)
+	}
+	cc.cleanup = append(cc.cleanup, cleanup)
+
+	podID, err := cc.RunPod(sbSpecFile)
+	if err != nil {
+		return "", "", err
+	}
+
+	contID, err := cc.startContainer(podID, image, sbSpecFile, contSpecFile)
+
+	return podID, contID, err
+}
+
+// StopPodAndContainer stops a container and pod.
+func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
+	if err := cc.StopContainer(contID); err != nil {
+		return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
+	}
+
+	if err := cc.StopPod(podID); err != nil {
+		return fmt.Errorf("failed to stop pod %q: %v", podID, err)
+	}
+
+	if err := cc.RmPod(podID); err != nil {
+		return fmt.Errorf("failed to remove pod %q: %v", podID, err)
+	}
+
+	return nil
+}
+
+// run runs crictl with the given args.
+func (cc *Crictl) run(args ...string) (string, error) {
+	defaultArgs := []string{
+		resolvePath("crictl"),
+		"--image-endpoint", fmt.Sprintf("unix://%s", cc.endpoint),
+		"--runtime-endpoint", fmt.Sprintf("unix://%s", cc.endpoint),
+	}
+	fullArgs := append(defaultArgs, args...)
+	out, err := testutil.Command(cc.logger, fullArgs...).CombinedOutput()
+	return string(out), err
+}
diff --git a/pkg/test/dockerutil/BUILD b/pkg/test/dockerutil/BUILD
new file mode 100644
index 000000000..7c8758e35
--- /dev/null
+++ b/pkg/test/dockerutil/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "dockerutil",
+    testonly = 1,
+    srcs = ["dockerutil.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/test/testutil",
+        "@com_github_kr_pty//:go_default_library",
+    ],
+)
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
new file mode 100644
index 000000000..baa8fc2f2
--- /dev/null
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -0,0 +1,581 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package dockerutil is a collection of utility functions.
+package dockerutil
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"path"
+	"regexp"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/kr/pty"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+var (
+	// runtime is the runtime to use for tests. This will be applied to all
+	// containers. Note that the default here ("runsc") corresponds to the
+	// default used by the installations. This is important, because the
+	// default installer for vm_tests (in tools/installers:head, invoked
+	// via tools/vm:defs.bzl) will install with this name. So without
+	// changing anything, tests should have a runsc runtime available to
+	// them. Otherwise installers should update the existing runtime
+	// instead of installing a new one.
+	runtime = flag.String("runtime", "runsc", "specify which runtime to use")
+
+	// config is the default Docker daemon configuration path.
+	config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
+)
+
+// EnsureSupportedDockerVersion checks if correct docker is installed.
+//
+// This logs directly to stderr, as it is typically called from a Main wrapper.
+func EnsureSupportedDockerVersion() {
+	cmd := exec.Command("docker", "version")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		log.Fatalf("error running %q: %v", "docker version", err)
+	}
+	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
+	matches := re.FindStringSubmatch(string(out))
+	if len(matches) != 3 {
+		log.Fatalf("Invalid docker output: %s", out)
+	}
+	major, _ := strconv.Atoi(matches[1])
+	minor, _ := strconv.Atoi(matches[2])
+	if major < 17 || (major == 17 && minor < 9) {
+		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
+	}
+}
+
+// RuntimePath returns the binary path for the current runtime.
+func RuntimePath() (string, error) {
+	// Read the configuration data; the file must exist.
+	configBytes, err := ioutil.ReadFile(*config)
+	if err != nil {
+		return "", err
+	}
+
+	// Unmarshal the configuration.
+	c := make(map[string]interface{})
+	if err := json.Unmarshal(configBytes, &c); err != nil {
+		return "", err
+	}
+
+	// Decode the expected configuration.
+	r, ok := c["runtimes"]
+	if !ok {
+		return "", fmt.Errorf("no runtimes declared: %v", c)
+	}
+	rs, ok := r.(map[string]interface{})
+	if !ok {
+		// The runtimes are not a map.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	r, ok = rs[*runtime]
+	if !ok {
+		// The expected runtime is not declared.
+		return "", fmt.Errorf("runtime %q not found: %v", *runtime, c)
+	}
+	rs, ok = r.(map[string]interface{})
+	if !ok {
+		// The runtime is not a map.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	p, ok := rs["path"].(string)
+	if !ok {
+		// The runtime does not declare a path.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	return p, nil
+}
+
+// Save exports a container image to the given Writer.
+//
+// Note that the writer should be actively consuming the output, otherwise it
+// is not guaranteed that the Save will make any progress and the call may
+// stall indefinitely.
+//
+// This is called by criutil in order to import imports.
+func Save(logger testutil.Logger, image string, w io.Writer) error {
+	cmd := testutil.Command(logger, "docker", "save", testutil.ImageByName(image))
+	cmd.Stdout = w // Send directly to the writer.
+	return cmd.Run()
+}
+
+// MountMode describes if the mount should be ro or rw.
+type MountMode int
+
+const (
+	// ReadOnly is what the name says.
+	ReadOnly MountMode = iota
+	// ReadWrite is what the name says.
+	ReadWrite
+)
+
+// String returns the mount mode argument for this MountMode.
+func (m MountMode) String() string {
+	switch m {
+	case ReadOnly:
+		return "ro"
+	case ReadWrite:
+		return "rw"
+	}
+	panic(fmt.Sprintf("invalid mode: %d", m))
+}
+
+// Docker contains the name and the runtime of a docker container.
+type Docker struct {
+	logger   testutil.Logger
+	Runtime  string
+	Name     string
+	copyErr  error
+	mounts   []string
+	cleanups []func()
+}
+
+// MakeDocker sets up the struct for a Docker container.
+//
+// Names of containers will be unique.
+func MakeDocker(logger testutil.Logger) *Docker {
+	return &Docker{
+		logger:  logger,
+		Name:    testutil.RandomID(logger.Name()),
+		Runtime: *runtime,
+	}
+}
+
+// Mount mounts the given source and makes it available in the container.
+func (d *Docker) Mount(target, source string, mode MountMode) {
+	d.mounts = append(d.mounts, fmt.Sprintf("-v=%s:%s:%v", source, target, mode))
+}
+
+// CopyFiles copies in and mounts the given files. They are always ReadOnly.
+func (d *Docker) CopyFiles(target string, sources ...string) {
+	dir, err := ioutil.TempDir("", d.Name)
+	if err != nil {
+		d.copyErr = fmt.Errorf("ioutil.TempDir failed: %v", err)
+		return
+	}
+	d.cleanups = append(d.cleanups, func() { os.RemoveAll(dir) })
+	if err := os.Chmod(dir, 0755); err != nil {
+		d.copyErr = fmt.Errorf("os.Chmod(%q, 0755) failed: %v", dir, err)
+		return
+	}
+	for _, name := range sources {
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			d.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
+			return
+		}
+		dst := path.Join(dir, path.Base(name))
+		if err := testutil.Copy(src, dst); err != nil {
+			d.copyErr = fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+			return
+		}
+		d.logger.Logf("copy: %s -> %s", src, dst)
+	}
+	d.Mount(target, dir, ReadOnly)
+}
+
+// Link links the given target.
+func (d *Docker) Link(target string, source *Docker) {
+	d.mounts = append(d.mounts, fmt.Sprintf("--link=%s:%s", source.Name, target))
+}
+
+// RunOpts are options for running a container.
+type RunOpts struct {
+	// Image is the image relative to images/. This will be mangled
+	// appropriately, to ensure that only first-party images are used.
+	Image string
+
+	// Memory is the memory limit in kB.
+	Memory int
+
+	// Ports are the ports to be allocated.
+	Ports []int
+
+	// WorkDir sets the working directory.
+	WorkDir string
+
+	// ReadOnly sets the read-only flag.
+	ReadOnly bool
+
+	// Env are additional environment variables.
+	Env []string
+
+	// User is the user to use.
+	User string
+
+	// Privileged enables privileged mode.
+	Privileged bool
+
+	// CapAdd are the extra set of capabilities to add.
+	CapAdd []string
+
+	// CapDrop are the extra set of capabilities to drop.
+	CapDrop []string
+
+	// Pty indicates that a pty will be allocated. If this is non-nil, then
+	// this will run after start-up with the *exec.Command and Pty file
+	// passed in to the function.
+	Pty func(*exec.Cmd, *os.File)
+
+	// Foreground indicates that the container should be run in the
+	// foreground. If this is true, then the output will be available as a
+	// return value from the Run function.
+	Foreground bool
+
+	// Extra are extra arguments that may be passed.
+	Extra []string
+}
+
+// args returns common arguments.
+//
+// Note that this does not define the complete behavior.
+func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) {
+	isExec := command == "exec"
+	isRun := command == "run"
+
+	if isRun || isExec {
+		rv = append(rv, "-i")
+	}
+	if r.Pty != nil {
+		rv = append(rv, "-t")
+	}
+	if r.User != "" {
+		rv = append(rv, fmt.Sprintf("--user=%s", r.User))
+	}
+	if r.Privileged {
+		rv = append(rv, "--privileged")
+	}
+	for _, c := range r.CapAdd {
+		rv = append(rv, fmt.Sprintf("--cap-add=%s", c))
+	}
+	for _, c := range r.CapDrop {
+		rv = append(rv, fmt.Sprintf("--cap-drop=%s", c))
+	}
+	for _, e := range r.Env {
+		rv = append(rv, fmt.Sprintf("--env=%s", e))
+	}
+	if r.WorkDir != "" {
+		rv = append(rv, fmt.Sprintf("--workdir=%s", r.WorkDir))
+	}
+	if !isExec {
+		if r.Memory != 0 {
+			rv = append(rv, fmt.Sprintf("--memory=%dk", r.Memory))
+		}
+		for _, p := range r.Ports {
+			rv = append(rv, fmt.Sprintf("--publish=%d", p))
+		}
+		if r.ReadOnly {
+			rv = append(rv, fmt.Sprintf("--read-only"))
+		}
+		if len(p) > 0 {
+			rv = append(rv, "--entrypoint=")
+		}
+	}
+
+	// Always attach the test environment & Extra.
+	rv = append(rv, fmt.Sprintf("--env=RUNSC_TEST_NAME=%s", d.Name))
+	rv = append(rv, r.Extra...)
+
+	// Attach necessary bits.
+	if isExec {
+		rv = append(rv, d.Name)
+	} else {
+		rv = append(rv, d.mounts...)
+		rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
+		rv = append(rv, fmt.Sprintf("--name=%s", d.Name))
+		rv = append(rv, testutil.ImageByName(r.Image))
+	}
+
+	// Attach other arguments.
+	rv = append(rv, p...)
+	return rv
+}
+
+// run runs a complete command.
+func (d *Docker) run(r RunOpts, command string, p ...string) (string, error) {
+	if d.copyErr != nil {
+		return "", d.copyErr
+	}
+	basicArgs := []string{"docker"}
+	if command == "spawn" {
+		command = "run"
+		basicArgs = append(basicArgs, command)
+		basicArgs = append(basicArgs, "-d")
+	} else {
+		basicArgs = append(basicArgs, command)
+	}
+	customArgs := d.argsFor(&r, command, p)
+	cmd := testutil.Command(d.logger, append(basicArgs, customArgs...)...)
+	if r.Pty != nil {
+		// If allocating a terminal, then we just ignore the output
+		// from the command.
+		ptmx, err := pty.Start(cmd.Cmd)
+		if err != nil {
+			return "", err
+		}
+		defer cmd.Wait() // Best effort.
+		r.Pty(cmd.Cmd, ptmx)
+	} else {
+		// Can't support PTY or streaming.
+		out, err := cmd.CombinedOutput()
+		return string(out), err
+	}
+	return "", nil
+}
+
+// Create calls 'docker create' with the arguments provided.
+func (d *Docker) Create(r RunOpts, args ...string) error {
+	_, err := d.run(r, "create", args...)
+	return err
+}
+
+// Start calls 'docker start'.
+func (d *Docker) Start() error {
+	return testutil.Command(d.logger, "docker", "start", d.Name).Run()
+}
+
+// Stop calls 'docker stop'.
+func (d *Docker) Stop() error {
+	return testutil.Command(d.logger, "docker", "stop", d.Name).Run()
+}
+
+// Run calls 'docker run' with the arguments provided.
+func (d *Docker) Run(r RunOpts, args ...string) (string, error) {
+	return d.run(r, "run", args...)
+}
+
+// Spawn starts the container and detaches.
+func (d *Docker) Spawn(r RunOpts, args ...string) error {
+	_, err := d.run(r, "spawn", args...)
+	return err
+}
+
+// Logs calls 'docker logs'.
+func (d *Docker) Logs() (string, error) {
+	// Don't capture the output; since it will swamp the logs.
+	out, err := exec.Command("docker", "logs", d.Name).CombinedOutput()
+	return string(out), err
+}
+
+// Exec calls 'docker exec' with the arguments provided.
+func (d *Docker) Exec(r RunOpts, args ...string) (string, error) {
+	return d.run(r, "exec", args...)
+}
+
+// Pause calls 'docker pause'.
+func (d *Docker) Pause() error {
+	return testutil.Command(d.logger, "docker", "pause", d.Name).Run()
+}
+
+// Unpause calls 'docker pause'.
+func (d *Docker) Unpause() error {
+	return testutil.Command(d.logger, "docker", "unpause", d.Name).Run()
+}
+
+// Checkpoint calls 'docker checkpoint'.
+func (d *Docker) Checkpoint(name string) error {
+	return testutil.Command(d.logger, "docker", "checkpoint", "create", d.Name, name).Run()
+}
+
+// Restore calls 'docker start --checkname [name]'.
+func (d *Docker) Restore(name string) error {
+	return testutil.Command(d.logger, "docker", "start", fmt.Sprintf("--checkpoint=%s", name), d.Name).Run()
+}
+
+// Kill calls 'docker kill'.
+func (d *Docker) Kill() error {
+	// Skip logging this command, it will likely be an error.
+	out, err := exec.Command("docker", "kill", d.Name).CombinedOutput()
+	if err != nil && !strings.Contains(string(out), "is not running") {
+		return err
+	}
+	return nil
+}
+
+// Remove calls 'docker rm'.
+func (d *Docker) Remove() error {
+	return testutil.Command(d.logger, "docker", "rm", d.Name).Run()
+}
+
+// CleanUp kills and deletes the container (best effort).
+func (d *Docker) CleanUp() {
+	// Kill the container.
+	if err := d.Kill(); err != nil {
+		// Just log; can't do anything here.
+		d.logger.Logf("error killing container %q: %v", d.Name, err)
+	}
+	// Remove the image.
+	if err := d.Remove(); err != nil {
+		d.logger.Logf("error removing container %q: %v", d.Name, err)
+	}
+	// Forget all mounts.
+	d.mounts = nil
+	// Execute all cleanups.
+	for _, c := range d.cleanups {
+		c()
+	}
+	d.cleanups = nil
+}
+
+// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
+// docker to allocate a free port in the host and prevent conflicts.
+func (d *Docker) FindPort(sandboxPort int) (int, error) {
+	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+	}
+	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
+	}
+	return port, nil
+}
+
+// FindIP returns the IP address of the container.
+func (d *Docker) FindIP() (net.IP, error) {
+	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+	if err != nil {
+		return net.IP{}, fmt.Errorf("error retrieving IP: %v", err)
+	}
+	ip := net.ParseIP(strings.TrimSpace(string(out)))
+	if ip == nil {
+		return net.IP{}, fmt.Errorf("invalid IP: %q", string(out))
+	}
+	return ip, nil
+}
+
+// SandboxPid returns the PID to the sandbox process.
+func (d *Docker) SandboxPid() (int, error) {
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput()
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving pid: %v", err)
+	}
+	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
+	}
+	return pid, nil
+}
+
+// ID returns the container ID.
+func (d *Docker) ID() (string, error) {
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.Id}}", d.Name).CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("error retrieving ID: %v", err)
+	}
+	return strings.TrimSpace(string(out)), nil
+}
+
+// Wait waits for container to exit, up to the given timeout. Returns error if
+// wait fails or timeout is hit. Returns the application return code otherwise.
+// Note that the application may have failed even if err == nil, always check
+// the exit code.
+func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
+	timeoutChan := time.After(timeout)
+	waitChan := make(chan (syscall.WaitStatus))
+	errChan := make(chan (error))
+
+	go func() {
+		out, err := testutil.Command(d.logger, "docker", "wait", d.Name).CombinedOutput()
+		if err != nil {
+			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
+		}
+		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+		if err != nil {
+			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
+		}
+		waitChan <- syscall.WaitStatus(uint32(exit))
+	}()
+
+	select {
+	case ws := <-waitChan:
+		return ws, nil
+	case err := <-errChan:
+		return syscall.WaitStatus(1), err
+	case <-timeoutChan:
+		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
+	}
+}
+
+// WaitForOutput calls 'docker logs' to retrieve containers output and searches
+// for the given pattern.
+func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
+	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
+	if err != nil {
+		return "", err
+	}
+	if len(matches) == 0 {
+		return "", nil
+	}
+	return matches[0], nil
+}
+
+// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
+// searches for the given pattern. It returns any regexp submatches as well.
+func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
+	re := regexp.MustCompile(pattern)
+	var (
+		lastOut string
+		stopped bool
+	)
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		out, err := d.Logs()
+		if err != nil {
+			return nil, err
+		}
+		if out != lastOut {
+			if lastOut == "" {
+				d.logger.Logf("output (start): %s", out)
+			} else if strings.HasPrefix(out, lastOut) {
+				d.logger.Logf("output (contn): %s", out[len(lastOut):])
+			} else {
+				d.logger.Logf("output (trunc): %s", out)
+			}
+			lastOut = out // Save for future.
+			if matches := re.FindStringSubmatch(lastOut); matches != nil {
+				return matches, nil // Success!
+			}
+		} else if stopped {
+			// The sandbox stopped and we looked at the
+			// logs at least once since determining that.
+			return nil, fmt.Errorf("no longer running: %v", err)
+		} else if pid, err := d.SandboxPid(); pid == 0 || err != nil {
+			// The sandbox may have stopped, but it's
+			// possible that it has emitted the terminal
+			// line between the last call to Logs and here.
+			stopped = true
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), lastOut)
+}
diff --git a/pkg/test/testutil/BUILD b/pkg/test/testutil/BUILD
new file mode 100644
index 000000000..03b1b4677
--- /dev/null
+++ b/pkg/test/testutil/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "testutil",
+    testonly = 1,
+    srcs = [
+        "testutil.go",
+        "testutil_runfiles.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/sync",
+        "//runsc/boot",
+        "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
new file mode 100644
index 000000000..d75ceca3d
--- /dev/null
+++ b/pkg/test/testutil/testutil.go
@@ -0,0 +1,550 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil contains utility functions for runsc tests.
+package testutil
+
+import (
+	"bufio"
+	"context"
+	"debug/elf"
+	"encoding/base32"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"math"
+	"math/rand"
+	"net/http"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+var (
+	checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support")
+)
+
+// IsCheckpointSupported returns the relevant command line flag.
+func IsCheckpointSupported() bool {
+	return *checkpoint
+}
+
+// nameToActual is used by ImageByName (for now).
+var nameToActual = map[string]string{
+	"basic/alpine":          "alpine",
+	"basic/busybox":         "busybox:1.31.1",
+	"basic/httpd":           "httpd",
+	"basic/mysql":           "mysql",
+	"basic/nginx":           "nginx",
+	"basic/python":          "gcr.io/gvisor-presubmit/python-hello",
+	"basic/resolv":          "k8s.gcr.io/busybox",
+	"basic/ruby":            "ruby",
+	"basic/tomcat":          "tomcat:8.0",
+	"basic/ubuntu":          "ubuntu:trusty",
+	"iptables":              "gcr.io/gvisor-presubmit/iptables-test",
+	"packetdrill":           "gcr.io/gvisor-presubmit/packetdrill",
+	"packetimpact":          "gcr.io/gvisor-presubmit/packetimpact",
+	"runtimes/go1.12":       "gcr.io/gvisor-presubmit/go1.12",
+	"runtimes/java11":       "gcr.io/gvisor-presubmit/java11",
+	"runtimes/nodejs12.4.0": "gcr.io/gvisor-presubmit/nodejs12.4.0",
+	"runtimes/php7.3.6":     "gcr.io/gvisor-presubmit/php7.3.6",
+	"runtimes/python3.7.3":  "gcr.io/gvisor-presubmit/python3.7.3",
+}
+
+// ImageByName mangles the image name used locally.
+//
+// For now, this is implemented as a static lookup table. In a subsequent
+// change, this will be used to reference a locally-generated image.
+func ImageByName(name string) string {
+	actual, ok := nameToActual[name]
+	if !ok {
+		panic(fmt.Sprintf("unknown image: %v", name))
+	}
+	// A terrible hack, for now execute a manual pull.
+	if out, err := exec.Command("docker", "pull", actual).CombinedOutput(); err != nil {
+		panic(fmt.Sprintf("error pulling image %q -> %q: %v, out: %s", name, actual, err, string(out)))
+	}
+	return actual
+}
+
+// ConfigureExePath configures the executable for runsc in the test environment.
+func ConfigureExePath() error {
+	path, err := FindFile("runsc/runsc")
+	if err != nil {
+		return err
+	}
+	specutils.ExePath = path
+	return nil
+}
+
+// TmpDir returns the absolute path to a writable directory that can be used as
+// scratch by the test.
+func TmpDir() string {
+	dir := os.Getenv("TEST_TMPDIR")
+	if dir == "" {
+		dir = "/tmp"
+	}
+	return dir
+}
+
+// Logger is a simple logging wrapper.
+//
+// This is designed to be implemented by *testing.T.
+type Logger interface {
+	Name() string
+	Logf(fmt string, args ...interface{})
+}
+
+// DefaultLogger logs using the log package.
+type DefaultLogger string
+
+// Name implements Logger.Name.
+func (d DefaultLogger) Name() string {
+	return string(d)
+}
+
+// Logf implements Logger.Logf.
+func (d DefaultLogger) Logf(fmt string, args ...interface{}) {
+	log.Printf(fmt, args...)
+}
+
+// Cmd is a simple wrapper.
+type Cmd struct {
+	logger Logger
+	*exec.Cmd
+}
+
+// CombinedOutput returns the output and logs.
+func (c *Cmd) CombinedOutput() ([]byte, error) {
+	out, err := c.Cmd.CombinedOutput()
+	if len(out) > 0 {
+		c.logger.Logf("output: %s", string(out))
+	}
+	if err != nil {
+		c.logger.Logf("error: %v", err)
+	}
+	return out, err
+}
+
+// Command is a simple wrapper around exec.Command, that logs.
+func Command(logger Logger, args ...string) *Cmd {
+	logger.Logf("command: %s", strings.Join(args, " "))
+	return &Cmd{
+		logger: logger,
+		Cmd:    exec.Command(args[0], args[1:]...),
+	}
+}
+
+// TestConfig returns the default configuration to use in tests. Note that
+// 'RootDir' must be set by caller if required.
+func TestConfig(t *testing.T) *boot.Config {
+	logDir := os.TempDir()
+	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
+		logDir = dir + "/"
+	}
+	return &boot.Config{
+		Debug:              true,
+		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
+		LogFormat:          "text",
+		DebugLogFormat:     "text",
+		LogPackets:         true,
+		Network:            boot.NetworkNone,
+		Strace:             true,
+		Platform:           "ptrace",
+		FileAccess:         boot.FileAccessExclusive,
+		NumNetworkChannels: 1,
+
+		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+	}
+}
+
+// NewSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func NewSpecWithArgs(args ...string) *specs.Spec {
+	return &specs.Spec{
+		// The host filesystem root is the container root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: args,
+			Env: []string{
+				"PATH=" + os.Getenv("PATH"),
+			},
+			Capabilities: specutils.AllCapabilities(),
+		},
+		Mounts: []specs.Mount{
+			// Hide the host /etc to avoid any side-effects.
+			// For example, bash reads /etc/passwd and if it is
+			// very big, tests can fail by timeout.
+			{
+				Type:        "tmpfs",
+				Destination: "/etc",
+			},
+			// Root is readonly, but many tests want to write to tmpdir.
+			// This creates a writable mount inside the root. Also, when tmpdir points
+			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
+			// inside the sentry.
+			{
+				Type:        "bind",
+				Destination: TmpDir(),
+				Source:      TmpDir(),
+			},
+		},
+		Hostname: "runsc-test-hostname",
+	}
+}
+
+// SetupRootDir creates a root directory for containers.
+func SetupRootDir() (string, func(), error) {
+	rootDir, err := ioutil.TempDir(TmpDir(), "containers")
+	if err != nil {
+		return "", nil, fmt.Errorf("error creating root dir: %v", err)
+	}
+	return rootDir, func() { os.RemoveAll(rootDir) }, nil
+}
+
+// SetupContainer creates a bundle and root dir for the container, generates a
+// test config, and writes the spec to config.json in the bundle dir.
+func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, cleanup func(), err error) {
+	rootDir, rootCleanup, err := SetupRootDir()
+	if err != nil {
+		return "", "", nil, err
+	}
+	conf.RootDir = rootDir
+	bundleDir, bundleCleanup, err := SetupBundleDir(spec)
+	if err != nil {
+		rootCleanup()
+		return "", "", nil, err
+	}
+	return rootDir, bundleDir, func() {
+		bundleCleanup()
+		rootCleanup()
+	}, err
+}
+
+// SetupBundleDir creates a bundle dir and writes the spec to config.json.
+func SetupBundleDir(spec *specs.Spec) (string, func(), error) {
+	bundleDir, err := ioutil.TempDir(TmpDir(), "bundle")
+	if err != nil {
+		return "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+	}
+	cleanup := func() { os.RemoveAll(bundleDir) }
+	if err := writeSpec(bundleDir, spec); err != nil {
+		cleanup()
+		return "", nil, fmt.Errorf("error writing spec: %v", err)
+	}
+	return bundleDir, cleanup, nil
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+	b, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// RandomID returns 20 random bytes following the given prefix.
+func RandomID(prefix string) string {
+	// Read 20 random bytes.
+	b := make([]byte, 20)
+	// "[Read] always returns len(p) and a nil error." --godoc
+	if _, err := rand.Read(b); err != nil {
+		panic("rand.Read failed: " + err.Error())
+	}
+	return fmt.Sprintf("%s-%s", prefix, base32.StdEncoding.EncodeToString(b))
+}
+
+// RandomContainerID generates a random container id for each test.
+//
+// The container id is used to create an abstract unix domain socket, which
+// must be unique. While the container forbids creating two containers with the
+// same name, sometimes between test runs the socket does not get cleaned up
+// quickly enough, causing container creation to fail.
+func RandomContainerID() string {
+	return RandomID("test-container-")
+}
+
+// Copy copies file from src to dst.
+func Copy(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+
+	st, err := in.Stat()
+	if err != nil {
+		return err
+	}
+
+	out, err := os.OpenFile(dst, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, st.Mode().Perm())
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+
+	// Mirror the local user's permissions across all users. This is
+	// because as we inject things into the container, the UID/GID will
+	// change. Also, the build system may generate artifacts with different
+	// modes. At the top-level (volume mapping) we have a big read-only
+	// knob that can be applied to prevent modifications.
+	//
+	// Note that this must be done via a separate Chmod call, otherwise the
+	// current process's umask will get in the way.
+	var mode os.FileMode
+	if st.Mode()&0100 != 0 {
+		mode |= 0111
+	}
+	if st.Mode()&0200 != 0 {
+		mode |= 0222
+	}
+	if st.Mode()&0400 != 0 {
+		mode |= 0444
+	}
+	if err := os.Chmod(dst, mode); err != nil {
+		return err
+	}
+
+	_, err = io.Copy(out, in)
+	return err
+}
+
+// Poll is a shorthand function to poll for something with given timeout.
+func Poll(cb func() error, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	return backoff.Retry(cb, b)
+}
+
+// WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
+func WaitForHTTP(port int, timeout time.Duration) error {
+	cb := func() error {
+		c := &http.Client{
+			// Calculate timeout to be able to do minimum 5 attempts.
+			Timeout: timeout / 5,
+		}
+		url := fmt.Sprintf("http://localhost:%d/", port)
+		resp, err := c.Get(url)
+		if err != nil {
+			log.Printf("Waiting %s: %v", url, err)
+			return err
+		}
+		resp.Body.Close()
+		return nil
+	}
+	return Poll(cb, timeout)
+}
+
+// Reaper reaps child processes.
+type Reaper struct {
+	// mu protects ch, which will be nil if the reaper is not running.
+	mu sync.Mutex
+	ch chan os.Signal
+}
+
+// Start starts reaping child processes.
+func (r *Reaper) Start() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch != nil {
+		panic("reaper.Start called on a running reaper")
+	}
+
+	r.ch = make(chan os.Signal, 1)
+	signal.Notify(r.ch, syscall.SIGCHLD)
+
+	go func() {
+		for {
+			r.mu.Lock()
+			ch := r.ch
+			r.mu.Unlock()
+			if ch == nil {
+				return
+			}
+
+			_, ok := <-ch
+			if !ok {
+				// Channel closed.
+				return
+			}
+			for {
+				cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil)
+				if cpid < 1 {
+					break
+				}
+			}
+		}
+	}()
+}
+
+// Stop stops reaping child processes.
+func (r *Reaper) Stop() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch == nil {
+		panic("reaper.Stop called on a stopped reaper")
+	}
+
+	signal.Stop(r.ch)
+	close(r.ch)
+	r.ch = nil
+}
+
+// StartReaper is a helper that starts a new Reaper and returns a function to
+// stop it.
+func StartReaper() func() {
+	r := &Reaper{}
+	r.Start()
+	return r.Stop
+}
+
+// WaitUntilRead reads from the given reader until the wanted string is found
+// or until timeout.
+func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
+	sc := bufio.NewScanner(r)
+	if split != nil {
+		sc.Split(split)
+	}
+	// done must be accessed atomically. A value greater than 0 indicates
+	// that the read loop can exit.
+	var done uint32
+	doneCh := make(chan struct{})
+	go func() {
+		for sc.Scan() {
+			t := sc.Text()
+			if strings.Contains(t, want) {
+				atomic.StoreUint32(&done, 1)
+				close(doneCh)
+				break
+			}
+			if atomic.LoadUint32(&done) > 0 {
+				break
+			}
+		}
+	}()
+	select {
+	case <-time.After(timeout):
+		atomic.StoreUint32(&done, 1)
+		return fmt.Errorf("timeout waiting to read %q", want)
+	case <-doneCh:
+		return nil
+	}
+}
+
+// KillCommand kills the process running cmd unless it hasn't been started. It
+// returns an error if it cannot kill the process unless the reason is that the
+// process has already exited.
+//
+// KillCommand will also reap the process.
+func KillCommand(cmd *exec.Cmd) error {
+	if cmd.Process == nil {
+		return nil
+	}
+	if err := cmd.Process.Kill(); err != nil {
+		if !strings.Contains(err.Error(), "process already finished") {
+			return fmt.Errorf("failed to kill process %v: %v", cmd, err)
+		}
+	}
+	return cmd.Wait()
+}
+
+// WriteTmpFile writes text to a temporary file, closes the file, and returns
+// the name of the file. A cleanup function is also returned.
+func WriteTmpFile(pattern, text string) (string, func(), error) {
+	file, err := ioutil.TempFile(TmpDir(), pattern)
+	if err != nil {
+		return "", nil, err
+	}
+	defer file.Close()
+	if _, err := file.Write([]byte(text)); err != nil {
+		return "", nil, err
+	}
+	return file.Name(), func() { os.RemoveAll(file.Name()) }, nil
+}
+
+// IsStatic returns true iff the given file is a static binary.
+func IsStatic(filename string) (bool, error) {
+	f, err := elf.Open(filename)
+	if err != nil {
+		return false, err
+	}
+	for _, prog := range f.Progs {
+		if prog.Type == elf.PT_INTERP {
+			return false, nil // Has interpreter.
+		}
+	}
+	return true, nil
+}
+
+// TestIndicesForShard returns indices for this test shard based on the
+// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
+//
+// If either of the env vars are not present, then the function will return all
+// tests. If there are more shards than there are tests, then the returned list
+// may be empty.
+func TestIndicesForShard(numTests int) ([]int, error) {
+	var (
+		shardIndex = 0
+		shardTotal = 1
+	)
+
+	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
+	if indexStr != "" && totalStr != "" {
+		// Parse index and total to ints.
+		var err error
+		shardIndex, err = strconv.Atoi(indexStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
+		}
+		shardTotal, err = strconv.Atoi(totalStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+		}
+	}
+
+	// Calculate!
+	var indices []int
+	numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal)))
+	for i := 0; i < numBlocks; i++ {
+		pick := i*shardTotal + shardIndex
+		if pick < numTests {
+			indices = append(indices, pick)
+		}
+	}
+	return indices, nil
+}
diff --git a/pkg/test/testutil/testutil_runfiles.go b/pkg/test/testutil/testutil_runfiles.go
new file mode 100644
index 000000000..ece9ea9a1
--- /dev/null
+++ b/pkg/test/testutil/testutil_runfiles.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 72c2fe381..69dcc74f2 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -23,6 +23,7 @@ go_library(
         "vfs.go",
     ],
     visibility = [
+        "//pkg/test:__subpackages__",
         "//runsc:__subpackages__",
         "//test:__subpackages__",
     ],
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 4900fbe16..af3538ef0 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -82,11 +82,11 @@ go_test(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
+        "//pkg/test/testutil",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/container",
         "//runsc/specutils",
-        "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 9360d7442..a84067112 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -23,10 +23,10 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/syndtr/gocapability/capability"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func init() {
@@ -90,16 +90,15 @@ func TestCapabilities(t *testing.T) {
 	// Use --network=host to make sandbox use spec's capabilities.
 	conf.Network = boot.NetworkHost
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := container.Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 0aaeea3a8..331b8e866 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -33,13 +33,15 @@ go_test(
     size = "large",
     srcs = [
         "console_test.go",
+        "container_norace_test.go",
+        "container_race_test.go",
         "container_test.go",
         "multi_container_test.go",
         "shared_volume_test.go",
     ],
     data = [
         "//runsc",
-        "//runsc/container/test_app",
+        "//test/cmd/test_app",
     ],
     library = ":container",
     shard_count = 5,
@@ -54,12 +56,12 @@ go_test(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sync",
+        "//pkg/test/testutil",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
         "//runsc/specutils",
-        "//runsc/testutil",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index af245b6d8..294dca5e7 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -29,9 +29,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // socketPath creates a path inside bundleDir and ensures that the returned
@@ -58,25 +58,26 @@ func socketPath(bundleDir string) (string, error) {
 }
 
 // createConsoleSocket creates a socket at the given path that will receive a
-// console fd from the sandbox. If no error occurs, it returns the server
-// socket and a cleanup function.
-func createConsoleSocket(path string) (*unet.ServerSocket, func() error, error) {
+// console fd from the sandbox. If an error occurs, t.Fatalf will be called.
+// The function returning should be deferred as cleanup.
+func createConsoleSocket(t *testing.T, path string) (*unet.ServerSocket, func()) {
+	t.Helper()
 	srv, err := unet.BindAndListen(path, false)
 	if err != nil {
-		return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", path, err)
+		t.Fatalf("error binding and listening to socket %q: %v", path, err)
 	}
 
-	cleanup := func() error {
+	cleanup := func() {
+		// Log errors; nothing can be done.
 		if err := srv.Close(); err != nil {
-			return fmt.Errorf("error closing socket %q: %v", path, err)
+			t.Logf("error closing socket %q: %v", path, err)
 		}
 		if err := os.Remove(path); err != nil {
-			return fmt.Errorf("error removing socket %q: %v", path, err)
+			t.Logf("error removing socket %q: %v", path, err)
 		}
-		return nil
 	}
 
-	return srv, cleanup, nil
+	return srv, cleanup
 }
 
 // receiveConsolePTY accepts a connection on the server socket and reads fds.
@@ -118,45 +119,42 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
 
 // Test that an pty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-		spec := testutil.NewSpecWithArgs("true")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("true")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		sock, err := socketPath(bundleDir)
-		if err != nil {
-			t.Fatalf("error getting socket path: %v", err)
-		}
-		srv, cleanup, err := createConsoleSocket(sock)
-		if err != nil {
-			t.Fatalf("error creating socket at %q: %v", sock, err)
-		}
-		defer cleanup()
-
-		// Create the container and pass the socket name.
-		args := Args{
-			ID:            testutil.UniqueContainerID(),
-			Spec:          spec,
-			BundleDir:     bundleDir,
-			ConsoleSocket: sock,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
+			sock, err := socketPath(bundleDir)
+			if err != nil {
+				t.Fatalf("error getting socket path: %v", err)
+			}
+			srv, cleanup := createConsoleSocket(t, sock)
+			defer cleanup()
+
+			// Create the container and pass the socket name.
+			args := Args{
+				ID:            testutil.RandomContainerID(),
+				Spec:          spec,
+				BundleDir:     bundleDir,
+				ConsoleSocket: sock,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
 
-		// Make sure we get a console PTY.
-		ptyMaster, err := receiveConsolePTY(srv)
-		if err != nil {
-			t.Fatalf("error receiving console FD: %v", err)
-		}
-		ptyMaster.Close()
+			// Make sure we get a console PTY.
+			ptyMaster, err := receiveConsolePTY(srv)
+			if err != nil {
+				t.Fatalf("error receiving console FD: %v", err)
+			}
+			ptyMaster.Close()
+		})
 	}
 }
 
@@ -165,16 +163,15 @@ func TestJobControlSignalExec(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
 	conf := testutil.TestConfig(t)
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -292,26 +289,22 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
 	spec.Process.Terminal = true
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	sock, err := socketPath(bundleDir)
 	if err != nil {
 		t.Fatalf("error getting socket path: %v", err)
 	}
-	srv, cleanup, err := createConsoleSocket(sock)
-	if err != nil {
-		t.Fatalf("error creating socket at %q: %v", sock, err)
-	}
+	srv, cleanup := createConsoleSocket(t, sock)
 	defer cleanup()
 
 	// Create the container and pass the socket name.
 	args := Args{
-		ID:            testutil.UniqueContainerID(),
+		ID:            testutil.RandomContainerID(),
 		Spec:          spec,
 		BundleDir:     bundleDir,
 		ConsoleSocket: sock,
@@ -368,7 +361,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Fatal(err)
+		t.Fatalf("error waiting for processes: %v", err)
 	}
 
 	// Execute sleep via the terminal.
@@ -377,7 +370,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	// Wait for sleep to start.
 	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Fatal(err)
+		t.Fatalf("error waiting for processes: %v", err)
 	}
 
 	// Reset the pty buffer, so there is less output for us to scan later.
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 7233659b1..117ea7d7b 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -274,7 +274,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	}
 
 	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
-		return nil, fmt.Errorf("creating container root directory: %v", err)
+		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
 	}
 
 	c := &Container{
diff --git a/runsc/container/container_norace_test.go b/runsc/container/container_norace_test.go
new file mode 100644
index 000000000..838c1e20a
--- /dev/null
+++ b/runsc/container/container_norace_test.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !race
+
+package container
+
+// Allow both kvm and ptrace for non-race builds.
+var platformOptions = []configOption{ptrace, kvm}
diff --git a/runsc/container/container_race_test.go b/runsc/container/container_race_test.go
new file mode 100644
index 000000000..9fb4c4fc0
--- /dev/null
+++ b/runsc/container/container_race_test.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package container
+
+// Only enabled ptrace with race builds.
+var platformOptions = []configOption{ptrace}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5db6d64aa..3ff89f38c 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -39,10 +39,10 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // waitForProcessList waits for the given process list to show up in the container.
@@ -215,16 +215,15 @@ func readOutputNum(file string, position int) (int, error) {
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
 func run(spec *specs.Spec, conf *boot.Config) error {
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create, start and wait for the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 		Attached:  true,
@@ -243,35 +242,41 @@ type configOption int
 
 const (
 	overlay configOption = iota
+	ptrace
 	kvm
 	nonExclusiveFS
 )
 
-var noOverlay = []configOption{kvm, nonExclusiveFS}
-var all = append(noOverlay, overlay)
+var (
+	noOverlay = append(platformOptions, nonExclusiveFS)
+	all       = append(noOverlay, overlay)
+)
 
 // configs generates different configurations to run tests.
-func configs(t *testing.T, opts ...configOption) []*boot.Config {
+func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	// Always load the default config.
-	cs := []*boot.Config{testutil.TestConfig(t)}
-
+	cs := make(map[string]*boot.Config)
 	for _, o := range opts {
-		c := testutil.TestConfig(t)
 		switch o {
 		case overlay:
+			c := testutil.TestConfig(t)
 			c.Overlay = true
+			cs["overlay"] = c
+		case ptrace:
+			c := testutil.TestConfig(t)
+			c.Platform = platforms.Ptrace
+			cs["ptrace"] = c
 		case kvm:
-			// TODO(b/112165693): KVM tests are flaky. Disable until fixed.
-			continue
-
+			c := testutil.TestConfig(t)
 			c.Platform = platforms.KVM
+			cs["kvm"] = c
 		case nonExclusiveFS:
+			c := testutil.TestConfig(t)
 			c.FileAccess = boot.FileAccessShared
+			cs["non-exclusive"] = c
 		default:
 			panic(fmt.Sprintf("unknown config option %v", o))
-
 		}
-		cs = append(cs, c)
 	}
 	return cs
 }
@@ -285,133 +290,133 @@ func TestLifecycle(t *testing.T) {
 	childReaper.Start()
 	defer childReaper.Stop()
 
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-		// The container will just sleep for a long time.  We will kill it before
-		// it finishes sleeping.
-		spec := testutil.NewSpecWithArgs("sleep", "100")
-
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
-
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-		}
-		// Create the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			// The container will just sleep for a long time.  We will kill it before
+			// it finishes sleeping.
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 
-		// Load the container from disk and check the status.
-		c, err = Load(rootDir, args.ID)
-		if err != nil {
-			t.Fatalf("error loading container: %v", err)
-		}
-		if got, want := c.Status, Created; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// List should return the container id.
-		ids, err := List(rootDir)
-		if err != nil {
-			t.Fatalf("error listing containers: %v", err)
-		}
-		if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
-			t.Errorf("container list got %v, want %v", got, want)
-		}
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+			}
+			// Create the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
 
-		// Start the container.
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Created; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
 
-		// Load the container from disk and check the status.
-		c, err = Load(rootDir, args.ID)
-		if err != nil {
-			t.Fatalf("error loading container: %v", err)
-		}
-		if got, want := c.Status, Running; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			// List should return the container id.
+			ids, err := List(rootDir)
+			if err != nil {
+				t.Fatalf("error listing containers: %v", err)
+			}
+			if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
+				t.Errorf("container list got %v, want %v", got, want)
+			}
 
-		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(c, expectedPL); err != nil {
-			t.Error(err)
-		}
+			// Start the container.
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Wait on the container.
-		var wg sync.WaitGroup
-		wg.Add(1)
-		ch := make(chan struct{})
-		go func() {
-			ch <- struct{}{}
-			ws, err := c.Wait()
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
 			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
+				t.Fatalf("error loading container: %v", err)
 			}
-			if got, want := ws.Signal(), syscall.SIGTERM; got != want {
-				t.Fatalf("got signal %v, want %v", got, want)
+			if got, want := c.Status, Running; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
 			}
-			wg.Done()
-		}()
 
-		// Wait a bit to ensure that we've started waiting on the
-		// container before we signal.
-		<-ch
-		time.Sleep(100 * time.Millisecond)
-		// Send the container a SIGTERM which will cause it to stop.
-		if err := c.SignalContainer(syscall.SIGTERM, false); err != nil {
-			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
-		}
-		// Wait for it to die.
-		wg.Wait()
+			// Verify that "sleep 100" is running.
+			if err := waitForProcessList(c, expectedPL); err != nil {
+				t.Error(err)
+			}
 
-		// Load the container from disk and check the status.
-		c, err = Load(rootDir, args.ID)
-		if err != nil {
-			t.Fatalf("error loading container: %v", err)
-		}
-		if got, want := c.Status, Stopped; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			// Wait on the container.
+			ch := make(chan error)
+			go func() {
+				ws, err := c.Wait()
+				if err != nil {
+					ch <- err
+				}
+				if got, want := ws.Signal(), syscall.SIGTERM; got != want {
+					ch <- fmt.Errorf("got signal %v, want %v", got, want)
+				}
+				ch <- nil
+			}()
 
-		// Destroy the container.
-		if err := c.Destroy(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
+			// Wait a bit to ensure that we've started waiting on
+			// the container before we signal.
+			time.Sleep(time.Second)
 
-		// List should not return the container id.
-		ids, err = List(rootDir)
-		if err != nil {
-			t.Fatalf("error listing containers: %v", err)
-		}
-		if len(ids) != 0 {
-			t.Errorf("expected container list to be empty, but got %v", ids)
-		}
+			// Send the container a SIGTERM which will cause it to stop.
+			if err := c.SignalContainer(syscall.SIGTERM, false); err != nil {
+				t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
+			}
 
-		// Loading the container by id should fail.
-		if _, err = Load(rootDir, args.ID); err == nil {
-			t.Errorf("expected loading destroyed container to fail, but it did not")
-		}
+			// Wait for it to die.
+			if err := <-ch; err != nil {
+				t.Fatalf("error waiting for container: %v", err)
+			}
+
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Stopped; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			// Destroy the container.
+			if err := c.Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
+
+			// List should not return the container id.
+			ids, err = List(rootDir)
+			if err != nil {
+				t.Fatalf("error listing containers: %v", err)
+			}
+			if len(ids) != 0 {
+				t.Errorf("expected container list to be empty, but got %v", ids)
+			}
+
+			// Loading the container by id should fail.
+			if _, err = Load(rootDir, args.ID); err == nil {
+				t.Errorf("expected loading destroyed container to fail, but it did not")
+			}
+		})
 	}
 }
 
@@ -420,12 +425,14 @@ func TestExePath(t *testing.T) {
 	// Create two directories that will be prepended to PATH.
 	firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating temporary directory: %v", err)
 	}
+	defer os.RemoveAll(firstPath)
 	secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating temporary directory: %v", err)
 	}
+	defer os.RemoveAll(secondPath)
 
 	// Create two minimal executables in the second path, two of which
 	// will be masked by files in first path.
@@ -433,11 +440,11 @@ func TestExePath(t *testing.T) {
 		path := filepath.Join(secondPath, p)
 		f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777)
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("error opening path: %v", err)
 		}
 		defer f.Close()
 		if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error writing contents: %v", err)
 		}
 	}
 
@@ -446,7 +453,7 @@ func TestExePath(t *testing.T) {
 	nonExecutable := filepath.Join(firstPath, "masked1")
 	f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error opening file: %v", err)
 	}
 	f2.Close()
 
@@ -454,68 +461,69 @@ func TestExePath(t *testing.T) {
 	// executable in the second.
 	nonRegular := filepath.Join(firstPath, "masked2")
 	if err := os.Mkdir(nonRegular, 0777); err != nil {
-		t.Fatal(err)
-	}
-
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-		for _, test := range []struct {
-			path    string
-			success bool
-		}{
-			{path: "true", success: true},
-			{path: "bin/true", success: true},
-			{path: "/bin/true", success: true},
-			{path: "thisfiledoesntexit", success: false},
-			{path: "bin/thisfiledoesntexit", success: false},
-			{path: "/bin/thisfiledoesntexit", success: false},
-
-			{path: "unmasked", success: true},
-			{path: filepath.Join(firstPath, "unmasked"), success: false},
-			{path: filepath.Join(secondPath, "unmasked"), success: true},
-
-			{path: "masked1", success: true},
-			{path: filepath.Join(firstPath, "masked1"), success: false},
-			{path: filepath.Join(secondPath, "masked1"), success: true},
-
-			{path: "masked2", success: true},
-			{path: filepath.Join(firstPath, "masked2"), success: false},
-			{path: filepath.Join(secondPath, "masked2"), success: true},
-		} {
-			spec := testutil.NewSpecWithArgs(test.path)
-			spec.Process.Env = []string{
-				fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")),
-			}
-
-			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
-			}
-
-			args := Args{
-				ID:        testutil.UniqueContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-				Attached:  true,
-			}
-			ws, err := Run(conf, args)
+		t.Fatalf("error making directory: %v", err)
+	}
+
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			for _, test := range []struct {
+				path    string
+				success bool
+			}{
+				{path: "true", success: true},
+				{path: "bin/true", success: true},
+				{path: "/bin/true", success: true},
+				{path: "thisfiledoesntexit", success: false},
+				{path: "bin/thisfiledoesntexit", success: false},
+				{path: "/bin/thisfiledoesntexit", success: false},
+
+				{path: "unmasked", success: true},
+				{path: filepath.Join(firstPath, "unmasked"), success: false},
+				{path: filepath.Join(secondPath, "unmasked"), success: true},
+
+				{path: "masked1", success: true},
+				{path: filepath.Join(firstPath, "masked1"), success: false},
+				{path: filepath.Join(secondPath, "masked1"), success: true},
+
+				{path: "masked2", success: true},
+				{path: filepath.Join(firstPath, "masked2"), success: false},
+				{path: filepath.Join(secondPath, "masked2"), success: true},
+			} {
+				t.Run(fmt.Sprintf("path=%s,success=%t", test.path, test.success), func(t *testing.T) {
+					spec := testutil.NewSpecWithArgs(test.path)
+					spec.Process.Env = []string{
+						fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")),
+					}
 
-			os.RemoveAll(rootDir)
-			os.RemoveAll(bundleDir)
+					_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+					if err != nil {
+						t.Fatalf("exec: error setting up container: %v", err)
+					}
+					defer cleanup()
 
-			if test.success {
-				if err != nil {
-					t.Errorf("exec: %s, error running container: %v", test.path, err)
-				}
-				if ws.ExitStatus() != 0 {
-					t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
-				}
-			} else {
-				if err == nil {
-					t.Errorf("exec: %s, got: no error, want: error", test.path)
-				}
+					args := Args{
+						ID:        testutil.RandomContainerID(),
+						Spec:      spec,
+						BundleDir: bundleDir,
+						Attached:  true,
+					}
+					ws, err := Run(conf, args)
+
+					if test.success {
+						if err != nil {
+							t.Errorf("exec: error running container: %v", err)
+						}
+						if ws.ExitStatus() != 0 {
+							t.Errorf("exec: got exit status %v want %v", ws.ExitStatus(), 0)
+						}
+					} else {
+						if err == nil {
+							t.Errorf("exec: got: no error, want: error")
+						}
+					}
+				})
 			}
-		}
+		})
 	}
 }
 
@@ -534,15 +542,14 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
 	succSpec := testutil.NewSpecWithArgs("true")
 	conf := testutil.TestConfig(t)
 	conf.VFS2 = vfs2
-	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      succSpec,
 		BundleDir: bundleDir,
 		Attached:  true,
@@ -559,15 +566,14 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
 	wantStatus := 123
 	errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
 
-	rootDir2, bundleDir2, err := testutil.SetupContainer(errSpec, conf)
+	_, bundleDir2, cleanup2, err := testutil.SetupContainer(errSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir2)
-	defer os.RemoveAll(bundleDir2)
+	defer cleanup2()
 
 	args2 := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      errSpec,
 		BundleDir: bundleDir2,
 		Attached:  true,
@@ -583,166 +589,163 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			const uid = 343
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 
-		const uid = 343
-		spec := testutil.NewSpecWithArgs("sleep", "100")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+				{
+					UID:     uid,
+					PID:     2,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{2},
+				},
+			}
 
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-			{
-				UID:     uid,
-				PID:     2,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{2},
-			},
-		}
+			// Verify that "sleep 100" is running.
+			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+				t.Error(err)
+			}
 
-		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
-			t.Error(err)
-		}
+			execArgs := &control.ExecArgs{
+				Filename:         "/bin/sleep",
+				Argv:             []string{"/bin/sleep", "5"},
+				WorkingDirectory: "/",
+				KUID:             uid,
+			}
 
-		execArgs := &control.ExecArgs{
-			Filename:         "/bin/sleep",
-			Argv:             []string{"/bin/sleep", "5"},
-			WorkingDirectory: "/",
-			KUID:             uid,
-		}
+			// Verify that "sleep 100" and "sleep 5" are running
+			// after exec.  First, start running exec (whick
+			// blocks).
+			ch := make(chan error)
+			go func() {
+				exitStatus, err := cont.executeSync(execArgs)
+				if err != nil {
+					ch <- err
+				} else if exitStatus != 0 {
+					ch <- fmt.Errorf("failed with exit status: %v", exitStatus)
+				} else {
+					ch <- nil
+				}
+			}()
 
-		// Verify that "sleep 100" and "sleep 5" are running after exec.
-		// First, start running exec (whick blocks).
-		status := make(chan error, 1)
-		go func() {
-			exitStatus, err := cont.executeSync(execArgs)
-			if err != nil {
-				log.Debugf("error executing: %v", err)
-				status <- err
-			} else if exitStatus != 0 {
-				log.Debugf("bad status: %d", exitStatus)
-				status <- fmt.Errorf("failed with exit status: %v", exitStatus)
-			} else {
-				status <- nil
+			if err := waitForProcessList(cont, expectedPL); err != nil {
+				t.Fatalf("error waiting for processes: %v", err)
 			}
-		}()
-
-		if err := waitForProcessList(cont, expectedPL); err != nil {
-			t.Fatal(err)
-		}
 
-		// Ensure that exec finished without error.
-		select {
-		case <-time.After(10 * time.Second):
-			t.Fatalf("container timed out waiting for exec to finish.")
-		case st := <-status:
-			if st != nil {
-				t.Errorf("container failed to exec %v: %v", args, err)
+			// Ensure that exec finished without error.
+			select {
+			case <-time.After(10 * time.Second):
+				t.Fatalf("container timed out waiting for exec to finish.")
+			case err := <-ch:
+				if err != nil {
+					t.Errorf("container failed to exec %v: %v", args, err)
+				}
 			}
-		}
+		})
 	}
 }
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		app, err := testutil.FindFile("runsc/container/test_app/test_app")
-		if err != nil {
-			t.Fatal("error finding test_app:", err)
-		}
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			app, err := testutil.FindFile("test/cmd/test_app/test_app")
+			if err != nil {
+				t.Fatal("error finding test_app:", err)
+			}
 
-		const nProcs = 4
-		spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			const nProcs = 4
+			spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Verify that all processes are running.
-		if err := waitForProcessCount(cont, nProcs); err != nil {
-			t.Fatalf("timed out waiting for processes to start: %v", err)
-		}
+			// Verify that all processes are running.
+			if err := waitForProcessCount(cont, nProcs); err != nil {
+				t.Fatalf("timed out waiting for processes to start: %v", err)
+			}
 
-		// Kill the child process with the largest PID.
-		procs, err := cont.Processes()
-		if err != nil {
-			t.Fatalf("failed to get process list: %v", err)
-		}
-		var pid int32
-		for _, p := range procs {
-			if pid < int32(p.PID) {
-				pid = int32(p.PID)
+			// Kill the child process with the largest PID.
+			procs, err := cont.Processes()
+			if err != nil {
+				t.Fatalf("failed to get process list: %v", err)
+			}
+			var pid int32
+			for _, p := range procs {
+				if pid < int32(p.PID) {
+					pid = int32(p.PID)
+				}
+			}
+			if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil {
+				t.Fatalf("failed to signal process %d: %v", pid, err)
 			}
-		}
-		if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil {
-			t.Fatalf("failed to signal process %d: %v", pid, err)
-		}
 
-		// Verify that one process is gone.
-		if err := waitForProcessCount(cont, nProcs-1); err != nil {
-			t.Fatal(err)
-		}
+			// Verify that one process is gone.
+			if err := waitForProcessCount(cont, nProcs-1); err != nil {
+				t.Fatalf("error waiting for processes: %v", err)
+			}
 
-		procs, err = cont.Processes()
-		if err != nil {
-			t.Fatalf("failed to get process list: %v", err)
-		}
-		for _, p := range procs {
-			if pid == int32(p.PID) {
-				t.Fatalf("pid %d is still alive, which should be killed", pid)
+			procs, err = cont.Processes()
+			if err != nil {
+				t.Fatalf("failed to get process list: %v", err)
 			}
-		}
+			for _, p := range procs {
+				if pid == int32(p.PID) {
+					t.Fatalf("pid %d is still alive, which should be killed", pid)
+				}
+			}
+		})
 	}
 }
 
@@ -753,160 +756,160 @@ func TestKillPid(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(t, noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir failed: %v", err)
-		}
-		if err := os.Chmod(dir, 0777); err != nil {
-			t.Fatalf("error chmoding file: %q, %v", dir, err)
-		}
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
+			if err := os.Chmod(dir, 0777); err != nil {
+				t.Fatalf("error chmoding file: %q, %v", dir, err)
+			}
 
-		outputPath := filepath.Join(dir, "output")
-		outputFile, err := createWriteableOutputFile(outputPath)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile.Close()
+			outputPath := filepath.Join(dir, "output")
+			outputFile, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile.Close()
 
-		script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath)
-		spec := testutil.NewSpecWithArgs("bash", "-c", script)
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath)
+			spec := testutil.NewSpecWithArgs("bash", "-c", script)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Set the image path, which is where the checkpoint image will be saved.
-		imagePath := filepath.Join(dir, "test-image-file")
+			// Set the image path, which is where the checkpoint image will be saved.
+			imagePath := filepath.Join(dir, "test-image-file")
 
-		// Create the image file and open for writing.
-		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
-		if err != nil {
-			t.Fatalf("error opening new file at imagePath: %v", err)
-		}
-		defer file.Close()
+			// Create the image file and open for writing.
+			file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+			if err != nil {
+				t.Fatalf("error opening new file at imagePath: %v", err)
+			}
+			defer file.Close()
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		// Checkpoint running container; save state into new file.
-		if err := cont.Checkpoint(file); err != nil {
-			t.Fatalf("error checkpointing container to empty file: %v", err)
-		}
-		defer os.RemoveAll(imagePath)
+			// Checkpoint running container; save state into new file.
+			if err := cont.Checkpoint(file); err != nil {
+				t.Fatalf("error checkpointing container to empty file: %v", err)
+			}
+			defer os.RemoveAll(imagePath)
 
-		lastNum, err := readOutputNum(outputPath, -1)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			lastNum, err := readOutputNum(outputPath, -1)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Delete and recreate file before restoring.
-		if err := os.Remove(outputPath); err != nil {
-			t.Fatalf("error removing file")
-		}
-		outputFile2, err := createWriteableOutputFile(outputPath)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile2.Close()
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile2, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile2.Close()
 
-		// Restore into a new container.
-		args2 := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont2, err := New(conf, args2)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont2.Destroy()
+			// Restore into a new container.
+			args2 := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont2, err := New(conf, args2)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont2.Destroy()
 
-		if err := cont2.Restore(spec, conf, imagePath); err != nil {
-			t.Fatalf("error restoring container: %v", err)
-		}
+			if err := cont2.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile2); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile2); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		firstNum, err := readOutputNum(outputPath, 0)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			firstNum, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Check that lastNum is one less than firstNum and that the container picks
-		// up from where it left off.
-		if lastNum+1 != firstNum {
-			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
-		}
-		cont2.Destroy()
+			// Check that lastNum is one less than firstNum and that the container picks
+			// up from where it left off.
+			if lastNum+1 != firstNum {
+				t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+			}
+			cont2.Destroy()
 
-		// Restore into another container!
-		// Delete and recreate file before restoring.
-		if err := os.Remove(outputPath); err != nil {
-			t.Fatalf("error removing file")
-		}
-		outputFile3, err := createWriteableOutputFile(outputPath)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile3.Close()
+			// Restore into another container!
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile3, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile3.Close()
 
-		// Restore into a new container.
-		args3 := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont3, err := New(conf, args3)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont3.Destroy()
+			// Restore into a new container.
+			args3 := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont3, err := New(conf, args3)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont3.Destroy()
 
-		if err := cont3.Restore(spec, conf, imagePath); err != nil {
-			t.Fatalf("error restoring container: %v", err)
-		}
+			if err := cont3.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile3); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile3); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		firstNum2, err := readOutputNum(outputPath, 0)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			firstNum2, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Check that lastNum is one less than firstNum and that the container picks
-		// up from where it left off.
-		if lastNum+1 != firstNum2 {
-			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
-		}
-		cont3.Destroy()
+			// Check that lastNum is one less than firstNum and that the container picks
+			// up from where it left off.
+			if lastNum+1 != firstNum2 {
+				t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+			}
+			cont3.Destroy()
+		})
 	}
 }
 
@@ -914,135 +917,134 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(t, noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		// UDS path is limited to 108 chars for compatibility with older systems.
-		// Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is
-		// not exceeded. Assumes '/tmp' exists in the system.
-		dir, err := ioutil.TempDir("/tmp", "uds-test")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir failed: %v", err)
-		}
-		defer os.RemoveAll(dir)
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			// UDS path is limited to 108 chars for compatibility with older systems.
+			// Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is
+			// not exceeded. Assumes '/tmp' exists in the system.
+			dir, err := ioutil.TempDir("/tmp", "uds-test")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
 
-		outputPath := filepath.Join(dir, "uds_output")
-		outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile.Close()
+			outputPath := filepath.Join(dir, "uds_output")
+			outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile.Close()
 
-		app, err := testutil.FindFile("runsc/container/test_app/test_app")
-		if err != nil {
-			t.Fatal("error finding test_app:", err)
-		}
+			app, err := testutil.FindFile("test/cmd/test_app/test_app")
+			if err != nil {
+				t.Fatal("error finding test_app:", err)
+			}
 
-		socketPath := filepath.Join(dir, "uds_socket")
-		defer os.Remove(socketPath)
+			socketPath := filepath.Join(dir, "uds_socket")
+			defer os.Remove(socketPath)
 
-		spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath)
-		spec.Process.User = specs.User{
-			UID: uint32(os.Getuid()),
-			GID: uint32(os.Getgid()),
-		}
-		spec.Mounts = []specs.Mount{{
-			Type:        "bind",
-			Destination: dir,
-			Source:      dir,
-		}}
+			spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath)
+			spec.Process.User = specs.User{
+				UID: uint32(os.Getuid()),
+				GID: uint32(os.Getgid()),
+			}
+			spec.Mounts = []specs.Mount{{
+				Type:        "bind",
+				Destination: dir,
+				Source:      dir,
+			}}
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Set the image path, the location where the checkpoint image will be saved.
-		imagePath := filepath.Join(dir, "test-image-file")
+			// Set the image path, the location where the checkpoint image will be saved.
+			imagePath := filepath.Join(dir, "test-image-file")
 
-		// Create the image file and open for writing.
-		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
-		if err != nil {
-			t.Fatalf("error opening new file at imagePath: %v", err)
-		}
-		defer file.Close()
-		defer os.RemoveAll(imagePath)
+			// Create the image file and open for writing.
+			file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+			if err != nil {
+				t.Fatalf("error opening new file at imagePath: %v", err)
+			}
+			defer file.Close()
+			defer os.RemoveAll(imagePath)
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		// Checkpoint running container; save state into new file.
-		if err := cont.Checkpoint(file); err != nil {
-			t.Fatalf("error checkpointing container to empty file: %v", err)
-		}
+			// Checkpoint running container; save state into new file.
+			if err := cont.Checkpoint(file); err != nil {
+				t.Fatalf("error checkpointing container to empty file: %v", err)
+			}
 
-		// Read last number outputted before checkpoint.
-		lastNum, err := readOutputNum(outputPath, -1)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			// Read last number outputted before checkpoint.
+			lastNum, err := readOutputNum(outputPath, -1)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Delete and recreate file before restoring.
-		if err := os.Remove(outputPath); err != nil {
-			t.Fatalf("error removing file")
-		}
-		outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile2.Close()
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile2.Close()
 
-		// Restore into a new container.
-		argsRestore := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		contRestore, err := New(conf, argsRestore)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer contRestore.Destroy()
+			// Restore into a new container.
+			argsRestore := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			contRestore, err := New(conf, argsRestore)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer contRestore.Destroy()
 
-		if err := contRestore.Restore(spec, conf, imagePath); err != nil {
-			t.Fatalf("error restoring container: %v", err)
-		}
+			if err := contRestore.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile2); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile2); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		// Read first number outputted after restore.
-		firstNum, err := readOutputNum(outputPath, 0)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			// Read first number outputted after restore.
+			firstNum, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Check that lastNum is one less than firstNum.
-		if lastNum+1 != firstNum {
-			t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
-		}
-		contRestore.Destroy()
+			// Check that lastNum is one less than firstNum.
+			if lastNum+1 != firstNum {
+				t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
+			}
+			contRestore.Destroy()
+		})
 	}
 }
 
@@ -1052,10 +1054,8 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for _, conf := range configs(t, noOverlay...) {
-		t.Run(fmt.Sprintf("conf: %+v", conf), func(t *testing.T) {
-			t.Logf("Running test with conf: %+v", conf)
-
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
 			if err != nil {
 				t.Fatalf("error creating temp dir: %v", err)
@@ -1066,16 +1066,15 @@ func TestPauseResume(t *testing.T) {
 			script := fmt.Sprintf("while [[ true ]]; do touch %q; sleep 0.1; done", running)
 			spec := testutil.NewSpecWithArgs("/bin/bash", "-c", script)
 
-			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
-			defer os.RemoveAll(rootDir)
-			defer os.RemoveAll(bundleDir)
+			defer cleanup()
 
 			// Create and start the container.
 			args := Args{
-				ID:        testutil.UniqueContainerID(),
+				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
 				BundleDir: bundleDir,
 			}
@@ -1134,16 +1133,15 @@ func TestPauseResume(t *testing.T) {
 func TestPauseResumeStatus(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("sleep", "20")
 	conf := testutil.TestConfig(t)
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1199,359 +1197,356 @@ func TestCapabilities(t *testing.T) {
 	uid := auth.KUID(os.Getuid() + 1)
 	gid := auth.KGID(os.Getgid() + 1)
 
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		spec := testutil.NewSpecWithArgs("sleep", "100")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-			{
-				UID:     uid,
-				PID:     2,
-				PPID:    0,
-				C:       0,
-				Cmd:     "exe",
-				Threads: []kernel.ThreadID{2},
-			},
-		}
-		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
-			t.Fatalf("Failed to wait for sleep to start, err: %v", err)
-		}
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+				{
+					UID:     uid,
+					PID:     2,
+					PPID:    0,
+					C:       0,
+					Cmd:     "exe",
+					Threads: []kernel.ThreadID{2},
+				},
+			}
+			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+				t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+			}
 
-		// Create an executable that can't be run with the specified UID:GID.
-		// This shouldn't be callable within the container until we add the
-		// CAP_DAC_OVERRIDE capability to skip the access check.
-		exePath := filepath.Join(rootDir, "exe")
-		if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
-			t.Fatalf("couldn't create executable: %v", err)
-		}
-		defer os.Remove(exePath)
-
-		// Need to traverse the intermediate directory.
-		os.Chmod(rootDir, 0755)
-
-		execArgs := &control.ExecArgs{
-			Filename:         exePath,
-			Argv:             []string{exePath},
-			WorkingDirectory: "/",
-			KUID:             uid,
-			KGID:             gid,
-			Capabilities:     &auth.TaskCapabilities{},
-		}
+			// Create an executable that can't be run with the specified UID:GID.
+			// This shouldn't be callable within the container until we add the
+			// CAP_DAC_OVERRIDE capability to skip the access check.
+			exePath := filepath.Join(rootDir, "exe")
+			if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+				t.Fatalf("couldn't create executable: %v", err)
+			}
+			defer os.Remove(exePath)
+
+			// Need to traverse the intermediate directory.
+			os.Chmod(rootDir, 0755)
+
+			execArgs := &control.ExecArgs{
+				Filename:         exePath,
+				Argv:             []string{exePath},
+				WorkingDirectory: "/",
+				KUID:             uid,
+				KGID:             gid,
+				Capabilities:     &auth.TaskCapabilities{},
+			}
 
-		// "exe" should fail because we don't have the necessary permissions.
-		if _, err := cont.executeSync(execArgs); err == nil {
-			t.Fatalf("container executed without error, but an error was expected")
-		}
+			// "exe" should fail because we don't have the necessary permissions.
+			if _, err := cont.executeSync(execArgs); err == nil {
+				t.Fatalf("container executed without error, but an error was expected")
+			}
 
-		// Now we run with the capability enabled and should succeed.
-		execArgs.Capabilities = &auth.TaskCapabilities{
-			EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
-		}
-		// "exe" should not fail this time.
-		if _, err := cont.executeSync(execArgs); err != nil {
-			t.Fatalf("container failed to exec %v: %v", args, err)
-		}
+			// Now we run with the capability enabled and should succeed.
+			execArgs.Capabilities = &auth.TaskCapabilities{
+				EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+			}
+			// "exe" should not fail this time.
+			if _, err := cont.executeSync(execArgs); err != nil {
+				t.Fatalf("container failed to exec %v: %v", args, err)
+			}
+		})
 	}
 }
 
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	for _, conf := range configs(t, noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		spec := testutil.NewSpecWithArgs("/bin/true")
-
-		// Set a random user/group with no access to "blocked" dir.
-		spec.Process.User.UID = 343
-		spec.Process.User.GID = 2401
-		spec.Process.Capabilities = nil
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("/bin/true")
+
+			// Set a random user/group with no access to "blocked" dir.
+			spec.Process.User.UID = 343
+			spec.Process.User.GID = 2401
+			spec.Process.Capabilities = nil
+
+			// User running inside container can't list '$TMP/blocked' and would fail to
+			// mount it.
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			if err := os.Chmod(dir, 0700); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+			}
+			dir = path.Join(dir, "test")
+			if err := os.Mkdir(dir, 0755); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+			}
 
-		// User running inside container can't list '$TMP/blocked' and would fail to
-		// mount it.
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		if err := os.Chmod(dir, 0700); err != nil {
-			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-		}
-		dir = path.Join(dir, "test")
-		if err := os.Mkdir(dir, 0755); err != nil {
-			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-		}
+			src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
 
-		src, err := ioutil.TempDir(testutil.TmpDir(), "src")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      src,
+				Type:        "bind",
+			})
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: dir,
-			Source:      src,
-			Type:        "bind",
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
 		})
-
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sandbox: %v", err)
-		}
 	}
 }
 
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+			if err != nil {
+				t.Fatal("ioutil.TempDir() failed:", err)
+			}
 
-		root, err := ioutil.TempDir(testutil.TmpDir(), "root")
-		if err != nil {
-			t.Fatal("ioutil.TempDir() failed:", err)
-		}
+			srcDir := path.Join(root, "src", "dir", "anotherdir")
+			if err := os.MkdirAll(srcDir, 0755); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
+			}
 
-		srcDir := path.Join(root, "src", "dir", "anotherdir")
-		if err := os.MkdirAll(srcDir, 0755); err != nil {
-			t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
-		}
+			mountDir := path.Join(root, "dir", "anotherdir")
 
-		mountDir := path.Join(root, "dir", "anotherdir")
+			spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: mountDir,
+				Source:      srcDir,
+				Type:        "bind",
+			})
 
-		spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: mountDir,
-			Source:      srcDir,
-			Type:        "bind",
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
 		})
-
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sandbox: %v", err)
-		}
 	}
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
-		spec.Root.Readonly = true
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec.Root.Readonly = true
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create, start and wait for the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-			t.Fatalf("container failed, waitStatus: %v", ws)
-		}
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+		})
 	}
 }
 
 func TestUIDMap(t *testing.T) {
-	for _, conf := range configs(t, noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-		testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
-		if err != nil {
-			t.Fatal(err)
-		}
-		defer os.RemoveAll(testDir)
-		testFile := path.Join(testDir, "testfile")
-
-		spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile")
-		uid := os.Getuid()
-		gid := os.Getgid()
-		spec.Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{Type: specs.UserNamespace},
-				{Type: specs.PIDNamespace},
-				{Type: specs.MountNamespace},
-			},
-			UIDMappings: []specs.LinuxIDMapping{
-				{
-					ContainerID: 0,
-					HostID:      uint32(uid),
-					Size:        1,
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			defer os.RemoveAll(testDir)
+			testFile := path.Join(testDir, "testfile")
+
+			spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile")
+			uid := os.Getuid()
+			gid := os.Getgid()
+			spec.Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{Type: specs.UserNamespace},
+					{Type: specs.PIDNamespace},
+					{Type: specs.MountNamespace},
 				},
-			},
-			GIDMappings: []specs.LinuxIDMapping{
-				{
-					ContainerID: 0,
-					HostID:      uint32(gid),
-					Size:        1,
+				UIDMappings: []specs.LinuxIDMapping{
+					{
+						ContainerID: 0,
+						HostID:      uint32(uid),
+						Size:        1,
+					},
 				},
-			},
-		}
+				GIDMappings: []specs.LinuxIDMapping{
+					{
+						ContainerID: 0,
+						HostID:      uint32(gid),
+						Size:        1,
+					},
+				},
+			}
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/tmp",
-			Source:      testDir,
-			Type:        "bind",
-		})
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: "/tmp",
+				Source:      testDir,
+				Type:        "bind",
+			})
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create, start and wait for the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if !ws.Exited() || ws.ExitStatus() != 0 {
-			t.Fatalf("container failed, waitStatus: %v", ws)
-		}
-		st := syscall.Stat_t{}
-		if err := syscall.Stat(testFile, &st); err != nil {
-			t.Fatalf("error stat /testfile: %v", err)
-		}
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || ws.ExitStatus() != 0 {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+			st := syscall.Stat_t{}
+			if err := syscall.Stat(testFile, &st); err != nil {
+				t.Fatalf("error stat /testfile: %v", err)
+			}
 
-		if st.Uid != uint32(uid) || st.Gid != uint32(gid) {
-			t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid)
-		}
+			if st.Uid != uint32(uid) || st.Gid != uint32(gid) {
+				t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid)
+			}
+		})
 	}
 }
 
 func TestReadonlyMount(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
-		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: dir,
-			Source:      dir,
-			Type:        "bind",
-			Options:     []string{"ro"},
-		})
-		spec.Root.Readonly = false
-
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create, start and wait for the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-			t.Fatalf("container failed, waitStatus: %v", ws)
-		}
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+		})
 	}
 }
 
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	cids := []string{
-		"foo-" + testutil.UniqueContainerID(),
-		"bar-" + testutil.UniqueContainerID(),
-		"baz-" + testutil.UniqueContainerID(),
+		"foo-" + testutil.RandomContainerID(),
+		"bar-" + testutil.RandomContainerID(),
+		"baz-" + testutil.RandomContainerID(),
 	}
 	for _, cid := range cids {
 		spec := testutil.NewSpecWithArgs("sleep", "100")
-		bundleDir, err := testutil.SetupBundleDir(spec)
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(bundleDir)
+		defer cleanup()
 
 		// Create and start the container.
 		args := Args{
@@ -1596,16 +1591,15 @@ func TestAbbreviatedIDs(t *testing.T) {
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
 	conf := testutil.TestConfig(t)
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1634,7 +1628,7 @@ func TestGoferExits(t *testing.T) {
 }
 
 func TestRootNotMount(t *testing.T) {
-	appSym, err := testutil.FindFile("runsc/container/test_app/test_app")
+	appSym, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1671,7 +1665,7 @@ func TestRootNotMount(t *testing.T) {
 }
 
 func TestUserLog(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1679,12 +1673,11 @@ func TestUserLog(t *testing.T) {
 	// sched_rr_get_interval = 148 - not implemented in gvisor.
 	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
 	conf := testutil.TestConfig(t)
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	dir, err := ioutil.TempDir(testutil.TmpDir(), "user_log_test")
 	if err != nil {
@@ -1694,7 +1687,7 @@ func TestUserLog(t *testing.T) {
 
 	// Create, start and wait for the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 		UserLog:   userLog,
@@ -1718,72 +1711,70 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		// Run a shell that sleeps for 1 second and then exits with a
-		// non-zero code.
-		const wantExit = 17
-		cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
-		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			// Run a shell that sleeps for 1 second and then exits with a
+			// non-zero code.
+			const wantExit = 17
+			cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
+			spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and Start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and Start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Wait on the sandbox. This will make an RPC to the sandbox
-		// and get the actual exit status of the application.
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if got := ws.ExitStatus(); got != wantExit {
-			t.Errorf("got exit status %d, want %d", got, wantExit)
-		}
+			// Wait on the sandbox. This will make an RPC to the sandbox
+			// and get the actual exit status of the application.
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got := ws.ExitStatus(); got != wantExit {
+				t.Errorf("got exit status %d, want %d", got, wantExit)
+			}
 
-		// Now the sandbox has exited, but the zombie sandbox process
-		// still exists. Calling Wait() now will return the sandbox
-		// exit status.
-		ws, err = c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if got := ws.ExitStatus(); got != wantExit {
-			t.Errorf("got exit status %d, want %d", got, wantExit)
-		}
+			// Now the sandbox has exited, but the zombie sandbox process
+			// still exists. Calling Wait() now will return the sandbox
+			// exit status.
+			ws, err = c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got := ws.ExitStatus(); got != wantExit {
+				t.Errorf("got exit status %d, want %d", got, wantExit)
+			}
+		})
 	}
 }
 
 func TestDestroyNotStarted(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
 	conf := testutil.TestConfig(t)
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create the container and check that it can be destroyed.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1801,16 +1792,15 @@ func TestDestroyStarting(t *testing.T) {
 	for i := 0; i < 10; i++ {
 		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
 		conf := testutil.TestConfig(t)
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+		defer cleanup()
 
 		// Create the container and check that it can be destroyed.
 		args := Args{
-			ID:        testutil.UniqueContainerID(),
+			ID:        testutil.RandomContainerID(),
 			Spec:      spec,
 			BundleDir: bundleDir,
 		}
@@ -1845,23 +1835,23 @@ func TestDestroyStarting(t *testing.T) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		dir := path.Join(tmpDir, "new/working/dir")
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			dir := path.Join(tmpDir, "new/working/dir")
 
-		// touch will fail if the directory doesn't exist.
-		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-		spec.Process.Cwd = dir
-		spec.Root.Readonly = true
+			// touch will fail if the directory doesn't exist.
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			spec.Process.Cwd = dir
+			spec.Root.Readonly = true
 
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("Error running container: %v", err)
-		}
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("Error running container: %v", err)
+			}
+		})
 	}
 }
 
@@ -1919,15 +1909,14 @@ func TestMountPropagation(t *testing.T) {
 	}
 
 	conf := testutil.TestConfig(t)
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1969,81 +1958,81 @@ func TestMountPropagation(t *testing.T) {
 }
 
 func TestMountSymlink(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
 
-		source := path.Join(dir, "source")
-		target := path.Join(dir, "target")
-		for _, path := range []string{source, target} {
-			if err := os.MkdirAll(path, 0777); err != nil {
-				t.Fatalf("os.MkdirAll(): %v", err)
+			source := path.Join(dir, "source")
+			target := path.Join(dir, "target")
+			for _, path := range []string{source, target} {
+				if err := os.MkdirAll(path, 0777); err != nil {
+					t.Fatalf("os.MkdirAll(): %v", err)
+				}
 			}
-		}
-		f, err := os.Create(path.Join(source, "file"))
-		if err != nil {
-			t.Fatalf("os.Create(): %v", err)
-		}
-		f.Close()
+			f, err := os.Create(path.Join(source, "file"))
+			if err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			f.Close()
 
-		link := path.Join(dir, "link")
-		if err := os.Symlink(target, link); err != nil {
-			t.Fatalf("os.Symlink(%q, %q): %v", target, link, err)
-		}
+			link := path.Join(dir, "link")
+			if err := os.Symlink(target, link); err != nil {
+				t.Fatalf("os.Symlink(%q, %q): %v", target, link, err)
+			}
 
-		spec := testutil.NewSpecWithArgs("/bin/sleep", "1000")
+			spec := testutil.NewSpecWithArgs("/bin/sleep", "1000")
 
-		// Mount to a symlink to ensure the mount code will follow it and mount
-		// at the symlink target.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: link,
-			Source:      source,
-		})
+			// Mount to a symlink to ensure the mount code will follow it and mount
+			// at the symlink target.
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Type:        "bind",
+				Destination: link,
+				Source:      source,
+			})
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("creating container: %v", err)
-		}
-		defer cont.Destroy()
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("creating container: %v", err)
+			}
+			defer cont.Destroy()
 
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("starting container: %v", err)
-		}
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("starting container: %v", err)
+			}
 
-		// Check that symlink was resolved and mount was created where the symlink
-		// is pointing to.
-		file := path.Join(target, "file")
-		execArgs := &control.ExecArgs{
-			Filename: "/usr/bin/test",
-			Argv:     []string{"test", "-f", file},
-		}
-		if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
-			t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
-		}
+			// Check that symlink was resolved and mount was created where the symlink
+			// is pointing to.
+			file := path.Join(target, "file")
+			execArgs := &control.ExecArgs{
+				Filename: "/usr/bin/test",
+				Argv:     []string{"test", "-f", file},
+			}
+			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
+			}
+		})
 	}
 }
 
 // Check that --net-raw disables the CAP_NET_RAW capability.
 func TestNetRaw(t *testing.T) {
 	capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10)
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -2106,7 +2095,7 @@ func TestTTYField(t *testing.T) {
 	stop := testutil.StartReaper()
 	defer stop()
 
-	testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+	testApp, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -2140,16 +2129,15 @@ func TestTTYField(t *testing.T) {
 			}
 
 			spec := testutil.NewSpecWithArgs(cmd...)
-			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
-			defer os.RemoveAll(rootDir)
-			defer os.RemoveAll(bundleDir)
+			defer cleanup()
 
 			// Create and start the container.
 			args := Args{
-				ID:        testutil.UniqueContainerID(),
+				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
 				BundleDir: bundleDir,
 			}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index dc2fb42ce..e3704b453 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -30,15 +30,15 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	var specs []*specs.Spec
 	var ids []string
-	rootID := testutil.UniqueContainerID()
+	rootID := testutil.RandomContainerID()
 
 	for i, cmd := range cmds {
 		spec := testutil.NewSpecWithArgs(cmd...)
@@ -52,7 +52,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
 				specutils.ContainerdSandboxIDAnnotation:     rootID,
 			}
-			ids = append(ids, testutil.UniqueContainerID())
+			ids = append(ids, testutil.RandomContainerID())
 		}
 		specs = append(specs, spec)
 	}
@@ -64,23 +64,29 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
-	var containers []*Container
-	var bundles []string
-	cleanup := func() {
+	var (
+		containers []*Container
+		cleanups   []func()
+	)
+	cleanups = append(cleanups, func() {
 		for _, c := range containers {
 			c.Destroy()
 		}
-		for _, b := range bundles {
-			os.RemoveAll(b)
+	})
+	cleanupAll := func() {
+		for _, c := range cleanups {
+			c()
 		}
 	}
+	localClean := specutils.MakeCleanup(cleanupAll)
+	defer localClean.Clean()
+
 	for i, spec := range specs {
-		bundleDir, err := testutil.SetupBundleDir(spec)
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 		if err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error setting up container: %v", err)
 		}
-		bundles = append(bundles, bundleDir)
+		cleanups = append(cleanups, cleanup)
 
 		args := Args{
 			ID:        ids[i],
@@ -89,17 +95,17 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		}
 		cont, err := New(conf, args)
 		if err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error creating container: %v", err)
 		}
 		containers = append(containers, cont)
 
 		if err := cont.Start(conf); err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error starting container: %v", err)
 		}
 	}
-	return containers, cleanup, nil
+
+	localClean.Release()
+	return containers, cleanupAll, nil
 }
 
 type execDesc struct {
@@ -135,159 +141,159 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
 
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		specs, ids := createSpecs(sleep, sleep)
-		containers, cleanup, err := startContainers(conf, specs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			specs, ids := createSpecs(sleep, sleep)
+			containers, cleanup, err := startContainers(conf, specs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that multiple processes are running.
-		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
-		}
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			expectedPL = []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
 	}
 }
 
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		testSpecs, ids := createSpecs(sleep, sleep)
-		testSpecs[1].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			testSpecs, ids := createSpecs(sleep, sleep)
+			testSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+					},
 				},
-			},
-		}
+			}
 
-		containers, cleanup, err := startContainers(conf, testSpecs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, testSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that multiple processes are running.
-		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			expectedPL = []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
 	}
 }
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		testSpecs, ids := createSpecs(sleep, sleep, sleep)
-		testSpecs[0].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
-					Path: "/proc/1/ns/pid",
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			testSpecs, ids := createSpecs(sleep, sleep, sleep)
+			testSpecs[0].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/1/ns/pid",
+					},
 				},
-			},
-		}
-		testSpecs[1].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
-					Path: "/proc/1/ns/pid",
+			}
+			testSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/1/ns/pid",
+					},
 				},
-			},
-		}
-		testSpecs[2].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
-					Path: "/proc/2/ns/pid",
+			}
+			testSpecs[2].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/2/ns/pid",
+					},
 				},
-			},
-		}
+			}
 
-		containers, cleanup, err := startContainers(conf, testSpecs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, testSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that multiple processes are running.
-		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-		if err := waitForProcessList(containers[2], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			if err := waitForProcessList(containers[2], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
-		}
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			expectedPL = []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
 	}
 }
 
 func TestMultiContainerWait(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -361,11 +367,11 @@ func TestMultiContainerWait(t *testing.T) {
 // TestExecWait ensures what we can wait containers and individual processes in the
 // sandbox that have already exited.
 func TestExecWait(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -457,11 +463,11 @@ func TestMultiContainerMount(t *testing.T) {
 	})
 
 	// Setup the containers.
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -484,174 +490,174 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
 
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		specs, ids := createSpecs(sleep, sleep)
-		containers, cleanup, err := startContainers(conf, specs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			specs, ids := createSpecs(sleep, sleep)
+			containers, cleanup, err := startContainers(conf, specs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that container 1 process is running.
-		expectedPL := []*control.Process{
-			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
-		}
+			// Check via ps that container 1 process is running.
+			expectedPL := []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
 
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		// Kill process 2.
-		if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil {
-			t.Errorf("failed to kill process 2: %v", err)
-		}
+			// Kill process 2.
+			if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil {
+				t.Errorf("failed to kill process 2: %v", err)
+			}
 
-		// Make sure process 1 is still running.
-		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Make sure process 1 is still running.
+			expectedPL = []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		// goferPid is reset when container is destroyed.
-		goferPid := containers[1].GoferPid
+			// goferPid is reset when container is destroyed.
+			goferPid := containers[1].GoferPid
 
-		// Destroy container and ensure container's gofer process has exited.
-		if err := containers[1].Destroy(); err != nil {
-			t.Errorf("failed to destroy container: %v", err)
-		}
-		_, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
-			cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
-			return uintptr(cpid), 0, err
-		})
-		if err != syscall.ECHILD {
-			t.Errorf("error waiting for gofer to exit: %v", err)
-		}
-		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Destroy container and ensure container's gofer process has exited.
+			if err := containers[1].Destroy(); err != nil {
+				t.Errorf("failed to destroy container: %v", err)
+			}
+			_, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
+				cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
+				return uintptr(cpid), 0, err
+			})
+			if err != syscall.ECHILD {
+				t.Errorf("error waiting for gofer to exit: %v", err)
+			}
+			// Make sure process 1 is still running.
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		// Now that process 2 is gone, ensure we get an error trying to
-		// signal it again.
-		if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil {
-			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
-		}
+			// Now that process 2 is gone, ensure we get an error trying to
+			// signal it again.
+			if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil {
+				t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
+			}
 
-		// Kill process 1.
-		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil {
-			t.Errorf("failed to kill process 1: %v", err)
-		}
+			// Kill process 1.
+			if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil {
+				t.Errorf("failed to kill process 1: %v", err)
+			}
 
-		// Ensure that container's gofer and sandbox process are no more.
-		err = blockUntilWaitable(containers[0].GoferPid)
-		if err != nil && err != syscall.ECHILD {
-			t.Errorf("error waiting for gofer to exit: %v", err)
-		}
+			// Ensure that container's gofer and sandbox process are no more.
+			err = blockUntilWaitable(containers[0].GoferPid)
+			if err != nil && err != syscall.ECHILD {
+				t.Errorf("error waiting for gofer to exit: %v", err)
+			}
 
-		err = blockUntilWaitable(containers[0].Sandbox.Pid)
-		if err != nil && err != syscall.ECHILD {
-			t.Errorf("error waiting for sandbox to exit: %v", err)
-		}
+			err = blockUntilWaitable(containers[0].Sandbox.Pid)
+			if err != nil && err != syscall.ECHILD {
+				t.Errorf("error waiting for sandbox to exit: %v", err)
+			}
 
-		// The sentry should be gone, so signaling should yield an error.
-		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
-			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
-		}
+			// The sentry should be gone, so signaling should yield an error.
+			if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
+				t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
+			}
 
-		if err := containers[0].Destroy(); err != nil {
-			t.Errorf("failed to destroy container: %v", err)
-		}
+			if err := containers[0].Destroy(); err != nil {
+				t.Errorf("failed to destroy container: %v", err)
+			}
+		})
 	}
 }
 
 // TestMultiContainerDestroy checks that container are properly cleaned-up when
 // they are destroyed.
 func TestMultiContainerDestroy(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// First container will remain intact while the second container is killed.
-		podSpecs, ids := createSpecs(
-			[]string{"sleep", "100"},
-			[]string{app, "fork-bomb"})
-
-		// Run the fork bomb in a PID namespace to prevent processes to be
-		// re-parented to PID=1 in the root container.
-		podSpecs[1].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{{Type: "pid"}},
-		}
-		containers, cleanup, err := startContainers(conf, podSpecs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// First container will remain intact while the second container is killed.
+			podSpecs, ids := createSpecs(
+				[]string{"sleep", "100"},
+				[]string{app, "fork-bomb"})
+
+			// Run the fork bomb in a PID namespace to prevent processes to be
+			// re-parented to PID=1 in the root container.
+			podSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{{Type: "pid"}},
+			}
+			containers, cleanup, err := startContainers(conf, podSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Exec more processes to ensure signal all works for exec'd processes too.
-		args := &control.ExecArgs{
-			Filename: app,
-			Argv:     []string{app, "fork-bomb"},
-		}
-		if _, err := containers[1].Execute(args); err != nil {
-			t.Fatalf("error exec'ing: %v", err)
-		}
+			// Exec more processes to ensure signal all works for exec'd processes too.
+			args := &control.ExecArgs{
+				Filename: app,
+				Argv:     []string{app, "fork-bomb"},
+			}
+			if _, err := containers[1].Execute(args); err != nil {
+				t.Fatalf("error exec'ing: %v", err)
+			}
 
-		// Let it brew...
-		time.Sleep(500 * time.Millisecond)
+			// Let it brew...
+			time.Sleep(500 * time.Millisecond)
 
-		if err := containers[1].Destroy(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
+			if err := containers[1].Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
 
-		// Check that destroy killed all processes belonging to the container and
-		// waited for them to exit before returning.
-		pss, err := containers[0].Sandbox.Processes("")
-		if err != nil {
-			t.Fatalf("error getting process data from sandbox: %v", err)
-		}
-		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
-		if r, err := procListsEqual(pss, expectedPL); !r {
-			t.Errorf("container got process list: %s, want: %s: error: %v",
-				procListToString(pss), procListToString(expectedPL), err)
-		}
+			// Check that destroy killed all processes belonging to the container and
+			// waited for them to exit before returning.
+			pss, err := containers[0].Sandbox.Processes("")
+			if err != nil {
+				t.Fatalf("error getting process data from sandbox: %v", err)
+			}
+			expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+			if r, err := procListsEqual(pss, expectedPL); !r {
+				t.Errorf("container got process list: %s, want: %s: error: %v",
+					procListToString(pss), procListToString(expectedPL), err)
+			}
 
-		// Check that cont.Destroy is safe to call multiple times.
-		if err := containers[1].Destroy(); err != nil {
-			t.Errorf("error destroying container: %v", err)
-		}
+			// Check that cont.Destroy is safe to call multiple times.
+			if err := containers[1].Destroy(); err != nil {
+				t.Errorf("error destroying container: %v", err)
+			}
+		})
 	}
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -706,11 +712,11 @@ func TestMultiContainerProcesses(t *testing.T) {
 // TestMultiContainerKillAll checks that all process that belong to a container
 // are killed when SIGKILL is sent to *all* processes in that container.
 func TestMultiContainerKillAll(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -721,7 +727,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 		{killContainer: true},
 		{killContainer: false},
 	} {
-		app, err := testutil.FindFile("runsc/container/test_app/test_app")
+		app, err := testutil.FindFile("test/cmd/test_app/test_app")
 		if err != nil {
 			t.Fatal("error finding test_app:", err)
 		}
@@ -739,11 +745,11 @@ func TestMultiContainerKillAll(t *testing.T) {
 		// Wait until all processes are created.
 		rootProcCount := int(math.Pow(2, 3) - 1)
 		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waitting for processes: %v", err)
 		}
 		procCount := int(math.Pow(2, 5) - 1)
 		if err := waitForProcessCount(containers[1], procCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 
 		// Exec more processes to ensure signal works for exec'd processes too.
@@ -757,7 +763,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 		// Wait for these new processes to start.
 		procCount += int(math.Pow(2, 3) - 1)
 		if err := waitForProcessCount(containers[1], procCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 
 		if tc.killContainer {
@@ -790,11 +796,11 @@ func TestMultiContainerKillAll(t *testing.T) {
 
 		// Check that all processes are gone.
 		if err := waitForProcessCount(containers[1], 0); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 		// Check that root container was not affected.
 		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 	}
 }
@@ -805,17 +811,16 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 		[]string{"/bin/sleep", "100"})
 
 	conf := testutil.TestConfig(t)
-	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(rootBundleDir)
+	defer cleanup()
 
 	rootArgs := Args{
 		ID:        ids[0],
 		Spec:      specs[0],
-		BundleDir: rootBundleDir,
+		BundleDir: bundleDir,
 	}
 	root, err := New(conf, rootArgs)
 	if err != nil {
@@ -827,11 +832,11 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	}
 
 	// Create and destroy sub-container.
-	bundleDir, err := testutil.SetupBundleDir(specs[1])
+	bundleDir, cleanupSub, err := testutil.SetupBundleDir(specs[1])
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(bundleDir)
+	defer cleanupSub()
 
 	args := Args{
 		ID:        ids[1],
@@ -859,17 +864,16 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	specs, ids := createSpecs(cmds...)
 
 	conf := testutil.TestConfig(t)
-	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
+	rootDir, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(rootBundleDir)
+	defer cleanup()
 
 	rootArgs := Args{
 		ID:        ids[0],
 		Spec:      specs[0],
-		BundleDir: rootBundleDir,
+		BundleDir: bundleDir,
 	}
 	root, err := New(conf, rootArgs)
 	if err != nil {
@@ -886,16 +890,16 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 			continue // skip root container
 		}
 
-		bundleDir, err := testutil.SetupBundleDir(specs[i])
+		bundleDir, cleanup, err := testutil.SetupBundleDir(specs[i])
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(bundleDir)
+		defer cleanup()
 
 		rootArgs := Args{
 			ID:        ids[i],
 			Spec:      specs[i],
-			BundleDir: rootBundleDir,
+			BundleDir: bundleDir,
 		}
 		cont, err := New(conf, rootArgs)
 		if err != nil {
@@ -937,11 +941,11 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
 	cmd := []string{"sh", "-c", script}
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -977,7 +981,7 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 // TestMultiContainerContainerDestroyStress tests that IO operations continue
 // to work after containers have been stopped and gofers killed.
 func TestMultiContainerContainerDestroyStress(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1007,12 +1011,11 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	childrenIDs := allIDs[1:]
 
 	conf := testutil.TestConfig(t)
-	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Start root container.
 	rootArgs := Args{
@@ -1038,11 +1041,11 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 
 		var children []*Container
 		for j, spec := range specs {
-			bundleDir, err := testutil.SetupBundleDir(spec)
+			bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
-			defer os.RemoveAll(bundleDir)
+			defer cleanup()
 
 			args := Args{
 				ID:        ids[j],
@@ -1080,306 +1083,306 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 // Test that pod shared mounts are properly mounted in 2 containers and that
 // changes from one container is reflected in the other.
 func TestMultiContainerSharedMount(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		podSpec, ids := createSpecs(sleep, sleep)
-		mnt0 := specs.Mount{
-			Destination: "/mydir/test",
-			Source:      "/some/dir",
-			Type:        "tmpfs",
-			Options:     nil,
-		}
-		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     nil,
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-		mnt1 := mnt0
-		mnt1.Destination = "/mydir2/test2"
-		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-		createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-		containers, cleanup, err := startContainers(conf, podSpec, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		file0 := path.Join(mnt0.Destination, "abc")
-		file1 := path.Join(mnt1.Destination, "abc")
-		execs := []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-				desc: "directory is mounted in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-				desc: "directory is mounted in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/touch", file0},
-				desc: "create file in container0",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-f", file0},
-				desc: "file appears in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-f", file1},
-				desc: "file appears in container1",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/bin/rm", file1},
-				desc: "file removed from container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-				desc: "file removed from container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-				desc: "file removed from container1",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/bin/mkdir", file1},
-				desc: "create directory in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-d", file0},
-				desc: "dir appears in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-d", file1},
-				desc: "dir appears in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/bin/rmdir", file0},
-				desc: "create directory in container0",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "!", "-d", file0},
-				desc: "dir removed from container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "!", "-d", file1},
-				desc: "dir removed from container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					desc: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					desc: "directory is mounted in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					desc: "create file in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file appears in container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/rm", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+					desc: "file removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/mkdir", file1},
+					desc: "create directory in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", file0},
+					desc: "dir appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", file1},
+					desc: "dir appears in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/bin/rmdir", file0},
+					desc: "create directory in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-d", file0},
+					desc: "dir removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-d", file1},
+					desc: "dir removed from container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
 	}
 }
 
 // Test that pod mounts are mounted as readonly when requested.
 func TestMultiContainerSharedMountReadonly(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		podSpec, ids := createSpecs(sleep, sleep)
-		mnt0 := specs.Mount{
-			Destination: "/mydir/test",
-			Source:      "/some/dir",
-			Type:        "tmpfs",
-			Options:     []string{"ro"},
-		}
-		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     []string{"ro"},
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-		mnt1 := mnt0
-		mnt1.Destination = "/mydir2/test2"
-		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-		createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-		containers, cleanup, err := startContainers(conf, podSpec, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		file0 := path.Join(mnt0.Destination, "abc")
-		file1 := path.Join(mnt1.Destination, "abc")
-		execs := []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-				desc: "directory is mounted in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-				desc: "directory is mounted in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/touch", file0},
-				want: 1,
-				desc: "fails to write to container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/touch", file1},
-				want: 1,
-				desc: "fails to write to container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					desc: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					desc: "directory is mounted in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					want: 1,
+					desc: "fails to write to container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/touch", file1},
+					want: 1,
+					desc: "fails to write to container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
 	}
 }
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	for _, conf := range configs(t, all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		podSpec, ids := createSpecs(sleep, sleep)
-		mnt0 := specs.Mount{
-			Destination: "/mydir/test",
-			Source:      "/some/dir",
-			Type:        "tmpfs",
-			Options:     nil,
-		}
-		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     nil,
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-		mnt1 := mnt0
-		mnt1.Destination = "/mydir2/test2"
-		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-		createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-		containers, cleanup, err := startContainers(conf, podSpec, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		file0 := path.Join(mnt0.Destination, "abc")
-		file1 := path.Join(mnt1.Destination, "abc")
-		execs := []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/touch", file0},
-				desc: "create file in container0",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-f", file0},
-				desc: "file appears in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-f", file1},
-				desc: "file appears in container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					desc: "create file in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file appears in container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
 
-		containers[1].Destroy()
+			containers[1].Destroy()
 
-		bundleDir, err := testutil.SetupBundleDir(podSpec[1])
-		if err != nil {
-			t.Fatalf("error restarting container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
+			bundleDir, cleanup, err := testutil.SetupBundleDir(podSpec[1])
+			if err != nil {
+				t.Fatalf("error restarting container: %v", err)
+			}
+			defer cleanup()
 
-		args := Args{
-			ID:        ids[1],
-			Spec:      podSpec[1],
-			BundleDir: bundleDir,
-		}
-		containers[1], err = New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		if err := containers[1].Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			args := Args{
+				ID:        ids[1],
+				Spec:      podSpec[1],
+				BundleDir: bundleDir,
+			}
+			containers[1], err = New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			if err := containers[1].Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		execs = []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-f", file0},
-				desc: "file is still in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-f", file1},
-				desc: "file is still in container1",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/bin/rm", file1},
-				desc: "file removed from container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-				desc: "file removed from container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-				desc: "file removed from container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			execs = []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file is still in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file is still in container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/rm", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+					desc: "file removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+					desc: "file removed from container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
 	}
 }
 
 // Test that unsupported pod mounts options are ignored when matching master and
 // slave mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -1428,7 +1431,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 // Test that one container can send an FD to another container, even though
 // they have distinct MountNamespaces.
 func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1457,11 +1460,11 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 		Type:        "tmpfs",
 	}
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -1494,11 +1497,11 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 
 // Test that container is destroyed when Gofer is killed.
 func TestMultiContainerGoferKilled(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -1581,11 +1584,11 @@ func TestMultiContainerLoadSandbox(t *testing.T) {
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
@@ -1614,7 +1617,7 @@ func TestMultiContainerLoadSandbox(t *testing.T) {
 	}
 
 	// Create a valid but empty container directory.
-	randomCID := testutil.UniqueContainerID()
+	randomCID := testutil.RandomContainerID()
 	dir = filepath.Join(conf.RootDir, randomCID)
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
@@ -1681,11 +1684,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index f80852414..bac177a88 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -24,8 +24,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // TestSharedVolume checks that modifications to a volume mount are propagated
@@ -33,7 +33,6 @@ import (
 func TestSharedVolume(t *testing.T) {
 	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
-	t.Logf("Running test with conf: %+v", conf)
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -44,16 +43,15 @@ func TestSharedVolume(t *testing.T) {
 		t.Fatalf("TempDir failed: %v", err)
 	}
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -192,7 +190,6 @@ func checkFile(c *Container, filename string, want []byte) error {
 func TestSharedVolumeFile(t *testing.T) {
 	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
-	t.Logf("Running test with conf: %+v", conf)
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -203,16 +200,15 @@ func TestSharedVolumeFile(t *testing.T) {
 		t.Fatalf("TempDir failed: %v", err)
 	}
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
deleted file mode 100644
index 0defbd9fc..000000000
--- a/runsc/container/test_app/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tools:defs.bzl", "go_binary")
-
-package(licenses = ["notice"])
-
-go_binary(
-    name = "test_app",
-    testonly = 1,
-    srcs = [
-        "fds.go",
-        "test_app.go",
-    ],
-    pure = True,
-    visibility = ["//runsc/container:__pkg__"],
-    deps = [
-        "//pkg/unet",
-        "//runsc/flag",
-        "//runsc/testutil",
-        "@com_github_google_subcommands//:go_default_library",
-        "@com_github_kr_pty//:go_default_library",
-    ],
-)
diff --git a/runsc/container/test_app/fds.go b/runsc/container/test_app/fds.go
deleted file mode 100644
index 2a146a2c3..000000000
--- a/runsc/container/test_app/fds.go
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"context"
-	"io/ioutil"
-	"log"
-	"os"
-	"time"
-
-	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/pkg/unet"
-	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-const fileContents = "foobarbaz"
-
-// fdSender will open a file and send the FD over a unix domain socket.
-type fdSender struct {
-	socketPath string
-}
-
-// Name implements subcommands.Command.Name.
-func (*fdSender) Name() string {
-	return "fd_sender"
-}
-
-// Synopsis implements subcommands.Command.Synopsys.
-func (*fdSender) Synopsis() string {
-	return "creates a file and sends the FD over the socket"
-}
-
-// Usage implements subcommands.Command.Usage.
-func (*fdSender) Usage() string {
-	return "fd_sender <flags>"
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (fds *fdSender) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&fds.socketPath, "socket", "", "path to socket")
-}
-
-// Execute implements subcommands.Command.Execute.
-func (fds *fdSender) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if fds.socketPath == "" {
-		log.Fatalf("socket flag must be set")
-	}
-
-	dir, err := ioutil.TempDir("", "")
-	if err != nil {
-		log.Fatalf("TempDir failed: %v", err)
-	}
-
-	fileToSend, err := ioutil.TempFile(dir, "")
-	if err != nil {
-		log.Fatalf("TempFile failed: %v", err)
-	}
-	defer fileToSend.Close()
-
-	if _, err := fileToSend.WriteString(fileContents); err != nil {
-		log.Fatalf("Write(%q) failed: %v", fileContents, err)
-	}
-
-	// Receiver may not be started yet, so try connecting in a poll loop.
-	var s *unet.Socket
-	if err := testutil.Poll(func() error {
-		var err error
-		s, err = unet.Connect(fds.socketPath, true /* SEQPACKET, so we can send empty message with FD */)
-		return err
-	}, 10*time.Second); err != nil {
-		log.Fatalf("Error connecting to socket %q: %v", fds.socketPath, err)
-	}
-	defer s.Close()
-
-	w := s.Writer(true)
-	w.ControlMessage.PackFDs(int(fileToSend.Fd()))
-	if _, err := w.WriteVec([][]byte{[]byte{'a'}}); err != nil {
-		log.Fatalf("Error sending FD %q over socket %q: %v", fileToSend.Fd(), fds.socketPath, err)
-	}
-
-	log.Print("FD SENDER exiting successfully")
-	return subcommands.ExitSuccess
-}
-
-// fdReceiver receives an FD from a unix domain socket and does things to it.
-type fdReceiver struct {
-	socketPath string
-}
-
-// Name implements subcommands.Command.Name.
-func (*fdReceiver) Name() string {
-	return "fd_receiver"
-}
-
-// Synopsis implements subcommands.Command.Synopsys.
-func (*fdReceiver) Synopsis() string {
-	return "reads an FD from a unix socket, and then does things to it"
-}
-
-// Usage implements subcommands.Command.Usage.
-func (*fdReceiver) Usage() string {
-	return "fd_receiver <flags>"
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (fdr *fdReceiver) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&fdr.socketPath, "socket", "", "path to socket")
-}
-
-// Execute implements subcommands.Command.Execute.
-func (fdr *fdReceiver) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if fdr.socketPath == "" {
-		log.Fatalf("Flags cannot be empty, given: socket: %q", fdr.socketPath)
-	}
-
-	ss, err := unet.BindAndListen(fdr.socketPath, true /* packet */)
-	if err != nil {
-		log.Fatalf("BindAndListen(%q) failed: %v", fdr.socketPath, err)
-	}
-	defer ss.Close()
-
-	var s *unet.Socket
-	c := make(chan error, 1)
-	go func() {
-		var err error
-		s, err = ss.Accept()
-		c <- err
-	}()
-
-	select {
-	case err := <-c:
-		if err != nil {
-			log.Fatalf("Accept() failed: %v", err)
-		}
-	case <-time.After(10 * time.Second):
-		log.Fatalf("Timeout waiting for accept")
-	}
-
-	r := s.Reader(true)
-	r.EnableFDs(1)
-	b := [][]byte{{'a'}}
-	if n, err := r.ReadVec(b); n != 1 || err != nil {
-		log.Fatalf("ReadVec got n=%d err %v (wanted 0, nil)", n, err)
-	}
-
-	fds, err := r.ExtractFDs()
-	if err != nil {
-		log.Fatalf("ExtractFD() got err %v", err)
-	}
-	if len(fds) != 1 {
-		log.Fatalf("ExtractFD() got %d FDs, wanted 1", len(fds))
-	}
-	fd := fds[0]
-
-	file := os.NewFile(uintptr(fd), "received file")
-	defer file.Close()
-	if _, err := file.Seek(0, os.SEEK_SET); err != nil {
-		log.Fatalf("Seek(0, 0) failed: %v", err)
-	}
-
-	got, err := ioutil.ReadAll(file)
-	if err != nil {
-		log.Fatalf("ReadAll failed: %v", err)
-	}
-	if string(got) != fileContents {
-		log.Fatalf("ReadAll got %q want %q", string(got), fileContents)
-	}
-
-	log.Print("FD RECEIVER exiting successfully")
-	return subcommands.ExitSuccess
-}
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
deleted file mode 100644
index 5f1c4b7d6..000000000
--- a/runsc/container/test_app/test_app.go
+++ /dev/null
@@ -1,394 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary test_app is like a swiss knife for tests that need to run anything
-// inside the sandbox. New functionality can be added with new commands.
-package main
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"log"
-	"net"
-	"os"
-	"os/exec"
-	"regexp"
-	"strconv"
-	sys "syscall"
-	"time"
-
-	"github.com/google/subcommands"
-	"github.com/kr/pty"
-	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-func main() {
-	subcommands.Register(subcommands.HelpCommand(), "")
-	subcommands.Register(subcommands.FlagsCommand(), "")
-	subcommands.Register(new(capability), "")
-	subcommands.Register(new(fdReceiver), "")
-	subcommands.Register(new(fdSender), "")
-	subcommands.Register(new(forkBomb), "")
-	subcommands.Register(new(ptyRunner), "")
-	subcommands.Register(new(reaper), "")
-	subcommands.Register(new(syscall), "")
-	subcommands.Register(new(taskTree), "")
-	subcommands.Register(new(uds), "")
-
-	flag.Parse()
-
-	exitCode := subcommands.Execute(context.Background())
-	os.Exit(int(exitCode))
-}
-
-type uds struct {
-	fileName   string
-	socketPath string
-}
-
-// Name implements subcommands.Command.Name.
-func (*uds) Name() string {
-	return "uds"
-}
-
-// Synopsis implements subcommands.Command.Synopsys.
-func (*uds) Synopsis() string {
-	return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file"
-}
-
-// Usage implements subcommands.Command.Usage.
-func (*uds) Usage() string {
-	return "uds <flags>"
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (c *uds) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&c.fileName, "file", "", "name of output file")
-	f.StringVar(&c.socketPath, "socket", "", "path to socket")
-}
-
-// Execute implements subcommands.Command.Execute.
-func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if c.fileName == "" || c.socketPath == "" {
-		log.Fatalf("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
-		return subcommands.ExitFailure
-	}
-	outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666)
-	if err != nil {
-		log.Fatal("error opening output file:", err)
-	}
-
-	defer os.Remove(c.socketPath)
-
-	listener, err := net.Listen("unix", c.socketPath)
-	if err != nil {
-		log.Fatalf("error listening on socket %q: %v", c.socketPath, err)
-	}
-
-	go server(listener, outputFile)
-	for i := 0; ; i++ {
-		conn, err := net.Dial("unix", c.socketPath)
-		if err != nil {
-			log.Fatal("error dialing:", err)
-		}
-		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
-			log.Fatal("error writing:", err)
-		}
-		conn.Close()
-		time.Sleep(100 * time.Millisecond)
-	}
-}
-
-func server(listener net.Listener, out *os.File) {
-	buf := make([]byte, 16)
-
-	for {
-		c, err := listener.Accept()
-		if err != nil {
-			log.Fatal("error accepting connection:", err)
-		}
-		nr, err := c.Read(buf)
-		if err != nil {
-			log.Fatal("error reading from buf:", err)
-		}
-		data := buf[0:nr]
-		fmt.Fprint(out, string(data)+"\n")
-	}
-}
-
-type taskTree struct {
-	depth int
-	width int
-	pause bool
-}
-
-// Name implements subcommands.Command.
-func (*taskTree) Name() string {
-	return "task-tree"
-}
-
-// Synopsis implements subcommands.Command.
-func (*taskTree) Synopsis() string {
-	return "creates a tree of tasks"
-}
-
-// Usage implements subcommands.Command.
-func (*taskTree) Usage() string {
-	return "task-tree <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (c *taskTree) SetFlags(f *flag.FlagSet) {
-	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
-	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
-	f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually")
-}
-
-// Execute implements subcommands.Command.
-func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	stop := testutil.StartReaper()
-	defer stop()
-
-	if c.depth == 0 {
-		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
-		select {}
-	}
-	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
-
-	var cmds []*exec.Cmd
-	for i := 0; i < c.width; i++ {
-		cmd := exec.Command(
-			"/proc/self/exe", c.Name(),
-			"--depth", strconv.Itoa(c.depth-1),
-			"--width", strconv.Itoa(c.width),
-			"--pause", strconv.FormatBool(c.pause))
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-
-		if err := cmd.Start(); err != nil {
-			log.Fatal("failed to call self:", err)
-		}
-		cmds = append(cmds, cmd)
-	}
-
-	for _, c := range cmds {
-		c.Wait()
-	}
-
-	if c.pause {
-		select {}
-	}
-
-	return subcommands.ExitSuccess
-}
-
-type forkBomb struct {
-	delay time.Duration
-}
-
-// Name implements subcommands.Command.
-func (*forkBomb) Name() string {
-	return "fork-bomb"
-}
-
-// Synopsis implements subcommands.Command.
-func (*forkBomb) Synopsis() string {
-	return "creates child process until the end of times"
-}
-
-// Usage implements subcommands.Command.
-func (*forkBomb) Usage() string {
-	return "fork-bomb <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (c *forkBomb) SetFlags(f *flag.FlagSet) {
-	f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child")
-}
-
-// Execute implements subcommands.Command.
-func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	time.Sleep(c.delay)
-
-	cmd := exec.Command("/proc/self/exe", c.Name())
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		log.Fatal("failed to call self:", err)
-	}
-	return subcommands.ExitSuccess
-}
-
-type reaper struct{}
-
-// Name implements subcommands.Command.
-func (*reaper) Name() string {
-	return "reaper"
-}
-
-// Synopsis implements subcommands.Command.
-func (*reaper) Synopsis() string {
-	return "reaps all children in a loop"
-}
-
-// Usage implements subcommands.Command.
-func (*reaper) Usage() string {
-	return "reaper <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (*reaper) SetFlags(*flag.FlagSet) {}
-
-// Execute implements subcommands.Command.
-func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	stop := testutil.StartReaper()
-	defer stop()
-	select {}
-}
-
-type syscall struct {
-	sysno uint64
-}
-
-// Name implements subcommands.Command.
-func (*syscall) Name() string {
-	return "syscall"
-}
-
-// Synopsis implements subcommands.Command.
-func (*syscall) Synopsis() string {
-	return "syscall makes a syscall"
-}
-
-// Usage implements subcommands.Command.
-func (*syscall) Usage() string {
-	return "syscall <flags>"
-}
-
-// SetFlags implements subcommands.Command.
-func (s *syscall) SetFlags(f *flag.FlagSet) {
-	f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call")
-}
-
-// Execute implements subcommands.Command.
-func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 {
-		fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno)
-	} else {
-		fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno)
-	}
-	return subcommands.ExitSuccess
-}
-
-type capability struct {
-	enabled  uint64
-	disabled uint64
-}
-
-// Name implements subcommands.Command.
-func (*capability) Name() string {
-	return "capability"
-}
-
-// Synopsis implements subcommands.Command.
-func (*capability) Synopsis() string {
-	return "checks if effective capabilities are set/unset"
-}
-
-// Usage implements subcommands.Command.
-func (*capability) Usage() string {
-	return "capability [--enabled=number] [--disabled=number]"
-}
-
-// SetFlags implements subcommands.Command.
-func (c *capability) SetFlags(f *flag.FlagSet) {
-	f.Uint64Var(&c.enabled, "enabled", 0, "")
-	f.Uint64Var(&c.disabled, "disabled", 0, "")
-}
-
-// Execute implements subcommands.Command.
-func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if c.enabled == 0 && c.disabled == 0 {
-		fmt.Println("One of the flags must be set")
-		return subcommands.ExitUsageError
-	}
-
-	status, err := ioutil.ReadFile("/proc/self/status")
-	if err != nil {
-		fmt.Printf("Error reading %q: %v\n", "proc/self/status", err)
-		return subcommands.ExitFailure
-	}
-	re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n")
-	matches := re.FindStringSubmatch(string(status))
-	if matches == nil || len(matches) != 2 {
-		fmt.Printf("Effective capabilities not found in\n%s\n", status)
-		return subcommands.ExitFailure
-	}
-	caps, err := strconv.ParseUint(matches[1], 16, 64)
-	if err != nil {
-		fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err)
-		return subcommands.ExitFailure
-	}
-
-	if c.enabled != 0 && (caps&c.enabled) != c.enabled {
-		fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps)
-		return subcommands.ExitFailure
-	}
-	if c.disabled != 0 && (caps&c.disabled) != 0 {
-		fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps)
-		return subcommands.ExitFailure
-	}
-
-	return subcommands.ExitSuccess
-}
-
-type ptyRunner struct{}
-
-// Name implements subcommands.Command.
-func (*ptyRunner) Name() string {
-	return "pty-runner"
-}
-
-// Synopsis implements subcommands.Command.
-func (*ptyRunner) Synopsis() string {
-	return "runs the given command with an open pty terminal"
-}
-
-// Usage implements subcommands.Command.
-func (*ptyRunner) Usage() string {
-	return "pty-runner [command]"
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
-
-// Execute implements subcommands.Command.
-func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
-	c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
-	f, err := pty.Start(c)
-	if err != nil {
-		fmt.Printf("pty.Start failed: %v", err)
-		return subcommands.ExitFailure
-	}
-	defer f.Close()
-
-	// Copy stdout from the command to keep this process alive until the
-	// subprocess exits.
-	io.Copy(os.Stdout, f)
-
-	return subcommands.ExitSuccess
-}
diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD
deleted file mode 100644
index 8a571a000..000000000
--- a/runsc/criutil/BUILD
+++ /dev/null
@@ -1,11 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "criutil",
-    testonly = 1,
-    srcs = ["criutil.go"],
-    visibility = ["//:sandbox"],
-    deps = ["//runsc/testutil"],
-)
diff --git a/runsc/criutil/criutil.go b/runsc/criutil/criutil.go
deleted file mode 100644
index 773f5a1c4..000000000
--- a/runsc/criutil/criutil.go
+++ /dev/null
@@ -1,277 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package criutil contains utility functions for interacting with the
-// Container Runtime Interface (CRI), principally via the crictl command line
-// tool. This requires critools to be installed on the local system.
-package criutil
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"os/exec"
-	"strings"
-	"time"
-
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-const endpointPrefix = "unix://"
-
-// Crictl contains information required to run the crictl utility.
-type Crictl struct {
-	executable      string
-	timeout         time.Duration
-	imageEndpoint   string
-	runtimeEndpoint string
-}
-
-// NewCrictl returns a Crictl configured with a timeout and an endpoint over
-// which it will talk to containerd.
-func NewCrictl(timeout time.Duration, endpoint string) *Crictl {
-	// Bazel doesn't pass PATH through, assume the location of crictl
-	// unless specified by environment variable.
-	executable := os.Getenv("CRICTL_PATH")
-	if executable == "" {
-		executable = "/usr/local/bin/crictl"
-	}
-	return &Crictl{
-		executable:      executable,
-		timeout:         timeout,
-		imageEndpoint:   endpointPrefix + endpoint,
-		runtimeEndpoint: endpointPrefix + endpoint,
-	}
-}
-
-// Pull pulls an container image. It corresponds to `crictl pull`.
-func (cc *Crictl) Pull(imageName string) error {
-	_, err := cc.run("pull", imageName)
-	return err
-}
-
-// RunPod creates a sandbox. It corresponds to `crictl runp`.
-func (cc *Crictl) RunPod(sbSpecFile string) (string, error) {
-	podID, err := cc.run("runp", sbSpecFile)
-	if err != nil {
-		return "", fmt.Errorf("runp failed: %v", err)
-	}
-	// Strip the trailing newline from crictl output.
-	return strings.TrimSpace(podID), nil
-}
-
-// Create creates a container within a sandbox. It corresponds to `crictl
-// create`.
-func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) {
-	podID, err := cc.run("create", podID, contSpecFile, sbSpecFile)
-	if err != nil {
-		return "", fmt.Errorf("create failed: %v", err)
-	}
-	// Strip the trailing newline from crictl output.
-	return strings.TrimSpace(podID), nil
-}
-
-// Start starts a container. It corresponds to `crictl start`.
-func (cc *Crictl) Start(contID string) (string, error) {
-	output, err := cc.run("start", contID)
-	if err != nil {
-		return "", fmt.Errorf("start failed: %v", err)
-	}
-	return output, nil
-}
-
-// Stop stops a container. It corresponds to `crictl stop`.
-func (cc *Crictl) Stop(contID string) error {
-	_, err := cc.run("stop", contID)
-	return err
-}
-
-// Exec execs a program inside a container. It corresponds to `crictl exec`.
-func (cc *Crictl) Exec(contID string, args ...string) (string, error) {
-	a := []string{"exec", contID}
-	a = append(a, args...)
-	output, err := cc.run(a...)
-	if err != nil {
-		return "", fmt.Errorf("exec failed: %v", err)
-	}
-	return output, nil
-}
-
-// Rm removes a container. It corresponds to `crictl rm`.
-func (cc *Crictl) Rm(contID string) error {
-	_, err := cc.run("rm", contID)
-	return err
-}
-
-// StopPod stops a pod. It corresponds to `crictl stopp`.
-func (cc *Crictl) StopPod(podID string) error {
-	_, err := cc.run("stopp", podID)
-	return err
-}
-
-// containsConfig is a minimal copy of
-// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto
-// It only contains fields needed for testing.
-type containerConfig struct {
-	Status containerStatus
-}
-
-type containerStatus struct {
-	Network containerNetwork
-}
-
-type containerNetwork struct {
-	IP string
-}
-
-// PodIP returns a pod's IP address.
-func (cc *Crictl) PodIP(podID string) (string, error) {
-	output, err := cc.run("inspectp", podID)
-	if err != nil {
-		return "", err
-	}
-	conf := &containerConfig{}
-	if err := json.Unmarshal([]byte(output), conf); err != nil {
-		return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output)
-	}
-	if conf.Status.Network.IP == "" {
-		return "", fmt.Errorf("no IP found in config: %s", output)
-	}
-	return conf.Status.Network.IP, nil
-}
-
-// RmPod removes a container. It corresponds to `crictl rmp`.
-func (cc *Crictl) RmPod(podID string) error {
-	_, err := cc.run("rmp", podID)
-	return err
-}
-
-// StartContainer pulls the given image ands starts the container in the
-// sandbox with the given podID.
-func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) {
-	// Write the specs to files that can be read by crictl.
-	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
-	if err != nil {
-		return "", fmt.Errorf("failed to write sandbox spec: %v", err)
-	}
-	contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
-	if err != nil {
-		return "", fmt.Errorf("failed to write container spec: %v", err)
-	}
-
-	return cc.startContainer(podID, image, sbSpecFile, contSpecFile)
-}
-
-func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) {
-	if err := cc.Pull(image); err != nil {
-		return "", fmt.Errorf("failed to pull %s: %v", image, err)
-	}
-
-	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
-	if err != nil {
-		return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
-	}
-
-	if _, err := cc.Start(contID); err != nil {
-		return "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
-	}
-
-	return contID, nil
-}
-
-// StopContainer stops and deletes the container with the given container ID.
-func (cc *Crictl) StopContainer(contID string) error {
-	if err := cc.Stop(contID); err != nil {
-		return fmt.Errorf("failed to stop container %q: %v", contID, err)
-	}
-
-	if err := cc.Rm(contID); err != nil {
-		return fmt.Errorf("failed to remove container %q: %v", contID, err)
-	}
-
-	return nil
-}
-
-// StartPodAndContainer pulls an image, then starts a sandbox and container in
-// that sandbox. It returns the pod ID and container ID.
-func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
-	// Write the specs to files that can be read by crictl.
-	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
-	if err != nil {
-		return "", "", fmt.Errorf("failed to write sandbox spec: %v", err)
-	}
-	contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
-	if err != nil {
-		return "", "", fmt.Errorf("failed to write container spec: %v", err)
-	}
-
-	podID, err := cc.RunPod(sbSpecFile)
-	if err != nil {
-		return "", "", err
-	}
-
-	contID, err := cc.startContainer(podID, image, sbSpecFile, contSpecFile)
-
-	return podID, contID, err
-}
-
-// StopPodAndContainer stops a container and pod.
-func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
-	if err := cc.StopContainer(contID); err != nil {
-		return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
-	}
-
-	if err := cc.StopPod(podID); err != nil {
-		return fmt.Errorf("failed to stop pod %q: %v", podID, err)
-	}
-
-	if err := cc.RmPod(podID); err != nil {
-		return fmt.Errorf("failed to remove pod %q: %v", podID, err)
-	}
-
-	return nil
-}
-
-// run runs crictl with the given args and returns an error if it takes longer
-// than cc.Timeout to run.
-func (cc *Crictl) run(args ...string) (string, error) {
-	defaultArgs := []string{
-		"--image-endpoint", cc.imageEndpoint,
-		"--runtime-endpoint", cc.runtimeEndpoint,
-	}
-	cmd := exec.Command(cc.executable, append(defaultArgs, args...)...)
-
-	// Run the command with a timeout.
-	done := make(chan string)
-	errCh := make(chan error)
-	go func() {
-		output, err := cmd.CombinedOutput()
-		if err != nil {
-			errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output))
-			return
-		}
-		done <- string(output)
-	}()
-	select {
-	case output := <-done:
-		return output, nil
-	case err := <-errCh:
-		return "", err
-	case <-time.After(cc.timeout):
-		if err := testutil.KillCommand(cmd); err != nil {
-			return "", fmt.Errorf("timed out, then couldn't kill process %+v: %v", cmd, err)
-		}
-		return "", fmt.Errorf("timed out: %+v", cmd)
-	}
-}
diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD
deleted file mode 100644
index 8621af901..000000000
--- a/runsc/dockerutil/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "dockerutil",
-    testonly = 1,
-    srcs = ["dockerutil.go"],
-    visibility = ["//:sandbox"],
-    deps = [
-        "//runsc/testutil",
-        "@com_github_kr_pty//:go_default_library",
-    ],
-)
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
deleted file mode 100644
index f009486bc..000000000
--- a/runsc/dockerutil/dockerutil.go
+++ /dev/null
@@ -1,486 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package dockerutil is a collection of utility functions, primarily for
-// testing.
-package dockerutil
-
-import (
-	"encoding/json"
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"log"
-	"os"
-	"os/exec"
-	"path"
-	"regexp"
-	"strconv"
-	"strings"
-	"syscall"
-	"time"
-
-	"github.com/kr/pty"
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-var (
-	// runtime is the runtime to use for tests. This will be applied to all
-	// containers. Note that the default here ("runsc") corresponds to the
-	// default used by the installations. This is important, because the
-	// default installer for vm_tests (in tools/installers:head, invoked
-	// via tools/vm:defs.bzl) will install with this name. So without
-	// changing anything, tests should have a runsc runtime available to
-	// them. Otherwise installers should update the existing runtime
-	// instead of installing a new one.
-	runtime = flag.String("runtime", "runsc", "specify which runtime to use")
-
-	// config is the default Docker daemon configuration path.
-	config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
-)
-
-// EnsureSupportedDockerVersion checks if correct docker is installed.
-func EnsureSupportedDockerVersion() {
-	cmd := exec.Command("docker", "version")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		log.Fatalf("Error running %q: %v", "docker version", err)
-	}
-	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
-	matches := re.FindStringSubmatch(string(out))
-	if len(matches) != 3 {
-		log.Fatalf("Invalid docker output: %s", out)
-	}
-	major, _ := strconv.Atoi(matches[1])
-	minor, _ := strconv.Atoi(matches[2])
-	if major < 17 || (major == 17 && minor < 9) {
-		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
-	}
-}
-
-// RuntimePath returns the binary path for the current runtime.
-func RuntimePath() (string, error) {
-	// Read the configuration data; the file must exist.
-	configBytes, err := ioutil.ReadFile(*config)
-	if err != nil {
-		return "", err
-	}
-
-	// Unmarshal the configuration.
-	c := make(map[string]interface{})
-	if err := json.Unmarshal(configBytes, &c); err != nil {
-		return "", err
-	}
-
-	// Decode the expected configuration.
-	r, ok := c["runtimes"]
-	if !ok {
-		return "", fmt.Errorf("no runtimes declared: %v", c)
-	}
-	rs, ok := r.(map[string]interface{})
-	if !ok {
-		// The runtimes are not a map.
-		return "", fmt.Errorf("unexpected format: %v", c)
-	}
-	r, ok = rs[*runtime]
-	if !ok {
-		// The expected runtime is not declared.
-		return "", fmt.Errorf("runtime %q not found: %v", *runtime, c)
-	}
-	rs, ok = r.(map[string]interface{})
-	if !ok {
-		// The runtime is not a map.
-		return "", fmt.Errorf("unexpected format: %v", c)
-	}
-	p, ok := rs["path"].(string)
-	if !ok {
-		// The runtime does not declare a path.
-		return "", fmt.Errorf("unexpected format: %v", c)
-	}
-	return p, nil
-}
-
-// MountMode describes if the mount should be ro or rw.
-type MountMode int
-
-const (
-	// ReadOnly is what the name says.
-	ReadOnly MountMode = iota
-	// ReadWrite is what the name says.
-	ReadWrite
-)
-
-// String returns the mount mode argument for this MountMode.
-func (m MountMode) String() string {
-	switch m {
-	case ReadOnly:
-		return "ro"
-	case ReadWrite:
-		return "rw"
-	}
-	panic(fmt.Sprintf("invalid mode: %d", m))
-}
-
-// MountArg formats the volume argument to mount in the container.
-func MountArg(source, target string, mode MountMode) string {
-	return fmt.Sprintf("-v=%s:%s:%v", source, target, mode)
-}
-
-// LinkArg formats the link argument.
-func LinkArg(source *Docker, target string) string {
-	return fmt.Sprintf("--link=%s:%s", source.Name, target)
-}
-
-// PrepareFiles creates temp directory to copy files there. The sandbox doesn't
-// have access to files in the test dir.
-func PrepareFiles(names ...string) (string, error) {
-	dir, err := ioutil.TempDir("", "image-test")
-	if err != nil {
-		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
-	}
-	if err := os.Chmod(dir, 0777); err != nil {
-		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
-	}
-	for _, name := range names {
-		src, err := testutil.FindFile(name)
-		if err != nil {
-			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
-		}
-		dst := path.Join(dir, path.Base(name))
-		if err := testutil.Copy(src, dst); err != nil {
-			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
-		}
-	}
-	return dir, nil
-}
-
-// do executes docker command.
-func do(args ...string) (string, error) {
-	log.Printf("Running: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("error executing docker %s: %v\nout: %s", args, err, out)
-	}
-	return string(out), nil
-}
-
-// doWithPty executes docker command with stdio attached to a pty.
-func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	log.Printf("Running with pty: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	ptmx, err := pty.Start(cmd)
-	if err != nil {
-		return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err)
-	}
-	return cmd, ptmx, nil
-}
-
-// Pull pulls a docker image. This is used in tests to isolate the
-// time to pull the image off the network from the time to actually
-// start the container, to avoid timeouts over slow networks.
-func Pull(image string) error {
-	_, err := do("pull", image)
-	return err
-}
-
-// Docker contains the name and the runtime of a docker container.
-type Docker struct {
-	Runtime string
-	Name    string
-}
-
-// MakeDocker sets up the struct for a Docker container.
-// Names of containers will be unique.
-func MakeDocker(namePrefix string) Docker {
-	return Docker{
-		Name:    testutil.RandomName(namePrefix),
-		Runtime: *runtime,
-	}
-}
-
-// logDockerID logs a container id, which is needed to find container runsc logs.
-func (d *Docker) logDockerID() {
-	id, err := d.ID()
-	if err != nil {
-		log.Printf("%v\n", err)
-	}
-	log.Printf("Name: %s ID: %v\n", d.Name, id)
-}
-
-// Create calls 'docker create' with the arguments provided.
-func (d *Docker) Create(args ...string) error {
-	a := []string{"create", "--runtime", d.Runtime, "--name", d.Name}
-	a = append(a, args...)
-	_, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return err
-}
-
-// Start calls 'docker start'.
-func (d *Docker) Start() error {
-	if _, err := do("start", d.Name); err != nil {
-		return fmt.Errorf("error starting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Stop calls 'docker stop'.
-func (d *Docker) Stop() error {
-	if _, err := do("stop", d.Name); err != nil {
-		return fmt.Errorf("error stopping container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Run calls 'docker run' with the arguments provided. The container starts
-// running in the background and the call returns immediately.
-func (d *Docker) Run(args ...string) error {
-	a := d.runArgs("-d")
-	a = append(a, args...)
-	_, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return err
-}
-
-// RunWithPty is like Run but with an attached pty.
-func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	a := d.runArgs("-it")
-	a = append(a, args...)
-	return doWithPty(a...)
-}
-
-// RunFg calls 'docker run' with the arguments provided in the foreground. It
-// blocks until the container exits and returns the output.
-func (d *Docker) RunFg(args ...string) (string, error) {
-	a := d.runArgs(args...)
-	out, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return string(out), err
-}
-
-func (d *Docker) runArgs(args ...string) []string {
-	// Environment variable RUNSC_TEST_NAME is picked up by the runtime and added
-	// to the log name, so one can easily identify the corresponding logs for
-	// this test.
-	rv := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-e", "RUNSC_TEST_NAME=" + d.Name}
-	return append(rv, args...)
-}
-
-// Logs calls 'docker logs'.
-func (d *Docker) Logs() (string, error) {
-	return do("logs", d.Name)
-}
-
-// Exec calls 'docker exec' with the arguments provided.
-func (d *Docker) Exec(args ...string) (string, error) {
-	return d.ExecWithFlags(nil, args...)
-}
-
-// ExecWithFlags calls 'docker exec <flags> name <args>'.
-func (d *Docker) ExecWithFlags(flags []string, args ...string) (string, error) {
-	a := []string{"exec"}
-	a = append(a, flags...)
-	a = append(a, d.Name)
-	a = append(a, args...)
-	return do(a...)
-}
-
-// ExecAsUser calls 'docker exec' as the given user with the arguments
-// provided.
-func (d *Docker) ExecAsUser(user string, args ...string) (string, error) {
-	a := []string{"exec", "--user", user, d.Name}
-	a = append(a, args...)
-	return do(a...)
-}
-
-// ExecWithTerminal calls 'docker exec -it' with the arguments provided and
-// attaches a pty to stdio.
-func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) {
-	a := []string{"exec", "-it", d.Name}
-	a = append(a, args...)
-	return doWithPty(a...)
-}
-
-// Pause calls 'docker pause'.
-func (d *Docker) Pause() error {
-	if _, err := do("pause", d.Name); err != nil {
-		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Unpause calls 'docker pause'.
-func (d *Docker) Unpause() error {
-	if _, err := do("unpause", d.Name); err != nil {
-		return fmt.Errorf("error unpausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Checkpoint calls 'docker checkpoint'.
-func (d *Docker) Checkpoint(name string) error {
-	if _, err := do("checkpoint", "create", d.Name, name); err != nil {
-		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Restore calls 'docker start --checkname [name]'.
-func (d *Docker) Restore(name string) error {
-	if _, err := do("start", "--checkpoint", name, d.Name); err != nil {
-		return fmt.Errorf("error starting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Remove calls 'docker rm'.
-func (d *Docker) Remove() error {
-	if _, err := do("rm", d.Name); err != nil {
-		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// CleanUp kills and deletes the container (best effort).
-func (d *Docker) CleanUp() {
-	d.logDockerID()
-	if _, err := do("kill", d.Name); err != nil {
-		if strings.Contains(err.Error(), "is not running") {
-			// Nothing to kill. Don't log the error in this case.
-		} else {
-			log.Printf("error killing container %q: %v", d.Name, err)
-		}
-	}
-	if err := d.Remove(); err != nil {
-		log.Print(err)
-	}
-}
-
-// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
-// docker to allocate a free port in the host and prevent conflicts.
-func (d *Docker) FindPort(sandboxPort int) (int, error) {
-	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
-	out, err := do("inspect", "-f", format, d.Name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving port: %v", err)
-	}
-	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
-	}
-	return port, nil
-}
-
-// FindIP returns the IP address of the container as a string.
-func (d *Docker) FindIP() (string, error) {
-	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
-	out, err := do("inspect", "-f", format, d.Name)
-	if err != nil {
-		return "", fmt.Errorf("error retrieving IP: %v", err)
-	}
-	return strings.TrimSpace(out), nil
-}
-
-// SandboxPid returns the PID to the sandbox process.
-func (d *Docker) SandboxPid() (int, error) {
-	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving pid: %v", err)
-	}
-	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
-	}
-	return pid, nil
-}
-
-// ID returns the container ID.
-func (d *Docker) ID() (string, error) {
-	out, err := do("inspect", "-f={{.Id}}", d.Name)
-	if err != nil {
-		return "", fmt.Errorf("error retrieving ID: %v", err)
-	}
-	return strings.TrimSpace(string(out)), nil
-}
-
-// Wait waits for container to exit, up to the given timeout. Returns error if
-// wait fails or timeout is hit. Returns the application return code otherwise.
-// Note that the application may have failed even if err == nil, always check
-// the exit code.
-func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
-	timeoutChan := time.After(timeout)
-	waitChan := make(chan (syscall.WaitStatus))
-	errChan := make(chan (error))
-
-	go func() {
-		out, err := do("wait", d.Name)
-		if err != nil {
-			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
-		}
-		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-		if err != nil {
-			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
-		}
-		waitChan <- syscall.WaitStatus(uint32(exit))
-	}()
-
-	select {
-	case ws := <-waitChan:
-		return ws, nil
-	case err := <-errChan:
-		return syscall.WaitStatus(1), err
-	case <-timeoutChan:
-		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
-	}
-}
-
-// WaitForOutput calls 'docker logs' to retrieve containers output and searches
-// for the given pattern.
-func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
-	if err != nil {
-		return "", err
-	}
-	if len(matches) == 0 {
-		return "", nil
-	}
-	return matches[0], nil
-}
-
-// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
-// searches for the given pattern. It returns any regexp submatches as well.
-func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
-	re := regexp.MustCompile(pattern)
-	var out string
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		var err error
-		out, err = d.Logs()
-		if err != nil {
-			return nil, err
-		}
-		if matches := re.FindStringSubmatch(out); matches != nil {
-			// Success!
-			return matches, nil
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
-}
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
deleted file mode 100644
index 945405303..000000000
--- a/runsc/testutil/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "testutil",
-    testonly = 1,
-    srcs = [
-        "testutil.go",
-        "testutil_runfiles.go",
-    ],
-    visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/log",
-        "//pkg/sync",
-        "//runsc/boot",
-        "//runsc/specutils",
-        "@com_github_cenkalti_backoff//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-    ],
-)
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
deleted file mode 100644
index 5e09f8f16..000000000
--- a/runsc/testutil/testutil.go
+++ /dev/null
@@ -1,433 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package testutil contains utility functions for runsc tests.
-package testutil
-
-import (
-	"bufio"
-	"context"
-	"debug/elf"
-	"encoding/base32"
-	"encoding/json"
-	"flag"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"math"
-	"math/rand"
-	"net/http"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"sync/atomic"
-	"syscall"
-	"testing"
-	"time"
-
-	"github.com/cenkalti/backoff"
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/specutils"
-)
-
-var (
-	checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support")
-)
-
-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
-
-// IsCheckpointSupported returns the relevant command line flag.
-func IsCheckpointSupported() bool {
-	return *checkpoint
-}
-
-// TmpDir returns the absolute path to a writable directory that can be used as
-// scratch by the test.
-func TmpDir() string {
-	dir := os.Getenv("TEST_TMPDIR")
-	if dir == "" {
-		dir = "/tmp"
-	}
-	return dir
-}
-
-// ConfigureExePath configures the executable for runsc in the test environment.
-func ConfigureExePath() error {
-	path, err := FindFile("runsc/runsc")
-	if err != nil {
-		return err
-	}
-	specutils.ExePath = path
-	return nil
-}
-
-// TestConfig returns the default configuration to use in tests. Note that
-// 'RootDir' must be set by caller if required.
-func TestConfig(t *testing.T) *boot.Config {
-	logDir := ""
-	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
-		logDir = dir + "/"
-	}
-	return &boot.Config{
-		Debug:              true,
-		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
-		LogFormat:          "text",
-		DebugLogFormat:     "text",
-		LogPackets:         true,
-		Network:            boot.NetworkNone,
-		Strace:             true,
-		Platform:           "ptrace",
-		FileAccess:         boot.FileAccessExclusive,
-		NumNetworkChannels: 1,
-
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
-	}
-}
-
-// NewSpecWithArgs creates a simple spec with the given args suitable for use
-// in tests.
-func NewSpecWithArgs(args ...string) *specs.Spec {
-	return &specs.Spec{
-		// The host filesystem root is the container root.
-		Root: &specs.Root{
-			Path:     "/",
-			Readonly: true,
-		},
-		Process: &specs.Process{
-			Args: args,
-			Env: []string{
-				"PATH=" + os.Getenv("PATH"),
-			},
-			Capabilities: specutils.AllCapabilities(),
-		},
-		Mounts: []specs.Mount{
-			// Hide the host /etc to avoid any side-effects.
-			// For example, bash reads /etc/passwd and if it is
-			// very big, tests can fail by timeout.
-			{
-				Type:        "tmpfs",
-				Destination: "/etc",
-			},
-			// Root is readonly, but many tests want to write to tmpdir.
-			// This creates a writable mount inside the root. Also, when tmpdir points
-			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
-			// inside the sentry.
-			{
-				Type:        "bind",
-				Destination: TmpDir(),
-				Source:      TmpDir(),
-			},
-		},
-		Hostname: "runsc-test-hostname",
-	}
-}
-
-// SetupRootDir creates a root directory for containers.
-func SetupRootDir() (string, error) {
-	rootDir, err := ioutil.TempDir(TmpDir(), "containers")
-	if err != nil {
-		return "", fmt.Errorf("error creating root dir: %v", err)
-	}
-	return rootDir, nil
-}
-
-// SetupContainer creates a bundle and root dir for the container, generates a
-// test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
-	rootDir, err = SetupRootDir()
-	if err != nil {
-		return "", "", err
-	}
-	conf.RootDir = rootDir
-	bundleDir, err = SetupBundleDir(spec)
-	return rootDir, bundleDir, err
-}
-
-// SetupBundleDir creates a bundle dir and writes the spec to config.json.
-func SetupBundleDir(spec *specs.Spec) (bundleDir string, err error) {
-	bundleDir, err = ioutil.TempDir(TmpDir(), "bundle")
-	if err != nil {
-		return "", fmt.Errorf("error creating bundle dir: %v", err)
-	}
-
-	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", fmt.Errorf("error writing spec: %v", err)
-	}
-	return bundleDir, nil
-}
-
-// writeSpec writes the spec to disk in the given directory.
-func writeSpec(dir string, spec *specs.Spec) error {
-	b, err := json.Marshal(spec)
-	if err != nil {
-		return err
-	}
-	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
-}
-
-// UniqueContainerID generates a unique container id for each test.
-//
-// The container id is used to create an abstract unix domain socket, which must
-// be unique.  While the container forbids creating two containers with the same
-// name, sometimes between test runs the socket does not get cleaned up quickly
-// enough, causing container creation to fail.
-func UniqueContainerID() string {
-	// Read 20 random bytes.
-	b := make([]byte, 20)
-	// "[Read] always returns len(p) and a nil error." --godoc
-	if _, err := rand.Read(b); err != nil {
-		panic("rand.Read failed: " + err.Error())
-	}
-	// base32 encode the random bytes, so that the name is a valid
-	// container id and can be used as a socket name in the filesystem.
-	return fmt.Sprintf("test-container-%s", base32.StdEncoding.EncodeToString(b))
-}
-
-// Copy copies file from src to dst.
-func Copy(src, dst string) error {
-	in, err := os.Open(src)
-	if err != nil {
-		return err
-	}
-	defer in.Close()
-
-	out, err := os.Create(dst)
-	if err != nil {
-		return err
-	}
-	defer out.Close()
-
-	_, err = io.Copy(out, in)
-	return err
-}
-
-// Poll is a shorthand function to poll for something with given timeout.
-func Poll(cb func() error, timeout time.Duration) error {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	defer cancel()
-	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
-	return backoff.Retry(cb, b)
-}
-
-// WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
-func WaitForHTTP(port int, timeout time.Duration) error {
-	cb := func() error {
-		c := &http.Client{
-			// Calculate timeout to be able to do minimum 5 attempts.
-			Timeout: timeout / 5,
-		}
-		url := fmt.Sprintf("http://localhost:%d/", port)
-		resp, err := c.Get(url)
-		if err != nil {
-			log.Infof("Waiting %s: %v", url, err)
-			return err
-		}
-		resp.Body.Close()
-		return nil
-	}
-	return Poll(cb, timeout)
-}
-
-// Reaper reaps child processes.
-type Reaper struct {
-	// mu protects ch, which will be nil if the reaper is not running.
-	mu sync.Mutex
-	ch chan os.Signal
-}
-
-// Start starts reaping child processes.
-func (r *Reaper) Start() {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-
-	if r.ch != nil {
-		panic("reaper.Start called on a running reaper")
-	}
-
-	r.ch = make(chan os.Signal, 1)
-	signal.Notify(r.ch, syscall.SIGCHLD)
-
-	go func() {
-		for {
-			r.mu.Lock()
-			ch := r.ch
-			r.mu.Unlock()
-			if ch == nil {
-				return
-			}
-
-			_, ok := <-ch
-			if !ok {
-				// Channel closed.
-				return
-			}
-			for {
-				cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil)
-				if cpid < 1 {
-					break
-				}
-			}
-		}
-	}()
-}
-
-// Stop stops reaping child processes.
-func (r *Reaper) Stop() {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-
-	if r.ch == nil {
-		panic("reaper.Stop called on a stopped reaper")
-	}
-
-	signal.Stop(r.ch)
-	close(r.ch)
-	r.ch = nil
-}
-
-// StartReaper is a helper that starts a new Reaper and returns a function to
-// stop it.
-func StartReaper() func() {
-	r := &Reaper{}
-	r.Start()
-	return r.Stop
-}
-
-// WaitUntilRead reads from the given reader until the wanted string is found
-// or until timeout.
-func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
-	sc := bufio.NewScanner(r)
-	if split != nil {
-		sc.Split(split)
-	}
-	// done must be accessed atomically. A value greater than 0 indicates
-	// that the read loop can exit.
-	var done uint32
-	doneCh := make(chan struct{})
-	go func() {
-		for sc.Scan() {
-			t := sc.Text()
-			if strings.Contains(t, want) {
-				atomic.StoreUint32(&done, 1)
-				close(doneCh)
-				break
-			}
-			if atomic.LoadUint32(&done) > 0 {
-				break
-			}
-		}
-	}()
-	select {
-	case <-time.After(timeout):
-		atomic.StoreUint32(&done, 1)
-		return fmt.Errorf("timeout waiting to read %q", want)
-	case <-doneCh:
-		return nil
-	}
-}
-
-// KillCommand kills the process running cmd unless it hasn't been started. It
-// returns an error if it cannot kill the process unless the reason is that the
-// process has already exited.
-func KillCommand(cmd *exec.Cmd) error {
-	if cmd.Process == nil {
-		return nil
-	}
-	if err := cmd.Process.Kill(); err != nil {
-		if !strings.Contains(err.Error(), "process already finished") {
-			return fmt.Errorf("failed to kill process %v: %v", cmd, err)
-		}
-	}
-	return nil
-}
-
-// WriteTmpFile writes text to a temporary file, closes the file, and returns
-// the name of the file.
-func WriteTmpFile(pattern, text string) (string, error) {
-	file, err := ioutil.TempFile(TmpDir(), pattern)
-	if err != nil {
-		return "", err
-	}
-	defer file.Close()
-	if _, err := file.Write([]byte(text)); err != nil {
-		return "", err
-	}
-	return file.Name(), nil
-}
-
-// RandomName create a name with a 6 digit random number appended to it.
-func RandomName(prefix string) string {
-	return fmt.Sprintf("%s-%06d", prefix, rand.Int31n(1000000))
-}
-
-// IsStatic returns true iff the given file is a static binary.
-func IsStatic(filename string) (bool, error) {
-	f, err := elf.Open(filename)
-	if err != nil {
-		return false, err
-	}
-	for _, prog := range f.Progs {
-		if prog.Type == elf.PT_INTERP {
-			return false, nil // Has interpreter.
-		}
-	}
-	return true, nil
-}
-
-// TestIndicesForShard returns indices for this test shard based on the
-// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
-//
-// If either of the env vars are not present, then the function will return all
-// tests. If there are more shards than there are tests, then the returned list
-// may be empty.
-func TestIndicesForShard(numTests int) ([]int, error) {
-	var (
-		shardIndex = 0
-		shardTotal = 1
-	)
-
-	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
-	if indexStr != "" && totalStr != "" {
-		// Parse index and total to ints.
-		var err error
-		shardIndex, err = strconv.Atoi(indexStr)
-		if err != nil {
-			return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
-		}
-		shardTotal, err = strconv.Atoi(totalStr)
-		if err != nil {
-			return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
-		}
-	}
-
-	// Calculate!
-	var indices []int
-	numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal)))
-	for i := 0; i < numBlocks; i++ {
-		pick := i*shardTotal + shardIndex
-		if pick < numTests {
-			indices = append(indices, pick)
-		}
-	}
-	return indices, nil
-}
diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go
deleted file mode 100644
index ece9ea9a1..000000000
--- a/runsc/testutil/testutil_runfiles.go
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testutil
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-)
-
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
index 0f46909ac..c8da1f32d 100755
--- a/scripts/iptables_tests.sh
+++ b/scripts/iptables_tests.sh
@@ -17,14 +17,5 @@
 source $(dirname $0)/common.sh
 
 install_runsc_for_test iptables --net-raw
-
-# Build the docker image for the test.
-run //test/iptables/runner:runner-image --norun
-
-test //test/iptables:iptables_test \
-  "--test_arg=--runtime=runc" \
-  "--test_arg=--image=bazel/test/iptables/runner:runner-image"
-
-test //test/iptables:iptables_test \
-  "--test_arg=--runtime=${RUNTIME}" \
-  "--test_arg=--image=bazel/test/iptables/runner:runner-image"
+test //test/iptables:iptables_test --test_arg=--runtime=runc
+test //test/iptables:iptables_test --test_arg=--runtime=${RUNTIME}
diff --git a/test/cmd/test_app/BUILD b/test/cmd/test_app/BUILD
new file mode 100644
index 000000000..98ba5a3d9
--- /dev/null
+++ b/test/cmd/test_app/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "test_app",
+    testonly = 1,
+    srcs = [
+        "fds.go",
+        "test_app.go",
+    ],
+    pure = True,
+    visibility = ["//runsc/container:__pkg__"],
+    deps = [
+        "//pkg/test/testutil",
+        "//pkg/unet",
+        "//runsc/flag",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
+    ],
+)
diff --git a/test/cmd/test_app/fds.go b/test/cmd/test_app/fds.go
new file mode 100644
index 000000000..a7658eefd
--- /dev/null
+++ b/test/cmd/test_app/fds.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"context"
+	"io/ioutil"
+	"log"
+	"os"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+const fileContents = "foobarbaz"
+
+// fdSender will open a file and send the FD over a unix domain socket.
+type fdSender struct {
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*fdSender) Name() string {
+	return "fd_sender"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*fdSender) Synopsis() string {
+	return "creates a file and sends the FD over the socket"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*fdSender) Usage() string {
+	return "fd_sender <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (fds *fdSender) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&fds.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (fds *fdSender) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if fds.socketPath == "" {
+		log.Fatalf("socket flag must be set")
+	}
+
+	dir, err := ioutil.TempDir("", "")
+	if err != nil {
+		log.Fatalf("TempDir failed: %v", err)
+	}
+
+	fileToSend, err := ioutil.TempFile(dir, "")
+	if err != nil {
+		log.Fatalf("TempFile failed: %v", err)
+	}
+	defer fileToSend.Close()
+
+	if _, err := fileToSend.WriteString(fileContents); err != nil {
+		log.Fatalf("Write(%q) failed: %v", fileContents, err)
+	}
+
+	// Receiver may not be started yet, so try connecting in a poll loop.
+	var s *unet.Socket
+	if err := testutil.Poll(func() error {
+		var err error
+		s, err = unet.Connect(fds.socketPath, true /* SEQPACKET, so we can send empty message with FD */)
+		return err
+	}, 10*time.Second); err != nil {
+		log.Fatalf("Error connecting to socket %q: %v", fds.socketPath, err)
+	}
+	defer s.Close()
+
+	w := s.Writer(true)
+	w.ControlMessage.PackFDs(int(fileToSend.Fd()))
+	if _, err := w.WriteVec([][]byte{[]byte{'a'}}); err != nil {
+		log.Fatalf("Error sending FD %q over socket %q: %v", fileToSend.Fd(), fds.socketPath, err)
+	}
+
+	log.Print("FD SENDER exiting successfully")
+	return subcommands.ExitSuccess
+}
+
+// fdReceiver receives an FD from a unix domain socket and does things to it.
+type fdReceiver struct {
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*fdReceiver) Name() string {
+	return "fd_receiver"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*fdReceiver) Synopsis() string {
+	return "reads an FD from a unix socket, and then does things to it"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*fdReceiver) Usage() string {
+	return "fd_receiver <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (fdr *fdReceiver) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&fdr.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (fdr *fdReceiver) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if fdr.socketPath == "" {
+		log.Fatalf("Flags cannot be empty, given: socket: %q", fdr.socketPath)
+	}
+
+	ss, err := unet.BindAndListen(fdr.socketPath, true /* packet */)
+	if err != nil {
+		log.Fatalf("BindAndListen(%q) failed: %v", fdr.socketPath, err)
+	}
+	defer ss.Close()
+
+	var s *unet.Socket
+	c := make(chan error, 1)
+	go func() {
+		var err error
+		s, err = ss.Accept()
+		c <- err
+	}()
+
+	select {
+	case err := <-c:
+		if err != nil {
+			log.Fatalf("Accept() failed: %v", err)
+		}
+	case <-time.After(10 * time.Second):
+		log.Fatalf("Timeout waiting for accept")
+	}
+
+	r := s.Reader(true)
+	r.EnableFDs(1)
+	b := [][]byte{{'a'}}
+	if n, err := r.ReadVec(b); n != 1 || err != nil {
+		log.Fatalf("ReadVec got n=%d err %v (wanted 0, nil)", n, err)
+	}
+
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		log.Fatalf("ExtractFD() got err %v", err)
+	}
+	if len(fds) != 1 {
+		log.Fatalf("ExtractFD() got %d FDs, wanted 1", len(fds))
+	}
+	fd := fds[0]
+
+	file := os.NewFile(uintptr(fd), "received file")
+	defer file.Close()
+	if _, err := file.Seek(0, os.SEEK_SET); err != nil {
+		log.Fatalf("Seek(0, 0) failed: %v", err)
+	}
+
+	got, err := ioutil.ReadAll(file)
+	if err != nil {
+		log.Fatalf("ReadAll failed: %v", err)
+	}
+	if string(got) != fileContents {
+		log.Fatalf("ReadAll got %q want %q", string(got), fileContents)
+	}
+
+	log.Print("FD RECEIVER exiting successfully")
+	return subcommands.ExitSuccess
+}
diff --git a/test/cmd/test_app/test_app.go b/test/cmd/test_app/test_app.go
new file mode 100644
index 000000000..3ba4f38f8
--- /dev/null
+++ b/test/cmd/test_app/test_app.go
@@ -0,0 +1,394 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary test_app is like a swiss knife for tests that need to run anything
+// inside the sandbox. New functionality can be added with new commands.
+package main
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"regexp"
+	"strconv"
+	sys "syscall"
+	"time"
+
+	"github.com/google/subcommands"
+	"github.com/kr/pty"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+func main() {
+	subcommands.Register(subcommands.HelpCommand(), "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+	subcommands.Register(new(capability), "")
+	subcommands.Register(new(fdReceiver), "")
+	subcommands.Register(new(fdSender), "")
+	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(ptyRunner), "")
+	subcommands.Register(new(reaper), "")
+	subcommands.Register(new(syscall), "")
+	subcommands.Register(new(taskTree), "")
+	subcommands.Register(new(uds), "")
+
+	flag.Parse()
+
+	exitCode := subcommands.Execute(context.Background())
+	os.Exit(int(exitCode))
+}
+
+type uds struct {
+	fileName   string
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*uds) Name() string {
+	return "uds"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*uds) Synopsis() string {
+	return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*uds) Usage() string {
+	return "uds <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *uds) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.fileName, "file", "", "name of output file")
+	f.StringVar(&c.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if c.fileName == "" || c.socketPath == "" {
+		log.Fatalf("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
+		return subcommands.ExitFailure
+	}
+	outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		log.Fatal("error opening output file:", err)
+	}
+
+	defer os.Remove(c.socketPath)
+
+	listener, err := net.Listen("unix", c.socketPath)
+	if err != nil {
+		log.Fatalf("error listening on socket %q: %v", c.socketPath, err)
+	}
+
+	go server(listener, outputFile)
+	for i := 0; ; i++ {
+		conn, err := net.Dial("unix", c.socketPath)
+		if err != nil {
+			log.Fatal("error dialing:", err)
+		}
+		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
+			log.Fatal("error writing:", err)
+		}
+		conn.Close()
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func server(listener net.Listener, out *os.File) {
+	buf := make([]byte, 16)
+
+	for {
+		c, err := listener.Accept()
+		if err != nil {
+			log.Fatal("error accepting connection:", err)
+		}
+		nr, err := c.Read(buf)
+		if err != nil {
+			log.Fatal("error reading from buf:", err)
+		}
+		data := buf[0:nr]
+		fmt.Fprint(out, string(data)+"\n")
+	}
+}
+
+type taskTree struct {
+	depth int
+	width int
+	pause bool
+}
+
+// Name implements subcommands.Command.
+func (*taskTree) Name() string {
+	return "task-tree"
+}
+
+// Synopsis implements subcommands.Command.
+func (*taskTree) Synopsis() string {
+	return "creates a tree of tasks"
+}
+
+// Usage implements subcommands.Command.
+func (*taskTree) Usage() string {
+	return "task-tree <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *taskTree) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
+	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
+	f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually")
+}
+
+// Execute implements subcommands.Command.
+func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	if c.depth == 0 {
+		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
+		select {}
+	}
+	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
+
+	var cmds []*exec.Cmd
+	for i := 0; i < c.width; i++ {
+		cmd := exec.Command(
+			"/proc/self/exe", c.Name(),
+			"--depth", strconv.Itoa(c.depth-1),
+			"--width", strconv.Itoa(c.width),
+			"--pause", strconv.FormatBool(c.pause))
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+
+		if err := cmd.Start(); err != nil {
+			log.Fatal("failed to call self:", err)
+		}
+		cmds = append(cmds, cmd)
+	}
+
+	for _, c := range cmds {
+		c.Wait()
+	}
+
+	if c.pause {
+		select {}
+	}
+
+	return subcommands.ExitSuccess
+}
+
+type forkBomb struct {
+	delay time.Duration
+}
+
+// Name implements subcommands.Command.
+func (*forkBomb) Name() string {
+	return "fork-bomb"
+}
+
+// Synopsis implements subcommands.Command.
+func (*forkBomb) Synopsis() string {
+	return "creates child process until the end of times"
+}
+
+// Usage implements subcommands.Command.
+func (*forkBomb) Usage() string {
+	return "fork-bomb <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *forkBomb) SetFlags(f *flag.FlagSet) {
+	f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child")
+}
+
+// Execute implements subcommands.Command.
+func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	time.Sleep(c.delay)
+
+	cmd := exec.Command("/proc/self/exe", c.Name())
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		log.Fatal("failed to call self:", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+type reaper struct{}
+
+// Name implements subcommands.Command.
+func (*reaper) Name() string {
+	return "reaper"
+}
+
+// Synopsis implements subcommands.Command.
+func (*reaper) Synopsis() string {
+	return "reaps all children in a loop"
+}
+
+// Usage implements subcommands.Command.
+func (*reaper) Usage() string {
+	return "reaper <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (*reaper) SetFlags(*flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+	select {}
+}
+
+type syscall struct {
+	sysno uint64
+}
+
+// Name implements subcommands.Command.
+func (*syscall) Name() string {
+	return "syscall"
+}
+
+// Synopsis implements subcommands.Command.
+func (*syscall) Synopsis() string {
+	return "syscall makes a syscall"
+}
+
+// Usage implements subcommands.Command.
+func (*syscall) Usage() string {
+	return "syscall <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (s *syscall) SetFlags(f *flag.FlagSet) {
+	f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call")
+}
+
+// Execute implements subcommands.Command.
+func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 {
+		fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno)
+	} else {
+		fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno)
+	}
+	return subcommands.ExitSuccess
+}
+
+type capability struct {
+	enabled  uint64
+	disabled uint64
+}
+
+// Name implements subcommands.Command.
+func (*capability) Name() string {
+	return "capability"
+}
+
+// Synopsis implements subcommands.Command.
+func (*capability) Synopsis() string {
+	return "checks if effective capabilities are set/unset"
+}
+
+// Usage implements subcommands.Command.
+func (*capability) Usage() string {
+	return "capability [--enabled=number] [--disabled=number]"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *capability) SetFlags(f *flag.FlagSet) {
+	f.Uint64Var(&c.enabled, "enabled", 0, "")
+	f.Uint64Var(&c.disabled, "disabled", 0, "")
+}
+
+// Execute implements subcommands.Command.
+func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if c.enabled == 0 && c.disabled == 0 {
+		fmt.Println("One of the flags must be set")
+		return subcommands.ExitUsageError
+	}
+
+	status, err := ioutil.ReadFile("/proc/self/status")
+	if err != nil {
+		fmt.Printf("Error reading %q: %v\n", "proc/self/status", err)
+		return subcommands.ExitFailure
+	}
+	re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n")
+	matches := re.FindStringSubmatch(string(status))
+	if matches == nil || len(matches) != 2 {
+		fmt.Printf("Effective capabilities not found in\n%s\n", status)
+		return subcommands.ExitFailure
+	}
+	caps, err := strconv.ParseUint(matches[1], 16, 64)
+	if err != nil {
+		fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err)
+		return subcommands.ExitFailure
+	}
+
+	if c.enabled != 0 && (caps&c.enabled) != c.enabled {
+		fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps)
+		return subcommands.ExitFailure
+	}
+	if c.disabled != 0 && (caps&c.disabled) != 0 {
+		fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps)
+		return subcommands.ExitFailure
+	}
+
+	return subcommands.ExitSuccess
+}
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+	return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+	return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+	return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+	c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+	f, err := pty.Start(c)
+	if err != nil {
+		fmt.Printf("pty.Start failed: %v", err)
+		return subcommands.ExitFailure
+	}
+	defer f.Close()
+
+	// Copy stdout from the command to keep this process alive until the
+	// subprocess exits.
+	io.Copy(os.Stdout, f)
+
+	return subcommands.ExitSuccess
+}
diff --git a/test/e2e/BUILD b/test/e2e/BUILD
index 76e04f878..44cce0e3b 100644
--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@@ -20,9 +20,9 @@ go_test(
     deps = [
         "//pkg/abi/linux",
         "//pkg/bits",
-        "//runsc/dockerutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
         "//runsc/specutils",
-        "//runsc/testutil",
     ],
 )
 
diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
index 594c8e752..6a63b1232 100644
--- a/test/e2e/exec_test.go
+++ b/test/e2e/exec_test.go
@@ -23,6 +23,8 @@ package integration
 
 import (
 	"fmt"
+	"os"
+	"os/exec"
 	"strconv"
 	"strings"
 	"syscall"
@@ -31,23 +33,23 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 // Test that exec uses the exact same capability set as the container.
 func TestExecCapabilities(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-capabilities-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	if err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Check that capability.
 	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
 	if err != nil {
 		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
@@ -59,7 +61,7 @@ func TestExecCapabilities(t *testing.T) {
 	t.Log("Root capabilities:", want)
 
 	// Now check that exec'd process capabilities match the root.
-	got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
+	got, err := d.Exec(dockerutil.RunOpts{}, "grep", "CapEff:", "/proc/self/status")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -72,16 +74,16 @@ func TestExecCapabilities(t *testing.T) {
 // Test that 'exec --privileged' adds all capabilities, except for CAP_NET_RAW
 // which is removed from the container when --net-raw=false.
 func TestExecPrivileged(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-privileged-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container with all capabilities dropped.
-	if err := d.Run("--cap-drop=all", "alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image:   "basic/alpine",
+		CapDrop: []string{"all"},
+	}, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Check that all capabilities where dropped from container.
 	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
@@ -100,9 +102,11 @@ func TestExecPrivileged(t *testing.T) {
 		t.Fatalf("Container should have no capabilities: %x", containerCaps)
 	}
 
-	// Check that 'exec --privileged' adds all capabilities, except
-	// for CAP_NET_RAW.
-	got, err := d.ExecWithFlags([]string{"--privileged"}, "grep", "CapEff:", "/proc/self/status")
+	// Check that 'exec --privileged' adds all capabilities, except for
+	// CAP_NET_RAW.
+	got, err := d.Exec(dockerutil.RunOpts{
+		Privileged: true,
+	}, "grep", "CapEff:", "/proc/self/status")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -114,97 +118,99 @@ func TestExecPrivileged(t *testing.T) {
 }
 
 func TestExecJobControl(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-job-control-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Exec 'sh' with an attached pty.
-	cmd, ptmx, err := d.ExecWithTerminal("sh")
-	if err != nil {
+	if _, err := d.Exec(dockerutil.RunOpts{
+		Pty: func(cmd *exec.Cmd, ptmx *os.File) {
+			// Call "sleep 100 | cat" in the shell. We pipe to cat
+			// so that there will be two processes in the
+			// foreground process group.
+			if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// Give shell a few seconds to start executing the sleep.
+			time.Sleep(2 * time.Second)
+
+			// Send a ^C to the pty, which should kill sleep and
+			// cat, but not the shell.  \x03 is ASCII "end of
+			// text", which is the same as ^C.
+			if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// The shell should still be alive at this point. Sleep
+			// should have exited with code 2+128=130. We'll exit
+			// with 10 plus that number, so that we can be sure
+			// that the shell did not get signalled.
+			if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// Exec process should exit with code 10+130=140.
+			ps, err := cmd.Process.Wait()
+			if err != nil {
+				t.Fatalf("error waiting for exec process: %v", err)
+			}
+			ws := ps.Sys().(syscall.WaitStatus)
+			if !ws.Exited() {
+				t.Errorf("ws.Exited got false, want true")
+			}
+			if got, want := ws.ExitStatus(), 140; got != want {
+				t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
+			}
+		},
+	}, "sh"); err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
-	defer ptmx.Close()
-
-	// Call "sleep 100 | cat" in the shell.  We pipe to cat so that there
-	// will be two processes in the foreground process group.
-	if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Give shell a few seconds to start executing the sleep.
-	time.Sleep(2 * time.Second)
-
-	// Send a ^C to the pty, which should kill sleep and cat, but not the
-	// shell.  \x03 is ASCII "end of text", which is the same as ^C.
-	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// The shell should still be alive at this point. Sleep should have
-	// exited with code 2+128=130. We'll exit with 10 plus that number, so
-	// that we can be sure that the shell did not get signalled.
-	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Exec process should exit with code 10+130=140.
-	ps, err := cmd.Process.Wait()
-	if err != nil {
-		t.Fatalf("error waiting for exec process: %v", err)
-	}
-	ws := ps.Sys().(syscall.WaitStatus)
-	if !ws.Exited() {
-		t.Errorf("ws.Exited got false, want true")
-	}
-	if got, want := ws.ExitStatus(), 140; got != want {
-		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
-	}
 }
 
 // Test that failure to exec returns proper error message.
 func TestExecError(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-error-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
-	_, err := d.Exec("no_can_find")
+	// Attempt to exec a binary that doesn't exist.
+	out, err := d.Exec(dockerutil.RunOpts{}, "no_can_find")
 	if err == nil {
 		t.Fatalf("docker exec didn't fail")
 	}
-	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
-		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
+	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(out, want) {
+		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", out, want)
 	}
 }
 
 // Test that exec inherits environment from run.
 func TestExecEnv(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-env-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container with env FOO=BAR.
-	if err := d.Run("-e", "FOO=BAR", "alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Env:   []string{"FOO=BAR"},
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Exec "echo $FOO".
-	got, err := d.Exec("/bin/sh", "-c", "echo $FOO")
+	got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $FOO")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -216,17 +222,19 @@ func TestExecEnv(t *testing.T) {
 // TestRunEnvHasHome tests that run always has HOME environment set.
 func TestRunEnvHasHome(t *testing.T) {
 	// Base alpine image does not have any environment variables set.
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("run-env-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Exec "echo $HOME". The 'bin' user's home dir is '/bin'.
-	got, err := d.RunFg("--user", "bin", "alpine", "/bin/sh", "-c", "echo $HOME")
+	got, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		User:  "bin",
+	}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
+
+	// Check that the directory matches.
 	if got, want := strings.TrimSpace(got), "/bin"; got != want {
 		t.Errorf("bad output from 'docker run'. Got %q; Want %q.", got, want)
 	}
@@ -235,18 +243,17 @@ func TestRunEnvHasHome(t *testing.T) {
 // Test that exec always has HOME environment set, even when not set in run.
 func TestExecEnvHasHome(t *testing.T) {
 	// Base alpine image does not have any environment variables set.
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-env-home-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Exec "echo $HOME", and expect to see "/root".
-	got, err := d.Exec("/bin/sh", "-c", "echo $HOME")
+	got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -258,12 +265,14 @@ func TestExecEnvHasHome(t *testing.T) {
 	newUID := 1234
 	newHome := "/foo/bar"
 	cmd := fmt.Sprintf("mkdir -p -m 777 %q && adduser foo -D -u %d -h %q", newHome, newUID, newHome)
-	if _, err := d.Exec("/bin/sh", "-c", cmd); err != nil {
+	if _, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", cmd); err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
 
 	// Execute the same as the new user and expect newHome.
-	got, err = d.ExecAsUser(strconv.Itoa(newUID), "/bin/sh", "-c", "echo $HOME")
+	got, err = d.Exec(dockerutil.RunOpts{
+		User: strconv.Itoa(newUID),
+	}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index cc4fbbaed..404e37689 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -27,14 +27,15 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"os/exec"
 	"strconv"
 	"strings"
 	"syscall"
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 // httpRequestSucceeds sends a request to a given url and checks that the status is OK.
@@ -53,65 +54,66 @@ func httpRequestSucceeds(client http.Client, server string, port int) error {
 
 // TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
 func TestLifeCycle(t *testing.T) {
-	if err := dockerutil.Pull("nginx"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("lifecycle-test")
-	if err := d.Create("-p", "80", "nginx"); err != nil {
-		t.Fatal("docker create failed:", err)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Start the container.
+	if err := d.Create(dockerutil.RunOpts{
+		Image: "basic/nginx",
+		Ports: []int{80},
+	}); err != nil {
+		t.Fatalf("docker create failed: %v", err)
 	}
 	if err := d.Start(); err != nil {
-		d.CleanUp()
-		t.Fatal("docker start failed:", err)
+		t.Fatalf("docker start failed: %v", err)
 	}
 
-	// Test that container is working
+	// Test that container is working.
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatal("docker.FindPort(80) failed: ", err)
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
 	}
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 	client := http.Client{Timeout: time.Duration(2 * time.Second)}
 	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Error("http request failed:", err)
+		t.Errorf("http request failed: %v", err)
 	}
 
 	if err := d.Stop(); err != nil {
-		d.CleanUp()
-		t.Fatal("docker stop failed:", err)
+		t.Fatalf("docker stop failed: %v", err)
 	}
 	if err := d.Remove(); err != nil {
-		t.Fatal("docker rm failed:", err)
+		t.Fatalf("docker rm failed: %v", err)
 	}
 }
 
 func TestPauseResume(t *testing.T) {
-	const img = "gcr.io/gvisor-presubmit/python-hello"
 	if !testutil.IsCheckpointSupported() {
-		t.Log("Checkpoint is not supported, skipping test.")
-		return
+		t.Skip("Checkpoint is not supported.")
 	}
 
-	if err := dockerutil.Pull(img); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("pause-resume-test")
-	if err := d.Run("-p", "8080", img); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Start the container.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/python",
+		Ports: []int{8080}, // See Dockerfile.
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatal("docker.FindPort(8080) failed:", err)
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check that container is working.
@@ -121,7 +123,7 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	if err := d.Pause(); err != nil {
-		t.Fatal("docker pause failed:", err)
+		t.Fatalf("docker pause failed: %v", err)
 	}
 
 	// Check if container is paused.
@@ -137,12 +139,12 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	if err := d.Unpause(); err != nil {
-		t.Fatal("docker unpause failed:", err)
+		t.Fatalf("docker unpause failed: %v", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
@@ -152,43 +154,43 @@ func TestPauseResume(t *testing.T) {
 }
 
 func TestCheckpointRestore(t *testing.T) {
-	const img = "gcr.io/gvisor-presubmit/python-hello"
 	if !testutil.IsCheckpointSupported() {
-		t.Log("Pause/resume is not supported, skipping test.")
-		return
+		t.Skip("Pause/resume is not supported.")
 	}
 
-	if err := dockerutil.Pull(img); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("save-restore-test")
-	if err := d.Run("-p", "8080", img); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Start the container.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/python",
+		Ports: []int{8080}, // See Dockerfile.
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Create a snapshot.
 	if err := d.Checkpoint("test"); err != nil {
-		t.Fatal("docker checkpoint failed:", err)
+		t.Fatalf("docker checkpoint failed: %v", err)
 	}
-
 	if _, err := d.Wait(30 * time.Second); err != nil {
-		t.Fatal(err)
+		t.Fatalf("wait failed: %v", err)
 	}
 
 	// TODO(b/143498576): Remove Poll after github.com/moby/moby/issues/38963 is fixed.
 	if err := testutil.Poll(func() error { return d.Restore("test") }, 15*time.Second); err != nil {
-		t.Fatal("docker restore failed:", err)
+		t.Fatalf("docker restore failed: %v", err)
 	}
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatal("docker.FindPort(8080) failed:", err)
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
@@ -200,26 +202,28 @@ func TestCheckpointRestore(t *testing.T) {
 
 // Create client and server that talk to each other using the local IP.
 func TestConnectToSelf(t *testing.T) {
-	d := dockerutil.MakeDocker("connect-to-self-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Creates server that replies "server" and exists. Sleeps at the end because
 	// 'docker exec' gets killed if the init process exists before it can finish.
-	if err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
-		t.Fatal("docker run failed:", err)
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/ubuntu",
+	}, "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Finds IP address for host.
-	ip, err := d.Exec("/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
+	ip, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
 	if err != nil {
-		t.Fatal("docker exec failed:", err)
+		t.Fatalf("docker exec failed: %v", err)
 	}
 	ip = strings.TrimRight(ip, "\n")
 
 	// Runs client that sends "client" to the server and exits.
-	reply, err := d.Exec("/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
+	reply, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
 	if err != nil {
-		t.Fatal("docker exec failed:", err)
+		t.Fatalf("docker exec failed: %v", err)
 	}
 
 	// Ensure both client and server got the message from each other.
@@ -227,21 +231,22 @@ func TestConnectToSelf(t *testing.T) {
 		t.Errorf("Error on server, want: %q, got: %q", want, reply)
 	}
 	if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
-		t.Fatal("docker.WaitForOutput(client) timeout:", err)
+		t.Fatalf("docker.WaitForOutput(client) timeout: %v", err)
 	}
 }
 
 func TestMemLimit(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
-	cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'"
-	out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	allocMemory := 500 * 1024
+	out, err := d.Run(dockerutil.RunOpts{
+		Image:  "basic/alpine",
+		Memory: allocMemory, // In kB.
+	}, "sh", "-c", "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'")
 	if err != nil {
-		t.Fatal("docker run failed:", err)
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Remove warning message that swap isn't present.
 	if strings.HasPrefix(out, "WARNING") {
@@ -252,27 +257,30 @@ func TestMemLimit(t *testing.T) {
 		out = lines[1]
 	}
 
+	// Ensure the memory matches what we want.
 	got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64)
 	if err != nil {
 		t.Fatalf("failed to parse %q: %v", out, err)
 	}
-	if want := uint64(500 * 1024); got != want {
+	if want := uint64(allocMemory); got != want {
 		t.Errorf("MemTotal got: %d, want: %d", got, want)
 	}
 }
 
 func TestNumCPU(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
-	cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l"
-	out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Read how many cores are in the container.
+	out, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Extra: []string{"--cpuset-cpus=0"},
+	}, "sh", "-c", "cat /proc/cpuinfo | grep 'processor.*:' | wc -l")
 	if err != nil {
-		t.Fatal("docker run failed:", err)
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Ensure it matches what we want.
 	got, err := strconv.Atoi(strings.TrimSpace(out))
 	if err != nil {
 		t.Fatalf("failed to parse %q: %v", out, err)
@@ -284,39 +292,39 @@ func TestNumCPU(t *testing.T) {
 
 // TestJobControl tests that job control characters are handled properly.
 func TestJobControl(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("job-control-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container with an attached PTY.
-	_, ptmx, err := d.RunWithPty("alpine", "sh")
-	if err != nil {
+	if _, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Pty: func(_ *exec.Cmd, ptmx *os.File) {
+			// Call "sleep 100" in the shell.
+			if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// Give shell a few seconds to start executing the sleep.
+			time.Sleep(2 * time.Second)
+
+			// Send a ^C to the pty, which should kill sleep, but
+			// not the shell.  \x03 is ASCII "end of text", which
+			// is the same as ^C.
+			if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// The shell should still be alive at this point. Sleep
+			// should have exited with code 2+128=130. We'll exit
+			// with 10 plus that number, so that we can be sure
+			// that the shell did not get signalled.
+			if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+		},
+	}, "sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer ptmx.Close()
-	defer d.CleanUp()
-
-	// Call "sleep 100" in the shell.
-	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Give shell a few seconds to start executing the sleep.
-	time.Sleep(2 * time.Second)
-
-	// Send a ^C to the pty, which should kill sleep, but not the shell.
-	// \x03 is ASCII "end of text", which is the same as ^C.
-	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// The shell should still be alive at this point. Sleep should have
-	// exited with code 2+128=130. We'll exit with 10 plus that number, so
-	// that we can be sure that the shell did not get signalled.
-	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
 
 	// Wait for the container to exit.
 	got, err := d.Wait(5 * time.Second)
@@ -332,14 +340,25 @@ func TestJobControl(t *testing.T) {
 // TestTmpFile checks that files inside '/tmp' are not overridden. In addition,
 // it checks that working dir is created if it doesn't exit.
 func TestTmpFile(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Should work without ReadOnly
+	if _, err := d.Run(dockerutil.RunOpts{
+		Image:   "basic/alpine",
+		WorkDir: "/tmp/foo/bar",
+	}, "touch", "/tmp/foo/bar/file"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	d := dockerutil.MakeDocker("tmp-file-test")
-	if err := d.Run("-w=/tmp/foo/bar", "--read-only", "alpine", "touch", "/tmp/foo/bar/file"); err != nil {
-		t.Fatal("docker run failed:", err)
+
+	// Expect failure.
+	if _, err := d.Run(dockerutil.RunOpts{
+		Image:    "basic/alpine",
+		WorkDir:  "/tmp/foo/bar",
+		ReadOnly: true,
+	}, "touch", "/tmp/foo/bar/file"); err == nil {
+		t.Fatalf("docker run expected failure, but succeeded")
 	}
-	defer d.CleanUp()
 }
 
 func TestMain(m *testing.M) {
diff --git a/test/e2e/regression_test.go b/test/e2e/regression_test.go
index 2488be383..327a2174c 100644
--- a/test/e2e/regression_test.go
+++ b/test/e2e/regression_test.go
@@ -18,7 +18,7 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 )
 
 // Test that UDS can be created using overlay when parent directory is in lower
@@ -27,19 +27,19 @@ import (
 // Prerequisite: the directory where the socket file is created must not have
 // been open for write before bind(2) is called.
 func TestBindOverlay(t *testing.T) {
-	if err := dockerutil.Pull("ubuntu:trusty"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("bind-overlay-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	cmd := "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p"
-	got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
+	// Run the container.
+	got, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/ubuntu",
+	}, "bash", "-c", "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p")
 	if err != nil {
-		t.Fatal("docker run failed:", err)
+		t.Fatalf("docker run failed: %v", err)
 	}
 
+	// Check the output contains what we want.
 	if want := "foobar-asdf"; !strings.Contains(got, want) {
 		t.Fatalf("docker run output is missing %q: %s", want, got)
 	}
-	defer d.CleanUp()
 }
diff --git a/test/image/BUILD b/test/image/BUILD
index 7392ac54e..e749e47d4 100644
--- a/test/image/BUILD
+++ b/test/image/BUILD
@@ -22,8 +22,8 @@ go_test(
     ],
     visibility = ["//:sandbox"],
     deps = [
-        "//runsc/dockerutil",
-        "//runsc/testutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
     ],
 )
 
diff --git a/test/image/image_test.go b/test/image/image_test.go
index 0a1e19d6f..2e3543109 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -28,24 +28,29 @@ import (
 	"log"
 	"net/http"
 	"os"
-	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func TestHelloWorld(t *testing.T) {
-	d := dockerutil.MakeDocker("hello-test")
-	if err := d.Run("hello-world"); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Run the basic container.
+	out, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "echo", "Hello world!")
+	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
-	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
-		t.Fatalf("docker didn't say hello: %v", err)
+	// Check the output.
+	if !strings.Contains(out, "Hello world!") {
+		t.Fatalf("docker didn't say hello: got %s", out)
 	}
 }
 
@@ -102,27 +107,22 @@ func testHTTPServer(t *testing.T, port int) {
 }
 
 func TestHttpd(t *testing.T) {
-	if err := dockerutil.Pull("httpd"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("http-test")
-
-	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	mountArg := dockerutil.MountArg(dir, "/usr/local/apache2/htdocs", dockerutil.ReadOnly)
-	if err := d.Run("-p", "80", mountArg, "httpd"); err != nil {
+	d.CopyFiles("/usr/local/apache2/htdocs", "test/image/latin10k.txt")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/httpd",
+		Ports: []int{80},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 80 is mapped to.
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
+		t.Fatalf("FindPort(80) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
@@ -134,27 +134,22 @@ func TestHttpd(t *testing.T) {
 }
 
 func TestNginx(t *testing.T) {
-	if err := dockerutil.Pull("nginx"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("net-test")
-
-	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	mountArg := dockerutil.MountArg(dir, "/usr/share/nginx/html", dockerutil.ReadOnly)
-	if err := d.Run("-p", "80", mountArg, "nginx"); err != nil {
+	d.CopyFiles("/usr/share/nginx/html", "test/image/latin10k.txt")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/nginx",
+		Ports: []int{80},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 80 is mapped to.
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
+		t.Fatalf("FindPort(80) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
@@ -166,99 +161,58 @@ func TestNginx(t *testing.T) {
 }
 
 func TestMysql(t *testing.T) {
-	if err := dockerutil.Pull("mysql"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("mysql-test")
+	server := dockerutil.MakeDocker(t)
+	defer server.CleanUp()
 
 	// Start the container.
-	if err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+	if err := server.Spawn(dockerutil.RunOpts{
+		Image: "basic/mysql",
+		Env:   []string{"MYSQL_ROOT_PASSWORD=foobar123"},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Wait until it's up and running.
-	if _, err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	if _, err := server.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
+		t.Fatalf("WaitForOutput() timeout: %v", err)
 	}
 
-	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
+	// Generate the client and copy in the SQL payload.
+	client := dockerutil.MakeDocker(t)
+	defer client.CleanUp()
 
-	// Tell mysql client to connect to the server and execute the file in verbose
-	// mode to verify the output.
-	args := []string{
-		dockerutil.LinkArg(&d, "mysql"),
-		dockerutil.MountArg(dir, "/sql", dockerutil.ReadWrite),
-		"mysql",
-		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
-	}
-	if err := client.Run(args...); err != nil {
+	// Tell mysql client to connect to the server and execute the file in
+	// verbose mode to verify the output.
+	client.CopyFiles("/sql", "test/image/mysql.sql")
+	client.Link("mysql", server)
+	if _, err := client.Run(dockerutil.RunOpts{
+		Image: "basic/mysql",
+	}, "mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer client.CleanUp()
 
 	// Ensure file executed to the end and shutdown mysql.
-	if _, err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
-	}
-	if _, err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	if _, err := server.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
+		t.Fatalf("WaitForOutput() timeout: %v", err)
 	}
 }
 
-func TestPythonHello(t *testing.T) {
-	// TODO(b/136503277): Once we have more complete python runtime tests,
-	// we can drop this one.
-	const img = "gcr.io/gvisor-presubmit/python-hello"
-	if err := dockerutil.Pull(img); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("python-hello-test")
-	if err := d.Run("-p", "8080", img); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
+func TestTomcat(t *testing.T) {
+	d := dockerutil.MakeDocker(t)
 	defer d.CleanUp()
 
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-}
-
-func TestTomcat(t *testing.T) {
-	if err := dockerutil.Pull("tomcat:8.0"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("tomcat-test")
-	if err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+	// Start the server.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/tomcat",
+		Ports: []int{8080},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatalf("FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
@@ -278,28 +232,22 @@ func TestTomcat(t *testing.T) {
 }
 
 func TestRuby(t *testing.T) {
-	if err := dockerutil.Pull("ruby"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("ruby-test")
-
-	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
-	if err := os.Chmod(filepath.Join(dir, "ruby.sh"), 0333); err != nil {
-		t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err)
-	}
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	if err := d.Run("-p", "8080", dockerutil.MountArg(dir, "/src", dockerutil.ReadOnly), "ruby", "/src/ruby.sh"); err != nil {
+	// Execute the ruby workload.
+	d.CopyFiles("/src", "test/image/ruby.rb", "test/image/ruby.sh")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/ruby",
+		Ports: []int{8080},
+	}, "/src/ruby.sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatalf("FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running, 'gem install' can take some time.
@@ -326,18 +274,17 @@ func TestRuby(t *testing.T) {
 }
 
 func TestStdio(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("stdio-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	wantStdout := "hello stdout"
 	wantStderr := "bonjour stderr"
 	cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr)
-	if err := d.Run("alpine", "/bin/sh", "-c", cmd); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "/bin/sh", "-c", cmd); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	for _, want := range []string{wantStdout, wantStderr} {
 		if _, err := d.WaitForOutput(want, 5*time.Second); err != nil {
diff --git a/test/image/ruby.sh b/test/image/ruby.sh
old mode 100644
new mode 100755
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index 6bb3b82b5..3e29ca90d 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -14,7 +14,7 @@ go_library(
     ],
     visibility = ["//test/iptables:__subpackages__"],
     deps = [
-        "//runsc/testutil",
+        "//pkg/test/testutil",
     ],
 )
 
@@ -23,14 +23,14 @@ go_test(
     srcs = [
         "iptables_test.go",
     ],
+    data = ["//test/iptables/runner"],
     library = ":iptables",
     tags = [
         "local",
         "manual",
     ],
     deps = [
-        "//pkg/log",
-        "//runsc/dockerutil",
-        "//runsc/testutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
     ],
 )
diff --git a/test/iptables/README.md b/test/iptables/README.md
index cc8a2fcac..b9f44bd40 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -38,7 +38,7 @@ Build the testing Docker container. Re-run this when you modify the test code in
 this directory:
 
 ```bash
-$ bazel run //test/iptables/runner:runner-image -- --norun
+$ make load-iptables
 ```
 
 Run an individual test via:
diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go
index 2e565d988..16cb4f4da 100644
--- a/test/iptables/iptables.go
+++ b/test/iptables/iptables.go
@@ -18,12 +18,19 @@ package iptables
 import (
 	"fmt"
 	"net"
+	"time"
 )
 
 // IPExchangePort is the port the container listens on to receive the IP
 // address of the local process.
 const IPExchangePort = 2349
 
+// TerminalStatement is the last statement in the test runner.
+const TerminalStatement = "Finished!"
+
+// TestTimeout is the timeout used for all tests.
+const TestTimeout = 10 * time.Minute
+
 // A TestCase contains one action to run in the container and one to run
 // locally. The actions run concurrently and each must succeed for the test
 // pass.
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 493d69052..334d8e676 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -15,28 +15,14 @@
 package iptables
 
 import (
-	"flag"
 	"fmt"
 	"net"
-	"os"
-	"path"
 	"testing"
-	"time"
 
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
-const timeout = 18 * time.Second
-
-var image = flag.String("image", "bazel/test/iptables/runner:runner-image", "image to run tests in")
-
-type result struct {
-	output string
-	err    error
-}
-
 // singleTest runs a TestCase. Each test follows a pattern:
 // - Create a container.
 // - Get the container's IP.
@@ -46,77 +32,45 @@ type result struct {
 //
 // Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or
 // to stderr.
-func singleTest(test TestCase) error {
+func singleTest(t *testing.T, test TestCase) {
 	if _, ok := Tests[test.Name()]; !ok {
-		return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name())
+		t.Fatalf("no test found with name %q. Has it been registered?", test.Name())
 	}
 
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
 	// Create and start the container.
-	cont := dockerutil.MakeDocker("gvisor-iptables")
-	defer cont.CleanUp()
-	resultChan := make(chan *result)
-	go func() {
-		output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name())
-		logContainer(output, err)
-		resultChan <- &result{output, err}
-	}()
+	d.CopyFiles("/runner", "test/iptables/runner/runner")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image:  "iptables",
+		CapAdd: []string{"NET_ADMIN"},
+	}, "/runner/runner", "-name", test.Name()); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
 
 	// Get the container IP.
-	ip, err := getIP(cont)
+	ip, err := d.FindIP()
 	if err != nil {
-		return fmt.Errorf("failed to get container IP: %v", err)
+		t.Fatalf("failed to get container IP: %v", err)
 	}
 
 	// Give the container our IP.
 	if err := sendIP(ip); err != nil {
-		return fmt.Errorf("failed to send IP to container: %v", err)
+		t.Fatalf("failed to send IP to container: %v", err)
 	}
 
 	// Run our side of the test.
-	errChan := make(chan error)
-	go func() {
-		errChan <- test.LocalAction(ip)
-	}()
-
-	// Wait for both the container and local tests to finish.
-	var res *result
-	to := time.After(timeout)
-	for localDone := false; res == nil || !localDone; {
-		select {
-		case res = <-resultChan:
-			log.Infof("Container finished.")
-		case err, localDone = <-errChan:
-			log.Infof("Local finished.")
-			if err != nil {
-				return fmt.Errorf("local test failed: %v", err)
-			}
-		case <-to:
-			return fmt.Errorf("timed out after %f seconds", timeout.Seconds())
-		}
+	if err := test.LocalAction(ip); err != nil {
+		t.Fatalf("LocalAction failed: %v", err)
 	}
 
-	return res.err
-}
-
-func getIP(cont dockerutil.Docker) (net.IP, error) {
-	// The container might not have started yet, so retry a few times.
-	var ipStr string
-	to := time.After(timeout)
-	for ipStr == "" {
-		ipStr, _ = cont.FindIP()
-		select {
-		case <-to:
-			return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds())
-		default:
-			time.Sleep(250 * time.Millisecond)
-		}
-	}
-	ip := net.ParseIP(ipStr)
-	if ip == nil {
-		return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr)
+	// Wait for the final statement. This structure has the side effect
+	// that all container logs will appear within the individual test
+	// context.
+	if _, err := d.WaitForOutput(TerminalStatement, TestTimeout); err != nil {
+		t.Fatalf("test failed: %v", err)
 	}
-	log.Infof("Container has IP of %s", ipStr)
-	return ip, nil
 }
 
 func sendIP(ip net.IP) error {
@@ -132,7 +86,7 @@ func sendIP(ip net.IP) error {
 		conn = c
 		return err
 	}
-	if err := testutil.Poll(cb, timeout); err != nil {
+	if err := testutil.Poll(cb, TestTimeout); err != nil {
 		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
 	}
 	if _, err := conn.Write([]byte{0}); err != nil {
@@ -141,281 +95,184 @@ func sendIP(ip net.IP) error {
 	return nil
 }
 
-func logContainer(output string, err error) {
-	msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output)
-	if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" {
-		fpath := path.Join(artifactsDir, "container.log")
-		if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil {
-			log.Warningf("Failed to open log file %q: %v", fpath, err)
-		} else {
-			defer file.Close()
-			if _, err := file.Write([]byte(msg)); err == nil {
-				return
-			}
-			log.Warningf("Failed to write to log file %s: %v", fpath, err)
-		}
-	}
-
-	// We couldn't write to the output directory -- just log to stderr.
-	log.Infof(msg)
-}
-
 func TestFilterInputDropUDP(t *testing.T) {
-	if err := singleTest(FilterInputDropUDP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropUDP{})
 }
 
 func TestFilterInputDropUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropUDPPort{})
 }
 
 func TestFilterInputDropDifferentUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropDifferentUDPPort{})
 }
 
 func TestFilterInputDropAll(t *testing.T) {
-	if err := singleTest(FilterInputDropAll{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropAll{})
 }
 
 func TestFilterInputDropOnlyUDP(t *testing.T) {
-	if err := singleTest(FilterInputDropOnlyUDP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropOnlyUDP{})
 }
 
 func TestNATRedirectUDPPort(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATRedirectUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATRedirectUDPPort{})
 }
 
 func TestNATRedirectTCPPort(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATRedirectTCPPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATRedirectTCPPort{})
 }
 
 func TestNATDropUDP(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATDropUDP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATDropUDP{})
 }
 
 func TestNATAcceptAll(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATAcceptAll{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATAcceptAll{})
 }
 
 func TestFilterInputDropTCPDestPort(t *testing.T) {
-	if err := singleTest(FilterInputDropTCPDestPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropTCPDestPort{})
 }
 
 func TestFilterInputDropTCPSrcPort(t *testing.T) {
-	if err := singleTest(FilterInputDropTCPSrcPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropTCPSrcPort{})
 }
 
 func TestFilterInputCreateUserChain(t *testing.T) {
-	if err := singleTest(FilterInputCreateUserChain{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputCreateUserChain{})
 }
 
 func TestFilterInputDefaultPolicyAccept(t *testing.T) {
-	if err := singleTest(FilterInputDefaultPolicyAccept{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDefaultPolicyAccept{})
 }
 
 func TestFilterInputDefaultPolicyDrop(t *testing.T) {
-	if err := singleTest(FilterInputDefaultPolicyDrop{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDefaultPolicyDrop{})
 }
 
 func TestFilterInputReturnUnderflow(t *testing.T) {
-	if err := singleTest(FilterInputReturnUnderflow{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputReturnUnderflow{})
 }
 
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDropTCPDestPort{})
 }
 
 func TestFilterOutputDropTCPSrcPort(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDropTCPSrcPort{})
 }
 
 func TestFilterOutputAcceptTCPOwner(t *testing.T) {
-	if err := singleTest(FilterOutputAcceptTCPOwner{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputAcceptTCPOwner{})
 }
 
 func TestFilterOutputDropTCPOwner(t *testing.T) {
-	if err := singleTest(FilterOutputDropTCPOwner{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDropTCPOwner{})
 }
 
 func TestFilterOutputAcceptUDPOwner(t *testing.T) {
-	if err := singleTest(FilterOutputAcceptUDPOwner{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputAcceptUDPOwner{})
 }
 
 func TestFilterOutputDropUDPOwner(t *testing.T) {
-	if err := singleTest(FilterOutputDropUDPOwner{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDropUDPOwner{})
 }
 
 func TestFilterOutputOwnerFail(t *testing.T) {
-	if err := singleTest(FilterOutputOwnerFail{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputOwnerFail{})
 }
 
 func TestJumpSerialize(t *testing.T) {
-	if err := singleTest(FilterInputSerializeJump{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputSerializeJump{})
 }
 
 func TestJumpBasic(t *testing.T) {
-	if err := singleTest(FilterInputJumpBasic{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputJumpBasic{})
 }
 
 func TestJumpReturn(t *testing.T) {
-	if err := singleTest(FilterInputJumpReturn{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputJumpReturn{})
 }
 
 func TestJumpReturnDrop(t *testing.T) {
-	if err := singleTest(FilterInputJumpReturnDrop{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputJumpReturnDrop{})
 }
 
 func TestJumpBuiltin(t *testing.T) {
-	if err := singleTest(FilterInputJumpBuiltin{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputJumpBuiltin{})
 }
 
 func TestJumpTwice(t *testing.T) {
-	if err := singleTest(FilterInputJumpTwice{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputJumpTwice{})
 }
 
 func TestInputDestination(t *testing.T) {
-	if err := singleTest(FilterInputDestination{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDestination{})
 }
 
 func TestInputInvertDestination(t *testing.T) {
-	if err := singleTest(FilterInputInvertDestination{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputInvertDestination{})
 }
 
 func TestOutputDestination(t *testing.T) {
-	if err := singleTest(FilterOutputDestination{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDestination{})
 }
 
 func TestOutputInvertDestination(t *testing.T) {
-	if err := singleTest(FilterOutputInvertDestination{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputInvertDestination{})
 }
 
 func TestNATOutRedirectIP(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATOutRedirectIP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATOutRedirectIP{})
 }
 
 func TestNATOutDontRedirectIP(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATOutDontRedirectIP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATOutDontRedirectIP{})
 }
 
 func TestNATOutRedirectInvert(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATOutRedirectInvert{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATOutRedirectInvert{})
 }
 
 func TestNATPreRedirectIP(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATPreRedirectIP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATPreRedirectIP{})
 }
 
 func TestNATPreDontRedirectIP(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATPreDontRedirectIP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATPreDontRedirectIP{})
 }
 
 func TestNATPreRedirectInvert(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATPreRedirectInvert{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATPreRedirectInvert{})
 }
 
 func TestNATRedirectRequiresProtocol(t *testing.T) {
 	// TODO(gvisor.dev/issue/170): Enable when supported.
 	t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).")
-	if err := singleTest(NATRedirectRequiresProtocol{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, NATRedirectRequiresProtocol{})
 }
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 134391e8d..2a00677be 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -20,7 +20,7 @@ import (
 	"os/exec"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 const iptablesBinary = "iptables"
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
index b9199387a..24504a1b9 100644
--- a/test/iptables/runner/BUILD
+++ b/test/iptables/runner/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "container_image", "go_binary", "go_image")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -6,18 +6,7 @@ go_binary(
     name = "runner",
     testonly = 1,
     srcs = ["main.go"],
-    deps = ["//test/iptables"],
-)
-
-container_image(
-    name = "iptables-base",
-    base = "@iptables-test//image",
-)
-
-go_image(
-    name = "runner-image",
-    testonly = 1,
-    srcs = ["main.go"],
-    base = ":iptables-base",
+    pure = True,
+    visibility = ["//test/iptables:__subpackages__"],
     deps = ["//test/iptables"],
 )
diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go
index 3c794114e..6f77c0684 100644
--- a/test/iptables/runner/main.go
+++ b/test/iptables/runner/main.go
@@ -46,6 +46,9 @@ func main() {
 	if err := test.ContainerAction(ip); err != nil {
 		log.Fatalf("Failed running test %q: %v", *name, err)
 	}
+
+	// Emit the final line.
+	log.Printf("%s", iptables.TerminalStatement)
 }
 
 // getIP listens for a connection from the local process and returns the source
diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh
index c8268170f..922547d65 100755
--- a/test/packetdrill/packetdrill_test.sh
+++ b/test/packetdrill/packetdrill_test.sh
@@ -85,23 +85,26 @@ if [[ ! -x "${INIT_SCRIPT-}" ]]; then
   exit 2
 fi
 
+function new_net_prefix() {
+  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
+}
+
 # Variables specific to the control network and interface start with CTRL_.
 # Variables specific to the test network and interface start with TEST_.
 # Variables specific to the DUT start with DUT_.
 # Variables specific to the test runner start with TEST_RUNNER_.
 declare -r PACKETDRILL="/packetdrill/gtests/net/packetdrill/packetdrill"
 # Use random numbers so that test networks don't collide.
-declare -r CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)"
-declare -r TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)"
+declare CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)"
+declare CTRL_NET_PREFIX=$(new_net_prefix)
+declare TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)"
+declare TEST_NET_PREFIX=$(new_net_prefix)
 declare -r tolerance_usecs=100000
 # On both DUT and test runner, testing packets are on the eth2 interface.
 declare -r TEST_DEVICE="eth2"
 # Number of bits in the *_NET_PREFIX variables.
 declare -r NET_MASK="24"
-function new_net_prefix() {
-  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
-  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
-}
 # Last bits of the DUT's IP address.
 declare -r DUT_NET_SUFFIX=".10"
 # Control port.
@@ -137,23 +140,21 @@ function finish {
 trap finish EXIT
 
 # Subnet for control packets between test runner and DUT.
-declare CTRL_NET_PREFIX=$(new_net_prefix)
 while ! docker network create \
   "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
   sleep 0.1
-  declare CTRL_NET_PREFIX=$(new_net_prefix)
+  CTRL_NET_PREFIX=$(new_net_prefix)
+  CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)"
 done
 
 # Subnet for the packets that are part of the test.
-declare TEST_NET_PREFIX=$(new_net_prefix)
 while ! docker network create \
   "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
   sleep 0.1
-  declare TEST_NET_PREFIX=$(new_net_prefix)
+  TEST_NET_PREFIX=$(new_net_prefix)
+  TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)"
 done
 
-docker pull "${IMAGE_TAG}"
-
 # Create the DUT container and connect to network.
 DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
   --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index 9335909c0..3f340c6bc 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -132,7 +132,7 @@ func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16)
 		copy(sa.Addr[:], addr.To16())
 		dut.Bind(fd, &sa)
 	} else {
-		dut.t.Fatal("unknown ip addr type for remoteIP")
+		dut.t.Fatalf("unknown ip addr type for remoteIP")
 	}
 	sa := dut.GetSockName(fd)
 	var port int
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
index 2be3c17c3..46d63d5e5 100755
--- a/test/packetimpact/tests/test_runner.sh
+++ b/test/packetimpact/tests/test_runner.sh
@@ -107,21 +107,24 @@ if [[ ! -f "${TESTBENCH_BINARY-}" ]]; then
   exit 2
 fi
 
+function new_net_prefix() {
+  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
+}
+
 # Variables specific to the control network and interface start with CTRL_.
 # Variables specific to the test network and interface start with TEST_.
 # Variables specific to the DUT start with DUT_.
 # Variables specific to the test bench start with TESTBENCH_.
 # Use random numbers so that test networks don't collide.
-declare -r CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-declare -r TEST_NET="test_net-${RANDOM}${RANDOM}"
+declare CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
+declare CTRL_NET_PREFIX=$(new_net_prefix)
+declare TEST_NET="test_net-${RANDOM}${RANDOM}"
+declare TEST_NET_PREFIX=$(new_net_prefix)
 # On both DUT and test bench, testing packets are on the eth2 interface.
 declare -r TEST_DEVICE="eth2"
 # Number of bits in the *_NET_PREFIX variables.
 declare -r NET_MASK="24"
-function new_net_prefix() {
-  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
-  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
-}
 # Last bits of the DUT's IP address.
 declare -r DUT_NET_SUFFIX=".10"
 # Control port.
@@ -130,6 +133,7 @@ declare -r CTRL_PORT="40000"
 declare -r TESTBENCH_NET_SUFFIX=".20"
 declare -r TIMEOUT="60"
 declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetimpact"
+
 # Make sure that docker is installed.
 docker --version
 
@@ -169,19 +173,19 @@ function finish {
 trap finish EXIT
 
 # Subnet for control packets between test bench and DUT.
-declare CTRL_NET_PREFIX=$(new_net_prefix)
 while ! docker network create \
   "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
   sleep 0.1
-  declare CTRL_NET_PREFIX=$(new_net_prefix)
+  CTRL_NET_PREFIX=$(new_net_prefix)
+  CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
 done
 
 # Subnet for the packets that are part of the test.
-declare TEST_NET_PREFIX=$(new_net_prefix)
 while ! docker network create \
   "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
   sleep 0.1
-  declare TEST_NET_PREFIX=$(new_net_prefix)
+  TEST_NET_PREFIX=$(new_net_prefix)
+  TEST_NET="test_net-${RANDOM}${RANDOM}"
 done
 
 docker pull "${IMAGE_TAG}"
diff --git a/test/root/BUILD b/test/root/BUILD
index 05166673a..17e51e66e 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -33,14 +33,12 @@ go_test(
     ],
     visibility = ["//:sandbox"],
     deps = [
-        "//runsc/boot",
+        "//pkg/test/criutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
         "//runsc/cgroup",
         "//runsc/container",
-        "//runsc/criutil",
-        "//runsc/dockerutil",
         "//runsc/specutils",
-        "//runsc/testutil",
-        "//test/root/testdata",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
index 679342def..8876d0d61 100644
--- a/test/root/cgroup_test.go
+++ b/test/root/cgroup_test.go
@@ -26,9 +26,9 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/cgroup"
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func verifyPid(pid int, path string) error {
@@ -56,54 +56,70 @@ func verifyPid(pid int, path string) error {
 	return fmt.Errorf("got: %v, want: %d", gots, pid)
 }
 
-// TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestMemCGroup(t *testing.T) {
-	allocMemSize := 128 << 20
-	if err := dockerutil.Pull("python"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("memusage-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start a new container and allocate the specified about of memory.
-	args := []string{
-		"--memory=256MB",
-		"python",
-		"python",
-		"-c",
-		fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize),
-	}
-	if err := d.Run(args...); err != nil {
-		t.Fatal("docker create failed:", err)
+	allocMemSize := 128 << 20
+	allocMemLimit := 2 * allocMemSize
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image:  "basic/python",
+		Memory: allocMemLimit / 1024, // Must be in Kb.
+	}, "python", "-c", fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize)); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Extract the ID to lookup the cgroup.
 	gid, err := d.ID()
 	if err != nil {
 		t.Fatalf("Docker.ID() failed: %v", err)
 	}
 	t.Logf("cgroup ID: %s", gid)
 
-	path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.usage_in_bytes")
-	memUsage := 0
-
 	// Wait when the container will allocate memory.
+	memUsage := 0
 	start := time.Now()
-	for time.Now().Sub(start) < 30*time.Second {
+	for time.Since(start) < 30*time.Second {
+		// Sleep for a brief period of time after spawning the
+		// container (so that Docker can create the cgroup etc.
+		// or after looping below (so the application can start).
+		time.Sleep(100 * time.Millisecond)
+
+		// Read the cgroup memory limit.
+		path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.limit_in_bytes")
 		outRaw, err := ioutil.ReadFile(path)
 		if err != nil {
-			t.Fatalf("failed to read %q: %v", path, err)
+			// It's possible that the container does not exist yet.
+			continue
 		}
 		out := strings.TrimSpace(string(outRaw))
+		memLimit, err := strconv.Atoi(out)
+		if err != nil {
+			t.Fatalf("Atoi(%v): %v", out, err)
+		}
+		if memLimit != allocMemLimit {
+			// The group may not have had the correct limit set yet.
+			continue
+		}
+
+		// Read the cgroup memory usage.
+		path = filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.max_usage_in_bytes")
+		outRaw, err = ioutil.ReadFile(path)
+		if err != nil {
+			t.Fatalf("error reading usage: %v", err)
+		}
+		out = strings.TrimSpace(string(outRaw))
 		memUsage, err = strconv.Atoi(out)
 		if err != nil {
 			t.Fatalf("Atoi(%v): %v", out, err)
 		}
+		t.Logf("read usage: %v, wanted: %v", memUsage, allocMemSize)
 
-		if memUsage > allocMemSize {
+		// Are we done?
+		if memUsage >= allocMemSize {
 			return
 		}
-
-		time.Sleep(100 * time.Millisecond)
 	}
 
 	t.Fatalf("%vMB is less than %vMB", memUsage>>20, allocMemSize>>20)
@@ -111,10 +127,8 @@ func TestMemCGroup(t *testing.T) {
 
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroup(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// This is not a comprehensive list of attributes.
 	//
@@ -179,10 +193,11 @@ func TestCgroup(t *testing.T) {
 			want: "5",
 		},
 		{
-			arg:  "--blkio-weight=750",
-			ctrl: "blkio",
-			file: "blkio.weight",
-			want: "750",
+			arg:            "--blkio-weight=750",
+			ctrl:           "blkio",
+			file:           "blkio.weight",
+			want:           "750",
+			skipIfNotFound: true, // blkio groups may not be available.
 		},
 	}
 
@@ -191,12 +206,15 @@ func TestCgroup(t *testing.T) {
 		args = append(args, attr.arg)
 	}
 
-	args = append(args, "alpine", "sleep", "10000")
-	if err := d.Run(args...); err != nil {
-		t.Fatal("docker create failed:", err)
+	// Start the container.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Extra: args, // Cgroup arguments.
+	}, "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Lookup the relevant cgroup ID.
 	gid, err := d.ID()
 	if err != nil {
 		t.Fatalf("Docker.ID() failed: %v", err)
@@ -245,17 +263,21 @@ func TestCgroup(t *testing.T) {
 	}
 }
 
+// TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroupParent(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	parent := testutil.RandomName("runsc")
-	if err := d.Run("--cgroup-parent", parent, "alpine", "sleep", "10000"); err != nil {
-		t.Fatal("docker create failed:", err)
+	// Construct a known cgroup name.
+	parent := testutil.RandomID("runsc-")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Extra: []string{fmt.Sprintf("--cgroup-parent=%s", parent)},
+	}, "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
+
+	// Extract the ID to look up the cgroup.
 	gid, err := d.ID()
 	if err != nil {
 		t.Fatalf("Docker.ID() failed: %v", err)
diff --git a/test/root/chroot_test.go b/test/root/chroot_test.go
index be0f63d18..a306132a4 100644
--- a/test/root/chroot_test.go
+++ b/test/root/chroot_test.go
@@ -24,17 +24,20 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 )
 
 // TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned
 // up after the sandbox is destroyed.
 func TestChroot(t *testing.T) {
-	d := dockerutil.MakeDocker("chroot-test")
-	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "10000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	pid, err := d.SandboxPid()
 	if err != nil {
@@ -76,11 +79,14 @@ func TestChroot(t *testing.T) {
 }
 
 func TestChrootGofer(t *testing.T) {
-	d := dockerutil.MakeDocker("chroot-test")
-	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "10000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// It's tricky to find gofers. Get sandbox PID first, then find parent. From
 	// parent get all immediate children, remove the sandbox, and everything else
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index 3f90c4c6a..85007dcce 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -16,6 +16,7 @@ package root
 
 import (
 	"bytes"
+	"encoding/json"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -29,16 +30,58 @@ import (
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/criutil"
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/criutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
-	"gvisor.dev/gvisor/test/root/testdata"
 )
 
 // Tests for crictl have to be run as root (rather than in a user namespace)
 // because crictl creates named network namespaces in /var/run/netns/.
 
+// SimpleSpec returns a JSON config for a simple container that runs the
+// specified command in the specified image.
+func SimpleSpec(name, image string, cmd []string, extra map[string]interface{}) string {
+	s := map[string]interface{}{
+		"metadata": map[string]string{
+			"name": name,
+		},
+		"image": map[string]string{
+			"image": testutil.ImageByName(image),
+		},
+		"log_path": fmt.Sprintf("%s.log", name),
+	}
+	if len(cmd) > 0 { // Omit if empty.
+		s["command"] = cmd
+	}
+	for k, v := range extra {
+		s[k] = v // Extra settings.
+	}
+	v, err := json.Marshal(s)
+	if err != nil {
+		// This shouldn't happen.
+		panic(err)
+	}
+	return string(v)
+}
+
+// Sandbox is a default JSON config for a sandbox.
+var Sandbox = `{
+    "metadata": {
+        "name": "default-sandbox",
+        "namespace": "default",
+        "attempt": 1,
+        "uid": "hdishd83djaidwnduwk28bcsb"
+    },
+    "linux": {
+    },
+    "log_directory": "/tmp"
+}
+`
+
+// Httpd is a JSON config for an httpd container.
+var Httpd = SimpleSpec("httpd", "basic/httpd", nil, nil)
+
 // TestCrictlSanity refers to b/112433158.
 func TestCrictlSanity(t *testing.T) {
 	// Setup containerd and crictl.
@@ -47,9 +90,9 @@ func TestCrictlSanity(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.Httpd)
+	podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox, Httpd)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	// Look for the httpd page.
@@ -59,10 +102,38 @@ func TestCrictlSanity(t *testing.T) {
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 }
 
+// HttpdMountPaths is a JSON config for an httpd container with additional
+// mounts.
+var HttpdMountPaths = SimpleSpec("httpd", "basic/httpd", nil, map[string]interface{}{
+	"mounts": []map[string]interface{}{
+		map[string]interface{}{
+			"container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
+			"readonly":       true,
+		},
+		map[string]interface{}{
+			"container_path": "/etc/hosts",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
+			"readonly":       false,
+		},
+		map[string]interface{}{
+			"container_path": "/dev/termination-log",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
+			"readonly":       false,
+		},
+		map[string]interface{}{
+			"container_path": "/usr/local/apache2/htdocs/test",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
+			"readonly":       true,
+		},
+	},
+	"linux": map[string]interface{}{},
+})
+
 // TestMountPaths refers to b/117635704.
 func TestMountPaths(t *testing.T) {
 	// Setup containerd and crictl.
@@ -71,9 +142,9 @@ func TestMountPaths(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.HttpdMountPaths)
+	podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox, HttpdMountPaths)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	// Look for the directory available at /test.
@@ -83,7 +154,7 @@ func TestMountPaths(t *testing.T) {
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 }
 
@@ -95,14 +166,16 @@ func TestMountOverSymlinks(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, testdata.MountOverSymlink)
+
+	spec := SimpleSpec("busybox", "basic/resolv", []string{"sleep", "1000"}, nil)
+	podID, contID, err := crictl.StartPodAndContainer("basic/resolv", Sandbox, spec)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("readlink failed: %v, out: %s", err, out)
 	}
 	if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) {
 		t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out))
@@ -110,11 +183,11 @@ func TestMountOverSymlinks(t *testing.T) {
 
 	etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("cat failed: %v, out: %s", err, etc)
 	}
 	tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("cat failed: %v, out: %s", err, out)
 	}
 	if tmp != etc {
 		t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp))
@@ -122,7 +195,7 @@ func TestMountOverSymlinks(t *testing.T) {
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 }
 
@@ -135,16 +208,16 @@ func TestHomeDir(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	contSpec := testdata.SimpleSpec("root", "k8s.gcr.io/busybox", []string{"sleep", "1000"})
-	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, contSpec)
+	contSpec := SimpleSpec("root", "basic/busybox", []string{"sleep", "1000"}, nil)
+	podID, contID, err := crictl.StartPodAndContainer("basic/busybox", Sandbox, contSpec)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	t.Run("root container", func(t *testing.T) {
 		out, err := crictl.Exec(contID, "sh", "-c", "echo $HOME")
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("exec failed: %v, out: %s", err, out)
 		}
 		if got, want := strings.TrimSpace(string(out)), "/root"; got != want {
 			t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want)
@@ -153,32 +226,47 @@ func TestHomeDir(t *testing.T) {
 
 	t.Run("sub-container", func(t *testing.T) {
 		// Create a sub container in the same pod.
-		subContSpec := testdata.SimpleSpec("subcontainer", "k8s.gcr.io/busybox", []string{"sleep", "1000"})
-		subContID, err := crictl.StartContainer(podID, "k8s.gcr.io/busybox", testdata.Sandbox, subContSpec)
+		subContSpec := SimpleSpec("subcontainer", "basic/busybox", []string{"sleep", "1000"}, nil)
+		subContID, err := crictl.StartContainer(podID, "basic/busybox", Sandbox, subContSpec)
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("start failed: %v", err)
 		}
 
 		out, err := crictl.Exec(subContID, "sh", "-c", "echo $HOME")
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("exec failed: %v, out: %s", err, out)
 		}
 		if got, want := strings.TrimSpace(string(out)), "/root"; got != want {
 			t.Fatalf("Home directory invalid. Got %q, Want: %q", got, want)
 		}
 
 		if err := crictl.StopContainer(subContID); err != nil {
-			t.Fatal(err)
+			t.Fatalf("stop failed: %v", err)
 		}
 	})
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 
 }
 
+// containerdConfigTemplate is a .toml config for containerd. It contains a
+// formatting verb so the runtime field can be set via fmt.Sprintf.
+const containerdConfigTemplate = `
+disabled_plugins = ["restart"]
+[plugins.linux]
+  runtime = "%s"
+  runtime_root = "/tmp/test-containerd/runsc"
+  shim = "/usr/local/bin/gvisor-containerd-shim"
+  shim_debug = true
+
+[plugins.cri.containerd.runtimes.runsc]
+  runtime_type = "io.containerd.runtime.v1.linux"
+  runtime_engine = "%s"
+`
+
 // setup sets up before a test. Specifically it:
 // * Creates directories and a socket for containerd to utilize.
 // * Runs containerd and waits for it to reach a "ready" state for testing.
@@ -213,50 +301,52 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
 	if err != nil {
 		t.Fatalf("error discovering runtime path: %v", err)
 	}
-	config, err := testutil.WriteTmpFile("containerd-config", testdata.ContainerdConfig(runtime))
+	config, configCleanup, err := testutil.WriteTmpFile("containerd-config", fmt.Sprintf(containerdConfigTemplate, runtime, runtime))
 	if err != nil {
 		t.Fatalf("failed to write containerd config")
 	}
-	cleanups = append(cleanups, func() { os.RemoveAll(config) })
+	cleanups = append(cleanups, configCleanup)
 
 	// Start containerd.
-	containerd := exec.Command(getContainerd(),
+	cmd := exec.Command(getContainerd(),
 		"--config", config,
 		"--log-level", "debug",
 		"--root", containerdRoot,
 		"--state", containerdState,
 		"--address", sockAddr)
+	startupR, startupW := io.Pipe()
+	defer startupR.Close()
+	defer startupW.Close()
+	stderr := &bytes.Buffer{}
+	stdout := &bytes.Buffer{}
+	cmd.Stderr = io.MultiWriter(startupW, stderr)
+	cmd.Stdout = io.MultiWriter(startupW, stdout)
 	cleanups = append(cleanups, func() {
-		if err := testutil.KillCommand(containerd); err != nil {
-			log.Printf("error killing containerd: %v", err)
-		}
+		t.Logf("containerd stdout: %s", stdout.String())
+		t.Logf("containerd stderr: %s", stderr.String())
 	})
-	containerdStderr, err := containerd.StderrPipe()
-	if err != nil {
-		t.Fatalf("failed to get containerd stderr: %v", err)
-	}
-	containerdStdout, err := containerd.StdoutPipe()
-	if err != nil {
-		t.Fatalf("failed to get containerd stdout: %v", err)
-	}
-	if err := containerd.Start(); err != nil {
+
+	// Start the process.
+	if err := cmd.Start(); err != nil {
 		t.Fatalf("failed running containerd: %v", err)
 	}
 
-	// Wait for containerd to boot. Then put all containerd output into a
-	// buffer to be logged at the end of the test.
-	testutil.WaitUntilRead(containerdStderr, "Start streaming server", nil, 10*time.Second)
-	stdoutBuf := &bytes.Buffer{}
-	stderrBuf := &bytes.Buffer{}
-	go func() { io.Copy(stdoutBuf, containerdStdout) }()
-	go func() { io.Copy(stderrBuf, containerdStderr) }()
+	// Wait for containerd to boot.
+	if err := testutil.WaitUntilRead(startupR, "Start streaming server", nil, 10*time.Second); err != nil {
+		t.Fatalf("failed to start containerd: %v", err)
+	}
+
+	// Kill must be the last cleanup (as it will be executed first).
+	cc := criutil.NewCrictl(t, sockAddr)
 	cleanups = append(cleanups, func() {
-		t.Logf("containerd stdout: %s", string(stdoutBuf.Bytes()))
-		t.Logf("containerd stderr: %s", string(stderrBuf.Bytes()))
+		cc.CleanUp() // Remove tmp files, etc.
+		if err := testutil.KillCommand(cmd); err != nil {
+			log.Printf("error killing containerd: %v", err)
+		}
 	})
 
 	cleanup.Release()
-	return criutil.NewCrictl(20*time.Second, sockAddr), cleanupFunc, nil
+	return cc, cleanupFunc, nil
 }
 
 // httpGet GETs the contents of a file served from a pod on port 80.
diff --git a/test/root/main_test.go b/test/root/main_test.go
index d74dec85f..9fb17e0dd 100644
--- a/test/root/main_test.go
+++ b/test/root/main_test.go
@@ -21,7 +21,7 @@ import (
 	"testing"
 
 	"github.com/syndtr/gocapability/capability"
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go
index 22488b05d..9a3cecd97 100644
--- a/test/root/oom_score_adj_test.go
+++ b/test/root/oom_score_adj_test.go
@@ -20,10 +20,9 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 var (
@@ -40,15 +39,6 @@ var (
 // TestOOMScoreAdjSingle tests that oom_score_adj is set properly in a
 // single container sandbox.
 func TestOOMScoreAdjSingle(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfig(t)
-	conf.RootDir = rootDir
-
 	ppid, err := specutils.GetParentPid(os.Getpid())
 	if err != nil {
 		t.Fatalf("getting parent pid: %v", err)
@@ -89,11 +79,11 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 
 	for _, testCase := range testCases {
 		t.Run(testCase.Name, func(t *testing.T) {
-			id := testutil.UniqueContainerID()
+			id := testutil.RandomContainerID()
 			s := testutil.NewSpecWithArgs("sleep", "1000")
 			s.Process.OOMScoreAdj = testCase.OOMScoreAdj
 
-			containers, cleanup, err := startContainers(conf, []*specs.Spec{s}, []string{id})
+			containers, cleanup, err := startContainers(t, []*specs.Spec{s}, []string{id})
 			if err != nil {
 				t.Fatalf("error starting containers: %v", err)
 			}
@@ -131,15 +121,6 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 // TestOOMScoreAdjMulti tests that oom_score_adj is set properly in a
 // multi-container sandbox.
 func TestOOMScoreAdjMulti(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfig(t)
-	conf.RootDir = rootDir
-
 	ppid, err := specutils.GetParentPid(os.Getpid())
 	if err != nil {
 		t.Fatalf("getting parent pid: %v", err)
@@ -257,7 +238,7 @@ func TestOOMScoreAdjMulti(t *testing.T) {
 				}
 			}
 
-			containers, cleanup, err := startContainers(conf, specs, ids)
+			containers, cleanup, err := startContainers(t, specs, ids)
 			if err != nil {
 				t.Fatalf("error starting containers: %v", err)
 			}
@@ -321,7 +302,7 @@ func TestOOMScoreAdjMulti(t *testing.T) {
 func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	var specs []*specs.Spec
 	var ids []string
-	rootID := testutil.UniqueContainerID()
+	rootID := testutil.RandomContainerID()
 
 	for i, cmd := range cmds {
 		spec := testutil.NewSpecWithArgs(cmd...)
@@ -335,35 +316,48 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
 				specutils.ContainerdSandboxIDAnnotation:     rootID,
 			}
-			ids = append(ids, testutil.UniqueContainerID())
+			ids = append(ids, testutil.RandomContainerID())
 		}
 		specs = append(specs, spec)
 	}
 	return specs, ids
 }
 
-func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
-	if len(conf.RootDir) == 0 {
-		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
-	}
-
-	var containers []*container.Container
-	var bundles []string
-	cleanup := func() {
+func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
+	var (
+		containers []*container.Container
+		cleanups   []func()
+	)
+	cleanups = append(cleanups, func() {
 		for _, c := range containers {
 			c.Destroy()
 		}
-		for _, b := range bundles {
-			os.RemoveAll(b)
+	})
+	cleanupAll := func() {
+		for _, c := range cleanups {
+			c()
 		}
 	}
+	localClean := specutils.MakeCleanup(cleanupAll)
+	defer localClean.Clean()
+
+	// All containers must share the same root.
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	cleanups = append(cleanups, cleanup)
+
+	// Point this to from the configuration.
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
 	for i, spec := range specs {
-		bundleDir, err := testutil.SetupBundleDir(spec)
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 		if err != nil {
-			cleanup()
-			return nil, nil, fmt.Errorf("error setting up container: %v", err)
+			return nil, nil, fmt.Errorf("error setting up bundle: %v", err)
 		}
-		bundles = append(bundles, bundleDir)
+		cleanups = append(cleanups, cleanup)
 
 		args := container.Args{
 			ID:        ids[i],
@@ -372,15 +366,15 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*c
 		}
 		cont, err := container.New(conf, args)
 		if err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error creating container: %v", err)
 		}
 		containers = append(containers, cont)
 
 		if err := cont.Start(conf); err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error starting container: %v", err)
 		}
 	}
-	return containers, cleanup, nil
+
+	localClean.Release()
+	return containers, cleanupAll, nil
 }
diff --git a/test/root/runsc_test.go b/test/root/runsc_test.go
index 90373e2db..25204bebb 100644
--- a/test/root/runsc_test.go
+++ b/test/root/runsc_test.go
@@ -28,8 +28,8 @@ import (
 
 	"github.com/cenkalti/backoff"
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // TestDoKill checks that when "runsc do..." is killed, the sandbox process is
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
deleted file mode 100644
index 6859541ad..000000000
--- a/test/root/testdata/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "testdata",
-    srcs = [
-        "busybox.go",
-        "containerd_config.go",
-        "httpd.go",
-        "httpd_mount_paths.go",
-        "sandbox.go",
-        "simple.go",
-    ],
-    visibility = [
-        "//:sandbox",
-    ],
-)
diff --git a/test/root/testdata/busybox.go b/test/root/testdata/busybox.go
deleted file mode 100644
index e4dbd2843..000000000
--- a/test/root/testdata/busybox.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// MountOverSymlink is a JSON config for a container that /etc/resolv.conf is a
-// symlink to /tmp/resolv.conf.
-var MountOverSymlink = `
-{
-        "metadata": {
-                "name": "busybox"
-        },
-        "image": {
-                "image": "k8s.gcr.io/busybox"
-        },
-        "command": [
-                "sleep",
-                "1000"
-        ]
-}
-`
diff --git a/test/root/testdata/containerd_config.go b/test/root/testdata/containerd_config.go
deleted file mode 100644
index e12f1ec88..000000000
--- a/test/root/testdata/containerd_config.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package testdata contains data required for root tests.
-package testdata
-
-import "fmt"
-
-// containerdConfigTemplate is a .toml config for containerd. It contains a
-// formatting verb so the runtime field can be set via fmt.Sprintf.
-const containerdConfigTemplate = `
-disabled_plugins = ["restart"]
-[plugins.linux]
-  runtime = "%s"
-  runtime_root = "/tmp/test-containerd/runsc"
-  shim = "/usr/local/bin/gvisor-containerd-shim"
-  shim_debug = true
-
-[plugins.cri.containerd.runtimes.runsc]
-  runtime_type = "io.containerd.runtime.v1.linux"
-  runtime_engine = "%s"
-`
-
-// ContainerdConfig returns a containerd config file with the specified
-// runtime.
-func ContainerdConfig(runtime string) string {
-	return fmt.Sprintf(containerdConfigTemplate, runtime, runtime)
-}
diff --git a/test/root/testdata/httpd.go b/test/root/testdata/httpd.go
deleted file mode 100644
index 45d5e33d4..000000000
--- a/test/root/testdata/httpd.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// Httpd is a JSON config for an httpd container.
-const Httpd = `
-{
-  "metadata": {
-    "name": "httpd"
-  },
-  "image":{
-    "image": "httpd"
-  },
-  "mounts": [
-  ],
-  "linux": {
-  },
-  "log_path": "httpd.log"
-}
-`
diff --git a/test/root/testdata/httpd_mount_paths.go b/test/root/testdata/httpd_mount_paths.go
deleted file mode 100644
index ac3f4446a..000000000
--- a/test/root/testdata/httpd_mount_paths.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// HttpdMountPaths is a JSON config for an httpd container with additional
-// mounts.
-const HttpdMountPaths = `
-{
-  "metadata": {
-    "name": "httpd"
-  },
-  "image":{
-    "image": "httpd"
-  },
-  "mounts": [
-      {
-        "container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
-        "readonly": true
-      },
-      {
-        "container_path": "/etc/hosts",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
-        "readonly": false
-      },
-      {
-        "container_path": "/dev/termination-log",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
-        "readonly": false
-      },
-      {
-        "container_path": "/usr/local/apache2/htdocs/test",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
-        "readonly": true
-      }
-  ],
-  "linux": {
-  },
-  "log_path": "httpd.log"
-}
-`
diff --git a/test/root/testdata/sandbox.go b/test/root/testdata/sandbox.go
deleted file mode 100644
index 0db210370..000000000
--- a/test/root/testdata/sandbox.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// Sandbox is a default JSON config for a sandbox.
-const Sandbox = `
-{
-    "metadata": {
-        "name": "default-sandbox",
-        "namespace": "default",
-        "attempt": 1,
-        "uid": "hdishd83djaidwnduwk28bcsb"
-    },
-    "linux": {
-    },
-    "log_directory": "/tmp"
-}
-`
diff --git a/test/root/testdata/simple.go b/test/root/testdata/simple.go
deleted file mode 100644
index 1cca53f0c..000000000
--- a/test/root/testdata/simple.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-import (
-	"encoding/json"
-	"fmt"
-)
-
-// SimpleSpec returns a JSON config for a simple container that runs the
-// specified command in the specified image.
-func SimpleSpec(name, image string, cmd []string) string {
-	cmds, err := json.Marshal(cmd)
-	if err != nil {
-		// This shouldn't happen.
-		panic(err)
-	}
-	return fmt.Sprintf(`
-{
-        "metadata": {
-                "name": %q
-        },
-        "image": {
-                "image": %q
-        },
-        "command": %s
-	}
-`, name, image, cmds)
-}
diff --git a/test/runner/BUILD b/test/runner/BUILD
index 9959ef9b0..6833c9986 100644
--- a/test/runner/BUILD
+++ b/test/runner/BUILD
@@ -12,8 +12,8 @@ go_binary(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
+        "//pkg/test/testutil",
         "//runsc/specutils",
-        "//runsc/testutil",
         "//test/runner/gtest",
         "//test/uds",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/test/runner/runner.go b/test/runner/runner.go
index 0d3742f71..14c9cbc47 100644
--- a/test/runner/runner.go
+++ b/test/runner/runner.go
@@ -32,8 +32,8 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 	"gvisor.dev/gvisor/test/runner/gtest"
 	"gvisor.dev/gvisor/test/uds"
 )
@@ -115,20 +115,20 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
 //
 // Returns an error if the sandboxed application exits non-zero.
 func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
-	bundleDir, err := testutil.SetupBundleDir(spec)
+	bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 	if err != nil {
 		return fmt.Errorf("SetupBundleDir failed: %v", err)
 	}
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		return fmt.Errorf("SetupRootDir failed: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	name := tc.FullName()
-	id := testutil.UniqueContainerID()
+	id := testutil.RandomContainerID()
 	log.Infof("Running test %q in container %q", name, id)
 	specutils.LogSpec(spec)
 
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 2c472bf8d..4cd627222 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -1,20 +1,7 @@
-# These packages are used to run language runtime tests inside gVisor sandboxes.
-
-load("//tools:defs.bzl", "go_binary", "go_test")
-load("//test/runtimes:build_defs.bzl", "runtime_test")
+load("//test/runtimes:defs.bzl", "runtime_test")
 
 package(licenses = ["notice"])
 
-go_binary(
-    name = "runner",
-    testonly = 1,
-    srcs = ["runner.go"],
-    deps = [
-        "//runsc/dockerutil",
-        "//runsc/testutil",
-    ],
-)
-
 runtime_test(
     name = "go1.12",
     blacklist_file = "blacklist_go1.12.csv",
@@ -44,10 +31,3 @@ runtime_test(
     blacklist_file = "blacklist_python3.7.3.csv",
     lang = "python",
 )
-
-go_test(
-    name = "blacklist_test",
-    size = "small",
-    srcs = ["blacklist_test.go"],
-    library = ":runner",
-)
diff --git a/test/runtimes/README.md b/test/runtimes/README.md
deleted file mode 100644
index 42d722553..000000000
--- a/test/runtimes/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Runtimes Tests Dockerfiles
-
-The Dockerfiles defined under this path are configured to host the execution of
-the runtimes language tests. Each Dockerfile can support the language indicated
-by its directory.
-
-The following runtimes are currently supported:
-
--   Go 1.12
--   Java 11
--   Node.js 12
--   PHP 7.3
--   Python 3.7
-
-### Building and pushing the images:
-
-The canonical source of images is the
-[gvisor-presubmit container registry](https://gcr.io/gvisor-presubmit/). You can
-build new images with the following command:
-
-```bash
-$ cd images
-$ docker build -f Dockerfile_$LANG [-t $NAME] .
-```
-
-To push them to our container registry, set the tag in the command above to
-`gcr.io/gvisor-presubmit/$LANG`, then push them. (Note that you will need
-appropriate permissions to the `gvisor-presubmit` GCP project.)
-
-```bash
-gcloud docker -- push gcr.io/gvisor-presubmit/$LANG
-```
-
-#### Running in Docker locally:
-
-1) [Install and configure Docker](https://docs.docker.com/install/)
-
-2) Pull the image you want to run:
-
-```bash
-$ docker pull gcr.io/gvisor-presubmit/$LANG
-```
-
-3) Run docker with the image.
-
-```bash
-$ docker run [--runtime=runsc] --rm -it $NAME [FLAG]
-```
-
-Running the command with no flags will cause all the available tests to execute.
-
-Flags can be added for additional functionality:
-
--   --list: Print a list of all available tests
--   --test &lt;name&gt;: Run a single test from the list of available tests
--   --v: Print the language version
diff --git a/test/runtimes/blacklist_test.go b/test/runtimes/blacklist_test.go
deleted file mode 100644
index 0ff69ab18..000000000
--- a/test/runtimes/blacklist_test.go
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"flag"
-	"os"
-	"testing"
-)
-
-func TestMain(m *testing.M) {
-	flag.Parse()
-	os.Exit(m.Run())
-}
-
-// Test that the blacklist parses without error.
-func TestBlacklists(t *testing.T) {
-	bl, err := getBlacklist()
-	if err != nil {
-		t.Fatalf("error parsing blacklist: %v", err)
-	}
-	if *blacklistFile != "" && len(bl) == 0 {
-		t.Errorf("got empty blacklist for file %q", *blacklistFile)
-	}
-}
diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
deleted file mode 100644
index 92e275a76..000000000
--- a/test/runtimes/build_defs.bzl
+++ /dev/null
@@ -1,75 +0,0 @@
-"""Defines a rule for runtime test targets."""
-
-load("//tools:defs.bzl", "go_test", "loopback")
-
-def runtime_test(
-        name,
-        lang,
-        image_repo = "gcr.io/gvisor-presubmit",
-        image_name = None,
-        blacklist_file = None,
-        shard_count = 50,
-        size = "enormous"):
-    """Generates sh_test and blacklist test targets for a given runtime.
-
-    Args:
-      name: The name of the runtime being tested. Typically, the lang + version.
-          This is used in the names of the generated test targets.
-      lang: The language being tested.
-      image_repo: The docker repository containing the proctor image to run.
-          i.e., the prefix to the fully qualified docker image id.
-      image_name: The name of the image in the image_repo.
-          Defaults to the test name.
-      blacklist_file: A test blacklist to pass to the runtime test's runner.
-      shard_count: See Bazel common test attributes.
-      size: See Bazel common test attributes.
-    """
-    if image_name == None:
-        image_name = name
-    args = [
-        "--lang",
-        lang,
-        "--image",
-        "/".join([image_repo, image_name]),
-    ]
-    data = [
-        ":runner",
-        loopback,
-    ]
-    if blacklist_file:
-        args += ["--blacklist_file", "test/runtimes/" + blacklist_file]
-        data += [blacklist_file]
-
-        # Add a test that the blacklist parses correctly.
-        blacklist_test(name, blacklist_file)
-
-    sh_test(
-        name = name + "_test",
-        srcs = ["runner.sh"],
-        args = args,
-        data = data,
-        size = size,
-        shard_count = shard_count,
-        tags = [
-            # Requires docker and runsc to be configured before the test runs.
-            "local",
-            # Don't include test target in wildcard target patterns.
-            "manual",
-        ],
-    )
-
-def blacklist_test(name, blacklist_file):
-    """Test that a blacklist parses correctly."""
-    go_test(
-        name = name + "_blacklist_test",
-        library = ":runner",
-        srcs = ["blacklist_test.go"],
-        args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
-        data = [blacklist_file],
-    )
-
-def sh_test(**kwargs):
-    """Wraps the standard sh_test."""
-    native.sh_test(
-        **kwargs
-    )
diff --git a/test/runtimes/defs.bzl b/test/runtimes/defs.bzl
new file mode 100644
index 000000000..f836dd952
--- /dev/null
+++ b/test/runtimes/defs.bzl
@@ -0,0 +1,79 @@
+"""Defines a rule for runtime test targets."""
+
+load("//tools:defs.bzl", "go_test")
+
+def _runtime_test_impl(ctx):
+    # Construct arguments.
+    args = [
+        "--lang",
+        ctx.attr.lang,
+        "--image",
+        ctx.attr.image,
+    ]
+    if ctx.attr.blacklist_file:
+        args += [
+            "--blacklist_file",
+            ctx.files.blacklist_file[0].short_path,
+        ]
+
+    # Build a runner.
+    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "%s %s\n" % (ctx.files._runner[0].short_path, " ".join(args)),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return the runner.
+    return [DefaultInfo(
+        executable = runner,
+        runfiles = ctx.runfiles(
+            files = ctx.files._runner + ctx.files.blacklist_file + ctx.files._proctor,
+            collect_default = True,
+            collect_data = True,
+        ),
+    )]
+
+_runtime_test = rule(
+    implementation = _runtime_test_impl,
+    attrs = {
+        "image": attr.string(
+            mandatory = False,
+        ),
+        "lang": attr.string(
+            mandatory = True,
+        ),
+        "blacklist_file": attr.label(
+            mandatory = False,
+            allow_single_file = True,
+        ),
+        "_runner": attr.label(
+            default = "//test/runtimes/runner:runner",
+        ),
+        "_proctor": attr.label(
+            default = "//test/runtimes/proctor:proctor",
+        ),
+    },
+    test = True,
+)
+
+def runtime_test(name, **kwargs):
+    _runtime_test(
+        name = name,
+        image = name,  # Resolved as images/runtimes/%s.
+        tags = [
+            "local",
+            "manual",
+        ],
+        **kwargs
+    )
+
+def blacklist_test(name, blacklist_file):
+    """Test that a blacklist parses correctly."""
+    go_test(
+        name = name + "_blacklist_test",
+        library = ":runner",
+        srcs = ["blacklist_test.go"],
+        args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
+        data = [blacklist_file],
+    )
diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/images/proctor/BUILD
deleted file mode 100644
index 85e004c45..000000000
--- a/test/runtimes/images/proctor/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-load("//tools:defs.bzl", "go_binary", "go_test")
-
-package(licenses = ["notice"])
-
-go_binary(
-    name = "proctor",
-    srcs = [
-        "go.go",
-        "java.go",
-        "nodejs.go",
-        "php.go",
-        "proctor.go",
-        "python.go",
-    ],
-    visibility = ["//test/runtimes/images:__subpackages__"],
-)
-
-go_test(
-    name = "proctor_test",
-    size = "small",
-    srcs = ["proctor_test.go"],
-    library = ":proctor",
-    deps = [
-        "//runsc/testutil",
-    ],
-)
diff --git a/test/runtimes/images/proctor/go.go b/test/runtimes/images/proctor/go.go
deleted file mode 100644
index 3e2d5d8db..000000000
--- a/test/runtimes/images/proctor/go.go
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"regexp"
-	"strings"
-)
-
-var (
-	goTestRegEx = regexp.MustCompile(`^.+\.go$`)
-
-	// Directories with .dir contain helper files for tests.
-	// Exclude benchmarks and stress tests.
-	goDirFilter = regexp.MustCompile(`^(bench|stress)\/.+$|^.+\.dir.+$`)
-)
-
-// Location of Go tests on disk.
-const goTestDir = "/usr/local/go/test"
-
-// goRunner implements TestRunner for Go.
-//
-// There are two types of Go tests: "Go tool tests" and "Go tests on disk".
-// "Go tool tests" are found and executed using `go tool dist test`. "Go tests
-// on disk" are found in the /usr/local/go/test directory and are executed
-// using `go run run.go`.
-type goRunner struct{}
-
-var _ TestRunner = goRunner{}
-
-// ListTests implements TestRunner.ListTests.
-func (goRunner) ListTests() ([]string, error) {
-	// Go tool dist test tests.
-	args := []string{"tool", "dist", "test", "-list"}
-	cmd := exec.Command("go", args...)
-	cmd.Stderr = os.Stderr
-	out, err := cmd.Output()
-	if err != nil {
-		return nil, fmt.Errorf("failed to list: %v", err)
-	}
-	var toolSlice []string
-	for _, test := range strings.Split(string(out), "\n") {
-		toolSlice = append(toolSlice, test)
-	}
-
-	// Go tests on disk.
-	diskSlice, err := search(goTestDir, goTestRegEx)
-	if err != nil {
-		return nil, err
-	}
-	// Remove items from /bench/, /stress/ and .dir files
-	diskFiltered := diskSlice[:0]
-	for _, file := range diskSlice {
-		if !goDirFilter.MatchString(file) {
-			diskFiltered = append(diskFiltered, file)
-		}
-	}
-
-	return append(toolSlice, diskFiltered...), nil
-}
-
-// TestCmd implements TestRunner.TestCmd.
-func (goRunner) TestCmd(test string) *exec.Cmd {
-	// Check if test exists on disk by searching for file of the same name.
-	// This will determine whether or not it is a Go test on disk.
-	if strings.HasSuffix(test, ".go") {
-		// Test has suffix ".go" which indicates a disk test, run it as such.
-		cmd := exec.Command("go", "run", "run.go", "-v", "--", test)
-		cmd.Dir = goTestDir
-		return cmd
-	}
-
-	// No ".go" suffix, run as a tool test.
-	return exec.Command("go", "tool", "dist", "test", "-run", test)
-}
diff --git a/test/runtimes/images/proctor/java.go b/test/runtimes/images/proctor/java.go
deleted file mode 100644
index 8b362029d..000000000
--- a/test/runtimes/images/proctor/java.go
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"regexp"
-	"strings"
-)
-
-// Directories to exclude from tests.
-var javaExclDirs = regexp.MustCompile(`(^(sun\/security)|(java\/util\/stream)|(java\/time)| )`)
-
-// Location of java tests.
-const javaTestDir = "/root/test/jdk"
-
-// javaRunner implements TestRunner for Java.
-type javaRunner struct{}
-
-var _ TestRunner = javaRunner{}
-
-// ListTests implements TestRunner.ListTests.
-func (javaRunner) ListTests() ([]string, error) {
-	args := []string{
-		"-dir:" + javaTestDir,
-		"-ignore:quiet",
-		"-a",
-		"-listtests",
-		":jdk_core",
-		":jdk_svc",
-		":jdk_sound",
-		":jdk_imageio",
-	}
-	cmd := exec.Command("jtreg", args...)
-	cmd.Stderr = os.Stderr
-	out, err := cmd.Output()
-	if err != nil {
-		return nil, fmt.Errorf("jtreg -listtests : %v", err)
-	}
-	var testSlice []string
-	for _, test := range strings.Split(string(out), "\n") {
-		if !javaExclDirs.MatchString(test) {
-			testSlice = append(testSlice, test)
-		}
-	}
-	return testSlice, nil
-}
-
-// TestCmd implements TestRunner.TestCmd.
-func (javaRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{
-		"-noreport",
-		"-dir:" + javaTestDir,
-		test,
-	}
-	return exec.Command("jtreg", args...)
-}
diff --git a/test/runtimes/images/proctor/nodejs.go b/test/runtimes/images/proctor/nodejs.go
deleted file mode 100644
index bd57db444..000000000
--- a/test/runtimes/images/proctor/nodejs.go
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"os/exec"
-	"path/filepath"
-	"regexp"
-)
-
-var nodejsTestRegEx = regexp.MustCompile(`^test-[^-].+\.js$`)
-
-// Location of nodejs tests relative to working dir.
-const nodejsTestDir = "test"
-
-// nodejsRunner implements TestRunner for NodeJS.
-type nodejsRunner struct{}
-
-var _ TestRunner = nodejsRunner{}
-
-// ListTests implements TestRunner.ListTests.
-func (nodejsRunner) ListTests() ([]string, error) {
-	testSlice, err := search(nodejsTestDir, nodejsTestRegEx)
-	if err != nil {
-		return nil, err
-	}
-	return testSlice, nil
-}
-
-// TestCmd implements TestRunner.TestCmd.
-func (nodejsRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{filepath.Join("tools", "test.py"), test}
-	return exec.Command("/usr/bin/python", args...)
-}
diff --git a/test/runtimes/images/proctor/php.go b/test/runtimes/images/proctor/php.go
deleted file mode 100644
index 9115040e1..000000000
--- a/test/runtimes/images/proctor/php.go
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"os/exec"
-	"regexp"
-)
-
-var phpTestRegEx = regexp.MustCompile(`^.+\.phpt$`)
-
-// phpRunner implements TestRunner for PHP.
-type phpRunner struct{}
-
-var _ TestRunner = phpRunner{}
-
-// ListTests implements TestRunner.ListTests.
-func (phpRunner) ListTests() ([]string, error) {
-	testSlice, err := search(".", phpTestRegEx)
-	if err != nil {
-		return nil, err
-	}
-	return testSlice, nil
-}
-
-// TestCmd implements TestRunner.TestCmd.
-func (phpRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{"test", "TESTS=" + test}
-	return exec.Command("make", args...)
-}
diff --git a/test/runtimes/images/proctor/proctor.go b/test/runtimes/images/proctor/proctor.go
deleted file mode 100644
index b54abe434..000000000
--- a/test/runtimes/images/proctor/proctor.go
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary proctor runs the test for a particular runtime. It is meant to be
-// included in Docker images for all runtime tests.
-package main
-
-import (
-	"flag"
-	"fmt"
-	"log"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"regexp"
-	"syscall"
-)
-
-// TestRunner is an interface that must be implemented for each runtime
-// integrated with proctor.
-type TestRunner interface {
-	// ListTests returns a string slice of tests available to run.
-	ListTests() ([]string, error)
-
-	// TestCmd returns an *exec.Cmd that will run the given test.
-	TestCmd(test string) *exec.Cmd
-}
-
-var (
-	runtime  = flag.String("runtime", "", "name of runtime")
-	list     = flag.Bool("list", false, "list all available tests")
-	testName = flag.String("test", "", "run a single test from the list of available tests")
-	pause    = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
-)
-
-func main() {
-	flag.Parse()
-
-	if *pause {
-		pauseAndReap()
-		panic("pauseAndReap should never return")
-	}
-
-	if *runtime == "" {
-		log.Fatalf("runtime flag must be provided")
-	}
-
-	tr, err := testRunnerForRuntime(*runtime)
-	if err != nil {
-		log.Fatalf("%v", err)
-	}
-
-	// List tests.
-	if *list {
-		tests, err := tr.ListTests()
-		if err != nil {
-			log.Fatalf("failed to list tests: %v", err)
-		}
-		for _, test := range tests {
-			fmt.Println(test)
-		}
-		return
-	}
-
-	var tests []string
-	if *testName == "" {
-		// Run every test.
-		tests, err = tr.ListTests()
-		if err != nil {
-			log.Fatalf("failed to get all tests: %v", err)
-		}
-	} else {
-		// Run a single test.
-		tests = []string{*testName}
-	}
-	for _, test := range tests {
-		cmd := tr.TestCmd(test)
-		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
-		if err := cmd.Run(); err != nil {
-			log.Fatalf("FAIL: %v", err)
-		}
-	}
-}
-
-// testRunnerForRuntime returns a new TestRunner for the given runtime.
-func testRunnerForRuntime(runtime string) (TestRunner, error) {
-	switch runtime {
-	case "go":
-		return goRunner{}, nil
-	case "java":
-		return javaRunner{}, nil
-	case "nodejs":
-		return nodejsRunner{}, nil
-	case "php":
-		return phpRunner{}, nil
-	case "python":
-		return pythonRunner{}, nil
-	}
-	return nil, fmt.Errorf("invalid runtime %q", runtime)
-}
-
-// pauseAndReap is like init. It runs forever and reaps any children.
-func pauseAndReap() {
-	// Get notified of any new children.
-	ch := make(chan os.Signal, 1)
-	signal.Notify(ch, syscall.SIGCHLD)
-
-	for {
-		if _, ok := <-ch; !ok {
-			// Channel closed. This should not happen.
-			panic("signal channel closed")
-		}
-
-		// Reap the child.
-		for {
-			if cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); cpid < 1 {
-				break
-			}
-		}
-	}
-}
-
-// search is a helper function to find tests in the given directory that match
-// the regex.
-func search(root string, testFilter *regexp.Regexp) ([]string, error) {
-	var testSlice []string
-
-	err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
-		if err != nil {
-			return err
-		}
-
-		name := filepath.Base(path)
-
-		if info.IsDir() || !testFilter.MatchString(name) {
-			return nil
-		}
-
-		relPath, err := filepath.Rel(root, path)
-		if err != nil {
-			return err
-		}
-		testSlice = append(testSlice, relPath)
-		return nil
-	})
-	if err != nil {
-		return nil, fmt.Errorf("walking %q: %v", root, err)
-	}
-
-	return testSlice, nil
-}
diff --git a/test/runtimes/images/proctor/proctor_test.go b/test/runtimes/images/proctor/proctor_test.go
deleted file mode 100644
index 6bb61d142..000000000
--- a/test/runtimes/images/proctor/proctor_test.go
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"reflect"
-	"regexp"
-	"strings"
-	"testing"
-
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-func touch(t *testing.T, name string) {
-	t.Helper()
-	f, err := os.Create(name)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := f.Close(); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestSearchEmptyDir(t *testing.T) {
-	td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer os.RemoveAll(td)
-
-	var want []string
-
-	testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`)
-	got, err := search(td, testFilter)
-	if err != nil {
-		t.Errorf("search error: %v", err)
-	}
-
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf("Found %#v; want %#v", got, want)
-	}
-}
-
-func TestSearch(t *testing.T) {
-	td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer os.RemoveAll(td)
-
-	// Creating various files similar to the test filter regex.
-	files := []string{
-		"emp/",
-		"tee/",
-		"test-foo.tc",
-		"test-foo.tc",
-		"test-bar.tc",
-		"test-sam.tc",
-		"Test-que.tc",
-		"test-brett",
-		"test--abc.tc",
-		"test---xyz.tc",
-		"test-bool.TC",
-		"--test-gvs.tc",
-		" test-pew.tc",
-		"dir/test_baz.tc",
-		"dir/testsnap.tc",
-		"dir/test-luk.tc",
-		"dir/nest/test-ok.tc",
-		"dir/dip/diz/goog/test-pack.tc",
-		"dir/dip/diz/wobble/thud/test-cas.e",
-		"dir/dip/diz/wobble/thud/test-cas.tc",
-	}
-	want := []string{
-		"dir/dip/diz/goog/test-pack.tc",
-		"dir/dip/diz/wobble/thud/test-cas.tc",
-		"dir/nest/test-ok.tc",
-		"dir/test-luk.tc",
-		"test-bar.tc",
-		"test-foo.tc",
-		"test-sam.tc",
-	}
-
-	for _, item := range files {
-		if strings.HasSuffix(item, "/") {
-			// This item is a directory, create it.
-			if err := os.MkdirAll(filepath.Join(td, item), 0755); err != nil {
-				t.Fatal(err)
-			}
-		} else {
-			// This item is a file, create the directory and touch file.
-			// Create directory in which file should be created
-			fullDirPath := filepath.Join(td, filepath.Dir(item))
-			if err := os.MkdirAll(fullDirPath, 0755); err != nil {
-				t.Fatal(err)
-			}
-			// Create file with full path to file.
-			touch(t, filepath.Join(td, item))
-		}
-	}
-
-	testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`)
-	got, err := search(td, testFilter)
-	if err != nil {
-		t.Errorf("search error: %v", err)
-	}
-
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf("Found %#v; want %#v", got, want)
-	}
-}
diff --git a/test/runtimes/images/proctor/python.go b/test/runtimes/images/proctor/python.go
deleted file mode 100644
index b9e0fbe6f..000000000
--- a/test/runtimes/images/proctor/python.go
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"strings"
-)
-
-// pythonRunner implements TestRunner for Python.
-type pythonRunner struct{}
-
-var _ TestRunner = pythonRunner{}
-
-// ListTests implements TestRunner.ListTests.
-func (pythonRunner) ListTests() ([]string, error) {
-	args := []string{"-m", "test", "--list-tests"}
-	cmd := exec.Command("./python", args...)
-	cmd.Stderr = os.Stderr
-	out, err := cmd.Output()
-	if err != nil {
-		return nil, fmt.Errorf("failed to list: %v", err)
-	}
-	var toolSlice []string
-	for _, test := range strings.Split(string(out), "\n") {
-		toolSlice = append(toolSlice, test)
-	}
-	return toolSlice, nil
-}
-
-// TestCmd implements TestRunner.TestCmd.
-func (pythonRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{"-m", "test", test}
-	return exec.Command("./python", args...)
-}
diff --git a/test/runtimes/proctor/BUILD b/test/runtimes/proctor/BUILD
new file mode 100644
index 000000000..50a26d182
--- /dev/null
+++ b/test/runtimes/proctor/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_binary", "go_test")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "proctor",
+    srcs = [
+        "go.go",
+        "java.go",
+        "nodejs.go",
+        "php.go",
+        "proctor.go",
+        "python.go",
+    ],
+    pure = True,
+    visibility = ["//test/runtimes:__pkg__"],
+)
+
+go_test(
+    name = "proctor_test",
+    size = "small",
+    srcs = ["proctor_test.go"],
+    library = ":proctor",
+    deps = [
+        "//pkg/test/testutil",
+    ],
+)
diff --git a/test/runtimes/proctor/go.go b/test/runtimes/proctor/go.go
new file mode 100644
index 000000000..3e2d5d8db
--- /dev/null
+++ b/test/runtimes/proctor/go.go
@@ -0,0 +1,90 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"regexp"
+	"strings"
+)
+
+var (
+	goTestRegEx = regexp.MustCompile(`^.+\.go$`)
+
+	// Directories with .dir contain helper files for tests.
+	// Exclude benchmarks and stress tests.
+	goDirFilter = regexp.MustCompile(`^(bench|stress)\/.+$|^.+\.dir.+$`)
+)
+
+// Location of Go tests on disk.
+const goTestDir = "/usr/local/go/test"
+
+// goRunner implements TestRunner for Go.
+//
+// There are two types of Go tests: "Go tool tests" and "Go tests on disk".
+// "Go tool tests" are found and executed using `go tool dist test`. "Go tests
+// on disk" are found in the /usr/local/go/test directory and are executed
+// using `go run run.go`.
+type goRunner struct{}
+
+var _ TestRunner = goRunner{}
+
+// ListTests implements TestRunner.ListTests.
+func (goRunner) ListTests() ([]string, error) {
+	// Go tool dist test tests.
+	args := []string{"tool", "dist", "test", "-list"}
+	cmd := exec.Command("go", args...)
+	cmd.Stderr = os.Stderr
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, fmt.Errorf("failed to list: %v", err)
+	}
+	var toolSlice []string
+	for _, test := range strings.Split(string(out), "\n") {
+		toolSlice = append(toolSlice, test)
+	}
+
+	// Go tests on disk.
+	diskSlice, err := search(goTestDir, goTestRegEx)
+	if err != nil {
+		return nil, err
+	}
+	// Remove items from /bench/, /stress/ and .dir files
+	diskFiltered := diskSlice[:0]
+	for _, file := range diskSlice {
+		if !goDirFilter.MatchString(file) {
+			diskFiltered = append(diskFiltered, file)
+		}
+	}
+
+	return append(toolSlice, diskFiltered...), nil
+}
+
+// TestCmd implements TestRunner.TestCmd.
+func (goRunner) TestCmd(test string) *exec.Cmd {
+	// Check if test exists on disk by searching for file of the same name.
+	// This will determine whether or not it is a Go test on disk.
+	if strings.HasSuffix(test, ".go") {
+		// Test has suffix ".go" which indicates a disk test, run it as such.
+		cmd := exec.Command("go", "run", "run.go", "-v", "--", test)
+		cmd.Dir = goTestDir
+		return cmd
+	}
+
+	// No ".go" suffix, run as a tool test.
+	return exec.Command("go", "tool", "dist", "test", "-run", test)
+}
diff --git a/test/runtimes/proctor/java.go b/test/runtimes/proctor/java.go
new file mode 100644
index 000000000..8b362029d
--- /dev/null
+++ b/test/runtimes/proctor/java.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"regexp"
+	"strings"
+)
+
+// Directories to exclude from tests.
+var javaExclDirs = regexp.MustCompile(`(^(sun\/security)|(java\/util\/stream)|(java\/time)| )`)
+
+// Location of java tests.
+const javaTestDir = "/root/test/jdk"
+
+// javaRunner implements TestRunner for Java.
+type javaRunner struct{}
+
+var _ TestRunner = javaRunner{}
+
+// ListTests implements TestRunner.ListTests.
+func (javaRunner) ListTests() ([]string, error) {
+	args := []string{
+		"-dir:" + javaTestDir,
+		"-ignore:quiet",
+		"-a",
+		"-listtests",
+		":jdk_core",
+		":jdk_svc",
+		":jdk_sound",
+		":jdk_imageio",
+	}
+	cmd := exec.Command("jtreg", args...)
+	cmd.Stderr = os.Stderr
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, fmt.Errorf("jtreg -listtests : %v", err)
+	}
+	var testSlice []string
+	for _, test := range strings.Split(string(out), "\n") {
+		if !javaExclDirs.MatchString(test) {
+			testSlice = append(testSlice, test)
+		}
+	}
+	return testSlice, nil
+}
+
+// TestCmd implements TestRunner.TestCmd.
+func (javaRunner) TestCmd(test string) *exec.Cmd {
+	args := []string{
+		"-noreport",
+		"-dir:" + javaTestDir,
+		test,
+	}
+	return exec.Command("jtreg", args...)
+}
diff --git a/test/runtimes/proctor/nodejs.go b/test/runtimes/proctor/nodejs.go
new file mode 100644
index 000000000..bd57db444
--- /dev/null
+++ b/test/runtimes/proctor/nodejs.go
@@ -0,0 +1,46 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"os/exec"
+	"path/filepath"
+	"regexp"
+)
+
+var nodejsTestRegEx = regexp.MustCompile(`^test-[^-].+\.js$`)
+
+// Location of nodejs tests relative to working dir.
+const nodejsTestDir = "test"
+
+// nodejsRunner implements TestRunner for NodeJS.
+type nodejsRunner struct{}
+
+var _ TestRunner = nodejsRunner{}
+
+// ListTests implements TestRunner.ListTests.
+func (nodejsRunner) ListTests() ([]string, error) {
+	testSlice, err := search(nodejsTestDir, nodejsTestRegEx)
+	if err != nil {
+		return nil, err
+	}
+	return testSlice, nil
+}
+
+// TestCmd implements TestRunner.TestCmd.
+func (nodejsRunner) TestCmd(test string) *exec.Cmd {
+	args := []string{filepath.Join("tools", "test.py"), test}
+	return exec.Command("/usr/bin/python", args...)
+}
diff --git a/test/runtimes/proctor/php.go b/test/runtimes/proctor/php.go
new file mode 100644
index 000000000..9115040e1
--- /dev/null
+++ b/test/runtimes/proctor/php.go
@@ -0,0 +1,42 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"os/exec"
+	"regexp"
+)
+
+var phpTestRegEx = regexp.MustCompile(`^.+\.phpt$`)
+
+// phpRunner implements TestRunner for PHP.
+type phpRunner struct{}
+
+var _ TestRunner = phpRunner{}
+
+// ListTests implements TestRunner.ListTests.
+func (phpRunner) ListTests() ([]string, error) {
+	testSlice, err := search(".", phpTestRegEx)
+	if err != nil {
+		return nil, err
+	}
+	return testSlice, nil
+}
+
+// TestCmd implements TestRunner.TestCmd.
+func (phpRunner) TestCmd(test string) *exec.Cmd {
+	args := []string{"test", "TESTS=" + test}
+	return exec.Command("make", args...)
+}
diff --git a/test/runtimes/proctor/proctor.go b/test/runtimes/proctor/proctor.go
new file mode 100644
index 000000000..b54abe434
--- /dev/null
+++ b/test/runtimes/proctor/proctor.go
@@ -0,0 +1,163 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary proctor runs the test for a particular runtime. It is meant to be
+// included in Docker images for all runtime tests.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"regexp"
+	"syscall"
+)
+
+// TestRunner is an interface that must be implemented for each runtime
+// integrated with proctor.
+type TestRunner interface {
+	// ListTests returns a string slice of tests available to run.
+	ListTests() ([]string, error)
+
+	// TestCmd returns an *exec.Cmd that will run the given test.
+	TestCmd(test string) *exec.Cmd
+}
+
+var (
+	runtime  = flag.String("runtime", "", "name of runtime")
+	list     = flag.Bool("list", false, "list all available tests")
+	testName = flag.String("test", "", "run a single test from the list of available tests")
+	pause    = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
+)
+
+func main() {
+	flag.Parse()
+
+	if *pause {
+		pauseAndReap()
+		panic("pauseAndReap should never return")
+	}
+
+	if *runtime == "" {
+		log.Fatalf("runtime flag must be provided")
+	}
+
+	tr, err := testRunnerForRuntime(*runtime)
+	if err != nil {
+		log.Fatalf("%v", err)
+	}
+
+	// List tests.
+	if *list {
+		tests, err := tr.ListTests()
+		if err != nil {
+			log.Fatalf("failed to list tests: %v", err)
+		}
+		for _, test := range tests {
+			fmt.Println(test)
+		}
+		return
+	}
+
+	var tests []string
+	if *testName == "" {
+		// Run every test.
+		tests, err = tr.ListTests()
+		if err != nil {
+			log.Fatalf("failed to get all tests: %v", err)
+		}
+	} else {
+		// Run a single test.
+		tests = []string{*testName}
+	}
+	for _, test := range tests {
+		cmd := tr.TestCmd(test)
+		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
+		if err := cmd.Run(); err != nil {
+			log.Fatalf("FAIL: %v", err)
+		}
+	}
+}
+
+// testRunnerForRuntime returns a new TestRunner for the given runtime.
+func testRunnerForRuntime(runtime string) (TestRunner, error) {
+	switch runtime {
+	case "go":
+		return goRunner{}, nil
+	case "java":
+		return javaRunner{}, nil
+	case "nodejs":
+		return nodejsRunner{}, nil
+	case "php":
+		return phpRunner{}, nil
+	case "python":
+		return pythonRunner{}, nil
+	}
+	return nil, fmt.Errorf("invalid runtime %q", runtime)
+}
+
+// pauseAndReap is like init. It runs forever and reaps any children.
+func pauseAndReap() {
+	// Get notified of any new children.
+	ch := make(chan os.Signal, 1)
+	signal.Notify(ch, syscall.SIGCHLD)
+
+	for {
+		if _, ok := <-ch; !ok {
+			// Channel closed. This should not happen.
+			panic("signal channel closed")
+		}
+
+		// Reap the child.
+		for {
+			if cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); cpid < 1 {
+				break
+			}
+		}
+	}
+}
+
+// search is a helper function to find tests in the given directory that match
+// the regex.
+func search(root string, testFilter *regexp.Regexp) ([]string, error) {
+	var testSlice []string
+
+	err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		name := filepath.Base(path)
+
+		if info.IsDir() || !testFilter.MatchString(name) {
+			return nil
+		}
+
+		relPath, err := filepath.Rel(root, path)
+		if err != nil {
+			return err
+		}
+		testSlice = append(testSlice, relPath)
+		return nil
+	})
+	if err != nil {
+		return nil, fmt.Errorf("walking %q: %v", root, err)
+	}
+
+	return testSlice, nil
+}
diff --git a/test/runtimes/proctor/proctor_test.go b/test/runtimes/proctor/proctor_test.go
new file mode 100644
index 000000000..6ef2de085
--- /dev/null
+++ b/test/runtimes/proctor/proctor_test.go
@@ -0,0 +1,127 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"reflect"
+	"regexp"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+func touch(t *testing.T, name string) {
+	t.Helper()
+	f, err := os.Create(name)
+	if err != nil {
+		t.Fatalf("error creating file %q: %v", name, err)
+	}
+	if err := f.Close(); err != nil {
+		t.Fatalf("error closing file %q: %v", name, err)
+	}
+}
+
+func TestSearchEmptyDir(t *testing.T) {
+	td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest")
+	if err != nil {
+		t.Fatalf("error creating searchtest: %v", err)
+	}
+	defer os.RemoveAll(td)
+
+	var want []string
+
+	testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`)
+	got, err := search(td, testFilter)
+	if err != nil {
+		t.Errorf("search error: %v", err)
+	}
+
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Found %#v; want %#v", got, want)
+	}
+}
+
+func TestSearch(t *testing.T) {
+	td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest")
+	if err != nil {
+		t.Fatalf("error creating searchtest: %v", err)
+	}
+	defer os.RemoveAll(td)
+
+	// Creating various files similar to the test filter regex.
+	files := []string{
+		"emp/",
+		"tee/",
+		"test-foo.tc",
+		"test-foo.tc",
+		"test-bar.tc",
+		"test-sam.tc",
+		"Test-que.tc",
+		"test-brett",
+		"test--abc.tc",
+		"test---xyz.tc",
+		"test-bool.TC",
+		"--test-gvs.tc",
+		" test-pew.tc",
+		"dir/test_baz.tc",
+		"dir/testsnap.tc",
+		"dir/test-luk.tc",
+		"dir/nest/test-ok.tc",
+		"dir/dip/diz/goog/test-pack.tc",
+		"dir/dip/diz/wobble/thud/test-cas.e",
+		"dir/dip/diz/wobble/thud/test-cas.tc",
+	}
+	want := []string{
+		"dir/dip/diz/goog/test-pack.tc",
+		"dir/dip/diz/wobble/thud/test-cas.tc",
+		"dir/nest/test-ok.tc",
+		"dir/test-luk.tc",
+		"test-bar.tc",
+		"test-foo.tc",
+		"test-sam.tc",
+	}
+
+	for _, item := range files {
+		if strings.HasSuffix(item, "/") {
+			// This item is a directory, create it.
+			if err := os.MkdirAll(filepath.Join(td, item), 0755); err != nil {
+				t.Fatalf("error making directory: %v", err)
+			}
+		} else {
+			// This item is a file, create the directory and touch file.
+			// Create directory in which file should be created
+			fullDirPath := filepath.Join(td, filepath.Dir(item))
+			if err := os.MkdirAll(fullDirPath, 0755); err != nil {
+				t.Fatalf("error making directory: %v", err)
+			}
+			// Create file with full path to file.
+			touch(t, filepath.Join(td, item))
+		}
+	}
+
+	testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`)
+	got, err := search(td, testFilter)
+	if err != nil {
+		t.Errorf("search error: %v", err)
+	}
+
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Found %#v; want %#v", got, want)
+	}
+}
diff --git a/test/runtimes/proctor/python.go b/test/runtimes/proctor/python.go
new file mode 100644
index 000000000..b9e0fbe6f
--- /dev/null
+++ b/test/runtimes/proctor/python.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+)
+
+// pythonRunner implements TestRunner for Python.
+type pythonRunner struct{}
+
+var _ TestRunner = pythonRunner{}
+
+// ListTests implements TestRunner.ListTests.
+func (pythonRunner) ListTests() ([]string, error) {
+	args := []string{"-m", "test", "--list-tests"}
+	cmd := exec.Command("./python", args...)
+	cmd.Stderr = os.Stderr
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, fmt.Errorf("failed to list: %v", err)
+	}
+	var toolSlice []string
+	for _, test := range strings.Split(string(out), "\n") {
+		toolSlice = append(toolSlice, test)
+	}
+	return toolSlice, nil
+}
+
+// TestCmd implements TestRunner.TestCmd.
+func (pythonRunner) TestCmd(test string) *exec.Cmd {
+	args := []string{"-m", "test", test}
+	return exec.Command("./python", args...)
+}
diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go
deleted file mode 100644
index 3c98f4570..000000000
--- a/test/runtimes/runner.go
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary runner runs the runtime tests in a Docker container.
-package main
-
-import (
-	"encoding/csv"
-	"flag"
-	"fmt"
-	"io"
-	"os"
-	"sort"
-	"strings"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-var (
-	lang          = flag.String("lang", "", "language runtime to test")
-	image         = flag.String("image", "", "docker image with runtime tests")
-	blacklistFile = flag.String("blacklist_file", "", "file containing blacklist of tests to exclude, in CSV format with fields: test name, bug id, comment")
-)
-
-// Wait time for each test to run.
-const timeout = 5 * time.Minute
-
-func main() {
-	flag.Parse()
-	if *lang == "" || *image == "" {
-		fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n")
-		os.Exit(1)
-	}
-
-	os.Exit(runTests())
-}
-
-// runTests is a helper that is called by main. It exists so that we can run
-// defered functions before exiting. It returns an exit code that should be
-// passed to os.Exit.
-func runTests() int {
-	// Get tests to blacklist.
-	blacklist, err := getBlacklist()
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error getting blacklist: %s\n", err.Error())
-		return 1
-	}
-
-	// Create a single docker container that will be used for all tests.
-	d := dockerutil.MakeDocker("gvisor-" + *lang)
-	defer d.CleanUp()
-
-	// Get a slice of tests to run. This will also start a single Docker
-	// container that will be used to run each test. The final test will
-	// stop the Docker container.
-	tests, err := getTests(d, blacklist)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "%s\n", err.Error())
-		return 1
-	}
-
-	m := testing.MainStart(testDeps{}, tests, nil, nil)
-	return m.Run()
-}
-
-// getTests returns a slice of tests to run, subject to the shard size and
-// index.
-func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.InternalTest, error) {
-	// Pull the image.
-	if err := dockerutil.Pull(*image); err != nil {
-		return nil, fmt.Errorf("docker pull %q failed: %v", *image, err)
-	}
-
-	// Run proctor with --pause flag to keep container alive forever.
-	if err := d.Run(*image, "--pause"); err != nil {
-		return nil, fmt.Errorf("docker run failed: %v", err)
-	}
-
-	// Get a list of all tests in the image.
-	list, err := d.Exec("/proctor", "--runtime", *lang, "--list")
-	if err != nil {
-		return nil, fmt.Errorf("docker exec failed: %v", err)
-	}
-
-	// Calculate a subset of tests to run corresponding to the current
-	// shard.
-	tests := strings.Fields(list)
-	sort.Strings(tests)
-	indices, err := testutil.TestIndicesForShard(len(tests))
-	if err != nil {
-		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
-	}
-
-	var itests []testing.InternalTest
-	for _, tci := range indices {
-		// Capture tc in this scope.
-		tc := tests[tci]
-		itests = append(itests, testing.InternalTest{
-			Name: tc,
-			F: func(t *testing.T) {
-				// Is the test blacklisted?
-				if _, ok := blacklist[tc]; ok {
-					t.Skipf("SKIP: blacklisted test %q", tc)
-				}
-
-				var (
-					now    = time.Now()
-					done   = make(chan struct{})
-					output string
-					err    error
-				)
-
-				go func() {
-					fmt.Printf("RUNNING %s...\n", tc)
-					output, err = d.Exec("/proctor", "--runtime", *lang, "--test", tc)
-					close(done)
-				}()
-
-				select {
-				case <-done:
-					if err == nil {
-						fmt.Printf("PASS: %s (%v)\n\n", tc, time.Since(now))
-						return
-					}
-					t.Errorf("FAIL: %s (%v):\n%s\n", tc, time.Since(now), output)
-				case <-time.After(timeout):
-					t.Errorf("TIMEOUT: %s (%v):\n%s\n", tc, time.Since(now), output)
-				}
-			},
-		})
-	}
-	return itests, nil
-}
-
-// getBlacklist reads the blacklist file and returns a set of test names to
-// exclude.
-func getBlacklist() (map[string]struct{}, error) {
-	blacklist := make(map[string]struct{})
-	if *blacklistFile == "" {
-		return blacklist, nil
-	}
-	file, err := testutil.FindFile(*blacklistFile)
-	if err != nil {
-		return nil, err
-	}
-	f, err := os.Open(file)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	r := csv.NewReader(f)
-
-	// First line is header. Skip it.
-	if _, err := r.Read(); err != nil {
-		return nil, err
-	}
-
-	for {
-		record, err := r.Read()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
-		blacklist[record[0]] = struct{}{}
-	}
-	return blacklist, nil
-}
-
-// testDeps implements testing.testDeps (an unexported interface), and is
-// required to use testing.MainStart.
-type testDeps struct{}
-
-func (f testDeps) MatchString(a, b string) (bool, error)       { return a == b, nil }
-func (f testDeps) StartCPUProfile(io.Writer) error             { return nil }
-func (f testDeps) StopCPUProfile()                             {}
-func (f testDeps) WriteProfileTo(string, io.Writer, int) error { return nil }
-func (f testDeps) ImportPath() string                          { return "" }
-func (f testDeps) StartTestLog(io.Writer)                      {}
-func (f testDeps) StopTestLog() error                          { return nil }
diff --git a/test/runtimes/runner.sh b/test/runtimes/runner.sh
deleted file mode 100755
index a8d9a3460..000000000
--- a/test/runtimes/runner.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -euf -x -o pipefail
-
-echo -- "$@"
-
-# Create outputs dir if it does not exist.
-if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then
-  mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}"
-  chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}"
-fi
-
-# Update the timestamp on the shard status file. Bazel looks for this.
-touch "${TEST_SHARD_STATUS_FILE}"
-
-# Get location of runner binary.
-readonly runner=$(find "${TEST_SRCDIR}" -name runner)
-
-# Pass the arguments of this script directly to the runner.
-exec "${runner}" "$@"
-
diff --git a/test/runtimes/runner/BUILD b/test/runtimes/runner/BUILD
new file mode 100644
index 000000000..63924b9c5
--- /dev/null
+++ b/test/runtimes/runner/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_binary", "go_test")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["main.go"],
+    visibility = ["//test/runtimes:__pkg__"],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
+    ],
+)
+
+go_test(
+    name = "blacklist_test",
+    size = "small",
+    srcs = ["blacklist_test.go"],
+    library = ":runner",
+)
diff --git a/test/runtimes/runner/blacklist_test.go b/test/runtimes/runner/blacklist_test.go
new file mode 100644
index 000000000..0ff69ab18
--- /dev/null
+++ b/test/runtimes/runner/blacklist_test.go
@@ -0,0 +1,37 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"flag"
+	"os"
+	"testing"
+)
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	os.Exit(m.Run())
+}
+
+// Test that the blacklist parses without error.
+func TestBlacklists(t *testing.T) {
+	bl, err := getBlacklist()
+	if err != nil {
+		t.Fatalf("error parsing blacklist: %v", err)
+	}
+	if *blacklistFile != "" && len(bl) == 0 {
+		t.Errorf("got empty blacklist for file %q", *blacklistFile)
+	}
+}
diff --git a/test/runtimes/runner/main.go b/test/runtimes/runner/main.go
new file mode 100644
index 000000000..57540e00e
--- /dev/null
+++ b/test/runtimes/runner/main.go
@@ -0,0 +1,189 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary runner runs the runtime tests in a Docker container.
+package main
+
+import (
+	"encoding/csv"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+var (
+	lang          = flag.String("lang", "", "language runtime to test")
+	image         = flag.String("image", "", "docker image with runtime tests")
+	blacklistFile = flag.String("blacklist_file", "", "file containing blacklist of tests to exclude, in CSV format with fields: test name, bug id, comment")
+)
+
+// Wait time for each test to run.
+const timeout = 5 * time.Minute
+
+func main() {
+	flag.Parse()
+	if *lang == "" || *image == "" {
+		fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n")
+		os.Exit(1)
+	}
+	os.Exit(runTests())
+}
+
+// runTests is a helper that is called by main. It exists so that we can run
+// defered functions before exiting. It returns an exit code that should be
+// passed to os.Exit.
+func runTests() int {
+	// Get tests to blacklist.
+	blacklist, err := getBlacklist()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error getting blacklist: %s\n", err.Error())
+		return 1
+	}
+
+	// Construct the shared docker instance.
+	d := dockerutil.MakeDocker(testutil.DefaultLogger(*lang))
+	defer d.CleanUp()
+
+	// Get a slice of tests to run. This will also start a single Docker
+	// container that will be used to run each test. The final test will
+	// stop the Docker container.
+	tests, err := getTests(d, blacklist)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "%s\n", err.Error())
+		return 1
+	}
+
+	m := testing.MainStart(testDeps{}, tests, nil, nil)
+	return m.Run()
+}
+
+// getTests executes all tests as table tests.
+func getTests(d *dockerutil.Docker, blacklist map[string]struct{}) ([]testing.InternalTest, error) {
+	// Start the container.
+	d.CopyFiles("/proctor", "test/runtimes/proctor/proctor")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: fmt.Sprintf("runtimes/%s", *image),
+	}, "/proctor/proctor", "--pause"); err != nil {
+		return nil, fmt.Errorf("docker run failed: %v", err)
+	}
+
+	// Get a list of all tests in the image.
+	list, err := d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--list")
+	if err != nil {
+		return nil, fmt.Errorf("docker exec failed: %v", err)
+	}
+
+	// Calculate a subset of tests to run corresponding to the current
+	// shard.
+	tests := strings.Fields(list)
+	sort.Strings(tests)
+	indices, err := testutil.TestIndicesForShard(len(tests))
+	if err != nil {
+		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
+	}
+
+	var itests []testing.InternalTest
+	for _, tci := range indices {
+		// Capture tc in this scope.
+		tc := tests[tci]
+		itests = append(itests, testing.InternalTest{
+			Name: tc,
+			F: func(t *testing.T) {
+				// Is the test blacklisted?
+				if _, ok := blacklist[tc]; ok {
+					t.Skipf("SKIP: blacklisted test %q", tc)
+				}
+
+				var (
+					now    = time.Now()
+					done   = make(chan struct{})
+					output string
+					err    error
+				)
+
+				go func() {
+					fmt.Printf("RUNNING %s...\n", tc)
+					output, err = d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--test", tc)
+					close(done)
+				}()
+
+				select {
+				case <-done:
+					if err == nil {
+						fmt.Printf("PASS: %s (%v)\n\n", tc, time.Since(now))
+						return
+					}
+					t.Errorf("FAIL: %s (%v):\n%s\n", tc, time.Since(now), output)
+				case <-time.After(timeout):
+					t.Errorf("TIMEOUT: %s (%v):\n%s\n", tc, time.Since(now), output)
+				}
+			},
+		})
+	}
+
+	return itests, nil
+}
+
+// getBlacklist reads the blacklist file and returns a set of test names to
+// exclude.
+func getBlacklist() (map[string]struct{}, error) {
+	blacklist := make(map[string]struct{})
+	if *blacklistFile == "" {
+		return blacklist, nil
+	}
+	f, err := os.Open(*blacklistFile)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	r := csv.NewReader(f)
+
+	// First line is header. Skip it.
+	if _, err := r.Read(); err != nil {
+		return nil, err
+	}
+
+	for {
+		record, err := r.Read()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+		blacklist[record[0]] = struct{}{}
+	}
+	return blacklist, nil
+}
+
+// testDeps implements testing.testDeps (an unexported interface), and is
+// required to use testing.MainStart.
+type testDeps struct{}
+
+func (f testDeps) MatchString(a, b string) (bool, error)       { return a == b, nil }
+func (f testDeps) StartCPUProfile(io.Writer) error             { return nil }
+func (f testDeps) StopCPUProfile()                             {}
+func (f testDeps) WriteProfileTo(string, io.Writer, int) error { return nil }
+func (f testDeps) ImportPath() string                          { return "" }
+func (f testDeps) StartTestLog(io.Writer)                      {}
+func (f testDeps) StopTestLog() error                          { return nil }
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 2207b9b34..3c22aec24 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -5,18 +5,14 @@ load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _g
 load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library")
 load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
 load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
-load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
-load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
 load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
 load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library")
 
-container_image = _container_image
 cc_library = _cc_library
 cc_flags_supplier = _cc_flags_supplier
 cc_proto_library = _cc_proto_library
 cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
-go_image = _go_image
 go_embed_data = _go_embed_data
 gtest = "@com_google_googletest//:gtest"
 grpcpp = "@com_github_grpc_grpc//:grpc++"
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 33240e7f4..cdaf281f3 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms")
 load("//tools/bazeldefs:tags.bzl", "go_suffixes")
 load("//tools/nogo:defs.bzl", "nogo_test")
@@ -19,12 +19,10 @@ cc_grpc_library = _cc_grpc_library
 cc_library = _cc_library
 cc_test = _cc_test
 cc_toolchain = _cc_toolchain
-container_image = _container_image
 default_installer = _default_installer
 default_net_util = _default_net_util
 gbenchmark = _gbenchmark
 go_embed_data = _go_embed_data
-go_image = _go_image
 go_test = _go_test
 gtest = _gtest
 grpcpp = _grpcpp
-- 
cgit v1.2.3


From 2e8c35b506654172243ea46918d27c897fca568c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 23 Apr 2020 13:00:34 -0700
Subject: Add basic GitHub labeler workflow.

This is the first automated GitHub actions workflow, and it simply applies
labels to pull request in a best-effort fashion.

PiperOrigin-RevId: 308112191
---
 .github/labeler.yml           | 42 ++++++++++++++++++++++++++++++++++++++++++
 .github/workflows/labeler.yml | 11 +++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 .github/labeler.yml
 create mode 100644 .github/workflows/labeler.yml

diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 000000000..b6a17051c
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,42 @@
+"arch: arm":
+  - "**/*_arm64.*"
+  - "**/*_aarch64.*"
+"arch: x86_64":
+  - "**/*_amd64.*"
+  - "**/*_x86.*"
+"area: bazel":
+  - "**/BUILD"
+  - "**/*.bzl"
+"area: docs":
+  - "**/g3doc/**"
+  - "**/README.md"
+"area: filesystem":
+  - "pkg/sentry/fs/**"
+  - "pkg/sentry/vfs/**"
+  - "pkg/sentry/fsimpl/**"
+"area: hostinet":
+  - "pkg/sentry/socket/hostinet/**"
+"area: networking":
+  - "pkg/tcpip/**"
+  - "pkg/sentry/socket/**"
+"area: kernel":
+  - "pkg/sentry/arch/**"
+  - "pkg/sentry/kernel/**"
+  - "pkg/sentry/syscalls/**"
+"area: mm":
+  - "pkg/sentry/mm/**"
+"area: tests":
+  - "**/tests/**"
+  - "**/*_test.go"
+  - "**/test/**"
+"area: tooling":
+  - "tools/**"
+"dependencies":
+  - "WORKSPACE"
+  - "go.mod"
+  - "go.sum"
+"platform: kvm":
+  - "pkg/sentry/platform/kvm/**"
+  - "pkg/sentry/platform/ring0/**"
+"platform: ptrace":
+  - "pkg/sentry/platform/ptrace/**"
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 000000000..b5fd10352
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,11 @@
+name: "Labeler"
+on:
+- pull_request
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v2
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
-- 
cgit v1.2.3


From cc5de905e628c5e9aca7e7a333d6dd9638719b6a Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 23 Apr 2020 13:11:23 -0700
Subject: Fix test output so that filenames have the correct path.

Tested:
  Intentionally introduce an error and then run:
  blaze test --test_output=streamed //third_party/gvisor/test/packetimpact/tests:tcp_outside_the_window_linux_test
PiperOrigin-RevId: 308114194
---
 test/packetimpact/tests/test_runner.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
index 46d63d5e5..e938de782 100755
--- a/test/packetimpact/tests/test_runner.sh
+++ b/test/packetimpact/tests/test_runner.sh
@@ -262,7 +262,10 @@ sleep 3
 # Start a packetimpact test on the test bench.  The packetimpact test sends and
 # receives packets and also sends POSIX socket commands to the posix_server to
 # be executed on the DUT.
-docker exec -t "${TESTBENCH}" \
+docker exec \
+  -e XML_OUTPUT_FILE="/test.xml" \
+  -e TEST_TARGET \
+  -t "${TESTBENCH}" \
   /bin/bash -c "${DOCKER_TESTBENCH_BINARY} \
   ${EXTRA_TEST_ARGS[@]-} \
   --posix_server_ip=${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
-- 
cgit v1.2.3


From 5042ea7e2cbdc0c04fd454583589a3b1e152f95d Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 23 Apr 2020 15:35:56 -0700
Subject: Add vfs.MkdirOptions.ForSyntheticMountpoint.

PiperOrigin-RevId: 308143529
---
 pkg/sentry/fsimpl/gofer/directory.go  | 147 +++++++++++-----
 pkg/sentry/fsimpl/gofer/filesystem.go | 323 ++++++++++++++++++++++------------
 pkg/sentry/fsimpl/gofer/gofer.go      | 177 ++++++++++++-------
 pkg/sentry/fsimpl/gofer/gofer_test.go |   2 +-
 pkg/sentry/vfs/filesystem.go          |   3 +-
 pkg/sentry/vfs/options.go             |  19 ++
 runsc/boot/vfs.go                     |   6 +
 7 files changed, 461 insertions(+), 216 deletions(-)

diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index d02691232..c67766ab2 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -21,8 +21,10 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func (d *dentry) isDir() bool {
@@ -41,15 +43,46 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.children[name] = child
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
-// InteropModeShared.
-func (d *dentry) cacheNegativeChildLocked(name string) {
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) cacheNegativeLookupLocked(name string) {
+	// Don't cache negative lookups if InteropModeShared is in effect (since
+	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
+	// case the only files in the directory are those for which a dentry exists
+	// in d.children). Instead, just delete any previously-cached dentry.
+	if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
+		delete(d.children, name)
+		return
+	}
 	if d.children == nil {
 		d.children = make(map[string]*dentry)
 	}
 	d.children[name] = nil
 }
 
+// createSyntheticDirectory creates a synthetic directory with the given name
+// in d.
+//
+// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
+// a child with the given name.
+func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) {
+	d2 := &dentry{
+		refs:      1, // held by d
+		fs:        d.fs,
+		mode:      uint32(mode) | linux.S_IFDIR,
+		uid:       uint32(kuid),
+		gid:       uint32(kgid),
+		blockSize: usermem.PageSize, // arbitrary
+		handle: handle{
+			fd: -1,
+		},
+	}
+	d2.pf.dentry = d2
+	d2.vfsd.Init(d2)
+
+	d.cacheNewChildLocked(d2, name)
+	d.syntheticChildren++
+}
+
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
@@ -77,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		fd.dirents = ds
 	}
 
-	if d.fs.opts.interop != InteropModeShared {
+	if d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
 
@@ -108,10 +141,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// filesystem.renameMu is needed for d.parent, and must be locked before
 	// dentry.dirMu.
 	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
 	if d.dirents != nil {
-		d.fs.renameMu.RUnlock()
 		return d.dirents, nil
 	}
 
@@ -132,51 +165,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 			NextOff: 2,
 		},
 	}
-	d.fs.renameMu.RUnlock()
-	off := uint64(0)
-	const count = 64 * 1024 // for consistency with the vfs1 client
-	d.handleMu.RLock()
-	defer d.handleMu.RUnlock()
-	if !d.handleReadable {
-		// This should not be possible because a readable handle should have
-		// been opened when the calling directoryFD was opened.
-		panic("gofer.dentry.getDirents called without a readable handle")
-	}
-	for {
-		p9ds, err := d.handle.file.readdir(ctx, off, count)
-		if err != nil {
-			return nil, err
+	var realChildren map[string]struct{}
+	if !d.isSynthetic() {
+		if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
+			// Record the set of children d actually has so that we don't emit
+			// duplicate entries for synthetic children.
+			realChildren = make(map[string]struct{})
 		}
-		if len(p9ds) == 0 {
-			// Cache dirents for future directoryFDs if permitted.
-			if d.fs.opts.interop != InteropModeShared {
-				d.dirents = dirents
+		off := uint64(0)
+		const count = 64 * 1024 // for consistency with the vfs1 client
+		d.handleMu.RLock()
+		if !d.handleReadable {
+			// This should not be possible because a readable handle should
+			// have been opened when the calling directoryFD was opened.
+			d.handleMu.RUnlock()
+			panic("gofer.dentry.getDirents called without a readable handle")
+		}
+		for {
+			p9ds, err := d.handle.file.readdir(ctx, off, count)
+			if err != nil {
+				d.handleMu.RUnlock()
+				return nil, err
+			}
+			if len(p9ds) == 0 {
+				d.handleMu.RUnlock()
+				break
+			}
+			for _, p9d := range p9ds {
+				if p9d.Name == "." || p9d.Name == ".." {
+					continue
+				}
+				dirent := vfs.Dirent{
+					Name:    p9d.Name,
+					Ino:     p9d.QID.Path,
+					NextOff: int64(len(dirents) + 1),
+				}
+				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+				// DMSOCKET.
+				switch p9d.Type {
+				case p9.TypeSymlink:
+					dirent.Type = linux.DT_LNK
+				case p9.TypeDir:
+					dirent.Type = linux.DT_DIR
+				default:
+					dirent.Type = linux.DT_REG
+				}
+				dirents = append(dirents, dirent)
+				if realChildren != nil {
+					realChildren[p9d.Name] = struct{}{}
+				}
 			}
-			return dirents, nil
+			off = p9ds[len(p9ds)-1].Offset
 		}
-		for _, p9d := range p9ds {
-			if p9d.Name == "." || p9d.Name == ".." {
+	}
+	// Emit entries for synthetic children.
+	if d.syntheticChildren != 0 {
+		for _, child := range d.children {
+			if child == nil || !child.isSynthetic() {
 				continue
 			}
-			dirent := vfs.Dirent{
-				Name:    p9d.Name,
-				Ino:     p9d.QID.Path,
-				NextOff: int64(len(dirents) + 1),
-			}
-			// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
-			// DMSOCKET.
-			switch p9d.Type {
-			case p9.TypeSymlink:
-				dirent.Type = linux.DT_LNK
-			case p9.TypeDir:
-				dirent.Type = linux.DT_DIR
-			default:
-				dirent.Type = linux.DT_REG
+			if _, ok := realChildren[child.name]; ok {
+				continue
 			}
-			dirents = append(dirents, dirent)
+			dirents = append(dirents, vfs.Dirent{
+				Name:    child.name,
+				Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
+				Ino:     child.ino,
+				NextOff: int64(len(dirents) + 1),
+			})
 		}
-		off = p9ds[len(p9ds)-1].Offset
 	}
+	// Cache dirents for future directoryFDs if permitted.
+	if d.cachedMetadataAuthoritative() {
+		d.dirents = dirents
+	}
+	return dirents, nil
 }
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index eba4aabe8..98ccb42fd 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -29,14 +29,16 @@ import (
 
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
-	// Snapshot current dentries and special files.
+	// Snapshot current syncable dentries and special files.
 	fs.syncMu.Lock()
-	ds := make([]*dentry, 0, len(fs.dentries))
-	for d := range fs.dentries {
+	ds := make([]*dentry, 0, len(fs.syncableDentries))
+	for d := range fs.syncableDentries {
+		d.IncRef()
 		ds = append(ds, d)
 	}
 	sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
 	for sffd := range fs.specialFileFDs {
+		sffd.vfsfd.IncRef()
 		sffds = append(sffds, sffd)
 	}
 	fs.syncMu.Unlock()
@@ -47,9 +49,6 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 
 	// Sync regular files.
 	for _, d := range ds {
-		if !d.TryIncRef() {
-			continue
-		}
 		err := d.syncSharedHandle(ctx)
 		d.DecRef()
 		if err != nil && retErr == nil {
@@ -60,9 +59,6 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 	// Sync special files, which may be writable but do not use dentry shared
 	// handles (so they won't be synced by the above).
 	for _, sffd := range sffds {
-		if !sffd.vfsfd.TryIncRef() {
-			continue
-		}
 		err := sffd.Sync(ctx)
 		sffd.vfsfd.DecRef()
 		if err != nil && retErr == nil {
@@ -114,8 +110,8 @@ func putDentrySlice(ds *[]*dentry) {
 // to *ds.
 //
 // Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
-// metadata must be up to date.
+// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
+// must be up to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
@@ -148,7 +144,7 @@ afterSymlink:
 		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
 			return nil, err
 		}
-		if fs.opts.interop == InteropModeShared && d != d.parent {
+		if d != d.parent && !d.cachedMetadataAuthoritative() {
 			_, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
 			if err != nil {
 				return nil, err
@@ -195,7 +191,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 		return nil, syserror.ENAMETOOLONG
 	}
 	child, ok := parent.children[name]
-	if ok && fs.opts.interop != InteropModeShared {
+	if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() {
 		// Whether child is nil or not, it is cached information that is
 		// assumed to be correct.
 		return child, nil
@@ -206,7 +202,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
 }
 
-// Preconditions: As for getChildLocked.
+// Preconditions: As for getChildLocked. !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
 	if err != nil && err != syserror.ENOENT {
@@ -220,24 +216,41 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 			child.updateFromP9Attrs(attrMask, &attr)
 			return child, nil
 		}
-		// The file at this path has changed or no longer exists. Remove
-		// the stale dentry from the tree, and re-evaluate its caching
-		// status (i.e. if it has 0 references, drop it).
+		if file.isNil() && child.isSynthetic() {
+			// We have a synthetic file, and no remote file has arisen to
+			// replace it.
+			return child, nil
+		}
+		// The file at this path has changed or no longer exists. Mark the
+		// dentry invalidated, and re-evaluate its caching status (i.e. if it
+		// has 0 references, drop it). Wait to update parent.children until we
+		// know what to replace the existing dentry with (i.e. one of the
+		// returns below), to avoid a redundant map access.
 		vfsObj.InvalidateDentry(&child.vfsd)
+		if child.isSynthetic() {
+			// Normally we don't mark invalidated dentries as deleted since
+			// they may still exist (but at a different path), and also for
+			// consistency with Linux. However, synthetic files are guaranteed
+			// to become unreachable if their dentries are invalidated, so
+			// treat their invalidation as deletion.
+			child.setDeleted()
+			parent.syntheticChildren--
+			child.decRefLocked()
+			parent.dirents = nil
+		}
 		*ds = appendDentry(*ds, child)
 	}
 	if file.isNil() {
 		// No file exists at this path now. Cache the negative lookup if
 		// allowed.
-		if fs.opts.interop != InteropModeShared {
-			parent.cacheNegativeChildLocked(name)
-		}
+		parent.cacheNegativeLookupLocked(name)
 		return nil, nil
 	}
 	// Create a new dentry representing the file.
 	child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
 	if err != nil {
 		file.close(ctx)
+		delete(parent.children, name)
 		return nil, err
 	}
 	parent.cacheNewChildLocked(child, name)
@@ -252,8 +265,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop ==
-// InteropModeShared, then d's cached metadata must be up to date.
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If
+// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
+// date.
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -275,7 +289,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 // Preconditions: fs.renameMu must be locked.
 func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
 	d := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !d.cachedMetadataAuthoritative() {
 		// Get updated metadata for rp.Start() as required by fs.stepLocked().
 		if err := d.updateFromGetattr(ctx); err != nil {
 			return nil, err
@@ -297,16 +311,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 }
 
 // doCreateAt checks that creating a file at rp is permitted, then invokes
-// create to do so.
+// createInRemoteDir (if the parent directory is a real remote directory) or
+// createInSyntheticDir (if the parent directory is synthetic) to do so.
 //
 // Preconditions: !rp.Done(). For the final path component in rp,
 // !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by
 		// fs.walkParentDirLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
@@ -340,6 +355,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	defer mnt.EndWrite()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
+	if parent.isSynthetic() {
+		if child := parent.children[name]; child != nil {
+			return syserror.EEXIST
+		}
+		if createInSyntheticDir == nil {
+			return syserror.EPERM
+		}
+		if err := createInSyntheticDir(parent, name); err != nil {
+			return err
+		}
+		parent.touchCMtime()
+		parent.dirents = nil
+		return nil
+	}
 	if fs.opts.interop == InteropModeShared {
 		// The existence of a dentry at name would be inconclusive because the
 		// file it represents may have been deleted from the remote filesystem,
@@ -348,21 +377,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time
 		// it's used.
-		return create(parent, name)
+		return createInRemoteDir(parent, name)
 	}
 	if child := parent.children[name]; child != nil {
 		return syserror.EEXIST
 	}
 	// No cached dentry exists; however, there might still be an existing file
 	// at name. As above, we attempt the file creation RPC anyway.
-	if err := create(parent, name); err != nil {
+	if err := createInRemoteDir(parent, name); err != nil {
 		return err
 	}
+	if child, ok := parent.children[name]; ok && child == nil {
+		// Delete the now-stale negative dentry.
+		delete(parent.children, name)
+	}
 	parent.touchCMtime()
-	// Either parent.children[name] doesn't exist (in which case this is a
-	// no-op) or is nil (in which case this erases the now-stale information
-	// that the file doesn't exist).
-	delete(parent.children, name)
 	parent.dirents = nil
 	return nil
 }
@@ -373,7 +402,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by
 		// fs.walkParentDirLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
@@ -421,8 +450,10 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	// only revalidating the dentry if that fails (indicating that the existing
 	// dentry is a mount point).
 	if child != nil {
+		child.dirMu.Lock()
+		defer child.dirMu.Unlock()
 		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
-			if fs.opts.interop != InteropModeShared {
+			if parent.cachedMetadataAuthoritative() {
 				return err
 			}
 			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
@@ -437,13 +468,37 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		}
 	}
 	flags := uint32(0)
+	// If a dentry exists, use it for best-effort checks on its deletability.
 	if dir {
-		if child != nil && !child.isDir() {
-			vfsObj.AbortDeleteDentry(&child.vfsd)
-			return syserror.ENOTDIR
+		if child != nil {
+			// child must be an empty directory.
+			if child.syntheticChildren != 0 {
+				// This is definitely not an empty directory, irrespective of
+				// fs.opts.interop.
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+				return syserror.ENOTEMPTY
+			}
+			// If InteropModeShared is in effect and the first call to
+			// PrepareDeleteDentry above succeeded, then child wasn't
+			// revalidated (so we can't expect its file type to be correct) and
+			// individually revalidating its children (to confirm that they
+			// still exist) would be a waste of time.
+			if child.cachedMetadataAuthoritative() {
+				if !child.isDir() {
+					vfsObj.AbortDeleteDentry(&child.vfsd)
+					return syserror.ENOTDIR
+				}
+				for _, grandchild := range child.children {
+					if grandchild != nil {
+						vfsObj.AbortDeleteDentry(&child.vfsd)
+						return syserror.ENOTEMPTY
+					}
+				}
+			}
 		}
 		flags = linux.AT_REMOVEDIR
 	} else {
+		// child must be a non-directory file.
 		if child != nil && child.isDir() {
 			vfsObj.AbortDeleteDentry(&child.vfsd)
 			return syserror.EISDIR
@@ -455,28 +510,36 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 			return syserror.ENOTDIR
 		}
 	}
-	err = parent.file.unlinkAt(ctx, name, flags)
-	if err != nil {
-		if child != nil {
-			vfsObj.AbortDeleteDentry(&child.vfsd)
-		}
-		return err
-	}
-	if fs.opts.interop != InteropModeShared {
-		parent.touchCMtime()
-		if dir {
-			parent.decLinks()
+	if parent.isSynthetic() {
+		if child == nil {
+			return syserror.ENOENT
 		}
-		parent.cacheNegativeChildLocked(name)
-		parent.dirents = nil
 	} else {
-		delete(parent.children, name)
+		err = parent.file.unlinkAt(ctx, name, flags)
+		if err != nil {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return err
+		}
 	}
 	if child != nil {
-		child.setDeleted()
 		vfsObj.CommitDeleteDentry(&child.vfsd)
+		child.setDeleted()
+		if child.isSynthetic() {
+			parent.syntheticChildren--
+			child.decRefLocked()
+		}
 		ds = appendDentry(ds, child)
 	}
+	parent.cacheNegativeLookupLocked(name)
+	if parent.cachedMetadataAuthoritative() {
+		parent.dirents = nil
+		parent.touchCMtime()
+		if dir {
+			parent.decLinks()
+		}
+	}
 	return nil
 }
 
@@ -554,7 +617,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by
 		// fs.walkParentDirLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
@@ -577,20 +640,32 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		}
 		// 9P2000.L supports hard links, but we don't.
 		return syserror.EPERM
-	})
+	}, nil)
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	creds := rp.Credentials()
 	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
-		creds := rp.Credentials()
 		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
-			return err
+			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+				return err
+			}
+			ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
+			parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID)
 		}
 		if fs.opts.interop != InteropModeShared {
 			parent.incLinks()
 		}
 		return nil
+	}, func(parent *dentry, name string) error {
+		if !opts.ForSyntheticMountpoint {
+			// Can't create non-synthetic files in synthetic directories.
+			return syserror.EPERM
+		}
+		parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID)
+		parent.incLinks()
+		return nil
 	})
 }
 
@@ -600,7 +675,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		creds := rp.Credentials()
 		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
-	})
+	}, nil)
 }
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
@@ -620,7 +695,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by fs.stepLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
 			return nil, err
@@ -643,6 +718,10 @@ afterTrailingSymlink:
 	parent.dirMu.Lock()
 	child, err := fs.stepLocked(ctx, rp, parent, &ds)
 	if err == syserror.ENOENT && mayCreate {
+		if parent.isSynthetic() {
+			parent.dirMu.Unlock()
+			return nil, syserror.EPERM
+		}
 		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
 		parent.dirMu.Unlock()
 		return fd, err
@@ -702,8 +781,10 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 		if opts.Flags&linux.O_DIRECT != 0 {
 			return nil, syserror.EINVAL
 		}
-		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
-			return nil, err
+		if !d.isSynthetic() {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+				return nil, err
+			}
 		}
 		fd := &directoryFD{}
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
@@ -733,6 +814,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 }
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+// !d.isSynthetic().
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
@@ -811,7 +893,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	child.refs = 1
 	// Insert the dentry into the tree.
 	d.cacheNewChildLocked(child, name)
-	if d.fs.opts.interop != InteropModeShared {
+	if d.cachedMetadataAuthoritative() {
 		d.touchCMtime()
 		d.dirents = nil
 	}
@@ -888,7 +970,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	defer mnt.EndWrite()
 
 	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !oldParent.cachedMetadataAuthoritative() {
 		if err := oldParent.updateFromGetattr(ctx); err != nil {
 			return err
 		}
@@ -933,35 +1015,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if newParent.isDeleted() {
 		return syserror.ENOENT
 	}
-	replaced := newParent.children[newName]
-	// This is similar to unlinkAt, except:
-	//
-	// - If a dentry exists for the file to be replaced, we revalidate it
-	// unconditionally (instead of only if PrepareRenameDentry fails) for
-	// simplicity.
-	//
-	// - If rp.MustBeDir(), then we need a dentry representing the replaced
-	// file regardless to confirm that it's a directory.
-	if replaced != nil || rp.MustBeDir() {
-		replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
-		if err != nil {
-			return err
-		}
-		if replaced != nil {
-			if replaced.isDir() {
-				if !renamed.isDir() {
-					return syserror.EISDIR
-				}
-			} else {
-				if rp.MustBeDir() || renamed.isDir() {
-					return syserror.ENOTDIR
-				}
-			}
-		}
+	replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
+	if err != nil {
+		return err
 	}
 	var replacedVFSD *vfs.Dentry
 	if replaced != nil {
 		replacedVFSD = &replaced.vfsd
+		if replaced.isDir() {
+			if !renamed.isDir() {
+				return syserror.EISDIR
+			}
+		} else {
+			if rp.MustBeDir() || renamed.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
 	}
 
 	if oldParent == newParent && oldName == newName {
@@ -972,27 +1041,47 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
 		return err
 	}
-	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
-		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
-		return err
+
+	// Update the remote filesystem.
+	if !renamed.isSynthetic() {
+		if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+			vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+			return err
+		}
+	} else if replaced != nil && !replaced.isSynthetic() {
+		// We are replacing an existing real file with a synthetic one, so we
+		// need to unlink the former.
+		flags := uint32(0)
+		if replaced.isDir() {
+			flags = linux.AT_REMOVEDIR
+		}
+		if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
+			vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+			return err
+		}
 	}
-	if fs.opts.interop != InteropModeShared {
-		oldParent.cacheNegativeChildLocked(oldName)
-		oldParent.dirents = nil
-		newParent.dirents = nil
-		if renamed.isDir() {
-			oldParent.decLinks()
-			newParent.incLinks()
+
+	// Update the dentry tree.
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+	if replaced != nil {
+		replaced.setDeleted()
+		if replaced.isSynthetic() {
+			newParent.syntheticChildren--
+			replaced.decRefLocked()
 		}
-		oldParent.touchCMtime()
-		newParent.touchCMtime()
-		renamed.touchCtime()
-	} else {
-		delete(oldParent.children, oldName)
+		ds = appendDentry(ds, replaced)
 	}
+	oldParent.cacheNegativeLookupLocked(oldName)
+	// We don't use newParent.cacheNewChildLocked() since we don't want to mess
+	// with reference counts and queue oldParent for checkCachingLocked if the
+	// parent isn't actually changing.
 	if oldParent != newParent {
-		appendDentry(ds, oldParent)
+		ds = appendDentry(ds, oldParent)
 		newParent.IncRef()
+		if renamed.isSynthetic() {
+			oldParent.syntheticChildren--
+			newParent.syntheticChildren++
+		}
 	}
 	renamed.parent = newParent
 	renamed.name = newName
@@ -1000,11 +1089,25 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		newParent.children = make(map[string]*dentry)
 	}
 	newParent.children[newName] = renamed
-	if replaced != nil {
-		replaced.setDeleted()
-		appendDentry(ds, replaced)
+
+	// Update metadata.
+	if renamed.cachedMetadataAuthoritative() {
+		renamed.touchCtime()
+	}
+	if oldParent.cachedMetadataAuthoritative() {
+		oldParent.dirents = nil
+		oldParent.touchCMtime()
+		if renamed.isDir() {
+			oldParent.decLinks()
+		}
+	}
+	if newParent.cachedMetadataAuthoritative() {
+		newParent.dirents = nil
+		newParent.touchCMtime()
+		if renamed.isDir() {
+			newParent.incLinks()
+		}
 	}
-	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
 	return nil
 }
 
@@ -1051,6 +1154,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
+	// If d is synthetic, invoke statfs on the first ancestor of d that isn't.
+	for d.isSynthetic() {
+		d = d.parent
+	}
 	fsstat, err := d.file.statFS(ctx)
 	if err != nil {
 		return linux.Statfs{}, err
@@ -1080,7 +1187,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		creds := rp.Credentials()
 		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
-	})
+	}, nil)
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 293df2545..8b4e91d17 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -27,8 +27,9 @@
 //             dentry.handleMu
 //               dentry.dataMu
 //
-// Locking dentry.dirMu in multiple dentries requires holding
-// filesystem.renameMu for writing.
+// Locking dentry.dirMu in multiple dentries requires that either ancestor
+// dentries are locked before descendant dentries, or that filesystem.renameMu
+// is locked for writing.
 package gofer
 
 import (
@@ -102,11 +103,12 @@ type filesystem struct {
 	cachedDentries    dentryList
 	cachedDentriesLen uint64
 
-	// dentries contains all dentries in this filesystem. specialFileFDs
-	// contains all open specialFileFDs. These fields are protected by syncMu.
-	syncMu         sync.Mutex
-	dentries       map[*dentry]struct{}
-	specialFileFDs map[*specialFileFD]struct{}
+	// syncableDentries contains all dentries in this filesystem for which
+	// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
+	// These fields are protected by syncMu.
+	syncMu           sync.Mutex
+	syncableDentries map[*dentry]struct{}
+	specialFileFDs   map[*specialFileFD]struct{}
 }
 
 type filesystemOptions struct {
@@ -187,7 +189,8 @@ const (
 	// InteropModeShared is appropriate when there are users of the remote
 	// filesystem that may mutate its state other than the client.
 	//
-	// - The client must verify cached filesystem state before using it.
+	// - The client must verify ("revalidate") cached filesystem state before
+	// using it.
 	//
 	// - Client changes to filesystem state must be sent to the remote
 	// filesystem synchronously.
@@ -376,14 +379,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 
 	// Construct the filesystem object.
 	fs := &filesystem{
-		mfp:            mfp,
-		opts:           fsopts,
-		uid:            creds.EffectiveKUID,
-		gid:            creds.EffectiveKGID,
-		client:         client,
-		clock:          ktime.RealtimeClockFromContext(ctx),
-		dentries:       make(map[*dentry]struct{}),
-		specialFileFDs: make(map[*specialFileFD]struct{}),
+		mfp:              mfp,
+		opts:             fsopts,
+		uid:              creds.EffectiveKUID,
+		gid:              creds.EffectiveKGID,
+		client:           client,
+		clock:            ktime.RealtimeClockFromContext(ctx),
+		syncableDentries: make(map[*dentry]struct{}),
+		specialFileFDs:   make(map[*specialFileFD]struct{}),
 	}
 	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
@@ -409,7 +412,7 @@ func (fs *filesystem) Release() {
 	mf := fs.mfp.MemoryFile()
 
 	fs.syncMu.Lock()
-	for d := range fs.dentries {
+	for d := range fs.syncableDentries {
 		d.handleMu.Lock()
 		d.dataMu.Lock()
 		if d.handleWritable {
@@ -444,9 +447,11 @@ type dentry struct {
 	vfsd vfs.Dentry
 
 	// refs is the reference count. Each dentry holds a reference on its
-	// parent, even if disowned. refs is accessed using atomic memory
-	// operations. When refs reaches 0, the dentry may be added to the cache or
-	// destroyed. If refs==-1 the dentry has already been destroyed.
+	// parent, even if disowned. An additional reference is held on all
+	// synthetic dentries until they are unlinked or invalidated. When refs
+	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
+	// -1, the dentry has already been destroyed. refs is accessed using atomic
+	// memory operations.
 	refs int64
 
 	// fs is the owning filesystem. fs is immutable.
@@ -465,6 +470,12 @@ type dentry struct {
 	// We don't support hard links, so each dentry maps 1:1 to an inode.
 
 	// file is the unopened p9.File that backs this dentry. file is immutable.
+	//
+	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
+	// that does not exist on the remote filesystem. As of this writing, this
+	// is only possible for a directory created with
+	// MkdirOptions.ForSyntheticMountpoint == true.
+	// TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes).
 	file p9file
 
 	// If deleted is non-zero, the file represented by this dentry has been
@@ -484,15 +495,21 @@ type dentry struct {
 	// - Mappings of child filenames to dentries representing those children.
 	//
 	// - Mappings of child filenames that are known not to exist to nil
-	// dentries (only if InteropModeShared is not in effect).
+	// dentries (only if InteropModeShared is not in effect and the directory
+	// is not synthetic).
 	//
 	// children is protected by dirMu.
 	children map[string]*dentry
 
-	// If this dentry represents a directory, InteropModeShared is not in
-	// effect, and dirents is not nil, it is a cache of all entries in the
-	// directory, in the order they were returned by the server. dirents is
-	// protected by dirMu.
+	// If this dentry represents a directory, syntheticChildren is the number
+	// of child dentries for which dentry.isSynthetic() == true.
+	// syntheticChildren is protected by dirMu.
+	syntheticChildren int
+
+	// If this dentry represents a directory,
+	// dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
+	// is a cache of all entries in the directory, in the order they were
+	// returned by the server. dirents is protected by dirMu.
 	dirents []vfs.Dirent
 
 	// Cached metadata; protected by metadataMu and accessed using atomic
@@ -589,6 +606,8 @@ func dentryAttrMask() p9.AttrMask {
 // initially has no references, but is not cached; it is the caller's
 // responsibility to set the dentry's reference count and/or call
 // dentry.checkCachingLocked() as appropriate.
+//
+// Preconditions: !file.isNil().
 func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
 	if !mask.Mode {
 		ctx.Warningf("can't create gofer.dentry without file type")
@@ -612,10 +631,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 		},
 	}
 	d.pf.dentry = d
-	if mask.UID {
+	if mask.UID && attr.UID != auth.NoID {
 		d.uid = uint32(attr.UID)
 	}
-	if mask.GID {
+	if mask.GID && attr.GID != auth.NoID {
 		d.gid = uint32(attr.GID)
 	}
 	if mask.Size {
@@ -642,11 +661,19 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	d.vfsd.Init(d)
 
 	fs.syncMu.Lock()
-	fs.dentries[d] = struct{}{}
+	fs.syncableDentries[d] = struct{}{}
 	fs.syncMu.Unlock()
 	return d, nil
 }
 
+func (d *dentry) isSynthetic() bool {
+	return d.file.isNil()
+}
+
+func (d *dentry) cachedMetadataAuthoritative() bool {
+	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
+}
+
 // updateFromP9Attrs is called to update d's metadata after an update from the
 // remote filesystem.
 func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
@@ -691,6 +718,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 	d.metadataMu.Unlock()
 }
 
+// Preconditions: !d.isSynthetic()
 func (d *dentry) updateFromGetattr(ctx context.Context) error {
 	// Use d.handle.file, which represents a 9P fid that has been opened, in
 	// preference to d.file, which represents a 9P fid that has not. This may
@@ -758,7 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	defer mnt.EndWrite()
 	setLocalAtime := false
 	setLocalMtime := false
-	if d.fs.opts.interop != InteropModeShared {
+	if d.cachedMetadataAuthoritative() {
 		// Timestamp updates will be handled locally.
 		setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
 		setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
@@ -771,35 +799,37 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	}
 	d.metadataMu.Lock()
 	defer d.metadataMu.Unlock()
-	if stat.Mask != 0 {
-		if err := d.file.setAttr(ctx, p9.SetAttrMask{
-			Permissions:        stat.Mask&linux.STATX_MODE != 0,
-			UID:                stat.Mask&linux.STATX_UID != 0,
-			GID:                stat.Mask&linux.STATX_GID != 0,
-			Size:               stat.Mask&linux.STATX_SIZE != 0,
-			ATime:              stat.Mask&linux.STATX_ATIME != 0,
-			MTime:              stat.Mask&linux.STATX_MTIME != 0,
-			ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
-			MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
-		}, p9.SetAttr{
-			Permissions:      p9.FileMode(stat.Mode),
-			UID:              p9.UID(stat.UID),
-			GID:              p9.GID(stat.GID),
-			Size:             stat.Size,
-			ATimeSeconds:     uint64(stat.Atime.Sec),
-			ATimeNanoSeconds: uint64(stat.Atime.Nsec),
-			MTimeSeconds:     uint64(stat.Mtime.Sec),
-			MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
-		}); err != nil {
-			return err
+	if !d.isSynthetic() {
+		if stat.Mask != 0 {
+			if err := d.file.setAttr(ctx, p9.SetAttrMask{
+				Permissions:        stat.Mask&linux.STATX_MODE != 0,
+				UID:                stat.Mask&linux.STATX_UID != 0,
+				GID:                stat.Mask&linux.STATX_GID != 0,
+				Size:               stat.Mask&linux.STATX_SIZE != 0,
+				ATime:              stat.Mask&linux.STATX_ATIME != 0,
+				MTime:              stat.Mask&linux.STATX_MTIME != 0,
+				ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
+				MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
+			}, p9.SetAttr{
+				Permissions:      p9.FileMode(stat.Mode),
+				UID:              p9.UID(stat.UID),
+				GID:              p9.GID(stat.GID),
+				Size:             stat.Size,
+				ATimeSeconds:     uint64(stat.Atime.Sec),
+				ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+				MTimeSeconds:     uint64(stat.Mtime.Sec),
+				MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+			}); err != nil {
+				return err
+			}
+		}
+		if d.fs.opts.interop == InteropModeShared {
+			// There's no point to updating d's metadata in this case since
+			// it'll be overwritten by revalidation before the next time it's
+			// used anyway. (InteropModeShared inhibits client caching of
+			// regular file data, so there's no cache to truncate either.)
+			return nil
 		}
-	}
-	if d.fs.opts.interop == InteropModeShared {
-		// There's no point to updating d's metadata in this case since it'll
-		// be overwritten by revalidation before the next time it's used
-		// anyway. (InteropModeShared inhibits client caching of regular file
-		// data, so there's no cache to truncate either.)
-		return nil
 	}
 	now := d.fs.clock.Now().Nanoseconds()
 	if stat.Mask&linux.STATX_MODE != 0 {
@@ -897,6 +927,15 @@ func (d *dentry) DecRef() {
 	}
 }
 
+// decRefLocked decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefLocked() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs < 0 {
+		panic("gofer.dentry.decRefLocked() called without holding a reference")
+	}
+}
+
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
@@ -1013,11 +1052,11 @@ func (d *dentry) destroyLocked() {
 	if !d.file.isNil() {
 		d.file.close(ctx)
 		d.file = p9file{}
+		// Remove d from the set of syncable dentries.
+		d.fs.syncMu.Lock()
+		delete(d.fs.syncableDentries, d)
+		d.fs.syncMu.Unlock()
 	}
-	// Remove d from the set of all dentries.
-	d.fs.syncMu.Lock()
-	delete(d.fs.dentries, d)
-	d.fs.syncMu.Unlock()
 	// Drop the reference held by d on its parent without recursively locking
 	// d.fs.renameMu.
 	if d.parent != nil {
@@ -1040,6 +1079,9 @@ func (d *dentry) setDeleted() {
 // We only support xattrs prefixed with "user." (see b/148380782). Currently,
 // there is no need to expose any other xattrs through a gofer.
 func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+	if d.file.isNil() {
+		return nil, nil
+	}
 	xattrMap, err := d.file.listXattr(ctx, size)
 	if err != nil {
 		return nil, err
@@ -1054,6 +1096,9 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 }
 
 func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if d.file.isNil() {
+		return "", syserror.ENODATA
+	}
 	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
 		return "", err
 	}
@@ -1064,6 +1109,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 }
 
 func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if d.file.isNil() {
+		return syserror.EPERM
+	}
 	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
 		return err
 	}
@@ -1074,6 +1122,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 }
 
 func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+	if d.file.isNil() {
+		return syserror.EPERM
+	}
 	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
 		return err
 	}
@@ -1083,7 +1134,7 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name
 	return d.file.removeXattr(ctx, name)
 }
 
-// Preconditions: d.isRegularFile() || d.isDirectory().
+// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1213,7 +1264,7 @@ func (fd *fileDescription) dentry() *dentry {
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	d := fd.dentry()
 	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
-	if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
 		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
 		// available?
 		if err := d.updateFromGetattr(ctx); err != nil {
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index 4041fb252..adff39490 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -24,7 +24,7 @@ import (
 
 func TestDestroyIdempotent(t *testing.T) {
 	fs := filesystem{
-		dentries: make(map[*dentry]struct{}),
+		syncableDentries: make(map[*dentry]struct{}),
 		opts: filesystemOptions{
 			// Test relies on no dentry being held in the cache.
 			maxCachedDentries: 0,
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 74577bc2f..20e5bb072 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -443,8 +443,7 @@ type FilesystemImpl interface {
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
-	// explanation.)
+	// ListxattrAt returns ENOTSUP.
 	//
 	// - If the size of the list (including a NUL terminating byte after every
 	// entry) would exceed size, ERANGE may be returned. Note that
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 534528ce6..022bac127 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -33,6 +33,25 @@ type GetDentryOptions struct {
 type MkdirOptions struct {
 	// Mode is the file mode bits for the created directory.
 	Mode linux.FileMode
+
+	// If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create
+	// the given directory in memory only (as opposed to persistent storage).
+	// The created directory should be able to support the creation of
+	// subdirectories with ForSyntheticMountpoint == true. It does not need to
+	// support the creation of subdirectories with ForSyntheticMountpoint ==
+	// false, or files of other types.
+	//
+	// FilesystemImpls are permitted to ignore the ForSyntheticMountpoint
+	// option.
+	//
+	// The ForSyntheticMountpoint option exists because, unlike mount(2), the
+	// OCI Runtime Specification permits the specification of mount points that
+	// do not exist, under the expectation that container runtimes will create
+	// them. (More accurately, the OCI Runtime Specification completely fails
+	// to document this feature, but it's implemented by runc.)
+	// ForSyntheticMountpoint allows such mount points to be created even when
+	// the underlying persistent filesystem is immutable.
+	ForSyntheticMountpoint bool
 }
 
 // MknodOptions contains options to VirtualFilesystem.MknodAt() and
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 82083c57d..bce3a3593 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -251,6 +251,12 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 	// All writes go to upper, be paranoid and make lower readonly.
 	opts.ReadOnly = useOverlay
 
+	if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{
+		ForSyntheticMountpoint: true,
+	}); err != nil && err != syserror.EEXIST {
+		// Log a warning, but attempt the mount anyway.
+		log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err)
+	}
 	if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
 		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
-- 
cgit v1.2.3


From 93dd47146185ec7004f514e23bad9f225f55efb1 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 23 Apr 2020 15:47:59 -0700
Subject: Enable automated marshalling for epoll events.

Ensure we use the correct architecture-specific defintion of epoll
event, and use go-marshal for serialization.

PiperOrigin-RevId: 308145677
---
 pkg/abi/linux/epoll_amd64.go                   |  4 ++-
 pkg/abi/linux/epoll_arm64.go                   |  4 ++-
 pkg/sentry/kernel/epoll/BUILD                  |  1 +
 pkg/sentry/kernel/epoll/epoll.go               | 20 +++---------
 pkg/sentry/syscalls/epoll.go                   |  3 +-
 pkg/sentry/syscalls/linux/sys_epoll.go         | 27 ++--------------
 pkg/sentry/syscalls/linux/vfs2/BUILD           |  1 -
 pkg/sentry/syscalls/linux/vfs2/epoll.go        |  7 ++--
 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go | 44 --------------------------
 9 files changed, 20 insertions(+), 91 deletions(-)
 delete mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go

diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go
index 34ff18009..7e74b1143 100644
--- a/pkg/abi/linux/epoll_amd64.go
+++ b/pkg/abi/linux/epoll_amd64.go
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package linux
 
 // EpollEvent is equivalent to struct epoll_event from epoll(2).
 //
-// +marshal
+// +marshal slice:EpollEventSlice
 type EpollEvent struct {
 	Events uint32
 	// Linux makes struct epoll_event::data a __u64. We represent it as
diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go
index f86c35329..a35939cc9 100644
--- a/pkg/abi/linux/epoll_arm64.go
+++ b/pkg/abi/linux/epoll_arm64.go
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package linux
 
 // EpollEvent is equivalent to struct epoll_event from epoll(2).
 //
-// +marshal
+// +marshal slice:EpollEventSlice
 type EpollEvent struct {
 	Events uint32
 	// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index dedf0fa15..75eedd5a2 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -24,6 +24,7 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/refs",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 592650923..3d78cd48f 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -30,19 +31,6 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// Event describes the event mask that was observed and the user data to be
-// returned when one of the events occurs. It has this format to match the linux
-// format to avoid extra copying/allocation when writing events to userspace.
-type Event struct {
-	// Events is the event mask containing the set of events that have been
-	// observed on an entry.
-	Events uint32
-
-	// Data is an opaque 64-bit value provided by the caller when adding the
-	// entry, and returned to the caller when the entry reports an event.
-	Data [2]int32
-}
-
 // EntryFlags is a bitmask that holds an entry's flags.
 type EntryFlags int
 
@@ -227,9 +215,9 @@ func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // ReadEvents returns up to max available events.
-func (e *EventPoll) ReadEvents(max int) []Event {
+func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
 	var local pollEntryList
-	var ret []Event
+	var ret []linux.EpollEvent
 
 	e.listsMu.Lock()
 
@@ -251,7 +239,7 @@ func (e *EventPoll) ReadEvents(max int) []Event {
 		}
 
 		// Add event to the array that will be returned to caller.
-		ret = append(ret, Event{
+		ret = append(ret, linux.EpollEvent{
 			Events: uint32(ready),
 			Data:   entry.userData,
 		})
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index 87dcad18b..d9fb808c0 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -17,6 +17,7 @@ package syscalls
 import (
 	"time"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -118,7 +119,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
 }
 
 // WaitEpoll implements the epoll_wait(2) linux syscall.
-func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) {
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) {
 	// Get epoll from the file descriptor.
 	epollfile := t.GetFile(fd)
 	if epollfile == nil {
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 3ab93fbde..51bf205cf 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -72,7 +71,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	var data [2]int32
 	if op != linux.EPOLL_CTL_DEL {
 		var e linux.EpollEvent
-		if _, err := t.CopyIn(eventAddr, &e); err != nil {
+		if _, err := e.CopyIn(t, eventAddr); err != nil {
 			return 0, nil, err
 		}
 
@@ -105,28 +104,6 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 }
 
-// copyOutEvents copies epoll events from the kernel to user memory.
-func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
-	const itemLen = 12
-	buffLen := len(e) * itemLen
-	if _, ok := addr.AddLength(uint64(buffLen)); !ok {
-		return syserror.EFAULT
-	}
-
-	b := t.CopyScratchBuffer(buffLen)
-	for i := range e {
-		usermem.ByteOrder.PutUint32(b[i*itemLen:], e[i].Events)
-		usermem.ByteOrder.PutUint32(b[i*itemLen+4:], uint32(e[i].Data[0]))
-		usermem.ByteOrder.PutUint32(b[i*itemLen+8:], uint32(e[i].Data[1]))
-	}
-
-	if _, err := t.CopyOutBytes(addr, b); err != nil {
-		return err
-	}
-
-	return nil
-}
-
 // EpollWait implements the epoll_wait(2) linux syscall.
 func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	epfd := args[0].Int()
@@ -140,7 +117,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 
 	if len(r) != 0 {
-		if err := copyOutEvents(t, eventsAddr, r); err != nil {
+		if _, err := linux.CopyEpollEventSliceOut(t, eventsAddr, r); err != nil {
 			return 0, nil, err
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 6ff2d84d2..f6fb0f219 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -6,7 +6,6 @@ go_library(
     name = "vfs2",
     srcs = [
         "epoll.go",
-        "epoll_unsafe.go",
         "execve.go",
         "fd.go",
         "filesystem.go",
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index 5a938cee2..34c90ae3e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes()
+
 // EpollCreate1 implements Linux syscall epoll_create1(2).
 func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	flags := args[0].Int()
@@ -124,7 +126,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	maxEvents := int(args[2].Int())
 	timeout := int(args[3].Int())
 
-	const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+	var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
 	if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
 		return 0, nil, syserror.EINVAL
 	}
@@ -157,7 +159,8 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		maxEvents -= n
 		if n != 0 {
 			// Copy what we read out.
-			copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n])
+			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+			copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
 			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
 			total += copiedEvents
 			if err != nil {
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
deleted file mode 100644
index 825f325bf..000000000
--- a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs2
-
-import (
-	"reflect"
-	"runtime"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/gohacks"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{}))
-
-func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) {
-	if len(events) == 0 {
-		return 0, nil
-	}
-	// Cast events to a byte slice for copying.
-	var eventBytes []byte
-	eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes))
-	eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0])))
-	eventBytesHdr.Len = len(events) * sizeofEpollEvent
-	eventBytesHdr.Cap = len(events) * sizeofEpollEvent
-	copiedBytes, err := t.CopyOutBytes(addr, eventBytes)
-	runtime.KeepAlive(events)
-	copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
-	return copiedEvents, err
-}
-- 
cgit v1.2.3


From eccae0f77d3708d591119488f427eca90de7c711 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 23 Apr 2020 17:27:24 -0700
Subject: Remove View.First() and View.RemoveFirst()

These methods let users eaily break the VectorisedView abstraction, and
allowed netstack to slip into pseudo-enforcement of the "all headers are
in the first View" invariant. Removing them and replacing with PullUp(n)
breaks this reliance and will make it easier to add iptables support and
rework network buffer management.

The new View.PullUp(n) method is low cost in the common case, when when
all the headers fit in the first View.

PiperOrigin-RevId: 308163542
---
 pkg/sentry/socket/netfilter/tcp_matcher.go |   5 +-
 pkg/sentry/socket/netfilter/udp_matcher.go |   5 +-
 pkg/tcpip/buffer/view.go                   |  55 ++++++++++----
 pkg/tcpip/buffer/view_test.go              | 113 +++++++++++++++++++++++++++++
 pkg/tcpip/link/loopback/loopback.go        |  10 +--
 pkg/tcpip/link/rawfile/BUILD               |   9 ++-
 pkg/tcpip/link/rawfile/rawfile_test.go     |  46 ++++++++++++
 pkg/tcpip/link/rawfile/rawfile_unsafe.go   |   6 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go |   2 +-
 pkg/tcpip/link/sniffer/sniffer.go          |  65 +++++++++++++----
 pkg/tcpip/network/arp/arp.go               |   5 +-
 pkg/tcpip/network/ipv4/icmp.go             |  20 +++--
 pkg/tcpip/network/ipv4/ipv4.go             |  12 ++-
 pkg/tcpip/network/ipv6/icmp.go             |  74 ++++++++++++-------
 pkg/tcpip/network/ipv6/icmp_test.go        |   3 +-
 pkg/tcpip/network/ipv6/ipv6.go             |   6 +-
 pkg/tcpip/stack/forwarder_test.go          |  13 ++--
 pkg/tcpip/stack/iptables.go                |  22 +++++-
 pkg/tcpip/stack/iptables_targets.go        |  23 ++++--
 pkg/tcpip/stack/nic.go                     |  34 +++------
 pkg/tcpip/stack/packet_buffer.go           |   8 +-
 pkg/tcpip/stack/stack_test.go              |  10 ++-
 pkg/tcpip/stack/transport_test.go          |   5 +-
 pkg/tcpip/transport/icmp/endpoint.go       |   8 +-
 pkg/tcpip/transport/tcp/segment.go         |  29 +++++---
 pkg/tcpip/transport/tcp/tcp_test.go        |   4 +-
 pkg/tcpip/transport/udp/endpoint.go        |   6 +-
 pkg/tcpip/transport/udp/protocol.go        |   9 ++-
 28 files changed, 458 insertions(+), 149 deletions(-)
 create mode 100644 pkg/tcpip/link/rawfile/rawfile_test.go

diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index ff1cfd8f6..55c0f04f3 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -121,12 +121,13 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		tcpHeader = header.TCP(pkt.TransportHeader)
 	} else {
 		// The TCP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.TCPMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.TCPMinimumSize)
+		if !ok {
 			// There's no valid TCP header here, so we hotdrop the
 			// packet.
 			return false, true
 		}
-		tcpHeader = header.TCP(pkt.Data.First())
+		tcpHeader = header.TCP(hdr)
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 3359418c1..04d03d494 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -120,12 +120,13 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		udpHeader = header.UDP(pkt.TransportHeader)
 	} else {
 		// The UDP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.UDPMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+		if !ok {
 			// There's no valid UDP header here, so we hotdrop the
 			// packet.
 			return false, true
 		}
-		udpHeader = header.UDP(pkt.Data.First())
+		udpHeader = header.UDP(hdr)
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 8ec5d5d5c..f01217c91 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -77,7 +77,8 @@ func NewVectorisedView(size int, views []View) VectorisedView {
 	return VectorisedView{views: views, size: size}
 }
 
-// TrimFront removes the first "count" bytes of the vectorised view.
+// TrimFront removes the first "count" bytes of the vectorised view. It panics
+// if count > vv.Size().
 func (vv *VectorisedView) TrimFront(count int) {
 	for count > 0 && len(vv.views) > 0 {
 		if count < len(vv.views[0]) {
@@ -86,7 +87,7 @@ func (vv *VectorisedView) TrimFront(count int) {
 			return
 		}
 		count -= len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
 	}
 }
 
@@ -104,7 +105,7 @@ func (vv *VectorisedView) Read(v View) (copied int, err error) {
 		count -= len(vv.views[0])
 		copy(v[copied:], vv.views[0])
 		copied += len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
 	}
 	if copied == 0 {
 		return 0, io.EOF
@@ -126,7 +127,7 @@ func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int
 		count -= len(vv.views[0])
 		dstVV.AppendView(vv.views[0])
 		copied += len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
 	}
 	return copied
 }
@@ -162,22 +163,37 @@ func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
 
-// First returns the first view of the vectorised view.
-func (vv *VectorisedView) First() View {
+// PullUp returns the first "count" bytes of the vectorised view. If those
+// bytes aren't already contiguous inside the vectorised view, PullUp will
+// reallocate as needed to make them contiguous. PullUp fails and returns false
+// when count > vv.Size().
+func (vv *VectorisedView) PullUp(count int) (View, bool) {
 	if len(vv.views) == 0 {
-		return nil
+		return nil, count == 0
+	}
+	if count <= len(vv.views[0]) {
+		return vv.views[0][:count], true
+	}
+	if count > vv.size {
+		return nil, false
 	}
-	return vv.views[0]
-}
 
-// RemoveFirst removes the first view of the vectorised view.
-func (vv *VectorisedView) RemoveFirst() {
-	if len(vv.views) == 0 {
-		return
+	newFirst := NewView(count)
+	i := 0
+	for offset := 0; offset < count; i++ {
+		copy(newFirst[offset:], vv.views[i])
+		if count-offset < len(vv.views[i]) {
+			vv.views[i].TrimFront(count - offset)
+			break
+		}
+		offset += len(vv.views[i])
+		vv.views[i] = nil
 	}
-	vv.size -= len(vv.views[0])
-	vv.views[0] = nil
-	vv.views = vv.views[1:]
+	// We're guaranteed that i > 0, since count is too large for the first
+	// view.
+	vv.views[i-1] = newFirst
+	vv.views = vv.views[i-1:]
+	return newFirst, true
 }
 
 // Size returns the size in bytes of the entire content stored in the vectorised view.
@@ -225,3 +241,10 @@ func (vv *VectorisedView) Readers() []bytes.Reader {
 	}
 	return readers
 }
+
+// removeFirst panics when len(vv.views) < 1.
+func (vv *VectorisedView) removeFirst() {
+	vv.size -= len(vv.views[0])
+	vv.views[0] = nil
+	vv.views = vv.views[1:]
+}
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index 106e1994c..c56795c7b 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -16,6 +16,7 @@
 package buffer
 
 import (
+	"bytes"
 	"reflect"
 	"testing"
 )
@@ -370,3 +371,115 @@ func TestVVRead(t *testing.T) {
 		})
 	}
 }
+
+var pullUpTestCases = []struct {
+	comment string
+	in      VectorisedView
+	count   int
+	want    []byte
+	result  VectorisedView
+	ok      bool
+}{
+	{
+		comment: "simple case",
+		in:      vv(2, "12"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "12"),
+		ok:      true,
+	},
+	{
+		comment: "entire View",
+		in:      vv(2, "1", "2"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "1", "2"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across two Views",
+		in:      vv(3, "1", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across all Views",
+		in:      vv(5, "1", "23", "45"),
+		count:   5,
+		want:    []byte("12345"),
+		result:  vv(5, "12345"),
+		ok:      true,
+	},
+	{
+		comment: "count = 0",
+		in:      vv(1, "1"),
+		count:   0,
+		want:    []byte{},
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count = size",
+		in:      vv(1, "1"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count too large",
+		in:      vv(3, "1", "23"),
+		count:   4,
+		want:    nil,
+		result:  vv(3, "1", "23"),
+		ok:      false,
+	},
+	{
+		comment: "empty vv",
+		in:      vv(0, ""),
+		count:   1,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      false,
+	},
+	{
+		comment: "empty vv, count = 0",
+		in:      vv(0, ""),
+		count:   0,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      true,
+	},
+	{
+		comment: "empty views",
+		in:      vv(3, "", "1", "", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+}
+
+func TestPullUp(t *testing.T) {
+	for _, c := range pullUpTestCases {
+		got, ok := c.in.PullUp(c.count)
+
+		// Is the return value right?
+		if ok != c.ok {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got an ok of %t. Want %t",
+				c.comment, c.count, c.in, ok, c.ok)
+		}
+		if bytes.Compare(got, View(c.want)) != 0 {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got %v. Want %v",
+				c.comment, c.count, c.in, got, c.want)
+		}
+
+		// Is the underlying structure right?
+		if !reflect.DeepEqual(c.in, c.result) {
+			t.Errorf("Test %q failed when calling PullUp(%d). Got vv with structure %v. Wanted %v",
+				c.comment, c.count, c.in, c.result)
+		}
+	}
+}
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 1e2255bfa..073c84ef9 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -98,13 +98,13 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	// Reject the packet if it's shorter than an ethernet header.
-	if vv.Size() < header.EthernetMinimumSize {
+	// There should be an ethernet header at the beginning of vv.
+	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
+	if !ok {
+		// Reject the packet if it's shorter than an ethernet header.
 		return tcpip.ErrBadAddress
 	}
-
-	// There should be an ethernet header at the beginning of vv.
-	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
+	linkHeader := header.Ethernet(hdr)
 	vv.TrimFront(len(linkHeader))
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
 		Data:       vv,
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 14b527bc2..9cc08d0e2 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -18,3 +18,10 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+go_test(
+    name = "rawfile_test",
+    size = "small",
+    srcs = ["rawfile_test.go"],
+    library = ":rawfile",
+)
diff --git a/pkg/tcpip/link/rawfile/rawfile_test.go b/pkg/tcpip/link/rawfile/rawfile_test.go
new file mode 100644
index 000000000..8f14ba761
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/rawfile_test.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package rawfile
+
+import (
+	"syscall"
+	"testing"
+)
+
+func TestNonBlockingWrite3ZeroLength(t *testing.T) {
+	fd, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
+	if err != nil {
+		t.Fatalf("failed to open /dev/null: %v", err)
+	}
+	defer syscall.Close(fd)
+
+	if err := NonBlockingWrite3(fd, []byte{}, []byte{0}, nil); err != nil {
+		t.Fatalf("failed to write: %v", err)
+	}
+}
+
+func TestNonBlockingWrite3Nil(t *testing.T) {
+	fd, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
+	if err != nil {
+		t.Fatalf("failed to open /dev/null: %v", err)
+	}
+	defer syscall.Close(fd)
+
+	if err := NonBlockingWrite3(fd, nil, []byte{0}, nil); err != nil {
+		t.Fatalf("failed to write: %v", err)
+	}
+}
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 44e25d475..92efd0bf8 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -76,9 +76,13 @@ func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
 
 	// We have two buffers. Build the iovec that represents them and issue
 	// a writev syscall.
+	var base *byte
+	if len(b1) > 0 {
+		base = &b1[0]
+	}
 	iovec := [3]syscall.Iovec{
 		{
-			Base: &b1[0],
+			Base: base,
 			Len:  uint64(len(b1)),
 		},
 		{
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 27ea3f531..33f640b85 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -674,7 +674,7 @@ func TestSimpleReceive(t *testing.T) {
 		// Wait for packet to be received, then check it.
 		c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet")
 		c.mu.Lock()
-		rcvd := []byte(c.packets[0].vv.First())
+		rcvd := []byte(c.packets[0].vv.ToView())
 		c.packets = c.packets[:0]
 		c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index be2537a82..0799c8f4d 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -171,11 +171,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	writer := e.writer
 	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
-		first := pkt.Header.View()
-		if len(first) == 0 {
-			first = pkt.Data.First()
-		}
-		logPacket(prefix, protocol, first, gso)
+		logPacket(prefix, protocol, pkt, gso)
 	}
 	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
 		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
@@ -238,7 +234,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 // Wait implements stack.LinkEndpoint.Wait.
 func (e *endpoint) Wait() { e.lower.Wait() }
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -247,28 +243,49 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	size := uint16(0)
 	var fragmentOffset uint16
 	var moreFragments bool
+
+	// Create a clone of pkt, including any headers if present. Avoid allocating
+	// backing memory for the clone.
+	views := [8]buffer.View{}
+	vv := buffer.NewVectorisedView(0, views[:0])
+	vv.AppendView(pkt.Header.View())
+	vv.Append(pkt.Data)
+
 	switch protocol {
 	case header.IPv4ProtocolNumber:
-		ipv4 := header.IPv4(b)
+		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			return
+		}
+		ipv4 := header.IPv4(hdr)
 		fragmentOffset = ipv4.FragmentOffset()
 		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
 		src = ipv4.SourceAddress()
 		dst = ipv4.DestinationAddress()
 		transProto = ipv4.Protocol()
 		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		b = b[ipv4.HeaderLength():]
+		vv.TrimFront(int(ipv4.HeaderLength()))
 		id = int(ipv4.ID())
 
 	case header.IPv6ProtocolNumber:
-		ipv6 := header.IPv6(b)
+		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		if !ok {
+			return
+		}
+		ipv6 := header.IPv6(hdr)
 		src = ipv6.SourceAddress()
 		dst = ipv6.DestinationAddress()
 		transProto = ipv6.NextHeader()
 		size = ipv6.PayloadLength()
-		b = b[header.IPv6MinimumSize:]
+		vv.TrimFront(header.IPv6MinimumSize)
 
 	case header.ARPProtocolNumber:
-		arp := header.ARP(b)
+		hdr, ok := vv.PullUp(header.ARPSize)
+		if !ok {
+			return
+		}
+		vv.TrimFront(header.ARPSize)
+		arp := header.ARP(hdr)
 		log.Infof(
 			"%s arp %v (%v) -> %v (%v) valid:%v",
 			prefix,
@@ -284,7 +301,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	// We aren't guaranteed to have a transport header - it's possible for
 	// writes via raw endpoints to contain only network headers.
-	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && len(b) < minSize {
+	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && vv.Size() < minSize {
 		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
 		return
 	}
@@ -297,7 +314,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	switch tcpip.TransportProtocolNumber(transProto) {
 	case header.ICMPv4ProtocolNumber:
 		transName = "icmp"
-		icmp := header.ICMPv4(b)
+		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv4(hdr)
 		icmpType := "unknown"
 		if fragmentOffset == 0 {
 			switch icmp.Type() {
@@ -330,7 +351,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.ICMPv6ProtocolNumber:
 		transName = "icmp"
-		icmp := header.ICMPv6(b)
+		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv6(hdr)
 		icmpType := "unknown"
 		switch icmp.Type() {
 		case header.ICMPv6DstUnreachable:
@@ -361,7 +386,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.UDPProtocolNumber:
 		transName = "udp"
-		udp := header.UDP(b)
+		hdr, ok := vv.PullUp(header.UDPMinimumSize)
+		if !ok {
+			break
+		}
+		udp := header.UDP(hdr)
 		if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
 			srcPort = udp.SourcePort()
 			dstPort = udp.DestinationPort()
@@ -371,7 +400,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.TCPProtocolNumber:
 		transName = "tcp"
-		tcp := header.TCP(b)
+		hdr, ok := vv.PullUp(header.TCPMinimumSize)
+		if !ok {
+			break
+		}
+		tcp := header.TCP(hdr)
 		if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
 			offset := int(tcp.DataOffset())
 			if offset < header.TCPMinimumSize {
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 7acbfa0a8..cf73a939e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -93,7 +93,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	v := pkt.Data.First()
+	v, ok := pkt.Data.PullUp(header.ARPSize)
+	if !ok {
+		return
+	}
 	h := header.ARP(v)
 	if !h.IsValid() {
 		return
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index c4bf1ba5c..4cbefe5ab 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -25,7 +25,11 @@ import (
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
 func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
-	h := header.IPv4(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv4(h)
 
 	// We don't use IsValid() here because ICMP only requires that the IP
 	// header plus 8 bytes of the transport header be included. So it's
@@ -34,12 +38,12 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	//
 	// Drop packet if it doesn't have the basic IPv4 header or if the
 	// original source address doesn't match the endpoint's address.
-	if len(h) < header.IPv4MinimumSize || h.SourceAddress() != e.id.LocalAddress {
+	if hdr.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
-	hlen := int(h.HeaderLength())
-	if pkt.Data.Size() < hlen || h.FragmentOffset() != 0 {
+	hlen := int(hdr.HeaderLength())
+	if pkt.Data.Size() < hlen || hdr.FragmentOffset() != 0 {
 		// We won't be able to handle this if it doesn't contain the
 		// full IPv4 header, or if it's a fragment not at offset 0
 		// (because it won't have the transport header).
@@ -48,15 +52,15 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 
 	// Skip the ip header, then deliver control message.
 	pkt.Data.TrimFront(hlen)
-	p := h.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	p := hdr.TransportProtocol()
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
-	v := pkt.Data.First()
-	if len(v) < header.ICMPv4MinimumSize {
+	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+	if !ok {
 		received.Invalid.Increment()
 		return
 	}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 104aafbed..17202cc7a 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -328,7 +328,11 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
-	ip := header.IPv4(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return tcpip.ErrInvalidOptionValue
+	}
+	ip := header.IPv4(h)
 	if !ip.IsValid(pkt.Data.Size()) {
 		return tcpip.ErrInvalidOptionValue
 	}
@@ -378,7 +382,11 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView := pkt.Data.First()
+	headerView, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
 	h := header.IPv4(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index b68983d10..bdf3a0d25 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -28,7 +28,11 @@ import (
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
 func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
-	h := header.IPv6(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv6(h)
 
 	// We don't use IsValid() here because ICMP only requires that up to
 	// 1280 bytes of the original packet be included. So it's likely that it
@@ -36,17 +40,21 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	//
 	// Drop packet if it doesn't have the basic IPv6 header or if the
 	// original source address doesn't match the endpoint's address.
-	if len(h) < header.IPv6MinimumSize || h.SourceAddress() != e.id.LocalAddress {
+	if hdr.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
 	// Skip the IP header, then handle the fragmentation header if there
 	// is one.
 	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	p := h.TransportProtocol()
+	p := hdr.TransportProtocol()
 	if p == header.IPv6FragmentHeader {
-		f := header.IPv6Fragment(pkt.Data.First())
-		if !f.IsValid() || f.FragmentOffset() != 0 {
+		f, ok := pkt.Data.PullUp(header.IPv6FragmentHeaderSize)
+		if !ok {
+			return
+		}
+		fragHdr := header.IPv6Fragment(f)
+		if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 {
 			// We can't handle fragments that aren't at offset 0
 			// because they don't have the transport headers.
 			return
@@ -55,19 +63,19 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 		// Skip fragmentation header and find out the actual protocol
 		// number.
 		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
-		p = f.TransportProtocol()
+		p = fragHdr.TransportProtocol()
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
-	v := pkt.Data.First()
-	if len(v) < header.ICMPv6MinimumSize {
+	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
+	if !ok {
 		received.Invalid.Increment()
 		return
 	}
@@ -76,11 +84,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	// Validate ICMPv6 checksum before processing the packet.
 	//
-	// Only the first view in vv is accounted for by h. To account for the
-	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
 	payload := pkt.Data.Clone(nil)
-	payload.RemoveFirst()
+	payload.TrimFront(len(h))
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
 		return
@@ -101,34 +107,40 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
-		if len(v) < header.ICMPv6PacketTooBigMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
-		mtu := h.MTU()
+		mtu := header.ICMPv6(hdr).MTU()
 		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
-		if len(v) < header.ICMPv6DstUnreachableMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6DstUnreachableMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
-		switch h.Code() {
+		switch header.ICMPv6(hdr).Code() {
 		case header.ICMPv6PortUnreachable:
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if len(v) < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		ns := header.NDPNeighborSolicit(h.NDPPayload())
+		// The remainder of payload must be only the neighbor solicitation, so
+		// payload.ToView() always returns the solicitation. Per RFC 6980 section 5,
+		// NDP messages cannot be fragmented. Also note that in the common case NDP
+		// datagrams are very small and ToView() will not incur allocations.
+		ns := header.NDPNeighborSolicit(payload.ToView())
 		it, err := ns.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NS option, drop the packet.
@@ -286,12 +298,16 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if len(v) < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		na := header.NDPNeighborAdvert(h.NDPPayload())
+		// The remainder of payload must be only the neighbor advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		na := header.NDPNeighborAdvert(payload.ToView())
 		it, err := na.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NA option, drop the packet.
@@ -363,14 +379,15 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
-		if len(v) < header.ICMPv6EchoMinimumSize {
+		icmpHdr, ok := pkt.Data.PullUp(header.ICMPv6EchoMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
 		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-		copy(packet, h)
+		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
 		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
@@ -384,7 +401,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoReply:
 		received.EchoReply.Increment()
-		if len(v) < header.ICMPv6EchoMinimumSize {
+		if pkt.Data.Size() < header.ICMPv6EchoMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -406,8 +423,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 	case header.ICMPv6RouterAdvert:
 		received.RouterAdvert.Increment()
 
-		p := h.NDPPayload()
-		if len(p) < header.NDPRAMinimumSize || !isNDPValid() {
+		// Is the NDP payload of sufficient size to hold a Router
+		// Advertisement?
+		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
@@ -425,7 +443,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 			return
 		}
 
-		ra := header.NDPRouterAdvert(p)
+		// The remainder of payload must be only the router advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		ra := header.NDPRouterAdvert(payload.ToView())
 		opts := ra.Options()
 
 		// Are options valid as per the wire format?
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index bd099a7f8..d412ff688 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -166,7 +166,8 @@ func TestICMPCounts(t *testing.T) {
 		},
 		{
 			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize},
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+		},
 		{
 			typ:       header.ICMPv6NeighborAdvert,
 			size:      header.ICMPv6NeighborAdvertMinimumSize,
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 331b0817b..486725131 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -171,7 +171,11 @@ func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffe
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView := pkt.Data.First()
+	headerView, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
 	h := header.IPv6(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index e9c652042..c7c663498 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -70,7 +70,10 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 
 func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
 	// Consume the network header.
-	b := pkt.Data.First()
+	b, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
+	if !ok {
+		return
+	}
 	pkt.Data.TrimFront(fwdTestNetHeaderLen)
 
 	// Dispatch the packet to the transport protocol.
@@ -473,7 +476,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Header.View()
+	b := p.Pkt.Data.ToView()
 	if b[0] != 3 {
 		t.Fatalf("got b[0] = %d, want = 3", b[0])
 	}
@@ -517,7 +520,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.ToView()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -564,7 +567,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.ToView()
 		if b[0] != 3 {
 			t.Fatalf("got b[0] = %d, want = 3", b[0])
 		}
@@ -619,7 +622,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Header.View()
+		b := p.Pkt.Data.ToView()
 		if b[0] < 8 {
 			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
 		}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 6c0a4b24d..6b91159d4 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -212,6 +212,11 @@ func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool {
 // CheckPackets runs pkts through the rules for hook and returns a map of packets that
 // should not go forward.
 //
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+//
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
+//
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
 func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*PacketBuffer]struct{}) {
@@ -226,7 +231,9 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList) (drop map[*Pa
 	return drop
 }
 
-// Precondition: pkt.NetworkHeader is set.
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
 func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
@@ -271,14 +278,21 @@ func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx
 	return chainDrop
 }
 
-// Precondition: pk.NetworkHeader is set.
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
 func (it *IPTables) checkRule(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
-	// pkt.Data.First().
+	// pkt.Data.
 	if pkt.NetworkHeader == nil {
-		pkt.NetworkHeader = pkt.Data.First()
+		var ok bool
+		pkt.NetworkHeader, ok = pkt.Data.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			// Precondition has been violated.
+			panic(fmt.Sprintf("iptables checks require IPv4 headers of at least %d bytes", header.IPv4MinimumSize))
+		}
 	}
 
 	// Check whether the packet matches the IP header filter.
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 7b4543caf..8be61f4b1 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -96,9 +96,12 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 	newPkt := pkt.Clone()
 
 	// Set network header.
-	headerView := newPkt.Data.First()
+	headerView, ok := newPkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return RuleDrop, 0
+	}
 	netHeader := header.IPv4(headerView)
-	newPkt.NetworkHeader = headerView[:header.IPv4MinimumSize]
+	newPkt.NetworkHeader = headerView
 
 	hlen := int(netHeader.HeaderLength())
 	tlen := int(netHeader.TotalLength())
@@ -117,10 +120,14 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 		if newPkt.TransportHeader != nil {
 			udpHeader = header.UDP(newPkt.TransportHeader)
 		} else {
-			if len(pkt.Data.First()) < header.UDPMinimumSize {
+			if pkt.Data.Size() < header.UDPMinimumSize {
+				return RuleDrop, 0
+			}
+			hdr, ok := newPkt.Data.PullUp(header.UDPMinimumSize)
+			if !ok {
 				return RuleDrop, 0
 			}
-			udpHeader = header.UDP(newPkt.Data.First())
+			udpHeader = header.UDP(hdr)
 		}
 		udpHeader.SetDestinationPort(rt.MinPort)
 	case header.TCPProtocolNumber:
@@ -128,10 +135,14 @@ func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 		if newPkt.TransportHeader != nil {
 			tcpHeader = header.TCP(newPkt.TransportHeader)
 		} else {
-			if len(pkt.Data.First()) < header.TCPMinimumSize {
+			if pkt.Data.Size() < header.TCPMinimumSize {
 				return RuleDrop, 0
 			}
-			tcpHeader = header.TCP(newPkt.TransportHeader)
+			hdr, ok := newPkt.Data.PullUp(header.TCPMinimumSize)
+			if !ok {
+				return RuleDrop, 0
+			}
+			tcpHeader = header.TCP(hdr)
 		}
 		// TODO(gvisor.dev/issue/170): Need to recompute checksum
 		// and implement nat connection tracking to support TCP.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 016dbe15e..0c2b1f36a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1203,12 +1203,12 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	if len(pkt.Data.First()) < netProto.MinimumPacketSize() {
+	netHeader, ok := pkt.Data.PullUp(netProto.MinimumPacketSize())
+	if !ok {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
-
-	src, dst := netProto.ParseAddresses(pkt.Data.First())
+	src, dst := netProto.ParseAddresses(netHeader)
 
 	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
 		// The source address is one of our own, so we never should have gotten a
@@ -1289,22 +1289,8 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-
-	firstData := pkt.Data.First()
-	pkt.Data.RemoveFirst()
-
-	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen == 0 {
-		pkt.Header = buffer.NewPrependableFromView(firstData)
-	} else {
-		firstDataLen := len(firstData)
-
-		// pkt.Header should have enough capacity to hold n.linkEP's headers.
-		pkt.Header = buffer.NewPrependable(firstDataLen + linkHeaderLen)
-
-		// TODO(b/151227689): avoid copying the packet when forwarding
-		if n := copy(pkt.Header.Prepend(firstDataLen), firstData); n != firstDataLen {
-			panic(fmt.Sprintf("copied %d bytes, expected %d", n, firstDataLen))
-		}
+	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen != 0 {
+		pkt.Header = buffer.NewPrependable(linkHeaderLen)
 	}
 
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
@@ -1332,12 +1318,13 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// validly formed.
 	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	if len(pkt.Data.First()) < transProto.MinimumPacketSize() {
+	transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
+	if !ok {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
+	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
@@ -1375,11 +1362,12 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
 	// be present in the payload. We know that the ports are within the
 	// first 8 bytes for all known transport protocols.
-	if len(pkt.Data.First()) < 8 {
+	transHeader, ok := pkt.Data.PullUp(8)
+	if !ok {
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
+	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
 	if err != nil {
 		return
 	}
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index dc125f25e..7d36f8e84 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -37,7 +37,13 @@ type PacketBuffer struct {
 	Data buffer.VectorisedView
 
 	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header.
+	// down the stack, each layer adds to Header. Note that forwarded
+	// packets don't populate Headers on their way out -- their headers and
+	// payload are never parsed out and remain in Data.
+	//
+	// TODO(gvisor.dev/issue/170): Forwarded packets don't currently
+	// populate Header, but should. This will be doable once early parsing
+	// (https://github.com/google/gvisor/pull/1995) is supported.
 	Header buffer.Prependable
 
 	// These fields are used by both inbound and outbound packets. They
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index c7634ceb1..d45d2cc1f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -95,16 +95,18 @@ func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffe
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
 	// Consume the network header.
-	b := pkt.Data.First()
+	b, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+	if !ok {
+		return
+	}
 	pkt.Data.TrimFront(fakeNetHeaderLen)
 
 	// Handle control packets.
 	if b[2] == uint8(fakeControlProtocol) {
-		nb := pkt.Data.First()
-		if len(nb) < fakeNetHeaderLen {
+		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+		if !ok {
 			return
 		}
-
 		pkt.Data.TrimFront(fakeNetHeaderLen)
 		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
 		return
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 3084e6593..a611e44ab 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -642,10 +642,11 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	if dst := p.Pkt.Header.View()[0]; dst != 3 {
+	hdrs := p.Pkt.Data.ToView()
+	if dst := hdrs[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := p.Pkt.Header.View()[1]; src != 1 {
+	if src := hdrs[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index feef8dca0..b1d820372 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -747,15 +747,15 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h := header.ICMPv4(pkt.Data.First())
-		if h.Type() != header.ICMPv4EchoReply {
+		h, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+		if !ok || header.ICMPv4(h).Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h := header.ICMPv6(pkt.Data.First())
-		if h.Type() != header.ICMPv6EchoReply {
+		h, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
+		if !ok || header.ICMPv6(h).Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 40461fd31..7712ce652 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -144,7 +144,11 @@ func (s *segment) logicalLen() seqnum.Size {
 // TCP checksum and stores the checksum and result of checksum verification in
 // the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
-	h := header.TCP(s.data.First())
+	h, ok := s.data.PullUp(header.TCPMinimumSize)
+	if !ok {
+		return false
+	}
+	hdr := header.TCP(h)
 
 	// h is the header followed by the payload. We check that the offset to
 	// the data respects the following constraints:
@@ -156,12 +160,16 @@ func (s *segment) parse() bool {
 	// N.B. The segment has already been validated as having at least the
 	//      minimum TCP size before reaching here, so it's safe to read the
 	//      fields.
-	offset := int(h.DataOffset())
-	if offset < header.TCPMinimumSize || offset > len(h) {
+	offset := int(hdr.DataOffset())
+	if offset < header.TCPMinimumSize {
+		return false
+	}
+	hdrWithOpts, ok := s.data.PullUp(offset)
+	if !ok {
 		return false
 	}
 
-	s.options = []byte(h[header.TCPMinimumSize:offset])
+	s.options = []byte(hdrWithOpts[header.TCPMinimumSize:])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
 
 	// Query the link capabilities to decide if checksum validation is
@@ -173,18 +181,19 @@ func (s *segment) parse() bool {
 		s.data.TrimFront(offset)
 	}
 	if verifyChecksum {
-		s.csum = h.Checksum()
+		hdr = header.TCP(hdrWithOpts)
+		s.csum = hdr.Checksum()
 		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
-		xsum = h.CalculateChecksum(xsum)
+		xsum = hdr.CalculateChecksum(xsum)
 		s.data.TrimFront(offset)
 		xsum = header.ChecksumVV(s.data, xsum)
 		s.csumValid = xsum == 0xffff
 	}
 
-	s.sequenceNumber = seqnum.Value(h.SequenceNumber())
-	s.ackNumber = seqnum.Value(h.AckNumber())
-	s.flags = h.Flags()
-	s.window = seqnum.Size(h.WindowSize())
+	s.sequenceNumber = seqnum.Value(hdr.SequenceNumber())
+	s.ackNumber = seqnum.Value(hdr.AckNumber())
+	s.flags = hdr.Flags()
+	s.window = seqnum.Size(hdr.WindowSize())
 	return true
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index ab1014c7f..286c66cf5 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -3548,7 +3548,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
 	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
 
 	c.SendSegment(vv)
@@ -3575,7 +3575,7 @@ func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
 	// Overwrite a byte in the payload which should cause checksum
 	// verification to fail.
 	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index edb54f0be..756ab913a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1250,8 +1250,8 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(pkt.Data.First())
-	if int(hdr.Length()) > pkt.Data.Size() {
+	hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok || int(header.UDP(hdr).Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
@@ -1286,7 +1286,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
-			Port: hdr.SourcePort(),
+			Port: header.UDP(hdr).SourcePort(),
 		},
 	}
 	packet.data = pkt.Data
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 6e31a9bac..52af6de22 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -68,8 +68,13 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // that don't match any existing endpoint.
 func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(pkt.Data.First())
-	if int(hdr.Length()) > pkt.Data.Size() {
+	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok {
+		// Malformed packet.
+		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
+		return true
+	}
+	if int(header.UDP(h).Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
-- 
cgit v1.2.3


From 696feaf10c9339a57d177a913e847ddb488ece69 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 23 Apr 2020 17:32:59 -0700
Subject: Port devpts to VFS2.

PiperOrigin-RevId: 308164359
---
 pkg/sentry/fs/tty/line_discipline.go        |   4 +
 pkg/sentry/fs/tty/master.go                 |   4 +
 pkg/sentry/fs/tty/queue.go                  |   4 +
 pkg/sentry/fs/tty/slave.go                  |   4 +
 pkg/sentry/fs/tty/terminal.go               |   4 +
 pkg/sentry/fsimpl/devpts/BUILD              |  43 +++
 pkg/sentry/fsimpl/devpts/devpts.go          | 207 +++++++++++++
 pkg/sentry/fsimpl/devpts/devpts_test.go     |  56 ++++
 pkg/sentry/fsimpl/devpts/line_discipline.go | 449 ++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/devpts/master.go          | 226 ++++++++++++++
 pkg/sentry/fsimpl/devpts/queue.go           | 240 +++++++++++++++
 pkg/sentry/fsimpl/devpts/slave.go           | 186 ++++++++++++
 pkg/sentry/fsimpl/devpts/terminal.go        | 124 ++++++++
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go      |  13 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go      |   2 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |   9 +-
 16 files changed, 1570 insertions(+), 5 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/devpts/BUILD
 create mode 100644 pkg/sentry/fsimpl/devpts/devpts.go
 create mode 100644 pkg/sentry/fsimpl/devpts/devpts_test.go
 create mode 100644 pkg/sentry/fsimpl/devpts/line_discipline.go
 create mode 100644 pkg/sentry/fsimpl/devpts/master.go
 create mode 100644 pkg/sentry/fsimpl/devpts/queue.go
 create mode 100644 pkg/sentry/fsimpl/devpts/slave.go
 create mode 100644 pkg/sentry/fsimpl/devpts/terminal.go

diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 12b1c6097..2e9dd2d55 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// canonMaxBytes is the number of bytes that fit into a single line of
 	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
@@ -443,3 +445,5 @@ func (l *lineDiscipline) peek(b []byte) int {
 	}
 	return size
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/line_discipline.go)
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index f62da49bd..fe07fa929 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -26,6 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // masterInodeOperations are the fs.InodeOperations for the master end of the
 // Terminal (ptmx file).
 //
@@ -232,3 +234,5 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		unimpl.EmitUnimplementedEvent(ctx)
 	}
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/master.go)
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 1ca79c0b2..ceabb9b1e 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // waitBufMaxBytes is the maximum size of a wait buffer. It is based on
 // TTYB_DEFAULT_MEM_LIMIT.
 const waitBufMaxBytes = 131072
@@ -234,3 +236,5 @@ func (q *queue) waitBufAppend(b []byte) {
 	q.waitBuf = append(q.waitBuf, b)
 	q.waitBufLen += uint64(len(b))
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/queue.go)
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 6a2dbc576..9871f6fc6 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // slaveInodeOperations are the fs.InodeOperations for the slave end of the
 // Terminal (pts file).
 //
@@ -172,3 +174,5 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem
 		return 0, syserror.ENOTTY
 	}
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/slave.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 5883f26db..ddcccf4da 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Terminal is a pseudoterminal.
 //
 // +stateify savable
@@ -126,3 +128,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	}
 	return tm.slaveKTTY
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
new file mode 100644
index 000000000..585764223
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -0,0 +1,43 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "devpts",
+    srcs = [
+        "devpts.go",
+        "line_discipline.go",
+        "master.go",
+        "queue.go",
+        "slave.go",
+        "terminal.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "devpts_test",
+    size = "small",
+    srcs = ["devpts_test.go"],
+    library = ":devpts",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/contexttest",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
new file mode 100644
index 000000000..07a69b940
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -0,0 +1,207 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package devpts provides a filesystem implementation that behaves like
+// devpts.
+package devpts
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the filesystem name.
+const Name = "devpts"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	// No data allowed.
+	if opts.Data != "" {
+		return nil, nil, syserror.EINVAL
+	}
+
+	fs, root := fstype.newFilesystem(vfsObj, creds)
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// newFilesystem creates a new devpts filesystem with root directory and ptmx
+// master inode. It returns the filesystem and root Dentry.
+func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) {
+	fs := &kernfs.Filesystem{}
+	fs.Init(vfsObj, fstype)
+
+	// Construct the root directory. This is always inode id 1.
+	root := &rootInode{
+		slaves: make(map[uint32]*slaveInode),
+	}
+	root.InodeAttrs.Init(creds, 1, linux.ModeDirectory|0555)
+	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	root.dentry.Init(root)
+
+	// Construct the pts master inode and dentry. Linux always uses inode
+	// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
+	master := &masterInode{
+		root: root,
+	}
+	master.InodeAttrs.Init(creds, 2, linux.ModeCharacterDevice|0666)
+	master.dentry.Init(master)
+
+	// Add the master as a child of the root.
+	links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
+		"ptmx": &master.dentry,
+	})
+	root.IncLinks(links)
+
+	return fs, &root.dentry
+}
+
+// rootInode is the root directory inode for the devpts mounts.
+type rootInode struct {
+	kernfs.AlwaysValid
+	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.OrderedChildren
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// master is the master pty inode. Immutable.
+	master *masterInode
+
+	// root is the root directory inode for this filesystem. Immutable.
+	root *rootInode
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// slaves maps pty ids to slave inodes.
+	slaves map[uint32]*slaveInode
+
+	// nextIdx is the next pty index to use. Must be accessed atomically.
+	//
+	// TODO(b/29356795): reuse indices when ptys are closed.
+	nextIdx uint32
+}
+
+var _ kernfs.Inode = (*rootInode)(nil)
+
+// allocateTerminal creates a new Terminal and installs a pts node for it.
+func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if i.nextIdx == math.MaxUint32 {
+		return nil, syserror.ENOMEM
+	}
+	idx := i.nextIdx
+	i.nextIdx++
+
+	// Sanity check that slave with idx does not exist.
+	if _, ok := i.slaves[idx]; ok {
+		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
+	}
+
+	// Create the new terminal and slave.
+	t := newTerminal(idx)
+	slave := &slaveInode{
+		root: i,
+		t:    t,
+	}
+	// Linux always uses pty index + 3 as the inode id. See
+	// fs/devpts/inode.c:devpts_pty_new().
+	slave.InodeAttrs.Init(creds, uint64(idx+3), linux.ModeCharacterDevice|0600)
+	slave.dentry.Init(slave)
+	i.slaves[idx] = slave
+
+	return t, nil
+}
+
+// masterClose is called when the master end of t is closed.
+func (i *rootInode) masterClose(t *Terminal) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	// Sanity check that slave with idx exists.
+	if _, ok := i.slaves[t.n]; !ok {
+		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
+	}
+	delete(i.slaves, t.n)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *rootInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	return fd.VFSFileDescription(), nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	idx, err := strconv.ParseUint(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if si, ok := i.slaves[uint32(idx)]; ok {
+		si.dentry.IncRef()
+		return si.dentry.VFSDentry(), nil
+
+	}
+	return nil, syserror.ENOENT
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	ids := make([]int, 0, len(i.slaves))
+	for id := range i.slaves {
+		ids = append(ids, int(id))
+	}
+	sort.Ints(ids)
+	for _, id := range ids[relOffset:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(id), 10),
+			Type:    linux.DT_CHR,
+			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
new file mode 100644
index 000000000..b7c149047
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func TestSimpleMasterToSlave(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+	ctx := contexttest.Context(t)
+	inBytes := []byte("hello, tty\n")
+	src := usermem.BytesIOSequence(inBytes)
+	outBytes := make([]byte, 32)
+	dst := usermem.BytesIOSequence(outBytes)
+
+	// Write to the input queue.
+	nw, err := ld.inputQueueWrite(ctx, src)
+	if err != nil {
+		t.Fatalf("error writing to input queue: %v", err)
+	}
+	if nw != int64(len(inBytes)) {
+		t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes))
+	}
+
+	// Read from the input queue.
+	nr, err := ld.inputQueueRead(ctx, dst)
+	if err != nil {
+		t.Fatalf("error reading from input queue: %v", err)
+	}
+	if nr != int64(len(inBytes)) {
+		t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes))
+	}
+
+	outStr := string(outBytes[:nr])
+	inStr := string(inBytes)
+	if outStr != inStr {
+		t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr)
+	}
+}
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
new file mode 100644
index 000000000..e201801d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -0,0 +1,449 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+const (
+	// canonMaxBytes is the number of bytes that fit into a single line of
+	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
+	// in include/linux/tty.h.
+	canonMaxBytes = 4096
+
+	// nonCanonMaxBytes is the maximum number of bytes that can be read at
+	// a time in noncanonical mode.
+	nonCanonMaxBytes = canonMaxBytes - 1
+
+	spacesPerTab = 8
+)
+
+// lineDiscipline dictates how input and output are handled between the
+// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
+// pages are good resources for how to affect the line discipline:
+//
+//   * termios(3)
+//   * tty_ioctl(4)
+//
+// This file corresponds most closely to drivers/tty/n_tty.c.
+//
+// lineDiscipline has a simple structure but supports a multitude of options
+// (see the above man pages). It consists of two queues of bytes: one from the
+// terminal master to slave (the input queue) and one from slave to master (the
+// output queue). When bytes are written to one end of the pty, the line
+// discipline reads the bytes, modifies them or takes special action if
+// required, and enqueues them to be read by the other end of the pty:
+//
+//       input from terminal    +-------------+   input to process (e.g. bash)
+//    +------------------------>| input queue |---------------------------+
+//    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
+//    |                                                                   |
+//    |                                                                   v
+// masterFD                                                            slaveFD
+//    ^                                                                   |
+//    |                                                                   |
+//    |   output to terminal   +--------------+    output from process    |
+//    +------------------------| output queue |<--------------------------+
+//        (outputQueueRead)    +--------------+    (outputQueueWrite)
+//
+// Lock order:
+//  termiosMu
+//    inQueue.mu
+//      outQueue.mu
+//
+// +stateify savable
+type lineDiscipline struct {
+	// sizeMu protects size.
+	sizeMu sync.Mutex `state:"nosave"`
+
+	// size is the terminal size (width and height).
+	size linux.WindowSize
+
+	// inQueue is the input queue of the terminal.
+	inQueue queue
+
+	// outQueue is the output queue of the terminal.
+	outQueue queue
+
+	// termiosMu protects termios.
+	termiosMu sync.RWMutex `state:"nosave"`
+
+	// termios is the terminal configuration used by the lineDiscipline.
+	termios linux.KernelTermios
+
+	// column is the location in a row of the cursor. This is important for
+	// handling certain special characters like backspace.
+	column int
+
+	// masterWaiter is used to wait on the master end of the TTY.
+	masterWaiter waiter.Queue `state:"zerovalue"`
+
+	// slaveWaiter is used to wait on the slave end of the TTY.
+	slaveWaiter waiter.Queue `state:"zerovalue"`
+}
+
+func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
+	ld := lineDiscipline{termios: termios}
+	ld.inQueue.transformer = &inputQueueTransformer{}
+	ld.outQueue.transformer = &outputQueueTransformer{}
+	return &ld
+}
+
+// getTermios gets the linux.Termios for the tty.
+func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	// We must copy a Termios struct, not KernelTermios.
+	t := l.termios.ToTermios()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// setTermios sets a linux.Termios for the tty.
+func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
+	// We must copy a Termios struct, not KernelTermios.
+	var t linux.Termios
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	l.termios.FromTermios(t)
+
+	// If canonical mode is turned off, move bytes from inQueue's wait
+	// buffer to its read buffer. Anything already in the read buffer is
+	// now readable.
+	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
+		l.inQueue.mu.Lock()
+		l.inQueue.pushWaitBufLocked(l)
+		l.inQueue.readable = true
+		l.inQueue.mu.Unlock()
+		l.slaveWaiter.Notify(waiter.EventIn)
+	}
+
+	return 0, err
+}
+
+func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) masterReadiness() waiter.EventMask {
+	// We don't have to lock a termios because the default master termios
+	// is immutable.
+	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
+}
+
+func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
+}
+
+func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(ctx, io, args)
+}
+
+func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, pushed, err := l.inQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.slaveWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, err := l.inQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(ctx, io, args)
+}
+
+func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, pushed, err := l.outQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, err := l.outQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// transformer is a helper interface to make it easier to stateify queue.
+type transformer interface {
+	// transform functions require queue's mutex to be held.
+	transform(*lineDiscipline, *queue, []byte) int
+}
+
+// outputQueueTransformer implements transformer. It performs line discipline
+// transformations on the output queue.
+//
+// +stateify savable
+type outputQueueTransformer struct{}
+
+// transform does output processing for one end of the pty. See
+// drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
+	// transformOutput is effectively always in noncanonical mode, as the
+	// master termios never has ICANON set.
+
+	if !l.termios.OEnabled(linux.OPOST) {
+		q.readBuf = append(q.readBuf, buf...)
+		if len(q.readBuf) > 0 {
+			q.readable = true
+		}
+		return len(buf)
+	}
+
+	var ret int
+	for len(buf) > 0 {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		ret += size
+		buf = buf[size:]
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
+		case '\n':
+			if l.termios.OEnabled(linux.ONLRET) {
+				l.column = 0
+			}
+			if l.termios.OEnabled(linux.ONLCR) {
+				q.readBuf = append(q.readBuf, '\r', '\n')
+				continue
+			}
+		case '\r':
+			if l.termios.OEnabled(linux.ONOCR) && l.column == 0 {
+				continue
+			}
+			if l.termios.OEnabled(linux.OCRNL) {
+				cBytes[0] = '\n'
+				if l.termios.OEnabled(linux.ONLRET) {
+					l.column = 0
+				}
+				break
+			}
+			l.column = 0
+		case '\t':
+			spaces := spacesPerTab - l.column%spacesPerTab
+			if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
+				l.column += spaces
+				q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...)
+				continue
+			}
+			l.column += spaces
+		case '\b':
+			if l.column > 0 {
+				l.column--
+			}
+		default:
+			l.column++
+		}
+		q.readBuf = append(q.readBuf, cBytes...)
+	}
+	if len(q.readBuf) > 0 {
+		q.readable = true
+	}
+	return ret
+}
+
+// inputQueueTransformer implements transformer. It performs line discipline
+// transformations on the input queue.
+//
+// +stateify savable
+type inputQueueTransformer struct{}
+
+// transform does input processing for one end of the pty. Characters read are
+// transformed according to flags set in the termios struct. See
+// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
+// function.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
+	// If there's a line waiting to be read in canonical mode, don't write
+	// anything else to the read buffer.
+	if l.termios.LEnabled(linux.ICANON) && q.readable {
+		return 0
+	}
+
+	maxBytes := nonCanonMaxBytes
+	if l.termios.LEnabled(linux.ICANON) {
+		maxBytes = canonMaxBytes
+	}
+
+	var ret int
+	for len(buf) > 0 && len(q.readBuf) < canonMaxBytes {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
+		case '\r':
+			if l.termios.IEnabled(linux.IGNCR) {
+				buf = buf[size:]
+				ret += size
+				continue
+			}
+			if l.termios.IEnabled(linux.ICRNL) {
+				cBytes[0] = '\n'
+			}
+		case '\n':
+			if l.termios.IEnabled(linux.INLCR) {
+				cBytes[0] = '\r'
+			}
+		}
+
+		// In canonical mode, we discard non-terminating characters
+		// after the first 4095.
+		if l.shouldDiscard(q, cBytes) {
+			buf = buf[size:]
+			ret += size
+			continue
+		}
+
+		// Stop if the buffer would be overfilled.
+		if len(q.readBuf)+size > maxBytes {
+			break
+		}
+		buf = buf[size:]
+		ret += size
+
+		// If we get EOF, make the buffer available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) {
+			q.readable = true
+			break
+		}
+
+		q.readBuf = append(q.readBuf, cBytes...)
+
+		// Anything written to the readBuf will have to be echoed.
+		if l.termios.LEnabled(linux.ECHO) {
+			l.outQueue.writeBytes(cBytes, l)
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+
+		// If we finish a line, make it available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) {
+			q.readable = true
+			break
+		}
+	}
+
+	// In noncanonical mode, everything is readable.
+	if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 {
+		q.readable = true
+	}
+
+	return ret
+}
+
+// shouldDiscard returns whether c should be discarded. In canonical mode, if
+// too many bytes are enqueued, we keep reading input and discarding it until
+// we find a terminating character. Signal/echo processing still occurs.
+//
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool {
+	return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes)
+}
+
+// peek returns the size in bytes of the next character to process. As long as
+// b isn't empty, peek returns a value of at least 1.
+func (l *lineDiscipline) peek(b []byte) int {
+	size := 1
+	// If UTF-8 support is enabled, runes might be multiple bytes.
+	if l.termios.IEnabled(linux.IUTF8) {
+		_, size = utf8.DecodeRune(b)
+	}
+	return size
+}
+
+// LINT.ThenChange(../../fs/tty/line_discipline.go)
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
new file mode 100644
index 000000000..60340c28e
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -0,0 +1,226 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+// masterInode is the inode for the master end of the Terminal.
+type masterInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+}
+
+var _ kernfs.Inode = (*masterInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (mi *masterInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	t, err := mi.root.allocateTerminal(rp.Credentials())
+	if err != nil {
+		return nil, err
+	}
+
+	mi.IncRef()
+	fd := &masterFileDescription{
+		inode: mi,
+		t:     t,
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		mi.DecRef()
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := mi.InodeAttrs.Stat(vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.TTYAUX_MAJOR
+	statx.RdevMinor = linux.PTMX_MINOR
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type masterFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	inode *masterInode
+	t     *Terminal
+}
+
+var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (mfd *masterFileDescription) Release() {
+	mfd.inode.root.masterClose(mfd.t)
+	mfd.inode.DecRef()
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	mfd.t.ld.masterWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) {
+	mfd.t.ld.masterWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return mfd.t.ld.masterReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return mfd.t.ld.outputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return mfd.t.ld.inputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the output queue read buffer.
+		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+	case linux.TCGETS:
+		// N.B. TCGETS on the master actually returns the configuration
+		// of the slave end.
+		return mfd.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		// N.B. TCSETS on the master actually affects the configuration
+		// of the slave end.
+		return mfd.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return mfd.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCSPTLCK:
+		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
+		return 0, nil
+	case linux.TIOCGWINSZ:
+		return 0, mfd.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return mfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return mfd.inode.Stat(fs, opts)
+}
+
+// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
+func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
+	switch cmd {
+	case linux.TCGETS,
+		linux.TCSETS,
+		linux.TCSETSW,
+		linux.TCSETSF,
+		linux.TIOCGWINSZ,
+		linux.TIOCSWINSZ,
+		linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+	}
+}
+
+// LINT.ThenChange(../../fs/tty/master.go)
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
new file mode 100644
index 000000000..29a6be858
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -0,0 +1,240 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+// waitBufMaxBytes is the maximum size of a wait buffer. It is based on
+// TTYB_DEFAULT_MEM_LIMIT.
+const waitBufMaxBytes = 131072
+
+// queue represents one of the input or output queues between a pty master and
+// slave. Bytes written to a queue are added to the read buffer until it is
+// full, at which point they are written to the wait buffer. Bytes are
+// processed (i.e. undergo termios transformations) as they are added to the
+// read buffer. The read buffer is readable when its length is nonzero and
+// readable is true.
+//
+// +stateify savable
+type queue struct {
+	// mu protects everything in queue.
+	mu sync.Mutex `state:"nosave"`
+
+	// readBuf is buffer of data ready to be read when readable is true.
+	// This data has been processed.
+	readBuf []byte
+
+	// waitBuf contains data that can't fit into readBuf. It is put here
+	// until it can be loaded into the read buffer. waitBuf contains data
+	// that hasn't been processed.
+	waitBuf    [][]byte
+	waitBufLen uint64
+
+	// readable indicates whether the read buffer can be read from.  In
+	// canonical mode, there can be an unterminated line in the read buffer,
+	// so readable must be checked.
+	readable bool
+
+	// transform is the the queue's function for transforming bytes
+	// entering the queue. For example, transform might convert all '\r's
+	// entering the queue to '\n's.
+	transformer
+}
+
+// readReadiness returns whether q is ready to be read from.
+func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.readBuf) > 0 && q.readable {
+		return waiter.EventIn
+	}
+	return waiter.EventMask(0)
+}
+
+// writeReadiness returns whether q is ready to be written to.
+func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.waitBufLen < waitBufMaxBytes {
+		return waiter.EventOut
+	}
+	return waiter.EventMask(0)
+}
+
+// readableSize writes the number of readable bytes to userspace.
+func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	var size int32
+	if q.readable {
+		size = int32(len(q.readBuf))
+	}
+
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+
+}
+
+// read reads from q to userspace. It returns the number of bytes read as well
+// as whether the read caused more readable data to become available (whether
+// data was pushed from the wait buffer to the read buffer).
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	if !q.readable {
+		return 0, false, syserror.ErrWouldBlock
+	}
+
+	if dst.NumBytes() > canonMaxBytes {
+		dst = dst.TakeFirst(canonMaxBytes)
+	}
+
+	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) {
+		src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.readBuf = q.readBuf[n:]
+
+		// If we read everything, this queue is no longer readable.
+		if len(q.readBuf) == 0 {
+			q.readable = false
+		}
+
+		return n, nil
+	}))
+	if err != nil {
+		return 0, false, err
+	}
+
+	// Move data from the queue's wait buffer to its read buffer.
+	nPushed := q.pushWaitBufLocked(l)
+
+	return int64(n), nPushed > 0, nil
+}
+
+// write writes to q from userspace.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Copy data into the wait buffer.
+	n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) {
+		copyLen := src.NumBytes()
+		room := waitBufMaxBytes - q.waitBufLen
+		// If out of room, return EAGAIN.
+		if room == 0 && copyLen > 0 {
+			return 0, syserror.ErrWouldBlock
+		}
+		// Cap the size of the wait buffer.
+		if copyLen > room {
+			copyLen = room
+			src = src.TakeFirst64(room)
+		}
+		buf := make([]byte, copyLen)
+
+		// Copy the data into the wait buffer.
+		dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.waitBufAppend(buf)
+
+		return n, nil
+	}))
+	if err != nil {
+		return 0, err
+	}
+
+	// Push data from the wait to the read buffer.
+	q.pushWaitBufLocked(l)
+
+	return n, nil
+}
+
+// writeBytes writes to q from b.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Write to the wait buffer.
+	q.waitBufAppend(b)
+	q.pushWaitBufLocked(l)
+}
+
+// pushWaitBufLocked fills the queue's read buffer with data from the wait
+// buffer.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be locked.
+func (q *queue) pushWaitBufLocked(l *lineDiscipline) int {
+	if q.waitBufLen == 0 {
+		return 0
+	}
+
+	// Move data from the wait to the read buffer.
+	var total int
+	var i int
+	for i = 0; i < len(q.waitBuf); i++ {
+		n := q.transform(l, q, q.waitBuf[i])
+		total += n
+		if n != len(q.waitBuf[i]) {
+			// The read buffer filled up without consuming the
+			// entire buffer.
+			q.waitBuf[i] = q.waitBuf[i][n:]
+			break
+		}
+	}
+
+	// Update wait buffer based on consumed data.
+	q.waitBuf = q.waitBuf[i:]
+	q.waitBufLen -= uint64(total)
+
+	return total
+}
+
+// Precondition: q.mu must be locked.
+func (q *queue) waitBufAppend(b []byte) {
+	q.waitBuf = append(q.waitBuf, b)
+	q.waitBufLen += uint64(len(b))
+}
+
+// LINT.ThenChange(../../fs/tty/queue.go)
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
new file mode 100644
index 000000000..e7e50d51e
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -0,0 +1,186 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// LINT.IfChange
+
+// slaveInode is the inode for the slave end of the Terminal.
+type slaveInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*slaveInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (si *slaveInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	si.IncRef()
+	fd := &slaveFileDescription{
+		inode: si,
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		si.DecRef()
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (si *slaveInode) Valid(context.Context) bool {
+	// Return valid if the slave still exists.
+	si.root.mu.Lock()
+	defer si.root.mu.Unlock()
+	_, ok := si.root.slaves[si.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := si.InodeAttrs.Stat(vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
+	statx.RdevMinor = si.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type slaveFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	inode *slaveInode
+}
+
+var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (sfd *slaveFileDescription) Release() {
+	sfd.inode.DecRef()
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
+	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sfd.inode.t.ld.slaveReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescripionImpl.Ioctl.
+func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
+	case linux.TCGETS:
+		return sfd.inode.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		return sfd.inode.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return sfd.inode.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.Stat(fs, opts)
+}
+
+// LINT.ThenChange(../../fs/tty/slave.go)
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
new file mode 100644
index 000000000..b44e673d8
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -0,0 +1,124 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// LINT.IfChanges
+
+// Terminal is a pseudoterminal.
+//
+// +stateify savable
+type Terminal struct {
+	// n is the terminal index. It is immutable.
+	n uint32
+
+	// ld is the line discipline of the terminal. It is immutable.
+	ld *lineDiscipline
+
+	// masterKTTY contains the controlling process of the master end of
+	// this terminal. This field is immutable.
+	masterKTTY *kernel.TTY
+
+	// slaveKTTY contains the controlling process of the slave end of this
+	// terminal. This field is immutable.
+	slaveKTTY *kernel.TTY
+}
+
+func newTerminal(n uint32) *Terminal {
+	termios := linux.DefaultSlaveTermios
+	t := Terminal{
+		n:          n,
+		ld:         newLineDiscipline(termios),
+		masterKTTY: &kernel.TTY{Index: n},
+		slaveKTTY:  &kernel.TTY{Index: n},
+	}
+	return &t
+}
+
+// setControllingTTY makes tm the controlling terminal of the calling thread
+// group.
+func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
+}
+
+// releaseControllingTTY removes tm as the controlling terminal of the calling
+// thread group.
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("releaseControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
+}
+
+// foregroundProcessGroup gets the process group ID of tm's foreground process.
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("foregroundProcessGroup must be called from a task context")
+	}
+
+	ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
+	if err != nil {
+		return 0, err
+	}
+
+	// Write it out to *arg.
+	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// foregroundProcessGroup sets tm's foreground process.
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setForegroundProcessGroup must be called from a task context")
+	}
+
+	// Read in the process group ID.
+	var pgid int32
+	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		return 0, err
+	}
+
+	ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
+	return uintptr(ret), err
+}
+
+func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
+	if isMaster {
+		return tm.masterKTTY
+	}
+	return tm.slaveKTTY
+}
+
+// LINT.ThenChange(../../fs/tty/terminal.go)
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 64f1b142c..142ee53b0 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -163,16 +163,25 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 func (a *Accessor) UserspaceInit(ctx context.Context) error {
 	actx := a.wrapContext(ctx)
 
-	// systemd: src/shared/dev-setup.c:dev_setup()
+	// Initialize symlinks.
 	for _, symlink := range []struct {
 		source string
 		target string
 	}{
-		// /proc/kcore is not implemented.
+		// systemd: src/shared/dev-setup.c:dev_setup()
 		{source: "fd", target: "/proc/self/fd"},
 		{source: "stdin", target: "/proc/self/fd/0"},
 		{source: "stdout", target: "/proc/self/fd/1"},
 		{source: "stderr", target: "/proc/self/fd/2"},
+		// /proc/kcore is not implemented.
+
+		// Linux implements /dev/ptmx as a device node, but advises
+		// container implementations to create /dev/ptmx as a symlink
+		// to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd
+		// follows this advice (src/nspawn/nspawn.c:setup_pts()), while
+		// LXC tries to create a bind mount and falls back to a symlink
+		// (src/lxc/conf.c:lxc_setup_devpts()).
+		{source: "ptmx", target: "pts/ptmx"},
 	} {
 		if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil {
 			return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3164d022c..1d46dba25 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -391,7 +391,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
 	// appropriate bits in rp), but are returned by
 	// FileDescriptionImpl.StatusFlags().
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
 	ats := vfs.AccessTypesForOpenFlags(&opts)
 
 	// Do not create new file.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 9f526359e..a946645f6 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -216,6 +216,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo
 	atomic.StoreUint32(&a.nlink, nlink)
 }
 
+// Ino returns the inode id.
+func (a *InodeAttrs) Ino() uint64 {
+	return atomic.LoadUint64(&a.ino)
+}
+
 // Mode implements Inode.Mode.
 func (a *InodeAttrs) Mode() linux.FileMode {
 	return linux.FileMode(atomic.LoadUint32(&a.mode))
@@ -359,8 +364,8 @@ func (o *OrderedChildren) Destroy() {
 // cache. Populate returns the number of directories inserted, which the caller
 // may use to update the link count for the parent directory.
 //
-// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory
-// inode. children must not contain any conflicting entries already in o.
+// Precondition: d must represent a directory inode. children must not contain
+// any conflicting entries already in o.
 func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
 	var links uint32
 	for name, child := range children {
-- 
cgit v1.2.3


From f01f2132d8d3e551579cba9a1b942b4b70d83f21 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 23 Apr 2020 18:18:54 -0700
Subject: Enable automated marshalling for mempolicy syscalls.

PiperOrigin-RevId: 308170679
---
 pkg/abi/linux/mm.go                        | 17 +++++++++++------
 pkg/sentry/kernel/task.go                  |  2 +-
 pkg/sentry/kernel/task_sched.go            |  4 ++--
 pkg/sentry/mm/mm.go                        |  3 ++-
 pkg/sentry/mm/syscalls.go                  |  4 ++--
 pkg/sentry/syscalls/linux/sys_mempolicy.go | 18 +++++++++---------
 6 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index cd043dac3..07cc1895e 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -90,14 +90,19 @@ const (
 	MS_SYNC       = 1 << 2
 )
 
+// NumaPolicy is the NUMA memory policy for a memory range. See numa(7).
+//
+// +marshal
+type NumaPolicy int32
+
 // Policies for get_mempolicy(2)/set_mempolicy(2).
 const (
-	MPOL_DEFAULT    = 0
-	MPOL_PREFERRED  = 1
-	MPOL_BIND       = 2
-	MPOL_INTERLEAVE = 3
-	MPOL_LOCAL      = 4
-	MPOL_MAX        = 5
+	MPOL_DEFAULT    NumaPolicy = 0
+	MPOL_PREFERRED  NumaPolicy = 1
+	MPOL_BIND       NumaPolicy = 2
+	MPOL_INTERLEAVE NumaPolicy = 3
+	MPOL_LOCAL      NumaPolicy = 4
+	MPOL_MAX        NumaPolicy = 5
 )
 
 // Flags for get_mempolicy(2).
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e5d133d6c..f48247c94 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -484,7 +484,7 @@ type Task struct {
 	// bit.
 	//
 	// numaPolicy and numaNodeMask are protected by mu.
-	numaPolicy   int32
+	numaPolicy   linux.NumaPolicy
 	numaNodeMask uint64
 
 	// netns is the task's network namespace. netns is never nil.
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 8b148db35..09366b60c 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -653,14 +653,14 @@ func (t *Task) SetNiceness(n int) {
 }
 
 // NumaPolicy returns t's current numa policy.
-func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) {
+func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	return t.numaPolicy, t.numaNodeMask
 }
 
 // SetNumaPolicy sets t's numa policy.
-func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) {
+func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	t.numaPolicy = policy
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 34d3bde7a..6db7c3d40 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -35,6 +35,7 @@
 package mm
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -286,7 +287,7 @@ type vma struct {
 	mlockMode memmap.MLockMode
 
 	// numaPolicy is the NUMA policy for this vma set by mbind().
-	numaPolicy int32
+	numaPolicy linux.NumaPolicy
 
 	// numaNodemask is the NUMA nodemask for this vma set by mbind().
 	numaNodemask uint64
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index c5dfa5972..3f496aa9f 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -974,7 +974,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
 }
 
 // NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
-func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	vseg := mm.vmas.FindSegment(addr)
@@ -986,7 +986,7 @@ func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
 }
 
 // SetNumaPolicy implements the semantics of Linux's mbind().
-func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error {
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
 	if !addr.IsPageAligned() {
 		return syserror.EINVAL
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
index ac934dc6f..9b4a5c3f1 100644
--- a/pkg/sentry/syscalls/linux/sys_mempolicy.go
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -162,10 +162,10 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 			if err != nil {
 				return 0, nil, err
 			}
-			policy = 0 // maxNodes == 1
+			policy = linux.MPOL_DEFAULT // maxNodes == 1
 		}
 		if mode != 0 {
-			if _, err := t.CopyOut(mode, policy); err != nil {
+			if _, err := policy.CopyOut(t, mode); err != nil {
 				return 0, nil, err
 			}
 		}
@@ -199,10 +199,10 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
 			return 0, nil, syserror.EINVAL
 		}
-		policy = 0 // maxNodes == 1
+		policy = linux.MPOL_DEFAULT // maxNodes == 1
 	}
 	if mode != 0 {
-		if _, err := t.CopyOut(mode, policy); err != nil {
+		if _, err := policy.CopyOut(t, mode); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -216,7 +216,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 
 // SetMempolicy implements the syscall set_mempolicy(2).
 func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	modeWithFlags := args[0].Int()
+	modeWithFlags := linux.NumaPolicy(args[0].Int())
 	nodemask := args[1].Pointer()
 	maxnode := args[2].Uint()
 
@@ -233,7 +233,7 @@ func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	length := args[1].Uint64()
-	mode := args[2].Int()
+	mode := linux.NumaPolicy(args[2].Int())
 	nodemask := args[3].Pointer()
 	maxnode := args[4].Uint()
 	flags := args[5].Uint()
@@ -258,9 +258,9 @@ func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return 0, nil, err
 }
 
-func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) {
-	flags := modeWithFlags & linux.MPOL_MODE_FLAGS
-	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
+	flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
+	mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
 	if flags == linux.MPOL_MODE_FLAGS {
 		// Can't specify both mode flags simultaneously.
 		return 0, 0, syserror.EINVAL
-- 
cgit v1.2.3


From 79542417fe97a62ee86aa211ac559bcc5cac5e5e Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Thu, 23 Apr 2020 18:20:43 -0700
Subject: Fix Layer merge and add unit tests

mergo was improperly merging nil and empty strings

PiperOrigin-RevId: 308170862
---
 WORKSPACE                                  |   7 --
 test/packetimpact/testbench/BUILD          |   6 +-
 test/packetimpact/testbench/layers.go      |  44 +++++++-----
 test/packetimpact/testbench/layers_test.go | 109 +++++++++++++++++++++++++++++
 4 files changed, 141 insertions(+), 25 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 3bf5cc9c1..c86e0fcdc 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -188,13 +188,6 @@ go_repository(
     version = "v0.0.0-20171129191014-dec09d789f3d",
 )
 
-go_repository(
-    name = "com_github_imdario_mergo",
-    importpath = "github.com/imdario/mergo",
-    sum = "h1:CGgOkSJeqMRmt0D9XLWExdT4m4F1vd3FV3VPt+0VxkQ=",
-    version = "v0.3.8",
-)
-
 go_repository(
     name = "com_github_kr_pretty",
     importpath = "github.com/kr/pretty",
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index b6a254882..3ceceb9d7 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -23,7 +23,6 @@ go_library(
         "//test/packetimpact/proto:posix_server_go_proto",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
-        "@com_github_imdario_mergo//:go_default_library",
         "@com_github_mohae_deepcopy//:go_default_library",
         "@org_golang_google_grpc//:go_default_library",
         "@org_golang_google_grpc//keepalive:go_default_library",
@@ -37,5 +36,8 @@ go_test(
     size = "small",
     srcs = ["layers_test.go"],
     library = ":testbench",
-    deps = ["//pkg/tcpip"],
+    deps = [
+        "//pkg/tcpip",
+        "@com_github_mohae_deepcopy//:go_default_library",
+    ],
 )
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 5ce324f0d..01e99567d 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -22,7 +22,6 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
-	"github.com/imdario/mergo"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -111,13 +110,31 @@ func equalLayer(x, y Layer) bool {
 	return cmp.Equal(x, y, opt, cmpopts.IgnoreTypes(LayerBase{}))
 }
 
-// mergeLayer merges other in layer. Any non-nil value in other overrides the
-// corresponding value in layer. If other is nil, no action is performed.
-func mergeLayer(layer, other Layer) error {
-	if other == nil {
+// mergeLayer merges y into x. Any fields for which y has a non-nil value, that
+// value overwrite the corresponding fields in x.
+func mergeLayer(x, y Layer) error {
+	if y == nil {
 		return nil
 	}
-	return mergo.Merge(layer, other, mergo.WithOverride)
+	if reflect.TypeOf(x) != reflect.TypeOf(y) {
+		return fmt.Errorf("can't merge %T into %T", y, x)
+	}
+	vx := reflect.ValueOf(x).Elem()
+	vy := reflect.ValueOf(y).Elem()
+	t := vy.Type()
+	for i := 0; i < vy.NumField(); i++ {
+		t := t.Field(i)
+		if t.Anonymous {
+			// Ignore the LayerBase in the Layer struct.
+			continue
+		}
+		v := vy.Field(i)
+		if v.IsNil() {
+			continue
+		}
+		vx.Field(i).Set(v)
+	}
+	return nil
 }
 
 func stringLayer(l Layer) string {
@@ -243,8 +260,7 @@ func (l *Ether) length() int {
 	return header.EthernetMinimumSize
 }
 
-// merge overrides the values in l with the values from other but only in fields
-// where the value is not nil.
+// merge implements Layer.merge.
 func (l *Ether) merge(other Layer) error {
 	return mergeLayer(l, other)
 }
@@ -399,8 +415,7 @@ func (l *IPv4) length() int {
 	return int(*l.IHL)
 }
 
-// merge overrides the values in l with the values from other but only in fields
-// where the value is not nil.
+// merge implements Layer.merge.
 func (l *IPv4) merge(other Layer) error {
 	return mergeLayer(l, other)
 }
@@ -544,8 +559,7 @@ func (l *TCP) length() int {
 	return int(*l.DataOffset)
 }
 
-// merge overrides the values in l with the values from other but only in fields
-// where the value is not nil.
+// merge implements Layer.merge.
 func (l *TCP) merge(other Layer) error {
 	return mergeLayer(l, other)
 }
@@ -622,8 +636,7 @@ func (l *UDP) length() int {
 	return int(*l.Length)
 }
 
-// merge overrides the values in l with the values from other but only in fields
-// where the value is not nil.
+// merge implements Layer.merge.
 func (l *UDP) merge(other Layer) error {
 	return mergeLayer(l, other)
 }
@@ -659,8 +672,7 @@ func (l *Payload) length() int {
 	return len(l.Bytes)
 }
 
-// merge overrides the values in l with the values from other but only in fields
-// where the value is not nil.
+// merge implements Layer.merge.
 func (l *Payload) merge(other Layer) error {
 	return mergeLayer(l, other)
 }
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
index c99cf6312..f07ec5eb2 100644
--- a/test/packetimpact/testbench/layers_test.go
+++ b/test/packetimpact/testbench/layers_test.go
@@ -17,6 +17,7 @@ package testbench
 import (
 	"testing"
 
+	"github.com/mohae/deepcopy"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
@@ -52,6 +53,114 @@ func TestLayerMatch(t *testing.T) {
 	}
 }
 
+func TestLayerMergeMismatch(t *testing.T) {
+	tcp := &TCP{}
+	otherTCP := &TCP{}
+	ipv4 := &IPv4{}
+	ether := &Ether{}
+	for _, tt := range []struct {
+		a, b    Layer
+		success bool
+	}{
+		{tcp, tcp, true},
+		{tcp, otherTCP, true},
+		{tcp, ipv4, false},
+		{tcp, ether, false},
+		{tcp, nil, true},
+
+		{otherTCP, otherTCP, true},
+		{otherTCP, ipv4, false},
+		{otherTCP, ether, false},
+		{otherTCP, nil, true},
+
+		{ipv4, ipv4, true},
+		{ipv4, ether, false},
+		{ipv4, nil, true},
+
+		{ether, ether, true},
+		{ether, nil, true},
+	} {
+		if err := tt.a.merge(tt.b); (err == nil) != tt.success {
+			t.Errorf("%s.merge(%s) got %s, wanted the opposite", tt.a, tt.b, err)
+		}
+		if tt.b != nil {
+			if err := tt.b.merge(tt.a); (err == nil) != tt.success {
+				t.Errorf("%s.merge(%s) got %s, wanted the opposite", tt.b, tt.a, err)
+			}
+		}
+	}
+}
+
+func TestLayerMerge(t *testing.T) {
+	zero := Uint32(0)
+	one := Uint32(1)
+	two := Uint32(2)
+	empty := []byte{}
+	foo := []byte("foo")
+	bar := []byte("bar")
+	for _, tt := range []struct {
+		a, b Layer
+		want Layer
+	}{
+		{&TCP{AckNum: nil}, &TCP{AckNum: nil}, &TCP{AckNum: nil}},
+		{&TCP{AckNum: nil}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: nil}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: nil}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: nil}, nil, &TCP{AckNum: nil}},
+
+		{&TCP{AckNum: zero}, &TCP{AckNum: nil}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: zero}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: zero}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: zero}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: zero}, nil, &TCP{AckNum: zero}},
+
+		{&TCP{AckNum: one}, &TCP{AckNum: nil}, &TCP{AckNum: one}},
+		{&TCP{AckNum: one}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: one}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: one}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: one}, nil, &TCP{AckNum: one}},
+
+		{&TCP{AckNum: two}, &TCP{AckNum: nil}, &TCP{AckNum: two}},
+		{&TCP{AckNum: two}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: two}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: two}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: two}, nil, &TCP{AckNum: two}},
+
+		{&Payload{Bytes: nil}, &Payload{Bytes: nil}, &Payload{Bytes: nil}},
+		{&Payload{Bytes: nil}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: nil}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: nil}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: nil}, nil, &Payload{Bytes: nil}},
+
+		{&Payload{Bytes: empty}, &Payload{Bytes: nil}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: empty}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: empty}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: empty}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: empty}, nil, &Payload{Bytes: empty}},
+
+		{&Payload{Bytes: foo}, &Payload{Bytes: nil}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: foo}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: foo}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: foo}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: foo}, nil, &Payload{Bytes: foo}},
+
+		{&Payload{Bytes: bar}, &Payload{Bytes: nil}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: bar}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: bar}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: bar}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: bar}, nil, &Payload{Bytes: bar}},
+	} {
+		a := deepcopy.Copy(tt.a).(Layer)
+		if err := a.merge(tt.b); err != nil {
+			t.Errorf("%s.merge(%s) = %s, wanted nil", tt.a, tt.b, err)
+			continue
+		}
+		if a.String() != tt.want.String() {
+			t.Errorf("%s.merge(%s) merge result got %s, want %s", tt.a, tt.b, a, tt.want)
+		}
+	}
+}
+
 func TestLayerStringFormat(t *testing.T) {
 	for _, tt := range []struct {
 		name string
-- 
cgit v1.2.3


From 40a712c57cd78c51c9875ae04b5e795113c75e62 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 24 Apr 2020 08:19:11 -0700
Subject: Refactor syscall.Fstat calls in hostfs.

Just call syscall.Fstat directly each time mode/file owner are needed. This
feels more natural than using i.getPermissions().

PiperOrigin-RevId: 308257405
---
 pkg/sentry/fsimpl/host/host.go | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index ae94cfa6e..7847e3cc2 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -169,31 +169,22 @@ func fileFlagsFromHostFD(fd int) (int, error) {
 
 // CheckPermissions implements kernfs.Inode.
 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
-	mode, uid, gid, err := i.getPermissions()
-	if err != nil {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return err
 	}
-	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
 }
 
 // Mode implements kernfs.Inode.
 func (i *inode) Mode() linux.FileMode {
-	mode, _, _, err := i.getPermissions()
-	// Retrieving the mode from the host fd using fstat(2) should not fail.
-	// If the syscall does not succeed, something is fundamentally wrong.
-	if err != nil {
-		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
-	}
-	return linux.FileMode(mode)
-}
-
-func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) {
-	// Retrieve metadata.
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
-		return 0, 0, 0, err
+		// Retrieving the mode from the host fd using fstat(2) should not fail.
+		// If the syscall does not succeed, something is fundamentally wrong.
+		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
 	}
-	return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil
+	return linux.FileMode(s.Mode)
 }
 
 // Stat implements kernfs.Inode.
@@ -326,11 +317,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		return syserror.EPERM
 	}
-	mode, uid, gid, err := i.getPermissions()
-	if err != nil {
+	var hostStat syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
 		return err
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &s, mode.Permissions(), uid, gid); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
 		return err
 	}
 
@@ -374,11 +365,11 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio
 }
 
 func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
-	mode, _, _, err := i.getPermissions()
-	if err != nil {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return nil, err
 	}
-	fileType := mode.FileType()
+	fileType := s.Mode & linux.FileTypeMask
 	if fileType == syscall.S_IFSOCK {
 		if i.isTTY {
 			return nil, errors.New("cannot use host socket as TTY")
-- 
cgit v1.2.3


From 1b88c63b3e6b330c8399bf92f148cc80374bee18 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 24 Apr 2020 10:02:22 -0700
Subject: Move hostfs mount to Kernel struct.

This is needed to set up host fds passed through a Unix socket. Note that
the host package depends on kernel, so we cannot set up the hostfs mount
directly in Kernel.Init as we do for sockfs and pipefs.

Also, adjust sockfs to make its setup look more like hostfs's and pipefs's.

PiperOrigin-RevId: 308274053
---
 pkg/sentry/fsimpl/host/host.go     | 16 +++++++--------
 pkg/sentry/fsimpl/sockfs/sockfs.go | 26 ++++++++++-------------
 pkg/sentry/kernel/kernel.go        | 42 ++++++++++++++++++++++++++++----------
 runsc/boot/fds.go                  |  7 +------
 runsc/boot/loader.go               | 13 ++++++++++++
 5 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 7847e3cc2..a26b13067 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -42,7 +42,7 @@ type filesystemType struct{}
 
 // GetFilesystem implements FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	panic("cannot instaniate a host filesystem")
+	panic("host.filesystemType.GetFilesystem should never be called")
 }
 
 // Name implements FilesystemType.Name.
@@ -55,14 +55,14 @@ type filesystem struct {
 	kernfs.Filesystem
 }
 
-// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD.
-func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) {
+// NewFilesystem sets up and returns a new hostfs filesystem.
+//
+// Note that there should only ever be one instance of host.filesystem,
+// a global mount for host fds.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
 	fs := &filesystem{}
-	fs.Init(vfsObj, &filesystemType{})
-	vfsfs := fs.VFSFilesystem()
-	// NewDisconnectedMount will take an additional reference on vfsfs.
-	defer vfsfs.DecRef()
-	return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{})
+	fs.Init(vfsObj, filesystemType{})
+	return fs.VFSFilesystem()
 }
 
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index 3f7ad1d65..632cfde88 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -24,26 +24,12 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// NewFilesystem creates a new sockfs filesystem.
-//
-// Note that there should only ever be one instance of sockfs.Filesystem,
-// backing a global socket mount.
-func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
-	fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{})
-	if err != nil {
-		panic("failed to create sockfs filesystem")
-	}
-	return fs
-}
-
 // filesystemType implements vfs.FilesystemType.
 type filesystemType struct{}
 
 // GetFilesystem implements FilesystemType.GetFilesystem.
 func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	fs := &filesystem{}
-	fs.Init(vfsObj, fsType)
-	return fs.VFSFilesystem(), nil, nil
+	panic("sockfs.filesystemType.GetFilesystem should never be called")
 }
 
 // Name implements FilesystemType.Name.
@@ -60,6 +46,16 @@ type filesystem struct {
 	kernfs.Filesystem
 }
 
+// NewFilesystem sets up and returns a new sockfs filesystem.
+//
+// Note that there should only ever be one instance of sockfs.Filesystem,
+// backing a global socket mount.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
+	fs := &filesystem{}
+	fs.Init(vfsObj, filesystemType{})
+	return fs.VFSFilesystem()
+}
+
 // inode implements kernfs.Inode.
 //
 // TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index fef60e636..c91b9dce2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -227,11 +227,6 @@ type Kernel struct {
 	// by extMu.
 	nextSocketEntry uint64
 
-	// socketMount is a disconnected vfs.Mount, not included in k.vfs,
-	// representing a sockfs.filesystem. socketMount is used to back
-	// VirtualDentries representing anonymous sockets.
-	socketMount *vfs.Mount
-
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
 
@@ -255,10 +250,22 @@ type Kernel struct {
 	// VFS keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
 
+	// hostMount is the Mount used for file descriptors that were imported
+	// from the host.
+	hostMount *vfs.Mount
+
 	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
 	// syscalls (as opposed to named pipes created by mknod()).
 	pipeMount *vfs.Mount
 
+	// socketMount is the Mount used for sockets created by the socket() and
+	// socketpair() syscalls. There are several cases where a socket dentry will
+	// not be contained in socketMount:
+	// 1. Socket files created by mknod()
+	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
+	// 3. Socket files created by binding Unix sockets to a file path
+	socketMount *vfs.Mount
+
 	// If set to true, report address space activation waits as if the task is in
 	// external wait so that the watchdog doesn't report the task stuck.
 	SleepForAddressSpaceActivation bool
@@ -377,7 +384,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 		defer socketFilesystem.DecRef()
 		socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
 		if err != nil {
-			return fmt.Errorf("failed to initialize socket mount: %v", err)
+			return fmt.Errorf("failed to create sockfs mount: %v", err)
 		}
 		k.socketMount = socketMount
 	}
@@ -1526,11 +1533,6 @@ func (k *Kernel) ListSockets() []*SocketEntry {
 	return socks
 }
 
-// SocketMount returns the global socket mount.
-func (k *Kernel) SocketMount() *vfs.Mount {
-	return k.socketMount
-}
-
 // supervisorContext is a privileged context.
 type supervisorContext struct {
 	context.NoopSleeper
@@ -1629,7 +1631,25 @@ func (k *Kernel) VFS() *vfs.VirtualFilesystem {
 	return &k.vfs
 }
 
+// SetHostMount sets the hostfs mount.
+func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
+	if k.hostMount != nil {
+		panic("Kernel.hostMount cannot be set more than once")
+	}
+	k.hostMount = mnt
+}
+
+// HostMount returns the hostfs mount.
+func (k *Kernel) HostMount() *vfs.Mount {
+	return k.hostMount
+}
+
 // PipeMount returns the pipefs mount.
 func (k *Kernel) PipeMount() *vfs.Mount {
 	return k.pipeMount
 }
+
+// SocketMount returns the sockfs mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+	return k.socketMount
+}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 7e49f6f9f..0cbd63857 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -89,14 +89,9 @@ func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kern
 	fdTable := k.NewFDTable()
 	defer fdTable.DecRef()
 
-	hostMount, err := vfshost.NewMount(k.VFS())
-	if err != nil {
-		return nil, fmt.Errorf("creating host mount: %w", err)
-	}
-
 	for appFD, hostFD := range stdioFDs {
 		// TODO(gvisor.dev/issue/1482): Add TTY support.
-		appFile, err := vfshost.ImportFD(hostMount, hostFD, false)
+		appFile, err := vfshost.ImportFD(k.HostMount(), hostFD, false)
 		if err != nil {
 			return nil, err
 		}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 096b0e9f0..3f41d8357 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -46,6 +47,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -329,6 +331,17 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		// Set up host mount that will be used for imported fds.
+		hostFilesystem := vfs2host.NewFilesystem(k.VFS())
+		defer hostFilesystem.DecRef()
+		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
+		}
+		k.SetHostMount(hostMount)
+	}
+
 	// Make host FDs stable between invocations. Host FDs must map to the exact
 	// same number when the sandbox is restored. Otherwise the wrong FD will be
 	// used.
-- 
cgit v1.2.3


From 2cc0fd42f462f3942230c4b33ca2825e2a28765d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 24 Apr 2020 11:43:49 -0700
Subject: Fixes for procfs

- Return ENOENT for /proc/[pid]/task if task is zoombied or terminated
- Allow directory to be Seek() to the end
- Construct synthetic files for /proc/[pid]/ns/*
- Changed GenericDirectoryFD.Init to not register with FileDescription,
  otherwise other implementation cannot change behavior.

Updates #1195,1193

PiperOrigin-RevId: 308294649
---
 pkg/sentry/fsimpl/devpts/devpts.go          |  6 ++-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go    | 28 +++++++++--
 pkg/sentry/fsimpl/kernfs/filesystem.go      |  4 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go |  8 +--
 pkg/sentry/fsimpl/kernfs/kernfs.go          |  2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go     | 10 ++--
 pkg/sentry/fsimpl/kernfs/symlink.go         |  2 +-
 pkg/sentry/fsimpl/proc/subtasks.go          | 49 ++++++++++++++++++-
 pkg/sentry/fsimpl/proc/task.go              |  7 ++-
 pkg/sentry/fsimpl/proc/task_fds.go          | 14 ++++--
 pkg/sentry/fsimpl/proc/task_files.go        | 76 +++++++++++++++++++++++++++--
 pkg/sentry/fsimpl/proc/tasks.go             |  6 ++-
 pkg/sentry/fsimpl/proc/tasks_files.go       |  4 +-
 pkg/sentry/fsimpl/sys/sys.go                |  4 +-
 pkg/sentry/vfs/vfs.go                       |  8 +++
 15 files changed, 192 insertions(+), 36 deletions(-)

diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 07a69b940..f36bf50fc 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -161,8 +161,10 @@ func (i *rootInode) masterClose(t *Terminal) {
 
 // Open implements kernfs.Inode.Open.
 func (i *rootInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index e8a4670b8..dd5806301 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,6 +15,8 @@
 package kernfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -43,15 +45,27 @@ type GenericDirectoryFD struct {
 	off      int64
 }
 
-// Init initializes a GenericDirectoryFD.
-func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error {
+// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
+// dentry.
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+	fd := &GenericDirectoryFD{}
+	if err := fd.Init(children, opts); err != nil {
+		return nil, err
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
+}
+
+// Init initializes a GenericDirectoryFD. Use it when overriding
+// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
+// correct implementation.
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
-		return err
-	}
 	fd.children = children
 	return nil
 }
@@ -187,6 +201,10 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 		// Use offset as given.
 	case linux.SEEK_CUR:
 		offset += fd.off
+	case linux.SEEK_END:
+		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
+		// if they are added after SEEK_END.
+		offset = math.MaxInt64
 	default:
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 1d46dba25..3ccd92fc5 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -82,7 +82,7 @@ afterSymlink:
 	}
 	// Resolve any symlink at current path component.
 	if rp.ShouldFollowSymlink() && next.isSymlink() {
-		targetVD, targetPathname, err := next.inode.Getlink(ctx)
+		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
 		}
@@ -477,7 +477,7 @@ afterTrailingSymlink:
 	}
 	child := childVFSD.Impl().(*Dentry)
 	if rp.ShouldFollowSymlink() && child.isSymlink() {
-		targetVD, targetPathname, err := child.inode.Getlink(ctx)
+		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
 		}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index a946645f6..02f35a675 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -182,7 +182,7 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) {
 }
 
 // Getlink implements Inode.Getlink.
-func (InodeNotSymlink) Getlink(context.Context) (vfs.VirtualDentry, string, error) {
+func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	return vfs.VirtualDentry{}, "", syserror.EINVAL
 }
 
@@ -568,8 +568,10 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F
 
 // Open implements kernfs.Inode.
 func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index f5041824f..95cf6dc24 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -439,5 +439,5 @@ type inodeSymlink interface {
 	//
 	// - If the inode is not a symlink, Getlink returns (zero-value
 	// VirtualDentry, "", EINVAL).
-	Getlink(ctx context.Context) (vfs.VirtualDentry, string, error)
+	Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 465451f35..0964d5456 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -117,8 +117,8 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 }
 
 func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
@@ -147,8 +147,10 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 018aa503c..0aa6dc979 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -56,7 +56,7 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 }
 
 // Getlink implements Inode.Getlink.
-func (s *StaticSymlink) Getlink(_ context.Context) (vfs.VirtualDentry, string, error) {
+func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	return vfs.VirtualDentry{}, s.target, nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index a21313666..28ec2484a 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -88,6 +88,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 	if len(tasks) == 0 {
 		return offset, syserror.ENOENT
 	}
+	if relOffset >= int64(len(tasks)) {
+		return offset, nil
+	}
 
 	tids := make([]int, 0, len(tasks))
 	for _, tid := range tasks {
@@ -110,10 +113,52 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 	return offset, nil
 }
 
+type subtasksFD struct {
+	kernfs.GenericDirectoryFD
+
+	task *kernel.Task
+}
+
+func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.IterDirents(ctx, cb)
+}
+
+// Seek implements vfs.FileDecriptionImpl.Seek.
+func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return 0, syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.Seek(ctx, offset, whence)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return linux.Statx{}, syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.Stat(ctx, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.SetStat(ctx, opts)
+}
+
 // Open implements kernfs.Inode.
 func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd := &subtasksFD{task: i.task}
+	if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+		return nil, err
+	}
+	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 888afc0fd..e2790d35b 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -44,6 +44,7 @@ type taskInode struct {
 var _ kernfs.Inode = (*taskInode)(nil)
 
 func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
+	// TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
 	contents := map[string]*kernfs.Dentry{
 		"auxv":      newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}),
 		"cmdline":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
@@ -102,8 +103,10 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 
 // Open implements kernfs.Inode.
 func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 046265eca..a7622f1b6 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -143,8 +143,10 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 
 // Open implements kernfs.Inode.
 func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
@@ -207,7 +209,7 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
 	return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
 }
 
-func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	file, _ := getTaskFD(s.task, s.fd)
 	if file == nil {
 		return vfs.VirtualDentry{}, "", syserror.ENOENT
@@ -268,8 +270,10 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 
 // Open implements kernfs.Inode.
 func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index f3173e197..410cc3552 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -622,7 +622,7 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
 }
 
 // Getlink implements kernfs.Inode.Getlink.
-func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	if !kernel.ContextCanTrace(ctx, s.task, false) {
 		return vfs.VirtualDentry{}, "", syserror.EACCES
 	}
@@ -754,9 +754,79 @@ func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
 }
 
 // Getlink implements Inode.Getlink.
-func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return vfs.VirtualDentry{}, "", err
 	}
-	return s.StaticSymlink.Getlink(ctx)
+
+	// Create a synthetic inode to represent the namespace.
+	dentry := &kernfs.Dentry{}
+	dentry.Init(&namespaceInode{})
+	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
+	vd.IncRef()
+	dentry.DecRef()
+	return vd, "", nil
+}
+
+// namespaceInode is a synthetic inode created to represent a namespace in
+// /proc/[pid]/ns/*.
+type namespaceInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+}
+
+var _ kernfs.Inode = (*namespaceInode)(nil)
+
+// Init initializes a namespace inode.
+func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	i.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm)
+}
+
+// Open implements Inode.Open.
+func (i *namespaceInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &namespaceFD{inode: i}
+	i.IncRef()
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// namespace FD is a synthetic file that represents a namespace in
+// /proc/[pid]/ns/*.
+type namespaceFD struct {
+	vfs.FileDescriptionDefaultImpl
+
+	vfsfd vfs.FileDescription
+	inode *namespaceInode
+}
+
+var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
+
+// Stat implements FileDescriptionImpl.
+func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(vfs, opts)
+}
+
+// SetStat implements FileDescriptionImpl.
+func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode.SetStat(ctx, vfs, creds, opts)
+}
+
+// Release implements FileDescriptionImpl.
+func (fd *namespaceFD) Release() {
+	fd.inode.DecRef()
+}
+
+// OnClose implements FileDescriptionImpl.
+func (*namespaceFD) OnClose(context.Context) error {
+	return nil
 }
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 9f2ef8200..26518ed03 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -202,8 +202,10 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 
 // Open implements kernfs.Inode.
 func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 4621e2de0..92007df81 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -63,7 +63,7 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
-func (s *selfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	target, err := s.Readlink(ctx)
 	return vfs.VirtualDentry{}, target, err
 }
@@ -106,7 +106,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
-func (s *threadSelfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	target, err := s.Readlink(ctx)
 	return vfs.VirtualDentry{}, target, err
 }
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 5c617270e..34e8e0cbe 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -106,8 +106,8 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
 
 // Open implements kernfs.Inode.Open.
 func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index cb5bbd781..9015f2cc1 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -797,6 +797,14 @@ type VirtualDentry struct {
 	dentry *Dentry
 }
 
+// MakeVirtualDentry creates a VirtualDentry.
+func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
+	return VirtualDentry{
+		mount:  mount,
+		dentry: dentry,
+	}
+}
+
 // Ok returns true if vd is not empty. It does not require that a reference is
 // held.
 func (vd VirtualDentry) Ok() bool {
-- 
cgit v1.2.3


From 632b104aff3fedf7798447eedc5662c973525c66 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 24 Apr 2020 12:36:14 -0700
Subject: Plumb context.Context into kernfs.Inode.Open().

PiperOrigin-RevId: 308304793
---
 pkg/sentry/fsimpl/devpts/devpts.go             |  2 +-
 pkg/sentry/fsimpl/devpts/master.go             |  2 +-
 pkg/sentry/fsimpl/devpts/slave.go              |  2 +-
 pkg/sentry/fsimpl/host/host.go                 | 10 +++++-----
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  2 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go         |  8 ++++----
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  4 ++--
 pkg/sentry/fsimpl/kernfs/kernfs.go             |  2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |  4 ++--
 pkg/sentry/fsimpl/pipefs/pipefs.go             |  5 ++---
 pkg/sentry/fsimpl/proc/subtasks.go             |  2 +-
 pkg/sentry/fsimpl/proc/task.go                 |  2 +-
 pkg/sentry/fsimpl/proc/task_fds.go             |  4 ++--
 pkg/sentry/fsimpl/proc/task_files.go           |  2 +-
 pkg/sentry/fsimpl/proc/tasks.go                |  2 +-
 pkg/sentry/fsimpl/sockfs/sockfs.go             |  2 +-
 pkg/sentry/fsimpl/sys/sys.go                   |  2 +-
 runsc/boot/fds.go                              |  2 +-
 18 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index f36bf50fc..181d765d3 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -160,7 +160,7 @@ func (i *rootInode) masterClose(t *Terminal) {
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *rootInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 60340c28e..04a292927 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -46,7 +46,7 @@ type masterInode struct {
 var _ kernfs.Inode = (*masterInode)(nil)
 
 // Open implements kernfs.Inode.Open.
-func (mi *masterInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	t, err := mi.root.allocateTerminal(rp.Credentials())
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index e7e50d51e..0a98dc896 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -48,7 +48,7 @@ type slaveInode struct {
 var _ kernfs.Inode = (*slaveInode)(nil)
 
 // Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	si.IncRef()
 	fd := &slaveFileDescription{
 		inode: si,
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index a26b13067..1e53b5c1b 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -66,7 +66,7 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
 }
 
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
-func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
+func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
 	fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem)
 	if !ok {
 		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
@@ -108,7 +108,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err
 	// i.open will take a reference on d.
 	defer d.DecRef()
 
-	return i.open(d.VFSDentry(), mnt)
+	return i.open(ctx, d.VFSDentry(), mnt)
 }
 
 // inode implements kernfs.Inode.
@@ -360,11 +360,11 @@ func (i *inode) Destroy() {
 }
 
 // Open implements kernfs.Inode.
-func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.open(vfsd, rp.Mount())
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.open(ctx, vfsd, rp.Mount())
 }
 
-func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index d8bddbafa..c7779fc11 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -53,7 +53,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
 	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3ccd92fc5..9e8d80414 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -406,7 +406,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts)
+		return inode.Open(ctx, rp, vfsd, opts)
 	}
 
 	// May create new file.
@@ -425,7 +425,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts)
+		return inode.Open(ctx, rp, vfsd, opts)
 	}
 afterTrailingSymlink:
 	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
@@ -466,7 +466,7 @@ afterTrailingSymlink:
 		}
 		child := childVFSD.Impl().(*Dentry)
 		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		return child.inode.Open(rp, childVFSD, opts)
+		return child.inode.Open(ctx, rp, childVFSD, opts)
 	}
 	if err != nil {
 		return nil, err
@@ -499,7 +499,7 @@ afterTrailingSymlink:
 	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	return child.inode.Open(rp, &child.vfsd, opts)
+	return child.inode.Open(ctx, rp, &child.vfsd, opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 02f35a675..615592d5f 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -525,7 +525,7 @@ type InodeSymlink struct {
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -567,7 +567,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F
 }
 
 // Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 95cf6dc24..732837933 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -308,7 +308,7 @@ type Inode interface {
 	//
 	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
 	// the inode on which Open() is being called.
-	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
 }
 
 type inodeRefs interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 0964d5456..a9f671bc8 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -116,7 +116,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 	return &dir.dentry
 }
 
-func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
@@ -146,7 +146,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &dir.dentry
 }
 
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index faf3179bc..d6bd67467 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -129,9 +129,8 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	// FIXME(b/38173783): kernfs does not plumb Context here.
-	return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 28ec2484a..a5cfa8333 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -151,7 +151,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 }
 
 // Open implements kernfs.Inode.
-func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
 	if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index e2790d35b..66419d91b 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -102,7 +102,7 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 }
 
 // Open implements kernfs.Inode.
-func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index a7622f1b6..8ad976073 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -142,7 +142,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 }
 
 // Open implements kernfs.Inode.
-func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
@@ -269,7 +269,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 }
 
 // Open implements kernfs.Inode.
-func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 410cc3552..515f25327 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -788,7 +788,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.Fi
 }
 
 // Open implements Inode.Open.
-func (i *namespaceInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 26518ed03..5aeda8c9b 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -201,7 +201,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 }
 
 // Open implements kernfs.Inode.
-func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index 632cfde88..5ce50625b 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -69,7 +69,7 @@ type inode struct {
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ENXIO
 }
 
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 34e8e0cbe..f8d25d35e 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -105,7 +105,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	if err != nil {
 		return nil, err
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 0cbd63857..7e7a31fbd 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -91,7 +91,7 @@ func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kern
 
 	for appFD, hostFD := range stdioFDs {
 		// TODO(gvisor.dev/issue/1482): Add TTY support.
-		appFile, err := vfshost.ImportFD(k.HostMount(), hostFD, false)
+		appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false)
 		if err != nil {
 			return nil, err
 		}
-- 
cgit v1.2.3


From 1ceee045294a6059093851645968f5a7e00a58f3 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 24 Apr 2020 12:45:33 -0700
Subject: Do not copy tcpip.CancellableTimer

A CancellableTimer's AfterFunc timer instance creates a closure over the
CancellableTimer's address. This closure makes a CancellableTimer unsafe
to copy.

No behaviour change, existing tests pass.

PiperOrigin-RevId: 308306664
---
 pkg/tcpip/stack/ndp.go  | 31 +++++++++++++++++++++----------
 pkg/tcpip/timer.go      | 25 ++++++++++++++++++++++---
 pkg/tcpip/timer_test.go | 16 ++++++++--------
 3 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 193a9dfde..c11d62f97 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -412,20 +412,33 @@ type dadState struct {
 // defaultRouterState holds data associated with a default router discovered by
 // a Router Advertisement (RA).
 type defaultRouterState struct {
-	invalidationTimer tcpip.CancellableTimer
+	// Timer to invalidate the default router.
+	//
+	// May not be nil.
+	invalidationTimer *tcpip.CancellableTimer
 }
 
 // onLinkPrefixState holds data associated with an on-link prefix discovered by
 // a Router Advertisement's Prefix Information option (PI) when the NDP
 // configurations was configured to do so.
 type onLinkPrefixState struct {
-	invalidationTimer tcpip.CancellableTimer
+	// Timer to invalidate the on-link prefix.
+	//
+	// May not be nil.
+	invalidationTimer *tcpip.CancellableTimer
 }
 
 // slaacPrefixState holds state associated with a SLAAC prefix.
 type slaacPrefixState struct {
-	deprecationTimer  tcpip.CancellableTimer
-	invalidationTimer tcpip.CancellableTimer
+	// Timer to deprecate the prefix.
+	//
+	// May not be nil.
+	deprecationTimer *tcpip.CancellableTimer
+
+	// Timer to invalidate the prefix.
+	//
+	// May not be nil.
+	invalidationTimer *tcpip.CancellableTimer
 
 	// Nonzero only when the address is not valid forever.
 	validUntil time.Time
@@ -775,7 +788,6 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	}
 
 	rtr.invalidationTimer.StopLocked()
-
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
@@ -804,7 +816,7 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 	}
 
 	state := defaultRouterState{
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
 			ndp.invalidateDefaultRouter(ip)
 		}),
 	}
@@ -834,7 +846,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 	}
 
 	state := onLinkPrefixState{
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
 			ndp.invalidateOnLinkPrefix(prefix)
 		}),
 	}
@@ -859,7 +871,6 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	}
 
 	s.invalidationTimer.StopLocked()
-
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
@@ -979,7 +990,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	}
 
 	state := slaacPrefixState{
-		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
@@ -987,7 +998,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 
 			ndp.deprecateSLAACAddress(state.ref)
 		}),
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
index 67f66fc72..59f3b391f 100644
--- a/pkg/tcpip/timer.go
+++ b/pkg/tcpip/timer.go
@@ -88,6 +88,9 @@ func (t *cancellableTimerInstance) stop() {
 //
 // The term "related work" is defined as some work that needs to be done while
 // holding some lock that the timer must also hold while doing some work.
+//
+// Note, it is not safe to copy a CancellableTimer as its timer instance creates
+// a closure over the address of the CancellableTimer.
 type CancellableTimer struct {
 	// The active instance of a cancellable timer.
 	instance cancellableTimerInstance
@@ -154,12 +157,28 @@ func (t *CancellableTimer) Reset(d time.Duration) {
 	}
 }
 
-// MakeCancellableTimer returns an unscheduled CancellableTimer with the given
+// Lock is a no-op used by the copylocks checker from go vet.
+//
+// See CancellableTimer for details about why it shouldn't be copied.
+//
+// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
+// details about the copylocks checker.
+func (*CancellableTimer) Lock() {}
+
+// Unlock is a no-op used by the copylocks checker from go vet.
+//
+// See CancellableTimer for details about why it shouldn't be copied.
+//
+// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
+// details about the copylocks checker.
+func (*CancellableTimer) Unlock() {}
+
+// NewCancellableTimer returns an unscheduled CancellableTimer with the given
 // locker and fn.
 //
 // fn MUST NOT attempt to lock locker.
 //
 // Callers must call Reset to schedule the timer to fire.
-func MakeCancellableTimer(locker sync.Locker, fn func()) CancellableTimer {
-	return CancellableTimer{locker: locker, fn: fn}
+func NewCancellableTimer(locker sync.Locker, fn func()) *CancellableTimer {
+	return &CancellableTimer{locker: locker, fn: fn}
 }
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
index 730134906..b4940e397 100644
--- a/pkg/tcpip/timer_test.go
+++ b/pkg/tcpip/timer_test.go
@@ -43,7 +43,7 @@ func TestCancellableTimerReassignment(t *testing.T) {
 			// that has an active timer (even if it has been stopped as a stopped
 			// timer may be blocked on a lock before it can check if it has been
 			// stopped while another goroutine holds the same lock).
-			timer = tcpip.MakeCancellableTimer(&lock, func() {
+			timer = *tcpip.NewCancellableTimer(&lock, func() {
 				wg.Done()
 			})
 			timer.Reset(shortDuration)
@@ -59,7 +59,7 @@ func TestCancellableTimerFire(t *testing.T) {
 	ch := make(chan struct{})
 	var lock sync.Mutex
 
-	timer := tcpip.MakeCancellableTimer(&lock, func() {
+	timer := tcpip.NewCancellableTimer(&lock, func() {
 		ch <- struct{}{}
 	})
 	timer.Reset(shortDuration)
@@ -85,7 +85,7 @@ func TestCancellableTimerResetFromLongDuration(t *testing.T) {
 	ch := make(chan struct{})
 	var lock sync.Mutex
 
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(middleDuration)
 
 	lock.Lock()
@@ -116,7 +116,7 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	timer.StopLocked()
 	lock.Unlock()
@@ -153,7 +153,7 @@ func TestCancellableTimerImmediatelyStop(t *testing.T) {
 
 	for i := 0; i < 1000; i++ {
 		lock.Lock()
-		timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+		timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 		timer.Reset(shortDuration)
 		timer.StopLocked()
 		lock.Unlock()
@@ -174,7 +174,7 @@ func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	timer.StopLocked()
 	lock.Unlock()
@@ -205,7 +205,7 @@ func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	for i := 0; i < 10; i++ {
 		// Sleep until the timer fires and gets blocked trying to take the lock.
@@ -237,7 +237,7 @@ func TestManyCancellableTimerResetUnderLock(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	for i := 0; i < 10; i++ {
 		timer.StopLocked()
-- 
cgit v1.2.3


From f87964e829f438175edcc0264adc7ce7b3d83842 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 24 Apr 2020 12:51:37 -0700
Subject: kokoro: save all files from test.outputs/

If a test fails by timeout, bazel doesn't generate outputs.zip.

PiperOrigin-RevId: 308307815
---
 scripts/common_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/common_build.sh b/scripts/common_build.sh
index 3be0bb21c..4fe1067d2 100755
--- a/scripts/common_build.sh
+++ b/scripts/common_build.sh
@@ -70,8 +70,8 @@ function collect_logs() {
     for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
       junitparser merge `find $d -name test.xml` $d/test.xml
       cat $d/shard_*_of_*/test.log > $d/test.log
-      if ls -l $d/shard_*_of_*/test.outputs/outputs.zip 2>/dev/null; then
-        zip -r -1 "$d/outputs.zip" $d/shard_*_of_*/test.outputs/outputs.zip
+      if ls -ld $d/shard_*_of_*/test.outputs 2>/dev/null; then
+        zip -r -1 "$d/outputs.zip" $d/shard_*_of_*/test.outputs
       fi
     done
     find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
-- 
cgit v1.2.3


From f13f26d17da56d585fd9857a81175bbd0be8ce60 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 24 Apr 2020 13:45:31 -0700
Subject: Port SCM Rights to VFS2.

Fixes #1477.

PiperOrigin-RevId: 308317511
---
 pkg/sentry/fs/host/control.go             |   6 +-
 pkg/sentry/fsimpl/host/BUILD              |   3 +
 pkg/sentry/fsimpl/host/control.go         |  96 ++++++++++++++++++++++
 pkg/sentry/socket/control/BUILD           |   6 +-
 pkg/sentry/socket/control/control.go      |  24 +++++-
 pkg/sentry/socket/control/control_vfs2.go | 131 ++++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/socket.go  |   2 +-
 7 files changed, 260 insertions(+), 8 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/host/control.go
 create mode 100644 pkg/sentry/socket/control/control_vfs2.go

diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 52c0504b6..39299b7e4 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
+// LINT.IfChange
+
 type scmRights struct {
 	fds []int
 }
@@ -32,8 +34,6 @@ func newSCMRights(fds []int) control.SCMRights {
 }
 
 // Files implements control.SCMRights.Files.
-//
-// TODO(gvisor.dev/issue/2017): Port to VFS2.
 func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) {
 	n := max
 	var trunc bool
@@ -93,3 +93,5 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
 	}
 	return files
 }
+
+// LINT.ThenChange(../../fsimpl/host/control.go)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 44dd9f672..2dcb03a73 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -5,6 +5,7 @@ licenses(["notice"])
 go_library(
     name = "host",
     srcs = [
+        "control.go",
         "host.go",
         "ioctl_unsafe.go",
         "tty.go",
@@ -23,6 +24,8 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/vfs",
         "//pkg/sync",
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
new file mode 100644
index 000000000..b9082a20f
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -0,0 +1,96 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type scmRights struct {
+	fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRightsVFS2 {
+	return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) {
+	n := max
+	var trunc bool
+	if l := len(c.fds); n > l {
+		n = l
+	} else if n < l {
+		trunc = true
+	}
+
+	rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n]))
+
+	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+	c.fds = c.fds[len(rf):]
+	return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (c *scmRights) Clone() transport.RightsControlMessage {
+	// Host rights never need to be cloned.
+	return nil
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+	for _, fd := range c.fds {
+		syscall.Close(fd)
+	}
+	c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription {
+	files := make([]*vfs.FileDescription, 0, len(fds))
+	for _, fd := range fds {
+		// Get flags. We do it here because they may be modified
+		// by subsequent functions.
+		fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+		if errno != 0 {
+			ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+			break
+		}
+
+		// Create the file backed by hostFD.
+		file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */)
+		if err != nil {
+			ctx.Warningf("Error creating file from host FD: %v", err)
+			break
+		}
+
+		if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil {
+			ctx.Warningf("Error setting flags on host FD file: %v", err)
+			break
+		}
+
+		files = append(files, file)
+	}
+	return files
+}
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 4d42d29cb..ca16d0381 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -4,7 +4,10 @@ package(licenses = ["notice"])
 
 go_library(
     name = "control",
-    srcs = ["control.go"],
+    srcs = [
+        "control.go",
+        "control_vfs2.go",
+    ],
     imports = [
         "gvisor.dev/gvisor/pkg/sentry/fs",
     ],
@@ -18,6 +21,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 8834a1e1a..8b439a078 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -41,6 +41,8 @@ type SCMCredentials interface {
 	Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID)
 }
 
+// LINT.IfChange
+
 // SCMRights represents a SCM_RIGHTS socket control message.
 type SCMRights interface {
 	transport.RightsControlMessage
@@ -142,6 +144,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flag
 	return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
 }
 
+// LINT.ThenChange(./control_vfs2.go)
+
 // scmCredentials represents an SCM_CREDENTIALS socket control message.
 //
 // +stateify savable
@@ -537,11 +541,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 	}
 
 	if len(fds) > 0 {
-		rights, err := NewSCMRights(t, fds)
-		if err != nil {
-			return socket.ControlMessages{}, err
+		if kernel.VFS2Enabled {
+			rights, err := NewSCMRightsVFS2(t, fds)
+			if err != nil {
+				return socket.ControlMessages{}, err
+			}
+			cmsgs.Unix.Rights = rights
+		} else {
+			rights, err := NewSCMRights(t, fds)
+			if err != nil {
+				return socket.ControlMessages{}, err
+			}
+			cmsgs.Unix.Rights = rights
 		}
-		cmsgs.Unix.Rights = rights
 	}
 
 	return cmsgs, nil
@@ -566,6 +578,8 @@ func MakeCreds(t *kernel.Task) SCMCredentials {
 	return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
 }
 
+// LINT.IfChange
+
 // New creates default control messages if needed.
 func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
 	return transport.ControlMessages{
@@ -573,3 +587,5 @@ func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transpo
 		Rights:      rights,
 	}
 }
+
+// LINT.ThenChange(./control_vfs2.go)
diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go
new file mode 100644
index 000000000..fd08179be
--- /dev/null
+++ b/pkg/sentry/socket/control/control_vfs2.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SCMRightsVFS2 represents a SCM_RIGHTS socket control message.
+type SCMRightsVFS2 interface {
+	transport.RightsControlMessage
+
+	// Files returns up to max RightsFiles.
+	//
+	// Returned files are consumed and ownership is transferred to the caller.
+	// Subsequent calls to Files will return the next files.
+	Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool)
+}
+
+// RightsFiles represents a SCM_RIGHTS socket control message. A reference is
+// maintained for each vfs.FileDescription and is release either when an FD is created or
+// when the Release method is called.
+type RightsFilesVFS2 []*vfs.FileDescription
+
+// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message
+// representation using local sentry FDs.
+func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) {
+	files := make(RightsFilesVFS2, 0, len(fds))
+	for _, fd := range fds {
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			files.Release()
+			return nil, syserror.EBADF
+		}
+		files = append(files, file)
+	}
+	return &files, nil
+}
+
+// Files implements SCMRights.Files.
+func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) {
+	n := max
+	var trunc bool
+	if l := len(*fs); n > l {
+		n = l
+	} else if n < l {
+		trunc = true
+	}
+	rf := (*fs)[:n]
+	*fs = (*fs)[n:]
+	return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage {
+	nfs := append(RightsFilesVFS2(nil), *fs...)
+	for _, nf := range nfs {
+		nf.IncRef()
+	}
+	return &nfs
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (fs *RightsFilesVFS2) Release() {
+	for _, f := range *fs {
+		f.DecRef()
+	}
+	*fs = nil
+}
+
+// rightsFDsVFS2 gets up to the specified maximum number of FDs.
+func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) {
+	files, trunc := rights.Files(t, max)
+	fds := make([]int32, 0, len(files))
+	for i := 0; i < max && len(files) > 0; i++ {
+		fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{
+			CloseOnExec: cloexec,
+		})
+		files[0].DecRef()
+		files = files[1:]
+		if err != nil {
+			t.Warningf("Error inserting FD: %v", err)
+			// This is what Linux does.
+			break
+		}
+
+		fds = append(fds, int32(fd))
+	}
+	return fds, trunc
+}
+
+// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf.
+func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) {
+	maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4
+	// Linux does not return any FDs if none fit.
+	if maxFDs <= 0 {
+		flags |= linux.MSG_CTRUNC
+		return buf, flags
+	}
+	fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs)
+	if trunc {
+		flags |= linux.MSG_CTRUNC
+	}
+	align := t.Arch().Width()
+	return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
+}
+
+// NewVFS2 creates default control messages if needed.
+func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages {
+	return transport.ControlMessages{
+		Credentials: makeCreds(t, socketOrEndpoint),
+		Rights:      rights,
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index b1ede32f0..10b668477 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -804,7 +804,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 	}
 
 	if cms.Unix.Rights != nil {
-		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+		controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
 	}
 
 	// Copy the address to the caller.
-- 
cgit v1.2.3


From c60613475c92185c9b15468d0de87b321ef2b4d7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 24 Apr 2020 14:10:28 -0700
Subject: Standardize all Docker images.

This change moves all Docker images to a standard location, and abstracts the
build process so that they can be maintained in an automated fashion. This also
allows the images to be architecture-independent.

All images will now be referred to by the test framework via the canonical
`gvisor.dev/images/<name>`, where `<name>` is a function of the path within the
source tree.

In a subsequent change, continuous integration will be added so that the images
will always be correct and available locally.

In the end, using `bazel` for Docker containers is simply not possible. Given
that we already have the need to use `make` with the base container (for
Docker), we extend this approach to get more flexibility.

This change also adds a self-documenting and powerful Makefile that is intended
to replace the collection of scripts in scripts. Canonical (self-documenting)
targets can be added here for targets that understand which images need to be
loaded and/or built.

PiperOrigin-RevId: 308322438
---
 .travis.yml                                  |  11 +-
 CONTRIBUTING.md                              |  31 +----
 Dockerfile                                   |   9 --
 Makefile                                     | 199 ++++++++++++++++++++++-----
 images/BUILD                                 |  11 ++
 images/Makefile                              |  93 +++++++++++++
 images/README.md                             |  61 ++++++++
 images/basic/alpine/Dockerfile               |   1 +
 images/basic/busybox/Dockerfile              |   1 +
 images/basic/httpd/Dockerfile                |   1 +
 images/basic/mysql/Dockerfile                |   1 +
 images/basic/nginx/Dockerfile                |   1 +
 images/basic/python/Dockerfile               |   2 +
 images/basic/resolv/Dockerfile               |   1 +
 images/basic/ruby/Dockerfile                 |   1 +
 images/basic/tomcat/Dockerfile               |   1 +
 images/basic/ubuntu/Dockerfile               |   1 +
 images/default/Dockerfile                    |  11 ++
 images/iptables/Dockerfile                   |   2 +
 images/packetdrill/Dockerfile                |   8 ++
 images/packetimpact/Dockerfile               |  16 +++
 images/runtimes/go1.12/Dockerfile            |   4 +
 images/runtimes/java11/Dockerfile            |  22 +++
 images/runtimes/nodejs12.4.0/Dockerfile      |  21 +++
 images/runtimes/php7.3.6/Dockerfile          |  19 +++
 images/runtimes/python3.7.3/Dockerfile       |  21 +++
 pkg/test/testutil/testutil.go                |  38 +----
 scripts/build.sh                             |   4 -
 scripts/common.sh                            |  22 ---
 scripts/docker_tests.sh                      |   2 +
 scripts/hostnet_tests.sh                     |   2 +
 scripts/iptables_tests.sh                    |   6 +-
 scripts/kvm_tests.sh                         |   2 +
 scripts/make_tests.sh                        |   5 -
 scripts/overlay_tests.sh                     |   2 +
 scripts/packetdrill_tests.sh                 |   2 +
 scripts/packetimpact_tests.sh                |   2 +
 scripts/root_tests.sh                        |   3 +-
 scripts/swgso_tests.sh                       |   2 +
 test/iptables/runner/Dockerfile              |   4 -
 test/packetdrill/Dockerfile                  |   9 --
 test/packetimpact/tests/Dockerfile           |  17 ---
 test/root/BUILD                              |   1 +
 test/runtimes/images/Dockerfile_go1.12       |  10 --
 test/runtimes/images/Dockerfile_java11       |  30 ----
 test/runtimes/images/Dockerfile_nodejs12.4.0 |  28 ----
 test/runtimes/images/Dockerfile_php7.3.6     |  27 ----
 test/runtimes/images/Dockerfile_python3.7.3  |  30 ----
 tools/bazel.mk                               | 106 ++++++++++++++
 tools/installers/BUILD                       |   8 ++
 tools/installers/images.sh                   |  24 ++++
 tools/make_repository.sh                     |  31 ++++-
 tools/vm/defs.bzl                            |   5 +-
 53 files changed, 665 insertions(+), 307 deletions(-)
 delete mode 100644 Dockerfile
 create mode 100644 images/BUILD
 create mode 100644 images/Makefile
 create mode 100644 images/README.md
 create mode 100644 images/basic/alpine/Dockerfile
 create mode 100644 images/basic/busybox/Dockerfile
 create mode 100644 images/basic/httpd/Dockerfile
 create mode 100644 images/basic/mysql/Dockerfile
 create mode 100644 images/basic/nginx/Dockerfile
 create mode 100644 images/basic/python/Dockerfile
 create mode 100644 images/basic/resolv/Dockerfile
 create mode 100644 images/basic/ruby/Dockerfile
 create mode 100644 images/basic/tomcat/Dockerfile
 create mode 100644 images/basic/ubuntu/Dockerfile
 create mode 100644 images/default/Dockerfile
 create mode 100644 images/iptables/Dockerfile
 create mode 100644 images/packetdrill/Dockerfile
 create mode 100644 images/packetimpact/Dockerfile
 create mode 100644 images/runtimes/go1.12/Dockerfile
 create mode 100644 images/runtimes/java11/Dockerfile
 create mode 100644 images/runtimes/nodejs12.4.0/Dockerfile
 create mode 100644 images/runtimes/php7.3.6/Dockerfile
 create mode 100644 images/runtimes/python3.7.3/Dockerfile
 delete mode 100644 test/iptables/runner/Dockerfile
 delete mode 100644 test/packetdrill/Dockerfile
 delete mode 100644 test/packetimpact/tests/Dockerfile
 delete mode 100644 test/runtimes/images/Dockerfile_go1.12
 delete mode 100644 test/runtimes/images/Dockerfile_java11
 delete mode 100644 test/runtimes/images/Dockerfile_nodejs12.4.0
 delete mode 100644 test/runtimes/images/Dockerfile_php7.3.6
 delete mode 100644 test/runtimes/images/Dockerfile_python3.7.3
 create mode 100644 tools/bazel.mk
 create mode 100755 tools/installers/images.sh

diff --git a/.travis.yml b/.travis.yml
index acbd3d61b..40c8773fa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,22 +1,19 @@
-language: minimal
-sudo: required
+language: shell
 dist: xenial
 cache:
   directories:
     - /home/travis/.cache/bazel/
+os: linux
 services:
   - docker
-matrix:
+jobs:
   include:
    - os: linux
      arch: amd64
-     env: RUNSC_PATH=./bazel-bin/runsc/linux_amd64_pure_stripped/runsc
    - os: linux
      arch: arm64
-     env: RUNSC_PATH=./bazel-bin/runsc/linux_arm64_pure_stripped/runsc
 script:
-   - uname -a
-   - make DOCKER_RUN_OPTIONS="" BAZEL_OPTIONS="build runsc:runsc" bazel && $RUNSC_PATH --alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do ls
+   - uname -a && make smoke-test
 branches:
   except:
   # Skip copybara branches.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ad8e710da..423cf7a34 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -108,32 +108,15 @@ ignored.
 
 ### Build and test with Docker
 
-`scripts/dev.sh` is a convenient script that builds and installs `runsc` as a
-new Docker runtime for you. The scripts tries to extract the runtime name from
-your local environment and will print it at the end. You can also customize it.
-The script creates one regular runtime and another with debug flags enabled.
-Here are a few examples:
+Running `make dev` is a convenient way to build and install `runsc` as a Docker
+runtime. The output of this command will show the runtimes installed.
+
+You may use `make refresh` to refresh the binary after any changes. For example:
 
 ```bash
-# Default case (inside branch my-branch)
-$ scripts/dev.sh
-...
-Runtimes my-branch and my-branch-d (debug enabled) setup.
-Use --runtime=my-branch with your Docker command.
-  docker run --rm --runtime=my-branch --rm hello-world
-
-If you rebuild, use scripts/dev.sh --refresh.
-Logs are in: /tmp/my-branch/logs
-
-# --refresh just updates the runtime binary and doesn't restart docker.
-$ git/my_branch> scripts/dev.sh --refresh
-
-# Using a custom runtime name
-$ git/my_branch> scripts/dev.sh my-runtime
-...
-Runtimes my-runtime and my-runtime-d (debug enabled) setup.
-Use --runtime=my-runtime with your Docker command.
-  docker run --rm --runtime=my-runtime --rm hello-world
+make dev
+docker run --rm --runtime=my-branch --rm hello-world
+make refresh
 ```
 
 ### The small print
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 0fac71710..000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM fedora:31
-
-RUN  dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
-
-RUN dnf install -y bazel2 git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch
-
-RUN pip install pycparser
-
-WORKDIR /gvisor
diff --git a/Makefile b/Makefile
index d9531fbd5..c56c6ed48 100644
--- a/Makefile
+++ b/Makefile
@@ -1,50 +1,173 @@
-UID := $(shell id -u ${USER})
-GID := $(shell id -g ${USER})
-GVISOR_BAZEL_CACHE := $(shell readlink -f ~/.cache/bazel/)
+#!/usr/bin/make -f
 
-# The  --privileged is required to run tests.
-DOCKER_RUN_OPTIONS ?= --privileged
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-all: runsc
+# Described below.
+OPTIONS :=
+TARGETS := //runsc
+ARGS    :=
 
-docker-build:
-	docker build -t gvisor-bazel .
+default: runsc
+.PHONY: default
 
-bazel-shutdown:
-	docker exec -i gvisor-bazel bazel shutdown && \
-	docker kill gvisor-bazel
+## usage: make <target>
+##         or
+##        make <build|test|copy|run|sudo> OPTIONS="..." TARGETS="..." ARGS="..."
+##
+## Basic targets.
+##
+##   This Makefile wraps basic build and test targets for ease-of-use. Bazel
+##   is run inside a canonical Docker container in order to simplify up-front
+##   requirements.
+##
+##   There are common arguments that may be passed to targets. These are:
+##     OPTIONS - Build or test options.
+##     TARGETS - The bazel targets.
+##     ARGS    - Arguments for run or sudo.
+##
+##   Additionally, the copy target expects a DESTINATION to be provided.
+##
+##   For example, to build runsc using this Makefile, you can run:
+##     make build OPTIONS="" TARGETS="//runsc"'
+##
+help: ## Shows all targets and help from the Makefile (this message).
+	@grep --no-filename -E '^([a-z.A-Z_-]+:.*?|)##' $(MAKEFILE_LIST) | \
+		awk 'BEGIN {FS = "(:.*?|)## ?"}; { \
+			if (length($$1) > 0) { \
+				printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2; \
+			} else { \
+				printf "%s\n", $$2; \
+			} \
+		}'
+build: ## Builds the given $(TARGETS) with the given $(OPTIONS). E.g. make build TARGETS=runsc
+test:  ## Tests the given $(TARGETS) with the given $(OPTIONS). E.g. make test TARGETS=pkg/buffer:buffer_test
+copy:  ## Copies the given $(TARGETS) to the given $(DESTINATION). E.g. make copy TARGETS=runsc DESTINATION=/tmp
+run:   ## Runs the given $(TARGETS), built with $(OPTIONS), using $(ARGS). E.g. make run TARGETS=runsc ARGS=-version
+sudo:  ## Runs the given $(TARGETS) as per run, but using "sudo -E". E.g. make sudo TARGETS=test/root:root_test ARGS=-test.v
+.PHONY: help build test copy run sudo
 
-bazel-server-start: docker-build
-	mkdir -p "$(GVISOR_BAZEL_CACHE)" && \
-	docker run -d --rm --name gvisor-bazel \
-		--user 0:0 \
-		-v "$(GVISOR_BAZEL_CACHE):$(HOME)/.cache/bazel/" \
-		-v "$(CURDIR):$(CURDIR)" \
-		--workdir "$(CURDIR)" \
-		--tmpfs /tmp:rw,exec \
-		$(DOCKER_RUN_OPTIONS) \
-		gvisor-bazel \
-		sh -c "while :; do sleep 100; done" && \
-	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
+# Load all bazel wrappers.
+#
+# This file should define the basic "build", "test", "run" and "sudo" rules, in
+# addition to the $(BRANCH_NAME) variable.
+ifneq (,$(wildcard tools/google.mk))
+include tools/google.mk
+else
+include tools/bazel.mk
+endif
 
-bazel-server:
-	docker exec gvisor-bazel true || \
-	$(MAKE) bazel-server-start
+##
+## Docker image targets.
+##
+##   Images used by the tests must also be built and available locally.
+##   The canonical test targets defined below will automatically load
+##   relevant images. These can be loaded or built manually via these
+##   targets.
+##
+##   (*) Note that you may provide an ARCH parameter in order to build
+##   and load images from an alternate archiecture (using qemu). When
+##   bazel is run as a server, this has the effect of running an full
+##   cross-architecture chain, and can produce cross-compiled binaries.
+##
+define images
+$(1)-%: ## Image tool: $(1) a given image (also may use 'all-images').
+	@$(MAKE) -C images $$@
+endef
+rebuild-...: ## Rebuild the given image. Also may use 'rebuild-all-images'.
+$(eval $(call images,rebuild))
+push-...: ## Push the given image. Also may use 'push-all-images'.
+$(eval $(call images,pull))
+pull-...: ## Pull the given image. Also may use 'pull-all-images'.
+$(eval $(call images,push))
+load-...: ## Load (pull or rebuild) the given image. Also may use 'load-all-images'.
+$(eval $(call images,load))
+list-images: ## List all available images.
+	@$(MAKE) -C images $$@
 
-BAZEL_OPTIONS := build runsc
-bazel: bazel-server
-	docker exec -u $(UID):$(GID) -i gvisor-bazel bazel $(BAZEL_OPTIONS)
+##
+## Canonical build and test targets.
+##
+##   These targets are used by continuous integration and provide
+##   convenient entrypoints for testing changes. If you're adding a
+##   new subsystem or workflow, consider adding a new target here.
+##
+runsc: ## Builds the runsc binary.
+	@$(MAKE) build TARGETS="//runsc"
+.PHONY: runsc
 
-bazel-alias:
-	@echo "alias bazel='docker exec -u $(UID):$(GID) -i gvisor-bazel bazel'"
+smoke-test: ## Runs a simple smoke test after build runsc.
+	@$(MAKE) run DOCKER_RUN_OPTIONS="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true"
+.PHONY: smoke-tests
 
-runsc:
-	$(MAKE) BAZEL_OPTIONS="build runsc" bazel
+unit-tests: ## Runs all unit tests in pkg runsc and tools.
+	@$(MAKE) test OPTIONS="pkg/... runsc/... tools/..."
+.PHONY: unit-tests
 
-tests:
-	$(MAKE) BAZEL_OPTIONS="test --test_tag_filters runsc_ptrace //test/syscalls/..." bazel
+tests: ## Runs all local ptrace system call tests.
+	@$(MAKE) test OPTIONS="--test_tag_filter runsc_ptrace test/syscalls/..."
+.PHONY: tests
 
-unit-tests:
-	$(MAKE) BAZEL_OPTIONS="test //pkg/... //runsc/... //tools/..." bazel
+##
+## Development helpers and tooling.
+##
+##   These targets faciliate local development by automatically
+##   installing and configuring a runtime. Several variables may
+##   be used here to tweak the installation:
+##     RUNTIME         - The name of the installed runtime (default: branch).
+##     RUNTIME_DIR     - Where the runtime will be installed (default: temporary directory with the $RUNTIME).
+##     RUNTIME_BIN     - The runtime binary (default: $RUNTIME_DIR/runsc).
+##     RUNTIME_LOG_DIR - The logs directory (default: $RUNTIME_DIR/logs).
+##     RUNTIME_LOGS    - The log pattern (default: $RUNTIME_LOG_DIR/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%).
+##
+ifeq (,$(BRANCH_NAME))
+RUNTIME     := runsc
+RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/runsc
+else
+RUNTIME     := $(BRANCH_NAME)
+RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(BRANCH_NAME)
+endif
+RUNTIME_BIN     := $(RUNTIME_DIR)/runsc
+RUNTIME_LOG_DIR := $(RUNTIME_DIR)/logs
+RUNTIME_LOGS    := $(RUNTIME_LOG_DIR)/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%
 
-.PHONY: docker-build bazel-shutdown bazel-server-start bazel-server bazel runsc tests
+dev: ## Installs a set of local runtimes. Requires sudo.
+	@$(MAKE) refresh ARGS="--net-raw"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)" ARGS="--net-raw"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)-p" ARGS="--net-raw --profile"
+	@sudo systemctl restart docker
+.PHONY: dev
+
+refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'test-install' first.
+	@mkdir -p "$(RUNTIME_DIR)"
+	@$(MAKE) copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)" && chmod 0755 "$(RUNTIME_BIN)"
+.PHONY: install
+
+test-install: ## Installs the runtime for testing. Requires sudo.
+	@$(MAKE) refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME --debug --strace --log-packets $(ARGS)"
+	@$(MAKE) configure
+	@sudo systemctl restart docker
+.PHONY: install-test
+
+configure: ## Configures a single runtime. Requires sudo. Typically called from dev or test-install.
+	@sudo sudo "$(RUNTIME_BIN)" install --experimental=true --runtime="$(RUNTIME)" -- --debug-log "$(RUNTIME_LOGS)" $(ARGS)
+	@echo "Installed runtime \"$(RUNTIME)\" @ $(RUNTIME_BIN)"
+	@echo "Logs are in: $(RUNTIME_LOG_DIR)"
+	@sudo rm -rf "$(RUNTIME_LOG_DIR)" && mkdir -p "$(RUNTIME_LOG_DIR)"
+.PHONY: configure
+
+test-runtime: ## A convenient wrapper around test that provides the runtime argument. Target must still be provided.
+	@$(MAKE) test OPTIONS="$(OPTIONS) --test_arg=--runtime=$(RUNTIME)"
+.PHONY: runtime-test
diff --git a/images/BUILD b/images/BUILD
new file mode 100644
index 000000000..a50f388e9
--- /dev/null
+++ b/images/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])
+
+# The images filegroup is definitely not a hermetic target, and requires Make
+# to do anything meaningful with. However, this will be slurped up and used by
+# the tools/installer/images.sh installer, which will ensure that all required
+# images are available locally when running vm_tests.
+filegroup(
+    name = "images",
+    srcs = glob(["**"]),
+    visibility = ["//tools/installers:__pkg__"],
+)
diff --git a/images/Makefile b/images/Makefile
new file mode 100644
index 000000000..1485607bd
--- /dev/null
+++ b/images/Makefile
@@ -0,0 +1,93 @@
+#!/usr/bin/make -f
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ARCH is the architecture used for the build. This may be overriden at the
+# command line in order to perform a cross-build (in a limited capacity).
+ARCH := $(shell uname -m)
+
+# Note that the image prefixes used here must match the image mangling in
+# runsc/testutil.MangleImage. Names are mangled in this way to ensure that all
+# tests are using locally-defined images (that are consistent and idempotent).
+REMOTE_IMAGE_PREFIX ?= gcr.io/gvisor-presubmit
+LOCAL_IMAGE_PREFIX ?= gvisor.dev/images
+ALL_IMAGES := $(subst /,_,$(subst ./,,$(shell find . -name Dockerfile -exec dirname {} \;)))
+ifneq ($(ARCH),$(shell uname -m))
+DOCKER_PLATFORM_ARGS := --platform=$(ARCH)
+else
+DOCKER_PLATFORM_ARGS :=
+endif
+
+list-all-images:
+	@for image in $(ALL_IMAGES); do echo $${image}; done
+.PHONY: list-build-images
+
+%-all-images:
+	@$(MAKE) $(patsubst %,$*-%,$(ALL_IMAGES))
+
+# tag is a function that returns the tag name, given an image.
+#
+# The tag constructed is used to memoize the image generated (see README.md).
+# This scheme is used to enable aggressive caching in a central repository, but
+# ensuring that images will always be sourced using the local files if there
+# are changes.
+path = $(subst _,/,$(1))
+tag = $(shell find $(call path,$(1)) -type f -print | sort | xargs -n 1 sha256sum | sha256sum - | cut -c 1-16)
+remote_image = $(REMOTE_IMAGE_PREFIX)/$(subst _,/,$(1))_$(ARCH):$(call tag,$(1))
+local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1))
+
+# rebuild builds the image locally. Only the "remote" tag will be applied. Note
+# we need to explicitly repull the base layer in order to ensure that the
+# architecture is correct. Note that we use the term "rebuild" here to avoid
+# conflicting with the bazel "build" terminology, which is used elsewhere.
+rebuild-%: register-cross
+	FROM=$(shell grep FROM $(call path,$*)/Dockerfile | cut -d' ' -f2-) && \
+		docker pull $(DOCKER_PLATFORM_ARGS) $$FROM
+	T=$$(mktemp -d) && cp -a $(call path,$*)/* $$T && \
+		docker build $(DOCKER_PLATFORM_ARGS) -t $(call remote_image,$*) $$T && \
+		rm -rf $$T
+
+# pull will check the "remote" image and pull if necessary. If the remote image
+# must be pulled, then it will tag with the latest local target. Note that pull
+# may fail if the remote image is not available.
+pull-%:
+	docker pull $(DOCKER_PLATFORM_ARGS) $(call remote_image,$*)
+
+# load will either pull the "remote" or build it locally. This is the preferred
+# entrypoint, as it should never file. The local tag should always be set after
+# this returns (either by the pull or the build).
+load-%:
+	docker inspect $(call remote_image,$*) >/dev/null 2>&1 || $(MAKE) pull-$* || $(MAKE) rebuild-$*
+	docker tag $(call remote_image,$*) $(call local_image,$*)
+
+# push pushes the remote image, after either pulling (to validate that the tag
+# already exists) or building manually.
+push-%: load-%
+	docker push $(call remote_image,$*)
+
+# register-cross registers the necessary qemu binaries for cross-compilation.
+# This may be used by any target that may execute containers that are not the
+# native format.
+register-cross:
+ifneq ($(ARCH),$(shell uname -m))
+ifeq (,$(wildcard /proc/sys/fs/binfmt_misc/qemu-*))
+	docker run --rm --privileged multiarch/qemu-user-static --reset --persistent yes
+else
+	@true # Already registered.
+endif
+else
+	@true # No cross required.
+endif
+.PHONY: register-cross
diff --git a/images/README.md b/images/README.md
new file mode 100644
index 000000000..d2efb5db4
--- /dev/null
+++ b/images/README.md
@@ -0,0 +1,61 @@
+# Container Images
+
+This directory contains all images used by tests.
+
+Note that all these images must be pushed to the testing project hosted on
+[Google Container Registry][gcr]. This will happen automatically as part of
+continuous integration. This will speed up loading as images will not need to be
+built from scratch for each test run.
+
+Image tooling is accessible via `make`, specifically via `tools/images.mk`.
+
+## Why make?
+
+Make is used because it can bootstrap the `default` image, which contains
+`bazel` and all other parts of the toolchain.
+
+## Listing images
+
+To list all images, use `make list-all-images` from the top-level directory.
+
+## Loading and referencing images
+
+To build a specific image, use `make load-<image>` from the top-level directory.
+This will ensure that an image `gvisor.dev/images/<image>:latest` is available.
+
+Images should always be referred to via the `gvisor.dev/images` canonical path.
+This tag exists only locally, but serves to decouple tests from the underlying
+image infrastructure.
+
+The continuous integration system can either take fine-grained dependencies on
+single images via individual `load` targets, or pull all images via a single
+`load-all-images` invocation.
+
+## Adding new images
+
+To add a new image, create a new directory under `images` containing a
+Dockerfile and any other files that the image requires. You may choose to add to
+an existing subdirectory if applicable, or create a new one.
+
+All images will be tagged and memoized using a hash of the directory contents.
+As a result, every image should be made completely reproducible if possible.
+This means using fixed tags and fixed versions whenever feasible.
+
+Notes that images should also be made architecture-independent if possible. The
+build scripts will handling loading the appropriate architecture onto the
+machine and tagging it with the single canonical tag.
+
+Add a `load-<image>` dependency in the Makefile if the image is required for a
+particular set of tests. This target will pull the tag from the image repository
+if available.
+
+## Building and pushing images
+
+All images can be built manually by running `build-<image>` and pushed using
+`push-<image>`. Note that you can also use `build-all-images` and
+`push-all-images`. Note that pushing will require appropriate permissions in the
+project.
+
+The continuous integration system can either take fine-grained dependencies on
+individual `push` targets, or ensure all images are up-to-date with a single
+`push-all-images` invocation.
diff --git a/images/basic/alpine/Dockerfile b/images/basic/alpine/Dockerfile
new file mode 100644
index 000000000..12b26040a
--- /dev/null
+++ b/images/basic/alpine/Dockerfile
@@ -0,0 +1 @@
+FROM alpine:3.11.5
diff --git a/images/basic/busybox/Dockerfile b/images/basic/busybox/Dockerfile
new file mode 100644
index 000000000..79b3f683a
--- /dev/null
+++ b/images/basic/busybox/Dockerfile
@@ -0,0 +1 @@
+FROM busybox:1.31.1
diff --git a/images/basic/httpd/Dockerfile b/images/basic/httpd/Dockerfile
new file mode 100644
index 000000000..83bc0ed88
--- /dev/null
+++ b/images/basic/httpd/Dockerfile
@@ -0,0 +1 @@
+FROM httpd:2.4.43
diff --git a/images/basic/mysql/Dockerfile b/images/basic/mysql/Dockerfile
new file mode 100644
index 000000000..95da9c48d
--- /dev/null
+++ b/images/basic/mysql/Dockerfile
@@ -0,0 +1 @@
+FROM mysql:8.0.19
diff --git a/images/basic/nginx/Dockerfile b/images/basic/nginx/Dockerfile
new file mode 100644
index 000000000..af2e62526
--- /dev/null
+++ b/images/basic/nginx/Dockerfile
@@ -0,0 +1 @@
+FROM nginx:1.17.9
diff --git a/images/basic/python/Dockerfile b/images/basic/python/Dockerfile
new file mode 100644
index 000000000..acf07cca9
--- /dev/null
+++ b/images/basic/python/Dockerfile
@@ -0,0 +1,2 @@
+FROM python:3
+ENTRYPOINT ["python", "-m", "http.server", "8080"]
diff --git a/images/basic/resolv/Dockerfile b/images/basic/resolv/Dockerfile
new file mode 100644
index 000000000..13665bdaf
--- /dev/null
+++ b/images/basic/resolv/Dockerfile
@@ -0,0 +1 @@
+FROM k8s.gcr.io/busybox:latest
diff --git a/images/basic/ruby/Dockerfile b/images/basic/ruby/Dockerfile
new file mode 100644
index 000000000..d290418fb
--- /dev/null
+++ b/images/basic/ruby/Dockerfile
@@ -0,0 +1 @@
+FROM ruby:2.7.1
diff --git a/images/basic/tomcat/Dockerfile b/images/basic/tomcat/Dockerfile
new file mode 100644
index 000000000..c7db39a36
--- /dev/null
+++ b/images/basic/tomcat/Dockerfile
@@ -0,0 +1 @@
+FROM tomcat:8.0
diff --git a/images/basic/ubuntu/Dockerfile b/images/basic/ubuntu/Dockerfile
new file mode 100644
index 000000000..331b71343
--- /dev/null
+++ b/images/basic/ubuntu/Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:trusty
diff --git a/images/default/Dockerfile b/images/default/Dockerfile
new file mode 100644
index 000000000..2d0bb5ba5
--- /dev/null
+++ b/images/default/Dockerfile
@@ -0,0 +1,11 @@
+FROM fedora:31
+RUN dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
+RUN dnf install -y git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch
+RUN pip install pycparser
+RUN dnf install -y bazel3
+RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-289.0.0-linux-x86_64.tar.gz | \
+    tar zxvf - google-cloud-sdk && \
+    google-cloud-sdk/install.sh && \
+    ln -s /google-cloud-sdk/bin/gcloud /usr/bin/gcloud
+WORKDIR /workspace
+ENTRYPOINT ["/usr/bin/bazel"]
diff --git a/images/iptables/Dockerfile b/images/iptables/Dockerfile
new file mode 100644
index 000000000..efd91cb80
--- /dev/null
+++ b/images/iptables/Dockerfile
@@ -0,0 +1,2 @@
+FROM ubuntu
+RUN apt update && apt install -y iptables
diff --git a/images/packetdrill/Dockerfile b/images/packetdrill/Dockerfile
new file mode 100644
index 000000000..7a006c85f
--- /dev/null
+++ b/images/packetdrill/Dockerfile
@@ -0,0 +1,8 @@
+FROM ubuntu:bionic
+RUN apt-get update && apt-get install -y net-tools git iptables iputils-ping \
+        netcat tcpdump jq tar bison flex make
+RUN hash -r
+RUN git clone --branch packetdrill-v2.0 \
+        https://github.com/google/packetdrill.git
+RUN cd packetdrill/gtests/net/packetdrill && ./configure && make
+CMD /bin/bash
diff --git a/images/packetimpact/Dockerfile b/images/packetimpact/Dockerfile
new file mode 100644
index 000000000..87aa99ef2
--- /dev/null
+++ b/images/packetimpact/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:bionic
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        # iptables to disable OS native packet processing.
+        iptables \
+        # nc to check that the posix_server is running.
+        netcat \
+        # tcpdump to log brief packet sniffing.
+        tcpdump \
+        # ip link show to display MAC addresses.
+        iproute2 \
+        # tshark to log verbose packet sniffing.
+        tshark \
+        # killall for cleanup.
+        psmisc
+RUN hash -r
+CMD /bin/bash
diff --git a/images/runtimes/go1.12/Dockerfile b/images/runtimes/go1.12/Dockerfile
new file mode 100644
index 000000000..cb2944062
--- /dev/null
+++ b/images/runtimes/go1.12/Dockerfile
@@ -0,0 +1,4 @@
+# Go is easy, since we already have everything we need to compile the proctor
+# binary and run the tests in the golang Docker image.
+FROM golang:1.12
+RUN ["go", "tool", "dist", "test", "-compile-only"]
diff --git a/images/runtimes/java11/Dockerfile b/images/runtimes/java11/Dockerfile
new file mode 100644
index 000000000..03bc8aaf1
--- /dev/null
+++ b/images/runtimes/java11/Dockerfile
@@ -0,0 +1,22 @@
+FROM ubuntu:bionic
+RUN apt-get update && apt-get install -y \
+  autoconf \
+  build-essential \
+  curl \
+  make \
+  openjdk-11-jdk \
+  unzip \
+  zip
+
+# Download the JDK test library.
+WORKDIR /root
+RUN set -ex \
+ && curl -fsSL --retry 10 -o /tmp/jdktests.tar.gz http://hg.openjdk.java.net/jdk/jdk11/archive/76072a077ee1.tar.gz/test \
+ && tar -xzf /tmp/jdktests.tar.gz \
+ && mv jdk11-76072a077ee1/test test \
+ && rm -f /tmp/jdktests.tar.gz
+
+# Install jtreg and add to PATH.
+RUN curl -o jtreg.tar.gz https://ci.adoptopenjdk.net/view/Dependencies/job/jtreg/lastSuccessfulBuild/artifact/jtreg-4.2.0-tip.tar.gz
+RUN tar -xzf jtreg.tar.gz
+ENV PATH="/root/jtreg/bin:$PATH"
diff --git a/images/runtimes/nodejs12.4.0/Dockerfile b/images/runtimes/nodejs12.4.0/Dockerfile
new file mode 100644
index 000000000..d17924b62
--- /dev/null
+++ b/images/runtimes/nodejs12.4.0/Dockerfile
@@ -0,0 +1,21 @@
+FROM ubuntu:bionic
+RUN apt-get update && apt-get install -y \
+  curl \
+  dumb-init \
+  g++ \
+  make \
+  python
+
+WORKDIR /root
+ARG VERSION=v12.4.0
+RUN curl -o node-${VERSION}.tar.gz https://nodejs.org/dist/${VERSION}/node-${VERSION}.tar.gz
+RUN tar -zxf node-${VERSION}.tar.gz
+
+WORKDIR /root/node-${VERSION}
+RUN ./configure
+RUN make
+RUN make test-build
+
+# Including dumb-init emulates the Linux "init" process, preventing the failure
+# of tests involving worker processes.
+ENTRYPOINT ["/usr/bin/dumb-init"]
diff --git a/images/runtimes/php7.3.6/Dockerfile b/images/runtimes/php7.3.6/Dockerfile
new file mode 100644
index 000000000..e5f67f79c
--- /dev/null
+++ b/images/runtimes/php7.3.6/Dockerfile
@@ -0,0 +1,19 @@
+FROM ubuntu:bionic
+RUN apt-get update && apt-get install -y \
+  autoconf \
+  automake \
+  bison \
+  build-essential \
+  curl \
+  libtool \
+  libxml2-dev \
+  re2c
+
+WORKDIR /root
+ARG VERSION=7.3.6
+RUN curl -o php-${VERSION}.tar.gz https://www.php.net/distributions/php-${VERSION}.tar.gz
+RUN tar -zxf php-${VERSION}.tar.gz
+
+WORKDIR /root/php-${VERSION}
+RUN ./configure
+RUN make
diff --git a/images/runtimes/python3.7.3/Dockerfile b/images/runtimes/python3.7.3/Dockerfile
new file mode 100644
index 000000000..4d1e1e221
--- /dev/null
+++ b/images/runtimes/python3.7.3/Dockerfile
@@ -0,0 +1,21 @@
+FROM ubuntu:bionic
+RUN apt-get update && apt-get install -y \
+  curl \
+  gcc \
+  libbz2-dev \
+  libffi-dev \
+  liblzma-dev \
+  libreadline-dev \
+  libssl-dev \
+  make \
+  zlib1g-dev
+
+# Use flags -LJO to follow the html redirect and download .tar.gz.
+WORKDIR /root
+ARG VERSION=3.7.3
+RUN curl -LJO https://github.com/python/cpython/archive/v${VERSION}.tar.gz
+RUN tar -zxf cpython-${VERSION}.tar.gz
+
+WORKDIR /root/cpython-${VERSION}
+RUN ./configure --with-pydebug
+RUN make -s -j2
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index d75ceca3d..ee8c78014 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -57,42 +57,10 @@ func IsCheckpointSupported() bool {
 	return *checkpoint
 }
 
-// nameToActual is used by ImageByName (for now).
-var nameToActual = map[string]string{
-	"basic/alpine":          "alpine",
-	"basic/busybox":         "busybox:1.31.1",
-	"basic/httpd":           "httpd",
-	"basic/mysql":           "mysql",
-	"basic/nginx":           "nginx",
-	"basic/python":          "gcr.io/gvisor-presubmit/python-hello",
-	"basic/resolv":          "k8s.gcr.io/busybox",
-	"basic/ruby":            "ruby",
-	"basic/tomcat":          "tomcat:8.0",
-	"basic/ubuntu":          "ubuntu:trusty",
-	"iptables":              "gcr.io/gvisor-presubmit/iptables-test",
-	"packetdrill":           "gcr.io/gvisor-presubmit/packetdrill",
-	"packetimpact":          "gcr.io/gvisor-presubmit/packetimpact",
-	"runtimes/go1.12":       "gcr.io/gvisor-presubmit/go1.12",
-	"runtimes/java11":       "gcr.io/gvisor-presubmit/java11",
-	"runtimes/nodejs12.4.0": "gcr.io/gvisor-presubmit/nodejs12.4.0",
-	"runtimes/php7.3.6":     "gcr.io/gvisor-presubmit/php7.3.6",
-	"runtimes/python3.7.3":  "gcr.io/gvisor-presubmit/python3.7.3",
-}
-
-// ImageByName mangles the image name used locally.
-//
-// For now, this is implemented as a static lookup table. In a subsequent
-// change, this will be used to reference a locally-generated image.
+// ImageByName mangles the image name used locally. This depends on the image
+// build infrastructure in images/ and tools/vm.
 func ImageByName(name string) string {
-	actual, ok := nameToActual[name]
-	if !ok {
-		panic(fmt.Sprintf("unknown image: %v", name))
-	}
-	// A terrible hack, for now execute a manual pull.
-	if out, err := exec.Command("docker", "pull", actual).CombinedOutput(); err != nil {
-		panic(fmt.Sprintf("error pulling image %q -> %q: %v, out: %s", name, actual, err, string(out)))
-	}
-	return actual
+	return fmt.Sprintf("gvisor.dev/images/%s", name)
 }
 
 // ConfigureExePath configures the executable for runsc in the test environment.
diff --git a/scripts/build.sh b/scripts/build.sh
index 7c9c99800..e821e8624 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -16,9 +16,6 @@
 
 source $(dirname $0)/common.sh
 
-# Install required packages for make_repository.sh et al.
-apt_install dpkg-sig coreutils apt-utils xz-utils
-
 # Build runsc.
 runsc=$(build -c opt //runsc)
 
@@ -45,7 +42,6 @@ if [[ -v KOKORO_REPO_KEY ]]; then
   repo=$(tools/make_repository.sh \
           "${KOKORO_KEYSTORE_DIR}/${KOKORO_REPO_KEY}" \
           gvisor-bot@google.com \
-          main \
           "${KOKORO_ARTIFACTS_DIR}" \
           ${pkgs})
 fi
diff --git a/scripts/common.sh b/scripts/common.sh
index bc6ba71e8..3ca699e4a 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -84,25 +84,3 @@ function install_runsc() {
   # Restart docker to pick up the new runtime configuration.
   sudo systemctl restart docker
 }
-
-# Installs the given packages. Note that the package names should be verified to
-# be correct, otherwise this may result in a loop that spins until time out.
-function apt_install() {
-  while true; do
-    sudo apt-get update &&
-      sudo apt-get install -y "$@" &&
-      true
-    result="${?}"
-    case $result in
-      0)
-        break
-        ;;
-      100)
-        # 100 is the error code that apt-get returns.
-        ;;
-      *)
-        exit $result
-        ;;
-    esac
-  done
-}
diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh
index 72ba05260..931ce1aa4 100755
--- a/scripts/docker_tests.sh
+++ b/scripts/docker_tests.sh
@@ -16,5 +16,7 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 install_runsc_for_test docker
 test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/hostnet_tests.sh b/scripts/hostnet_tests.sh
index 41298293d..992db50dd 100755
--- a/scripts/hostnet_tests.sh
+++ b/scripts/hostnet_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Install the runtime and perform basic tests.
 install_runsc_for_test hostnet --network=host
 test_runsc --test_arg=-checkpoint=false //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
index c8da1f32d..2a8c24907 100755
--- a/scripts/iptables_tests.sh
+++ b/scripts/iptables_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-iptables
+
 install_runsc_for_test iptables --net-raw
-test //test/iptables:iptables_test --test_arg=--runtime=runc
-test //test/iptables:iptables_test --test_arg=--runtime=${RUNTIME}
+test //test/iptables:iptables_test "--test_arg=--runtime=runc"
+test //test/iptables:iptables_test "--test_arg=--runtime=${RUNTIME}"
diff --git a/scripts/kvm_tests.sh b/scripts/kvm_tests.sh
index 5662401df..619571c74 100755
--- a/scripts/kvm_tests.sh
+++ b/scripts/kvm_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Ensure that KVM is loaded, and we can use it.
 (lsmod | grep -E '^(kvm_intel|kvm_amd)') || sudo modprobe kvm
 sudo chmod a+rw /dev/kvm
diff --git a/scripts/make_tests.sh b/scripts/make_tests.sh
index 79426756d..dbf1bba77 100755
--- a/scripts/make_tests.sh
+++ b/scripts/make_tests.sh
@@ -16,10 +16,5 @@
 
 source $(dirname $0)/common.sh
 
-top_level=$(git rev-parse --show-toplevel 2>/dev/null)
-[[ $? -eq 0 ]] && cd "${top_level}" || exit 1
-
-make
 make runsc
-make BAZEL_OPTIONS="build //..." bazel
 make bazel-shutdown
diff --git a/scripts/overlay_tests.sh b/scripts/overlay_tests.sh
index 2a1f12c0b..448864953 100755
--- a/scripts/overlay_tests.sh
+++ b/scripts/overlay_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Install the runtime and perform basic tests.
 install_runsc_for_test overlay --overlay
 test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/packetdrill_tests.sh b/scripts/packetdrill_tests.sh
index fc6bef79c..f0fc444c8 100755
--- a/scripts/packetdrill_tests.sh
+++ b/scripts/packetdrill_tests.sh
@@ -16,5 +16,7 @@
 
 source $(dirname $0)/common.sh
 
+make load-packetdrill
+
 install_runsc_for_test runsc-d
 test_runsc $(bazel query "attr(tags, manual, tests(//test/packetdrill/...))")
diff --git a/scripts/packetimpact_tests.sh b/scripts/packetimpact_tests.sh
index 027d11e64..17fc43f27 100755
--- a/scripts/packetimpact_tests.sh
+++ b/scripts/packetimpact_tests.sh
@@ -16,5 +16,7 @@
 
 source $(dirname $0)/common.sh
 
+make load-packetimpact
+
 install_runsc_for_test runsc-d
 test_runsc $(bazel query "attr(tags, packetimpact, tests(//test/packetimpact/...))")
diff --git a/scripts/root_tests.sh b/scripts/root_tests.sh
index 4e4fcc76b..d629bf2aa 100755
--- a/scripts/root_tests.sh
+++ b/scripts/root_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Reinstall the latest containerd shim.
 declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
 declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
@@ -28,4 +30,3 @@ sudo mv ${shim_path} /usr/local/bin/gvisor-containerd-shim
 # Run the tests that require root.
 install_runsc_for_test root
 run_as_root //test/root:root_test --runtime=${RUNTIME}
-
diff --git a/scripts/swgso_tests.sh b/scripts/swgso_tests.sh
index 0de2df1d2..c67f2fe5c 100755
--- a/scripts/swgso_tests.sh
+++ b/scripts/swgso_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Install the runtime and perform basic tests.
 install_runsc_for_test swgso --software-gso=true --gso=false
 test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/test/iptables/runner/Dockerfile b/test/iptables/runner/Dockerfile
deleted file mode 100644
index b77db44a1..000000000
--- a/test/iptables/runner/Dockerfile
+++ /dev/null
@@ -1,4 +0,0 @@
-# This Dockerfile builds the image hosted at
-# gcr.io/gvisor-presubmit/iptables-test.
-FROM ubuntu
-RUN apt update && apt install -y iptables
diff --git a/test/packetdrill/Dockerfile b/test/packetdrill/Dockerfile
deleted file mode 100644
index 4b75e9527..000000000
--- a/test/packetdrill/Dockerfile
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM ubuntu:bionic
-
-RUN apt-get update && apt-get install -y net-tools git iptables iputils-ping \
-        netcat tcpdump jq tar bison flex make
-RUN hash -r
-RUN git clone --branch packetdrill-v2.0 \
-        https://github.com/google/packetdrill.git
-RUN cd packetdrill/gtests/net/packetdrill && ./configure && make
-CMD /bin/bash
diff --git a/test/packetimpact/tests/Dockerfile b/test/packetimpact/tests/Dockerfile
deleted file mode 100644
index 9075bc555..000000000
--- a/test/packetimpact/tests/Dockerfile
+++ /dev/null
@@ -1,17 +0,0 @@
-FROM ubuntu:bionic
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        # iptables to disable OS native packet processing.
-        iptables \
-        # nc to check that the posix_server is running.
-        netcat \
-        # tcpdump to log brief packet sniffing.
-        tcpdump \
-        # ip link show to display MAC addresses.
-        iproute2 \
-        # tshark to log verbose packet sniffing.
-        tshark \
-        # killall for cleanup.
-        psmisc
-RUN hash -r
-CMD /bin/bash
diff --git a/test/root/BUILD b/test/root/BUILD
index 17e51e66e..639e293e3 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -48,6 +48,7 @@ go_test(
 
 vm_test(
     name = "root_vm_test",
+    size = "large",
     shard_count = 1,
     targets = [
         "//tools/installers:shim",
diff --git a/test/runtimes/images/Dockerfile_go1.12 b/test/runtimes/images/Dockerfile_go1.12
deleted file mode 100644
index ab9d6abf3..000000000
--- a/test/runtimes/images/Dockerfile_go1.12
+++ /dev/null
@@ -1,10 +0,0 @@
-# Go is easy, since we already have everything we need to compile the proctor
-# binary and run the tests in the golang Docker image.
-FROM golang:1.12
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
-# Pre-compile the tests so we don't need to do so in each test run.
-RUN ["go", "tool", "dist", "test", "-compile-only"]
-
-ENTRYPOINT ["/proctor", "--runtime=go"]
diff --git a/test/runtimes/images/Dockerfile_java11 b/test/runtimes/images/Dockerfile_java11
deleted file mode 100644
index 9b7c3d5a3..000000000
--- a/test/runtimes/images/Dockerfile_java11
+++ /dev/null
@@ -1,30 +0,0 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
-FROM ubuntu:bionic
-RUN apt-get update && apt-get install -y \
-  autoconf \
-  build-essential \
-  curl \
-  make \
-  openjdk-11-jdk \
-  unzip \
-  zip
-
-# Download the JDK test library.
-WORKDIR /root
-RUN set -ex \
- && curl -fsSL --retry 10 -o /tmp/jdktests.tar.gz http://hg.openjdk.java.net/jdk/jdk11/archive/76072a077ee1.tar.gz/test \
- && tar -xzf /tmp/jdktests.tar.gz \
- && mv jdk11-76072a077ee1/test test \
- && rm -f /tmp/jdktests.tar.gz
-
-# Install jtreg and add to PATH.
-RUN curl -o jtreg.tar.gz https://ci.adoptopenjdk.net/view/Dependencies/job/jtreg/lastSuccessfulBuild/artifact/jtreg-4.2.0-tip.tar.gz
-RUN tar -xzf jtreg.tar.gz
-ENV PATH="/root/jtreg/bin:$PATH"
-
-COPY --from=golang /proctor /proctor
-ENTRYPOINT ["/proctor", "--runtime=java"]
diff --git a/test/runtimes/images/Dockerfile_nodejs12.4.0 b/test/runtimes/images/Dockerfile_nodejs12.4.0
deleted file mode 100644
index 26f68b487..000000000
--- a/test/runtimes/images/Dockerfile_nodejs12.4.0
+++ /dev/null
@@ -1,28 +0,0 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
-FROM ubuntu:bionic
-RUN apt-get update && apt-get install -y \
-  curl \
-  dumb-init \
-  g++ \
-  make \
-  python
-
-WORKDIR /root
-ARG VERSION=v12.4.0
-RUN curl -o node-${VERSION}.tar.gz https://nodejs.org/dist/${VERSION}/node-${VERSION}.tar.gz
-RUN tar -zxf node-${VERSION}.tar.gz
-
-WORKDIR /root/node-${VERSION}
-RUN ./configure
-RUN make
-RUN make test-build
-
-COPY --from=golang /proctor /proctor
-
-# Including dumb-init emulates the Linux "init" process, preventing the failure
-# of tests involving worker processes.
-ENTRYPOINT ["/usr/bin/dumb-init", "/proctor", "--runtime=nodejs"]
diff --git a/test/runtimes/images/Dockerfile_php7.3.6 b/test/runtimes/images/Dockerfile_php7.3.6
deleted file mode 100644
index e6b4c6329..000000000
--- a/test/runtimes/images/Dockerfile_php7.3.6
+++ /dev/null
@@ -1,27 +0,0 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
-FROM ubuntu:bionic
-RUN apt-get update && apt-get install -y \
-  autoconf \
-  automake \
-  bison \
-  build-essential \
-  curl \
-  libtool \
-  libxml2-dev \
-  re2c
-
-WORKDIR /root
-ARG VERSION=7.3.6
-RUN curl -o php-${VERSION}.tar.gz https://www.php.net/distributions/php-${VERSION}.tar.gz
-RUN tar -zxf php-${VERSION}.tar.gz
-
-WORKDIR /root/php-${VERSION}
-RUN ./configure
-RUN make
-
-COPY --from=golang /proctor /proctor
-ENTRYPOINT ["/proctor", "--runtime=php"]
diff --git a/test/runtimes/images/Dockerfile_python3.7.3 b/test/runtimes/images/Dockerfile_python3.7.3
deleted file mode 100644
index 905cd22d7..000000000
--- a/test/runtimes/images/Dockerfile_python3.7.3
+++ /dev/null
@@ -1,30 +0,0 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
-FROM ubuntu:bionic
-
-RUN apt-get update && apt-get install -y \
-  curl \
-  gcc \
-  libbz2-dev \
-  libffi-dev \
-  liblzma-dev \
-  libreadline-dev \
-  libssl-dev \
-  make \
-  zlib1g-dev
-
-# Use flags -LJO to follow the html redirect and download .tar.gz.
-WORKDIR /root
-ARG VERSION=3.7.3
-RUN curl -LJO https://github.com/python/cpython/archive/v${VERSION}.tar.gz
-RUN tar -zxf cpython-${VERSION}.tar.gz
-
-WORKDIR /root/cpython-${VERSION}
-RUN ./configure --with-pydebug
-RUN make -s -j2
-
-COPY --from=golang /proctor /proctor
-ENTRYPOINT ["/proctor", "--runtime=python"]
diff --git a/tools/bazel.mk b/tools/bazel.mk
new file mode 100644
index 000000000..45fbbecca
--- /dev/null
+++ b/tools/bazel.mk
@@ -0,0 +1,106 @@
+#!/usr/bin/make -f
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See base Makefile.
+BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \
+			git rev-parse --abbrev-ref HEAD 2>/dev/null) | \
+			xargs -n 1 basename 2>/dev/null)
+
+# Bazel container configuration (see below).
+USER ?= gvisor
+DOCKER_NAME ?= gvisor-bazel
+DOCKER_RUN_OPTIONS ?= --privileged
+BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/)
+GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/)
+DOCKER_SOCKET := /var/run/docker.sock
+
+# Non-configurable.
+UID := $(shell id -u ${USER})
+GID := $(shell id -g ${USER})
+FULL_DOCKER_RUN_OPTIONS := $(DOCKER_RUN_OPTIONS)
+FULL_DOCKER_RUN_OPTIONS += -v "$(BAZEL_CACHE):$(BAZEL_CACHE)"
+FULL_DOCKER_RUN_OPTIONS += -v "$(GCLOUD_CONFIG):$(GCLOUD_CONFIG)"
+FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)"
+
+##
+## Bazel helpers.
+##
+##   This file supports targets that wrap bazel in a running Docker
+##   container to simplify development. Some options are available to
+##   control the behavior of this container:
+##     USER               - The in-container user.
+##     DOCKER_RUN_OPTIONS - Options for the container (default: --privileged, required for tests).
+##     DOCKER_NAME        - The container name (default: gvisor-bazel-HASH).
+##     BAZEL_CACHE        - The bazel cache directory (default: detected).
+##     GCLOUD_CONFIG      - The gcloud config directory (detect: detected).
+##     DOCKER_SOCKET      - The Docker socket (default: detected).
+##
+bazel-server-start: load-default ## Starts the bazel server.
+	docker run -d --rm \
+	        --name $(DOCKER_NAME) \
+		--user 0:0 \
+		-v "$(CURDIR):$(CURDIR)" \
+		--workdir "$(CURDIR)" \
+		--tmpfs /tmp:rw,exec \
+		--entrypoint "" \
+		$(FULL_DOCKER_RUN_OPTIONS) \
+		gvisor.dev/images/default \
+		sh -c "groupadd --gid $(GID) --non-unique $(USER) && \
+		       useradd --uid $(UID) --non-unique --no-create-home --gid $(GID) -d $(HOME) $(USER) && \
+	               bazel version && \
+		       while :; do sleep 3600; done"
+	@while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; sleep 1; done
+.PHONY: bazel-server-start
+
+bazel-shutdown: ## Shuts down a running bazel server.
+	@docker exec --user $(UID):$(GID) $(DOCKER_NAME) bazel shutdown; rc=$$?; docker kill $(DOCKER_NAME) || [[ $$rc -ne 0 ]]
+.PHONY: bazel-shutdown
+
+bazel-alias: ## Emits an alias that can be used within the shell.
+	@echo "alias bazel='docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel'"
+.PHONY: bazel-alias
+
+bazel-server: ## Ensures that the server exists. Used as an internal target.
+	@docker exec $(DOCKER_NAME) true || $(MAKE) bazel-server-start
+.PHONY: bazel-server
+
+build_paths = docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) sh -c 'bazel build $(OPTIONS) $(TARGETS) 2>&1 \
+		| tee /dev/fd/2 \
+		| grep -E "^  bazel-bin/" \
+		| awk "{print $$1;}"' \
+		| xargs -n 1 -I {} sh -c "$(1)"
+
+build: bazel-server
+	@$(call build_paths,echo {})
+.PHONY: build
+
+copy: bazel-server
+ifeq (,$(DESTINATION))
+	$(error Destination not provided.)
+endif
+	@$(call build_paths,cp -a {} $(DESTINATION))
+
+run: bazel-server
+	@$(call build_paths,{} $(ARGS))
+.PHONY: run
+
+sudo: bazel-server
+	@$(call build_paths,sudo -E {} $(ARGS))
+.PHONY: sudo
+
+test: bazel-server
+	@docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel test $(OPTIONS) $(TARGETS)
+.PHONY: test
diff --git a/tools/installers/BUILD b/tools/installers/BUILD
index d78a265ca..caa7b1983 100644
--- a/tools/installers/BUILD
+++ b/tools/installers/BUILD
@@ -16,6 +16,14 @@ sh_binary(
     data = [":runsc"],
 )
 
+sh_binary(
+    name = "images",
+    srcs = ["images.sh"],
+    data = [
+        "//images",
+    ],
+)
+
 sh_binary(
     name = "master",
     srcs = ["master.sh"],
diff --git a/tools/installers/images.sh b/tools/installers/images.sh
new file mode 100755
index 000000000..52e750f57
--- /dev/null
+++ b/tools/installers/images.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail
+
+# Find the images directory.
+for images in $(find . -type d -name images); do
+  if [[ -f "${images}"/Makefile ]]; then
+    make -C "${images}" load-all-images
+  fi
+done
diff --git a/tools/make_repository.sh b/tools/make_repository.sh
index 27ffbc9f3..c91fd283c 100755
--- a/tools/make_repository.sh
+++ b/tools/make_repository.sh
@@ -17,14 +17,37 @@
 # Parse arguments. We require more than two arguments, which are the private
 # keyring, the e-mail associated with the signer, and the list of packages.
 if [ "$#" -le 3 ]; then
-  echo "usage: $0 <private-key> <signer-email> <component> <root> <packages...>"
+  echo "usage: $0 <private-key> <signer-email> <root> <packages...>"
   exit 1
 fi
 declare -r private_key=$(readlink -e "$1"); shift
 declare -r signer="$1"; shift
-declare -r component="$1"; shift
 declare -r root="$1"; shift
 
+# Ensure that we have the correct packages installed.
+function apt_install() {
+  while true; do
+    sudo apt-get update &&
+      sudo apt-get install -y "$@" &&
+      true
+    result="${?}"
+    case $result in
+      0)
+        break
+        ;;
+      100)
+        # 100 is the error code that apt-get returns.
+        ;;
+      *)
+        exit $result
+        ;;
+    esac
+  done
+}
+dpkg-sig --help >/dev/null       || apt_install dpkg-sig
+apt-ftparchive --help >/dev/null || apt_install apt-utils
+xz --help >/dev/null             || apt_install xz-utils
+
 # Verbose from this point.
 set -xeo pipefail
 
@@ -78,7 +101,7 @@ for dir in "${root}"/pool/*/binary-*; do
   name=$(basename "${dir}")
   arch=${name##binary-}
   arches+=("${arch}")
-  repo_packages="${tmpdir}"/"${component}"/"${name}"
+  repo_packages="${tmpdir}"/main/"${name}"
   mkdir -p "${repo_packages}"
   (cd "${root}" && apt-ftparchive --arch "${arch}" packages pool > "${repo_packages}"/Packages)
   (cd "${repo_packages}" && cat Packages | gzip > Packages.gz)
@@ -91,7 +114,7 @@ APT {
   FTPArchive {
     Release {
       Architectures "${arches[@]}";
-      Components "${component}";
+      Components "main";
     };
   };
 };
diff --git a/tools/vm/defs.bzl b/tools/vm/defs.bzl
index 24bf0aabc..61feefcbc 100644
--- a/tools/vm/defs.bzl
+++ b/tools/vm/defs.bzl
@@ -183,7 +183,10 @@ def vm_test(
     """
     targets = kwargs.pop("targets", [])
     if installers == None:
-        installers = ["//tools/installers:head"]
+        installers = [
+            "//tools/installers:head",
+            "//tools/installers:images",
+        ]
     targets = installers + targets
     if default_installer():
         targets = [default_installer()] + targets
-- 
cgit v1.2.3


From d5776be3fbcc9e71c449b7b41786929734ce47e2 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 24 Apr 2020 14:41:33 -0700
Subject: Improve and update packetimpact README.md

PiperOrigin-RevId: 308328860
---
 test/packetimpact/README.md | 304 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 227 insertions(+), 77 deletions(-)

diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md
index ece4dedc6..a82ad996a 100644
--- a/test/packetimpact/README.md
+++ b/test/packetimpact/README.md
@@ -29,24 +29,24 @@ There are a few ways to write networking tests for gVisor currently:
 
 The right choice depends on the needs of the test.
 
-Feature       | Go unit test | syscall test | packetdrill | packetimpact
-------------- | ------------ | ------------ | ----------- | ------------
-Multiplatform | no           | **YES**      | **YES**     | **YES**
-Concise       | no           | somewhat     | somewhat    | **VERY**
-Control-flow  | **YES**      | **YES**      | no          | **YES**
-Flexible      | **VERY**     | no           | somewhat    | **VERY**
+Feature        | Go unit test | syscall test | packetdrill | packetimpact
+-------------- | ------------ | ------------ | ----------- | ------------
+Multi-platform | no           | **YES**      | **YES**     | **YES**
+Concise        | no           | somewhat     | somewhat    | **VERY**
+Control-flow   | **YES**      | **YES**      | no          | **YES**
+Flexible       | **VERY**     | no           | somewhat    | **VERY**
 
 ### Go unit tests
 
 If the test depends on the internals of gVisor and doesn't need to run on Linux
 or other platforms for comparison purposes, a Go unit test can be appropriate.
 They can observe internals of gVisor networking. The downside is that they are
-**not concise** and **not multiplatform**. If you require insight on gVisor
+**not concise** and **not multi-platform**. If you require insight on gVisor
 internals, this is the right choice.
 
 ### Syscall tests
 
-Syscall tests are **multiplatform** but cannot examine the internals of gVisor
+Syscall tests are **multi-platform** but cannot examine the internals of gVisor
 networking. They are **concise**. They can use **control-flow** structures like
 conditionals, for loops, and variables. However, they are limited to only what
 the POSIX interface provides so they are **not flexible**. For example, you
@@ -57,7 +57,7 @@ protocols, wrong sequence numbers, etc.
 
 ### Packetdrill tests
 
-Packetdrill tests are **multiplatform** and can run against both Linux and
+Packetdrill tests are **multi-platform** and can run against both Linux and
 gVisor. They are **concise** and use a special packetdrill scripting language.
 They are **more flexible** than a syscall test in that they can send packets
 that a syscall test would have difficulty sending, like a packet with a
@@ -73,7 +73,7 @@ other side supports window scaling, for example.
 Packetimpact tests are similar to Packetdrill tests except that they are written
 in Go instead of the packetdrill scripting language. That gives them all the
 **control-flow** abilities of Go (loops, functions, variables, etc). They are
-**multiplatform** in the same way as packetdrill tests but even more
+**multi-platform** in the same way as packetdrill tests but even more
 **flexible** because Go is more expressive than the scripting language of
 packetdrill. However, Go is **not as concise** as the packetdrill language. Many
 design decisions below are made to mitigate that.
@@ -81,21 +81,27 @@ design decisions below are made to mitigate that.
 ## How it works
 
 ```
-    +--------------+               +--------------+
-    |              |   TEST NET    |              |
-    |              | <===========> |    Device    |
-    |    Test      |               |    Under     |
-    |    Bench     |               |    Test      |
-    |              | <===========> |    (DUT)     |
-    |              |  CONTROL NET  |              |
-    +--------------+               +--------------+
+     Testbench                           Device-Under-Test (DUT)
+    +-------------------+               +------------------------+
+    |                   |   TEST NET    |                        |
+    | rawsockets.go <-->| <===========> | <---+                  |
+    |           ^       |               |     |                  |
+    |           |       |               |     |                  |
+    |           v       |               |     |                  |
+    |     unittest      |               |     |                  |
+    |           ^       |               |     |                  |
+    |           |       |               |     |                  |
+    |           v       |               |     v                  |
+    |         dut.go <========gRPC========> posix server         |
+    |                   |  CONTROL NET  |                        |
+    +-------------------+               +------------------------+
 ```
 
-Two docker containers are created by a script, one for the test bench and the
-other for the device under test (DUT). The script connects the two containers
-with a control network and test network. It also does some other tasks like
-waiting until the DUT is ready before starting the test and disabling Linux
-networking that would interfere with the test bench.
+Two docker containers are created by a "runner" script, one for the testbench
+and the other for the device under test (DUT). The script connects the two
+containers with a control network and test network. It also does some other
+tasks like waiting until the DUT is ready before starting the test and disabling
+Linux networking that would interfere with the test bench.
 
 ### DUT
 
@@ -220,7 +226,8 @@ func (i *Injector) Send(b []byte) {...}
     container and in practice, the container doesn't recognize binaries built on
     the host if they use cgo.
 *   Both gVisor and gopacket have the ability to read and write pcap files
-    without cgo but that is insufficient here.
+    without cgo but that is insufficient here because we can't just replay pcap
+    files, we need a more dynamic solution.
 *   The sniffer and injector can't share a socket because they need to be bound
     differently.
 *   Sniffing could have been done asynchronously with channels, obviating the
@@ -270,11 +277,10 @@ but with a pointer for each field that may be `nil`.
     *   Many functions, one per field, like: `filterByFlag(myBytes, SYN)`,
         `filterByLength(myBytes, 20)`, `filterByNextProto(myBytes, 0x8000)`,
         etc.
-    *   Using pointers allows us to combine `Layer`s with a one-line call to
-        `mergo.Merge(...)`. So the default `Layers` can be overridden by a
-        `Layers` with just the TCP conection's src/dst which can be overridden
-        by one with just a test specific TCP window size. Each override is
-        specified as just one call to `mergo.Merge`.
+    *   Using pointers allows us to combine `Layer`s with reflection. So the
+        default `Layers` can be overridden by a `Layers` with just the TCP
+        conection's src/dst which can be overridden by one with just a test
+        specific TCP window size.
     *   It's a proven way to separate the details of a packet from the byte
         format as shown by scapy's success.
 *   Use packetgo. It's more general than parsing packets with gVisor. However:
@@ -334,6 +340,14 @@ type Layer interface {
 }
 ```
 
+The `next` and `prev` make up a link listed so that each layer can get at the
+information in the layer around it. This is necessary for some protocols, like
+TCP that needs the layer before and payload after to compute the checksum. Any
+sequence of `Layer` structs is valid so long as the parser and `toBytes`
+functions can map from type to protool number and vice-versa. When the mapping
+fails, an error is emitted explaining what functionality is missing. The
+solution is either to fix the ordering or implement the missing protocol.
+
 For each `Layer` there is also a parsing function. For example, this one is for
 Ethernet:
 
@@ -392,81 +406,217 @@ for {
 ##### Alternatives considered
 
 *   Don't use previous and next pointers.
-    *   Each layer may need to be able to interrogate the layers aroung it, like
+    *   Each layer may need to be able to interrogate the layers around it, like
         for computing the next protocol number or total length. So *some*
         mechanism is needed for a `Layer` to see neighboring layers.
     *   We could pass the entire array `Layers` to the `toBytes()` function.
         Passing an array to a method that includes in the array the function
         receiver itself seems wrong.
 
-#### Connections
+#### `layerState`
 
-Using `Layers` above, we can create connection structures to maintain state
-about connections. For example, here is the `TCPIPv4` struct:
+`Layers` represents the different headers of a packet but a connection includes
+more state. For example, a TCP connection needs to keep track of the next
+expected sequence number and also the next sequence number to send. This is
+stored in a `layerState` struct. This is the `layerState` for TCP:
 
+```go
+// tcpState maintains state about a TCP connection.
+type tcpState struct {
+    out, in                   TCP
+    localSeqNum, remoteSeqNum *seqnum.Value
+    synAck                    *TCP
+    portPickerFD              int
+    finSent                   bool
+}
 ```
-type TCPIPv4 struct {
-  outgoing     Layers
-  incoming     Layers
-  localSeqNum  uint32
-  remoteSeqNum uint32
-  sniffer      Sniffer
-  injector     Injector
-  t            *testing.T
+
+The next sequence numbers for each side of the connection are stored. `out` and
+`in` have defaults for the TCP header, such as the expected source and
+destination ports for outgoing packets and incoming packets.
+
+##### `layerState` interface
+
+```go
+// layerState stores the state of a layer of a connection.
+type layerState interface {
+    // outgoing returns an outgoing layer to be sent in a frame.
+    outgoing() Layer
+
+    // incoming creates an expected Layer for comparing against a received Layer.
+    // Because the expectation can depend on values in the received Layer, it is
+    // an input to incoming. For example, the ACK number needs to be checked in a
+    // TCP packet but only if the ACK flag is set in the received packet.
+    incoming(received Layer) Layer
+
+    // sent updates the layerState based on the Layer that was sent. The input is
+    // a Layer with all prev and next pointers populated so that the entire frame
+    // as it was sent is available.
+    sent(sent Layer) error
+
+    // received updates the layerState based on a Layer that is receieved. The
+    // input is a Layer with all prev and next pointers populated so that the
+    // entire frame as it was receieved is available.
+    received(received Layer) error
+
+    // close frees associated resources held by the LayerState.
+    close() error
 }
 ```
 
-`TCPIPv4` contains an `outgoing Layers` which holds the defaults for the
-connection, such as the source and destination MACs, IPs, and ports. When
-`outgoing.toBytes()` is called a valid packet for this TCPIPv4 flow is built.
+`outgoing` generates the default Layer for an outgoing packet. For TCP, this
+would be a `TCP` with the source and destination ports populated. Because they
+are static, they are stored inside the `out` member of `tcpState`. However, the
+sequence numbers change frequently so the outgoing sequence number is stored in
+the `localSeqNum` and put into the output of outgoing for each call.
+
+`incoming` does the same functions for packets that arrive but instead of
+generating a packet to send, it generates an expect packet for filtering packets
+that arrive. For example, if a `TCP` header arrives with the wrong ports, it can
+be ignored as belonging to a different connection. `incoming` needs the received
+header itself as an input because the filter may depend on the input. For
+example, the expected sequence number depends on the flags in the TCP header.
+
+`sent` and `received` are run for each header that is actually sent or received
+and used to update the internal state. `incoming` and `outgoing` should *not* be
+used for these purpose. For example, `incoming` is called on every packet that
+arrives but only packets that match ought to actually update the state.
+`outgoing` is called to created outgoing packets and those packets are always
+sent, so unlike `incoming`/`received`, there is one `outgoing` call for each
+`sent` call.
+
+`close` cleans up after the layerState. For example, TCP and UDP need to keep a
+port reserved and then release it.
+
+#### Connections
+
+Using `layerState` above, we can create connections.
 
-It also contains `incoming Layers` which holds filter for incoming packets that
-belong to this flow. `incoming.match(Layers)` is used on received bytes to check
-if they are part of the flow.
+```go
+// Connection holds a collection of layer states for maintaining a connection
+// along with sockets for sniffer and injecting packets.
+type Connection struct {
+    layerStates []layerState
+    injector    Injector
+    sniffer     Sniffer
+    t           *testing.T
+}
+```
 
-The `sniffer` and `injector` are for receiving and sending raw packet bytes. The
-`localSeqNum` and `remoteSeqNum` are updated by `Send` and `Recv` so that
-outgoing packets will have, by default, the correct sequence number and ack
-number.
+The connection stores an array of `layerState` in the order that the headers
+should be present in the frame to send. For example, Ether then IPv4 then TCP.
+The injector and sniffer are for writing and reading frames. A `*testing.T` is
+stored so that internal errors can be reported directly without code in the unit
+test.
 
-TCPIPv4 provides some functions:
+The `Connection` has some useful functions:
 
+```go
+// Close frees associated resources held by the Connection.
+func (conn *Connection) Close() {...}
+// CreateFrame builds a frame for the connection with layer overriding defaults
+// of the innermost layer and additionalLayers added after it.
+func (conn *Connection) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {...}
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *Connection) SendFrame(frame Layers) {...}
+// Send a packet with reasonable defaults. Potentially override the final layer
+// in the connection with the provided layer and add additionLayers.
+func (conn *Connection) Send(layer Layer, additionalLayers ...Layer) {...}
+// Expect a frame with the final layerStates layer matching the provided Layer
+// within the timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error) {...}
+// ExpectFrame expects a frame that matches the provided Layers within the
+// timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layers, error) {...}
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *Connection) Drain() {...}
 ```
-func (conn *TCPIPv4) Send(tcp TCP) {...}
-func (conn *TCPIPv4) Recv(timeout time.Duration) *TCP {...}
+
+`CreateFrame` uses the `[]layerState` to create a frame to send. The first
+argument is for overriding defaults in the last header of the frame, because
+this is the most common need. For a TCPIPv4 connection, this would be the TCP
+header. Optional additionalLayers can be specified to add to the frame being
+created, such as a `Payload` for `TCP`.
+
+`SendFrame` sends the frame to the DUT. It is combined with `CreateFrame` to
+make `Send`. For unittests with basic sending needs, `Send` can be used. If more
+control is needed over the frame, it can be made with `CreateFrame`, modified in
+the unit test, and then sent with `SendFrame`.
+
+On the receiving side, there is `Expect` and `ExpectFrame`. Like with the
+sending side, there are two forms of each function, one for just the last header
+and one for the whole frame. The expect functions use the `[]layerState` to
+create a template for the expected incoming frame. That frame is then overridden
+by the values in the first argument. Finally, a loop starts sniffing packets on
+the wire for frames. If a matching frame is found before the timeout, it is
+returned without error. If not, nil is returned and the error contains text of
+all the received frames that didn't match. Exactly one of the outputs will be
+non-nil, even if no frames are received at all.
+
+`Drain` sniffs and discards all the frames that have yet to be received. A
+common way to write a test is:
+
+```go
+conn.Drain() // Discard all outstanding frames.
+conn.Send(...) // Send a frame with overrides.
+// Now expect a frame with a certain header and fail if it doesn't arrive.
+if _, err := conn.Expect(...); err != nil { t.Fatal(...) }
 ```
 
-`Send(tcp TCP)` uses [mergo](https://github.com/imdario/mergo) to merge the
-provided `TCP` (a `Layer`) into `outgoing`. This way the user can specify
-concisely just which fields of `outgoing` to modify. The packet is sent using
-the `injector`.
+Or for a test where we want to check that no frame arrives:
 
-`Recv(timeout time.Duration)` reads packets from the sniffer until either the
-timeout has elapsed or a packet that matches `incoming` arrives.
+```go
+if gotOne, _ := conn.Expect(...); gotOne != nil { t.Fatal(...) }
+```
+
+#### Specializing `Connection`
 
-Using those, we can perform a TCP 3-way handshake without too much code:
+Because there are some common combinations of `layerState` into `Connection`,
+they are defined:
 
 ```go
-func (conn *TCPIPv4) Handshake() {
-  syn := uint8(header.TCPFlagSyn)
-  synack := uint8(header.TCPFlagSyn)
-  ack := uint8(header.TCPFlagAck)
-  conn.Send(TCP{Flags: &syn}) // Send a packet with all defaults but set TCP-SYN.
-
-  // Wait for the SYN-ACK response.
-  for {
-    newTCP := conn.Recv(time.Second)  // This already filters by MAC, IP, and ports.
-    if TCP{Flags: &synack}.match(newTCP) {
-      break // Only if it's a SYN-ACK proceed.
-    }
-  }
+// TCPIPv4 maintains the state for all the layers in a TCP/IPv4 connection.
+type TCPIPv4 Connection
+// UDPIPv4 maintains the state for all the layers in a UDP/IPv4 connection.
+type UDPIPv4 Connection
+```
+
+Each has a `NewXxx` function to create a new connection with reasonable
+defaults. They also have functions that call the underlying `Connection`
+functions but with specialization and tighter type-checking. For example:
 
-  conn.Send(TCP{Flags: &ack}) // Send an ACK. The seq and ack numbers are set correctly.
+```go
+func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+    (*Connection)(conn).Send(&tcp, additionalLayers...)
+}
+func (conn *TCPIPv4) Drain() {
+    conn.sniffer.Drain()
+}
+```
+
+They may also have some accessors to get or set the internal state of the
+connection:
+
+```go
+func (conn *TCPIPv4) state() *tcpState {
+    state, ok := conn.layerStates[len(conn.layerStates)-1].(*tcpState)
+    if !ok {
+        conn.t.Fatalf("expected final state of %v to be tcpState", conn.layerStates)
+    }
+    return state
+}
+func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
+    return conn.state().remoteSeqNum
+}
+func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value {
+    return conn.state().localSeqNum
 }
 ```
 
-The handshake code is part of the testbench utilities so tests can share this
-common sequence, making tests even more concise.
+Unittests will in practice use these functions and not the functions on
+`Connection`. For example, `NewTCPIPv4()` and then call `Send` on that rather
+than cast is to a `Connection` and call `Send` on that cast result.
 
 ##### Alternatives considered
 
-- 
cgit v1.2.3


From 3d860530a904004aea5bc95e6331b3b11cec1877 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 24 Apr 2020 15:02:33 -0700
Subject: Better error message from ExpectFrame

Display the errors as diffs between the expected and wanted frame.

PiperOrigin-RevId: 308333271
---
 test/packetimpact/testbench/connections.go        |  94 +++++----
 test/packetimpact/testbench/layers.go             | 245 ++++++++++++++++++++++
 test/packetimpact/testbench/layers_test.go        |  80 +++++++
 test/packetimpact/tests/fin_wait2_timeout_test.go |   4 +-
 4 files changed, 376 insertions(+), 47 deletions(-)

diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 952a717e0..42a90a859 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -21,7 +21,6 @@ import (
 	"fmt"
 	"math/rand"
 	"net"
-	"strings"
 	"testing"
 	"time"
 
@@ -66,14 +65,16 @@ func pickPort() (int, uint16, error) {
 
 // layerState stores the state of a layer of a connection.
 type layerState interface {
-	// outgoing returns an outgoing layer to be sent in a frame.
+	// outgoing returns an outgoing layer to be sent in a frame. It should not
+	// update layerState, that is done in layerState.sent.
 	outgoing() Layer
 
 	// incoming creates an expected Layer for comparing against a received Layer.
 	// Because the expectation can depend on values in the received Layer, it is
 	// an input to incoming. For example, the ACK number needs to be checked in a
-	// TCP packet but only if the ACK flag is set in the received packet. The
-	// calles takes ownership of the returned Layer.
+	// TCP packet but only if the ACK flag is set in the received packet. It
+	// should not update layerState, that is done in layerState.received. The
+	// caller takes ownership of the returned Layer.
 	incoming(received Layer) Layer
 
 	// sent updates the layerState based on the Layer that was sent. The input is
@@ -363,44 +364,33 @@ type Connection struct {
 	t           *testing.T
 }
 
-// match tries to match each Layer in received against the incoming filter. If
-// received is longer than layerStates then that may still count as a match. The
-// reverse is never a match. override overrides the default matchers for each
-// Layer.
-func (conn *Connection) match(override, received Layers) bool {
-	var layersToMatch int
-	if len(override) < len(conn.layerStates) {
-		layersToMatch = len(conn.layerStates)
-	} else {
-		layersToMatch = len(override)
-	}
-	if len(received) < layersToMatch {
-		return false
-	}
-	for i := 0; i < layersToMatch; i++ {
-		var toMatch Layer
-		if i < len(conn.layerStates) {
-			s := conn.layerStates[i]
-			toMatch = s.incoming(received[i])
-			if toMatch == nil {
-				return false
-			}
-			if i < len(override) {
-				if err := toMatch.merge(override[i]); err != nil {
-					conn.t.Fatalf("failed to merge: %s", err)
-				}
-			}
-		} else {
-			toMatch = override[i]
-			if toMatch == nil {
-				conn.t.Fatalf("expect the overriding layers to be non-nil")
-			}
-		}
-		if !toMatch.match(received[i]) {
-			return false
+// Returns the default incoming frame against which to match. If received is
+// longer than layerStates then that may still count as a match. The reverse is
+// never a match and nil is returned.
+func (conn *Connection) incoming(received Layers) Layers {
+	if len(received) < len(conn.layerStates) {
+		return nil
+	}
+	in := Layers{}
+	for i, s := range conn.layerStates {
+		toMatch := s.incoming(received[i])
+		if toMatch == nil {
+			return nil
 		}
+		in = append(in, toMatch)
+	}
+	return in
+}
+
+func (conn *Connection) match(override, received Layers) bool {
+	toMatch := conn.incoming(received)
+	if toMatch == nil {
+		return false // Not enough layers in gotLayers for matching.
 	}
-	return true
+	if err := toMatch.merge(override); err != nil {
+		return false // Failing to merge is not matching.
+	}
+	return toMatch.match(received)
 }
 
 // Close frees associated resources held by the Connection.
@@ -470,6 +460,16 @@ func (conn *Connection) recvFrame(timeout time.Duration) Layers {
 	return parse(parseEther, b)
 }
 
+// layersError stores the Layers that we got and the Layers that we wanted to
+// match.
+type layersError struct {
+	got, want Layers
+}
+
+func (e *layersError) Error() string {
+	return e.got.diff(e.want)
+}
+
 // Expect a frame with the final layerStates layer matching the provided Layer
 // within the timeout specified. If it doesn't arrive in time, it returns nil.
 func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error) {
@@ -485,21 +485,25 @@ func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error
 		return gotFrame[len(conn.layerStates)-1], nil
 	}
 	conn.t.Fatal("the received frame should be at least as long as the expected layers")
-	return nil, fmt.Errorf("the received frame should be at least as long as the expected layers")
+	panic("unreachable")
 }
 
 // ExpectFrame expects a frame that matches the provided Layers within the
-// timeout specified. If it doesn't arrive in time, it returns nil.
+// timeout specified. If one arrives in time, the Layers is returned without an
+// error. If it doesn't arrive in time, it returns nil and error is non-nil.
 func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layers, error) {
 	deadline := time.Now().Add(timeout)
-	var allLayers []string
+	var errs error
 	for {
 		var gotLayers Layers
 		if timeout = time.Until(deadline); timeout > 0 {
 			gotLayers = conn.recvFrame(timeout)
 		}
 		if gotLayers == nil {
-			return nil, fmt.Errorf("got %d packets:\n%s", len(allLayers), strings.Join(allLayers, "\n"))
+			if errs == nil {
+				return nil, fmt.Errorf("got no frames matching %v during %s", layers, timeout)
+			}
+			return nil, fmt.Errorf("got no frames matching %v during %s: got %w", layers, timeout, errs)
 		}
 		if conn.match(layers, gotLayers) {
 			for i, s := range conn.layerStates {
@@ -509,7 +513,7 @@ func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layer
 			}
 			return gotLayers, nil
 		}
-		allLayers = append(allLayers, fmt.Sprintf("%s", gotLayers))
+		errs = multierr.Combine(errs, &layersError{got: gotLayers, want: conn.incoming(gotLayers)})
 	}
 }
 
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 01e99567d..2cbbbb318 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
+	"go.uber.org/multierr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -720,3 +721,247 @@ func (ls *Layers) match(other Layers) bool {
 	}
 	return true
 }
+
+// layerDiff stores the diffs for each field along with the label for the Layer.
+// If rows is nil, that means that there was no diff.
+type layerDiff struct {
+	label string
+	rows  []layerDiffRow
+}
+
+// layerDiffRow stores the fields and corresponding values for two got and want
+// layers. If the value was nil then the string stored is the empty string.
+type layerDiffRow struct {
+	field, got, want string
+}
+
+// diffLayer extracts all differing fields between two layers.
+func diffLayer(got, want Layer) []layerDiffRow {
+	vGot := reflect.ValueOf(got).Elem()
+	vWant := reflect.ValueOf(want).Elem()
+	if vGot.Type() != vWant.Type() {
+		return nil
+	}
+	t := vGot.Type()
+	var result []layerDiffRow
+	for i := 0; i < t.NumField(); i++ {
+		t := t.Field(i)
+		if t.Anonymous {
+			// Ignore the LayerBase in the Layer struct.
+			continue
+		}
+		vGot := vGot.Field(i)
+		vWant := vWant.Field(i)
+		gotString := ""
+		if !vGot.IsNil() {
+			gotString = fmt.Sprint(reflect.Indirect(vGot))
+		}
+		wantString := ""
+		if !vWant.IsNil() {
+			wantString = fmt.Sprint(reflect.Indirect(vWant))
+		}
+		result = append(result, layerDiffRow{t.Name, gotString, wantString})
+	}
+	return result
+}
+
+// layerType returns a concise string describing the type of the Layer, like
+// "TCP", or "IPv6".
+func layerType(l Layer) string {
+	return reflect.TypeOf(l).Elem().Name()
+}
+
+// diff compares Layers and returns a representation of the difference. Each
+// Layer in the Layers is pairwise compared. If an element in either is nil, it
+// is considered a match with the other Layer. If two Layers have differing
+// types, they don't match regardless of the contents. If two Layers have the
+// same type then the fields in the Layer are pairwise compared. Fields that are
+// nil always match. Two non-nil fields only match if they point to equal
+// values. diff returns an empty string if and only if *ls and other match.
+func (ls *Layers) diff(other Layers) string {
+	var allDiffs []layerDiff
+	// Check the cases where one list is longer than the other, where one or both
+	// elements are nil, where the sides have different types, and where the sides
+	// have the same type.
+	for i := 0; i < len(*ls) || i < len(other); i++ {
+		if i >= len(*ls) {
+			// Matching ls against other where other is longer than ls. missing
+			// matches everything so we just include a label without any rows. Having
+			// no rows is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: "missing matches " + layerType(other[i]),
+			})
+			continue
+		}
+
+		if i >= len(other) {
+			// Matching ls against other where ls is longer than other. missing
+			// matches everything so we just include a label without any rows. Having
+			// no rows is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: layerType((*ls)[i]) + " matches missing",
+			})
+			continue
+		}
+
+		if (*ls)[i] == nil && other[i] == nil {
+			// Matching ls against other where both elements are nil. nil matches
+			// everything so we just include a label without any rows. Having no rows
+			// is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: "nil matches nil",
+			})
+			continue
+		}
+
+		if (*ls)[i] == nil {
+			// Matching ls against other where the element in ls is nil. nil matches
+			// everything so we just include a label without any rows. Having no rows
+			// is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: "nil matches " + layerType(other[i]),
+			})
+			continue
+		}
+
+		if other[i] == nil {
+			// Matching ls against other where the element in other is nil. nil
+			// matches everything so we just include a label without any rows. Having
+			// no rows is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: layerType((*ls)[i]) + " matches nil",
+			})
+			continue
+		}
+
+		if reflect.TypeOf((*ls)[i]) == reflect.TypeOf(other[i]) {
+			// Matching ls against other where both elements have the same type. Match
+			// each field pairwise and only report a diff if there is a mismatch,
+			// which is only when both sides are non-nil and have differring values.
+			diff := diffLayer((*ls)[i], other[i])
+			var layerDiffRows []layerDiffRow
+			for _, d := range diff {
+				if d.got == "" || d.want == "" || d.got == d.want {
+					continue
+				}
+				layerDiffRows = append(layerDiffRows, layerDiffRow{
+					d.field,
+					d.got,
+					d.want,
+				})
+			}
+			if len(layerDiffRows) > 0 {
+				allDiffs = append(allDiffs, layerDiff{
+					label: layerType((*ls)[i]),
+					rows:  layerDiffRows,
+				})
+			} else {
+				allDiffs = append(allDiffs, layerDiff{
+					label: layerType((*ls)[i]) + " matches " + layerType(other[i]),
+					// Having no rows is a sign that there was no diff.
+				})
+			}
+			continue
+		}
+		// Neither side is nil and the types are different, so we'll display one
+		// side then the other.
+		allDiffs = append(allDiffs, layerDiff{
+			label: layerType((*ls)[i]) + " doesn't match " + layerType(other[i]),
+		})
+		diff := diffLayer((*ls)[i], (*ls)[i])
+		layerDiffRows := []layerDiffRow{}
+		for _, d := range diff {
+			if len(d.got) == 0 {
+				continue
+			}
+			layerDiffRows = append(layerDiffRows, layerDiffRow{
+				d.field,
+				d.got,
+				"",
+			})
+		}
+		allDiffs = append(allDiffs, layerDiff{
+			label: layerType((*ls)[i]),
+			rows:  layerDiffRows,
+		})
+
+		layerDiffRows = []layerDiffRow{}
+		diff = diffLayer(other[i], other[i])
+		for _, d := range diff {
+			if len(d.want) == 0 {
+				continue
+			}
+			layerDiffRows = append(layerDiffRows, layerDiffRow{
+				d.field,
+				"",
+				d.want,
+			})
+		}
+		allDiffs = append(allDiffs, layerDiff{
+			label: layerType(other[i]),
+			rows:  layerDiffRows,
+		})
+	}
+
+	output := ""
+	// These are for output formatting.
+	maxLabelLen, maxFieldLen, maxGotLen, maxWantLen := 0, 0, 0, 0
+	foundOne := false
+	for _, l := range allDiffs {
+		if len(l.label) > maxLabelLen && len(l.rows) > 0 {
+			maxLabelLen = len(l.label)
+		}
+		if l.rows != nil {
+			foundOne = true
+		}
+		for _, r := range l.rows {
+			if len(r.field) > maxFieldLen {
+				maxFieldLen = len(r.field)
+			}
+			if l := len(fmt.Sprint(r.got)); l > maxGotLen {
+				maxGotLen = l
+			}
+			if l := len(fmt.Sprint(r.want)); l > maxWantLen {
+				maxWantLen = l
+			}
+		}
+	}
+	if !foundOne {
+		return ""
+	}
+	for _, l := range allDiffs {
+		if len(l.rows) == 0 {
+			output += "(" + l.label + ")\n"
+			continue
+		}
+		for i, r := range l.rows {
+			var label string
+			if i == 0 {
+				label = l.label + ":"
+			}
+			output += fmt.Sprintf(
+				"%*s %*s %*v %*v\n",
+				maxLabelLen+1, label,
+				maxFieldLen+1, r.field+":",
+				maxGotLen, r.got,
+				maxWantLen, r.want,
+			)
+		}
+	}
+	return output
+}
+
+// merge merges the other Layers into ls. If the other Layers is longer, those
+// additional Layer structs are added to ls. The errors from merging are
+// collected and returned.
+func (ls *Layers) merge(other Layers) error {
+	var errs error
+	for i, o := range other {
+		if i < len(*ls) {
+			errs = multierr.Combine(errs, (*ls)[i].merge(o))
+		} else {
+			*ls = append(*ls, o)
+		}
+	}
+	return errs
+}
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
index f07ec5eb2..96f72de5b 100644
--- a/test/packetimpact/testbench/layers_test.go
+++ b/test/packetimpact/testbench/layers_test.go
@@ -313,3 +313,83 @@ func TestConnectionMatch(t *testing.T) {
 		})
 	}
 }
+
+func TestLayersDiff(t *testing.T) {
+	for _, tt := range []struct {
+		x, y Layers
+		want string
+	}{
+		{
+			Layers{&Ether{Type: NetworkProtocolNumber(12)}, &TCP{DataOffset: Uint8(5), SeqNum: Uint32(5)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"Ether:       Type: 12 13\n" +
+				"  TCP:     SeqNum:  5  6\n" +
+				"       DataOffset:  5  7\n",
+		},
+		{
+			Layers{&Ether{Type: NetworkProtocolNumber(12)}, &UDP{SrcPort: Uint16(123)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"Ether:       Type:  12 13\n" +
+				"(UDP doesn't match TCP)\n" +
+				"  UDP:    SrcPort: 123   \n" +
+				"  TCP:     SeqNum:      6\n" +
+				"       DataOffset:      7\n",
+		},
+		{
+			Layers{&UDP{SrcPort: Uint16(123)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"(UDP doesn't match Ether)\n" +
+				"  UDP: SrcPort: 123   \n" +
+				"Ether:    Type:     13\n" +
+				"(missing matches TCP)\n",
+		},
+		{
+			Layers{nil, &UDP{SrcPort: Uint16(123)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"(nil matches Ether)\n" +
+				"(UDP doesn't match TCP)\n" +
+				"UDP:    SrcPort: 123  \n" +
+				"TCP:     SeqNum:     6\n" +
+				"     DataOffset:     7\n",
+		},
+		{
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &IPv4{IHL: Uint8(4)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &IPv4{IHL: Uint8(6)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"(Ether matches Ether)\n" +
+				"IPv4: IHL: 4 6\n" +
+				"(TCP matches TCP)\n",
+		},
+		{
+			Layers{&Payload{Bytes: []byte("foo")}},
+			Layers{&Payload{Bytes: []byte("bar")}},
+			"Payload: Bytes: [102 111 111] [98 97 114]\n",
+		},
+		{
+			Layers{&Payload{Bytes: []byte("")}},
+			Layers{&Payload{}},
+			"",
+		},
+		{
+			Layers{&Payload{Bytes: []byte("")}},
+			Layers{&Payload{Bytes: []byte("")}},
+			"",
+		},
+		{
+			Layers{&UDP{}},
+			Layers{&TCP{}},
+			"(UDP doesn't match TCP)\n" +
+				"(UDP)\n" +
+				"(TCP)\n",
+		},
+	} {
+		if got := tt.x.diff(tt.y); got != tt.want {
+			t.Errorf("%s.diff(%s) = %q, want %q", tt.x, tt.y, got, tt.want)
+		}
+		if tt.x.match(tt.y) != (tt.x.diff(tt.y) == "") {
+			t.Errorf("match and diff of %s and %s disagree", tt.x, tt.y)
+		}
+		if tt.y.match(tt.x) != (tt.y.diff(tt.x) == "") {
+			t.Errorf("match and diff of %s and %s disagree", tt.y, tt.x)
+		}
+	}
+}
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
index b98594f94..99dc77f9a 100644
--- a/test/packetimpact/tests/fin_wait2_timeout_test.go
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -61,8 +61,8 @@ func TestFinWait2Timeout(t *testing.T) {
 					t.Fatalf("expected a RST packet within a second but got none: %s", err)
 				}
 			} else {
-				if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, 10*time.Second); err == nil {
-					t.Fatalf("expected no RST packets within ten seconds but got one: %s", err)
+				if got, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, 10*time.Second); got != nil || err == nil {
+					t.Fatalf("expected no RST packets within ten seconds but got one: %s", got)
 				}
 			}
 		})
-- 
cgit v1.2.3


From dfff265fe422499af3bbe7d58e8db35ba32304f5 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 24 Apr 2020 15:55:11 -0700
Subject: Add ICMP6 param problem test

Tested:
  When run on Linux, a correct ICMPv6 response is received.  On netstack, no
  ICMPv6 response is received.
PiperOrigin-RevId: 308343113
---
 pkg/tcpip/header/ipv6.go                           |  10 +-
 test/packetimpact/testbench/connections.go         | 176 ++++++++++++++---
 test/packetimpact/testbench/layers.go              | 212 +++++++++++++++++++--
 test/packetimpact/tests/BUILD                      |  13 ++
 .../tests/icmpv6_param_problem_test.go             |  73 +++++++
 test/packetimpact/tests/test_runner.sh             |  41 +++-
 6 files changed, 484 insertions(+), 41 deletions(-)
 create mode 100644 test/packetimpact/tests/icmpv6_param_problem_test.go

diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 76e88e9b3..ba80b64a8 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -27,7 +27,9 @@ const (
 	// IPv6PayloadLenOffset is the offset of the PayloadLength field in
 	// IPv6 header.
 	IPv6PayloadLenOffset = 4
-	nextHdr              = 6
+	// IPv6NextHeaderOffset is the offset of the NextHeader field in
+	// IPv6 header.
+	IPv6NextHeaderOffset = 6
 	hopLimit             = 7
 	v6SrcAddr            = 8
 	v6DstAddr            = v6SrcAddr + IPv6AddressSize
@@ -163,7 +165,7 @@ func (b IPv6) HopLimit() uint8 {
 
 // NextHeader returns the value of the "next header" field of the ipv6 header.
 func (b IPv6) NextHeader() uint8 {
-	return b[nextHdr]
+	return b[IPv6NextHeaderOffset]
 }
 
 // TransportProtocol implements Network.TransportProtocol.
@@ -223,7 +225,7 @@ func (b IPv6) SetDestinationAddress(addr tcpip.Address) {
 
 // SetNextHeader sets the value of the "next header" field of the ipv6 header.
 func (b IPv6) SetNextHeader(v uint8) {
-	b[nextHdr] = v
+	b[IPv6NextHeaderOffset] = v
 }
 
 // SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a
@@ -235,7 +237,7 @@ func (IPv6) SetChecksum(uint16) {
 func (b IPv6) Encode(i *IPv6Fields) {
 	b.SetTOS(i.TrafficClass, i.FlowLabel)
 	b.SetPayloadLength(i.PayloadLength)
-	b[nextHdr] = i.NextHeader
+	b[IPv6NextHeaderOffset] = i.NextHeader
 	b[hopLimit] = i.HopLimit
 	b.SetSourceAddress(i.SrcAddr)
 	b.SetDestinationAddress(i.DstAddr)
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 42a90a859..2280bd4ee 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -34,33 +34,60 @@ import (
 
 var localIPv4 = flag.String("local_ipv4", "", "local IPv4 address for test packets")
 var remoteIPv4 = flag.String("remote_ipv4", "", "remote IPv4 address for test packets")
+var localIPv6 = flag.String("local_ipv6", "", "local IPv6 address for test packets")
+var remoteIPv6 = flag.String("remote_ipv6", "", "remote IPv6 address for test packets")
 var localMAC = flag.String("local_mac", "", "local mac address for test packets")
 var remoteMAC = flag.String("remote_mac", "", "remote mac address for test packets")
 
-// pickPort makes a new socket and returns the socket FD and port. The caller
-// must close the FD when done with the port if there is no error.
-func pickPort() (int, uint16, error) {
-	fd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM, 0)
+// pickPort makes a new socket and returns the socket FD and port. The domain
+// should be AF_INET or AF_INET6. The caller must close the FD when done with
+// the port if there is no error.
+func pickPort(domain, typ int) (fd int, port uint16, err error) {
+	fd, err = unix.Socket(domain, typ, 0)
 	if err != nil {
 		return -1, 0, err
 	}
-	var sa unix.SockaddrInet4
-	copy(sa.Addr[0:4], net.ParseIP(*localIPv4).To4())
-	if err := unix.Bind(fd, &sa); err != nil {
-		unix.Close(fd)
+	defer func() {
+		if err != nil {
+			err = multierr.Append(err, unix.Close(fd))
+		}
+	}()
+	var sa unix.Sockaddr
+	switch domain {
+	case unix.AF_INET:
+		var sa4 unix.SockaddrInet4
+		copy(sa4.Addr[:], net.ParseIP(*localIPv4).To4())
+		sa = &sa4
+	case unix.AF_INET6:
+		var sa6 unix.SockaddrInet6
+		copy(sa6.Addr[:], net.ParseIP(*localIPv6).To16())
+		sa = &sa6
+	default:
+		return -1, 0, fmt.Errorf("invalid domain %d, it should be one of unix.AF_INET or unix.AF_INET6", domain)
+	}
+	if err = unix.Bind(fd, sa); err != nil {
 		return -1, 0, err
 	}
 	newSockAddr, err := unix.Getsockname(fd)
 	if err != nil {
-		unix.Close(fd)
 		return -1, 0, err
 	}
-	newSockAddrInet4, ok := newSockAddr.(*unix.SockaddrInet4)
-	if !ok {
-		unix.Close(fd)
-		return -1, 0, fmt.Errorf("can't cast Getsockname result to SockaddrInet4")
+	switch domain {
+	case unix.AF_INET:
+		newSockAddrInet4, ok := newSockAddr.(*unix.SockaddrInet4)
+		if !ok {
+			return -1, 0, fmt.Errorf("can't cast Getsockname result %T to SockaddrInet4", newSockAddr)
+		}
+		return fd, uint16(newSockAddrInet4.Port), nil
+	case unix.AF_INET6:
+		newSockAddrInet6, ok := newSockAddr.(*unix.SockaddrInet6)
+		if !ok {
+			return -1, 0, fmt.Errorf("can't cast Getsockname result %T to SockaddrInet6", newSockAddr)
+		}
+		return fd, uint16(newSockAddrInet6.Port), nil
+	default:
+		return -1, 0, fmt.Errorf("invalid domain %d, it should be one of unix.AF_INET or unix.AF_INET6", domain)
 	}
-	return fd, uint16(newSockAddrInet4.Port), nil
 }
 
 // layerState stores the state of a layer of a connection.
@@ -123,7 +150,7 @@ func newEtherState(out, in Ether) (*etherState, error) {
 }
 
 func (s *etherState) outgoing() Layer {
-	return &s.out
+	return deepcopy.Copy(&s.out).(Layer)
 }
 
 // incoming implements layerState.incoming.
@@ -168,7 +195,7 @@ func newIPv4State(out, in IPv4) (*ipv4State, error) {
 }
 
 func (s *ipv4State) outgoing() Layer {
-	return &s.out
+	return deepcopy.Copy(&s.out).(Layer)
 }
 
 // incoming implements layerState.incoming.
@@ -188,6 +215,54 @@ func (*ipv4State) close() error {
 	return nil
 }
 
+// ipv6State maintains state about an IPv6 connection.
+type ipv6State struct {
+	out, in IPv6
+}
+
+var _ layerState = (*ipv6State)(nil)
+
+// newIPv6State creates a new ipv6State.
+func newIPv6State(out, in IPv6) (*ipv6State, error) {
+	lIP := tcpip.Address(net.ParseIP(*localIPv6).To16())
+	rIP := tcpip.Address(net.ParseIP(*remoteIPv6).To16())
+	s := ipv6State{
+		out: IPv6{SrcAddr: &lIP, DstAddr: &rIP},
+		in:  IPv6{SrcAddr: &rIP, DstAddr: &lIP},
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+// outgoing returns an outgoing layer to be sent in a frame.
+func (s *ipv6State) outgoing() Layer {
+	return deepcopy.Copy(&s.out).(Layer)
+}
+
+func (s *ipv6State) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (s *ipv6State) sent(Layer) error {
+	// Nothing to do.
+	return nil
+}
+
+func (s *ipv6State) received(Layer) error {
+	// Nothing to do.
+	return nil
+}
+
+// close cleans up any resources held.
+func (s *ipv6State) close() error {
+	return nil
+}
+
 // tcpState maintains state about a TCP connection.
 type tcpState struct {
 	out, in                   TCP
@@ -206,8 +281,8 @@ func SeqNumValue(v seqnum.Value) *seqnum.Value {
 }
 
 // newTCPState creates a new TCPState.
-func newTCPState(out, in TCP) (*tcpState, error) {
-	portPickerFD, localPort, err := pickPort()
+func newTCPState(domain int, out, in TCP) (*tcpState, error) {
+	portPickerFD, localPort, err := pickPort(domain, unix.SOCK_STREAM)
 	if err != nil {
 		return nil, err
 	}
@@ -310,8 +385,8 @@ type udpState struct {
 var _ layerState = (*udpState)(nil)
 
 // newUDPState creates a new udpState.
-func newUDPState(out, in UDP) (*udpState, error) {
-	portPickerFD, localPort, err := pickPort()
+func newUDPState(domain int, out, in UDP) (*udpState, error) {
+	portPickerFD, localPort, err := pickPort(domain, unix.SOCK_DGRAM)
 	if err != nil {
 		return nil, err
 	}
@@ -330,7 +405,7 @@ func newUDPState(out, in UDP) (*udpState, error) {
 }
 
 func (s *udpState) outgoing() Layer {
-	return &s.out
+	return deepcopy.Copy(&s.out).(Layer)
 }
 
 // incoming implements layerState.incoming.
@@ -422,7 +497,7 @@ func (conn *Connection) CreateFrame(layer Layer, additionalLayers ...Layer) Laye
 
 // SendFrame sends a frame on the wire and updates the state of all layers.
 func (conn *Connection) SendFrame(frame Layers) {
-	outBytes, err := frame.toBytes()
+	outBytes, err := frame.ToBytes()
 	if err != nil {
 		conn.t.Fatalf("can't build outgoing TCP packet: %s", err)
 	}
@@ -536,7 +611,7 @@ func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 {
 	if err != nil {
 		t.Fatalf("can't make ipv4State: %s", err)
 	}
-	tcpState, err := newTCPState(outgoingTCP, incomingTCP)
+	tcpState, err := newTCPState(unix.AF_INET, outgoingTCP, incomingTCP)
 	if err != nil {
 		t.Fatalf("can't make tcpState: %s", err)
 	}
@@ -633,6 +708,59 @@ func (conn *TCPIPv4) SynAck() *TCP {
 	return conn.state().synAck
 }
 
+// IPv6Conn maintains the state for all the layers in a IPv6 connection.
+type IPv6Conn Connection
+
+// NewIPv6Conn creates a new IPv6Conn connection with reasonable defaults.
+func NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn {
+	etherState, err := newEtherState(Ether{}, Ether{})
+	if err != nil {
+		t.Fatalf("can't make EtherState: %s", err)
+	}
+	ipv6State, err := newIPv6State(outgoingIPv6, incomingIPv6)
+	if err != nil {
+		t.Fatalf("can't make IPv6State: %s", err)
+	}
+
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make injector: %s", err)
+	}
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make sniffer: %s", err)
+	}
+
+	return IPv6Conn{
+		layerStates: []layerState{etherState, ipv6State},
+		injector:    injector,
+		sniffer:     sniffer,
+		t:           t,
+	}
+}
+
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *IPv6Conn) SendFrame(frame Layers) {
+	(*Connection)(conn).SendFrame(frame)
+}
+
+// CreateFrame builds a frame for the connection with ipv6 overriding the ipv6
+// layer defaults and additionalLayers added after it.
+func (conn *IPv6Conn) CreateFrame(ipv6 IPv6, additionalLayers ...Layer) Layers {
+	return (*Connection)(conn).CreateFrame(&ipv6, additionalLayers...)
+}
+
+// Close to clean up any resources held.
+func (conn *IPv6Conn) Close() {
+	(*Connection)(conn).Close()
+}
+
+// ExpectFrame expects a frame that matches the provided Layers within the
+// timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *IPv6Conn) ExpectFrame(frame Layers, timeout time.Duration) (Layers, error) {
+	return (*Connection)(conn).ExpectFrame(frame, timeout)
+}
+
 // Drain drains the sniffer's receive buffer by receiving packets until there's
 // nothing else to receive.
 func (conn *TCPIPv4) Drain() {
@@ -652,7 +780,7 @@ func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 {
 	if err != nil {
 		t.Fatalf("can't make ipv4State: %s", err)
 	}
-	tcpState, err := newUDPState(outgoingUDP, incomingUDP)
+	tcpState, err := newUDPState(unix.AF_INET, outgoingUDP, incomingUDP)
 	if err != nil {
 		t.Fatalf("can't make udpState: %s", err)
 	}
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 2cbbbb318..817f5c261 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -36,14 +36,14 @@ import (
 type Layer interface {
 	fmt.Stringer
 
-	// toBytes converts the Layer into bytes. In places where the Layer's field
+	// ToBytes converts the Layer into bytes. In places where the Layer's field
 	// isn't nil, the value that is pointed to is used. When the field is nil, a
 	// reasonable default for the Layer is used. For example, "64" for IPv4 TTL
 	// and a calculated checksum for TCP or IP. Some layers require information
 	// from the previous or next layers in order to compute a default, such as
 	// TCP's checksum or Ethernet's type, so each Layer has a doubly-linked list
 	// to the layer's neighbors.
-	toBytes() ([]byte, error)
+	ToBytes() ([]byte, error)
 
 	// match checks if the current Layer matches the provided Layer. If either
 	// Layer has a nil in a given field, that field is considered matching.
@@ -174,7 +174,8 @@ func (l *Ether) String() string {
 	return stringLayer(l)
 }
 
-func (l *Ether) toBytes() ([]byte, error) {
+// ToBytes implements Layer.ToBytes.
+func (l *Ether) ToBytes() ([]byte, error) {
 	b := make([]byte, header.EthernetMinimumSize)
 	h := header.Ethernet(b)
 	fields := &header.EthernetFields{}
@@ -190,8 +191,9 @@ func (l *Ether) toBytes() ([]byte, error) {
 		switch n := l.next().(type) {
 		case *IPv4:
 			fields.Type = header.IPv4ProtocolNumber
+		case *IPv6:
+			fields.Type = header.IPv6ProtocolNumber
 		default:
-			// TODO(b/150301488): Support more protocols, like IPv6.
 			return nil, fmt.Errorf("ethernet header's next layer is unrecognized: %#v", n)
 		}
 	}
@@ -246,6 +248,8 @@ func parseEther(b []byte) (Layer, layerParser) {
 	switch h.Type() {
 	case header.IPv4ProtocolNumber:
 		nextParser = parseIPv4
+	case header.IPv6ProtocolNumber:
+		nextParser = parseIPv6
 	default:
 		// Assume that the rest is a payload.
 		nextParser = parsePayload
@@ -286,7 +290,8 @@ func (l *IPv4) String() string {
 	return stringLayer(l)
 }
 
-func (l *IPv4) toBytes() ([]byte, error) {
+// ToBytes implements Layer.ToBytes.
+func (l *IPv4) ToBytes() ([]byte, error) {
 	b := make([]byte, header.IPv4MinimumSize)
 	h := header.IPv4(b)
 	fields := &header.IPv4Fields{
@@ -421,6 +426,186 @@ func (l *IPv4) merge(other Layer) error {
 	return mergeLayer(l, other)
 }
 
+// IPv6 can construct and match an IPv6 encapsulation.
+type IPv6 struct {
+	LayerBase
+	TrafficClass  *uint8
+	FlowLabel     *uint32
+	PayloadLength *uint16
+	NextHeader    *uint8
+	HopLimit      *uint8
+	SrcAddr       *tcpip.Address
+	DstAddr       *tcpip.Address
+}
+
+func (l *IPv6) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *IPv6) ToBytes() ([]byte, error) {
+	b := make([]byte, header.IPv6MinimumSize)
+	h := header.IPv6(b)
+	fields := &header.IPv6Fields{
+		HopLimit: 64,
+	}
+	if l.TrafficClass != nil {
+		fields.TrafficClass = *l.TrafficClass
+	}
+	if l.FlowLabel != nil {
+		fields.FlowLabel = *l.FlowLabel
+	}
+	if l.PayloadLength != nil {
+		fields.PayloadLength = *l.PayloadLength
+	} else {
+		for current := l.next(); current != nil; current = current.next() {
+			fields.PayloadLength += uint16(current.length())
+		}
+	}
+	if l.NextHeader != nil {
+		fields.NextHeader = *l.NextHeader
+	} else {
+		switch n := l.next().(type) {
+		case *TCP:
+			fields.NextHeader = uint8(header.TCPProtocolNumber)
+		case *UDP:
+			fields.NextHeader = uint8(header.UDPProtocolNumber)
+		case *ICMPv6:
+			fields.NextHeader = uint8(header.ICMPv6ProtocolNumber)
+		default:
+			// TODO(b/150301488): Support more protocols as needed.
+			return nil, fmt.Errorf("ToBytes can't deduce the IPv6 header's next protocol: %#v", n)
+		}
+	}
+	if l.HopLimit != nil {
+		fields.HopLimit = *l.HopLimit
+	}
+	if l.SrcAddr != nil {
+		fields.SrcAddr = *l.SrcAddr
+	}
+	if l.DstAddr != nil {
+		fields.DstAddr = *l.DstAddr
+	}
+	h.Encode(fields)
+	return h, nil
+}
+
+// parseIPv6 parses the bytes assuming that they start with an ipv6 header and
+// continues parsing further encapsulations.
+func parseIPv6(b []byte) (Layer, layerParser) {
+	h := header.IPv6(b)
+	tos, flowLabel := h.TOS()
+	ipv6 := IPv6{
+		TrafficClass:  &tos,
+		FlowLabel:     &flowLabel,
+		PayloadLength: Uint16(h.PayloadLength()),
+		NextHeader:    Uint8(h.NextHeader()),
+		HopLimit:      Uint8(h.HopLimit()),
+		SrcAddr:       Address(h.SourceAddress()),
+		DstAddr:       Address(h.DestinationAddress()),
+	}
+	var nextParser layerParser
+	switch h.TransportProtocol() {
+	case header.TCPProtocolNumber:
+		nextParser = parseTCP
+	case header.UDPProtocolNumber:
+		nextParser = parseUDP
+	case header.ICMPv6ProtocolNumber:
+		nextParser = parseICMPv6
+	default:
+		// Assume that the rest is a payload.
+		nextParser = parsePayload
+	}
+	return &ipv6, nextParser
+}
+
+func (l *IPv6) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *IPv6) length() int {
+	return header.IPv6MinimumSize
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *IPv6) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// ICMPv6 can construct and match an ICMPv6 encapsulation.
+type ICMPv6 struct {
+	LayerBase
+	Type       *header.ICMPv6Type
+	Code       *byte
+	Checksum   *uint16
+	NDPPayload []byte
+}
+
+func (l *ICMPv6) String() string {
+	// TODO(eyalsoha): Do something smarter here when *l.Type is ParameterProblem?
+	// We could parse the contents of the Payload as if it were an IPv6 packet.
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *ICMPv6) ToBytes() ([]byte, error) {
+	b := make([]byte, header.ICMPv6HeaderSize+len(l.NDPPayload))
+	h := header.ICMPv6(b)
+	if l.Type != nil {
+		h.SetType(*l.Type)
+	}
+	if l.Code != nil {
+		h.SetCode(*l.Code)
+	}
+	copy(h.NDPPayload(), l.NDPPayload)
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+	} else {
+		ipv6 := l.prev().(*IPv6)
+		h.SetChecksum(header.ICMPv6Checksum(h, *ipv6.SrcAddr, *ipv6.DstAddr, buffer.VectorisedView{}))
+	}
+	return h, nil
+}
+
+// ICMPv6Type is a helper routine that allocates a new ICMPv6Type value to store
+// v and returns a pointer to it.
+func ICMPv6Type(v header.ICMPv6Type) *header.ICMPv6Type {
+	return &v
+}
+
+// Byte is a helper routine that allocates a new byte value to store
+// v and returns a pointer to it.
+func Byte(v byte) *byte {
+	return &v
+}
+
+// parseICMPv6 parses the bytes assuming that they start with an ICMPv6 header.
+func parseICMPv6(b []byte) (Layer, layerParser) {
+	h := header.ICMPv6(b)
+	icmpv6 := ICMPv6{
+		Type:       ICMPv6Type(h.Type()),
+		Code:       Byte(h.Code()),
+		Checksum:   Uint16(h.Checksum()),
+		NDPPayload: h.NDPPayload(),
+	}
+	return &icmpv6, nil
+}
+
+func (l *ICMPv6) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *ICMPv6) length() int {
+	return header.ICMPv6HeaderSize + len(l.NDPPayload)
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *ICMPv6) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
 // TCP can construct and match a TCP encapsulation.
 type TCP struct {
 	LayerBase
@@ -439,7 +624,8 @@ func (l *TCP) String() string {
 	return stringLayer(l)
 }
 
-func (l *TCP) toBytes() ([]byte, error) {
+// ToBytes implements Layer.ToBytes.
+func (l *TCP) ToBytes() ([]byte, error) {
 	b := make([]byte, header.TCPMinimumSize)
 	h := header.TCP(b)
 	if l.SrcPort != nil {
@@ -504,7 +690,7 @@ func layerChecksum(l Layer, protoNumber tcpip.TransportProtocolNumber) (uint16,
 	}
 	var payloadBytes buffer.VectorisedView
 	for current := l.next(); current != nil; current = current.next() {
-		payload, err := current.toBytes()
+		payload, err := current.ToBytes()
 		if err != nil {
 			return 0, fmt.Errorf("can't get bytes for next header: %s", payload)
 		}
@@ -578,7 +764,8 @@ func (l *UDP) String() string {
 	return stringLayer(l)
 }
 
-func (l *UDP) toBytes() ([]byte, error) {
+// ToBytes implements Layer.ToBytes.
+func (l *UDP) ToBytes() ([]byte, error) {
 	b := make([]byte, header.UDPMinimumSize)
 	h := header.UDP(b)
 	if l.SrcPort != nil {
@@ -661,7 +848,8 @@ func parsePayload(b []byte) (Layer, layerParser) {
 	return &payload, nil
 }
 
-func (l *Payload) toBytes() ([]byte, error) {
+// ToBytes implements Layer.ToBytes.
+func (l *Payload) ToBytes() ([]byte, error) {
 	return l.Bytes, nil
 }
 
@@ -697,11 +885,13 @@ func (ls *Layers) linkLayers() {
 	}
 }
 
-func (ls *Layers) toBytes() ([]byte, error) {
+// ToBytes converts the Layers into bytes. It creates a linked list of the Layer
+// structs and then concatentates the output of ToBytes on each Layer.
+func (ls *Layers) ToBytes() ([]byte, error) {
 	ls.linkLayers()
 	outBytes := []byte{}
 	for _, l := range *ls {
-		layerBytes, err := l.toBytes()
+		layerBytes, err := l.ToBytes()
 		if err != nil {
 			return nil, err
 		}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 47c722ccd..42f87e3f3 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -96,6 +96,19 @@ packetimpact_go_test(
     ],
 )
 
+packetimpact_go_test(
+    name = "icmpv6_param_problem",
+    srcs = ["icmpv6_param_problem_test.go"],
+    # TODO(b/153485026): Fix netstack then remove the line below.
+    netstack = False,
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 sh_binary(
     name = "test_runner",
     srcs = ["test_runner.sh"],
diff --git a/test/packetimpact/tests/icmpv6_param_problem_test.go b/test/packetimpact/tests/icmpv6_param_problem_test.go
new file mode 100644
index 000000000..b48e55df4
--- /dev/null
+++ b/test/packetimpact/tests/icmpv6_param_problem_test.go
@@ -0,0 +1,73 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package icmpv6_param_problem_test
+
+import (
+	"encoding/binary"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+// TestICMPv6ParamProblemTest sends a packet with a bad next header. The DUT
+// should respond with an ICMPv6 Parameter Problem message.
+func TestICMPv6ParamProblemTest(t *testing.T) {
+	dut := tb.NewDUT(t)
+	defer dut.TearDown()
+	conn := tb.NewIPv6Conn(t, tb.IPv6{}, tb.IPv6{})
+	defer conn.Close()
+	ipv6 := tb.IPv6{
+		// 254 is reserved and used for experimentation and testing. This should
+		// cause an error.
+		NextHeader: tb.Uint8(254),
+	}
+	icmpv6 := tb.ICMPv6{
+		Type:       tb.ICMPv6Type(header.ICMPv6EchoRequest),
+		NDPPayload: []byte("hello world"),
+	}
+
+	toSend := conn.CreateFrame(ipv6, &icmpv6)
+	conn.SendFrame(toSend)
+
+	// Build the expected ICMPv6 payload, which includes an index to the
+	// problematic byte and also the problematic packet as described in
+	// https://tools.ietf.org/html/rfc4443#page-12 .
+	ipv6Sent := toSend[1:]
+	expectedPayload, err := ipv6Sent.ToBytes()
+	if err != nil {
+		t.Fatalf("can't convert %s to bytes: %s", ipv6Sent, err)
+	}
+
+	// The problematic field is the NextHeader.
+	b := make([]byte, 4)
+	binary.BigEndian.PutUint32(b, header.IPv6NextHeaderOffset)
+	expectedPayload = append(b, expectedPayload...)
+	expectedICMPv6 := tb.ICMPv6{
+		Type:       tb.ICMPv6Type(header.ICMPv6ParamProblem),
+		NDPPayload: expectedPayload,
+	}
+
+	paramProblem := tb.Layers{
+		&tb.Ether{},
+		&tb.IPv6{},
+		&expectedICMPv6,
+	}
+	timeout := time.Second
+	if _, err := conn.ExpectFrame(paramProblem, timeout); err != nil {
+		t.Errorf("expected %s within %s but got none: %s", paramProblem, timeout, err)
+	}
+}
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
index e938de782..706441cce 100755
--- a/test/packetimpact/tests/test_runner.sh
+++ b/test/packetimpact/tests/test_runner.sh
@@ -192,6 +192,8 @@ docker pull "${IMAGE_TAG}"
 
 # Create the DUT container and connect to network.
 DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
+  --cap-add NET_ADMIN \
+  --sysctl net.ipv6.conf.all.disable_ipv6=0 \
   --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
 docker network connect "${CTRL_NET}" \
   --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
@@ -203,6 +205,8 @@ docker start "${DUT}"
 
 # Create the test bench container and connect to network.
 TESTBENCH=$(docker create --privileged --rm \
+  --cap-add NET_ADMIN \
+  --sysctl net.ipv6.conf.all.disable_ipv6=0 \
   --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
 docker network connect "${CTRL_NET}" \
   --ip "${CTRL_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
@@ -237,6 +241,32 @@ declare -r REMOTE_MAC=$(docker exec -t "${DUT}" ip link show \
   "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
 declare -r LOCAL_MAC=$(docker exec -t "${TESTBENCH}" ip link show \
   "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
+declare REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show scope link \
+  "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
+declare -r LOCAL_IPV6=$(docker exec -t "${TESTBENCH}" ip addr show scope link \
+  "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
+
+# Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
+# needed.  Convert the MAC address to an IPv6 link local address as described in
+# RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20
+if [[ -z "${REMOTE_IPV6}" ]]; then
+  # Split the octets of the MAC into an array of strings.
+  IFS=":" read -a REMOTE_OCTETS <<< "${REMOTE_MAC}"
+  # Flip the global bit.
+  REMOTE_OCTETS[0]=$(printf '%x' "$((0x${REMOTE_OCTETS[0]} ^ 2))")
+  # Add the IPv6 address.
+  docker exec "${DUT}" \
+    ip addr add $(printf 'fe80::%02x%02x:%02xff:fe%02x:%02x%02x/64' \
+    "0x${REMOTE_OCTETS[0]}" "0x${REMOTE_OCTETS[1]}" "0x${REMOTE_OCTETS[2]}" \
+    "0x${REMOTE_OCTETS[3]}" "0x${REMOTE_OCTETS[4]}" "0x${REMOTE_OCTETS[5]}") \
+    scope link \
+    dev "${TEST_DEVICE}"
+  # Re-extract the IPv6 address.
+  # TODO(eyalsoha): Add "scope link" below when netstack supports correctly
+  # creating link-local IPv6 addresses.
+  REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show \
+    "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
+fi
 
 declare -r DOCKER_TESTBENCH_BINARY="/$(basename ${TESTBENCH_BINARY})"
 docker cp -L "${TESTBENCH_BINARY}" "${TESTBENCH}:${DOCKER_TESTBENCH_BINARY}"
@@ -245,7 +275,10 @@ if [[ -z "${TSHARK-}" ]]; then
   # Run tcpdump in the test bench unbuffered, without dns resolution, just on
   # the interface with the test packets.
   docker exec -t "${TESTBENCH}" \
-    tcpdump -S -vvv -U -n -i "${TEST_DEVICE}" net "${TEST_NET_PREFIX}/24" &
+    tcpdump -S -vvv -U -n -i "${TEST_DEVICE}" \
+    net "${TEST_NET_PREFIX}/24" or \
+    host "${REMOTE_IPV6}" or \
+    host "${LOCAL_IPV6}" &
 else
   # Run tshark in the test bench unbuffered, without dns resolution, just on the
   # interface with the test packets.
@@ -253,7 +286,9 @@ else
     tshark -V -l -n -i "${TEST_DEVICE}" \
     -o tcp.check_checksum:TRUE \
     -o udp.check_checksum:TRUE \
-    host "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" &
+    net "${TEST_NET_PREFIX}/24" or \
+    host "${REMOTE_IPV6}" or \
+    host "${LOCAL_IPV6}" &
 fi
 
 # tcpdump and tshark take time to startup
@@ -272,6 +307,8 @@ docker exec \
   --posix_server_port=${CTRL_PORT} \
   --remote_ipv4=${TEST_NET_PREFIX}${DUT_NET_SUFFIX} \
   --local_ipv4=${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX} \
+  --remote_ipv6=${REMOTE_IPV6} \
+  --local_ipv6=${LOCAL_IPV6} \
   --remote_mac=${REMOTE_MAC} \
   --local_mac=${LOCAL_MAC} \
   --device=${TEST_DEVICE}" && true
-- 
cgit v1.2.3


From 4af39dd1c522f7852312ecbfd3678892fc656322 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 24 Apr 2020 18:15:26 -0700
Subject: Propagate PID limit from OCI to sandbox cgroup

Closes #2489

PiperOrigin-RevId: 308362434
---
 runsc/cgroup/cgroup.go   | 14 ++++++++++++--
 test/root/cgroup_test.go |  6 ++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 653ca5f52..fa40ee509 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -45,13 +45,13 @@ var controllers = map[string]controller{
 	"memory":   &memory{},
 	"net_cls":  &networkClass{},
 	"net_prio": &networkPrio{},
+	"pids":     &pids{},
 
 	// These controllers either don't have anything in the OCI spec or is
-	// irrevalant for a sandbox, e.g. pids.
+	// irrelevant for a sandbox.
 	"devices":    &noop{},
 	"freezer":    &noop{},
 	"perf_event": &noop{},
-	"pids":       &noop{},
 	"systemd":    &noop{},
 }
 
@@ -525,3 +525,13 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 	}
 	return nil
 }
+
+type pids struct{}
+
+func (*pids) set(spec *specs.LinuxResources, path string) error {
+	if spec.Pids == nil {
+		return nil
+	}
+	val := strconv.FormatInt(spec.Pids.Limit, 10)
+	return setValue(path, "pids.max", val)
+}
diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
index 8876d0d61..d0634b5c3 100644
--- a/test/root/cgroup_test.go
+++ b/test/root/cgroup_test.go
@@ -199,6 +199,12 @@ func TestCgroup(t *testing.T) {
 			want:           "750",
 			skipIfNotFound: true, // blkio groups may not be available.
 		},
+		{
+			arg:  "--pids-limit=1000",
+			ctrl: "pids",
+			file: "pids.max",
+			want: "1000",
+		},
 	}
 
 	args := make([]string, 0, len(attrs))
-- 
cgit v1.2.3


From 15a822a1936e295cb6418df7ddf445d8500dfb2e Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Fri, 24 Apr 2020 18:22:21 -0700
Subject: VFS2: Get HelloWorld image tests to pass with VFS2

This change includes:
- Modifications to loader_test.go to get TestCreateMountNamespace to
pass with VFS2.
- Changes necessary to get TestHelloWorld in image tests to pass with
VFS2. This means runsc can run the hello-world container with docker
on VSF2.

Note: Containers that use sockets will not run with these changes.
See "//test/image/...". Any tests here with sockets currently fail
(which is all of them but HelloWorld).
PiperOrigin-RevId: 308363072
---
 pkg/sentry/fsimpl/gofer/directory.go |   1 +
 runsc/boot/BUILD                     |   2 +
 runsc/boot/loader.go                 |  13 +--
 runsc/boot/loader_test.go            | 152 ++++++++++++++++++++++++-----------
 runsc/boot/vfs.go                    |  78 ++++++++++++++----
 scripts/docker_tests.sh              |   3 +
 6 files changed, 183 insertions(+), 66 deletions(-)

diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index c67766ab2..55f9ed911 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -75,6 +75,7 @@ func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode
 		handle: handle{
 			fd: -1,
 		},
+		nlink: uint32(2),
 	}
 	d2.pf.dentry = d2
 	d2.vfsd.Init(d2)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 69dcc74f2..ed3c8f546 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -119,11 +119,13 @@ go_test(
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
         "//runsc/fsgofer",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 3f41d8357..f6ea4c102 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -625,11 +625,14 @@ func (l *Loader) run() error {
 
 	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
 	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
-	// during restore, we can release l.stdioFDs now.
-	for _, fd := range l.stdioFDs {
-		err := syscall.Close(fd)
-		if err != nil {
-			return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
+	// passed FDs, so only close for VFS1.
+	if !kernel.VFS2Enabled {
+		for _, fd := range l.stdioFDs {
+			err := syscall.Close(fd)
+			if err != nil {
+				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+			}
 		}
 	}
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e7c71734f..55d27a632 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,11 +26,13 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/fsgofer"
@@ -107,14 +109,12 @@ func startGofer(root string) (int, func(), error) {
 	return sandboxEnd, cleanup, nil
 }
 
-func createLoader(vfsEnabled bool) (*Loader, func(), error) {
+func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) {
 	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
 		return nil, nil, err
 	}
 	conf := testConfig()
-	spec := testSpec()
-
 	conf.VFS2 = vfsEnabled
 
 	sandEnd, cleanup, err := startGofer(spec.Root.Path)
@@ -161,7 +161,7 @@ func TestRunVFS2(t *testing.T) {
 }
 
 func doRun(t *testing.T, vfsEnabled bool) {
-	l, cleanup, err := createLoader(vfsEnabled)
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
@@ -210,7 +210,7 @@ func TestStartSignalVFS2(t *testing.T) {
 }
 
 func doStartSignal(t *testing.T, vfsEnabled bool) {
-	l, cleanup, err := createLoader(vfsEnabled)
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
@@ -258,18 +258,19 @@ func doStartSignal(t *testing.T, vfsEnabled bool) {
 
 }
 
-// Test that MountNamespace can be created with various specs.
-func TestCreateMountNamespace(t *testing.T) {
-	testCases := []struct {
-		name string
-		// Spec that will be used to create the mount manager.  Note
-		// that we can't mount procfs without a kernel, so each spec
-		// MUST contain something other than procfs mounted at /proc.
-		spec specs.Spec
-		// Paths that are expected to exist in the resulting fs.
-		expectedPaths []string
-	}{
-		{
+type CreateMountTestcase struct {
+	name string
+	// Spec that will be used to create the mount manager.  Note
+	// that we can't mount procfs without a kernel, so each spec
+	// MUST contain something other than procfs mounted at /proc.
+	spec specs.Spec
+	// Paths that are expected to exist in the resulting fs.
+	expectedPaths []string
+}
+
+func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+	testCases := []*CreateMountTestcase{
+		&CreateMountTestcase{
 			// Only proc.
 			name: "only proc mount",
 			spec: specs.Spec{
@@ -311,7 +312,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			// /dev, and /sys.
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		{
+		&CreateMountTestcase{
 			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
@@ -355,7 +356,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		{
+		&CreateMountTestcase{
 			name: "mount inside /dev",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -398,40 +399,47 @@ func TestCreateMountNamespace(t *testing.T) {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
-		{
-			name: "mounts inside mandatory mounts",
-			spec: specs.Spec{
-				Root: &specs.Root{
-					Path:     os.TempDir(),
-					Readonly: true,
+	}
+
+	vfsCase := &CreateMountTestcase{
+		name: "mounts inside mandatory mounts",
+		spec: specs.Spec{
+			Root: &specs.Root{
+				Path:     os.TempDir(),
+				Readonly: true,
+			},
+			Mounts: []specs.Mount{
+				{
+					Destination: "/proc",
+					Type:        "tmpfs",
 				},
-				Mounts: []specs.Mount{
-					{
-						Destination: "/proc",
-						Type:        "tmpfs",
-					},
-					// We don't include /sys, and /tmp in
-					// the spec, since they will be added
-					// automatically.
-					//
-					// Instead, add submounts inside these
-					// directories and make sure they are
-					// visible under the mandatory mounts.
-					{
-						Destination: "/sys/bar",
-						Type:        "tmpfs",
-					},
-					{
-						Destination: "/tmp/baz",
-						Type:        "tmpfs",
-					},
+				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
+				//  MkDirAt in VFS2 (and remove the reduntant append).
+				// {
+				//		Destination: "/sys/bar",
+				//		Type:        "tmpfs",
+				//	},
+				//
+				{
+					Destination: "/tmp/baz",
+					Type:        "tmpfs",
 				},
 			},
-			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
 		},
+		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
 	}
 
-	for _, tc := range testCases {
+	if !vfs2 {
+		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
+		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
+	}
+	return append(testCases, vfsCase)
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+
+	for _, tc := range createMountTestcases(false /* vfs2 */) {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -466,6 +474,56 @@ func TestCreateMountNamespace(t *testing.T) {
 	}
 }
 
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespaceVFS2(t *testing.T) {
+
+	for _, tc := range createMountTestcases(true /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			defer resetSyscallTable()
+
+			spec := testSpec()
+			spec.Mounts = tc.spec.Mounts
+			spec.Root = tc.spec.Root
+
+			l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec)
+			if err != nil {
+				t.Fatalf("failed to create loader: %v", err)
+			}
+			defer l.Destroy()
+			defer loaderCleanup()
+
+			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.conf); err != nil {
+				t.Fatalf("failed process hints: %v", err)
+			}
+
+			ctx := l.rootProcArgs.NewContext(l.k)
+			mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+			if err != nil {
+				t.Fatalf("failed to setupVFS2: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+
+				target := &vfs.PathOperation{
+					Root:  root,
+					Start: root,
+					Path:  fspath.Parse(p),
+				}
+
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
+				}
+
+			}
+		})
+	}
+}
+
 // TestRestoreEnvironment tests that the correct mounts are collected from the spec and config
 // in order to build the environment for restoring.
 func TestRestoreEnvironment(t *testing.T) {
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index bce3a3593..0b9b0b436 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"path"
+	"sort"
 	"strconv"
 	"strings"
 
@@ -192,14 +193,9 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 		return nil, fmt.Errorf("register filesystems: %w", err)
 	}
 
-	fd := c.fds.remove()
-
-	opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
-
-	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts})
+	mns, err := c.createMountNamespaceVFS2(ctx, conf, creds)
 	if err != nil {
-		return nil, fmt.Errorf("setting up mountnamespace: %w", err)
+		return nil, fmt.Errorf("creating mount namespace: %w", err)
 	}
 
 	rootProcArgs.MountNamespaceVFS2 = mns
@@ -212,8 +208,23 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	return mns, nil
 }
 
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+
+	fd := c.fds.remove()
+	opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mount namespace: %w", err)
+	}
+	return mns, nil
+}
+
 func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
 
+	c.prepareMountsVFS2()
+
 	for _, submount := range c.mounts {
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
 		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil {
@@ -226,6 +237,11 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 	return c.checkDispenser()
 }
 
+func (c *containerMounter) prepareMountsVFS2() {
+	// Sort the mounts so that we don't place children before parents.
+	sort.Slice(c.mounts, func(i, j int) bool { return len(c.mounts[i].Destination) < len(c.mounts[j].Destination) })
+}
+
 // TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version.
 func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error {
 	root := mns.Root()
@@ -236,11 +252,21 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 		Path:  fspath.Parse(submount.Destination),
 	}
 
-	_, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount)
+	fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount)
 	if err != nil {
 		return fmt.Errorf("mountOptions failed: %w", err)
 	}
 
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
+		return err
+	}
+	log.Debugf("directory exists or made directory for submount: %s", submount.Destination)
+
 	opts := &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
 			Data: strings.Join(options, ","),
@@ -251,12 +277,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 	// All writes go to upper, be paranoid and make lower readonly.
 	opts.ReadOnly = useOverlay
 
-	if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{
-		ForSyntheticMountpoint: true,
-	}); err != nil && err != syserror.EEXIST {
-		// Log a warning, but attempt the mount anyway.
-		log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err)
-	}
 	if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
 		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
@@ -314,3 +334,33 @@ func p9MountOptionsVFS2(fd int, fa FileAccessType) []string {
 	}
 	return opts
 }
+
+func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+
+	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
+	switch {
+
+	case err == syserror.ENOENT:
+		if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
+			return err
+		}
+
+		mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+		if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
+			return fmt.Errorf("failed to makedir for mount %+v: %w", target, err)
+		}
+		return nil
+
+	case err != nil:
+		return fmt.Errorf("stat failed for mount %+v: %w", target, err)
+
+	default:
+		return nil
+	}
+}
diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh
index 931ce1aa4..dce0a4085 100755
--- a/scripts/docker_tests.sh
+++ b/scripts/docker_tests.sh
@@ -20,3 +20,6 @@ make load-all-images
 
 install_runsc_for_test docker
 test_runsc //test/image:image_test //test/e2e:integration_test
+
+install_runsc_for_test docker --vfs2
+test_runsc //test/image:image_test --test_filter=.*TestHelloWorld
-- 
cgit v1.2.3


From c9199bab927e901947c1647de248433aa3d439fb Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 24 Apr 2020 19:59:05 -0700
Subject: More descriptive error message for missing docker image.

Tested:
  Ran a packetimpact test after `docker image rm` and examined the message.
PiperOrigin-RevId: 308370603
---
 pkg/test/dockerutil/dockerutil.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index baa8fc2f2..5f2af9f3b 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -353,7 +353,10 @@ func (d *Docker) run(r RunOpts, command string, p ...string) (string, error) {
 
 // Create calls 'docker create' with the arguments provided.
 func (d *Docker) Create(r RunOpts, args ...string) error {
-	_, err := d.run(r, "create", args...)
+	out, err := d.run(r, "create", args...)
+	if strings.Contains(out, "Unable to find image") {
+		return fmt.Errorf("unable to find image, did you remember to `make load-%s`: %w", r.Image, err)
+	}
 	return err
 }
 
-- 
cgit v1.2.3


From 17ac90a2033a7646dca3dac405b4b0f589e95478 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Sat, 25 Apr 2020 11:26:18 -0700
Subject: Add container tests passing with VFS2

Several tests are passing after getting TestAppExitStatus (run /bin/true)
changes. Make versions that run via VFS2 so that we know what is and isn't
working.

In addition, fix bug in VFSFile ReadFull. For the TestExePath test in
container_test.go, the case "unmasked" will return 0 bytes read with no
EOF err, causing the ReadFull call to spin.

PiperOrigin-RevId: 308428126
---
 runsc/container/container_test.go | 61 +++++++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 3ff89f38c..a1d4d3b7e 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -281,6 +281,18 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	return cs
 }
 
+func configsWithVFS2(t *testing.T, opts []configOption) map[string]*boot.Config {
+	vfs1 := configs(t, opts...)
+	vfs2 := configs(t, opts...)
+
+	for key, value := range vfs2 {
+		value.VFS2 = true
+		vfs1[key+"VFS2"] = value
+	}
+
+	return vfs1
+}
+
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
@@ -290,7 +302,7 @@ func TestLifecycle(t *testing.T) {
 	childReaper.Start()
 	defer childReaper.Stop()
 
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all) {
 		t.Run(name, func(t *testing.T) {
 			// The container will just sleep for a long time.  We will kill it before
 			// it finishes sleeping.
@@ -464,7 +476,7 @@ func TestExePath(t *testing.T) {
 		t.Fatalf("error making directory: %v", err)
 	}
 
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, []configOption{overlay}) {
 		t.Run(name, func(t *testing.T) {
 			for _, test := range []struct {
 				path    string
@@ -1329,7 +1341,7 @@ func TestRunNonRoot(t *testing.T) {
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, []configOption{overlay}) {
 		t.Run(name, func(t *testing.T) {
 			root, err := ioutil.TempDir(testutil.TmpDir(), "root")
 			if err != nil {
@@ -1358,7 +1370,7 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, []configOption{overlay}) {
 		t.Run(name, func(t *testing.T) {
 			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
 			spec.Root.Readonly = true
@@ -1476,7 +1488,7 @@ func TestUIDMap(t *testing.T) {
 }
 
 func TestReadonlyMount(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, []configOption{overlay}) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
 			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
@@ -1526,6 +1538,14 @@ func TestReadonlyMount(t *testing.T) {
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
+	doAbbreviatedIDsTest(t, false)
+}
+
+func TestAbbreviatedIDsVFS2(t *testing.T) {
+	doAbbreviatedIDsTest(t, true)
+}
+
+func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
@@ -1534,6 +1554,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
+	conf.VFS2 = vfs2
 
 	cids := []string{
 		"foo-" + testutil.RandomContainerID(),
@@ -1589,9 +1610,19 @@ func TestAbbreviatedIDs(t *testing.T) {
 }
 
 func TestGoferExits(t *testing.T) {
+	doGoferExitTest(t, false)
+}
+
+func TestGoferExitsVFS2(t *testing.T) {
+	doGoferExitTest(t, true)
+}
+
+func doGoferExitTest(t *testing.T, vfs2 bool) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
 	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -1711,7 +1742,7 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all) {
 		t.Run(name, func(t *testing.T) {
 			// Run a shell that sleeps for 1 second and then exits with a
 			// non-zero code.
@@ -1764,8 +1795,17 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 }
 
 func TestDestroyNotStarted(t *testing.T) {
+	doDestroyNotStartedTest(t, false)
+}
+
+func TestDestroyNotStartedVFS2(t *testing.T) {
+	doDestroyNotStartedTest(t, true)
+}
+
+func doDestroyNotStartedTest(t *testing.T, vfs2 bool) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
 	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
@@ -1789,9 +1829,18 @@ func TestDestroyNotStarted(t *testing.T) {
 
 // TestDestroyStarting attempts to force a race between start and destroy.
 func TestDestroyStarting(t *testing.T) {
+	doDestroyNotStartedTest(t, false)
+}
+
+func TestDestroyStartedVFS2(t *testing.T) {
+	doDestroyNotStartedTest(t, true)
+}
+
+func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 	for i := 0; i < 10; i++ {
 		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
 		conf := testutil.TestConfig(t)
+		conf.VFS2 = vfs2
 		rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
-- 
cgit v1.2.3


From 3c67754663f424f2ebbc0ff2a4c80e30618d5355 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Sat, 25 Apr 2020 23:54:56 -0700
Subject: Enable automated marshalling for signals and the arch package.

PiperOrigin-RevId: 308472331
---
 pkg/abi/linux/BUILD                                |  2 +
 pkg/abi/linux/ptrace_amd64.go                      | 52 ++++++++++++++++++++++
 pkg/abi/linux/ptrace_arm64.go                      | 29 ++++++++++++
 pkg/sentry/arch/BUILD                              |  4 +-
 pkg/sentry/arch/arch_aarch64.go                    | 26 ++++++-----
 pkg/sentry/arch/arch_amd64.go                      | 13 +++---
 pkg/sentry/arch/arch_arm64.go                      |  2 +
 pkg/sentry/arch/arch_state_aarch64.go              | 38 ----------------
 pkg/sentry/arch/arch_state_x86.go                  | 42 -----------------
 pkg/sentry/arch/arch_x86.go                        | 25 ++++++-----
 pkg/sentry/arch/arch_x86_impl.go                   |  4 +-
 pkg/sentry/arch/signal.go                          |  3 ++
 pkg/sentry/arch/signal_act.go                      |  4 ++
 pkg/sentry/arch/signal_stack.go                    |  3 ++
 pkg/sentry/kernel/task_signals.go                  |  8 ++--
 pkg/sentry/platform/kvm/kvm_arm64.go               |  5 +--
 pkg/sentry/platform/kvm/kvm_test.go                | 36 +++++++--------
 pkg/sentry/platform/kvm/testutil/BUILD             |  1 +
 pkg/sentry/platform/kvm/testutil/testutil_amd64.go | 17 +++----
 pkg/sentry/platform/kvm/testutil/testutil_arm64.go | 13 +++---
 pkg/sentry/platform/ptrace/ptrace_amd64.go         |  7 ++-
 pkg/sentry/platform/ptrace/ptrace_arm64.go         |  5 +--
 pkg/sentry/platform/ptrace/ptrace_unsafe.go        |  4 +-
 pkg/sentry/platform/ptrace/subprocess.go           |  8 ++--
 pkg/sentry/platform/ptrace/subprocess_amd64.go     | 16 +++----
 pkg/sentry/platform/ptrace/subprocess_arm64.go     | 16 +++----
 pkg/sentry/platform/ring0/BUILD                    |  1 +
 pkg/sentry/platform/ring0/defs.go                  |  9 ++--
 pkg/sentry/platform/ring0/entry_amd64.go           |  6 +--
 pkg/sentry/platform/ring0/gen_offsets/BUILD        |  1 +
 pkg/sentry/platform/ring0/offsets_amd64.go         |  5 ++-
 pkg/sentry/platform/ring0/offsets_arm64.go         |  5 ++-
 pkg/sentry/syscalls/linux/sys_signal.go            | 10 ++---
 33 files changed, 224 insertions(+), 196 deletions(-)
 create mode 100644 pkg/abi/linux/ptrace_amd64.go
 create mode 100644 pkg/abi/linux/ptrace_arm64.go
 delete mode 100644 pkg/sentry/arch/arch_state_aarch64.go

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 59b0e138a..114b516e2 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -44,6 +44,8 @@ go_library(
         "poll.go",
         "prctl.go",
         "ptrace.go",
+        "ptrace_amd64.go",
+        "ptrace_arm64.go",
         "rseq.go",
         "rusage.go",
         "sched.go",
diff --git a/pkg/abi/linux/ptrace_amd64.go b/pkg/abi/linux/ptrace_amd64.go
new file mode 100644
index 000000000..ed3881e27
--- /dev/null
+++ b/pkg/abi/linux/ptrace_amd64.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+// PtraceRegs is the set of CPU registers exposed by ptrace. Source:
+// syscall.PtraceRegs.
+//
+// +marshal
+// +stateify savable
+type PtraceRegs struct {
+	R15      uint64
+	R14      uint64
+	R13      uint64
+	R12      uint64
+	Rbp      uint64
+	Rbx      uint64
+	R11      uint64
+	R10      uint64
+	R9       uint64
+	R8       uint64
+	Rax      uint64
+	Rcx      uint64
+	Rdx      uint64
+	Rsi      uint64
+	Rdi      uint64
+	Orig_rax uint64
+	Rip      uint64
+	Cs       uint64
+	Eflags   uint64
+	Rsp      uint64
+	Ss       uint64
+	Fs_base  uint64
+	Gs_base  uint64
+	Ds       uint64
+	Es       uint64
+	Fs       uint64
+	Gs       uint64
+}
diff --git a/pkg/abi/linux/ptrace_arm64.go b/pkg/abi/linux/ptrace_arm64.go
new file mode 100644
index 000000000..6147738b3
--- /dev/null
+++ b/pkg/abi/linux/ptrace_arm64.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+// PtraceRegs is the set of CPU registers exposed by ptrace. Source:
+// syscall.PtraceRegs.
+//
+// +marshal
+// +stateify savable
+type PtraceRegs struct {
+	Regs   [31]uint64
+	Sp     uint64
+	Pc     uint64
+	Pstate uint64
+}
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index e27f21e5e..901e0f320 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -11,7 +11,6 @@ go_library(
         "arch_amd64.go",
         "arch_amd64.s",
         "arch_arm64.go",
-        "arch_state_aarch64.go",
         "arch_state_x86.go",
         "arch_x86.go",
         "arch_x86_impl.go",
@@ -26,11 +25,11 @@ go_library(
         "syscalls_amd64.go",
         "syscalls_arm64.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         ":registers_go_proto",
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/log",
@@ -38,6 +37,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
+        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index c29e1b841..529980267 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -17,18 +17,20 @@
 package arch
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
-	"syscall"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Registers represents the CPU registers for this architecture.
+type Registers = linux.PtraceRegs
+
 const (
 	// SyscallWidth is the width of insturctions.
 	SyscallWidth = 4
@@ -90,7 +92,7 @@ func NewFloatingPointData() *FloatingPointData {
 // file ensures it's only built on aarch64).
 type State struct {
 	// The system registers.
-	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+	Regs Registers
 
 	// Our floating point state.
 	aarch64FPState `state:"wait"`
@@ -226,25 +228,27 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
 
 // PtraceGetRegs implements Context.PtraceGetRegs.
 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
-	return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+	regs := s.ptraceGetRegs()
+	n, err := regs.WriteTo(dst)
+	return int(n), err
 }
 
-func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+func (s *State) ptraceGetRegs() Registers {
 	return s.Regs
 }
 
-var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+var registersSize = (*Registers)(nil).SizeBytes()
 
 // PtraceSetRegs implements Context.PtraceSetRegs.
 func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
-	var regs syscall.PtraceRegs
-	buf := make([]byte, ptraceRegsSize)
+	var regs Registers
+	buf := make([]byte, registersSize)
 	if _, err := io.ReadFull(src, buf); err != nil {
 		return 0, err
 	}
-	binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+	regs.UnmarshalUnsafe(buf)
 	s.Regs = regs
-	return ptraceRegsSize, nil
+	return registersSize, nil
 }
 
 // PtraceGetFPRegs implements Context.PtraceGetFPRegs.
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 85d6acc0f..3b3a0a272 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -22,7 +22,6 @@ import (
 	"math/rand"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -301,8 +300,10 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
 	// u_debugreg, returning 0 or silently no-oping for other fields
 	// respectively.
-	if addr < uintptr(ptraceRegsSize) {
-		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
+	if addr < uintptr(registersSize) {
+		regs := c.ptraceGetRegs()
+		buf := make([]byte, regs.SizeBytes())
+		regs.MarshalUnsafe(buf)
 		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
 	}
 	// Note: x86 debug registers are missing.
@@ -314,8 +315,10 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 	if addr&7 != 0 || addr >= userStructSize {
 		return syscall.EIO
 	}
-	if addr < uintptr(ptraceRegsSize) {
-		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
+	if addr < uintptr(registersSize) {
+		regs := c.ptraceGetRegs()
+		buf := make([]byte, regs.SizeBytes())
+		regs.MarshalUnsafe(buf)
 		usermem.ByteOrder.PutUint64(buf[addr:], uint64(data))
 		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
 		return err
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index db99c5acb..ada7ac7b8 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package arch
 
 import (
diff --git a/pkg/sentry/arch/arch_state_aarch64.go b/pkg/sentry/arch/arch_state_aarch64.go
deleted file mode 100644
index 0136a85ad..000000000
--- a/pkg/sentry/arch/arch_state_aarch64.go
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build arm64
-
-package arch
-
-import (
-	"syscall"
-)
-
-type syscallPtraceRegs struct {
-	Regs   [31]uint64
-	Sp     uint64
-	Pc     uint64
-	Pstate uint64
-}
-
-// saveRegs is invoked by stateify.
-func (s *State) saveRegs() syscallPtraceRegs {
-	return syscallPtraceRegs(s.Regs)
-}
-
-// loadRegs is invoked by stateify.
-func (s *State) loadRegs(r syscallPtraceRegs) {
-	s.Regs = syscall.PtraceRegs(r)
-}
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index aa31169e0..19ce99d25 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -18,7 +18,6 @@ package arch
 
 import (
 	"fmt"
-	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -90,44 +89,3 @@ func (s *State) afterLoadFPState() {
 	// Copy to the new, aligned location.
 	copy(s.x86FPState, old)
 }
-
-// +stateify savable
-type syscallPtraceRegs struct {
-	R15      uint64
-	R14      uint64
-	R13      uint64
-	R12      uint64
-	Rbp      uint64
-	Rbx      uint64
-	R11      uint64
-	R10      uint64
-	R9       uint64
-	R8       uint64
-	Rax      uint64
-	Rcx      uint64
-	Rdx      uint64
-	Rsi      uint64
-	Rdi      uint64
-	Orig_rax uint64
-	Rip      uint64
-	Cs       uint64
-	Eflags   uint64
-	Rsp      uint64
-	Ss       uint64
-	Fs_base  uint64
-	Gs_base  uint64
-	Ds       uint64
-	Es       uint64
-	Fs       uint64
-	Gs       uint64
-}
-
-// saveRegs is invoked by stateify.
-func (s *State) saveRegs() syscallPtraceRegs {
-	return syscallPtraceRegs(s.Regs)
-}
-
-// loadRegs is invoked by stateify.
-func (s *State) loadRegs(r syscallPtraceRegs) {
-	s.Regs = syscall.PtraceRegs(r)
-}
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 7fc4c0473..dc458b37f 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -21,7 +21,7 @@ import (
 	"io"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
@@ -30,6 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Registers represents the CPU registers for this architecture.
+type Registers = linux.PtraceRegs
+
 // System-related constants for x86.
 const (
 	// SyscallWidth is the width of syscall, sysenter, and int 80 insturctions.
@@ -267,10 +270,12 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
 
 // PtraceGetRegs implements Context.PtraceGetRegs.
 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
-	return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+	regs := s.ptraceGetRegs()
+	n, err := regs.WriteTo(dst)
+	return int(n), err
 }
 
-func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+func (s *State) ptraceGetRegs() Registers {
 	regs := s.Regs
 	// These may not be initialized.
 	if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 {
@@ -306,16 +311,16 @@ func (s *State) ptraceGetRegs() syscall.PtraceRegs {
 	return regs
 }
 
-var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+var registersSize = (*Registers)(nil).SizeBytes()
 
 // PtraceSetRegs implements Context.PtraceSetRegs.
 func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
-	var regs syscall.PtraceRegs
-	buf := make([]byte, ptraceRegsSize)
+	var regs Registers
+	buf := make([]byte, registersSize)
 	if _, err := io.ReadFull(src, buf); err != nil {
 		return 0, err
 	}
-	binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+	regs.UnmarshalUnsafe(buf)
 	// Truncate segment registers to 16 bits.
 	regs.Cs = uint64(uint16(regs.Cs))
 	regs.Ds = uint64(uint16(regs.Ds))
@@ -369,7 +374,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
 	}
 	regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
 	s.Regs = regs
-	return ptraceRegsSize, nil
+	return registersSize, nil
 }
 
 // isUserSegmentSelector returns true if the given segment selector specifies a
@@ -538,7 +543,7 @@ const (
 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < ptraceRegsSize {
+		if maxlen < registersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceGetRegs(dst)
@@ -558,7 +563,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < ptraceRegsSize {
+		if maxlen < registersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go
index 3edf40764..0c73fcbfb 100644
--- a/pkg/sentry/arch/arch_x86_impl.go
+++ b/pkg/sentry/arch/arch_x86_impl.go
@@ -17,8 +17,6 @@
 package arch
 
 import (
-	"syscall"
-
 	"gvisor.dev/gvisor/pkg/cpuid"
 )
 
@@ -28,7 +26,7 @@ import (
 // +stateify savable
 type State struct {
 	// The system registers.
-	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+	Regs Registers
 
 	// Our floating point state.
 	x86FPState `state:"wait"`
diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go
index 8b03d0187..c9fb55d00 100644
--- a/pkg/sentry/arch/signal.go
+++ b/pkg/sentry/arch/signal.go
@@ -22,6 +22,7 @@ import (
 // SignalAct represents the action that should be taken when a signal is
 // delivered, and is equivalent to struct sigaction.
 //
+// +marshal
 // +stateify savable
 type SignalAct struct {
 	Handler  uint64
@@ -43,6 +44,7 @@ func (s *SignalAct) DeserializeTo(other *SignalAct) {
 // SignalStack represents information about a user stack, and is equivalent to
 // stack_t.
 //
+// +marshal
 // +stateify savable
 type SignalStack struct {
 	Addr  uint64
@@ -64,6 +66,7 @@ func (s *SignalStack) DeserializeTo(other *SignalStack) {
 // SignalInfo represents information about a signal being delivered, and is
 // equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h).
 //
+// +marshal
 // +stateify savable
 type SignalInfo struct {
 	Signo int32 // Signal number
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index f9ca2e74e..32173aa20 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -14,6 +14,8 @@
 
 package arch
 
+import "gvisor.dev/gvisor/tools/go_marshal/marshal"
+
 // Special values for SignalAct.Handler.
 const (
 	// SignalActDefault is SIG_DFL and specifies that the default behavior for
@@ -71,6 +73,8 @@ func (s SignalAct) HasRestorer() bool {
 // NativeSignalAct is a type that is equivalent to struct sigaction in the
 // guest architecture.
 type NativeSignalAct interface {
+	marshal.Marshallable
+
 	// SerializeFrom copies the data in the host SignalAct s into this object.
 	SerializeFrom(s *SignalAct)
 
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index e58f055c7..0fa738a1d 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -18,6 +18,7 @@ package arch
 
 import (
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 const (
@@ -55,6 +56,8 @@ func (s *SignalStack) Contains(sp usermem.Addr) bool {
 // NativeSignalStack is a type that is equivalent to stack_t in the guest
 // architecture.
 type NativeSignalStack interface {
+	marshal.Marshallable
+
 	// SerializeFrom copies the data in the host SignalStack s into this
 	// object.
 	SerializeFrom(s *SignalStack)
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 7d25e98f7..79766cafe 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -716,7 +716,7 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a
 func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
 	n := t.Arch().NewSignalAct()
 	n.SerializeFrom(s)
-	_, err := t.CopyOut(addr, n)
+	_, err := n.CopyOut(t, addr)
 	return err
 }
 
@@ -725,7 +725,7 @@ func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
 func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
 	n := t.Arch().NewSignalAct()
 	var s arch.SignalAct
-	if _, err := t.CopyIn(addr, n); err != nil {
+	if _, err := n.CopyIn(t, addr); err != nil {
 		return s, err
 	}
 	n.DeserializeTo(&s)
@@ -737,7 +737,7 @@ func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
 func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
 	n := t.Arch().NewSignalStack()
 	n.SerializeFrom(s)
-	_, err := t.CopyOut(addr, n)
+	_, err := n.CopyOut(t, addr)
 	return err
 }
 
@@ -746,7 +746,7 @@ func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error
 func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
 	n := t.Arch().NewSignalStack()
 	var s arch.SignalStack
-	if _, err := t.CopyIn(addr, n); err != nil {
+	if _, err := n.CopyIn(t, addr); err != nil {
 		return s, err
 	}
 	n.DeserializeTo(&s)
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 716198712..29d457a7e 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -17,8 +17,7 @@
 package kvm
 
 import (
-	"syscall"
-
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
@@ -37,7 +36,7 @@ type userFpsimdState struct {
 }
 
 type userRegs struct {
-	Regs    syscall.PtraceRegs
+	Regs    arch.Registers
 	sp_el1  uint64
 	elr_el1 uint64
 	spsr    [KVM_NR_SPSR]uint64
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index c42752d50..6c8f4fa28 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -117,10 +117,10 @@ func TestKernelFloatingPoint(t *testing.T) {
 	})
 }
 
-func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *syscall.PtraceRegs, *pagetables.PageTables) bool) {
+func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) {
 	// Initialize registers & page tables.
 	var (
-		regs syscall.PtraceRegs
+		regs arch.Registers
 		pt   *pagetables.PageTables
 	)
 	testutil.SetTestTarget(&regs, target)
@@ -154,7 +154,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func
 }
 
 func TestApplicationSyscall(t *testing.T) {
-	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -168,7 +168,7 @@ func TestApplicationSyscall(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -184,7 +184,7 @@ func TestApplicationSyscall(t *testing.T) {
 }
 
 func TestApplicationFault(t *testing.T) {
-	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
@@ -199,7 +199,7 @@ func TestApplicationFault(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
@@ -216,7 +216,7 @@ func TestApplicationFault(t *testing.T) {
 }
 
 func TestRegistersSyscall(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
 		for {
 			var si arch.SignalInfo
@@ -239,7 +239,7 @@ func TestRegistersSyscall(t *testing.T) {
 }
 
 func TestRegistersFault(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
 		for {
 			var si arch.SignalInfo
@@ -263,7 +263,7 @@ func TestRegistersFault(t *testing.T) {
 }
 
 func TestSegments(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTestSegments(regs)
 		for {
 			var si arch.SignalInfo
@@ -287,7 +287,7 @@ func TestSegments(t *testing.T) {
 }
 
 func TestBounce(t *testing.T) {
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		go func() {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
@@ -302,7 +302,7 @@ func TestBounce(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		go func() {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
@@ -321,7 +321,7 @@ func TestBounce(t *testing.T) {
 }
 
 func TestBounceStress(t *testing.T) {
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		randomSleep := func() {
 			// O(hundreds of microseconds) is appropriate to ensure
 			// different overlaps and different schedules.
@@ -357,7 +357,7 @@ func TestBounceStress(t *testing.T) {
 
 func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
-	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, &data) // Read legitimate value.
 		for {
 			var si arch.SignalInfo
@@ -398,7 +398,7 @@ func IsFault(err error, si *arch.SignalInfo) bool {
 }
 
 func TestEmptyAddressSpace(t *testing.T) {
-	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -412,7 +412,7 @@ func TestEmptyAddressSpace(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -471,7 +471,7 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 		i int // Iteration includes machine.Get() / machine.Put().
 		a int // Count for ErrContextInterrupt.
 	)
-	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -493,7 +493,7 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 
 func BenchmarkKernelSyscall(b *testing.B) {
 	// Note that the target passed here is irrelevant, we never execute SwitchToUser.
-	applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		// iteration does not include machine.Get() / machine.Put().
 		for i := 0; i < b.N; i++ {
 			testutil.Getpid()
@@ -508,7 +508,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
 		i int
 		a int
 	)
-	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index f7605df8a..f7feb8683 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -13,4 +13,5 @@ go_library(
         "testutil_arm64.s",
     ],
     visibility = ["//pkg/sentry/platform/kvm:__pkg__"],
+    deps = ["//pkg/sentry/arch"],
 )
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
index 4c108abbf..8048eedec 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -18,19 +18,20 @@ package testutil
 
 import (
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // TwiddleSegments reads segments into known registers.
 func TwiddleSegments()
 
 // SetTestTarget sets the rip appropriately.
-func SetTestTarget(regs *syscall.PtraceRegs, fn func()) {
+func SetTestTarget(regs *arch.Registers, fn func()) {
 	regs.Rip = uint64(reflect.ValueOf(fn).Pointer())
 }
 
 // SetTouchTarget sets rax appropriately.
-func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
+func SetTouchTarget(regs *arch.Registers, target *uintptr) {
 	if target != nil {
 		regs.Rax = uint64(reflect.ValueOf(target).Pointer())
 	} else {
@@ -39,12 +40,12 @@ func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
 }
 
 // RewindSyscall rewinds a syscall RIP.
-func RewindSyscall(regs *syscall.PtraceRegs) {
+func RewindSyscall(regs *arch.Registers) {
 	regs.Rip -= 2
 }
 
 // SetTestRegs initializes registers to known values.
-func SetTestRegs(regs *syscall.PtraceRegs) {
+func SetTestRegs(regs *arch.Registers) {
 	regs.R15 = 0x15
 	regs.R14 = 0x14
 	regs.R13 = 0x13
@@ -64,7 +65,7 @@ func SetTestRegs(regs *syscall.PtraceRegs) {
 }
 
 // CheckTestRegs checks that registers were twiddled per TwiddleRegs.
-func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) {
+func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
 	if need := ^uint64(0x15); regs.R15 != need {
 		err = addRegisterMismatch(err, "R15", regs.R15, need)
 	}
@@ -121,13 +122,13 @@ var fsData uint64 = 0x55
 var gsData uint64 = 0x85
 
 // SetTestSegments initializes segments to known values.
-func SetTestSegments(regs *syscall.PtraceRegs) {
+func SetTestSegments(regs *arch.Registers) {
 	regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer())
 	regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer())
 }
 
 // CheckTestSegments checks that registers were twiddled per TwiddleSegments.
-func CheckTestSegments(regs *syscall.PtraceRegs) (err error) {
+func CheckTestSegments(regs *arch.Registers) (err error) {
 	if regs.Rax != fsData {
 		err = addRegisterMismatch(err, "Rax", regs.Rax, fsData)
 	}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index 40b2e4acc..ca902c8c1 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -19,16 +19,17 @@ package testutil
 import (
 	"fmt"
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // SetTestTarget sets the rip appropriately.
-func SetTestTarget(regs *syscall.PtraceRegs, fn func()) {
+func SetTestTarget(regs *arch.Registers, fn func()) {
 	regs.Pc = uint64(reflect.ValueOf(fn).Pointer())
 }
 
 // SetTouchTarget sets rax appropriately.
-func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
+func SetTouchTarget(regs *arch.Registers, target *uintptr) {
 	if target != nil {
 		regs.Regs[8] = uint64(reflect.ValueOf(target).Pointer())
 	} else {
@@ -37,19 +38,19 @@ func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
 }
 
 // RewindSyscall rewinds a syscall RIP.
-func RewindSyscall(regs *syscall.PtraceRegs) {
+func RewindSyscall(regs *arch.Registers) {
 	regs.Pc -= 4
 }
 
 // SetTestRegs initializes registers to known values.
-func SetTestRegs(regs *syscall.PtraceRegs) {
+func SetTestRegs(regs *arch.Registers) {
 	for i := 0; i <= 30; i++ {
 		regs.Regs[i] = uint64(i) + 1
 	}
 }
 
 // CheckTestRegs checks that registers were twiddled per TwiddleRegs.
-func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) {
+func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
 	for i := 0; i <= 30; i++ {
 		if need := ^uint64(i + 1); regs.Regs[i] != need {
 			err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need)
diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go
index 24fc5dc62..3b9a870a5 100644
--- a/pkg/sentry/platform/ptrace/ptrace_amd64.go
+++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go
@@ -15,9 +15,8 @@
 package ptrace
 
 import (
-	"syscall"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
@@ -28,12 +27,12 @@ func fpRegSet(useXsave bool) uintptr {
 	return linux.NT_PRFPREG
 }
 
-func stackPointer(r *syscall.PtraceRegs) uintptr {
+func stackPointer(r *arch.Registers) uintptr {
 	return uintptr(r.Rsp)
 }
 
 // x86 use the fs_base register to store the TLS pointer which can be
-// get/set in "func (t *thread) get/setRegs(regs *syscall.PtraceRegs)".
+// get/set in "func (t *thread) get/setRegs(regs *arch.Registers)".
 // So both of the get/setTLS() operations are noop here.
 
 // getTLS gets the thread local storage register.
diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go
index 4db28c534..5c869926a 100644
--- a/pkg/sentry/platform/ptrace/ptrace_arm64.go
+++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go
@@ -15,9 +15,8 @@
 package ptrace
 
 import (
-	"syscall"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
@@ -25,6 +24,6 @@ func fpRegSet(_ bool) uintptr {
 	return linux.NT_PRFPREG
 }
 
-func stackPointer(r *syscall.PtraceRegs) uintptr {
+func stackPointer(r *arch.Registers) uintptr {
 	return uintptr(r.Sp)
 }
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 6c0ed7b3e..8b72d24e8 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -24,7 +24,7 @@ import (
 )
 
 // getRegs gets the general purpose register set.
-func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
+func (t *thread) getRegs(regs *arch.Registers) error {
 	iovec := syscall.Iovec{
 		Base: (*byte)(unsafe.Pointer(regs)),
 		Len:  uint64(unsafe.Sizeof(*regs)),
@@ -43,7 +43,7 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
 }
 
 // setRegs sets the general purpose register set.
-func (t *thread) setRegs(regs *syscall.PtraceRegs) error {
+func (t *thread) setRegs(regs *arch.Registers) error {
 	iovec := syscall.Iovec{
 		Base: (*byte)(unsafe.Pointer(regs)),
 		Len:  uint64(unsafe.Sizeof(*regs)),
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 773ddb1ed..2389423b0 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -63,7 +63,7 @@ type thread struct {
 	// initRegs are the initial registers for the first thread.
 	//
 	// These are used for the register set for system calls.
-	initRegs syscall.PtraceRegs
+	initRegs arch.Registers
 }
 
 // threadPool is a collection of threads.
@@ -317,7 +317,7 @@ const (
 )
 
 func (t *thread) dumpAndPanic(message string) {
-	var regs syscall.PtraceRegs
+	var regs arch.Registers
 	message += "\n"
 	if err := t.getRegs(&regs); err == nil {
 		message += dumpRegs(&regs)
@@ -423,7 +423,7 @@ func (t *thread) init() {
 // This is _not_ for use by application system calls, rather it is for use when
 // a system call must be injected into the remote context (e.g. mmap, munmap).
 // Note that clones are handled separately.
-func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
+func (t *thread) syscall(regs *arch.Registers) (uintptr, error) {
 	// Set registers.
 	if err := t.setRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
@@ -461,7 +461,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 // syscallIgnoreInterrupt ignores interrupts on the system call thread and
 // restarts the syscall if the kernel indicates that should happen.
 func (t *thread) syscallIgnoreInterrupt(
-	initRegs *syscall.PtraceRegs,
+	initRegs *arch.Registers,
 	sysno uintptr,
 	args ...arch.SyscallArgument) (uintptr, error) {
 	for {
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index cd74945e7..84b699f0d 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -41,7 +41,7 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *arch.Registers) {
 	regs.Cs = t.initRegs.Cs
 	regs.Ss = t.initRegs.Ss
 	regs.Ds = t.initRegs.Ds
@@ -53,7 +53,7 @@ func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
 // createSyscallRegs sets up syscall registers.
 //
 // This should be called to generate registers for a system call.
-func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers {
 	// Copy initial registers.
 	regs := *initRegs
 
@@ -82,18 +82,18 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch
 }
 
 // isSingleStepping determines if the registers indicate single-stepping.
-func isSingleStepping(regs *syscall.PtraceRegs) bool {
+func isSingleStepping(regs *arch.Registers) bool {
 	return (regs.Eflags & arch.X86TrapFlag) != 0
 }
 
 // updateSyscallRegs updates registers after finishing sysemu.
-func updateSyscallRegs(regs *syscall.PtraceRegs) {
+func updateSyscallRegs(regs *arch.Registers) {
 	// Ptrace puts -ENOSYS in rax on syscall-enter-stop.
 	regs.Rax = regs.Orig_rax
 }
 
 // syscallReturnValue extracts a sensible return from registers.
-func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+func syscallReturnValue(regs *arch.Registers) (uintptr, error) {
 	rval := int64(regs.Rax)
 	if rval < 0 {
 		return 0, syscall.Errno(-rval)
@@ -101,7 +101,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
 	return uintptr(rval), nil
 }
 
-func dumpRegs(regs *syscall.PtraceRegs) string {
+func dumpRegs(regs *arch.Registers) string {
 	var m strings.Builder
 
 	fmt.Fprintf(&m, "Registers:\n")
@@ -143,7 +143,7 @@ func (t *thread) adjustInitRegsRip() {
 }
 
 // Pass the expected PPID to the child via R15 when creating stub process.
-func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+func initChildProcessPPID(initregs *arch.Registers, ppid int32) {
 	initregs.R15 = uint64(ppid)
 	// Rbx has to be set to 1 when creating stub process.
 	initregs.Rbx = 1
@@ -156,7 +156,7 @@ func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 //
 // Note that this should only be called after verifying that the signalInfo has
 // been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) {
 	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
 		signalInfo.Signo = int32(linux.SIGSEGV)
 
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 7f5c393f0..bd618fae8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -41,13 +41,13 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *arch.Registers) {
 }
 
 // createSyscallRegs sets up syscall registers.
 //
 // This should be called to generate registers for a system call.
-func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers {
 	// Copy initial registers (Pc, Sp, etc.).
 	regs := *initRegs
 
@@ -78,7 +78,7 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch
 }
 
 // isSingleStepping determines if the registers indicate single-stepping.
-func isSingleStepping(regs *syscall.PtraceRegs) bool {
+func isSingleStepping(regs *arch.Registers) bool {
 	// Refer to the ARM SDM D2.12.3: software step state machine
 	// return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1).
 	//
@@ -89,13 +89,13 @@ func isSingleStepping(regs *syscall.PtraceRegs) bool {
 }
 
 // updateSyscallRegs updates registers after finishing sysemu.
-func updateSyscallRegs(regs *syscall.PtraceRegs) {
+func updateSyscallRegs(regs *arch.Registers) {
 	// No special work is necessary.
 	return
 }
 
 // syscallReturnValue extracts a sensible return from registers.
-func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+func syscallReturnValue(regs *arch.Registers) (uintptr, error) {
 	rval := int64(regs.Regs[0])
 	if rval < 0 {
 		return 0, syscall.Errno(-rval)
@@ -103,7 +103,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
 	return uintptr(rval), nil
 }
 
-func dumpRegs(regs *syscall.PtraceRegs) string {
+func dumpRegs(regs *arch.Registers) string {
 	var m strings.Builder
 
 	fmt.Fprintf(&m, "Registers:\n")
@@ -125,7 +125,7 @@ func (t *thread) adjustInitRegsRip() {
 }
 
 // Pass the expected PPID to the child via X7 when creating stub process
-func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+func initChildProcessPPID(initregs *arch.Registers, ppid int32) {
 	initregs.Regs[7] = uint64(ppid)
 	// R9 has to be set to 1 when creating stub process.
 	initregs.Regs[9] = 1
@@ -138,7 +138,7 @@ func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 //
 // Note that this should only be called after verifying that the signalInfo has
 // been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) {
 	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
 		signalInfo.Signo = int32(linux.SIGSEGV)
 
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index b69520030..679b287c3 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -79,6 +79,7 @@ go_library(
     deps = [
         "//pkg/cpuid",
         "//pkg/safecopy",
+        "//pkg/sentry/arch",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 86fd5ed58..e6daf24df 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -15,8 +15,7 @@
 package ring0
 
 import (
-	"syscall"
-
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
@@ -72,7 +71,7 @@ type CPU struct {
 
 	// registers is a set of registers; these may be used on kernel system
 	// calls and exceptions via the Registers function.
-	registers syscall.PtraceRegs
+	registers arch.Registers
 
 	// hooks are kernel hooks.
 	hooks Hooks
@@ -83,14 +82,14 @@ type CPU struct {
 // This is explicitly safe to call during KernelException and KernelSyscall.
 //
 //go:nosplit
-func (c *CPU) Registers() *syscall.PtraceRegs {
+func (c *CPU) Registers() *arch.Registers {
 	return &c.registers
 }
 
 // SwitchOpts are passed to the Switch function.
 type SwitchOpts struct {
 	// Registers are the user register state.
-	Registers *syscall.PtraceRegs
+	Registers *arch.Registers
 
 	// FloatingPointState is a byte pointer where floating point state is
 	// saved and restored.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index a5ce67885..7fa43c2f5 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -17,7 +17,7 @@
 package ring0
 
 import (
-	"syscall"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // This is an assembly function.
@@ -41,7 +41,7 @@ func swapgs()
 // The return code is the vector that interrupted execution.
 //
 // See stubs.go for a note regarding the frame size of this function.
-func sysret(*CPU, *syscall.PtraceRegs) Vector
+func sysret(*CPU, *arch.Registers) Vector
 
 // "iret is the cadillac of CPL switching."
 //
@@ -50,7 +50,7 @@ func sysret(*CPU, *syscall.PtraceRegs) Vector
 // iret is nearly identical to sysret, except an iret is used to fully restore
 // all user state. This must be called in cases where all registers need to be
 // restored.
-func iret(*CPU, *syscall.PtraceRegs) Vector
+func iret(*CPU, *arch.Registers) Vector
 
 // exception is the generic exception entry.
 //
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 4cae10459..549f3d228 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -27,6 +27,7 @@ go_binary(
     visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
     deps = [
         "//pkg/cpuid",
+        "//pkg/sentry/arch",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index 85cc3fdad..b8ab120a0 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -20,7 +20,8 @@ import (
 	"fmt"
 	"io"
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // Emit prints architecture-specific offsets.
@@ -64,7 +65,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define SyscallInt80               0x%02x\n", SyscallInt80)
 	fmt.Fprintf(w, "#define Syscall                    0x%02x\n", Syscall)
 
-	p := &syscall.PtraceRegs{}
+	p := &arch.Registers{}
 	fmt.Fprintf(w, "\n// Ptrace registers.\n")
 	fmt.Fprintf(w, "#define PTRACE_R15      0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_R14      0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 057fb5c69..f3de962f0 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -20,7 +20,8 @@ import (
 	"fmt"
 	"io"
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // Emit prints architecture-specific offsets.
@@ -87,7 +88,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
 	fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
 
-	p := &syscall.PtraceRegs{}
+	p := &arch.Registers{}
 	fmt.Fprintf(w, "\n// Ptrace registers.\n")
 	fmt.Fprintf(w, "#define PTRACE_R0       0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_R1       0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer())
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 7e1747a0c..582d37e03 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -355,7 +355,7 @@ func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	pending := t.PendingSignals()
-	_, err := t.CopyOut(addr, pending)
+	_, err := pending.CopyOut(t, addr)
 	return 0, nil, err
 }
 
@@ -392,7 +392,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 
 	if siginfo != 0 {
 		si.FixSignalCodeForUser()
-		if _, err := t.CopyOut(siginfo, si); err != nil {
+		if _, err := si.CopyOut(t, siginfo); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -411,7 +411,7 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	// same way), and that the code is in the allowed set. This same logic
 	// appears below in RtSigtgqueueinfo and should be kept in sync.
 	var info arch.SignalInfo
-	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+	if _, err := info.CopyIn(t, infoAddr); err != nil {
 		return 0, nil, err
 	}
 	info.Signo = int32(sig)
@@ -455,7 +455,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker
 
 	// Copy in the info. See RtSigqueueinfo above.
 	var info arch.SignalInfo
-	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+	if _, err := info.CopyIn(t, infoAddr); err != nil {
 		return 0, nil, err
 	}
 	info.Signo = int32(sig)
@@ -485,7 +485,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 
 	// Copy in the signal mask.
 	var mask linux.SignalSet
-	if _, err := t.CopyIn(sigset, &mask); err != nil {
+	if _, err := mask.CopyIn(t, sigset); err != nil {
 		return 0, nil, err
 	}
 	mask &^= kernel.UnblockableSignals
-- 
cgit v1.2.3